├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── check_environment.py ├── convert_deps.py ├── environment.yml ├── notebooks ├── 00-README.ipynb ├── 01-Indexing.ipynb ├── 02-Alignment.ipynb ├── 03-Groupby.ipynb ├── 04-Tidy-Data.ipynb ├── 05-Timeseries.ipynb ├── 06-Dask.ipynb ├── 07-Performance.ipynb ├── 08-Pandas-NumPy-ScikitLearn.ipynb ├── 09-Visualization.ipynb ├── 10-Iterators.ipynb ├── data │ ├── beer-raw-small.txt.gz │ ├── cpi.csv │ ├── flights-ts.csv.gz │ ├── flights_coord.csv │ ├── games.csv │ ├── gdp.csv │ ├── ny-flights.csv.gz │ ├── rpi.csv │ ├── stocks.csv │ ├── subset.csv.gz │ ├── tidy_checkpoint.csv │ └── tips.csv ├── mydask.png └── solutions │ ├── aligment_concat.py │ ├── aligment_merge.py │ ├── alignment_00.py │ ├── alignment_01.py │ ├── alignment_02.py │ ├── alignment_03.py │ ├── alignment_positive.py │ ├── alignment_real_gdp09.py │ ├── dropna_columns.py │ ├── eda_00.py │ ├── eda_01.py │ ├── eda_02.py │ ├── eda_03.py │ ├── eda_04.py │ ├── groupby_00.py │ ├── groupby_00b.py │ ├── groupby_01.py │ ├── groupby_02.py │ ├── groupby_03.py │ ├── groupby_04.py │ ├── groupby_abv.py │ ├── groupby_format_review.py │ ├── indexing_00.py │ ├── indexing_01.py │ ├── indexing_02.py │ ├── indexing_cancelled.py │ ├── indexing_datetime.py │ ├── indexing_drop_columns.py │ ├── indexing_drop_index.py │ ├── indexing_ex1_engine_columns.py │ ├── indexing_ex2_5th.py │ ├── indexing_ex3_years.py │ ├── indexing_ex4_mpg.py │ ├── indexing_ex5_mpg_and_cylinders.py │ ├── indexing_loc.py │ ├── indexing_thoughts.py │ ├── performance_00.py │ ├── performance_01.py │ ├── performance_02.py │ ├── performance_concat.py │ ├── performance_kd.py │ ├── readme_00.py │ ├── sklearn_pandas_split.py │ ├── tidy_00.py │ ├── tidy_01.py │ ├── tidy_02.py │ ├── tidy_03.py │ ├── tidy_04.py │ ├── tidy_05.py │ ├── tidy_06.py │ ├── tidy_07.py │ ├── tidy_drest.py │ ├── tidy_sanity.py │ ├── timeseries_departure.py │ ├── timeseries_monthly_ma.py │ ├── timeseries_resample.py │ ├── timeseries_resample_agg.py │ ├── timeseries_timedelta.py │ ├── visualize_00.py │ ├── visualize_01.py │ ├── visualize_02a.py │ ├── visualize_02b.py │ ├── visualize_03.py │ ├── visualize_04.py │ ├── visualize_05.py │ ├── visualize_06.py │ ├── visualize_07.py │ ├── visualize_08.py │ ├── visualize_09.py │ └── visualize_10.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.md 2 | *.pdf 3 | *_files/ 4 | __pycache__/ 5 | !README.md 6 | *.ipynb_checkpoints/ 7 | notebooks/data/beer/*.gz 8 | *.DS_Store 9 | pandas-scipy 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More_considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution 4.0 International Public License 58 | 59 | By exercising the Licensed Rights (defined below), You accept and agree 60 | to be bound by the terms and conditions of this Creative Commons 61 | Attribution 4.0 International Public License ("Public License"). To the 62 | extent this Public License may be interpreted as a contract, You are 63 | granted the Licensed Rights in consideration of Your acceptance of 64 | these terms and conditions, and the Licensor grants You such rights in 65 | consideration of benefits the Licensor receives from making the 66 | Licensed Material available under these terms and conditions. 67 | 68 | 69 | Section 1 -- Definitions. 70 | 71 | a. Adapted Material means material subject to Copyright and Similar 72 | Rights that is derived from or based upon the Licensed Material 73 | and in which the Licensed Material is translated, altered, 74 | arranged, transformed, or otherwise modified in a manner requiring 75 | permission under the Copyright and Similar Rights held by the 76 | Licensor. For purposes of this Public License, where the Licensed 77 | Material is a musical work, performance, or sound recording, 78 | Adapted Material is always produced where the Licensed Material is 79 | synched in timed relation with a moving image. 80 | 81 | b. Adapter's License means the license You apply to Your Copyright 82 | and Similar Rights in Your contributions to Adapted Material in 83 | accordance with the terms and conditions of this Public License. 84 | 85 | c. Copyright and Similar Rights means copyright and/or similar rights 86 | closely related to copyright including, without limitation, 87 | performance, broadcast, sound recording, and Sui Generis Database 88 | Rights, without regard to how the rights are labeled or 89 | categorized. For purposes of this Public License, the rights 90 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 91 | Rights. 92 | 93 | d. Effective Technological Measures means those measures that, in the 94 | absence of proper authority, may not be circumvented under laws 95 | fulfilling obligations under Article 11 of the WIPO Copyright 96 | Treaty adopted on December 20, 1996, and/or similar international 97 | agreements. 98 | 99 | e. Exceptions and Limitations means fair use, fair dealing, and/or 100 | any other exception or limitation to Copyright and Similar Rights 101 | that applies to Your use of the Licensed Material. 102 | 103 | f. Licensed Material means the artistic or literary work, database, 104 | or other material to which the Licensor applied this Public 105 | License. 106 | 107 | g. Licensed Rights means the rights granted to You subject to the 108 | terms and conditions of this Public License, which are limited to 109 | all Copyright and Similar Rights that apply to Your use of the 110 | Licensed Material and that the Licensor has authority to license. 111 | 112 | h. Licensor means the individual(s) or entity(ies) granting rights 113 | under this Public License. 114 | 115 | i. Share means to provide material to the public by any means or 116 | process that requires permission under the Licensed Rights, such 117 | as reproduction, public display, public performance, distribution, 118 | dissemination, communication, or importation, and to make material 119 | available to the public including in ways that members of the 120 | public may access the material from a place and at a time 121 | individually chosen by them. 122 | 123 | j. Sui Generis Database Rights means rights other than copyright 124 | resulting from Directive 96/9/EC of the European Parliament and of 125 | the Council of 11 March 1996 on the legal protection of databases, 126 | as amended and/or succeeded, as well as other essentially 127 | equivalent rights anywhere in the world. 128 | 129 | k. You means the individual or entity exercising the Licensed Rights 130 | under this Public License. Your has a corresponding meaning. 131 | 132 | 133 | Section 2 -- Scope. 134 | 135 | a. License grant. 136 | 137 | 1. Subject to the terms and conditions of this Public License, 138 | the Licensor hereby grants You a worldwide, royalty-free, 139 | non-sublicensable, non-exclusive, irrevocable license to 140 | exercise the Licensed Rights in the Licensed Material to: 141 | 142 | a. reproduce and Share the Licensed Material, in whole or 143 | in part; and 144 | 145 | b. produce, reproduce, and Share Adapted Material. 146 | 147 | 2. Exceptions and Limitations. For the avoidance of doubt, where 148 | Exceptions and Limitations apply to Your use, this Public 149 | License does not apply, and You do not need to comply with 150 | its terms and conditions. 151 | 152 | 3. Term. The term of this Public License is specified in Section 153 | 6(a). 154 | 155 | 4. Media and formats; technical modifications allowed. The 156 | Licensor authorizes You to exercise the Licensed Rights in 157 | all media and formats whether now known or hereafter created, 158 | and to make technical modifications necessary to do so. The 159 | Licensor waives and/or agrees not to assert any right or 160 | authority to forbid You from making technical modifications 161 | necessary to exercise the Licensed Rights, including 162 | technical modifications necessary to circumvent Effective 163 | Technological Measures. For purposes of this Public License, 164 | simply making modifications authorized by this Section 2(a) 165 | (4) never produces Adapted Material. 166 | 167 | 5. Downstream recipients. 168 | 169 | a. Offer from the Licensor -- Licensed Material. Every 170 | recipient of the Licensed Material automatically 171 | receives an offer from the Licensor to exercise the 172 | Licensed Rights under the terms and conditions of this 173 | Public License. 174 | 175 | b. No downstream restrictions. You may not offer or impose 176 | any additional or different terms or conditions on, or 177 | apply any Effective Technological Measures to, the 178 | Licensed Material if doing so restricts exercise of the 179 | Licensed Rights by any recipient of the Licensed 180 | Material. 181 | 182 | 6. No endorsement. Nothing in this Public License constitutes or 183 | may be construed as permission to assert or imply that You 184 | are, or that Your use of the Licensed Material is, connected 185 | with, or sponsored, endorsed, or granted official status by, 186 | the Licensor or others designated to receive attribution as 187 | provided in Section 3(a)(1)(A)(i). 188 | 189 | b. Other rights. 190 | 191 | 1. Moral rights, such as the right of integrity, are not 192 | licensed under this Public License, nor are publicity, 193 | privacy, and/or other similar personality rights; however, to 194 | the extent possible, the Licensor waives and/or agrees not to 195 | assert any such rights held by the Licensor to the limited 196 | extent necessary to allow You to exercise the Licensed 197 | Rights, but not otherwise. 198 | 199 | 2. Patent and trademark rights are not licensed under this 200 | Public License. 201 | 202 | 3. To the extent possible, the Licensor waives any right to 203 | collect royalties from You for the exercise of the Licensed 204 | Rights, whether directly or through a collecting society 205 | under any voluntary or waivable statutory or compulsory 206 | licensing scheme. In all other cases the Licensor expressly 207 | reserves any right to collect such royalties. 208 | 209 | 210 | Section 3 -- License Conditions. 211 | 212 | Your exercise of the Licensed Rights is expressly made subject to the 213 | following conditions. 214 | 215 | a. Attribution. 216 | 217 | 1. If You Share the Licensed Material (including in modified 218 | form), You must: 219 | 220 | a. retain the following if it is supplied by the Licensor 221 | with the Licensed Material: 222 | 223 | i. identification of the creator(s) of the Licensed 224 | Material and any others designated to receive 225 | attribution, in any reasonable manner requested by 226 | the Licensor (including by pseudonym if 227 | designated); 228 | 229 | ii. a copyright notice; 230 | 231 | iii. a notice that refers to this Public License; 232 | 233 | iv. a notice that refers to the disclaimer of 234 | warranties; 235 | 236 | v. a URI or hyperlink to the Licensed Material to the 237 | extent reasonably practicable; 238 | 239 | b. indicate if You modified the Licensed Material and 240 | retain an indication of any previous modifications; and 241 | 242 | c. indicate the Licensed Material is licensed under this 243 | Public License, and include the text of, or the URI or 244 | hyperlink to, this Public License. 245 | 246 | 2. You may satisfy the conditions in Section 3(a)(1) in any 247 | reasonable manner based on the medium, means, and context in 248 | which You Share the Licensed Material. For example, it may be 249 | reasonable to satisfy the conditions by providing a URI or 250 | hyperlink to a resource that includes the required 251 | information. 252 | 253 | 3. If requested by the Licensor, You must remove any of the 254 | information required by Section 3(a)(1)(A) to the extent 255 | reasonably practicable. 256 | 257 | 4. If You Share Adapted Material You produce, the Adapter's 258 | License You apply must not prevent recipients of the Adapted 259 | Material from complying with this Public License. 260 | 261 | 262 | Section 4 -- Sui Generis Database Rights. 263 | 264 | Where the Licensed Rights include Sui Generis Database Rights that 265 | apply to Your use of the Licensed Material: 266 | 267 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 268 | to extract, reuse, reproduce, and Share all or a substantial 269 | portion of the contents of the database; 270 | 271 | b. if You include all or a substantial portion of the database 272 | contents in a database in which You have Sui Generis Database 273 | Rights, then the database in which You have Sui Generis Database 274 | Rights (but not its individual contents) is Adapted Material; and 275 | 276 | c. You must comply with the conditions in Section 3(a) if You Share 277 | all or a substantial portion of the contents of the database. 278 | 279 | For the avoidance of doubt, this Section 4 supplements and does not 280 | replace Your obligations under this Public License where the Licensed 281 | Rights include other Copyright and Similar Rights. 282 | 283 | 284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 285 | 286 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 287 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 288 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 289 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 290 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 291 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 292 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 293 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 294 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 295 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 296 | 297 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 298 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 299 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 300 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 301 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 302 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 303 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 304 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 305 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 306 | 307 | c. The disclaimer of warranties and limitation of liability provided 308 | above shall be interpreted in a manner that, to the extent 309 | possible, most closely approximates an absolute disclaimer and 310 | waiver of all liability. 311 | 312 | 313 | Section 6 -- Term and Termination. 314 | 315 | a. This Public License applies for the term of the Copyright and 316 | Similar Rights licensed here. However, if You fail to comply with 317 | this Public License, then Your rights under this Public License 318 | terminate automatically. 319 | 320 | b. Where Your right to use the Licensed Material has terminated under 321 | Section 6(a), it reinstates: 322 | 323 | 1. automatically as of the date the violation is cured, provided 324 | it is cured within 30 days of Your discovery of the 325 | violation; or 326 | 327 | 2. upon express reinstatement by the Licensor. 328 | 329 | For the avoidance of doubt, this Section 6(b) does not affect any 330 | right the Licensor may have to seek remedies for Your violations 331 | of this Public License. 332 | 333 | c. For the avoidance of doubt, the Licensor may also offer the 334 | Licensed Material under separate terms or conditions or stop 335 | distributing the Licensed Material at any time; however, doing so 336 | will not terminate this Public License. 337 | 338 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 339 | License. 340 | 341 | 342 | Section 7 -- Other Terms and Conditions. 343 | 344 | a. The Licensor shall not be bound by any additional or different 345 | terms or conditions communicated by You unless expressly agreed. 346 | 347 | b. Any arrangements, understandings, or agreements regarding the 348 | Licensed Material not stated herein are separate from and 349 | independent of the terms and conditions of this Public License. 350 | 351 | 352 | Section 8 -- Interpretation. 353 | 354 | a. For the avoidance of doubt, this Public License does not, and 355 | shall not be interpreted to, reduce, limit, restrict, or impose 356 | conditions on any use of the Licensed Material that could lawfully 357 | be made without permission under this Public License. 358 | 359 | b. To the extent possible, if any provision of this Public License is 360 | deemed unenforceable, it shall be automatically reformed to the 361 | minimum extent necessary to make it enforceable. If the provision 362 | cannot be reformed, it shall be severed from this Public License 363 | without affecting the enforceability of the remaining terms and 364 | conditions. 365 | 366 | c. No term or condition of this Public License will be waived and no 367 | failure to comply consented to unless expressly agreed to by the 368 | Licensor. 369 | 370 | d. Nothing in this Public License constitutes or may be interpreted 371 | as a limitation upon, or waiver of, any privileges and immunities 372 | that apply to the Licensor or You, including from the legal 373 | processes of any jurisdiction or authority. 374 | 375 | 376 | ======================================================================= 377 | 378 | Creative Commons is not a party to its public 379 | licenses. Notwithstanding, Creative Commons may elect to apply one of 380 | its public licenses to material it publishes and in those instances 381 | will be considered the “Licensor.” The text of the Creative Commons 382 | public licenses is dedicated to the public domain under the CC0 Public 383 | Domain Dedication. Except for the limited purpose of indicating that 384 | material is shared under a Creative Commons public license or as 385 | otherwise permitted by the Creative Commons policies published at 386 | creativecommons.org/policies, Creative Commons does not authorize the 387 | use of the trademark "Creative Commons" or any other trademark or logo 388 | of Creative Commons without its prior written consent including, 389 | without limitation, in connection with any unauthorized modifications 390 | to any of its public licenses or any other arrangements, 391 | understandings, or agreements concerning use of licensed material. For 392 | the avoidance of doubt, this paragraph does not form part of the 393 | public licenses. 394 | 395 | Creative Commons may be contacted at creativecommons.org. 396 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SRC = $(wildcard notebooks/*.ipynb) 2 | 3 | strip: 4 | nbstripout notebooks/*.ipynb 5 | 6 | requirements.txt: environment.yml 7 | python convert_deps.py 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SciPy 2018 Tutorial: Pandas .head() to .tail() 2 | 3 | [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/deniederhut/Pandas-Tutorial-SciPyConf-2018/master) 4 | 5 | ### https://github.com/deniederhut/Pandas-Tutorial-SciPyConf-2018 6 | 7 | Cluster URL: http://104.155.138.8 8 | 9 | #### Presented by: 10 | - [Tom Augspurger](https://tomaugspurger.github.io/), [Anaconda, Inc.](https://anaconda.org/) 11 | - [Joris Van den Bossche](https://jorisvandenbossche.github.io/), [Université Paris-Saclay Center for Data Science](https://www.datascience-paris-saclay.fr/) 12 | - [Dillon Niederhut](https://dillon.niederhut.us), [Enthought Inc.](https://www.enthought.com) 13 | 14 | 15 | ## First-Time Setup 16 | 17 | #### 1. Install Python 18 | 19 | If you don't already have a working python distribution, you may download one of 20 | 21 | * Miniconda ([https://conda.io/miniconda.html](https://conda.io/miniconda.html)) 22 | * Python.org ([https://www.python.org/downloads/](https://www.python.org/downloads/)) 23 | 24 | You'll need Python 3. 25 | 26 | #### 2. Download Tutorial Materials 27 | 28 | This GitHub repository is all that is needed in terms of tutorial content. The simplest solution is to download the material using this link: 29 | 30 | [https://github.com/deniederhut/Pandas-Tutorial-SciPyConf-2018/archive/master.zip](https://github.com/deniederhut/Pandas-Tutorial-SciPyConf-2018/archive/master.zip) 31 | 32 | If you're familiar with Git, you can also clone this repository with: 33 | 34 | ```sh 35 | git clone git@github.com:deniederhut/Pandas-Tutorial-SciPyConf-2018.git 36 | ``` 37 | 38 | It will create a new folder named Pandas-Tutorial-SciPyConf-2018/ with all the 39 | content you will need, including: 40 | 41 | - `requirements.txt` - the package requirements for this tutorial 42 | - `check_environment.py` - a script for testing your installation 43 | - `notebooks/` - the Jupyter notebooks we'll use during the tutoral 44 | 45 | #### 3. Install Required Packages 46 | 47 | If you are using conda, you can install the necessary packages by opening a terminal and entering the following: 48 | 49 | ```sh 50 | conda update conda --yes 51 | conda --version # Should be about 4.5.4 52 | conda env create --file=environment.yml 53 | conda activate pandas-scipy 54 | ``` 55 | 56 | If you are using Python from python.org or your system, you can install the necessary packages by opening a terminal and entering the following: 57 | 58 | ```sh 59 | # Create a new environment 60 | python3 -m venv pandas-scipy 61 | source pandas-scipy/bin/activate 62 | 63 | pip install -U pip wheel setuptools 64 | pip install -U -r requirements.txt 65 | ``` 66 | 67 | #### 4. Test the Installation 68 | 69 | To make sure everything was installed correctly, open a terminal, and change its directory (`cd`) so that your working directory is `Pandas-Tutorial-SciPyConf-2018`. The enter the following: 70 | 71 | ```sh 72 | python check_environment.py 73 | ``` 74 | 75 | #### 5. Start the Notebook 76 | 77 | ```sh 78 | jupyter notebook 79 | ``` 80 | 81 | ## Questions? Problems? 82 | 83 | You may post messages to the slack channel for this tutorial at: [https://scipy2018.slack.com](https://scipy2018.slack.com) 84 | -------------------------------------------------------------------------------- /check_environment.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | 3 | packages = ['pandas', 'IPython', 'statsmodels', 'sklearn', 'seaborn', 4 | 'toolz', 'requests', 'scipy'] 5 | 6 | bad = [] 7 | for package in packages: 8 | try: 9 | importlib.import_module(package) 10 | except ImportError: 11 | bad.append("Can't import %s" % package) 12 | else: 13 | if len(bad) > 0: 14 | print('\n'.join(bad)) 15 | else: 16 | try: 17 | import pandas as pd 18 | df = pd.read_csv("notebooks/data/cpi.csv") 19 | print("All good. Enjoy the tutorial!") 20 | except Exception as e: 21 | print("Couldn't read CPI") 22 | print(e) 23 | -------------------------------------------------------------------------------- /convert_deps.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert the conda environment.yml to a pip requirements.txt 3 | """ 4 | import yaml 5 | 6 | exclude = {'python=3.6.6', 'nomkl'} 7 | rename = {'pytables': 'tables'} 8 | 9 | with open("environment.yml") as f: 10 | dev = yaml.load(f) 11 | 12 | required = dev['dependencies'] 13 | required = [rename.get(dep, dep).replace("=", "==") for dep in required 14 | if not isinstance(dep, dict) 15 | and dep not in exclude] 16 | pip, = [x for x in dev['dependencies'] if isinstance(x, dict)] 17 | required.extend(pip['pip']) 18 | 19 | 20 | with open("requirements.txt", 'wt') as f: 21 | f.write('\n'.join(required)) 22 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: pandas-scipy 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - python=3.6.6 7 | - numpy=1.14.5 8 | - pandas=0.23.1 9 | - matplotlib=2.2.2 10 | - seaborn=0.8.1 11 | - ipython=6.4.0 12 | - jupyter=1.0.0 13 | - notebook=5.5.0 14 | - dask=0.18.1 15 | - distributed=1.22.0 16 | - toolz=0.9.0 17 | - pandas-datareader=0.6.0 18 | - scikit-learn=0.19.1 19 | - scipy=1.1.0 20 | - statsmodels=0.9.0 21 | - xlrd=1.1.0 22 | - pip: 23 | - lifetimes==0.6.0.0 24 | -------------------------------------------------------------------------------- /notebooks/00-README.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Welcome to the course!\n", 15 | "This notebook will outline our structure for the course, and introduce you to the notebook if you haven't used it before." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "## Background Expectations\n", 23 | "\n", 24 | "- Hopefully you've used Python before\n", 25 | "- Experience with NumPy will be helpful, but not required\n", 26 | "- Pandas will be the primary focus\n", 27 | "- We'll see bits of scikit-learn and statsmodels" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Course Format\n", 35 | "\n", 36 | "- We'll work through notebooks together (execute each cell)\n", 37 | "- You'll do exercises\n", 38 | "- During exercises, we'll follow-up on questions\n", 39 | "- We'll demonstrate the solutions" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## Jupyter Notebook\n", 47 | "\n", 48 | "> The Jupyter Notebook is a web application that allows you to create and share documents that contain live code, equations, visualizations and explanatory text.\n", 49 | "\n", 50 | "- Two Modes: Edit and Command\n", 51 | "- Command -> Edit: `Enter`\n", 52 | "- Edit -> Command: `Esc`\n", 53 | "- Execute a Cell: `Shift+Enter`\n", 54 | "- Down: `j/Down Arrow`\n", 55 | "- Up: `k/Up Arrow`" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "## Tab Completion\n", 63 | "\n", 64 | "IPython will tab complete method names and function arguments\n", 65 | "\n", 66 | "Use `shift+tab` to inside a function call to show the signature" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "# type str.\n" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "# type str.split()\n" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "## Exercises\n", 92 | "\n", 93 | "- Lots of small exercises to check understanding\n", 94 | "- Each exercise includes\n", 95 | " + A prompt / question to be answered\n", 96 | " + An empty cell for code\n", 97 | " + A \"magic\" cell that loads a solution\n", 98 | "- Execute the magic cell twice" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "
\n", 106 | "

Exercise: Print 'Hello, world!'

\n", 107 | "
\n", 108 | "\n", 109 | "

Print the text \"Hello, world!\"

" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# Your code here\n" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "# %load solutions/readme_00.py" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "Make sure to run the solution cell twice. I'd encourage you to always\n", 135 | "run the solution cell, as later steps in the notebooks will depend on earlier\n", 136 | "steps being correct." 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "## Pandas Cheat Sheet\n", 144 | "\n", 145 | "https://github.com/pandas-dev/pandas/blob/master/doc/cheatsheet/Pandas_Cheat_Sheet.pdf\n", 146 | "\n", 147 | "![cheat sheet](figures/cheat-sheet-preview.png)\n" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "## Notebooks\n", 155 | "\n", 156 | "1. [Indexing](01-Indexing.ipynb)\n", 157 | "2. [Alignment](02-Alignment.ipynb)\n", 158 | "3. [Iterators & Groupby](03-Iterators-Groupby.ipynb)\n", 159 | "4. [Visualization](04-Visualization.ipynb)\n", 160 | "5. [Tidy Data](05-Tidy-Data.ipynb)\n", 161 | "6. [Performance](06-Performance.ipynb)\n", 162 | "7. [Timeseries](07-Timeseries.ipynb)\n", 163 | "8. [Ecosystem](08-Pandas-NumPy-ScikitLearn.ipynb)" 164 | ] 165 | } 166 | ], 167 | "metadata": { 168 | "kernelspec": { 169 | "display_name": "Python 3", 170 | "language": "python", 171 | "name": "python3" 172 | }, 173 | "language_info": { 174 | "codemirror_mode": { 175 | "name": "ipython", 176 | "version": 3 177 | }, 178 | "file_extension": ".py", 179 | "mimetype": "text/x-python", 180 | "name": "python", 181 | "nbconvert_exporter": "python", 182 | "pygments_lexer": "ipython3", 183 | "version": "3.6.6" 184 | } 185 | }, 186 | "nbformat": 4, 187 | "nbformat_minor": 2 188 | } 189 | -------------------------------------------------------------------------------- /notebooks/03-Groupby.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# The groupby operation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import numpy as np\n", 17 | "import pandas as pd\n", 18 | "import matplotlib.pyplot as plt" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "%matplotlib inline" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "plt.style.use('default')\n", 37 | "plt.rcParams['figure.figsize'] = (12, 6)\n", 38 | "pd.options.display.max_rows = 10" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "I've provided the reviews by the top 100 reviewers.\n", 46 | "We'll use it for talking about groupby." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "df = pd.read_csv(\"data/subset.csv.gz\", compression=\"gzip\",\n", 56 | " parse_dates=['time'])\n", 57 | "df.head()" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "df.info()" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "## Aside: Namespaces\n", 74 | "\n", 75 | "Pandas has been expanding its use of namespaces (or accessors) on `DataFrame` to group together related methods. This also limits the number of methods direclty attached to `DataFrame` itself, which can be overwhelming.\n", 76 | "\n", 77 | "Currently, we have these namespaces:\n", 78 | "\n", 79 | "- `.str`: defined on `Series` and `Index`es containing strings (object dtype)\n", 80 | "- `.dt`: defined on `Series` with `datetime` or `timedelta` dtype\n", 81 | "- `.cat`: defined on `Series` and `Indexes` with `category` dtype\n", 82 | "- `.plot`: defined on `Series` and `DataFrames`" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "
\n", 90 | "

Exercise: Reviews by Hour

\n", 91 | "
\n", 92 | "\n", 93 | "

Make a barplot of the count of reviews by hour of the day.

" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "- Hint: Use the `.dt` namespace to get the `hour` component of a `datetime`\n", 101 | "- Hint: We've seen `Series.value_counts` for getting the count of each value\n", 102 | "- Hint: Use `.sort_index` to make sure the data is ordered by hour, not count\n", 103 | "- Hint: Use the [`.plot`](http://pandas.pydata.org/pandas-docs/stable/api.html#plotting) namespace to get a `bar` chart" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "%load solutions/groupby_03.py" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "
\n", 127 | "

Exercise: Pale Ales

\n", 128 | "
\n", 129 | "

\n", 130 | "Make a variable `pale_ales` that filters `df` to just rows where `beer_style` contains the string `'pale ale'` (ignoring case)\n", 131 | "

\n", 132 | "- Hint: Use the `df.beer_style.str` namespace and find a method for checking whether a string contains another string." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "%load solutions/groupby_04.py" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "# Groupby" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "Groupby operations come up in a lot of contexts.\n", 163 | "At its root, groupby about doing an operation on many subsets of the data, each of which shares something in common.\n", 164 | "The components of a groupby operation are:" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "## Components of a groupby\n", 172 | "\n", 173 | "1. **split** a table into groups\n", 174 | "2. **apply** a function to each group\n", 175 | "3. **combine** the results into a single DataFrame or Series" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "In pandas the `split` step looks like\n", 183 | "\n", 184 | "```python\n", 185 | "df.groupby( grouper )\n", 186 | "```\n", 187 | "\n", 188 | "`grouper` can be many things\n", 189 | "\n", 190 | "- Series (or string indicating a column in `df`)\n", 191 | "- function (to be applied on the index)\n", 192 | "- dict : groups by *values*\n", 193 | "- `levels=[ names of levels in a MultiIndex ]`" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "## Split\n", 201 | "\n", 202 | "Break a table into smaller logical tables according to some rule" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "gr = df.groupby(\"beer_name\")\n", 212 | "gr" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "We haven't really done any actual work yet, but pandas knows what it needs to know to break the larger `df` into many smaller pieces, one for each distinct `beer_name`." 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "## Apply & Combine\n", 227 | "\n", 228 | "To finish the groupby, we apply a method to the groupby object." 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "review_cols = ['review_appearance', 'review_aroma', 'review_overall',\n", 238 | " 'review_palate', 'review_taste']\n", 239 | "\n", 240 | "df.groupby('beer_name')[review_cols].agg('mean')" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "In this case, the function we applied was `'mean'`.\n", 248 | "Pandas has implemented cythonized versions of certain common methods like mean, sum, etc.\n", 249 | "You can also pass in regular functions like `np.mean`.\n", 250 | "\n", 251 | "In terms of split, apply, combine, split was `df.groupby('beer_name')`. \n", 252 | "We apply the `mean` function by passing in `'mean'`.\n", 253 | "Finally, by using the `.agg` method (for aggregate) we tell pandas to combine the results with one output row per group.\n", 254 | "\n", 255 | "You can also pass in regular functions like `np.mean`." 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "df.groupby('beer_name')[review_cols].agg(np.mean).head()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "Finally, [certain methods](http://pandas.pydata.org/pandas-docs/stable/api.html#id35) have been attached to `Groupby` objects." 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "df.groupby('beer_name')[review_cols].mean()" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "
\n", 288 | "

Exercise: Highest Variance

\n", 289 | "
\n", 290 | "\n", 291 | "

Find the `beer_style`s with the greatest variance in `abv`.

\n", 292 | "\n", 293 | "- hint: `.std` calculates the standard deviation (`.var` for variance), and is available on `GroupBy` objects like `gr.abv`.\n", 294 | "- hint: use `.sort_values` to sort a Series by the values (it took us a while to come up with that name)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "%load solutions/groupby_abv.py" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "## `.agg` output shape\n", 318 | "\n", 319 | "The output shape is determined by the grouper, data, and aggregation\n", 320 | "\n", 321 | "- Grouper: Controls the output index\n", 322 | " * single grouper -> Index\n", 323 | " * array-like grouper -> MultiIndex\n", 324 | "- Subject (Groupee): Controls the output data values\n", 325 | " * single column -> Series (or DataFrame if multiple aggregations)\n", 326 | " * multiple columns -> DataFrame\n", 327 | "- Aggregation: Controls the output columns\n", 328 | " * single aggfunc -> Index in the colums\n", 329 | " * multiple aggfuncs -> MultiIndex in the columns (Or 1-D Index if groupee is 1-D)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "\n", 337 | "We'll go into MultiIndexes in a bit, but for know, think of them as regular Indexes with multiple levels (columns)." 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "# single grouper, single groupee, single aggregation\n", 347 | "df.groupby('beer_style').review_overall.agg('mean')" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "# multiple groupers, multiple groupee, single aggregation\n", 357 | "df.groupby(['brewer_id', 'beer_name'])[review_cols].agg(['mean', 'min', 'max', 'std', 'count'])" 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "metadata": {}, 363 | "source": [ 364 | "
\n", 365 | "

Exercise: Rating by length

\n", 366 | "
\n", 367 | "\n", 368 | "

Plot the relationship between review length (number of characters) and average `review_overall`.

\n", 369 | "\n", 370 | "- Hint: use `.plot(style='k.')`\n", 371 | "- We've grouped by columns so far, you can also group by any series with the same length" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "%load solutions/groupby_00.py" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "
\n", 395 | "

Exercise: Reviews by Length

\n", 396 | "
\n", 397 | "\n", 398 | "

Find the relationship between review length (number of **words** and average `review_overall`.)

\n", 399 | "\n", 400 | "- Hint: You can pass a [regular expression](https://docs.python.org/3/howto/regex.html#matching-characters) to any of the `.str` methods." 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": null, 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [ 416 | "%load solutions/groupby_00b.py" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "
\n", 424 | "

Exercise: Rating by number of Reviews

\n", 425 | "
\n", 426 | "\n", 427 | "

Find the relationship between the number of reviews for a beer and the average `review_overall`.

\n" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [ 443 | "%load solutions/groupby_01.py" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "metadata": {}, 449 | "source": [ 450 | "## Transform\n", 451 | "\n", 452 | "A *transform* is a function whose output is the same shape as the input." 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "metadata": {}, 458 | "source": [ 459 | "Recall that a groupby has three steps: split, apply, combine.\n", 460 | "So far, all of the functions we've applied have been *aggregations*: the rule for \"combine\" is one row per group.\n", 461 | "\n", 462 | "You can use `Groupby.transform` when you have an operation that should be done *groupwise*, but the result should be the same shape.\n", 463 | "For example, suppose we wanted to normalize each reviewer's scores by their average score. " 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [ 472 | "# Define demean(v: array) -> array\n", 473 | "def demean(v):\n", 474 | " return v - v.mean()" 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "metadata": {}, 480 | "source": [ 481 | "Just calling `demean` on the entire Series will normalize by the *global* average." 482 | ] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "metadata": {}, 488 | "outputs": [], 489 | "source": [ 490 | "demean(df.review_overall)" 491 | ] 492 | }, 493 | { 494 | "cell_type": "markdown", 495 | "metadata": {}, 496 | "source": [ 497 | "Now, let's demean each individual's reviews by their own average.\n", 498 | "This could be useful if, for example, you were building a recommendation system.\n", 499 | "A rating of 4 from someone's whose average is 2 is in some sense more meaningful that a 4 from someone who always gives 4s." 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "normalized = df.groupby(\"profile_name\")[review_cols].transform(demean)\n", 509 | "normalized.head()" 510 | ] 511 | }, 512 | { 513 | "cell_type": "markdown", 514 | "metadata": {}, 515 | "source": [ 516 | "We used `.transform` because the desired output was the same shape as the input.\n", 517 | "Just like `.agg` informs pandas that you want `1 input group → 1 output row`, the `.transform` method informs pandas that you want `1 input row → 1 output row`.\n", 518 | "\n", 519 | "`.transform` operates on each column independently." 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": {}, 525 | "source": [ 526 | "
\n", 527 | "

Exercise: Personal Trend?

\n", 528 | "
\n", 529 | "\n", 530 | "

Do reviewer's `review_overall` trend over a person's time reviewing?

\n", 531 | "\n", 532 | "Hint: Need an indictor that tracks which review this is for that person. That is, we need a cumulative count of reviews per person." 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": null, 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": null, 545 | "metadata": {}, 546 | "outputs": [], 547 | "source": [ 548 | "%load solutions/groupby_02.py" 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "metadata": {}, 554 | "source": [ 555 | "## General `.apply`\n", 556 | "\n", 557 | "We've seen `.agg` for outputting 1 row per group, and `.transform` for outputting 1 row per input row.\n", 558 | "\n", 559 | "The final kind of function application is `.apply`.\n", 560 | "This can do pretty much whatever you want.\n", 561 | "We'll see an example in a later notebook." 562 | ] 563 | }, 564 | { 565 | "cell_type": "markdown", 566 | "metadata": {}, 567 | "source": [ 568 | "## Summary\n", 569 | "\n", 570 | "- We used groupby to analyze data by subsets\n", 571 | "- We used `agg` to summarize groups and `transform` to perform group-wise transformations" 572 | ] 573 | } 574 | ], 575 | "metadata": { 576 | "kernelspec": { 577 | "display_name": "Python 3", 578 | "language": "python", 579 | "name": "python3" 580 | }, 581 | "language_info": { 582 | "codemirror_mode": { 583 | "name": "ipython", 584 | "version": 3 585 | }, 586 | "file_extension": ".py", 587 | "mimetype": "text/x-python", 588 | "name": "python", 589 | "nbconvert_exporter": "python", 590 | "pygments_lexer": "ipython3", 591 | "version": "3.5.5" 592 | } 593 | }, 594 | "nbformat": 4, 595 | "nbformat_minor": 1 596 | } 597 | -------------------------------------------------------------------------------- /notebooks/05-Timeseries.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Timeseries" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Pandas started out in the financial world, so it naturally has strong support for timeseries data.\n", 15 | "We'll look at some pandas data types and methods for manipulating timeseries data.\n", 16 | "Afterwords, we'll use [statsmodels' state space framework](http://www.statsmodels.org/stable/statespace.html) to model timeseries data." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import numpy as np\n", 26 | "import pandas as pd\n", 27 | "import seaborn as sns\n", 28 | "import matplotlib.pyplot as plt" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "%matplotlib inline" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "plt.style.use('default')\n", 47 | "plt.rcParams['figure.figsize'] = (12, 6)\n", 48 | "pd.options.display.max_rows = 10" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "## Datatypes\n", 56 | "\n", 57 | "- `pd.Timestamp` (nanosecond resolution `datetime.datetime`)\n", 58 | "- `pd.Timedelta` (nanosecond resolution `datetime.timedelta`)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "Pandas provides highly-performant (mostly) drop-in replacements for `datetime.datetime` (`pd.Timestamp`) and `datetime.tiemedelta` (`pd.Timedelta`).\n", 66 | "These have been tailored for efficient storage in NumPy arrays.\n", 67 | "For the most part you'll be working with `DatetimeIndex`es or `TimedeltaIndex`es, or Series / DataFrames containing these.\n", 68 | "\n", 69 | "The biggest limitation is that pandas stores `Timestamp`s at nanosecond resolution. Since they're backed by NumPy's 64-bit integer, the minimum and maximum values are" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "pd.Timestamp.min, pd.Timestamp.max" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "If this is a problem, [there are workarounds](http://pandas.pydata.org/pandas-docs/stable/timeseries.html#representing-out-of-bounds-spans)." 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "We'll go back to the BTS data set on flights.\n", 93 | "This time I've provided the number of flights per hour for two airports in Chicago: Midway (MDW) and O'Hare (ORD). The data go back to January 1st, 2000." 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "df = pd.read_csv(\"data/flights-ts.csv.gz\", index_col=0, parse_dates=True)\n", 103 | "df.head()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "## Resampling\n", 111 | "\n", 112 | "Resampling is similar to a groupby, but specialized for datetimes.\n", 113 | "Instead of specifying a column of values to group by, you specify a `rule`: the desired output frequency.\n", 114 | "The original data is binned into each group created by your rule." 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "resampler = df.resample(\"MS\") # MS=Month Start\n", 124 | "resampler" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "There's an extensive list of frequency codes: http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases.\n", 132 | "\n", 133 | "If you examine the raw data in `df`, you'll notice that it's not at a fixed frequency.\n", 134 | "Hours where there weren't any flights just simply aren't present.\n", 135 | "This isn't a problem though; resample is perfect for going from \"ragged\" timeseries data to fixed-frequency data.\n", 136 | "\n", 137 | "Just like with `.groupby`, `.resample` returns a deferred object that hasn't really done any work yet.\n", 138 | "It has methods for aggregation, transformation, and general function application." 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "resampler.sum()" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "resampler.sum().plot();" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "
\n", 164 | "

Exercise: Resample

\n", 165 | "
\n", 166 | "

Plot the standard deviation for the number of flights from `MDW` and `ORD` at a weekly frequency

" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "# Your solution\n" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "%load solutions/timeseries_resample.py" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "
\n", 192 | "

Exercise: Resample-Agg

\n", 193 | "
\n", 194 | "

Compute the the total number of flights (sum), mean, and median flights *per Quarter*.

" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "%load solutions/timeseries_resample_agg.py" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "## Rolling, Expanding\n", 218 | "\n", 219 | "Applying functions to windows, moving through your data." 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "These are very similar to groupby and resample. Let's get the daily number of flights with a `resample` quick." 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "daily = df.resample('D').sum()\n", 236 | "daily" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": {}, 242 | "source": [ 243 | "Suppose you wanted a 30-day moving (or rolling) average.\n", 244 | "This is possible with the `.rolling` method. Like `groupby` and `resample`, this object is just going to store the information to know what subset of data to operate on next; it doesn't actually do any work yet:" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "daily.rolling(30, center=True)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "The first argument is the window size.\n", 261 | "Since `daily` is at daily frequency, 30 means a 30-day window.\n", 262 | "`center=True` says to label each window with the middle-most point.\n", 263 | "To actually do work, you call a method like `.mean`;" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "fig, ax = plt.subplots()\n", 273 | "daily.rolling(30).mean().rename(columns=lambda x: x + \" (30D MA)\").plot(ax=ax, alpha=.25,\n", 274 | " color=['C0', 'C1'])\n", 275 | "daily.plot(ax=ax, alpha=.25, color=['C0', 'C1'], legend=False);" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "It's common to combine resampling and rolling." 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "df.resample(\"D\").sum().rolling(30).corr(pairwise=True).xs(\"MDW\", level=1)['ORD'].plot(\n", 292 | " title=\"O'Hare : Midway cross-correlation (30D MA)\", figsize=(12, 4)\n", 293 | ");" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "## Timezones\n", 301 | "\n", 302 | "pandas can store an array of datetimes with a common timezone.\n", 303 | "Right now the index for `df` is timezone naïve, but we can convert to a timezone with `tz_convert`:" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "df.index.tzinfo # None, timezone naïve" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "df.index.tz_localize(\"US/Central\")" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "Timezones, as usual, are annoying to deal with.\n", 329 | "We've hit a daylight savings time issue.\n", 330 | "As the error says, 2000-04-02T02:00:00 isn't actaully a valid time in US/Central.\n", 331 | "I checked the BTS website, and these timestamps are supposed to be local time, so presumably some data was recorded incorrectly.\n", 332 | "pandas is strict by default, so it we need to tell it to ignore those errors: " 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "idx = df.index.tz_localize(\"US/Central\", ambiguous=\"NaT\", errors='coerce')\n", 342 | "idx" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "pd.isnull(idx).sum() # 25 bad values" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": {}, 357 | "source": [ 358 | "Notice the dtype: `datetime64[ns, US/Central]`.\n", 359 | "That means nanosecond resolution in the US/Central time zone.\n", 360 | "Once you have a datetime with timezone, you can convert timezones with `tz_convert`:" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": null, 366 | "metadata": {}, 367 | "outputs": [], 368 | "source": [ 369 | "idx.tz_convert(\"UTC\")" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "## Offsets\n", 377 | "\n", 378 | "I wish the standard library `datetime` module had something like this.\n", 379 | "Let's generate some fake data with `pd.date_range`" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "dates = pd.date_range(\"2016-01-01\", end=\"2016-12-31\", freq='D')\n", 389 | "dates" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": {}, 395 | "source": [ 396 | "There are a whole bunch of offsets available in the `pd.tseries.offsets` namespace. For example, to move 3 business days into the future:" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "dates + pd.tseries.offsets.BDay(3)" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": {}, 411 | "source": [ 412 | "Or to move to the next month end:" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "dates + pd.tseries.offsets.MonthEnd()" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "metadata": {}, 427 | "source": [ 428 | "## Timedelta Math\n", 429 | "\n", 430 | "Being able to add columns of dates and timedeltas turns out to be quite convenient.\n", 431 | "Let's go all the way back to our first example with flight delays from New York airports." 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "flights = pd.read_csv(\"data/ny-flights.csv.gz\", parse_dates=['dep', 'arr'])\n", 441 | "flights.head()" 442 | ] 443 | }, 444 | { 445 | "cell_type": "markdown", 446 | "metadata": {}, 447 | "source": [ 448 | "
\n", 449 | "

Exercise: Convert Timedelta

\n", 450 | "
\n", 451 | "

Convert `flights.dep_delay` and `flights.arr_delay` to timedelta dtype.

\n", 452 | "\n", 453 | "- Hint: recall our type conversion methods: `pd.to_*`\n", 454 | "- Make new columns in `flights` called `dep_delay_td` and `arr_delay_td`\n", 455 | "- Check the `unit` argument for the conversion method. The delay columns are in *minutes*." 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": null, 461 | "metadata": {}, 462 | "outputs": [], 463 | "source": [ 464 | "# Your solution\n" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [ 473 | "%load solutions/timeseries_timedelta.py" 474 | ] 475 | }, 476 | { 477 | "cell_type": "markdown", 478 | "metadata": {}, 479 | "source": [ 480 | "
\n", 481 | "

Exercise: Timedelta Math

\n", 482 | "
\n", 483 | "

Compute the actual time the flight left, but adding the departure time `dep` and the delay `dep_delay`." 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": null, 489 | "metadata": {}, 490 | "outputs": [], 491 | "source": [] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "%load solutions/timeseries_departure.py" 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": {}, 505 | "source": [ 506 | "# Modeling Timeseries\n", 507 | "\n", 508 | "Timeseries are an interesting problem to model.\n", 509 | "If we're lucky, we have a long history of past data that we can (maybe) use to predict the future.\n", 510 | "We can exploit regularity in the timeseries (seasonal patterns, periods of high values are typically followed by another high value, etc.) to better predict the future.\n", 511 | "\n", 512 | "Statsmodels has a nice framework for fitting timeseries models and evaluating their output." 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": null, 518 | "metadata": {}, 519 | "outputs": [], 520 | "source": [ 521 | "import statsmodels.formula.api as smf\n", 522 | "import statsmodels.tsa.api as smt\n", 523 | "import statsmodels.api as sm" 524 | ] 525 | }, 526 | { 527 | "cell_type": "markdown", 528 | "metadata": {}, 529 | "source": [ 530 | "Let's model Monthly flights from `ORD`." 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "y = daily.ORD.resample(\"MS\").sum()\n", 540 | "y.plot();" 541 | ] 542 | }, 543 | { 544 | "cell_type": "markdown", 545 | "metadata": {}, 546 | "source": [ 547 | "That final value is odd because it's not a complete month. Let's drop it." 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": null, 553 | "metadata": {}, 554 | "outputs": [], 555 | "source": [ 556 | "y = daily.ORD.resample(\"MS\").sum().iloc[:-1]\n", 557 | "y.head()" 558 | ] 559 | }, 560 | { 561 | "cell_type": "markdown", 562 | "metadata": {}, 563 | "source": [ 564 | "It's common to estimate the parameters on *differenced* values.\n", 565 | "That is, make a new series $y'$ where $y_t' = y_t - y_{t-1}$. Pandas makes this simple with the `.diff` method." 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": null, 571 | "metadata": {}, 572 | "outputs": [], 573 | "source": [ 574 | "y_prime = y.diff()\n", 575 | "y_prime.head()" 576 | ] 577 | }, 578 | { 579 | "cell_type": "markdown", 580 | "metadata": {}, 581 | "source": [ 582 | "We'll drop that first NaN:" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": null, 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [ 591 | "y_prime = y.diff().dropna()\n", 592 | "y_prime.plot();" 593 | ] 594 | }, 595 | { 596 | "cell_type": "markdown", 597 | "metadata": {}, 598 | "source": [ 599 | "Think back to regular linear regression: Predict some variable $y$ with some matrix $X$:\n", 600 | "\n", 601 | "$y = \\beta_0 + \\beta_1 X_1 + \\beta_2 X_2 ... + \\beta_p X_p + \\varepsilon$\n", 602 | "\n", 603 | "When modelling timeseries, past values of $y$ make for good components of $X$.\n", 604 | "We can do this with the pandas `.shift` method:" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": null, 610 | "metadata": {}, 611 | "outputs": [], 612 | "source": [ 613 | "y_prime.shift()" 614 | ] 615 | }, 616 | { 617 | "cell_type": "markdown", 618 | "metadata": {}, 619 | "source": [ 620 | "So the value for `2001-01-01` (-867) is now labeled `2000-02-01`. We can collect many of these with a list comprehension and a `concat`." 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": null, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "lagged = pd.concat([y_prime.shift(i) for i in range(9)], axis=1,\n", 630 | " keys=['y', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8'])\n", 631 | "lagged" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": null, 637 | "metadata": {}, 638 | "outputs": [], 639 | "source": [ 640 | "mod_lagged = smf.ols('y ~ L1 + L2 + L3 + L4 + L5 + L6 + L7 + L8', lagged)\n", 641 | "res_lagged = mod_lagged.fit()\n", 642 | "\n", 643 | "res_lagged.summary()" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": null, 649 | "metadata": {}, 650 | "outputs": [], 651 | "source": [ 652 | "ax = res_lagged.fittedvalues.plot(label=\"predicted\", figsize=(12, 4), legend=True)\n", 653 | "y_prime.plot(label=\"actual\", legend=True);" 654 | ] 655 | }, 656 | { 657 | "cell_type": "markdown", 658 | "metadata": {}, 659 | "source": [ 660 | "In practice, you won't be doing the `shift`ing and `diff`ing yourself.\n", 661 | "It's more convenient to let statsmodels do that for us.\n", 662 | "Then we don't have to worry about un-differencing the fitted / predicted results to interpret them correctly.\n", 663 | "Also, the solvers we'll see next are a bit more sophisticated than a linear regression." 664 | ] 665 | }, 666 | { 667 | "cell_type": "markdown", 668 | "metadata": {}, 669 | "source": [ 670 | "## AutoRegressive Model\n", 671 | "\n", 672 | "Predict $y_{t+1}$, given $y_0, y_1, \\ldots y_t$" 673 | ] 674 | }, 675 | { 676 | "cell_type": "markdown", 677 | "metadata": {}, 678 | "source": [ 679 | "Let's fit an autoregressive (AR) model. Autoregressive part just means using past values of $y$ to predict the future (like we did above).\n", 680 | "We'll use statsmodel's `SARIMAX` model. The AR part of SARIMAX is for autoregressive.\n", 681 | "It also handles seasonality (**S**), differencing (**I** for integrated), moving average (**MA**), and exogenous regressors (**X**).\n", 682 | "\n", 683 | "We'll stick to a simple AR(8) model (use the last 8 periods) with a single period of differencing." 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "metadata": {}, 690 | "outputs": [], 691 | "source": [ 692 | "mod = smt.SARIMAX(y, order=(8, 1, 0)) # AR(8), first difference, no MA\n", 693 | "res = mod.fit()" 694 | ] 695 | }, 696 | { 697 | "cell_type": "markdown", 698 | "metadata": {}, 699 | "source": [ 700 | "As usual with statsmodels, we get a nice summary with the fitted coefficeints and some test statistics (which we'll ignore)" 701 | ] 702 | }, 703 | { 704 | "cell_type": "code", 705 | "execution_count": null, 706 | "metadata": {}, 707 | "outputs": [], 708 | "source": [ 709 | "res.summary()" 710 | ] 711 | }, 712 | { 713 | "cell_type": "markdown", 714 | "metadata": {}, 715 | "source": [ 716 | "The results instance has all the usual attributes and methods, like `fittedvalues`." 717 | ] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": null, 722 | "metadata": {}, 723 | "outputs": [], 724 | "source": [ 725 | "ax = res.fittedvalues.iloc[1:].plot(label=\"Fitted\", legend=True, figsize=(12, 4))\n", 726 | "y.plot(ax=ax, label=\"Actual\", legend=True);" 727 | ] 728 | }, 729 | { 730 | "cell_type": "markdown", 731 | "metadata": {}, 732 | "source": [ 733 | "## Forecasting\n", 734 | "\n", 735 | "The real value of timeseries analysis is to predict the future.\n", 736 | "We can use the `.get_prediction` method to get the predicted values, along with a confidence interval." 737 | ] 738 | }, 739 | { 740 | "cell_type": "markdown", 741 | "metadata": {}, 742 | "source": [ 743 | "First, we'll look at one-period-ahead forecasts.\n", 744 | "Basically, this simulates looking at our data the last day of the month, and making the forecast for the next month.\n", 745 | "Keep in mind though that we fit our parameters on the entire dataset. The isn't an out-of-sample prediction." 746 | ] 747 | }, 748 | { 749 | "cell_type": "code", 750 | "execution_count": null, 751 | "metadata": {}, 752 | "outputs": [], 753 | "source": [ 754 | "pred = res.get_prediction(start='2001-03-01')\n", 755 | "pred_ci = pred.conf_int()" 756 | ] 757 | }, 758 | { 759 | "cell_type": "code", 760 | "execution_count": null, 761 | "metadata": {}, 762 | "outputs": [], 763 | "source": [ 764 | "ax = y.plot(label='observed')\n", 765 | "pred.predicted_mean.plot(ax=ax, label='Forecast', alpha=.7)\n", 766 | "ax.fill_between(pred_ci.index,\n", 767 | " pred_ci.iloc[:, 0],\n", 768 | " pred_ci.iloc[:, 1], color='k', alpha=.2)\n", 769 | "plt.legend()\n", 770 | "sns.despine()" 771 | ] 772 | }, 773 | { 774 | "cell_type": "markdown", 775 | "metadata": {}, 776 | "source": [ 777 | "Alternatively, we can make dynamic forecasts as of some month (January 2013 in the example below). That means the forecast from that point forward only use information available as of January 2013 (though again, we fit the model on the entire dataset). The predictions are generated in a similar way: a bunch of one-step forecasts. Only instead of plugging in the actual values beyond January 2013, we plug in the forecast values." 778 | ] 779 | }, 780 | { 781 | "cell_type": "code", 782 | "execution_count": null, 783 | "metadata": {}, 784 | "outputs": [], 785 | "source": [ 786 | "pred_dy = res.get_prediction(start='2002-03-01', dynamic='2013-01-01')\n", 787 | "pred_dy_ci = pred_dy.conf_int()" 788 | ] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": null, 793 | "metadata": {}, 794 | "outputs": [], 795 | "source": [ 796 | "ax = y.plot(label='observed')\n", 797 | "pred_dy.predicted_mean.plot(ax=ax, label='Forecast')\n", 798 | "ax.fill_between(pred_dy_ci.index,\n", 799 | " pred_dy_ci.iloc[:, 0],\n", 800 | " pred_dy_ci.iloc[:, 1], color='k', alpha=.25)\n", 801 | "ylim = ax.get_ylim()\n", 802 | "ax.fill_betweenx(ylim, pd.Timestamp('2013-01-01'), y.index[-1],\n", 803 | " alpha=.1, zorder=-1)\n", 804 | "ax.set_ylim(ylim)\n", 805 | "ax.annotate('Dynamic $\\\\longrightarrow$',\n", 806 | " (pd.Timestamp('2013-02-01'), 16000))\n", 807 | "\n", 808 | "plt.legend()\n", 809 | "sns.despine()\n", 810 | "plt.tight_layout()" 811 | ] 812 | }, 813 | { 814 | "cell_type": "markdown", 815 | "metadata": {}, 816 | "source": [ 817 | "There are *a lot* of issues we didn't cover here.\n", 818 | "Seasonality, non-stationarity, autocorrellation, unit roots, and more.\n", 819 | "Timeseries modeling is fraught with traps that will throw off your predictions.\n", 820 | "Still, this should give you a taste of what's possbile." 821 | ] 822 | }, 823 | { 824 | "cell_type": "markdown", 825 | "metadata": {}, 826 | "source": [ 827 | "## Further Resources\n", 828 | "\n", 829 | "- [statsmodels state space documentation](http://www.statsmodels.org/dev/statespace.html)\n", 830 | "- [statsmodels state space examples](http://www.statsmodels.org/dev/examples/index.html#statespace)\n", 831 | "- [pyflux](http://www.pyflux.com), another time series modeling library\n", 832 | "- Sean Abu's [post on ARIMA](http://www.seanabu.com/2016/03/22/time-series-seasonal-ARIMA-model-in-python/)\n", 833 | "- Jeffrey Yau's [talks at PyData](https://www.youtube.com/watch?v=tJ-O3hk1vRw)\n", 834 | "- My [blog post](http://tomaugspurger.github.io/modern-7-timeseries.html)" 835 | ] 836 | } 837 | ], 838 | "metadata": { 839 | "kernelspec": { 840 | "display_name": "Python 3", 841 | "language": "python", 842 | "name": "python3" 843 | }, 844 | "language_info": { 845 | "codemirror_mode": { 846 | "name": "ipython", 847 | "version": 3 848 | }, 849 | "file_extension": ".py", 850 | "mimetype": "text/x-python", 851 | "name": "python", 852 | "nbconvert_exporter": "python", 853 | "pygments_lexer": "ipython3", 854 | "version": "3.6.6" 855 | } 856 | }, 857 | "nbformat": 4, 858 | "nbformat_minor": 2 859 | } 860 | -------------------------------------------------------------------------------- /notebooks/06-Dask.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\"Dask\n", 11 | "\n", 12 | "# Personal Dask Cluster\n", 13 | "\n", 14 | "Go to `` for your own Dask cluster. Use your first and last name provided to SciPy for the username and \"dask\" for the password.\n", 15 | "(We don't actually do any authentication)." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "## Dask Quickstart\n", 23 | "\n", 24 | "Dask scales python.\n", 25 | "Today, we'll focus on how it scales pandas, but know that it's more general.\n", 26 | "\n", 27 | "Pandas is fundamentally for in-memory datasets.\n", 28 | "You can't have a DataFrame larger than your machine's RAM.\n", 29 | "\n", 30 | "Dask dataframe lets you work with larger than memory datasets.\n", 31 | "Dask breaks large problems into many small problems (task graph).\n", 32 | "It then executes those small problems in parallel and in a small memory footprint (scheduler).\n", 33 | "It provides user interfaces, like `dask.dataframe` or `dask.array`, which feel like NumPy and pandas." 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "import numpy as np\n", 43 | "import pandas as pd\n", 44 | "import dask.dataframe as dd\n", 45 | "\n", 46 | "df = dd.from_pandas(\n", 47 | " pd.DataFrame({'A': np.random.choice(['a', 'b', 'c'], size=100),\n", 48 | " 'B': np.random.randn(100),\n", 49 | " 'C': np.random.uniform(size=100)}),\n", 50 | " npartitions=4\n", 51 | ")\n", 52 | "df" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "A dask dataframe has most of the same methods as pandas." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "df.B + df.C" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "df[['B', 'C']].sum()" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "Dask DataFrame's methods are lazy.\n", 85 | "This lets Dask build up a large chain of operations that can be executed in parallel.\n", 86 | "When you say `df.sum()`, instead of computing the sum immediately, Dask builds up a *task graph*.\n", 87 | "\n", 88 | "```python\n", 89 | "df[['B', 'C']].sum().visualize(rankdir='LR')\n", 90 | "```\n", 91 | "\n", 92 | "" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "When you're ready for a concrete result, call `compute`." 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "df[['B', 'C']].sum().compute()" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "Calling `compute` hands the task graph to the *scheduler*, which executes the graph in parallel.\n", 116 | "Dask has several schedulers, depending on how you want to do the computation (using many threads, processes, or machines).\n", 117 | "We'll be using the distributed scheduler, so we can see how dask scales pandas to a cluster of machines.\n", 118 | "But Dask also works well on a single machine.\n", 119 | "You write normal pandas operations, but the computation happens in a low-memory footprint." 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "# Distributed DataFrames and Efficiency\n", 127 | "\n", 128 | "We will cover the following topics:\n", 129 | "\n", 130 | "1. Persist common intermediate results in memory with `persist`\n", 131 | "2. Partitions and partition size\n", 132 | "3. Using indices to improve efficiency" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "from dask_kubernetes import KubeCluster\n", 142 | "from dask.distributed import Client" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "The next cell will start up some workers for you. This make take a few minutes, but they widget will update automatically when the workers are ready. You don't need to do anything with the manual or adaptive scaling." 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "cluster = KubeCluster(n_workers=8)\n", 159 | "cluster" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "**Be sure to open the diagnostics UI.**" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "client = Client(cluster)\n", 176 | "client" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "## Moving to distributed\n", 184 | "\n", 185 | "A few things change when moving from local to distributed computing.\n", 186 | "\n", 187 | "1. Environment: Each worker is a separate machine, and needs to have the required libraries installed. This cluster was setup using [Kubernetes](http://dask.pydata.org/en/latest/setup/kubernetes.html#).\n", 188 | "2. File system: Previously, every worker (threads, processes, or even the distributed scheduler in local mode) had access to your laptops file system. In a distributed environment, you'll need some kind of shared file system to read data (cloud storage like S3 or GCS, or a network file system)\n", 189 | "3. Communication: Moving data between machines is relatively expensive. When possible, the distributed scheduler will ensure that tasks are scheduled to be run on workers that already have the required data. But some tasks will require data from multiple machines." 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "## The full airline dataset\n", 197 | "\n", 198 | "We have the full airline dataset stored on `GCS`. This is the same as the one you've been working with, but includes all originating airports and a few extra columns. We change the `read_csv` call slightly to avoid the extra columns.\n", 199 | "\n", 200 | "`dask.dataframe` has support for reading directly from `GCS`, so we can use our `read_csv` call from before." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "import dask.dataframe as dd\n", 210 | "\n", 211 | "columns = ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',\n", 212 | " 'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',\n", 213 | " 'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',\n", 214 | " 'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',\n", 215 | " 'Cancelled']\n", 216 | "\n", 217 | "df = dd.read_csv('gcs://anaconda-public-data/airline/(199)|(200)*.csv',\n", 218 | " parse_dates={'Date': [0, 1, 2]},\n", 219 | " dtype={'TailNum': object,\n", 220 | " 'CRSElapsedTime': float,\n", 221 | " 'Distance': float,\n", 222 | " 'Cancelled': bool},\n", 223 | " usecols=columns)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "df.head()" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "### Persist data in distributed memory\n", 240 | "\n", 241 | "Every time we run an operation like `df[~df.Cancelled].DepDelay.max().compute()` we read through our dataset from disk. This can be slow, especially because we're reading data from CSV. We usually have two options to make this faster:\n", 242 | "\n", 243 | "1. Persist relevant data in memory, either on our computer or on a cluster\n", 244 | "2. Use a faster on-disk format, like HDF5 or Parquet\n", 245 | "\n", 246 | "In this section we persist our data in memory. On a single machine this is often done by doing a bit of pre-processing and data reduction with dask dataframe and then `compute`-ing to a Pandas dataframe and using Pandas in the future. \n", 247 | "\n", 248 | "```python\n", 249 | "df = dd.read_csv(...)\n", 250 | "df = df[df.Origin == 'LGA'] # filter down to smaller dataset\n", 251 | "pdf = df.compute() # convert to pandas\n", 252 | "pdf ... # continue with familiar Pandas workflows\n", 253 | "```\n", 254 | "\n", 255 | "However on a distributed cluster when even our cleaned data is too large we still can't use Pandas. In this case we ask Dask to persist data in memory with the `dask.persist` function. This is what we'll do today. This will help us to understand when data is lazy and when it is computing.\n", 256 | "\n", 257 | "You can trigger computations using the persist method:\n", 258 | "\n", 259 | " x = x.persist()\n", 260 | "\n", 261 | "or the dask.persist function for multiple inputs:\n", 262 | "\n", 263 | " x, y = dask.persist(x, y)" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "### Exercise\n", 271 | "\n", 272 | "Persist the dataframe into memory.\n", 273 | "\n", 274 | "- How long does the cell take to execute (look at the \"busy\" indicator in the top-right)?\n", 275 | "- After it has persisted how long does it take to compute `df[~df.Cancelled].DepDelay.count().compute()`?\n", 276 | "- Looking at the plots in the diagnostic web page (the link was printed above), what is taking up most of the time? (You can over over rectangles to see what function they represent)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "df = # TODO: persist dataframe in memory" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "%time _ = df.Cancelled[~df.Cancelled].count().compute()" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": {}, 300 | "source": [ 301 | "### Exercise\n", 302 | "\n", 303 | "Repeat the groupby computation from the previous notebooks. What is taking all of the time now?" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "# What was the average departure delay from each airport?\n", 313 | "df[~df.Cancelled].groupby('Origin').DepDelay.mean().nlargest(10).compute()" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "## Partitions\n", 321 | "\n", 322 | "One `dask.dataframe` is composed of several Pandas dataframes. The organization of these dataframes can significantly impact performance. In this section we discuss two common factors that commonly impact performance:\n", 323 | "\n", 324 | "1. The number of Pandas dataframes can affect overhead. If the dataframes are too small then Dask might spend more time deciding what to do than Pandas spends actually doing it. Ideally computations should take 100's of milliseconds.\n", 325 | "\n", 326 | "2. If we know how the dataframes are sorted then certain operations become much faster" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "### Number of partitions and partition size\n", 334 | "\n", 335 | "When we read in our data from CSV files we get potentially multiple Pandas dataframe for each file. Look at the metadata below to determine a few things about the current partitioning:\n", 336 | "- How many partitions are there?\n", 337 | "- Are the splits along the index between partitions known? If so, what are they?" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "# Number of partitions\n", 347 | "df.npartitions" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "# Are the splits between partitions known?\n", 357 | "df.known_divisions" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "# The splits between partitions. If unknown these are all `None`\n", 367 | "df.divisions" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": {}, 373 | "source": [ 374 | "### Exercise: How large is the DataFrame?\n", 375 | "\n", 376 | "- How would you compute the memory usage of a single pandas DataFrame?\n", 377 | "- Given your knowledge of Dask, how would you do it for a Dask DataFrame?" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "# Your code here...\n" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "%load memory-usage.py" 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "## Sorted Index column\n", 403 | "\n", 404 | "*This section doesn't have any exercises. Just follow along.*\n", 405 | "\n", 406 | "Many dataframe operations like loc-indexing, groupby-apply, and joins are *much* faster on a sorted index. For example, if we want to get data for a particular day of data it *really* helps to know where that day is, otherwise we need to search over all of our data.\n", 407 | "\n", 408 | "The Pandas model gives us a sorted index column. Dask.dataframe copies this model, and it remembers the min and max values of every partition's index.\n", 409 | "\n", 410 | "By default, our data doesn't have an index." 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [ 419 | "df.head()" 420 | ] 421 | }, 422 | { 423 | "cell_type": "markdown", 424 | "metadata": {}, 425 | "source": [ 426 | "So if we search for a particular day it takes a while because it has to pass through all of the data." 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [ 435 | "%time df[df.Date == '1992-05-05'].compute()" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": null, 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [ 444 | "df[df.Date == '1992-05-05'].visualize(optimize_graph=True)" 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "metadata": {}, 450 | "source": [ 451 | "However if we set the `Date` column as the index then this operation can be much much faster.\n", 452 | "\n", 453 | "Calling `set_index` followed by `persist` results in a new set of dataframe partitions stored in memory, sorted along the index column. To do this dask has to\n", 454 | "\n", 455 | "- Shuffle the data by date, resulting in the same number of output partitions\n", 456 | "- Set the index for each partition\n", 457 | "- Store the resulting partitions in distributed memory\n", 458 | "\n", 459 | "This can be a (relatively) expensive operation, but allows certain queries to be more optimized. \n", 460 | "\n", 461 | "Watch the diagnostics page while the next line is running to see how the shuffle and index operation progresses." 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "%%time\n", 471 | "df = df.set_index('Date').persist()" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "After the index is set, we now have known divisions:" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [ 487 | "# Number of partitions\n", 488 | "df.npartitions" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [ 497 | "# Are the splits between partitions known?\n", 498 | "df.known_divisions" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": null, 504 | "metadata": {}, 505 | "outputs": [], 506 | "source": [ 507 | "# The splits between partitions.\n", 508 | "df.divisions" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [ 517 | "# The repr for a dask dataframe can also be useful for\n", 518 | "# seeing partition information\n", 519 | "df" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": {}, 525 | "source": [ 526 | "Repeating the same query for all flights on a specific date, we can see that we're much faster after setting the index:" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": {}, 533 | "outputs": [], 534 | "source": [ 535 | "%time df.loc['1992-05-05'].compute()" 536 | ] 537 | }, 538 | { 539 | "cell_type": "markdown", 540 | "metadata": {}, 541 | "source": [ 542 | "If you look at the resulting graph, you can see that dask was able to optimize the computation to only look at a single partition:" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [ 551 | "df.loc['1992-05-05'].visualize(optimize_graph=True)" 552 | ] 553 | }, 554 | { 555 | "cell_type": "markdown", 556 | "metadata": {}, 557 | "source": [ 558 | "### Timeseries operations\n", 559 | "\n", 560 | "When the index of a dask dataframe is a known `DatetimeIndes`, traditional pandas timeseries operations are supported. For example, now that we have a sorted index we can resample the `DepDelay` column into 1 month bins." 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": null, 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | "%matplotlib inline" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": null, 575 | "metadata": {}, 576 | "outputs": [], 577 | "source": [ 578 | "%%time \n", 579 | "(df.DepDelay\n", 580 | " .resample('1M')\n", 581 | " .mean()\n", 582 | " .fillna(method='ffill')\n", 583 | " .compute()\n", 584 | " .plot(figsize=(10, 5)));" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": null, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "metadata": {}, 598 | "outputs": [], 599 | "source": [ 600 | "# When you're done with the `airlines` dataset\n", 601 | "client.restart()" 602 | ] 603 | }, 604 | { 605 | "cell_type": "markdown", 606 | "metadata": {}, 607 | "source": [ 608 | "## Exercise: Explore the NYC Taxi dataset\n", 609 | "\n", 610 | "We have some of the NYC Taxi ride dataset in parquet format stored in GCS." 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": {}, 617 | "outputs": [], 618 | "source": [ 619 | "taxi = dd.read_parquet(\"gcs://anaconda-public-data/nyc-taxi/nyc.parquet\")\n", 620 | "taxi.head()" 621 | ] 622 | }, 623 | { 624 | "cell_type": "markdown", 625 | "metadata": {}, 626 | "source": [ 627 | "Some questions?\n", 628 | "\n", 629 | "- How large is the dataset? Will it fit in your cluster's RAM if you persist it?\n", 630 | "- What's the average tip percent by hour?" 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": null, 636 | "metadata": {}, 637 | "outputs": [], 638 | "source": [] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": null, 643 | "metadata": {}, 644 | "outputs": [], 645 | "source": [ 646 | "# clean up, when finished with the notebook\n", 647 | "client.close()\n", 648 | "cluster.close()" 649 | ] 650 | } 651 | ], 652 | "metadata": { 653 | "kernelspec": { 654 | "display_name": "Python 3", 655 | "language": "python", 656 | "name": "python3" 657 | }, 658 | "language_info": { 659 | "codemirror_mode": { 660 | "name": "ipython", 661 | "version": 3 662 | }, 663 | "file_extension": ".py", 664 | "mimetype": "text/x-python", 665 | "name": "python", 666 | "nbconvert_exporter": "python", 667 | "pygments_lexer": "ipython3", 668 | "version": "3.6.5" 669 | } 670 | }, 671 | "nbformat": 4, 672 | "nbformat_minor": 2 673 | } 674 | -------------------------------------------------------------------------------- /notebooks/09-Visualization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Visualization\n", 8 | "\n", 9 | "We have a *ton* of options for viz in python.\n", 10 | "I'm going to focus on matplotlib and seaborn, because they work well for the types of analyses I usually do.\n", 11 | "At the end, I'll mention Altair, which is new but has a really good design." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "import numpy as np\n", 22 | "import seaborn.apionly as sns\n", 23 | "import matplotlib.pyplot as plt\n", 24 | "plt.style.use('default')" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "%matplotlib inline" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "To start with, we'll fetch some data from yahoo using the `pandas_datareader` package." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "stocks = pd.read_csv(\"data/stocks.csv\", index_col=\"Date\", parse_dates=True)\n", 50 | "stocks.head()" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "## Matplotlib\n", 58 | "\n", 59 | "- foundation for seaborn and pandas plotting\n", 60 | "- full control over every detail" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "We won't say too much about matplotlib directly.\n", 68 | "It's a large library with several different levels of API.\n", 69 | "Additionally, both seaborn and pandas use matplotlib internally so you can consider these two higher-level, domain specific APIs built on top of matplotlib proper.\n", 70 | "This works well, as you can use the higher-level library most of the time, but you still have the full power and control of matplotlib when you need it.\n", 71 | "\n", 72 | "People familiar with matplotlib will have used the `axes.plot` method; It takes an `x`, `y` and a bunch of keyword arguments." 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "fig, ax = plt.subplots(figsize=(12, 6))\n", 82 | "ax.plot(stocks.Open);" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "Notice that matplotlib is now pandas-aware.\n", 90 | "`ax.plot` knows that when it's passed a `Series`, like `stocks.Open`, then `stocks.index` makes for a good `x` axis.\n", 91 | "\n", 92 | "As of matplotlib 1.5, all the plot methods `.plot, .bar, .scatter`, etc. take an optional *data* argument. When passed, you can use strings as the `x` and `y`. matplotlib will use these strings as keys for `data.__getitem__`. This means `data` can be DataFrames, dictionaries, even H5py files." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "fig, ax = plt.subplots(figsize=(12, 6))\n", 102 | "ax.plot('Open', data=stocks)\n", 103 | "plt.legend();" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "Matplotlib's strength (and weakness) is it's customizability. With enough work, you can make essentially any figure you want." 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "import statsmodels.tsa.api as smt" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "ϵ, t = smt.filters.hpfilter(stocks.Close, lamb=129600*30)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "start = pd.Timestamp('2007-12-01')\n", 138 | "end = pd.Timestamp('2009-06-01')\n", 139 | "\n", 140 | "fig, ax = plt.subplots(figsize=(12, 6))\n", 141 | "ax.plot(t, linewidth=2, label=\"Trend\")\n", 142 | "\n", 143 | "ax.fill_between(t.index, t - ϵ, t + ϵ, alpha=.15, color='b')\n", 144 | "\n", 145 | "ylim = ax.get_ylim()\n", 146 | "ax.fill_between([start, end], *ylim, color='k', alpha=.2)\n", 147 | "ax.set_ylim(*ylim)\n", 148 | "\n", 149 | "ax.annotate(\"Housing Bubble\", (pd.Timestamp(\"2006-01\"), 255),\n", 150 | " fontsize=12, color='red')\n", 151 | "ax.annotate(\"Recession\", (pd.Timestamp(\"2008-01\"), 255),\n", 152 | " fontsize=12, color='red')\n", 153 | "\n", 154 | "ax.legend()\n", 155 | "sns.despine()" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(8, 4), sharex=True)\n", 165 | "\n", 166 | "ax1.plot(\"Open\", data=stocks, color='red')\n", 167 | "ax2.plot(\"Volume\", data=stocks)\n", 168 | "ax2.fill_between(stocks.index, 0, \"Volume\", data=stocks, alpha=.25)\n", 169 | "ax2.set_ylim(0)\n", 170 | "ax1.legend()\n", 171 | "\n", 172 | "plt.tight_layout()" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "## Pandas Plotting\n", 180 | "\n", 181 | "> Usually convenient\n", 182 | "\n", 183 | "- Previously, nicer aesthetics (not since matplotlib 2.0)\n", 184 | "- Nicer labeling (but matplotlib is better now)\n", 185 | "- Easier (though less flexible) subplotting" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "stocks[['Open', 'High', \"Low\", \"Close\"]].plot(subplots=True, figsize=(8, 8));" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "stocks.Volume.plot.area();" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "I'd recommend pandas builtin plotting for when you need a quick visualization.\n", 211 | "For simple customizations, you can typically followup a pandas plot with an `ax.set`:" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "ax = stocks.Close.plot.density()\n", 221 | "ax.set(xlabel=\"Close\", title=\"Closing Price (density)\", ylim=.0001);" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "pd.cut(stocks.Close, 10).value_counts().sort_index().plot.barh(figsize=(4, 8));" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "For more elaborate customizations, it can make sense to just start with matplotlib." 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "## [Seaborn](http://seaborn.pydata.org/)\n", 245 | "\n", 246 | "> Seaborn provides a high-level interface for drawing attractive statistical graphics.\n", 247 | "\n", 248 | "- Statistical aggregations (`countplot`, bootstrapped standard errors, `regplot`)\n", 249 | "- Easier distribution plotting\n", 250 | "- Easier faceting by variable" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "It is *not* a replacement of matplotlib. Rather, it provides a nice API for\n", 258 | "many common statistical methods. Some of the distinguishing features are\n", 259 | "\n", 260 | "We'll make an update of [this notebook](https://gist.github.com/mwaskom/8224591) from Michael Waskom, the author of seaborn, which explores the titanic dataset:" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "## Titanic Dataset\n", 268 | "\n", 269 | "- Survived\n", 270 | "- Class\n", 271 | "- Sex\n", 272 | "- Age\n", 273 | "- Embarked\n", 274 | "- Man / Woman / Child\n", 275 | "- Deck" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "t = sns.load_dataset('titanic')\n", 285 | "t['class'] = t['class'].cat.as_ordered()\n", 286 | "t.head()" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "t.info()" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "pal = dict(man=\"#4682B4\", woman=\"#CD5C5C\", child=\"#2E8B57\", male=\"#6495ED\",\n", 305 | " female=\"#F08080\")\n", 306 | "\n", 307 | "with sns.color_palette('viridis', n_colors=3) as v:\n", 308 | " pal.update(**dict(zip(['First', 'Second', 'Third'], v.as_hex())))\n", 309 | "\n", 310 | "sns.set(context='talk', style='white')" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "## Exploratory Analysis\n", 318 | "\n", 319 | "1. Who were the passengers?\n", 320 | "2. Who survived?" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": {}, 326 | "source": [ 327 | "## Who were the passengers?\n", 328 | "\n", 329 | "Explore them across different dimensions; We'll start with *categorical* data like sex or class.\n", 330 | "\n", 331 | "What's the count of passengers by sex?" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "sns.countplot(x=\"sex\", data=t, palette=pal);" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "By class?" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "sns.countplot(x=\"class\", data=t, palette=pal);" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "By \"who\" (man, woman, or child)?" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [ 372 | "sns.countplot(\"who\", data=t, palette=pal)\n", 373 | "sns.despine()" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "Seaborn is built up of a heirarchy of convenience functions and methods.\n", 381 | "For instance, `countplot` is essentially a specialized version of `factorplot` where `kind` is set to count." 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "sns.factorplot(x=\"class\", data=t, kind=\"count\", hue=\"sex\",\n", 391 | " palette=pal, size=7);" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": {}, 397 | "source": [ 398 | "Most seaborn functions have the same API. You pass the requeired arguments (`x`, `y`, etc. depending on the plot), a `data` argument.\n", 399 | "Include additional arguments like `hue`, `col`, etc. as needed.\n", 400 | "\n", 401 | "Why have both `countplot` and `factorplot(..., kind='count')`? The specialized versions like `countplot` are handy when working with `Grid`s, which we'll see later on." 402 | ] 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "metadata": {}, 407 | "source": [ 408 | "

\n", 409 | "

Exercise: Embarked by class

\n", 410 | "
\n", 411 | "\n", 412 | "

Make a `factorplot` with the counts of `embarked`, with the `hue` split\n", 413 | "by `class`.

" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [ 422 | "# Your solution\n" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "%load solutions/visualize_00.py\n" 432 | ] 433 | }, 434 | { 435 | "cell_type": "markdown", 436 | "metadata": {}, 437 | "source": [ 438 | "
\n", 439 | "

Exercise: Age by class

\n", 440 | "
\n", 441 | "

\n", 442 | "Make a pointplot of `age` by `class`. Look at the `kind` parameter to `sns.factorplot`.

" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [ 451 | "# Your solution here\n" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "metadata": {}, 458 | "outputs": [], 459 | "source": [ 460 | "%load solutions/visualize_01.py" 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": {}, 466 | "source": [ 467 | "## Distributions\n", 468 | "\n", 469 | "Let's moving to plotting *quantitative* data.\n", 470 | "We'll do this while introducting a new abstraction from seaborn, the `Grid` (`Grid`s work with either quantitative or qualitative data)." 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": {}, 476 | "source": [ 477 | "## Grids\n", 478 | "\n", 479 | "You initalize a `Grid` with all the agruments needed to layout the grid that\n", 480 | "the data will be plotted on:\n", 481 | "\n", 482 | "- `data`: DataFrame\n", 483 | "- `row` : variable to facet rows by\n", 484 | "- `col` : variable to facet columns by\n", 485 | "- `hue` : variable to split colors by" 486 | ] 487 | }, 488 | { 489 | "cell_type": "markdown", 490 | "metadata": {}, 491 | "source": [ 492 | "There are several kinds of `Grid`s in seaborn; we'll start with the `FacetGrid`." 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": null, 498 | "metadata": {}, 499 | "outputs": [], 500 | "source": [ 501 | "g = sns.FacetGrid(t, hue=\"sex\", aspect=2.5, palette=pal)" 502 | ] 503 | }, 504 | { 505 | "cell_type": "markdown", 506 | "metadata": {}, 507 | "source": [ 508 | "We haven't actually done any plotting really.\n", 509 | "Just the necessary work to layout the axes.\n", 510 | "\n", 511 | "To actually plot something, `map` plotting functions over the `FacetGrid`. The arguments to `g.map` are passed through to the underlying plot method like `sns.kdeplot`." 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "metadata": {}, 518 | "outputs": [], 519 | "source": [ 520 | "g = sns.FacetGrid(t, hue=\"sex\", aspect=2.5, palette=pal)\n", 521 | "g.map(sns.kdeplot, \"age\", shade=True)\n", 522 | "g.set(xlim=(0, 80), ylim=0)\n", 523 | "g.add_legend();" 524 | ] 525 | }, 526 | { 527 | "cell_type": "markdown", 528 | "metadata": {}, 529 | "source": [ 530 | "To show how the faceting works, pass `row` or `column` when setting up the `FacetGrid`:" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "g = sns.FacetGrid(t, row=\"sex\", hue=\"sex\", aspect=2.5, palette=pal)\n", 540 | "g.map(sns.kdeplot, \"age\", shade=True)\n", 541 | "g.set(xlim=(0, 80), ylim=0)\n", 542 | "g.add_legend();" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [ 551 | "g = sns.FacetGrid(t, hue=\"who\", aspect=2.5, palette=pal)\n", 552 | "g.map(sns.kdeplot, \"age\", shade=True)\n", 553 | "g.set(xlim=(0, 80), ylim=0)\n", 554 | "g.add_legend();" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": null, 560 | "metadata": {}, 561 | "outputs": [], 562 | "source": [ 563 | "g = sns.FacetGrid(t, hue=\"class\", aspect=3, palette=\"YlGn_r\")\n", 564 | "g.map(sns.kdeplot, \"age\", shade=True)\n", 565 | "g.set(xlim=(0, 80), ylim=0)\n", 566 | "g.add_legend();" 567 | ] 568 | }, 569 | { 570 | "cell_type": "markdown", 571 | "metadata": {}, 572 | "source": [ 573 | "You can get quite complicated results, without much additional work." 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "g = (sns.FacetGrid(t, col=\"sex\", row=\"class\", size=2.5, aspect=2.5,\n", 583 | " palette=pal, hue=\"sex\")\n", 584 | " .map(sns.kdeplot, \"age\", shade=True)\n", 585 | " .map(sns.rugplot, \"age\")\n", 586 | " .set(xlim=(0, 80), ylim=0));" 587 | ] 588 | }, 589 | { 590 | "cell_type": "markdown", 591 | "metadata": {}, 592 | "source": [ 593 | "This is a great asset when exploring a new dataset." 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": null, 599 | "metadata": {}, 600 | "outputs": [], 601 | "source": [ 602 | "sns.factorplot('deck', data=t, palette='PuBu_d',\n", 603 | " kind=\"count\");" 604 | ] 605 | }, 606 | { 607 | "cell_type": "markdown", 608 | "metadata": {}, 609 | "source": [ 610 | "Your data isn't always in perfect shape to be plotted, so you'll be mixing in data manipulations with actual plotting:" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": {}, 617 | "outputs": [], 618 | "source": [ 619 | "sns.violinplot(\"class\", \"fare\", data=t, orient=\"v\",\n", 620 | " palette=\"YlGn\")\n", 621 | "sns.despine(left=True)" 622 | ] 623 | }, 624 | { 625 | "cell_type": "markdown", 626 | "metadata": {}, 627 | "source": [ 628 | "
\n", 629 | "

Exercise: Trimming

\n", 630 | "
\n", 631 | "

\n", 632 | "Create a new column in `t` called `fare_` that topcodes `fare` to be no more than `3 * t.fare.median()`. That is, anything higher than 3x the median should just be set to 3x the median.

\n", 633 | "\n", 634 | "Hint: you can use `np.where` to simulate an `if x then y else z` on arrays of data." 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": null, 640 | "metadata": {}, 641 | "outputs": [], 642 | "source": [ 643 | "# Your solution here\n" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": null, 649 | "metadata": {}, 650 | "outputs": [], 651 | "source": [ 652 | "%load solutions/visualize_02a.py" 653 | ] 654 | }, 655 | { 656 | "cell_type": "markdown", 657 | "metadata": {}, 658 | "source": [ 659 | "Now make the violinplot on fares that we tried above:" 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": null, 665 | "metadata": {}, 666 | "outputs": [], 667 | "source": [ 668 | "# Your solution here\n" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": null, 674 | "metadata": {}, 675 | "outputs": [], 676 | "source": [ 677 | "%load solutions/visualize_02b.py\n" 678 | ] 679 | }, 680 | { 681 | "cell_type": "markdown", 682 | "metadata": {}, 683 | "source": [ 684 | "Seaborn makes it easy to split by an additional variable, like `sex`." 685 | ] 686 | }, 687 | { 688 | "cell_type": "code", 689 | "execution_count": null, 690 | "metadata": {}, 691 | "outputs": [], 692 | "source": [ 693 | "sns.violinplot(\"class\", \"fare_\", data=t, orient=\"v\",\n", 694 | " palette=\"YlGn\", hue='sex', split=True)\n", 695 | "sns.despine(left=True)\n", 696 | "plt.ylim(0);\n" 697 | ] 698 | }, 699 | { 700 | "cell_type": "markdown", 701 | "metadata": {}, 702 | "source": [ 703 | "## Plotting Relationships\n", 704 | "\n", 705 | "We've seen summary statistics (like countplot), univariate distributions, and basic relationships between one variable and a categorical variable.\n", 706 | "\n", 707 | "Seaborn also provides tools for visualizng bivariate relationships between quantitative variables." 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": null, 713 | "metadata": {}, 714 | "outputs": [], 715 | "source": [ 716 | "ax = sns.jointplot(\"age\", \"fare_\", data=t, color=\"g\", size=8);" 717 | ] 718 | }, 719 | { 720 | "cell_type": "markdown", 721 | "metadata": {}, 722 | "source": [ 723 | "## Who Survived?\n", 724 | "\n", 725 | "Let's turn to the variable of interest: who survived?" 726 | ] 727 | }, 728 | { 729 | "cell_type": "markdown", 730 | "metadata": {}, 731 | "source": [ 732 | "
\n", 733 | "

Exercise: Who Survived?

\n", 734 | "
\n", 735 | "\n", 736 | "

Explore the `alive` variable

" 737 | ] 738 | }, 739 | { 740 | "cell_type": "markdown", 741 | "metadata": {}, 742 | "source": [ 743 | "What does the count of alive look like?" 744 | ] 745 | }, 746 | { 747 | "cell_type": "code", 748 | "execution_count": null, 749 | "metadata": {}, 750 | "outputs": [], 751 | "source": [] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": null, 756 | "metadata": {}, 757 | "outputs": [], 758 | "source": [ 759 | "%load solutions/visualize_03.py" 760 | ] 761 | }, 762 | { 763 | "cell_type": "markdown", 764 | "metadata": {}, 765 | "source": [ 766 | "What's the relationship between class and survived?" 767 | ] 768 | }, 769 | { 770 | "cell_type": "code", 771 | "execution_count": null, 772 | "metadata": {}, 773 | "outputs": [], 774 | "source": [] 775 | }, 776 | { 777 | "cell_type": "code", 778 | "execution_count": null, 779 | "metadata": {}, 780 | "outputs": [], 781 | "source": [ 782 | "%load solutions/visualize_04.py\n" 783 | ] 784 | }, 785 | { 786 | "cell_type": "markdown", 787 | "metadata": {}, 788 | "source": [ 789 | "What's the relationship between who and survived" 790 | ] 791 | }, 792 | { 793 | "cell_type": "code", 794 | "execution_count": null, 795 | "metadata": {}, 796 | "outputs": [], 797 | "source": [] 798 | }, 799 | { 800 | "cell_type": "code", 801 | "execution_count": null, 802 | "metadata": {}, 803 | "outputs": [], 804 | "source": [ 805 | "%load solutions/visualize_05.py\n" 806 | ] 807 | }, 808 | { 809 | "cell_type": "markdown", 810 | "metadata": {}, 811 | "source": [ 812 | "What's the interaction of `sex` with `class`, when predicting `survived`? Split the `hue` by `sex`.\n", 813 | "\n", 814 | "Hint: `class` is a categorical (AKA factor), so use `factorplot`." 815 | ] 816 | }, 817 | { 818 | "cell_type": "code", 819 | "execution_count": null, 820 | "metadata": {}, 821 | "outputs": [], 822 | "source": [] 823 | }, 824 | { 825 | "cell_type": "code", 826 | "execution_count": null, 827 | "metadata": {}, 828 | "outputs": [], 829 | "source": [ 830 | "%load solutions/visualize_06.py\n" 831 | ] 832 | }, 833 | { 834 | "cell_type": "markdown", 835 | "metadata": {}, 836 | "source": [ 837 | "How about the interaction of `'who'` with class?" 838 | ] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": null, 843 | "metadata": {}, 844 | "outputs": [], 845 | "source": [] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": null, 850 | "metadata": {}, 851 | "outputs": [], 852 | "source": [ 853 | "%load solutions/visualize_07.py\n" 854 | ] 855 | }, 856 | { 857 | "cell_type": "markdown", 858 | "metadata": {}, 859 | "source": [ 860 | "What's the relationship between `survived` and `adult_male`?" 861 | ] 862 | }, 863 | { 864 | "cell_type": "code", 865 | "execution_count": null, 866 | "metadata": {}, 867 | "outputs": [], 868 | "source": [] 869 | }, 870 | { 871 | "cell_type": "code", 872 | "execution_count": null, 873 | "metadata": {}, 874 | "outputs": [], 875 | "source": [ 876 | "%load solutions/visualize_08.py\n" 877 | ] 878 | }, 879 | { 880 | "cell_type": "markdown", 881 | "metadata": {}, 882 | "source": [ 883 | "## Regression plots\n", 884 | "\n", 885 | "You can plot relationships with best fit lines (and bootstrapped standard errors) using `lmplot`." 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "execution_count": null, 891 | "metadata": {}, 892 | "outputs": [], 893 | "source": [ 894 | "sns.lmplot(\"age\", \"survived\", t, logistic=True, y_jitter=.05);" 895 | ] 896 | }, 897 | { 898 | "cell_type": "markdown", 899 | "metadata": {}, 900 | "source": [ 901 | "Since we have a binary target (`survived`), we use `logistic`. It can be more informative to bin the x variable." 902 | ] 903 | }, 904 | { 905 | "cell_type": "code", 906 | "execution_count": null, 907 | "metadata": {}, 908 | "outputs": [], 909 | "source": [ 910 | "bins = [15, 30, 45, 60]\n", 911 | "sns.lmplot(\"age\", \"survived\", t, logistic=True,\n", 912 | " x_bins=bins);" 913 | ] 914 | }, 915 | { 916 | "cell_type": "markdown", 917 | "metadata": {}, 918 | "source": [ 919 | "
\n", 920 | "

Exercise: Survived by gender

\n", 921 | "
\n", 922 | "\n", 923 | "\n", 924 | "

Can you split that relationship by `sex`?

" 925 | ] 926 | }, 927 | { 928 | "cell_type": "code", 929 | "execution_count": null, 930 | "metadata": {}, 931 | "outputs": [], 932 | "source": [] 933 | }, 934 | { 935 | "cell_type": "code", 936 | "execution_count": null, 937 | "metadata": {}, 938 | "outputs": [], 939 | "source": [ 940 | "%load solutions/visualize_09.py\n" 941 | ] 942 | }, 943 | { 944 | "cell_type": "markdown", 945 | "metadata": {}, 946 | "source": [ 947 | "How about class?" 948 | ] 949 | }, 950 | { 951 | "cell_type": "code", 952 | "execution_count": null, 953 | "metadata": {}, 954 | "outputs": [], 955 | "source": [] 956 | }, 957 | { 958 | "cell_type": "code", 959 | "execution_count": null, 960 | "metadata": {}, 961 | "outputs": [], 962 | "source": [ 963 | "%load solutions/visualize_10.py\n" 964 | ] 965 | }, 966 | { 967 | "cell_type": "markdown", 968 | "metadata": {}, 969 | "source": [ 970 | "## Seaborn Summary\n", 971 | "\n", 972 | "- Many small functions with a consistent API (`x`, `y`, `data`, etc.)\n", 973 | "- `Grid`s offer an abstraction for (relatively) easy faceting" 974 | ] 975 | } 976 | ], 977 | "metadata": { 978 | "kernelspec": { 979 | "display_name": "Python 3", 980 | "language": "python", 981 | "name": "python3" 982 | }, 983 | "language_info": { 984 | "codemirror_mode": { 985 | "name": "ipython", 986 | "version": 3 987 | }, 988 | "file_extension": ".py", 989 | "mimetype": "text/x-python", 990 | "name": "python", 991 | "nbconvert_exporter": "python", 992 | "pygments_lexer": "ipython3", 993 | "version": "3.6.1" 994 | } 995 | }, 996 | "nbformat": 4, 997 | "nbformat_minor": 1 998 | } 999 | -------------------------------------------------------------------------------- /notebooks/10-Iterators.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Iterators" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Topics\n", 15 | "\n", 16 | "- Stream larger-than-memory data through a pipeline\n", 17 | "- Composable thanks to the iterator protocol" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "My favorite \"feature\" of pandas is that it's written in Python.\n", 25 | "Python has great language-level features for handling streams of data\n", 26 | "that may not fit in memory.\n", 27 | "This can be a useful pre-processing step to reading the data into a DataFrame or\n", 28 | "NumPy array.\n", 29 | "You can get quite far using just the builtin data structures as David Beazley proves in [this PyData keynote](https://www.youtube.com/watch?v=lyDLAutA88s)." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "import os\n", 39 | "import gzip\n", 40 | "from itertools import islice, takewhile\n", 41 | "\n", 42 | "import numpy as np\n", 43 | "import pandas as pd\n", 44 | "import seaborn as sns\n", 45 | "import dask.dataframe as dd\n", 46 | "from toolz import partition_all, partitionby\n", 47 | "import matplotlib.pyplot as plt" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "%matplotlib inline" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "pd.options.display.max_rows = 10\n", 66 | "sns.set(context='talk')\n", 67 | "plt.style.use(\"default\")" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "## Beer Reviews Dataset\n", 75 | "\n", 76 | "- A review is a list of lines\n", 77 | "- Each review line is formated like `meta/field: value`\n", 78 | "- Reviews are separated by blank lines (i.e. the line is just `'\\n'`)\n" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "Stanford has a [dataset on beer reviews](https://snap.stanford.edu/data/web-BeerAdvocate.html). The raw file is too large for me to include, but I split off a couple subsets for us to work with.\n", 86 | "\n", 87 | "Pandas can't read this file natively, but we have Python!\n", 88 | "We'll use Python to parse the raw file and tranform it into a tabular format." 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "with gzip.open(\"data/beer-raw-small.txt.gz\", \"r\") as f:\n", 98 | " print(f.read(1500).decode('utf-8'))" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "The full compressed raw dataset is about 500MB, so reading it all into memory might not be pleasent (we're working with a small subset that would fit in memory, but pretend it didn't).\n", 106 | "Fortunately, Python's iterator protocol and generators make dealing with large streams of data pleasent." 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "## Developing a solution\n", 114 | "\n", 115 | "Let's build a solution together. I'll provide some guidance as we go along." 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "# Get a handle to the data\n", 125 | "f = gzip.open(\"data/beer-raw-small.txt.gz\", \"rt\")" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "Usually you'd use a context manager like `with gzip.open(...) as f`, but for debugging, it's OK to do it this way." 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "## Parsing Tasks\n", 140 | "\n", 141 | "1. split the raw text stream into individual reviews\n", 142 | "2. transform each individual review into a data container\n", 143 | "3. combine a chunk of transformed individual reviews into a collection\n", 144 | "4. store the chunk to disk" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "Let's grab the first review using [`takewhile`](https://docs.python.org/3/library/itertools.html#itertools.takewhile) till the first `'\\n'`.\n", 152 | "`takewhile` scans a stream, returning each item (line) until it hits the sentinal value it's looking for." 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "from itertools import takewhile" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "f.seek(0); # make the cell idempotent\n", 171 | "first = list(takewhile(lambda x: x != '\\n', f))\n", 172 | "first" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "
\n", 180 | "

Exercise: Format Review

\n", 181 | "
\n", 182 | "

Write a function `format_review` that converts an item like `first` into a dict

" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "It will have one entry per line, where the are the stuff to the left of the colon and the values are the stuff to the right.\n", 190 | "For example, the first line would be\n", 191 | "\n", 192 | "`'beer/name: Sausa Weizen\\n',` => `'beer/name': 'Sausa Weizen'`\n", 193 | "\n", 194 | "Make sure to clean up the line endings too.\n", 195 | "\n", 196 | "- Hint: Check out the [python string methods](https://docs.python.org/3/library/stdtypes.html#string-methods)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "You can check your function against `expected` by evaluating the next cell.\n", 204 | "If you get a failure, adjust your `format_review` until it passes." 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "import unittest\n", 214 | "from typing import List, Dict\n", 215 | "\n", 216 | "f.seek(0); # make the cell idempotent\n", 217 | "review = list(takewhile(lambda x: x != '\\n', f))\n", 218 | "\n", 219 | "\n", 220 | "def format_review(review: List[str]) -> Dict[str, str]:\n", 221 | " \"\"\"Your code goes below\"\"\"\n", 222 | " \n", 223 | "\n", 224 | "class TestFormat(unittest.TestCase):\n", 225 | " maxDiff = None\n", 226 | "\n", 227 | " def test_format_review(self):\n", 228 | " result = format_review(review)\n", 229 | " expected = {\n", 230 | " 'beer/ABV': '5.00',\n", 231 | " 'beer/beerId': '47986',\n", 232 | " 'beer/brewerId': '10325',\n", 233 | " 'beer/name': 'Sausa Weizen',\n", 234 | " 'beer/style': 'Hefeweizen',\n", 235 | " 'review/appearance': '2.5',\n", 236 | " 'review/aroma': '2',\n", 237 | " 'review/overall': '1.5',\n", 238 | " 'review/palate': '1.5',\n", 239 | " 'review/profileName': 'stcules',\n", 240 | " 'review/taste': '1.5',\n", 241 | " 'review/text': 'A lot of foam. But a lot.\\tIn the smell some banana, and then lactic and tart. Not a good start.\\tQuite dark orange in color, with a lively carbonation (now visible, under the foam).\\tAgain tending to lactic sourness.\\tSame for the taste. With some yeast and banana.\\t\\t',\n", 242 | " 'review/time': '1234817823'\n", 243 | " }\n", 244 | " self.assertEqual(result, expected)\n", 245 | "\n", 246 | "suite = unittest.TestLoader().loadTestsFromModule(TestFormat())\n", 247 | "unittest.TextTestRunner().run(suite)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "%load solutions/groupby_format_review.py" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "Notice that optional argument to split, which controls the number of splits made; If a review text had contained a literal `': '`, we'd be in trouble since it'd get split again.\n", 264 | "\n", 265 | "Make sure you executed the above solution cell twice (first to load, second to execute) as we'll be using that `format_review` function down below" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "## To a DataFrame\n", 273 | "\n", 274 | "Assuming we've processed many reviews into a list, we'll then build up a DataFrame." 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "r = [format_review(first)] # imagine a list of many reviews\n", 284 | "\n", 285 | "col_names = {\n", 286 | " 'beer/ABV': 'abv',\n", 287 | " 'beer/beerId': 'beer_id',\n", 288 | " 'beer/brewerId': 'brewer_id',\n", 289 | " 'beer/name': 'beer_name',\n", 290 | " 'beer/style': 'beer_style',\n", 291 | " 'review/appearance': 'review_appearance',\n", 292 | " 'review/aroma': 'review_aroma',\n", 293 | " 'review/overall': 'review_overall',\n", 294 | " 'review/palate': 'review_palate',\n", 295 | " 'review/profileName': 'profile_name',\n", 296 | " 'review/taste': 'review_taste',\n", 297 | " 'review/text': 'text',\n", 298 | " 'review/time': 'time'\n", 299 | "}\n", 300 | "df = pd.DataFrame(r)\n", 301 | "numeric = ['abv', 'review_appearance', 'review_aroma',\n", 302 | " 'review_overall', 'review_palate', 'review_taste']\n", 303 | "df = (df.rename(columns=col_names)\n", 304 | " .replace('', np.nan))\n", 305 | "df[numeric] = df[numeric].astype(float)\n", 306 | "df['time'] = pd.to_datetime(df.time.astype(int), unit='s')\n", 307 | "df" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": {}, 313 | "source": [ 314 | "Again, writing that as a function:" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "def as_dataframe(reviews):\n", 324 | " col_names = {\n", 325 | " 'beer/ABV': 'abv',\n", 326 | " 'beer/beerId': 'beer_id',\n", 327 | " 'beer/brewerId': 'brewer_id',\n", 328 | " 'beer/name': 'beer_name',\n", 329 | " 'beer/style': 'beer_style',\n", 330 | " 'review/appearance': 'review_appearance',\n", 331 | " 'review/aroma': 'review_aroma',\n", 332 | " 'review/overall': 'review_overall',\n", 333 | " 'review/palate': 'review_palate',\n", 334 | " 'review/profileName': 'profile_name',\n", 335 | " 'review/taste': 'review_taste',\n", 336 | " 'review/text': 'text',\n", 337 | " 'review/time': 'time'\n", 338 | " }\n", 339 | " df = pd.DataFrame(list(reviews))\n", 340 | " numeric = ['abv', 'review_appearance', 'review_aroma',\n", 341 | " 'review_overall', 'review_palate', 'review_taste']\n", 342 | " df = (df.rename(columns=col_names)\n", 343 | " .replace('', np.nan))\n", 344 | " df[numeric] = df[numeric].astype(float)\n", 345 | " df['time'] = pd.to_datetime(df.time.astype(int), unit='s')\n", 346 | " return df" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "## Full pipeline\n", 354 | "\n", 355 | "1. `file -> review_lines : List[str]`\n", 356 | "2. `review_lines -> reviews : Dict[str, str]`\n", 357 | "3. `reviews -> DataFrames`\n", 358 | "4. `DataFrames -> CSV`" 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": {}, 364 | "source": [ 365 | "The full pipeline would look something like:" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "from toolz import partition_all, partitionby\n", 375 | "\n", 376 | "\n", 377 | "BATCH_SIZE = 100 # Number of reviews to process per chunk\n", 378 | " # Intentionally small for demostration \n", 379 | "\n", 380 | "\n", 381 | "with gzip.open(\"data/beer-raw-small.txt.gz\", \"rt\") as f:\n", 382 | "\n", 383 | " # Filter out a null byte at the end\n", 384 | " lines = (x for x in f if not x.startswith('\\x00'))\n", 385 | " \n", 386 | " review_lines_and_newlines = partitionby(lambda x: x == '\\n', lines)\n", 387 | " # that goes [review, \\n, review, \\n, ...]\n", 388 | " # so filter out the newlines\n", 389 | " review_lines = filter(lambda x: x != ('\\n',), review_lines_and_newlines)\n", 390 | " \n", 391 | " # generator expression to go from List[str] -> Dict[str, str]\n", 392 | " reviews = (format_review(x) for x in review_lines)\n", 393 | " \n", 394 | " # `reviews` yields one dict per review.\n", 395 | " # Won't fit in memory, so do `BATCH_SIZE` per chunk\n", 396 | " chunks = partition_all(BATCH_SIZE, reviews)\n", 397 | " dfs = (as_dataframe(chunk) for chunk in chunks)\n", 398 | " os.makedirs(\"data/beer/\", exist_ok=True)\n", 399 | "\n", 400 | " # the first time we read from disk\n", 401 | " for i, df in enumerate(dfs):\n", 402 | " df.to_csv(\"data/beer/chunk_%s.csv.gz\" % i, index=False,\n", 403 | " compression=\"gzip\")\n", 404 | " print(i, end='\\r')" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": {}, 410 | "source": [ 411 | "\n", 412 | "This runs comfortably in memory. At any given time, we only have `BATCH_SIZE` reviews in memory." 413 | ] 414 | } 415 | ], 416 | "metadata": { 417 | "kernelspec": { 418 | "display_name": "Python 3", 419 | "language": "python", 420 | "name": "python3" 421 | }, 422 | "language_info": { 423 | "codemirror_mode": { 424 | "name": "ipython", 425 | "version": 3 426 | }, 427 | "file_extension": ".py", 428 | "mimetype": "text/x-python", 429 | "name": "python", 430 | "nbconvert_exporter": "python", 431 | "pygments_lexer": "ipython3", 432 | "version": "3.5.5" 433 | } 434 | }, 435 | "nbformat": 4, 436 | "nbformat_minor": 1 437 | } 438 | -------------------------------------------------------------------------------- /notebooks/data/beer-raw-small.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deniederhut/Pandas-Tutorial-SciPyConf-2018/d30a9aac47b4a16b6170da77f982fe8592a4eb73/notebooks/data/beer-raw-small.txt.gz -------------------------------------------------------------------------------- /notebooks/data/cpi.csv: -------------------------------------------------------------------------------- 1 | DATE,CPIAUCSL 2 | 1947-01-01,21.48 3 | 1947-02-01,21.62 4 | 1947-03-01,22.0 5 | 1947-04-01,22.0 6 | 1947-05-01,21.95 7 | 1947-06-01,22.08 8 | 1947-07-01,22.23 9 | 1947-08-01,22.4 10 | 1947-09-01,22.84 11 | 1947-10-01,22.91 12 | 1947-11-01,23.06 13 | 1947-12-01,23.41 14 | 1948-01-01,23.68 15 | 1948-02-01,23.67 16 | 1948-03-01,23.5 17 | 1948-04-01,23.82 18 | 1948-05-01,24.01 19 | 1948-06-01,24.15 20 | 1948-07-01,24.4 21 | 1948-08-01,24.43 22 | 1948-09-01,24.36 23 | 1948-10-01,24.31 24 | 1948-11-01,24.16 25 | 1948-12-01,24.05 26 | 1949-01-01,24.01 27 | 1949-02-01,23.91 28 | 1949-03-01,23.91 29 | 1949-04-01,23.92 30 | 1949-05-01,23.91 31 | 1949-06-01,23.92 32 | 1949-07-01,23.7 33 | 1949-08-01,23.7 34 | 1949-09-01,23.75 35 | 1949-10-01,23.67 36 | 1949-11-01,23.7 37 | 1949-12-01,23.61 38 | 1950-01-01,23.51 39 | 1950-02-01,23.61 40 | 1950-03-01,23.64 41 | 1950-04-01,23.65 42 | 1950-05-01,23.77 43 | 1950-06-01,23.88 44 | 1950-07-01,24.07 45 | 1950-08-01,24.2 46 | 1950-09-01,24.34 47 | 1950-10-01,24.5 48 | 1950-11-01,24.6 49 | 1950-12-01,24.98 50 | 1951-01-01,25.38 51 | 1951-02-01,25.83 52 | 1951-03-01,25.88 53 | 1951-04-01,25.92 54 | 1951-05-01,25.99 55 | 1951-06-01,25.93 56 | 1951-07-01,25.91 57 | 1951-08-01,25.86 58 | 1951-09-01,26.03 59 | 1951-10-01,26.16 60 | 1951-11-01,26.32 61 | 1951-12-01,26.47 62 | 1952-01-01,26.45 63 | 1952-02-01,26.41 64 | 1952-03-01,26.39 65 | 1952-04-01,26.46 66 | 1952-05-01,26.47 67 | 1952-06-01,26.53 68 | 1952-07-01,26.68 69 | 1952-08-01,26.69 70 | 1952-09-01,26.63 71 | 1952-10-01,26.69 72 | 1952-11-01,26.69 73 | 1952-12-01,26.71 74 | 1953-01-01,26.64 75 | 1953-02-01,26.59 76 | 1953-03-01,26.63 77 | 1953-04-01,26.69 78 | 1953-05-01,26.7 79 | 1953-06-01,26.77 80 | 1953-07-01,26.79 81 | 1953-08-01,26.85 82 | 1953-09-01,26.89 83 | 1953-10-01,26.95 84 | 1953-11-01,26.85 85 | 1953-12-01,26.87 86 | 1954-01-01,26.94 87 | 1954-02-01,26.99 88 | 1954-03-01,26.93 89 | 1954-04-01,26.86 90 | 1954-05-01,26.93 91 | 1954-06-01,26.94 92 | 1954-07-01,26.86 93 | 1954-08-01,26.85 94 | 1954-09-01,26.81 95 | 1954-10-01,26.72 96 | 1954-11-01,26.78 97 | 1954-12-01,26.77 98 | 1955-01-01,26.77 99 | 1955-02-01,26.82 100 | 1955-03-01,26.79 101 | 1955-04-01,26.79 102 | 1955-05-01,26.77 103 | 1955-06-01,26.71 104 | 1955-07-01,26.76 105 | 1955-08-01,26.72 106 | 1955-09-01,26.85 107 | 1955-10-01,26.82 108 | 1955-11-01,26.88 109 | 1955-12-01,26.87 110 | 1956-01-01,26.83 111 | 1956-02-01,26.86 112 | 1956-03-01,26.89 113 | 1956-04-01,26.93 114 | 1956-05-01,27.03 115 | 1956-06-01,27.15 116 | 1956-07-01,27.29 117 | 1956-08-01,27.31 118 | 1956-09-01,27.35 119 | 1956-10-01,27.51 120 | 1956-11-01,27.51 121 | 1956-12-01,27.63 122 | 1957-01-01,27.67 123 | 1957-02-01,27.8 124 | 1957-03-01,27.86 125 | 1957-04-01,27.93 126 | 1957-05-01,28.0 127 | 1957-06-01,28.11 128 | 1957-07-01,28.19 129 | 1957-08-01,28.28 130 | 1957-09-01,28.32 131 | 1957-10-01,28.32 132 | 1957-11-01,28.41 133 | 1957-12-01,28.47 134 | 1958-01-01,28.64 135 | 1958-02-01,28.7 136 | 1958-03-01,28.87 137 | 1958-04-01,28.94 138 | 1958-05-01,28.94 139 | 1958-06-01,28.91 140 | 1958-07-01,28.89 141 | 1958-08-01,28.94 142 | 1958-09-01,28.91 143 | 1958-10-01,28.91 144 | 1958-11-01,28.95 145 | 1958-12-01,28.97 146 | 1959-01-01,29.01 147 | 1959-02-01,29.0 148 | 1959-03-01,28.97 149 | 1959-04-01,28.98 150 | 1959-05-01,29.04 151 | 1959-06-01,29.11 152 | 1959-07-01,29.15 153 | 1959-08-01,29.18 154 | 1959-09-01,29.25 155 | 1959-10-01,29.35 156 | 1959-11-01,29.35 157 | 1959-12-01,29.41 158 | 1960-01-01,29.37 159 | 1960-02-01,29.41 160 | 1960-03-01,29.41 161 | 1960-04-01,29.54 162 | 1960-05-01,29.57 163 | 1960-06-01,29.61 164 | 1960-07-01,29.55 165 | 1960-08-01,29.61 166 | 1960-09-01,29.61 167 | 1960-10-01,29.75 168 | 1960-11-01,29.78 169 | 1960-12-01,29.81 170 | 1961-01-01,29.84 171 | 1961-02-01,29.84 172 | 1961-03-01,29.84 173 | 1961-04-01,29.81 174 | 1961-05-01,29.84 175 | 1961-06-01,29.84 176 | 1961-07-01,29.92 177 | 1961-08-01,29.94 178 | 1961-09-01,29.98 179 | 1961-10-01,29.98 180 | 1961-11-01,29.98 181 | 1961-12-01,30.01 182 | 1962-01-01,30.04 183 | 1962-02-01,30.11 184 | 1962-03-01,30.17 185 | 1962-04-01,30.21 186 | 1962-05-01,30.24 187 | 1962-06-01,30.21 188 | 1962-07-01,30.22 189 | 1962-08-01,30.28 190 | 1962-09-01,30.42 191 | 1962-10-01,30.38 192 | 1962-11-01,30.38 193 | 1962-12-01,30.38 194 | 1963-01-01,30.44 195 | 1963-02-01,30.48 196 | 1963-03-01,30.51 197 | 1963-04-01,30.48 198 | 1963-05-01,30.51 199 | 1963-06-01,30.61 200 | 1963-07-01,30.69 201 | 1963-08-01,30.75 202 | 1963-09-01,30.72 203 | 1963-10-01,30.75 204 | 1963-11-01,30.78 205 | 1963-12-01,30.88 206 | 1964-01-01,30.94 207 | 1964-02-01,30.91 208 | 1964-03-01,30.94 209 | 1964-04-01,30.95 210 | 1964-05-01,30.98 211 | 1964-06-01,31.01 212 | 1964-07-01,31.02 213 | 1964-08-01,31.05 214 | 1964-09-01,31.08 215 | 1964-10-01,31.12 216 | 1964-11-01,31.21 217 | 1964-12-01,31.25 218 | 1965-01-01,31.28 219 | 1965-02-01,31.28 220 | 1965-03-01,31.31 221 | 1965-04-01,31.38 222 | 1965-05-01,31.48 223 | 1965-06-01,31.61 224 | 1965-07-01,31.58 225 | 1965-08-01,31.55 226 | 1965-09-01,31.62 227 | 1965-10-01,31.65 228 | 1965-11-01,31.75 229 | 1965-12-01,31.85 230 | 1966-01-01,31.88 231 | 1966-02-01,32.08 232 | 1966-03-01,32.18 233 | 1966-04-01,32.28 234 | 1966-05-01,32.35 235 | 1966-06-01,32.38 236 | 1966-07-01,32.45 237 | 1966-08-01,32.65 238 | 1966-09-01,32.75 239 | 1966-10-01,32.85 240 | 1966-11-01,32.88 241 | 1966-12-01,32.92 242 | 1967-01-01,32.9 243 | 1967-02-01,33.0 244 | 1967-03-01,33.0 245 | 1967-04-01,33.1 246 | 1967-05-01,33.1 247 | 1967-06-01,33.3 248 | 1967-07-01,33.4 249 | 1967-08-01,33.5 250 | 1967-09-01,33.6 251 | 1967-10-01,33.7 252 | 1967-11-01,33.9 253 | 1967-12-01,34.0 254 | 1968-01-01,34.1 255 | 1968-02-01,34.2 256 | 1968-03-01,34.3 257 | 1968-04-01,34.4 258 | 1968-05-01,34.5 259 | 1968-06-01,34.7 260 | 1968-07-01,34.9 261 | 1968-08-01,35.0 262 | 1968-09-01,35.1 263 | 1968-10-01,35.3 264 | 1968-11-01,35.4 265 | 1968-12-01,35.6 266 | 1969-01-01,35.7 267 | 1969-02-01,35.8 268 | 1969-03-01,36.1 269 | 1969-04-01,36.3 270 | 1969-05-01,36.4 271 | 1969-06-01,36.6 272 | 1969-07-01,36.8 273 | 1969-08-01,36.9 274 | 1969-09-01,37.1 275 | 1969-10-01,37.3 276 | 1969-11-01,37.5 277 | 1969-12-01,37.7 278 | 1970-01-01,37.9 279 | 1970-02-01,38.1 280 | 1970-03-01,38.3 281 | 1970-04-01,38.5 282 | 1970-05-01,38.6 283 | 1970-06-01,38.8 284 | 1970-07-01,38.9 285 | 1970-08-01,39.0 286 | 1970-09-01,39.2 287 | 1970-10-01,39.4 288 | 1970-11-01,39.6 289 | 1970-12-01,39.8 290 | 1971-01-01,39.9 291 | 1971-02-01,39.9 292 | 1971-03-01,40.0 293 | 1971-04-01,40.1 294 | 1971-05-01,40.3 295 | 1971-06-01,40.5 296 | 1971-07-01,40.6 297 | 1971-08-01,40.7 298 | 1971-09-01,40.8 299 | 1971-10-01,40.9 300 | 1971-11-01,41.0 301 | 1971-12-01,41.1 302 | 1972-01-01,41.2 303 | 1972-02-01,41.4 304 | 1972-03-01,41.4 305 | 1972-04-01,41.5 306 | 1972-05-01,41.6 307 | 1972-06-01,41.7 308 | 1972-07-01,41.8 309 | 1972-08-01,41.9 310 | 1972-09-01,42.1 311 | 1972-10-01,42.2 312 | 1972-11-01,42.4 313 | 1972-12-01,42.5 314 | 1973-01-01,42.7 315 | 1973-02-01,43.0 316 | 1973-03-01,43.4 317 | 1973-04-01,43.7 318 | 1973-05-01,43.9 319 | 1973-06-01,44.2 320 | 1973-07-01,44.2 321 | 1973-08-01,45.0 322 | 1973-09-01,45.2 323 | 1973-10-01,45.6 324 | 1973-11-01,45.9 325 | 1973-12-01,46.3 326 | 1974-01-01,46.8 327 | 1974-02-01,47.3 328 | 1974-03-01,47.8 329 | 1974-04-01,48.1 330 | 1974-05-01,48.6 331 | 1974-06-01,49.0 332 | 1974-07-01,49.3 333 | 1974-08-01,49.9 334 | 1974-09-01,50.6 335 | 1974-10-01,51.0 336 | 1974-11-01,51.5 337 | 1974-12-01,51.9 338 | 1975-01-01,52.3 339 | 1975-02-01,52.6 340 | 1975-03-01,52.8 341 | 1975-04-01,53.0 342 | 1975-05-01,53.1 343 | 1975-06-01,53.5 344 | 1975-07-01,54.0 345 | 1975-08-01,54.2 346 | 1975-09-01,54.6 347 | 1975-10-01,54.9 348 | 1975-11-01,55.3 349 | 1975-12-01,55.6 350 | 1976-01-01,55.8 351 | 1976-02-01,55.9 352 | 1976-03-01,56.0 353 | 1976-04-01,56.1 354 | 1976-05-01,56.4 355 | 1976-06-01,56.7 356 | 1976-07-01,57.0 357 | 1976-08-01,57.3 358 | 1976-09-01,57.6 359 | 1976-10-01,57.9 360 | 1976-11-01,58.1 361 | 1976-12-01,58.4 362 | 1977-01-01,58.7 363 | 1977-02-01,59.3 364 | 1977-03-01,59.6 365 | 1977-04-01,60.0 366 | 1977-05-01,60.2 367 | 1977-06-01,60.5 368 | 1977-07-01,60.8 369 | 1977-08-01,61.1 370 | 1977-09-01,61.3 371 | 1977-10-01,61.6 372 | 1977-11-01,62.0 373 | 1977-12-01,62.3 374 | 1978-01-01,62.7 375 | 1978-02-01,63.0 376 | 1978-03-01,63.4 377 | 1978-04-01,63.9 378 | 1978-05-01,64.5 379 | 1978-06-01,65.0 380 | 1978-07-01,65.5 381 | 1978-08-01,65.9 382 | 1978-09-01,66.5 383 | 1978-10-01,67.1 384 | 1978-11-01,67.5 385 | 1978-12-01,67.9 386 | 1979-01-01,68.5 387 | 1979-02-01,69.2 388 | 1979-03-01,69.9 389 | 1979-04-01,70.6 390 | 1979-05-01,71.4 391 | 1979-06-01,72.2 392 | 1979-07-01,73.0 393 | 1979-08-01,73.7 394 | 1979-09-01,74.4 395 | 1979-10-01,75.2 396 | 1979-11-01,76.0 397 | 1979-12-01,76.9 398 | 1980-01-01,78.0 399 | 1980-02-01,79.0 400 | 1980-03-01,80.1 401 | 1980-04-01,80.9 402 | 1980-05-01,81.7 403 | 1980-06-01,82.5 404 | 1980-07-01,82.6 405 | 1980-08-01,83.2 406 | 1980-09-01,83.9 407 | 1980-10-01,84.7 408 | 1980-11-01,85.6 409 | 1980-12-01,86.4 410 | 1981-01-01,87.2 411 | 1981-02-01,88.0 412 | 1981-03-01,88.6 413 | 1981-04-01,89.1 414 | 1981-05-01,89.7 415 | 1981-06-01,90.5 416 | 1981-07-01,91.5 417 | 1981-08-01,92.2 418 | 1981-09-01,93.1 419 | 1981-10-01,93.4 420 | 1981-11-01,93.8 421 | 1981-12-01,94.1 422 | 1982-01-01,94.4 423 | 1982-02-01,94.7 424 | 1982-03-01,94.7 425 | 1982-04-01,95.0 426 | 1982-05-01,95.9 427 | 1982-06-01,97.0 428 | 1982-07-01,97.5 429 | 1982-08-01,97.7 430 | 1982-09-01,97.7 431 | 1982-10-01,98.1 432 | 1982-11-01,98.0 433 | 1982-12-01,97.7 434 | 1983-01-01,97.9 435 | 1983-02-01,98.0 436 | 1983-03-01,98.1 437 | 1983-04-01,98.8 438 | 1983-05-01,99.2 439 | 1983-06-01,99.4 440 | 1983-07-01,99.8 441 | 1983-08-01,100.1 442 | 1983-09-01,100.4 443 | 1983-10-01,100.8 444 | 1983-11-01,101.1 445 | 1983-12-01,101.4 446 | 1984-01-01,102.1 447 | 1984-02-01,102.6 448 | 1984-03-01,102.9 449 | 1984-04-01,103.3 450 | 1984-05-01,103.5 451 | 1984-06-01,103.7 452 | 1984-07-01,104.1 453 | 1984-08-01,104.4 454 | 1984-09-01,104.7 455 | 1984-10-01,105.1 456 | 1984-11-01,105.3 457 | 1984-12-01,105.5 458 | 1985-01-01,105.7 459 | 1985-02-01,106.3 460 | 1985-03-01,106.8 461 | 1985-04-01,107.0 462 | 1985-05-01,107.2 463 | 1985-06-01,107.5 464 | 1985-07-01,107.7 465 | 1985-08-01,107.9 466 | 1985-09-01,108.1 467 | 1985-10-01,108.5 468 | 1985-11-01,109.0 469 | 1985-12-01,109.5 470 | 1986-01-01,109.9 471 | 1986-02-01,109.7 472 | 1986-03-01,109.1 473 | 1986-04-01,108.7 474 | 1986-05-01,109.0 475 | 1986-06-01,109.4 476 | 1986-07-01,109.5 477 | 1986-08-01,109.6 478 | 1986-09-01,110.0 479 | 1986-10-01,110.2 480 | 1986-11-01,110.4 481 | 1986-12-01,110.8 482 | 1987-01-01,111.4 483 | 1987-02-01,111.8 484 | 1987-03-01,112.2 485 | 1987-04-01,112.7 486 | 1987-05-01,113.0 487 | 1987-06-01,113.5 488 | 1987-07-01,113.8 489 | 1987-08-01,114.3 490 | 1987-09-01,114.7 491 | 1987-10-01,115.0 492 | 1987-11-01,115.4 493 | 1987-12-01,115.6 494 | 1988-01-01,116.0 495 | 1988-02-01,116.2 496 | 1988-03-01,116.5 497 | 1988-04-01,117.2 498 | 1988-05-01,117.5 499 | 1988-06-01,118.0 500 | 1988-07-01,118.5 501 | 1988-08-01,119.0 502 | 1988-09-01,119.5 503 | 1988-10-01,119.9 504 | 1988-11-01,120.3 505 | 1988-12-01,120.7 506 | 1989-01-01,121.2 507 | 1989-02-01,121.6 508 | 1989-03-01,122.2 509 | 1989-04-01,123.1 510 | 1989-05-01,123.7 511 | 1989-06-01,124.1 512 | 1989-07-01,124.5 513 | 1989-08-01,124.5 514 | 1989-09-01,124.8 515 | 1989-10-01,125.4 516 | 1989-11-01,125.9 517 | 1989-12-01,126.3 518 | 1990-01-01,127.5 519 | 1990-02-01,128.0 520 | 1990-03-01,128.6 521 | 1990-04-01,128.9 522 | 1990-05-01,129.1 523 | 1990-06-01,129.9 524 | 1990-07-01,130.5 525 | 1990-08-01,131.6 526 | 1990-09-01,132.5 527 | 1990-10-01,133.4 528 | 1990-11-01,133.7 529 | 1990-12-01,134.2 530 | 1991-01-01,134.7 531 | 1991-02-01,134.8 532 | 1991-03-01,134.8 533 | 1991-04-01,135.1 534 | 1991-05-01,135.6 535 | 1991-06-01,136.0 536 | 1991-07-01,136.2 537 | 1991-08-01,136.6 538 | 1991-09-01,137.0 539 | 1991-10-01,137.2 540 | 1991-11-01,137.8 541 | 1991-12-01,138.2 542 | 1992-01-01,138.3 543 | 1992-02-01,138.6 544 | 1992-03-01,139.1 545 | 1992-04-01,139.4 546 | 1992-05-01,139.7 547 | 1992-06-01,140.1 548 | 1992-07-01,140.5 549 | 1992-08-01,140.8 550 | 1992-09-01,141.1 551 | 1992-10-01,141.7 552 | 1992-11-01,142.1 553 | 1992-12-01,142.3 554 | 1993-01-01,142.8 555 | 1993-02-01,143.1 556 | 1993-03-01,143.3 557 | 1993-04-01,143.8 558 | 1993-05-01,144.2 559 | 1993-06-01,144.3 560 | 1993-07-01,144.5 561 | 1993-08-01,144.8 562 | 1993-09-01,145.0 563 | 1993-10-01,145.6 564 | 1993-11-01,146.0 565 | 1993-12-01,146.3 566 | 1994-01-01,146.3 567 | 1994-02-01,146.7 568 | 1994-03-01,147.1 569 | 1994-04-01,147.2 570 | 1994-05-01,147.5 571 | 1994-06-01,147.9 572 | 1994-07-01,148.4 573 | 1994-08-01,149.0 574 | 1994-09-01,149.3 575 | 1994-10-01,149.4 576 | 1994-11-01,149.8 577 | 1994-12-01,150.1 578 | 1995-01-01,150.5 579 | 1995-02-01,150.9 580 | 1995-03-01,151.2 581 | 1995-04-01,151.8 582 | 1995-05-01,152.1 583 | 1995-06-01,152.4 584 | 1995-07-01,152.6 585 | 1995-08-01,152.9 586 | 1995-09-01,153.1 587 | 1995-10-01,153.5 588 | 1995-11-01,153.7 589 | 1995-12-01,153.9 590 | 1996-01-01,154.7 591 | 1996-02-01,155.0 592 | 1996-03-01,155.5 593 | 1996-04-01,156.1 594 | 1996-05-01,156.4 595 | 1996-06-01,156.7 596 | 1996-07-01,157.0 597 | 1996-08-01,157.2 598 | 1996-09-01,157.7 599 | 1996-10-01,158.2 600 | 1996-11-01,158.7 601 | 1996-12-01,159.1 602 | 1997-01-01,159.4 603 | 1997-02-01,159.7 604 | 1997-03-01,159.8 605 | 1997-04-01,159.9 606 | 1997-05-01,159.9 607 | 1997-06-01,160.2 608 | 1997-07-01,160.4 609 | 1997-08-01,160.8 610 | 1997-09-01,161.2 611 | 1997-10-01,161.5 612 | 1997-11-01,161.7 613 | 1997-12-01,161.8 614 | 1998-01-01,162.0 615 | 1998-02-01,162.0 616 | 1998-03-01,162.0 617 | 1998-04-01,162.2 618 | 1998-05-01,162.6 619 | 1998-06-01,162.8 620 | 1998-07-01,163.2 621 | 1998-08-01,163.4 622 | 1998-09-01,163.5 623 | 1998-10-01,163.9 624 | 1998-11-01,164.1 625 | 1998-12-01,164.4 626 | 1999-01-01,164.7 627 | 1999-02-01,164.7 628 | 1999-03-01,164.8 629 | 1999-04-01,165.9 630 | 1999-05-01,166.0 631 | 1999-06-01,166.0 632 | 1999-07-01,166.7 633 | 1999-08-01,167.1 634 | 1999-09-01,167.8 635 | 1999-10-01,168.1 636 | 1999-11-01,168.4 637 | 1999-12-01,168.8 638 | 2000-01-01,169.3 639 | 2000-02-01,170.0 640 | 2000-03-01,171.0 641 | 2000-04-01,170.9 642 | 2000-05-01,171.2 643 | 2000-06-01,172.2 644 | 2000-07-01,172.7 645 | 2000-08-01,172.7 646 | 2000-09-01,173.6 647 | 2000-10-01,173.9 648 | 2000-11-01,174.2 649 | 2000-12-01,174.6 650 | 2001-01-01,175.6 651 | 2001-02-01,176.0 652 | 2001-03-01,176.1 653 | 2001-04-01,176.4 654 | 2001-05-01,177.3 655 | 2001-06-01,177.7 656 | 2001-07-01,177.4 657 | 2001-08-01,177.4 658 | 2001-09-01,178.1 659 | 2001-10-01,177.6 660 | 2001-11-01,177.5 661 | 2001-12-01,177.4 662 | 2002-01-01,177.7 663 | 2002-02-01,178.0 664 | 2002-03-01,178.5 665 | 2002-04-01,179.3 666 | 2002-05-01,179.5 667 | 2002-06-01,179.6 668 | 2002-07-01,180.0 669 | 2002-08-01,180.5 670 | 2002-09-01,180.8 671 | 2002-10-01,181.2 672 | 2002-11-01,181.5 673 | 2002-12-01,181.8 674 | 2003-01-01,182.6 675 | 2003-02-01,183.6 676 | 2003-03-01,183.9 677 | 2003-04-01,183.2 678 | 2003-05-01,182.9 679 | 2003-06-01,183.1 680 | 2003-07-01,183.7 681 | 2003-08-01,184.5 682 | 2003-09-01,185.1 683 | 2003-10-01,184.9 684 | 2003-11-01,185.0 685 | 2003-12-01,185.5 686 | 2004-01-01,186.3 687 | 2004-02-01,186.7 688 | 2004-03-01,187.1 689 | 2004-04-01,187.4 690 | 2004-05-01,188.2 691 | 2004-06-01,188.9 692 | 2004-07-01,189.1 693 | 2004-08-01,189.2 694 | 2004-09-01,189.8 695 | 2004-10-01,190.8 696 | 2004-11-01,191.7 697 | 2004-12-01,191.7 698 | 2005-01-01,191.6 699 | 2005-02-01,192.4 700 | 2005-03-01,193.1 701 | 2005-04-01,193.7 702 | 2005-05-01,193.6 703 | 2005-06-01,193.7 704 | 2005-07-01,194.9 705 | 2005-08-01,196.1 706 | 2005-09-01,198.8 707 | 2005-10-01,199.1 708 | 2005-11-01,198.1 709 | 2005-12-01,198.1 710 | 2006-01-01,199.3 711 | 2006-02-01,199.4 712 | 2006-03-01,199.7 713 | 2006-04-01,200.7 714 | 2006-05-01,201.3 715 | 2006-06-01,201.8 716 | 2006-07-01,202.9 717 | 2006-08-01,203.8 718 | 2006-09-01,202.8 719 | 2006-10-01,201.9 720 | 2006-11-01,202.0 721 | 2006-12-01,203.1 722 | 2007-01-01,203.437 723 | 2007-02-01,204.226 724 | 2007-03-01,205.28799999999998 725 | 2007-04-01,205.90400000000002 726 | 2007-05-01,206.755 727 | 2007-06-01,207.234 728 | 2007-07-01,207.60299999999998 729 | 2007-08-01,207.667 730 | 2007-09-01,208.547 731 | 2007-10-01,209.19 732 | 2007-11-01,210.834 733 | 2007-12-01,211.445 734 | 2008-01-01,212.174 735 | 2008-02-01,212.687 736 | 2008-03-01,213.44799999999998 737 | 2008-04-01,213.942 738 | 2008-05-01,215.208 739 | 2008-06-01,217.463 740 | 2008-07-01,219.016 741 | 2008-08-01,218.69 742 | 2008-09-01,218.877 743 | 2008-10-01,216.995 744 | 2008-11-01,213.153 745 | 2008-12-01,211.398 746 | 2009-01-01,211.933 747 | 2009-02-01,212.705 748 | 2009-03-01,212.495 749 | 2009-04-01,212.709 750 | 2009-05-01,213.02200000000002 751 | 2009-06-01,214.79 752 | 2009-07-01,214.726 753 | 2009-08-01,215.445 754 | 2009-09-01,215.861 755 | 2009-10-01,216.50900000000001 756 | 2009-11-01,217.234 757 | 2009-12-01,217.347 758 | 2010-01-01,217.488 759 | 2010-02-01,217.28099999999998 760 | 2010-03-01,217.35299999999998 761 | 2010-04-01,217.403 762 | 2010-05-01,217.29 763 | 2010-06-01,217.199 764 | 2010-07-01,217.605 765 | 2010-08-01,217.923 766 | 2010-09-01,218.275 767 | 2010-10-01,219.035 768 | 2010-11-01,219.59 769 | 2010-12-01,220.472 770 | 2011-01-01,221.187 771 | 2011-02-01,221.898 772 | 2011-03-01,223.046 773 | 2011-04-01,224.093 774 | 2011-05-01,224.80599999999998 775 | 2011-06-01,224.80599999999998 776 | 2011-07-01,225.395 777 | 2011-08-01,226.106 778 | 2011-09-01,226.597 779 | 2011-10-01,226.75 780 | 2011-11-01,227.169 781 | 2011-12-01,227.22299999999998 782 | 2012-01-01,227.86 783 | 2012-02-01,228.377 784 | 2012-03-01,228.894 785 | 2012-04-01,229.28599999999997 786 | 2012-05-01,228.722 787 | 2012-06-01,228.50599999999997 788 | 2012-07-01,228.475 789 | 2012-08-01,229.84400000000002 790 | 2012-09-01,230.987 791 | 2012-10-01,231.655 792 | 2012-11-01,231.278 793 | 2012-12-01,231.27200000000002 794 | 2013-01-01,231.641 795 | 2013-02-01,233.005 796 | 2013-03-01,232.313 797 | 2013-04-01,231.856 798 | 2013-05-01,231.895 799 | 2013-06-01,232.357 800 | 2013-07-01,232.74900000000002 801 | 2013-08-01,233.24900000000002 802 | 2013-09-01,233.642 803 | 2013-10-01,233.799 804 | 2013-11-01,234.21 805 | 2013-12-01,234.847 806 | 2014-01-01,235.43599999999998 807 | 2014-02-01,235.62099999999998 808 | 2014-03-01,235.89700000000002 809 | 2014-04-01,236.495 810 | 2014-05-01,236.803 811 | 2014-06-01,237.016 812 | 2014-07-01,237.25900000000001 813 | 2014-08-01,237.16299999999998 814 | 2014-09-01,237.51 815 | 2014-10-01,237.65099999999998 816 | 2014-11-01,237.261 817 | 2014-12-01,236.46400000000003 818 | 2015-01-01,234.954 819 | 2015-02-01,235.415 820 | 2015-03-01,235.859 821 | 2015-04-01,236.197 822 | 2015-05-01,236.87599999999998 823 | -------------------------------------------------------------------------------- /notebooks/data/flights-ts.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deniederhut/Pandas-Tutorial-SciPyConf-2018/d30a9aac47b4a16b6170da77f982fe8592a4eb73/notebooks/data/flights-ts.csv.gz -------------------------------------------------------------------------------- /notebooks/data/gdp.csv: -------------------------------------------------------------------------------- 1 | DATE,GDP 2 | 1947-01-01,243.1 3 | 1947-04-01,246.3 4 | 1947-07-01,250.1 5 | 1947-10-01,260.3 6 | 1948-01-01,266.2 7 | 1948-04-01,272.9 8 | 1948-07-01,279.5 9 | 1948-10-01,280.7 10 | 1949-01-01,275.4 11 | 1949-04-01,271.7 12 | 1949-07-01,273.3 13 | 1949-10-01,271.0 14 | 1950-01-01,281.2 15 | 1950-04-01,290.7 16 | 1950-07-01,308.5 17 | 1950-10-01,320.3 18 | 1951-01-01,336.4 19 | 1951-04-01,344.5 20 | 1951-07-01,351.8 21 | 1951-10-01,356.6 22 | 1952-01-01,360.2 23 | 1952-04-01,361.4 24 | 1952-07-01,368.1 25 | 1952-10-01,381.2 26 | 1953-01-01,388.5 27 | 1953-04-01,392.3 28 | 1953-07-01,391.7 29 | 1953-10-01,386.5 30 | 1954-01-01,385.9 31 | 1954-04-01,386.7 32 | 1954-07-01,391.6 33 | 1954-10-01,400.3 34 | 1955-01-01,413.8 35 | 1955-04-01,422.2 36 | 1955-07-01,430.9 37 | 1955-10-01,437.8 38 | 1956-01-01,440.5 39 | 1956-04-01,446.8 40 | 1956-07-01,452.0 41 | 1956-10-01,461.3 42 | 1957-01-01,470.6 43 | 1957-04-01,472.8 44 | 1957-07-01,480.3 45 | 1957-10-01,475.7 46 | 1958-01-01,468.4 47 | 1958-04-01,472.8 48 | 1958-07-01,486.7 49 | 1958-10-01,500.4 50 | 1959-01-01,511.1 51 | 1959-04-01,524.2 52 | 1959-07-01,525.2 53 | 1959-10-01,529.3 54 | 1960-01-01,543.3 55 | 1960-04-01,542.7 56 | 1960-07-01,546.0 57 | 1960-10-01,541.1 58 | 1961-01-01,545.9 59 | 1961-04-01,557.4 60 | 1961-07-01,568.2 61 | 1961-10-01,581.6 62 | 1962-01-01,595.2 63 | 1962-04-01,602.6 64 | 1962-07-01,609.6 65 | 1962-10-01,613.1 66 | 1963-01-01,622.7 67 | 1963-04-01,631.8 68 | 1963-07-01,645.0 69 | 1963-10-01,654.8 70 | 1964-01-01,671.1 71 | 1964-04-01,680.8 72 | 1964-07-01,692.8 73 | 1964-10-01,698.4 74 | 1965-01-01,719.2 75 | 1965-04-01,732.4 76 | 1965-07-01,750.2 77 | 1965-10-01,773.1 78 | 1966-01-01,797.3 79 | 1966-04-01,807.2 80 | 1966-07-01,820.8 81 | 1966-10-01,834.9 82 | 1967-01-01,846.0 83 | 1967-04-01,851.1 84 | 1967-07-01,866.6 85 | 1967-10-01,883.2 86 | 1968-01-01,911.1 87 | 1968-04-01,936.3 88 | 1968-07-01,952.3 89 | 1968-10-01,970.1 90 | 1969-01-01,995.4 91 | 1969-04-01,1011.4 92 | 1969-07-01,1032.0 93 | 1969-10-01,1040.7 94 | 1970-01-01,1053.5 95 | 1970-04-01,1070.1 96 | 1970-07-01,1088.5 97 | 1970-10-01,1091.5 98 | 1971-01-01,1137.8 99 | 1971-04-01,1159.4 100 | 1971-07-01,1180.3 101 | 1971-10-01,1193.6 102 | 1972-01-01,1233.8 103 | 1972-04-01,1270.1 104 | 1972-07-01,1293.8 105 | 1972-10-01,1332.0 106 | 1973-01-01,1380.7 107 | 1973-04-01,1417.6 108 | 1973-07-01,1436.8 109 | 1973-10-01,1479.1 110 | 1974-01-01,1494.7 111 | 1974-04-01,1534.2 112 | 1974-07-01,1563.4 113 | 1974-10-01,1603.0 114 | 1975-01-01,1619.6 115 | 1975-04-01,1656.4 116 | 1975-07-01,1713.8 117 | 1975-10-01,1765.9 118 | 1976-01-01,1824.5 119 | 1976-04-01,1856.9 120 | 1976-07-01,1890.5 121 | 1976-10-01,1938.4 122 | 1977-01-01,1992.5 123 | 1977-04-01,2060.2 124 | 1977-07-01,2122.4 125 | 1977-10-01,2168.7 126 | 1978-01-01,2208.7 127 | 1978-04-01,2336.6 128 | 1978-07-01,2398.9 129 | 1978-10-01,2482.2 130 | 1979-01-01,2531.6 131 | 1979-04-01,2595.9 132 | 1979-07-01,2670.4 133 | 1979-10-01,2730.7 134 | 1980-01-01,2796.5 135 | 1980-04-01,2799.9 136 | 1980-07-01,2860.0 137 | 1980-10-01,2993.5 138 | 1981-01-01,3131.8 139 | 1981-04-01,3167.3 140 | 1981-07-01,3261.2 141 | 1981-10-01,3283.5 142 | 1982-01-01,3273.8 143 | 1982-04-01,3331.3 144 | 1982-07-01,3367.1 145 | 1982-10-01,3407.8 146 | 1983-01-01,3480.3 147 | 1983-04-01,3583.8 148 | 1983-07-01,3692.3 149 | 1983-10-01,3796.1 150 | 1984-01-01,3912.8 151 | 1984-04-01,4015.0 152 | 1984-07-01,4087.4 153 | 1984-10-01,4147.6 154 | 1985-01-01,4237.0 155 | 1985-04-01,4302.3 156 | 1985-07-01,4394.6 157 | 1985-10-01,4453.1 158 | 1986-01-01,4516.3 159 | 1986-04-01,4555.2 160 | 1986-07-01,4619.6 161 | 1986-10-01,4669.4 162 | 1987-01-01,4736.2 163 | 1987-04-01,4821.5 164 | 1987-07-01,4900.5 165 | 1987-10-01,5022.7 166 | 1988-01-01,5090.6 167 | 1988-04-01,5207.7 168 | 1988-07-01,5299.5 169 | 1988-10-01,5412.7 170 | 1989-01-01,5527.4 171 | 1989-04-01,5628.4 172 | 1989-07-01,5711.6 173 | 1989-10-01,5763.4 174 | 1990-01-01,5890.8 175 | 1990-04-01,5974.7 176 | 1990-07-01,6029.5 177 | 1990-10-01,6023.3 178 | 1991-01-01,6054.9 179 | 1991-04-01,6143.6 180 | 1991-07-01,6218.4 181 | 1991-10-01,6279.3 182 | 1992-01-01,6380.8 183 | 1992-04-01,6492.3 184 | 1992-07-01,6586.5 185 | 1992-10-01,6697.6 186 | 1993-01-01,6748.2 187 | 1993-04-01,6829.6 188 | 1993-07-01,6904.2 189 | 1993-10-01,7032.8 190 | 1994-01-01,7136.3 191 | 1994-04-01,7269.8 192 | 1994-07-01,7352.3 193 | 1994-10-01,7476.7 194 | 1995-01-01,7545.3 195 | 1995-04-01,7604.9 196 | 1995-07-01,7706.5 197 | 1995-10-01,7799.5 198 | 1996-01-01,7893.1 199 | 1996-04-01,8061.5 200 | 1996-07-01,8159.0 201 | 1996-10-01,8287.1 202 | 1997-01-01,8402.1 203 | 1997-04-01,8551.9 204 | 1997-07-01,8691.8 205 | 1997-10-01,8788.3 206 | 1998-01-01,8889.7 207 | 1998-04-01,8994.7 208 | 1998-07-01,9146.5 209 | 1998-10-01,9325.7 210 | 1999-01-01,9447.1 211 | 1999-04-01,9557.0 212 | 1999-07-01,9712.3 213 | 1999-10-01,9926.1 214 | 2000-01-01,10031.0 215 | 2000-04-01,10278.3 216 | 2000-07-01,10357.4 217 | 2000-10-01,10472.3 218 | 2001-01-01,10508.1 219 | 2001-04-01,10638.4 220 | 2001-07-01,10639.5 221 | 2001-10-01,10701.3 222 | 2002-01-01,10834.4 223 | 2002-04-01,10934.8 224 | 2002-07-01,11037.1 225 | 2002-10-01,11103.8 226 | 2003-01-01,11230.1 227 | 2003-04-01,11370.7 228 | 2003-07-01,11625.1 229 | 2003-10-01,11816.8 230 | 2004-01-01,11988.4 231 | 2004-04-01,12181.4 232 | 2004-07-01,12367.7 233 | 2004-10-01,12562.2 234 | 2005-01-01,12813.7 235 | 2005-04-01,12974.1 236 | 2005-07-01,13205.4 237 | 2005-10-01,13381.6 238 | 2006-01-01,13648.9 239 | 2006-04-01,13799.8 240 | 2006-07-01,13908.5 241 | 2006-10-01,14066.4 242 | 2007-01-01,14233.2 243 | 2007-04-01,14422.3 244 | 2007-07-01,14569.7 245 | 2007-10-01,14685.3 246 | 2008-01-01,14668.4 247 | 2008-04-01,14813.0 248 | 2008-07-01,14843.0 249 | 2008-10-01,14549.9 250 | 2009-01-01,14383.9 251 | 2009-04-01,14340.4 252 | 2009-07-01,14384.1 253 | 2009-10-01,14566.5 254 | 2010-01-01,14681.1 255 | 2010-04-01,14888.6 256 | 2010-07-01,15057.7 257 | 2010-10-01,15230.2 258 | 2011-01-01,15238.4 259 | 2011-04-01,15460.9 260 | 2011-07-01,15587.1 261 | 2011-10-01,15785.3 262 | 2012-01-01,15973.9 263 | 2012-04-01,16121.9 264 | 2012-07-01,16227.9 265 | 2012-10-01,16297.3 266 | 2013-01-01,16475.4 267 | 2013-04-01,16541.4 268 | 2013-07-01,16749.3 269 | 2013-10-01,16999.9 270 | 2014-01-01,17025.2 271 | -------------------------------------------------------------------------------- /notebooks/data/ny-flights.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deniederhut/Pandas-Tutorial-SciPyConf-2018/d30a9aac47b4a16b6170da77f982fe8592a4eb73/notebooks/data/ny-flights.csv.gz -------------------------------------------------------------------------------- /notebooks/data/rpi.csv: -------------------------------------------------------------------------------- 1 | 0,1,2,3,4,5,6,7,8,9,10,11 2 | 2014-15 NBA RPI Rankings,,,,,,,,,,, 3 | RK,TEAM,RPI,W,L,PCT,SOS,PWR,PF,PA,EWL,EWP 4 | 1,Golden State,.582,67,15,.817,.504,1,9016,8188,68-14,.831 5 | 2,Houston,.552,56,26,.683,.509,7,8522,8240,52-30,.635 6 | ,LA,.552,56,26,.683,.509,4,8751,8211,61-21,.741 7 | 4,Atlanta,.551,60,22,.732,.491,5,8409,7964,58-24,.710 8 | 5,Memphis,.549,55,27,.671,.509,8,8062,7796,52-30,.635 9 | 6,San Antonio,.548,55,27,.671,.506,2,8461,7953,60-22,.735 10 | 7,Dallas,.536,50,32,.610,.511,10,8628,8390,50-32,.613 11 | 8,Cleveland,.535,53,29,.646,.497,3,8457,8090,55-27,.675 12 | 9,Portland,.534,51,31,.622,.504,9,8429,8082,55-27,.667 13 | 10,Chicago,.523,50,32,.610,.494,6,8265,8019,51-31,.622 14 | 11,Toronto,.517,49,33,.598,.491,11,8527,8275,51-31,.621 15 | ,New Orleans,.517,45,37,.549,.507,12,8147,8082,44-38,.533 16 | 13,Oklahoma City,.516,45,37,.549,.505,15,8524,8345,48-34,.587 17 | 14,Washington,.509,46,36,.561,.491,14,8080,8021,43-39,.530 18 | 15,Phoenix,.500,39,43,.476,.508,21,8397,8471,38-44,.464 19 | 16,Utah,.498,38,44,.463,.509,13,7801,7783,42-40,.510 20 | 17,Milwaukee,.494,41,41,.500,.492,18,8023,7988,42-40,.518 21 | 18,Boston,.490,40,42,.488,.490,17,8312,8299,42-40,.506 22 | 19,Brooklyn,.487,38,44,.463,.495,19,8038,8274,31-51,.383 23 | 20,Indiana,.484,38,44,.463,.491,16,7981,7958,42-40,.512 24 | 21,Miami,.483,37,45,.451,.493,20,7764,7977,32-50,.390 25 | 22,Charlotte,.471,33,49,.402,.494,22,7721,7981,30-52,.367 26 | 23,Denver,.470,30,52,.366,.505,23,8320,8611,30-52,.362 27 | ,Sacramento,.470,29,53,.354,.509,26,8310,8614,29-53,.356 28 | 25,Detroit,.467,32,50,.390,.493,24,8077,8159,38-44,.458 29 | 26,Orlando,.447,25,57,.305,.494,25,7847,8316,23-59,.277 30 | 27,LA Lakers,.444,21,61,.256,.507,28,8073,8634,20-62,.248 31 | 28,Minnesota,.430,16,66,.195,.508,30,8016,8737,16-66,.194 32 | 29,Philadelphia,.425,18,64,.220,.494,27,7542,8278,15-67,.177 33 | 30,New York,.419,17,65,.207,.490,29,7535,8299,14-68,.169 34 | -------------------------------------------------------------------------------- /notebooks/data/subset.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deniederhut/Pandas-Tutorial-SciPyConf-2018/d30a9aac47b4a16b6170da77f982fe8592a4eb73/notebooks/data/subset.csv.gz -------------------------------------------------------------------------------- /notebooks/data/tips.csv: -------------------------------------------------------------------------------- 1 | total_bill,tip,sex,smoker,day,time,size 2 | 16.99,1.01,Female,No,Sun,Dinner,2 3 | 10.34,1.66,Male,No,Sun,Dinner,3 4 | 21.01,3.5,Male,No,Sun,Dinner,3 5 | 23.68,3.31,Male,No,Sun,Dinner,2 6 | 24.59,3.61,Female,No,Sun,Dinner,4 7 | 25.29,4.71,Male,No,Sun,Dinner,4 8 | 8.77,2.0,Male,No,Sun,Dinner,2 9 | 26.88,3.12,Male,No,Sun,Dinner,4 10 | 15.04,1.96,Male,No,Sun,Dinner,2 11 | 14.78,3.23,Male,No,Sun,Dinner,2 12 | 10.27,1.71,Male,No,Sun,Dinner,2 13 | 35.26,5.0,Female,No,Sun,Dinner,4 14 | 15.42,1.57,Male,No,Sun,Dinner,2 15 | 18.43,3.0,Male,No,Sun,Dinner,4 16 | 14.83,3.02,Female,No,Sun,Dinner,2 17 | 21.58,3.92,Male,No,Sun,Dinner,2 18 | 10.33,1.67,Female,No,Sun,Dinner,3 19 | 16.29,3.71,Male,No,Sun,Dinner,3 20 | 16.97,3.5,Female,No,Sun,Dinner,3 21 | 20.65,3.35,Male,No,Sat,Dinner,3 22 | 17.92,4.08,Male,No,Sat,Dinner,2 23 | 20.29,2.75,Female,No,Sat,Dinner,2 24 | 15.77,2.23,Female,No,Sat,Dinner,2 25 | 39.42,7.58,Male,No,Sat,Dinner,4 26 | 19.82,3.18,Male,No,Sat,Dinner,2 27 | 17.81,2.34,Male,No,Sat,Dinner,4 28 | 13.37,2.0,Male,No,Sat,Dinner,2 29 | 12.69,2.0,Male,No,Sat,Dinner,2 30 | 21.7,4.3,Male,No,Sat,Dinner,2 31 | 19.65,3.0,Female,No,Sat,Dinner,2 32 | 9.55,1.45,Male,No,Sat,Dinner,2 33 | 18.35,2.5,Male,No,Sat,Dinner,4 34 | 15.06,3.0,Female,No,Sat,Dinner,2 35 | 20.69,2.45,Female,No,Sat,Dinner,4 36 | 17.78,3.27,Male,No,Sat,Dinner,2 37 | 24.06,3.6,Male,No,Sat,Dinner,3 38 | 16.31,2.0,Male,No,Sat,Dinner,3 39 | 16.93,3.07,Female,No,Sat,Dinner,3 40 | 18.69,2.31,Male,No,Sat,Dinner,3 41 | 31.27,5.0,Male,No,Sat,Dinner,3 42 | 16.04,2.24,Male,No,Sat,Dinner,3 43 | 17.46,2.54,Male,No,Sun,Dinner,2 44 | 13.94,3.06,Male,No,Sun,Dinner,2 45 | 9.68,1.32,Male,No,Sun,Dinner,2 46 | 30.4,5.6,Male,No,Sun,Dinner,4 47 | 18.29,3.0,Male,No,Sun,Dinner,2 48 | 22.23,5.0,Male,No,Sun,Dinner,2 49 | 32.4,6.0,Male,No,Sun,Dinner,4 50 | 28.55,2.05,Male,No,Sun,Dinner,3 51 | 18.04,3.0,Male,No,Sun,Dinner,2 52 | 12.54,2.5,Male,No,Sun,Dinner,2 53 | 10.29,2.6,Female,No,Sun,Dinner,2 54 | 34.81,5.2,Female,No,Sun,Dinner,4 55 | 9.94,1.56,Male,No,Sun,Dinner,2 56 | 25.56,4.34,Male,No,Sun,Dinner,4 57 | 19.49,3.51,Male,No,Sun,Dinner,2 58 | 38.01,3.0,Male,Yes,Sat,Dinner,4 59 | 26.41,1.5,Female,No,Sat,Dinner,2 60 | 11.24,1.76,Male,Yes,Sat,Dinner,2 61 | 48.27,6.73,Male,No,Sat,Dinner,4 62 | 20.29,3.21,Male,Yes,Sat,Dinner,2 63 | 13.81,2.0,Male,Yes,Sat,Dinner,2 64 | 11.02,1.98,Male,Yes,Sat,Dinner,2 65 | 18.29,3.76,Male,Yes,Sat,Dinner,4 66 | 17.59,2.64,Male,No,Sat,Dinner,3 67 | 20.08,3.15,Male,No,Sat,Dinner,3 68 | 16.45,2.47,Female,No,Sat,Dinner,2 69 | 3.07,1.0,Female,Yes,Sat,Dinner,1 70 | 20.23,2.01,Male,No,Sat,Dinner,2 71 | 15.01,2.09,Male,Yes,Sat,Dinner,2 72 | 12.02,1.97,Male,No,Sat,Dinner,2 73 | 17.07,3.0,Female,No,Sat,Dinner,3 74 | 26.86,3.14,Female,Yes,Sat,Dinner,2 75 | 25.28,5.0,Female,Yes,Sat,Dinner,2 76 | 14.73,2.2,Female,No,Sat,Dinner,2 77 | 10.51,1.25,Male,No,Sat,Dinner,2 78 | 17.92,3.08,Male,Yes,Sat,Dinner,2 79 | 27.2,4.0,Male,No,Thur,Lunch,4 80 | 22.76,3.0,Male,No,Thur,Lunch,2 81 | 17.29,2.71,Male,No,Thur,Lunch,2 82 | 19.44,3.0,Male,Yes,Thur,Lunch,2 83 | 16.66,3.4,Male,No,Thur,Lunch,2 84 | 10.07,1.83,Female,No,Thur,Lunch,1 85 | 32.68,5.0,Male,Yes,Thur,Lunch,2 86 | 15.98,2.03,Male,No,Thur,Lunch,2 87 | 34.83,5.17,Female,No,Thur,Lunch,4 88 | 13.03,2.0,Male,No,Thur,Lunch,2 89 | 18.28,4.0,Male,No,Thur,Lunch,2 90 | 24.71,5.85,Male,No,Thur,Lunch,2 91 | 21.16,3.0,Male,No,Thur,Lunch,2 92 | 28.97,3.0,Male,Yes,Fri,Dinner,2 93 | 22.49,3.5,Male,No,Fri,Dinner,2 94 | 5.75,1.0,Female,Yes,Fri,Dinner,2 95 | 16.32,4.3,Female,Yes,Fri,Dinner,2 96 | 22.75,3.25,Female,No,Fri,Dinner,2 97 | 40.17,4.73,Male,Yes,Fri,Dinner,4 98 | 27.28,4.0,Male,Yes,Fri,Dinner,2 99 | 12.03,1.5,Male,Yes,Fri,Dinner,2 100 | 21.01,3.0,Male,Yes,Fri,Dinner,2 101 | 12.46,1.5,Male,No,Fri,Dinner,2 102 | 11.35,2.5,Female,Yes,Fri,Dinner,2 103 | 15.38,3.0,Female,Yes,Fri,Dinner,2 104 | 44.3,2.5,Female,Yes,Sat,Dinner,3 105 | 22.42,3.48,Female,Yes,Sat,Dinner,2 106 | 20.92,4.08,Female,No,Sat,Dinner,2 107 | 15.36,1.64,Male,Yes,Sat,Dinner,2 108 | 20.49,4.06,Male,Yes,Sat,Dinner,2 109 | 25.21,4.29,Male,Yes,Sat,Dinner,2 110 | 18.24,3.76,Male,No,Sat,Dinner,2 111 | 14.31,4.0,Female,Yes,Sat,Dinner,2 112 | 14.0,3.0,Male,No,Sat,Dinner,2 113 | 7.25,1.0,Female,No,Sat,Dinner,1 114 | 38.07,4.0,Male,No,Sun,Dinner,3 115 | 23.95,2.55,Male,No,Sun,Dinner,2 116 | 25.71,4.0,Female,No,Sun,Dinner,3 117 | 17.31,3.5,Female,No,Sun,Dinner,2 118 | 29.93,5.07,Male,No,Sun,Dinner,4 119 | 10.65,1.5,Female,No,Thur,Lunch,2 120 | 12.43,1.8,Female,No,Thur,Lunch,2 121 | 24.08,2.92,Female,No,Thur,Lunch,4 122 | 11.69,2.31,Male,No,Thur,Lunch,2 123 | 13.42,1.68,Female,No,Thur,Lunch,2 124 | 14.26,2.5,Male,No,Thur,Lunch,2 125 | 15.95,2.0,Male,No,Thur,Lunch,2 126 | 12.48,2.52,Female,No,Thur,Lunch,2 127 | 29.8,4.2,Female,No,Thur,Lunch,6 128 | 8.52,1.48,Male,No,Thur,Lunch,2 129 | 14.52,2.0,Female,No,Thur,Lunch,2 130 | 11.38,2.0,Female,No,Thur,Lunch,2 131 | 22.82,2.18,Male,No,Thur,Lunch,3 132 | 19.08,1.5,Male,No,Thur,Lunch,2 133 | 20.27,2.83,Female,No,Thur,Lunch,2 134 | 11.17,1.5,Female,No,Thur,Lunch,2 135 | 12.26,2.0,Female,No,Thur,Lunch,2 136 | 18.26,3.25,Female,No,Thur,Lunch,2 137 | 8.51,1.25,Female,No,Thur,Lunch,2 138 | 10.33,2.0,Female,No,Thur,Lunch,2 139 | 14.15,2.0,Female,No,Thur,Lunch,2 140 | 16.0,2.0,Male,Yes,Thur,Lunch,2 141 | 13.16,2.75,Female,No,Thur,Lunch,2 142 | 17.47,3.5,Female,No,Thur,Lunch,2 143 | 34.3,6.7,Male,No,Thur,Lunch,6 144 | 41.19,5.0,Male,No,Thur,Lunch,5 145 | 27.05,5.0,Female,No,Thur,Lunch,6 146 | 16.43,2.3,Female,No,Thur,Lunch,2 147 | 8.35,1.5,Female,No,Thur,Lunch,2 148 | 18.64,1.36,Female,No,Thur,Lunch,3 149 | 11.87,1.63,Female,No,Thur,Lunch,2 150 | 9.78,1.73,Male,No,Thur,Lunch,2 151 | 7.51,2.0,Male,No,Thur,Lunch,2 152 | 14.07,2.5,Male,No,Sun,Dinner,2 153 | 13.13,2.0,Male,No,Sun,Dinner,2 154 | 17.26,2.74,Male,No,Sun,Dinner,3 155 | 24.55,2.0,Male,No,Sun,Dinner,4 156 | 19.77,2.0,Male,No,Sun,Dinner,4 157 | 29.85,5.14,Female,No,Sun,Dinner,5 158 | 48.17,5.0,Male,No,Sun,Dinner,6 159 | 25.0,3.75,Female,No,Sun,Dinner,4 160 | 13.39,2.61,Female,No,Sun,Dinner,2 161 | 16.49,2.0,Male,No,Sun,Dinner,4 162 | 21.5,3.5,Male,No,Sun,Dinner,4 163 | 12.66,2.5,Male,No,Sun,Dinner,2 164 | 16.21,2.0,Female,No,Sun,Dinner,3 165 | 13.81,2.0,Male,No,Sun,Dinner,2 166 | 17.51,3.0,Female,Yes,Sun,Dinner,2 167 | 24.52,3.48,Male,No,Sun,Dinner,3 168 | 20.76,2.24,Male,No,Sun,Dinner,2 169 | 31.71,4.5,Male,No,Sun,Dinner,4 170 | 10.59,1.61,Female,Yes,Sat,Dinner,2 171 | 10.63,2.0,Female,Yes,Sat,Dinner,2 172 | 50.81,10.0,Male,Yes,Sat,Dinner,3 173 | 15.81,3.16,Male,Yes,Sat,Dinner,2 174 | 7.25,5.15,Male,Yes,Sun,Dinner,2 175 | 31.85,3.18,Male,Yes,Sun,Dinner,2 176 | 16.82,4.0,Male,Yes,Sun,Dinner,2 177 | 32.9,3.11,Male,Yes,Sun,Dinner,2 178 | 17.89,2.0,Male,Yes,Sun,Dinner,2 179 | 14.48,2.0,Male,Yes,Sun,Dinner,2 180 | 9.6,4.0,Female,Yes,Sun,Dinner,2 181 | 34.63,3.55,Male,Yes,Sun,Dinner,2 182 | 34.65,3.68,Male,Yes,Sun,Dinner,4 183 | 23.33,5.65,Male,Yes,Sun,Dinner,2 184 | 45.35,3.5,Male,Yes,Sun,Dinner,3 185 | 23.17,6.5,Male,Yes,Sun,Dinner,4 186 | 40.55,3.0,Male,Yes,Sun,Dinner,2 187 | 20.69,5.0,Male,No,Sun,Dinner,5 188 | 20.9,3.5,Female,Yes,Sun,Dinner,3 189 | 30.46,2.0,Male,Yes,Sun,Dinner,5 190 | 18.15,3.5,Female,Yes,Sun,Dinner,3 191 | 23.1,4.0,Male,Yes,Sun,Dinner,3 192 | 15.69,1.5,Male,Yes,Sun,Dinner,2 193 | 19.81,4.19,Female,Yes,Thur,Lunch,2 194 | 28.44,2.56,Male,Yes,Thur,Lunch,2 195 | 15.48,2.02,Male,Yes,Thur,Lunch,2 196 | 16.58,4.0,Male,Yes,Thur,Lunch,2 197 | 7.56,1.44,Male,No,Thur,Lunch,2 198 | 10.34,2.0,Male,Yes,Thur,Lunch,2 199 | 43.11,5.0,Female,Yes,Thur,Lunch,4 200 | 13.0,2.0,Female,Yes,Thur,Lunch,2 201 | 13.51,2.0,Male,Yes,Thur,Lunch,2 202 | 18.71,4.0,Male,Yes,Thur,Lunch,3 203 | 12.74,2.01,Female,Yes,Thur,Lunch,2 204 | 13.0,2.0,Female,Yes,Thur,Lunch,2 205 | 16.4,2.5,Female,Yes,Thur,Lunch,2 206 | 20.53,4.0,Male,Yes,Thur,Lunch,4 207 | 16.47,3.23,Female,Yes,Thur,Lunch,3 208 | 26.59,3.41,Male,Yes,Sat,Dinner,3 209 | 38.73,3.0,Male,Yes,Sat,Dinner,4 210 | 24.27,2.03,Male,Yes,Sat,Dinner,2 211 | 12.76,2.23,Female,Yes,Sat,Dinner,2 212 | 30.06,2.0,Male,Yes,Sat,Dinner,3 213 | 25.89,5.16,Male,Yes,Sat,Dinner,4 214 | 48.33,9.0,Male,No,Sat,Dinner,4 215 | 13.27,2.5,Female,Yes,Sat,Dinner,2 216 | 28.17,6.5,Female,Yes,Sat,Dinner,3 217 | 12.9,1.1,Female,Yes,Sat,Dinner,2 218 | 28.15,3.0,Male,Yes,Sat,Dinner,5 219 | 11.59,1.5,Male,Yes,Sat,Dinner,2 220 | 7.74,1.44,Male,Yes,Sat,Dinner,2 221 | 30.14,3.09,Female,Yes,Sat,Dinner,4 222 | 12.16,2.2,Male,Yes,Fri,Lunch,2 223 | 13.42,3.48,Female,Yes,Fri,Lunch,2 224 | 8.58,1.92,Male,Yes,Fri,Lunch,1 225 | 15.98,3.0,Female,No,Fri,Lunch,3 226 | 13.42,1.58,Male,Yes,Fri,Lunch,2 227 | 16.27,2.5,Female,Yes,Fri,Lunch,2 228 | 10.09,2.0,Female,Yes,Fri,Lunch,2 229 | 20.45,3.0,Male,No,Sat,Dinner,4 230 | 13.28,2.72,Male,No,Sat,Dinner,2 231 | 22.12,2.88,Female,Yes,Sat,Dinner,2 232 | 24.01,2.0,Male,Yes,Sat,Dinner,4 233 | 15.69,3.0,Male,Yes,Sat,Dinner,3 234 | 11.61,3.39,Male,No,Sat,Dinner,2 235 | 10.77,1.47,Male,No,Sat,Dinner,2 236 | 15.53,3.0,Male,Yes,Sat,Dinner,2 237 | 10.07,1.25,Male,No,Sat,Dinner,2 238 | 12.6,1.0,Male,Yes,Sat,Dinner,2 239 | 32.83,1.17,Male,Yes,Sat,Dinner,2 240 | 35.83,4.67,Female,No,Sat,Dinner,3 241 | 29.03,5.92,Male,No,Sat,Dinner,3 242 | 27.18,2.0,Female,Yes,Sat,Dinner,2 243 | 22.67,2.0,Male,Yes,Sat,Dinner,2 244 | 17.82,1.75,Male,No,Sat,Dinner,2 245 | 18.78,3.0,Female,No,Thur,Dinner,2 246 | -------------------------------------------------------------------------------- /notebooks/mydask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deniederhut/Pandas-Tutorial-SciPyConf-2018/d30a9aac47b4a16b6170da77f982fe8592a4eb73/notebooks/mydask.png -------------------------------------------------------------------------------- /notebooks/solutions/aligment_concat.py: -------------------------------------------------------------------------------- 1 | df = pd.concat([gdp, cpi], axis='columns') 2 | df.head() -------------------------------------------------------------------------------- /notebooks/solutions/aligment_merge.py: -------------------------------------------------------------------------------- 1 | outer = (pd.merge(gdp_bad, cpi_bad, on="DATE", how='outer') 2 | .sort_values("DATE")) 3 | outer.head() -------------------------------------------------------------------------------- /notebooks/solutions/alignment_00.py: -------------------------------------------------------------------------------- 1 | cpi = pd.read_csv("data/cpi.csv", parse_dates=['DATE']) 2 | gdp = pd.read_csv("data/gdp.csv", parse_dates=['DATE']) 3 | -------------------------------------------------------------------------------- /notebooks/solutions/alignment_01.py: -------------------------------------------------------------------------------- 1 | # Option 1: The "manual way" 2 | common = pd.Index(cpi.DATE).intersection(gdp.DATE) 3 | rgdp = (gdp.loc[gdp.DATE.isin(common), 'GDP'].values / 4 | cpi.loc[cpi.DATE.isin(common), 'CPIAUCSL']) 5 | display.display(rgdp.head()) 6 | 7 | # Option 2: "merge" 8 | m = pd.merge(gdp, cpi, on="DATE") 9 | rgdp = m['GDP'] / m['CPIAUCSL'] 10 | rgdp.head() 11 | -------------------------------------------------------------------------------- /notebooks/solutions/alignment_02.py: -------------------------------------------------------------------------------- 1 | gdp = gdp.set_index("DATE").squeeze() 2 | cpi = cpi.set_index("DATE").squeeze().rename("cpi") 3 | side_by_side(gdp.head(), cpi.head()) 4 | -------------------------------------------------------------------------------- /notebooks/solutions/alignment_03.py: -------------------------------------------------------------------------------- 1 | res = DataReader(series, start='2000-01-01', data_source="fred") 2 | res = res.rename(columns=dict(zip(series, names))) 3 | 4 | fig, ax = plt.subplots(figsize=(12, 8)) 5 | res[['quits', 'layoffs']].plot.area( 6 | color=area_colors, ax=ax) 7 | res[['hires', 'openings']].plot( 8 | ax=ax, color=line_colors, linewidth=3); 9 | -------------------------------------------------------------------------------- /notebooks/solutions/alignment_positive.py: -------------------------------------------------------------------------------- 1 | (pct_change > 0).mean() -------------------------------------------------------------------------------- /notebooks/solutions/alignment_real_gdp09.py: -------------------------------------------------------------------------------- 1 | cpi09 = cpi / cpi.loc['2009'].mean() * 100 2 | gdp / cpi09 -------------------------------------------------------------------------------- /notebooks/solutions/dropna_columns.py: -------------------------------------------------------------------------------- 1 | df.dropna(axis="columns") -------------------------------------------------------------------------------- /notebooks/solutions/eda_00.py: -------------------------------------------------------------------------------- 1 | def from_dollars(col): 2 | return pd.to_numeric(col.str.lstrip('$')) 3 | -------------------------------------------------------------------------------- /notebooks/solutions/eda_01.py: -------------------------------------------------------------------------------- 1 | cols = ['state_bottle_cost', 'state_bottle_retail', 'sale'] 2 | df[cols] = df[cols].apply(from_dollars) 3 | -------------------------------------------------------------------------------- /notebooks/solutions/eda_02.py: -------------------------------------------------------------------------------- 1 | url = 'https://en.wikipedia.org/wiki/List_of_largest_Iowa_cities_by_population' 2 | popn = (pd.read_html(url, header=0)[0] 3 | .set_index("City") 4 | .rename(lambda x: x.lower())) 5 | popn.head() 6 | -------------------------------------------------------------------------------- /notebooks/solutions/eda_03.py: -------------------------------------------------------------------------------- 1 | per_cap = (df.groupby(df.city.str.lower()) 2 | .volume_sold.sum() / 3 | popn.Population.astype(float)).dropna() 4 | per_cap.plot.barh(figsize=(10, 10), color='k', width=.9); 5 | -------------------------------------------------------------------------------- /notebooks/solutions/eda_04.py: -------------------------------------------------------------------------------- 1 | pd.concat([df.groupby(df.city.str.lower())[['sale', 'volume_sold']].sum(), 2 | popn.Population], axis=1, join='inner').pipe(sns.pairplot); 3 | -------------------------------------------------------------------------------- /notebooks/solutions/groupby_00.py: -------------------------------------------------------------------------------- 1 | review_length = df.text.str.len() 2 | gr = df.groupby(review_length).review_overall 3 | gr.mean().plot(style='k.') -------------------------------------------------------------------------------- /notebooks/solutions/groupby_00b.py: -------------------------------------------------------------------------------- 1 | (df.groupby(df.text.str.count('\w+')) 2 | .review_overall 3 | .mean().plot(style='k.')) 4 | -------------------------------------------------------------------------------- /notebooks/solutions/groupby_01.py: -------------------------------------------------------------------------------- 1 | (df.groupby('beer_id') 2 | .review_overall 3 | .agg(['mean', 'count']) 4 | .plot.scatter(x='count', y='mean', color='k', 5 | marker='.', alpha=.25)); 6 | -------------------------------------------------------------------------------- /notebooks/solutions/groupby_02.py: -------------------------------------------------------------------------------- 1 | order = df.groupby("profile_name").review_overall.cumcount() 2 | df.groupby(order).review_overall.mean().plot() 3 | -------------------------------------------------------------------------------- /notebooks/solutions/groupby_03.py: -------------------------------------------------------------------------------- 1 | # Make a barplot of review times by hour 2 | (df.time.dt.hour 3 | .value_counts() 4 | .sort_index() 5 | .plot.bar(rot=0, color='k', width=.8)); 6 | -------------------------------------------------------------------------------- /notebooks/solutions/groupby_04.py: -------------------------------------------------------------------------------- 1 | pas = df[df.beer_style.str.lower().str.contains("pale ale")] 2 | pas.head() -------------------------------------------------------------------------------- /notebooks/solutions/groupby_abv.py: -------------------------------------------------------------------------------- 1 | df.groupby('beer_style').abv.std().sort_values(ascending=False) -------------------------------------------------------------------------------- /notebooks/solutions/groupby_format_review.py: -------------------------------------------------------------------------------- 1 | def format_review(review): 2 | return dict([line.strip('\n').split(": ", 1) for line in review]) -------------------------------------------------------------------------------- /notebooks/solutions/indexing_00.py: -------------------------------------------------------------------------------- 1 | first[['origin', 'dest']] -------------------------------------------------------------------------------- /notebooks/solutions/indexing_01.py: -------------------------------------------------------------------------------- 1 | flights[(flights.dep.dt.hour <= 6) | 2 | (flights.dep.dt.hour >= 18)] 3 | -------------------------------------------------------------------------------- /notebooks/solutions/indexing_02.py: -------------------------------------------------------------------------------- 1 | m1 = flights.origin == 'ATL' 2 | most_common = flights.loc[m1, 'dest'].value_counts().index[:3] 3 | m2 = flights.dest.isin(most_common) 4 | 5 | flights[m1 & m2].head() 6 | -------------------------------------------------------------------------------- /notebooks/solutions/indexing_cancelled.py: -------------------------------------------------------------------------------- 1 | flights.loc[flights.cancelled == 1] -------------------------------------------------------------------------------- /notebooks/solutions/indexing_datetime.py: -------------------------------------------------------------------------------- 1 | delays.loc['2014-01-03T12':'2014-01-10T12'] -------------------------------------------------------------------------------- /notebooks/solutions/indexing_drop_columns.py: -------------------------------------------------------------------------------- 1 | flights.drop('airline_id', axis='columns') -------------------------------------------------------------------------------- /notebooks/solutions/indexing_drop_index.py: -------------------------------------------------------------------------------- 1 | first.drop(['EV', 'F9']) -------------------------------------------------------------------------------- /notebooks/solutions/indexing_ex1_engine_columns.py: -------------------------------------------------------------------------------- 1 | cars[['cylinders', 'displacement', 'horsepower']] -------------------------------------------------------------------------------- /notebooks/solutions/indexing_ex2_5th.py: -------------------------------------------------------------------------------- 1 | cars.iloc[::5] -------------------------------------------------------------------------------- /notebooks/solutions/indexing_ex3_years.py: -------------------------------------------------------------------------------- 1 | yearly.loc[[70, 75, 80, 82], ['horsepower', 'weight']] -------------------------------------------------------------------------------- /notebooks/solutions/indexing_ex4_mpg.py: -------------------------------------------------------------------------------- 1 | cars[cars.mpg >= 30] 2 | -------------------------------------------------------------------------------- /notebooks/solutions/indexing_ex5_mpg_and_cylinders.py: -------------------------------------------------------------------------------- 1 | len(cars[(cars.mpg >= 30) & (cars.cylinders >= 5)]) -------------------------------------------------------------------------------- /notebooks/solutions/indexing_loc.py: -------------------------------------------------------------------------------- 1 | first.loc[['US', 'VX', 'WN'], ['tail_num', 'origin', 'dest']] 2 | -------------------------------------------------------------------------------- /notebooks/solutions/indexing_thoughts.py: -------------------------------------------------------------------------------- 1 | first.loc['AA', 'fl_num'] = -1 2 | first.head() -------------------------------------------------------------------------------- /notebooks/solutions/performance_00.py: -------------------------------------------------------------------------------- 1 | ids = flights.ORIGIN_AIRPORT_ID.value_counts() 2 | ids = ids[ids >= 500].index 3 | ids 4 | -------------------------------------------------------------------------------- /notebooks/solutions/performance_01.py: -------------------------------------------------------------------------------- 1 | subset = coord[coord.AIRPORT_ID.isin(ids)] 2 | subset.head() 3 | -------------------------------------------------------------------------------- /notebooks/solutions/performance_02.py: -------------------------------------------------------------------------------- 1 | from sklearn.neighbors import KDTree 2 | 3 | # the result of KDTree.query is a list of index 4 | # *positions*, we'll use id_map to go from 5 | # positions back to airport names 6 | id_map = dict(enumerate(locs.index)) 7 | 8 | tree = KDTree(locs) 9 | 10 | distances, indexes = tree.query(locs.values, k=2) 11 | indexes = indexes[:, 1] 12 | distances = distances[:, 1] 13 | neighbors = pd.Series(indexes, index=locs.index).map(id_map) 14 | neighbors.head() 15 | -------------------------------------------------------------------------------- /notebooks/solutions/performance_concat.py: -------------------------------------------------------------------------------- 1 | 2 | pd.concat([pd.DataFrame(set_, columns=['A', 'B', 'C']) for set_ in records], 3 | ignore_index=True) -------------------------------------------------------------------------------- /notebooks/solutions/performance_kd.py: -------------------------------------------------------------------------------- 1 | coord = pd.read_csv("data/flights_coord.csv") 2 | coord.head() -------------------------------------------------------------------------------- /notebooks/solutions/readme_00.py: -------------------------------------------------------------------------------- 1 | print("Hello, world!") 2 | -------------------------------------------------------------------------------- /notebooks/solutions/sklearn_pandas_split.py: -------------------------------------------------------------------------------- 1 | y = df['tip'] 2 | X = df.drop('tip', axis=1) -------------------------------------------------------------------------------- /notebooks/solutions/tidy_00.py: -------------------------------------------------------------------------------- 1 | df['winning_team'] = np.where( 2 | df.home_points > df.away_points, 3 | df.home_team, 4 | df.away_team) 5 | 6 | win = pd.melt(df, id_vars='winning_team', value_vars=['away_team', 'home_team'], 7 | var_name='home_or_away', value_name='team') 8 | win['won'] = win.winning_team == win.team 9 | win_pct = win.groupby(['team', 'home_or_away']).won.mean() 10 | win_pct.head() 11 | -------------------------------------------------------------------------------- /notebooks/solutions/tidy_01.py: -------------------------------------------------------------------------------- 1 | df['home_win'] = df.home_points > df.away_points 2 | df['point_spread'] = df.home_points - df.away_points 3 | df.head() 4 | -------------------------------------------------------------------------------- /notebooks/solutions/tidy_02.py: -------------------------------------------------------------------------------- 1 | # RPI 2 | df['home_strength'] = df.home_team.map(rpi.rename(mapping)['RPI']) 3 | df['away_strength'] = df.away_team.map(rpi.rename(mapping)['RPI']) 4 | df.head() 5 | -------------------------------------------------------------------------------- /notebooks/solutions/tidy_03.py: -------------------------------------------------------------------------------- 1 | df['rest_spread'] = df['home_rest'] - df['away_rest'] 2 | df.head() -------------------------------------------------------------------------------- /notebooks/solutions/tidy_04.py: -------------------------------------------------------------------------------- 1 | fig, ax = plt.subplots(figsize=(12, 6)) 2 | sns.barplot(x='rest_spread', y='home_win', 3 | data=df.loc[(-3 <= df.rest_spread) & 4 | (df.rest_spread <= 3)], 5 | color='#4c72b0', ax=ax) 6 | sns.despine() -------------------------------------------------------------------------------- /notebooks/solutions/tidy_05.py: -------------------------------------------------------------------------------- 1 | def compute_away_streaks(v): 2 | streaks = [] 3 | current_streak = 0 4 | 5 | for row in v: 6 | if row == 'away_team': 7 | current_streak += 1 8 | else: 9 | current_streak = 0 10 | streaks.append(current_streak) 11 | return pd.Series(streaks, index=v.index) -------------------------------------------------------------------------------- /notebooks/solutions/tidy_06.py: -------------------------------------------------------------------------------- 1 | # fill 1 for teams that start on the road 2 | away_streaks = ( 3 | tidy.groupby("team") 4 | .variable 5 | .transform(compute_away_streaks).fillna(1)) 6 | away_streaks.head() 7 | -------------------------------------------------------------------------------- /notebooks/solutions/tidy_07.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | 3 | def log_shape(func): 4 | @wraps(func) 5 | def deco(*args, **kwargs): 6 | result = func(*args, **kwargs) 7 | logger.info("In %s [%s]", func.__name__, result.shape) 8 | return result 9 | return deco 10 | -------------------------------------------------------------------------------- /notebooks/solutions/tidy_drest.py: -------------------------------------------------------------------------------- 1 | df['rest_spread'].mean() -------------------------------------------------------------------------------- /notebooks/solutions/tidy_sanity.py: -------------------------------------------------------------------------------- 1 | (df.home_rest - df.away_rest).mean() -------------------------------------------------------------------------------- /notebooks/solutions/timeseries_departure.py: -------------------------------------------------------------------------------- 1 | flights.dep + flights.dep_delay_td -------------------------------------------------------------------------------- /notebooks/solutions/timeseries_monthly_ma.py: -------------------------------------------------------------------------------- 1 | ma = y.rolling(4).mean() 2 | ax = ma.plot(legend=True, label="MA[4]", figsize=(12, 4)) 3 | y.plot(ax=ax, label="Observed", legend=True); -------------------------------------------------------------------------------- /notebooks/solutions/timeseries_resample.py: -------------------------------------------------------------------------------- 1 | df.resample("W").std().plot(); -------------------------------------------------------------------------------- /notebooks/solutions/timeseries_resample_agg.py: -------------------------------------------------------------------------------- 1 | df.resample("Q").agg(['sum', 'mean', 'median']) -------------------------------------------------------------------------------- /notebooks/solutions/timeseries_timedelta.py: -------------------------------------------------------------------------------- 1 | flights['dep_delay_td'] = pd.to_timedelta(flights['dep_delay'], unit='T') 2 | flights['arr_delay_td'] = pd.to_timedelta(flights['arr_delay'], unit='T') 3 | flights.info() -------------------------------------------------------------------------------- /notebooks/solutions/visualize_00.py: -------------------------------------------------------------------------------- 1 | sns.factorplot('embarked', data=t, kind="count", hue="class"); 2 | -------------------------------------------------------------------------------- /notebooks/solutions/visualize_01.py: -------------------------------------------------------------------------------- 1 | sns.factorplot('age', 'class', data=t); -------------------------------------------------------------------------------- /notebooks/solutions/visualize_02a.py: -------------------------------------------------------------------------------- 1 | m = t.fare.median() 2 | 3 | t['fare_'] = np.where(t.fare < m * 3, t.fare, m * 3) 4 | t.head() 5 | -------------------------------------------------------------------------------- /notebooks/solutions/visualize_02b.py: -------------------------------------------------------------------------------- 1 | sns.violinplot("class", "fare_", data=t, orient="v", 2 | palette="YlGn") 3 | sns.despine(left=True) 4 | plt.ylim(0); 5 | -------------------------------------------------------------------------------- /notebooks/solutions/visualize_03.py: -------------------------------------------------------------------------------- 1 | sns.countplot("alive", data=t, palette="OrRd_d"); 2 | -------------------------------------------------------------------------------- /notebooks/solutions/visualize_04.py: -------------------------------------------------------------------------------- 1 | sns.factorplot("class", "survived", data=t).set(ylim=(0, 1)); 2 | -------------------------------------------------------------------------------- /notebooks/solutions/visualize_05.py: -------------------------------------------------------------------------------- 1 | sns.factorplot("who", "survived", data=t).set(ylim=(0, 1)); 2 | -------------------------------------------------------------------------------- /notebooks/solutions/visualize_06.py: -------------------------------------------------------------------------------- 1 | sns.factorplot("class", "survived", data=t, 2 | hue="sex", palette=pal).set(ylim=(0, 1)); 3 | -------------------------------------------------------------------------------- /notebooks/solutions/visualize_07.py: -------------------------------------------------------------------------------- 1 | g = sns.factorplot("class", "survived", data=t, 2 | hue="who", palette=pal, col="who", 3 | aspect=.5) 4 | g.set(ylim=(0, 1)) 5 | g.despine(left=True); 6 | -------------------------------------------------------------------------------- /notebooks/solutions/visualize_08.py: -------------------------------------------------------------------------------- 1 | fg = sns.factorplot("adult_male", "survived", data=t, 2 | col="class", hue="class", size=6, 3 | aspect=.33, palette="BuPu_d") 4 | fg.set(ylim=(0, 1)) 5 | fg.despine(left=True); 6 | -------------------------------------------------------------------------------- /notebooks/solutions/visualize_09.py: -------------------------------------------------------------------------------- 1 | sns.lmplot("age", "survived", t, hue="sex", 2 | logistic=True, x_bins=bins, 3 | palette=pal); 4 | -------------------------------------------------------------------------------- /notebooks/solutions/visualize_10.py: -------------------------------------------------------------------------------- 1 | sns.lmplot("age", "survived", t, hue="class", 2 | logistic=True, x_bins=bins, 3 | palette=pal); -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.14.5 2 | pandas==0.23.1 3 | matplotlib==2.2.2 4 | seaborn==0.8.1 5 | ipython==6.4.0 6 | jupyter==1.0.0 7 | notebook==5.5.0 8 | dask==0.18.1 9 | distributed==1.22.0 10 | toolz==0.9.0 11 | pandas-datareader==0.6.0 12 | scikit-learn==0.19.1 13 | scipy==1.1.0 14 | statsmodels==0.9.0 15 | xlrd==1.1.0 16 | lifetimes==0.6.0.0 --------------------------------------------------------------------------------