├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── check_environment.py
├── convert_deps.py
├── environment.yml
├── notebooks
    ├── 00-README.ipynb
    ├── 01-Indexing.ipynb
    ├── 02-Alignment.ipynb
    ├── 03-Groupby.ipynb
    ├── 04-Tidy-Data.ipynb
    ├── 05-Timeseries.ipynb
    ├── 06-Dask.ipynb
    ├── 07-Performance.ipynb
    ├── 08-Pandas-NumPy-ScikitLearn.ipynb
    ├── 09-Visualization.ipynb
    ├── 10-Iterators.ipynb
    ├── data
    │   ├── beer-raw-small.txt.gz
    │   ├── cpi.csv
    │   ├── flights-ts.csv.gz
    │   ├── flights_coord.csv
    │   ├── games.csv
    │   ├── gdp.csv
    │   ├── ny-flights.csv.gz
    │   ├── rpi.csv
    │   ├── stocks.csv
    │   ├── subset.csv.gz
    │   ├── tidy_checkpoint.csv
    │   └── tips.csv
    ├── mydask.png
    └── solutions
    │   ├── aligment_concat.py
    │   ├── aligment_merge.py
    │   ├── alignment_00.py
    │   ├── alignment_01.py
    │   ├── alignment_02.py
    │   ├── alignment_03.py
    │   ├── alignment_positive.py
    │   ├── alignment_real_gdp09.py
    │   ├── dropna_columns.py
    │   ├── eda_00.py
    │   ├── eda_01.py
    │   ├── eda_02.py
    │   ├── eda_03.py
    │   ├── eda_04.py
    │   ├── groupby_00.py
    │   ├── groupby_00b.py
    │   ├── groupby_01.py
    │   ├── groupby_02.py
    │   ├── groupby_03.py
    │   ├── groupby_04.py
    │   ├── groupby_abv.py
    │   ├── groupby_format_review.py
    │   ├── indexing_00.py
    │   ├── indexing_01.py
    │   ├── indexing_02.py
    │   ├── indexing_cancelled.py
    │   ├── indexing_datetime.py
    │   ├── indexing_drop_columns.py
    │   ├── indexing_drop_index.py
    │   ├── indexing_ex1_engine_columns.py
    │   ├── indexing_ex2_5th.py
    │   ├── indexing_ex3_years.py
    │   ├── indexing_ex4_mpg.py
    │   ├── indexing_ex5_mpg_and_cylinders.py
    │   ├── indexing_loc.py
    │   ├── indexing_thoughts.py
    │   ├── performance_00.py
    │   ├── performance_01.py
    │   ├── performance_02.py
    │   ├── performance_concat.py
    │   ├── performance_kd.py
    │   ├── readme_00.py
    │   ├── sklearn_pandas_split.py
    │   ├── tidy_00.py
    │   ├── tidy_01.py
    │   ├── tidy_02.py
    │   ├── tidy_03.py
    │   ├── tidy_04.py
    │   ├── tidy_05.py
    │   ├── tidy_06.py
    │   ├── tidy_07.py
    │   ├── tidy_drest.py
    │   ├── tidy_sanity.py
    │   ├── timeseries_departure.py
    │   ├── timeseries_monthly_ma.py
    │   ├── timeseries_resample.py
    │   ├── timeseries_resample_agg.py
    │   ├── timeseries_timedelta.py
    │   ├── visualize_00.py
    │   ├── visualize_01.py
    │   ├── visualize_02a.py
    │   ├── visualize_02b.py
    │   ├── visualize_03.py
    │   ├── visualize_04.py
    │   ├── visualize_05.py
    │   ├── visualize_06.py
    │   ├── visualize_07.py
    │   ├── visualize_08.py
    │   ├── visualize_09.py
    │   └── visualize_10.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.md
 2 | *.pdf
 3 | *_files/
 4 | __pycache__/
 5 | !README.md
 6 | *.ipynb_checkpoints/
 7 | notebooks/data/beer/*.gz
 8 | *.DS_Store
 9 | pandas-scipy
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Attribution 4.0 International
  2 | 
  3 | =======================================================================
  4 | 
  5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
  6 | does not provide legal services or legal advice. Distribution of
  7 | Creative Commons public licenses does not create a lawyer-client or
  8 | other relationship. Creative Commons makes its licenses and related
  9 | information available on an "as-is" basis. Creative Commons gives no
 10 | warranties regarding its licenses, any material licensed under their
 11 | terms and conditions, or any related information. Creative Commons
 12 | disclaims all liability for damages resulting from their use to the
 13 | fullest extent possible.
 14 | 
 15 | Using Creative Commons Public Licenses
 16 | 
 17 | Creative Commons public licenses provide a standard set of terms and
 18 | conditions that creators and other rights holders may use to share
 19 | original works of authorship and other material subject to copyright
 20 | and certain other rights specified in the public license below. The
 21 | following considerations are for informational purposes only, are not
 22 | exhaustive, and do not form part of our licenses.
 23 | 
 24 |      Considerations for licensors: Our public licenses are
 25 |      intended for use by those authorized to give the public
 26 |      permission to use material in ways otherwise restricted by
 27 |      copyright and certain other rights. Our licenses are
 28 |      irrevocable. Licensors should read and understand the terms
 29 |      and conditions of the license they choose before applying it.
 30 |      Licensors should also secure all rights necessary before
 31 |      applying our licenses so that the public can reuse the
 32 |      material as expected. Licensors should clearly mark any
 33 |      material not subject to the license. This includes other CC-
 34 |      licensed material, or material used under an exception or
 35 |      limitation to copyright. More considerations for licensors:
 36 | 	wiki.creativecommons.org/Considerations_for_licensors
 37 | 
 38 |      Considerations for the public: By using one of our public
 39 |      licenses, a licensor grants the public permission to use the
 40 |      licensed material under specified terms and conditions. If
 41 |      the licensor's permission is not necessary for any reason--for
 42 |      example, because of any applicable exception or limitation to
 43 |      copyright--then that use is not regulated by the license. Our
 44 |      licenses grant only permissions under copyright and certain
 45 |      other rights that a licensor has authority to grant. Use of
 46 |      the licensed material may still be restricted for other
 47 |      reasons, including because others have copyright or other
 48 |      rights in the material. A licensor may make special requests,
 49 |      such as asking that all changes be marked or described.
 50 |      Although not required by our licenses, you are encouraged to
 51 |      respect those requests where reasonable. More_considerations
 52 |      for the public: 
 53 | 	wiki.creativecommons.org/Considerations_for_licensees
 54 | 
 55 | =======================================================================
 56 | 
 57 | Creative Commons Attribution 4.0 International Public License
 58 | 
 59 | By exercising the Licensed Rights (defined below), You accept and agree
 60 | to be bound by the terms and conditions of this Creative Commons
 61 | Attribution 4.0 International Public License ("Public License"). To the
 62 | extent this Public License may be interpreted as a contract, You are
 63 | granted the Licensed Rights in consideration of Your acceptance of
 64 | these terms and conditions, and the Licensor grants You such rights in
 65 | consideration of benefits the Licensor receives from making the
 66 | Licensed Material available under these terms and conditions.
 67 | 
 68 | 
 69 | Section 1 -- Definitions.
 70 | 
 71 |   a. Adapted Material means material subject to Copyright and Similar
 72 |      Rights that is derived from or based upon the Licensed Material
 73 |      and in which the Licensed Material is translated, altered,
 74 |      arranged, transformed, or otherwise modified in a manner requiring
 75 |      permission under the Copyright and Similar Rights held by the
 76 |      Licensor. For purposes of this Public License, where the Licensed
 77 |      Material is a musical work, performance, or sound recording,
 78 |      Adapted Material is always produced where the Licensed Material is
 79 |      synched in timed relation with a moving image.
 80 | 
 81 |   b. Adapter's License means the license You apply to Your Copyright
 82 |      and Similar Rights in Your contributions to Adapted Material in
 83 |      accordance with the terms and conditions of this Public License.
 84 | 
 85 |   c. Copyright and Similar Rights means copyright and/or similar rights
 86 |      closely related to copyright including, without limitation,
 87 |      performance, broadcast, sound recording, and Sui Generis Database
 88 |      Rights, without regard to how the rights are labeled or
 89 |      categorized. For purposes of this Public License, the rights
 90 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
 91 |      Rights.
 92 | 
 93 |   d. Effective Technological Measures means those measures that, in the
 94 |      absence of proper authority, may not be circumvented under laws
 95 |      fulfilling obligations under Article 11 of the WIPO Copyright
 96 |      Treaty adopted on December 20, 1996, and/or similar international
 97 |      agreements.
 98 | 
 99 |   e. Exceptions and Limitations means fair use, fair dealing, and/or
100 |      any other exception or limitation to Copyright and Similar Rights
101 |      that applies to Your use of the Licensed Material.
102 | 
103 |   f. Licensed Material means the artistic or literary work, database,
104 |      or other material to which the Licensor applied this Public
105 |      License.
106 | 
107 |   g. Licensed Rights means the rights granted to You subject to the
108 |      terms and conditions of this Public License, which are limited to
109 |      all Copyright and Similar Rights that apply to Your use of the
110 |      Licensed Material and that the Licensor has authority to license.
111 | 
112 |   h. Licensor means the individual(s) or entity(ies) granting rights
113 |      under this Public License.
114 | 
115 |   i. Share means to provide material to the public by any means or
116 |      process that requires permission under the Licensed Rights, such
117 |      as reproduction, public display, public performance, distribution,
118 |      dissemination, communication, or importation, and to make material
119 |      available to the public including in ways that members of the
120 |      public may access the material from a place and at a time
121 |      individually chosen by them.
122 | 
123 |   j. Sui Generis Database Rights means rights other than copyright
124 |      resulting from Directive 96/9/EC of the European Parliament and of
125 |      the Council of 11 March 1996 on the legal protection of databases,
126 |      as amended and/or succeeded, as well as other essentially
127 |      equivalent rights anywhere in the world.
128 | 
129 |   k. You means the individual or entity exercising the Licensed Rights
130 |      under this Public License. Your has a corresponding meaning.
131 | 
132 | 
133 | Section 2 -- Scope.
134 | 
135 |   a. License grant.
136 | 
137 |        1. Subject to the terms and conditions of this Public License,
138 |           the Licensor hereby grants You a worldwide, royalty-free,
139 |           non-sublicensable, non-exclusive, irrevocable license to
140 |           exercise the Licensed Rights in the Licensed Material to:
141 | 
142 |             a. reproduce and Share the Licensed Material, in whole or
143 |                in part; and
144 | 
145 |             b. produce, reproduce, and Share Adapted Material.
146 | 
147 |        2. Exceptions and Limitations. For the avoidance of doubt, where
148 |           Exceptions and Limitations apply to Your use, this Public
149 |           License does not apply, and You do not need to comply with
150 |           its terms and conditions.
151 | 
152 |        3. Term. The term of this Public License is specified in Section
153 |           6(a).
154 | 
155 |        4. Media and formats; technical modifications allowed. The
156 |           Licensor authorizes You to exercise the Licensed Rights in
157 |           all media and formats whether now known or hereafter created,
158 |           and to make technical modifications necessary to do so. The
159 |           Licensor waives and/or agrees not to assert any right or
160 |           authority to forbid You from making technical modifications
161 |           necessary to exercise the Licensed Rights, including
162 |           technical modifications necessary to circumvent Effective
163 |           Technological Measures. For purposes of this Public License,
164 |           simply making modifications authorized by this Section 2(a)
165 |           (4) never produces Adapted Material.
166 | 
167 |        5. Downstream recipients.
168 | 
169 |             a. Offer from the Licensor -- Licensed Material. Every
170 |                recipient of the Licensed Material automatically
171 |                receives an offer from the Licensor to exercise the
172 |                Licensed Rights under the terms and conditions of this
173 |                Public License.
174 | 
175 |             b. No downstream restrictions. You may not offer or impose
176 |                any additional or different terms or conditions on, or
177 |                apply any Effective Technological Measures to, the
178 |                Licensed Material if doing so restricts exercise of the
179 |                Licensed Rights by any recipient of the Licensed
180 |                Material.
181 | 
182 |        6. No endorsement. Nothing in this Public License constitutes or
183 |           may be construed as permission to assert or imply that You
184 |           are, or that Your use of the Licensed Material is, connected
185 |           with, or sponsored, endorsed, or granted official status by,
186 |           the Licensor or others designated to receive attribution as
187 |           provided in Section 3(a)(1)(A)(i).
188 | 
189 |   b. Other rights.
190 | 
191 |        1. Moral rights, such as the right of integrity, are not
192 |           licensed under this Public License, nor are publicity,
193 |           privacy, and/or other similar personality rights; however, to
194 |           the extent possible, the Licensor waives and/or agrees not to
195 |           assert any such rights held by the Licensor to the limited
196 |           extent necessary to allow You to exercise the Licensed
197 |           Rights, but not otherwise.
198 | 
199 |        2. Patent and trademark rights are not licensed under this
200 |           Public License.
201 | 
202 |        3. To the extent possible, the Licensor waives any right to
203 |           collect royalties from You for the exercise of the Licensed
204 |           Rights, whether directly or through a collecting society
205 |           under any voluntary or waivable statutory or compulsory
206 |           licensing scheme. In all other cases the Licensor expressly
207 |           reserves any right to collect such royalties.
208 | 
209 | 
210 | Section 3 -- License Conditions.
211 | 
212 | Your exercise of the Licensed Rights is expressly made subject to the
213 | following conditions.
214 | 
215 |   a. Attribution.
216 | 
217 |        1. If You Share the Licensed Material (including in modified
218 |           form), You must:
219 | 
220 |             a. retain the following if it is supplied by the Licensor
221 |                with the Licensed Material:
222 | 
223 |                  i. identification of the creator(s) of the Licensed
224 |                     Material and any others designated to receive
225 |                     attribution, in any reasonable manner requested by
226 |                     the Licensor (including by pseudonym if
227 |                     designated);
228 | 
229 |                 ii. a copyright notice;
230 | 
231 |                iii. a notice that refers to this Public License;
232 | 
233 |                 iv. a notice that refers to the disclaimer of
234 |                     warranties;
235 | 
236 |                  v. a URI or hyperlink to the Licensed Material to the
237 |                     extent reasonably practicable;
238 | 
239 |             b. indicate if You modified the Licensed Material and
240 |                retain an indication of any previous modifications; and
241 | 
242 |             c. indicate the Licensed Material is licensed under this
243 |                Public License, and include the text of, or the URI or
244 |                hyperlink to, this Public License.
245 | 
246 |        2. You may satisfy the conditions in Section 3(a)(1) in any
247 |           reasonable manner based on the medium, means, and context in
248 |           which You Share the Licensed Material. For example, it may be
249 |           reasonable to satisfy the conditions by providing a URI or
250 |           hyperlink to a resource that includes the required
251 |           information.
252 | 
253 |        3. If requested by the Licensor, You must remove any of the
254 |           information required by Section 3(a)(1)(A) to the extent
255 |           reasonably practicable.
256 | 
257 |        4. If You Share Adapted Material You produce, the Adapter's
258 |           License You apply must not prevent recipients of the Adapted
259 |           Material from complying with this Public License.
260 | 
261 | 
262 | Section 4 -- Sui Generis Database Rights.
263 | 
264 | Where the Licensed Rights include Sui Generis Database Rights that
265 | apply to Your use of the Licensed Material:
266 | 
267 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
268 |      to extract, reuse, reproduce, and Share all or a substantial
269 |      portion of the contents of the database;
270 | 
271 |   b. if You include all or a substantial portion of the database
272 |      contents in a database in which You have Sui Generis Database
273 |      Rights, then the database in which You have Sui Generis Database
274 |      Rights (but not its individual contents) is Adapted Material; and
275 | 
276 |   c. You must comply with the conditions in Section 3(a) if You Share
277 |      all or a substantial portion of the contents of the database.
278 | 
279 | For the avoidance of doubt, this Section 4 supplements and does not
280 | replace Your obligations under this Public License where the Licensed
281 | Rights include other Copyright and Similar Rights.
282 | 
283 | 
284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
285 | 
286 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
287 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
288 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
289 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
290 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
291 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
292 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
293 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
294 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
295 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
296 | 
297 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
298 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
299 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
300 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
301 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
302 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
303 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
304 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
305 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
306 | 
307 |   c. The disclaimer of warranties and limitation of liability provided
308 |      above shall be interpreted in a manner that, to the extent
309 |      possible, most closely approximates an absolute disclaimer and
310 |      waiver of all liability.
311 | 
312 | 
313 | Section 6 -- Term and Termination.
314 | 
315 |   a. This Public License applies for the term of the Copyright and
316 |      Similar Rights licensed here. However, if You fail to comply with
317 |      this Public License, then Your rights under this Public License
318 |      terminate automatically.
319 | 
320 |   b. Where Your right to use the Licensed Material has terminated under
321 |      Section 6(a), it reinstates:
322 | 
323 |        1. automatically as of the date the violation is cured, provided
324 |           it is cured within 30 days of Your discovery of the
325 |           violation; or
326 | 
327 |        2. upon express reinstatement by the Licensor.
328 | 
329 |      For the avoidance of doubt, this Section 6(b) does not affect any
330 |      right the Licensor may have to seek remedies for Your violations
331 |      of this Public License.
332 | 
333 |   c. For the avoidance of doubt, the Licensor may also offer the
334 |      Licensed Material under separate terms or conditions or stop
335 |      distributing the Licensed Material at any time; however, doing so
336 |      will not terminate this Public License.
337 | 
338 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
339 |      License.
340 | 
341 | 
342 | Section 7 -- Other Terms and Conditions.
343 | 
344 |   a. The Licensor shall not be bound by any additional or different
345 |      terms or conditions communicated by You unless expressly agreed.
346 | 
347 |   b. Any arrangements, understandings, or agreements regarding the
348 |      Licensed Material not stated herein are separate from and
349 |      independent of the terms and conditions of this Public License.
350 | 
351 | 
352 | Section 8 -- Interpretation.
353 | 
354 |   a. For the avoidance of doubt, this Public License does not, and
355 |      shall not be interpreted to, reduce, limit, restrict, or impose
356 |      conditions on any use of the Licensed Material that could lawfully
357 |      be made without permission under this Public License.
358 | 
359 |   b. To the extent possible, if any provision of this Public License is
360 |      deemed unenforceable, it shall be automatically reformed to the
361 |      minimum extent necessary to make it enforceable. If the provision
362 |      cannot be reformed, it shall be severed from this Public License
363 |      without affecting the enforceability of the remaining terms and
364 |      conditions.
365 | 
366 |   c. No term or condition of this Public License will be waived and no
367 |      failure to comply consented to unless expressly agreed to by the
368 |      Licensor.
369 | 
370 |   d. Nothing in this Public License constitutes or may be interpreted
371 |      as a limitation upon, or waiver of, any privileges and immunities
372 |      that apply to the Licensor or You, including from the legal
373 |      processes of any jurisdiction or authority.
374 | 
375 | 
376 | =======================================================================
377 | 
378 | Creative Commons is not a party to its public
379 | licenses. Notwithstanding, Creative Commons may elect to apply one of
380 | its public licenses to material it publishes and in those instances
381 | will be considered the “Licensor.” The text of the Creative Commons
382 | public licenses is dedicated to the public domain under the CC0 Public
383 | Domain Dedication. Except for the limited purpose of indicating that
384 | material is shared under a Creative Commons public license or as
385 | otherwise permitted by the Creative Commons policies published at
386 | creativecommons.org/policies, Creative Commons does not authorize the
387 | use of the trademark "Creative Commons" or any other trademark or logo
388 | of Creative Commons without its prior written consent including,
389 | without limitation, in connection with any unauthorized modifications
390 | to any of its public licenses or any other arrangements,
391 | understandings, or agreements concerning use of licensed material. For
392 | the avoidance of doubt, this paragraph does not form part of the
393 | public licenses.
394 | 
395 | Creative Commons may be contacted at creativecommons.org.
396 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | SRC  = $(wildcard notebooks/*.ipynb)
2 | 
3 | strip:
4 | 	nbstripout notebooks/*.ipynb
5 | 
6 | requirements.txt: environment.yml
7 | 	python convert_deps.py
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SciPy 2018 Tutorial: Pandas .head() to .tail()
 2 | 
 3 | [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/deniederhut/Pandas-Tutorial-SciPyConf-2018/master)
 4 | 
 5 | ### https://github.com/deniederhut/Pandas-Tutorial-SciPyConf-2018
 6 | 
 7 | Cluster URL: http://104.155.138.8
 8 | 
 9 | #### Presented by:
10 | - [Tom Augspurger](https://tomaugspurger.github.io/), [Anaconda, Inc.](https://anaconda.org/)
11 | - [Joris Van den Bossche](https://jorisvandenbossche.github.io/), [Université Paris-Saclay Center for Data Science](https://www.datascience-paris-saclay.fr/)
12 | - [Dillon Niederhut](https://dillon.niederhut.us), [Enthought Inc.](https://www.enthought.com)
13 | 
14 | 
15 | ## First-Time Setup
16 | 
17 | #### 1. Install Python
18 | 
19 | If you don't already have a working python distribution, you may download one of
20 | 
21 | * Miniconda ([https://conda.io/miniconda.html](https://conda.io/miniconda.html))
22 | * Python.org  ([https://www.python.org/downloads/](https://www.python.org/downloads/))
23 | 
24 | You'll need Python 3.
25 | 
26 | #### 2. Download Tutorial Materials
27 | 
28 | This GitHub repository is all that is needed in terms of tutorial content. The simplest solution is to download the material using this link:
29 | 
30 | [https://github.com/deniederhut/Pandas-Tutorial-SciPyConf-2018/archive/master.zip](https://github.com/deniederhut/Pandas-Tutorial-SciPyConf-2018/archive/master.zip)
31 | 
32 | If you're familiar with Git, you can also clone this repository with:
33 | 
34 | ```sh
35 | git clone git@github.com:deniederhut/Pandas-Tutorial-SciPyConf-2018.git
36 | ```
37 | 
38 | It will create a new folder named Pandas-Tutorial-SciPyConf-2018/ with all the
39 | content you will need, including:
40 | 
41 | - `requirements.txt` - the package requirements for this tutorial
42 | - `check_environment.py` - a script for testing your installation
43 | - `notebooks/` - the Jupyter notebooks we'll use during the tutoral
44 | 
45 | #### 3. Install Required Packages
46 | 
47 | If you are using conda, you can install the necessary packages by opening a terminal and entering the following:
48 | 
49 | ```sh
50 | conda update conda --yes
51 | conda --version  # Should be about 4.5.4
52 | conda env create --file=environment.yml
53 | conda activate pandas-scipy
54 | ```
55 | 
56 | If you are using Python from python.org or your system, you can install the necessary packages by opening a terminal and entering the following:
57 | 
58 | ```sh
59 | # Create a new environment
60 | python3 -m venv pandas-scipy
61 | source pandas-scipy/bin/activate
62 | 
63 | pip install -U pip wheel setuptools
64 | pip install -U -r requirements.txt
65 | ```
66 | 
67 | #### 4. Test the Installation
68 | 
69 | To make sure everything was installed correctly, open a terminal, and change its directory (`cd`) so that your working directory is `Pandas-Tutorial-SciPyConf-2018`. The enter the following:
70 | 
71 | ```sh
72 | python check_environment.py
73 | ```
74 | 
75 | #### 5. Start the Notebook
76 | 
77 | ```sh
78 | jupyter notebook
79 | ```
80 | 
81 | ## Questions? Problems?
82 | 
83 | You may post messages to the slack channel for this tutorial at: [https://scipy2018.slack.com](https://scipy2018.slack.com)
84 | 


--------------------------------------------------------------------------------
/check_environment.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | packages = ['pandas', 'IPython', 'statsmodels', 'sklearn', 'seaborn',
 4 |             'toolz', 'requests', 'scipy']
 5 | 
 6 | bad = []
 7 | for package in packages:
 8 |     try:
 9 |         importlib.import_module(package)
10 |     except ImportError:
11 |         bad.append("Can't import %s" % package)
12 | else:
13 |     if len(bad) > 0:
14 |         print('\n'.join(bad))
15 |     else:
16 |         try:
17 |             import pandas as pd
18 |             df = pd.read_csv("notebooks/data/cpi.csv")
19 |             print("All good. Enjoy the tutorial!")
20 |         except Exception as e:
21 |             print("Couldn't read CPI")
22 |             print(e)
23 | 


--------------------------------------------------------------------------------
/convert_deps.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Convert the conda environment.yml to a pip requirements.txt
 3 | """
 4 | import yaml
 5 | 
 6 | exclude = {'python=3.6.6', 'nomkl'}
 7 | rename = {'pytables': 'tables'}
 8 | 
 9 | with open("environment.yml") as f:
10 |     dev = yaml.load(f)
11 | 
12 | required = dev['dependencies']
13 | required = [rename.get(dep, dep).replace("=", "==") for dep in required
14 |             if not isinstance(dep, dict)
15 |             and dep not in exclude]
16 | pip, = [x for x in dev['dependencies'] if isinstance(x, dict)]
17 | required.extend(pip['pip'])
18 | 
19 | 
20 | with open("requirements.txt", 'wt') as f:
21 |     f.write('\n'.join(required))
22 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: pandas-scipy
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - python=3.6.6
 7 |   - numpy=1.14.5
 8 |   - pandas=0.23.1
 9 |   - matplotlib=2.2.2
10 |   - seaborn=0.8.1
11 |   - ipython=6.4.0
12 |   - jupyter=1.0.0
13 |   - notebook=5.5.0
14 |   - dask=0.18.1
15 |   - distributed=1.22.0
16 |   - toolz=0.9.0
17 |   - pandas-datareader=0.6.0
18 |   - scikit-learn=0.19.1
19 |   - scipy=1.1.0
20 |   - statsmodels=0.9.0
21 |   - xlrd=1.1.0
22 |   - pip:
23 |     - lifetimes==0.6.0.0
24 | 


--------------------------------------------------------------------------------
/notebooks/00-README.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Introduction"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Welcome to the course!\n",
 15 |     "This notebook will outline our structure for the course, and introduce you to the notebook if you haven't used it before."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## Background Expectations\n",
 23 |     "\n",
 24 |     "- Hopefully you've used Python before\n",
 25 |     "- Experience with NumPy will be helpful, but not required\n",
 26 |     "- Pandas will be the primary focus\n",
 27 |     "- We'll see bits of scikit-learn and statsmodels"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Course Format\n",
 35 |     "\n",
 36 |     "- We'll work through notebooks together (execute each cell)\n",
 37 |     "- You'll do exercises\n",
 38 |     "- During exercises, we'll follow-up on questions\n",
 39 |     "- We'll demonstrate the solutions"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "## Jupyter Notebook\n",
 47 |     "\n",
 48 |     "> The Jupyter Notebook is a web application that allows you to create and share documents that contain live code, equations, visualizations and explanatory text.\n",
 49 |     "\n",
 50 |     "- Two Modes: Edit and Command\n",
 51 |     "- Command -> Edit: `Enter`\n",
 52 |     "- Edit -> Command: `Esc`\n",
 53 |     "- Execute a Cell: `Shift+Enter`\n",
 54 |     "- Down: `j/Down Arrow`\n",
 55 |     "- Up: `k/Up Arrow`"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "## Tab Completion\n",
 63 |     "\n",
 64 |     "IPython will tab complete method names and function arguments\n",
 65 |     "\n",
 66 |     "Use `shift+tab` to inside a function call to show the signature"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "# type str.<TAB>\n"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "# type str.split(<shift+TAB>)\n"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "## Exercises\n",
 92 |     "\n",
 93 |     "- Lots of small exercises to check understanding\n",
 94 |     "- Each exercise includes\n",
 95 |     "    + A prompt / question to be answered\n",
 96 |     "    + An empty cell for code\n",
 97 |     "    + A \"magic\" cell that loads a solution\n",
 98 |     "- Execute the magic cell twice"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {},
104 |    "source": [
105 |     "<div class=\"alert alert-success\" data-title=\"Print 'Hello, world!'\">\n",
106 |     "  <h1><i class=\"fa fa-tasks\" aria-hidden=\"true\"></i> Exercise: Print 'Hello, world!'</h1>\n",
107 |     "</div>\n",
108 |     "\n",
109 |     "<p>Print the text \"Hello, world!\"</p>"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "# Your code here\n"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "# %load solutions/readme_00.py"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "Make sure to run the solution cell twice. I'd encourage you to always\n",
135 |     "run the solution cell, as later steps in the notebooks will depend on earlier\n",
136 |     "steps being correct."
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "## Pandas Cheat Sheet\n",
144 |     "\n",
145 |     "https://github.com/pandas-dev/pandas/blob/master/doc/cheatsheet/Pandas_Cheat_Sheet.pdf\n",
146 |     "\n",
147 |     "![cheat sheet](figures/cheat-sheet-preview.png)\n"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "## Notebooks\n",
155 |     "\n",
156 |     "1. [Indexing](01-Indexing.ipynb)\n",
157 |     "2. [Alignment](02-Alignment.ipynb)\n",
158 |     "3. [Iterators & Groupby](03-Iterators-Groupby.ipynb)\n",
159 |     "4. [Visualization](04-Visualization.ipynb)\n",
160 |     "5. [Tidy Data](05-Tidy-Data.ipynb)\n",
161 |     "6. [Performance](06-Performance.ipynb)\n",
162 |     "7. [Timeseries](07-Timeseries.ipynb)\n",
163 |     "8. [Ecosystem](08-Pandas-NumPy-ScikitLearn.ipynb)"
164 |    ]
165 |   }
166 |  ],
167 |  "metadata": {
168 |   "kernelspec": {
169 |    "display_name": "Python 3",
170 |    "language": "python",
171 |    "name": "python3"
172 |   },
173 |   "language_info": {
174 |    "codemirror_mode": {
175 |     "name": "ipython",
176 |     "version": 3
177 |    },
178 |    "file_extension": ".py",
179 |    "mimetype": "text/x-python",
180 |    "name": "python",
181 |    "nbconvert_exporter": "python",
182 |    "pygments_lexer": "ipython3",
183 |    "version": "3.6.6"
184 |   }
185 |  },
186 |  "nbformat": 4,
187 |  "nbformat_minor": 2
188 | }
189 | 


--------------------------------------------------------------------------------
/notebooks/03-Groupby.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# The groupby operation"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import numpy as np\n",
 17 |     "import pandas as pd\n",
 18 |     "import matplotlib.pyplot as plt"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "%matplotlib inline"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "plt.style.use('default')\n",
 37 |     "plt.rcParams['figure.figsize'] = (12, 6)\n",
 38 |     "pd.options.display.max_rows = 10"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "I've provided the reviews by the top 100 reviewers.\n",
 46 |     "We'll use it for talking about groupby."
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "df = pd.read_csv(\"data/subset.csv.gz\", compression=\"gzip\",\n",
 56 |     "                 parse_dates=['time'])\n",
 57 |     "df.head()"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "df.info()"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "## Aside: Namespaces\n",
 74 |     "\n",
 75 |     "Pandas has been expanding its use of namespaces (or accessors) on `DataFrame` to group together related methods. This also limits the number of methods direclty attached to `DataFrame` itself, which can be overwhelming.\n",
 76 |     "\n",
 77 |     "Currently, we have these namespaces:\n",
 78 |     "\n",
 79 |     "- `.str`: defined on `Series` and `Index`es containing strings (object dtype)\n",
 80 |     "- `.dt`: defined on `Series` with `datetime` or `timedelta` dtype\n",
 81 |     "- `.cat`: defined on `Series` and `Indexes` with `category` dtype\n",
 82 |     "- `.plot`: defined on `Series` and `DataFrames`"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "<div class=\"alert alert-success\" data-title=\"Reviews by Hour\">\n",
 90 |     "  <h1><i class=\"fa fa-tasks\" aria-hidden=\"true\"></i> Exercise: Reviews by Hour</h1>\n",
 91 |     "</div>\n",
 92 |     "\n",
 93 |     "<p>Make a barplot of the count of reviews by hour of the day.</p>"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "- Hint: Use the `.dt` namespace to get the `hour` component of a `datetime`\n",
101 |     "- Hint: We've seen `Series.value_counts` for getting the count of each value\n",
102 |     "- Hint: Use `.sort_index` to make sure the data is ordered by hour, not count\n",
103 |     "- Hint: Use the [`.plot`](http://pandas.pydata.org/pandas-docs/stable/api.html#plotting) namespace to get a `bar` chart"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": []
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "%load solutions/groupby_03.py"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "<div class=\"alert alert-success\" data-title=\"Pale Ales\">\n",
127 |     "  <h1><i class=\"fa fa-tasks\" aria-hidden=\"true\"></i> Exercise: Pale Ales</h1>\n",
128 |     "</div>\n",
129 |     "<p>\n",
130 |     "Make a variable `pale_ales` that filters `df` to just rows where `beer_style` contains the string `'pale ale'` (ignoring case)\n",
131 |     "</p>\n",
132 |     "- Hint: Use the `df.beer_style.str` namespace and find a method for checking whether a string contains another string."
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": []
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "%load solutions/groupby_04.py"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "metadata": {},
154 |    "source": [
155 |     "# Groupby"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "Groupby operations come up in a lot of contexts.\n",
163 |     "At its root, groupby about doing an operation on many subsets of the data, each of which shares something in common.\n",
164 |     "The components of a groupby operation are:"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "## Components of a groupby\n",
172 |     "\n",
173 |     "1. **split** a table into groups\n",
174 |     "2. **apply** a function to each group\n",
175 |     "3. **combine** the results into a single DataFrame or Series"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "In pandas the `split` step looks like\n",
183 |     "\n",
184 |     "```python\n",
185 |     "df.groupby( grouper )\n",
186 |     "```\n",
187 |     "\n",
188 |     "`grouper` can be many things\n",
189 |     "\n",
190 |     "- Series (or string indicating a column in `df`)\n",
191 |     "- function (to be applied on the index)\n",
192 |     "- dict : groups by *values*\n",
193 |     "- `levels=[ names of levels in a MultiIndex ]`"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {},
199 |    "source": [
200 |     "## Split\n",
201 |     "\n",
202 |     "Break a table into smaller logical tables according to some rule"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "gr = df.groupby(\"beer_name\")\n",
212 |     "gr"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "We haven't really done any actual work yet, but pandas knows what it needs to know to break the larger `df` into many smaller pieces, one for each distinct `beer_name`."
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {},
225 |    "source": [
226 |     "## Apply & Combine\n",
227 |     "\n",
228 |     "To finish the groupby, we apply a method to the groupby object."
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "review_cols = ['review_appearance', 'review_aroma', 'review_overall',\n",
238 |     "               'review_palate', 'review_taste']\n",
239 |     "\n",
240 |     "df.groupby('beer_name')[review_cols].agg('mean')"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "markdown",
245 |    "metadata": {},
246 |    "source": [
247 |     "In this case, the function we applied was `'mean'`.\n",
248 |     "Pandas has implemented cythonized versions of certain common methods like mean, sum, etc.\n",
249 |     "You can also pass in regular functions like `np.mean`.\n",
250 |     "\n",
251 |     "In terms of split, apply, combine, split was `df.groupby('beer_name')`. \n",
252 |     "We apply the `mean` function by passing in `'mean'`.\n",
253 |     "Finally, by using the `.agg` method (for aggregate) we tell pandas to combine the results with one output row per group.\n",
254 |     "\n",
255 |     "You can also pass in regular functions like `np.mean`."
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "df.groupby('beer_name')[review_cols].agg(np.mean).head()"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "metadata": {},
270 |    "source": [
271 |     "Finally, [certain methods](http://pandas.pydata.org/pandas-docs/stable/api.html#id35) have been attached to `Groupby` objects."
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "df.groupby('beer_name')[review_cols].mean()"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "markdown",
285 |    "metadata": {},
286 |    "source": [
287 |     "<div class=\"alert alert-success\" data-title=\"Highest Variance\">\n",
288 |     "  <h1><i class=\"fa fa-tasks\" aria-hidden=\"true\"></i> Exercise: Highest Variance</h1>\n",
289 |     "</div>\n",
290 |     "\n",
291 |     "<p>Find the `beer_style`s with the greatest variance in `abv`.</p>\n",
292 |     "\n",
293 |     "- hint: `.std` calculates the standard deviation (`.var` for variance), and is available on `GroupBy` objects like `gr.abv`.\n",
294 |     "- hint: use `.sort_values` to sort a Series by the values (it took us a while to come up with that name)"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": []
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": null,
307 |    "metadata": {},
308 |    "outputs": [],
309 |    "source": [
310 |     "%load solutions/groupby_abv.py"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "markdown",
315 |    "metadata": {},
316 |    "source": [
317 |     "## `.agg` output shape\n",
318 |     "\n",
319 |     "The output shape is determined by the grouper, data, and aggregation\n",
320 |     "\n",
321 |     "- Grouper: Controls the output index\n",
322 |     "    * single grouper -> Index\n",
323 |     "    * array-like grouper -> MultiIndex\n",
324 |     "- Subject (Groupee): Controls the output data values\n",
325 |     "    * single column -> Series (or DataFrame if multiple aggregations)\n",
326 |     "    * multiple columns -> DataFrame\n",
327 |     "- Aggregation: Controls the output columns\n",
328 |     "    * single aggfunc -> Index in the colums\n",
329 |     "    * multiple aggfuncs -> MultiIndex in the columns (Or 1-D Index if groupee is 1-D)"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "markdown",
334 |    "metadata": {},
335 |    "source": [
336 |     "\n",
337 |     "We'll go into MultiIndexes in a bit, but for know, think of them as regular Indexes with multiple levels (columns)."
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": null,
343 |    "metadata": {},
344 |    "outputs": [],
345 |    "source": [
346 |     "# single grouper, single groupee, single aggregation\n",
347 |     "df.groupby('beer_style').review_overall.agg('mean')"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": null,
353 |    "metadata": {},
354 |    "outputs": [],
355 |    "source": [
356 |     "# multiple groupers, multiple groupee, single aggregation\n",
357 |     "df.groupby(['brewer_id', 'beer_name'])[review_cols].agg(['mean', 'min', 'max', 'std', 'count'])"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "markdown",
362 |    "metadata": {},
363 |    "source": [
364 |     "<div class=\"alert alert-success\" data-title=\"Rating by length\">\n",
365 |     "  <h1><i class=\"fa fa-tasks\" aria-hidden=\"true\"></i> Exercise: Rating by length</h1>\n",
366 |     "</div>\n",
367 |     "\n",
368 |     "<p>Plot the relationship between review length (number of characters) and average `review_overall`.</p>\n",
369 |     "\n",
370 |     "- Hint: use `.plot(style='k.')`\n",
371 |     "- We've grouped by columns so far, you can also group by any series with the same length"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": null,
377 |    "metadata": {},
378 |    "outputs": [],
379 |    "source": []
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": null,
384 |    "metadata": {},
385 |    "outputs": [],
386 |    "source": [
387 |     "%load solutions/groupby_00.py"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "markdown",
392 |    "metadata": {},
393 |    "source": [
394 |     "<div class=\"alert alert-success\" data-title=\"Reviews by Length\">\n",
395 |     "  <h1><i class=\"fa fa-tasks\" aria-hidden=\"true\"></i> Exercise: Reviews by Length</h1>\n",
396 |     "</div>\n",
397 |     "\n",
398 |     "<p>Find the relationship between review length (number of **words** and average `review_overall`.)</p>\n",
399 |     "\n",
400 |     "- Hint: You can pass a [regular expression](https://docs.python.org/3/howto/regex.html#matching-characters) to any of the `.str` methods."
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": null,
406 |    "metadata": {},
407 |    "outputs": [],
408 |    "source": []
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": null,
413 |    "metadata": {},
414 |    "outputs": [],
415 |    "source": [
416 |     "%load solutions/groupby_00b.py"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "markdown",
421 |    "metadata": {},
422 |    "source": [
423 |     "<div class=\"alert alert-success\" data-title=\"Rating by number of Reviews\">\n",
424 |     "  <h1><i class=\"fa fa-tasks\" aria-hidden=\"true\"></i> Exercise: Rating by number of Reviews</h1>\n",
425 |     "</div>\n",
426 |     "\n",
427 |     "<p>Find the relationship between the number of reviews for a beer and the average `review_overall`.</p>\n"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": null,
433 |    "metadata": {},
434 |    "outputs": [],
435 |    "source": []
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": null,
440 |    "metadata": {},
441 |    "outputs": [],
442 |    "source": [
443 |     "%load solutions/groupby_01.py"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "markdown",
448 |    "metadata": {},
449 |    "source": [
450 |     "## Transform\n",
451 |     "\n",
452 |     "A *transform* is a function whose output is the same shape as the input."
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "markdown",
457 |    "metadata": {},
458 |    "source": [
459 |     "Recall that a groupby has three steps: split, apply, combine.\n",
460 |     "So far, all of the functions we've applied have been *aggregations*: the rule for \"combine\" is one row per group.\n",
461 |     "\n",
462 |     "You can use `Groupby.transform` when you have an operation that should be done *groupwise*, but the result should be the same shape.\n",
463 |     "For example, suppose we wanted to normalize each reviewer's scores by their average score. "
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": null,
469 |    "metadata": {},
470 |    "outputs": [],
471 |    "source": [
472 |     "# Define demean(v: array) -> array\n",
473 |     "def demean(v):\n",
474 |     "    return v - v.mean()"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "markdown",
479 |    "metadata": {},
480 |    "source": [
481 |     "Just calling `demean` on the entire Series will normalize by the *global* average."
482 |    ]
483 |   },
484 |   {
485 |    "cell_type": "code",
486 |    "execution_count": null,
487 |    "metadata": {},
488 |    "outputs": [],
489 |    "source": [
490 |     "demean(df.review_overall)"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "markdown",
495 |    "metadata": {},
496 |    "source": [
497 |     "Now, let's demean each individual's reviews by their own average.\n",
498 |     "This could be useful if, for example, you were building a recommendation system.\n",
499 |     "A rating of 4 from someone's whose average is 2 is in some sense more meaningful that a 4 from someone who always gives 4s."
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "code",
504 |    "execution_count": null,
505 |    "metadata": {},
506 |    "outputs": [],
507 |    "source": [
508 |     "normalized = df.groupby(\"profile_name\")[review_cols].transform(demean)\n",
509 |     "normalized.head()"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "markdown",
514 |    "metadata": {},
515 |    "source": [
516 |     "We used `.transform` because the desired output was the same shape as the input.\n",
517 |     "Just like `.agg` informs pandas that you want `1 input group → 1 output row`, the `.transform` method informs pandas that you want `1 input row → 1 output row`.\n",
518 |     "\n",
519 |     "`.transform` operates on each column independently."
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "markdown",
524 |    "metadata": {},
525 |    "source": [
526 |     "<div class=\"alert alert-success\" data-title=\"Personal Trend?\">\n",
527 |     "  <h1><i class=\"fa fa-tasks\" aria-hidden=\"true\"></i> Exercise: Personal Trend?</h1>\n",
528 |     "</div>\n",
529 |     "\n",
530 |     "<p>Do reviewer's `review_overall` trend over a person's time reviewing?</p>\n",
531 |     "\n",
532 |     "Hint: Need an indictor that tracks which review this is for that person. That is, we need a cumulative count of reviews per person."
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "code",
537 |    "execution_count": null,
538 |    "metadata": {},
539 |    "outputs": [],
540 |    "source": []
541 |   },
542 |   {
543 |    "cell_type": "code",
544 |    "execution_count": null,
545 |    "metadata": {},
546 |    "outputs": [],
547 |    "source": [
548 |     "%load solutions/groupby_02.py"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "markdown",
553 |    "metadata": {},
554 |    "source": [
555 |     "## General `.apply`\n",
556 |     "\n",
557 |     "We've seen `.agg` for outputting 1 row per group, and `.transform` for outputting 1 row per input row.\n",
558 |     "\n",
559 |     "The final kind of function application is `.apply`.\n",
560 |     "This can do pretty much whatever you want.\n",
561 |     "We'll see an example in a later notebook."
562 |    ]
563 |   },
564 |   {
565 |    "cell_type": "markdown",
566 |    "metadata": {},
567 |    "source": [
568 |     "## Summary\n",
569 |     "\n",
570 |     "- We used groupby to analyze data by subsets\n",
571 |     "- We used `agg` to summarize groups and `transform` to perform group-wise transformations"
572 |    ]
573 |   }
574 |  ],
575 |  "metadata": {
576 |   "kernelspec": {
577 |    "display_name": "Python 3",
578 |    "language": "python",
579 |    "name": "python3"
580 |   },
581 |   "language_info": {
582 |    "codemirror_mode": {
583 |     "name": "ipython",
584 |     "version": 3
585 |    },
586 |    "file_extension": ".py",
587 |    "mimetype": "text/x-python",
588 |    "name": "python",
589 |    "nbconvert_exporter": "python",
590 |    "pygments_lexer": "ipython3",
591 |    "version": "3.5.5"
592 |   }
593 |  },
594 |  "nbformat": 4,
595 |  "nbformat_minor": 1
596 | }
597 | 


--------------------------------------------------------------------------------
/notebooks/05-Timeseries.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Timeseries"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Pandas started out in the financial world, so it naturally has strong support for timeseries data.\n",
 15 |     "We'll look at some pandas data types and methods for manipulating timeseries data.\n",
 16 |     "Afterwords, we'll use [statsmodels' state space framework](http://www.statsmodels.org/stable/statespace.html) to model timeseries data."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import numpy as np\n",
 26 |     "import pandas as pd\n",
 27 |     "import seaborn as sns\n",
 28 |     "import matplotlib.pyplot as plt"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "%matplotlib inline"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "plt.style.use('default')\n",
 47 |     "plt.rcParams['figure.figsize'] = (12, 6)\n",
 48 |     "pd.options.display.max_rows = 10"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "## Datatypes\n",
 56 |     "\n",
 57 |     "- `pd.Timestamp` (nanosecond resolution `datetime.datetime`)\n",
 58 |     "- `pd.Timedelta` (nanosecond resolution `datetime.timedelta`)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "Pandas provides highly-performant (mostly) drop-in replacements for `datetime.datetime` (`pd.Timestamp`) and `datetime.tiemedelta` (`pd.Timedelta`).\n",
 66 |     "These have been tailored for efficient storage in NumPy arrays.\n",
 67 |     "For the most part you'll be working with `DatetimeIndex`es or `TimedeltaIndex`es, or Series / DataFrames containing these.\n",
 68 |     "\n",
 69 |     "The biggest limitation is that pandas stores `Timestamp`s at nanosecond resolution. Since they're backed by NumPy's 64-bit integer, the minimum and maximum values are"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "pd.Timestamp.min, pd.Timestamp.max"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "If this is a problem, [there are workarounds](http://pandas.pydata.org/pandas-docs/stable/timeseries.html#representing-out-of-bounds-spans)."
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "We'll go back to the BTS data set on flights.\n",
 93 |     "This time I've provided the number of flights per hour for two airports in Chicago: Midway (MDW) and O'Hare (ORD). The data go back to January 1st, 2000."
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "df = pd.read_csv(\"data/flights-ts.csv.gz\", index_col=0, parse_dates=True)\n",
103 |     "df.head()"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "## Resampling\n",
111 |     "\n",
112 |     "Resampling is similar to a groupby, but specialized for datetimes.\n",
113 |     "Instead of specifying a column of values to group by, you  specify a `rule`: the desired output frequency.\n",
114 |     "The original data is binned into each group created by your rule."
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "resampler = df.resample(\"MS\")  # MS=Month Start\n",
124 |     "resampler"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "There's an extensive list of frequency codes: http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases.\n",
132 |     "\n",
133 |     "If you examine the raw data in `df`, you'll notice that it's not at a fixed frequency.\n",
134 |     "Hours where there weren't any flights just simply aren't present.\n",
135 |     "This isn't a problem though; resample is perfect for going from \"ragged\" timeseries data to fixed-frequency data.\n",
136 |     "\n",
137 |     "Just like with `.groupby`, `.resample` returns a deferred object that hasn't really done any work yet.\n",
138 |     "It has methods for aggregation, transformation, and general function application."
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "resampler.sum()"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "resampler.sum().plot();"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "<div class=\"alert alert-success\" data-title=\"Resample\">\n",
164 |     "  <h1><i class=\"fa fa-tasks\" aria-hidden=\"true\"></i> Exercise: Resample</h1>\n",
165 |     "</div>\n",
166 |     "<p>Plot the standard deviation for the number of flights from `MDW` and `ORD` at a weekly frequency</p>"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "# Your solution\n"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "%load solutions/timeseries_resample.py"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "<div class=\"alert alert-success\" data-title=\"Resample-Agg\">\n",
192 |     "  <h1><i class=\"fa fa-tasks\" aria-hidden=\"true\"></i> Exercise: Resample-Agg</h1>\n",
193 |     "</div>\n",
194 |     "<p>Compute the the total number of flights (sum), mean, and median flights *per Quarter*.</p>"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": []
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "%load solutions/timeseries_resample_agg.py"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "markdown",
215 |    "metadata": {},
216 |    "source": [
217 |     "## Rolling, Expanding\n",
218 |     "\n",
219 |     "Applying functions to windows, moving through your data."
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "markdown",
224 |    "metadata": {},
225 |    "source": [
226 |     "These are very similar to groupby and resample. Let's get the daily number of flights with a `resample` quick."
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "daily = df.resample('D').sum()\n",
236 |     "daily"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "markdown",
241 |    "metadata": {},
242 |    "source": [
243 |     "Suppose you wanted a 30-day moving (or rolling) average.\n",
244 |     "This is possible with the `.rolling` method. Like `groupby` and `resample`, this object is just going to store the information to know what subset of data to operate on next; it doesn't actually do any work yet:"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "daily.rolling(30, center=True)"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "The first argument is the window size.\n",
261 |     "Since `daily` is at daily frequency, 30 means a 30-day window.\n",
262 |     "`center=True` says to label each window with the middle-most point.\n",
263 |     "To actually do work, you call a method like `.mean`;"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "metadata": {},
270 |    "outputs": [],
271 |    "source": [
272 |     "fig, ax = plt.subplots()\n",
273 |     "daily.rolling(30).mean().rename(columns=lambda x: x + \" (30D MA)\").plot(ax=ax, alpha=.25,\n",
274 |     "                                                                        color=['C0', 'C1'])\n",
275 |     "daily.plot(ax=ax, alpha=.25, color=['C0', 'C1'], legend=False);"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "markdown",
280 |    "metadata": {},
281 |    "source": [
282 |     "It's common to combine resampling and rolling."
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": null,
288 |    "metadata": {},
289 |    "outputs": [],
290 |    "source": [
291 |     "df.resample(\"D\").sum().rolling(30).corr(pairwise=True).xs(\"MDW\", level=1)['ORD'].plot(\n",
292 |     "    title=\"O'Hare : Midway cross-correlation (30D MA)\", figsize=(12, 4)\n",
293 |     ");"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "metadata": {},
299 |    "source": [
300 |     "## Timezones\n",
301 |     "\n",
302 |     "pandas can store an array of datetimes with a common timezone.\n",
303 |     "Right now the index for `df` is timezone naïve, but we can convert to a timezone with `tz_convert`:"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": [
312 |     "df.index.tzinfo  # None, timezone naïve"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {},
319 |    "outputs": [],
320 |    "source": [
321 |     "df.index.tz_localize(\"US/Central\")"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "markdown",
326 |    "metadata": {},
327 |    "source": [
328 |     "Timezones, as usual, are annoying to deal with.\n",
329 |     "We've hit a daylight savings time issue.\n",
330 |     "As the error says, 2000-04-02T02:00:00 isn't actaully a valid time in US/Central.\n",
331 |     "I checked the BTS website, and these timestamps are supposed to be local time, so presumably some data was recorded incorrectly.\n",
332 |     "pandas is strict by default, so it we need to tell it to ignore those errors: "
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": null,
338 |    "metadata": {},
339 |    "outputs": [],
340 |    "source": [
341 |     "idx = df.index.tz_localize(\"US/Central\", ambiguous=\"NaT\", errors='coerce')\n",
342 |     "idx"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": null,
348 |    "metadata": {},
349 |    "outputs": [],
350 |    "source": [
351 |     "pd.isnull(idx).sum()  # 25 bad values"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "markdown",
356 |    "metadata": {},
357 |    "source": [
358 |     "Notice the dtype: `datetime64[ns, US/Central]`.\n",
359 |     "That means nanosecond resolution in the US/Central time zone.\n",
360 |     "Once you have a datetime with timezone, you can convert timezones with `tz_convert`:"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": null,
366 |    "metadata": {},
367 |    "outputs": [],
368 |    "source": [
369 |     "idx.tz_convert(\"UTC\")"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "markdown",
374 |    "metadata": {},
375 |    "source": [
376 |     "## Offsets\n",
377 |     "\n",
378 |     "I wish the standard library `datetime` module had something like this.\n",
379 |     "Let's generate some fake data with `pd.date_range`"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": null,
385 |    "metadata": {},
386 |    "outputs": [],
387 |    "source": [
388 |     "dates = pd.date_range(\"2016-01-01\", end=\"2016-12-31\", freq='D')\n",
389 |     "dates"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "markdown",
394 |    "metadata": {},
395 |    "source": [
396 |     "There are a whole bunch of offsets available in the `pd.tseries.offsets` namespace. For example, to move 3 business days into the future:"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": null,
402 |    "metadata": {},
403 |    "outputs": [],
404 |    "source": [
405 |     "dates + pd.tseries.offsets.BDay(3)"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "markdown",
410 |    "metadata": {},
411 |    "source": [
412 |     "Or to move to the next month end:"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "code",
417 |    "execution_count": null,
418 |    "metadata": {},
419 |    "outputs": [],
420 |    "source": [
421 |     "dates + pd.tseries.offsets.MonthEnd()"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "markdown",
426 |    "metadata": {},
427 |    "source": [
428 |     "## Timedelta Math\n",
429 |     "\n",
430 |     "Being able to add columns of dates and timedeltas turns out to be quite convenient.\n",
431 |     "Let's go all the way back to our first example with flight delays from New York airports."
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": null,
437 |    "metadata": {},
438 |    "outputs": [],
439 |    "source": [
440 |     "flights = pd.read_csv(\"data/ny-flights.csv.gz\", parse_dates=['dep', 'arr'])\n",
441 |     "flights.head()"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "markdown",
446 |    "metadata": {},
447 |    "source": [
448 |     "<div class=\"alert alert-success\" data-title=\"Convert Timedelta\">\n",
449 |     "  <h1><i class=\"fa fa-tasks\" aria-hidden=\"true\"></i> Exercise: Convert Timedelta</h1>\n",
450 |     "</div>\n",
451 |     "<p>Convert `flights.dep_delay` and `flights.arr_delay` to timedelta dtype.</p>\n",
452 |     "\n",
453 |     "- Hint: recall our type conversion methods: `pd.to_*`\n",
454 |     "- Make new columns in `flights` called `dep_delay_td` and `arr_delay_td`\n",
455 |     "- Check the `unit` argument for the conversion method. The delay columns are in *minutes*."
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": null,
461 |    "metadata": {},
462 |    "outputs": [],
463 |    "source": [
464 |     "# Your solution\n"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "code",
469 |    "execution_count": null,
470 |    "metadata": {},
471 |    "outputs": [],
472 |    "source": [
473 |     "%load solutions/timeseries_timedelta.py"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "markdown",
478 |    "metadata": {},
479 |    "source": [
480 |     "<div class=\"alert alert-success\" data-title=\"Timedelta Math\">\n",
481 |     "  <h1><i class=\"fa fa-tasks\" aria-hidden=\"true\"></i> Exercise: Timedelta Math</h1>\n",
482 |     "</div>\n",
483 |     "<p>Compute the actual time the flight left, but adding the departure time `dep` and the delay `dep_delay`."
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "code",
488 |    "execution_count": null,
489 |    "metadata": {},
490 |    "outputs": [],
491 |    "source": []
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": null,
496 |    "metadata": {},
497 |    "outputs": [],
498 |    "source": [
499 |     "%load solutions/timeseries_departure.py"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "markdown",
504 |    "metadata": {},
505 |    "source": [
506 |     "# Modeling Timeseries\n",
507 |     "\n",
508 |     "Timeseries are an interesting problem to model.\n",
509 |     "If we're lucky, we have a long history of past data that we can (maybe) use to predict the future.\n",
510 |     "We can exploit regularity in the timeseries (seasonal patterns, periods of high values are typically followed by another high value, etc.) to better predict the future.\n",
511 |     "\n",
512 |     "Statsmodels has a nice framework for fitting timeseries models and evaluating their output."
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": null,
518 |    "metadata": {},
519 |    "outputs": [],
520 |    "source": [
521 |     "import statsmodels.formula.api as smf\n",
522 |     "import statsmodels.tsa.api as smt\n",
523 |     "import statsmodels.api as sm"
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "markdown",
528 |    "metadata": {},
529 |    "source": [
530 |     "Let's model Monthly flights from `ORD`."
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "code",
535 |    "execution_count": null,
536 |    "metadata": {},
537 |    "outputs": [],
538 |    "source": [
539 |     "y = daily.ORD.resample(\"MS\").sum()\n",
540 |     "y.plot();"
541 |    ]
542 |   },
543 |   {
544 |    "cell_type": "markdown",
545 |    "metadata": {},
546 |    "source": [
547 |     "That final value is odd because it's not a complete month. Let's drop it."
548 |    ]
549 |   },
550 |   {
551 |    "cell_type": "code",
552 |    "execution_count": null,
553 |    "metadata": {},
554 |    "outputs": [],
555 |    "source": [
556 |     "y = daily.ORD.resample(\"MS\").sum().iloc[:-1]\n",
557 |     "y.head()"
558 |    ]
559 |   },
560 |   {
561 |    "cell_type": "markdown",
562 |    "metadata": {},
563 |    "source": [
564 |     "It's common to estimate the parameters on *differenced* values.\n",
565 |     "That is, make a new series $y'$ where $y_t' = y_t - y_{t-1}$. Pandas makes this simple with the `.diff` method."
566 |    ]
567 |   },
568 |   {
569 |    "cell_type": "code",
570 |    "execution_count": null,
571 |    "metadata": {},
572 |    "outputs": [],
573 |    "source": [
574 |     "y_prime = y.diff()\n",
575 |     "y_prime.head()"
576 |    ]
577 |   },
578 |   {
579 |    "cell_type": "markdown",
580 |    "metadata": {},
581 |    "source": [
582 |     "We'll drop that first NaN:"
583 |    ]
584 |   },
585 |   {
586 |    "cell_type": "code",
587 |    "execution_count": null,
588 |    "metadata": {},
589 |    "outputs": [],
590 |    "source": [
591 |     "y_prime = y.diff().dropna()\n",
592 |     "y_prime.plot();"
593 |    ]
594 |   },
595 |   {
596 |    "cell_type": "markdown",
597 |    "metadata": {},
598 |    "source": [
599 |     "Think back to regular linear regression: Predict some variable $y$ with some matrix $X$:\n",
600 |     "\n",
601 |     "$y = \\beta_0 + \\beta_1 X_1 + \\beta_2 X_2 ... + \\beta_p X_p + \\varepsilon$\n",
602 |     "\n",
603 |     "When modelling timeseries, past values of $y$ make for good components of $X$.\n",
604 |     "We can do this with the pandas `.shift` method:"
605 |    ]
606 |   },
607 |   {
608 |    "cell_type": "code",
609 |    "execution_count": null,
610 |    "metadata": {},
611 |    "outputs": [],
612 |    "source": [
613 |     "y_prime.shift()"
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "markdown",
618 |    "metadata": {},
619 |    "source": [
620 |     "So the value for `2001-01-01` (-867) is now labeled `2000-02-01`. We can collect many of these with a list comprehension and a `concat`."
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "code",
625 |    "execution_count": null,
626 |    "metadata": {},
627 |    "outputs": [],
628 |    "source": [
629 |     "lagged = pd.concat([y_prime.shift(i) for i in range(9)], axis=1,\n",
630 |     "                   keys=['y', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8'])\n",
631 |     "lagged"
632 |    ]
633 |   },
634 |   {
635 |    "cell_type": "code",
636 |    "execution_count": null,
637 |    "metadata": {},
638 |    "outputs": [],
639 |    "source": [
640 |     "mod_lagged = smf.ols('y ~ L1 + L2 + L3 + L4 + L5 + L6 + L7 + L8', lagged)\n",
641 |     "res_lagged = mod_lagged.fit()\n",
642 |     "\n",
643 |     "res_lagged.summary()"
644 |    ]
645 |   },
646 |   {
647 |    "cell_type": "code",
648 |    "execution_count": null,
649 |    "metadata": {},
650 |    "outputs": [],
651 |    "source": [
652 |     "ax = res_lagged.fittedvalues.plot(label=\"predicted\", figsize=(12, 4), legend=True)\n",
653 |     "y_prime.plot(label=\"actual\", legend=True);"
654 |    ]
655 |   },
656 |   {
657 |    "cell_type": "markdown",
658 |    "metadata": {},
659 |    "source": [
660 |     "In practice, you won't be doing the `shift`ing and `diff`ing yourself.\n",
661 |     "It's more convenient to let statsmodels do that for us.\n",
662 |     "Then we don't have to worry about un-differencing the fitted / predicted results to interpret them correctly.\n",
663 |     "Also, the solvers we'll see next are a bit more sophisticated than a linear regression."
664 |    ]
665 |   },
666 |   {
667 |    "cell_type": "markdown",
668 |    "metadata": {},
669 |    "source": [
670 |     "## AutoRegressive Model\n",
671 |     "\n",
672 |     "Predict $y_{t+1}$, given $y_0, y_1, \\ldots y_t$"
673 |    ]
674 |   },
675 |   {
676 |    "cell_type": "markdown",
677 |    "metadata": {},
678 |    "source": [
679 |     "Let's fit an autoregressive (AR) model. Autoregressive part just means using past values of $y$ to predict the future (like we did above).\n",
680 |     "We'll use statsmodel's `SARIMAX` model. The AR part of SARIMAX is for autoregressive.\n",
681 |     "It also handles seasonality (**S**), differencing (**I** for integrated), moving average (**MA**), and exogenous regressors (**X**).\n",
682 |     "\n",
683 |     "We'll stick to a simple AR(8) model (use the last 8 periods) with a single period of differencing."
684 |    ]
685 |   },
686 |   {
687 |    "cell_type": "code",
688 |    "execution_count": null,
689 |    "metadata": {},
690 |    "outputs": [],
691 |    "source": [
692 |     "mod = smt.SARIMAX(y, order=(8, 1, 0))  # AR(8), first difference, no MA\n",
693 |     "res = mod.fit()"
694 |    ]
695 |   },
696 |   {
697 |    "cell_type": "markdown",
698 |    "metadata": {},
699 |    "source": [
700 |     "As usual with statsmodels, we get a nice summary with the fitted coefficeints and some test statistics (which we'll ignore)"
701 |    ]
702 |   },
703 |   {
704 |    "cell_type": "code",
705 |    "execution_count": null,
706 |    "metadata": {},
707 |    "outputs": [],
708 |    "source": [
709 |     "res.summary()"
710 |    ]
711 |   },
712 |   {
713 |    "cell_type": "markdown",
714 |    "metadata": {},
715 |    "source": [
716 |     "The results instance has all the usual attributes and methods, like `fittedvalues`."
717 |    ]
718 |   },
719 |   {
720 |    "cell_type": "code",
721 |    "execution_count": null,
722 |    "metadata": {},
723 |    "outputs": [],
724 |    "source": [
725 |     "ax = res.fittedvalues.iloc[1:].plot(label=\"Fitted\", legend=True, figsize=(12, 4))\n",
726 |     "y.plot(ax=ax, label=\"Actual\", legend=True);"
727 |    ]
728 |   },
729 |   {
730 |    "cell_type": "markdown",
731 |    "metadata": {},
732 |    "source": [
733 |     "## Forecasting\n",
734 |     "\n",
735 |     "The real value of timeseries analysis is to predict the future.\n",
736 |     "We can use the `.get_prediction` method to get the predicted values, along with a confidence interval."
737 |    ]
738 |   },
739 |   {
740 |    "cell_type": "markdown",
741 |    "metadata": {},
742 |    "source": [
743 |     "First, we'll look at one-period-ahead forecasts.\n",
744 |     "Basically, this simulates looking at our data the last day of the month, and making the forecast for the next month.\n",
745 |     "Keep in mind though that we fit our parameters on the entire dataset. The isn't an out-of-sample prediction."
746 |    ]
747 |   },
748 |   {
749 |    "cell_type": "code",
750 |    "execution_count": null,
751 |    "metadata": {},
752 |    "outputs": [],
753 |    "source": [
754 |     "pred = res.get_prediction(start='2001-03-01')\n",
755 |     "pred_ci = pred.conf_int()"
756 |    ]
757 |   },
758 |   {
759 |    "cell_type": "code",
760 |    "execution_count": null,
761 |    "metadata": {},
762 |    "outputs": [],
763 |    "source": [
764 |     "ax = y.plot(label='observed')\n",
765 |     "pred.predicted_mean.plot(ax=ax, label='Forecast', alpha=.7)\n",
766 |     "ax.fill_between(pred_ci.index,\n",
767 |     "                pred_ci.iloc[:, 0],\n",
768 |     "                pred_ci.iloc[:, 1], color='k', alpha=.2)\n",
769 |     "plt.legend()\n",
770 |     "sns.despine()"
771 |    ]
772 |   },
773 |   {
774 |    "cell_type": "markdown",
775 |    "metadata": {},
776 |    "source": [
777 |     "Alternatively, we can make dynamic forecasts as of some month (January 2013 in the example below). That means the forecast from that point forward only use information available as of January 2013 (though again, we fit the model on the entire dataset). The predictions are generated in a similar way: a bunch of one-step forecasts. Only instead of plugging in the actual values beyond January 2013, we plug in the forecast values."
778 |    ]
779 |   },
780 |   {
781 |    "cell_type": "code",
782 |    "execution_count": null,
783 |    "metadata": {},
784 |    "outputs": [],
785 |    "source": [
786 |     "pred_dy = res.get_prediction(start='2002-03-01', dynamic='2013-01-01')\n",
787 |     "pred_dy_ci = pred_dy.conf_int()"
788 |    ]
789 |   },
790 |   {
791 |    "cell_type": "code",
792 |    "execution_count": null,
793 |    "metadata": {},
794 |    "outputs": [],
795 |    "source": [
796 |     "ax = y.plot(label='observed')\n",
797 |     "pred_dy.predicted_mean.plot(ax=ax, label='Forecast')\n",
798 |     "ax.fill_between(pred_dy_ci.index,\n",
799 |     "                pred_dy_ci.iloc[:, 0],\n",
800 |     "                pred_dy_ci.iloc[:, 1], color='k', alpha=.25)\n",
801 |     "ylim = ax.get_ylim()\n",
802 |     "ax.fill_betweenx(ylim, pd.Timestamp('2013-01-01'), y.index[-1],\n",
803 |     "                 alpha=.1, zorder=-1)\n",
804 |     "ax.set_ylim(ylim)\n",
805 |     "ax.annotate('Dynamic $\\\\longrightarrow$',\n",
806 |     "            (pd.Timestamp('2013-02-01'), 16000))\n",
807 |     "\n",
808 |     "plt.legend()\n",
809 |     "sns.despine()\n",
810 |     "plt.tight_layout()"
811 |    ]
812 |   },
813 |   {
814 |    "cell_type": "markdown",
815 |    "metadata": {},
816 |    "source": [
817 |     "There are *a lot* of issues we didn't cover here.\n",
818 |     "Seasonality, non-stationarity, autocorrellation, unit roots, and more.\n",
819 |     "Timeseries modeling is fraught with traps that will throw off your predictions.\n",
820 |     "Still, this should give you a taste of what's possbile."
821 |    ]
822 |   },
823 |   {
824 |    "cell_type": "markdown",
825 |    "metadata": {},
826 |    "source": [
827 |     "## Further Resources\n",
828 |     "\n",
829 |     "- [statsmodels state space documentation](http://www.statsmodels.org/dev/statespace.html)\n",
830 |     "- [statsmodels state space examples](http://www.statsmodels.org/dev/examples/index.html#statespace)\n",
831 |     "- [pyflux](http://www.pyflux.com), another time series modeling library\n",
832 |     "- Sean Abu's [post on ARIMA](http://www.seanabu.com/2016/03/22/time-series-seasonal-ARIMA-model-in-python/)\n",
833 |     "- Jeffrey Yau's [talks at PyData](https://www.youtube.com/watch?v=tJ-O3hk1vRw)\n",
834 |     "- My [blog post](http://tomaugspurger.github.io/modern-7-timeseries.html)"
835 |    ]
836 |   }
837 |  ],
838 |  "metadata": {
839 |   "kernelspec": {
840 |    "display_name": "Python 3",
841 |    "language": "python",
842 |    "name": "python3"
843 |   },
844 |   "language_info": {
845 |    "codemirror_mode": {
846 |     "name": "ipython",
847 |     "version": 3
848 |    },
849 |    "file_extension": ".py",
850 |    "mimetype": "text/x-python",
851 |    "name": "python",
852 |    "nbconvert_exporter": "python",
853 |    "pygments_lexer": "ipython3",
854 |    "version": "3.6.6"
855 |   }
856 |  },
857 |  "nbformat": 4,
858 |  "nbformat_minor": 2
859 | }
860 | 


--------------------------------------------------------------------------------
/notebooks/06-Dask.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "<img src=\"http://dask.readthedocs.io/en/latest/_images/dask_horizontal.svg\"\n",
  8 |     "     align=\"right\"\n",
  9 |     "     width=\"30%\"\n",
 10 |     "     alt=\"Dask logo\\\">\n",
 11 |     "\n",
 12 |     "# Personal Dask Cluster\n",
 13 |     "\n",
 14 |     "Go to `<cluster url>` for your own Dask cluster. Use your first and last name provided to SciPy for the username and \"dask\" for the password.\n",
 15 |     "(We don't actually do any authentication)."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## Dask Quickstart\n",
 23 |     "\n",
 24 |     "Dask scales python.\n",
 25 |     "Today, we'll focus on how it scales pandas, but know that it's more general.\n",
 26 |     "\n",
 27 |     "Pandas is fundamentally for in-memory datasets.\n",
 28 |     "You can't have a DataFrame larger than your machine's RAM.\n",
 29 |     "\n",
 30 |     "Dask dataframe lets you work with larger than memory datasets.\n",
 31 |     "Dask breaks large problems into many small problems (task graph).\n",
 32 |     "It then executes those small problems in parallel and in a small memory footprint (scheduler).\n",
 33 |     "It provides user interfaces, like `dask.dataframe` or `dask.array`, which feel like NumPy and pandas."
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import numpy as np\n",
 43 |     "import pandas as pd\n",
 44 |     "import dask.dataframe as dd\n",
 45 |     "\n",
 46 |     "df = dd.from_pandas(\n",
 47 |     "    pd.DataFrame({'A': np.random.choice(['a', 'b', 'c'], size=100),\n",
 48 |     "                  'B': np.random.randn(100),\n",
 49 |     "                  'C': np.random.uniform(size=100)}),\n",
 50 |     "    npartitions=4\n",
 51 |     ")\n",
 52 |     "df"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "A dask dataframe has most of the same methods as pandas."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "df.B + df.C"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "df[['B', 'C']].sum()"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "Dask DataFrame's methods are lazy.\n",
 85 |     "This lets Dask build up a large chain of operations that can be executed in parallel.\n",
 86 |     "When you say `df.sum()`, instead of computing the sum immediately, Dask builds up a *task graph*.\n",
 87 |     "\n",
 88 |     "```python\n",
 89 |     "df[['B', 'C']].sum().visualize(rankdir='LR')\n",
 90 |     "```\n",
 91 |     "\n",
 92 |     "<img src=\"graph.png\">"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "When you're ready for a concrete result, call `compute`."
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "df[['B', 'C']].sum().compute()"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "Calling `compute` hands the task graph to the *scheduler*, which executes the graph in parallel.\n",
116 |     "Dask has several schedulers, depending on how you want to do the computation (using many threads, processes, or machines).\n",
117 |     "We'll be using the distributed scheduler, so we can see how dask scales pandas to a cluster of machines.\n",
118 |     "But Dask also works well on a single machine.\n",
119 |     "You write normal pandas operations, but the computation happens in a low-memory footprint."
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "# Distributed DataFrames and Efficiency\n",
127 |     "\n",
128 |     "We will cover the following topics:\n",
129 |     "\n",
130 |     "1. Persist common intermediate results in memory with `persist`\n",
131 |     "2. Partitions and partition size\n",
132 |     "3. Using indices to improve efficiency"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "from dask_kubernetes import KubeCluster\n",
142 |     "from dask.distributed import Client"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "The next cell will start up some workers for you. This make take a few minutes, but they widget will update automatically when the workers are ready. You don't need to do anything with the manual or adaptive scaling."
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "cluster = KubeCluster(n_workers=8)\n",
159 |     "cluster"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "metadata": {},
165 |    "source": [
166 |     "**Be sure to open the diagnostics UI.**"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "client = Client(cluster)\n",
176 |     "client"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "## Moving to distributed\n",
184 |     "\n",
185 |     "A few things change when moving from local to distributed computing.\n",
186 |     "\n",
187 |     "1. Environment: Each worker is a separate machine, and needs to have the required libraries installed. This cluster was setup using [Kubernetes](http://dask.pydata.org/en/latest/setup/kubernetes.html#).\n",
188 |     "2. File system: Previously, every worker (threads, processes, or even the distributed scheduler in local mode) had access to your laptops file system. In a distributed environment, you'll need some kind of shared file system to read data (cloud storage like S3 or GCS, or a network file system)\n",
189 |     "3. Communication: Moving data between machines is relatively expensive. When possible, the distributed scheduler will ensure that tasks are scheduled to be run on workers that already have the required data. But some tasks will require data from multiple machines."
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "## The full airline dataset\n",
197 |     "\n",
198 |     "We have the full airline dataset stored on `GCS`. This is the same as the one you've been working with, but includes all originating airports and a few extra columns. We change the `read_csv` call slightly to avoid the extra columns.\n",
199 |     "\n",
200 |     "`dask.dataframe` has support for reading directly from `GCS`, so we can use our `read_csv` call from before."
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "import dask.dataframe as dd\n",
210 |     "\n",
211 |     "columns = ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',\n",
212 |     "           'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',\n",
213 |     "           'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',\n",
214 |     "           'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',\n",
215 |     "           'Cancelled']\n",
216 |     "\n",
217 |     "df = dd.read_csv('gcs://anaconda-public-data/airline/(199)|(200)*.csv',\n",
218 |     "                 parse_dates={'Date': [0, 1, 2]},\n",
219 |     "                 dtype={'TailNum': object,\n",
220 |     "                        'CRSElapsedTime': float,\n",
221 |     "                        'Distance': float,\n",
222 |     "                        'Cancelled': bool},\n",
223 |     "                 usecols=columns)"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "df.head()"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "### Persist data in distributed memory\n",
240 |     "\n",
241 |     "Every time we run an operation like `df[~df.Cancelled].DepDelay.max().compute()` we read through our dataset from disk.  This can be slow, especially because we're reading data from CSV.  We usually have two options to make this faster:\n",
242 |     "\n",
243 |     "1.  Persist relevant data in memory, either on our computer or on a cluster\n",
244 |     "2.  Use a faster on-disk format, like HDF5 or Parquet\n",
245 |     "\n",
246 |     "In this section we persist our data in memory.  On a single machine this is often done by doing a bit of pre-processing and data reduction with dask dataframe and then `compute`-ing to a Pandas dataframe and using Pandas in the future.  \n",
247 |     "\n",
248 |     "```python\n",
249 |     "df = dd.read_csv(...)\n",
250 |     "df = df[df.Origin == 'LGA']  # filter down to smaller dataset\n",
251 |     "pdf = df.compute()  # convert to pandas\n",
252 |     "pdf ... # continue with familiar Pandas workflows\n",
253 |     "```\n",
254 |     "\n",
255 |     "However on a distributed cluster when even our cleaned data is too large we still can't use Pandas.  In this case we ask Dask to persist data in memory with the `dask.persist` function.  This is what we'll do today.  This will help us to understand when data is lazy and when it is computing.\n",
256 |     "\n",
257 |     "You can trigger computations using the persist method:\n",
258 |     "\n",
259 |     "    x = x.persist()\n",
260 |     "\n",
261 |     "or the dask.persist function for multiple inputs:\n",
262 |     "\n",
263 |     "    x, y = dask.persist(x, y)"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "markdown",
268 |    "metadata": {},
269 |    "source": [
270 |     "### Exercise\n",
271 |     "\n",
272 |     "Persist the dataframe into memory.\n",
273 |     "\n",
274 |     "- How long does the cell take to execute (look at the \"busy\" indicator in the top-right)?\n",
275 |     "- After it has persisted how long does it take to compute `df[~df.Cancelled].DepDelay.count().compute()`?\n",
276 |     "- Looking at the plots in the diagnostic web page (the link was printed above), what is taking up most of the time? (You can over over rectangles to see what function they represent)"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "df = # TODO: persist dataframe in memory"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "%time _ = df.Cancelled[~df.Cancelled].count().compute()"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "markdown",
299 |    "metadata": {},
300 |    "source": [
301 |     "### Exercise\n",
302 |     "\n",
303 |     "Repeat the groupby computation from the previous notebooks. What is taking all of the time now?"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": null,
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": [
312 |     "# What was the average departure delay from each airport?\n",
313 |     "df[~df.Cancelled].groupby('Origin').DepDelay.mean().nlargest(10).compute()"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "markdown",
318 |    "metadata": {},
319 |    "source": [
320 |     "## Partitions\n",
321 |     "\n",
322 |     "One `dask.dataframe` is composed of several Pandas dataframes.  The organization of these dataframes can significantly impact performance.  In this section we discuss two common factors that commonly impact performance:\n",
323 |     "\n",
324 |     "1. The number of Pandas dataframes can affect overhead.  If the dataframes are too small then Dask might spend more time deciding what to do than Pandas spends actually doing it.  Ideally computations should take 100's of milliseconds.\n",
325 |     "\n",
326 |     "2. If we know how the dataframes are sorted then certain operations become much faster"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "markdown",
331 |    "metadata": {},
332 |    "source": [
333 |     "### Number of partitions and partition size\n",
334 |     "\n",
335 |     "When we read in our data from CSV files we get potentially multiple Pandas dataframe for each file. Look at the metadata below to determine a few things about the current partitioning:\n",
336 |     "- How many partitions are there?\n",
337 |     "- Are the splits along the index between partitions known? If so, what are they?"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": null,
343 |    "metadata": {},
344 |    "outputs": [],
345 |    "source": [
346 |     "# Number of partitions\n",
347 |     "df.npartitions"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": null,
353 |    "metadata": {},
354 |    "outputs": [],
355 |    "source": [
356 |     "# Are the splits between partitions known?\n",
357 |     "df.known_divisions"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": null,
363 |    "metadata": {},
364 |    "outputs": [],
365 |    "source": [
366 |     "# The splits between partitions. If unknown these are all `None`\n",
367 |     "df.divisions"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "markdown",
372 |    "metadata": {},
373 |    "source": [
374 |     "### Exercise: How large is the DataFrame?\n",
375 |     "\n",
376 |     "- How would you compute the memory usage of a single pandas DataFrame?\n",
377 |     "- Given your knowledge of Dask, how would you do it for a Dask DataFrame?"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": null,
383 |    "metadata": {},
384 |    "outputs": [],
385 |    "source": [
386 |     "# Your code here...\n"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": null,
392 |    "metadata": {},
393 |    "outputs": [],
394 |    "source": [
395 |     "%load memory-usage.py"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "markdown",
400 |    "metadata": {},
401 |    "source": [
402 |     "## Sorted Index column\n",
403 |     "\n",
404 |     "*This section doesn't have any exercises.  Just follow along.*\n",
405 |     "\n",
406 |     "Many dataframe operations like loc-indexing, groupby-apply, and joins are *much* faster on a sorted index.  For example, if we want to get data for a particular day of data it *really* helps to know where that day is, otherwise we need to search over all of our data.\n",
407 |     "\n",
408 |     "The Pandas model gives us a sorted index column.  Dask.dataframe copies this model, and it remembers the min and max values of every partition's index.\n",
409 |     "\n",
410 |     "By default, our data doesn't have an index."
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": null,
416 |    "metadata": {},
417 |    "outputs": [],
418 |    "source": [
419 |     "df.head()"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "markdown",
424 |    "metadata": {},
425 |    "source": [
426 |     "So if we search for a particular day it takes a while because it has to pass through all of the data."
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "code",
431 |    "execution_count": null,
432 |    "metadata": {},
433 |    "outputs": [],
434 |    "source": [
435 |     "%time df[df.Date == '1992-05-05'].compute()"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": null,
441 |    "metadata": {},
442 |    "outputs": [],
443 |    "source": [
444 |     "df[df.Date == '1992-05-05'].visualize(optimize_graph=True)"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "markdown",
449 |    "metadata": {},
450 |    "source": [
451 |     "However if we set the `Date` column as the index then this operation can be much much faster.\n",
452 |     "\n",
453 |     "Calling `set_index` followed by `persist` results in a new set of dataframe partitions stored in memory, sorted along the index column. To do this dask has to\n",
454 |     "\n",
455 |     "- Shuffle the data by date, resulting in the same number of output partitions\n",
456 |     "- Set the index for each partition\n",
457 |     "- Store the resulting partitions in distributed memory\n",
458 |     "\n",
459 |     "This can be a (relatively) expensive operation, but allows certain queries to be more optimized. \n",
460 |     "\n",
461 |     "Watch the diagnostics page while the next line is running to see how the shuffle and index operation progresses."
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": null,
467 |    "metadata": {},
468 |    "outputs": [],
469 |    "source": [
470 |     "%%time\n",
471 |     "df = df.set_index('Date').persist()"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "markdown",
476 |    "metadata": {},
477 |    "source": [
478 |     "After the index is set, we now have known divisions:"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": null,
484 |    "metadata": {},
485 |    "outputs": [],
486 |    "source": [
487 |     "# Number of partitions\n",
488 |     "df.npartitions"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "code",
493 |    "execution_count": null,
494 |    "metadata": {},
495 |    "outputs": [],
496 |    "source": [
497 |     "# Are the splits between partitions known?\n",
498 |     "df.known_divisions"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "code",
503 |    "execution_count": null,
504 |    "metadata": {},
505 |    "outputs": [],
506 |    "source": [
507 |     "# The splits between partitions.\n",
508 |     "df.divisions"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": null,
514 |    "metadata": {},
515 |    "outputs": [],
516 |    "source": [
517 |     "# The repr for a dask dataframe can also be useful for\n",
518 |     "# seeing partition information\n",
519 |     "df"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "markdown",
524 |    "metadata": {},
525 |    "source": [
526 |     "Repeating the same query for all flights on a specific date, we can see that we're much faster after setting the index:"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": null,
532 |    "metadata": {},
533 |    "outputs": [],
534 |    "source": [
535 |     "%time df.loc['1992-05-05'].compute()"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "markdown",
540 |    "metadata": {},
541 |    "source": [
542 |     "If you look at the resulting graph, you can see that dask was able to optimize the computation to only look at a single partition:"
543 |    ]
544 |   },
545 |   {
546 |    "cell_type": "code",
547 |    "execution_count": null,
548 |    "metadata": {},
549 |    "outputs": [],
550 |    "source": [
551 |     "df.loc['1992-05-05'].visualize(optimize_graph=True)"
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "markdown",
556 |    "metadata": {},
557 |    "source": [
558 |     "### Timeseries operations\n",
559 |     "\n",
560 |     "When the index of a dask dataframe is a known `DatetimeIndes`, traditional pandas timeseries operations are supported. For example, now that we have a sorted index we can resample the `DepDelay` column into 1 month bins."
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": null,
566 |    "metadata": {},
567 |    "outputs": [],
568 |    "source": [
569 |     "%matplotlib inline"
570 |    ]
571 |   },
572 |   {
573 |    "cell_type": "code",
574 |    "execution_count": null,
575 |    "metadata": {},
576 |    "outputs": [],
577 |    "source": [
578 |     "%%time \n",
579 |     "(df.DepDelay\n",
580 |     "   .resample('1M')\n",
581 |     "   .mean()\n",
582 |     "   .fillna(method='ffill')\n",
583 |     "   .compute()\n",
584 |     "   .plot(figsize=(10, 5)));"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "code",
589 |    "execution_count": null,
590 |    "metadata": {},
591 |    "outputs": [],
592 |    "source": []
593 |   },
594 |   {
595 |    "cell_type": "code",
596 |    "execution_count": null,
597 |    "metadata": {},
598 |    "outputs": [],
599 |    "source": [
600 |     "# When you're done with the `airlines` dataset\n",
601 |     "client.restart()"
602 |    ]
603 |   },
604 |   {
605 |    "cell_type": "markdown",
606 |    "metadata": {},
607 |    "source": [
608 |     "## Exercise: Explore the NYC Taxi dataset\n",
609 |     "\n",
610 |     "We have some of the NYC Taxi ride dataset in parquet format stored in GCS."
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": null,
616 |    "metadata": {},
617 |    "outputs": [],
618 |    "source": [
619 |     "taxi = dd.read_parquet(\"gcs://anaconda-public-data/nyc-taxi/nyc.parquet\")\n",
620 |     "taxi.head()"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "markdown",
625 |    "metadata": {},
626 |    "source": [
627 |     "Some questions?\n",
628 |     "\n",
629 |     "- How large is the dataset? Will it fit in your cluster's RAM if you persist it?\n",
630 |     "- What's the average tip percent by hour?"
631 |    ]
632 |   },
633 |   {
634 |    "cell_type": "code",
635 |    "execution_count": null,
636 |    "metadata": {},
637 |    "outputs": [],
638 |    "source": []
639 |   },
640 |   {
641 |    "cell_type": "code",
642 |    "execution_count": null,
643 |    "metadata": {},
644 |    "outputs": [],
645 |    "source": [
646 |     "# clean up, when finished with the notebook\n",
647 |     "client.close()\n",
648 |     "cluster.close()"
649 |    ]
650 |   }
651 |  ],
652 |  "metadata": {
653 |   "kernelspec": {
654 |    "display_name": "Python 3",
655 |    "language": "python",
656 |    "name": "python3"
657 |   },
658 |   "language_info": {
659 |    "codemirror_mode": {
660 |     "name": "ipython",
661 |     "version": 3
662 |    },
663 |    "file_extension": ".py",
664 |    "mimetype": "text/x-python",
665 |    "name": "python",
666 |    "nbconvert_exporter": "python",
667 |    "pygments_lexer": "ipython3",
668 |    "version": "3.6.5"
669 |   }
670 |  },
671 |  "nbformat": 4,
672 |  "nbformat_minor": 2
673 | }
674 | 


--------------------------------------------------------------------------------
/notebooks/09-Visualization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Visualization\n",
  8 |     "\n",
  9 |     "We have a *ton* of options for viz in python.\n",
 10 |     "I'm going to focus on matplotlib and seaborn, because they work well for the types of analyses I usually do.\n",
 11 |     "At the end, I'll mention Altair, which is new but has a really good design."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pandas as pd\n",
 21 |     "import numpy as np\n",
 22 |     "import seaborn.apionly as sns\n",
 23 |     "import matplotlib.pyplot as plt\n",
 24 |     "plt.style.use('default')"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "%matplotlib inline"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "To start with, we'll fetch some data from yahoo using the `pandas_datareader` package."
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "stocks = pd.read_csv(\"data/stocks.csv\", index_col=\"Date\", parse_dates=True)\n",
 50 |     "stocks.head()"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "## Matplotlib\n",
 58 |     "\n",
 59 |     "- foundation for seaborn and pandas plotting\n",
 60 |     "- full control over every detail"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "We won't say too much about matplotlib directly.\n",
 68 |     "It's a large library with several different levels of API.\n",
 69 |     "Additionally, both seaborn and pandas use matplotlib internally so you can consider these two higher-level, domain specific APIs built on top of matplotlib proper.\n",
 70 |     "This works well, as you can use the higher-level library most of the time, but you still have the full power and control of matplotlib when you need it.\n",
 71 |     "\n",
 72 |     "People familiar with matplotlib will have used the `axes.plot` method; It takes an `x`, `y` and a bunch of keyword arguments."
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "fig, ax = plt.subplots(figsize=(12, 6))\n",
 82 |     "ax.plot(stocks.Open);"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "Notice that matplotlib is now pandas-aware.\n",
 90 |     "`ax.plot` knows that when it's passed a `Series`, like `stocks.Open`, then `stocks.index` makes for a good `x` axis.\n",
 91 |     "\n",
 92 |     "As of matplotlib 1.5, all the plot methods `.plot, .bar, .scatter`, etc. take an optional *data* argument. When passed, you can use strings as the `x` and `y`. matplotlib will use these strings as keys for `data.__getitem__`. This means `data` can be DataFrames, dictionaries, even H5py files."
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "fig, ax = plt.subplots(figsize=(12, 6))\n",
102 |     "ax.plot('Open', data=stocks)\n",
103 |     "plt.legend();"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "Matplotlib's strength (and weakness) is it's customizability. With enough work, you can make essentially any figure you want."
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "import statsmodels.tsa.api as smt"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "ϵ, t = smt.filters.hpfilter(stocks.Close, lamb=129600*30)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "start = pd.Timestamp('2007-12-01')\n",
138 |     "end = pd.Timestamp('2009-06-01')\n",
139 |     "\n",
140 |     "fig, ax = plt.subplots(figsize=(12, 6))\n",
141 |     "ax.plot(t, linewidth=2, label=\"Trend\")\n",
142 |     "\n",
143 |     "ax.fill_between(t.index, t - ϵ, t + ϵ, alpha=.15, color='b')\n",
144 |     "\n",
145 |     "ylim = ax.get_ylim()\n",
146 |     "ax.fill_between([start, end], *ylim, color='k', alpha=.2)\n",
147 |     "ax.set_ylim(*ylim)\n",
148 |     "\n",
149 |     "ax.annotate(\"Housing Bubble\", (pd.Timestamp(\"2006-01\"),  255),\n",
150 |     "            fontsize=12, color='red')\n",
151 |     "ax.annotate(\"Recession\", (pd.Timestamp(\"2008-01\"),  255),\n",
152 |     "            fontsize=12, color='red')\n",
153 |     "\n",
154 |     "ax.legend()\n",
155 |     "sns.despine()"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(8, 4), sharex=True)\n",
165 |     "\n",
166 |     "ax1.plot(\"Open\", data=stocks, color='red')\n",
167 |     "ax2.plot(\"Volume\", data=stocks)\n",
168 |     "ax2.fill_between(stocks.index, 0, \"Volume\", data=stocks, alpha=.25)\n",
169 |     "ax2.set_ylim(0)\n",
170 |     "ax1.legend()\n",
171 |     "\n",
172 |     "plt.tight_layout()"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "## Pandas Plotting\n",
180 |     "\n",
181 |     "> Usually convenient\n",
182 |     "\n",
183 |     "- Previously, nicer aesthetics (not since matplotlib 2.0)\n",
184 |     "- Nicer labeling (but matplotlib is better now)\n",
185 |     "- Easier (though less flexible) subplotting"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "stocks[['Open', 'High', \"Low\", \"Close\"]].plot(subplots=True, figsize=(8, 8));"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "stocks.Volume.plot.area();"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {},
209 |    "source": [
210 |     "I'd recommend pandas builtin plotting for when you need a quick visualization.\n",
211 |     "For simple customizations, you can typically followup a pandas plot with an `ax.set`:"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": null,
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "ax = stocks.Close.plot.density()\n",
221 |     "ax.set(xlabel=\"Close\", title=\"Closing Price (density)\", ylim=.0001);"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "pd.cut(stocks.Close, 10).value_counts().sort_index().plot.barh(figsize=(4, 8));"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "markdown",
235 |    "metadata": {},
236 |    "source": [
237 |     "For more elaborate customizations, it can make sense to just start with matplotlib."
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "## [Seaborn](http://seaborn.pydata.org/)\n",
245 |     "\n",
246 |     "> Seaborn provides a high-level interface for drawing attractive statistical graphics.\n",
247 |     "\n",
248 |     "- Statistical aggregations (`countplot`, bootstrapped standard errors, `regplot`)\n",
249 |     "- Easier distribution plotting\n",
250 |     "- Easier faceting by variable"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {},
256 |    "source": [
257 |     "It is *not* a replacement of matplotlib. Rather, it provides a nice API for\n",
258 |     "many common statistical methods. Some of the distinguishing features are\n",
259 |     "\n",
260 |     "We'll make an update of [this notebook](https://gist.github.com/mwaskom/8224591) from Michael Waskom, the author of seaborn, which explores the titanic dataset:"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "metadata": {},
266 |    "source": [
267 |     "## Titanic Dataset\n",
268 |     "\n",
269 |     "- Survived\n",
270 |     "- Class\n",
271 |     "- Sex\n",
272 |     "- Age\n",
273 |     "- Embarked\n",
274 |     "- Man / Woman / Child\n",
275 |     "- Deck"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": null,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "t = sns.load_dataset('titanic')\n",
285 |     "t['class'] = t['class'].cat.as_ordered()\n",
286 |     "t.head()"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": null,
292 |    "metadata": {},
293 |    "outputs": [],
294 |    "source": [
295 |     "t.info()"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": null,
301 |    "metadata": {},
302 |    "outputs": [],
303 |    "source": [
304 |     "pal = dict(man=\"#4682B4\", woman=\"#CD5C5C\", child=\"#2E8B57\", male=\"#6495ED\",\n",
305 |     "           female=\"#F08080\")\n",
306 |     "\n",
307 |     "with sns.color_palette('viridis', n_colors=3) as v:\n",
308 |     "    pal.update(**dict(zip(['First', 'Second', 'Third'], v.as_hex())))\n",
309 |     "\n",
310 |     "sns.set(context='talk', style='white')"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "markdown",
315 |    "metadata": {},
316 |    "source": [
317 |     "## Exploratory Analysis\n",
318 |     "\n",
319 |     "1. Who were the passengers?\n",
320 |     "2. Who survived?"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "markdown",
325 |    "metadata": {},
326 |    "source": [
327 |     "## Who were the passengers?\n",
328 |     "\n",
329 |     "Explore them across different dimensions; We'll start with *categorical* data like sex or class.\n",
330 |     "\n",
331 |     "What's the count of passengers by sex?"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": null,
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "sns.countplot(x=\"sex\", data=t, palette=pal);"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "markdown",
345 |    "metadata": {},
346 |    "source": [
347 |     "By class?"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": null,
353 |    "metadata": {},
354 |    "outputs": [],
355 |    "source": [
356 |     "sns.countplot(x=\"class\", data=t, palette=pal);"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "markdown",
361 |    "metadata": {},
362 |    "source": [
363 |     "By \"who\" (man, woman, or child)?"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": null,
369 |    "metadata": {},
370 |    "outputs": [],
371 |    "source": [
372 |     "sns.countplot(\"who\", data=t, palette=pal)\n",
373 |     "sns.despine()"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "markdown",
378 |    "metadata": {},
379 |    "source": [
380 |     "Seaborn is built up of a heirarchy of convenience functions and methods.\n",
381 |     "For instance, `countplot` is essentially a specialized version of `factorplot` where `kind` is set to count."
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": null,
387 |    "metadata": {},
388 |    "outputs": [],
389 |    "source": [
390 |     "sns.factorplot(x=\"class\", data=t, kind=\"count\", hue=\"sex\",\n",
391 |     "               palette=pal, size=7);"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "markdown",
396 |    "metadata": {},
397 |    "source": [
398 |     "Most seaborn functions have the same API. You pass the requeired arguments (`x`, `y`, etc. depending on the plot), a `data` argument.\n",
399 |     "Include additional arguments like `hue`, `col`, etc. as needed.\n",
400 |     "\n",
401 |     "Why have both `countplot` and `factorplot(..., kind='count')`? The specialized versions like `countplot` are handy when working with `Grid`s, which we'll see later on."
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "markdown",
406 |    "metadata": {},
407 |    "source": [
408 |     "<div class=\"alert alert-success\" data-title=\"Embarked by class\">\n",
409 |     "  <h1><i class=\"fa fa-tasks\" aria-hidden=\"true\"></i> Exercise: Embarked by class</h1>\n",
410 |     "</div>\n",
411 |     "\n",
412 |     "<p>Make a `factorplot` with the counts of `embarked`, with the `hue` split\n",
413 |     "by `class`.</p>"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "code",
418 |    "execution_count": null,
419 |    "metadata": {},
420 |    "outputs": [],
421 |    "source": [
422 |     "# Your solution\n"
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": null,
428 |    "metadata": {},
429 |    "outputs": [],
430 |    "source": [
431 |     "%load solutions/visualize_00.py\n"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "markdown",
436 |    "metadata": {},
437 |    "source": [
438 |     "<div class=\"alert alert-success\" data-title=\"Age by class\">\n",
439 |     "  <h1><i class=\"fa fa-tasks\" aria-hidden=\"true\"></i> Exercise: Age by class</h1>\n",
440 |     "</div>\n",
441 |     "<p>\n",
442 |     "Make a pointplot of `age` by `class`. Look at the `kind` parameter to `sns.factorplot`.</p>"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "code",
447 |    "execution_count": null,
448 |    "metadata": {},
449 |    "outputs": [],
450 |    "source": [
451 |     "# Your solution here\n"
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "code",
456 |    "execution_count": null,
457 |    "metadata": {},
458 |    "outputs": [],
459 |    "source": [
460 |     "%load solutions/visualize_01.py"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "markdown",
465 |    "metadata": {},
466 |    "source": [
467 |     "## Distributions\n",
468 |     "\n",
469 |     "Let's moving to plotting *quantitative* data.\n",
470 |     "We'll do this while introducting a new abstraction from seaborn, the `Grid` (`Grid`s work with either quantitative or qualitative data)."
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "markdown",
475 |    "metadata": {},
476 |    "source": [
477 |     "## Grids\n",
478 |     "\n",
479 |     "You initalize a `Grid` with all the agruments needed to layout the grid that\n",
480 |     "the data will be plotted on:\n",
481 |     "\n",
482 |     "- `data`: DataFrame\n",
483 |     "- `row` : variable to facet rows by\n",
484 |     "- `col` : variable to facet columns by\n",
485 |     "- `hue` : variable to split colors by"
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "markdown",
490 |    "metadata": {},
491 |    "source": [
492 |     "There are several kinds of `Grid`s in seaborn; we'll start with the `FacetGrid`."
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "code",
497 |    "execution_count": null,
498 |    "metadata": {},
499 |    "outputs": [],
500 |    "source": [
501 |     "g = sns.FacetGrid(t, hue=\"sex\", aspect=2.5, palette=pal)"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "markdown",
506 |    "metadata": {},
507 |    "source": [
508 |     "We haven't actually done any plotting really.\n",
509 |     "Just the necessary work to layout the axes.\n",
510 |     "\n",
511 |     "To actually plot something, `map` plotting functions over the `FacetGrid`. The arguments to `g.map` are passed through to the underlying plot method like `sns.kdeplot`."
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": null,
517 |    "metadata": {},
518 |    "outputs": [],
519 |    "source": [
520 |     "g = sns.FacetGrid(t, hue=\"sex\", aspect=2.5, palette=pal)\n",
521 |     "g.map(sns.kdeplot, \"age\", shade=True)\n",
522 |     "g.set(xlim=(0, 80), ylim=0)\n",
523 |     "g.add_legend();"
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "markdown",
528 |    "metadata": {},
529 |    "source": [
530 |     "To show how the faceting works, pass `row` or `column` when setting up the `FacetGrid`:"
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "code",
535 |    "execution_count": null,
536 |    "metadata": {},
537 |    "outputs": [],
538 |    "source": [
539 |     "g = sns.FacetGrid(t, row=\"sex\", hue=\"sex\", aspect=2.5, palette=pal)\n",
540 |     "g.map(sns.kdeplot, \"age\", shade=True)\n",
541 |     "g.set(xlim=(0, 80), ylim=0)\n",
542 |     "g.add_legend();"
543 |    ]
544 |   },
545 |   {
546 |    "cell_type": "code",
547 |    "execution_count": null,
548 |    "metadata": {},
549 |    "outputs": [],
550 |    "source": [
551 |     "g = sns.FacetGrid(t, hue=\"who\", aspect=2.5, palette=pal)\n",
552 |     "g.map(sns.kdeplot, \"age\", shade=True)\n",
553 |     "g.set(xlim=(0, 80), ylim=0)\n",
554 |     "g.add_legend();"
555 |    ]
556 |   },
557 |   {
558 |    "cell_type": "code",
559 |    "execution_count": null,
560 |    "metadata": {},
561 |    "outputs": [],
562 |    "source": [
563 |     "g = sns.FacetGrid(t, hue=\"class\", aspect=3, palette=\"YlGn_r\")\n",
564 |     "g.map(sns.kdeplot, \"age\", shade=True)\n",
565 |     "g.set(xlim=(0, 80), ylim=0)\n",
566 |     "g.add_legend();"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "markdown",
571 |    "metadata": {},
572 |    "source": [
573 |     "You can get quite complicated results, without much additional work."
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": null,
579 |    "metadata": {},
580 |    "outputs": [],
581 |    "source": [
582 |     "g = (sns.FacetGrid(t, col=\"sex\", row=\"class\", size=2.5, aspect=2.5,\n",
583 |     "                   palette=pal, hue=\"sex\")\n",
584 |     "     .map(sns.kdeplot, \"age\", shade=True)\n",
585 |     "     .map(sns.rugplot, \"age\")\n",
586 |     "     .set(xlim=(0, 80), ylim=0));"
587 |    ]
588 |   },
589 |   {
590 |    "cell_type": "markdown",
591 |    "metadata": {},
592 |    "source": [
593 |     "This is a great asset when exploring a new dataset."
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "code",
598 |    "execution_count": null,
599 |    "metadata": {},
600 |    "outputs": [],
601 |    "source": [
602 |     "sns.factorplot('deck', data=t, palette='PuBu_d',\n",
603 |     "               kind=\"count\");"
604 |    ]
605 |   },
606 |   {
607 |    "cell_type": "markdown",
608 |    "metadata": {},
609 |    "source": [
610 |     "Your data isn't always in perfect shape to be plotted, so you'll be mixing in data manipulations with actual plotting:"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": null,
616 |    "metadata": {},
617 |    "outputs": [],
618 |    "source": [
619 |     "sns.violinplot(\"class\", \"fare\", data=t, orient=\"v\",\n",
620 |     "               palette=\"YlGn\")\n",
621 |     "sns.despine(left=True)"
622 |    ]
623 |   },
624 |   {
625 |    "cell_type": "markdown",
626 |    "metadata": {},
627 |    "source": [
628 |     "<div class=\"alert alert-success\" data-title=\"Trimming\">\n",
629 |     "  <h1><i class=\"fa fa-tasks\" aria-hidden=\"true\"></i> Exercise: Trimming</h1>\n",
630 |     "</div>\n",
631 |     "<p>\n",
632 |     "Create a new column in `t` called `fare_` that topcodes `fare` to be no more than `3 * t.fare.median()`. That is, anything higher than 3x the median should just be set to 3x the median.</p>\n",
633 |     "\n",
634 |     "Hint: you can use `np.where` to simulate an `if x then y else z` on arrays of data."
635 |    ]
636 |   },
637 |   {
638 |    "cell_type": "code",
639 |    "execution_count": null,
640 |    "metadata": {},
641 |    "outputs": [],
642 |    "source": [
643 |     "# Your solution here\n"
644 |    ]
645 |   },
646 |   {
647 |    "cell_type": "code",
648 |    "execution_count": null,
649 |    "metadata": {},
650 |    "outputs": [],
651 |    "source": [
652 |     "%load solutions/visualize_02a.py"
653 |    ]
654 |   },
655 |   {
656 |    "cell_type": "markdown",
657 |    "metadata": {},
658 |    "source": [
659 |     "Now make the violinplot on fares that we tried above:"
660 |    ]
661 |   },
662 |   {
663 |    "cell_type": "code",
664 |    "execution_count": null,
665 |    "metadata": {},
666 |    "outputs": [],
667 |    "source": [
668 |     "# Your solution here\n"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "code",
673 |    "execution_count": null,
674 |    "metadata": {},
675 |    "outputs": [],
676 |    "source": [
677 |     "%load solutions/visualize_02b.py\n"
678 |    ]
679 |   },
680 |   {
681 |    "cell_type": "markdown",
682 |    "metadata": {},
683 |    "source": [
684 |     "Seaborn makes it easy to split by an additional variable, like `sex`."
685 |    ]
686 |   },
687 |   {
688 |    "cell_type": "code",
689 |    "execution_count": null,
690 |    "metadata": {},
691 |    "outputs": [],
692 |    "source": [
693 |     "sns.violinplot(\"class\", \"fare_\", data=t, orient=\"v\",\n",
694 |     "               palette=\"YlGn\", hue='sex', split=True)\n",
695 |     "sns.despine(left=True)\n",
696 |     "plt.ylim(0);\n"
697 |    ]
698 |   },
699 |   {
700 |    "cell_type": "markdown",
701 |    "metadata": {},
702 |    "source": [
703 |     "## Plotting Relationships\n",
704 |     "\n",
705 |     "We've seen summary statistics (like countplot), univariate distributions, and basic relationships between one variable and a categorical variable.\n",
706 |     "\n",
707 |     "Seaborn also provides tools for visualizng bivariate relationships between quantitative variables."
708 |    ]
709 |   },
710 |   {
711 |    "cell_type": "code",
712 |    "execution_count": null,
713 |    "metadata": {},
714 |    "outputs": [],
715 |    "source": [
716 |     "ax = sns.jointplot(\"age\", \"fare_\", data=t, color=\"g\", size=8);"
717 |    ]
718 |   },
719 |   {
720 |    "cell_type": "markdown",
721 |    "metadata": {},
722 |    "source": [
723 |     "## Who Survived?\n",
724 |     "\n",
725 |     "Let's turn to the variable of interest: who survived?"
726 |    ]
727 |   },
728 |   {
729 |    "cell_type": "markdown",
730 |    "metadata": {},
731 |    "source": [
732 |     "<div class=\"alert alert-success\" data-title=\"Who Survived?\">\n",
733 |     "  <h1><i class=\"fa fa-tasks\" aria-hidden=\"true\"></i> Exercise: Who Survived?</h1>\n",
734 |     "</div>\n",
735 |     "\n",
736 |     "<p>Explore the `alive` variable</p>"
737 |    ]
738 |   },
739 |   {
740 |    "cell_type": "markdown",
741 |    "metadata": {},
742 |    "source": [
743 |     "What does the count of alive look like?"
744 |    ]
745 |   },
746 |   {
747 |    "cell_type": "code",
748 |    "execution_count": null,
749 |    "metadata": {},
750 |    "outputs": [],
751 |    "source": []
752 |   },
753 |   {
754 |    "cell_type": "code",
755 |    "execution_count": null,
756 |    "metadata": {},
757 |    "outputs": [],
758 |    "source": [
759 |     "%load solutions/visualize_03.py"
760 |    ]
761 |   },
762 |   {
763 |    "cell_type": "markdown",
764 |    "metadata": {},
765 |    "source": [
766 |     "What's the relationship between class and survived?"
767 |    ]
768 |   },
769 |   {
770 |    "cell_type": "code",
771 |    "execution_count": null,
772 |    "metadata": {},
773 |    "outputs": [],
774 |    "source": []
775 |   },
776 |   {
777 |    "cell_type": "code",
778 |    "execution_count": null,
779 |    "metadata": {},
780 |    "outputs": [],
781 |    "source": [
782 |     "%load solutions/visualize_04.py\n"
783 |    ]
784 |   },
785 |   {
786 |    "cell_type": "markdown",
787 |    "metadata": {},
788 |    "source": [
789 |     "What's the relationship between who and survived"
790 |    ]
791 |   },
792 |   {
793 |    "cell_type": "code",
794 |    "execution_count": null,
795 |    "metadata": {},
796 |    "outputs": [],
797 |    "source": []
798 |   },
799 |   {
800 |    "cell_type": "code",
801 |    "execution_count": null,
802 |    "metadata": {},
803 |    "outputs": [],
804 |    "source": [
805 |     "%load solutions/visualize_05.py\n"
806 |    ]
807 |   },
808 |   {
809 |    "cell_type": "markdown",
810 |    "metadata": {},
811 |    "source": [
812 |     "What's the interaction of `sex` with `class`, when predicting `survived`? Split the `hue` by `sex`.\n",
813 |     "\n",
814 |     "Hint: `class` is a categorical (AKA factor), so use `factorplot`."
815 |    ]
816 |   },
817 |   {
818 |    "cell_type": "code",
819 |    "execution_count": null,
820 |    "metadata": {},
821 |    "outputs": [],
822 |    "source": []
823 |   },
824 |   {
825 |    "cell_type": "code",
826 |    "execution_count": null,
827 |    "metadata": {},
828 |    "outputs": [],
829 |    "source": [
830 |     "%load solutions/visualize_06.py\n"
831 |    ]
832 |   },
833 |   {
834 |    "cell_type": "markdown",
835 |    "metadata": {},
836 |    "source": [
837 |     "How about the interaction of `'who'` with class?"
838 |    ]
839 |   },
840 |   {
841 |    "cell_type": "code",
842 |    "execution_count": null,
843 |    "metadata": {},
844 |    "outputs": [],
845 |    "source": []
846 |   },
847 |   {
848 |    "cell_type": "code",
849 |    "execution_count": null,
850 |    "metadata": {},
851 |    "outputs": [],
852 |    "source": [
853 |     "%load solutions/visualize_07.py\n"
854 |    ]
855 |   },
856 |   {
857 |    "cell_type": "markdown",
858 |    "metadata": {},
859 |    "source": [
860 |     "What's the relationship between `survived` and `adult_male`?"
861 |    ]
862 |   },
863 |   {
864 |    "cell_type": "code",
865 |    "execution_count": null,
866 |    "metadata": {},
867 |    "outputs": [],
868 |    "source": []
869 |   },
870 |   {
871 |    "cell_type": "code",
872 |    "execution_count": null,
873 |    "metadata": {},
874 |    "outputs": [],
875 |    "source": [
876 |     "%load solutions/visualize_08.py\n"
877 |    ]
878 |   },
879 |   {
880 |    "cell_type": "markdown",
881 |    "metadata": {},
882 |    "source": [
883 |     "## Regression plots\n",
884 |     "\n",
885 |     "You can plot relationships with best fit lines (and bootstrapped standard errors) using `lmplot`."
886 |    ]
887 |   },
888 |   {
889 |    "cell_type": "code",
890 |    "execution_count": null,
891 |    "metadata": {},
892 |    "outputs": [],
893 |    "source": [
894 |     "sns.lmplot(\"age\", \"survived\", t, logistic=True, y_jitter=.05);"
895 |    ]
896 |   },
897 |   {
898 |    "cell_type": "markdown",
899 |    "metadata": {},
900 |    "source": [
901 |     "Since we have a binary target (`survived`), we use `logistic`. It can be more informative to bin the x variable."
902 |    ]
903 |   },
904 |   {
905 |    "cell_type": "code",
906 |    "execution_count": null,
907 |    "metadata": {},
908 |    "outputs": [],
909 |    "source": [
910 |     "bins = [15, 30, 45, 60]\n",
911 |     "sns.lmplot(\"age\", \"survived\", t, logistic=True,\n",
912 |     "           x_bins=bins);"
913 |    ]
914 |   },
915 |   {
916 |    "cell_type": "markdown",
917 |    "metadata": {},
918 |    "source": [
919 |     "<div class=\"alert alert-success\" data-title=\"Survived by gender\">\n",
920 |     "  <h1><i class=\"fa fa-tasks\" aria-hidden=\"true\"></i> Exercise: Survived by gender</h1>\n",
921 |     "</div>\n",
922 |     "\n",
923 |     "\n",
924 |     "<p>Can you split that relationship by `sex`?</p>"
925 |    ]
926 |   },
927 |   {
928 |    "cell_type": "code",
929 |    "execution_count": null,
930 |    "metadata": {},
931 |    "outputs": [],
932 |    "source": []
933 |   },
934 |   {
935 |    "cell_type": "code",
936 |    "execution_count": null,
937 |    "metadata": {},
938 |    "outputs": [],
939 |    "source": [
940 |     "%load solutions/visualize_09.py\n"
941 |    ]
942 |   },
943 |   {
944 |    "cell_type": "markdown",
945 |    "metadata": {},
946 |    "source": [
947 |     "How about class?"
948 |    ]
949 |   },
950 |   {
951 |    "cell_type": "code",
952 |    "execution_count": null,
953 |    "metadata": {},
954 |    "outputs": [],
955 |    "source": []
956 |   },
957 |   {
958 |    "cell_type": "code",
959 |    "execution_count": null,
960 |    "metadata": {},
961 |    "outputs": [],
962 |    "source": [
963 |     "%load solutions/visualize_10.py\n"
964 |    ]
965 |   },
966 |   {
967 |    "cell_type": "markdown",
968 |    "metadata": {},
969 |    "source": [
970 |     "## Seaborn Summary\n",
971 |     "\n",
972 |     "- Many small functions with a consistent API (`x`, `y`, `data`, etc.)\n",
973 |     "- `Grid`s offer an abstraction for (relatively) easy faceting"
974 |    ]
975 |   }
976 |  ],
977 |  "metadata": {
978 |   "kernelspec": {
979 |    "display_name": "Python 3",
980 |    "language": "python",
981 |    "name": "python3"
982 |   },
983 |   "language_info": {
984 |    "codemirror_mode": {
985 |     "name": "ipython",
986 |     "version": 3
987 |    },
988 |    "file_extension": ".py",
989 |    "mimetype": "text/x-python",
990 |    "name": "python",
991 |    "nbconvert_exporter": "python",
992 |    "pygments_lexer": "ipython3",
993 |    "version": "3.6.1"
994 |   }
995 |  },
996 |  "nbformat": 4,
997 |  "nbformat_minor": 1
998 | }
999 | 


--------------------------------------------------------------------------------
/notebooks/10-Iterators.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Iterators"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## Topics\n",
 15 |     "\n",
 16 |     "- Stream larger-than-memory data through a pipeline\n",
 17 |     "- Composable thanks to the iterator protocol"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "My favorite \"feature\" of pandas is that it's written in Python.\n",
 25 |     "Python has great language-level features for handling streams of data\n",
 26 |     "that may not fit in memory.\n",
 27 |     "This can be a useful pre-processing step to reading the data into a DataFrame or\n",
 28 |     "NumPy array.\n",
 29 |     "You can get quite far using just the builtin data structures as David Beazley proves in [this PyData keynote](https://www.youtube.com/watch?v=lyDLAutA88s)."
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "import os\n",
 39 |     "import gzip\n",
 40 |     "from itertools import islice, takewhile\n",
 41 |     "\n",
 42 |     "import numpy as np\n",
 43 |     "import pandas as pd\n",
 44 |     "import seaborn as sns\n",
 45 |     "import dask.dataframe as dd\n",
 46 |     "from toolz import partition_all, partitionby\n",
 47 |     "import matplotlib.pyplot as plt"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "%matplotlib inline"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "pd.options.display.max_rows = 10\n",
 66 |     "sns.set(context='talk')\n",
 67 |     "plt.style.use(\"default\")"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "## Beer Reviews Dataset\n",
 75 |     "\n",
 76 |     "- A review is a list of lines\n",
 77 |     "- Each review line is formated like `meta/field: value`\n",
 78 |     "- Reviews are separated by blank lines (i.e. the line is just `'\\n'`)\n"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "Stanford has a [dataset on beer reviews](https://snap.stanford.edu/data/web-BeerAdvocate.html). The raw file is too large for me to include, but I split off a couple subsets for us to work with.\n",
 86 |     "\n",
 87 |     "Pandas can't read this file natively, but we have Python!\n",
 88 |     "We'll use Python to parse the raw file and tranform it into a tabular format."
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "with gzip.open(\"data/beer-raw-small.txt.gz\", \"r\") as f:\n",
 98 |     "    print(f.read(1500).decode('utf-8'))"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {},
104 |    "source": [
105 |     "The full compressed raw dataset is about 500MB, so reading it all into memory might not be pleasent (we're working with a small subset that would fit in memory, but pretend it didn't).\n",
106 |     "Fortunately, Python's iterator protocol and generators make dealing with large streams of data pleasent."
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {},
112 |    "source": [
113 |     "## Developing a solution\n",
114 |     "\n",
115 |     "Let's build a solution together. I'll provide some guidance as we go along."
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "# Get a handle to the data\n",
125 |     "f = gzip.open(\"data/beer-raw-small.txt.gz\", \"rt\")"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "Usually you'd use a context manager like `with gzip.open(...) as f`, but for debugging, it's OK to do it this way."
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "## Parsing Tasks\n",
140 |     "\n",
141 |     "1. split the raw text stream into individual reviews\n",
142 |     "2. transform each individual review into a data container\n",
143 |     "3. combine a chunk of transformed individual reviews into a collection\n",
144 |     "4. store the chunk to disk"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "Let's grab the first review using [`takewhile`](https://docs.python.org/3/library/itertools.html#itertools.takewhile) till the first `'\\n'`.\n",
152 |     "`takewhile` scans a stream, returning each item (line) until it hits the sentinal value it's looking for."
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "from itertools import takewhile"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "f.seek(0);  # make the cell idempotent\n",
171 |     "first = list(takewhile(lambda x: x != '\\n', f))\n",
172 |     "first"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "<div class=\"alert alert-success\" data-title=\"Format Review\">\n",
180 |     "  <h1><i class=\"fa fa-tasks\" aria-hidden=\"true\"></i> Exercise: Format Review</h1>\n",
181 |     "</div>\n",
182 |     "<p>Write a function `format_review` that converts an item like `first` into a dict</p>"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "It will have one entry per line, where the are the stuff to the left of the colon and the values are the stuff to the right.\n",
190 |     "For example, the first line would be\n",
191 |     "\n",
192 |     "`'beer/name: Sausa Weizen\\n',` => `'beer/name': 'Sausa Weizen'`\n",
193 |     "\n",
194 |     "Make sure to clean up the line endings too.\n",
195 |     "\n",
196 |     "- Hint: Check out the [python string methods](https://docs.python.org/3/library/stdtypes.html#string-methods)"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "You can check your function against `expected` by evaluating the next cell.\n",
204 |     "If you get a failure, adjust your `format_review` until it passes."
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "import unittest\n",
214 |     "from typing import List, Dict\n",
215 |     "\n",
216 |     "f.seek(0);  # make the cell idempotent\n",
217 |     "review = list(takewhile(lambda x: x != '\\n', f))\n",
218 |     "\n",
219 |     "\n",
220 |     "def format_review(review: List[str]) -> Dict[str, str]:\n",
221 |     "    \"\"\"Your code goes below\"\"\"\n",
222 |     "    \n",
223 |     "\n",
224 |     "class TestFormat(unittest.TestCase):\n",
225 |     "    maxDiff = None\n",
226 |     "\n",
227 |     "    def test_format_review(self):\n",
228 |     "        result = format_review(review)\n",
229 |     "        expected = {\n",
230 |     "            'beer/ABV': '5.00',\n",
231 |     "            'beer/beerId': '47986',\n",
232 |     "            'beer/brewerId': '10325',\n",
233 |     "            'beer/name': 'Sausa Weizen',\n",
234 |     "            'beer/style': 'Hefeweizen',\n",
235 |     "            'review/appearance': '2.5',\n",
236 |     "            'review/aroma': '2',\n",
237 |     "            'review/overall': '1.5',\n",
238 |     "            'review/palate': '1.5',\n",
239 |     "            'review/profileName': 'stcules',\n",
240 |     "            'review/taste': '1.5',\n",
241 |     "            'review/text': 'A lot of foam. But a lot.\\tIn the smell some banana, and then lactic and tart. Not a good start.\\tQuite dark orange in color, with a lively carbonation (now visible, under the foam).\\tAgain tending to lactic sourness.\\tSame for the taste. With some yeast and banana.\\t\\t',\n",
242 |     "            'review/time': '1234817823'\n",
243 |     "        }\n",
244 |     "        self.assertEqual(result, expected)\n",
245 |     "\n",
246 |     "suite = unittest.TestLoader().loadTestsFromModule(TestFormat())\n",
247 |     "unittest.TextTestRunner().run(suite)"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "%load solutions/groupby_format_review.py"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "markdown",
261 |    "metadata": {},
262 |    "source": [
263 |     "Notice that optional argument to split, which controls the number of splits made; If a review text had contained a literal `': '`, we'd be in trouble since it'd get split again.\n",
264 |     "\n",
265 |     "Make sure you executed the above solution cell twice (first to load, second to execute) as we'll be using that `format_review` function down below"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "markdown",
270 |    "metadata": {},
271 |    "source": [
272 |     "## To a DataFrame\n",
273 |     "\n",
274 |     "Assuming we've processed many reviews into a list, we'll then build up a DataFrame."
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "r = [format_review(first)]  # imagine a list of many reviews\n",
284 |     "\n",
285 |     "col_names = {\n",
286 |     "    'beer/ABV': 'abv',\n",
287 |     "    'beer/beerId': 'beer_id',\n",
288 |     "    'beer/brewerId': 'brewer_id',\n",
289 |     "    'beer/name': 'beer_name',\n",
290 |     "    'beer/style': 'beer_style',\n",
291 |     "    'review/appearance': 'review_appearance',\n",
292 |     "    'review/aroma': 'review_aroma',\n",
293 |     "    'review/overall': 'review_overall',\n",
294 |     "    'review/palate': 'review_palate',\n",
295 |     "    'review/profileName': 'profile_name',\n",
296 |     "    'review/taste': 'review_taste',\n",
297 |     "    'review/text': 'text',\n",
298 |     "    'review/time': 'time'\n",
299 |     "}\n",
300 |     "df = pd.DataFrame(r)\n",
301 |     "numeric = ['abv', 'review_appearance', 'review_aroma',\n",
302 |     "           'review_overall', 'review_palate', 'review_taste']\n",
303 |     "df = (df.rename(columns=col_names)\n",
304 |     "        .replace('', np.nan))\n",
305 |     "df[numeric] = df[numeric].astype(float)\n",
306 |     "df['time'] = pd.to_datetime(df.time.astype(int), unit='s')\n",
307 |     "df"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "markdown",
312 |    "metadata": {},
313 |    "source": [
314 |     "Again, writing that as a function:"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": null,
320 |    "metadata": {},
321 |    "outputs": [],
322 |    "source": [
323 |     "def as_dataframe(reviews):\n",
324 |     "    col_names = {\n",
325 |     "        'beer/ABV': 'abv',\n",
326 |     "        'beer/beerId': 'beer_id',\n",
327 |     "        'beer/brewerId': 'brewer_id',\n",
328 |     "        'beer/name': 'beer_name',\n",
329 |     "        'beer/style': 'beer_style',\n",
330 |     "        'review/appearance': 'review_appearance',\n",
331 |     "        'review/aroma': 'review_aroma',\n",
332 |     "        'review/overall': 'review_overall',\n",
333 |     "        'review/palate': 'review_palate',\n",
334 |     "        'review/profileName': 'profile_name',\n",
335 |     "        'review/taste': 'review_taste',\n",
336 |     "        'review/text': 'text',\n",
337 |     "        'review/time': 'time'\n",
338 |     "    }\n",
339 |     "    df = pd.DataFrame(list(reviews))\n",
340 |     "    numeric = ['abv', 'review_appearance', 'review_aroma',\n",
341 |     "               'review_overall', 'review_palate', 'review_taste']\n",
342 |     "    df = (df.rename(columns=col_names)\n",
343 |     "            .replace('', np.nan))\n",
344 |     "    df[numeric] = df[numeric].astype(float)\n",
345 |     "    df['time'] = pd.to_datetime(df.time.astype(int), unit='s')\n",
346 |     "    return df"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "metadata": {},
352 |    "source": [
353 |     "## Full pipeline\n",
354 |     "\n",
355 |     "1. `file -> review_lines : List[str]`\n",
356 |     "2. `review_lines -> reviews : Dict[str, str]`\n",
357 |     "3. `reviews -> DataFrames`\n",
358 |     "4. `DataFrames -> CSV`"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "markdown",
363 |    "metadata": {},
364 |    "source": [
365 |     "The full pipeline would look something like:"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "metadata": {},
372 |    "outputs": [],
373 |    "source": [
374 |     "from toolz import partition_all, partitionby\n",
375 |     "\n",
376 |     "\n",
377 |     "BATCH_SIZE = 100  # Number of reviews to process per chunk\n",
378 |     "                  # Intentionally small for demostration    \n",
379 |     "\n",
380 |     "\n",
381 |     "with gzip.open(\"data/beer-raw-small.txt.gz\", \"rt\") as f:\n",
382 |     "\n",
383 |     "    # Filter out a null byte at the end\n",
384 |     "    lines = (x for x in f if not x.startswith('\\x00'))\n",
385 |     "    \n",
386 |     "    review_lines_and_newlines = partitionby(lambda x: x == '\\n', lines)\n",
387 |     "    # that goes [review, \\n, review, \\n, ...]\n",
388 |     "    # so filter out the newlines\n",
389 |     "    review_lines = filter(lambda x: x != ('\\n',), review_lines_and_newlines)\n",
390 |     "    \n",
391 |     "    # generator expression to go from List[str] -> Dict[str, str]\n",
392 |     "    reviews = (format_review(x) for x in review_lines)\n",
393 |     "    \n",
394 |     "    # `reviews` yields one dict per review.\n",
395 |     "    # Won't fit in memory, so do `BATCH_SIZE` per chunk\n",
396 |     "    chunks = partition_all(BATCH_SIZE, reviews)\n",
397 |     "    dfs = (as_dataframe(chunk) for chunk in chunks)\n",
398 |     "    os.makedirs(\"data/beer/\", exist_ok=True)\n",
399 |     "\n",
400 |     "    # the first time we read from disk\n",
401 |     "    for i, df in enumerate(dfs):\n",
402 |     "        df.to_csv(\"data/beer/chunk_%s.csv.gz\" % i, index=False,\n",
403 |     "                  compression=\"gzip\")\n",
404 |     "        print(i, end='\\r')"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "markdown",
409 |    "metadata": {},
410 |    "source": [
411 |     "\n",
412 |     "This runs comfortably in memory. At any given time, we only have `BATCH_SIZE` reviews in memory."
413 |    ]
414 |   }
415 |  ],
416 |  "metadata": {
417 |   "kernelspec": {
418 |    "display_name": "Python 3",
419 |    "language": "python",
420 |    "name": "python3"
421 |   },
422 |   "language_info": {
423 |    "codemirror_mode": {
424 |     "name": "ipython",
425 |     "version": 3
426 |    },
427 |    "file_extension": ".py",
428 |    "mimetype": "text/x-python",
429 |    "name": "python",
430 |    "nbconvert_exporter": "python",
431 |    "pygments_lexer": "ipython3",
432 |    "version": "3.5.5"
433 |   }
434 |  },
435 |  "nbformat": 4,
436 |  "nbformat_minor": 1
437 | }
438 | 


--------------------------------------------------------------------------------
/notebooks/data/beer-raw-small.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deniederhut/Pandas-Tutorial-SciPyConf-2018/d30a9aac47b4a16b6170da77f982fe8592a4eb73/notebooks/data/beer-raw-small.txt.gz


--------------------------------------------------------------------------------
/notebooks/data/cpi.csv:
--------------------------------------------------------------------------------
  1 | DATE,CPIAUCSL
  2 | 1947-01-01,21.48
  3 | 1947-02-01,21.62
  4 | 1947-03-01,22.0
  5 | 1947-04-01,22.0
  6 | 1947-05-01,21.95
  7 | 1947-06-01,22.08
  8 | 1947-07-01,22.23
  9 | 1947-08-01,22.4
 10 | 1947-09-01,22.84
 11 | 1947-10-01,22.91
 12 | 1947-11-01,23.06
 13 | 1947-12-01,23.41
 14 | 1948-01-01,23.68
 15 | 1948-02-01,23.67
 16 | 1948-03-01,23.5
 17 | 1948-04-01,23.82
 18 | 1948-05-01,24.01
 19 | 1948-06-01,24.15
 20 | 1948-07-01,24.4
 21 | 1948-08-01,24.43
 22 | 1948-09-01,24.36
 23 | 1948-10-01,24.31
 24 | 1948-11-01,24.16
 25 | 1948-12-01,24.05
 26 | 1949-01-01,24.01
 27 | 1949-02-01,23.91
 28 | 1949-03-01,23.91
 29 | 1949-04-01,23.92
 30 | 1949-05-01,23.91
 31 | 1949-06-01,23.92
 32 | 1949-07-01,23.7
 33 | 1949-08-01,23.7
 34 | 1949-09-01,23.75
 35 | 1949-10-01,23.67
 36 | 1949-11-01,23.7
 37 | 1949-12-01,23.61
 38 | 1950-01-01,23.51
 39 | 1950-02-01,23.61
 40 | 1950-03-01,23.64
 41 | 1950-04-01,23.65
 42 | 1950-05-01,23.77
 43 | 1950-06-01,23.88
 44 | 1950-07-01,24.07
 45 | 1950-08-01,24.2
 46 | 1950-09-01,24.34
 47 | 1950-10-01,24.5
 48 | 1950-11-01,24.6
 49 | 1950-12-01,24.98
 50 | 1951-01-01,25.38
 51 | 1951-02-01,25.83
 52 | 1951-03-01,25.88
 53 | 1951-04-01,25.92
 54 | 1951-05-01,25.99
 55 | 1951-06-01,25.93
 56 | 1951-07-01,25.91
 57 | 1951-08-01,25.86
 58 | 1951-09-01,26.03
 59 | 1951-10-01,26.16
 60 | 1951-11-01,26.32
 61 | 1951-12-01,26.47
 62 | 1952-01-01,26.45
 63 | 1952-02-01,26.41
 64 | 1952-03-01,26.39
 65 | 1952-04-01,26.46
 66 | 1952-05-01,26.47
 67 | 1952-06-01,26.53
 68 | 1952-07-01,26.68
 69 | 1952-08-01,26.69
 70 | 1952-09-01,26.63
 71 | 1952-10-01,26.69
 72 | 1952-11-01,26.69
 73 | 1952-12-01,26.71
 74 | 1953-01-01,26.64
 75 | 1953-02-01,26.59
 76 | 1953-03-01,26.63
 77 | 1953-04-01,26.69
 78 | 1953-05-01,26.7
 79 | 1953-06-01,26.77
 80 | 1953-07-01,26.79
 81 | 1953-08-01,26.85
 82 | 1953-09-01,26.89
 83 | 1953-10-01,26.95
 84 | 1953-11-01,26.85
 85 | 1953-12-01,26.87
 86 | 1954-01-01,26.94
 87 | 1954-02-01,26.99
 88 | 1954-03-01,26.93
 89 | 1954-04-01,26.86
 90 | 1954-05-01,26.93
 91 | 1954-06-01,26.94
 92 | 1954-07-01,26.86
 93 | 1954-08-01,26.85
 94 | 1954-09-01,26.81
 95 | 1954-10-01,26.72
 96 | 1954-11-01,26.78
 97 | 1954-12-01,26.77
 98 | 1955-01-01,26.77
 99 | 1955-02-01,26.82
100 | 1955-03-01,26.79
101 | 1955-04-01,26.79
102 | 1955-05-01,26.77
103 | 1955-06-01,26.71
104 | 1955-07-01,26.76
105 | 1955-08-01,26.72
106 | 1955-09-01,26.85
107 | 1955-10-01,26.82
108 | 1955-11-01,26.88
109 | 1955-12-01,26.87
110 | 1956-01-01,26.83
111 | 1956-02-01,26.86
112 | 1956-03-01,26.89
113 | 1956-04-01,26.93
114 | 1956-05-01,27.03
115 | 1956-06-01,27.15
116 | 1956-07-01,27.29
117 | 1956-08-01,27.31
118 | 1956-09-01,27.35
119 | 1956-10-01,27.51
120 | 1956-11-01,27.51
121 | 1956-12-01,27.63
122 | 1957-01-01,27.67
123 | 1957-02-01,27.8
124 | 1957-03-01,27.86
125 | 1957-04-01,27.93
126 | 1957-05-01,28.0
127 | 1957-06-01,28.11
128 | 1957-07-01,28.19
129 | 1957-08-01,28.28
130 | 1957-09-01,28.32
131 | 1957-10-01,28.32
132 | 1957-11-01,28.41
133 | 1957-12-01,28.47
134 | 1958-01-01,28.64
135 | 1958-02-01,28.7
136 | 1958-03-01,28.87
137 | 1958-04-01,28.94
138 | 1958-05-01,28.94
139 | 1958-06-01,28.91
140 | 1958-07-01,28.89
141 | 1958-08-01,28.94
142 | 1958-09-01,28.91
143 | 1958-10-01,28.91
144 | 1958-11-01,28.95
145 | 1958-12-01,28.97
146 | 1959-01-01,29.01
147 | 1959-02-01,29.0
148 | 1959-03-01,28.97
149 | 1959-04-01,28.98
150 | 1959-05-01,29.04
151 | 1959-06-01,29.11
152 | 1959-07-01,29.15
153 | 1959-08-01,29.18
154 | 1959-09-01,29.25
155 | 1959-10-01,29.35
156 | 1959-11-01,29.35
157 | 1959-12-01,29.41
158 | 1960-01-01,29.37
159 | 1960-02-01,29.41
160 | 1960-03-01,29.41
161 | 1960-04-01,29.54
162 | 1960-05-01,29.57
163 | 1960-06-01,29.61
164 | 1960-07-01,29.55
165 | 1960-08-01,29.61
166 | 1960-09-01,29.61
167 | 1960-10-01,29.75
168 | 1960-11-01,29.78
169 | 1960-12-01,29.81
170 | 1961-01-01,29.84
171 | 1961-02-01,29.84
172 | 1961-03-01,29.84
173 | 1961-04-01,29.81
174 | 1961-05-01,29.84
175 | 1961-06-01,29.84
176 | 1961-07-01,29.92
177 | 1961-08-01,29.94
178 | 1961-09-01,29.98
179 | 1961-10-01,29.98
180 | 1961-11-01,29.98
181 | 1961-12-01,30.01
182 | 1962-01-01,30.04
183 | 1962-02-01,30.11
184 | 1962-03-01,30.17
185 | 1962-04-01,30.21
186 | 1962-05-01,30.24
187 | 1962-06-01,30.21
188 | 1962-07-01,30.22
189 | 1962-08-01,30.28
190 | 1962-09-01,30.42
191 | 1962-10-01,30.38
192 | 1962-11-01,30.38
193 | 1962-12-01,30.38
194 | 1963-01-01,30.44
195 | 1963-02-01,30.48
196 | 1963-03-01,30.51
197 | 1963-04-01,30.48
198 | 1963-05-01,30.51
199 | 1963-06-01,30.61
200 | 1963-07-01,30.69
201 | 1963-08-01,30.75
202 | 1963-09-01,30.72
203 | 1963-10-01,30.75
204 | 1963-11-01,30.78
205 | 1963-12-01,30.88
206 | 1964-01-01,30.94
207 | 1964-02-01,30.91
208 | 1964-03-01,30.94
209 | 1964-04-01,30.95
210 | 1964-05-01,30.98
211 | 1964-06-01,31.01
212 | 1964-07-01,31.02
213 | 1964-08-01,31.05
214 | 1964-09-01,31.08
215 | 1964-10-01,31.12
216 | 1964-11-01,31.21
217 | 1964-12-01,31.25
218 | 1965-01-01,31.28
219 | 1965-02-01,31.28
220 | 1965-03-01,31.31
221 | 1965-04-01,31.38
222 | 1965-05-01,31.48
223 | 1965-06-01,31.61
224 | 1965-07-01,31.58
225 | 1965-08-01,31.55
226 | 1965-09-01,31.62
227 | 1965-10-01,31.65
228 | 1965-11-01,31.75
229 | 1965-12-01,31.85
230 | 1966-01-01,31.88
231 | 1966-02-01,32.08
232 | 1966-03-01,32.18
233 | 1966-04-01,32.28
234 | 1966-05-01,32.35
235 | 1966-06-01,32.38
236 | 1966-07-01,32.45
237 | 1966-08-01,32.65
238 | 1966-09-01,32.75
239 | 1966-10-01,32.85
240 | 1966-11-01,32.88
241 | 1966-12-01,32.92
242 | 1967-01-01,32.9
243 | 1967-02-01,33.0
244 | 1967-03-01,33.0
245 | 1967-04-01,33.1
246 | 1967-05-01,33.1
247 | 1967-06-01,33.3
248 | 1967-07-01,33.4
249 | 1967-08-01,33.5
250 | 1967-09-01,33.6
251 | 1967-10-01,33.7
252 | 1967-11-01,33.9
253 | 1967-12-01,34.0
254 | 1968-01-01,34.1
255 | 1968-02-01,34.2
256 | 1968-03-01,34.3
257 | 1968-04-01,34.4
258 | 1968-05-01,34.5
259 | 1968-06-01,34.7
260 | 1968-07-01,34.9
261 | 1968-08-01,35.0
262 | 1968-09-01,35.1
263 | 1968-10-01,35.3
264 | 1968-11-01,35.4
265 | 1968-12-01,35.6
266 | 1969-01-01,35.7
267 | 1969-02-01,35.8
268 | 1969-03-01,36.1
269 | 1969-04-01,36.3
270 | 1969-05-01,36.4
271 | 1969-06-01,36.6
272 | 1969-07-01,36.8
273 | 1969-08-01,36.9
274 | 1969-09-01,37.1
275 | 1969-10-01,37.3
276 | 1969-11-01,37.5
277 | 1969-12-01,37.7
278 | 1970-01-01,37.9
279 | 1970-02-01,38.1
280 | 1970-03-01,38.3
281 | 1970-04-01,38.5
282 | 1970-05-01,38.6
283 | 1970-06-01,38.8
284 | 1970-07-01,38.9
285 | 1970-08-01,39.0
286 | 1970-09-01,39.2
287 | 1970-10-01,39.4
288 | 1970-11-01,39.6
289 | 1970-12-01,39.8
290 | 1971-01-01,39.9
291 | 1971-02-01,39.9
292 | 1971-03-01,40.0
293 | 1971-04-01,40.1
294 | 1971-05-01,40.3
295 | 1971-06-01,40.5
296 | 1971-07-01,40.6
297 | 1971-08-01,40.7
298 | 1971-09-01,40.8
299 | 1971-10-01,40.9
300 | 1971-11-01,41.0
301 | 1971-12-01,41.1
302 | 1972-01-01,41.2
303 | 1972-02-01,41.4
304 | 1972-03-01,41.4
305 | 1972-04-01,41.5
306 | 1972-05-01,41.6
307 | 1972-06-01,41.7
308 | 1972-07-01,41.8
309 | 1972-08-01,41.9
310 | 1972-09-01,42.1
311 | 1972-10-01,42.2
312 | 1972-11-01,42.4
313 | 1972-12-01,42.5
314 | 1973-01-01,42.7
315 | 1973-02-01,43.0
316 | 1973-03-01,43.4
317 | 1973-04-01,43.7
318 | 1973-05-01,43.9
319 | 1973-06-01,44.2
320 | 1973-07-01,44.2
321 | 1973-08-01,45.0
322 | 1973-09-01,45.2
323 | 1973-10-01,45.6
324 | 1973-11-01,45.9
325 | 1973-12-01,46.3
326 | 1974-01-01,46.8
327 | 1974-02-01,47.3
328 | 1974-03-01,47.8
329 | 1974-04-01,48.1
330 | 1974-05-01,48.6
331 | 1974-06-01,49.0
332 | 1974-07-01,49.3
333 | 1974-08-01,49.9
334 | 1974-09-01,50.6
335 | 1974-10-01,51.0
336 | 1974-11-01,51.5
337 | 1974-12-01,51.9
338 | 1975-01-01,52.3
339 | 1975-02-01,52.6
340 | 1975-03-01,52.8
341 | 1975-04-01,53.0
342 | 1975-05-01,53.1
343 | 1975-06-01,53.5
344 | 1975-07-01,54.0
345 | 1975-08-01,54.2
346 | 1975-09-01,54.6
347 | 1975-10-01,54.9
348 | 1975-11-01,55.3
349 | 1975-12-01,55.6
350 | 1976-01-01,55.8
351 | 1976-02-01,55.9
352 | 1976-03-01,56.0
353 | 1976-04-01,56.1
354 | 1976-05-01,56.4
355 | 1976-06-01,56.7
356 | 1976-07-01,57.0
357 | 1976-08-01,57.3
358 | 1976-09-01,57.6
359 | 1976-10-01,57.9
360 | 1976-11-01,58.1
361 | 1976-12-01,58.4
362 | 1977-01-01,58.7
363 | 1977-02-01,59.3
364 | 1977-03-01,59.6
365 | 1977-04-01,60.0
366 | 1977-05-01,60.2
367 | 1977-06-01,60.5
368 | 1977-07-01,60.8
369 | 1977-08-01,61.1
370 | 1977-09-01,61.3
371 | 1977-10-01,61.6
372 | 1977-11-01,62.0
373 | 1977-12-01,62.3
374 | 1978-01-01,62.7
375 | 1978-02-01,63.0
376 | 1978-03-01,63.4
377 | 1978-04-01,63.9
378 | 1978-05-01,64.5
379 | 1978-06-01,65.0
380 | 1978-07-01,65.5
381 | 1978-08-01,65.9
382 | 1978-09-01,66.5
383 | 1978-10-01,67.1
384 | 1978-11-01,67.5
385 | 1978-12-01,67.9
386 | 1979-01-01,68.5
387 | 1979-02-01,69.2
388 | 1979-03-01,69.9
389 | 1979-04-01,70.6
390 | 1979-05-01,71.4
391 | 1979-06-01,72.2
392 | 1979-07-01,73.0
393 | 1979-08-01,73.7
394 | 1979-09-01,74.4
395 | 1979-10-01,75.2
396 | 1979-11-01,76.0
397 | 1979-12-01,76.9
398 | 1980-01-01,78.0
399 | 1980-02-01,79.0
400 | 1980-03-01,80.1
401 | 1980-04-01,80.9
402 | 1980-05-01,81.7
403 | 1980-06-01,82.5
404 | 1980-07-01,82.6
405 | 1980-08-01,83.2
406 | 1980-09-01,83.9
407 | 1980-10-01,84.7
408 | 1980-11-01,85.6
409 | 1980-12-01,86.4
410 | 1981-01-01,87.2
411 | 1981-02-01,88.0
412 | 1981-03-01,88.6
413 | 1981-04-01,89.1
414 | 1981-05-01,89.7
415 | 1981-06-01,90.5
416 | 1981-07-01,91.5
417 | 1981-08-01,92.2
418 | 1981-09-01,93.1
419 | 1981-10-01,93.4
420 | 1981-11-01,93.8
421 | 1981-12-01,94.1
422 | 1982-01-01,94.4
423 | 1982-02-01,94.7
424 | 1982-03-01,94.7
425 | 1982-04-01,95.0
426 | 1982-05-01,95.9
427 | 1982-06-01,97.0
428 | 1982-07-01,97.5
429 | 1982-08-01,97.7
430 | 1982-09-01,97.7
431 | 1982-10-01,98.1
432 | 1982-11-01,98.0
433 | 1982-12-01,97.7
434 | 1983-01-01,97.9
435 | 1983-02-01,98.0
436 | 1983-03-01,98.1
437 | 1983-04-01,98.8
438 | 1983-05-01,99.2
439 | 1983-06-01,99.4
440 | 1983-07-01,99.8
441 | 1983-08-01,100.1
442 | 1983-09-01,100.4
443 | 1983-10-01,100.8
444 | 1983-11-01,101.1
445 | 1983-12-01,101.4
446 | 1984-01-01,102.1
447 | 1984-02-01,102.6
448 | 1984-03-01,102.9
449 | 1984-04-01,103.3
450 | 1984-05-01,103.5
451 | 1984-06-01,103.7
452 | 1984-07-01,104.1
453 | 1984-08-01,104.4
454 | 1984-09-01,104.7
455 | 1984-10-01,105.1
456 | 1984-11-01,105.3
457 | 1984-12-01,105.5
458 | 1985-01-01,105.7
459 | 1985-02-01,106.3
460 | 1985-03-01,106.8
461 | 1985-04-01,107.0
462 | 1985-05-01,107.2
463 | 1985-06-01,107.5
464 | 1985-07-01,107.7
465 | 1985-08-01,107.9
466 | 1985-09-01,108.1
467 | 1985-10-01,108.5
468 | 1985-11-01,109.0
469 | 1985-12-01,109.5
470 | 1986-01-01,109.9
471 | 1986-02-01,109.7
472 | 1986-03-01,109.1
473 | 1986-04-01,108.7
474 | 1986-05-01,109.0
475 | 1986-06-01,109.4
476 | 1986-07-01,109.5
477 | 1986-08-01,109.6
478 | 1986-09-01,110.0
479 | 1986-10-01,110.2
480 | 1986-11-01,110.4
481 | 1986-12-01,110.8
482 | 1987-01-01,111.4
483 | 1987-02-01,111.8
484 | 1987-03-01,112.2
485 | 1987-04-01,112.7
486 | 1987-05-01,113.0
487 | 1987-06-01,113.5
488 | 1987-07-01,113.8
489 | 1987-08-01,114.3
490 | 1987-09-01,114.7
491 | 1987-10-01,115.0
492 | 1987-11-01,115.4
493 | 1987-12-01,115.6
494 | 1988-01-01,116.0
495 | 1988-02-01,116.2
496 | 1988-03-01,116.5
497 | 1988-04-01,117.2
498 | 1988-05-01,117.5
499 | 1988-06-01,118.0
500 | 1988-07-01,118.5
501 | 1988-08-01,119.0
502 | 1988-09-01,119.5
503 | 1988-10-01,119.9
504 | 1988-11-01,120.3
505 | 1988-12-01,120.7
506 | 1989-01-01,121.2
507 | 1989-02-01,121.6
508 | 1989-03-01,122.2
509 | 1989-04-01,123.1
510 | 1989-05-01,123.7
511 | 1989-06-01,124.1
512 | 1989-07-01,124.5
513 | 1989-08-01,124.5
514 | 1989-09-01,124.8
515 | 1989-10-01,125.4
516 | 1989-11-01,125.9
517 | 1989-12-01,126.3
518 | 1990-01-01,127.5
519 | 1990-02-01,128.0
520 | 1990-03-01,128.6
521 | 1990-04-01,128.9
522 | 1990-05-01,129.1
523 | 1990-06-01,129.9
524 | 1990-07-01,130.5
525 | 1990-08-01,131.6
526 | 1990-09-01,132.5
527 | 1990-10-01,133.4
528 | 1990-11-01,133.7
529 | 1990-12-01,134.2
530 | 1991-01-01,134.7
531 | 1991-02-01,134.8
532 | 1991-03-01,134.8
533 | 1991-04-01,135.1
534 | 1991-05-01,135.6
535 | 1991-06-01,136.0
536 | 1991-07-01,136.2
537 | 1991-08-01,136.6
538 | 1991-09-01,137.0
539 | 1991-10-01,137.2
540 | 1991-11-01,137.8
541 | 1991-12-01,138.2
542 | 1992-01-01,138.3
543 | 1992-02-01,138.6
544 | 1992-03-01,139.1
545 | 1992-04-01,139.4
546 | 1992-05-01,139.7
547 | 1992-06-01,140.1
548 | 1992-07-01,140.5
549 | 1992-08-01,140.8
550 | 1992-09-01,141.1
551 | 1992-10-01,141.7
552 | 1992-11-01,142.1
553 | 1992-12-01,142.3
554 | 1993-01-01,142.8
555 | 1993-02-01,143.1
556 | 1993-03-01,143.3
557 | 1993-04-01,143.8
558 | 1993-05-01,144.2
559 | 1993-06-01,144.3
560 | 1993-07-01,144.5
561 | 1993-08-01,144.8
562 | 1993-09-01,145.0
563 | 1993-10-01,145.6
564 | 1993-11-01,146.0
565 | 1993-12-01,146.3
566 | 1994-01-01,146.3
567 | 1994-02-01,146.7
568 | 1994-03-01,147.1
569 | 1994-04-01,147.2
570 | 1994-05-01,147.5
571 | 1994-06-01,147.9
572 | 1994-07-01,148.4
573 | 1994-08-01,149.0
574 | 1994-09-01,149.3
575 | 1994-10-01,149.4
576 | 1994-11-01,149.8
577 | 1994-12-01,150.1
578 | 1995-01-01,150.5
579 | 1995-02-01,150.9
580 | 1995-03-01,151.2
581 | 1995-04-01,151.8
582 | 1995-05-01,152.1
583 | 1995-06-01,152.4
584 | 1995-07-01,152.6
585 | 1995-08-01,152.9
586 | 1995-09-01,153.1
587 | 1995-10-01,153.5
588 | 1995-11-01,153.7
589 | 1995-12-01,153.9
590 | 1996-01-01,154.7
591 | 1996-02-01,155.0
592 | 1996-03-01,155.5
593 | 1996-04-01,156.1
594 | 1996-05-01,156.4
595 | 1996-06-01,156.7
596 | 1996-07-01,157.0
597 | 1996-08-01,157.2
598 | 1996-09-01,157.7
599 | 1996-10-01,158.2
600 | 1996-11-01,158.7
601 | 1996-12-01,159.1
602 | 1997-01-01,159.4
603 | 1997-02-01,159.7
604 | 1997-03-01,159.8
605 | 1997-04-01,159.9
606 | 1997-05-01,159.9
607 | 1997-06-01,160.2
608 | 1997-07-01,160.4
609 | 1997-08-01,160.8
610 | 1997-09-01,161.2
611 | 1997-10-01,161.5
612 | 1997-11-01,161.7
613 | 1997-12-01,161.8
614 | 1998-01-01,162.0
615 | 1998-02-01,162.0
616 | 1998-03-01,162.0
617 | 1998-04-01,162.2
618 | 1998-05-01,162.6
619 | 1998-06-01,162.8
620 | 1998-07-01,163.2
621 | 1998-08-01,163.4
622 | 1998-09-01,163.5
623 | 1998-10-01,163.9
624 | 1998-11-01,164.1
625 | 1998-12-01,164.4
626 | 1999-01-01,164.7
627 | 1999-02-01,164.7
628 | 1999-03-01,164.8
629 | 1999-04-01,165.9
630 | 1999-05-01,166.0
631 | 1999-06-01,166.0
632 | 1999-07-01,166.7
633 | 1999-08-01,167.1
634 | 1999-09-01,167.8
635 | 1999-10-01,168.1
636 | 1999-11-01,168.4
637 | 1999-12-01,168.8
638 | 2000-01-01,169.3
639 | 2000-02-01,170.0
640 | 2000-03-01,171.0
641 | 2000-04-01,170.9
642 | 2000-05-01,171.2
643 | 2000-06-01,172.2
644 | 2000-07-01,172.7
645 | 2000-08-01,172.7
646 | 2000-09-01,173.6
647 | 2000-10-01,173.9
648 | 2000-11-01,174.2
649 | 2000-12-01,174.6
650 | 2001-01-01,175.6
651 | 2001-02-01,176.0
652 | 2001-03-01,176.1
653 | 2001-04-01,176.4
654 | 2001-05-01,177.3
655 | 2001-06-01,177.7
656 | 2001-07-01,177.4
657 | 2001-08-01,177.4
658 | 2001-09-01,178.1
659 | 2001-10-01,177.6
660 | 2001-11-01,177.5
661 | 2001-12-01,177.4
662 | 2002-01-01,177.7
663 | 2002-02-01,178.0
664 | 2002-03-01,178.5
665 | 2002-04-01,179.3
666 | 2002-05-01,179.5
667 | 2002-06-01,179.6
668 | 2002-07-01,180.0
669 | 2002-08-01,180.5
670 | 2002-09-01,180.8
671 | 2002-10-01,181.2
672 | 2002-11-01,181.5
673 | 2002-12-01,181.8
674 | 2003-01-01,182.6
675 | 2003-02-01,183.6
676 | 2003-03-01,183.9
677 | 2003-04-01,183.2
678 | 2003-05-01,182.9
679 | 2003-06-01,183.1
680 | 2003-07-01,183.7
681 | 2003-08-01,184.5
682 | 2003-09-01,185.1
683 | 2003-10-01,184.9
684 | 2003-11-01,185.0
685 | 2003-12-01,185.5
686 | 2004-01-01,186.3
687 | 2004-02-01,186.7
688 | 2004-03-01,187.1
689 | 2004-04-01,187.4
690 | 2004-05-01,188.2
691 | 2004-06-01,188.9
692 | 2004-07-01,189.1
693 | 2004-08-01,189.2
694 | 2004-09-01,189.8
695 | 2004-10-01,190.8
696 | 2004-11-01,191.7
697 | 2004-12-01,191.7
698 | 2005-01-01,191.6
699 | 2005-02-01,192.4
700 | 2005-03-01,193.1
701 | 2005-04-01,193.7
702 | 2005-05-01,193.6
703 | 2005-06-01,193.7
704 | 2005-07-01,194.9
705 | 2005-08-01,196.1
706 | 2005-09-01,198.8
707 | 2005-10-01,199.1
708 | 2005-11-01,198.1
709 | 2005-12-01,198.1
710 | 2006-01-01,199.3
711 | 2006-02-01,199.4
712 | 2006-03-01,199.7
713 | 2006-04-01,200.7
714 | 2006-05-01,201.3
715 | 2006-06-01,201.8
716 | 2006-07-01,202.9
717 | 2006-08-01,203.8
718 | 2006-09-01,202.8
719 | 2006-10-01,201.9
720 | 2006-11-01,202.0
721 | 2006-12-01,203.1
722 | 2007-01-01,203.437
723 | 2007-02-01,204.226
724 | 2007-03-01,205.28799999999998
725 | 2007-04-01,205.90400000000002
726 | 2007-05-01,206.755
727 | 2007-06-01,207.234
728 | 2007-07-01,207.60299999999998
729 | 2007-08-01,207.667
730 | 2007-09-01,208.547
731 | 2007-10-01,209.19
732 | 2007-11-01,210.834
733 | 2007-12-01,211.445
734 | 2008-01-01,212.174
735 | 2008-02-01,212.687
736 | 2008-03-01,213.44799999999998
737 | 2008-04-01,213.942
738 | 2008-05-01,215.208
739 | 2008-06-01,217.463
740 | 2008-07-01,219.016
741 | 2008-08-01,218.69
742 | 2008-09-01,218.877
743 | 2008-10-01,216.995
744 | 2008-11-01,213.153
745 | 2008-12-01,211.398
746 | 2009-01-01,211.933
747 | 2009-02-01,212.705
748 | 2009-03-01,212.495
749 | 2009-04-01,212.709
750 | 2009-05-01,213.02200000000002
751 | 2009-06-01,214.79
752 | 2009-07-01,214.726
753 | 2009-08-01,215.445
754 | 2009-09-01,215.861
755 | 2009-10-01,216.50900000000001
756 | 2009-11-01,217.234
757 | 2009-12-01,217.347
758 | 2010-01-01,217.488
759 | 2010-02-01,217.28099999999998
760 | 2010-03-01,217.35299999999998
761 | 2010-04-01,217.403
762 | 2010-05-01,217.29
763 | 2010-06-01,217.199
764 | 2010-07-01,217.605
765 | 2010-08-01,217.923
766 | 2010-09-01,218.275
767 | 2010-10-01,219.035
768 | 2010-11-01,219.59
769 | 2010-12-01,220.472
770 | 2011-01-01,221.187
771 | 2011-02-01,221.898
772 | 2011-03-01,223.046
773 | 2011-04-01,224.093
774 | 2011-05-01,224.80599999999998
775 | 2011-06-01,224.80599999999998
776 | 2011-07-01,225.395
777 | 2011-08-01,226.106
778 | 2011-09-01,226.597
779 | 2011-10-01,226.75
780 | 2011-11-01,227.169
781 | 2011-12-01,227.22299999999998
782 | 2012-01-01,227.86
783 | 2012-02-01,228.377
784 | 2012-03-01,228.894
785 | 2012-04-01,229.28599999999997
786 | 2012-05-01,228.722
787 | 2012-06-01,228.50599999999997
788 | 2012-07-01,228.475
789 | 2012-08-01,229.84400000000002
790 | 2012-09-01,230.987
791 | 2012-10-01,231.655
792 | 2012-11-01,231.278
793 | 2012-12-01,231.27200000000002
794 | 2013-01-01,231.641
795 | 2013-02-01,233.005
796 | 2013-03-01,232.313
797 | 2013-04-01,231.856
798 | 2013-05-01,231.895
799 | 2013-06-01,232.357
800 | 2013-07-01,232.74900000000002
801 | 2013-08-01,233.24900000000002
802 | 2013-09-01,233.642
803 | 2013-10-01,233.799
804 | 2013-11-01,234.21
805 | 2013-12-01,234.847
806 | 2014-01-01,235.43599999999998
807 | 2014-02-01,235.62099999999998
808 | 2014-03-01,235.89700000000002
809 | 2014-04-01,236.495
810 | 2014-05-01,236.803
811 | 2014-06-01,237.016
812 | 2014-07-01,237.25900000000001
813 | 2014-08-01,237.16299999999998
814 | 2014-09-01,237.51
815 | 2014-10-01,237.65099999999998
816 | 2014-11-01,237.261
817 | 2014-12-01,236.46400000000003
818 | 2015-01-01,234.954
819 | 2015-02-01,235.415
820 | 2015-03-01,235.859
821 | 2015-04-01,236.197
822 | 2015-05-01,236.87599999999998
823 | 


--------------------------------------------------------------------------------
/notebooks/data/flights-ts.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deniederhut/Pandas-Tutorial-SciPyConf-2018/d30a9aac47b4a16b6170da77f982fe8592a4eb73/notebooks/data/flights-ts.csv.gz


--------------------------------------------------------------------------------
/notebooks/data/gdp.csv:
--------------------------------------------------------------------------------
  1 | DATE,GDP
  2 | 1947-01-01,243.1
  3 | 1947-04-01,246.3
  4 | 1947-07-01,250.1
  5 | 1947-10-01,260.3
  6 | 1948-01-01,266.2
  7 | 1948-04-01,272.9
  8 | 1948-07-01,279.5
  9 | 1948-10-01,280.7
 10 | 1949-01-01,275.4
 11 | 1949-04-01,271.7
 12 | 1949-07-01,273.3
 13 | 1949-10-01,271.0
 14 | 1950-01-01,281.2
 15 | 1950-04-01,290.7
 16 | 1950-07-01,308.5
 17 | 1950-10-01,320.3
 18 | 1951-01-01,336.4
 19 | 1951-04-01,344.5
 20 | 1951-07-01,351.8
 21 | 1951-10-01,356.6
 22 | 1952-01-01,360.2
 23 | 1952-04-01,361.4
 24 | 1952-07-01,368.1
 25 | 1952-10-01,381.2
 26 | 1953-01-01,388.5
 27 | 1953-04-01,392.3
 28 | 1953-07-01,391.7
 29 | 1953-10-01,386.5
 30 | 1954-01-01,385.9
 31 | 1954-04-01,386.7
 32 | 1954-07-01,391.6
 33 | 1954-10-01,400.3
 34 | 1955-01-01,413.8
 35 | 1955-04-01,422.2
 36 | 1955-07-01,430.9
 37 | 1955-10-01,437.8
 38 | 1956-01-01,440.5
 39 | 1956-04-01,446.8
 40 | 1956-07-01,452.0
 41 | 1956-10-01,461.3
 42 | 1957-01-01,470.6
 43 | 1957-04-01,472.8
 44 | 1957-07-01,480.3
 45 | 1957-10-01,475.7
 46 | 1958-01-01,468.4
 47 | 1958-04-01,472.8
 48 | 1958-07-01,486.7
 49 | 1958-10-01,500.4
 50 | 1959-01-01,511.1
 51 | 1959-04-01,524.2
 52 | 1959-07-01,525.2
 53 | 1959-10-01,529.3
 54 | 1960-01-01,543.3
 55 | 1960-04-01,542.7
 56 | 1960-07-01,546.0
 57 | 1960-10-01,541.1
 58 | 1961-01-01,545.9
 59 | 1961-04-01,557.4
 60 | 1961-07-01,568.2
 61 | 1961-10-01,581.6
 62 | 1962-01-01,595.2
 63 | 1962-04-01,602.6
 64 | 1962-07-01,609.6
 65 | 1962-10-01,613.1
 66 | 1963-01-01,622.7
 67 | 1963-04-01,631.8
 68 | 1963-07-01,645.0
 69 | 1963-10-01,654.8
 70 | 1964-01-01,671.1
 71 | 1964-04-01,680.8
 72 | 1964-07-01,692.8
 73 | 1964-10-01,698.4
 74 | 1965-01-01,719.2
 75 | 1965-04-01,732.4
 76 | 1965-07-01,750.2
 77 | 1965-10-01,773.1
 78 | 1966-01-01,797.3
 79 | 1966-04-01,807.2
 80 | 1966-07-01,820.8
 81 | 1966-10-01,834.9
 82 | 1967-01-01,846.0
 83 | 1967-04-01,851.1
 84 | 1967-07-01,866.6
 85 | 1967-10-01,883.2
 86 | 1968-01-01,911.1
 87 | 1968-04-01,936.3
 88 | 1968-07-01,952.3
 89 | 1968-10-01,970.1
 90 | 1969-01-01,995.4
 91 | 1969-04-01,1011.4
 92 | 1969-07-01,1032.0
 93 | 1969-10-01,1040.7
 94 | 1970-01-01,1053.5
 95 | 1970-04-01,1070.1
 96 | 1970-07-01,1088.5
 97 | 1970-10-01,1091.5
 98 | 1971-01-01,1137.8
 99 | 1971-04-01,1159.4
100 | 1971-07-01,1180.3
101 | 1971-10-01,1193.6
102 | 1972-01-01,1233.8
103 | 1972-04-01,1270.1
104 | 1972-07-01,1293.8
105 | 1972-10-01,1332.0
106 | 1973-01-01,1380.7
107 | 1973-04-01,1417.6
108 | 1973-07-01,1436.8
109 | 1973-10-01,1479.1
110 | 1974-01-01,1494.7
111 | 1974-04-01,1534.2
112 | 1974-07-01,1563.4
113 | 1974-10-01,1603.0
114 | 1975-01-01,1619.6
115 | 1975-04-01,1656.4
116 | 1975-07-01,1713.8
117 | 1975-10-01,1765.9
118 | 1976-01-01,1824.5
119 | 1976-04-01,1856.9
120 | 1976-07-01,1890.5
121 | 1976-10-01,1938.4
122 | 1977-01-01,1992.5
123 | 1977-04-01,2060.2
124 | 1977-07-01,2122.4
125 | 1977-10-01,2168.7
126 | 1978-01-01,2208.7
127 | 1978-04-01,2336.6
128 | 1978-07-01,2398.9
129 | 1978-10-01,2482.2
130 | 1979-01-01,2531.6
131 | 1979-04-01,2595.9
132 | 1979-07-01,2670.4
133 | 1979-10-01,2730.7
134 | 1980-01-01,2796.5
135 | 1980-04-01,2799.9
136 | 1980-07-01,2860.0
137 | 1980-10-01,2993.5
138 | 1981-01-01,3131.8
139 | 1981-04-01,3167.3
140 | 1981-07-01,3261.2
141 | 1981-10-01,3283.5
142 | 1982-01-01,3273.8
143 | 1982-04-01,3331.3
144 | 1982-07-01,3367.1
145 | 1982-10-01,3407.8
146 | 1983-01-01,3480.3
147 | 1983-04-01,3583.8
148 | 1983-07-01,3692.3
149 | 1983-10-01,3796.1
150 | 1984-01-01,3912.8
151 | 1984-04-01,4015.0
152 | 1984-07-01,4087.4
153 | 1984-10-01,4147.6
154 | 1985-01-01,4237.0
155 | 1985-04-01,4302.3
156 | 1985-07-01,4394.6
157 | 1985-10-01,4453.1
158 | 1986-01-01,4516.3
159 | 1986-04-01,4555.2
160 | 1986-07-01,4619.6
161 | 1986-10-01,4669.4
162 | 1987-01-01,4736.2
163 | 1987-04-01,4821.5
164 | 1987-07-01,4900.5
165 | 1987-10-01,5022.7
166 | 1988-01-01,5090.6
167 | 1988-04-01,5207.7
168 | 1988-07-01,5299.5
169 | 1988-10-01,5412.7
170 | 1989-01-01,5527.4
171 | 1989-04-01,5628.4
172 | 1989-07-01,5711.6
173 | 1989-10-01,5763.4
174 | 1990-01-01,5890.8
175 | 1990-04-01,5974.7
176 | 1990-07-01,6029.5
177 | 1990-10-01,6023.3
178 | 1991-01-01,6054.9
179 | 1991-04-01,6143.6
180 | 1991-07-01,6218.4
181 | 1991-10-01,6279.3
182 | 1992-01-01,6380.8
183 | 1992-04-01,6492.3
184 | 1992-07-01,6586.5
185 | 1992-10-01,6697.6
186 | 1993-01-01,6748.2
187 | 1993-04-01,6829.6
188 | 1993-07-01,6904.2
189 | 1993-10-01,7032.8
190 | 1994-01-01,7136.3
191 | 1994-04-01,7269.8
192 | 1994-07-01,7352.3
193 | 1994-10-01,7476.7
194 | 1995-01-01,7545.3
195 | 1995-04-01,7604.9
196 | 1995-07-01,7706.5
197 | 1995-10-01,7799.5
198 | 1996-01-01,7893.1
199 | 1996-04-01,8061.5
200 | 1996-07-01,8159.0
201 | 1996-10-01,8287.1
202 | 1997-01-01,8402.1
203 | 1997-04-01,8551.9
204 | 1997-07-01,8691.8
205 | 1997-10-01,8788.3
206 | 1998-01-01,8889.7
207 | 1998-04-01,8994.7
208 | 1998-07-01,9146.5
209 | 1998-10-01,9325.7
210 | 1999-01-01,9447.1
211 | 1999-04-01,9557.0
212 | 1999-07-01,9712.3
213 | 1999-10-01,9926.1
214 | 2000-01-01,10031.0
215 | 2000-04-01,10278.3
216 | 2000-07-01,10357.4
217 | 2000-10-01,10472.3
218 | 2001-01-01,10508.1
219 | 2001-04-01,10638.4
220 | 2001-07-01,10639.5
221 | 2001-10-01,10701.3
222 | 2002-01-01,10834.4
223 | 2002-04-01,10934.8
224 | 2002-07-01,11037.1
225 | 2002-10-01,11103.8
226 | 2003-01-01,11230.1
227 | 2003-04-01,11370.7
228 | 2003-07-01,11625.1
229 | 2003-10-01,11816.8
230 | 2004-01-01,11988.4
231 | 2004-04-01,12181.4
232 | 2004-07-01,12367.7
233 | 2004-10-01,12562.2
234 | 2005-01-01,12813.7
235 | 2005-04-01,12974.1
236 | 2005-07-01,13205.4
237 | 2005-10-01,13381.6
238 | 2006-01-01,13648.9
239 | 2006-04-01,13799.8
240 | 2006-07-01,13908.5
241 | 2006-10-01,14066.4
242 | 2007-01-01,14233.2
243 | 2007-04-01,14422.3
244 | 2007-07-01,14569.7
245 | 2007-10-01,14685.3
246 | 2008-01-01,14668.4
247 | 2008-04-01,14813.0
248 | 2008-07-01,14843.0
249 | 2008-10-01,14549.9
250 | 2009-01-01,14383.9
251 | 2009-04-01,14340.4
252 | 2009-07-01,14384.1
253 | 2009-10-01,14566.5
254 | 2010-01-01,14681.1
255 | 2010-04-01,14888.6
256 | 2010-07-01,15057.7
257 | 2010-10-01,15230.2
258 | 2011-01-01,15238.4
259 | 2011-04-01,15460.9
260 | 2011-07-01,15587.1
261 | 2011-10-01,15785.3
262 | 2012-01-01,15973.9
263 | 2012-04-01,16121.9
264 | 2012-07-01,16227.9
265 | 2012-10-01,16297.3
266 | 2013-01-01,16475.4
267 | 2013-04-01,16541.4
268 | 2013-07-01,16749.3
269 | 2013-10-01,16999.9
270 | 2014-01-01,17025.2
271 | 


--------------------------------------------------------------------------------
/notebooks/data/ny-flights.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deniederhut/Pandas-Tutorial-SciPyConf-2018/d30a9aac47b4a16b6170da77f982fe8592a4eb73/notebooks/data/ny-flights.csv.gz


--------------------------------------------------------------------------------
/notebooks/data/rpi.csv:
--------------------------------------------------------------------------------
 1 | 0,1,2,3,4,5,6,7,8,9,10,11
 2 | 2014-15 NBA RPI Rankings,,,,,,,,,,,
 3 | RK,TEAM,RPI,W,L,PCT,SOS,PWR,PF,PA,EWL,EWP
 4 | 1,Golden State,.582,67,15,.817,.504,1,9016,8188,68-14,.831
 5 | 2,Houston,.552,56,26,.683,.509,7,8522,8240,52-30,.635
 6 | ,LA,.552,56,26,.683,.509,4,8751,8211,61-21,.741
 7 | 4,Atlanta,.551,60,22,.732,.491,5,8409,7964,58-24,.710
 8 | 5,Memphis,.549,55,27,.671,.509,8,8062,7796,52-30,.635
 9 | 6,San Antonio,.548,55,27,.671,.506,2,8461,7953,60-22,.735
10 | 7,Dallas,.536,50,32,.610,.511,10,8628,8390,50-32,.613
11 | 8,Cleveland,.535,53,29,.646,.497,3,8457,8090,55-27,.675
12 | 9,Portland,.534,51,31,.622,.504,9,8429,8082,55-27,.667
13 | 10,Chicago,.523,50,32,.610,.494,6,8265,8019,51-31,.622
14 | 11,Toronto,.517,49,33,.598,.491,11,8527,8275,51-31,.621
15 | ,New Orleans,.517,45,37,.549,.507,12,8147,8082,44-38,.533
16 | 13,Oklahoma City,.516,45,37,.549,.505,15,8524,8345,48-34,.587
17 | 14,Washington,.509,46,36,.561,.491,14,8080,8021,43-39,.530
18 | 15,Phoenix,.500,39,43,.476,.508,21,8397,8471,38-44,.464
19 | 16,Utah,.498,38,44,.463,.509,13,7801,7783,42-40,.510
20 | 17,Milwaukee,.494,41,41,.500,.492,18,8023,7988,42-40,.518
21 | 18,Boston,.490,40,42,.488,.490,17,8312,8299,42-40,.506
22 | 19,Brooklyn,.487,38,44,.463,.495,19,8038,8274,31-51,.383
23 | 20,Indiana,.484,38,44,.463,.491,16,7981,7958,42-40,.512
24 | 21,Miami,.483,37,45,.451,.493,20,7764,7977,32-50,.390
25 | 22,Charlotte,.471,33,49,.402,.494,22,7721,7981,30-52,.367
26 | 23,Denver,.470,30,52,.366,.505,23,8320,8611,30-52,.362
27 | ,Sacramento,.470,29,53,.354,.509,26,8310,8614,29-53,.356
28 | 25,Detroit,.467,32,50,.390,.493,24,8077,8159,38-44,.458
29 | 26,Orlando,.447,25,57,.305,.494,25,7847,8316,23-59,.277
30 | 27,LA Lakers,.444,21,61,.256,.507,28,8073,8634,20-62,.248
31 | 28,Minnesota,.430,16,66,.195,.508,30,8016,8737,16-66,.194
32 | 29,Philadelphia,.425,18,64,.220,.494,27,7542,8278,15-67,.177
33 | 30,New York,.419,17,65,.207,.490,29,7535,8299,14-68,.169
34 | 


--------------------------------------------------------------------------------
/notebooks/data/subset.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deniederhut/Pandas-Tutorial-SciPyConf-2018/d30a9aac47b4a16b6170da77f982fe8592a4eb73/notebooks/data/subset.csv.gz


--------------------------------------------------------------------------------
/notebooks/data/tips.csv:
--------------------------------------------------------------------------------
  1 | total_bill,tip,sex,smoker,day,time,size
  2 | 16.99,1.01,Female,No,Sun,Dinner,2
  3 | 10.34,1.66,Male,No,Sun,Dinner,3
  4 | 21.01,3.5,Male,No,Sun,Dinner,3
  5 | 23.68,3.31,Male,No,Sun,Dinner,2
  6 | 24.59,3.61,Female,No,Sun,Dinner,4
  7 | 25.29,4.71,Male,No,Sun,Dinner,4
  8 | 8.77,2.0,Male,No,Sun,Dinner,2
  9 | 26.88,3.12,Male,No,Sun,Dinner,4
 10 | 15.04,1.96,Male,No,Sun,Dinner,2
 11 | 14.78,3.23,Male,No,Sun,Dinner,2
 12 | 10.27,1.71,Male,No,Sun,Dinner,2
 13 | 35.26,5.0,Female,No,Sun,Dinner,4
 14 | 15.42,1.57,Male,No,Sun,Dinner,2
 15 | 18.43,3.0,Male,No,Sun,Dinner,4
 16 | 14.83,3.02,Female,No,Sun,Dinner,2
 17 | 21.58,3.92,Male,No,Sun,Dinner,2
 18 | 10.33,1.67,Female,No,Sun,Dinner,3
 19 | 16.29,3.71,Male,No,Sun,Dinner,3
 20 | 16.97,3.5,Female,No,Sun,Dinner,3
 21 | 20.65,3.35,Male,No,Sat,Dinner,3
 22 | 17.92,4.08,Male,No,Sat,Dinner,2
 23 | 20.29,2.75,Female,No,Sat,Dinner,2
 24 | 15.77,2.23,Female,No,Sat,Dinner,2
 25 | 39.42,7.58,Male,No,Sat,Dinner,4
 26 | 19.82,3.18,Male,No,Sat,Dinner,2
 27 | 17.81,2.34,Male,No,Sat,Dinner,4
 28 | 13.37,2.0,Male,No,Sat,Dinner,2
 29 | 12.69,2.0,Male,No,Sat,Dinner,2
 30 | 21.7,4.3,Male,No,Sat,Dinner,2
 31 | 19.65,3.0,Female,No,Sat,Dinner,2
 32 | 9.55,1.45,Male,No,Sat,Dinner,2
 33 | 18.35,2.5,Male,No,Sat,Dinner,4
 34 | 15.06,3.0,Female,No,Sat,Dinner,2
 35 | 20.69,2.45,Female,No,Sat,Dinner,4
 36 | 17.78,3.27,Male,No,Sat,Dinner,2
 37 | 24.06,3.6,Male,No,Sat,Dinner,3
 38 | 16.31,2.0,Male,No,Sat,Dinner,3
 39 | 16.93,3.07,Female,No,Sat,Dinner,3
 40 | 18.69,2.31,Male,No,Sat,Dinner,3
 41 | 31.27,5.0,Male,No,Sat,Dinner,3
 42 | 16.04,2.24,Male,No,Sat,Dinner,3
 43 | 17.46,2.54,Male,No,Sun,Dinner,2
 44 | 13.94,3.06,Male,No,Sun,Dinner,2
 45 | 9.68,1.32,Male,No,Sun,Dinner,2
 46 | 30.4,5.6,Male,No,Sun,Dinner,4
 47 | 18.29,3.0,Male,No,Sun,Dinner,2
 48 | 22.23,5.0,Male,No,Sun,Dinner,2
 49 | 32.4,6.0,Male,No,Sun,Dinner,4
 50 | 28.55,2.05,Male,No,Sun,Dinner,3
 51 | 18.04,3.0,Male,No,Sun,Dinner,2
 52 | 12.54,2.5,Male,No,Sun,Dinner,2
 53 | 10.29,2.6,Female,No,Sun,Dinner,2
 54 | 34.81,5.2,Female,No,Sun,Dinner,4
 55 | 9.94,1.56,Male,No,Sun,Dinner,2
 56 | 25.56,4.34,Male,No,Sun,Dinner,4
 57 | 19.49,3.51,Male,No,Sun,Dinner,2
 58 | 38.01,3.0,Male,Yes,Sat,Dinner,4
 59 | 26.41,1.5,Female,No,Sat,Dinner,2
 60 | 11.24,1.76,Male,Yes,Sat,Dinner,2
 61 | 48.27,6.73,Male,No,Sat,Dinner,4
 62 | 20.29,3.21,Male,Yes,Sat,Dinner,2
 63 | 13.81,2.0,Male,Yes,Sat,Dinner,2
 64 | 11.02,1.98,Male,Yes,Sat,Dinner,2
 65 | 18.29,3.76,Male,Yes,Sat,Dinner,4
 66 | 17.59,2.64,Male,No,Sat,Dinner,3
 67 | 20.08,3.15,Male,No,Sat,Dinner,3
 68 | 16.45,2.47,Female,No,Sat,Dinner,2
 69 | 3.07,1.0,Female,Yes,Sat,Dinner,1
 70 | 20.23,2.01,Male,No,Sat,Dinner,2
 71 | 15.01,2.09,Male,Yes,Sat,Dinner,2
 72 | 12.02,1.97,Male,No,Sat,Dinner,2
 73 | 17.07,3.0,Female,No,Sat,Dinner,3
 74 | 26.86,3.14,Female,Yes,Sat,Dinner,2
 75 | 25.28,5.0,Female,Yes,Sat,Dinner,2
 76 | 14.73,2.2,Female,No,Sat,Dinner,2
 77 | 10.51,1.25,Male,No,Sat,Dinner,2
 78 | 17.92,3.08,Male,Yes,Sat,Dinner,2
 79 | 27.2,4.0,Male,No,Thur,Lunch,4
 80 | 22.76,3.0,Male,No,Thur,Lunch,2
 81 | 17.29,2.71,Male,No,Thur,Lunch,2
 82 | 19.44,3.0,Male,Yes,Thur,Lunch,2
 83 | 16.66,3.4,Male,No,Thur,Lunch,2
 84 | 10.07,1.83,Female,No,Thur,Lunch,1
 85 | 32.68,5.0,Male,Yes,Thur,Lunch,2
 86 | 15.98,2.03,Male,No,Thur,Lunch,2
 87 | 34.83,5.17,Female,No,Thur,Lunch,4
 88 | 13.03,2.0,Male,No,Thur,Lunch,2
 89 | 18.28,4.0,Male,No,Thur,Lunch,2
 90 | 24.71,5.85,Male,No,Thur,Lunch,2
 91 | 21.16,3.0,Male,No,Thur,Lunch,2
 92 | 28.97,3.0,Male,Yes,Fri,Dinner,2
 93 | 22.49,3.5,Male,No,Fri,Dinner,2
 94 | 5.75,1.0,Female,Yes,Fri,Dinner,2
 95 | 16.32,4.3,Female,Yes,Fri,Dinner,2
 96 | 22.75,3.25,Female,No,Fri,Dinner,2
 97 | 40.17,4.73,Male,Yes,Fri,Dinner,4
 98 | 27.28,4.0,Male,Yes,Fri,Dinner,2
 99 | 12.03,1.5,Male,Yes,Fri,Dinner,2
100 | 21.01,3.0,Male,Yes,Fri,Dinner,2
101 | 12.46,1.5,Male,No,Fri,Dinner,2
102 | 11.35,2.5,Female,Yes,Fri,Dinner,2
103 | 15.38,3.0,Female,Yes,Fri,Dinner,2
104 | 44.3,2.5,Female,Yes,Sat,Dinner,3
105 | 22.42,3.48,Female,Yes,Sat,Dinner,2
106 | 20.92,4.08,Female,No,Sat,Dinner,2
107 | 15.36,1.64,Male,Yes,Sat,Dinner,2
108 | 20.49,4.06,Male,Yes,Sat,Dinner,2
109 | 25.21,4.29,Male,Yes,Sat,Dinner,2
110 | 18.24,3.76,Male,No,Sat,Dinner,2
111 | 14.31,4.0,Female,Yes,Sat,Dinner,2
112 | 14.0,3.0,Male,No,Sat,Dinner,2
113 | 7.25,1.0,Female,No,Sat,Dinner,1
114 | 38.07,4.0,Male,No,Sun,Dinner,3
115 | 23.95,2.55,Male,No,Sun,Dinner,2
116 | 25.71,4.0,Female,No,Sun,Dinner,3
117 | 17.31,3.5,Female,No,Sun,Dinner,2
118 | 29.93,5.07,Male,No,Sun,Dinner,4
119 | 10.65,1.5,Female,No,Thur,Lunch,2
120 | 12.43,1.8,Female,No,Thur,Lunch,2
121 | 24.08,2.92,Female,No,Thur,Lunch,4
122 | 11.69,2.31,Male,No,Thur,Lunch,2
123 | 13.42,1.68,Female,No,Thur,Lunch,2
124 | 14.26,2.5,Male,No,Thur,Lunch,2
125 | 15.95,2.0,Male,No,Thur,Lunch,2
126 | 12.48,2.52,Female,No,Thur,Lunch,2
127 | 29.8,4.2,Female,No,Thur,Lunch,6
128 | 8.52,1.48,Male,No,Thur,Lunch,2
129 | 14.52,2.0,Female,No,Thur,Lunch,2
130 | 11.38,2.0,Female,No,Thur,Lunch,2
131 | 22.82,2.18,Male,No,Thur,Lunch,3
132 | 19.08,1.5,Male,No,Thur,Lunch,2
133 | 20.27,2.83,Female,No,Thur,Lunch,2
134 | 11.17,1.5,Female,No,Thur,Lunch,2
135 | 12.26,2.0,Female,No,Thur,Lunch,2
136 | 18.26,3.25,Female,No,Thur,Lunch,2
137 | 8.51,1.25,Female,No,Thur,Lunch,2
138 | 10.33,2.0,Female,No,Thur,Lunch,2
139 | 14.15,2.0,Female,No,Thur,Lunch,2
140 | 16.0,2.0,Male,Yes,Thur,Lunch,2
141 | 13.16,2.75,Female,No,Thur,Lunch,2
142 | 17.47,3.5,Female,No,Thur,Lunch,2
143 | 34.3,6.7,Male,No,Thur,Lunch,6
144 | 41.19,5.0,Male,No,Thur,Lunch,5
145 | 27.05,5.0,Female,No,Thur,Lunch,6
146 | 16.43,2.3,Female,No,Thur,Lunch,2
147 | 8.35,1.5,Female,No,Thur,Lunch,2
148 | 18.64,1.36,Female,No,Thur,Lunch,3
149 | 11.87,1.63,Female,No,Thur,Lunch,2
150 | 9.78,1.73,Male,No,Thur,Lunch,2
151 | 7.51,2.0,Male,No,Thur,Lunch,2
152 | 14.07,2.5,Male,No,Sun,Dinner,2
153 | 13.13,2.0,Male,No,Sun,Dinner,2
154 | 17.26,2.74,Male,No,Sun,Dinner,3
155 | 24.55,2.0,Male,No,Sun,Dinner,4
156 | 19.77,2.0,Male,No,Sun,Dinner,4
157 | 29.85,5.14,Female,No,Sun,Dinner,5
158 | 48.17,5.0,Male,No,Sun,Dinner,6
159 | 25.0,3.75,Female,No,Sun,Dinner,4
160 | 13.39,2.61,Female,No,Sun,Dinner,2
161 | 16.49,2.0,Male,No,Sun,Dinner,4
162 | 21.5,3.5,Male,No,Sun,Dinner,4
163 | 12.66,2.5,Male,No,Sun,Dinner,2
164 | 16.21,2.0,Female,No,Sun,Dinner,3
165 | 13.81,2.0,Male,No,Sun,Dinner,2
166 | 17.51,3.0,Female,Yes,Sun,Dinner,2
167 | 24.52,3.48,Male,No,Sun,Dinner,3
168 | 20.76,2.24,Male,No,Sun,Dinner,2
169 | 31.71,4.5,Male,No,Sun,Dinner,4
170 | 10.59,1.61,Female,Yes,Sat,Dinner,2
171 | 10.63,2.0,Female,Yes,Sat,Dinner,2
172 | 50.81,10.0,Male,Yes,Sat,Dinner,3
173 | 15.81,3.16,Male,Yes,Sat,Dinner,2
174 | 7.25,5.15,Male,Yes,Sun,Dinner,2
175 | 31.85,3.18,Male,Yes,Sun,Dinner,2
176 | 16.82,4.0,Male,Yes,Sun,Dinner,2
177 | 32.9,3.11,Male,Yes,Sun,Dinner,2
178 | 17.89,2.0,Male,Yes,Sun,Dinner,2
179 | 14.48,2.0,Male,Yes,Sun,Dinner,2
180 | 9.6,4.0,Female,Yes,Sun,Dinner,2
181 | 34.63,3.55,Male,Yes,Sun,Dinner,2
182 | 34.65,3.68,Male,Yes,Sun,Dinner,4
183 | 23.33,5.65,Male,Yes,Sun,Dinner,2
184 | 45.35,3.5,Male,Yes,Sun,Dinner,3
185 | 23.17,6.5,Male,Yes,Sun,Dinner,4
186 | 40.55,3.0,Male,Yes,Sun,Dinner,2
187 | 20.69,5.0,Male,No,Sun,Dinner,5
188 | 20.9,3.5,Female,Yes,Sun,Dinner,3
189 | 30.46,2.0,Male,Yes,Sun,Dinner,5
190 | 18.15,3.5,Female,Yes,Sun,Dinner,3
191 | 23.1,4.0,Male,Yes,Sun,Dinner,3
192 | 15.69,1.5,Male,Yes,Sun,Dinner,2
193 | 19.81,4.19,Female,Yes,Thur,Lunch,2
194 | 28.44,2.56,Male,Yes,Thur,Lunch,2
195 | 15.48,2.02,Male,Yes,Thur,Lunch,2
196 | 16.58,4.0,Male,Yes,Thur,Lunch,2
197 | 7.56,1.44,Male,No,Thur,Lunch,2
198 | 10.34,2.0,Male,Yes,Thur,Lunch,2
199 | 43.11,5.0,Female,Yes,Thur,Lunch,4
200 | 13.0,2.0,Female,Yes,Thur,Lunch,2
201 | 13.51,2.0,Male,Yes,Thur,Lunch,2
202 | 18.71,4.0,Male,Yes,Thur,Lunch,3
203 | 12.74,2.01,Female,Yes,Thur,Lunch,2
204 | 13.0,2.0,Female,Yes,Thur,Lunch,2
205 | 16.4,2.5,Female,Yes,Thur,Lunch,2
206 | 20.53,4.0,Male,Yes,Thur,Lunch,4
207 | 16.47,3.23,Female,Yes,Thur,Lunch,3
208 | 26.59,3.41,Male,Yes,Sat,Dinner,3
209 | 38.73,3.0,Male,Yes,Sat,Dinner,4
210 | 24.27,2.03,Male,Yes,Sat,Dinner,2
211 | 12.76,2.23,Female,Yes,Sat,Dinner,2
212 | 30.06,2.0,Male,Yes,Sat,Dinner,3
213 | 25.89,5.16,Male,Yes,Sat,Dinner,4
214 | 48.33,9.0,Male,No,Sat,Dinner,4
215 | 13.27,2.5,Female,Yes,Sat,Dinner,2
216 | 28.17,6.5,Female,Yes,Sat,Dinner,3
217 | 12.9,1.1,Female,Yes,Sat,Dinner,2
218 | 28.15,3.0,Male,Yes,Sat,Dinner,5
219 | 11.59,1.5,Male,Yes,Sat,Dinner,2
220 | 7.74,1.44,Male,Yes,Sat,Dinner,2
221 | 30.14,3.09,Female,Yes,Sat,Dinner,4
222 | 12.16,2.2,Male,Yes,Fri,Lunch,2
223 | 13.42,3.48,Female,Yes,Fri,Lunch,2
224 | 8.58,1.92,Male,Yes,Fri,Lunch,1
225 | 15.98,3.0,Female,No,Fri,Lunch,3
226 | 13.42,1.58,Male,Yes,Fri,Lunch,2
227 | 16.27,2.5,Female,Yes,Fri,Lunch,2
228 | 10.09,2.0,Female,Yes,Fri,Lunch,2
229 | 20.45,3.0,Male,No,Sat,Dinner,4
230 | 13.28,2.72,Male,No,Sat,Dinner,2
231 | 22.12,2.88,Female,Yes,Sat,Dinner,2
232 | 24.01,2.0,Male,Yes,Sat,Dinner,4
233 | 15.69,3.0,Male,Yes,Sat,Dinner,3
234 | 11.61,3.39,Male,No,Sat,Dinner,2
235 | 10.77,1.47,Male,No,Sat,Dinner,2
236 | 15.53,3.0,Male,Yes,Sat,Dinner,2
237 | 10.07,1.25,Male,No,Sat,Dinner,2
238 | 12.6,1.0,Male,Yes,Sat,Dinner,2
239 | 32.83,1.17,Male,Yes,Sat,Dinner,2
240 | 35.83,4.67,Female,No,Sat,Dinner,3
241 | 29.03,5.92,Male,No,Sat,Dinner,3
242 | 27.18,2.0,Female,Yes,Sat,Dinner,2
243 | 22.67,2.0,Male,Yes,Sat,Dinner,2
244 | 17.82,1.75,Male,No,Sat,Dinner,2
245 | 18.78,3.0,Female,No,Thur,Dinner,2
246 | 


--------------------------------------------------------------------------------
/notebooks/mydask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deniederhut/Pandas-Tutorial-SciPyConf-2018/d30a9aac47b4a16b6170da77f982fe8592a4eb73/notebooks/mydask.png


--------------------------------------------------------------------------------
/notebooks/solutions/aligment_concat.py:
--------------------------------------------------------------------------------
1 | df = pd.concat([gdp, cpi], axis='columns')
2 | df.head()


--------------------------------------------------------------------------------
/notebooks/solutions/aligment_merge.py:
--------------------------------------------------------------------------------
1 | outer = (pd.merge(gdp_bad, cpi_bad, on="DATE", how='outer')
2 |            .sort_values("DATE"))
3 | outer.head()


--------------------------------------------------------------------------------
/notebooks/solutions/alignment_00.py:
--------------------------------------------------------------------------------
1 | cpi = pd.read_csv("data/cpi.csv", parse_dates=['DATE'])
2 | gdp = pd.read_csv("data/gdp.csv", parse_dates=['DATE'])
3 | 


--------------------------------------------------------------------------------
/notebooks/solutions/alignment_01.py:
--------------------------------------------------------------------------------
 1 | # Option 1: The "manual way"
 2 | common = pd.Index(cpi.DATE).intersection(gdp.DATE)
 3 | rgdp = (gdp.loc[gdp.DATE.isin(common), 'GDP'].values /
 4 |         cpi.loc[cpi.DATE.isin(common), 'CPIAUCSL'])
 5 | display.display(rgdp.head())
 6 | 
 7 | # Option 2: "merge"
 8 | m = pd.merge(gdp, cpi, on="DATE")
 9 | rgdp = m['GDP'] / m['CPIAUCSL']
10 | rgdp.head()
11 | 


--------------------------------------------------------------------------------
/notebooks/solutions/alignment_02.py:
--------------------------------------------------------------------------------
1 | gdp = gdp.set_index("DATE").squeeze()
2 | cpi = cpi.set_index("DATE").squeeze().rename("cpi")
3 | side_by_side(gdp.head(), cpi.head())
4 | 


--------------------------------------------------------------------------------
/notebooks/solutions/alignment_03.py:
--------------------------------------------------------------------------------
1 | res = DataReader(series, start='2000-01-01', data_source="fred")
2 | res = res.rename(columns=dict(zip(series, names)))
3 | 
4 | fig, ax = plt.subplots(figsize=(12, 8))
5 | res[['quits', 'layoffs']].plot.area(
6 |     color=area_colors, ax=ax)
7 | res[['hires', 'openings']].plot(
8 |     ax=ax, color=line_colors, linewidth=3);
9 | 


--------------------------------------------------------------------------------
/notebooks/solutions/alignment_positive.py:
--------------------------------------------------------------------------------
1 | (pct_change > 0).mean()


--------------------------------------------------------------------------------
/notebooks/solutions/alignment_real_gdp09.py:
--------------------------------------------------------------------------------
1 | cpi09 = cpi / cpi.loc['2009'].mean() * 100
2 | gdp / cpi09


--------------------------------------------------------------------------------
/notebooks/solutions/dropna_columns.py:
--------------------------------------------------------------------------------
1 | df.dropna(axis="columns")


--------------------------------------------------------------------------------
/notebooks/solutions/eda_00.py:
--------------------------------------------------------------------------------
1 | def from_dollars(col):
2 |     return pd.to_numeric(col.str.lstrip('$'))
3 | 


--------------------------------------------------------------------------------
/notebooks/solutions/eda_01.py:
--------------------------------------------------------------------------------
1 | cols = ['state_bottle_cost', 'state_bottle_retail', 'sale']
2 | df[cols] = df[cols].apply(from_dollars)
3 | 


--------------------------------------------------------------------------------
/notebooks/solutions/eda_02.py:
--------------------------------------------------------------------------------
1 | url = 'https://en.wikipedia.org/wiki/List_of_largest_Iowa_cities_by_population'
2 | popn = (pd.read_html(url, header=0)[0]
3 |           .set_index("City")
4 |           .rename(lambda x: x.lower()))
5 | popn.head()
6 | 


--------------------------------------------------------------------------------
/notebooks/solutions/eda_03.py:
--------------------------------------------------------------------------------
1 | per_cap = (df.groupby(df.city.str.lower())
2 |              .volume_sold.sum() /
3 |            popn.Population.astype(float)).dropna()
4 | per_cap.plot.barh(figsize=(10, 10), color='k', width=.9);
5 | 


--------------------------------------------------------------------------------
/notebooks/solutions/eda_04.py:
--------------------------------------------------------------------------------
1 | pd.concat([df.groupby(df.city.str.lower())[['sale', 'volume_sold']].sum(),
2 |            popn.Population], axis=1, join='inner').pipe(sns.pairplot);
3 | 


--------------------------------------------------------------------------------
/notebooks/solutions/groupby_00.py:
--------------------------------------------------------------------------------
1 | review_length = df.text.str.len()
2 | gr = df.groupby(review_length).review_overall
3 | gr.mean().plot(style='k.')


--------------------------------------------------------------------------------
/notebooks/solutions/groupby_00b.py:
--------------------------------------------------------------------------------
1 | (df.groupby(df.text.str.count('\w+'))
2 |    .review_overall
3 |    .mean().plot(style='k.'))
4 | 


--------------------------------------------------------------------------------
/notebooks/solutions/groupby_01.py:
--------------------------------------------------------------------------------
1 | (df.groupby('beer_id')
2 |    .review_overall
3 |    .agg(['mean', 'count'])
4 |    .plot.scatter(x='count', y='mean', color='k',
5 |                  marker='.', alpha=.25));
6 | 


--------------------------------------------------------------------------------
/notebooks/solutions/groupby_02.py:
--------------------------------------------------------------------------------
1 | order = df.groupby("profile_name").review_overall.cumcount()
2 | df.groupby(order).review_overall.mean().plot()
3 | 


--------------------------------------------------------------------------------
/notebooks/solutions/groupby_03.py:
--------------------------------------------------------------------------------
1 | # Make a barplot of review times by hour
2 | (df.time.dt.hour
3 |    .value_counts()
4 |    .sort_index()
5 |    .plot.bar(rot=0, color='k', width=.8));
6 | 


--------------------------------------------------------------------------------
/notebooks/solutions/groupby_04.py:
--------------------------------------------------------------------------------
1 | pas = df[df.beer_style.str.lower().str.contains("pale ale")]
2 | pas.head()


--------------------------------------------------------------------------------
/notebooks/solutions/groupby_abv.py:
--------------------------------------------------------------------------------
1 | df.groupby('beer_style').abv.std().sort_values(ascending=False)


--------------------------------------------------------------------------------
/notebooks/solutions/groupby_format_review.py:
--------------------------------------------------------------------------------
1 | def format_review(review):
2 |     return dict([line.strip('\n').split(": ", 1) for line in review])


--------------------------------------------------------------------------------
/notebooks/solutions/indexing_00.py:
--------------------------------------------------------------------------------
1 | first[['origin', 'dest']]


--------------------------------------------------------------------------------
/notebooks/solutions/indexing_01.py:
--------------------------------------------------------------------------------
1 | flights[(flights.dep.dt.hour <= 6) |
2 |         (flights.dep.dt.hour >= 18)]
3 | 


--------------------------------------------------------------------------------
/notebooks/solutions/indexing_02.py:
--------------------------------------------------------------------------------
1 | m1 = flights.origin == 'ATL'
2 | most_common = flights.loc[m1, 'dest'].value_counts().index[:3]
3 | m2 = flights.dest.isin(most_common)
4 | 
5 | flights[m1 & m2].head()
6 | 


--------------------------------------------------------------------------------
/notebooks/solutions/indexing_cancelled.py:
--------------------------------------------------------------------------------
1 | flights.loc[flights.cancelled == 1]


--------------------------------------------------------------------------------
/notebooks/solutions/indexing_datetime.py:
--------------------------------------------------------------------------------
1 | delays.loc['2014-01-03T12':'2014-01-10T12']


--------------------------------------------------------------------------------
/notebooks/solutions/indexing_drop_columns.py:
--------------------------------------------------------------------------------
1 | flights.drop('airline_id', axis='columns')


--------------------------------------------------------------------------------
/notebooks/solutions/indexing_drop_index.py:
--------------------------------------------------------------------------------
1 | first.drop(['EV', 'F9'])


--------------------------------------------------------------------------------
/notebooks/solutions/indexing_ex1_engine_columns.py:
--------------------------------------------------------------------------------
1 | cars[['cylinders', 'displacement', 'horsepower']]


--------------------------------------------------------------------------------
/notebooks/solutions/indexing_ex2_5th.py:
--------------------------------------------------------------------------------
1 | cars.iloc[::5]


--------------------------------------------------------------------------------
/notebooks/solutions/indexing_ex3_years.py:
--------------------------------------------------------------------------------
1 | yearly.loc[[70, 75, 80, 82], ['horsepower', 'weight']]


--------------------------------------------------------------------------------
/notebooks/solutions/indexing_ex4_mpg.py:
--------------------------------------------------------------------------------
1 | cars[cars.mpg >= 30]
2 | 


--------------------------------------------------------------------------------
/notebooks/solutions/indexing_ex5_mpg_and_cylinders.py:
--------------------------------------------------------------------------------
1 | len(cars[(cars.mpg >= 30) & (cars.cylinders >= 5)])


--------------------------------------------------------------------------------
/notebooks/solutions/indexing_loc.py:
--------------------------------------------------------------------------------
1 | first.loc[['US', 'VX', 'WN'], ['tail_num', 'origin', 'dest']]
2 | 


--------------------------------------------------------------------------------
/notebooks/solutions/indexing_thoughts.py:
--------------------------------------------------------------------------------
1 | first.loc['AA', 'fl_num'] = -1
2 | first.head()


--------------------------------------------------------------------------------
/notebooks/solutions/performance_00.py:
--------------------------------------------------------------------------------
1 | ids = flights.ORIGIN_AIRPORT_ID.value_counts()
2 | ids = ids[ids >= 500].index
3 | ids
4 | 


--------------------------------------------------------------------------------
/notebooks/solutions/performance_01.py:
--------------------------------------------------------------------------------
1 | subset = coord[coord.AIRPORT_ID.isin(ids)]
2 | subset.head()
3 | 


--------------------------------------------------------------------------------
/notebooks/solutions/performance_02.py:
--------------------------------------------------------------------------------
 1 | from sklearn.neighbors import KDTree
 2 | 
 3 | # the result of KDTree.query is a list of index
 4 | # *positions*, we'll use id_map to go from
 5 | # positions back to airport names
 6 | id_map = dict(enumerate(locs.index))
 7 | 
 8 | tree = KDTree(locs)
 9 | 
10 | distances, indexes = tree.query(locs.values, k=2)
11 | indexes = indexes[:, 1]
12 | distances = distances[:, 1]
13 | neighbors = pd.Series(indexes, index=locs.index).map(id_map)
14 | neighbors.head()
15 | 


--------------------------------------------------------------------------------
/notebooks/solutions/performance_concat.py:
--------------------------------------------------------------------------------
1 | 
2 | pd.concat([pd.DataFrame(set_, columns=['A', 'B', 'C']) for set_ in records],
3 |           ignore_index=True)


--------------------------------------------------------------------------------
/notebooks/solutions/performance_kd.py:
--------------------------------------------------------------------------------
1 | coord = pd.read_csv("data/flights_coord.csv")
2 | coord.head()


--------------------------------------------------------------------------------
/notebooks/solutions/readme_00.py:
--------------------------------------------------------------------------------
1 | print("Hello, world!")
2 | 


--------------------------------------------------------------------------------
/notebooks/solutions/sklearn_pandas_split.py:
--------------------------------------------------------------------------------
1 | y = df['tip']
2 | X = df.drop('tip', axis=1)


--------------------------------------------------------------------------------
/notebooks/solutions/tidy_00.py:
--------------------------------------------------------------------------------
 1 | df['winning_team'] = np.where(
 2 |     df.home_points > df.away_points,
 3 |     df.home_team,
 4 |     df.away_team)
 5 | 
 6 | win = pd.melt(df, id_vars='winning_team', value_vars=['away_team', 'home_team'],
 7 |               var_name='home_or_away', value_name='team')
 8 | win['won'] = win.winning_team == win.team
 9 | win_pct = win.groupby(['team', 'home_or_away']).won.mean()
10 | win_pct.head()
11 | 


--------------------------------------------------------------------------------
/notebooks/solutions/tidy_01.py:
--------------------------------------------------------------------------------
1 | df['home_win'] = df.home_points > df.away_points
2 | df['point_spread'] = df.home_points - df.away_points
3 | df.head()
4 | 


--------------------------------------------------------------------------------
/notebooks/solutions/tidy_02.py:
--------------------------------------------------------------------------------
1 | # RPI
2 | df['home_strength'] = df.home_team.map(rpi.rename(mapping)['RPI'])
3 | df['away_strength'] = df.away_team.map(rpi.rename(mapping)['RPI'])
4 | df.head()
5 | 


--------------------------------------------------------------------------------
/notebooks/solutions/tidy_03.py:
--------------------------------------------------------------------------------
1 | df['rest_spread'] = df['home_rest'] - df['away_rest']
2 | df.head()


--------------------------------------------------------------------------------
/notebooks/solutions/tidy_04.py:
--------------------------------------------------------------------------------
1 | fig, ax = plt.subplots(figsize=(12, 6))
2 | sns.barplot(x='rest_spread', y='home_win',
3 |             data=df.loc[(-3 <= df.rest_spread) &
4 |                         (df.rest_spread <= 3)],
5 |             color='#4c72b0', ax=ax)
6 | sns.despine()


--------------------------------------------------------------------------------
/notebooks/solutions/tidy_05.py:
--------------------------------------------------------------------------------
 1 | def compute_away_streaks(v):
 2 |     streaks = []
 3 |     current_streak = 0
 4 | 
 5 |     for row in v:
 6 |         if row == 'away_team':
 7 |             current_streak += 1
 8 |         else:
 9 |             current_streak = 0
10 |         streaks.append(current_streak)
11 |     return pd.Series(streaks, index=v.index)


--------------------------------------------------------------------------------
/notebooks/solutions/tidy_06.py:
--------------------------------------------------------------------------------
1 | # fill 1 for teams that start on the road
2 | away_streaks = (
3 |     tidy.groupby("team")
4 |         .variable
5 |         .transform(compute_away_streaks).fillna(1))
6 | away_streaks.head()
7 | 


--------------------------------------------------------------------------------
/notebooks/solutions/tidy_07.py:
--------------------------------------------------------------------------------
 1 | from functools import wraps
 2 | 
 3 | def log_shape(func):
 4 |     @wraps(func)
 5 |     def deco(*args, **kwargs):
 6 |         result = func(*args, **kwargs)
 7 |         logger.info("In %s [%s]", func.__name__, result.shape)
 8 |         return result
 9 |     return deco
10 | 


--------------------------------------------------------------------------------
/notebooks/solutions/tidy_drest.py:
--------------------------------------------------------------------------------
1 | df['rest_spread'].mean()


--------------------------------------------------------------------------------
/notebooks/solutions/tidy_sanity.py:
--------------------------------------------------------------------------------
1 | (df.home_rest - df.away_rest).mean()


--------------------------------------------------------------------------------
/notebooks/solutions/timeseries_departure.py:
--------------------------------------------------------------------------------
1 | flights.dep + flights.dep_delay_td


--------------------------------------------------------------------------------
/notebooks/solutions/timeseries_monthly_ma.py:
--------------------------------------------------------------------------------
1 | ma = y.rolling(4).mean()
2 | ax = ma.plot(legend=True, label="MA[4]", figsize=(12, 4))
3 | y.plot(ax=ax, label="Observed", legend=True);


--------------------------------------------------------------------------------
/notebooks/solutions/timeseries_resample.py:
--------------------------------------------------------------------------------
1 | df.resample("W").std().plot();


--------------------------------------------------------------------------------
/notebooks/solutions/timeseries_resample_agg.py:
--------------------------------------------------------------------------------
1 | df.resample("Q").agg(['sum', 'mean', 'median'])


--------------------------------------------------------------------------------
/notebooks/solutions/timeseries_timedelta.py:
--------------------------------------------------------------------------------
1 | flights['dep_delay_td'] = pd.to_timedelta(flights['dep_delay'], unit='T')
2 | flights['arr_delay_td'] = pd.to_timedelta(flights['arr_delay'], unit='T')
3 | flights.info()


--------------------------------------------------------------------------------
/notebooks/solutions/visualize_00.py:
--------------------------------------------------------------------------------
1 | sns.factorplot('embarked', data=t, kind="count", hue="class");
2 | 


--------------------------------------------------------------------------------
/notebooks/solutions/visualize_01.py:
--------------------------------------------------------------------------------
1 | sns.factorplot('age', 'class', data=t);


--------------------------------------------------------------------------------
/notebooks/solutions/visualize_02a.py:
--------------------------------------------------------------------------------
1 | m = t.fare.median()
2 | 
3 | t['fare_'] = np.where(t.fare < m * 3, t.fare, m * 3)
4 | t.head()
5 | 


--------------------------------------------------------------------------------
/notebooks/solutions/visualize_02b.py:
--------------------------------------------------------------------------------
1 | sns.violinplot("class", "fare_", data=t, orient="v",
2 |                palette="YlGn")
3 | sns.despine(left=True)
4 | plt.ylim(0);
5 | 


--------------------------------------------------------------------------------
/notebooks/solutions/visualize_03.py:
--------------------------------------------------------------------------------
1 | sns.countplot("alive", data=t, palette="OrRd_d");
2 | 


--------------------------------------------------------------------------------
/notebooks/solutions/visualize_04.py:
--------------------------------------------------------------------------------
1 | sns.factorplot("class", "survived", data=t).set(ylim=(0, 1));
2 | 


--------------------------------------------------------------------------------
/notebooks/solutions/visualize_05.py:
--------------------------------------------------------------------------------
1 | sns.factorplot("who", "survived", data=t).set(ylim=(0, 1));
2 | 


--------------------------------------------------------------------------------
/notebooks/solutions/visualize_06.py:
--------------------------------------------------------------------------------
1 | sns.factorplot("class", "survived", data=t,
2 |                hue="sex", palette=pal).set(ylim=(0, 1));
3 | 


--------------------------------------------------------------------------------
/notebooks/solutions/visualize_07.py:
--------------------------------------------------------------------------------
1 | g = sns.factorplot("class", "survived", data=t,
2 |                    hue="who", palette=pal, col="who",
3 |                    aspect=.5)
4 | g.set(ylim=(0, 1))
5 | g.despine(left=True);
6 | 


--------------------------------------------------------------------------------
/notebooks/solutions/visualize_08.py:
--------------------------------------------------------------------------------
1 | fg = sns.factorplot("adult_male", "survived", data=t,
2 |                     col="class", hue="class", size=6,
3 |                     aspect=.33, palette="BuPu_d")
4 | fg.set(ylim=(0, 1))
5 | fg.despine(left=True);
6 | 


--------------------------------------------------------------------------------
/notebooks/solutions/visualize_09.py:
--------------------------------------------------------------------------------
1 | sns.lmplot("age", "survived", t, hue="sex",
2 |            logistic=True, x_bins=bins,
3 |            palette=pal);
4 | 


--------------------------------------------------------------------------------
/notebooks/solutions/visualize_10.py:
--------------------------------------------------------------------------------
1 | sns.lmplot("age", "survived", t, hue="class",
2 |            logistic=True, x_bins=bins,
3 |            palette=pal);


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==1.14.5
 2 | pandas==0.23.1
 3 | matplotlib==2.2.2
 4 | seaborn==0.8.1
 5 | ipython==6.4.0
 6 | jupyter==1.0.0
 7 | notebook==5.5.0
 8 | dask==0.18.1
 9 | distributed==1.22.0
10 | toolz==0.9.0
11 | pandas-datareader==0.6.0
12 | scikit-learn==0.19.1
13 | scipy==1.1.0
14 | statsmodels==0.9.0
15 | xlrd==1.1.0
16 | lifetimes==0.6.0.0


--------------------------------------------------------------------------------