├── .gitignore
├── LICENSE.md
├── Makefile
├── README.md
├── cover
    └── modern-pandas-cover.png
├── markdown
    └── style.css
├── modern-1-url.txt
├── modern_1_intro.ipynb
├── modern_2_method_chaining.ipynb
├── modern_3_indexes.ipynb
├── modern_4_performance.ipynb
├── modern_5_tidy.ipynb
├── modern_6_visualization.ipynb
├── modern_7_timeseries.ipynb
├── modern_8_out_of_core.ipynb
├── prep.py
├── url_4.txt
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | data/
 2 | *.epub
 3 | *.html
 4 | markdown/
 5 | models/
 6 | *.pdf
 7 | *.epub
 8 | *.fthr
 9 | __pycache__
10 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 | Attribution 4.0 International
  2 | 
  3 | =======================================================================
  4 | 
  5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
  6 | does not provide legal services or legal advice. Distribution of
  7 | Creative Commons public licenses does not create a lawyer-client or
  8 | other relationship. Creative Commons makes its licenses and related
  9 | information available on an "as-is" basis. Creative Commons gives no
 10 | warranties regarding its licenses, any material licensed under their
 11 | terms and conditions, or any related information. Creative Commons
 12 | disclaims all liability for damages resulting from their use to the
 13 | fullest extent possible.
 14 | 
 15 | Using Creative Commons Public Licenses
 16 | 
 17 | Creative Commons public licenses provide a standard set of terms and
 18 | conditions that creators and other rights holders may use to share
 19 | original works of authorship and other material subject to copyright
 20 | and certain other rights specified in the public license below. The
 21 | following considerations are for informational purposes only, are not
 22 | exhaustive, and do not form part of our licenses.
 23 | 
 24 |      Considerations for licensors: Our public licenses are
 25 |      intended for use by those authorized to give the public
 26 |      permission to use material in ways otherwise restricted by
 27 |      copyright and certain other rights. Our licenses are
 28 |      irrevocable. Licensors should read and understand the terms
 29 |      and conditions of the license they choose before applying it.
 30 |      Licensors should also secure all rights necessary before
 31 |      applying our licenses so that the public can reuse the
 32 |      material as expected. Licensors should clearly mark any
 33 |      material not subject to the license. This includes other CC-
 34 |      licensed material, or material used under an exception or
 35 |      limitation to copyright. More considerations for licensors:
 36 | 	wiki.creativecommons.org/Considerations_for_licensors
 37 | 
 38 |      Considerations for the public: By using one of our public
 39 |      licenses, a licensor grants the public permission to use the
 40 |      licensed material under specified terms and conditions. If
 41 |      the licensor's permission is not necessary for any reason--for
 42 |      example, because of any applicable exception or limitation to
 43 |      copyright--then that use is not regulated by the license. Our
 44 |      licenses grant only permissions under copyright and certain
 45 |      other rights that a licensor has authority to grant. Use of
 46 |      the licensed material may still be restricted for other
 47 |      reasons, including because others have copyright or other
 48 |      rights in the material. A licensor may make special requests,
 49 |      such as asking that all changes be marked or described.
 50 |      Although not required by our licenses, you are encouraged to
 51 |      respect those requests where reasonable. More_considerations
 52 |      for the public: 
 53 | 	wiki.creativecommons.org/Considerations_for_licensees
 54 | 
 55 | =======================================================================
 56 | 
 57 | Creative Commons Attribution 4.0 International Public License
 58 | 
 59 | By exercising the Licensed Rights (defined below), You accept and agree
 60 | to be bound by the terms and conditions of this Creative Commons
 61 | Attribution 4.0 International Public License ("Public License"). To the
 62 | extent this Public License may be interpreted as a contract, You are
 63 | granted the Licensed Rights in consideration of Your acceptance of
 64 | these terms and conditions, and the Licensor grants You such rights in
 65 | consideration of benefits the Licensor receives from making the
 66 | Licensed Material available under these terms and conditions.
 67 | 
 68 | 
 69 | Section 1 -- Definitions.
 70 | 
 71 |   a. Adapted Material means material subject to Copyright and Similar
 72 |      Rights that is derived from or based upon the Licensed Material
 73 |      and in which the Licensed Material is translated, altered,
 74 |      arranged, transformed, or otherwise modified in a manner requiring
 75 |      permission under the Copyright and Similar Rights held by the
 76 |      Licensor. For purposes of this Public License, where the Licensed
 77 |      Material is a musical work, performance, or sound recording,
 78 |      Adapted Material is always produced where the Licensed Material is
 79 |      synched in timed relation with a moving image.
 80 | 
 81 |   b. Adapter's License means the license You apply to Your Copyright
 82 |      and Similar Rights in Your contributions to Adapted Material in
 83 |      accordance with the terms and conditions of this Public License.
 84 | 
 85 |   c. Copyright and Similar Rights means copyright and/or similar rights
 86 |      closely related to copyright including, without limitation,
 87 |      performance, broadcast, sound recording, and Sui Generis Database
 88 |      Rights, without regard to how the rights are labeled or
 89 |      categorized. For purposes of this Public License, the rights
 90 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
 91 |      Rights.
 92 | 
 93 |   d. Effective Technological Measures means those measures that, in the
 94 |      absence of proper authority, may not be circumvented under laws
 95 |      fulfilling obligations under Article 11 of the WIPO Copyright
 96 |      Treaty adopted on December 20, 1996, and/or similar international
 97 |      agreements.
 98 | 
 99 |   e. Exceptions and Limitations means fair use, fair dealing, and/or
100 |      any other exception or limitation to Copyright and Similar Rights
101 |      that applies to Your use of the Licensed Material.
102 | 
103 |   f. Licensed Material means the artistic or literary work, database,
104 |      or other material to which the Licensor applied this Public
105 |      License.
106 | 
107 |   g. Licensed Rights means the rights granted to You subject to the
108 |      terms and conditions of this Public License, which are limited to
109 |      all Copyright and Similar Rights that apply to Your use of the
110 |      Licensed Material and that the Licensor has authority to license.
111 | 
112 |   h. Licensor means the individual(s) or entity(ies) granting rights
113 |      under this Public License.
114 | 
115 |   i. Share means to provide material to the public by any means or
116 |      process that requires permission under the Licensed Rights, such
117 |      as reproduction, public display, public performance, distribution,
118 |      dissemination, communication, or importation, and to make material
119 |      available to the public including in ways that members of the
120 |      public may access the material from a place and at a time
121 |      individually chosen by them.
122 | 
123 |   j. Sui Generis Database Rights means rights other than copyright
124 |      resulting from Directive 96/9/EC of the European Parliament and of
125 |      the Council of 11 March 1996 on the legal protection of databases,
126 |      as amended and/or succeeded, as well as other essentially
127 |      equivalent rights anywhere in the world.
128 | 
129 |   k. You means the individual or entity exercising the Licensed Rights
130 |      under this Public License. Your has a corresponding meaning.
131 | 
132 | 
133 | Section 2 -- Scope.
134 | 
135 |   a. License grant.
136 | 
137 |        1. Subject to the terms and conditions of this Public License,
138 |           the Licensor hereby grants You a worldwide, royalty-free,
139 |           non-sublicensable, non-exclusive, irrevocable license to
140 |           exercise the Licensed Rights in the Licensed Material to:
141 | 
142 |             a. reproduce and Share the Licensed Material, in whole or
143 |                in part; and
144 | 
145 |             b. produce, reproduce, and Share Adapted Material.
146 | 
147 |        2. Exceptions and Limitations. For the avoidance of doubt, where
148 |           Exceptions and Limitations apply to Your use, this Public
149 |           License does not apply, and You do not need to comply with
150 |           its terms and conditions.
151 | 
152 |        3. Term. The term of this Public License is specified in Section
153 |           6(a).
154 | 
155 |        4. Media and formats; technical modifications allowed. The
156 |           Licensor authorizes You to exercise the Licensed Rights in
157 |           all media and formats whether now known or hereafter created,
158 |           and to make technical modifications necessary to do so. The
159 |           Licensor waives and/or agrees not to assert any right or
160 |           authority to forbid You from making technical modifications
161 |           necessary to exercise the Licensed Rights, including
162 |           technical modifications necessary to circumvent Effective
163 |           Technological Measures. For purposes of this Public License,
164 |           simply making modifications authorized by this Section 2(a)
165 |           (4) never produces Adapted Material.
166 | 
167 |        5. Downstream recipients.
168 | 
169 |             a. Offer from the Licensor -- Licensed Material. Every
170 |                recipient of the Licensed Material automatically
171 |                receives an offer from the Licensor to exercise the
172 |                Licensed Rights under the terms and conditions of this
173 |                Public License.
174 | 
175 |             b. No downstream restrictions. You may not offer or impose
176 |                any additional or different terms or conditions on, or
177 |                apply any Effective Technological Measures to, the
178 |                Licensed Material if doing so restricts exercise of the
179 |                Licensed Rights by any recipient of the Licensed
180 |                Material.
181 | 
182 |        6. No endorsement. Nothing in this Public License constitutes or
183 |           may be construed as permission to assert or imply that You
184 |           are, or that Your use of the Licensed Material is, connected
185 |           with, or sponsored, endorsed, or granted official status by,
186 |           the Licensor or others designated to receive attribution as
187 |           provided in Section 3(a)(1)(A)(i).
188 | 
189 |   b. Other rights.
190 | 
191 |        1. Moral rights, such as the right of integrity, are not
192 |           licensed under this Public License, nor are publicity,
193 |           privacy, and/or other similar personality rights; however, to
194 |           the extent possible, the Licensor waives and/or agrees not to
195 |           assert any such rights held by the Licensor to the limited
196 |           extent necessary to allow You to exercise the Licensed
197 |           Rights, but not otherwise.
198 | 
199 |        2. Patent and trademark rights are not licensed under this
200 |           Public License.
201 | 
202 |        3. To the extent possible, the Licensor waives any right to
203 |           collect royalties from You for the exercise of the Licensed
204 |           Rights, whether directly or through a collecting society
205 |           under any voluntary or waivable statutory or compulsory
206 |           licensing scheme. In all other cases the Licensor expressly
207 |           reserves any right to collect such royalties.
208 | 
209 | 
210 | Section 3 -- License Conditions.
211 | 
212 | Your exercise of the Licensed Rights is expressly made subject to the
213 | following conditions.
214 | 
215 |   a. Attribution.
216 | 
217 |        1. If You Share the Licensed Material (including in modified
218 |           form), You must:
219 | 
220 |             a. retain the following if it is supplied by the Licensor
221 |                with the Licensed Material:
222 | 
223 |                  i. identification of the creator(s) of the Licensed
224 |                     Material and any others designated to receive
225 |                     attribution, in any reasonable manner requested by
226 |                     the Licensor (including by pseudonym if
227 |                     designated);
228 | 
229 |                 ii. a copyright notice;
230 | 
231 |                iii. a notice that refers to this Public License;
232 | 
233 |                 iv. a notice that refers to the disclaimer of
234 |                     warranties;
235 | 
236 |                  v. a URI or hyperlink to the Licensed Material to the
237 |                     extent reasonably practicable;
238 | 
239 |             b. indicate if You modified the Licensed Material and
240 |                retain an indication of any previous modifications; and
241 | 
242 |             c. indicate the Licensed Material is licensed under this
243 |                Public License, and include the text of, or the URI or
244 |                hyperlink to, this Public License.
245 | 
246 |        2. You may satisfy the conditions in Section 3(a)(1) in any
247 |           reasonable manner based on the medium, means, and context in
248 |           which You Share the Licensed Material. For example, it may be
249 |           reasonable to satisfy the conditions by providing a URI or
250 |           hyperlink to a resource that includes the required
251 |           information.
252 | 
253 |        3. If requested by the Licensor, You must remove any of the
254 |           information required by Section 3(a)(1)(A) to the extent
255 |           reasonably practicable.
256 | 
257 |        4. If You Share Adapted Material You produce, the Adapter's
258 |           License You apply must not prevent recipients of the Adapted
259 |           Material from complying with this Public License.
260 | 
261 | 
262 | Section 4 -- Sui Generis Database Rights.
263 | 
264 | Where the Licensed Rights include Sui Generis Database Rights that
265 | apply to Your use of the Licensed Material:
266 | 
267 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
268 |      to extract, reuse, reproduce, and Share all or a substantial
269 |      portion of the contents of the database;
270 | 
271 |   b. if You include all or a substantial portion of the database
272 |      contents in a database in which You have Sui Generis Database
273 |      Rights, then the database in which You have Sui Generis Database
274 |      Rights (but not its individual contents) is Adapted Material; and
275 | 
276 |   c. You must comply with the conditions in Section 3(a) if You Share
277 |      all or a substantial portion of the contents of the database.
278 | 
279 | For the avoidance of doubt, this Section 4 supplements and does not
280 | replace Your obligations under this Public License where the Licensed
281 | Rights include other Copyright and Similar Rights.
282 | 
283 | 
284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
285 | 
286 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
287 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
288 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
289 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
290 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
291 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
292 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
293 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
294 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
295 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
296 | 
297 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
298 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
299 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
300 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
301 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
302 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
303 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
304 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
305 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
306 | 
307 |   c. The disclaimer of warranties and limitation of liability provided
308 |      above shall be interpreted in a manner that, to the extent
309 |      possible, most closely approximates an absolute disclaimer and
310 |      waiver of all liability.
311 | 
312 | 
313 | Section 6 -- Term and Termination.
314 | 
315 |   a. This Public License applies for the term of the Copyright and
316 |      Similar Rights licensed here. However, if You fail to comply with
317 |      this Public License, then Your rights under this Public License
318 |      terminate automatically.
319 | 
320 |   b. Where Your right to use the Licensed Material has terminated under
321 |      Section 6(a), it reinstates:
322 | 
323 |        1. automatically as of the date the violation is cured, provided
324 |           it is cured within 30 days of Your discovery of the
325 |           violation; or
326 | 
327 |        2. upon express reinstatement by the Licensor.
328 | 
329 |      For the avoidance of doubt, this Section 6(b) does not affect any
330 |      right the Licensor may have to seek remedies for Your violations
331 |      of this Public License.
332 | 
333 |   c. For the avoidance of doubt, the Licensor may also offer the
334 |      Licensed Material under separate terms or conditions or stop
335 |      distributing the Licensed Material at any time; however, doing so
336 |      will not terminate this Public License.
337 | 
338 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
339 |      License.
340 | 
341 | 
342 | Section 7 -- Other Terms and Conditions.
343 | 
344 |   a. The Licensor shall not be bound by any additional or different
345 |      terms or conditions communicated by You unless expressly agreed.
346 | 
347 |   b. Any arrangements, understandings, or agreements regarding the
348 |      Licensed Material not stated herein are separate from and
349 |      independent of the terms and conditions of this Public License.
350 | 
351 | 
352 | Section 8 -- Interpretation.
353 | 
354 |   a. For the avoidance of doubt, this Public License does not, and
355 |      shall not be interpreted to, reduce, limit, restrict, or impose
356 |      conditions on any use of the Licensed Material that could lawfully
357 |      be made without permission under this Public License.
358 | 
359 |   b. To the extent possible, if any provision of this Public License is
360 |      deemed unenforceable, it shall be automatically reformed to the
361 |      minimum extent necessary to make it enforceable. If the provision
362 |      cannot be reformed, it shall be severed from this Public License
363 |      without affecting the enforceability of the remaining terms and
364 |      conditions.
365 | 
366 |   c. No term or condition of this Public License will be waived and no
367 |      failure to comply consented to unless expressly agreed to by the
368 |      Licensor.
369 | 
370 |   d. Nothing in this Public License constitutes or may be interpreted
371 |      as a limitation upon, or waiver of, any privileges and immunities
372 |      that apply to the Licensor or You, including from the legal
373 |      processes of any jurisdiction or authority.
374 | 
375 | 
376 | =======================================================================
377 | 
378 | Creative Commons is not a party to its public
379 | licenses. Notwithstanding, Creative Commons may elect to apply one of
380 | its public licenses to material it publishes and in those instances
381 | will be considered the “Licensor.” The text of the Creative Commons
382 | public licenses is dedicated to the public domain under the CC0 Public
383 | Domain Dedication. Except for the limited purpose of indicating that
384 | material is shared under a Creative Commons public license or as
385 | otherwise permitted by the Creative Commons policies published at
386 | creativecommons.org/policies, Creative Commons does not authorize the
387 | use of the trademark "Creative Commons" or any other trademark or logo
388 | of Creative Commons without its prior written consent including,
389 | without limitation, in connection with any unauthorized modifications
390 | to any of its public licenses or any other arrangements,
391 | understandings, or agreements concerning use of licensed material. For
392 | the avoidance of doubt, this paragraph does not form part of the
393 | public licenses.
394 | 
395 | Creative Commons may be contacted at creativecommons.org.
396 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | all_markdown: markdown/modern_1_intro.md \
 3 | 			markdown/modern_2_method_chaining.md \
 4 | 			markdown/modern_3_indexes.md \
 5 | 			markdown/modern_4_performance.md \
 6 | 			markdown/modern_5_tidy.md \
 7 | 			markdown/modern_6_visualization.md \
 8 | 			markdown/modern_7_timeseries.md
 9 | 
10 | 
11 | all_markdown_processed: markdown/modern_1_intro_processed.md \
12 | 			markdown/modern_2_method_chaining_processed.md \
13 | 			markdown/modern_3_indexes_processed.md \
14 | 			markdown/modern_4_performance_processed.md \
15 | 			markdown/modern_5_tidy_processed.md \
16 | 			markdown/modern_6_visualization_processed.md \
17 | 			markdown/modern_7_timeseries_processed.md
18 | 
19 | 
20 | markdown/modern.epub: all_markdown_processed markdown/style.css
21 | 	cd markdown && \
22 | 	pandoc -f markdown-markdown_in_html_blocks --epub-cover-image=../cover/modern-pandas-cover.png --epub-stylesheet=style.css --chapters -S -o $(notdir $@) \
23 | 	    title.txt \
24 | 		modern_1_intro_processed.md \
25 | 		modern_2_method_chaining_processed.md \
26 | 		modern_3_indexes_processed.md \
27 | 		modern_4_performance_processed.md \
28 | 		modern_5_tidy_processed.md \
29 | 		modern_6_visualization_processed.md \
30 | 		modern_7_timeseries_processed.md
31 | 		# markdown/modern_8_out_of_core.md \
32 | 
33 | markdown/modern.pdf: all_markdown_processed
34 | 	cd markdown && \
35 | 	pandoc -f markdown-markdown_in_html_blocks -V documentclass=memoir --chapters -S --latex-engine=xelatex --toc --template=$(HOME)/.pandoc/templates/default.latex -o $(notdir $@) \
36 | 	    title.txt \
37 | 		modern_1_intro_processed.md \
38 | 		modern_2_method_chaining_processed.md \
39 | 		modern_3_indexes_processed.md \
40 | 		modern_4_performance_processed.md \
41 | 		modern_5_tidy_processed.md \
42 | 		modern_6_visualization_processed.md \
43 | 		modern_7_timeseries_processed.md
44 | 
45 | markdown/%.md: %.ipynb
46 | 	jupyter nbconvert --execute --allow-errors --ExecutePreprocessor.timeout=9999999 --to=markdown --output=$(basename $(notdir $@)) $<
47 | 	$(eval BASE:=$(basename $(notdir $@)))
48 | 	if [ -d $(BASE)_files ]; then \
49 | 		rm -rf markdown/$(BASE)_files; \
50 | 	fi
51 | 	if [ -d $(BASE)_files ]; then \
52 | 		mv $(BASE)_files markdown/$(BASE)_files;\
53 | 	fi
54 | 	mv $(BASE).md $@
55 | 
56 | markdown/%_processed.md: markdown/%.md
57 | 	pandoc -f markdown-markdown_in_html_blocks $< | pandoc -f html -t markdown+pipe_tables -o $@
58 | 
59 | markdown/sample.epub: markdown/modern_1_intro_processed.md
60 | 	cd markdown && \
61 | 	pandoc -f markdown-markdown_in_html_blocks --epub-cover-image=../cover/modern-pandas-cover.png --epub-stylesheet=style.css --chapters -S -o $(notdir $@) \
62 | 	title.txt \
63 | 	modern_1_intro_processed.md
64 | 
65 | markdown/sample.pdf: markdown/modern_1_intro_processed.md
66 | 	cd markdown && \
67 | 	pandoc -f markdown-markdown_in_html_blocks -V documentclass=memoir -S --latex-engine=xelatex --template=$(HOME)/.pandoc/templates/default.latex -o $(notdir $@) \
68 | 		title.txt \
69 | 		modern_1_intro_processed.md
70 | 
71 | test.epub: markdown/modern_1_intro_processed.md
72 | 	cd markdown && \
73 | 	pandoc -f markdown-markdown_in_html_blocks $(notdir $<) | pandoc -f html -t markdown+pipe_tables -o test_processed.md && \
74 | 	pandoc -f markdown-markdown_in_html_blocks --epub-cover-image=../cover/modern-pandas-cover.png --epub-stylesheet=style.css --chapters -S -o $(notdir $@) \
75 | 	test_processed.md
76 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Effective Pandas
 2 | 
 3 | ![Effective Pandas](cover/modern-pandas-cover.png)
 4 | 
 5 | A collection of notebooks behind my [series](http://tomaugspurger.github.io/modern-1-intro.html) on
 6 | writing idiomatic pandas.
 7 | 
 8 | ## Contents
 9 | 
10 | - [Modern Pandas](modern_1_intro.ipynb)
11 | - [Method Chaining](modern_2_method_chaining.ipynb)
12 | - [Indexes](modern_3_indexes.ipynb)
13 | - [Fast Pandas](modern_4_performance.ipynb)
14 | - [Tidy Data](modern_5_tidy.ipynb)
15 | - [Visualization](modern_6_visualization.ipynb)
16 | - [Time Series](modern_7_timeseries.ipynb)
17 | - [Out of Core](modern_8_out_of_core.ipynb) (partial)
18 | 
19 | 


--------------------------------------------------------------------------------
/cover/modern-pandas-cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TomAugspurger/effective-pandas/bb3f49066bd85accf9d5b84b16870e02e223aaee/cover/modern-pandas-cover.png


--------------------------------------------------------------------------------
/markdown/style.css:
--------------------------------------------------------------------------------
 1 | table { width: 53%;
 2 |         text-align: right;
 3 |         font-size: 14px;
 4 |         line-height: 1.4;
 5 |         margin-top: 1.4rem;
 6 |         margin-bottom: 1.4rem;
 7 |         margin-left: 1%;
 8 |         margin-right: 1%;
 9 |         border-top: 2px solid #333333;
10 |         border-bottom: 2px solid #333333;
11 |         border-collapse: separate;
12 |         border-spacing: 0 5px;
13 |         -webkit-font-feature-settings: 'tnum'; /* This is technically redundant */
14 |         -moz-font-feature-settings: 'tnum';
15 |         -ms-font-feature-settings: 'tnum'; }
16 | 
17 | 


--------------------------------------------------------------------------------
/modern-1-url.txt:
--------------------------------------------------------------------------------
1 | UserTableName=On_Time_Performance&DBShortName=On_Time&RawDataTable=T_ONTIME&sqlstr=+SELECT+FL_DATE%2CUNIQUE_CARRIER%2CAIRLINE_ID%2CTAIL_NUM%2CFL_NUM%2CORIGIN_AIRPORT_ID%2CORIGIN_AIRPORT_SEQ_ID%2CORIGIN_CITY_MARKET_ID%2CORIGIN%2CORIGIN_CITY_NAME%2CDEST_AIRPORT_ID%2CDEST_AIRPORT_SEQ_ID%2CDEST_CITY_MARKET_ID%2CDEST%2CDEST_CITY_NAME%2CCRS_DEP_TIME%2CDEP_TIME%2CDEP_DELAY%2CTAXI_OUT%2CWHEELS_OFF%2CWHEELS_ON%2CTAXI_IN%2CCRS_ARR_TIME%2CARR_TIME%2CARR_DELAY%2CCANCELLED%2CCANCELLATION_CODE%2CCARRIER_DELAY%2CWEATHER_DELAY%2CNAS_DELAY%2CSECURITY_DELAY%2CLATE_AIRCRAFT_DELAY+FROM++T_ONTIME+WHERE+Month+%3D1+AND+YEAR%3D2017&varlist=FL_DATE%2CUNIQUE_CARRIER%2CAIRLINE_ID%2CTAIL_NUM%2CFL_NUM%2CORIGIN_AIRPORT_ID%2CORIGIN_AIRPORT_SEQ_ID%2CORIGIN_CITY_MARKET_ID%2CORIGIN%2CORIGIN_CITY_NAME%2CDEST_AIRPORT_ID%2CDEST_AIRPORT_SEQ_ID%2CDEST_CITY_MARKET_ID%2CDEST%2CDEST_CITY_NAME%2CCRS_DEP_TIME%2CDEP_TIME%2CDEP_DELAY%2CTAXI_OUT%2CWHEELS_OFF%2CWHEELS_ON%2CTAXI_IN%2CCRS_ARR_TIME%2CARR_TIME%2CARR_DELAY%2CCANCELLED%2CCANCELLATION_CODE%2CCARRIER_DELAY%2CWEATHER_DELAY%2CNAS_DELAY%2CSECURITY_DELAY%2CLATE_AIRCRAFT_DELAY&grouplist=&suml=&sumRegion=&filter1=title%3D&filter2=title%3D&geo=All%A0&time=January&timename=Month&GEOGRAPHY=All&XYEAR=2017&FREQUENCY=1&VarDesc=Year&VarType=Num&VarDesc=Quarter&VarType=Num&VarDesc=Month&VarType=Num&VarDesc=DayofMonth&VarType=Num&VarDesc=DayOfWeek&VarType=Num&VarName=FL_DATE&VarDesc=FlightDate&VarType=Char&VarName=UNIQUE_CARRIER&VarDesc=UniqueCarrier&VarType=Char&VarName=AIRLINE_ID&VarDesc=AirlineID&VarType=Num&VarDesc=Carrier&VarType=Char&VarName=TAIL_NUM&VarDesc=TailNum&VarType=Char&VarName=FL_NUM&VarDesc=FlightNum&VarType=Char&VarName=ORIGIN_AIRPORT_ID&VarDesc=OriginAirportID&VarType=Num&VarName=ORIGIN_AIRPORT_SEQ_ID&VarDesc=OriginAirportSeqID&VarType=Num&VarName=ORIGIN_CITY_MARKET_ID&VarDesc=OriginCityMarketID&VarType=Num&VarName=ORIGIN&VarDesc=Origin&VarType=Char&VarName=ORIGIN_CITY_NAME&VarDesc=OriginCityName&VarType=Char&VarDesc=OriginState&VarType=Char&VarDesc=OriginStateFips&VarType=Char&VarDesc=OriginStateName&VarType=Char&VarDesc=OriginWac&VarType=Num&VarName=DEST_AIRPORT_ID&VarDesc=DestAirportID&VarType=Num&VarName=DEST_AIRPORT_SEQ_ID&VarDesc=DestAirportSeqID&VarType=Num&VarName=DEST_CITY_MARKET_ID&VarDesc=DestCityMarketID&VarType=Num&VarName=DEST&VarDesc=Dest&VarType=Char&VarName=DEST_CITY_NAME&VarDesc=DestCityName&VarType=Char&VarDesc=DestState&VarType=Char&VarDesc=DestStateFips&VarType=Char&VarDesc=DestStateName&VarType=Char&VarDesc=DestWac&VarType=Num&VarName=CRS_DEP_TIME&VarDesc=CRSDepTime&VarType=Char&VarName=DEP_TIME&VarDesc=DepTime&VarType=Char&VarName=DEP_DELAY&VarDesc=DepDelay&VarType=Num&VarDesc=DepDelayMinutes&VarType=Num&VarDesc=DepDel15&VarType=Num&VarDesc=DepartureDelayGroups&VarType=Num&VarDesc=DepTimeBlk&VarType=Char&VarName=TAXI_OUT&VarDesc=TaxiOut&VarType=Num&VarName=WHEELS_OFF&VarDesc=WheelsOff&VarType=Char&VarName=WHEELS_ON&VarDesc=WheelsOn&VarType=Char&VarName=TAXI_IN&VarDesc=TaxiIn&VarType=Num&VarName=CRS_ARR_TIME&VarDesc=CRSArrTime&VarType=Char&VarName=ARR_TIME&VarDesc=ArrTime&VarType=Char&VarName=ARR_DELAY&VarDesc=ArrDelay&VarType=Num&VarDesc=ArrDelayMinutes&VarType=Num&VarDesc=ArrDel15&VarType=Num&VarDesc=ArrivalDelayGroups&VarType=Num&VarDesc=ArrTimeBlk&VarType=Char&VarName=CANCELLED&VarDesc=Cancelled&VarType=Num&VarName=CANCELLATION_CODE&VarDesc=CancellationCode&VarType=Char&VarDesc=Diverted&VarType=Num&VarDesc=CRSElapsedTime&VarType=Num&VarDesc=ActualElapsedTime&VarType=Num&VarDesc=AirTime&VarType=Num&VarDesc=Flights&VarType=Num&VarDesc=Distance&VarType=Num&VarDesc=DistanceGroup&VarType=Num&VarName=CARRIER_DELAY&VarDesc=CarrierDelay&VarType=Num&VarName=WEATHER_DELAY&VarDesc=WeatherDelay&VarType=Num&VarName=NAS_DELAY&VarDesc=NASDelay&VarType=Num&VarName=SECURITY_DELAY&VarDesc=SecurityDelay&VarType=Num&VarName=LATE_AIRCRAFT_DELAY&VarDesc=LateAircraftDelay&VarType=Num&VarDesc=FirstDepTime&VarType=Char&VarDesc=TotalAddGTime&VarType=Num&VarDesc=LongestAddGTime&VarType=Num&VarDesc=DivAirportLandings&VarType=Num&VarDesc=DivReachedDest&VarType=Num&VarDesc=DivActualElapsedTime&VarType=Num&VarDesc=DivArrDelay&VarType=Num&VarDesc=DivDistance&VarType=Num&VarDesc=Div1Airport&VarType=Char&VarDesc=Div1AirportID&VarType=Num&VarDesc=Div1AirportSeqID&VarType=Num&VarDesc=Div1WheelsOn&VarType=Char&VarDesc=Div1TotalGTime&VarType=Num&VarDesc=Div1LongestGTime&VarType=Num&VarDesc=Div1WheelsOff&VarType=Char&VarDesc=Div1TailNum&VarType=Char&VarDesc=Div2Airport&VarType=Char&VarDesc=Div2AirportID&VarType=Num&VarDesc=Div2AirportSeqID&VarType=Num&VarDesc=Div2WheelsOn&VarType=Char&VarDesc=Div2TotalGTime&VarType=Num&VarDesc=Div2LongestGTime&VarType=Num&VarDesc=Div2WheelsOff&VarType=Char&VarDesc=Div2TailNum&VarType=Char&VarDesc=Div3Airport&VarType=Char&VarDesc=Div3AirportID&VarType=Num&VarDesc=Div3AirportSeqID&VarType=Num&VarDesc=Div3WheelsOn&VarType=Char&VarDesc=Div3TotalGTime&VarType=Num&VarDesc=Div3LongestGTime&VarType=Num&VarDesc=Div3WheelsOff&VarType=Char&VarDesc=Div3TailNum&VarType=Char&VarDesc=Div4Airport&VarType=Char&VarDesc=Div4AirportID&VarType=Num&VarDesc=Div4AirportSeqID&VarType=Num&VarDesc=Div4WheelsOn&VarType=Char&VarDesc=Div4TotalGTime&VarType=Num&VarDesc=Div4LongestGTime&VarType=Num&VarDesc=Div4WheelsOff&VarType=Char&VarDesc=Div4TailNum&VarType=Char&VarDesc=Div5Airport&VarType=Char&VarDesc=Div5AirportID&VarType=Num&VarDesc=Div5AirportSeqID&VarType=Num&VarDesc=Div5WheelsOn&VarType=Char&VarDesc=Div5TotalGTime&VarType=Num&VarDesc=Div5LongestGTime&VarType=Num&VarDesc=Div5WheelsOff&VarType=Char&VarDesc=Div5TailNum&VarType=Char
2 | 


--------------------------------------------------------------------------------
/modern_1_intro.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Effective Pandas\n",
   8 |     "\n",
   9 |     "## Introduction\n",
  10 |     "\n",
  11 |     "This series is about how to make effective use of [pandas](http://pandas.pydata.org), a data analysis library for the Python programming language.\n",
  12 |     "It's targeted at an intermediate level: people who have some experience with pandas, but are looking to improve.\n",
  13 |     "\n",
  14 |     "## Prior Art\n",
  15 |     "\n",
  16 |     "There are many great resources for learning pandas; this is not one of them.\n",
  17 |     "For beginners, I typically recommend [Greg Reda's](https://twitter.com/gjreda) [3-part introduction](http://gregreda.com/2013/10/26/intro-to-pandas-data-structures/), especially if they're familiar with SQL. Of course, there's the pandas [documentation](http://pandas.pydata.org/) itself. I gave [a talk](https://www.youtube.com/watch?v=otCriSKVV_8) at PyData Seattle targeted as an introduction if you prefer video form. Wes McKinney's [Python for Data Analysis](http://shop.oreilly.com/product/0636920023784.do) is still the goto book (and is also a really good introduction to NumPy as well). Jake VanderPlas's [Python Data Science Handbook](http://shop.oreilly.com/product/0636920034919.do), in early release, is great too.\n",
  18 |     "Kevin Markham has a [video series](http://www.dataschool.io/easier-data-analysis-with-pandas/) for beginners learning pandas.\n",
  19 |     "\n",
  20 |     "With all those resources (and many more that I've slighted through omission), why write another? Surely the law of diminishing returns is kicking in by now.\n",
  21 |     "Still, I thought there was room for a guide that is up to date (as of March 2016) and emphasizes idiomatic pandas code (code that is *pandorable*).\n",
  22 |     "This series probably won't be appropriate for people completely new to python\n",
  23 |     "or NumPy and pandas.\n",
  24 |     "By luck, this first post happened to cover topics that are relatively introductory,\n",
  25 |     "so read some of the linked material and come back, or [let me know](https://twitter.com/tomaugspurger) if you\n",
  26 |     "have questions.\n",
  27 |     "\n",
  28 |     "## Get the Data\n",
  29 |     "\n",
  30 |     "We'll be working with [flight delay data](http://www.transtats.bts.gov/databases.asp?Mode_ID=1&Mode_Desc=Aviation&Subject_ID2=0) from the BTS (R users can install Hadley's [NYCFlights13](https://github.com/hadley/nycflights13) dataset for similar data.\n",
  31 |     "\n"
  32 |    ]
  33 |   },
  34 |   {
  35 |    "cell_type": "code",
  36 |    "execution_count": 1,
  37 |    "metadata": {},
  38 |    "outputs": [],
  39 |    "source": [
  40 |     "import os\n",
  41 |     "import zipfile\n",
  42 |     "\n",
  43 |     "import requests\n",
  44 |     "import numpy as np\n",
  45 |     "import pandas as pd\n",
  46 |     "import seaborn as sns\n",
  47 |     "import matplotlib.pyplot as plt\n",
  48 |     "\n",
  49 |     "if int(os.environ.get(\"MODERN_PANDAS_EPUB\", 0)):\n",
  50 |     "    import prep"
  51 |    ]
  52 |   },
  53 |   {
  54 |    "cell_type": "code",
  55 |    "execution_count": 2,
  56 |    "metadata": {},
  57 |    "outputs": [],
  58 |    "source": [
  59 |     "import requests\n",
  60 |     "\n",
  61 |     "headers = {\n",
  62 |     "    'Referer': 'https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time',\n",
  63 |     "    'Origin': 'https://www.transtats.bts.gov',\n",
  64 |     "    'Content-Type': 'application/x-www-form-urlencoded',\n",
  65 |     "}\n",
  66 |     "\n",
  67 |     "params = (\n",
  68 |     "    ('Table_ID', '236'),\n",
  69 |     "    ('Has_Group', '3'),\n",
  70 |     "    ('Is_Zipped', '0'),\n",
  71 |     ")\n",
  72 |     "\n",
  73 |     "with open('modern-1-url.txt', encoding='utf-8') as f:\n",
  74 |     "    data = f.read().strip()\n",
  75 |     "\n",
  76 |     "os.makedirs('data', exist_ok=True)\n",
  77 |     "dest = \"data/flights.csv.zip\"\n",
  78 |     "\n",
  79 |     "if not os.path.exists(dest):\n",
  80 |     "    r = requests.post('https://www.transtats.bts.gov/DownLoad_Table.asp',\n",
  81 |     "                      headers=headers, params=params, data=data, stream=True)\n",
  82 |     "\n",
  83 |     "    with open(\"data/flights.csv.zip\", 'wb') as f:\n",
  84 |     "        for chunk in r.iter_content(chunk_size=102400): \n",
  85 |     "            if chunk:\n",
  86 |     "                f.write(chunk)"
  87 |    ]
  88 |   },
  89 |   {
  90 |    "cell_type": "markdown",
  91 |    "metadata": {},
  92 |    "source": [
  93 |     "That download returned a ZIP file.\n",
  94 |     "There's an open [Pull Request](https://github.com/pydata/pandas/pull/12175) for automatically decompressing ZIP archives with a single CSV,\n",
  95 |     "but for now we have to extract it ourselves and then read it in."
  96 |    ]
  97 |   },
  98 |   {
  99 |    "cell_type": "code",
 100 |    "execution_count": 3,
 101 |    "metadata": {},
 102 |    "outputs": [
 103 |     {
 104 |      "name": "stdout",
 105 |      "output_type": "stream",
 106 |      "text": [
 107 |       "<class 'pandas.core.frame.DataFrame'>\n",
 108 |       "RangeIndex: 450017 entries, 0 to 450016\n",
 109 |       "Data columns (total 33 columns):\n",
 110 |       "fl_date                  450017 non-null datetime64[ns]\n",
 111 |       "unique_carrier           450017 non-null object\n",
 112 |       "airline_id               450017 non-null int64\n",
 113 |       "tail_num                 449378 non-null object\n",
 114 |       "fl_num                   450017 non-null int64\n",
 115 |       "origin_airport_id        450017 non-null int64\n",
 116 |       "origin_airport_seq_id    450017 non-null int64\n",
 117 |       "origin_city_market_id    450017 non-null int64\n",
 118 |       "origin                   450017 non-null object\n",
 119 |       "origin_city_name         450017 non-null object\n",
 120 |       "dest_airport_id          450017 non-null int64\n",
 121 |       "dest_airport_seq_id      450017 non-null int64\n",
 122 |       "dest_city_market_id      450017 non-null int64\n",
 123 |       "dest                     450017 non-null object\n",
 124 |       "dest_city_name           450017 non-null object\n",
 125 |       "crs_dep_time             450017 non-null int64\n",
 126 |       "dep_time                 441476 non-null float64\n",
 127 |       "dep_delay                441476 non-null float64\n",
 128 |       "taxi_out                 441244 non-null float64\n",
 129 |       "wheels_off               441244 non-null float64\n",
 130 |       "wheels_on                440746 non-null float64\n",
 131 |       "taxi_in                  440746 non-null float64\n",
 132 |       "crs_arr_time             450017 non-null int64\n",
 133 |       "arr_time                 440746 non-null float64\n",
 134 |       "arr_delay                439645 non-null float64\n",
 135 |       "cancelled                450017 non-null float64\n",
 136 |       "cancellation_code        8886 non-null object\n",
 137 |       "carrier_delay            97699 non-null float64\n",
 138 |       "weather_delay            97699 non-null float64\n",
 139 |       "nas_delay                97699 non-null float64\n",
 140 |       "security_delay           97699 non-null float64\n",
 141 |       "late_aircraft_delay      97699 non-null float64\n",
 142 |       "unnamed: 32              0 non-null float64\n",
 143 |       "dtypes: datetime64[ns](1), float64(15), int64(10), object(7)\n",
 144 |       "memory usage: 113.3+ MB\n"
 145 |      ]
 146 |     }
 147 |    ],
 148 |    "source": [
 149 |     "zf = zipfile.ZipFile(\"data/flights.csv.zip\")\n",
 150 |     "fp = zf.extract(zf.filelist[0].filename, path='data/')\n",
 151 |     "df = pd.read_csv(fp, parse_dates=[\"FL_DATE\"]).rename(columns=str.lower)\n",
 152 |     "\n",
 153 |     "df.info()"
 154 |    ]
 155 |   },
 156 |   {
 157 |    "cell_type": "markdown",
 158 |    "metadata": {},
 159 |    "source": [
 160 |     "## Indexing\n",
 161 |     "\n",
 162 |     "Or, *explicit is better than implicit*.\n",
 163 |     "By my count, 7 of the top-15 voted pandas questions on [Stackoverflow](http://stackoverflow.com/questions/tagged/pandas?sort=votes&pageSize=15) are about indexing. This seems as good a place as any to start.\n",
 164 |     "\n",
 165 |     "By indexing, we mean the selection of subsets of a DataFrame or Series.\n",
 166 |     "`DataFrames` (and to a lesser extent, `Series`) provide a difficult set of challenges:\n",
 167 |     "\n",
 168 |     "- Like lists, you can index by location.\n",
 169 |     "- Like dictionaries, you can index by label.\n",
 170 |     "- Like NumPy arrays, you can index by boolean masks.\n",
 171 |     "- Any of these indexers could be scalar indexes, or they could be arrays, or they could be `slice`s.\n",
 172 |     "- Any of these should work on the index (row labels) or columns of a DataFrame.\n",
 173 |     "- And any of these should work on hierarchical indexes.\n",
 174 |     "\n",
 175 |     "The complexity of pandas' indexing is a microcosm for the complexity of the pandas API in general.\n",
 176 |     "There's a reason for the complexity (well, most of it), but that's not *much* consolation while you're learning.\n",
 177 |     "Still, all of these ways of indexing really are useful enough to justify their inclusion in the library."
 178 |    ]
 179 |   },
 180 |   {
 181 |    "cell_type": "markdown",
 182 |    "metadata": {},
 183 |    "source": [
 184 |     "## Slicing\n",
 185 |     "\n",
 186 |     "Or, *explicit is better than implicit*.\n",
 187 |     "\n",
 188 |     "By my count, 7 of the top-15 voted pandas questions on [Stackoverflow](http://stackoverflow.com/questions/tagged/pandas?sort=votes&pageSize=15) are about slicing. This seems as good a place as any to start.\n",
 189 |     "\n",
 190 |     "Brief history digression: For years the preferred method for row and/or column selection was `.ix`."
 191 |    ]
 192 |   },
 193 |   {
 194 |    "cell_type": "code",
 195 |    "execution_count": 5,
 196 |    "metadata": {},
 197 |    "outputs": [
 198 |     {
 199 |      "data": {
 200 |       "text/html": [
 201 |        "<div>\n",
 202 |        "<table border=\"0\" class=\"dataframe\">\n",
 203 |        "  <thead>\n",
 204 |        "    <tr style=\"text-align: right;\">\n",
 205 |        "      <th></th>\n",
 206 |        "      <th>fl_date</th>\n",
 207 |        "      <th>tail_num</th>\n",
 208 |        "    </tr>\n",
 209 |        "  </thead>\n",
 210 |        "  <tbody>\n",
 211 |        "    <tr>\n",
 212 |        "      <th>10</th>\n",
 213 |        "      <td>2014-01-01</td>\n",
 214 |        "      <td>N3LGAA</td>\n",
 215 |        "    </tr>\n",
 216 |        "    <tr>\n",
 217 |        "      <th>11</th>\n",
 218 |        "      <td>2014-01-01</td>\n",
 219 |        "      <td>N368AA</td>\n",
 220 |        "    </tr>\n",
 221 |        "    <tr>\n",
 222 |        "      <th>12</th>\n",
 223 |        "      <td>2014-01-01</td>\n",
 224 |        "      <td>N3DDAA</td>\n",
 225 |        "    </tr>\n",
 226 |        "    <tr>\n",
 227 |        "      <th>13</th>\n",
 228 |        "      <td>2014-01-01</td>\n",
 229 |        "      <td>N332AA</td>\n",
 230 |        "    </tr>\n",
 231 |        "    <tr>\n",
 232 |        "      <th>14</th>\n",
 233 |        "      <td>2014-01-01</td>\n",
 234 |        "      <td>N327AA</td>\n",
 235 |        "    </tr>\n",
 236 |        "    <tr>\n",
 237 |        "      <th>15</th>\n",
 238 |        "      <td>2014-01-01</td>\n",
 239 |        "      <td>N3LBAA</td>\n",
 240 |        "    </tr>\n",
 241 |        "  </tbody>\n",
 242 |        "</table>\n",
 243 |        "</div>"
 244 |       ],
 245 |       "text/plain": [
 246 |        "      fl_date tail_num\n",
 247 |        "10 2014-01-01   N3LGAA\n",
 248 |        "11 2014-01-01   N368AA\n",
 249 |        "12 2014-01-01   N3DDAA\n",
 250 |        "13 2014-01-01   N332AA\n",
 251 |        "14 2014-01-01   N327AA\n",
 252 |        "15 2014-01-01   N3LBAA"
 253 |       ]
 254 |      },
 255 |      "execution_count": 5,
 256 |      "metadata": {},
 257 |      "output_type": "execute_result"
 258 |     }
 259 |    ],
 260 |    "source": [
 261 |     "df.ix[10:15, ['fl_date', 'tail_num']]"
 262 |    ]
 263 |   },
 264 |   {
 265 |    "cell_type": "markdown",
 266 |    "metadata": {},
 267 |    "source": [
 268 |     "However this simple little operation hides some complexity. What if, rather than our default `range(n)` index, we had an integer index like"
 269 |    ]
 270 |   },
 271 |   {
 272 |    "cell_type": "code",
 273 |    "execution_count": 6,
 274 |    "metadata": {},
 275 |    "outputs": [
 276 |     {
 277 |      "data": {
 278 |       "text/html": [
 279 |        "<div>\n",
 280 |        "<table border=\"0\" class=\"dataframe\">\n",
 281 |        "  <thead>\n",
 282 |        "    <tr style=\"text-align: right;\">\n",
 283 |        "      <th></th>\n",
 284 |        "      <th>fl_date</th>\n",
 285 |        "      <th>unique_carrier</th>\n",
 286 |        "    </tr>\n",
 287 |        "    <tr>\n",
 288 |        "      <th>airline_id</th>\n",
 289 |        "      <th></th>\n",
 290 |        "      <th></th>\n",
 291 |        "    </tr>\n",
 292 |        "  </thead>\n",
 293 |        "  <tbody>\n",
 294 |        "    <tr>\n",
 295 |        "      <th>19393</th>\n",
 296 |        "      <td>2014-01-01</td>\n",
 297 |        "      <td>WN</td>\n",
 298 |        "    </tr>\n",
 299 |        "    <tr>\n",
 300 |        "      <th>19690</th>\n",
 301 |        "      <td>2014-01-01</td>\n",
 302 |        "      <td>HA</td>\n",
 303 |        "    </tr>\n",
 304 |        "    <tr>\n",
 305 |        "      <th>19790</th>\n",
 306 |        "      <td>2014-01-01</td>\n",
 307 |        "      <td>DL</td>\n",
 308 |        "    </tr>\n",
 309 |        "    <tr>\n",
 310 |        "      <th>19805</th>\n",
 311 |        "      <td>2014-01-01</td>\n",
 312 |        "      <td>AA</td>\n",
 313 |        "    </tr>\n",
 314 |        "    <tr>\n",
 315 |        "      <th>19930</th>\n",
 316 |        "      <td>2014-01-01</td>\n",
 317 |        "      <td>AS</td>\n",
 318 |        "    </tr>\n",
 319 |        "  </tbody>\n",
 320 |        "</table>\n",
 321 |        "</div>"
 322 |       ],
 323 |       "text/plain": [
 324 |        "              fl_date unique_carrier\n",
 325 |        "airline_id                          \n",
 326 |        "19393      2014-01-01             WN\n",
 327 |        "19690      2014-01-01             HA\n",
 328 |        "19790      2014-01-01             DL\n",
 329 |        "19805      2014-01-01             AA\n",
 330 |        "19930      2014-01-01             AS"
 331 |       ]
 332 |      },
 333 |      "execution_count": 6,
 334 |      "metadata": {},
 335 |      "output_type": "execute_result"
 336 |     }
 337 |    ],
 338 |    "source": [
 339 |     "first = df.groupby('airline_id')[['fl_date', 'unique_carrier']].first()\n",
 340 |     "first.head()"
 341 |    ]
 342 |   },
 343 |   {
 344 |    "cell_type": "markdown",
 345 |    "metadata": {},
 346 |    "source": [
 347 |     "Can you predict ahead of time what our slice from above will give when passed to `.ix`?"
 348 |    ]
 349 |   },
 350 |   {
 351 |    "cell_type": "code",
 352 |    "execution_count": 7,
 353 |    "metadata": {},
 354 |    "outputs": [
 355 |     {
 356 |      "data": {
 357 |       "text/html": [
 358 |        "<div>\n",
 359 |        "<table border=\"0\" class=\"dataframe\">\n",
 360 |        "  <thead>\n",
 361 |        "    <tr style=\"text-align: right;\">\n",
 362 |        "      <th></th>\n",
 363 |        "      <th>fl_date</th>\n",
 364 |        "      <th>tail_num</th>\n",
 365 |        "    </tr>\n",
 366 |        "    <tr>\n",
 367 |        "      <th>airline_id</th>\n",
 368 |        "      <th></th>\n",
 369 |        "      <th></th>\n",
 370 |        "    </tr>\n",
 371 |        "  </thead>\n",
 372 |        "  <tbody>\n",
 373 |        "  </tbody>\n",
 374 |        "</table>\n",
 375 |        "</div>"
 376 |       ],
 377 |       "text/plain": [
 378 |        "Empty DataFrame\n",
 379 |        "Columns: [fl_date, tail_num]\n",
 380 |        "Index: []"
 381 |       ]
 382 |      },
 383 |      "execution_count": 7,
 384 |      "metadata": {},
 385 |      "output_type": "execute_result"
 386 |     }
 387 |    ],
 388 |    "source": [
 389 |     "first.ix[10:15, ['fl_date', 'tail_num']]"
 390 |    ]
 391 |   },
 392 |   {
 393 |    "cell_type": "markdown",
 394 |    "metadata": {},
 395 |    "source": [
 396 |     "Surprise, an empty DataFrame! Which in data analysis is rarely a good thing. What happened?\n",
 397 |     "\n",
 398 |     "We had an integer index, so the call to `.ix` used its label-based mode. It was looking for integer *labels* between 10:15 (inclusive). It didn't find any. Since we sliced a range it returned an empty DataFrame, rather than raising a KeyError.\n",
 399 |     "\n",
 400 |     "By way of contrast, suppose we had a string index, rather than integers."
 401 |    ]
 402 |   },
 403 |   {
 404 |    "cell_type": "code",
 405 |    "execution_count": 8,
 406 |    "metadata": {},
 407 |    "outputs": [
 408 |     {
 409 |      "data": {
 410 |       "text/html": [
 411 |        "<div>\n",
 412 |        "<table border=\"0\" class=\"dataframe\">\n",
 413 |        "  <thead>\n",
 414 |        "    <tr style=\"text-align: right;\">\n",
 415 |        "      <th></th>\n",
 416 |        "      <th>fl_date</th>\n",
 417 |        "      <th>tail_num</th>\n",
 418 |        "    </tr>\n",
 419 |        "    <tr>\n",
 420 |        "      <th>unique_carrier</th>\n",
 421 |        "      <th></th>\n",
 422 |        "      <th></th>\n",
 423 |        "    </tr>\n",
 424 |        "  </thead>\n",
 425 |        "  <tbody>\n",
 426 |        "    <tr>\n",
 427 |        "      <th>UA</th>\n",
 428 |        "      <td>2014-01-01</td>\n",
 429 |        "      <td>N14214</td>\n",
 430 |        "    </tr>\n",
 431 |        "    <tr>\n",
 432 |        "      <th>US</th>\n",
 433 |        "      <td>2014-01-01</td>\n",
 434 |        "      <td>N650AW</td>\n",
 435 |        "    </tr>\n",
 436 |        "    <tr>\n",
 437 |        "      <th>VX</th>\n",
 438 |        "      <td>2014-01-01</td>\n",
 439 |        "      <td>N637VA</td>\n",
 440 |        "    </tr>\n",
 441 |        "    <tr>\n",
 442 |        "      <th>WN</th>\n",
 443 |        "      <td>2014-01-01</td>\n",
 444 |        "      <td>N412WN</td>\n",
 445 |        "    </tr>\n",
 446 |        "  </tbody>\n",
 447 |        "</table>\n",
 448 |        "</div>"
 449 |       ],
 450 |       "text/plain": [
 451 |        "                  fl_date tail_num\n",
 452 |        "unique_carrier                    \n",
 453 |        "UA             2014-01-01   N14214\n",
 454 |        "US             2014-01-01   N650AW\n",
 455 |        "VX             2014-01-01   N637VA\n",
 456 |        "WN             2014-01-01   N412WN"
 457 |       ]
 458 |      },
 459 |      "execution_count": 8,
 460 |      "metadata": {},
 461 |      "output_type": "execute_result"
 462 |     }
 463 |    ],
 464 |    "source": [
 465 |     "first = df.groupby('unique_carrier').first()\n",
 466 |     "first.ix[10:15, ['fl_date', 'tail_num']]"
 467 |    ]
 468 |   },
 469 |   {
 470 |    "cell_type": "markdown",
 471 |    "metadata": {},
 472 |    "source": [
 473 |     "\n",
 474 |     "And it works again! Now that we had a string index, `.ix` used its positional-mode. It looked for *rows* 10-15 (exclusive on the right).\n",
 475 |     "\n",
 476 |     "But you can't reliably predict what the outcome of the slice will be ahead of time. It's on the *reader* of the code (probably your future self) to know the dtypes so you can reckon whether `.ix` will use label indexing (returning the empty DataFrame) or positional indexing (like the last example).\n",
 477 |     "In general, methods whose behavior depends on the data, like `.ix` dispatching to label-based indexing on integer Indexes but location-based indexing on non-integer, are hard to use correctly. We've been trying to stamp them out in pandas.\n",
 478 |     "\n",
 479 |     "Since pandas 0.12, these tasks have been cleanly separated into two methods:\n",
 480 |     "\n",
 481 |     "1. `.loc` for label-based indexing\n",
 482 |     "2. `.iloc` for positional indexing"
 483 |    ]
 484 |   },
 485 |   {
 486 |    "cell_type": "code",
 487 |    "execution_count": 9,
 488 |    "metadata": {},
 489 |    "outputs": [
 490 |     {
 491 |      "data": {
 492 |       "text/html": [
 493 |        "<div>\n",
 494 |        "<table border=\"0\" class=\"dataframe\">\n",
 495 |        "  <thead>\n",
 496 |        "    <tr style=\"text-align: right;\">\n",
 497 |        "      <th></th>\n",
 498 |        "      <th>fl_date</th>\n",
 499 |        "      <th>tail_num</th>\n",
 500 |        "    </tr>\n",
 501 |        "    <tr>\n",
 502 |        "      <th>unique_carrier</th>\n",
 503 |        "      <th></th>\n",
 504 |        "      <th></th>\n",
 505 |        "    </tr>\n",
 506 |        "  </thead>\n",
 507 |        "  <tbody>\n",
 508 |        "    <tr>\n",
 509 |        "      <th>AA</th>\n",
 510 |        "      <td>2014-01-01</td>\n",
 511 |        "      <td>N338AA</td>\n",
 512 |        "    </tr>\n",
 513 |        "    <tr>\n",
 514 |        "      <th>AS</th>\n",
 515 |        "      <td>2014-01-01</td>\n",
 516 |        "      <td>N524AS</td>\n",
 517 |        "    </tr>\n",
 518 |        "    <tr>\n",
 519 |        "      <th>DL</th>\n",
 520 |        "      <td>2014-01-01</td>\n",
 521 |        "      <td>N911DL</td>\n",
 522 |        "    </tr>\n",
 523 |        "  </tbody>\n",
 524 |        "</table>\n",
 525 |        "</div>"
 526 |       ],
 527 |       "text/plain": [
 528 |        "                  fl_date tail_num\n",
 529 |        "unique_carrier                    \n",
 530 |        "AA             2014-01-01   N338AA\n",
 531 |        "AS             2014-01-01   N524AS\n",
 532 |        "DL             2014-01-01   N911DL"
 533 |       ]
 534 |      },
 535 |      "execution_count": 9,
 536 |      "metadata": {},
 537 |      "output_type": "execute_result"
 538 |     }
 539 |    ],
 540 |    "source": [
 541 |     "first.loc[['AA', 'AS', 'DL'], ['fl_date', 'tail_num']]"
 542 |    ]
 543 |   },
 544 |   {
 545 |    "cell_type": "code",
 546 |    "execution_count": 10,
 547 |    "metadata": {},
 548 |    "outputs": [
 549 |     {
 550 |      "data": {
 551 |       "text/html": [
 552 |        "<div>\n",
 553 |        "<table border=\"0\" class=\"dataframe\">\n",
 554 |        "  <thead>\n",
 555 |        "    <tr style=\"text-align: right;\">\n",
 556 |        "      <th></th>\n",
 557 |        "      <th>fl_date</th>\n",
 558 |        "      <th>airline_id</th>\n",
 559 |        "    </tr>\n",
 560 |        "    <tr>\n",
 561 |        "      <th>unique_carrier</th>\n",
 562 |        "      <th></th>\n",
 563 |        "      <th></th>\n",
 564 |        "    </tr>\n",
 565 |        "  </thead>\n",
 566 |        "  <tbody>\n",
 567 |        "    <tr>\n",
 568 |        "      <th>AA</th>\n",
 569 |        "      <td>2014-01-01</td>\n",
 570 |        "      <td>19805</td>\n",
 571 |        "    </tr>\n",
 572 |        "    <tr>\n",
 573 |        "      <th>AS</th>\n",
 574 |        "      <td>2014-01-01</td>\n",
 575 |        "      <td>19930</td>\n",
 576 |        "    </tr>\n",
 577 |        "    <tr>\n",
 578 |        "      <th>DL</th>\n",
 579 |        "      <td>2014-01-01</td>\n",
 580 |        "      <td>19790</td>\n",
 581 |        "    </tr>\n",
 582 |        "  </tbody>\n",
 583 |        "</table>\n",
 584 |        "</div>"
 585 |       ],
 586 |       "text/plain": [
 587 |        "                  fl_date  airline_id\n",
 588 |        "unique_carrier                       \n",
 589 |        "AA             2014-01-01       19805\n",
 590 |        "AS             2014-01-01       19930\n",
 591 |        "DL             2014-01-01       19790"
 592 |       ]
 593 |      },
 594 |      "execution_count": 10,
 595 |      "metadata": {},
 596 |      "output_type": "execute_result"
 597 |     }
 598 |    ],
 599 |    "source": [
 600 |     "first.iloc[[0, 1, 3], [0, 1]]"
 601 |    ]
 602 |   },
 603 |   {
 604 |    "cell_type": "markdown",
 605 |    "metadata": {},
 606 |    "source": [
 607 |     "`.ix` is still around, and isn't being deprecated any time soon. Occasionally it's useful. But if you've been using `.ix` out of habit, or if you didn't know any better, maybe give `.loc` and `.iloc` a shot. For the intrepid reader, Joris Van den Bossche (a core pandas dev) [compiled a great overview](https://github.com/pydata/pandas/issues/9595) of the pandas `__getitem__` API.\n",
 608 |     "A later post in this series will go into more detail on using Indexes effectively;\n",
 609 |     "they are useful objects in their own right, but for now we'll move on to a closely related topic.\n",
 610 |     "\n",
 611 |     "## SettingWithCopy\n",
 612 |     "\n",
 613 |     "Pandas used to get *a lot* of questions about assignments seemingly not working. We'll take [this StackOverflow](http://stackoverflow.com/q/16553298/1889400) question as a representative question."
 614 |    ]
 615 |   },
 616 |   {
 617 |    "cell_type": "code",
 618 |    "execution_count": 11,
 619 |    "metadata": {},
 620 |    "outputs": [
 621 |     {
 622 |      "data": {
 623 |       "text/html": [
 624 |        "<div>\n",
 625 |        "<table border=\"0\" class=\"dataframe\">\n",
 626 |        "  <thead>\n",
 627 |        "    <tr style=\"text-align: right;\">\n",
 628 |        "      <th></th>\n",
 629 |        "      <th>a</th>\n",
 630 |        "      <th>b</th>\n",
 631 |        "    </tr>\n",
 632 |        "  </thead>\n",
 633 |        "  <tbody>\n",
 634 |        "    <tr>\n",
 635 |        "      <th>0</th>\n",
 636 |        "      <td>1</td>\n",
 637 |        "      <td>10</td>\n",
 638 |        "    </tr>\n",
 639 |        "    <tr>\n",
 640 |        "      <th>1</th>\n",
 641 |        "      <td>2</td>\n",
 642 |        "      <td>20</td>\n",
 643 |        "    </tr>\n",
 644 |        "    <tr>\n",
 645 |        "      <th>2</th>\n",
 646 |        "      <td>3</td>\n",
 647 |        "      <td>30</td>\n",
 648 |        "    </tr>\n",
 649 |        "    <tr>\n",
 650 |        "      <th>3</th>\n",
 651 |        "      <td>4</td>\n",
 652 |        "      <td>40</td>\n",
 653 |        "    </tr>\n",
 654 |        "    <tr>\n",
 655 |        "      <th>4</th>\n",
 656 |        "      <td>5</td>\n",
 657 |        "      <td>50</td>\n",
 658 |        "    </tr>\n",
 659 |        "  </tbody>\n",
 660 |        "</table>\n",
 661 |        "</div>"
 662 |       ],
 663 |       "text/plain": [
 664 |        "   a   b\n",
 665 |        "0  1  10\n",
 666 |        "1  2  20\n",
 667 |        "2  3  30\n",
 668 |        "3  4  40\n",
 669 |        "4  5  50"
 670 |       ]
 671 |      },
 672 |      "execution_count": 11,
 673 |      "metadata": {},
 674 |      "output_type": "execute_result"
 675 |     }
 676 |    ],
 677 |    "source": [
 678 |     "f = pd.DataFrame({'a':[1,2,3,4,5], 'b':[10,20,30,40,50]})\n",
 679 |     "f"
 680 |    ]
 681 |   },
 682 |   {
 683 |    "cell_type": "markdown",
 684 |    "metadata": {},
 685 |    "source": [
 686 |     "The user wanted to take the rows of `b` where `a` was 3 or less, and set them equal to `b / 10`\n",
 687 |     "We'll use boolean indexing to select those rows `f['a'] <= 3`,"
 688 |    ]
 689 |   },
 690 |   {
 691 |    "cell_type": "code",
 692 |    "execution_count": 12,
 693 |    "metadata": {},
 694 |    "outputs": [
 695 |     {
 696 |      "data": {
 697 |       "text/html": [
 698 |        "<div>\n",
 699 |        "<table border=\"0\" class=\"dataframe\">\n",
 700 |        "  <thead>\n",
 701 |        "    <tr style=\"text-align: right;\">\n",
 702 |        "      <th></th>\n",
 703 |        "      <th>a</th>\n",
 704 |        "      <th>b</th>\n",
 705 |        "    </tr>\n",
 706 |        "  </thead>\n",
 707 |        "  <tbody>\n",
 708 |        "    <tr>\n",
 709 |        "      <th>0</th>\n",
 710 |        "      <td>1</td>\n",
 711 |        "      <td>10</td>\n",
 712 |        "    </tr>\n",
 713 |        "    <tr>\n",
 714 |        "      <th>1</th>\n",
 715 |        "      <td>2</td>\n",
 716 |        "      <td>20</td>\n",
 717 |        "    </tr>\n",
 718 |        "    <tr>\n",
 719 |        "      <th>2</th>\n",
 720 |        "      <td>3</td>\n",
 721 |        "      <td>30</td>\n",
 722 |        "    </tr>\n",
 723 |        "    <tr>\n",
 724 |        "      <th>3</th>\n",
 725 |        "      <td>4</td>\n",
 726 |        "      <td>40</td>\n",
 727 |        "    </tr>\n",
 728 |        "    <tr>\n",
 729 |        "      <th>4</th>\n",
 730 |        "      <td>5</td>\n",
 731 |        "      <td>50</td>\n",
 732 |        "    </tr>\n",
 733 |        "  </tbody>\n",
 734 |        "</table>\n",
 735 |        "</div>"
 736 |       ],
 737 |       "text/plain": [
 738 |        "   a   b\n",
 739 |        "0  1  10\n",
 740 |        "1  2  20\n",
 741 |        "2  3  30\n",
 742 |        "3  4  40\n",
 743 |        "4  5  50"
 744 |       ]
 745 |      },
 746 |      "execution_count": 12,
 747 |      "metadata": {},
 748 |      "output_type": "execute_result"
 749 |     }
 750 |    ],
 751 |    "source": [
 752 |     "# ignore the context manager for now\n",
 753 |     "with pd.option_context('mode.chained_assignment', None):\n",
 754 |     "    f[f['a'] <= 3]['b'] = f[f['a'] <= 3 ]['b'] / 10\n",
 755 |     "f"
 756 |    ]
 757 |   },
 758 |   {
 759 |    "cell_type": "markdown",
 760 |    "metadata": {},
 761 |    "source": [
 762 |     "And nothing happened. Well, something did happen, but nobody witnessed it. If an object without any references is modified, does it make a sound?\n",
 763 |     "\n",
 764 |     "The warning I silenced above with the context manager links to [an explanation](http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy) that's quite helpful. I'll summarize the high points here.\n",
 765 |     "\n",
 766 |     "The \"failure\" to update `f` comes down to what's called *chained indexing*, a practice to be avoided.\n",
 767 |     "The \"chained\" comes from indexing multiple times, one after another, rather than one single indexing operation.\n",
 768 |     "Above we had two operations on the left-hand side, one `__getitem__` and one `__setitem__` (in python, the square brackets are syntactic sugar for `__getitem__` or `__setitem__` if it's for assignment). So `f[f['a'] <= 3]['b']` becomes\n",
 769 |     "\n",
 770 |     "1. `getitem`: `f[f['a'] <= 3]`\n",
 771 |     "2. `setitem`: `_['b'] = ...`  # using `_` to represent the result of 1.\n",
 772 |     "\n",
 773 |     "In general, pandas can't guarantee whether that first `__getitem__` returns a view or a copy of the underlying data.\n",
 774 |     "The changes *will* be made to the thing I called `_` above, the result of the `__getitem__` in `1`.\n",
 775 |     "But we don't know that `_` shares the same memory as our original `f`.\n",
 776 |     "And so we can't be sure that whatever changes are being made to `_` will be reflected in `f`.\n",
 777 |     "\n",
 778 |     "Done properly, you would write"
 779 |    ]
 780 |   },
 781 |   {
 782 |    "cell_type": "code",
 783 |    "execution_count": 13,
 784 |    "metadata": {},
 785 |    "outputs": [
 786 |     {
 787 |      "data": {
 788 |       "text/html": [
 789 |        "<div>\n",
 790 |        "<table border=\"0\" class=\"dataframe\">\n",
 791 |        "  <thead>\n",
 792 |        "    <tr style=\"text-align: right;\">\n",
 793 |        "      <th></th>\n",
 794 |        "      <th>a</th>\n",
 795 |        "      <th>b</th>\n",
 796 |        "    </tr>\n",
 797 |        "  </thead>\n",
 798 |        "  <tbody>\n",
 799 |        "    <tr>\n",
 800 |        "      <th>0</th>\n",
 801 |        "      <td>1</td>\n",
 802 |        "      <td>1.0</td>\n",
 803 |        "    </tr>\n",
 804 |        "    <tr>\n",
 805 |        "      <th>1</th>\n",
 806 |        "      <td>2</td>\n",
 807 |        "      <td>2.0</td>\n",
 808 |        "    </tr>\n",
 809 |        "    <tr>\n",
 810 |        "      <th>2</th>\n",
 811 |        "      <td>3</td>\n",
 812 |        "      <td>3.0</td>\n",
 813 |        "    </tr>\n",
 814 |        "    <tr>\n",
 815 |        "      <th>3</th>\n",
 816 |        "      <td>4</td>\n",
 817 |        "      <td>40.0</td>\n",
 818 |        "    </tr>\n",
 819 |        "    <tr>\n",
 820 |        "      <th>4</th>\n",
 821 |        "      <td>5</td>\n",
 822 |        "      <td>50.0</td>\n",
 823 |        "    </tr>\n",
 824 |        "  </tbody>\n",
 825 |        "</table>\n",
 826 |        "</div>"
 827 |       ],
 828 |       "text/plain": [
 829 |        "   a     b\n",
 830 |        "0  1   1.0\n",
 831 |        "1  2   2.0\n",
 832 |        "2  3   3.0\n",
 833 |        "3  4  40.0\n",
 834 |        "4  5  50.0"
 835 |       ]
 836 |      },
 837 |      "execution_count": 13,
 838 |      "metadata": {},
 839 |      "output_type": "execute_result"
 840 |     }
 841 |    ],
 842 |    "source": [
 843 |     "f.loc[f['a'] <= 3, 'b'] = f.loc[f['a'] <= 3, 'b'] / 10\n",
 844 |     "f"
 845 |    ]
 846 |   },
 847 |   {
 848 |    "cell_type": "markdown",
 849 |    "metadata": {},
 850 |    "source": [
 851 |     "Now this is all in a single call to `__setitem__` and pandas can ensure that the assignment happens properly.\n",
 852 |     "\n",
 853 |     "The rough rule is any time you see back-to-back square brackets, `][`, you're in asking for trouble. Replace that with a `.loc[..., ...]` and you'll be set.\n",
 854 |     "\n",
 855 |     "The other bit of advice is that a SettingWithCopy warning is raised when the *assignment* is made.\n",
 856 |     "The potential copy could be made earlier in your code.\n",
 857 |     "\n",
 858 |     "## Multidimensional Indexing\n",
 859 |     "\n",
 860 |     "MultiIndexes might just be my favorite feature of pandas.\n",
 861 |     "They let you represent higher-dimensional datasets in a familiar two-dimensional table, which my brain can sometimes handle.\n",
 862 |     "Each additional level of the MultiIndex represents another dimension.\n",
 863 |     "The cost of this is somewhat harder label indexing.\n",
 864 |     "\n",
 865 |     "My very first bug report to pandas, back in [November 2012](https://github.com/pydata/pandas/issues/2207),\n",
 866 |     "was about indexing into a MultiIndex.\n",
 867 |     "I bring it up now because I genuinely couldn't tell whether the result I got was a bug or not.\n",
 868 |     "Also, from that bug report\n",
 869 |     "\n",
 870 |     "> Sorry if this isn't actually a bug. Still very new to python. Thanks!\n",
 871 |     "\n",
 872 |     "Adorable.\n",
 873 |     "\n",
 874 |     "That operation was made much easier by [this](http://pandas.pydata.org/pandas-docs/version/0.18.0/whatsnew.html#multiindexing-using-slicers) addition in 2014, which lets you slice arbitrary levels of a MultiIndex..\n",
 875 |     "Let's make a MultiIndexed DataFrame to work with."
 876 |    ]
 877 |   },
 878 |   {
 879 |    "cell_type": "code",
 880 |    "execution_count": 14,
 881 |    "metadata": {},
 882 |    "outputs": [
 883 |     {
 884 |      "data": {
 885 |       "text/html": [
 886 |        "<div>\n",
 887 |        "<table border=\"0\" class=\"dataframe\">\n",
 888 |        "  <thead>\n",
 889 |        "    <tr style=\"text-align: right;\">\n",
 890 |        "      <th></th>\n",
 891 |        "      <th></th>\n",
 892 |        "      <th></th>\n",
 893 |        "      <th></th>\n",
 894 |        "      <th></th>\n",
 895 |        "      <th>airline_id</th>\n",
 896 |        "      <th>fl_num</th>\n",
 897 |        "      <th>origin_airport_id</th>\n",
 898 |        "      <th>origin_airport_seq_id</th>\n",
 899 |        "    </tr>\n",
 900 |        "    <tr>\n",
 901 |        "      <th>unique_carrier</th>\n",
 902 |        "      <th>origin</th>\n",
 903 |        "      <th>dest</th>\n",
 904 |        "      <th>tail_num</th>\n",
 905 |        "      <th>fl_date</th>\n",
 906 |        "      <th></th>\n",
 907 |        "      <th></th>\n",
 908 |        "      <th></th>\n",
 909 |        "      <th></th>\n",
 910 |        "    </tr>\n",
 911 |        "  </thead>\n",
 912 |        "  <tbody>\n",
 913 |        "    <tr>\n",
 914 |        "      <th rowspan=\"5\" valign=\"top\">AA</th>\n",
 915 |        "      <th rowspan=\"5\" valign=\"top\">ABQ</th>\n",
 916 |        "      <th rowspan=\"5\" valign=\"top\">DFW</th>\n",
 917 |        "      <th rowspan=\"2\" valign=\"top\">N200AA</th>\n",
 918 |        "      <th>2014-01-06</th>\n",
 919 |        "      <td>19805</td>\n",
 920 |        "      <td>1662</td>\n",
 921 |        "      <td>10140</td>\n",
 922 |        "      <td>1014002</td>\n",
 923 |        "    </tr>\n",
 924 |        "    <tr>\n",
 925 |        "      <th>2014-01-27</th>\n",
 926 |        "      <td>19805</td>\n",
 927 |        "      <td>1090</td>\n",
 928 |        "      <td>10140</td>\n",
 929 |        "      <td>1014002</td>\n",
 930 |        "    </tr>\n",
 931 |        "    <tr>\n",
 932 |        "      <th>N202AA</th>\n",
 933 |        "      <th>2014-01-27</th>\n",
 934 |        "      <td>19805</td>\n",
 935 |        "      <td>1332</td>\n",
 936 |        "      <td>10140</td>\n",
 937 |        "      <td>1014002</td>\n",
 938 |        "    </tr>\n",
 939 |        "    <tr>\n",
 940 |        "      <th rowspan=\"2\" valign=\"top\">N426AA</th>\n",
 941 |        "      <th>2014-01-09</th>\n",
 942 |        "      <td>19805</td>\n",
 943 |        "      <td>1662</td>\n",
 944 |        "      <td>10140</td>\n",
 945 |        "      <td>1014002</td>\n",
 946 |        "    </tr>\n",
 947 |        "    <tr>\n",
 948 |        "      <th>2014-01-15</th>\n",
 949 |        "      <td>19805</td>\n",
 950 |        "      <td>1467</td>\n",
 951 |        "      <td>10140</td>\n",
 952 |        "      <td>1014002</td>\n",
 953 |        "    </tr>\n",
 954 |        "  </tbody>\n",
 955 |        "</table>\n",
 956 |        "</div>"
 957 |       ],
 958 |       "text/plain": [
 959 |        "                                                airline_id  fl_num  \\\n",
 960 |        "unique_carrier origin dest tail_num fl_date                          \n",
 961 |        "AA             ABQ    DFW  N200AA   2014-01-06       19805    1662   \n",
 962 |        "                                    2014-01-27       19805    1090   \n",
 963 |        "                           N202AA   2014-01-27       19805    1332   \n",
 964 |        "                           N426AA   2014-01-09       19805    1662   \n",
 965 |        "                                    2014-01-15       19805    1467   \n",
 966 |        "\n",
 967 |        "                                                origin_airport_id  \\\n",
 968 |        "unique_carrier origin dest tail_num fl_date                         \n",
 969 |        "AA             ABQ    DFW  N200AA   2014-01-06              10140   \n",
 970 |        "                                    2014-01-27              10140   \n",
 971 |        "                           N202AA   2014-01-27              10140   \n",
 972 |        "                           N426AA   2014-01-09              10140   \n",
 973 |        "                                    2014-01-15              10140   \n",
 974 |        "\n",
 975 |        "                                                origin_airport_seq_id  \n",
 976 |        "unique_carrier origin dest tail_num fl_date                            \n",
 977 |        "AA             ABQ    DFW  N200AA   2014-01-06                1014002  \n",
 978 |        "                                    2014-01-27                1014002  \n",
 979 |        "                           N202AA   2014-01-27                1014002  \n",
 980 |        "                           N426AA   2014-01-09                1014002  \n",
 981 |        "                                    2014-01-15                1014002  "
 982 |       ]
 983 |      },
 984 |      "execution_count": 14,
 985 |      "metadata": {},
 986 |      "output_type": "execute_result"
 987 |     }
 988 |    ],
 989 |    "source": [
 990 |     "hdf = df.set_index(['unique_carrier', 'origin', 'dest', 'tail_num', 'fl_date']).sort_index()\n",
 991 |     "hdf[hdf.columns[:4]].head()"
 992 |    ]
 993 |   },
 994 |   {
 995 |    "cell_type": "markdown",
 996 |    "metadata": {},
 997 |    "source": [
 998 |     "And just to clear up some terminology, the *levels* of a MultiIndex are the\n",
 999 |     "former column names (`unique_carrier`, `origin`...).\n",
1000 |     "The labels are the actual values in a level, (`'AA'`, `'ABQ'`, ...).\n",
1001 |     "Levels can be referred to by name or position, with 0 being the outermost level.\n",
1002 |     "\n",
1003 |     "Slicing the outermost index level is pretty easy, we just use our regular `.loc[row_indexer, column_indexer]`. We'll select the columns `dep_time` and `dep_delay` where the carrier was American Airlines, Delta, or US Airways."
1004 |    ]
1005 |   },
1006 |   {
1007 |    "cell_type": "code",
1008 |    "execution_count": 15,
1009 |    "metadata": {},
1010 |    "outputs": [
1011 |     {
1012 |      "data": {
1013 |       "text/html": [
1014 |        "<div>\n",
1015 |        "<table border=\"0\" class=\"dataframe\">\n",
1016 |        "  <thead>\n",
1017 |        "    <tr style=\"text-align: right;\">\n",
1018 |        "      <th></th>\n",
1019 |        "      <th></th>\n",
1020 |        "      <th></th>\n",
1021 |        "      <th></th>\n",
1022 |        "      <th></th>\n",
1023 |        "      <th>dep_time</th>\n",
1024 |        "      <th>dep_delay</th>\n",
1025 |        "    </tr>\n",
1026 |        "    <tr>\n",
1027 |        "      <th>unique_carrier</th>\n",
1028 |        "      <th>origin</th>\n",
1029 |        "      <th>dest</th>\n",
1030 |        "      <th>tail_num</th>\n",
1031 |        "      <th>fl_date</th>\n",
1032 |        "      <th></th>\n",
1033 |        "      <th></th>\n",
1034 |        "    </tr>\n",
1035 |        "  </thead>\n",
1036 |        "  <tbody>\n",
1037 |        "    <tr>\n",
1038 |        "      <th rowspan=\"5\" valign=\"top\">AA</th>\n",
1039 |        "      <th rowspan=\"5\" valign=\"top\">ABQ</th>\n",
1040 |        "      <th rowspan=\"5\" valign=\"top\">DFW</th>\n",
1041 |        "      <th rowspan=\"2\" valign=\"top\">N200AA</th>\n",
1042 |        "      <th>2014-01-06</th>\n",
1043 |        "      <td>1246.0</td>\n",
1044 |        "      <td>71.0</td>\n",
1045 |        "    </tr>\n",
1046 |        "    <tr>\n",
1047 |        "      <th>2014-01-27</th>\n",
1048 |        "      <td>605.0</td>\n",
1049 |        "      <td>0.0</td>\n",
1050 |        "    </tr>\n",
1051 |        "    <tr>\n",
1052 |        "      <th>N202AA</th>\n",
1053 |        "      <th>2014-01-27</th>\n",
1054 |        "      <td>822.0</td>\n",
1055 |        "      <td>-13.0</td>\n",
1056 |        "    </tr>\n",
1057 |        "    <tr>\n",
1058 |        "      <th rowspan=\"2\" valign=\"top\">N426AA</th>\n",
1059 |        "      <th>2014-01-09</th>\n",
1060 |        "      <td>1135.0</td>\n",
1061 |        "      <td>0.0</td>\n",
1062 |        "    </tr>\n",
1063 |        "    <tr>\n",
1064 |        "      <th>2014-01-15</th>\n",
1065 |        "      <td>1022.0</td>\n",
1066 |        "      <td>-8.0</td>\n",
1067 |        "    </tr>\n",
1068 |        "    <tr>\n",
1069 |        "      <th>...</th>\n",
1070 |        "      <th>...</th>\n",
1071 |        "      <th>...</th>\n",
1072 |        "      <th>...</th>\n",
1073 |        "      <th>...</th>\n",
1074 |        "      <td>...</td>\n",
1075 |        "      <td>...</td>\n",
1076 |        "    </tr>\n",
1077 |        "    <tr>\n",
1078 |        "      <th rowspan=\"5\" valign=\"top\">US</th>\n",
1079 |        "      <th rowspan=\"5\" valign=\"top\">TUS</th>\n",
1080 |        "      <th rowspan=\"5\" valign=\"top\">PHX</th>\n",
1081 |        "      <th rowspan=\"2\" valign=\"top\">N824AW</th>\n",
1082 |        "      <th>2014-01-16</th>\n",
1083 |        "      <td>1900.0</td>\n",
1084 |        "      <td>-10.0</td>\n",
1085 |        "    </tr>\n",
1086 |        "    <tr>\n",
1087 |        "      <th>2014-01-20</th>\n",
1088 |        "      <td>1903.0</td>\n",
1089 |        "      <td>-7.0</td>\n",
1090 |        "    </tr>\n",
1091 |        "    <tr>\n",
1092 |        "      <th rowspan=\"2\" valign=\"top\">N836AW</th>\n",
1093 |        "      <th>2014-01-08</th>\n",
1094 |        "      <td>1928.0</td>\n",
1095 |        "      <td>18.0</td>\n",
1096 |        "    </tr>\n",
1097 |        "    <tr>\n",
1098 |        "      <th>2014-01-29</th>\n",
1099 |        "      <td>1908.0</td>\n",
1100 |        "      <td>-2.0</td>\n",
1101 |        "    </tr>\n",
1102 |        "    <tr>\n",
1103 |        "      <th>N837AW</th>\n",
1104 |        "      <th>2014-01-10</th>\n",
1105 |        "      <td>1902.0</td>\n",
1106 |        "      <td>-8.0</td>\n",
1107 |        "    </tr>\n",
1108 |        "  </tbody>\n",
1109 |        "</table>\n",
1110 |        "<p>139194 rows × 2 columns</p>\n",
1111 |        "</div>"
1112 |       ],
1113 |       "text/plain": [
1114 |        "                                                dep_time  dep_delay\n",
1115 |        "unique_carrier origin dest tail_num fl_date                        \n",
1116 |        "AA             ABQ    DFW  N200AA   2014-01-06    1246.0       71.0\n",
1117 |        "                                    2014-01-27     605.0        0.0\n",
1118 |        "                           N202AA   2014-01-27     822.0      -13.0\n",
1119 |        "                           N426AA   2014-01-09    1135.0        0.0\n",
1120 |        "                                    2014-01-15    1022.0       -8.0\n",
1121 |        "...                                                  ...        ...\n",
1122 |        "US             TUS    PHX  N824AW   2014-01-16    1900.0      -10.0\n",
1123 |        "                                    2014-01-20    1903.0       -7.0\n",
1124 |        "                           N836AW   2014-01-08    1928.0       18.0\n",
1125 |        "                                    2014-01-29    1908.0       -2.0\n",
1126 |        "                           N837AW   2014-01-10    1902.0       -8.0\n",
1127 |        "\n",
1128 |        "[139194 rows x 2 columns]"
1129 |       ]
1130 |      },
1131 |      "execution_count": 15,
1132 |      "metadata": {},
1133 |      "output_type": "execute_result"
1134 |     }
1135 |    ],
1136 |    "source": [
1137 |     "hdf.loc[['AA', 'DL', 'US'], ['dep_time', 'dep_delay']]"
1138 |    ]
1139 |   },
1140 |   {
1141 |    "cell_type": "markdown",
1142 |    "metadata": {},
1143 |    "source": [
1144 |     "So far, so good. What if you wanted to select the rows whose origin was Chicago O'Hare (`ORD`) or Des Moines International Airport (DSM).\n",
1145 |     "Well, `.loc` wants `[row_indexer, column_indexer]` so let's wrap the two elements of our row indexer (the list of carriers and the list of origins) in a tuple to make it a single unit:"
1146 |    ]
1147 |   },
1148 |   {
1149 |    "cell_type": "code",
1150 |    "execution_count": 16,
1151 |    "metadata": {},
1152 |    "outputs": [
1153 |     {
1154 |      "data": {
1155 |       "text/html": [
1156 |        "<div>\n",
1157 |        "<table border=\"0\" class=\"dataframe\">\n",
1158 |        "  <thead>\n",
1159 |        "    <tr style=\"text-align: right;\">\n",
1160 |        "      <th></th>\n",
1161 |        "      <th></th>\n",
1162 |        "      <th></th>\n",
1163 |        "      <th></th>\n",
1164 |        "      <th></th>\n",
1165 |        "      <th>dep_time</th>\n",
1166 |        "      <th>dep_delay</th>\n",
1167 |        "    </tr>\n",
1168 |        "    <tr>\n",
1169 |        "      <th>unique_carrier</th>\n",
1170 |        "      <th>origin</th>\n",
1171 |        "      <th>dest</th>\n",
1172 |        "      <th>tail_num</th>\n",
1173 |        "      <th>fl_date</th>\n",
1174 |        "      <th></th>\n",
1175 |        "      <th></th>\n",
1176 |        "    </tr>\n",
1177 |        "  </thead>\n",
1178 |        "  <tbody>\n",
1179 |        "    <tr>\n",
1180 |        "      <th rowspan=\"5\" valign=\"top\">AA</th>\n",
1181 |        "      <th rowspan=\"5\" valign=\"top\">DSM</th>\n",
1182 |        "      <th rowspan=\"5\" valign=\"top\">DFW</th>\n",
1183 |        "      <th rowspan=\"2\" valign=\"top\">N200AA</th>\n",
1184 |        "      <th>2014-01-12</th>\n",
1185 |        "      <td>603.0</td>\n",
1186 |        "      <td>-7.0</td>\n",
1187 |        "    </tr>\n",
1188 |        "    <tr>\n",
1189 |        "      <th>2014-01-17</th>\n",
1190 |        "      <td>751.0</td>\n",
1191 |        "      <td>101.0</td>\n",
1192 |        "    </tr>\n",
1193 |        "    <tr>\n",
1194 |        "      <th rowspan=\"2\" valign=\"top\">N424AA</th>\n",
1195 |        "      <th>2014-01-10</th>\n",
1196 |        "      <td>1759.0</td>\n",
1197 |        "      <td>-1.0</td>\n",
1198 |        "    </tr>\n",
1199 |        "    <tr>\n",
1200 |        "      <th>2014-01-15</th>\n",
1201 |        "      <td>1818.0</td>\n",
1202 |        "      <td>18.0</td>\n",
1203 |        "    </tr>\n",
1204 |        "    <tr>\n",
1205 |        "      <th>N426AA</th>\n",
1206 |        "      <th>2014-01-07</th>\n",
1207 |        "      <td>1835.0</td>\n",
1208 |        "      <td>35.0</td>\n",
1209 |        "    </tr>\n",
1210 |        "    <tr>\n",
1211 |        "      <th>...</th>\n",
1212 |        "      <th>...</th>\n",
1213 |        "      <th>...</th>\n",
1214 |        "      <th>...</th>\n",
1215 |        "      <th>...</th>\n",
1216 |        "      <td>...</td>\n",
1217 |        "      <td>...</td>\n",
1218 |        "    </tr>\n",
1219 |        "    <tr>\n",
1220 |        "      <th rowspan=\"5\" valign=\"top\">US</th>\n",
1221 |        "      <th rowspan=\"5\" valign=\"top\">ORD</th>\n",
1222 |        "      <th rowspan=\"5\" valign=\"top\">PHX</th>\n",
1223 |        "      <th>N806AW</th>\n",
1224 |        "      <th>2014-01-26</th>\n",
1225 |        "      <td>1406.0</td>\n",
1226 |        "      <td>-4.0</td>\n",
1227 |        "    </tr>\n",
1228 |        "    <tr>\n",
1229 |        "      <th>N830AW</th>\n",
1230 |        "      <th>2014-01-28</th>\n",
1231 |        "      <td>1401.0</td>\n",
1232 |        "      <td>-9.0</td>\n",
1233 |        "    </tr>\n",
1234 |        "    <tr>\n",
1235 |        "      <th>N833AW</th>\n",
1236 |        "      <th>2014-01-10</th>\n",
1237 |        "      <td>1500.0</td>\n",
1238 |        "      <td>50.0</td>\n",
1239 |        "    </tr>\n",
1240 |        "    <tr>\n",
1241 |        "      <th>N837AW</th>\n",
1242 |        "      <th>2014-01-19</th>\n",
1243 |        "      <td>1408.0</td>\n",
1244 |        "      <td>-2.0</td>\n",
1245 |        "    </tr>\n",
1246 |        "    <tr>\n",
1247 |        "      <th>N839AW</th>\n",
1248 |        "      <th>2014-01-14</th>\n",
1249 |        "      <td>1406.0</td>\n",
1250 |        "      <td>-4.0</td>\n",
1251 |        "    </tr>\n",
1252 |        "  </tbody>\n",
1253 |        "</table>\n",
1254 |        "<p>5205 rows × 2 columns</p>\n",
1255 |        "</div>"
1256 |       ],
1257 |       "text/plain": [
1258 |        "                                                dep_time  dep_delay\n",
1259 |        "unique_carrier origin dest tail_num fl_date                        \n",
1260 |        "AA             DSM    DFW  N200AA   2014-01-12     603.0       -7.0\n",
1261 |        "                                    2014-01-17     751.0      101.0\n",
1262 |        "                           N424AA   2014-01-10    1759.0       -1.0\n",
1263 |        "                                    2014-01-15    1818.0       18.0\n",
1264 |        "                           N426AA   2014-01-07    1835.0       35.0\n",
1265 |        "...                                                  ...        ...\n",
1266 |        "US             ORD    PHX  N806AW   2014-01-26    1406.0       -4.0\n",
1267 |        "                           N830AW   2014-01-28    1401.0       -9.0\n",
1268 |        "                           N833AW   2014-01-10    1500.0       50.0\n",
1269 |        "                           N837AW   2014-01-19    1408.0       -2.0\n",
1270 |        "                           N839AW   2014-01-14    1406.0       -4.0\n",
1271 |        "\n",
1272 |        "[5205 rows x 2 columns]"
1273 |       ]
1274 |      },
1275 |      "execution_count": 16,
1276 |      "metadata": {},
1277 |      "output_type": "execute_result"
1278 |     }
1279 |    ],
1280 |    "source": [
1281 |     "hdf.loc[(['AA', 'DL', 'US'], ['ORD', 'DSM']), ['dep_time', 'dep_delay']]"
1282 |    ]
1283 |   },
1284 |   {
1285 |    "cell_type": "markdown",
1286 |    "metadata": {},
1287 |    "source": [
1288 |     "Now try to do any flight from ORD or DSM, not just from those carriers.\n",
1289 |     "This used to be a pain.\n",
1290 |     "You might have to turn to the `.xs` method, or pass in `df.index.get_level_values(0)` and zip that up with the indexers your wanted, or maybe reset the index and do a boolean mask, and set the index again... ugh.\n",
1291 |     "\n",
1292 |     "But now, you can use an `IndexSlice`."
1293 |    ]
1294 |   },
1295 |   {
1296 |    "cell_type": "code",
1297 |    "execution_count": 17,
1298 |    "metadata": {},
1299 |    "outputs": [
1300 |     {
1301 |      "data": {
1302 |       "text/html": [
1303 |        "<div>\n",
1304 |        "<table border=\"0\" class=\"dataframe\">\n",
1305 |        "  <thead>\n",
1306 |        "    <tr style=\"text-align: right;\">\n",
1307 |        "      <th></th>\n",
1308 |        "      <th></th>\n",
1309 |        "      <th></th>\n",
1310 |        "      <th></th>\n",
1311 |        "      <th></th>\n",
1312 |        "      <th>dep_time</th>\n",
1313 |        "      <th>dep_delay</th>\n",
1314 |        "    </tr>\n",
1315 |        "    <tr>\n",
1316 |        "      <th>unique_carrier</th>\n",
1317 |        "      <th>origin</th>\n",
1318 |        "      <th>dest</th>\n",
1319 |        "      <th>tail_num</th>\n",
1320 |        "      <th>fl_date</th>\n",
1321 |        "      <th></th>\n",
1322 |        "      <th></th>\n",
1323 |        "    </tr>\n",
1324 |        "  </thead>\n",
1325 |        "  <tbody>\n",
1326 |        "    <tr>\n",
1327 |        "      <th rowspan=\"5\" valign=\"top\">AA</th>\n",
1328 |        "      <th rowspan=\"5\" valign=\"top\">DSM</th>\n",
1329 |        "      <th rowspan=\"5\" valign=\"top\">DFW</th>\n",
1330 |        "      <th rowspan=\"2\" valign=\"top\">N200AA</th>\n",
1331 |        "      <th>2014-01-12</th>\n",
1332 |        "      <td>603.0</td>\n",
1333 |        "      <td>-7.0</td>\n",
1334 |        "    </tr>\n",
1335 |        "    <tr>\n",
1336 |        "      <th>2014-01-17</th>\n",
1337 |        "      <td>751.0</td>\n",
1338 |        "      <td>101.0</td>\n",
1339 |        "    </tr>\n",
1340 |        "    <tr>\n",
1341 |        "      <th rowspan=\"2\" valign=\"top\">N424AA</th>\n",
1342 |        "      <th>2014-01-10</th>\n",
1343 |        "      <td>1759.0</td>\n",
1344 |        "      <td>-1.0</td>\n",
1345 |        "    </tr>\n",
1346 |        "    <tr>\n",
1347 |        "      <th>2014-01-15</th>\n",
1348 |        "      <td>1818.0</td>\n",
1349 |        "      <td>18.0</td>\n",
1350 |        "    </tr>\n",
1351 |        "    <tr>\n",
1352 |        "      <th>N426AA</th>\n",
1353 |        "      <th>2014-01-07</th>\n",
1354 |        "      <td>1835.0</td>\n",
1355 |        "      <td>35.0</td>\n",
1356 |        "    </tr>\n",
1357 |        "    <tr>\n",
1358 |        "      <th>...</th>\n",
1359 |        "      <th>...</th>\n",
1360 |        "      <th>...</th>\n",
1361 |        "      <th>...</th>\n",
1362 |        "      <th>...</th>\n",
1363 |        "      <td>...</td>\n",
1364 |        "      <td>...</td>\n",
1365 |        "    </tr>\n",
1366 |        "    <tr>\n",
1367 |        "      <th rowspan=\"5\" valign=\"top\">WN</th>\n",
1368 |        "      <th rowspan=\"5\" valign=\"top\">DSM</th>\n",
1369 |        "      <th rowspan=\"5\" valign=\"top\">MDW</th>\n",
1370 |        "      <th>N941WN</th>\n",
1371 |        "      <th>2014-01-17</th>\n",
1372 |        "      <td>1759.0</td>\n",
1373 |        "      <td>14.0</td>\n",
1374 |        "    </tr>\n",
1375 |        "    <tr>\n",
1376 |        "      <th>N943WN</th>\n",
1377 |        "      <th>2014-01-10</th>\n",
1378 |        "      <td>2229.0</td>\n",
1379 |        "      <td>284.0</td>\n",
1380 |        "    </tr>\n",
1381 |        "    <tr>\n",
1382 |        "      <th>N963WN</th>\n",
1383 |        "      <th>2014-01-22</th>\n",
1384 |        "      <td>656.0</td>\n",
1385 |        "      <td>-4.0</td>\n",
1386 |        "    </tr>\n",
1387 |        "    <tr>\n",
1388 |        "      <th>N967WN</th>\n",
1389 |        "      <th>2014-01-30</th>\n",
1390 |        "      <td>654.0</td>\n",
1391 |        "      <td>-6.0</td>\n",
1392 |        "    </tr>\n",
1393 |        "    <tr>\n",
1394 |        "      <th>N969WN</th>\n",
1395 |        "      <th>2014-01-19</th>\n",
1396 |        "      <td>1747.0</td>\n",
1397 |        "      <td>2.0</td>\n",
1398 |        "    </tr>\n",
1399 |        "  </tbody>\n",
1400 |        "</table>\n",
1401 |        "<p>22380 rows × 2 columns</p>\n",
1402 |        "</div>"
1403 |       ],
1404 |       "text/plain": [
1405 |        "                                                dep_time  dep_delay\n",
1406 |        "unique_carrier origin dest tail_num fl_date                        \n",
1407 |        "AA             DSM    DFW  N200AA   2014-01-12     603.0       -7.0\n",
1408 |        "                                    2014-01-17     751.0      101.0\n",
1409 |        "                           N424AA   2014-01-10    1759.0       -1.0\n",
1410 |        "                                    2014-01-15    1818.0       18.0\n",
1411 |        "                           N426AA   2014-01-07    1835.0       35.0\n",
1412 |        "...                                                  ...        ...\n",
1413 |        "WN             DSM    MDW  N941WN   2014-01-17    1759.0       14.0\n",
1414 |        "                           N943WN   2014-01-10    2229.0      284.0\n",
1415 |        "                           N963WN   2014-01-22     656.0       -4.0\n",
1416 |        "                           N967WN   2014-01-30     654.0       -6.0\n",
1417 |        "                           N969WN   2014-01-19    1747.0        2.0\n",
1418 |        "\n",
1419 |        "[22380 rows x 2 columns]"
1420 |       ]
1421 |      },
1422 |      "execution_count": 17,
1423 |      "metadata": {},
1424 |      "output_type": "execute_result"
1425 |     }
1426 |    ],
1427 |    "source": [
1428 |     "hdf.loc[pd.IndexSlice[:, ['ORD', 'DSM']], ['dep_time', 'dep_delay']]"
1429 |    ]
1430 |   },
1431 |   {
1432 |    "cell_type": "markdown",
1433 |    "metadata": {},
1434 |    "source": [
1435 |     "The `:` says include every label in this level.\n",
1436 |     "The `IndexSlice` object is just sugar for the actual python `slice` object needed to remove slice each level."
1437 |    ]
1438 |   },
1439 |   {
1440 |    "cell_type": "code",
1441 |    "execution_count": 18,
1442 |    "metadata": {},
1443 |    "outputs": [
1444 |     {
1445 |      "data": {
1446 |       "text/plain": [
1447 |        "(slice(None, None, None), ['ORD', 'DSM'])"
1448 |       ]
1449 |      },
1450 |      "execution_count": 18,
1451 |      "metadata": {},
1452 |      "output_type": "execute_result"
1453 |     }
1454 |    ],
1455 |    "source": [
1456 |     "pd.IndexSlice[:, ['ORD', 'DSM']]"
1457 |    ]
1458 |   },
1459 |   {
1460 |    "cell_type": "markdown",
1461 |    "metadata": {},
1462 |    "source": [
1463 |     "We use `IndexSlice` since `hdf.loc[(:, ['ORD', 'DSM'])]` isn't valid python syntax.\n",
1464 |     "Now we can slice to our heart's content; all flights from O'Hare to Des Moines in the first half of January? Sure, why not?"
1465 |    ]
1466 |   },
1467 |   {
1468 |    "cell_type": "code",
1469 |    "execution_count": 19,
1470 |    "metadata": {},
1471 |    "outputs": [
1472 |     {
1473 |      "data": {
1474 |       "text/html": [
1475 |        "<div>\n",
1476 |        "<table border=\"0\" class=\"dataframe\">\n",
1477 |        "  <thead>\n",
1478 |        "    <tr style=\"text-align: right;\">\n",
1479 |        "      <th></th>\n",
1480 |        "      <th></th>\n",
1481 |        "      <th></th>\n",
1482 |        "      <th></th>\n",
1483 |        "      <th></th>\n",
1484 |        "      <th>dep_time</th>\n",
1485 |        "      <th>dep_delay</th>\n",
1486 |        "      <th>arr_time</th>\n",
1487 |        "      <th>arr_delay</th>\n",
1488 |        "    </tr>\n",
1489 |        "    <tr>\n",
1490 |        "      <th>unique_carrier</th>\n",
1491 |        "      <th>origin</th>\n",
1492 |        "      <th>dest</th>\n",
1493 |        "      <th>tail_num</th>\n",
1494 |        "      <th>fl_date</th>\n",
1495 |        "      <th></th>\n",
1496 |        "      <th></th>\n",
1497 |        "      <th></th>\n",
1498 |        "      <th></th>\n",
1499 |        "    </tr>\n",
1500 |        "  </thead>\n",
1501 |        "  <tbody>\n",
1502 |        "    <tr>\n",
1503 |        "      <th rowspan=\"5\" valign=\"top\">EV</th>\n",
1504 |        "      <th rowspan=\"5\" valign=\"top\">ORD</th>\n",
1505 |        "      <th rowspan=\"5\" valign=\"top\">DSM</th>\n",
1506 |        "      <th>NaN</th>\n",
1507 |        "      <th>2014-01-07</th>\n",
1508 |        "      <td>NaN</td>\n",
1509 |        "      <td>NaN</td>\n",
1510 |        "      <td>NaN</td>\n",
1511 |        "      <td>NaN</td>\n",
1512 |        "    </tr>\n",
1513 |        "    <tr>\n",
1514 |        "      <th>N11121</th>\n",
1515 |        "      <th>2014-01-05</th>\n",
1516 |        "      <td>NaN</td>\n",
1517 |        "      <td>NaN</td>\n",
1518 |        "      <td>NaN</td>\n",
1519 |        "      <td>NaN</td>\n",
1520 |        "    </tr>\n",
1521 |        "    <tr>\n",
1522 |        "      <th>N11181</th>\n",
1523 |        "      <th>2014-01-12</th>\n",
1524 |        "      <td>1514.0</td>\n",
1525 |        "      <td>6.0</td>\n",
1526 |        "      <td>1625.0</td>\n",
1527 |        "      <td>-2.0</td>\n",
1528 |        "    </tr>\n",
1529 |        "    <tr>\n",
1530 |        "      <th>N11536</th>\n",
1531 |        "      <th>2014-01-10</th>\n",
1532 |        "      <td>1723.0</td>\n",
1533 |        "      <td>4.0</td>\n",
1534 |        "      <td>1853.0</td>\n",
1535 |        "      <td>19.0</td>\n",
1536 |        "    </tr>\n",
1537 |        "    <tr>\n",
1538 |        "      <th>N11539</th>\n",
1539 |        "      <th>2014-01-01</th>\n",
1540 |        "      <td>1127.0</td>\n",
1541 |        "      <td>127.0</td>\n",
1542 |        "      <td>1304.0</td>\n",
1543 |        "      <td>149.0</td>\n",
1544 |        "    </tr>\n",
1545 |        "    <tr>\n",
1546 |        "      <th>...</th>\n",
1547 |        "      <th>...</th>\n",
1548 |        "      <th>...</th>\n",
1549 |        "      <th>...</th>\n",
1550 |        "      <th>...</th>\n",
1551 |        "      <td>...</td>\n",
1552 |        "      <td>...</td>\n",
1553 |        "      <td>...</td>\n",
1554 |        "      <td>...</td>\n",
1555 |        "    </tr>\n",
1556 |        "    <tr>\n",
1557 |        "      <th rowspan=\"5\" valign=\"top\">UA</th>\n",
1558 |        "      <th rowspan=\"5\" valign=\"top\">ORD</th>\n",
1559 |        "      <th rowspan=\"5\" valign=\"top\">DSM</th>\n",
1560 |        "      <th>N24212</th>\n",
1561 |        "      <th>2014-01-09</th>\n",
1562 |        "      <td>2023.0</td>\n",
1563 |        "      <td>8.0</td>\n",
1564 |        "      <td>2158.0</td>\n",
1565 |        "      <td>34.0</td>\n",
1566 |        "    </tr>\n",
1567 |        "    <tr>\n",
1568 |        "      <th>N73256</th>\n",
1569 |        "      <th>2014-01-15</th>\n",
1570 |        "      <td>2019.0</td>\n",
1571 |        "      <td>4.0</td>\n",
1572 |        "      <td>2127.0</td>\n",
1573 |        "      <td>3.0</td>\n",
1574 |        "    </tr>\n",
1575 |        "    <tr>\n",
1576 |        "      <th rowspan=\"2\" valign=\"top\">N78285</th>\n",
1577 |        "      <th>2014-01-07</th>\n",
1578 |        "      <td>2020.0</td>\n",
1579 |        "      <td>5.0</td>\n",
1580 |        "      <td>2136.0</td>\n",
1581 |        "      <td>12.0</td>\n",
1582 |        "    </tr>\n",
1583 |        "    <tr>\n",
1584 |        "      <th>2014-01-13</th>\n",
1585 |        "      <td>2014.0</td>\n",
1586 |        "      <td>-1.0</td>\n",
1587 |        "      <td>2114.0</td>\n",
1588 |        "      <td>-10.0</td>\n",
1589 |        "    </tr>\n",
1590 |        "    <tr>\n",
1591 |        "      <th>N841UA</th>\n",
1592 |        "      <th>2014-01-11</th>\n",
1593 |        "      <td>1825.0</td>\n",
1594 |        "      <td>20.0</td>\n",
1595 |        "      <td>1939.0</td>\n",
1596 |        "      <td>19.0</td>\n",
1597 |        "    </tr>\n",
1598 |        "  </tbody>\n",
1599 |        "</table>\n",
1600 |        "<p>153 rows × 4 columns</p>\n",
1601 |        "</div>"
1602 |       ],
1603 |       "text/plain": [
1604 |        "                                                dep_time  dep_delay  arr_time  \\\n",
1605 |        "unique_carrier origin dest tail_num fl_date                                     \n",
1606 |        "EV             ORD    DSM  NaN      2014-01-07       NaN        NaN       NaN   \n",
1607 |        "                           N11121   2014-01-05       NaN        NaN       NaN   \n",
1608 |        "                           N11181   2014-01-12    1514.0        6.0    1625.0   \n",
1609 |        "                           N11536   2014-01-10    1723.0        4.0    1853.0   \n",
1610 |        "                           N11539   2014-01-01    1127.0      127.0    1304.0   \n",
1611 |        "...                                                  ...        ...       ...   \n",
1612 |        "UA             ORD    DSM  N24212   2014-01-09    2023.0        8.0    2158.0   \n",
1613 |        "                           N73256   2014-01-15    2019.0        4.0    2127.0   \n",
1614 |        "                           N78285   2014-01-07    2020.0        5.0    2136.0   \n",
1615 |        "                                    2014-01-13    2014.0       -1.0    2114.0   \n",
1616 |        "                           N841UA   2014-01-11    1825.0       20.0    1939.0   \n",
1617 |        "\n",
1618 |        "                                                arr_delay  \n",
1619 |        "unique_carrier origin dest tail_num fl_date                \n",
1620 |        "EV             ORD    DSM  NaN      2014-01-07        NaN  \n",
1621 |        "                           N11121   2014-01-05        NaN  \n",
1622 |        "                           N11181   2014-01-12       -2.0  \n",
1623 |        "                           N11536   2014-01-10       19.0  \n",
1624 |        "                           N11539   2014-01-01      149.0  \n",
1625 |        "...                                                   ...  \n",
1626 |        "UA             ORD    DSM  N24212   2014-01-09       34.0  \n",
1627 |        "                           N73256   2014-01-15        3.0  \n",
1628 |        "                           N78285   2014-01-07       12.0  \n",
1629 |        "                                    2014-01-13      -10.0  \n",
1630 |        "                           N841UA   2014-01-11       19.0  \n",
1631 |        "\n",
1632 |        "[153 rows x 4 columns]"
1633 |       ]
1634 |      },
1635 |      "execution_count": 19,
1636 |      "metadata": {},
1637 |      "output_type": "execute_result"
1638 |     }
1639 |    ],
1640 |    "source": [
1641 |     "hdf.loc[pd.IndexSlice[:, 'ORD', 'DSM', :, '2014-01-01':'2014-01-15'],\n",
1642 |     "        ['dep_time', 'dep_delay', 'arr_time', 'arr_delay']]"
1643 |    ]
1644 |   },
1645 |   {
1646 |    "cell_type": "markdown",
1647 |    "metadata": {},
1648 |    "source": [
1649 |     "We'll talk more about working with Indexes (including MultiIndexes) in a later post. I have an unproven thesis that they're underused because `IndexSlice` is underused, causing people to think they're more unwieldy than they actually are. But let's close out part one.\n",
1650 |     "\n",
1651 |     "## WrapUp\n",
1652 |     "\n",
1653 |     "This first post covered Indexing, a topic that's central to pandas.\n",
1654 |     "The power provided by the DataFrame comes with some unavoidable complexities.\n",
1655 |     "Best practices (using `.loc` and `.iloc`) will spare you many a headache.\n",
1656 |     "We then toured a couple of commonly misunderstood sub-topics, setting with copy and Hierarchical Indexing."
1657 |    ]
1658 |   }
1659 |  ],
1660 |  "metadata": {
1661 |   "kernelspec": {
1662 |    "display_name": "Python 3",
1663 |    "language": "python",
1664 |    "name": "python3"
1665 |   },
1666 |   "language_info": {
1667 |    "codemirror_mode": {
1668 |     "name": "ipython",
1669 |     "version": 3
1670 |    },
1671 |    "file_extension": ".py",
1672 |    "mimetype": "text/x-python",
1673 |    "name": "python",
1674 |    "nbconvert_exporter": "python",
1675 |    "pygments_lexer": "ipython3",
1676 |    "version": "3.6.1"
1677 |   }
1678 |  },
1679 |  "nbformat": 4,
1680 |  "nbformat_minor": 1
1681 | }
1682 | 


--------------------------------------------------------------------------------
/modern_4_performance.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Performance\n",
   8 |     "\n",
   9 |     "[Wes McKinney](https://twitter.com/wesmckinn), the creator of pandas, is kind of obsessed with performance. From micro-optimizations for element access, to [embedding](https://github.com/pydata/pandas/tree/master/pandas/src/klib) a fast hash table inside pandas, we all benefit from his and others' hard work.\n",
  10 |     "This post will focus mainly on making efficient use of pandas and NumPy.\n",
  11 |     "\n",
  12 |     "One thing I'll explicitly not touch on is storage formats.\n",
  13 |     "Performance is just one of many factors that go into choosing a storage format.\n",
  14 |     "Just know that pandas can talk to [many formats](http://pandas.pydata.org/pandas-docs/version/0.18.0/io.html), and the format that strikes the right balance between performance, portability, data-types, metadata handling, etc., is an [ongoing](http://blog.cloudera.com/blog/2016/03/feather-a-fast-on-disk-format-for-data-frames-for-r-and-python-powered-by-apache-arrow/) topic of discussion."
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "code",
  19 |    "execution_count": 1,
  20 |    "metadata": {
  21 |     "collapsed": true
  22 |    },
  23 |    "outputs": [],
  24 |    "source": [
  25 |     "%matplotlib inline\n",
  26 |     "\n",
  27 |     "import os\n",
  28 |     "import numpy as np\n",
  29 |     "import pandas as pd\n",
  30 |     "import matplotlib.pyplot as plt\n",
  31 |     "import seaborn as sns\n",
  32 |     "\n",
  33 |     "if int(os.environ.get(\"MODERN_PANDAS_EPUB\", 0)):\n",
  34 |     "    import prep # noqa\n",
  35 |     "\n",
  36 |     "sns.set_style('ticks')\n",
  37 |     "sns.set_context('talk')\n",
  38 |     "pd.options.display.max_rows = 10"
  39 |    ]
  40 |   },
  41 |   {
  42 |    "cell_type": "markdown",
  43 |    "metadata": {},
  44 |    "source": [
  45 |     "## Constructors\n",
  46 |     "\n",
  47 |     "It's pretty common to have many similar sources (say a bunch of CSVs) that need to be combined into a single DataFrame. There are two routes to the same end:\n",
  48 |     "\n",
  49 |     "1. Initialize one DataFrame and append to that\n",
  50 |     "2. Make many smaller DataFrames and concatenate at the end\n",
  51 |     "\n",
  52 |     "For pandas, the second option is faster.\n",
  53 |     "DataFrame appends are expensive relative to a list append.\n",
  54 |     "Depending on the values, pandas might have to recast the data to a different type.\n",
  55 |     "And indexes are immutable, so each time you append pandas has to create an entirely new one.\n",
  56 |     "\n",
  57 |     "In the last section we downloaded a bunch of weather files, one per state, writing each to a separate CSV.\n",
  58 |     "One could imagine coming back later to read them in, using the following code.\n",
  59 |     "\n",
  60 |     "The idiomatic python way\n",
  61 |     "\n",
  62 |     "```python\n",
  63 |     "files = glob.glob('weather/*.csv')\n",
  64 |     "columns = ['station', 'date', 'tmpf', 'relh', 'sped', 'mslp',\n",
  65 |     "           'p01i', 'vsby', 'gust_mph', 'skyc1', 'skyc2', 'skyc3']\n",
  66 |     "\n",
  67 |     "# init empty DataFrame, like you might for a list\n",
  68 |     "weather = pd.DataFrame(columns=columns)\n",
  69 |     "\n",
  70 |     "for fp in files:\n",
  71 |     "    city = pd.read_csv(fp, columns=columns)\n",
  72 |     "    weather.append(city)\n",
  73 |     "```\n",
  74 |     "\n",
  75 |     "This is pretty standard code, quite similar to building up a list of tuples, say.\n",
  76 |     "The only nitpick is that you'd probably use a list-comprehension if you were just making a list.\n",
  77 |     "But we don't have special syntax for DataFrame-comprehensions (if only), so you'd fall back to the \"initialize empty container, append to said container\" pattern.\n",
  78 |     "\n",
  79 |     "But there's a better, pandorable, way\n",
  80 |     "\n",
  81 |     "```python\n",
  82 |     "files = glob.glob('weather/*.csv')\n",
  83 |     "weather_dfs = [pd.read_csv(fp, names=columns) for fp in files]\n",
  84 |     "weather = pd.concat(weather_dfs)\n",
  85 |     "```\n",
  86 |     "\n",
  87 |     "Subjectively this is cleaner and more beautiful.\n",
  88 |     "There's fewer lines of code.\n",
  89 |     "You don't have this extraneous detail of building an empty DataFrame.\n",
  90 |     "And objectively the pandorable way is faster, as we'll test next."
  91 |    ]
  92 |   },
  93 |   {
  94 |    "cell_type": "markdown",
  95 |    "metadata": {},
  96 |    "source": [
  97 |     "We'll define two functions for building an identical DataFrame. The first `append_df`, creates an empty DataFrame and appends to it. The second, `concat_df`,  creates many DataFrames, and concatenates them at the end. We also write a short decorator that runs the functions a handful of times and records the results."
  98 |    ]
  99 |   },
 100 |   {
 101 |    "cell_type": "code",
 102 |    "execution_count": 2,
 103 |    "metadata": {
 104 |     "collapsed": true
 105 |    },
 106 |    "outputs": [],
 107 |    "source": [
 108 |     "import time\n",
 109 |     "\n",
 110 |     "size_per = 5000\n",
 111 |     "N = 100\n",
 112 |     "cols = list('abcd')\n",
 113 |     "\n",
 114 |     "def timed(n=30):\n",
 115 |     "    '''\n",
 116 |     "    Running a microbenchmark. Never use this.\n",
 117 |     "    '''\n",
 118 |     "    def deco(func):\n",
 119 |     "        def wrapper(*args, **kwargs):\n",
 120 |     "            timings = []\n",
 121 |     "            for i in range(n):\n",
 122 |     "                t0 = time.time()\n",
 123 |     "                func(*args, **kwargs)\n",
 124 |     "                t1 = time.time()\n",
 125 |     "                timings.append(t1 - t0)\n",
 126 |     "            return timings\n",
 127 |     "        return wrapper\n",
 128 |     "    return deco\n",
 129 |     "    \n",
 130 |     "@timed(60)\n",
 131 |     "def append_df():\n",
 132 |     "    '''\n",
 133 |     "    The pythonic (bad) way\n",
 134 |     "    '''\n",
 135 |     "    df = pd.DataFrame(columns=cols)\n",
 136 |     "    for _ in range(N):\n",
 137 |     "        df.append(pd.DataFrame(np.random.randn(size_per, 4), columns=cols))\n",
 138 |     "    return df\n",
 139 |     "\n",
 140 |     "@timed(60)\n",
 141 |     "def concat_df():\n",
 142 |     "    '''\n",
 143 |     "    The pandorabe (good) way\n",
 144 |     "    '''\n",
 145 |     "    dfs = [pd.DataFrame(np.random.randn(size_per, 4), columns=cols)\n",
 146 |     "           for _ in range(N)]\n",
 147 |     "    return pd.concat(dfs, ignore_index=True)"
 148 |    ]
 149 |   },
 150 |   {
 151 |    "cell_type": "code",
 152 |    "execution_count": 3,
 153 |    "metadata": {},
 154 |    "outputs": [
 155 |     {
 156 |      "data": {
 157 |       "text/html": [
 158 |        "<div>\n",
 159 |        "<style>\n",
 160 |        "    .dataframe thead tr:only-child th {\n",
 161 |        "        text-align: right;\n",
 162 |        "    }\n",
 163 |        "\n",
 164 |        "    .dataframe thead th {\n",
 165 |        "        text-align: left;\n",
 166 |        "    }\n",
 167 |        "\n",
 168 |        "    .dataframe tbody tr th {\n",
 169 |        "        vertical-align: top;\n",
 170 |        "    }\n",
 171 |        "</style>\n",
 172 |        "<table border=\"1\" class=\"dataframe\">\n",
 173 |        "  <thead>\n",
 174 |        "    <tr style=\"text-align: right;\">\n",
 175 |        "      <th></th>\n",
 176 |        "      <th>level_0</th>\n",
 177 |        "      <th>Method</th>\n",
 178 |        "      <th>Time (s)</th>\n",
 179 |        "    </tr>\n",
 180 |        "  </thead>\n",
 181 |        "  <tbody>\n",
 182 |        "    <tr>\n",
 183 |        "      <th>0</th>\n",
 184 |        "      <td>0</td>\n",
 185 |        "      <td>Append</td>\n",
 186 |        "      <td>0.158359</td>\n",
 187 |        "    </tr>\n",
 188 |        "    <tr>\n",
 189 |        "      <th>1</th>\n",
 190 |        "      <td>0</td>\n",
 191 |        "      <td>Concat</td>\n",
 192 |        "      <td>0.093890</td>\n",
 193 |        "    </tr>\n",
 194 |        "    <tr>\n",
 195 |        "      <th>2</th>\n",
 196 |        "      <td>1</td>\n",
 197 |        "      <td>Append</td>\n",
 198 |        "      <td>0.150050</td>\n",
 199 |        "    </tr>\n",
 200 |        "    <tr>\n",
 201 |        "      <th>3</th>\n",
 202 |        "      <td>1</td>\n",
 203 |        "      <td>Concat</td>\n",
 204 |        "      <td>0.092446</td>\n",
 205 |        "    </tr>\n",
 206 |        "    <tr>\n",
 207 |        "      <th>4</th>\n",
 208 |        "      <td>2</td>\n",
 209 |        "      <td>Append</td>\n",
 210 |        "      <td>0.149565</td>\n",
 211 |        "    </tr>\n",
 212 |        "  </tbody>\n",
 213 |        "</table>\n",
 214 |        "</div>"
 215 |       ],
 216 |       "text/plain": [
 217 |        "   level_0  Method  Time (s)\n",
 218 |        "0        0  Append  0.158359\n",
 219 |        "1        0  Concat  0.093890\n",
 220 |        "2        1  Append  0.150050\n",
 221 |        "3        1  Concat  0.092446\n",
 222 |        "4        2  Append  0.149565"
 223 |       ]
 224 |      },
 225 |      "execution_count": 3,
 226 |      "metadata": {},
 227 |      "output_type": "execute_result"
 228 |     }
 229 |    ],
 230 |    "source": [
 231 |     "t_append = append_df()\n",
 232 |     "t_concat = concat_df()\n",
 233 |     "\n",
 234 |     "timings = (pd.DataFrame({\"Append\": t_append, \"Concat\": t_concat})\n",
 235 |     "             .stack()\n",
 236 |     "             .reset_index()\n",
 237 |     "             .rename(columns={0: 'Time (s)',\n",
 238 |     "                              'level_1': 'Method'}))\n",
 239 |     "timings.head()"
 240 |    ]
 241 |   },
 242 |   {
 243 |    "cell_type": "code",
 244 |    "execution_count": 4,
 245 |    "metadata": {},
 246 |    "outputs": [
 247 |     {
 248 |      "data": {
 249 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAQwAAAGcCAYAAAAs6p7QAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XtUlOW+B/AvlxmZMc0pGTRND+IS3SUCGW6EUgE3aylp\n0VicEYrtwfZ4ItMuipdteYLKMkulLUGtAzK1PWQmq4O2TSOkoxRdTAsr1HErKcLsBkxhbvCePzzO\ncfagPCjMjM73sxZLed5nnvm9A/Plfd55L36SJEkgIhLg7+kCiOj6wcAgImEMDCISxsAgImEMDCIS\nxsAgImEMDCISxsAgImEMDCISxsAgImE+Hxh2ux0NDQ2w2+2eLoXI6/l8YDQ2NiIxMRGNjY2eLoXI\n6/l8YBCROAYGEQljYBCRMAYGEQljYBCRMAYGEQljYBCRMAYGEQljYBCRMAYGEQljYBCRMLcGRl1d\nHTQaDSIjIzF79mwcOHDgiv1zc3OxZs0ap7b6+nrMnTsXUVFRSEpKwo4dO/qyZCK6hNsCw2KxQKfT\nITU1FbW1tcjIyEB2djasVqtLX5PJhJycHJSWljq1t7e3Y/78+UhOTsbXX3+Nl156CcuWLcOpU6fc\ntRpEPi3QXU9UU1MDf39/aLVaAIBGo0FJSQkqKyuRnJzs1Fer1SI6Otql/dNPP8XgwYPxyCOPAADu\nvvtuvP/++xg4cKB7VqKX2Gw2GI3GXhnr4mn5gYG986McPHgwZDJZr4xFNx63BYbBYEBYWJhTW2ho\nKOrr612Cobi4GCEhIcjJyXFq/+GHHxAaGoply5bh008/hVqtxjPPPIMxY8YI1WAymdDS0uLU5u7T\n2m02G3Q6HZqamtz6vKLUajUKCgoYGtQltwVGW1sbFAqFU1tQUBDMZrNL35CQkC7HaG1txY4dO5CX\nl4f/+I//QFVVFZ588kmUl5dj5MiR3dag1+uRn59/dStARO4LDIVC4RIOZrMZSqVSeAy5XI5x48bh\n/vvvBwAkJSVh/PjxqK6uFgqM9PR0pKSkOLU1NjYiMzNTuIZrJZPJUFBQ0CtTkqamJqxcuRLAhR3E\narX6msfklISuxG2BMWrUKOj1eqc2g8Hg8ga+ktDQUOzbt8+prbOzE6I3oFepVFCpVE5tnnhzyGQy\nDB06tFfHVKvVvT4m0T9z26cksbGxsFqtKC0thc1mw9atW2E0GhEfHy88RnJyMpqamlBSUoLOzk7s\n3r0bP/zwAxISEvqwciK6yG2BIZfLUVRUhIqKCsTExECv12PTpk1QKpXIyspCQUFBt2OEhIRg8+bN\n2LlzJ+6++2689tpreOONNzBs2DA3rAER+Umi2/M3qIaGBiQmJmLPnj0YPny4p8vpkdOnT+Oxxx4D\nABQWFnJKQn2Oh4YTkTAGBhEJY2AQkTAGBhEJY2AQkTAGBhEJY2AQkTAGBhEJY2AQkTAGBhEJY2AQ\nkTAGBhEJY2AQkTAGBhEJY2AQkTAGBhEJY2AQkTAGBhEJY2AQkTAGBhEJc9t9Sa5ndrsdzc3Nni7D\nxaW3W/TWWy8GBwf32n1fyfP4kxTQ3NzsuDq3t7p4BzRvw6uZ31g4JSEiYdzC6CHFiGnwl4nfD7av\nSVInAMDPz3uyv9PWhvYTlZ4ug/oAA6OH/GVK+MsHeLoMIo/wnj9LROT1GBhEJIyBQUTCGBhEJIyB\nQUTCGBhEJIyBQUTCGBhEJMytgVFXVweNRoPIyEjMnj0bBw4cuGL/3NxcrFmzxqlt9erVuPPOOxEV\nFeX4OnXqVF+WTUT/x22BYbFYoNPpkJqaitraWmRkZCA7OxtWq9Wlr8lkQk5ODkpLS12WHT58GGvX\nrsW3337r+LrtttvcsQpEPs9tgVFTUwN/f39otVrIZDJoNBqoVCpUVrqec6DVahEQEIDk5GSn9s7O\nTvz0008YN26cu8omoku47VwSg8GAsLAwp7bQ0FDU19e7BENxcTFCQkKQk5Pj1H78+HGYzWasWbMG\n33zzDYYMGYInn3wS06ZNE6rBZDKhpaXFqa2xsbFH69FpO9+j/r6Ir9GNy22B0dbWBoVC4dQWFBQE\ns9ns0jckJKTLMc6ePYuYmBhkZWVh/PjxqKqqwqJFi1BWVobw8PBua9Dr9cjPz+9x7Xa73fH/9hOf\n9fjxvuzS146uf24LDIVC4RIOZrMZSqX4qeKRkZEoKSlxfJ+UlITY2Fh89tlnQoGRnp6OlJQUp7bG\nxkZkZmYK10Dky9wWGKNGjYJer3dqMxgMLm/gK9m/fz/+/ve/Iy0tzdFmsVjQr18/ocerVCqoVCqn\nNplM1u3jLr3EnGLEVPjL+gtW7Js6becdW2K8PN+NxW0/zdjYWFitVpSWliItLQ3l5eUwGo2Ij48X\nHsPf3x9r1qzB6NGjERUVhZ07d+K7777Dyy+/3IeV/1MNsv68Hgb5LLd9SiKXy1FUVISKigrExMRA\nr9dj06ZNUCqVyMrKQkFBQbdjTJo0CcuXL8fy5ctx11134Z133kFBQcFl93kQUe9y6/bi2LFjsWXL\nFpf2t99+u8v+XW05zJkzB3PmzOn12oioezw0nIiEMTCISBgDg4iE8TOvHuq0tXm6BCfeepsBujEx\nMHqI99sgX+Y9f5aIyOtxC0NAcHAwCgsLPV2Gi6amJsc9VXNzc6FWqz1ckavg4GBPl0C9iIEhIDAw\n0OtvKKxWq72+Rrr+cUpCRMIYGEQkjIFBRMIYGEQkjIFBRMIYGEQkjIFBRMIYGEQkjIFBRMIYGEQk\njIFBRMIYGEQkjIFBRMIYGEQkjKe3e4DNZoPRaLzmcZqamrr8/7UYPHiw0N3gyDcxMNzMZrNBp9P1\n2hv8oosX0rlWarUaBQUFDA3qEqckRCSMWxhuJpPJUFBQ0CtTEgCw2+0Aeu+mx5yS0JUwMDxAJpPx\ncnp0XeKUhIiEMTCISBgDg4iEMTCISBgDg4iEMTCISBgDg4iEuTUw6urqoNFoEBkZidmzZ+PAgQNX\n7J+bm4s1a9Z0uezIkSOIiIjAzz//3BelElEX3BYYFosFOp0OqampqK2tRUZGBrKzs2G1Wl36mkwm\n5OTkoLS0tMuxbDYblixZAovF0tdlE9El3BYYNTU18Pf3h1arhUwmg0ajgUqlQmVlpUtfrVaLgIAA\nJCcndznW+vXrERsb2+MaTCYTDAaD09fJkyd7PA6Rr3LboeEGgwFhYWFObaGhoaivr3cJhuLiYoSE\nhCAnJ8dlnK+++gqff/45ysrK8Pbbb/eoBr1ej/z8/J4XT0QA3BgYbW1tUCgUTm1BQUEwm80ufUNC\nQroc49y5c1ixYgXWr18PuVze4xrS09ORkpLi1NbY2IjMzMwej0Xki9wWGAqFwiUczGYzlEql8Bgv\nvPACUlNTMXbs2KuqQaVSQaVSObXxzEwicW7bhzFq1CgYDAanNoPBgNGjRwuPsXPnThQVFWHixImY\nOHEiACAtLQ0fffRRr9ZKRF1z2xZGbGwsrFYrSktLkZaWhvLychiNRsTHxwuPcfDgQafvw8PDsWXL\nFowZM6a3yyWiLrhtC0Mul6OoqAgVFRWIiYmBXq/Hpk2boFQqkZWVhYKCAneVQkRXyU+SJMnTRXhS\nQ0MDEhMTsWfPHgwfPtzT5RB5NR4aTkTCGBhEJIyBQUTCGBhEJIyBQUTCGBhEJIyBQUTCGBhEJIyB\nQUTCGBhEJIyBQUTCGBhEJIyBQUTCGBhEJIyBQUTCGBhEJIyBQUTCGBhEJIyBQUTCGBhEJIyBQUTC\nGBhEJIyBQUTCGBhEJIyBQUTCGBhEJIyBQUTCGBhEJIyBQUTCGBhEJIyBQUTCGBhEJIyBQUTC3BoY\ndXV10Gg0iIyMxOzZs3HgwIEr9s/NzcWaNWuc2vR6PRISEhAVFYUHH3wQX331VV+WTESXcFtgWCwW\n6HQ6pKamora2FhkZGcjOzobVanXpazKZkJOTg9LSUqf2ffv24S9/+QuKiorw7bffIi0tDY8//jg6\nOzvdtRpEPs1tgVFTUwN/f39otVrIZDJoNBqoVCpUVla69NVqtQgICEBycrJT++TJk/HJJ58gLCwM\nra2tMJlMGDRoEPz9xVbDZDLBYDA4fZ08ebJX1o/IFwS664kMBgPCwsKc2kJDQ1FfX+8SDMXFxQgJ\nCUFOTo7LOP3790dNTQ0yMzMRGBiIjRs3Cteg1+uRn59/dStARO4LjLa2NigUCqe2oKAgmM1ml74h\nISFXHCs6OhqHDh3Crl27sGjRImzbts0ljLqSnp6OlJQUp7bGxkZkZmZ2vwJE5L7AUCgULuFgNpuh\nVCp7PJZcLgcAzJw5E3/961+xd+9eocBQqVRQqVRObTKZrMfPT+Sr3LYPY9SoUTAYDE5tBoMBo0eP\nFh6jrKwMS5cudWqz2WwYMGBAr9RIRFfmtsCIjY2F1WpFaWkpbDYbtm7dCqPRiPj4eOExJkyYgL/9\n7W/Yv38/Ojo68P777+PEiRNISEjow8qJ6KJupyTt7e3Yvn079u7di0OHDqGlpQV+fn649dZbceed\nd2LKlCmYOXNmt1MLuVyOoqIiPP/881i3bh1GjhyJTZs2QalUIisrCxMnToROp7viGOHh4Xj11Vfx\nwgsvoKmpCWPHjsV//ud/4pZbbunZWhPRVfGTJEnqaoHdbsdbb72FkpISjBgxAvHx8Rg9ejQGDRqE\njo4OmEwm/PTTT/jqq69w4sQJZGRk4LHHHnPsX7heNDQ0IDExEXv27MHw4cM9XQ6RV7vsFsZDDz2E\n+Ph4fPTRR91+anH8+HFs2bIFDz30ELZv397rRRKRd7jsFkZTUxPUanWPBruax3gatzCIxF12p2d3\nb/zTp0+jo6OjR48houub0KckZ86cwRNPPIEffvgBFosFWq0W06ZNw7Rp03D48OG+rpGIvIRQYKxe\nvRotLS1QqVT48MMPUV9fj//6r/9CUlIScnNz+7pGIvISQkd61tTUYOvWrbjtttuwe/duTJs2DRMm\nTMAtt9zicqg1Ed24hLYwZDIZOjo6cP78eXz55ZeYMmUKAKC5uZlHWRL5EKEtjMmTJ2P58uVQKBRQ\nKBSYOnUqqqurkZubi6SkpL6ukYi8hNAWxgsvvIDIyEjcdNNNeOutt9C/f38cO3YMCQkJWL58eV/X\nSERe4rLHYUiSBD8/vx4NdjWP8TQeh0Ek7rJbGA8++CB2794tNEhnZyd27NiBBx98sNcKIyLvc9l9\nGOvXr8cLL7yAvLw8JCYmIi4uDqNHj3ZcT+LXX3/Fjz/+iNraWuzcuRNjx47FG2+84bbCicj9Ljsl\nuejgwYN49913sXfvXphMJqcpx6233op7770XaWlpiIiI6PNi+wKnJETiuv2UJCIiwhEGv/zyC/7x\nj3/Az88ParW625PSiOjG0qNL9A0bNgzDhg3rq1qIyMvxzmdEJIyBQUTCGBhEJKxHgXHmzBnU1NTA\nbDbDaDT2VU1E5KWEAqOtrQ2LFy/GlClTMG/ePDQ3N2PVqlXQarX49ddf+7pGIvISQoHx6quvorGx\nETt37kS/fv0AAE8//TQsFgtefPHFPi2QiLyHUGDs2bMHy5YtQ2hoqKMtLCwMq1evRnV1dZ8VR0Te\nRSgwzp07h5tuusn1wf7+sNvtvV4UEXknocCIj49HQUGB00V/TSYTXn31VcTFxfVZcUTkXYQCY+XK\nlTh+/DhiY2NhNpuRlZWFadOmobW1FStWrOjrGonISwgdGq5Wq1FWVob9+/fj2LFjsNvtCAsLQ1xc\n3HV3/Qsiuno9Opfk97//Pe666y7H9zabDQCuu9sjEtHVEQqML7/8EqtXr8bx48fR2dnpaL94hS3e\nm4TINwgFxp///GeMHj0aS5cuRVBQUF/XREReSigwmpqaUFBQ4HQcBhH5HqFPSaZPn46qqqq+roWI\nvJzQFsZTTz2FWbNm4b//+79x++23w9/fOWdee+21PimOiLyLUGCsWLECfn5+GD58OPdhEPkwocD4\n6quvoNfrMX78+Gt6srq6OqxatQpHjhzByJEjsXr1akRGRl62f25uLmQyGZYuXepo2717N9avX49f\nfvkFQ4cOxaJFizB9+vRrqouIxAjtwxg5ciSsVus1PZHFYoFOp0Nqaipqa2uRkZGB7OzsLsc1mUzI\nyclBaWmpU7vBYMCSJUuwfPlyfP3111i2bBmWLFmCo0ePXlNtRCRGaAtDp9MhJycHGRkZGDFiBAID\nnR8WHx/f7Rg1NTXw9/eHVqsFAGg0GpSUlKCyshLJyclOfbVaLaKjo13af/nlFzz00EOIjY11PG9o\naCgOHTqEsLCwbmswmUxoaWlxamtsbOz2cUR0gfBOTwBdXvtC9MAtg8Hg8qYODQ1FfX29SzAUFxcj\nJCQEOTk5Tu3x8fFO4XTy5EnU19dj7NixIqsBvV6P/Px8ob5E5EooMH788cdrfqK2tjYoFAqntqCg\nIJjNZpe+Ivc7OXPmDObPn48HHnhAODDS09ORkpLi1NbY2IjMzEyhxxP5ussGhtVqdZwj0t3+C5Fz\nSRQKhUs4mM1mKJVKkTqd1NXVQafTYerUqXj++eeFH6dSqRy3erxIJpP1+PmJfNVlA2PChAn4/PPP\nceuttyIiIqLLs1J7ci7JqFGjoNfrndoMBoPLX/zu7N27F4sXL8bjjz+OefPm9eixRHRtLhsYJSUl\nuPnmmwEAmzdvvuYnio2NhdVqRWlpKdLS0lBeXg6j0Si0w/Si+vp6LFy4EHl5eZg5c+Y110REPXPZ\nwPjyyy8xfvx4BAYGIiYm5pqfSC6Xo6ioCM8//zzWrVuHkSNHYtOmTVAqlcjKysLEiROh0+muOMbm\nzZthNpuxcuVKrFy50tGek5ODhx9++JprJKIru+zd28eNG+eYktzIePd2InGXPXDrMjlCRD7sih+r\n2mw2oSM8ecUtIt9wxcCYNm2a0CC84haRb7hiYGzYsMHxSQkR0WUDw8/PD9HR0Tf8Tk8iEsednkQk\n7LKB8cADDzhuvExEBFxhSvLSSy+5sw4iug4IXUCHiAhgYBBRDzAwiEgYA4OIhDEwiEgYA4OIhDEw\niEgYA4OIhDEwiEgYA4OIhDEwiEgYA4OIhDEwiEgYA4OIhDEwiEgYA4OIhDEwiEgYA4OIhDEwiEgY\nA4OIhDEwiEgYA4OIhDEwiEgYA4OIhDEwiEiYWwOjrq4OGo0GkZGRmD17Ng4cOHDF/rm5uVizZk2X\ny3bu3ImHH364L8okostwW2BYLBbodDqkpqaitrYWGRkZyM7OhtVqdelrMpmQk5OD0tJSl2U2mw2F\nhYV49tlnecNoIjdzW2DU1NTA398fWq0WMpkMGo0GKpUKlZWVLn21Wi0CAgKQnJzssmzVqlWorq7G\nH//4R3eUTUSXcFtgGAwGhIWFObWFhoaivr7epW9xcTHy8vKgVCpdli1atAilpaUYMWJEj2swmUww\nGAxOXydPnuzxOES+6rJ3b+9tbW1tUCgUTm1BQUEwm80ufUNCQi47zpWWdUev1yM/P/+qH0/k69wW\nGAqFwiUczGZzl1sRfSU9PR0pKSlObY2NjcjMzHRbDUTXM7cFxqhRo6DX653aDAaDyxu4L6lUKqhU\nKqc2mUzmtucnut65bR9GbGwsrFYrSktLYbPZsHXrVhiNRsTHx7urBCK6Rm4LDLlcjqKiIlRUVCAm\nJgZ6vR6bNm2CUqlEVlYWCgoK3FUKEV0lP8nHD2ZoaGhAYmIi9uzZg+HDh3u6HCKvxkPDiUgYA4OI\nhDEwiEgYA4OIhDEwiEgYA4OIhDEwiEgYA4OIhDEwiEgYA4OIhDEwiEgYA4OIhDEwiEgYA4OIhDEw\niEgYA4OIhDEwiEgYA4OIhDEwiEgYA4OIhDEwiEgYA4OIhDEwiEgYA4OIhDEwiEgYA4OIhDEwiEgY\nA4OIhDEwiEgYA4OIhAV6ugAid7LZbDAajb0ylt1uBwAEBvbO22jw4MGQyWS9MlZfYWCQz7DZbNDp\ndGhqavJ0KV1Sq9UoKCjw6tDglISIhLl1C6Ourg6rVq3CkSNHMHLkSKxevRqRkZGX7Z+bmwuZTIal\nS5c62vbt24cXX3wRDQ0N+N3vfoe8vDyEhoa6o3y6zslkMhQUFPTKlKSpqQkrV64EcOH3VK1WX/OY\nnJJcwmKxQKfTQafTYc6cOSgvL0d2djY+/fRTyOVyp74mkwlr1qzBhx9+iHnz5jnajUYjsrOzsXbt\nWsTHx6OwsBBPP/00tm3b5q7VoOucTCbD0KFDe3VMtVrd62N6K7dNSWpqauDv7w+tVguZTAaNRgOV\nSoXKykqXvlqtFgEBAUhOTnZq37VrF8aNG4eEhATI5XIsWLAAJ0+exPfff++u1SDyaW4LDIPBgLCw\nMKe20NBQ1NfXu/QtLi5GXl4elEqlU/uxY8ecxggICMDtt9+OI0eOCNVgMplgMBicvk6ePHkVa0Pk\nm9w2JWlra4NCoXBqCwoKgtlsdukbEhLS5Rjt7e246aabnNoUCgXa29uFatDr9cjPzxesmIj+mdsC\nQ6FQuISD2Wx22Yro6Rjt7e3CY6SnpyMlJcWprbGxEZmZmcI1EPkytwXGqFGjoNfrndoMBoPLG7i7\nMT7++GPH9x0dHThx4gRGjx4t9HiVSgWVSuXU5u17pYm8idv2YcTGxsJqtaK0tBQ2mw1bt26F0WhE\nfHy88BjTp0/H999/j127dsFqtWLTpk0YMmQIfve73/Vh5UR0kdsCQy6Xo6ioCBUVFYiJiYFer8em\nTZugVCqRlZWFgoKCbscIDg7GX/7yF+Tn52PSpEnYt28fNm7cCD8/PzesARH5SZIkeboIT2poaEBi\nYiL27NmD4cOHe7ocuk6cPn0ajz32GACgsLCQx2EQEf0zBgYRCWNgEJEwBgYRCWNgEJEwBgYRCWNg\nEJEwBgYRCWNgEJEwBgYRCWNgEJEwBgYRCWNgEJEw3siIvJrdbkdzc7Ony3Bx6c2QvPXGSMHBwb12\nV7aLGBjk1Zqbmx2nkXuri/cn8TZ9cdo9pyREJIxbGHTdGBg/FP5K7/mVlTovXHvKz997rvjW2WbH\n2c9P99n43vPqE3XDXxmIgJt40WZP4pSEiIQxMIhIGAODiIQxMIhIGAODiIQxMIhIGD9WpetGR5vd\n0yV4vb5+jRgY5NXs9v9/A/zWhwck3Ygufe16C6ckRCSMWxjk1S4923JA/FAEeNGh4d6oo83u2BLr\n7TNVAQYGXUcCeGi4x3FKQkTCGBhEJIyBQUTCGBhEJMytgVFXVweNRoPIyEjMnj0bBw4c6LJfcXEx\n7rnnHkRHR+OZZ55BW1ubY1lJSQkSEhIwceJEPPHEEzAaje4qn8jnuS0wLBYLdDodUlNTUVtbi4yM\nDGRnZ8NqtTr1q6ysxDvvvIPNmzejqqoKra2t2LBhAwBgx44dePPNN/Haa69h//79GD16NBYsWOCu\nVSDyeW4LjJqaGvj7+0Or1UImk0Gj0UClUqGystKpX3l5OTQaDUJDQzFgwAA8+eST2Lp1Kzo6OrBr\n1y489NBDiIqKgkwmwxNPPIEjR47gp59+ctdqEPk0tx2HYTAYEBYW5tQWGhqK+vp6JCcnO9qOHTuG\n6dOnO/X57bffcObMGXR2diIoKMixzM/PD35+fvj73/+O8PDwbmswmUxoaWlxamtsbLzaVSI36/Sy\nc0m89ZqefcltgdHW1gaFQuHUFhQUBLPZ7NTW3t7uFAoXH9Pe3o6EhASsW7cOSUlJGDVqFAoLC2Gx\nWGCxWIRq0Ov1yM/Pv8Y1IU/py4vbkhi3BYZCoXAJB7PZDKVS6dQWFBTkFADt7e0AgP79++P+++9H\nU1MT/v3f/x02mw0ZGRkICwvDwIEDhWpIT09HSkqKU1tjYyMyMzOvYo2IfI/bAmPUqFHQ6/VObQaD\nweUNHBYWhmPHjjn1GTBgANRqNZqamjBjxgzHjW3Onj2LN998E+PGjROqQaVSQaVSObXJZDzU2JsF\nBwejsLDQ02W4aGpqctzAKDc3F2q12sMVuQoODu71Md0WGLGxsbBarSgtLUVaWhrKy8thNBoRHx/v\n1G/WrFl47rnnkJycjKFDh2LDhg2477774O/vj3379qGoqAilpaWQy+XIzc3F5MmTvfKHRb0jMDCw\n1+/e1dvUarXX19hb3PYpiVwuR1FRESoqKhATEwO9Xo9NmzZBqVQiKysLBQUFAICEhATMnz8ff/rT\nnzB16lQMGDAAS5YsAQDMnj0b9957L2bMmIGEhARIkoRXXnnFXatA5PP8JEmSPF2EJzU0NCAxMRF7\n9uzB8OHDPV0OXSdOnz7tmBr3xT1MvRUPDSciYQwMIhLGwCAiYQwMIhLGwCAiYQwMIhLGiwCTT7HZ\nbL1yDZWmpqYu/38tBg8e7PVHHjMwyGfYbDbodLpee4NfdPEQ8WulVqtRUFDg1aHBKQkRCeMWBvkM\nmUyGgoKCXrus48VbEfbWDYM4JSHyMjKZzGcO4+4LnJIQkTAGBhEJY2AQkTAGBhEJY2AQkTAGBhEJ\nY2AQkTAGBhEJY2AQkTAGBhEJY2AQkTAGBhEJ8/mTzzo6OgDwLu7ku4YMGSJ8xq3PB0ZzczMAYO7c\nuR6uhMgzenITL5+/85nZbMb333+P4OBgBAQEeLqcHjt58iQyMzNRXFyM22+/3dPl+JQb5bXnFkYP\nBAUFYeLEiZ4u46rZbDYAF37ovNWje/nia8+dnkQkjIFBRMIYGEQkjIFxnRs0aBCys7MxaNAgT5fi\nc3zxtff5T0mISBy3MIhIGAODiIQxMIhIGAODiIQxMIhIGAODiIQxMIhIGAODyAucPHnS0yUIYWB4\nyLPPPos777wTZ86c8XQpTn7++WeEh4d7uow+t3fvXjz66KOYNGkSYmJi8G//9m84dOiQR2rZs2cP\nFi9e7JHn7ikGhge0traiqqoKycnJ2LJli6fL8TllZWVYtmwZMjMz8fnnn6O6uhpxcXF49NFHUV9f\n7/Z6Wltb0dnZ6fbnvRoMDA/Yvn07Jk6ciLlz56KsrAxWqxUAsHHjRixevBiPPPIIIiMjMWfOHBw+\nfBgA8MUXX2DmzJl4/vnnERUVhcTERFRUVDjGPHXqFHQ6HSZNmoQ//OEP+OCDDxzLMjIy8Prrr2P2\n7NmIjo6Gx1nTAAAKPklEQVRGeno6GhoaAACdnZ1Yt24dJk2ahPj4eKcxb0RtbW14+eWXkZubi2nT\npkEmk6Ffv36YN28etFotjh49CqPRiKeffhqTJk3ClClT8Morrzh+Rjk5OcjNzYVWq0VUVBRSU1Px\nww8/OMZ/7733kJiYiOjoaDz66KOOqUZdXR0yMzMRHx+PCRMmYN68eTAajTh48CCee+45HD58GHFx\ncR55TXpEIrebOXOmtHv3bkmSJCklJUUqLy+XJEmSNmzYIIWHh0sVFRWS1WqVNm7cKE2dOlWyWCxS\nTU2NNGbMGCk3N1eyWCxSdXW1dOedd0o///yzZLfbpfvuu09au3atZLFYpMOHD0txcXHS/v37JUmS\npPT0dCkxMVE6ceKEdPbsWUmr1Up//vOfJUmSJL1eL/3hD3+QGhoapJaWFumRRx6RxowZ45kXxg2q\nq6uliIgIyWazXbbPww8/LD311FPSb7/9JjU2NkoPPvig9Oqrr0qSJElLly6VJk6cKB0+fFhqb2+X\nFi1aJM2bN0+SJEmqqqqS7r77bum7776T7Ha79Morr0gPP/ywJEmSlJSUJG3evFnq7OyUfv31V0mj\n0Uivv/66JEmS9MEHH0gPPPBAH6957+AWhpt98803OHv2LKZOnQoASEtLw7vvvutYHhsbixkzZkAm\nk2HBggVoa2vDN998AwBQKpV45plnIJfLER8fj3vuuQc7d+7EoUOHcPr0aSxevBhyuRxjx45FWloa\n3n//fce4s2bNwu23344BAwZg+vTpOH78OABgx44dmDt3LoYNG4abb74ZCxcudNtr4QktLS0YOHDg\nZS9Jd+LECXz77bdYsWIFbrrpJoSEhODJJ5/Ehx9+6OiTkJCAsWPHIigoCDNmzHC8lhUVFbj//vsR\nERGBgIAAPP7441ixYgUA4J133sHcuXPR3t6OM2fOQKVSed3+KxE+f4k+dysrK4PJZMK9994LALDb\n7WhpacH3338PABgxYoSjb0BAAIKDg2E0GhEcHIwhQ4agX79+juVDhgyB0WjEqVOncO7cOcTExDiW\ndXR04I477nB8f8sttzj+HxgYCOn/TlI2Go0ICQlxLLvRLzU3ePBgtLa2wmazQSaTOS1rbW1FU1MT\nlEql0+t12223wWg0Oi7Jd6XX8tIdxkqlEuPHjwcAHDx4EPPnz8f58+cRHh6O1tZWp3GuFwwMN/rt\nt9/w8ccfo7i42CkY8vLyoNfrMWzYMDQ1NTna7XY7mpqaMGTIEHR0dOAf//gHOjo6HBcrPnXqFCIi\nIqBWqxESEoLPPvvM8Vij0ej4Rb4StVqNU6dOOb6/Hv/q9URUVBRkMhn27t2LxMREp2UrVqzA+fPn\n0dbWhl9//dXxhm5oaMCgQYNcAuafhYSEOL1+586dQ35+Ph555BEsXboU7733HiZMmAAAWLZsmdDP\nx9twSuJG5eXlGDFiBO666y4EBwc7vjQaDSoqKmAymVBdXY19+/bBZrPhzTffhEqlQlRUFIALfwEL\nCwths9lQVVWFmpoazJw5ExMmTEBQUBDefvtt2Gw2NDY24o9//KPTVOdyZs2ahZKSEhw7dgznzp3D\nhg0b+vpl8Kh+/frhqaeewqpVq/DZZ5/Bbrc73tj79u3DsmXLEBsbi7y8PJw/fx5nzpzBhg0bcN99\n93U79n333Yft27ejrq4OdrsdBQUF+O6779De3g7gwgWnJUlCVVUVPv74Y8cWi1wux/nz56+LAOEW\nhhuVlZUhJSXFpX3y5MlQqVQoKytDREQEioqKkJ2djTvuuANvvfWWY4ti4MCBaGxsRHx8PG699Vas\nX78eI0eOBAAUFhYiNzcXRUVFCAgIwIwZM/D44493W5NGo0FzczPmzp0LSZLwr//6r6iuru7dFfcy\nc+fOxcCBA5Gfn49nn30W/v7+iIiIQGlpKcaMGYO1a9ciLy/PsQUya9YsPP30092OGxsbi2effRaL\nFy+G0WhEdHQ01q1bh6FDh2LBggV49NFH0dHRgbCwMKSlpaGmpgYAcPfddzv+/Z//+R+naae34RW3\nvMjGjRtRX1/f5V/5L774AgsXLsQXX3zhgcqILuCUhIiEMTCISBinJEQkjFsYRCSMgUFEwhgYRCSM\ngUFCwsPDER4e3uXp3wcPHkR4eDgyMjKEx/vb3/7mOCpy27ZtvX6mZkZGBtauXdurYxIDg3pAJpPh\nk08+cWnftWsX/Pz8hMf55ZdfsHDhQpw7d643yyM3YGCQsJiYGOzevdul/ZNPPkFkZKTwOPxg7vrF\nwCBhSUlJqKurQ2Njo6Ptp59+wrlz5xAdHe3U9+jRo5g3bx4mTJiAhIQEvPHGG45zJy4ecj1jxgxs\n27bN8ZjCwkLExcUhKioKOTk5sFgsjmWff/45NBoNJkyYgMTERPz1r391er5t27YhMTERkZGRyM3N\nRUdHR6+vPzEwqAeGDx+O8PBwp62MTz75BElJSfD3//9fJYvFgqysLIwZMwbbt2/Hiy++iI8//hiv\nv/46ADiu0/Huu+9ixowZAC6cXXvgwAGUlJRgw4YN2LlzJ8rKygAAtbW1+NOf/oTk5GRs374dCxYs\nwMsvv4wdO3YAAPbt24dVq1ZBp9Phgw8+gNlsxtdff+2W18TXMDCoR6ZPn+4UGLt27UJycrJTn48+\n+ghKpRI5OTkIDQ3F73//e6xcuRKlpaWw2+2O08ZVKhWCgoIAAP7+/njppZcwevRo3HPPPYiLi0Nd\nXR0AYPPmzZg2bRrmz5+P0NBQaDQapKeno6ioCACwZcsWJCcnY86cOQgLC8Nzzz0HtVrtjpfD5zAw\nqEeSkpJQW1uLs2fP4sSJEzhz5ozThXuAC9MRg8GAqKgox1d2djasVqvTtTcudfPNN+Pmm292fD9w\n4EDHlOTo0aOIiIhw6h8dHY1jx45BkiQcPXoU48aNcyyTyWRO31Pv4ent1CNjx47FbbfdhsrKSjQ3\nNyMhIcHlcnd2ux133XUXcnNzXR4/ZMgQp4sEXXTxFP5LXdw5KpfLXT6F6ezsREdHh6P9n3ekXu4S\nfHRtuIVBPZaUlITKykrs3r3bZToCAGFhYTh+/DiGDh2KkSNHYuTIkTh9+jRee+01SJLUo49gL453\n4MABp7ZvvvkG//Iv/wLgwjEiBw8edCzr6OjAjz/+2PMVo24xMKjHpk+fjqqqKhw9ehSTJ092WT5r\n1iwAFy7JX19fj9raWqxYsQKBgYHo168flEolgAufsJw/f77b55s3bx4qKytRVFSE48ePY+vWrXjv\nvfeQnp4O4MJBWp9++ik2b94Mg8GAl156CadPn+7FNaaLGBjUY1FRUejfvz/uvfdeyOVyl+VKpRLv\nvPMOTCYTNBoNFi5ciLi4OMcURaVSITU1FUuWLHF8EnIld9xxB9avX4/y8nKkpKSgqKgIy5cvR1pa\nmqOedevW4d1338Xs2bPR0tKCKVOm9O5KEwCe3k5EPcAtDCISxsAgImEMDCISxsAgImEMDCISxsAg\nImEMDCISxsAgImH/C+Prsyn3CTzmAAAAAElFTkSuQmCC\n",
 250 |       "text/plain": [
 251 |        "<matplotlib.figure.Figure at 0x1184dd978>"
 252 |       ]
 253 |      },
 254 |      "metadata": {},
 255 |      "output_type": "display_data"
 256 |     }
 257 |    ],
 258 |    "source": [
 259 |     "plt.figure(figsize=(4, 6))\n",
 260 |     "sns.boxplot(x='Method', y='Time (s)', data=timings)\n",
 261 |     "sns.despine()\n",
 262 |     "plt.tight_layout()"
 263 |    ]
 264 |   },
 265 |   {
 266 |    "cell_type": "markdown",
 267 |    "metadata": {},
 268 |    "source": [
 269 |     "## Datatypes\n",
 270 |     "\n",
 271 |     "The pandas type system essentially [NumPy's](http://docs.scipy.org/doc/numpy/reference/arrays.scalars.html) with a few extensions (`categorical`, `datetime64` with timezone, `timedelta64`).\n",
 272 |     "An advantage of the DataFrame over a 2-dimensional NumPy array is that the DataFrame can have columns of various types within a single table.\n",
 273 |     "That said, each column should have a specific dtype; you don't want to be mixing bools with ints with strings within a single column.\n",
 274 |     "For one thing, this is slow.\n",
 275 |     "It forces the column to be have an `object` dtype (the fallback python-object container type), which means you don't get any of the type-specific optimizations in pandas or NumPy.\n",
 276 |     "For another, it means you're probably violating the maxims of tidy data, which we'll discuss next time.\n",
 277 |     "\n",
 278 |     "When should you have `object` columns?\n",
 279 |     "There are a few places where the NumPy / pandas type system isn't as rich as you might like.\n",
 280 |     "There's no integer NA (at the moment anyway), so if you have any missing values, represented by `NaN`, your otherwise integer column will be floats.\n",
 281 |     "There's also no `date` dtype (distinct from `datetime`).\n",
 282 |     "Consider the needs of your application: can you treat an integer `1` as `1.0`?\n",
 283 |     "Can you treat `date(2016, 1, 1)` as `datetime(2016, 1, 1, 0, 0)`?\n",
 284 |     "In my experience, this is rarely a problem other than when writing to something with a stricter schema like a database.\n",
 285 |     "But at that point it's fine to cast to one of the less performant types, since you're just not doing numeric operations anymore.\n",
 286 |     "\n",
 287 |     "The last case of `object` dtype data is text data.\n",
 288 |     "Pandas doesn't have any fixed-width string dtypes, so you're stuck with python objects.\n",
 289 |     "There is an important exception here, and that's low-cardinality text data, for which you'll want to use the `category` dtype (see below).\n",
 290 |     "\n",
 291 |     "If you have object data (either strings or python objects) that needs to be converted, checkout the [`to_numeric`](http://pandas.pydata.org/pandas-docs/version/0.18.0/generated/pandas.to_numeric.html), [`to_datetime`](http://pandas.pydata.org/pandas-docs/version/0.18.0/generated/pandas.to_datetime.html) and [`to_timedelta`](http://pandas.pydata.org/pandas-docs/version/0.18.0/generated/pandas.to_timedelta.html) methods."
 292 |    ]
 293 |   },
 294 |   {
 295 |    "cell_type": "markdown",
 296 |    "metadata": {},
 297 |    "source": [
 298 |     "## Iteration, Apply, And Vectorization\n",
 299 |     "\n",
 300 |     "We know that [\"Python is slow\"](https://jakevdp.github.io/blog/2014/05/09/why-python-is-slow/) (scare  quotes since that statement is too broad to be meaningful).\n",
 301 |     "There are various steps that can be taken to improve your code's performance from relatively simple changes, to rewriting your code in a lower-level language, to trying to parallelize it.\n",
 302 |     "And while you might have many options, there's typically an order you would proceed in.\n",
 303 |     "\n",
 304 |     "First (and I know it's cliché to say so, but still) benchmark your code.\n",
 305 |     "Make sure you actually need to spend time optimizing it.\n",
 306 |     "There are [many](https://github.com/nvdv/vprof) [options](https://jiffyclub.github.io/snakeviz/) [for](https://github.com/rkern/line_profiler) [benchmarking](https://docs.python.org/3.5/library/timeit.html) and visualizing where things are slow.\n",
 307 |     "\n",
 308 |     "Second, consider your algorithm.\n",
 309 |     "Make sure you aren't doing more work than you need to.\n",
 310 |     "A common one I see is doing a full sort on an array, just to select the `N` largest or smallest items.\n",
 311 |     "Pandas has methods for that."
 312 |    ]
 313 |   },
 314 |   {
 315 |    "cell_type": "code",
 316 |    "execution_count": 5,
 317 |    "metadata": {
 318 |     "collapsed": true
 319 |    },
 320 |    "outputs": [],
 321 |    "source": [
 322 |     "df = pd.read_csv(\"data/347136217_T_ONTIME.csv\")\n",
 323 |     "delays = df['DEP_DELAY']"
 324 |    ]
 325 |   },
 326 |   {
 327 |    "cell_type": "code",
 328 |    "execution_count": 6,
 329 |    "metadata": {},
 330 |    "outputs": [
 331 |     {
 332 |      "data": {
 333 |       "text/plain": [
 334 |        "112623    1480.0\n",
 335 |        "158136    1545.0\n",
 336 |        "152911    1934.0\n",
 337 |        "60246     1970.0\n",
 338 |        "59719     2755.0\n",
 339 |        "Name: DEP_DELAY, dtype: float64"
 340 |       ]
 341 |      },
 342 |      "execution_count": 6,
 343 |      "metadata": {},
 344 |      "output_type": "execute_result"
 345 |     }
 346 |    ],
 347 |    "source": [
 348 |     "# Select the 5 largest delays\n",
 349 |     "delays.nlargest(5).sort_values()"
 350 |    ]
 351 |   },
 352 |   {
 353 |    "cell_type": "code",
 354 |    "execution_count": 7,
 355 |    "metadata": {},
 356 |    "outputs": [
 357 |     {
 358 |      "data": {
 359 |       "text/plain": [
 360 |        "300895   -59.0\n",
 361 |        "235921   -58.0\n",
 362 |        "197897   -56.0\n",
 363 |        "332533   -56.0\n",
 364 |        "344542   -55.0\n",
 365 |        "Name: DEP_DELAY, dtype: float64"
 366 |       ]
 367 |      },
 368 |      "execution_count": 7,
 369 |      "metadata": {},
 370 |      "output_type": "execute_result"
 371 |     }
 372 |    ],
 373 |    "source": [
 374 |     "delays.nsmallest(5).sort_values()"
 375 |    ]
 376 |   },
 377 |   {
 378 |    "cell_type": "markdown",
 379 |    "metadata": {},
 380 |    "source": [
 381 |     "We follow up the `nlargest` or `nsmallest` with a sort (the result of `nlargest/smallest` is unordered), but it's much easier to sort 5 items that 500,000. The timings bear this out:"
 382 |    ]
 383 |   },
 384 |   {
 385 |    "cell_type": "code",
 386 |    "execution_count": 8,
 387 |    "metadata": {},
 388 |    "outputs": [
 389 |     {
 390 |      "name": "stdout",
 391 |      "output_type": "stream",
 392 |      "text": [
 393 |       "29.9 ms ± 1.01 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
 394 |      ]
 395 |     }
 396 |    ],
 397 |    "source": [
 398 |     "%timeit delays.sort_values().tail(5)"
 399 |    ]
 400 |   },
 401 |   {
 402 |    "cell_type": "code",
 403 |    "execution_count": 9,
 404 |    "metadata": {},
 405 |    "outputs": [
 406 |     {
 407 |      "name": "stdout",
 408 |      "output_type": "stream",
 409 |      "text": [
 410 |       "7.85 ms ± 142 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
 411 |      ]
 412 |     }
 413 |    ],
 414 |    "source": [
 415 |     "%timeit delays.nlargest(5).sort_values()"
 416 |    ]
 417 |   },
 418 |   {
 419 |    "cell_type": "markdown",
 420 |    "metadata": {},
 421 |    "source": [
 422 |     "\"Use the right algorithm\" is easy to say, but harder to apply in practice since you have to actually figure out the best algorithm to use.\n",
 423 |     "That one comes down to experience.\n",
 424 |     "\n",
 425 |     "Assuming you're at a spot that needs optimizing, and you've got the correct algorithm, *and* there isn't a readily available optimized version of what you need in pandas/numpy/scipy/scikit-learn/statsmodels/..., then what?\n",
 426 |     "\n",
 427 |     "The first place to turn is probably a vectorized NumPy implementation.\n",
 428 |     "Vectorization here means operating directly on arrays, rather than looping over lists scalars.\n",
 429 |     "This is generally much less work than rewriting it in something like Cython, and you can get pretty good results just by making *effective* use of NumPy and pandas.\n",
 430 |     "While not every operation can be vectorized, many can.\n",
 431 |     "\n",
 432 |     "Let's work through an example calculating the [Great-circle distance](https://en.wikipedia.org/wiki/Great-circle_distance) between airports.\n",
 433 |     "Grab the table of airport latitudes and longitudes from the [BTS website](http://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=288&DB_Short_Name=Aviation%20Support%20Tables) and extract it to a CSV."
 434 |    ]
 435 |   },
 436 |   {
 437 |    "cell_type": "code",
 438 |    "execution_count": 10,
 439 |    "metadata": {
 440 |     "collapsed": true
 441 |    },
 442 |    "outputs": [],
 443 |    "source": [
 444 |     "from utils import download_airports\n",
 445 |     "import zipfile"
 446 |    ]
 447 |   },
 448 |   {
 449 |    "cell_type": "code",
 450 |    "execution_count": 11,
 451 |    "metadata": {
 452 |     "collapsed": true
 453 |    },
 454 |    "outputs": [],
 455 |    "source": [
 456 |     "if not os.path.exists(\"data/airports.csv.zip\"):\n",
 457 |     "    download_airports()"
 458 |    ]
 459 |   },
 460 |   {
 461 |    "cell_type": "code",
 462 |    "execution_count": 14,
 463 |    "metadata": {},
 464 |    "outputs": [
 465 |     {
 466 |      "data": {
 467 |       "text/html": [
 468 |        "<div>\n",
 469 |        "<style>\n",
 470 |        "    .dataframe thead tr:only-child th {\n",
 471 |        "        text-align: right;\n",
 472 |        "    }\n",
 473 |        "\n",
 474 |        "    .dataframe thead th {\n",
 475 |        "        text-align: left;\n",
 476 |        "    }\n",
 477 |        "\n",
 478 |        "    .dataframe tbody tr th {\n",
 479 |        "        vertical-align: top;\n",
 480 |        "    }\n",
 481 |        "</style>\n",
 482 |        "<table border=\"1\" class=\"dataframe\">\n",
 483 |        "  <thead>\n",
 484 |        "    <tr style=\"text-align: right;\">\n",
 485 |        "      <th></th>\n",
 486 |        "      <th>LATITUDE</th>\n",
 487 |        "      <th>LONGITUDE</th>\n",
 488 |        "    </tr>\n",
 489 |        "    <tr>\n",
 490 |        "      <th>AIRPORT</th>\n",
 491 |        "      <th></th>\n",
 492 |        "      <th></th>\n",
 493 |        "    </tr>\n",
 494 |        "  </thead>\n",
 495 |        "  <tbody>\n",
 496 |        "    <tr>\n",
 497 |        "      <th>8F3</th>\n",
 498 |        "      <td>33.623889</td>\n",
 499 |        "      <td>-101.240833</td>\n",
 500 |        "    </tr>\n",
 501 |        "    <tr>\n",
 502 |        "      <th>A03</th>\n",
 503 |        "      <td>58.457500</td>\n",
 504 |        "      <td>-154.023333</td>\n",
 505 |        "    </tr>\n",
 506 |        "    <tr>\n",
 507 |        "      <th>A09</th>\n",
 508 |        "      <td>60.482222</td>\n",
 509 |        "      <td>-146.582222</td>\n",
 510 |        "    </tr>\n",
 511 |        "    <tr>\n",
 512 |        "      <th>A18</th>\n",
 513 |        "      <td>63.541667</td>\n",
 514 |        "      <td>-150.993889</td>\n",
 515 |        "    </tr>\n",
 516 |        "    <tr>\n",
 517 |        "      <th>A24</th>\n",
 518 |        "      <td>59.331667</td>\n",
 519 |        "      <td>-135.896667</td>\n",
 520 |        "    </tr>\n",
 521 |        "  </tbody>\n",
 522 |        "</table>\n",
 523 |        "</div>"
 524 |       ],
 525 |       "text/plain": [
 526 |        "          LATITUDE   LONGITUDE\n",
 527 |        "AIRPORT                       \n",
 528 |        "8F3      33.623889 -101.240833\n",
 529 |        "A03      58.457500 -154.023333\n",
 530 |        "A09      60.482222 -146.582222\n",
 531 |        "A18      63.541667 -150.993889\n",
 532 |        "A24      59.331667 -135.896667"
 533 |       ]
 534 |      },
 535 |      "execution_count": 14,
 536 |      "metadata": {},
 537 |      "output_type": "execute_result"
 538 |     }
 539 |    ],
 540 |    "source": [
 541 |     "coord = (pd.read_csv(\"data/airports.csv.zip\", index_col=['AIRPORT'],\n",
 542 |     "                     usecols=['AIRPORT', 'LATITUDE', 'LONGITUDE'])\n",
 543 |     "           .groupby(level=0).first()\n",
 544 |     "           .dropna()\n",
 545 |     "           .sample(n=500, random_state=42)\n",
 546 |     "           .sort_index())\n",
 547 |     "\n",
 548 |     "coord.head()"
 549 |    ]
 550 |   },
 551 |   {
 552 |    "cell_type": "markdown",
 553 |    "metadata": {},
 554 |    "source": [
 555 |     "For whatever reason, suppose we're interested in all the pairwise distances (I've limited it to just a sample of 500 airports to make this manageable.\n",
 556 |     "In the real world you *probably* don't need *all* the pairwise distances and would be better off with a [tree](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KDTree.html). Remember: think about what you actually need, and find the right algorithm for that).\n",
 557 |     "\n",
 558 |     "MultiIndexes have an alternative `from_product` constructor for getting the [Cartesian product](https://en.wikipedia.org/wiki/Cartesian_product) of the arrays you pass in.\n",
 559 |     "We'll give it `coords.index` twice (to get its Cartesian product with itself).\n",
 560 |     "That gives a MultiIndex of all the combination.\n",
 561 |     "With some minor reshaping of `coords` we'll have a DataFrame with all the latitude/longitude pairs."
 562 |    ]
 563 |   },
 564 |   {
 565 |    "cell_type": "code",
 566 |    "execution_count": 45,
 567 |    "metadata": {},
 568 |    "outputs": [
 569 |     {
 570 |      "data": {
 571 |       "text/html": [
 572 |        "<div>\n",
 573 |        "<style>\n",
 574 |        "    .dataframe thead tr:only-child th {\n",
 575 |        "        text-align: right;\n",
 576 |        "    }\n",
 577 |        "\n",
 578 |        "    .dataframe thead th {\n",
 579 |        "        text-align: left;\n",
 580 |        "    }\n",
 581 |        "\n",
 582 |        "    .dataframe tbody tr th {\n",
 583 |        "        vertical-align: top;\n",
 584 |        "    }\n",
 585 |        "</style>\n",
 586 |        "<table border=\"1\" class=\"dataframe\">\n",
 587 |        "  <thead>\n",
 588 |        "    <tr style=\"text-align: right;\">\n",
 589 |        "      <th></th>\n",
 590 |        "      <th></th>\n",
 591 |        "      <th>LATITUDE_1</th>\n",
 592 |        "      <th>LONGITUDE_1</th>\n",
 593 |        "      <th>LATITUDE_2</th>\n",
 594 |        "      <th>LONGITUDE_2</th>\n",
 595 |        "    </tr>\n",
 596 |        "    <tr>\n",
 597 |        "      <th>origin</th>\n",
 598 |        "      <th>dest</th>\n",
 599 |        "      <th></th>\n",
 600 |        "      <th></th>\n",
 601 |        "      <th></th>\n",
 602 |        "      <th></th>\n",
 603 |        "    </tr>\n",
 604 |        "  </thead>\n",
 605 |        "  <tbody>\n",
 606 |        "    <tr>\n",
 607 |        "      <th rowspan=\"5\" valign=\"top\">8F3</th>\n",
 608 |        "      <th>8F3</th>\n",
 609 |        "      <td>33.623889</td>\n",
 610 |        "      <td>-101.240833</td>\n",
 611 |        "      <td>33.623889</td>\n",
 612 |        "      <td>-101.240833</td>\n",
 613 |        "    </tr>\n",
 614 |        "    <tr>\n",
 615 |        "      <th>A03</th>\n",
 616 |        "      <td>33.623889</td>\n",
 617 |        "      <td>-101.240833</td>\n",
 618 |        "      <td>58.457500</td>\n",
 619 |        "      <td>-154.023333</td>\n",
 620 |        "    </tr>\n",
 621 |        "    <tr>\n",
 622 |        "      <th>A09</th>\n",
 623 |        "      <td>33.623889</td>\n",
 624 |        "      <td>-101.240833</td>\n",
 625 |        "      <td>60.482222</td>\n",
 626 |        "      <td>-146.582222</td>\n",
 627 |        "    </tr>\n",
 628 |        "    <tr>\n",
 629 |        "      <th>A18</th>\n",
 630 |        "      <td>33.623889</td>\n",
 631 |        "      <td>-101.240833</td>\n",
 632 |        "      <td>63.541667</td>\n",
 633 |        "      <td>-150.993889</td>\n",
 634 |        "    </tr>\n",
 635 |        "    <tr>\n",
 636 |        "      <th>A24</th>\n",
 637 |        "      <td>33.623889</td>\n",
 638 |        "      <td>-101.240833</td>\n",
 639 |        "      <td>59.331667</td>\n",
 640 |        "      <td>-135.896667</td>\n",
 641 |        "    </tr>\n",
 642 |        "  </tbody>\n",
 643 |        "</table>\n",
 644 |        "</div>"
 645 |       ],
 646 |       "text/plain": [
 647 |        "             LATITUDE_1  LONGITUDE_1  LATITUDE_2  LONGITUDE_2\n",
 648 |        "origin dest                                                  \n",
 649 |        "8F3    8F3    33.623889  -101.240833   33.623889  -101.240833\n",
 650 |        "       A03    33.623889  -101.240833   58.457500  -154.023333\n",
 651 |        "       A09    33.623889  -101.240833   60.482222  -146.582222\n",
 652 |        "       A18    33.623889  -101.240833   63.541667  -150.993889\n",
 653 |        "       A24    33.623889  -101.240833   59.331667  -135.896667"
 654 |       ]
 655 |      },
 656 |      "execution_count": 45,
 657 |      "metadata": {},
 658 |      "output_type": "execute_result"
 659 |     }
 660 |    ],
 661 |    "source": [
 662 |     "idx = pd.MultiIndex.from_product([coord.index, coord.index],\n",
 663 |     "                                 names=['origin', 'dest'])\n",
 664 |     "\n",
 665 |     "pairs = pd.concat([coord.add_suffix('_1').reindex(idx, level='origin'),\n",
 666 |     "                   coord.add_suffix('_2').reindex(idx, level='dest')],\n",
 667 |     "                  axis=1)\n",
 668 |     "pairs.head()"
 669 |    ]
 670 |   },
 671 |   {
 672 |    "cell_type": "code",
 673 |    "execution_count": 47,
 674 |    "metadata": {
 675 |     "collapsed": true
 676 |    },
 677 |    "outputs": [],
 678 |    "source": [
 679 |     "idx = idx[idx.get_level_values(0) <= idx.get_level_values(1)]"
 680 |    ]
 681 |   },
 682 |   {
 683 |    "cell_type": "code",
 684 |    "execution_count": 48,
 685 |    "metadata": {},
 686 |    "outputs": [
 687 |     {
 688 |      "data": {
 689 |       "text/plain": [
 690 |        "125250"
 691 |       ]
 692 |      },
 693 |      "execution_count": 48,
 694 |      "metadata": {},
 695 |      "output_type": "execute_result"
 696 |     }
 697 |    ],
 698 |    "source": [
 699 |     "len(idx)"
 700 |    ]
 701 |   },
 702 |   {
 703 |    "cell_type": "markdown",
 704 |    "metadata": {},
 705 |    "source": [
 706 |     "We'll break that down a bit, but don't lose sight of the real target: our great-circle distance calculation.\n",
 707 |     "\n",
 708 |     "The `add_suffix` (and `add_prefix`) method is handy for quickly renaming the columns."
 709 |    ]
 710 |   },
 711 |   {
 712 |    "cell_type": "code",
 713 |    "execution_count": 49,
 714 |    "metadata": {},
 715 |    "outputs": [
 716 |     {
 717 |      "data": {
 718 |       "text/html": [
 719 |        "<div>\n",
 720 |        "<style>\n",
 721 |        "    .dataframe thead tr:only-child th {\n",
 722 |        "        text-align: right;\n",
 723 |        "    }\n",
 724 |        "\n",
 725 |        "    .dataframe thead th {\n",
 726 |        "        text-align: left;\n",
 727 |        "    }\n",
 728 |        "\n",
 729 |        "    .dataframe tbody tr th {\n",
 730 |        "        vertical-align: top;\n",
 731 |        "    }\n",
 732 |        "</style>\n",
 733 |        "<table border=\"1\" class=\"dataframe\">\n",
 734 |        "  <thead>\n",
 735 |        "    <tr style=\"text-align: right;\">\n",
 736 |        "      <th></th>\n",
 737 |        "      <th>LATITUDE_1</th>\n",
 738 |        "      <th>LONGITUDE_1</th>\n",
 739 |        "    </tr>\n",
 740 |        "    <tr>\n",
 741 |        "      <th>AIRPORT</th>\n",
 742 |        "      <th></th>\n",
 743 |        "      <th></th>\n",
 744 |        "    </tr>\n",
 745 |        "  </thead>\n",
 746 |        "  <tbody>\n",
 747 |        "    <tr>\n",
 748 |        "      <th>8F3</th>\n",
 749 |        "      <td>33.623889</td>\n",
 750 |        "      <td>-101.240833</td>\n",
 751 |        "    </tr>\n",
 752 |        "    <tr>\n",
 753 |        "      <th>A03</th>\n",
 754 |        "      <td>58.457500</td>\n",
 755 |        "      <td>-154.023333</td>\n",
 756 |        "    </tr>\n",
 757 |        "    <tr>\n",
 758 |        "      <th>A09</th>\n",
 759 |        "      <td>60.482222</td>\n",
 760 |        "      <td>-146.582222</td>\n",
 761 |        "    </tr>\n",
 762 |        "    <tr>\n",
 763 |        "      <th>A18</th>\n",
 764 |        "      <td>63.541667</td>\n",
 765 |        "      <td>-150.993889</td>\n",
 766 |        "    </tr>\n",
 767 |        "    <tr>\n",
 768 |        "      <th>A24</th>\n",
 769 |        "      <td>59.331667</td>\n",
 770 |        "      <td>-135.896667</td>\n",
 771 |        "    </tr>\n",
 772 |        "  </tbody>\n",
 773 |        "</table>\n",
 774 |        "</div>"
 775 |       ],
 776 |       "text/plain": [
 777 |        "         LATITUDE_1  LONGITUDE_1\n",
 778 |        "AIRPORT                         \n",
 779 |        "8F3       33.623889  -101.240833\n",
 780 |        "A03       58.457500  -154.023333\n",
 781 |        "A09       60.482222  -146.582222\n",
 782 |        "A18       63.541667  -150.993889\n",
 783 |        "A24       59.331667  -135.896667"
 784 |       ]
 785 |      },
 786 |      "execution_count": 49,
 787 |      "metadata": {},
 788 |      "output_type": "execute_result"
 789 |     }
 790 |    ],
 791 |    "source": [
 792 |     "coord.add_suffix('_1').head()"
 793 |    ]
 794 |   },
 795 |   {
 796 |    "cell_type": "markdown",
 797 |    "metadata": {},
 798 |    "source": [
 799 |     "Alternatively you could use the more general `.rename` like `coord.rename(columns=lambda x: x + '_1')`.\n",
 800 |     "\n",
 801 |     "Next, we have the `reindex`.\n",
 802 |     "Like I mentioned in the prior chapter, indexes are crucial to pandas.\n",
 803 |     "`.reindex` is all about aligning a Series or DataFrame to a given index.\n",
 804 |     "In this case we use `.reindex` to align our original DataFrame to the new\n",
 805 |     "MultiIndex of combinations.\n",
 806 |     "By default, the output will have the original value if that index label was already present, and `NaN` otherwise.\n",
 807 |     "If we just called `coord.reindex(idx)`, with no additional arguments, we'd get a DataFrame of all `NaN`s."
 808 |    ]
 809 |   },
 810 |   {
 811 |    "cell_type": "code",
 812 |    "execution_count": 50,
 813 |    "metadata": {},
 814 |    "outputs": [
 815 |     {
 816 |      "data": {
 817 |       "text/html": [
 818 |        "<div>\n",
 819 |        "<style>\n",
 820 |        "    .dataframe thead tr:only-child th {\n",
 821 |        "        text-align: right;\n",
 822 |        "    }\n",
 823 |        "\n",
 824 |        "    .dataframe thead th {\n",
 825 |        "        text-align: left;\n",
 826 |        "    }\n",
 827 |        "\n",
 828 |        "    .dataframe tbody tr th {\n",
 829 |        "        vertical-align: top;\n",
 830 |        "    }\n",
 831 |        "</style>\n",
 832 |        "<table border=\"1\" class=\"dataframe\">\n",
 833 |        "  <thead>\n",
 834 |        "    <tr style=\"text-align: right;\">\n",
 835 |        "      <th></th>\n",
 836 |        "      <th></th>\n",
 837 |        "      <th>LATITUDE</th>\n",
 838 |        "      <th>LONGITUDE</th>\n",
 839 |        "    </tr>\n",
 840 |        "    <tr>\n",
 841 |        "      <th>origin</th>\n",
 842 |        "      <th>dest</th>\n",
 843 |        "      <th></th>\n",
 844 |        "      <th></th>\n",
 845 |        "    </tr>\n",
 846 |        "  </thead>\n",
 847 |        "  <tbody>\n",
 848 |        "    <tr>\n",
 849 |        "      <th rowspan=\"5\" valign=\"top\">8F3</th>\n",
 850 |        "      <th>8F3</th>\n",
 851 |        "      <td>NaN</td>\n",
 852 |        "      <td>NaN</td>\n",
 853 |        "    </tr>\n",
 854 |        "    <tr>\n",
 855 |        "      <th>A03</th>\n",
 856 |        "      <td>NaN</td>\n",
 857 |        "      <td>NaN</td>\n",
 858 |        "    </tr>\n",
 859 |        "    <tr>\n",
 860 |        "      <th>A09</th>\n",
 861 |        "      <td>NaN</td>\n",
 862 |        "      <td>NaN</td>\n",
 863 |        "    </tr>\n",
 864 |        "    <tr>\n",
 865 |        "      <th>A18</th>\n",
 866 |        "      <td>NaN</td>\n",
 867 |        "      <td>NaN</td>\n",
 868 |        "    </tr>\n",
 869 |        "    <tr>\n",
 870 |        "      <th>A24</th>\n",
 871 |        "      <td>NaN</td>\n",
 872 |        "      <td>NaN</td>\n",
 873 |        "    </tr>\n",
 874 |        "  </tbody>\n",
 875 |        "</table>\n",
 876 |        "</div>"
 877 |       ],
 878 |       "text/plain": [
 879 |        "             LATITUDE  LONGITUDE\n",
 880 |        "origin dest                     \n",
 881 |        "8F3    8F3        NaN        NaN\n",
 882 |        "       A03        NaN        NaN\n",
 883 |        "       A09        NaN        NaN\n",
 884 |        "       A18        NaN        NaN\n",
 885 |        "       A24        NaN        NaN"
 886 |       ]
 887 |      },
 888 |      "execution_count": 50,
 889 |      "metadata": {},
 890 |      "output_type": "execute_result"
 891 |     }
 892 |    ],
 893 |    "source": [
 894 |     "coord.reindex(idx).head()"
 895 |    ]
 896 |   },
 897 |   {
 898 |    "cell_type": "markdown",
 899 |    "metadata": {},
 900 |    "source": [
 901 |     "That's because there weren't any values of `idx` that were in `coord.index`,\n",
 902 |     "which makes sense since `coord.index` is just a regular one-level Index, while `idx` is a MultiIndex.\n",
 903 |     "We use the `level` keyword to handle the transition from the original single-level Index, to the two-leveled `idx`.\n",
 904 |     "\n",
 905 |     "> `level` : int or name\n",
 906 |     ">\n",
 907 |     "  Broadcast across a level, matching Index values on the\n",
 908 |     "    passed MultiIndex level\n",
 909 |     "\n"
 910 |    ]
 911 |   },
 912 |   {
 913 |    "cell_type": "code",
 914 |    "execution_count": 51,
 915 |    "metadata": {},
 916 |    "outputs": [
 917 |     {
 918 |      "data": {
 919 |       "text/html": [
 920 |        "<div>\n",
 921 |        "<style>\n",
 922 |        "    .dataframe thead tr:only-child th {\n",
 923 |        "        text-align: right;\n",
 924 |        "    }\n",
 925 |        "\n",
 926 |        "    .dataframe thead th {\n",
 927 |        "        text-align: left;\n",
 928 |        "    }\n",
 929 |        "\n",
 930 |        "    .dataframe tbody tr th {\n",
 931 |        "        vertical-align: top;\n",
 932 |        "    }\n",
 933 |        "</style>\n",
 934 |        "<table border=\"1\" class=\"dataframe\">\n",
 935 |        "  <thead>\n",
 936 |        "    <tr style=\"text-align: right;\">\n",
 937 |        "      <th></th>\n",
 938 |        "      <th></th>\n",
 939 |        "      <th>LATITUDE</th>\n",
 940 |        "      <th>LONGITUDE</th>\n",
 941 |        "    </tr>\n",
 942 |        "    <tr>\n",
 943 |        "      <th>origin</th>\n",
 944 |        "      <th>dest</th>\n",
 945 |        "      <th></th>\n",
 946 |        "      <th></th>\n",
 947 |        "    </tr>\n",
 948 |        "  </thead>\n",
 949 |        "  <tbody>\n",
 950 |        "    <tr>\n",
 951 |        "      <th rowspan=\"5\" valign=\"top\">8F3</th>\n",
 952 |        "      <th>8F3</th>\n",
 953 |        "      <td>33.623889</td>\n",
 954 |        "      <td>-101.240833</td>\n",
 955 |        "    </tr>\n",
 956 |        "    <tr>\n",
 957 |        "      <th>A03</th>\n",
 958 |        "      <td>58.457500</td>\n",
 959 |        "      <td>-154.023333</td>\n",
 960 |        "    </tr>\n",
 961 |        "    <tr>\n",
 962 |        "      <th>A09</th>\n",
 963 |        "      <td>60.482222</td>\n",
 964 |        "      <td>-146.582222</td>\n",
 965 |        "    </tr>\n",
 966 |        "    <tr>\n",
 967 |        "      <th>A18</th>\n",
 968 |        "      <td>63.541667</td>\n",
 969 |        "      <td>-150.993889</td>\n",
 970 |        "    </tr>\n",
 971 |        "    <tr>\n",
 972 |        "      <th>A24</th>\n",
 973 |        "      <td>59.331667</td>\n",
 974 |        "      <td>-135.896667</td>\n",
 975 |        "    </tr>\n",
 976 |        "  </tbody>\n",
 977 |        "</table>\n",
 978 |        "</div>"
 979 |       ],
 980 |       "text/plain": [
 981 |        "              LATITUDE   LONGITUDE\n",
 982 |        "origin dest                       \n",
 983 |        "8F3    8F3   33.623889 -101.240833\n",
 984 |        "       A03   58.457500 -154.023333\n",
 985 |        "       A09   60.482222 -146.582222\n",
 986 |        "       A18   63.541667 -150.993889\n",
 987 |        "       A24   59.331667 -135.896667"
 988 |       ]
 989 |      },
 990 |      "execution_count": 51,
 991 |      "metadata": {},
 992 |      "output_type": "execute_result"
 993 |     }
 994 |    ],
 995 |    "source": [
 996 |     "coord.reindex(idx, level='dest').head()"
 997 |    ]
 998 |   },
 999 |   {
1000 |    "cell_type": "markdown",
1001 |    "metadata": {},
1002 |    "source": [
1003 |     "If you ever need to do an operation that mixes regular single-level indexes with Multilevel Indexes, look for a level keyword argument.\n",
1004 |     "For example, all the arithmatic methods (`.mul`, `.add`, etc.) have them.\n",
1005 |     "\n",
1006 |     "This is a bit wasteful since the distance from airport `A` to `B` is the same as `B` to `A`.\n",
1007 |     "We could easily fix this with a `idx = idx[idx.get_level_values(0) <= idx.get_level_values(1)]`, but we'll ignore that for now.\n",
1008 |     "\n",
1009 |     "\n",
1010 |     "Quick tangent, I got some... let's say skepticism, on my last piece about the value of indexes.\n",
1011 |     "Here's an alternative version for the skeptics"
1012 |    ]
1013 |   },
1014 |   {
1015 |    "cell_type": "code",
1016 |    "execution_count": 52,
1017 |    "metadata": {
1018 |     "collapsed": true
1019 |    },
1020 |    "outputs": [],
1021 |    "source": [
1022 |     "from itertools import product, chain\n",
1023 |     "coord2 = coord.reset_index()"
1024 |    ]
1025 |   },
1026 |   {
1027 |    "cell_type": "code",
1028 |    "execution_count": 53,
1029 |    "metadata": {},
1030 |    "outputs": [
1031 |     {
1032 |      "data": {
1033 |       "text/html": [
1034 |        "<div>\n",
1035 |        "<style>\n",
1036 |        "    .dataframe thead tr:only-child th {\n",
1037 |        "        text-align: right;\n",
1038 |        "    }\n",
1039 |        "\n",
1040 |        "    .dataframe thead th {\n",
1041 |        "        text-align: left;\n",
1042 |        "    }\n",
1043 |        "\n",
1044 |        "    .dataframe tbody tr th {\n",
1045 |        "        vertical-align: top;\n",
1046 |        "    }\n",
1047 |        "</style>\n",
1048 |        "<table border=\"1\" class=\"dataframe\">\n",
1049 |        "  <thead>\n",
1050 |        "    <tr style=\"text-align: right;\">\n",
1051 |        "      <th></th>\n",
1052 |        "      <th></th>\n",
1053 |        "      <th>LATITUDE_1</th>\n",
1054 |        "      <th>LONGITUDE_1</th>\n",
1055 |        "      <th>LATITUDE_1</th>\n",
1056 |        "      <th>LONGITUDE_2</th>\n",
1057 |        "    </tr>\n",
1058 |        "    <tr>\n",
1059 |        "      <th>origin</th>\n",
1060 |        "      <th>dest</th>\n",
1061 |        "      <th></th>\n",
1062 |        "      <th></th>\n",
1063 |        "      <th></th>\n",
1064 |        "      <th></th>\n",
1065 |        "    </tr>\n",
1066 |        "  </thead>\n",
1067 |        "  <tbody>\n",
1068 |        "    <tr>\n",
1069 |        "      <th rowspan=\"5\" valign=\"top\">8F3</th>\n",
1070 |        "      <th>8F3</th>\n",
1071 |        "      <td>33.623889</td>\n",
1072 |        "      <td>-101.240833</td>\n",
1073 |        "      <td>33.623889</td>\n",
1074 |        "      <td>-101.240833</td>\n",
1075 |        "    </tr>\n",
1076 |        "    <tr>\n",
1077 |        "      <th>A03</th>\n",
1078 |        "      <td>33.623889</td>\n",
1079 |        "      <td>-101.240833</td>\n",
1080 |        "      <td>58.457500</td>\n",
1081 |        "      <td>-154.023333</td>\n",
1082 |        "    </tr>\n",
1083 |        "    <tr>\n",
1084 |        "      <th>A09</th>\n",
1085 |        "      <td>33.623889</td>\n",
1086 |        "      <td>-101.240833</td>\n",
1087 |        "      <td>60.482222</td>\n",
1088 |        "      <td>-146.582222</td>\n",
1089 |        "    </tr>\n",
1090 |        "    <tr>\n",
1091 |        "      <th>A18</th>\n",
1092 |        "      <td>33.623889</td>\n",
1093 |        "      <td>-101.240833</td>\n",
1094 |        "      <td>63.541667</td>\n",
1095 |        "      <td>-150.993889</td>\n",
1096 |        "    </tr>\n",
1097 |        "    <tr>\n",
1098 |        "      <th>A24</th>\n",
1099 |        "      <td>33.623889</td>\n",
1100 |        "      <td>-101.240833</td>\n",
1101 |        "      <td>59.331667</td>\n",
1102 |        "      <td>-135.896667</td>\n",
1103 |        "    </tr>\n",
1104 |        "  </tbody>\n",
1105 |        "</table>\n",
1106 |        "</div>"
1107 |       ],
1108 |       "text/plain": [
1109 |        "             LATITUDE_1  LONGITUDE_1  LATITUDE_1  LONGITUDE_2\n",
1110 |        "origin dest                                                  \n",
1111 |        "8F3    8F3    33.623889  -101.240833   33.623889  -101.240833\n",
1112 |        "       A03    33.623889  -101.240833   58.457500  -154.023333\n",
1113 |        "       A09    33.623889  -101.240833   60.482222  -146.582222\n",
1114 |        "       A18    33.623889  -101.240833   63.541667  -150.993889\n",
1115 |        "       A24    33.623889  -101.240833   59.331667  -135.896667"
1116 |       ]
1117 |      },
1118 |      "execution_count": 53,
1119 |      "metadata": {},
1120 |      "output_type": "execute_result"
1121 |     }
1122 |    ],
1123 |    "source": [
1124 |     "x = product(coord2.add_suffix('_1').itertuples(index=False),\n",
1125 |     "            coord2.add_suffix('_2').itertuples(index=False))\n",
1126 |     "y = [list(chain.from_iterable(z)) for z in x]\n",
1127 |     "\n",
1128 |     "df2 = (pd.DataFrame(y, columns=['origin', 'LATITUDE_1', 'LONGITUDE_1',\n",
1129 |     "                                'dest', 'LATITUDE_1', 'LONGITUDE_2'])\n",
1130 |     "       .set_index(['origin', 'dest']))\n",
1131 |     "df2.head()"
1132 |    ]
1133 |   },
1134 |   {
1135 |    "cell_type": "markdown",
1136 |    "metadata": {},
1137 |    "source": [
1138 |     "It's also readable (it's Python after all), though a bit slower.\n",
1139 |     "To me the `.reindex` method seems more natural.\n",
1140 |     "My thought process was, \"I need all the combinations of origin & destination (`MultiIndex.from_product`).\n",
1141 |     "Now I need to align this original DataFrame to this new MultiIndex (`coords.reindex`).\"\n",
1142 |     "\n",
1143 |     "With that diversion out of the way, let's turn back to our great-circle distance calculation.\n",
1144 |     "Our first implementation is pure python.\n",
1145 |     "The algorithm itself isn't too important, all that matters is that we're doing math operations on scalars."
1146 |    ]
1147 |   },
1148 |   {
1149 |    "cell_type": "code",
1150 |    "execution_count": 54,
1151 |    "metadata": {
1152 |     "collapsed": true
1153 |    },
1154 |    "outputs": [],
1155 |    "source": [
1156 |     "import math\n",
1157 |     "\n",
1158 |     "def gcd_py(lat1, lng1, lat2, lng2):\n",
1159 |     "    '''\n",
1160 |     "    Calculate great circle distance between two points.\n",
1161 |     "    http://www.johndcook.com/blog/python_longitude_latitude/\n",
1162 |     "    \n",
1163 |     "    Parameters\n",
1164 |     "    ----------\n",
1165 |     "    lat1, lng1, lat2, lng2: float\n",
1166 |     "    \n",
1167 |     "    Returns\n",
1168 |     "    -------\n",
1169 |     "    distance:\n",
1170 |     "      distance from ``(lat1, lng1)`` to ``(lat2, lng2)`` in kilometers.\n",
1171 |     "    '''\n",
1172 |     "    # python2 users will have to use ascii identifiers (or upgrade)\n",
1173 |     "    degrees_to_radians = math.pi / 180.0\n",
1174 |     "    ϕ1 = (90 - lat1) * degrees_to_radians\n",
1175 |     "    ϕ2 = (90 - lat2) * degrees_to_radians\n",
1176 |     "    \n",
1177 |     "    θ1 = lng1 * degrees_to_radians\n",
1178 |     "    θ2 = lng2 * degrees_to_radians\n",
1179 |     "    \n",
1180 |     "    cos = (math.sin(ϕ1) * math.sin(ϕ2) * math.cos(θ1 - θ2) +\n",
1181 |     "           math.cos(ϕ1) * math.cos(ϕ2))\n",
1182 |     "    # round to avoid precision issues on identical points causing ValueErrors\n",
1183 |     "    cos = round(cos, 8)\n",
1184 |     "    arc = math.acos(cos)\n",
1185 |     "    return arc * 6373  # radius of earth, in kilometers"
1186 |    ]
1187 |   },
1188 |   {
1189 |    "cell_type": "markdown",
1190 |    "metadata": {},
1191 |    "source": [
1192 |     "The second implementation uses NumPy.\n",
1193 |     "Aside from numpy having a builtin `deg2rad` convenience function (which is probably a bit slower than multiplying by a constant $\\frac{\\pi}{180}$), basically all we've done is swap the `math` prefix for `np`.\n",
1194 |     "Thanks to NumPy's broadcasting, we can write code that works on scalars or arrays of conformable shape."
1195 |    ]
1196 |   },
1197 |   {
1198 |    "cell_type": "code",
1199 |    "execution_count": 55,
1200 |    "metadata": {
1201 |     "collapsed": true
1202 |    },
1203 |    "outputs": [],
1204 |    "source": [
1205 |     "def gcd_vec(lat1, lng1, lat2, lng2):\n",
1206 |     "    '''\n",
1207 |     "    Calculate great circle distance.\n",
1208 |     "    http://www.johndcook.com/blog/python_longitude_latitude/\n",
1209 |     "    \n",
1210 |     "    Parameters\n",
1211 |     "    ----------\n",
1212 |     "    lat1, lng1, lat2, lng2: float or array of float\n",
1213 |     "    \n",
1214 |     "    Returns\n",
1215 |     "    -------\n",
1216 |     "    distance:\n",
1217 |     "      distance from ``(lat1, lng1)`` to ``(lat2, lng2)`` in kilometers.\n",
1218 |     "    '''\n",
1219 |     "    # python2 users will have to use ascii identifiers\n",
1220 |     "    ϕ1 = np.deg2rad(90 - lat1)\n",
1221 |     "    ϕ2 = np.deg2rad(90 - lat2)\n",
1222 |     "    \n",
1223 |     "    θ1 = np.deg2rad(lng1)\n",
1224 |     "    θ2 = np.deg2rad(lng2)\n",
1225 |     "    \n",
1226 |     "    cos = (np.sin(ϕ1) * np.sin(ϕ2) * np.cos(θ1 - θ2) +\n",
1227 |     "           np.cos(ϕ1) * np.cos(ϕ2))\n",
1228 |     "    arc = np.arccos(cos)\n",
1229 |     "    return arc * 6373"
1230 |    ]
1231 |   },
1232 |   {
1233 |    "cell_type": "markdown",
1234 |    "metadata": {},
1235 |    "source": [
1236 |     "To use the python version on our DataFrame, we can either iterate..."
1237 |    ]
1238 |   },
1239 |   {
1240 |    "cell_type": "code",
1241 |    "execution_count": 56,
1242 |    "metadata": {},
1243 |    "outputs": [
1244 |     {
1245 |      "name": "stdout",
1246 |      "output_type": "stream",
1247 |      "text": [
1248 |       "CPU times: user 823 ms, sys: 9.51 ms, total: 833 ms\n",
1249 |       "Wall time: 832 ms\n"
1250 |      ]
1251 |     },
1252 |     {
1253 |      "data": {
1254 |       "text/plain": [
1255 |        "origin  dest\n",
1256 |        "8F3     8F3         0.000000\n",
1257 |        "        A03      4744.967448\n",
1258 |        "        A09      4407.533212\n",
1259 |        "        A18      4744.593127\n",
1260 |        "        A24      3820.092688\n",
1261 |        "                    ...     \n",
1262 |        "ZZU     YUY     12643.665960\n",
1263 |        "        YYL     13687.592278\n",
1264 |        "        ZBR      4999.647307\n",
1265 |        "        ZXO     14925.531303\n",
1266 |        "        ZZU         0.000000\n",
1267 |        "Length: 250000, dtype: float64"
1268 |       ]
1269 |      },
1270 |      "execution_count": 56,
1271 |      "metadata": {},
1272 |      "output_type": "execute_result"
1273 |     }
1274 |    ],
1275 |    "source": [
1276 |     "%%time\n",
1277 |     "pd.Series([gcd_py(*x) for x in pairs.itertuples(index=False)],\n",
1278 |     "          index=pairs.index)"
1279 |    ]
1280 |   },
1281 |   {
1282 |    "cell_type": "markdown",
1283 |    "metadata": {},
1284 |    "source": [
1285 |     "Or use `DataFrame.apply`."
1286 |    ]
1287 |   },
1288 |   {
1289 |    "cell_type": "code",
1290 |    "execution_count": 57,
1291 |    "metadata": {},
1292 |    "outputs": [
1293 |     {
1294 |      "name": "stdout",
1295 |      "output_type": "stream",
1296 |      "text": [
1297 |       "CPU times: user 14.4 s, sys: 58.6 ms, total: 14.5 s\n",
1298 |       "Wall time: 14.5 s\n"
1299 |      ]
1300 |     }
1301 |    ],
1302 |    "source": [
1303 |     "%%time\n",
1304 |     "r = pairs.apply(lambda x: gcd_py(x['LATITUDE_1'], x['LONGITUDE_1'],\n",
1305 |     "                                 x['LATITUDE_2'], x['LONGITUDE_2']), axis=1);"
1306 |    ]
1307 |   },
1308 |   {
1309 |    "cell_type": "markdown",
1310 |    "metadata": {},
1311 |    "source": [
1312 |     "But as you can see, you don't want to use apply, especially with `axis=1` (calling the function on each row).  It's doing a lot more work handling dtypes in the background, and trying to infer the correct output shape that are pure overhead in this case. On top of that, it has to essentially use a for loop internally.\n",
1313 |     "\n",
1314 |     "You *rarely* want to use `DataFrame.apply` and almost never should use it with `axis=1`. Better to write functions that take arrays, and pass those in directly. Like we did with the vectorized version"
1315 |    ]
1316 |   },
1317 |   {
1318 |    "cell_type": "code",
1319 |    "execution_count": 58,
1320 |    "metadata": {},
1321 |    "outputs": [
1322 |     {
1323 |      "name": "stdout",
1324 |      "output_type": "stream",
1325 |      "text": [
1326 |       "CPU times: user 34.9 ms, sys: 21.1 ms, total: 56 ms\n",
1327 |       "Wall time: 38.9 ms\n"
1328 |      ]
1329 |     },
1330 |     {
1331 |      "name": "stderr",
1332 |      "output_type": "stream",
1333 |      "text": [
1334 |       "/Users/taugspurger/miniconda3/envs/modern-pandas/lib/python3.6/site-packages/ipykernel_launcher.py:24: RuntimeWarning: invalid value encountered in arccos\n"
1335 |      ]
1336 |     }
1337 |    ],
1338 |    "source": [
1339 |     "%%time\n",
1340 |     "r = gcd_vec(pairs['LATITUDE_1'], pairs['LONGITUDE_1'],\n",
1341 |     "            pairs['LATITUDE_2'], pairs['LONGITUDE_2'])"
1342 |    ]
1343 |   },
1344 |   {
1345 |    "cell_type": "code",
1346 |    "execution_count": 60,
1347 |    "metadata": {},
1348 |    "outputs": [
1349 |     {
1350 |      "data": {
1351 |       "text/plain": [
1352 |        "origin  dest\n",
1353 |        "8F3     8F3        0.000000\n",
1354 |        "        A03     4744.967484\n",
1355 |        "        A09     4407.533240\n",
1356 |        "        A18     4744.593111\n",
1357 |        "        A24     3820.092639\n",
1358 |        "dtype: float64"
1359 |       ]
1360 |      },
1361 |      "execution_count": 60,
1362 |      "metadata": {},
1363 |      "output_type": "execute_result"
1364 |     }
1365 |    ],
1366 |    "source": [
1367 |     "r.head()"
1368 |    ]
1369 |   },
1370 |   {
1371 |    "cell_type": "markdown",
1372 |    "metadata": {},
1373 |    "source": [
1374 |     "I try not to use the word \"easy\" when teaching, but that optimization was easy right?\n",
1375 |     "Why then, do I come across uses of `apply`, in my code and others', even when the vectorized version is available?\n",
1376 |     "The difficulty lies in knowing about broadcasting, and seeing where to apply it.\n",
1377 |     "\n",
1378 |     "For example, the README for [lifetimes](https://github.com/CamDavidsonPilon/lifetimes) (by Cam Davidson Pilon, also author of [Bayesian Methods for Hackers](https://github.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers), [lifelines](https://github.com/CamDavidsonPilon/lifelines), and [Data Origami](https://dataorigami.net)) used to have an example of passing [this method](https://github.com/CamDavidsonPilon/lifetimes/blob/5b4f7de0720413b6951ac0a4b0082bd50255a231/lifetimes/estimation.py#L249) into a `DataFrame.apply`.\n",
1379 |     "\n",
1380 |     "```python\n",
1381 |     "data.apply(lambda r: bgf.conditional_expected_number_of_purchases_up_to_time(\n",
1382 |     "    t, r['frequency'], r['recency'], r['T']), axis=1\n",
1383 |     ")\n",
1384 |     "```\n",
1385 |     "\n",
1386 |     "If you look at the function [I linked to](https://github.com/CamDavidsonPilon/lifetimes/blob/5b4f7de0720413b6951ac0a4b0082bd50255a231/lifetimes/estimation.py#L249), it's doing a fairly complicated computation involving a negative log likelihood and the Gamma function from `scipy.special`.\n",
1387 |     "But crucially, it was already vectorized.\n",
1388 |     "We were able to change the example to just pass the arrays (Series in this case) into the function, rather than applying the function to each row.\n",
1389 |     "\n",
1390 |     "```python\n",
1391 |     "bgf.conditional_expected_number_of_purchases_up_to_time(\n",
1392 |     "    t, data['frequency'], data['recency'], data['T']\n",
1393 |     ")\n",
1394 |     "```\n",
1395 |     "\n",
1396 |     "This got us another 30x speedup on the example dataset.\n",
1397 |     "I bring this up because it's very natural to have to translate an equation to code and think, \"Ok now I need to apply this function to each row\", so you reach for `DataFrame.apply`.\n",
1398 |     "See if you can just pass in the NumPy array or Series itself instead.\n",
1399 |     "\n",
1400 |     "Not all operations this easy to vectorize.\n",
1401 |     "Some operations are iterative by nature, and rely on the results of surrounding computations to proceed. In cases like this you can hope that one of the scientific python libraries has implemented it efficiently for you, or write your own solution using Numba / C / Cython / Fortran.\n",
1402 |     "\n",
1403 |     "Other examples take a bit more thought or knowledge to vectorize.\n",
1404 |     "Let's look at [this](http://nbviewer.jupyter.org/github/jreback/pydata2015-london/blob/master/notebooks/idioms.ipynb)\n",
1405 |     "example, taken from Jeff Reback's PyData London talk, that groupwise normalizes a dataset by subtracting the mean and dividing by the standard deviation for each group."
1406 |    ]
1407 |   },
1408 |   {
1409 |    "cell_type": "code",
1410 |    "execution_count": 61,
1411 |    "metadata": {
1412 |     "collapsed": true
1413 |    },
1414 |    "outputs": [],
1415 |    "source": [
1416 |     "import random\n",
1417 |     "\n",
1418 |     "def create_frame(n, n_groups):\n",
1419 |     "    # just setup code, not benchmarking this\n",
1420 |     "    stamps = pd.date_range('20010101', periods=n, freq='ms')\n",
1421 |     "    random.shuffle(stamps.values)    \n",
1422 |     "    return pd.DataFrame({'name': np.random.randint(0,n_groups,size=n),\n",
1423 |     "                         'stamp': stamps,\n",
1424 |     "                         'value': np.random.randint(0,n,size=n),\n",
1425 |     "                         'value2': np.random.randn(n)})\n",
1426 |     "\n",
1427 |     "\n",
1428 |     "df = create_frame(1000000,10000)\n",
1429 |     "\n",
1430 |     "def f_apply(df):\n",
1431 |     "    # Typical transform\n",
1432 |     "    return df.groupby('name').value2.apply(lambda x: (x-x.mean())/x.std())\n",
1433 |     "\n",
1434 |     "def f_unwrap(df):\n",
1435 |     "    # \"unwrapped\"\n",
1436 |     "    g = df.groupby('name').value2\n",
1437 |     "    v = df.value2\n",
1438 |     "    return (v-g.transform(np.mean))/g.transform(np.std)\n"
1439 |    ]
1440 |   },
1441 |   {
1442 |    "cell_type": "markdown",
1443 |    "metadata": {},
1444 |    "source": [
1445 |     "Timing it we see that the \"unwrapped\" version, get's quite a bit better performance."
1446 |    ]
1447 |   },
1448 |   {
1449 |    "cell_type": "code",
1450 |    "execution_count": null,
1451 |    "metadata": {
1452 |     "collapsed": true
1453 |    },
1454 |    "outputs": [],
1455 |    "source": [
1456 |     "%timeit f_apply(df)"
1457 |    ]
1458 |   },
1459 |   {
1460 |    "cell_type": "code",
1461 |    "execution_count": null,
1462 |    "metadata": {
1463 |     "collapsed": true
1464 |    },
1465 |    "outputs": [],
1466 |    "source": [
1467 |     "%timeit f_unwrap(df)"
1468 |    ]
1469 |   },
1470 |   {
1471 |    "cell_type": "markdown",
1472 |    "metadata": {},
1473 |    "source": [
1474 |     "Pandas GroupBy objects intercept calls for common functions like mean, sum, etc. and substitutes them with optimized Cython versions.\n",
1475 |     "So the unwrapped `.transform(np.mean)` and `.transform(np.std)` are fast, while the `x.mean` and `x.std` in the `.apply(lambda x: x - x.mean()/x.std())` aren't.\n",
1476 |     "\n",
1477 |     "`Groupby.apply` is always going to be around, beacuse it offers maximum flexibility. If you need to [fit a model on each group and create additional columns in the process](http://stackoverflow.com/q/35924126/1889400), it can handle that. It just might not be the fastest (which may be OK sometimes).\n",
1478 |     "\n",
1479 |     "This last example is admittedly niche.\n",
1480 |     "I'd like to think that there aren't too many places in pandas where the natural thing to do `.transform((x - x.mean()) / x.std())` is slower than the less obvious alternative.\n",
1481 |     "Ideally the user wouldn't have to know about GroupBy having special fast implementations of common methods.\n",
1482 |     "But that's where we are now."
1483 |    ]
1484 |   },
1485 |   {
1486 |    "cell_type": "markdown",
1487 |    "metadata": {},
1488 |    "source": [
1489 |     "## Categoricals"
1490 |    ]
1491 |   },
1492 |   {
1493 |    "cell_type": "markdown",
1494 |    "metadata": {},
1495 |    "source": [
1496 |     "\n",
1497 |     "Thanks to some great work by [Jan Schulz](https://twitter.com/janschulz), [Jeff Reback](https://twitter.com/janschulz), and others, pandas 0.15 gained a new [Categorical](http://pandas.pydata.org/pandas-docs/version/0.18.0/categorical.html) data type. Categoricals are nice for many reasons beyond just efficiency, but we'll focus on that here.\n",
1498 |     "\n",
1499 |     "Categoricals are an efficient way of representing data (typically strings) that have a low *cardinality*, i.e. relatively few distinct values relative to the size of the array. Internally, a Categorical stores the categories once, and an array of `codes`, which are just integers that indicate which category belongs there. Since it's cheaper to store a `code` than a `category`, we save on memory (shown next).\n",
1500 |     "\n"
1501 |    ]
1502 |   },
1503 |   {
1504 |    "cell_type": "code",
1505 |    "execution_count": null,
1506 |    "metadata": {
1507 |     "collapsed": true
1508 |    },
1509 |    "outputs": [],
1510 |    "source": [
1511 |     "import string\n",
1512 |     "\n",
1513 |     "s = pd.Series(np.random.choice(list(string.ascii_letters), 100000))\n",
1514 |     "print('{:0.2f} KB'.format(s.memory_usage(index=False) / 1000))"
1515 |    ]
1516 |   },
1517 |   {
1518 |    "cell_type": "code",
1519 |    "execution_count": null,
1520 |    "metadata": {
1521 |     "collapsed": true
1522 |    },
1523 |    "outputs": [],
1524 |    "source": [
1525 |     "c = s.astype('category')\n",
1526 |     "print('{:0.2f} KB'.format(c.memory_usage(index=False) / 1000))"
1527 |    ]
1528 |   },
1529 |   {
1530 |    "cell_type": "markdown",
1531 |    "metadata": {},
1532 |    "source": [
1533 |     "Beyond saving memory, having codes and a fixed set of categories offers up a bunch of algorithmic optimizations that pandas and others can take advantage of.\n",
1534 |     "\n",
1535 |     "[Matthew Rocklin](https://twitter.com/mrocklin) has a very nice [post](http://matthewrocklin.com/blog/work/2015/06/18/Categoricals) on using categoricals, and optimizing code in general."
1536 |    ]
1537 |   },
1538 |   {
1539 |    "cell_type": "markdown",
1540 |    "metadata": {},
1541 |    "source": [
1542 |     "## Going Further\n",
1543 |     "\n",
1544 |     "The pandas documentation has a section on [enhancing performance](http://pandas.pydata.org/pandas-docs/version/0.18.0/enhancingperf.html), focusing on using Cython or `numba` to speed up a computation. I've focused more on the lower-hanging fruit of picking the right algorithm, vectorizing your code, and using pandas or numpy more effetively. There are further optimizations availble if these aren't enough."
1545 |    ]
1546 |   },
1547 |   {
1548 |    "cell_type": "markdown",
1549 |    "metadata": {},
1550 |    "source": [
1551 |     "## Summary\n",
1552 |     "\n",
1553 |     "This post was more about how to make effective use of numpy and pandas, than writing your own highly-optimized code.\n",
1554 |     "In my day-to-day work of data analysis it's not worth the time to write and compile a cython extension.\n",
1555 |     "I'd rather rely on pandas to be fast at what matters (label lookup on large arrays, factorizations for groupbys and merges, numerics).\n",
1556 |     "If you want to learn more about what pandas does to make things fast, checkout Jeff Tratner' talk from PyData Seattle [talk](http://www.jeffreytratner.com/slides/pandas-under-the-hood-pydata-seattle-2015.pdf) on pandas' internals.\n",
1557 |     "\n",
1558 |     "Next time we'll look at a differnt kind of optimization: using the Tidy Data principles to facilitate efficient data analysis.\n"
1559 |    ]
1560 |   }
1561 |  ],
1562 |  "metadata": {
1563 |   "kernelspec": {
1564 |    "display_name": "Python 3",
1565 |    "language": "python",
1566 |    "name": "python3"
1567 |   },
1568 |   "language_info": {
1569 |    "codemirror_mode": {
1570 |     "name": "ipython",
1571 |     "version": 3
1572 |    },
1573 |    "file_extension": ".py",
1574 |    "mimetype": "text/x-python",
1575 |    "name": "python",
1576 |    "nbconvert_exporter": "python",
1577 |    "pygments_lexer": "ipython3",
1578 |    "version": "3.6.1"
1579 |   }
1580 |  },
1581 |  "nbformat": 4,
1582 |  "nbformat_minor": 1
1583 | }
1584 | 


--------------------------------------------------------------------------------
/modern_8_out_of_core.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "Want to guess what the most popular pandas-related question on StackOverflow is about? http://stackoverflow.com/questions/14262433/large-data-work-flows-using-pandas\n",
   8 |     "\n",
   9 |     "The strength of pandas really is in medium-data analytics, which can roughly be described as \"datasets that fit in memory, comfortable\".\n",
  10 |     "Depending on your data needs (and ability to buy time on a big [EC2 instance](https://aws.amazon.com/ec2/instance-types/) with, say, 244 GiB of RAM), this section may not apply to you.\n",
  11 |     "\n",
  12 |     "Pandas is not meant for \"Big Data\", but then again you probably don't have big data."
  13 |    ]
  14 |   },
  15 |   {
  16 |    "cell_type": "markdown",
  17 |    "metadata": {},
  18 |    "source": [
  19 |     "# Chunking and Iteration\n",
  20 |     "\n",
  21 |     "The first potential operation for handling larger-than-memory data is chunking or batching your data, and iterating over each batch.\n",
  22 |     "This immediately rules out algorithms that require the full dataset to be in memory at once, but with a bit of cleverness you can work around that limitation for many problems."
  23 |    ]
  24 |   },
  25 |   {
  26 |    "cell_type": "code",
  27 |    "execution_count": 4,
  28 |    "metadata": {
  29 |     "collapsed": false
  30 |    },
  31 |    "outputs": [
  32 |     {
  33 |      "data": {
  34 |       "text/plain": [
  35 |        "'201401'"
  36 |       ]
  37 |      },
  38 |      "execution_count": 4,
  39 |      "metadata": {},
  40 |      "output_type": "execute_result"
  41 |     }
  42 |    ],
  43 |    "source": [
  44 |     "pd.Timestamp('2014-01-01').strftime(\"%Y%m\")"
  45 |    ]
  46 |   },
  47 |   {
  48 |    "cell_type": "code",
  49 |    "execution_count": 18,
  50 |    "metadata": {
  51 |     "collapsed": true
  52 |    },
  53 |    "outputs": [],
  54 |    "source": [
  55 |     "from distributed import Executor\n",
  56 |     "\n",
  57 |     "executor = Executor('127.0.0.1:8786')"
  58 |    ]
  59 |   },
  60 |   {
  61 |    "cell_type": "code",
  62 |    "execution_count": 24,
  63 |    "metadata": {
  64 |     "collapsed": true
  65 |    },
  66 |    "outputs": [],
  67 |    "source": [
  68 |     "from distributed.diagnostics import progress"
  69 |    ]
  70 |   },
  71 |   {
  72 |    "cell_type": "code",
  73 |    "execution_count": 25,
  74 |    "metadata": {
  75 |     "collapsed": true
  76 |    },
  77 |    "outputs": [],
  78 |    "source": [
  79 |     "progress?"
  80 |    ]
  81 |   },
  82 |   {
  83 |    "cell_type": "code",
  84 |    "execution_count": 8,
  85 |    "metadata": {
  86 |     "collapsed": true
  87 |    },
  88 |    "outputs": [],
  89 |    "source": [
  90 |     "import os\n",
  91 |     "import requests"
  92 |    ]
  93 |   },
  94 |   {
  95 |    "cell_type": "code",
  96 |    "execution_count": 12,
  97 |    "metadata": {
  98 |     "collapsed": true
  99 |    },
 100 |    "outputs": [],
 101 |    "source": [
 102 |     "def download_month(month):\n",
 103 |     "    os.makedirs('comext', exist_ok=True)\n",
 104 |     "    base = (\"http://ec.europa.eu/eurostat/estat-navtree-portlet-prod/\"\n",
 105 |     "            \"BulkDownloadListing?sort=1&\"\n",
 106 |     "            \"downfile=comext%2F2015S1%2Fdata%2Fnc{:%Y%m}.7z\")\n",
 107 |     "    r = requests.get(base.format(month), stream=True)\n",
 108 |     "    filename = 'comext/{:%Y-%m}.tsv.7z'.format(month)\n",
 109 |     "    with open(filename, 'wb') as f:\n",
 110 |     "        for chunk in r.iter_content(chunk_size=1024):\n",
 111 |     "            if chunk:\n",
 112 |     "                f.write(chunk)\n",
 113 |     "    return filename\n",
 114 |     "\n",
 115 |     "    "
 116 |    ]
 117 |   },
 118 |   {
 119 |    "cell_type": "code",
 120 |    "execution_count": 26,
 121 |    "metadata": {
 122 |     "collapsed": false
 123 |    },
 124 |    "outputs": [],
 125 |    "source": [
 126 |     "dates = pd.date_range(start='2012-01-01', end='2014-12-01', freq='m')\n",
 127 |     "futures = executor.map(download_month, dates)\n",
 128 |     "progress(*futures)"
 129 |    ]
 130 |   },
 131 |   {
 132 |    "cell_type": "code",
 133 |    "execution_count": 61,
 134 |    "metadata": {
 135 |     "collapsed": false
 136 |    },
 137 |    "outputs": [],
 138 |    "source": [
 139 |     "!rename -S .gz .7z comext/*.gz"
 140 |    ]
 141 |   },
 142 |   {
 143 |    "cell_type": "code",
 144 |    "execution_count": 73,
 145 |    "metadata": {
 146 |     "collapsed": true
 147 |    },
 148 |    "outputs": [],
 149 |    "source": [
 150 |     "rm nc201401.dat"
 151 |    ]
 152 |   },
 153 |   {
 154 |    "cell_type": "code",
 155 |    "execution_count": 83,
 156 |    "metadata": {
 157 |     "collapsed": false
 158 |    },
 159 |    "outputs": [
 160 |     {
 161 |      "name": "stdout",
 162 |      "output_type": "stream",
 163 |      "text": [
 164 |       "\n",
 165 |       "7-Zip [64] 15.09 beta : Copyright (c) 1999-2015 Igor Pavlov : 2015-10-16\n",
 166 |       "p7zip Version 15.09 beta (locale=utf8,Utf16=on,HugeFiles=on,64 bits,4 CPUs x64)\n",
 167 |       "\n",
 168 |       "Scanning the drive for archives:\n",
 169 |       "  0M Scan\b\b\b\b\b\b\b\b\b         \b\b\b\b\b\b\b\b\b1 file, 38021410 bytes (37 MiB)\n",
 170 |       "\n",
 171 |       "Extracting archive: comext/2014-01.tsv.7z\n",
 172 |       "--\n",
 173 |       "Path = comext/2014-01.tsv.7z\n",
 174 |       "Type = 7z\n",
 175 |       "Physical Size = 38021410\n",
 176 |       "Headers Size = 126\n",
 177 |       "Method = LZMA:26\n",
 178 |       "Solid = -\n",
 179 |       "Blocks = 1\n",
 180 |       "\n",
 181 |       "  0%\b\b\b\b    \b\b\b\b  5% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 10% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 16% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 22% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 27% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 34% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 39% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 42% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 48% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 53% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 56% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 60% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 65% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 68% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 73% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 79% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 83% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 87% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 92% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b 96% - nc201401.dat\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b                   \b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b100% 1\b\b\b\b\b\b      \b\b\b\b\b\bEverything is Ok\n",
 182 |       "\n",
 183 |       "Size:       244210771\n",
 184 |       "Compressed: 38021410\n"
 185 |      ]
 186 |     }
 187 |    ],
 188 |    "source": [
 189 |     "!7z x -ocomext comext/*.7z"
 190 |    ]
 191 |   },
 192 |   {
 193 |    "cell_type": "code",
 194 |    "execution_count": 89,
 195 |    "metadata": {
 196 |     "collapsed": false
 197 |    },
 198 |    "outputs": [],
 199 |    "source": [
 200 |     "df = pd.read_csv('comext/nc201401.dat', dtype={'DECLARANT': 'object'})"
 201 |    ]
 202 |   },
 203 |   {
 204 |    "cell_type": "code",
 205 |    "execution_count": 90,
 206 |    "metadata": {
 207 |     "collapsed": false
 208 |    },
 209 |    "outputs": [
 210 |     {
 211 |      "data": {
 212 |       "text/html": [
 213 |        "<div>\n",
 214 |        "<table border=\"0\" class=\"dataframe\">\n",
 215 |        "  <thead>\n",
 216 |        "    <tr style=\"text-align: right;\">\n",
 217 |        "      <th></th>\n",
 218 |        "      <th>DECLARANT</th>\n",
 219 |        "      <th>PARTNER</th>\n",
 220 |        "      <th>PRODUCT_NC</th>\n",
 221 |        "      <th>FLOW</th>\n",
 222 |        "      <th>STAT_REGIME</th>\n",
 223 |        "      <th>PERIOD</th>\n",
 224 |        "      <th>VALUE_1000ECU</th>\n",
 225 |        "      <th>QUANTITY_TON</th>\n",
 226 |        "      <th>SUP_QUANTITY</th>\n",
 227 |        "    </tr>\n",
 228 |        "  </thead>\n",
 229 |        "  <tbody>\n",
 230 |        "    <tr>\n",
 231 |        "      <th>0</th>\n",
 232 |        "      <td>001</td>\n",
 233 |        "      <td>3</td>\n",
 234 |        "      <td>01</td>\n",
 235 |        "      <td>1</td>\n",
 236 |        "      <td>4</td>\n",
 237 |        "      <td>201401</td>\n",
 238 |        "      <td>2910.83</td>\n",
 239 |        "      <td>521.5</td>\n",
 240 |        "      <td>NaN</td>\n",
 241 |        "    </tr>\n",
 242 |        "    <tr>\n",
 243 |        "      <th>1</th>\n",
 244 |        "      <td>001</td>\n",
 245 |        "      <td>3</td>\n",
 246 |        "      <td>01</td>\n",
 247 |        "      <td>2</td>\n",
 248 |        "      <td>4</td>\n",
 249 |        "      <td>201401</td>\n",
 250 |        "      <td>1234.51</td>\n",
 251 |        "      <td>250.7</td>\n",
 252 |        "      <td>NaN</td>\n",
 253 |        "    </tr>\n",
 254 |        "    <tr>\n",
 255 |        "      <th>2</th>\n",
 256 |        "      <td>001</td>\n",
 257 |        "      <td>3</td>\n",
 258 |        "      <td>01012100</td>\n",
 259 |        "      <td>1</td>\n",
 260 |        "      <td>4</td>\n",
 261 |        "      <td>201401</td>\n",
 262 |        "      <td>92.11</td>\n",
 263 |        "      <td>2.0</td>\n",
 264 |        "      <td>4.0</td>\n",
 265 |        "    </tr>\n",
 266 |        "    <tr>\n",
 267 |        "      <th>3</th>\n",
 268 |        "      <td>001</td>\n",
 269 |        "      <td>3</td>\n",
 270 |        "      <td>01012100</td>\n",
 271 |        "      <td>2</td>\n",
 272 |        "      <td>4</td>\n",
 273 |        "      <td>201401</td>\n",
 274 |        "      <td>10.55</td>\n",
 275 |        "      <td>0.5</td>\n",
 276 |        "      <td>1.0</td>\n",
 277 |        "    </tr>\n",
 278 |        "    <tr>\n",
 279 |        "      <th>4</th>\n",
 280 |        "      <td>001</td>\n",
 281 |        "      <td>3</td>\n",
 282 |        "      <td>01012990</td>\n",
 283 |        "      <td>1</td>\n",
 284 |        "      <td>4</td>\n",
 285 |        "      <td>201401</td>\n",
 286 |        "      <td>32.97</td>\n",
 287 |        "      <td>1.1</td>\n",
 288 |        "      <td>2.0</td>\n",
 289 |        "    </tr>\n",
 290 |        "  </tbody>\n",
 291 |        "</table>\n",
 292 |        "</div>"
 293 |       ],
 294 |       "text/plain": [
 295 |        "  DECLARANT  PARTNER PRODUCT_NC  FLOW  STAT_REGIME  PERIOD  VALUE_1000ECU  \\\n",
 296 |        "0       001        3         01     1            4  201401        2910.83   \n",
 297 |        "1       001        3         01     2            4  201401        1234.51   \n",
 298 |        "2       001        3   01012100     1            4  201401          92.11   \n",
 299 |        "3       001        3   01012100     2            4  201401          10.55   \n",
 300 |        "4       001        3   01012990     1            4  201401          32.97   \n",
 301 |        "\n",
 302 |        "   QUANTITY_TON  SUP_QUANTITY  \n",
 303 |        "0         521.5           NaN  \n",
 304 |        "1         250.7           NaN  \n",
 305 |        "2           2.0           4.0  \n",
 306 |        "3           0.5           1.0  \n",
 307 |        "4           1.1           2.0  "
 308 |       ]
 309 |      },
 310 |      "execution_count": 90,
 311 |      "metadata": {},
 312 |      "output_type": "execute_result"
 313 |     }
 314 |    ],
 315 |    "source": [
 316 |     "df.head()"
 317 |    ]
 318 |   },
 319 |   {
 320 |    "cell_type": "code",
 321 |    "execution_count": 1,
 322 |    "metadata": {
 323 |     "collapsed": true
 324 |    },
 325 |    "outputs": [],
 326 |    "source": [
 327 |     "import dask.dataframe as dd"
 328 |    ]
 329 |   },
 330 |   {
 331 |    "cell_type": "code",
 332 |    "execution_count": 2,
 333 |    "metadata": {
 334 |     "collapsed": true
 335 |    },
 336 |    "outputs": [],
 337 |    "source": [
 338 |     "import zipfile"
 339 |    ]
 340 |   },
 341 |   {
 342 |    "cell_type": "code",
 343 |    "execution_count": 14,
 344 |    "metadata": {
 345 |     "collapsed": true
 346 |    },
 347 |    "outputs": [],
 348 |    "source": [
 349 |     "zf = zipfile.ZipFile('ml-latest.zip')\n",
 350 |     "\n",
 351 |     "zf.extractall()"
 352 |    ]
 353 |   },
 354 |   {
 355 |    "cell_type": "code",
 356 |    "execution_count": 3,
 357 |    "metadata": {
 358 |     "collapsed": false
 359 |    },
 360 |    "outputs": [
 361 |     {
 362 |      "name": "stdout",
 363 |      "output_type": "stream",
 364 |      "text": [
 365 |       "README.txt   links.csv    movies.csv   ratings.csv  tags.csv\r\n"
 366 |      ]
 367 |     }
 368 |    ],
 369 |    "source": [
 370 |     "ls ml-latest/"
 371 |    ]
 372 |   },
 373 |   {
 374 |    "cell_type": "code",
 375 |    "execution_count": 4,
 376 |    "metadata": {
 377 |     "collapsed": false
 378 |    },
 379 |    "outputs": [],
 380 |    "source": [
 381 |     "df = pd.read_csv('ml-latest/ratings.csv')\n",
 382 |     "df['timestamp'] = pd.to_datetime(df.timestamp, unit='s')"
 383 |    ]
 384 |   },
 385 |   {
 386 |    "cell_type": "code",
 387 |    "execution_count": 5,
 388 |    "metadata": {
 389 |     "collapsed": false
 390 |    },
 391 |    "outputs": [],
 392 |    "source": [
 393 |     "ratings = dd.from_pandas(df, npartitions=100)"
 394 |    ]
 395 |   },
 396 |   {
 397 |    "cell_type": "code",
 398 |    "execution_count": 6,
 399 |    "metadata": {
 400 |     "collapsed": false
 401 |    },
 402 |    "outputs": [
 403 |     {
 404 |      "data": {
 405 |       "text/html": [
 406 |        "<div>\n",
 407 |        "<table border=\"0\" class=\"dataframe\">\n",
 408 |        "  <thead>\n",
 409 |        "    <tr style=\"text-align: right;\">\n",
 410 |        "      <th></th>\n",
 411 |        "      <th>userId</th>\n",
 412 |        "      <th>movieId</th>\n",
 413 |        "      <th>rating</th>\n",
 414 |        "      <th>timestamp</th>\n",
 415 |        "    </tr>\n",
 416 |        "  </thead>\n",
 417 |        "  <tbody>\n",
 418 |        "    <tr>\n",
 419 |        "      <th>0</th>\n",
 420 |        "      <td>1</td>\n",
 421 |        "      <td>169</td>\n",
 422 |        "      <td>2.5</td>\n",
 423 |        "      <td>2008-03-07 22:08:14</td>\n",
 424 |        "    </tr>\n",
 425 |        "    <tr>\n",
 426 |        "      <th>1</th>\n",
 427 |        "      <td>1</td>\n",
 428 |        "      <td>2471</td>\n",
 429 |        "      <td>3.0</td>\n",
 430 |        "      <td>2008-03-07 22:03:58</td>\n",
 431 |        "    </tr>\n",
 432 |        "    <tr>\n",
 433 |        "      <th>2</th>\n",
 434 |        "      <td>1</td>\n",
 435 |        "      <td>48516</td>\n",
 436 |        "      <td>5.0</td>\n",
 437 |        "      <td>2008-03-07 22:03:55</td>\n",
 438 |        "    </tr>\n",
 439 |        "    <tr>\n",
 440 |        "      <th>3</th>\n",
 441 |        "      <td>2</td>\n",
 442 |        "      <td>2571</td>\n",
 443 |        "      <td>3.5</td>\n",
 444 |        "      <td>2015-07-06 06:50:33</td>\n",
 445 |        "    </tr>\n",
 446 |        "    <tr>\n",
 447 |        "      <th>4</th>\n",
 448 |        "      <td>2</td>\n",
 449 |        "      <td>109487</td>\n",
 450 |        "      <td>4.0</td>\n",
 451 |        "      <td>2015-07-06 06:51:36</td>\n",
 452 |        "    </tr>\n",
 453 |        "  </tbody>\n",
 454 |        "</table>\n",
 455 |        "</div>"
 456 |       ],
 457 |       "text/plain": [
 458 |        "   userId  movieId  rating           timestamp\n",
 459 |        "0       1      169     2.5 2008-03-07 22:08:14\n",
 460 |        "1       1     2471     3.0 2008-03-07 22:03:58\n",
 461 |        "2       1    48516     5.0 2008-03-07 22:03:55\n",
 462 |        "3       2     2571     3.5 2015-07-06 06:50:33\n",
 463 |        "4       2   109487     4.0 2015-07-06 06:51:36"
 464 |       ]
 465 |      },
 466 |      "execution_count": 6,
 467 |      "metadata": {},
 468 |      "output_type": "execute_result"
 469 |     }
 470 |    ],
 471 |    "source": [
 472 |     "ratings.head()"
 473 |    ]
 474 |   },
 475 |   {
 476 |    "cell_type": "code",
 477 |    "execution_count": 43,
 478 |    "metadata": {
 479 |     "collapsed": false
 480 |    },
 481 |    "outputs": [],
 482 |    "source": [
 483 |     "s = df.head(1000000)"
 484 |    ]
 485 |   },
 486 |   {
 487 |    "cell_type": "code",
 488 |    "execution_count": 44,
 489 |    "metadata": {
 490 |     "collapsed": false
 491 |    },
 492 |    "outputs": [],
 493 |    "source": [
 494 |     "s2 = dd.from_pandas(s, npartitions=20)"
 495 |    ]
 496 |   },
 497 |   {
 498 |    "cell_type": "code",
 499 |    "execution_count": 45,
 500 |    "metadata": {
 501 |     "collapsed": true
 502 |    },
 503 |    "outputs": [],
 504 |    "source": [
 505 |     "def sessionize(ts):\n",
 506 |     "    return (ts.sort_values().diff() >= pd.Timedelta(1, unit='h')).fillna(True).cumsum()"
 507 |    ]
 508 |   },
 509 |   {
 510 |    "cell_type": "code",
 511 |    "execution_count": 48,
 512 |    "metadata": {
 513 |     "collapsed": true
 514 |    },
 515 |    "outputs": [],
 516 |    "source": [
 517 |     "from dask.diagnostics import Profiler, ResourceProfiler, CacheProfiler"
 518 |    ]
 519 |   },
 520 |   {
 521 |    "cell_type": "code",
 522 |    "execution_count": 56,
 523 |    "metadata": {
 524 |     "collapsed": false
 525 |    },
 526 |    "outputs": [],
 527 |    "source": [
 528 |     "with Profiler() as prof, ResourceProfiler() as rprof:\n",
 529 |     "    out = ratings.groupby('userId').timestamp.apply(sessionize, columns='timstamp').compute()"
 530 |    ]
 531 |   },
 532 |   {
 533 |    "cell_type": "code",
 534 |    "execution_count": 57,
 535 |    "metadata": {
 536 |     "collapsed": false
 537 |    },
 538 |    "outputs": [
 539 |     {
 540 |      "data": {
 541 |       "text/plain": [
 542 |        "<bokeh.plotting.figure.Figure at 0x139a92ba8>"
 543 |       ]
 544 |      },
 545 |      "execution_count": 57,
 546 |      "metadata": {},
 547 |      "output_type": "execute_result"
 548 |     }
 549 |    ],
 550 |    "source": [
 551 |     "prof.visualize()"
 552 |    ]
 553 |   },
 554 |   {
 555 |    "cell_type": "code",
 556 |    "execution_count": 58,
 557 |    "metadata": {
 558 |     "collapsed": false
 559 |    },
 560 |    "outputs": [
 561 |     {
 562 |      "data": {
 563 |       "text/plain": [
 564 |        "<bokeh.plotting.figure.Figure at 0x113cb8860>"
 565 |       ]
 566 |      },
 567 |      "execution_count": 58,
 568 |      "metadata": {},
 569 |      "output_type": "execute_result"
 570 |     }
 571 |    ],
 572 |    "source": [
 573 |     "rprof.visualize()"
 574 |    ]
 575 |   },
 576 |   {
 577 |    "cell_type": "code",
 578 |    "execution_count": 47,
 579 |    "metadata": {
 580 |     "collapsed": false
 581 |    },
 582 |    "outputs": [
 583 |     {
 584 |      "name": "stdout",
 585 |      "output_type": "stream",
 586 |      "text": [
 587 |       "CPU times: user 7.8 s, sys: 143 ms, total: 7.94 s\n",
 588 |       "Wall time: 7.93 s\n"
 589 |      ]
 590 |     },
 591 |     {
 592 |      "data": {
 593 |       "text/plain": [
 594 |        "userId        \n",
 595 |        "1       2           0\n",
 596 |        "        1           0\n",
 597 |        "        0           0\n",
 598 |        "2       3           0\n",
 599 |        "        4           0\n",
 600 |        "                 ... \n",
 601 |        "10790   999616    111\n",
 602 |        "        999595    111\n",
 603 |        "        999596    111\n",
 604 |        "        999758    112\n",
 605 |        "        999475    113\n",
 606 |        "dtype: int64"
 607 |       ]
 608 |      },
 609 |      "execution_count": 47,
 610 |      "metadata": {},
 611 |      "output_type": "execute_result"
 612 |     }
 613 |    ],
 614 |    "source": [
 615 |     "%%time\n",
 616 |     "s.groupby('userId').timestamp.apply(sessionize)"
 617 |    ]
 618 |   },
 619 |   {
 620 |    "cell_type": "code",
 621 |    "execution_count": 31,
 622 |    "metadata": {
 623 |     "collapsed": false
 624 |    },
 625 |    "outputs": [
 626 |     {
 627 |      "name": "stdout",
 628 |      "output_type": "stream",
 629 |      "text": [
 630 |       "CPU times: user 1.35 s, sys: 24 ms, total: 1.37 s\n",
 631 |       "Wall time: 1.38 s\n"
 632 |      ]
 633 |     },
 634 |     {
 635 |      "data": {
 636 |       "text/plain": [
 637 |        "userId  userId  movieId\n",
 638 |        "1       1       48516      0\n",
 639 |        "                2471       0\n",
 640 |        "                169        0\n",
 641 |        "2       2       2571       0\n",
 642 |        "                109487     0\n",
 643 |        "                          ..\n",
 644 |        "1052    1052    50872      1\n",
 645 |        "                59315      1\n",
 646 |        "                47099      2\n",
 647 |        "                1246       2\n",
 648 |        "                356        2\n",
 649 |        "dtype: int64"
 650 |       ]
 651 |      },
 652 |      "execution_count": 31,
 653 |      "metadata": {},
 654 |      "output_type": "execute_result"
 655 |     }
 656 |    ],
 657 |    "source": [
 658 |     "%%time\n",
 659 |     "s2.groupby(level=0).timestamp.apply(sessionize)"
 660 |    ]
 661 |   },
 662 |   {
 663 |    "cell_type": "code",
 664 |    "execution_count": 67,
 665 |    "metadata": {
 666 |     "collapsed": false
 667 |    },
 668 |    "outputs": [
 669 |     {
 670 |      "data": {
 671 |       "text/plain": [
 672 |        "userId          \n",
 673 |        "1       2           0\n",
 674 |        "        1           0\n",
 675 |        "        0           0\n",
 676 |        "2       3           0\n",
 677 |        "        4           0\n",
 678 |        "                   ..\n",
 679 |        "247753  22884374    0\n",
 680 |        "        22884369    0\n",
 681 |        "        22884373    0\n",
 682 |        "        22884368    1\n",
 683 |        "        22884365    1\n",
 684 |        "dtype: int64"
 685 |       ]
 686 |      },
 687 |      "execution_count": 67,
 688 |      "metadata": {},
 689 |      "output_type": "execute_result"
 690 |     }
 691 |    ],
 692 |    "source": [
 693 |     "df.groupby(['userId']).timestamp.apply(sessionize)"
 694 |    ]
 695 |   },
 696 |   {
 697 |    "cell_type": "code",
 698 |    "execution_count": 43,
 699 |    "metadata": {
 700 |     "collapsed": false
 701 |    },
 702 |    "outputs": [
 703 |     {
 704 |      "ename": "AttributeError",
 705 |      "evalue": "'str' object has no attribute '_name'",
 706 |      "output_type": "error",
 707 |      "traceback": [
 708 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 709 |       "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
 710 |       "\u001b[0;32m<ipython-input-43-4bc4de3fdded>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mratings\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'userId'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrating\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
 711 |       "\u001b[0;32m/Users/tom.augspurger/Envs/blog/lib/python3.5/site-packages/dask/dataframe/groupby.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, columns)\u001b[0m\n\u001b[1;32m    205\u001b[0m         \"\"\"\n\u001b[1;32m    206\u001b[0m         \u001b[0;31m# df = set_index(self.df, self.index, **self.kwargs)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 207\u001b[0;31m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_name\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_name\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    208\u001b[0m             return map_partitions(_groupby_level0_getitem_apply,\n\u001b[1;32m    209\u001b[0m                                   \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 712 |       "\u001b[0;31mAttributeError\u001b[0m: 'str' object has no attribute '_name'"
 713 |      ]
 714 |     }
 715 |    ],
 716 |    "source": [
 717 |     "ratings.groupby('userId').rating.apply(np.mean)"
 718 |    ]
 719 |   },
 720 |   {
 721 |    "cell_type": "code",
 722 |    "execution_count": 33,
 723 |    "metadata": {
 724 |     "collapsed": false
 725 |    },
 726 |    "outputs": [
 727 |     {
 728 |      "data": {
 729 |       "text/html": [
 730 |        "<div>\n",
 731 |        "<table border=\"0\" class=\"dataframe\">\n",
 732 |        "  <thead>\n",
 733 |        "    <tr style=\"text-align: right;\">\n",
 734 |        "      <th></th>\n",
 735 |        "      <th>userId</th>\n",
 736 |        "      <th>movieId</th>\n",
 737 |        "      <th>rating</th>\n",
 738 |        "      <th>timestamp</th>\n",
 739 |        "    </tr>\n",
 740 |        "  </thead>\n",
 741 |        "  <tbody>\n",
 742 |        "    <tr>\n",
 743 |        "      <th>0</th>\n",
 744 |        "      <td>1</td>\n",
 745 |        "      <td>169</td>\n",
 746 |        "      <td>2.5</td>\n",
 747 |        "      <td>2008-03-07 22:08:14</td>\n",
 748 |        "    </tr>\n",
 749 |        "    <tr>\n",
 750 |        "      <th>1</th>\n",
 751 |        "      <td>1</td>\n",
 752 |        "      <td>2471</td>\n",
 753 |        "      <td>3.0</td>\n",
 754 |        "      <td>2008-03-07 22:03:58</td>\n",
 755 |        "    </tr>\n",
 756 |        "    <tr>\n",
 757 |        "      <th>2</th>\n",
 758 |        "      <td>1</td>\n",
 759 |        "      <td>48516</td>\n",
 760 |        "      <td>5.0</td>\n",
 761 |        "      <td>2008-03-07 22:03:55</td>\n",
 762 |        "    </tr>\n",
 763 |        "    <tr>\n",
 764 |        "      <th>3</th>\n",
 765 |        "      <td>2</td>\n",
 766 |        "      <td>2571</td>\n",
 767 |        "      <td>3.5</td>\n",
 768 |        "      <td>2015-07-06 06:50:33</td>\n",
 769 |        "    </tr>\n",
 770 |        "    <tr>\n",
 771 |        "      <th>4</th>\n",
 772 |        "      <td>2</td>\n",
 773 |        "      <td>109487</td>\n",
 774 |        "      <td>4.0</td>\n",
 775 |        "      <td>2015-07-06 06:51:36</td>\n",
 776 |        "    </tr>\n",
 777 |        "    <tr>\n",
 778 |        "      <th>...</th>\n",
 779 |        "      <td>...</td>\n",
 780 |        "      <td>...</td>\n",
 781 |        "      <td>...</td>\n",
 782 |        "      <td>...</td>\n",
 783 |        "    </tr>\n",
 784 |        "    <tr>\n",
 785 |        "      <th>95</th>\n",
 786 |        "      <td>4</td>\n",
 787 |        "      <td>1966</td>\n",
 788 |        "      <td>3.0</td>\n",
 789 |        "      <td>2002-11-19 21:02:20</td>\n",
 790 |        "    </tr>\n",
 791 |        "    <tr>\n",
 792 |        "      <th>96</th>\n",
 793 |        "      <td>4</td>\n",
 794 |        "      <td>2132</td>\n",
 795 |        "      <td>5.0</td>\n",
 796 |        "      <td>2002-11-19 20:26:41</td>\n",
 797 |        "    </tr>\n",
 798 |        "    <tr>\n",
 799 |        "      <th>97</th>\n",
 800 |        "      <td>4</td>\n",
 801 |        "      <td>2174</td>\n",
 802 |        "      <td>4.0</td>\n",
 803 |        "      <td>2004-06-29 21:35:21</td>\n",
 804 |        "    </tr>\n",
 805 |        "    <tr>\n",
 806 |        "      <th>98</th>\n",
 807 |        "      <td>4</td>\n",
 808 |        "      <td>2248</td>\n",
 809 |        "      <td>4.0</td>\n",
 810 |        "      <td>2002-11-19 20:45:14</td>\n",
 811 |        "    </tr>\n",
 812 |        "    <tr>\n",
 813 |        "      <th>99</th>\n",
 814 |        "      <td>4</td>\n",
 815 |        "      <td>2289</td>\n",
 816 |        "      <td>4.0</td>\n",
 817 |        "      <td>2002-11-19 20:38:55</td>\n",
 818 |        "    </tr>\n",
 819 |        "  </tbody>\n",
 820 |        "</table>\n",
 821 |        "<p>100 rows × 4 columns</p>\n",
 822 |        "</div>"
 823 |       ],
 824 |       "text/plain": [
 825 |        "    userId  movieId  rating           timestamp\n",
 826 |        "0        1      169     2.5 2008-03-07 22:08:14\n",
 827 |        "1        1     2471     3.0 2008-03-07 22:03:58\n",
 828 |        "2        1    48516     5.0 2008-03-07 22:03:55\n",
 829 |        "3        2     2571     3.5 2015-07-06 06:50:33\n",
 830 |        "4        2   109487     4.0 2015-07-06 06:51:36\n",
 831 |        "..     ...      ...     ...                 ...\n",
 832 |        "95       4     1966     3.0 2002-11-19 21:02:20\n",
 833 |        "96       4     2132     5.0 2002-11-19 20:26:41\n",
 834 |        "97       4     2174     4.0 2004-06-29 21:35:21\n",
 835 |        "98       4     2248     4.0 2002-11-19 20:45:14\n",
 836 |        "99       4     2289     4.0 2002-11-19 20:38:55\n",
 837 |        "\n",
 838 |        "[100 rows x 4 columns]"
 839 |       ]
 840 |      },
 841 |      "execution_count": 33,
 842 |      "metadata": {},
 843 |      "output_type": "execute_result"
 844 |     }
 845 |    ],
 846 |    "source": [
 847 |     "df"
 848 |    ]
 849 |   },
 850 |   {
 851 |    "cell_type": "code",
 852 |    "execution_count": 32,
 853 |    "metadata": {
 854 |     "collapsed": false
 855 |    },
 856 |    "outputs": [
 857 |     {
 858 |      "ename": "SyntaxError",
 859 |      "evalue": "invalid syntax (<ipython-input-32-b6172370324e>, line 3)",
 860 |      "output_type": "error",
 861 |      "traceback": [
 862 |       "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-32-b6172370324e>\"\u001b[0;36m, line \u001b[0;32m3\u001b[0m\n\u001b[0;31m    =======\u001b[0m\n\u001b[0m     ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
 863 |      ]
 864 |     }
 865 |    ],
 866 |    "source": [
 867 |     "# %load ml-latest/README.txt\n",
 868 |     "Summary\n",
 869 |     "=======\n",
 870 |     "\n",
 871 |     "This dataset (ml-latest) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 22884377 ratings and 586994 tag applications across 34208 movies. These data were created by 247753 users between January 09, 1995 and January 29, 2016. This dataset was generated on January 29, 2016.\n",
 872 |     "\n",
 873 |     "Users were selected at random for inclusion. All selected users had rated at least 1 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.\n",
 874 |     "\n",
 875 |     "The data are contained in four files, `links.csv`, `movies.csv`, `ratings.csv` and `tags.csv`. More details about the contents and use of all these files follows.\n",
 876 |     "\n",
 877 |     "This is a *development* dataset. As such, it may change over time and is not an appropriate dataset for shared research results. See available *benchmark* datasets if that is your intent.\n",
 878 |     "\n",
 879 |     "This and other GroupLens data sets are publicly available for download at <http://grouplens.org/datasets/>.\n",
 880 |     "\n",
 881 |     "\n",
 882 |     "Usage License\n",
 883 |     "=============\n",
 884 |     "\n",
 885 |     "Neither the University of Minnesota nor any of the researchers involved can guarantee the correctness of the data, its suitability for any particular purpose, or the validity of results based on the use of the data set. The data set may be used for any research purposes under the following conditions:\n",
 886 |     "\n",
 887 |     "* The user may not state or imply any endorsement from the University of Minnesota or the GroupLens Research Group.\n",
 888 |     "* The user must acknowledge the use of the data set in publications resulting from the use of the data set (see below for citation information).\n",
 889 |     "* The user may not redistribute the data without separate permission.\n",
 890 |     "* The user may not use this information for any commercial or revenue-bearing purposes without first obtaining permission from a faculty member of the GroupLens Research Project at the University of Minnesota.\n",
 891 |     "* The executable software scripts are provided \"as is\" without warranty of any kind, either expressed or implied, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose. The entire risk as to the quality and performance of them is with you. Should the program prove defective, you assume the cost of all necessary servicing, repair or correction.\n",
 892 |     "\n",
 893 |     "In no event shall the University of Minnesota, its affiliates or employees be liable to you for any damages arising out of the use or inability to use these programs (including but not limited to loss of data or data being rendered inaccurate).\n",
 894 |     "\n",
 895 |     "If you have any further questions or comments, please email <grouplens-info@cs.umn.edu>\n",
 896 |     "\n",
 897 |     "\n",
 898 |     "Citation\n",
 899 |     "========\n",
 900 |     "\n",
 901 |     "To acknowledge use of the dataset in publications, please cite the following paper:\n",
 902 |     "\n",
 903 |     "> F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages. DOI=<http://dx.doi.org/10.1145/2827872>\n",
 904 |     "\n",
 905 |     "\n",
 906 |     "Further Information About GroupLens\n",
 907 |     "===================================\n",
 908 |     "\n",
 909 |     "GroupLens is a research group in the Department of Computer Science and Engineering at the University of Minnesota. Since its inception in 1992, GroupLens's research projects have explored a variety of fields including:\n",
 910 |     "\n",
 911 |     "* recommender systems\n",
 912 |     "* online communities\n",
 913 |     "* mobile and ubiquitious technologies\n",
 914 |     "* digital libraries\n",
 915 |     "* local geographic information systems\n",
 916 |     "\n",
 917 |     "GroupLens Research operates a movie recommender based on collaborative filtering, MovieLens, which is the source of these data. We encourage you to visit <http://movielens.org> to try it out! If you have exciting ideas for experimental work to conduct on MovieLens, send us an email at <grouplens-info@cs.umn.edu> - we are always interested in working with external collaborators.\n",
 918 |     "\n",
 919 |     "\n",
 920 |     "Content and Use of Files\n",
 921 |     "========================\n",
 922 |     "\n",
 923 |     "Formatting and Encoding\n",
 924 |     "-----------------------\n",
 925 |     "\n",
 926 |     "The dataset files are written as [comma-separated values](http://en.wikipedia.org/wiki/Comma-separated_values) files with a single header row. Columns that contain commas (`,`) are escaped using double-quotes (`\"`). These files are encoded as UTF-8. If accented characters in movie titles or tag values (e.g. Misérables, Les (1995)) display incorrectly, make sure that any program reading the data, such as a text editor, terminal, or script, is configured for UTF-8.\n",
 927 |     "\n",
 928 |     "User Ids\n",
 929 |     "--------\n",
 930 |     "\n",
 931 |     "MovieLens users were selected at random for inclusion. Their ids have been anonymized. User ids are consistent between `ratings.csv` and `tags.csv` (i.e., the same id refers to the same user across the two files).\n",
 932 |     "\n",
 933 |     "Movie Ids\n",
 934 |     "---------\n",
 935 |     "\n",
 936 |     "Only movies with at least one rating or tag are included in the dataset. These movie ids are consistent with those used on the MovieLens web site (e.g., id `1` corresponds to the URL <https://movielens.org/movies/1>). Movie ids are consistent between `ratings.csv`, `tags.csv`, `movies.csv`, and `links.csv` (i.e., the same id refers to the same movie across these four data files).\n",
 937 |     "\n",
 938 |     "\n",
 939 |     "Ratings Data File Structure (ratings.csv)\n",
 940 |     "-----------------------------------------\n",
 941 |     "\n",
 942 |     "All ratings are contained in the file `ratings.csv`. Each line of this file after the header row represents one rating of one movie by one user, and has the following format:\n",
 943 |     "\n",
 944 |     "    userId,movieId,rating,timestamp\n",
 945 |     "\n",
 946 |     "The lines within this file are ordered first by userId, then, within user, by movieId.\n",
 947 |     "\n",
 948 |     "Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).\n",
 949 |     "\n",
 950 |     "Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.\n",
 951 |     "\n",
 952 |     "Tags Data File Structure (tags.csv)\n",
 953 |     "-----------------------------------\n",
 954 |     "\n",
 955 |     "All tags are contained in the file `tags.csv`. Each line of this file after the header row represents one tag applied to one movie by one user, and has the following format:\n",
 956 |     "\n",
 957 |     "    userId,movieId,tag,timestamp\n",
 958 |     "\n",
 959 |     "The lines within this file are ordered first by userId, then, within user, by movieId.\n",
 960 |     "\n",
 961 |     "Tags are user-generated metadata about movies. Each tag is typically a single word or short phrase. The meaning, value, and purpose of a particular tag is determined by each user.\n",
 962 |     "\n",
 963 |     "Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.\n",
 964 |     "\n",
 965 |     "Movies Data File Structure (movies.csv)\n",
 966 |     "---------------------------------------\n",
 967 |     "\n",
 968 |     "Movie information is contained in the file `movies.csv`. Each line of this file after the header row represents one movie, and has the following format:\n",
 969 |     "\n",
 970 |     "    movieId,title,genres\n",
 971 |     "\n",
 972 |     "Movie titles are entered manually or imported from <https://www.themoviedb.org/>, and include the year of release in parentheses. Errors and inconsistencies may exist in these titles.\n",
 973 |     "\n",
 974 |     "Genres are a pipe-separated list, and are selected from the following:\n",
 975 |     "\n",
 976 |     "* Action\n",
 977 |     "* Adventure\n",
 978 |     "* Animation\n",
 979 |     "* Children's\n",
 980 |     "* Comedy\n",
 981 |     "* Crime\n",
 982 |     "* Documentary\n",
 983 |     "* Drama\n",
 984 |     "* Fantasy\n",
 985 |     "* Film-Noir\n",
 986 |     "* Horror\n",
 987 |     "* Musical\n",
 988 |     "* Mystery\n",
 989 |     "* Romance\n",
 990 |     "* Sci-Fi\n",
 991 |     "* Thriller\n",
 992 |     "* War\n",
 993 |     "* Western\n",
 994 |     "* (no genres listed)\n",
 995 |     "\n",
 996 |     "Links Data File Structure (links.csv)\n",
 997 |     "---------------------------------------\n",
 998 |     "\n",
 999 |     "Identifiers that can be used to link to other sources of movie data are contained in the file `links.csv`. Each line of this file after the header row represents one movie, and has the following format:\n",
1000 |     "\n",
1001 |     "    movieId,imdbId,tmdbId\n",
1002 |     "\n",
1003 |     "movieId is an identifier for movies used by <https://movielens.org>. E.g., the movie Toy Story has the link <https://movielens.org/movies/1>.\n",
1004 |     "\n",
1005 |     "imdbId is an identifier for movies used by <http://www.imdb.com>. E.g., the movie Toy Story has the link <http://www.imdb.com/title/tt0114709/>.\n",
1006 |     "\n",
1007 |     "tmdbId is an identifier for movies used by <https://www.themoviedb.org>. E.g., the movie Toy Story has the link <https://www.themoviedb.org/movie/862>.\n",
1008 |     "\n",
1009 |     "Use of the resources listed above is subject to the terms of each provider.\n",
1010 |     "\n",
1011 |     "Cross-Validation\n",
1012 |     "----------------\n",
1013 |     "\n",
1014 |     "Prior versions of the MovieLens dataset included either pre-computed cross-folds or scripts to perform this computation. We no longer bundle either of these features with the dataset, since most modern toolkits provide this as a built-in feature. If you wish to learn about standard approaches to cross-fold computation in the context of recommender systems evaluation, see [LensKit](http://lenskit.org) for tools, documentation, and open-source code examples.\n"
1015 |    ]
1016 |   },
1017 |   {
1018 |    "cell_type": "code",
1019 |    "execution_count": null,
1020 |    "metadata": {
1021 |     "collapsed": true
1022 |    },
1023 |    "outputs": [],
1024 |    "source": []
1025 |   },
1026 |   {
1027 |    "cell_type": "code",
1028 |    "execution_count": null,
1029 |    "metadata": {
1030 |     "collapsed": true
1031 |    },
1032 |    "outputs": [],
1033 |    "source": []
1034 |   }
1035 |  ],
1036 |  "metadata": {
1037 |   "kernelspec": {
1038 |    "display_name": "Python 3",
1039 |    "language": "python",
1040 |    "name": "python3"
1041 |   },
1042 |   "language_info": {
1043 |    "codemirror_mode": {
1044 |     "name": "ipython",
1045 |     "version": 3
1046 |    },
1047 |    "file_extension": ".py",
1048 |    "mimetype": "text/x-python",
1049 |    "name": "python",
1050 |    "nbconvert_exporter": "python",
1051 |    "pygments_lexer": "ipython3",
1052 |    "version": "3.5.1"
1053 |   }
1054 |  },
1055 |  "nbformat": 4,
1056 |  "nbformat_minor": 0
1057 | }
1058 | 


--------------------------------------------------------------------------------
/prep.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from functools import wraps
 3 | import pandas as pd
 4 | from sklearn.externals import joblib
 5 | 
 6 | def _repr_html_(self):
 7 |     self = self.copy()
 8 | 
 9 |     if self.index.nlevels > 1:
10 |         return None
11 |     else:
12 |         name = self.index.name or 'index'
13 |         if self.columns.name is None:
14 |             self.columns.name = name
15 | 
16 |         max_rows = pd.get_option("display.max_rows")
17 |         max_cols = pd.get_option("display.max_columns")
18 |         show_dimensions = pd.get_option("display.show_dimensions")
19 | 
20 |         return self.to_html(max_rows=max_rows, max_cols=max_cols,
21 |                             show_dimensions=show_dimensions, notebook=True)
22 | 
23 | if int(os.environ.get("MODERN_PANDAS_EPUB", 0)):
24 |     pd.DataFrame._repr_html_ = _repr_html_
25 | 
26 | 
27 | def cached(name):
28 |     def deco(func):
29 | 
30 |         @wraps(func)
31 |         def wrapper(*args, **kwargs):
32 |             os.makedirs('models', exist_ok=True)
33 |             cache = os.path.join('models', name + '.pkl')
34 |             if os.path.exists(cache):
35 |                 return joblib.load(cache)
36 |             result = func(*args, **kwargs)
37 |             joblib.dump(result, cache)
38 |             return result
39 |         return wrapper
40 |     return deco
41 | 


--------------------------------------------------------------------------------
/url_4.txt:
--------------------------------------------------------------------------------
1 | UserTableName=Master_Coordinate&DBShortName=Aviation_Support_Tables&RawDataTable=T_MASTER_CORD&sqlstr=+SELECT+AIRPORT_ID%2CAIRPORT%2CDISPLAY_AIRPORT_CITY_NAME_FULL%2CAIRPORT_STATE_CODE%2CLATITUDE%2CLONGITUDE+FROM++T_MASTER_CORD&varlist=AIRPORT_ID%2CAIRPORT%2CDISPLAY_AIRPORT_CITY_NAME_FULL%2CAIRPORT_STATE_CODE%2CLATITUDE%2CLONGITUDE&grouplist=&suml=&sumRegion=&filter1=title%3D&filter2=title%3D&geo=Not+Applicable&time=Not+Applicable&timename=N%2FA&GEOGRAPHY=All&XYEAR=All&FREQUENCY=All&VarDesc=AirportSeqID&VarType=Num&VarName=AIRPORT_ID&VarDesc=AirportID&VarType=Num&VarName=AIRPORT&VarDesc=Airport&VarType=Char&VarDesc=AirportName&VarType=Char&VarName=DISPLAY_AIRPORT_CITY_NAME_FULL&VarDesc=AirportCityName&VarType=Char&VarDesc=AirportWacSeqID2&VarType=Num&VarDesc=AirportWac&VarType=Num&VarDesc=AirportCountryName&VarType=Char&VarDesc=AirportCountryCodeISO&VarType=Char&VarDesc=AirportStateName&VarType=Char&VarName=AIRPORT_STATE_CODE&VarDesc=AirportStateCode&VarType=Char&VarDesc=AirportStateFips&VarType=Char&VarDesc=CityMarketSeqID&VarType=Num&VarDesc=CityMarketID&VarType=Num&VarDesc=CityMarketName&VarType=Char&VarDesc=CityMarketWacSeqID2&VarType=Num&VarDesc=CityMarketWac&VarType=Num&VarDesc=LatDegrees&VarType=Num&VarDesc=LatHemisphere&VarType=Char&VarDesc=LatMinutes&VarType=Num&VarDesc=LatSeconds&VarType=Num&VarName=LATITUDE&VarDesc=Latitude&VarType=Num&VarDesc=LonDegrees&VarType=Num&VarDesc=LonHemisphere&VarType=Char&VarDesc=LonMinutes&VarType=Num&VarDesc=LonSeconds&VarType=Num&VarName=LONGITUDE&VarDesc=Longitude&VarType=Num&VarDesc=UTCLocalTimeVariation&VarType=Char&VarDesc=AirportStartDate&VarType=Char&VarDesc=AirportEndDate&VarType=Char&VarDesc=AirportIsClosed&VarType=Num&VarDesc=AirportIsLatest&VarType=Num
2 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import calendar
  3 | 
  4 | 
  5 | def download_airports():
  6 |     cookies = {
  7 |         'ASPSESSIONIDCAQCSDSS': 'GOBIGIDBLGILGICKLFHKIHMN',
  8 |         '__utmt_ritaTracker': '1',
  9 |         '__utmt_GSA_CP': '1',
 10 |         '__utma': '261918792.554646962.1504352085.1504352085.1504352085.1',
 11 |         '__utmb': '261918792.2.10.1504352085',
 12 |         '__utmc': '261918792',
 13 |         '__utmz': '261918792.1504352085.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
 14 |     }
 15 |     headers = {
 16 |         'Origin': 'https://www.transtats.bts.gov',
 17 |         'Accept-Encoding': 'gzip, deflate, br',
 18 |         'Accept-Language': 'en-US,en;q=0.8',
 19 |         'Upgrade-Insecure-Requests': '1',
 20 |         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
 21 |         'Content-Type': 'application/x-www-form-urlencoded',
 22 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 23 |         'Cache-Control': 'max-age=0',
 24 |         'Referer': 'https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=288&DB_Short_Name=Aviation%20Support%20Tables',
 25 |         'Connection': 'keep-alive',
 26 |     }
 27 | 
 28 |     params = (
 29 |         ('Table_ID', '288'),
 30 |         ('Has_Group', '0'),
 31 |         ('Is_Zipped', '0'),
 32 |     )
 33 | 
 34 |     data = [
 35 |         ('UserTableName', 'Master_Coordinate'),
 36 |         ('DBShortName', 'Aviation_Support_Tables'),
 37 |         ('RawDataTable', 'T_MASTER_CORD'),
 38 |         ('sqlstr', ' SELECT AIRPORT_ID,AIRPORT,DISPLAY_AIRPORT_NAME,DISPLAY_AIRPORT_CITY_NAME_FULL,LATITUDE,LONGITUDE FROM  T_MASTER_CORD'),
 39 |         ('varlist', 'AIRPORT_ID,AIRPORT,DISPLAY_AIRPORT_NAME,DISPLAY_AIRPORT_CITY_NAME_FULL,LATITUDE,LONGITUDE'),
 40 |         ('grouplist', ''),
 41 |         ('suml', ''),
 42 |         ('sumRegion', ''),
 43 |         ('filter1', 'title='),
 44 |         ('filter2', 'title='),
 45 |         ('geo', 'Not Applicable'),
 46 |         ('time', 'Not Applicable'),
 47 |         ('timename', 'N/A'),
 48 |         ('GEOGRAPHY', 'All'),
 49 |         ('XYEAR', 'All'),
 50 |         ('FREQUENCY', 'All'),
 51 |         ('VarDesc', 'AirportSeqID'),
 52 |         ('VarDesc', 'AirportID'),
 53 |         ('VarDesc', 'Airport'),
 54 |         ('VarDesc', 'AirportName'),
 55 |         ('VarDesc', 'AirportCityName'),
 56 |         ('VarDesc', 'AirportWacSeqID2'),
 57 |         ('VarDesc', 'AirportWac'),
 58 |         ('VarDesc', 'AirportCountryName'),
 59 |         ('VarDesc', 'AirportCountryCodeISO'),
 60 |         ('VarDesc', 'AirportStateName'),
 61 |         ('VarDesc', 'AirportStateCode'),
 62 |         ('VarDesc', 'AirportStateFips'),
 63 |         ('VarDesc', 'CityMarketSeqID'),
 64 |         ('VarDesc', 'CityMarketID'),
 65 |         ('VarDesc', 'CityMarketName'),
 66 |         ('VarDesc', 'CityMarketWacSeqID2'),
 67 |         ('VarDesc', 'CityMarketWac'),
 68 |         ('VarDesc', 'LatDegrees'),
 69 |         ('VarDesc', 'LatHemisphere'),
 70 |         ('VarDesc', 'LatMinutes'),
 71 |         ('VarDesc', 'LatSeconds'),
 72 |         ('VarDesc', 'Latitude'),
 73 |         ('VarDesc', 'LonDegrees'),
 74 |         ('VarDesc', 'LonHemisphere'),
 75 |         ('VarDesc', 'LonMinutes'),
 76 |         ('VarDesc', 'LonSeconds'),
 77 |         ('VarDesc', 'Longitude'),
 78 |         ('VarDesc', 'UTCLocalTimeVariation'),
 79 |         ('VarDesc', 'AirportStartDate'),
 80 |         ('VarDesc', 'AirportEndDate'),
 81 |         ('VarDesc', 'AirportIsClosed'),
 82 |         ('VarDesc', 'AirportIsLatest'),
 83 |         ('VarType', 'Num'),
 84 |         ('VarType', 'Num'),
 85 |         ('VarType', 'Char'),
 86 |         ('VarType', 'Char'),
 87 |         ('VarType', 'Char'),
 88 |         ('VarType', 'Num'),
 89 |         ('VarType', 'Num'),
 90 |         ('VarType', 'Char'),
 91 |         ('VarType', 'Char'),
 92 |         ('VarType', 'Char'),
 93 |         ('VarType', 'Char'),
 94 |         ('VarType', 'Char'),
 95 |         ('VarType', 'Num'),
 96 |         ('VarType', 'Num'),
 97 |         ('VarType', 'Char'),
 98 |         ('VarType', 'Num'),
 99 |         ('VarType', 'Num'),
100 |         ('VarType', 'Num'),
101 |         ('VarType', 'Char'),
102 |         ('VarType', 'Num'),
103 |         ('VarType', 'Num'),
104 |         ('VarType', 'Num'),
105 |         ('VarType', 'Num'),
106 |         ('VarType', 'Char'),
107 |         ('VarType', 'Num'),
108 |         ('VarType', 'Num'),
109 |         ('VarType', 'Num'),
110 |         ('VarType', 'Char'),
111 |         ('VarType', 'Char'),
112 |         ('VarType', 'Char'),
113 |         ('VarType', 'Num'),
114 |         ('VarType', 'Num'),
115 |         ('VarName', 'AIRPORT_ID'),
116 |         ('VarName', 'AIRPORT'),
117 |         ('VarName', 'DISPLAY_AIRPORT_NAME'),
118 |         ('VarName', 'DISPLAY_AIRPORT_CITY_NAME_FULL'),
119 |         ('VarName', 'LATITUDE'),
120 |         ('VarName', 'LONGITUDE'),
121 |     ]
122 | 
123 |     r = requests.post('https://www.transtats.bts.gov/DownLoad_Table.asp', headers=headers, params=params, cookies=cookies, data=data)
124 |     with open("data/airports.csv.zip") as f:
125 |         f.write(r.content)
126 | 
127 | 
128 | def download_timeseries(date):
129 |     month_name = calendar.month_name[date.month]
130 |     year = date.year
131 |     month = date.month
132 |     data = [
133 |         ('UserTableName', 'On_Time_Performance'),
134 |         ('DBShortName', 'On_Time'),
135 |         ('RawDataTable', 'T_ONTIME'),
136 |         ('sqlstr', ' SELECT FL_DATE,ORIGIN,CRS_DEP_TIME,DEP_TIME,CRS_ARR_TIME,ARR_TIME FROM  T_ONTIME WHERE Month ={} AND YEAR={}'.format(month, year)),
137 |         ('varlist', 'FL_DATE,ORIGIN,CRS_DEP_TIME,DEP_TIME,CRS_ARR_TIME,ARR_TIME'),
138 |         ('filter1', 'title='),
139 |         ('filter2', 'title='),
140 |         ('geo', 'All�'),
141 |         ('time',month_name),
142 |         ('timename', 'Month'),
143 |         ('GEOGRAPHY', 'All'),
144 |         ('XYEAR', str(year)),
145 |         ('FREQUENCY', '1'),
146 |         ('VarDesc', 'Year'),
147 |         ('VarType', 'Num'),
148 |         ('VarDesc', 'Quarter'),
149 |         ('VarType', 'Num'),
150 |         ('VarDesc', 'Month'),
151 |         ('VarType', 'Num'),
152 |         ('VarDesc', 'DayofMonth'),
153 |         ('VarType', 'Num'),
154 |         ('VarDesc', 'DayOfWeek'),
155 |         ('VarType', 'Num'),
156 |         ('VarName', 'FL_DATE'),
157 |         ('VarDesc', 'FlightDate'),
158 |         ('VarType', 'Char'),
159 |         ('VarDesc', 'UniqueCarrier'),
160 |         ('VarType', 'Char'),
161 |         ('VarDesc', 'AirlineID'),
162 |         ('VarType', 'Num'),
163 |         ('VarDesc', 'Carrier'),
164 |         ('VarType', 'Char'),
165 |         ('VarDesc', 'TailNum'),
166 |         ('VarType', 'Char'),
167 |         ('VarDesc', 'FlightNum'),
168 |         ('VarType', 'Char'),
169 |         ('VarDesc', 'OriginAirportID'),
170 |         ('VarType', 'Num'),
171 |         ('VarDesc', 'OriginAirportSeqID'),
172 |         ('VarType', 'Num'),
173 |         ('VarDesc', 'OriginCityMarketID'),
174 |         ('VarType', 'Num'),
175 |         ('VarName', 'ORIGIN'),
176 |         ('VarDesc', 'Origin'),
177 |         ('VarType', 'Char'),
178 |         ('VarDesc', 'OriginCityName'),
179 |         ('VarType', 'Char'),
180 |         ('VarDesc', 'OriginState'),
181 |         ('VarType', 'Char'),
182 |         ('VarDesc', 'OriginStateFips'),
183 |         ('VarType', 'Char'),
184 |         ('VarDesc', 'OriginStateName'),
185 |         ('VarType', 'Char'),
186 |         ('VarDesc', 'OriginWac'),
187 |         ('VarType', 'Num'),
188 |         ('VarDesc', 'DestAirportID'),
189 |         ('VarType', 'Num'),
190 |         ('VarDesc', 'DestAirportSeqID'),
191 |         ('VarType', 'Num'),
192 |         ('VarDesc', 'DestCityMarketID'),
193 |         ('VarType', 'Num'),
194 |         ('VarDesc', 'Dest'),
195 |         ('VarType', 'Char'),
196 |         ('VarDesc', 'DestCityName'),
197 |         ('VarType', 'Char'),
198 |         ('VarDesc', 'DestState'),
199 |         ('VarType', 'Char'),
200 |         ('VarDesc', 'DestStateFips'),
201 |         ('VarType', 'Char'),
202 |         ('VarDesc', 'DestStateName'),
203 |         ('VarType', 'Char'),
204 |         ('VarDesc', 'DestWac'),
205 |         ('VarType', 'Num'),
206 |         ('VarName', 'CRS_DEP_TIME'),
207 |         ('VarDesc', 'CRSDepTime'),
208 |         ('VarType', 'Char'),
209 |         ('VarName', 'DEP_TIME'),
210 |         ('VarDesc', 'DepTime'),
211 |         ('VarType', 'Char'),
212 |         ('VarDesc', 'DepDelay'),
213 |         ('VarType', 'Num'),
214 |         ('VarDesc', 'DepDelayMinutes'),
215 |         ('VarType', 'Num'),
216 |         ('VarDesc', 'DepDel15'),
217 |         ('VarType', 'Num'),
218 |         ('VarDesc', 'DepartureDelayGroups'),
219 |         ('VarType', 'Num'),
220 |         ('VarDesc', 'DepTimeBlk'),
221 |         ('VarType', 'Char'),
222 |         ('VarDesc', 'TaxiOut'),
223 |         ('VarType', 'Num'),
224 |         ('VarDesc', 'WheelsOff'),
225 |         ('VarType', 'Char'),
226 |         ('VarDesc', 'WheelsOn'),
227 |         ('VarType', 'Char'),
228 |         ('VarDesc', 'TaxiIn'),
229 |         ('VarType', 'Num'),
230 |         ('VarName', 'CRS_ARR_TIME'),
231 |         ('VarDesc', 'CRSArrTime'),
232 |         ('VarType', 'Char'),
233 |         ('VarName', 'ARR_TIME'),
234 |         ('VarDesc', 'ArrTime'),
235 |         ('VarType', 'Char'),
236 |         ('VarDesc', 'ArrDelay'),
237 |         ('VarType', 'Num'),
238 |         ('VarDesc', 'ArrDelayMinutes'),
239 |         ('VarType', 'Num'),
240 |         ('VarDesc', 'ArrDel15'),
241 |         ('VarType', 'Num'),
242 |         ('VarDesc', 'ArrivalDelayGroups'),
243 |         ('VarType', 'Num'),
244 |         ('VarDesc', 'ArrTimeBlk'),
245 |         ('VarType', 'Char'),
246 |         ('VarDesc', 'Cancelled'),
247 |         ('VarType', 'Num'),
248 |         ('VarDesc', 'CancellationCode'),
249 |         ('VarType', 'Char'),
250 |         ('VarDesc', 'Diverted'),
251 |         ('VarType', 'Num'),
252 |         ('VarDesc', 'CRSElapsedTime'),
253 |         ('VarType', 'Num'),
254 |         ('VarDesc', 'ActualElapsedTime'),
255 |         ('VarType', 'Num'),
256 |         ('VarDesc', 'AirTime'),
257 |         ('VarType', 'Num'),
258 |         ('VarDesc', 'Flights'),
259 |         ('VarType', 'Num'),
260 |         ('VarDesc', 'Distance'),
261 |         ('VarType', 'Num'),
262 |         ('VarDesc', 'DistanceGroup'),
263 |         ('VarType', 'Num'),
264 |         ('VarDesc', 'CarrierDelay'),
265 |         ('VarType', 'Num'),
266 |         ('VarDesc', 'WeatherDelay'),
267 |         ('VarType', 'Num'),
268 |         ('VarDesc', 'NASDelay'),
269 |         ('VarType', 'Num'),
270 |         ('VarDesc', 'SecurityDelay'),
271 |         ('VarType', 'Num'),
272 |         ('VarDesc', 'LateAircraftDelay'),
273 |         ('VarType', 'Num'),
274 |         ('VarDesc', 'FirstDepTime'),
275 |         ('VarType', 'Char'),
276 |         ('VarDesc', 'TotalAddGTime'),
277 |         ('VarType', 'Num'),
278 |         ('VarDesc', 'LongestAddGTime'),
279 |         ('VarType', 'Num'),
280 |         ('VarDesc', 'DivAirportLandings'),
281 |         ('VarType', 'Num'),
282 |         ('VarDesc', 'DivReachedDest'),
283 |         ('VarType', 'Num'),
284 |         ('VarDesc', 'DivActualElapsedTime'),
285 |         ('VarType', 'Num'),
286 |         ('VarDesc', 'DivArrDelay'),
287 |         ('VarType', 'Num'),
288 |         ('VarDesc', 'DivDistance'),
289 |         ('VarType', 'Num'),
290 |         ('VarDesc', 'Div1Airport'),
291 |         ('VarType', 'Char'),
292 |         ('VarDesc', 'Div1AirportID'),
293 |         ('VarType', 'Num'),
294 |         ('VarDesc', 'Div1AirportSeqID'),
295 |         ('VarType', 'Num'),
296 |         ('VarDesc', 'Div1WheelsOn'),
297 |         ('VarType', 'Char'),
298 |         ('VarDesc', 'Div1TotalGTime'),
299 |         ('VarType', 'Num'),
300 |         ('VarDesc', 'Div1LongestGTime'),
301 |         ('VarType', 'Num'),
302 |         ('VarDesc', 'Div1WheelsOff'),
303 |         ('VarType', 'Char'),
304 |         ('VarDesc', 'Div1TailNum'),
305 |         ('VarType', 'Char'),
306 |         ('VarDesc', 'Div2Airport'),
307 |         ('VarType', 'Char'),
308 |         ('VarDesc', 'Div2AirportID'),
309 |         ('VarType', 'Num'),
310 |         ('VarDesc', 'Div2AirportSeqID'),
311 |         ('VarType', 'Num'),
312 |         ('VarDesc', 'Div2WheelsOn'),
313 |         ('VarType', 'Char'),
314 |         ('VarDesc', 'Div2TotalGTime'),
315 |         ('VarType', 'Num'),
316 |         ('VarDesc', 'Div2LongestGTime'),
317 |         ('VarType', 'Num'),
318 |         ('VarDesc', 'Div2WheelsOff'),
319 |         ('VarType', 'Char'),
320 |         ('VarDesc', 'Div2TailNum'),
321 |         ('VarType', 'Char'),
322 |         ('VarDesc', 'Div3Airport'),
323 |         ('VarType', 'Char'),
324 |         ('VarDesc', 'Div3AirportID'),
325 |         ('VarType', 'Num'),
326 |         ('VarDesc', 'Div3AirportSeqID'),
327 |         ('VarType', 'Num'),
328 |         ('VarDesc', 'Div3WheelsOn'),
329 |         ('VarType', 'Char'),
330 |         ('VarDesc', 'Div3TotalGTime'),
331 |         ('VarType', 'Num'),
332 |         ('VarDesc', 'Div3LongestGTime'),
333 |         ('VarType', 'Num'),
334 |         ('VarDesc', 'Div3WheelsOff'),
335 |         ('VarType', 'Char'),
336 |         ('VarDesc', 'Div3TailNum'),
337 |         ('VarType', 'Char'),
338 |         ('VarDesc', 'Div4Airport'),
339 |         ('VarType', 'Char'),
340 |         ('VarDesc', 'Div4AirportID'),
341 |         ('VarType', 'Num'),
342 |         ('VarDesc', 'Div4AirportSeqID'),
343 |         ('VarType', 'Num'),
344 |         ('VarDesc', 'Div4WheelsOn'),
345 |         ('VarType', 'Char'),
346 |         ('VarDesc', 'Div4TotalGTime'),
347 |         ('VarType', 'Num'),
348 |         ('VarDesc', 'Div4LongestGTime'),
349 |         ('VarType', 'Num'),
350 |         ('VarDesc', 'Div4WheelsOff'),
351 |         ('VarType', 'Char'),
352 |         ('VarDesc', 'Div4TailNum'),
353 |         ('VarType', 'Char'),
354 |         ('VarDesc', 'Div5Airport'),
355 |         ('VarType', 'Char'),
356 |         ('VarDesc', 'Div5AirportID'),
357 |         ('VarType', 'Num'),
358 |         ('VarDesc', 'Div5AirportSeqID'),
359 |         ('VarType', 'Num'),
360 |         ('VarDesc', 'Div5WheelsOn'),
361 |         ('VarType', 'Char'),
362 |         ('VarDesc', 'Div5TotalGTime'),
363 |         ('VarType', 'Num'),
364 |         ('VarDesc', 'Div5LongestGTime'),
365 |         ('VarType', 'Num'),
366 |         ('VarDesc', 'Div5WheelsOff'),
367 |         ('VarType', 'Char'),
368 |         ('VarDesc', 'Div5TailNum'),
369 |         ('VarType', 'Char')
370 |     ]
371 |     cookies = {
372 |         'ASPSESSIONIDCAQCSDSS': 'GOBIGIDBLGILGICKLFHKIHMN',
373 |         '__utmt_ritaTracker': '1',
374 |         '__utmt_GSA_CP': '1',
375 |         '__utma': '261918792.554646962.1504352085.1504442392.1504442407.3',
376 |         '__utmb': '261918792.8.10.1504442407',
377 |         '__utmc': '261918792',
378 |         '__utmz': '261918792.1504442407.3.2.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)',
379 |     }
380 | 
381 |     headers = {
382 |         'Origin': 'https://www.transtats.bts.gov',
383 |         'Accept-Encoding': 'gzip, deflate, br',
384 |         'Accept-Language': 'en-US,en;q=0.8',
385 |         'Upgrade-Insecure-Requests': '1',
386 |         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
387 |         'Content-Type': 'application/x-www-form-urlencoded',
388 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
389 |         'Cache-Control': 'max-age=0',
390 |         'Referer': 'https://www.transtats.bts.gov/DL_SelectFields.asp',
391 |         'Connection': 'keep-alive',
392 |     }
393 | 
394 |     params = (
395 |         ('Table_ID', '236'),
396 |         ('Has_Group', '3'),
397 |         ('Is_Zipped', '0'),
398 |     )
399 | 
400 |     r = requests.get('https://www.transtats.bts.gov/DownLoad_Table.asp',
401 |                      headers=headers, params=params,
402 |                      cookies=cookies, data=data)
403 |     with open("data/timeseries/{:%Y-%m}.zip".format(date.to_timestamp()), "wb") as f:
404 |         f.write(r.content)
405 | 


--------------------------------------------------------------------------------