├── .gitignore
├── README.md
├── Tidy Data.ipynb
└── data
├── 2014-baby-names-illinois.csv
├── 2015-baby-names-illinois.csv
├── billboard.csv
├── pew-raw.csv
├── tb-raw.csv
├── weather-raw.csv
└── weather.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | data/*.tex
3 | data/*.r
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Tidy Data in Python
2 |
3 | **Author**
4 | Jean-Nicholas Hould
5 |
6 | **Description**
7 | This notebook demonstrates some manipulations to transform messy datasets into the tidy format using Python pandas.
8 |
9 | **Additional Information**
10 | For any additional details, please read my [blog post](http://www.jeannicholashould.com/tidy-data-in-python.html) which covers in details this notebook.
11 |
--------------------------------------------------------------------------------
/Tidy Data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Tidy Data in Python\n",
8 | "by [Jean-Nicholas Hould](http://www.jeannicholashould.com/)"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 6,
14 | "metadata": {
15 | "collapsed": false
16 | },
17 | "outputs": [],
18 | "source": [
19 | "import pandas as pd\n",
20 | "import datetime\n",
21 | "from os import listdir\n",
22 | "from os.path import isfile, join\n",
23 | "import glob\n",
24 | "import re"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Column headers are values, not variable names"
32 | ]
33 | },
34 | {
35 | "cell_type": "markdown",
36 | "metadata": {},
37 | "source": [
38 | "### Pew Research Center Dataset"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 7,
44 | "metadata": {
45 | "collapsed": false
46 | },
47 | "outputs": [
48 | {
49 | "data": {
50 | "text/html": [
51 | "
\n",
52 | "
\n",
53 | " \n",
54 | " \n",
55 | " | \n",
56 | " religion | \n",
57 | " <$10k | \n",
58 | " $10-20k | \n",
59 | " $20-30k | \n",
60 | " $30-40k | \n",
61 | " $40-50k | \n",
62 | " $50-75k | \n",
63 | "
\n",
64 | " \n",
65 | " \n",
66 | " \n",
67 | " 0 | \n",
68 | " Agnostic | \n",
69 | " 27 | \n",
70 | " 34 | \n",
71 | " 60 | \n",
72 | " 81 | \n",
73 | " 76 | \n",
74 | " 137 | \n",
75 | "
\n",
76 | " \n",
77 | " 1 | \n",
78 | " Atheist | \n",
79 | " 12 | \n",
80 | " 27 | \n",
81 | " 37 | \n",
82 | " 52 | \n",
83 | " 35 | \n",
84 | " 70 | \n",
85 | "
\n",
86 | " \n",
87 | " 2 | \n",
88 | " Buddhist | \n",
89 | " 27 | \n",
90 | " 21 | \n",
91 | " 30 | \n",
92 | " 34 | \n",
93 | " 33 | \n",
94 | " 58 | \n",
95 | "
\n",
96 | " \n",
97 | " 3 | \n",
98 | " Catholic | \n",
99 | " 418 | \n",
100 | " 617 | \n",
101 | " 732 | \n",
102 | " 670 | \n",
103 | " 638 | \n",
104 | " 1116 | \n",
105 | "
\n",
106 | " \n",
107 | " 4 | \n",
108 | " Dont know/refused | \n",
109 | " 15 | \n",
110 | " 14 | \n",
111 | " 15 | \n",
112 | " 11 | \n",
113 | " 10 | \n",
114 | " 35 | \n",
115 | "
\n",
116 | " \n",
117 | " 5 | \n",
118 | " Evangelical Prot | \n",
119 | " 575 | \n",
120 | " 869 | \n",
121 | " 1064 | \n",
122 | " 982 | \n",
123 | " 881 | \n",
124 | " 1486 | \n",
125 | "
\n",
126 | " \n",
127 | " 6 | \n",
128 | " Hindu | \n",
129 | " 1 | \n",
130 | " 9 | \n",
131 | " 7 | \n",
132 | " 9 | \n",
133 | " 11 | \n",
134 | " 34 | \n",
135 | "
\n",
136 | " \n",
137 | " 7 | \n",
138 | " Historically Black Prot | \n",
139 | " 228 | \n",
140 | " 244 | \n",
141 | " 236 | \n",
142 | " 238 | \n",
143 | " 197 | \n",
144 | " 223 | \n",
145 | "
\n",
146 | " \n",
147 | " 8 | \n",
148 | " Jehovahs Witness | \n",
149 | " 20 | \n",
150 | " 27 | \n",
151 | " 24 | \n",
152 | " 24 | \n",
153 | " 21 | \n",
154 | " 30 | \n",
155 | "
\n",
156 | " \n",
157 | " 9 | \n",
158 | " Jewish | \n",
159 | " 19 | \n",
160 | " 19 | \n",
161 | " 25 | \n",
162 | " 25 | \n",
163 | " 30 | \n",
164 | " 95 | \n",
165 | "
\n",
166 | " \n",
167 | "
\n",
168 | "
"
169 | ],
170 | "text/plain": [
171 | " religion <$10k $10-20k $20-30k $30-40k $40-50k \\\n",
172 | "0 Agnostic 27 34 60 81 76 \n",
173 | "1 Atheist 12 27 37 52 35 \n",
174 | "2 Buddhist 27 21 30 34 33 \n",
175 | "3 Catholic 418 617 732 670 638 \n",
176 | "4 Dont know/refused 15 14 15 11 10 \n",
177 | "5 Evangelical Prot 575 869 1064 982 881 \n",
178 | "6 Hindu 1 9 7 9 11 \n",
179 | "7 Historically Black Prot 228 244 236 238 197 \n",
180 | "8 Jehovahs Witness 20 27 24 24 21 \n",
181 | "9 Jewish 19 19 25 25 30 \n",
182 | "\n",
183 | " $50-75k \n",
184 | "0 137 \n",
185 | "1 70 \n",
186 | "2 58 \n",
187 | "3 1116 \n",
188 | "4 35 \n",
189 | "5 1486 \n",
190 | "6 34 \n",
191 | "7 223 \n",
192 | "8 30 \n",
193 | "9 95 "
194 | ]
195 | },
196 | "execution_count": 7,
197 | "metadata": {},
198 | "output_type": "execute_result"
199 | }
200 | ],
201 | "source": [
202 | "df = pd.read_csv(\"./data/pew-raw.csv\")\n",
203 | "df"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 8,
209 | "metadata": {
210 | "collapsed": false
211 | },
212 | "outputs": [
213 | {
214 | "data": {
215 | "text/html": [
216 | "\n",
217 | "
\n",
218 | " \n",
219 | " \n",
220 | " | \n",
221 | " religion | \n",
222 | " income | \n",
223 | " freq | \n",
224 | "
\n",
225 | " \n",
226 | " \n",
227 | " \n",
228 | " 0 | \n",
229 | " Agnostic | \n",
230 | " <$10k | \n",
231 | " 27 | \n",
232 | "
\n",
233 | " \n",
234 | " 30 | \n",
235 | " Agnostic | \n",
236 | " $30-40k | \n",
237 | " 81 | \n",
238 | "
\n",
239 | " \n",
240 | " 40 | \n",
241 | " Agnostic | \n",
242 | " $40-50k | \n",
243 | " 76 | \n",
244 | "
\n",
245 | " \n",
246 | " 50 | \n",
247 | " Agnostic | \n",
248 | " $50-75k | \n",
249 | " 137 | \n",
250 | "
\n",
251 | " \n",
252 | " 10 | \n",
253 | " Agnostic | \n",
254 | " $10-20k | \n",
255 | " 34 | \n",
256 | "
\n",
257 | " \n",
258 | " 20 | \n",
259 | " Agnostic | \n",
260 | " $20-30k | \n",
261 | " 60 | \n",
262 | "
\n",
263 | " \n",
264 | " 41 | \n",
265 | " Atheist | \n",
266 | " $40-50k | \n",
267 | " 35 | \n",
268 | "
\n",
269 | " \n",
270 | " 21 | \n",
271 | " Atheist | \n",
272 | " $20-30k | \n",
273 | " 37 | \n",
274 | "
\n",
275 | " \n",
276 | " 11 | \n",
277 | " Atheist | \n",
278 | " $10-20k | \n",
279 | " 27 | \n",
280 | "
\n",
281 | " \n",
282 | " 31 | \n",
283 | " Atheist | \n",
284 | " $30-40k | \n",
285 | " 52 | \n",
286 | "
\n",
287 | " \n",
288 | "
\n",
289 | "
"
290 | ],
291 | "text/plain": [
292 | " religion income freq\n",
293 | "0 Agnostic <$10k 27\n",
294 | "30 Agnostic $30-40k 81\n",
295 | "40 Agnostic $40-50k 76\n",
296 | "50 Agnostic $50-75k 137\n",
297 | "10 Agnostic $10-20k 34\n",
298 | "20 Agnostic $20-30k 60\n",
299 | "41 Atheist $40-50k 35\n",
300 | "21 Atheist $20-30k 37\n",
301 | "11 Atheist $10-20k 27\n",
302 | "31 Atheist $30-40k 52"
303 | ]
304 | },
305 | "execution_count": 8,
306 | "metadata": {},
307 | "output_type": "execute_result"
308 | }
309 | ],
310 | "source": [
311 | "formatted_df = pd.melt(df,[\"religion\"], var_name=\"income\", value_name=\"freq\")\n",
312 | "formatted_df = formatted_df.sort_values(by=[\"religion\"])\n",
313 | "formatted_df.head(10)"
314 | ]
315 | },
316 | {
317 | "cell_type": "markdown",
318 | "metadata": {},
319 | "source": [
320 | "### Billboard Top 100 Dataset"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": 69,
326 | "metadata": {
327 | "collapsed": false
328 | },
329 | "outputs": [
330 | {
331 | "data": {
332 | "text/html": [
333 | "\n",
334 | "
\n",
335 | " \n",
336 | " \n",
337 | " | \n",
338 | " year | \n",
339 | " artist.inverted | \n",
340 | " track | \n",
341 | " time | \n",
342 | " genre | \n",
343 | " date.entered | \n",
344 | " date.peaked | \n",
345 | " x1st.week | \n",
346 | " x2nd.week | \n",
347 | " x3rd.week | \n",
348 | " ... | \n",
349 | " x67th.week | \n",
350 | " x68th.week | \n",
351 | " x69th.week | \n",
352 | " x70th.week | \n",
353 | " x71st.week | \n",
354 | " x72nd.week | \n",
355 | " x73rd.week | \n",
356 | " x74th.week | \n",
357 | " x75th.week | \n",
358 | " x76th.week | \n",
359 | "
\n",
360 | " \n",
361 | " \n",
362 | " \n",
363 | " 0 | \n",
364 | " 2000 | \n",
365 | " Destiny's Child | \n",
366 | " Independent Women Part I | \n",
367 | " 3:38 | \n",
368 | " Rock | \n",
369 | " 2000-09-23 | \n",
370 | " 2000-11-18 | \n",
371 | " 78 | \n",
372 | " 63.0 | \n",
373 | " 49.0 | \n",
374 | " ... | \n",
375 | " NaN | \n",
376 | " NaN | \n",
377 | " NaN | \n",
378 | " NaN | \n",
379 | " NaN | \n",
380 | " NaN | \n",
381 | " NaN | \n",
382 | " NaN | \n",
383 | " NaN | \n",
384 | " NaN | \n",
385 | "
\n",
386 | " \n",
387 | " 1 | \n",
388 | " 2000 | \n",
389 | " Santana | \n",
390 | " Maria, Maria | \n",
391 | " 4:18 | \n",
392 | " Rock | \n",
393 | " 2000-02-12 | \n",
394 | " 2000-04-08 | \n",
395 | " 15 | \n",
396 | " 8.0 | \n",
397 | " 6.0 | \n",
398 | " ... | \n",
399 | " NaN | \n",
400 | " NaN | \n",
401 | " NaN | \n",
402 | " NaN | \n",
403 | " NaN | \n",
404 | " NaN | \n",
405 | " NaN | \n",
406 | " NaN | \n",
407 | " NaN | \n",
408 | " NaN | \n",
409 | "
\n",
410 | " \n",
411 | " 2 | \n",
412 | " 2000 | \n",
413 | " Savage Garden | \n",
414 | " I Knew I Loved You | \n",
415 | " 4:07 | \n",
416 | " Rock | \n",
417 | " 1999-10-23 | \n",
418 | " 2000-01-29 | \n",
419 | " 71 | \n",
420 | " 48.0 | \n",
421 | " 43.0 | \n",
422 | " ... | \n",
423 | " NaN | \n",
424 | " NaN | \n",
425 | " NaN | \n",
426 | " NaN | \n",
427 | " NaN | \n",
428 | " NaN | \n",
429 | " NaN | \n",
430 | " NaN | \n",
431 | " NaN | \n",
432 | " NaN | \n",
433 | "
\n",
434 | " \n",
435 | " 3 | \n",
436 | " 2000 | \n",
437 | " Madonna | \n",
438 | " Music | \n",
439 | " 3:45 | \n",
440 | " Rock | \n",
441 | " 2000-08-12 | \n",
442 | " 2000-09-16 | \n",
443 | " 41 | \n",
444 | " 23.0 | \n",
445 | " 18.0 | \n",
446 | " ... | \n",
447 | " NaN | \n",
448 | " NaN | \n",
449 | " NaN | \n",
450 | " NaN | \n",
451 | " NaN | \n",
452 | " NaN | \n",
453 | " NaN | \n",
454 | " NaN | \n",
455 | " NaN | \n",
456 | " NaN | \n",
457 | "
\n",
458 | " \n",
459 | " 4 | \n",
460 | " 2000 | \n",
461 | " Aguilera, Christina | \n",
462 | " Come On Over Baby (All I Want Is You) | \n",
463 | " 3:38 | \n",
464 | " Rock | \n",
465 | " 2000-08-05 | \n",
466 | " 2000-10-14 | \n",
467 | " 57 | \n",
468 | " 47.0 | \n",
469 | " 45.0 | \n",
470 | " ... | \n",
471 | " NaN | \n",
472 | " NaN | \n",
473 | " NaN | \n",
474 | " NaN | \n",
475 | " NaN | \n",
476 | " NaN | \n",
477 | " NaN | \n",
478 | " NaN | \n",
479 | " NaN | \n",
480 | " NaN | \n",
481 | "
\n",
482 | " \n",
483 | " 5 | \n",
484 | " 2000 | \n",
485 | " Janet | \n",
486 | " Doesn't Really Matter | \n",
487 | " 4:17 | \n",
488 | " Rock | \n",
489 | " 2000-06-17 | \n",
490 | " 2000-08-26 | \n",
491 | " 59 | \n",
492 | " 52.0 | \n",
493 | " 43.0 | \n",
494 | " ... | \n",
495 | " NaN | \n",
496 | " NaN | \n",
497 | " NaN | \n",
498 | " NaN | \n",
499 | " NaN | \n",
500 | " NaN | \n",
501 | " NaN | \n",
502 | " NaN | \n",
503 | " NaN | \n",
504 | " NaN | \n",
505 | "
\n",
506 | " \n",
507 | " 6 | \n",
508 | " 2000 | \n",
509 | " Destiny's Child | \n",
510 | " Say My Name | \n",
511 | " 4:31 | \n",
512 | " Rock | \n",
513 | " 1999-12-25 | \n",
514 | " 2000-03-18 | \n",
515 | " 83 | \n",
516 | " 83.0 | \n",
517 | " 44.0 | \n",
518 | " ... | \n",
519 | " NaN | \n",
520 | " NaN | \n",
521 | " NaN | \n",
522 | " NaN | \n",
523 | " NaN | \n",
524 | " NaN | \n",
525 | " NaN | \n",
526 | " NaN | \n",
527 | " NaN | \n",
528 | " NaN | \n",
529 | "
\n",
530 | " \n",
531 | " 7 | \n",
532 | " 2000 | \n",
533 | " Iglesias, Enrique | \n",
534 | " Be With You | \n",
535 | " 3:36 | \n",
536 | " Latin | \n",
537 | " 2000-04-01 | \n",
538 | " 2000-06-24 | \n",
539 | " 63 | \n",
540 | " 45.0 | \n",
541 | " 34.0 | \n",
542 | " ... | \n",
543 | " NaN | \n",
544 | " NaN | \n",
545 | " NaN | \n",
546 | " NaN | \n",
547 | " NaN | \n",
548 | " NaN | \n",
549 | " NaN | \n",
550 | " NaN | \n",
551 | " NaN | \n",
552 | " NaN | \n",
553 | "
\n",
554 | " \n",
555 | " 8 | \n",
556 | " 2000 | \n",
557 | " Sisqo | \n",
558 | " Incomplete | \n",
559 | " 3:52 | \n",
560 | " Rock | \n",
561 | " 2000-06-24 | \n",
562 | " 2000-08-12 | \n",
563 | " 77 | \n",
564 | " 66.0 | \n",
565 | " 61.0 | \n",
566 | " ... | \n",
567 | " NaN | \n",
568 | " NaN | \n",
569 | " NaN | \n",
570 | " NaN | \n",
571 | " NaN | \n",
572 | " NaN | \n",
573 | " NaN | \n",
574 | " NaN | \n",
575 | " NaN | \n",
576 | " NaN | \n",
577 | "
\n",
578 | " \n",
579 | " 9 | \n",
580 | " 2000 | \n",
581 | " Lonestar | \n",
582 | " Amazed | \n",
583 | " 4:25 | \n",
584 | " Country | \n",
585 | " 1999-06-05 | \n",
586 | " 2000-03-04 | \n",
587 | " 81 | \n",
588 | " 54.0 | \n",
589 | " 44.0 | \n",
590 | " ... | \n",
591 | " NaN | \n",
592 | " NaN | \n",
593 | " NaN | \n",
594 | " NaN | \n",
595 | " NaN | \n",
596 | " NaN | \n",
597 | " NaN | \n",
598 | " NaN | \n",
599 | " NaN | \n",
600 | " NaN | \n",
601 | "
\n",
602 | " \n",
603 | "
\n",
604 | "
10 rows × 83 columns
\n",
605 | "
"
606 | ],
607 | "text/plain": [
608 | " year artist.inverted track time \\\n",
609 | "0 2000 Destiny's Child Independent Women Part I 3:38 \n",
610 | "1 2000 Santana Maria, Maria 4:18 \n",
611 | "2 2000 Savage Garden I Knew I Loved You 4:07 \n",
612 | "3 2000 Madonna Music 3:45 \n",
613 | "4 2000 Aguilera, Christina Come On Over Baby (All I Want Is You) 3:38 \n",
614 | "5 2000 Janet Doesn't Really Matter 4:17 \n",
615 | "6 2000 Destiny's Child Say My Name 4:31 \n",
616 | "7 2000 Iglesias, Enrique Be With You 3:36 \n",
617 | "8 2000 Sisqo Incomplete 3:52 \n",
618 | "9 2000 Lonestar Amazed 4:25 \n",
619 | "\n",
620 | " genre date.entered date.peaked x1st.week x2nd.week x3rd.week \\\n",
621 | "0 Rock 2000-09-23 2000-11-18 78 63.0 49.0 \n",
622 | "1 Rock 2000-02-12 2000-04-08 15 8.0 6.0 \n",
623 | "2 Rock 1999-10-23 2000-01-29 71 48.0 43.0 \n",
624 | "3 Rock 2000-08-12 2000-09-16 41 23.0 18.0 \n",
625 | "4 Rock 2000-08-05 2000-10-14 57 47.0 45.0 \n",
626 | "5 Rock 2000-06-17 2000-08-26 59 52.0 43.0 \n",
627 | "6 Rock 1999-12-25 2000-03-18 83 83.0 44.0 \n",
628 | "7 Latin 2000-04-01 2000-06-24 63 45.0 34.0 \n",
629 | "8 Rock 2000-06-24 2000-08-12 77 66.0 61.0 \n",
630 | "9 Country 1999-06-05 2000-03-04 81 54.0 44.0 \n",
631 | "\n",
632 | " ... x67th.week x68th.week x69th.week x70th.week x71st.week \\\n",
633 | "0 ... NaN NaN NaN NaN NaN \n",
634 | "1 ... NaN NaN NaN NaN NaN \n",
635 | "2 ... NaN NaN NaN NaN NaN \n",
636 | "3 ... NaN NaN NaN NaN NaN \n",
637 | "4 ... NaN NaN NaN NaN NaN \n",
638 | "5 ... NaN NaN NaN NaN NaN \n",
639 | "6 ... NaN NaN NaN NaN NaN \n",
640 | "7 ... NaN NaN NaN NaN NaN \n",
641 | "8 ... NaN NaN NaN NaN NaN \n",
642 | "9 ... NaN NaN NaN NaN NaN \n",
643 | "\n",
644 | " x72nd.week x73rd.week x74th.week x75th.week x76th.week \n",
645 | "0 NaN NaN NaN NaN NaN \n",
646 | "1 NaN NaN NaN NaN NaN \n",
647 | "2 NaN NaN NaN NaN NaN \n",
648 | "3 NaN NaN NaN NaN NaN \n",
649 | "4 NaN NaN NaN NaN NaN \n",
650 | "5 NaN NaN NaN NaN NaN \n",
651 | "6 NaN NaN NaN NaN NaN \n",
652 | "7 NaN NaN NaN NaN NaN \n",
653 | "8 NaN NaN NaN NaN NaN \n",
654 | "9 NaN NaN NaN NaN NaN \n",
655 | "\n",
656 | "[10 rows x 83 columns]"
657 | ]
658 | },
659 | "execution_count": 69,
660 | "metadata": {},
661 | "output_type": "execute_result"
662 | }
663 | ],
664 | "source": [
665 | "df = pd.read_csv(\"./data/billboard.csv\", encoding=\"mac_latin2\")\n",
666 | "df.head(10)"
667 | ]
668 | },
669 | {
670 | "cell_type": "code",
671 | "execution_count": 70,
672 | "metadata": {
673 | "collapsed": false
674 | },
675 | "outputs": [
676 | {
677 | "data": {
678 | "text/html": [
679 | "\n",
680 | "
\n",
681 | " \n",
682 | " \n",
683 | " | \n",
684 | " year | \n",
685 | " artist.inverted | \n",
686 | " track | \n",
687 | " time | \n",
688 | " genre | \n",
689 | " week | \n",
690 | " rank | \n",
691 | " date | \n",
692 | "
\n",
693 | " \n",
694 | " \n",
695 | " \n",
696 | " 246 | \n",
697 | " 2000 | \n",
698 | " 2 Pac | \n",
699 | " Baby Don't Cry (Keep Ya Head Up II) | \n",
700 | " 4:22 | \n",
701 | " Rap | \n",
702 | " 1 | \n",
703 | " 87 | \n",
704 | " 2000-02-26 | \n",
705 | "
\n",
706 | " \n",
707 | " 563 | \n",
708 | " 2000 | \n",
709 | " 2 Pac | \n",
710 | " Baby Don't Cry (Keep Ya Head Up II) | \n",
711 | " 4:22 | \n",
712 | " Rap | \n",
713 | " 2 | \n",
714 | " 82 | \n",
715 | " 2000-03-04 | \n",
716 | "
\n",
717 | " \n",
718 | " 880 | \n",
719 | " 2000 | \n",
720 | " 2 Pac | \n",
721 | " Baby Don't Cry (Keep Ya Head Up II) | \n",
722 | " 4:22 | \n",
723 | " Rap | \n",
724 | " 3 | \n",
725 | " 72 | \n",
726 | " 2000-03-11 | \n",
727 | "
\n",
728 | " \n",
729 | " 1197 | \n",
730 | " 2000 | \n",
731 | " 2 Pac | \n",
732 | " Baby Don't Cry (Keep Ya Head Up II) | \n",
733 | " 4:22 | \n",
734 | " Rap | \n",
735 | " 4 | \n",
736 | " 77 | \n",
737 | " 2000-03-18 | \n",
738 | "
\n",
739 | " \n",
740 | " 1514 | \n",
741 | " 2000 | \n",
742 | " 2 Pac | \n",
743 | " Baby Don't Cry (Keep Ya Head Up II) | \n",
744 | " 4:22 | \n",
745 | " Rap | \n",
746 | " 5 | \n",
747 | " 87 | \n",
748 | " 2000-03-25 | \n",
749 | "
\n",
750 | " \n",
751 | " 1831 | \n",
752 | " 2000 | \n",
753 | " 2 Pac | \n",
754 | " Baby Don't Cry (Keep Ya Head Up II) | \n",
755 | " 4:22 | \n",
756 | " Rap | \n",
757 | " 6 | \n",
758 | " 94 | \n",
759 | " 2000-04-01 | \n",
760 | "
\n",
761 | " \n",
762 | " 2148 | \n",
763 | " 2000 | \n",
764 | " 2 Pac | \n",
765 | " Baby Don't Cry (Keep Ya Head Up II) | \n",
766 | " 4:22 | \n",
767 | " Rap | \n",
768 | " 7 | \n",
769 | " 99 | \n",
770 | " 2000-04-08 | \n",
771 | "
\n",
772 | " \n",
773 | " 287 | \n",
774 | " 2000 | \n",
775 | " 2Ge+her | \n",
776 | " The Hardest Part Of Breaking Up (Is Getting Ba... | \n",
777 | " 3:15 | \n",
778 | " R&B | \n",
779 | " 1 | \n",
780 | " 91 | \n",
781 | " 2000-09-02 | \n",
782 | "
\n",
783 | " \n",
784 | " 604 | \n",
785 | " 2000 | \n",
786 | " 2Ge+her | \n",
787 | " The Hardest Part Of Breaking Up (Is Getting Ba... | \n",
788 | " 3:15 | \n",
789 | " R&B | \n",
790 | " 2 | \n",
791 | " 87 | \n",
792 | " 2000-09-09 | \n",
793 | "
\n",
794 | " \n",
795 | " 921 | \n",
796 | " 2000 | \n",
797 | " 2Ge+her | \n",
798 | " The Hardest Part Of Breaking Up (Is Getting Ba... | \n",
799 | " 3:15 | \n",
800 | " R&B | \n",
801 | " 3 | \n",
802 | " 92 | \n",
803 | " 2000-09-16 | \n",
804 | "
\n",
805 | " \n",
806 | "
\n",
807 | "
"
808 | ],
809 | "text/plain": [
810 | " year artist.inverted track \\\n",
811 | "246 2000 2 Pac Baby Don't Cry (Keep Ya Head Up II) \n",
812 | "563 2000 2 Pac Baby Don't Cry (Keep Ya Head Up II) \n",
813 | "880 2000 2 Pac Baby Don't Cry (Keep Ya Head Up II) \n",
814 | "1197 2000 2 Pac Baby Don't Cry (Keep Ya Head Up II) \n",
815 | "1514 2000 2 Pac Baby Don't Cry (Keep Ya Head Up II) \n",
816 | "1831 2000 2 Pac Baby Don't Cry (Keep Ya Head Up II) \n",
817 | "2148 2000 2 Pac Baby Don't Cry (Keep Ya Head Up II) \n",
818 | "287 2000 2Ge+her The Hardest Part Of Breaking Up (Is Getting Ba... \n",
819 | "604 2000 2Ge+her The Hardest Part Of Breaking Up (Is Getting Ba... \n",
820 | "921 2000 2Ge+her The Hardest Part Of Breaking Up (Is Getting Ba... \n",
821 | "\n",
822 | " time genre week rank date \n",
823 | "246 4:22 Rap 1 87 2000-02-26 \n",
824 | "563 4:22 Rap 2 82 2000-03-04 \n",
825 | "880 4:22 Rap 3 72 2000-03-11 \n",
826 | "1197 4:22 Rap 4 77 2000-03-18 \n",
827 | "1514 4:22 Rap 5 87 2000-03-25 \n",
828 | "1831 4:22 Rap 6 94 2000-04-01 \n",
829 | "2148 4:22 Rap 7 99 2000-04-08 \n",
830 | "287 3:15 R&B 1 91 2000-09-02 \n",
831 | "604 3:15 R&B 2 87 2000-09-09 \n",
832 | "921 3:15 R&B 3 92 2000-09-16 "
833 | ]
834 | },
835 | "execution_count": 70,
836 | "metadata": {},
837 | "output_type": "execute_result"
838 | }
839 | ],
840 | "source": [
841 | "# Melting\n",
842 | "id_vars = [\"year\",\"artist.inverted\",\"track\",\"time\",\"genre\",\"date.entered\",\"date.peaked\"]\n",
843 | "df = pd.melt(frame=df,id_vars=id_vars, var_name=\"week\", value_name=\"rank\")\n",
844 | "\n",
845 | "# Formatting \n",
846 | "df[\"week\"] = df['week'].str.extract('(\\d+)', expand=False).astype(int)\n",
847 | "df[\"rank\"] = df[\"rank\"].astype(int)\n",
848 | "\n",
849 | "# Cleaning out unnecessary rows\n",
850 | "df = df.dropna()\n",
851 | "\n",
852 | "# Create \"date\" columns\n",
853 | "df['date'] = pd.to_datetime(df['date.entered']) + pd.to_timedelta(df['week'], unit='w') - pd.DateOffset(weeks=1)\n",
854 | "\n",
855 | "df = df[[\"year\", \"artist.inverted\", \"track\", \"time\", \"genre\", \"week\", \"rank\", \"date\"]]\n",
856 | "df = df.sort_values(ascending=True, by=[\"year\",\"artist.inverted\",\"track\",\"week\",\"rank\"])\n",
857 | "\n",
858 | "# Assigning the tidy dataset to a variable for future usage\n",
859 | "billboard = df\n",
860 | "\n",
861 | "df.head(10)"
862 | ]
863 | },
864 | {
865 | "cell_type": "markdown",
866 | "metadata": {},
867 | "source": [
868 | "## Multiple types in one table"
869 | ]
870 | },
871 | {
872 | "cell_type": "code",
873 | "execution_count": 11,
874 | "metadata": {
875 | "collapsed": false
876 | },
877 | "outputs": [
878 | {
879 | "data": {
880 | "text/html": [
881 | "\n",
882 | "
\n",
883 | " \n",
884 | " \n",
885 | " | \n",
886 | " year | \n",
887 | " artist.inverted | \n",
888 | " track | \n",
889 | " time | \n",
890 | " genre | \n",
891 | " song_id | \n",
892 | "
\n",
893 | " \n",
894 | " \n",
895 | " \n",
896 | " 0 | \n",
897 | " 2000 | \n",
898 | " 2 Pac | \n",
899 | " Baby Don't Cry (Keep Ya Head Up II) | \n",
900 | " 4:22 | \n",
901 | " Rap | \n",
902 | " 0 | \n",
903 | "
\n",
904 | " \n",
905 | " 1 | \n",
906 | " 2000 | \n",
907 | " 2Ge+her | \n",
908 | " The Hardest Part Of Breaking Up (Is Getting Ba... | \n",
909 | " 3:15 | \n",
910 | " R&B | \n",
911 | " 1 | \n",
912 | "
\n",
913 | " \n",
914 | " 2 | \n",
915 | " 2000 | \n",
916 | " 3 Doors Down | \n",
917 | " Kryptonite | \n",
918 | " 3:53 | \n",
919 | " Rock | \n",
920 | " 2 | \n",
921 | "
\n",
922 | " \n",
923 | " 3 | \n",
924 | " 2000 | \n",
925 | " 3 Doors Down | \n",
926 | " Loser | \n",
927 | " 4:24 | \n",
928 | " Rock | \n",
929 | " 3 | \n",
930 | "
\n",
931 | " \n",
932 | " 4 | \n",
933 | " 2000 | \n",
934 | " 504 Boyz | \n",
935 | " Wobble Wobble | \n",
936 | " 3:35 | \n",
937 | " Rap | \n",
938 | " 4 | \n",
939 | "
\n",
940 | " \n",
941 | " 5 | \n",
942 | " 2000 | \n",
943 | " 98° | \n",
944 | " Give Me Just One Night (Una Noche) | \n",
945 | " 3:24 | \n",
946 | " Rock | \n",
947 | " 5 | \n",
948 | "
\n",
949 | " \n",
950 | " 6 | \n",
951 | " 2000 | \n",
952 | " A*Teens | \n",
953 | " Dancing Queen | \n",
954 | " 3:44 | \n",
955 | " Pop | \n",
956 | " 6 | \n",
957 | "
\n",
958 | " \n",
959 | " 7 | \n",
960 | " 2000 | \n",
961 | " Aaliyah | \n",
962 | " I Don't Wanna | \n",
963 | " 4:15 | \n",
964 | " Rock | \n",
965 | " 7 | \n",
966 | "
\n",
967 | " \n",
968 | " 8 | \n",
969 | " 2000 | \n",
970 | " Aaliyah | \n",
971 | " Try Again | \n",
972 | " 4:03 | \n",
973 | " Rock | \n",
974 | " 8 | \n",
975 | "
\n",
976 | " \n",
977 | " 9 | \n",
978 | " 2000 | \n",
979 | " Adams, Yolanda | \n",
980 | " Open My Heart | \n",
981 | " 5:30 | \n",
982 | " Gospel | \n",
983 | " 9 | \n",
984 | "
\n",
985 | " \n",
986 | "
\n",
987 | "
"
988 | ],
989 | "text/plain": [
990 | " year artist.inverted track \\\n",
991 | "0 2000 2 Pac Baby Don't Cry (Keep Ya Head Up II) \n",
992 | "1 2000 2Ge+her The Hardest Part Of Breaking Up (Is Getting Ba... \n",
993 | "2 2000 3 Doors Down Kryptonite \n",
994 | "3 2000 3 Doors Down Loser \n",
995 | "4 2000 504 Boyz Wobble Wobble \n",
996 | "5 2000 98° Give Me Just One Night (Una Noche) \n",
997 | "6 2000 A*Teens Dancing Queen \n",
998 | "7 2000 Aaliyah I Don't Wanna \n",
999 | "8 2000 Aaliyah Try Again \n",
1000 | "9 2000 Adams, Yolanda Open My Heart \n",
1001 | "\n",
1002 | " time genre song_id \n",
1003 | "0 4:22 Rap 0 \n",
1004 | "1 3:15 R&B 1 \n",
1005 | "2 3:53 Rock 2 \n",
1006 | "3 4:24 Rock 3 \n",
1007 | "4 3:35 Rap 4 \n",
1008 | "5 3:24 Rock 5 \n",
1009 | "6 3:44 Pop 6 \n",
1010 | "7 4:15 Rock 7 \n",
1011 | "8 4:03 Rock 8 \n",
1012 | "9 5:30 Gospel 9 "
1013 | ]
1014 | },
1015 | "execution_count": 11,
1016 | "metadata": {},
1017 | "output_type": "execute_result"
1018 | }
1019 | ],
1020 | "source": [
1021 | "songs_cols = [\"year\", \"artist.inverted\", \"track\", \"time\", \"genre\"]\n",
1022 | "songs = billboard[songs_cols].drop_duplicates()\n",
1023 | "songs = songs.reset_index(drop=True)\n",
1024 | "songs[\"song_id\"] = songs.index\n",
1025 | "songs.head(10)"
1026 | ]
1027 | },
1028 | {
1029 | "cell_type": "code",
1030 | "execution_count": 12,
1031 | "metadata": {
1032 | "collapsed": false
1033 | },
1034 | "outputs": [
1035 | {
1036 | "data": {
1037 | "text/html": [
1038 | "\n",
1039 | "
\n",
1040 | " \n",
1041 | " \n",
1042 | " | \n",
1043 | " song_id | \n",
1044 | " date | \n",
1045 | " rank | \n",
1046 | "
\n",
1047 | " \n",
1048 | " \n",
1049 | " \n",
1050 | " 0 | \n",
1051 | " 0 | \n",
1052 | " 2000-02-26 | \n",
1053 | " 87 | \n",
1054 | "
\n",
1055 | " \n",
1056 | " 1 | \n",
1057 | " 0 | \n",
1058 | " 2000-03-04 | \n",
1059 | " 82 | \n",
1060 | "
\n",
1061 | " \n",
1062 | " 2 | \n",
1063 | " 0 | \n",
1064 | " 2000-03-11 | \n",
1065 | " 72 | \n",
1066 | "
\n",
1067 | " \n",
1068 | " 3 | \n",
1069 | " 0 | \n",
1070 | " 2000-03-18 | \n",
1071 | " 77 | \n",
1072 | "
\n",
1073 | " \n",
1074 | " 4 | \n",
1075 | " 0 | \n",
1076 | " 2000-03-25 | \n",
1077 | " 87 | \n",
1078 | "
\n",
1079 | " \n",
1080 | " 5 | \n",
1081 | " 0 | \n",
1082 | " 2000-04-01 | \n",
1083 | " 94 | \n",
1084 | "
\n",
1085 | " \n",
1086 | " 6 | \n",
1087 | " 0 | \n",
1088 | " 2000-04-08 | \n",
1089 | " 99 | \n",
1090 | "
\n",
1091 | " \n",
1092 | " 7 | \n",
1093 | " 1 | \n",
1094 | " 2000-09-02 | \n",
1095 | " 91 | \n",
1096 | "
\n",
1097 | " \n",
1098 | " 8 | \n",
1099 | " 1 | \n",
1100 | " 2000-09-09 | \n",
1101 | " 87 | \n",
1102 | "
\n",
1103 | " \n",
1104 | " 9 | \n",
1105 | " 1 | \n",
1106 | " 2000-09-16 | \n",
1107 | " 92 | \n",
1108 | "
\n",
1109 | " \n",
1110 | "
\n",
1111 | "
"
1112 | ],
1113 | "text/plain": [
1114 | " song_id date rank\n",
1115 | "0 0 2000-02-26 87\n",
1116 | "1 0 2000-03-04 82\n",
1117 | "2 0 2000-03-11 72\n",
1118 | "3 0 2000-03-18 77\n",
1119 | "4 0 2000-03-25 87\n",
1120 | "5 0 2000-04-01 94\n",
1121 | "6 0 2000-04-08 99\n",
1122 | "7 1 2000-09-02 91\n",
1123 | "8 1 2000-09-09 87\n",
1124 | "9 1 2000-09-16 92"
1125 | ]
1126 | },
1127 | "execution_count": 12,
1128 | "metadata": {},
1129 | "output_type": "execute_result"
1130 | }
1131 | ],
1132 | "source": [
1133 | "ranks = pd.merge(billboard, songs, on=[\"year\",\"artist.inverted\", \"track\", \"time\", \"genre\"])\n",
1134 | "ranks = ranks[[\"song_id\", \"date\",\"rank\"]]\n",
1135 | "ranks.head(10)"
1136 | ]
1137 | },
1138 | {
1139 | "cell_type": "markdown",
1140 | "metadata": {
1141 | "collapsed": true
1142 | },
1143 | "source": [
1144 | "## Multiple variables stored in one column"
1145 | ]
1146 | },
1147 | {
1148 | "cell_type": "markdown",
1149 | "metadata": {},
1150 | "source": [
1151 | "### Tubercolosis Example"
1152 | ]
1153 | },
1154 | {
1155 | "cell_type": "markdown",
1156 | "metadata": {},
1157 | "source": [
1158 | "A few notes on the raw data set:\n",
1159 | "\n",
1160 | "- The columns starting with \"m\" or \"f\" contain multiple variables: \n",
1161 | " - Sex (\"m\" or \"f\")\n",
1162 | " - Age Group (\"0-14\",\"15-24\", \"25-34\", \"45-54\", \"55-64\", \"65\", \"unknown\")\n",
1163 | "- Mixture of 0s and missing values(\"NaN\"). This is due to the data collection process and the distinction is important for this dataset."
1164 | ]
1165 | },
1166 | {
1167 | "cell_type": "code",
1168 | "execution_count": 49,
1169 | "metadata": {
1170 | "collapsed": false
1171 | },
1172 | "outputs": [
1173 | {
1174 | "data": {
1175 | "text/html": [
1176 | "\n",
1177 | "
\n",
1178 | " \n",
1179 | " \n",
1180 | " | \n",
1181 | " country | \n",
1182 | " year | \n",
1183 | " m014 | \n",
1184 | " m1524 | \n",
1185 | " m2534 | \n",
1186 | " m3544 | \n",
1187 | " m4554 | \n",
1188 | " m5564 | \n",
1189 | " m65 | \n",
1190 | " mu | \n",
1191 | " f014 | \n",
1192 | "
\n",
1193 | " \n",
1194 | " \n",
1195 | " \n",
1196 | " 0 | \n",
1197 | " AD | \n",
1198 | " 2000 | \n",
1199 | " 0.0 | \n",
1200 | " 0.0 | \n",
1201 | " 1.0 | \n",
1202 | " 0.0 | \n",
1203 | " 0 | \n",
1204 | " 0 | \n",
1205 | " 0.0 | \n",
1206 | " NaN | \n",
1207 | " NaN | \n",
1208 | "
\n",
1209 | " \n",
1210 | " 1 | \n",
1211 | " AE | \n",
1212 | " 2000 | \n",
1213 | " 2.0 | \n",
1214 | " 4.0 | \n",
1215 | " 4.0 | \n",
1216 | " 6.0 | \n",
1217 | " 5 | \n",
1218 | " 12 | \n",
1219 | " 10.0 | \n",
1220 | " NaN | \n",
1221 | " 3.0 | \n",
1222 | "
\n",
1223 | " \n",
1224 | " 2 | \n",
1225 | " AF | \n",
1226 | " 2000 | \n",
1227 | " 52.0 | \n",
1228 | " 228.0 | \n",
1229 | " 183.0 | \n",
1230 | " 149.0 | \n",
1231 | " 129 | \n",
1232 | " 94 | \n",
1233 | " 80.0 | \n",
1234 | " NaN | \n",
1235 | " 93.0 | \n",
1236 | "
\n",
1237 | " \n",
1238 | " 3 | \n",
1239 | " AG | \n",
1240 | " 2000 | \n",
1241 | " 0.0 | \n",
1242 | " 0.0 | \n",
1243 | " 0.0 | \n",
1244 | " 0.0 | \n",
1245 | " 0 | \n",
1246 | " 0 | \n",
1247 | " 1.0 | \n",
1248 | " NaN | \n",
1249 | " 1.0 | \n",
1250 | "
\n",
1251 | " \n",
1252 | " 4 | \n",
1253 | " AL | \n",
1254 | " 2000 | \n",
1255 | " 2.0 | \n",
1256 | " 19.0 | \n",
1257 | " 21.0 | \n",
1258 | " 14.0 | \n",
1259 | " 24 | \n",
1260 | " 19 | \n",
1261 | " 16.0 | \n",
1262 | " NaN | \n",
1263 | " 3.0 | \n",
1264 | "
\n",
1265 | " \n",
1266 | " 5 | \n",
1267 | " AM | \n",
1268 | " 2000 | \n",
1269 | " 2.0 | \n",
1270 | " 152.0 | \n",
1271 | " 130.0 | \n",
1272 | " 131.0 | \n",
1273 | " 63 | \n",
1274 | " 26 | \n",
1275 | " 21.0 | \n",
1276 | " NaN | \n",
1277 | " 1.0 | \n",
1278 | "
\n",
1279 | " \n",
1280 | " 6 | \n",
1281 | " AN | \n",
1282 | " 2000 | \n",
1283 | " 0.0 | \n",
1284 | " 0.0 | \n",
1285 | " 1.0 | \n",
1286 | " 2.0 | \n",
1287 | " 0 | \n",
1288 | " 0 | \n",
1289 | " 0.0 | \n",
1290 | " NaN | \n",
1291 | " 0.0 | \n",
1292 | "
\n",
1293 | " \n",
1294 | " 7 | \n",
1295 | " AO | \n",
1296 | " 2000 | \n",
1297 | " 186.0 | \n",
1298 | " 999.0 | \n",
1299 | " 1003.0 | \n",
1300 | " 912.0 | \n",
1301 | " 482 | \n",
1302 | " 312 | \n",
1303 | " 194.0 | \n",
1304 | " NaN | \n",
1305 | " 247.0 | \n",
1306 | "
\n",
1307 | " \n",
1308 | " 8 | \n",
1309 | " AR | \n",
1310 | " 2000 | \n",
1311 | " 97.0 | \n",
1312 | " 278.0 | \n",
1313 | " 594.0 | \n",
1314 | " 402.0 | \n",
1315 | " 419 | \n",
1316 | " 368 | \n",
1317 | " 330.0 | \n",
1318 | " NaN | \n",
1319 | " 121.0 | \n",
1320 | "
\n",
1321 | " \n",
1322 | " 9 | \n",
1323 | " AS | \n",
1324 | " 2000 | \n",
1325 | " NaN | \n",
1326 | " NaN | \n",
1327 | " NaN | \n",
1328 | " NaN | \n",
1329 | " 1 | \n",
1330 | " 1 | \n",
1331 | " NaN | \n",
1332 | " NaN | \n",
1333 | " NaN | \n",
1334 | "
\n",
1335 | " \n",
1336 | "
\n",
1337 | "
"
1338 | ],
1339 | "text/plain": [
1340 | " country year m014 m1524 m2534 m3544 m4554 m5564 m65 mu f014\n",
1341 | "0 AD 2000 0.0 0.0 1.0 0.0 0 0 0.0 NaN NaN\n",
1342 | "1 AE 2000 2.0 4.0 4.0 6.0 5 12 10.0 NaN 3.0\n",
1343 | "2 AF 2000 52.0 228.0 183.0 149.0 129 94 80.0 NaN 93.0\n",
1344 | "3 AG 2000 0.0 0.0 0.0 0.0 0 0 1.0 NaN 1.0\n",
1345 | "4 AL 2000 2.0 19.0 21.0 14.0 24 19 16.0 NaN 3.0\n",
1346 | "5 AM 2000 2.0 152.0 130.0 131.0 63 26 21.0 NaN 1.0\n",
1347 | "6 AN 2000 0.0 0.0 1.0 2.0 0 0 0.0 NaN 0.0\n",
1348 | "7 AO 2000 186.0 999.0 1003.0 912.0 482 312 194.0 NaN 247.0\n",
1349 | "8 AR 2000 97.0 278.0 594.0 402.0 419 368 330.0 NaN 121.0\n",
1350 | "9 AS 2000 NaN NaN NaN NaN 1 1 NaN NaN NaN"
1351 | ]
1352 | },
1353 | "execution_count": 49,
1354 | "metadata": {},
1355 | "output_type": "execute_result"
1356 | }
1357 | ],
1358 | "source": [
1359 | "df = pd.read_csv(\"./data/tb-raw.csv\")\n",
1360 | "df"
1361 | ]
1362 | },
1363 | {
1364 | "cell_type": "code",
1365 | "execution_count": 50,
1366 | "metadata": {
1367 | "collapsed": false
1368 | },
1369 | "outputs": [
1370 | {
1371 | "data": {
1372 | "text/html": [
1373 | "\n",
1374 | "
\n",
1375 | " \n",
1376 | " \n",
1377 | " | \n",
1378 | " country | \n",
1379 | " year | \n",
1380 | " cases | \n",
1381 | " sex | \n",
1382 | " age | \n",
1383 | "
\n",
1384 | " \n",
1385 | " \n",
1386 | " \n",
1387 | " 0 | \n",
1388 | " AD | \n",
1389 | " 2000 | \n",
1390 | " 0.0 | \n",
1391 | " m | \n",
1392 | " 0-14 | \n",
1393 | "
\n",
1394 | " \n",
1395 | " 10 | \n",
1396 | " AD | \n",
1397 | " 2000 | \n",
1398 | " 0.0 | \n",
1399 | " m | \n",
1400 | " 15-24 | \n",
1401 | "
\n",
1402 | " \n",
1403 | " 20 | \n",
1404 | " AD | \n",
1405 | " 2000 | \n",
1406 | " 1.0 | \n",
1407 | " m | \n",
1408 | " 25-34 | \n",
1409 | "
\n",
1410 | " \n",
1411 | " 30 | \n",
1412 | " AD | \n",
1413 | " 2000 | \n",
1414 | " 0.0 | \n",
1415 | " m | \n",
1416 | " 35-44 | \n",
1417 | "
\n",
1418 | " \n",
1419 | " 40 | \n",
1420 | " AD | \n",
1421 | " 2000 | \n",
1422 | " 0.0 | \n",
1423 | " m | \n",
1424 | " 45-54 | \n",
1425 | "
\n",
1426 | " \n",
1427 | " 50 | \n",
1428 | " AD | \n",
1429 | " 2000 | \n",
1430 | " 0.0 | \n",
1431 | " m | \n",
1432 | " 55-64 | \n",
1433 | "
\n",
1434 | " \n",
1435 | " 81 | \n",
1436 | " AE | \n",
1437 | " 2000 | \n",
1438 | " 3.0 | \n",
1439 | " f | \n",
1440 | " 0-14 | \n",
1441 | "
\n",
1442 | " \n",
1443 | " 1 | \n",
1444 | " AE | \n",
1445 | " 2000 | \n",
1446 | " 2.0 | \n",
1447 | " m | \n",
1448 | " 0-14 | \n",
1449 | "
\n",
1450 | " \n",
1451 | " 11 | \n",
1452 | " AE | \n",
1453 | " 2000 | \n",
1454 | " 4.0 | \n",
1455 | " m | \n",
1456 | " 15-24 | \n",
1457 | "
\n",
1458 | " \n",
1459 | " 21 | \n",
1460 | " AE | \n",
1461 | " 2000 | \n",
1462 | " 4.0 | \n",
1463 | " m | \n",
1464 | " 25-34 | \n",
1465 | "
\n",
1466 | " \n",
1467 | "
\n",
1468 | "
"
1469 | ],
1470 | "text/plain": [
1471 | " country year cases sex age\n",
1472 | "0 AD 2000 0.0 m 0-14\n",
1473 | "10 AD 2000 0.0 m 15-24\n",
1474 | "20 AD 2000 1.0 m 25-34\n",
1475 | "30 AD 2000 0.0 m 35-44\n",
1476 | "40 AD 2000 0.0 m 45-54\n",
1477 | "50 AD 2000 0.0 m 55-64\n",
1478 | "81 AE 2000 3.0 f 0-14\n",
1479 | "1 AE 2000 2.0 m 0-14\n",
1480 | "11 AE 2000 4.0 m 15-24\n",
1481 | "21 AE 2000 4.0 m 25-34"
1482 | ]
1483 | },
1484 | "execution_count": 50,
1485 | "metadata": {},
1486 | "output_type": "execute_result"
1487 | }
1488 | ],
1489 | "source": [
1490 | "df = pd.melt(df, id_vars=[\"country\",\"year\"], value_name=\"cases\", var_name=\"sex_and_age\")\n",
1491 | "\n",
1492 | "# Extract Sex, Age lower bound and Age upper bound group\n",
1493 | "tmp_df = df[\"sex_and_age\"].str.extract(\"(\\D)(\\d+)(\\d{2})\", expand=False) \n",
1494 | "\n",
1495 | "# Name columns\n",
1496 | "tmp_df.columns = [\"sex\", \"age_lower\", \"age_upper\"]\n",
1497 | "\n",
1498 | "# Create `age`column based on `age_lower` and `age_upper`\n",
1499 | "tmp_df[\"age\"] = tmp_df[\"age_lower\"] + \"-\" + tmp_df[\"age_upper\"]\n",
1500 | "\n",
1501 | "# Merge \n",
1502 | "df = pd.concat([df, tmp_df], axis=1)\n",
1503 | "\n",
1504 | "# Drop unnecessary columns and rows\n",
1505 | "df = df.drop(['sex_and_age',\"age_lower\",\"age_upper\"], axis=1)\n",
1506 | "df = df.dropna()\n",
1507 | "df = df.sort_values(ascending=True,by=[\"country\", \"year\", \"sex\", \"age\"])\n",
1508 | "df.head(10)"
1509 | ]
1510 | },
1511 | {
1512 | "cell_type": "markdown",
1513 | "metadata": {},
1514 | "source": [
1515 | "## Variables are stored in both rows and columns"
1516 | ]
1517 | },
1518 | {
1519 | "cell_type": "markdown",
1520 | "metadata": {},
1521 | "source": [
1522 | "### Global Historical Climatology Network Dataset"
1523 | ]
1524 | },
1525 | {
1526 | "cell_type": "code",
1527 | "execution_count": 24,
1528 | "metadata": {
1529 | "collapsed": false
1530 | },
1531 | "outputs": [],
1532 | "source": [
1533 | "df = pd.read_csv(\"./data/weather-raw.csv\")"
1534 | ]
1535 | },
1536 | {
1537 | "cell_type": "code",
1538 | "execution_count": 25,
1539 | "metadata": {
1540 | "collapsed": false
1541 | },
1542 | "outputs": [
1543 | {
1544 | "data": {
1545 | "text/html": [
1546 | "\n",
1547 | "
\n",
1548 | " \n",
1549 | " \n",
1550 | " | \n",
1551 | " id | \n",
1552 | " year | \n",
1553 | " month | \n",
1554 | " element | \n",
1555 | " day_raw | \n",
1556 | " value | \n",
1557 | "
\n",
1558 | " \n",
1559 | " \n",
1560 | " \n",
1561 | " 0 | \n",
1562 | " MX17004 | \n",
1563 | " 2010 | \n",
1564 | " 1 | \n",
1565 | " tmax | \n",
1566 | " d1 | \n",
1567 | " NaN | \n",
1568 | "
\n",
1569 | " \n",
1570 | " 1 | \n",
1571 | " MX17004 | \n",
1572 | " 2010 | \n",
1573 | " 1 | \n",
1574 | " tmin | \n",
1575 | " d1 | \n",
1576 | " NaN | \n",
1577 | "
\n",
1578 | " \n",
1579 | " 2 | \n",
1580 | " MX17004 | \n",
1581 | " 2010 | \n",
1582 | " 2 | \n",
1583 | " tmax | \n",
1584 | " d1 | \n",
1585 | " NaN | \n",
1586 | "
\n",
1587 | " \n",
1588 | " 3 | \n",
1589 | " MX17004 | \n",
1590 | " 2010 | \n",
1591 | " 2 | \n",
1592 | " tmin | \n",
1593 | " d1 | \n",
1594 | " NaN | \n",
1595 | "
\n",
1596 | " \n",
1597 | " 4 | \n",
1598 | " MX17004 | \n",
1599 | " 2010 | \n",
1600 | " 3 | \n",
1601 | " tmax | \n",
1602 | " d1 | \n",
1603 | " NaN | \n",
1604 | "
\n",
1605 | " \n",
1606 | " 5 | \n",
1607 | " MX17004 | \n",
1608 | " 2010 | \n",
1609 | " 3 | \n",
1610 | " tmin | \n",
1611 | " d1 | \n",
1612 | " NaN | \n",
1613 | "
\n",
1614 | " \n",
1615 | " 6 | \n",
1616 | " MX17004 | \n",
1617 | " 2010 | \n",
1618 | " 4 | \n",
1619 | " tmax | \n",
1620 | " d1 | \n",
1621 | " NaN | \n",
1622 | "
\n",
1623 | " \n",
1624 | " 7 | \n",
1625 | " MX17004 | \n",
1626 | " 2010 | \n",
1627 | " 4 | \n",
1628 | " tmin | \n",
1629 | " d1 | \n",
1630 | " NaN | \n",
1631 | "
\n",
1632 | " \n",
1633 | " 8 | \n",
1634 | " MX17004 | \n",
1635 | " 2010 | \n",
1636 | " 5 | \n",
1637 | " tmax | \n",
1638 | " d1 | \n",
1639 | " NaN | \n",
1640 | "
\n",
1641 | " \n",
1642 | " 9 | \n",
1643 | " MX17004 | \n",
1644 | " 2010 | \n",
1645 | " 5 | \n",
1646 | " tmin | \n",
1647 | " d1 | \n",
1648 | " NaN | \n",
1649 | "
\n",
1650 | " \n",
1651 | "
\n",
1652 | "
"
1653 | ],
1654 | "text/plain": [
1655 | " id year month element day_raw value\n",
1656 | "0 MX17004 2010 1 tmax d1 NaN\n",
1657 | "1 MX17004 2010 1 tmin d1 NaN\n",
1658 | "2 MX17004 2010 2 tmax d1 NaN\n",
1659 | "3 MX17004 2010 2 tmin d1 NaN\n",
1660 | "4 MX17004 2010 3 tmax d1 NaN\n",
1661 | "5 MX17004 2010 3 tmin d1 NaN\n",
1662 | "6 MX17004 2010 4 tmax d1 NaN\n",
1663 | "7 MX17004 2010 4 tmin d1 NaN\n",
1664 | "8 MX17004 2010 5 tmax d1 NaN\n",
1665 | "9 MX17004 2010 5 tmin d1 NaN"
1666 | ]
1667 | },
1668 | "execution_count": 25,
1669 | "metadata": {},
1670 | "output_type": "execute_result"
1671 | }
1672 | ],
1673 | "source": [
1674 | "df = pd.melt(df, id_vars=[\"id\", \"year\",\"month\",\"element\"], var_name=\"day_raw\")\n",
1675 | "df.head(10)"
1676 | ]
1677 | },
1678 | {
1679 | "cell_type": "code",
1680 | "execution_count": 26,
1681 | "metadata": {
1682 | "collapsed": false
1683 | },
1684 | "outputs": [
1685 | {
1686 | "data": {
1687 | "text/html": [
1688 | "\n",
1689 | "
\n",
1690 | " \n",
1691 | " \n",
1692 | " element | \n",
1693 | " id | \n",
1694 | " date | \n",
1695 | " tmax | \n",
1696 | " tmin | \n",
1697 | "
\n",
1698 | " \n",
1699 | " \n",
1700 | " \n",
1701 | " 0 | \n",
1702 | " MX17004 | \n",
1703 | " 2010-02-02 | \n",
1704 | " 27.3 | \n",
1705 | " 14.4 | \n",
1706 | "
\n",
1707 | " \n",
1708 | " 1 | \n",
1709 | " MX17004 | \n",
1710 | " 2010-02-03 | \n",
1711 | " 24.1 | \n",
1712 | " 14.4 | \n",
1713 | "
\n",
1714 | " \n",
1715 | " 2 | \n",
1716 | " MX17004 | \n",
1717 | " 2010-03-05 | \n",
1718 | " 32.1 | \n",
1719 | " 14.2 | \n",
1720 | "
\n",
1721 | " \n",
1722 | "
\n",
1723 | "
"
1724 | ],
1725 | "text/plain": [
1726 | "element id date tmax tmin\n",
1727 | "0 MX17004 2010-02-02 27.3 14.4\n",
1728 | "1 MX17004 2010-02-03 24.1 14.4\n",
1729 | "2 MX17004 2010-03-05 32.1 14.2"
1730 | ]
1731 | },
1732 | "execution_count": 26,
1733 | "metadata": {},
1734 | "output_type": "execute_result"
1735 | }
1736 | ],
1737 | "source": [
1738 | "# Extracting day\n",
1739 | "df[\"day\"] = df[\"day_raw\"].str.extract(\"d(\\d+)\", expand=False) \n",
1740 | "df[\"id\"] = \"MX17004\"\n",
1741 | "\n",
1742 | "# To numeric values\n",
1743 | "df[[\"year\",\"month\",\"day\"]] = df[[\"year\",\"month\",\"day\"]].apply(lambda x: pd.to_numeric(x, errors='ignore'))\n",
1744 | "\n",
1745 | "# Creating a date from the different columns\n",
1746 | "def create_date_from_year_month_day(row):\n",
1747 | " return datetime.datetime(year=row[\"year\"], month=int(row[\"month\"]), day=row[\"day\"])\n",
1748 | "\n",
1749 | "df[\"date\"] = df.apply(lambda row: create_date_from_year_month_day(row), axis=1)\n",
1750 | "df = df.drop(['year',\"month\",\"day\", \"day_raw\"], axis=1)\n",
1751 | "df = df.dropna()\n",
1752 | "\n",
1753 | "# Unmelting column \"element\"\n",
1754 | "df = df.pivot_table(index=[\"id\",\"date\"], columns=\"element\", values=\"value\")\n",
1755 | "df.reset_index(drop=False, inplace=True)\n",
1756 | "df"
1757 | ]
1758 | },
1759 | {
1760 | "cell_type": "markdown",
1761 | "metadata": {
1762 | "collapsed": true
1763 | },
1764 | "source": [
1765 | "## One type in multiple tables"
1766 | ]
1767 | },
1768 | {
1769 | "cell_type": "markdown",
1770 | "metadata": {},
1771 | "source": [
1772 | "### Baby Names in Illinois"
1773 | ]
1774 | },
1775 | {
1776 | "cell_type": "code",
1777 | "execution_count": 5,
1778 | "metadata": {
1779 | "collapsed": false
1780 | },
1781 | "outputs": [
1782 | {
1783 | "data": {
1784 | "text/html": [
1785 | "\n",
1786 | "
\n",
1787 | " \n",
1788 | " \n",
1789 | " | \n",
1790 | " rank | \n",
1791 | " name | \n",
1792 | " frequency | \n",
1793 | " sex | \n",
1794 | " year | \n",
1795 | "
\n",
1796 | " \n",
1797 | " \n",
1798 | " \n",
1799 | " 0 | \n",
1800 | " 1 | \n",
1801 | " Noah | \n",
1802 | " 837 | \n",
1803 | " Male | \n",
1804 | " 2014 | \n",
1805 | "
\n",
1806 | " \n",
1807 | " 1 | \n",
1808 | " 2 | \n",
1809 | " Alexander | \n",
1810 | " 747 | \n",
1811 | " Male | \n",
1812 | " 2014 | \n",
1813 | "
\n",
1814 | " \n",
1815 | " 2 | \n",
1816 | " 3 | \n",
1817 | " William | \n",
1818 | " 687 | \n",
1819 | " Male | \n",
1820 | " 2014 | \n",
1821 | "
\n",
1822 | " \n",
1823 | " 3 | \n",
1824 | " 4 | \n",
1825 | " Michael | \n",
1826 | " 680 | \n",
1827 | " Male | \n",
1828 | " 2014 | \n",
1829 | "
\n",
1830 | " \n",
1831 | " 4 | \n",
1832 | " 5 | \n",
1833 | " Liam | \n",
1834 | " 670 | \n",
1835 | " Male | \n",
1836 | " 2014 | \n",
1837 | "
\n",
1838 | " \n",
1839 | "
\n",
1840 | "
"
1841 | ],
1842 | "text/plain": [
1843 | " rank name frequency sex year\n",
1844 | "0 1 Noah 837 Male 2014\n",
1845 | "1 2 Alexander 747 Male 2014\n",
1846 | "2 3 William 687 Male 2014\n",
1847 | "3 4 Michael 680 Male 2014\n",
1848 | "4 5 Liam 670 Male 2014"
1849 | ]
1850 | },
1851 | "execution_count": 5,
1852 | "metadata": {},
1853 | "output_type": "execute_result"
1854 | }
1855 | ],
1856 | "source": [
1857 | "def extract_year(string):\n",
1858 | " match = re.match(\".+(\\d{4})\", string) \n",
1859 | " if match != None: return match.group(1)\n",
1860 | " \n",
1861 | "path = './data'\n",
1862 | "allFiles = glob.glob(path + \"/201*-baby-names-illinois.csv\")\n",
1863 | "frame = pd.DataFrame()\n",
1864 | "df_list= []\n",
1865 | "for file_ in allFiles:\n",
1866 | " df = pd.read_csv(file_,index_col=None, header=0)\n",
1867 | " df.columns = map(str.lower, df.columns)\n",
1868 | " df[\"year\"] = extract_year(file_)\n",
1869 | " df_list.append(df)\n",
1870 | " \n",
1871 | "df = pd.concat(df_list)\n",
1872 | "df.head(5)"
1873 | ]
1874 | }
1875 | ],
1876 | "metadata": {
1877 | "kernelspec": {
1878 | "display_name": "Python 2",
1879 | "language": "python",
1880 | "name": "python2"
1881 | },
1882 | "language_info": {
1883 | "codemirror_mode": {
1884 | "name": "ipython",
1885 | "version": 2
1886 | },
1887 | "file_extension": ".py",
1888 | "mimetype": "text/x-python",
1889 | "name": "python",
1890 | "nbconvert_exporter": "python",
1891 | "pygments_lexer": "ipython2",
1892 | "version": "2.7.11"
1893 | }
1894 | },
1895 | "nbformat": 4,
1896 | "nbformat_minor": 1
1897 | }
1898 |
--------------------------------------------------------------------------------
/data/2014-baby-names-illinois.csv:
--------------------------------------------------------------------------------
1 | rank,name,frequency,sex
2 | 1,Noah,837,Male
3 | 2,Alexander,747,Male
4 | 3,William,687,Male
5 | 4,Michael,680,Male
6 | 5,Liam,670,Male
7 | 6,Jacob,654,Male
8 | 7,Benjamin,649,Male
9 | 8,Mason,604,Male
10 | 9,Daniel,593,Male
11 | 10,Logan,593,Male
12 | 11,Ethan,579,Male
13 | 12,Anthony,564,Male
14 | 13,Aiden,535,Male
15 | 14,Jayden,530,Male
16 | 15,Joseph,498,Male
17 | 16,James,486,Male
18 | 17,Lucas,481,Male
19 | 18,Henry,477,Male
20 | 19,Jackson,469,Male
21 | 20,David,467,Male
22 | 21,Nathan,446,Male
23 | 22,Elijah,445,Male
24 | 23,Matthew,436,Male
25 | 24,Andrew,433,Male
26 | 25,John,432,Male
27 | 26,Isaac,419,Male
28 | 27,Dylan,417,Male
29 | 28,Jack,411,Male
30 | 29,Joshua,411,Male
31 | 30,Owen,411,Male
32 | 31,Julian,407,Male
33 | 32,Gabriel,399,Male
34 | 33,Ryan,399,Male
35 | 34,Oliver,396,Male
36 | 35,Carter,393,Male
37 | 36,Sebastian,380,Male
38 | 37,Charles,377,Male
39 | 38,Luke,371,Male
40 | 39,Jonathan,364,Male
41 | 40,Samuel,353,Male
42 | 41,Christopher,340,Male
43 | 42,Evan,326,Male
44 | 43,Connor,314,Male
45 | 44,Caleb,313,Male
46 | 45,Christian,308,Male
47 | 46,Thomas,306,Male
48 | 47,Nicholas,300,Male
49 | 48,Wyatt,300,Male
50 | 49,Hunter,298,Male
51 | 50,Adrian,288,Male
52 | 51,Angel,285,Male
53 | 52,Cameron,278,Male
54 | 53,Aaron,274,Male
55 | 54,Landon,266,Male
56 | 55,Nolan,266,Male
57 | 56,Jordan,263,Male
58 | 57,Kevin,261,Male
59 | 58,Gavin,258,Male
60 | 59,Adam,249,Male
61 | 60,Brandon,247,Male
62 | 61,Eli,237,Male
63 | 62,Parker,235,Male
64 | 63,Isaiah,234,Male
65 | 64,Jaxon,232,Male
66 | 65,Levi,232,Male
67 | 66,Tyler,227,Male
68 | 67,Dominic,224,Male
69 | 68,Josiah,224,Male
70 | 69,Jeremiah,222,Male
71 | 70,Austin,220,Male
72 | 71,Robert,210,Male
73 | 72,Cooper,204,Male
74 | 73,Leonardo,203,Male
75 | 74,Ian,202,Male
76 | 75,Blake,200,Male
77 | 76,Brayden,200,Male
78 | 77,Camden,200,Male
79 | 78,Zachary,198,Male
80 | 79,Damian,197,Male
81 | 80,Jace,196,Male
82 | 81,Vincent,196,Male
83 | 82,Ayden,192,Male
84 | 83,Leo,191,Male
85 | 84,Chase,189,Male
86 | 85,Colton,188,Male
87 | 86,Grayson,186,Male
88 | 87,Lincoln,186,Male
89 | 88,Mateo,185,Male
90 | 89,Jose,182,Male
91 | 90,Maxwell,181,Male
92 | 91,Giovanni,178,Male
93 | 92,Jason,177,Male
94 | 93,Kayden,175,Male
95 | 94,Nathaniel,175,Male
96 | 95,Miles,174,Male
97 | 96,Patrick,173,Male
98 | 97,Max,170,Male
99 | 98,Brody,168,Male
100 | 99,Jaxson,168,Male
101 | 100,George,166,Male
102 | 101,Theodore,166,Male
--------------------------------------------------------------------------------
/data/2015-baby-names-illinois.csv:
--------------------------------------------------------------------------------
1 | rank,name,frequency,sex
2 | 1,Noah,863,Male
3 | 2,Liam,709,Male
4 | 3,Alexander,703,Male
5 | 4,Jacob,650,Male
6 | 5,William,618,Male
7 | 6,Michael,617,Male
8 | 7,Benjamin,616,Male
9 | 8,Daniel,601,Male
10 | 9,Mason,594,Male
11 | 10,James,576,Male
12 | 11,Logan,568,Male
13 | 12,Ethan,560,Male
14 | 13,Aiden,547,Male
15 | 14,Anthony,524,Male
16 | 15,Henry,514,Male
17 | 16,Oliver,502,Male
18 | 17,Jayden,480,Male
19 | 18,Lucas,471,Male
20 | 19,Matthew,449,Male
21 | 20,Jackson,447,Male
22 | 21,Owen,446,Male
23 | 22,Sebastian,433,Male
24 | 23,Carter,429,Male
25 | 24,Joseph,427,Male
26 | 25,Isaac,421,Male
27 | 26,Elijah,413,Male
28 | 27,John,412,Male
29 | 28,Dylan,403,Male
30 | 29,David,402,Male
31 | 30,Julian,397,Male
32 | 31,Jack,384,Male
33 | 32,Nathan,382,Male
34 | 33,Samuel,379,Male
35 | 34,Andrew,377,Male
36 | 35,Gabriel,375,Male
37 | 36,Joshua,363,Male
38 | 37,Christopher,349,Male
39 | 38,Ryan,335,Male
40 | 39,Caleb,331,Male
41 | 40,Jonathan,328,Male
42 | 41,Charles,320,Male
43 | 42,Luke,308,Male
44 | 43,Wyatt,301,Male
45 | 44,Christian,299,Male
46 | 45,Thomas,299,Male
47 | 46,Dominic,286,Male
48 | 47,Cameron,280,Male
49 | 48,Adrian,279,Male
50 | 49,Nolan,279,Male
51 | 50,Angel,278,Male
52 | 51,Nicholas,274,Male
53 | 52,Connor,269,Male
54 | 53,Levi,268,Male
55 | 54,Hunter,267,Male
56 | 55,Landon,265,Male
57 | 56,Mateo,262,Male
58 | 57,Aaron,260,Male
59 | 58,Grayson,258,Male
60 | 59,Adam,256,Male
61 | 60,Isaiah,254,Male
62 | 61,Jordan,250,Male
63 | 62,Evan,249,Male
64 | 63,Leonardo,246,Male
65 | 64,Leo,244,Male
66 | 65,Jaxon,241,Male
67 | 66,Gavin,236,Male
68 | 67,Josiah,235,Male
69 | 68,Eli,223,Male
70 | 69,Theodore,220,Male
71 | 70,Lincoln,218,Male
72 | 71,Brandon,217,Male
73 | 72,Tyler,215,Male
74 | 73,Brayden,208,Male
75 | 74,Austin,205,Male
76 | 75,Robert,205,Male
77 | 76,Emmett,204,Male
78 | 77,Parker,204,Male
79 | 78,Jeremiah,200,Male
80 | 79,Kevin,200,Male
81 | 80,Colton,199,Male
82 | 81,Ian,199,Male
83 | 82,Vincent,198,Male
84 | 83,Zachary,197,Male
85 | 84,Chase,193,Male
86 | 85,Ayden,187,Male
87 | 86,Cooper,186,Male
88 | 87,Easton,183,Male
89 | 88,Declan,182,Male
90 | 89,Jaxson,180,Male
91 | 90,Xavier,175,Male
92 | 91,Jace,172,Male
93 | 92,Damian,171,Male
94 | 93,Jose,171,Male
95 | 94,Kayden,171,Male
96 | 95,Patrick,169,Male
97 | 96,Giovanni,168,Male
98 | 97,Hudson,167,Male
99 | 98,Camden,165,Male
100 | 99,Max,164,Male
101 | 100,Maxwell,155,Male
--------------------------------------------------------------------------------
/data/billboard.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nickhould/tidy-data-python/2ee13434796f7ef2a0674870c651b1ca1e1b9597/data/billboard.csv
--------------------------------------------------------------------------------
/data/pew-raw.csv:
--------------------------------------------------------------------------------
1 | religion, <$10k, $10-20k,$20-30k,$30-40k, $40-50k,$50-75k
2 | Agnostic,27,34,60,81,76,137
3 | Atheist,12,27,37,52,35,70
4 | Buddhist,27,21,30,34,33,58
5 | Catholic,418,617,732,670,638,1116
6 | Dont know/refused,15,14,15,11,10,35
7 | Evangelical Prot ,575,869,1064,982,881,1486
8 | Hindu ,1,9,7,9,11,34
9 | Historically Black Prot ,228,244,236,238,197,223
10 | Jehovahs Witness ,20,27,24,24,21,30
11 | Jewish ,19,19,25,25,30,95
--------------------------------------------------------------------------------
/data/tb-raw.csv:
--------------------------------------------------------------------------------
1 | country,year,m014,m1524,m2534,m3544,m4554,m5564,m65,mu,f014
2 | AD,2000,0,0,1,0,0,0,0,,
3 | AE,2000,2,4,4,6,5,12,10,,3
4 | AF,2000,52,228,183,149,129,94,80,,93
5 | AG,2000,0,0,0,0,0,0,1,,1
6 | AL,2000,2,19,21,14,24,19,16,,3
7 | AM,2000,2,152,130,131,63,26,21,,1
8 | AN,2000,0,0,1,2,0,0,0,,0
9 | AO,2000,186,999,1003,912,482,312,194,,247
10 | AR,2000,97,278,594,402,419,368,330,,121
11 | AS,2000,,,,,1,1,,,
--------------------------------------------------------------------------------
/data/weather-raw.csv:
--------------------------------------------------------------------------------
1 | id,year,month,element,d1,d2,d3,d4,d5,d6,d7,d8
2 | MX17004,2010,1,tmax,,,,,,,,
3 | MX17004,2010,1,tmin,,,,,,,,
4 | MX17004 ,2010,2,tmax,,27.3,24.1,,,,,
5 | MX17004,2010,2,tmin,,14.4,14.4,,,,,
6 | MX17004,2010,3,tmax,,,,,32.1,,,
7 | MX17004,2010,3,tmin,,,,,14.2,,,
8 | MX17004,2010,4,tmax,,,,,,,,
9 | MX17004,2010,4,tmin,,,,,,,,
10 | MX17004,2010,5,tmax,,,,,,,,
11 | MX17004,2010,5,tmin,,,,,,,,
--------------------------------------------------------------------------------