├── .gitignore
├── Creating Datasets.ipynb
├── README.md
├── Tidy Data in Python.ipynb
├── chapter1.md
├── course.yml
├── datasets
    ├── df1.csv
    ├── df2.csv
    ├── eyes.csv
    ├── lunch.csv
    └── messy.csv
├── img
    ├── author_image.png
    └── shield_image.png
└── requirements.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_STORE
2 | .cache
3 | .ipynb_checkpoints
4 | .spyderproject
5 | 


--------------------------------------------------------------------------------
/Creating Datasets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 16,
 18 |    "metadata": {
 19 |     "collapsed": false
 20 |    },
 21 |    "outputs": [
 22 |     {
 23 |      "name": "stdout",
 24 |      "output_type": "stream",
 25 |      "text": [
 26 |       "  First     Last  Treatment A  Treatment B\n",
 27 |       "0  John    Smith          NaN            2\n",
 28 |       "1  Jane      Doe         16.0           11\n",
 29 |       "2  Mary  Johnson          3.0            1\n"
 30 |      ]
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "messy = pd.DataFrame({'First' : ['John', 'Jane', 'Mary'], \n",
 35 |     "                      'Last' : ['Smith', 'Doe', 'Johnson'], \n",
 36 |     "                      'Treatment A' : [np.nan, 16, 3], \n",
 37 |     "                      'Treatment B' : [2, 11, 1]})\n",
 38 |     "print(messy)"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 8,
 44 |    "metadata": {
 45 |     "collapsed": false
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "messy.to_csv(path_or_buf='datasets/messy.csv',sep=',',index=False)"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 12,
 55 |    "metadata": {
 56 |     "collapsed": false
 57 |    },
 58 |    "outputs": [
 59 |     {
 60 |      "name": "stdout",
 61 |      "output_type": "stream",
 62 |      "text": [
 63 |       "   year  avg_free  avg_reduced  avg_full\n",
 64 |       "0  1969       2.9          0.0      16.5\n",
 65 |       "1  1970       4.6          0.0      17.8\n",
 66 |       "2  1971       5.8          0.5      17.8\n",
 67 |       "3  1972       7.3          0.5      16.6\n",
 68 |       "4  1973       8.1          0.5      16.1\n",
 69 |       "5  1974       8.6          0.5      15.5\n"
 70 |      ]
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "lunch = pd.read_csv('datasets/lunch.csv',sep=',',nrows=6)\n",
 75 |     "lunch.drop(lunch.columns[[4, 5, 6]], axis=1, inplace=True)\n",
 76 |     "print(lunch)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 13,
 82 |    "metadata": {
 83 |     "collapsed": false
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "lunch.to_csv('datasets/df2.csv',sep=',', index=False)"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 17,
 93 |    "metadata": {
 94 |     "collapsed": false
 95 |    },
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "        Name  Brown  Blue  Black\n",
102 |       "0     Esther      0     1      0\n",
103 |       "1  Elizabeth      1     0      0\n",
104 |       "2   Michelle      0     0      1\n"
105 |      ]
106 |     }
107 |    ],
108 |    "source": [
109 |     "eye_color = pd.DataFrame({'Name' : ['Esther', 'Elizabeth', 'Michelle'], \n",
110 |     "                      'Brown' : [0, 1, 0], \n",
111 |     "                      'Blue' : [1, 0, 0],\n",
112 |     "                      'Black':[0, 0, 1]})\n",
113 |     "eye_color = eye_color[['Name','Brown','Blue','Black']]\n",
114 |     "print(eye_color)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 18,
120 |    "metadata": {
121 |     "collapsed": true
122 |    },
123 |    "outputs": [],
124 |    "source": [
125 |     "eye_color.to_csv('datasets/eye_color.csv', sep=',', index=False)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {
132 |     "collapsed": true
133 |    },
134 |    "outputs": [],
135 |    "source": []
136 |   }
137 |  ],
138 |  "metadata": {
139 |   "anaconda-cloud": {},
140 |   "kernelspec": {
141 |    "display_name": "Python [Root]",
142 |    "language": "python",
143 |    "name": "Python [Root]"
144 |   },
145 |   "language_info": {
146 |    "codemirror_mode": {
147 |     "name": "ipython",
148 |     "version": 3
149 |    },
150 |    "file_extension": ".py",
151 |    "mimetype": "text/x-python",
152 |    "name": "python",
153 |    "nbconvert_exporter": "python",
154 |    "pygments_lexer": "ipython3",
155 |    "version": "3.5.1"
156 |   }
157 |  },
158 |  "nbformat": 4,
159 |  "nbformat_minor": 0
160 | }
161 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Tidy Data in Python: mini-course
2 | 
3 | Source files for the Tidy Data in Python mini-course.
4 | 
5 | You can take the course at www.datacamp.com/courses/1273
6 | 


--------------------------------------------------------------------------------
/Tidy Data in Python.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# Tidy Data in Python"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Tidy data and messy data"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 4,
 22 |    "metadata": {
 23 |     "collapsed": false
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "import pandas as pd"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 5,
 33 |    "metadata": {
 34 |     "collapsed": false
 35 |    },
 36 |    "outputs": [
 37 |     {
 38 |      "name": "stdout",
 39 |      "output_type": "stream",
 40 |      "text": [
 41 |       "  First     Last  Treatment A  Treatment B\n",
 42 |       "0  John    Smith          NaN            2\n",
 43 |       "1  Jane      Doe         16.0           11\n",
 44 |       "2  Mary  Johnson          3.0            1\n"
 45 |      ]
 46 |     }
 47 |    ],
 48 |    "source": [
 49 |     "#example showing a violation of rule 2\n",
 50 |     "messy = pd.read_csv('datasets/messy.csv', sep=',')\n",
 51 |     "print(messy)"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 10,
 57 |    "metadata": {
 58 |     "collapsed": false
 59 |    },
 60 |    "outputs": [
 61 |     {
 62 |      "name": "stdout",
 63 |      "output_type": "stream",
 64 |      "text": [
 65 |       "      owner  type  num\n",
 66 |       "0     Jason   dog    2\n",
 67 |       "1     Jason   cat    4\n",
 68 |       "2     Jason  bird    3\n",
 69 |       "3      Lisa   dog    7\n",
 70 |       "4      Lisa   cat   10\n",
 71 |       "5      Lisa  bird    9\n",
 72 |       "6  Terrence   dog    8\n",
 73 |       "7  Terrence   cat    5\n",
 74 |       "8  Terrence  bird    1\n",
 75 |       "\n",
 76 |       "   year  avg_free  avg_reduced  avg_full\n",
 77 |       "0  1969       2.9          0.0      16.5\n",
 78 |       "1  1970       4.6          0.0      17.8\n",
 79 |       "2  1971       5.8          0.5      17.8\n",
 80 |       "3  1972       7.3          0.5      16.6\n",
 81 |       "4  1973       8.1          0.5      16.1\n",
 82 |       "5  1974       8.6          0.5      15.5\n"
 83 |      ]
 84 |     }
 85 |    ],
 86 |    "source": [
 87 |     "#two datasets for multiple choice question\n",
 88 |     "df1 = pd.read_csv('datasets/df1.csv', sep = ',')\n",
 89 |     "df2 = pd.read_csv('datasets/df2.csv', sep = ',')\n",
 90 |     "\n",
 91 |     "print(df1)\n",
 92 |     "print()\n",
 93 |     "print(df2)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "Which dataset is messy and why?\n",
101 |     "\n",
102 |     "A. df1 is messy because ..."
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "## Melt"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 14,
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "    year     variable  value\n",
124 |       "0   1969     avg_free    2.9\n",
125 |       "1   1970     avg_free    4.6\n",
126 |       "2   1971     avg_free    5.8\n",
127 |       "3   1972     avg_free    7.3\n",
128 |       "4   1973     avg_free    8.1\n",
129 |       "5   1974     avg_free    8.6\n",
130 |       "6   1969  avg_reduced    0.0\n",
131 |       "7   1970  avg_reduced    0.0\n",
132 |       "8   1971  avg_reduced    0.5\n",
133 |       "9   1972  avg_reduced    0.5\n",
134 |       "10  1973  avg_reduced    0.5\n",
135 |       "11  1974  avg_reduced    0.5\n",
136 |       "12  1969     avg_full   16.5\n",
137 |       "13  1970     avg_full   17.8\n",
138 |       "14  1971     avg_full   17.8\n",
139 |       "15  1972     avg_full   16.6\n",
140 |       "16  1973     avg_full   16.1\n",
141 |       "17  1974     avg_full   15.5\n"
142 |      ]
143 |     }
144 |    ],
145 |    "source": [
146 |     "df2_tidy = pd.melt(df2, id_vars=['year'])\n",
147 |     "print(df2_tidy)"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "## Renaming Columns"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 16,
160 |    "metadata": {
161 |     "collapsed": false
162 |    },
163 |    "outputs": [
164 |     {
165 |      "name": "stdout",
166 |      "output_type": "stream",
167 |      "text": [
168 |       "    year lunch option  people\n",
169 |       "0   1969     avg_free     2.9\n",
170 |       "1   1970     avg_free     4.6\n",
171 |       "2   1971     avg_free     5.8\n",
172 |       "3   1972     avg_free     7.3\n",
173 |       "4   1973     avg_free     8.1\n",
174 |       "5   1974     avg_free     8.6\n",
175 |       "6   1969  avg_reduced     0.0\n",
176 |       "7   1970  avg_reduced     0.0\n",
177 |       "8   1971  avg_reduced     0.5\n",
178 |       "9   1972  avg_reduced     0.5\n",
179 |       "10  1973  avg_reduced     0.5\n",
180 |       "11  1974  avg_reduced     0.5\n",
181 |       "12  1969     avg_full    16.5\n",
182 |       "13  1970     avg_full    17.8\n",
183 |       "14  1971     avg_full    17.8\n",
184 |       "15  1972     avg_full    16.6\n",
185 |       "16  1973     avg_full    16.1\n",
186 |       "17  1974     avg_full    15.5\n"
187 |      ]
188 |     }
189 |    ],
190 |    "source": [
191 |     "df2_tidy.rename(columns = {'variable':'lunch option','value':'people'}, inplace = True)\n",
192 |     "print(df2_tidy)"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "## More Messiness"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 18,
205 |    "metadata": {
206 |     "collapsed": false
207 |    },
208 |    "outputs": [
209 |     {
210 |      "name": "stdout",
211 |      "output_type": "stream",
212 |      "text": [
213 |       "        Name  Brown  Blue  Black\n",
214 |       "0     Esther      0     1      0\n",
215 |       "1  Elizabeth      1     0      0\n",
216 |       "2   Michelle      0     0      1\n"
217 |      ]
218 |     }
219 |    ],
220 |    "source": [
221 |     "eye_color = pd.read_csv('datasets/eye_color.csv',sep=',')\n",
222 |     "print(eye_color)"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "markdown",
227 |    "metadata": {},
228 |    "source": [
229 |     "What rule does this dataset violate?"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "metadata": {},
235 |    "source": [
236 |     "## Deal with it!"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 39,
242 |    "metadata": {
243 |     "collapsed": false
244 |    },
245 |    "outputs": [
246 |     {
247 |      "name": "stdout",
248 |      "output_type": "stream",
249 |      "text": [
250 |       "        Name variable  value\n",
251 |       "0     Esther    Brown      0\n",
252 |       "1  Elizabeth    Brown      1\n",
253 |       "2   Michelle    Brown      0\n",
254 |       "3     Esther     Blue      1\n",
255 |       "4  Elizabeth     Blue      0\n",
256 |       "5   Michelle     Blue      0\n",
257 |       "6     Esther    Black      0\n",
258 |       "7  Elizabeth    Black      0\n",
259 |       "8   Michelle    Black      1\n"
260 |      ]
261 |     }
262 |    ],
263 |    "source": [
264 |     "eye_color_tidy = pd.melt(eye_color, id_vars = ['Name'])\n",
265 |     "print(eye_color_tidy)"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "markdown",
270 |    "metadata": {},
271 |    "source": [
272 |     "## Further Cleaning"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 40,
278 |    "metadata": {
279 |     "collapsed": false,
280 |     "scrolled": true
281 |    },
282 |    "outputs": [
283 |     {
284 |      "name": "stdout",
285 |      "output_type": "stream",
286 |      "text": [
287 |       "        Name variable  value\n",
288 |       "1  Elizabeth    Brown      1\n",
289 |       "3     Esther     Blue      1\n",
290 |       "8   Michelle    Black      1\n",
291 |       "\n",
292 |       "        Name variable\n",
293 |       "1  Elizabeth    Brown\n",
294 |       "3     Esther     Blue\n",
295 |       "8   Michelle    Black\n",
296 |       "\n",
297 |       "        Name eye color\n",
298 |       "1  Elizabeth     Brown\n",
299 |       "3     Esther      Blue\n",
300 |       "8   Michelle     Black\n"
301 |      ]
302 |     }
303 |    ],
304 |    "source": [
305 |     "eye_color_tidy = eye_color_tidy[eye_color_tidy.value == 1]\n",
306 |     "print(eye_color_tidy)\n",
307 |     "\n",
308 |     "del eye_color_tidy['value']\n",
309 |     "print()\n",
310 |     "print(eye_color_tidy)\n",
311 |     "\n",
312 |     "eye_color_tidy.rename(columns = {'variable':'eye color'}, inplace = True)\n",
313 |     "print()\n",
314 |     "print(eye_color_tidy)"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": null,
320 |    "metadata": {
321 |     "collapsed": true
322 |    },
323 |    "outputs": [],
324 |    "source": []
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": null,
329 |    "metadata": {
330 |     "collapsed": true
331 |    },
332 |    "outputs": [],
333 |    "source": []
334 |   }
335 |  ],
336 |  "metadata": {
337 |   "anaconda-cloud": {},
338 |   "kernelspec": {
339 |    "display_name": "Python [Root]",
340 |    "language": "python",
341 |    "name": "Python [Root]"
342 |   },
343 |   "language_info": {
344 |    "codemirror_mode": {
345 |     "name": "ipython",
346 |     "version": 3
347 |    },
348 |    "file_extension": ".py",
349 |    "mimetype": "text/x-python",
350 |    "name": "python",
351 |    "nbconvert_exporter": "python",
352 |    "pygments_lexer": "ipython3",
353 |    "version": "3.5.1"
354 |   }
355 |  },
356 |  "nbformat": 4,
357 |  "nbformat_minor": 0
358 | }
359 | 


--------------------------------------------------------------------------------
/chapter1.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title       : Tidy Data in Python
  3 | description : It is often said that data scientists spend only 20% of their time analyzing their data, and 80% of time cleaning it. Indeed, maintaining a tidy, easy-to-use dataset is crucial in our age of big data. In the paper Tidy Data, veteran statistician Hadley Wickham gives definitions of tidy and messy data so that all data scientists can keep their work organized. In this mini-course, you'll learn to transform messy datasets into tidy datasets using the pandas package in python. Let's get started!
  4 | 
  5 | attachments :
  6 | 
  7 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2 key:7ad68bd87f
  8 | ## Tidy Data and Messy Data
  9 | 
 10 | What exactly marks the difference between *tidy* data and *messy* data? It is not only how organized and intuitive the datasets look to our human eyes, but also how easily and efficiently they can be processed by computers. In his seminal paper [Tidy Data](https://www.jstatsoft.org/article/view/v059i10), Hadley Wickham proposed three standards for tidy data:
 11 | 
 12 | 1. Each variable forms a column
 13 | 2. Each observation forms a row
 14 | 3. Each type of observation forms a unit.
 15 | 
 16 | In this course, we'll focus on the first two rules and show you how we can use the Python package [pandas](http://pandas.pydata.org/) to deal with datasets violating them. To get started, execute `messy` in the IPython shell. This dataset, which appears in Wickham's paper, shows the number of people who choose either of two treatments in a hospital. Observe its structure in comparison with Wickham's rules. This dataset is *messy* because it violates rule #2: it combines Treatment A and Treatment B, two distinct observations, in a single row.
 17 | 
 18 | Now let's look at two more datasets. Execute `df1` and `df2` in your IPython shell to check out two other preloaded datasets, both featured in DataCamp's [*Cleaning Data in R*](https://campus.datacamp.com/courses/cleaning-data-in-r) course. The former shows the type and number of pets owned by three co-workers, and the latter shows the average BMI in three countries over several years. Which one of these datasets is messy, and why?
 19 | 
 20 | *** =instructions
 21 | - df1 is messy because it violates rule #1.
 22 | - df1 is messy because it violates rule #2.
 23 | - df2 is messy because it violates rule #2.
 24 |  
 25 | *** =hint
 26 | What are the observations and variables in these two datasets?
 27 | 
 28 | *** =pre_exercise_code
 29 | ```{r}
 30 | # The pre exercise code runs code to initialize the user's workspace.
 31 | import pandas as pd
 32 | url1 = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1273/datasets/df1.csv'
 33 | url2 = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1273/datasets/df2.csv'
 34 | url3 = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1273/datasets/messy.csv'
 35 | 
 36 | df1 = pd.read_csv(url1, sep = ',')
 37 | df2 = pd.read_csv(url2, sep = ',')
 38 | messy = pd.read_csv(url3, sep=',')
 39 | ```
 40 | 
 41 | *** =sct
 42 | ```{r}
 43 | # SCT written with pythonwhat: https://github.com/datacamp/pythonwhat/wiki
 44 | 
 45 | msg_1 = "In `df1`, `dogs`, `cats`, `birds` are each a variable so it does not violate rule #1."
 46 | msg_2 = "The three people in `df1` represent three different observations so it does not violate rule #1."
 47 | msg_3 = "Exactly! See detailed explanation in next exercise."
 48 | test_mc(3, [msg_1, msg_2, msg_3])
 49 | ```
 50 | 
 51 | --- type:NormalExercise lang:python xp:100 skills:2 key:431ad8bd98
 52 | ## Using Melt to Tidy Data
 53 | 
 54 | In `df2`, the years `1980`, `1981`, `1982`, and `1983` mark the years when BMI is observed. Thus, they represent three different observations and should be seperated in three rows. A great tool to achieve this is the melt function in the pandas package. Its basic syntax is `pd.melt(df, id_vars = lst)`, where `df` is the name of the dataframe we're dealing with and `lst` is a list of all the columns that we want to keep as columns. All the other columns will be "melted" together in different rows. To get a more concrete idea, try `melt` yourself to *tidy* the dataset `df2`!
 55 | 
 56 | *** =instructions
 57 | - Import `pandas` using the alias `pd`.
 58 | - Melt `df2`! We want to maintain the `Country` column and melt all the rest.
 59 | - Click "Submit" to print out the new melted DataFrame.
 60 | 
 61 | *** =hint
 62 | - To import package x with the alias y, use the command `import x as y`.
 63 | - `id_vars` should be `['Country']`.
 64 | - You don't need to change the code we provided for you.
 65 | 
 66 | *** =pre_exercise_code
 67 | ```{python}
 68 | import pandas as pd
 69 | url2 = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1273/datasets/df2.csv'
 70 | df2 = pd.read_csv(url2, sep = ',')
 71 | ```
 72 | 
 73 | *** =sample_code
 74 | ```{python}
 75 | # Import pandas as pd
 76 | 
 77 | 
 78 | # Melt df2 into new dataframe: df2_melted
 79 | df2_melted = ____
 80 | 
 81 | # print df2_melted
 82 | print(df2_melted)
 83 | ```
 84 | 
 85 | *** =solution
 86 | ```{python}
 87 | # Import pandas as pd
 88 | import pandas as pd
 89 | 
 90 | # Melt df2 into new dataframe: df2_melted
 91 | df2_melted = pd.melt(df2, id_vars=['Country'])
 92 | 
 93 | # print df2_melted
 94 | print(df2_melted)
 95 | ```
 96 | 
 97 | *** =sct
 98 | ```{python}
 99 | test_import("pandas")
100 | test_correct(
101 |     lambda: test_object("df2_melted"),
102 |     lambda: test_function("pandas.melt",
103 |      not_called_msg="Make sure to call the function `pd.melt()`.",
104 |      incorrect_msg="Did you pass the correct arguments to `pd.melt()`?")
105 | )
106 | test_function("print", incorrect_msg="Don't change any code we provided!")
107 | success_msg("Great job!")
108 | ```
109 | 
110 | --- type:NormalExercise lang:python xp:100 skills:2 key:3be71779cd
111 | ## Renaming Columns
112 | 
113 | See how easy that was? You tidied up your dataset with a single command! Now we just need a bit more fine-tuning. Change the column names with pandas' rename function. Its syntax is `df.rename(columns = d, inplace = False)`, where `d` is a dictionary where the keys are the columns you want to change, and the values are the new names for these columns. The code `inplace = False` means the result would be stored in a new DataFrame instead of the original one.
114 | 
115 | *** =instructions
116 | - Rename the `variable` column of `df2_melted` to `Year` and the `value` column to `Income`.
117 | - Click "Submit Answer" to print out the new tidy DataFrame.
118 | 
119 | *** =hint
120 | - Here `d` should be `{'variable':'Year','value':'Income'}`.
121 | - You don't need to change the code we provided for you.
122 | 
123 | *** =pre_exercise_code
124 | ```{python}
125 | #import pandas
126 | import pandas as pd
127 | url2 = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1273/datasets/df2.csv'
128 | df2 = pd.read_csv(url2, sep = ',')
129 | df2_melted = pd.melt(df2, id_vars = ['Country'])
130 | ```
131 | 
132 | *** =sample_code
133 | ```{python}
134 | # Import pandas
135 | import pandas as pd
136 | 
137 | # Rename the columns of df2_melted: df2_tidy
138 | df2_tidy = ____
139 | 
140 | # Print out df2_tidy
141 | print(df2_tidy)
142 | ```
143 | 
144 | *** =solution
145 | ```{python}
146 | # Import pandas
147 | import pandas as pd
148 | 
149 | # Rename the columns of df2_melted: df2_tidy
150 | df2_tidy = df2_melted.rename(columns = {'variable': 'Year', 'value': 'Income'}, inplace = False)
151 | 
152 | # Print out df2_tidy
153 | print(df2_tidy)
154 | ```
155 | 
156 | *** =sct
157 | ```{python}
158 | test_import("pandas")
159 | test_correct(
160 |     lambda: test_object("df2_tidy"),
161 |     lambda: test_function("df2_melted.rename",     
162 |      not_called_msg="Make sure to call the function `df2_melted.rename()`.",
163 |      incorrect_msg="Did you pass the correct arguments to `df2_melted.rename()`?"
164 |     )
165 | )
166 | test_function("print", incorrect_msg="Don't change any code we provided!")
167 | success_msg("Great job!")
168 | ```
169 | 
170 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2 key:d40684ea0d
171 | ## More messiness
172 | 
173 | Great job! Now that you're familiar with messy and tidy data, let's take a look at another dataset. Execute `eyes` in your shell to print a dataset that was featured in DataCamp's [Cleaning Data in R course](https://campus.datacamp.com/courses/cleaning-data-in-r). This dataset is about the eye colors of three women and whether or not they wear glasses. What problem does this dataset have?
174 | 
175 | *** =instructions
176 | - It violates rule #1 of tidy data: there are several columns that represent the same variable.
177 | - It violates rule #1 of tidy data: there are several variables represented in the same column.
178 | - It violates rule #2 of tidy data: there are several rows that represent the same observation.
179 | - It violates rule #2 of tidy data: there are several observations represented in the same row.
180 | 
181 | *** =hint
182 | Think about what the chart wants to show and how the columns relate to it!
183 | 
184 | *** =pre_exercise_code
185 | ```{r}
186 | # The pre exercise code runs code to initialize the user's workspace.
187 | # You can use it to load packages, initialize datasets and draw a plot in the viewer
188 | 
189 | import pandas as pd
190 | url4 = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1273/datasets/eyes.csv'
191 | eyes = pd.read_csv(url4,sep=',')
192 | ```
193 | 
194 | *** =sct
195 | ```{r}
196 | # SCT written with pythonwhat: https://github.com/datacamp/pythonwhat/wikif
197 | 
198 | msg_2 = "No column represents more than one variable."
199 | msg_3 = "Each person is one observation and is correctly charted as one observation."
200 | msg_success = "Exactly!"
201 | test_mc(1, [msg_success, msg_2, msg_3, msg_3])
202 | ```
203 | 
204 | --- type:NormalExercise lang:python xp:100 skills:2 key:5d0f6f3efd
205 | ## Deal with it!
206 | 
207 | In the previous exercise, the three columns--`Black`, `Blue`, and `Brown`--represent the same variable: eye color. It would make much more sense to merge them into one column. Use `melt` to do it!
208 | 
209 | *** =instructions
210 | - Use `melt` to leave `Name` and `Wear_Glasses` intact and combine everything else.
211 | - Rename the `variable` column to `Eye_Color`.
212 | - Hit "Submit Answer" to print out the resulting dataframe.
213 | 
214 | *** =hint
215 | - The basic syntax for melt is `df.melt(df, id_vars = lst)`. Here `lst` should be `['Name', 'Wear_Glasses']`.
216 | - The basic syntax for rename is  `df.rename(columns = lst, inplace = True)`.
217 | - You don't need to change the code we provided for you.
218 | 
219 | *** =pre_exercise_code
220 | ```{python}
221 | import pandas as pd
222 | url4 = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1273/datasets/eyes.csv'
223 | eyes = pd.read_csv(url4,sep=',')
224 | ```
225 | 
226 | *** =sample_code
227 | ```{python}
228 | # Import pandas
229 | import pandas as pd
230 | 
231 | # Melt the Black, Blue, and Brown columns of eyes: eyes_melted
232 | eyes_melted = ____
233 | 
234 | # Rename the variable column and save to eyes_renamed
235 | eyes_renamed = ____
236 | 
237 | # print out eyes_renamed
238 | print(eyes_renamed)
239 | ```
240 | 
241 | *** =solution
242 | ```{python}
243 | # Import pandas
244 | import pandas as pd
245 | 
246 | # Melt the Black, Blue, and Brown columns of eyes: eyes_melted
247 | eyes_melted = pd.melt(eyes, id_vars=['Name', 'Wear_Glasses'])
248 | 
249 | # Rename the variable column and save to eyes_renamed
250 | eyes_renamed = eyes_melted.rename(columns = {'variable': 'Eye_Color'}, inplace = False)
251 | 
252 | # print out eyes_renamed
253 | print(eyes_renamed)
254 | ```
255 | 
256 | *** =sct
257 | ```{python}
258 | test_import("pandas")
259 | test_function("pandas.melt")
260 | test_correct(
261 |     lambda: test_object("eyes_melted"),
262 |     lambda: test_function("pandas.melt",
263 |      not_called_msg="Make sure to call the function `pd.melt()`.",
264 |      incorrect_msg="Did you pass the correct arguments to `pd.melt()`?")
265 | )
266 | test_correct(
267 |     lambda: test_object("eyes_renamed"),
268 |     lambda: test_function("eyes_melted.rename",     
269 |      not_called_msg="Make sure to call the function `eyes_melted.rename()`.",
270 |      incorrect_msg="Did you pass the correct arguments to `eyes_melted.rename()`?"
271 |     )
272 | )
273 | test_function("print", incorrect_msg="Don't change any code we provided!")
274 | success_msg("Great job!")
275 | ```
276 | 
277 | --- type:NormalExercise lang:python xp:100 skills:2 key:99639b8387
278 | ## Further Cleaning
279 | 
280 | What did you notice in the last exercise? While the three columns melt into one, the dataset still has some problems. First of all, when we know Elizabeth has brown eyes, it's redundant to record that she doesn't have blue or black eyes. Therefore, what we want to do is to get rid of all rows whose value in the `value` column is 0. It is very easy to do this in pandas using the following command:
281 | 
282 | ```
283 | df1 = df2[df2.column == value]
284 | ```
285 | 
286 | where `column` is the name of the column we are examining and `value` is the value we want to keep. This step will give us one row for each girl that tells us only her correct eye color. Now the `value` column is no longer necessary, so let's delete it:
287 | 
288 | ```
289 | df.drop(lst, axis = 1)
290 | ```
291 | 
292 | Here `lst` is a list of the columns we want to get rid of, and `axis = 1` specifies that we want to drop columns instead of rows.
293 | 
294 | *** =instructions
295 | - Filter the dataset to keep only the rows where `value` is 1. 
296 | - Delete the `value` column.
297 | 
298 | *** =hint
299 | - To filter the dataset, you should have `df.column == 1`.
300 | - To drop the `value` column, you should have the argument `(['value'], axis = 1)`.
301 | 
302 | *** =pre_exercise_code
303 | ```{python}
304 | import pandas as pd
305 | url4 = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1273/datasets/eyes.csv'
306 | eyes = pd.read_csv(url4,sep=',')
307 | eyes_renamed = pd.melt(eyes, id_vars = ['Name', 'Wear_Glasses'])
308 | eyes_renamed.rename(columns = {'variable':'Eye_Color'}, inplace = True)
309 | ```
310 | 
311 | *** =sample_code
312 | ```{python}
313 | # Import pandas
314 | import pandas as pd
315 | 
316 | # Filter eyes_ranamed and save to eyes_filtered 
317 | eyes_filtered = ____
318 | 
319 | # Delete the `value` column and save to eyes_tidy
320 | eyes_tidy = ____
321 | 
322 | # print eyes_tidy
323 | print(eyes_tidy)
324 | ```
325 | 
326 | *** =solution
327 | ```{python}
328 | # Import pandas
329 | import pandas as pd
330 | 
331 | # Filter eyes_ranamed and save to eyes_filtered 
332 | eyes_filtered = eyes_renamed[eyes_renamed.value == 1]
333 | 
334 | # Delete the `value` column and save to eyes_tidy
335 | eyes_tidy = eyes_filtered.drop(['value'], axis=1)
336 | 
337 | # print eye_color_tidy again
338 | print(eyes_tidy)
339 | ```
340 | 
341 | *** =sct
342 | ```{python}
343 | test_import("pandas")
344 | test_object("eyes_filtered")
345 | test_correct(
346 |     lambda: test_object("eyes_tidy"),
347 |     lambda: test_function("eyes_filtered.drop",     
348 |      not_called_msg="Make sure to call the function `eyes_filtered.drop()`.",
349 |      incorrect_msg="Did you pass the correct arguments to `eyes_filtered.drop()`?"
350 |     )
351 | )
352 | test_function("print", incorrect_msg="Don't change any code we provided!")
353 | success_msg("Great job!")
354 | ```
355 | 


--------------------------------------------------------------------------------
/course.yml:
--------------------------------------------------------------------------------
 1 | title                : Tidy Data in Python Mini-Course
 2 | author_field         : Vincent Lan
 3 | description          : Most of the world's data are not sorted in a clean and organized fashion; nor are they easy to process. As a data scientist, you need to know what the standards for tidy data are and how to create tidy datasets from messy ones. This mini-course will prepare you for these tasks.
 4 | author_bio           : Vincent Lan is a Statistics student at Harvard University and a course development intern at Datacamp.
 5 | university           : DataCamp
 6 | difficulty_level     : 2
 7 | time_needed          : 0.5 hour
 8 | programming_language : python
 9 | from                 : "python-base-prod:20"
10 | 


--------------------------------------------------------------------------------
/datasets/df1.csv:
--------------------------------------------------------------------------------
1 | owner,dogs,cats,birds
2 | Jason,2,4,3
3 | Lisa,7,10,9
4 | Terrence,8,5,1
5 | 


--------------------------------------------------------------------------------
/datasets/df2.csv:
--------------------------------------------------------------------------------
1 | "Country","Y1980","Y1981","Y1982","Y1983"
2 | "Afghanistan",21.48678,21.46552,21.45145,21.43822
3 | "Albania",25.22533,25.23981,25.25636,25.27176
4 | "Algeria",22.25703,22.34745,22.43647,22.52105
5 | 


--------------------------------------------------------------------------------
/datasets/eyes.csv:
--------------------------------------------------------------------------------
1 | Name,Brown,Blue,Black,Wear_Glasses
2 | Esther,0,1,0,FALSE
3 | Elizabeth,1,0,0,FALSE
4 | Michelle,0,0,1,TRUE
5 | 


--------------------------------------------------------------------------------
/datasets/lunch.csv:
--------------------------------------------------------------------------------
 1 | "year","avg_free","avg_reduced","avg_full","avg_total","total_served","perc_free_red"
 2 | 1969,2.9,0,16.5,19.4,3368.2,15.1
 3 | 1970,4.6,0,17.8,22.4,3565.1,20.7
 4 | 1971,5.8,0.5,17.8,24.1,3848.3,26.1
 5 | 1972,7.3,0.5,16.6,24.4,3972.1,32.4
 6 | 1973,8.1,0.5,16.1,24.7,4008.8,35
 7 | 1974,8.6,0.5,15.5,24.6,3981.6,37.1
 8 | 1975,9.4,0.6,14.9,24.9,4063,40.3
 9 | 1976,10.2,0.8,14.6,25.6,4147.9,43.1
10 | 1977,10.5,1.3,14.5,26.2,4250,44.8
11 | 1978,10.3,1.5,14.9,26.7,4294.1,44.4
12 | 1979,10,1.7,15.3,27,4357.4,43.6
13 | 1980,10,1.9,14.7,26.6,4387,45.1
14 | 1981,10.6,1.9,13.3,25.8,4210.6,48.6
15 | 1982,9.8,1.6,11.5,22.9,3755,50.2
16 | 1983,10.3,1.5,11.2,23,3803.3,51.7
17 | 1984,10.3,1.5,11.5,23.4,3826.2,51
18 | 1985,9.9,1.6,12.1,23.6,3890.1,49.1
19 | 1986,10,1.6,12.2,23.7,3942.5,49.1
20 | 1987,10,1.6,12.4,23.9,3939.9,48.6
21 | 1988,9.8,1.6,12.8,24.2,4032.9,47.4
22 | 1989,9.7,1.6,12.9,24.2,4004.9,47.2
23 | 1990,9.8,1.7,12.6,24.1,4009,48.3
24 | 1991,10.3,1.8,12.2,24.2,4050.7,50.4
25 | 1992,11.2,1.7,11.7,24.6,4101.4,53.1
26 | 1993,11.7,1.7,11.4,24.9,4137.7,54.8
27 | 1994,12.2,1.8,11.3,25.3,4201.6,55.9
28 | 1995,12.4,1.9,11.4,25.7,4253.3,56.4
29 | 1996,12.6,2,11.3,25.9,4313.2,56.9
30 | 1997,12.9,2.1,11.3,26.3,4409,57.6
31 | 1998,13,2.2,11.4,26.6,4425,57.8
32 | 1999,13,2.4,11.6,27,4513.6,57.6
33 | 2000,13,2.5,11.9,27.3,4575,57.1
34 | 2001,12.9,2.6,12,27.5,4585.2,56.8
35 | 2002,13.3,2.6,12,28,4716.6,57.6
36 | 2003,13.7,2.7,11.9,28.4,4762.9,58.5
37 | 2004,14.1,2.8,12,29,4842.4,59.1
38 | 2005,14.6,2.9,12.2,29.6,4976.4,59.4
39 | 2006,14.8,2.9,12.4,30.1,5027.9,59.3
40 | 2007,15,3.1,12.6,30.6,5071.3,59.3
41 | 2008,15.4,3.1,12.5,31,5208.5,60.1
42 | 2009,16.3,3.2,11.9,31.3,5186.1,62.6
43 | 2010,17.6,3,11.1,31.8,5278.4,65.3
44 | 2011,18.4,2.7,10.8,31.8,5274.5,66.6
45 | 2012,18.7,2.7,10.2,31.7,5214.7,68.2
46 | 2013,18.9,2.6,9.2,30.7,5097.6,70.5
47 | 2014,19.2,2.5,8.8,30.5,5020.3,71.6
48 | 


--------------------------------------------------------------------------------
/datasets/messy.csv:
--------------------------------------------------------------------------------
1 | First,Last,Treatment A,Treatment B
2 | John,Smith,,2
3 | Jane,Doe,16.0,11
4 | Mary,Johnson,3.0,1
5 | 


--------------------------------------------------------------------------------
/img/author_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datacamp/community-courses-tidy-data-in-python-mini-course/186f84d57553d2b148abb1fd273d97bc17239e30/img/author_image.png


--------------------------------------------------------------------------------
/img/shield_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datacamp/community-courses-tidy-data-in-python-mini-course/186f84d57553d2b148abb1fd273d97bc17239e30/img/shield_image.png


--------------------------------------------------------------------------------
/requirements.sh:
--------------------------------------------------------------------------------
1 | pip3 install pandas==0.19.1
2 | 


--------------------------------------------------------------------------------