├── .gitignore ├── Creating Datasets.ipynb ├── README.md ├── Tidy Data in Python.ipynb ├── chapter1.md ├── course.yml ├── datasets ├── df1.csv ├── df2.csv ├── eyes.csv ├── lunch.csv └── messy.csv ├── img ├── author_image.png └── shield_image.png └── requirements.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_STORE 2 | .cache 3 | .ipynb_checkpoints 4 | .spyderproject 5 | -------------------------------------------------------------------------------- /Creating Datasets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 16, 18 | "metadata": { 19 | "collapsed": false 20 | }, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | " First Last Treatment A Treatment B\n", 27 | "0 John Smith NaN 2\n", 28 | "1 Jane Doe 16.0 11\n", 29 | "2 Mary Johnson 3.0 1\n" 30 | ] 31 | } 32 | ], 33 | "source": [ 34 | "messy = pd.DataFrame({'First' : ['John', 'Jane', 'Mary'], \n", 35 | " 'Last' : ['Smith', 'Doe', 'Johnson'], \n", 36 | " 'Treatment A' : [np.nan, 16, 3], \n", 37 | " 'Treatment B' : [2, 11, 1]})\n", 38 | "print(messy)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 8, 44 | "metadata": { 45 | "collapsed": false 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "messy.to_csv(path_or_buf='datasets/messy.csv',sep=',',index=False)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 12, 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | " year avg_free avg_reduced avg_full\n", 64 | "0 1969 2.9 0.0 16.5\n", 65 | "1 1970 4.6 0.0 17.8\n", 66 | "2 1971 5.8 0.5 17.8\n", 67 | "3 1972 7.3 0.5 16.6\n", 68 | "4 1973 8.1 0.5 16.1\n", 69 | "5 1974 8.6 0.5 15.5\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "lunch = pd.read_csv('datasets/lunch.csv',sep=',',nrows=6)\n", 75 | "lunch.drop(lunch.columns[[4, 5, 6]], axis=1, inplace=True)\n", 76 | "print(lunch)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 13, 82 | "metadata": { 83 | "collapsed": false 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "lunch.to_csv('datasets/df2.csv',sep=',', index=False)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 17, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | " Name Brown Blue Black\n", 102 | "0 Esther 0 1 0\n", 103 | "1 Elizabeth 1 0 0\n", 104 | "2 Michelle 0 0 1\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "eye_color = pd.DataFrame({'Name' : ['Esther', 'Elizabeth', 'Michelle'], \n", 110 | " 'Brown' : [0, 1, 0], \n", 111 | " 'Blue' : [1, 0, 0],\n", 112 | " 'Black':[0, 0, 1]})\n", 113 | "eye_color = eye_color[['Name','Brown','Blue','Black']]\n", 114 | "print(eye_color)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 18, 120 | "metadata": { 121 | "collapsed": true 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "eye_color.to_csv('datasets/eye_color.csv', sep=',', index=False)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [], 135 | "source": [] 136 | } 137 | ], 138 | "metadata": { 139 | "anaconda-cloud": {}, 140 | "kernelspec": { 141 | "display_name": "Python [Root]", 142 | "language": "python", 143 | "name": "Python [Root]" 144 | }, 145 | "language_info": { 146 | "codemirror_mode": { 147 | "name": "ipython", 148 | "version": 3 149 | }, 150 | "file_extension": ".py", 151 | "mimetype": "text/x-python", 152 | "name": "python", 153 | "nbconvert_exporter": "python", 154 | "pygments_lexer": "ipython3", 155 | "version": "3.5.1" 156 | } 157 | }, 158 | "nbformat": 4, 159 | "nbformat_minor": 0 160 | } 161 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tidy Data in Python: mini-course 2 | 3 | Source files for the Tidy Data in Python mini-course. 4 | 5 | You can take the course at www.datacamp.com/courses/1273 6 | -------------------------------------------------------------------------------- /Tidy Data in Python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "# Tidy Data in Python" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Tidy data and messy data" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 4, 22 | "metadata": { 23 | "collapsed": false 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "import pandas as pd" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 5, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | " First Last Treatment A Treatment B\n", 42 | "0 John Smith NaN 2\n", 43 | "1 Jane Doe 16.0 11\n", 44 | "2 Mary Johnson 3.0 1\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "#example showing a violation of rule 2\n", 50 | "messy = pd.read_csv('datasets/messy.csv', sep=',')\n", 51 | "print(messy)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 10, 57 | "metadata": { 58 | "collapsed": false 59 | }, 60 | "outputs": [ 61 | { 62 | "name": "stdout", 63 | "output_type": "stream", 64 | "text": [ 65 | " owner type num\n", 66 | "0 Jason dog 2\n", 67 | "1 Jason cat 4\n", 68 | "2 Jason bird 3\n", 69 | "3 Lisa dog 7\n", 70 | "4 Lisa cat 10\n", 71 | "5 Lisa bird 9\n", 72 | "6 Terrence dog 8\n", 73 | "7 Terrence cat 5\n", 74 | "8 Terrence bird 1\n", 75 | "\n", 76 | " year avg_free avg_reduced avg_full\n", 77 | "0 1969 2.9 0.0 16.5\n", 78 | "1 1970 4.6 0.0 17.8\n", 79 | "2 1971 5.8 0.5 17.8\n", 80 | "3 1972 7.3 0.5 16.6\n", 81 | "4 1973 8.1 0.5 16.1\n", 82 | "5 1974 8.6 0.5 15.5\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "#two datasets for multiple choice question\n", 88 | "df1 = pd.read_csv('datasets/df1.csv', sep = ',')\n", 89 | "df2 = pd.read_csv('datasets/df2.csv', sep = ',')\n", 90 | "\n", 91 | "print(df1)\n", 92 | "print()\n", 93 | "print(df2)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "Which dataset is messy and why?\n", 101 | "\n", 102 | "A. df1 is messy because ..." 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## Melt" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 14, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | " year variable value\n", 124 | "0 1969 avg_free 2.9\n", 125 | "1 1970 avg_free 4.6\n", 126 | "2 1971 avg_free 5.8\n", 127 | "3 1972 avg_free 7.3\n", 128 | "4 1973 avg_free 8.1\n", 129 | "5 1974 avg_free 8.6\n", 130 | "6 1969 avg_reduced 0.0\n", 131 | "7 1970 avg_reduced 0.0\n", 132 | "8 1971 avg_reduced 0.5\n", 133 | "9 1972 avg_reduced 0.5\n", 134 | "10 1973 avg_reduced 0.5\n", 135 | "11 1974 avg_reduced 0.5\n", 136 | "12 1969 avg_full 16.5\n", 137 | "13 1970 avg_full 17.8\n", 138 | "14 1971 avg_full 17.8\n", 139 | "15 1972 avg_full 16.6\n", 140 | "16 1973 avg_full 16.1\n", 141 | "17 1974 avg_full 15.5\n" 142 | ] 143 | } 144 | ], 145 | "source": [ 146 | "df2_tidy = pd.melt(df2, id_vars=['year'])\n", 147 | "print(df2_tidy)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "## Renaming Columns" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 16, 160 | "metadata": { 161 | "collapsed": false 162 | }, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | " year lunch option people\n", 169 | "0 1969 avg_free 2.9\n", 170 | "1 1970 avg_free 4.6\n", 171 | "2 1971 avg_free 5.8\n", 172 | "3 1972 avg_free 7.3\n", 173 | "4 1973 avg_free 8.1\n", 174 | "5 1974 avg_free 8.6\n", 175 | "6 1969 avg_reduced 0.0\n", 176 | "7 1970 avg_reduced 0.0\n", 177 | "8 1971 avg_reduced 0.5\n", 178 | "9 1972 avg_reduced 0.5\n", 179 | "10 1973 avg_reduced 0.5\n", 180 | "11 1974 avg_reduced 0.5\n", 181 | "12 1969 avg_full 16.5\n", 182 | "13 1970 avg_full 17.8\n", 183 | "14 1971 avg_full 17.8\n", 184 | "15 1972 avg_full 16.6\n", 185 | "16 1973 avg_full 16.1\n", 186 | "17 1974 avg_full 15.5\n" 187 | ] 188 | } 189 | ], 190 | "source": [ 191 | "df2_tidy.rename(columns = {'variable':'lunch option','value':'people'}, inplace = True)\n", 192 | "print(df2_tidy)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "## More Messiness" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 18, 205 | "metadata": { 206 | "collapsed": false 207 | }, 208 | "outputs": [ 209 | { 210 | "name": "stdout", 211 | "output_type": "stream", 212 | "text": [ 213 | " Name Brown Blue Black\n", 214 | "0 Esther 0 1 0\n", 215 | "1 Elizabeth 1 0 0\n", 216 | "2 Michelle 0 0 1\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "eye_color = pd.read_csv('datasets/eye_color.csv',sep=',')\n", 222 | "print(eye_color)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "What rule does this dataset violate?" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "## Deal with it!" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 39, 242 | "metadata": { 243 | "collapsed": false 244 | }, 245 | "outputs": [ 246 | { 247 | "name": "stdout", 248 | "output_type": "stream", 249 | "text": [ 250 | " Name variable value\n", 251 | "0 Esther Brown 0\n", 252 | "1 Elizabeth Brown 1\n", 253 | "2 Michelle Brown 0\n", 254 | "3 Esther Blue 1\n", 255 | "4 Elizabeth Blue 0\n", 256 | "5 Michelle Blue 0\n", 257 | "6 Esther Black 0\n", 258 | "7 Elizabeth Black 0\n", 259 | "8 Michelle Black 1\n" 260 | ] 261 | } 262 | ], 263 | "source": [ 264 | "eye_color_tidy = pd.melt(eye_color, id_vars = ['Name'])\n", 265 | "print(eye_color_tidy)" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "## Further Cleaning" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 40, 278 | "metadata": { 279 | "collapsed": false, 280 | "scrolled": true 281 | }, 282 | "outputs": [ 283 | { 284 | "name": "stdout", 285 | "output_type": "stream", 286 | "text": [ 287 | " Name variable value\n", 288 | "1 Elizabeth Brown 1\n", 289 | "3 Esther Blue 1\n", 290 | "8 Michelle Black 1\n", 291 | "\n", 292 | " Name variable\n", 293 | "1 Elizabeth Brown\n", 294 | "3 Esther Blue\n", 295 | "8 Michelle Black\n", 296 | "\n", 297 | " Name eye color\n", 298 | "1 Elizabeth Brown\n", 299 | "3 Esther Blue\n", 300 | "8 Michelle Black\n" 301 | ] 302 | } 303 | ], 304 | "source": [ 305 | "eye_color_tidy = eye_color_tidy[eye_color_tidy.value == 1]\n", 306 | "print(eye_color_tidy)\n", 307 | "\n", 308 | "del eye_color_tidy['value']\n", 309 | "print()\n", 310 | "print(eye_color_tidy)\n", 311 | "\n", 312 | "eye_color_tidy.rename(columns = {'variable':'eye color'}, inplace = True)\n", 313 | "print()\n", 314 | "print(eye_color_tidy)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": { 321 | "collapsed": true 322 | }, 323 | "outputs": [], 324 | "source": [] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": { 330 | "collapsed": true 331 | }, 332 | "outputs": [], 333 | "source": [] 334 | } 335 | ], 336 | "metadata": { 337 | "anaconda-cloud": {}, 338 | "kernelspec": { 339 | "display_name": "Python [Root]", 340 | "language": "python", 341 | "name": "Python [Root]" 342 | }, 343 | "language_info": { 344 | "codemirror_mode": { 345 | "name": "ipython", 346 | "version": 3 347 | }, 348 | "file_extension": ".py", 349 | "mimetype": "text/x-python", 350 | "name": "python", 351 | "nbconvert_exporter": "python", 352 | "pygments_lexer": "ipython3", 353 | "version": "3.5.1" 354 | } 355 | }, 356 | "nbformat": 4, 357 | "nbformat_minor": 0 358 | } 359 | -------------------------------------------------------------------------------- /chapter1.md: -------------------------------------------------------------------------------- 1 | --- 2 | title : Tidy Data in Python 3 | description : It is often said that data scientists spend only 20% of their time analyzing their data, and 80% of time cleaning it. Indeed, maintaining a tidy, easy-to-use dataset is crucial in our age of big data. In the paper Tidy Data, veteran statistician Hadley Wickham gives definitions of tidy and messy data so that all data scientists can keep their work organized. In this mini-course, you'll learn to transform messy datasets into tidy datasets using the pandas package in python. Let's get started! 4 | 5 | attachments : 6 | 7 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2 key:7ad68bd87f 8 | ## Tidy Data and Messy Data 9 | 10 | What exactly marks the difference between *tidy* data and *messy* data? It is not only how organized and intuitive the datasets look to our human eyes, but also how easily and efficiently they can be processed by computers. In his seminal paper [Tidy Data](https://www.jstatsoft.org/article/view/v059i10), Hadley Wickham proposed three standards for tidy data: 11 | 12 | 1. Each variable forms a column 13 | 2. Each observation forms a row 14 | 3. Each type of observation forms a unit. 15 | 16 | In this course, we'll focus on the first two rules and show you how we can use the Python package [pandas](http://pandas.pydata.org/) to deal with datasets violating them. To get started, execute `messy` in the IPython shell. This dataset, which appears in Wickham's paper, shows the number of people who choose either of two treatments in a hospital. Observe its structure in comparison with Wickham's rules. This dataset is *messy* because it violates rule #2: it combines Treatment A and Treatment B, two distinct observations, in a single row. 17 | 18 | Now let's look at two more datasets. Execute `df1` and `df2` in your IPython shell to check out two other preloaded datasets, both featured in DataCamp's [*Cleaning Data in R*](https://campus.datacamp.com/courses/cleaning-data-in-r) course. The former shows the type and number of pets owned by three co-workers, and the latter shows the average BMI in three countries over several years. Which one of these datasets is messy, and why? 19 | 20 | *** =instructions 21 | - df1 is messy because it violates rule #1. 22 | - df1 is messy because it violates rule #2. 23 | - df2 is messy because it violates rule #2. 24 | 25 | *** =hint 26 | What are the observations and variables in these two datasets? 27 | 28 | *** =pre_exercise_code 29 | ```{r} 30 | # The pre exercise code runs code to initialize the user's workspace. 31 | import pandas as pd 32 | url1 = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1273/datasets/df1.csv' 33 | url2 = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1273/datasets/df2.csv' 34 | url3 = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1273/datasets/messy.csv' 35 | 36 | df1 = pd.read_csv(url1, sep = ',') 37 | df2 = pd.read_csv(url2, sep = ',') 38 | messy = pd.read_csv(url3, sep=',') 39 | ``` 40 | 41 | *** =sct 42 | ```{r} 43 | # SCT written with pythonwhat: https://github.com/datacamp/pythonwhat/wiki 44 | 45 | msg_1 = "In `df1`, `dogs`, `cats`, `birds` are each a variable so it does not violate rule #1." 46 | msg_2 = "The three people in `df1` represent three different observations so it does not violate rule #1." 47 | msg_3 = "Exactly! See detailed explanation in next exercise." 48 | test_mc(3, [msg_1, msg_2, msg_3]) 49 | ``` 50 | 51 | --- type:NormalExercise lang:python xp:100 skills:2 key:431ad8bd98 52 | ## Using Melt to Tidy Data 53 | 54 | In `df2`, the years `1980`, `1981`, `1982`, and `1983` mark the years when BMI is observed. Thus, they represent three different observations and should be seperated in three rows. A great tool to achieve this is the melt function in the pandas package. Its basic syntax is `pd.melt(df, id_vars = lst)`, where `df` is the name of the dataframe we're dealing with and `lst` is a list of all the columns that we want to keep as columns. All the other columns will be "melted" together in different rows. To get a more concrete idea, try `melt` yourself to *tidy* the dataset `df2`! 55 | 56 | *** =instructions 57 | - Import `pandas` using the alias `pd`. 58 | - Melt `df2`! We want to maintain the `Country` column and melt all the rest. 59 | - Click "Submit" to print out the new melted DataFrame. 60 | 61 | *** =hint 62 | - To import package x with the alias y, use the command `import x as y`. 63 | - `id_vars` should be `['Country']`. 64 | - You don't need to change the code we provided for you. 65 | 66 | *** =pre_exercise_code 67 | ```{python} 68 | import pandas as pd 69 | url2 = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1273/datasets/df2.csv' 70 | df2 = pd.read_csv(url2, sep = ',') 71 | ``` 72 | 73 | *** =sample_code 74 | ```{python} 75 | # Import pandas as pd 76 | 77 | 78 | # Melt df2 into new dataframe: df2_melted 79 | df2_melted = ____ 80 | 81 | # print df2_melted 82 | print(df2_melted) 83 | ``` 84 | 85 | *** =solution 86 | ```{python} 87 | # Import pandas as pd 88 | import pandas as pd 89 | 90 | # Melt df2 into new dataframe: df2_melted 91 | df2_melted = pd.melt(df2, id_vars=['Country']) 92 | 93 | # print df2_melted 94 | print(df2_melted) 95 | ``` 96 | 97 | *** =sct 98 | ```{python} 99 | test_import("pandas") 100 | test_correct( 101 | lambda: test_object("df2_melted"), 102 | lambda: test_function("pandas.melt", 103 | not_called_msg="Make sure to call the function `pd.melt()`.", 104 | incorrect_msg="Did you pass the correct arguments to `pd.melt()`?") 105 | ) 106 | test_function("print", incorrect_msg="Don't change any code we provided!") 107 | success_msg("Great job!") 108 | ``` 109 | 110 | --- type:NormalExercise lang:python xp:100 skills:2 key:3be71779cd 111 | ## Renaming Columns 112 | 113 | See how easy that was? You tidied up your dataset with a single command! Now we just need a bit more fine-tuning. Change the column names with pandas' rename function. Its syntax is `df.rename(columns = d, inplace = False)`, where `d` is a dictionary where the keys are the columns you want to change, and the values are the new names for these columns. The code `inplace = False` means the result would be stored in a new DataFrame instead of the original one. 114 | 115 | *** =instructions 116 | - Rename the `variable` column of `df2_melted` to `Year` and the `value` column to `Income`. 117 | - Click "Submit Answer" to print out the new tidy DataFrame. 118 | 119 | *** =hint 120 | - Here `d` should be `{'variable':'Year','value':'Income'}`. 121 | - You don't need to change the code we provided for you. 122 | 123 | *** =pre_exercise_code 124 | ```{python} 125 | #import pandas 126 | import pandas as pd 127 | url2 = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1273/datasets/df2.csv' 128 | df2 = pd.read_csv(url2, sep = ',') 129 | df2_melted = pd.melt(df2, id_vars = ['Country']) 130 | ``` 131 | 132 | *** =sample_code 133 | ```{python} 134 | # Import pandas 135 | import pandas as pd 136 | 137 | # Rename the columns of df2_melted: df2_tidy 138 | df2_tidy = ____ 139 | 140 | # Print out df2_tidy 141 | print(df2_tidy) 142 | ``` 143 | 144 | *** =solution 145 | ```{python} 146 | # Import pandas 147 | import pandas as pd 148 | 149 | # Rename the columns of df2_melted: df2_tidy 150 | df2_tidy = df2_melted.rename(columns = {'variable': 'Year', 'value': 'Income'}, inplace = False) 151 | 152 | # Print out df2_tidy 153 | print(df2_tidy) 154 | ``` 155 | 156 | *** =sct 157 | ```{python} 158 | test_import("pandas") 159 | test_correct( 160 | lambda: test_object("df2_tidy"), 161 | lambda: test_function("df2_melted.rename", 162 | not_called_msg="Make sure to call the function `df2_melted.rename()`.", 163 | incorrect_msg="Did you pass the correct arguments to `df2_melted.rename()`?" 164 | ) 165 | ) 166 | test_function("print", incorrect_msg="Don't change any code we provided!") 167 | success_msg("Great job!") 168 | ``` 169 | 170 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2 key:d40684ea0d 171 | ## More messiness 172 | 173 | Great job! Now that you're familiar with messy and tidy data, let's take a look at another dataset. Execute `eyes` in your shell to print a dataset that was featured in DataCamp's [Cleaning Data in R course](https://campus.datacamp.com/courses/cleaning-data-in-r). This dataset is about the eye colors of three women and whether or not they wear glasses. What problem does this dataset have? 174 | 175 | *** =instructions 176 | - It violates rule #1 of tidy data: there are several columns that represent the same variable. 177 | - It violates rule #1 of tidy data: there are several variables represented in the same column. 178 | - It violates rule #2 of tidy data: there are several rows that represent the same observation. 179 | - It violates rule #2 of tidy data: there are several observations represented in the same row. 180 | 181 | *** =hint 182 | Think about what the chart wants to show and how the columns relate to it! 183 | 184 | *** =pre_exercise_code 185 | ```{r} 186 | # The pre exercise code runs code to initialize the user's workspace. 187 | # You can use it to load packages, initialize datasets and draw a plot in the viewer 188 | 189 | import pandas as pd 190 | url4 = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1273/datasets/eyes.csv' 191 | eyes = pd.read_csv(url4,sep=',') 192 | ``` 193 | 194 | *** =sct 195 | ```{r} 196 | # SCT written with pythonwhat: https://github.com/datacamp/pythonwhat/wikif 197 | 198 | msg_2 = "No column represents more than one variable." 199 | msg_3 = "Each person is one observation and is correctly charted as one observation." 200 | msg_success = "Exactly!" 201 | test_mc(1, [msg_success, msg_2, msg_3, msg_3]) 202 | ``` 203 | 204 | --- type:NormalExercise lang:python xp:100 skills:2 key:5d0f6f3efd 205 | ## Deal with it! 206 | 207 | In the previous exercise, the three columns--`Black`, `Blue`, and `Brown`--represent the same variable: eye color. It would make much more sense to merge them into one column. Use `melt` to do it! 208 | 209 | *** =instructions 210 | - Use `melt` to leave `Name` and `Wear_Glasses` intact and combine everything else. 211 | - Rename the `variable` column to `Eye_Color`. 212 | - Hit "Submit Answer" to print out the resulting dataframe. 213 | 214 | *** =hint 215 | - The basic syntax for melt is `df.melt(df, id_vars = lst)`. Here `lst` should be `['Name', 'Wear_Glasses']`. 216 | - The basic syntax for rename is `df.rename(columns = lst, inplace = True)`. 217 | - You don't need to change the code we provided for you. 218 | 219 | *** =pre_exercise_code 220 | ```{python} 221 | import pandas as pd 222 | url4 = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1273/datasets/eyes.csv' 223 | eyes = pd.read_csv(url4,sep=',') 224 | ``` 225 | 226 | *** =sample_code 227 | ```{python} 228 | # Import pandas 229 | import pandas as pd 230 | 231 | # Melt the Black, Blue, and Brown columns of eyes: eyes_melted 232 | eyes_melted = ____ 233 | 234 | # Rename the variable column and save to eyes_renamed 235 | eyes_renamed = ____ 236 | 237 | # print out eyes_renamed 238 | print(eyes_renamed) 239 | ``` 240 | 241 | *** =solution 242 | ```{python} 243 | # Import pandas 244 | import pandas as pd 245 | 246 | # Melt the Black, Blue, and Brown columns of eyes: eyes_melted 247 | eyes_melted = pd.melt(eyes, id_vars=['Name', 'Wear_Glasses']) 248 | 249 | # Rename the variable column and save to eyes_renamed 250 | eyes_renamed = eyes_melted.rename(columns = {'variable': 'Eye_Color'}, inplace = False) 251 | 252 | # print out eyes_renamed 253 | print(eyes_renamed) 254 | ``` 255 | 256 | *** =sct 257 | ```{python} 258 | test_import("pandas") 259 | test_function("pandas.melt") 260 | test_correct( 261 | lambda: test_object("eyes_melted"), 262 | lambda: test_function("pandas.melt", 263 | not_called_msg="Make sure to call the function `pd.melt()`.", 264 | incorrect_msg="Did you pass the correct arguments to `pd.melt()`?") 265 | ) 266 | test_correct( 267 | lambda: test_object("eyes_renamed"), 268 | lambda: test_function("eyes_melted.rename", 269 | not_called_msg="Make sure to call the function `eyes_melted.rename()`.", 270 | incorrect_msg="Did you pass the correct arguments to `eyes_melted.rename()`?" 271 | ) 272 | ) 273 | test_function("print", incorrect_msg="Don't change any code we provided!") 274 | success_msg("Great job!") 275 | ``` 276 | 277 | --- type:NormalExercise lang:python xp:100 skills:2 key:99639b8387 278 | ## Further Cleaning 279 | 280 | What did you notice in the last exercise? While the three columns melt into one, the dataset still has some problems. First of all, when we know Elizabeth has brown eyes, it's redundant to record that she doesn't have blue or black eyes. Therefore, what we want to do is to get rid of all rows whose value in the `value` column is 0. It is very easy to do this in pandas using the following command: 281 | 282 | ``` 283 | df1 = df2[df2.column == value] 284 | ``` 285 | 286 | where `column` is the name of the column we are examining and `value` is the value we want to keep. This step will give us one row for each girl that tells us only her correct eye color. Now the `value` column is no longer necessary, so let's delete it: 287 | 288 | ``` 289 | df.drop(lst, axis = 1) 290 | ``` 291 | 292 | Here `lst` is a list of the columns we want to get rid of, and `axis = 1` specifies that we want to drop columns instead of rows. 293 | 294 | *** =instructions 295 | - Filter the dataset to keep only the rows where `value` is 1. 296 | - Delete the `value` column. 297 | 298 | *** =hint 299 | - To filter the dataset, you should have `df.column == 1`. 300 | - To drop the `value` column, you should have the argument `(['value'], axis = 1)`. 301 | 302 | *** =pre_exercise_code 303 | ```{python} 304 | import pandas as pd 305 | url4 = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1273/datasets/eyes.csv' 306 | eyes = pd.read_csv(url4,sep=',') 307 | eyes_renamed = pd.melt(eyes, id_vars = ['Name', 'Wear_Glasses']) 308 | eyes_renamed.rename(columns = {'variable':'Eye_Color'}, inplace = True) 309 | ``` 310 | 311 | *** =sample_code 312 | ```{python} 313 | # Import pandas 314 | import pandas as pd 315 | 316 | # Filter eyes_ranamed and save to eyes_filtered 317 | eyes_filtered = ____ 318 | 319 | # Delete the `value` column and save to eyes_tidy 320 | eyes_tidy = ____ 321 | 322 | # print eyes_tidy 323 | print(eyes_tidy) 324 | ``` 325 | 326 | *** =solution 327 | ```{python} 328 | # Import pandas 329 | import pandas as pd 330 | 331 | # Filter eyes_ranamed and save to eyes_filtered 332 | eyes_filtered = eyes_renamed[eyes_renamed.value == 1] 333 | 334 | # Delete the `value` column and save to eyes_tidy 335 | eyes_tidy = eyes_filtered.drop(['value'], axis=1) 336 | 337 | # print eye_color_tidy again 338 | print(eyes_tidy) 339 | ``` 340 | 341 | *** =sct 342 | ```{python} 343 | test_import("pandas") 344 | test_object("eyes_filtered") 345 | test_correct( 346 | lambda: test_object("eyes_tidy"), 347 | lambda: test_function("eyes_filtered.drop", 348 | not_called_msg="Make sure to call the function `eyes_filtered.drop()`.", 349 | incorrect_msg="Did you pass the correct arguments to `eyes_filtered.drop()`?" 350 | ) 351 | ) 352 | test_function("print", incorrect_msg="Don't change any code we provided!") 353 | success_msg("Great job!") 354 | ``` 355 | -------------------------------------------------------------------------------- /course.yml: -------------------------------------------------------------------------------- 1 | title : Tidy Data in Python Mini-Course 2 | author_field : Vincent Lan 3 | description : Most of the world's data are not sorted in a clean and organized fashion; nor are they easy to process. As a data scientist, you need to know what the standards for tidy data are and how to create tidy datasets from messy ones. This mini-course will prepare you for these tasks. 4 | author_bio : Vincent Lan is a Statistics student at Harvard University and a course development intern at Datacamp. 5 | university : DataCamp 6 | difficulty_level : 2 7 | time_needed : 0.5 hour 8 | programming_language : python 9 | from : "python-base-prod:20" 10 | -------------------------------------------------------------------------------- /datasets/df1.csv: -------------------------------------------------------------------------------- 1 | owner,dogs,cats,birds 2 | Jason,2,4,3 3 | Lisa,7,10,9 4 | Terrence,8,5,1 5 | -------------------------------------------------------------------------------- /datasets/df2.csv: -------------------------------------------------------------------------------- 1 | "Country","Y1980","Y1981","Y1982","Y1983" 2 | "Afghanistan",21.48678,21.46552,21.45145,21.43822 3 | "Albania",25.22533,25.23981,25.25636,25.27176 4 | "Algeria",22.25703,22.34745,22.43647,22.52105 5 | -------------------------------------------------------------------------------- /datasets/eyes.csv: -------------------------------------------------------------------------------- 1 | Name,Brown,Blue,Black,Wear_Glasses 2 | Esther,0,1,0,FALSE 3 | Elizabeth,1,0,0,FALSE 4 | Michelle,0,0,1,TRUE 5 | -------------------------------------------------------------------------------- /datasets/lunch.csv: -------------------------------------------------------------------------------- 1 | "year","avg_free","avg_reduced","avg_full","avg_total","total_served","perc_free_red" 2 | 1969,2.9,0,16.5,19.4,3368.2,15.1 3 | 1970,4.6,0,17.8,22.4,3565.1,20.7 4 | 1971,5.8,0.5,17.8,24.1,3848.3,26.1 5 | 1972,7.3,0.5,16.6,24.4,3972.1,32.4 6 | 1973,8.1,0.5,16.1,24.7,4008.8,35 7 | 1974,8.6,0.5,15.5,24.6,3981.6,37.1 8 | 1975,9.4,0.6,14.9,24.9,4063,40.3 9 | 1976,10.2,0.8,14.6,25.6,4147.9,43.1 10 | 1977,10.5,1.3,14.5,26.2,4250,44.8 11 | 1978,10.3,1.5,14.9,26.7,4294.1,44.4 12 | 1979,10,1.7,15.3,27,4357.4,43.6 13 | 1980,10,1.9,14.7,26.6,4387,45.1 14 | 1981,10.6,1.9,13.3,25.8,4210.6,48.6 15 | 1982,9.8,1.6,11.5,22.9,3755,50.2 16 | 1983,10.3,1.5,11.2,23,3803.3,51.7 17 | 1984,10.3,1.5,11.5,23.4,3826.2,51 18 | 1985,9.9,1.6,12.1,23.6,3890.1,49.1 19 | 1986,10,1.6,12.2,23.7,3942.5,49.1 20 | 1987,10,1.6,12.4,23.9,3939.9,48.6 21 | 1988,9.8,1.6,12.8,24.2,4032.9,47.4 22 | 1989,9.7,1.6,12.9,24.2,4004.9,47.2 23 | 1990,9.8,1.7,12.6,24.1,4009,48.3 24 | 1991,10.3,1.8,12.2,24.2,4050.7,50.4 25 | 1992,11.2,1.7,11.7,24.6,4101.4,53.1 26 | 1993,11.7,1.7,11.4,24.9,4137.7,54.8 27 | 1994,12.2,1.8,11.3,25.3,4201.6,55.9 28 | 1995,12.4,1.9,11.4,25.7,4253.3,56.4 29 | 1996,12.6,2,11.3,25.9,4313.2,56.9 30 | 1997,12.9,2.1,11.3,26.3,4409,57.6 31 | 1998,13,2.2,11.4,26.6,4425,57.8 32 | 1999,13,2.4,11.6,27,4513.6,57.6 33 | 2000,13,2.5,11.9,27.3,4575,57.1 34 | 2001,12.9,2.6,12,27.5,4585.2,56.8 35 | 2002,13.3,2.6,12,28,4716.6,57.6 36 | 2003,13.7,2.7,11.9,28.4,4762.9,58.5 37 | 2004,14.1,2.8,12,29,4842.4,59.1 38 | 2005,14.6,2.9,12.2,29.6,4976.4,59.4 39 | 2006,14.8,2.9,12.4,30.1,5027.9,59.3 40 | 2007,15,3.1,12.6,30.6,5071.3,59.3 41 | 2008,15.4,3.1,12.5,31,5208.5,60.1 42 | 2009,16.3,3.2,11.9,31.3,5186.1,62.6 43 | 2010,17.6,3,11.1,31.8,5278.4,65.3 44 | 2011,18.4,2.7,10.8,31.8,5274.5,66.6 45 | 2012,18.7,2.7,10.2,31.7,5214.7,68.2 46 | 2013,18.9,2.6,9.2,30.7,5097.6,70.5 47 | 2014,19.2,2.5,8.8,30.5,5020.3,71.6 48 | -------------------------------------------------------------------------------- /datasets/messy.csv: -------------------------------------------------------------------------------- 1 | First,Last,Treatment A,Treatment B 2 | John,Smith,,2 3 | Jane,Doe,16.0,11 4 | Mary,Johnson,3.0,1 5 | -------------------------------------------------------------------------------- /img/author_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datacamp/community-courses-tidy-data-in-python-mini-course/186f84d57553d2b148abb1fd273d97bc17239e30/img/author_image.png -------------------------------------------------------------------------------- /img/shield_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datacamp/community-courses-tidy-data-in-python-mini-course/186f84d57553d2b148abb1fd273d97bc17239e30/img/shield_image.png -------------------------------------------------------------------------------- /requirements.sh: -------------------------------------------------------------------------------- 1 | pip3 install pandas==0.19.1 2 | --------------------------------------------------------------------------------