├── .gitignore ├── README.md ├── pandas_exercises.ipynb └── pandas_tricks.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pandas Tricks & Pitfalls 2 | 3 | Clone this repo and run `ipython notebook` on your computer. 4 | 5 | 6 | * **pandas_tricks.ipynb** - this is the main notebook, where I document some useful pandas features with lots of notes 7 | * **pandas_exercises.ipynb** - supplemental exercises. Not everything from the tricks notebook has exercises, I don't wanna inundate ya. 8 | 9 | # Additional Links 10 | * http://pandas.pydata.org/pandas-docs/stable/10min.html The official pandas introduction, I recommend that you read it through if you haven't already! (A second time wouldn't hurt, there's a lot covered) 11 | * http://pandas.pydata.org/pandas-docs/stable/ The pandas docs are generally good, but they're dense and take time to get through. 12 | * http://pbpython.com/excel-pandas-comp.html and http://pbpython.com/excel-pandas-comp-2.html provide a good overview of pandas tasks, especially for people with an Excel background. 13 | * http://www.swegler.com/becky/blog/2014/08/06/useful-pandas-snippets/ useful pandas snippets, includes a few things I didn't discuss here 14 | -------------------------------------------------------------------------------- /pandas_exercises.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Pandas practice exercises\n", 8 | "\n", 9 | "### These don't address everything in my pandas_tricks notebook, just the highlights.\n", 10 | "\n", 11 | "### You might want to open another blank notebook to practice the other stuff!" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "%matplotlib inline\n", 23 | "import pandas as pd\n", 24 | "import matplotlib.pyplot as plt\n", 25 | "\n", 26 | "df = pd.read_csv('mini_movie_data.csv')" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "# rename" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "df_rename = df.copy()\n", 45 | "df_rename.head()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "collapsed": false 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "# rename df_rename columns:\n", 57 | "# bday -> birthday\n", 58 | "# male -> is_male\n", 59 | "# (PS don't rename 'movie' column we need it for later!)\n", 60 | "\n", 61 | "\n" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "# unique" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": { 75 | "collapsed": true 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "# how many unique actors are in the data?\n", 80 | "# list 10 of the unique names.\n", 81 | "\n", 82 | "\n" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "# isin" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "collapsed": true 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "# select all rows for the movies: 'Agent Cody Banks', 'Snowpiercer', 'Stomp the Yard'" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "# to_numeric" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "num_df = pd.DataFrame({'a':['1.3','4','22','3.14']})\n", 119 | "\n", 120 | "num_df" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": { 127 | "collapsed": false 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "# check the datatype of num_df's column a \n", 132 | "# (see the 3 approaches to checking datatype in the \"Working with Timestamps\" section)\n", 133 | "\n" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "collapsed": true 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "# convert it to numeric\n", 145 | "\n" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": { 152 | "collapsed": true 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "# divide the column by 2\n", 157 | "\n" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "# Working with timestamps" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": { 171 | "collapsed": false 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "date_index = pd.date_range(start='01/01/2016', end='02/14/2016', freq='3D')\n", 176 | "date_df = pd.DataFrame.from_dict({'date':date_index, 'blah':range(len(date_index))})\n", 177 | "\n", 178 | "date_df" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": { 185 | "collapsed": false 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "# Select the day from each date in date_df using .dt.day\n", 190 | "\n", 191 | "\n" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "collapsed": true 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "# Select the weekday from each date in date_df using .dt.weekday\n", 203 | "\n", 204 | "\n" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": { 211 | "collapsed": false 212 | }, 213 | "outputs": [], 214 | "source": [ 215 | "dayOfWeek={0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}\n", 216 | "\n", 217 | "# Using the dictionary above, convert those weekday numbers to human-readable weekdays.\n", 218 | "# Add this as a new column to date_df called 'weekday'\n", 219 | "\n", 220 | "# Hint: use <...>.map(dayOfWeek)\n", 221 | "\n", 222 | "\n" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "# resample" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": { 236 | "collapsed": false 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "long_date_range = pd.date_range(start='1/1/2010', end='1/1/2016', freq='8D')\n", 241 | "many_dates = pd.DataFrame.from_dict({'date':long_date_range, 'spam':range(len(long_date_range))})\n", 242 | "\n", 243 | "many_dates.head()" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "collapsed": true 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "# set the index of the `many_dates` dataframe to the date column\n", 255 | "# remember, `resample` requires a DateTime index!\n", 256 | "\n", 257 | "\n" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": { 264 | "collapsed": true 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "# sum all the 'spam' values for each year\n", 269 | "\n", 270 | "\n" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": { 277 | "collapsed": true 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "# sum all the 'spam' values for each 6-month period\n", 282 | "\n", 283 | "\n" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "# cut" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": { 297 | "collapsed": false 298 | }, 299 | "outputs": [], 300 | "source": [ 301 | "# Here's some fake data\n", 302 | "no_movies = 15\n", 303 | "ratings_df = pd.DataFrame.from_dict({\n", 304 | " 'rating_no':pd.np.random.rand(no_movies), \n", 305 | " 'movie':df.movie.sample(no_movies)})\n", 306 | "# fake gross based on fake rating\n", 307 | "ratings_df['gross'] = pd.np.round(ratings_df.rating_no*100000000, decimals=2)\n", 308 | "\n", 309 | "ratings_df.head()" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": { 316 | "collapsed": true 317 | }, 318 | "outputs": [], 319 | "source": [ 320 | "# Use cut to bin ratings into 6 categories.\n", 321 | "# use the labels: 'terrible','bad','meh','mediocre','ok','good','great'\n", 322 | "\n" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": { 329 | "collapsed": true 330 | }, 331 | "outputs": [], 332 | "source": [ 333 | "# Use cut to bin ratings into 10 categories, numbered 1 to 10.\n", 334 | "# assign those ratings to a new column.\n", 335 | "# group by that new column to find the mean gross of each bin.\n", 336 | "\n", 337 | "\n" 338 | ] 339 | } 340 | ], 341 | "metadata": { 342 | "kernelspec": { 343 | "display_name": "Python 2", 344 | "language": "python", 345 | "name": "python2" 346 | }, 347 | "language_info": { 348 | "codemirror_mode": { 349 | "name": "ipython", 350 | "version": 2 351 | }, 352 | "file_extension": ".py", 353 | "mimetype": "text/x-python", 354 | "name": "python", 355 | "nbconvert_exporter": "python", 356 | "pygments_lexer": "ipython2", 357 | "version": "2.7.11" 358 | } 359 | }, 360 | "nbformat": 4, 361 | "nbformat_minor": 0 362 | } 363 | -------------------------------------------------------------------------------- /pandas_tricks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Pandas tricks & pitfalls" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "%matplotlib inline\n", 19 | "import pandas as pd\n", 20 | "import matplotlib.pyplot as plt" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "collapsed": false 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "df = pd.read_csv('mini_movie_data.csv')\n", 32 | "\n", 33 | "df.head()" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "collapsed": false, 41 | "scrolled": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "df.describe()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "# The `rename` function" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "# rename the 'movie' column to 'title'.\n", 64 | "# you can rename multiple columns by adding more key:value pairs to the dictionary\n", 65 | "df.rename(columns={'movie':'title'}, inplace=True)\n", 66 | "df.head()" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "# unique" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "collapsed": false 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "# how many unique studio names are there?\n", 85 | "print len(df.studio.unique())" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "collapsed": false 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "print df.studio.unique()" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "# unique values will not be sorted, you have to do it yourself\n", 108 | "print sorted(df.studio.unique())" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "# Groupby objects" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": { 122 | "collapsed": false 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "actors = df.groupby('actor')\n", 127 | "# this is a groupby object. do not be scared. it is your friend.\n", 128 | "actors" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "You can see that the groupby object does not immediately reveal any information about itself.\n", 136 | "But it is easy to make it reveal its contents:" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": { 143 | "collapsed": false 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "# select the first row in each group\n", 148 | "# (I keep putting .head() just so the printed dataframe won't fill up your whole screen. It's not needed)\n", 149 | "actors.first().head()" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "# select the last row of each group\n", 161 | "actors.last().head()" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": { 168 | "collapsed": false 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "# take the mean of all rows for each group.\n", 173 | "# columns which you can't take the mean of will automatically be dropped.\n", 174 | "actors.mean().head()" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": { 181 | "collapsed": false 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "# Get a group by name:\n", 186 | "actors.get_group('Gary Oldman').head()" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": { 193 | "collapsed": false 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "# calling size() on a groupby object will return the number of rows each group contains.\n", 198 | "# here, how many roles each actor has\n", 199 | "actors.size().head(10)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": { 206 | "collapsed": false 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "# agg() can take a list of functions. \n", 211 | "# It makes a new column and applies them to each group in a groupby\n", 212 | "actors['domestic_gross','worldwide_gross'].agg(['mean','count','std','min','max']).head(10)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "# isin" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": { 226 | "collapsed": false 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "# ASIDE: which female actors appear most often in the dataset?\n", 231 | "top_actresses = df[df.male==0].groupby('actor').size().sort_values(ascending=False).head()\n", 232 | "top_actresses" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": { 239 | "collapsed": false 240 | }, 241 | "outputs": [], 242 | "source": [ 243 | "# often we want to select all rows where a column contains any value in a list\n", 244 | "# eg, select all rows where df.actor is in our list of actors\n", 245 | "actor_list = ['Susan Sarandon','Julia Roberts']\n", 246 | "# This won't work:\n", 247 | "# df[df.actor in actor_list]" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": { 254 | "collapsed": false 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "# instead, use pandas.DataFrame.isin:\n", 259 | "df[df.actor.isin(actor_list)].head()" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "# pd.to_numeric()\n", 267 | "\n", 268 | "Converts a series, array, or dataframe to a numeric datatype." 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": { 275 | "collapsed": false 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "# example DataFrame of numbers-as-strings\n", 280 | "num_example = pd.DataFrame(data=zip(list('2049204795'),list('6185700963')), columns=['a','b'])\n", 281 | "num_example" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": { 288 | "collapsed": false 289 | }, 290 | "outputs": [], 291 | "source": [ 292 | "# if you add columns a and b, they're just concatenated together because they're strings!\n", 293 | "num_example.a + num_example.b" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": { 300 | "collapsed": false 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "# apply pd.to_numeric across the whole dataframe to convert everything to numeric values\n", 305 | "num_numeric = num_example.apply(pd.to_numeric)\n", 306 | "num_numeric" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": { 313 | "collapsed": false 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "# now adding the columns actually gives you the sum\n", 318 | "num_numeric.a + num_numeric.b" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "metadata": { 325 | "collapsed": false 326 | }, 327 | "outputs": [], 328 | "source": [ 329 | "# this example illustrates 2 things:\n", 330 | "# 1) grouping based on a conditional statement (is an even number)\n", 331 | "# 2) iterating through groups in a groupby\n", 332 | "for name, group in num_numeric.groupby(num_numeric.a%2==0):\n", 333 | " print name, '\\n', group\n", 334 | " print '* * *'" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "# Working with Timestamps" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": { 348 | "collapsed": false 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "# recall what the actor info dataframe looks like\n", 353 | "df.head()" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": { 360 | "collapsed": false 361 | }, 362 | "outputs": [], 363 | "source": [ 364 | "# what is the data type (dtype) of the bday column?\n", 365 | "df.bday.dtype" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": { 372 | "collapsed": false 373 | }, 374 | "outputs": [], 375 | "source": [ 376 | "# we can also print an element of the column to look at it\n", 377 | "df.bday[0]" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": { 384 | "collapsed": false 385 | }, 386 | "outputs": [], 387 | "source": [ 388 | "# we can also check the type of the first element\n", 389 | "type(df.bday[0])" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": {}, 395 | "source": [ 396 | "## pd.to_datetime" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": { 403 | "collapsed": true 404 | }, 405 | "outputs": [], 406 | "source": [ 407 | "# convert the columns of date-time strings to pandas Timestamp objects (similar to to_numeric)\n", 408 | "# we don't use .apply here because we only want to change these 2 specified columns\n", 409 | "for datetime_col in ['bday','release_date']:\n", 410 | " df[datetime_col] = pd.to_datetime(df[datetime_col]) " 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "metadata": { 417 | "collapsed": false 418 | }, 419 | "outputs": [], 420 | "source": [ 421 | "df.bday.dtype" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": { 428 | "collapsed": false 429 | }, 430 | "outputs": [], 431 | "source": [ 432 | "type(df.bday[0])" 433 | ] 434 | }, 435 | { 436 | "cell_type": "markdown", 437 | "metadata": {}, 438 | "source": [ 439 | "## Instant conversion to day/month/year with \n", 440 | "### `pd.Series.dt.`" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "metadata": { 447 | "collapsed": false 448 | }, 449 | "outputs": [], 450 | "source": [ 451 | "print 'years', df.bday.dt.year.unique()" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "metadata": { 458 | "collapsed": false 459 | }, 460 | "outputs": [], 461 | "source": [ 462 | "# this doesn't work.\n", 463 | "# df[df.bday > 1995]" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": { 470 | "collapsed": false 471 | }, 472 | "outputs": [], 473 | "source": [ 474 | "# instead you could compare to a Timestamp or other datetime object\n", 475 | "df[df.bday > pd.to_datetime('1-1-1995')].head()" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": null, 481 | "metadata": { 482 | "collapsed": false 483 | }, 484 | "outputs": [], 485 | "source": [ 486 | "# or, use the .dt syntax:\n", 487 | "df[df.bday.dt.year > 1995].head()" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "metadata": { 494 | "collapsed": false 495 | }, 496 | "outputs": [], 497 | "source": [ 498 | "# Pitfall!\n", 499 | "# when you want to select using multiple conditions, watch out for this pandas pitfall\n", 500 | "# (this doesn't work:)\n", 501 | "# df[2000 > df.bday.dt.year > 1995].head()" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": null, 507 | "metadata": { 508 | "collapsed": false 509 | }, 510 | "outputs": [], 511 | "source": [ 512 | "# Pitfall!\n", 513 | "# Instead, use the bitwise and (&) operator. However...\n", 514 | "# (this doesn't work either):\n", 515 | "# df[2000 > df.bday.dt.year & df.bday.dt.year > 1995].head()" 516 | ] 517 | }, 518 | { 519 | "cell_type": "markdown", 520 | "metadata": {}, 521 | "source": [ 522 | "### Since the '`&`' operator has really high precedence in order of operations, be sure to enclose each condition in *parentheses*.\n", 523 | "\n", 524 | "Eg: `2000 > df.bday.dt.year & df.bday.dt.year > 1995` is evaluated the same as \n", 525 | "\n", 526 | "`2000 > (df.bday.dt.year & df.bday.dt.year) > 1995`\n" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": { 533 | "collapsed": false 534 | }, 535 | "outputs": [], 536 | "source": [ 537 | "# select birthdays between 1995 and 2000, non-inclusive\n", 538 | "df[(2000 > df.bday.dt.year) & (df.bday.dt.year > 1995)].head()" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": null, 544 | "metadata": { 545 | "collapsed": false 546 | }, 547 | "outputs": [], 548 | "source": [ 549 | "# example of .dt.month\n", 550 | "# Note: you rarely need to add columns like this!! You can use .dt directly for a groupby or for a selection\n", 551 | "df2 = df.copy()\n", 552 | "df2['release_month'] = df2.release_date.dt.month\n", 553 | "df2.head()" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": null, 559 | "metadata": { 560 | "collapsed": false 561 | }, 562 | "outputs": [], 563 | "source": [ 564 | "monthly_mean = df.groupby(df.release_date.dt.month).mean()\n", 565 | "monthly_mean" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": null, 571 | "metadata": { 572 | "collapsed": false 573 | }, 574 | "outputs": [], 575 | "source": [ 576 | "monthly_mean[['domestic_gross','worldwide_gross']].plot.bar(title='Mean monthly gross')" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": null, 582 | "metadata": { 583 | "collapsed": false 584 | }, 585 | "outputs": [], 586 | "source": [ 587 | "# you don't need to make a new column for a one-off.\n", 588 | "(monthly_mean.domestic_gross / monthly_mean.worldwide_gross).plot.bar(\n", 589 | " title='Mean Domestic/Worldwide Gross Ratio by month')" 590 | ] 591 | }, 592 | { 593 | "cell_type": "markdown", 594 | "metadata": {}, 595 | "source": [ 596 | "## But that's gross, we don't want month numbers on the x axis, but the month names instead\n", 597 | "\n", 598 | "`calendar` library to the rescue" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": null, 604 | "metadata": { 605 | "collapsed": false 606 | }, 607 | "outputs": [], 608 | "source": [ 609 | "import calendar\n", 610 | "\n", 611 | "# we have the option of full name of month, or abbreviated name\n", 612 | "print calendar.month_name[1:4]\n", 613 | "print calendar.month_abbr[1:4]" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": null, 619 | "metadata": { 620 | "collapsed": false 621 | }, 622 | "outputs": [], 623 | "source": [ 624 | "# map over the the index of using calendar's month names\n", 625 | "monthly_mean.index = monthly_mean.index.map(lambda x: calendar.month_abbr[x])\n", 626 | "monthly_mean" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": null, 632 | "metadata": { 633 | "collapsed": false 634 | }, 635 | "outputs": [], 636 | "source": [ 637 | "# now we have month abbreviations as x labels when we plot\n", 638 | "(monthly_mean.domestic_gross / monthly_mean.worldwide_gross).plot.bar(\n", 639 | " title='Mean Domestic/Worldwide Gross Ratio by month')" 640 | ] 641 | }, 642 | { 643 | "cell_type": "markdown", 644 | "metadata": {}, 645 | "source": [ 646 | "# The `resample` method\n", 647 | "\n", 648 | "A convenient way to bin timeseries data\n", 649 | "\n", 650 | "**Warning:** resample only works with a Timestamp-indexed dataframe. You can always set your index to your datetime column of interest `df.set_index('datetime_column')` to make this work" 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": null, 656 | "metadata": { 657 | "collapsed": false 658 | }, 659 | "outputs": [], 660 | "source": [ 661 | "# let's look at movies of a given actor, by year\n", 662 | "actor_df = df[df.actor=='Samuel L. Jackson'].drop('male', axis=1)\n", 663 | "actor_df.sort_values('release_date').head()" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": null, 669 | "metadata": { 670 | "collapsed": false 671 | }, 672 | "outputs": [], 673 | "source": [ 674 | "# visualize what the data looks like now: it's irregular by year\n", 675 | "actor_df.plot('release_date','production_budget')" 676 | ] 677 | }, 678 | { 679 | "cell_type": "code", 680 | "execution_count": null, 681 | "metadata": { 682 | "collapsed": false 683 | }, 684 | "outputs": [], 685 | "source": [ 686 | "# take the mean of all the numerical columns\n", 687 | "actor_df.set_index('release_date').resample('AS', how='mean').head()" 688 | ] 689 | }, 690 | { 691 | "cell_type": "markdown", 692 | "metadata": {}, 693 | "source": [ 694 | "## note that by default, missing bins get replaced with a NaN row. This is can be useful if you want to set a default value to the missing bins." 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": null, 700 | "metadata": { 701 | "collapsed": false 702 | }, 703 | "outputs": [], 704 | "source": [ 705 | "# same as above, but fill all NaNs with 0\n", 706 | "actor_df.set_index('release_date').resample('AS', how='mean').fillna(0).head()" 707 | ] 708 | }, 709 | { 710 | "cell_type": "code", 711 | "execution_count": null, 712 | "metadata": { 713 | "collapsed": false 714 | }, 715 | "outputs": [], 716 | "source": [ 717 | "# if we want 5-year bins instead, we can plug in a 5 to the resample \"rule\": '5AS'\n", 718 | "actor_df.set_index('release_date').resample('5AS', how='mean')" 719 | ] 720 | }, 721 | { 722 | "cell_type": "markdown", 723 | "metadata": {}, 724 | "source": [ 725 | "## resample resolutions available [(via SO answer)](http://stackoverflow.com/a/17001474):\n", 726 | "\n", 727 | " B business day frequency\n", 728 | " C custom business day frequency (experimental)\n", 729 | " D calendar day frequency\n", 730 | " W weekly frequency\n", 731 | " M month end frequency\n", 732 | " BM business month end frequency\n", 733 | " CBM custom business month end frequency\n", 734 | " MS month start frequency\n", 735 | " BMS business month start frequency\n", 736 | " CBMS custom business month start frequency\n", 737 | " Q quarter end frequency\n", 738 | " BQ business quarter endfrequency\n", 739 | " QS quarter start frequency\n", 740 | " BQS business quarter start frequency\n", 741 | " A year end frequency\n", 742 | " BA business year end frequency\n", 743 | " AS year start frequency\n", 744 | " BAS business year start frequency\n", 745 | " BH business hour frequency\n", 746 | " H hourly frequency\n", 747 | " T minutely frequency\n", 748 | " S secondly frequency\n", 749 | " L milliseonds\n", 750 | " U microseconds\n", 751 | " N nanoseconds" 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": null, 757 | "metadata": { 758 | "collapsed": false 759 | }, 760 | "outputs": [], 761 | "source": [ 762 | "# let's say we want the mean, and also the count.\n", 763 | "# we can pass a list of methods to the `how`\n", 764 | "yr_bins = actor_df.set_index('release_date').resample('5AS', how=['mean','count','sem'])\n", 765 | "yr_bins.head()" 766 | ] 767 | }, 768 | { 769 | "cell_type": "code", 770 | "execution_count": null, 771 | "metadata": { 772 | "collapsed": false 773 | }, 774 | "outputs": [], 775 | "source": [ 776 | "# or you can get very fancy and pass a dict of dicts\n", 777 | "# the first key references the DataFrame's original column name\n", 778 | "# the second key defines the name of a new column.\n", 779 | "yr_bins = actor_df.set_index('release_date').resample('5AS', how={\n", 780 | " 'production_budget':{'avg':'mean', 'ct':'count', 'stdEm':'sem'},\n", 781 | " 'domestic_gross':{'low':'min', 'high':'max'},\n", 782 | " 'worldwide_gross':{'total':'sum'}})\n", 783 | "yr_bins" 784 | ] 785 | }, 786 | { 787 | "cell_type": "markdown", 788 | "metadata": {}, 789 | "source": [ 790 | "# Special note: try not to use method names as column names. It will make indexing more annoying.\n", 791 | "## For example, a column named 'mean' will cause a collision when you call `df.mean`\n", 792 | "## The `mean` method will have precedence.\n", 793 | "\n", 794 | "You'd only be able to access the column like: `df['mean']`" 795 | ] 796 | }, 797 | { 798 | "cell_type": "code", 799 | "execution_count": null, 800 | "metadata": { 801 | "collapsed": true 802 | }, 803 | "outputs": [], 804 | "source": [ 805 | "# PS: 'sem' is standard error of the mean\n", 806 | "# pd.Series.sem?" 807 | ] 808 | }, 809 | { 810 | "cell_type": "markdown", 811 | "metadata": {}, 812 | "source": [ 813 | "# Multiindexing" 814 | ] 815 | }, 816 | { 817 | "cell_type": "code", 818 | "execution_count": null, 819 | "metadata": { 820 | "collapsed": false 821 | }, 822 | "outputs": [], 823 | "source": [ 824 | "yr_bins.production_budget" 825 | ] 826 | }, 827 | { 828 | "cell_type": "code", 829 | "execution_count": null, 830 | "metadata": { 831 | "collapsed": false 832 | }, 833 | "outputs": [], 834 | "source": [ 835 | "# chaining the dot column name syntax is fine\n", 836 | "yr_bins.production_budget.avg" 837 | ] 838 | }, 839 | { 840 | "cell_type": "code", 841 | "execution_count": null, 842 | "metadata": { 843 | "collapsed": false 844 | }, 845 | "outputs": [], 846 | "source": [ 847 | "# you can also index both levels of the column index by name, as strings\n", 848 | "yr_bins['production_budget','avg']" 849 | ] 850 | }, 851 | { 852 | "cell_type": "markdown", 853 | "metadata": {}, 854 | "source": [ 855 | "## Flattening a multi-level column index\n", 856 | "\n", 857 | "### Use a list comprehension to rewrite the column names" 858 | ] 859 | }, 860 | { 861 | "cell_type": "code", 862 | "execution_count": null, 863 | "metadata": { 864 | "collapsed": false 865 | }, 866 | "outputs": [], 867 | "source": [ 868 | "print yr_bins.columns.values" 869 | ] 870 | }, 871 | { 872 | "cell_type": "code", 873 | "execution_count": null, 874 | "metadata": { 875 | "collapsed": false 876 | }, 877 | "outputs": [], 878 | "source": [ 879 | "yr_bins_flat = yr_bins.copy()\n", 880 | "# use an underscore as a delimiter. But it's up to you.\n", 881 | "yr_bins_flat.columns = ['_'.join(col) for col in yr_bins.columns.values]\n", 882 | "\n", 883 | "yr_bins_flat" 884 | ] 885 | }, 886 | { 887 | "cell_type": "markdown", 888 | "metadata": {}, 889 | "source": [ 890 | "# `pd.cut()`: bins numeric values -> categorical values" 891 | ] 892 | }, 893 | { 894 | "cell_type": "code", 895 | "execution_count": null, 896 | "metadata": { 897 | "collapsed": false 898 | }, 899 | "outputs": [], 900 | "source": [ 901 | "# make some fake data\n", 902 | "no_movies = 10\n", 903 | "ratings_df = pd.DataFrame.from_dict({\n", 904 | " 'rating_no':pd.np.random.rand(no_movies), \n", 905 | " 'movie':df.title.sample(no_movies)})\n", 906 | "# fake gross based on fake rating\n", 907 | "ratings_df['gross'] = pd.np.round(ratings_df.rating_no*100000000, decimals=2)\n", 908 | "\n", 909 | "# save this unmodified version for later\n", 910 | "ratings_df_orig = ratings_df.copy()\n", 911 | "\n", 912 | "ratings_df" 913 | ] 914 | }, 915 | { 916 | "cell_type": "code", 917 | "execution_count": null, 918 | "metadata": { 919 | "collapsed": false 920 | }, 921 | "outputs": [], 922 | "source": [ 923 | "# cut numerical ratings into N bins\n", 924 | "\n", 925 | "# here's what the labels default to when you don't define your own labels\n", 926 | "ratings_df['rating_category_ugly'] = pd.cut(ratings_df.rating_no, bins=4)\n", 927 | "\n", 928 | "# you can substitute whatever labels you want\n", 929 | "ratings_df['rating_category'] = pd.cut(ratings_df.rating_no, bins=4, labels=['bad','mediocre','good','excellent'])\n", 930 | "\n", 931 | "ratings_df" 932 | ] 933 | }, 934 | { 935 | "cell_type": "code", 936 | "execution_count": null, 937 | "metadata": { 938 | "collapsed": false 939 | }, 940 | "outputs": [], 941 | "source": [ 942 | "# `pd.cut` gives us an excellent way to groupby based on bins.\n", 943 | "# Eg, we can use the new categorical ratings to find the mean gross for each rating bin\n", 944 | "print 'mean gross for each rating bin'\n", 945 | "ratings_df.groupby('rating_category').mean()" 946 | ] 947 | }, 948 | { 949 | "cell_type": "code", 950 | "execution_count": null, 951 | "metadata": { 952 | "collapsed": false 953 | }, 954 | "outputs": [], 955 | "source": [ 956 | "# Even if we didn't care about assigning labels like 'bad', 'mediocre', etc to the rating numbers,\n", 957 | "# pd.cut is still very useful if we want to groupby on binned numerical data\n", 958 | "\n", 959 | "# We can do this as a one-liner, using the copy of the original ratings_df before we added those extra columns.\n", 960 | "# Let's do 5 bins to switch it up.\n", 961 | "ratings_df_orig.groupby(pd.cut(ratings_df_orig.rating_no, bins=5)).mean()" 962 | ] 963 | }, 964 | { 965 | "cell_type": "markdown", 966 | "metadata": {}, 967 | "source": [ 968 | "### Just like with `resample`, empty bins have *NaN* values." 969 | ] 970 | }, 971 | { 972 | "cell_type": "code", 973 | "execution_count": null, 974 | "metadata": { 975 | "collapsed": false 976 | }, 977 | "outputs": [], 978 | "source": [ 979 | "ratings_df_orig.groupby(pd.cut(ratings_df_orig.rating_no, bins=5), as_index=False).mean()" 980 | ] 981 | }, 982 | { 983 | "cell_type": "markdown", 984 | "metadata": {}, 985 | "source": [ 986 | "# Bonus: Taking advantage of seaborn's groupby support" 987 | ] 988 | }, 989 | { 990 | "cell_type": "code", 991 | "execution_count": null, 992 | "metadata": { 993 | "collapsed": false 994 | }, 995 | "outputs": [], 996 | "source": [ 997 | "n_top = 15\n", 998 | "# we only want one row per movie, we don't care about actors\n", 999 | "by_movie_df = df.groupby('title').first()\n", 1000 | "by_movie_df.head()" 1001 | ] 1002 | }, 1003 | { 1004 | "cell_type": "code", 1005 | "execution_count": null, 1006 | "metadata": { 1007 | "collapsed": false 1008 | }, 1009 | "outputs": [], 1010 | "source": [ 1011 | "# select only the top N studios, by total production budget of all movies\n", 1012 | "top_studio_names = by_movie_df.groupby('studio').sum().sort_values(\n", 1013 | " 'production_budget', ascending=False).index[:n_top]\n", 1014 | "\n", 1015 | "top_studio_df = by_movie_df[by_movie_df.studio.isin(top_studio_names)]\n", 1016 | "\n", 1017 | "print top_studio_names\n", 1018 | "top_studio_df.head()" 1019 | ] 1020 | }, 1021 | { 1022 | "cell_type": "code", 1023 | "execution_count": null, 1024 | "metadata": { 1025 | "collapsed": false 1026 | }, 1027 | "outputs": [], 1028 | "source": [ 1029 | "import seaborn as sns\n", 1030 | "\n", 1031 | "# make the size of the figure bigger (width,height)\n", 1032 | "plt.figure(figsize=(14,8))\n", 1033 | "\n", 1034 | "# we pass the studio column to sns.violinplot\n", 1035 | "sns.violinplot(top_studio_df.production_budget, groupby=top_studio_df.studio)\n", 1036 | "plt.title('Production budget distributions for the top 10 studios');" 1037 | ] 1038 | }, 1039 | { 1040 | "cell_type": "code", 1041 | "execution_count": null, 1042 | "metadata": { 1043 | "collapsed": true 1044 | }, 1045 | "outputs": [], 1046 | "source": [] 1047 | } 1048 | ], 1049 | "metadata": { 1050 | "kernelspec": { 1051 | "display_name": "Python 2", 1052 | "language": "python", 1053 | "name": "python2" 1054 | }, 1055 | "language_info": { 1056 | "codemirror_mode": { 1057 | "name": "ipython", 1058 | "version": 2 1059 | }, 1060 | "file_extension": ".py", 1061 | "mimetype": "text/x-python", 1062 | "name": "python", 1063 | "nbconvert_exporter": "python", 1064 | "pygments_lexer": "ipython2", 1065 | "version": "2.7.11" 1066 | } 1067 | }, 1068 | "nbformat": 4, 1069 | "nbformat_minor": 0 1070 | } 1071 | --------------------------------------------------------------------------------