├── .gitignore
├── README.md
├── pandas_exercises.ipynb
└── pandas_tricks.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Pandas Tricks & Pitfalls
 2 | 
 3 | Clone this repo and run `ipython notebook` on your computer.
 4 | 
 5 | 
 6 | * **pandas_tricks.ipynb** - this is the main notebook, where I document some useful pandas features with lots of notes
 7 | * **pandas_exercises.ipynb** - supplemental exercises. Not everything from the tricks notebook has exercises, I don't wanna inundate ya.
 8 | 
 9 | # Additional Links
10 | * http://pandas.pydata.org/pandas-docs/stable/10min.html The official pandas introduction, I recommend that you read it through if you haven't already! (A second time wouldn't hurt, there's a lot covered)
11 | * http://pandas.pydata.org/pandas-docs/stable/ The pandas docs are generally good, but they're dense and take time to get through.
12 | * http://pbpython.com/excel-pandas-comp.html and http://pbpython.com/excel-pandas-comp-2.html provide a good overview of pandas tasks, especially for people with an Excel background.
13 | * http://www.swegler.com/becky/blog/2014/08/06/useful-pandas-snippets/ useful pandas snippets, includes a few things I didn't discuss here
14 | 


--------------------------------------------------------------------------------
/pandas_exercises.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Pandas practice exercises\n",
  8 |     "\n",
  9 |     "### These don't address everything in my pandas_tricks notebook, just the highlights.\n",
 10 |     "\n",
 11 |     "### You might want to open another blank notebook to practice the other stuff!"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "%matplotlib inline\n",
 23 |     "import pandas as pd\n",
 24 |     "import matplotlib.pyplot as plt\n",
 25 |     "\n",
 26 |     "df = pd.read_csv('mini_movie_data.csv')"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "# rename"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {
 40 |     "collapsed": false
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "df_rename = df.copy()\n",
 45 |     "df_rename.head()"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {
 52 |     "collapsed": false
 53 |    },
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "# rename df_rename columns:\n",
 57 |     "# bday -> birthday\n",
 58 |     "# male -> is_male\n",
 59 |     "# (PS don't rename 'movie' column we need it for later!)\n",
 60 |     "\n",
 61 |     "\n"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "# unique"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {
 75 |     "collapsed": true
 76 |    },
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "# how many unique actors are in the data?\n",
 80 |     "# list 10 of the unique names.\n",
 81 |     "\n",
 82 |     "\n"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "# isin"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {
 96 |     "collapsed": true
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "# select all rows for the movies: 'Agent Cody Banks', 'Snowpiercer', 'Stomp the Yard'"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "# to_numeric"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {
114 |     "collapsed": false
115 |    },
116 |    "outputs": [],
117 |    "source": [
118 |     "num_df = pd.DataFrame({'a':['1.3','4','22','3.14']})\n",
119 |     "\n",
120 |     "num_df"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {
127 |     "collapsed": false
128 |    },
129 |    "outputs": [],
130 |    "source": [
131 |     "# check the datatype of num_df's column a \n",
132 |     "# (see the 3 approaches to checking datatype in the \"Working with Timestamps\" section)\n",
133 |     "\n"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {
140 |     "collapsed": true
141 |    },
142 |    "outputs": [],
143 |    "source": [
144 |     "# convert it to numeric\n",
145 |     "\n"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {
152 |     "collapsed": true
153 |    },
154 |    "outputs": [],
155 |    "source": [
156 |     "# divide the column by 2\n",
157 |     "\n"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "# Working with timestamps"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {
171 |     "collapsed": false
172 |    },
173 |    "outputs": [],
174 |    "source": [
175 |     "date_index = pd.date_range(start='01/01/2016', end='02/14/2016', freq='3D')\n",
176 |     "date_df = pd.DataFrame.from_dict({'date':date_index, 'blah':range(len(date_index))})\n",
177 |     "\n",
178 |     "date_df"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {
185 |     "collapsed": false
186 |    },
187 |    "outputs": [],
188 |    "source": [
189 |     "# Select the day from each date in date_df using .dt.day\n",
190 |     "\n",
191 |     "\n"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {
198 |     "collapsed": true
199 |    },
200 |    "outputs": [],
201 |    "source": [
202 |     "# Select the weekday from each date in date_df using .dt.weekday\n",
203 |     "\n",
204 |     "\n"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {
211 |     "collapsed": false
212 |    },
213 |    "outputs": [],
214 |    "source": [
215 |     "dayOfWeek={0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}\n",
216 |     "\n",
217 |     "# Using the dictionary above, convert those weekday numbers to human-readable weekdays.\n",
218 |     "# Add this as a new column to date_df called 'weekday'\n",
219 |     "\n",
220 |     "# Hint: use <...>.map(dayOfWeek)\n",
221 |     "\n",
222 |     "\n"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "markdown",
227 |    "metadata": {},
228 |    "source": [
229 |     "# resample"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": null,
235 |    "metadata": {
236 |     "collapsed": false
237 |    },
238 |    "outputs": [],
239 |    "source": [
240 |     "long_date_range = pd.date_range(start='1/1/2010', end='1/1/2016', freq='8D')\n",
241 |     "many_dates = pd.DataFrame.from_dict({'date':long_date_range, 'spam':range(len(long_date_range))})\n",
242 |     "\n",
243 |     "many_dates.head()"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {
250 |     "collapsed": true
251 |    },
252 |    "outputs": [],
253 |    "source": [
254 |     "# set the index of the `many_dates` dataframe to the date column\n",
255 |     "# remember, `resample` requires a DateTime index!\n",
256 |     "\n",
257 |     "\n"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {
264 |     "collapsed": true
265 |    },
266 |    "outputs": [],
267 |    "source": [
268 |     "# sum all the 'spam' values for each year\n",
269 |     "\n",
270 |     "\n"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {
277 |     "collapsed": true
278 |    },
279 |    "outputs": [],
280 |    "source": [
281 |     "# sum all the 'spam' values for each 6-month period\n",
282 |     "\n",
283 |     "\n"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "markdown",
288 |    "metadata": {},
289 |    "source": [
290 |     "# cut"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": null,
296 |    "metadata": {
297 |     "collapsed": false
298 |    },
299 |    "outputs": [],
300 |    "source": [
301 |     "# Here's some fake data\n",
302 |     "no_movies = 15\n",
303 |     "ratings_df = pd.DataFrame.from_dict({\n",
304 |     "    'rating_no':pd.np.random.rand(no_movies), \n",
305 |     "    'movie':df.movie.sample(no_movies)})\n",
306 |     "# fake gross based on fake rating\n",
307 |     "ratings_df['gross'] = pd.np.round(ratings_df.rating_no*100000000, decimals=2)\n",
308 |     "\n",
309 |     "ratings_df.head()"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "metadata": {
316 |     "collapsed": true
317 |    },
318 |    "outputs": [],
319 |    "source": [
320 |     "# Use cut to bin ratings into 6 categories.\n",
321 |     "# use the labels: 'terrible','bad','meh','mediocre','ok','good','great'\n",
322 |     "\n"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": null,
328 |    "metadata": {
329 |     "collapsed": true
330 |    },
331 |    "outputs": [],
332 |    "source": [
333 |     "# Use cut to bin ratings into 10 categories, numbered 1 to 10.\n",
334 |     "# assign those ratings to a new column.\n",
335 |     "# group by that new column to find the mean gross of each bin.\n",
336 |     "\n",
337 |     "\n"
338 |    ]
339 |   }
340 |  ],
341 |  "metadata": {
342 |   "kernelspec": {
343 |    "display_name": "Python 2",
344 |    "language": "python",
345 |    "name": "python2"
346 |   },
347 |   "language_info": {
348 |    "codemirror_mode": {
349 |     "name": "ipython",
350 |     "version": 2
351 |    },
352 |    "file_extension": ".py",
353 |    "mimetype": "text/x-python",
354 |    "name": "python",
355 |    "nbconvert_exporter": "python",
356 |    "pygments_lexer": "ipython2",
357 |    "version": "2.7.11"
358 |   }
359 |  },
360 |  "nbformat": 4,
361 |  "nbformat_minor": 0
362 | }
363 | 


--------------------------------------------------------------------------------
/pandas_tricks.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Pandas tricks & pitfalls"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "code",
  12 |    "execution_count": null,
  13 |    "metadata": {
  14 |     "collapsed": false
  15 |    },
  16 |    "outputs": [],
  17 |    "source": [
  18 |     "%matplotlib inline\n",
  19 |     "import pandas as pd\n",
  20 |     "import matplotlib.pyplot as plt"
  21 |    ]
  22 |   },
  23 |   {
  24 |    "cell_type": "code",
  25 |    "execution_count": null,
  26 |    "metadata": {
  27 |     "collapsed": false
  28 |    },
  29 |    "outputs": [],
  30 |    "source": [
  31 |     "df = pd.read_csv('mini_movie_data.csv')\n",
  32 |     "\n",
  33 |     "df.head()"
  34 |    ]
  35 |   },
  36 |   {
  37 |    "cell_type": "code",
  38 |    "execution_count": null,
  39 |    "metadata": {
  40 |     "collapsed": false,
  41 |     "scrolled": true
  42 |    },
  43 |    "outputs": [],
  44 |    "source": [
  45 |     "df.describe()"
  46 |    ]
  47 |   },
  48 |   {
  49 |    "cell_type": "markdown",
  50 |    "metadata": {},
  51 |    "source": [
  52 |     "# The `rename` function"
  53 |    ]
  54 |   },
  55 |   {
  56 |    "cell_type": "code",
  57 |    "execution_count": null,
  58 |    "metadata": {
  59 |     "collapsed": false
  60 |    },
  61 |    "outputs": [],
  62 |    "source": [
  63 |     "# rename the 'movie' column to 'title'.\n",
  64 |     "# you can rename multiple columns by adding more key:value pairs to the dictionary\n",
  65 |     "df.rename(columns={'movie':'title'}, inplace=True)\n",
  66 |     "df.head()"
  67 |    ]
  68 |   },
  69 |   {
  70 |    "cell_type": "markdown",
  71 |    "metadata": {},
  72 |    "source": [
  73 |     "# unique"
  74 |    ]
  75 |   },
  76 |   {
  77 |    "cell_type": "code",
  78 |    "execution_count": null,
  79 |    "metadata": {
  80 |     "collapsed": false
  81 |    },
  82 |    "outputs": [],
  83 |    "source": [
  84 |     "# how many unique studio names are there?\n",
  85 |     "print len(df.studio.unique())"
  86 |    ]
  87 |   },
  88 |   {
  89 |    "cell_type": "code",
  90 |    "execution_count": null,
  91 |    "metadata": {
  92 |     "collapsed": false
  93 |    },
  94 |    "outputs": [],
  95 |    "source": [
  96 |     "print df.studio.unique()"
  97 |    ]
  98 |   },
  99 |   {
 100 |    "cell_type": "code",
 101 |    "execution_count": null,
 102 |    "metadata": {
 103 |     "collapsed": false
 104 |    },
 105 |    "outputs": [],
 106 |    "source": [
 107 |     "# unique values will not be sorted, you have to do it yourself\n",
 108 |     "print sorted(df.studio.unique())"
 109 |    ]
 110 |   },
 111 |   {
 112 |    "cell_type": "markdown",
 113 |    "metadata": {},
 114 |    "source": [
 115 |     "# Groupby objects"
 116 |    ]
 117 |   },
 118 |   {
 119 |    "cell_type": "code",
 120 |    "execution_count": null,
 121 |    "metadata": {
 122 |     "collapsed": false
 123 |    },
 124 |    "outputs": [],
 125 |    "source": [
 126 |     "actors = df.groupby('actor')\n",
 127 |     "# this is a groupby object. do not be scared. it is your friend.\n",
 128 |     "actors"
 129 |    ]
 130 |   },
 131 |   {
 132 |    "cell_type": "markdown",
 133 |    "metadata": {},
 134 |    "source": [
 135 |     "You can see that the groupby object does not immediately reveal any information about itself.\n",
 136 |     "But it is easy to make it reveal its contents:"
 137 |    ]
 138 |   },
 139 |   {
 140 |    "cell_type": "code",
 141 |    "execution_count": null,
 142 |    "metadata": {
 143 |     "collapsed": false
 144 |    },
 145 |    "outputs": [],
 146 |    "source": [
 147 |     "# select the first row in each group\n",
 148 |     "# (I keep putting .head() just so the printed dataframe won't fill up your whole screen. It's not needed)\n",
 149 |     "actors.first().head()"
 150 |    ]
 151 |   },
 152 |   {
 153 |    "cell_type": "code",
 154 |    "execution_count": null,
 155 |    "metadata": {
 156 |     "collapsed": false
 157 |    },
 158 |    "outputs": [],
 159 |    "source": [
 160 |     "# select the last row of each group\n",
 161 |     "actors.last().head()"
 162 |    ]
 163 |   },
 164 |   {
 165 |    "cell_type": "code",
 166 |    "execution_count": null,
 167 |    "metadata": {
 168 |     "collapsed": false
 169 |    },
 170 |    "outputs": [],
 171 |    "source": [
 172 |     "# take the mean of all rows for each group.\n",
 173 |     "# columns which you can't take the mean of will automatically be dropped.\n",
 174 |     "actors.mean().head()"
 175 |    ]
 176 |   },
 177 |   {
 178 |    "cell_type": "code",
 179 |    "execution_count": null,
 180 |    "metadata": {
 181 |     "collapsed": false
 182 |    },
 183 |    "outputs": [],
 184 |    "source": [
 185 |     "# Get a group by name:\n",
 186 |     "actors.get_group('Gary Oldman').head()"
 187 |    ]
 188 |   },
 189 |   {
 190 |    "cell_type": "code",
 191 |    "execution_count": null,
 192 |    "metadata": {
 193 |     "collapsed": false
 194 |    },
 195 |    "outputs": [],
 196 |    "source": [
 197 |     "# calling size() on a groupby object will return the number of rows each group contains.\n",
 198 |     "# here, how many roles each actor has\n",
 199 |     "actors.size().head(10)"
 200 |    ]
 201 |   },
 202 |   {
 203 |    "cell_type": "code",
 204 |    "execution_count": null,
 205 |    "metadata": {
 206 |     "collapsed": false
 207 |    },
 208 |    "outputs": [],
 209 |    "source": [
 210 |     "# agg() can take a list of functions. \n",
 211 |     "# It makes a new column and applies them to each group in a groupby\n",
 212 |     "actors['domestic_gross','worldwide_gross'].agg(['mean','count','std','min','max']).head(10)"
 213 |    ]
 214 |   },
 215 |   {
 216 |    "cell_type": "markdown",
 217 |    "metadata": {},
 218 |    "source": [
 219 |     "# isin"
 220 |    ]
 221 |   },
 222 |   {
 223 |    "cell_type": "code",
 224 |    "execution_count": null,
 225 |    "metadata": {
 226 |     "collapsed": false
 227 |    },
 228 |    "outputs": [],
 229 |    "source": [
 230 |     "# ASIDE: which female actors appear most often in the dataset?\n",
 231 |     "top_actresses = df[df.male==0].groupby('actor').size().sort_values(ascending=False).head()\n",
 232 |     "top_actresses"
 233 |    ]
 234 |   },
 235 |   {
 236 |    "cell_type": "code",
 237 |    "execution_count": null,
 238 |    "metadata": {
 239 |     "collapsed": false
 240 |    },
 241 |    "outputs": [],
 242 |    "source": [
 243 |     "# often we want to select all rows where a column contains any value in a list\n",
 244 |     "# eg, select all rows where df.actor is in our list of actors\n",
 245 |     "actor_list = ['Susan Sarandon','Julia Roberts']\n",
 246 |     "# This won't work:\n",
 247 |     "# df[df.actor in actor_list]"
 248 |    ]
 249 |   },
 250 |   {
 251 |    "cell_type": "code",
 252 |    "execution_count": null,
 253 |    "metadata": {
 254 |     "collapsed": false
 255 |    },
 256 |    "outputs": [],
 257 |    "source": [
 258 |     "# instead, use pandas.DataFrame.isin:\n",
 259 |     "df[df.actor.isin(actor_list)].head()"
 260 |    ]
 261 |   },
 262 |   {
 263 |    "cell_type": "markdown",
 264 |    "metadata": {},
 265 |    "source": [
 266 |     "# pd.to_numeric()\n",
 267 |     "\n",
 268 |     "Converts a series, array, or dataframe to a numeric datatype."
 269 |    ]
 270 |   },
 271 |   {
 272 |    "cell_type": "code",
 273 |    "execution_count": null,
 274 |    "metadata": {
 275 |     "collapsed": false
 276 |    },
 277 |    "outputs": [],
 278 |    "source": [
 279 |     "# example DataFrame of numbers-as-strings\n",
 280 |     "num_example = pd.DataFrame(data=zip(list('2049204795'),list('6185700963')), columns=['a','b'])\n",
 281 |     "num_example"
 282 |    ]
 283 |   },
 284 |   {
 285 |    "cell_type": "code",
 286 |    "execution_count": null,
 287 |    "metadata": {
 288 |     "collapsed": false
 289 |    },
 290 |    "outputs": [],
 291 |    "source": [
 292 |     "# if you add columns a and b, they're just concatenated together because they're strings!\n",
 293 |     "num_example.a + num_example.b"
 294 |    ]
 295 |   },
 296 |   {
 297 |    "cell_type": "code",
 298 |    "execution_count": null,
 299 |    "metadata": {
 300 |     "collapsed": false
 301 |    },
 302 |    "outputs": [],
 303 |    "source": [
 304 |     "# apply pd.to_numeric across the whole dataframe to convert everything to numeric values\n",
 305 |     "num_numeric = num_example.apply(pd.to_numeric)\n",
 306 |     "num_numeric"
 307 |    ]
 308 |   },
 309 |   {
 310 |    "cell_type": "code",
 311 |    "execution_count": null,
 312 |    "metadata": {
 313 |     "collapsed": false
 314 |    },
 315 |    "outputs": [],
 316 |    "source": [
 317 |     "# now adding the columns actually gives you the sum\n",
 318 |     "num_numeric.a + num_numeric.b"
 319 |    ]
 320 |   },
 321 |   {
 322 |    "cell_type": "code",
 323 |    "execution_count": null,
 324 |    "metadata": {
 325 |     "collapsed": false
 326 |    },
 327 |    "outputs": [],
 328 |    "source": [
 329 |     "# this example illustrates 2 things:\n",
 330 |     "# 1) grouping based on a conditional statement (is an even number)\n",
 331 |     "# 2) iterating through groups in a groupby\n",
 332 |     "for name, group in num_numeric.groupby(num_numeric.a%2==0):\n",
 333 |     "    print name, '\\n', group\n",
 334 |     "    print '* * *'"
 335 |    ]
 336 |   },
 337 |   {
 338 |    "cell_type": "markdown",
 339 |    "metadata": {},
 340 |    "source": [
 341 |     "# Working with Timestamps"
 342 |    ]
 343 |   },
 344 |   {
 345 |    "cell_type": "code",
 346 |    "execution_count": null,
 347 |    "metadata": {
 348 |     "collapsed": false
 349 |    },
 350 |    "outputs": [],
 351 |    "source": [
 352 |     "# recall what the actor info dataframe looks like\n",
 353 |     "df.head()"
 354 |    ]
 355 |   },
 356 |   {
 357 |    "cell_type": "code",
 358 |    "execution_count": null,
 359 |    "metadata": {
 360 |     "collapsed": false
 361 |    },
 362 |    "outputs": [],
 363 |    "source": [
 364 |     "# what is the data type (dtype) of the bday column?\n",
 365 |     "df.bday.dtype"
 366 |    ]
 367 |   },
 368 |   {
 369 |    "cell_type": "code",
 370 |    "execution_count": null,
 371 |    "metadata": {
 372 |     "collapsed": false
 373 |    },
 374 |    "outputs": [],
 375 |    "source": [
 376 |     "# we can also print an element of the column to look at it\n",
 377 |     "df.bday[0]"
 378 |    ]
 379 |   },
 380 |   {
 381 |    "cell_type": "code",
 382 |    "execution_count": null,
 383 |    "metadata": {
 384 |     "collapsed": false
 385 |    },
 386 |    "outputs": [],
 387 |    "source": [
 388 |     "# we can also check the type of the first element\n",
 389 |     "type(df.bday[0])"
 390 |    ]
 391 |   },
 392 |   {
 393 |    "cell_type": "markdown",
 394 |    "metadata": {},
 395 |    "source": [
 396 |     "## pd.to_datetime"
 397 |    ]
 398 |   },
 399 |   {
 400 |    "cell_type": "code",
 401 |    "execution_count": null,
 402 |    "metadata": {
 403 |     "collapsed": true
 404 |    },
 405 |    "outputs": [],
 406 |    "source": [
 407 |     "# convert the columns of date-time strings to pandas Timestamp objects (similar to to_numeric)\n",
 408 |     "# we don't use .apply here because we only want to change these 2 specified columns\n",
 409 |     "for datetime_col in ['bday','release_date']:\n",
 410 |     "    df[datetime_col] = pd.to_datetime(df[datetime_col])    "
 411 |    ]
 412 |   },
 413 |   {
 414 |    "cell_type": "code",
 415 |    "execution_count": null,
 416 |    "metadata": {
 417 |     "collapsed": false
 418 |    },
 419 |    "outputs": [],
 420 |    "source": [
 421 |     "df.bday.dtype"
 422 |    ]
 423 |   },
 424 |   {
 425 |    "cell_type": "code",
 426 |    "execution_count": null,
 427 |    "metadata": {
 428 |     "collapsed": false
 429 |    },
 430 |    "outputs": [],
 431 |    "source": [
 432 |     "type(df.bday[0])"
 433 |    ]
 434 |   },
 435 |   {
 436 |    "cell_type": "markdown",
 437 |    "metadata": {},
 438 |    "source": [
 439 |     "## Instant conversion to day/month/year with \n",
 440 |     "### `pd.Series.dt.<day/month/year/second/etc>`"
 441 |    ]
 442 |   },
 443 |   {
 444 |    "cell_type": "code",
 445 |    "execution_count": null,
 446 |    "metadata": {
 447 |     "collapsed": false
 448 |    },
 449 |    "outputs": [],
 450 |    "source": [
 451 |     "print 'years', df.bday.dt.year.unique()"
 452 |    ]
 453 |   },
 454 |   {
 455 |    "cell_type": "code",
 456 |    "execution_count": null,
 457 |    "metadata": {
 458 |     "collapsed": false
 459 |    },
 460 |    "outputs": [],
 461 |    "source": [
 462 |     "# this doesn't work.\n",
 463 |     "# df[df.bday > 1995]"
 464 |    ]
 465 |   },
 466 |   {
 467 |    "cell_type": "code",
 468 |    "execution_count": null,
 469 |    "metadata": {
 470 |     "collapsed": false
 471 |    },
 472 |    "outputs": [],
 473 |    "source": [
 474 |     "# instead you could compare to a Timestamp or other datetime object\n",
 475 |     "df[df.bday > pd.to_datetime('1-1-1995')].head()"
 476 |    ]
 477 |   },
 478 |   {
 479 |    "cell_type": "code",
 480 |    "execution_count": null,
 481 |    "metadata": {
 482 |     "collapsed": false
 483 |    },
 484 |    "outputs": [],
 485 |    "source": [
 486 |     "# or, use the .dt syntax:\n",
 487 |     "df[df.bday.dt.year > 1995].head()"
 488 |    ]
 489 |   },
 490 |   {
 491 |    "cell_type": "code",
 492 |    "execution_count": null,
 493 |    "metadata": {
 494 |     "collapsed": false
 495 |    },
 496 |    "outputs": [],
 497 |    "source": [
 498 |     "# Pitfall!\n",
 499 |     "# when you want to select using multiple conditions, watch out for this pandas pitfall\n",
 500 |     "# (this doesn't work:)\n",
 501 |     "# df[2000 > df.bday.dt.year > 1995].head()"
 502 |    ]
 503 |   },
 504 |   {
 505 |    "cell_type": "code",
 506 |    "execution_count": null,
 507 |    "metadata": {
 508 |     "collapsed": false
 509 |    },
 510 |    "outputs": [],
 511 |    "source": [
 512 |     "# Pitfall!\n",
 513 |     "# Instead, use the bitwise and (&) operator. However...\n",
 514 |     "# (this doesn't work either):\n",
 515 |     "# df[2000 > df.bday.dt.year & df.bday.dt.year > 1995].head()"
 516 |    ]
 517 |   },
 518 |   {
 519 |    "cell_type": "markdown",
 520 |    "metadata": {},
 521 |    "source": [
 522 |     "### Since the '`&`' operator has really high precedence in order of operations, be sure to enclose each condition in *parentheses*.\n",
 523 |     "\n",
 524 |     "Eg: `2000 > df.bday.dt.year & df.bday.dt.year > 1995` is evaluated the same as \n",
 525 |     "\n",
 526 |     "`2000 > (df.bday.dt.year & df.bday.dt.year) > 1995`\n"
 527 |    ]
 528 |   },
 529 |   {
 530 |    "cell_type": "code",
 531 |    "execution_count": null,
 532 |    "metadata": {
 533 |     "collapsed": false
 534 |    },
 535 |    "outputs": [],
 536 |    "source": [
 537 |     "# select birthdays between 1995 and 2000, non-inclusive\n",
 538 |     "df[(2000 > df.bday.dt.year) & (df.bday.dt.year > 1995)].head()"
 539 |    ]
 540 |   },
 541 |   {
 542 |    "cell_type": "code",
 543 |    "execution_count": null,
 544 |    "metadata": {
 545 |     "collapsed": false
 546 |    },
 547 |    "outputs": [],
 548 |    "source": [
 549 |     "# example of .dt.month\n",
 550 |     "# Note: you rarely need to add columns like this!! You can use .dt directly for a groupby or for a selection\n",
 551 |     "df2 = df.copy()\n",
 552 |     "df2['release_month'] = df2.release_date.dt.month\n",
 553 |     "df2.head()"
 554 |    ]
 555 |   },
 556 |   {
 557 |    "cell_type": "code",
 558 |    "execution_count": null,
 559 |    "metadata": {
 560 |     "collapsed": false
 561 |    },
 562 |    "outputs": [],
 563 |    "source": [
 564 |     "monthly_mean = df.groupby(df.release_date.dt.month).mean()\n",
 565 |     "monthly_mean"
 566 |    ]
 567 |   },
 568 |   {
 569 |    "cell_type": "code",
 570 |    "execution_count": null,
 571 |    "metadata": {
 572 |     "collapsed": false
 573 |    },
 574 |    "outputs": [],
 575 |    "source": [
 576 |     "monthly_mean[['domestic_gross','worldwide_gross']].plot.bar(title='Mean monthly gross')"
 577 |    ]
 578 |   },
 579 |   {
 580 |    "cell_type": "code",
 581 |    "execution_count": null,
 582 |    "metadata": {
 583 |     "collapsed": false
 584 |    },
 585 |    "outputs": [],
 586 |    "source": [
 587 |     "# you don't need to make a new column for a one-off.\n",
 588 |     "(monthly_mean.domestic_gross / monthly_mean.worldwide_gross).plot.bar(\n",
 589 |     "    title='Mean Domestic/Worldwide Gross Ratio by month')"
 590 |    ]
 591 |   },
 592 |   {
 593 |    "cell_type": "markdown",
 594 |    "metadata": {},
 595 |    "source": [
 596 |     "## But that's gross, we don't want month numbers on the x axis, but the month names instead\n",
 597 |     "\n",
 598 |     "`calendar` library to the rescue"
 599 |    ]
 600 |   },
 601 |   {
 602 |    "cell_type": "code",
 603 |    "execution_count": null,
 604 |    "metadata": {
 605 |     "collapsed": false
 606 |    },
 607 |    "outputs": [],
 608 |    "source": [
 609 |     "import calendar\n",
 610 |     "\n",
 611 |     "# we have the option of full name of month, or abbreviated name\n",
 612 |     "print calendar.month_name[1:4]\n",
 613 |     "print calendar.month_abbr[1:4]"
 614 |    ]
 615 |   },
 616 |   {
 617 |    "cell_type": "code",
 618 |    "execution_count": null,
 619 |    "metadata": {
 620 |     "collapsed": false
 621 |    },
 622 |    "outputs": [],
 623 |    "source": [
 624 |     "# map over the the index of using calendar's month names\n",
 625 |     "monthly_mean.index = monthly_mean.index.map(lambda x: calendar.month_abbr[x])\n",
 626 |     "monthly_mean"
 627 |    ]
 628 |   },
 629 |   {
 630 |    "cell_type": "code",
 631 |    "execution_count": null,
 632 |    "metadata": {
 633 |     "collapsed": false
 634 |    },
 635 |    "outputs": [],
 636 |    "source": [
 637 |     "# now we have month abbreviations as x labels when we plot\n",
 638 |     "(monthly_mean.domestic_gross / monthly_mean.worldwide_gross).plot.bar(\n",
 639 |     "    title='Mean Domestic/Worldwide Gross Ratio by month')"
 640 |    ]
 641 |   },
 642 |   {
 643 |    "cell_type": "markdown",
 644 |    "metadata": {},
 645 |    "source": [
 646 |     "# The `resample` method\n",
 647 |     "\n",
 648 |     "A convenient way to bin timeseries data\n",
 649 |     "\n",
 650 |     "**Warning:** resample only works with a Timestamp-indexed dataframe. You can always set your index to your datetime column of interest `df.set_index('datetime_column')` to make this work"
 651 |    ]
 652 |   },
 653 |   {
 654 |    "cell_type": "code",
 655 |    "execution_count": null,
 656 |    "metadata": {
 657 |     "collapsed": false
 658 |    },
 659 |    "outputs": [],
 660 |    "source": [
 661 |     "# let's look at movies of a given actor, by year\n",
 662 |     "actor_df = df[df.actor=='Samuel L. Jackson'].drop('male', axis=1)\n",
 663 |     "actor_df.sort_values('release_date').head()"
 664 |    ]
 665 |   },
 666 |   {
 667 |    "cell_type": "code",
 668 |    "execution_count": null,
 669 |    "metadata": {
 670 |     "collapsed": false
 671 |    },
 672 |    "outputs": [],
 673 |    "source": [
 674 |     "# visualize what the data looks like now: it's irregular by year\n",
 675 |     "actor_df.plot('release_date','production_budget')"
 676 |    ]
 677 |   },
 678 |   {
 679 |    "cell_type": "code",
 680 |    "execution_count": null,
 681 |    "metadata": {
 682 |     "collapsed": false
 683 |    },
 684 |    "outputs": [],
 685 |    "source": [
 686 |     "# take the mean of all the numerical columns\n",
 687 |     "actor_df.set_index('release_date').resample('AS', how='mean').head()"
 688 |    ]
 689 |   },
 690 |   {
 691 |    "cell_type": "markdown",
 692 |    "metadata": {},
 693 |    "source": [
 694 |     "## note that by default, missing bins get replaced with a NaN row. This is can be useful if you want to set a default value to the missing bins."
 695 |    ]
 696 |   },
 697 |   {
 698 |    "cell_type": "code",
 699 |    "execution_count": null,
 700 |    "metadata": {
 701 |     "collapsed": false
 702 |    },
 703 |    "outputs": [],
 704 |    "source": [
 705 |     "# same as above, but fill all NaNs with 0\n",
 706 |     "actor_df.set_index('release_date').resample('AS', how='mean').fillna(0).head()"
 707 |    ]
 708 |   },
 709 |   {
 710 |    "cell_type": "code",
 711 |    "execution_count": null,
 712 |    "metadata": {
 713 |     "collapsed": false
 714 |    },
 715 |    "outputs": [],
 716 |    "source": [
 717 |     "# if we want 5-year bins instead, we can plug in a 5 to the resample \"rule\": '5AS'\n",
 718 |     "actor_df.set_index('release_date').resample('5AS', how='mean')"
 719 |    ]
 720 |   },
 721 |   {
 722 |    "cell_type": "markdown",
 723 |    "metadata": {},
 724 |    "source": [
 725 |     "## resample resolutions available [(via SO answer)](http://stackoverflow.com/a/17001474):\n",
 726 |     "\n",
 727 |     "    B       business day frequency\n",
 728 |     "    C       custom business day frequency (experimental)\n",
 729 |     "    D       calendar day frequency\n",
 730 |     "    W       weekly frequency\n",
 731 |     "    M       month end frequency\n",
 732 |     "    BM      business month end frequency\n",
 733 |     "    CBM     custom business month end frequency\n",
 734 |     "    MS      month start frequency\n",
 735 |     "    BMS     business month start frequency\n",
 736 |     "    CBMS    custom business month start frequency\n",
 737 |     "    Q       quarter end frequency\n",
 738 |     "    BQ      business quarter endfrequency\n",
 739 |     "    QS      quarter start frequency\n",
 740 |     "    BQS     business quarter start frequency\n",
 741 |     "    A       year end frequency\n",
 742 |     "    BA      business year end frequency\n",
 743 |     "    AS      year start frequency\n",
 744 |     "    BAS     business year start frequency\n",
 745 |     "    BH      business hour frequency\n",
 746 |     "    H       hourly frequency\n",
 747 |     "    T       minutely frequency\n",
 748 |     "    S       secondly frequency\n",
 749 |     "    L       milliseonds\n",
 750 |     "    U       microseconds\n",
 751 |     "    N       nanoseconds"
 752 |    ]
 753 |   },
 754 |   {
 755 |    "cell_type": "code",
 756 |    "execution_count": null,
 757 |    "metadata": {
 758 |     "collapsed": false
 759 |    },
 760 |    "outputs": [],
 761 |    "source": [
 762 |     "# let's say we want the mean, and also the count.\n",
 763 |     "# we can pass a list of methods to the `how`\n",
 764 |     "yr_bins = actor_df.set_index('release_date').resample('5AS', how=['mean','count','sem'])\n",
 765 |     "yr_bins.head()"
 766 |    ]
 767 |   },
 768 |   {
 769 |    "cell_type": "code",
 770 |    "execution_count": null,
 771 |    "metadata": {
 772 |     "collapsed": false
 773 |    },
 774 |    "outputs": [],
 775 |    "source": [
 776 |     "# or you can get very fancy and pass a dict of dicts\n",
 777 |     "# the first key references the DataFrame's original column name\n",
 778 |     "# the second key defines the name of a new column.\n",
 779 |     "yr_bins = actor_df.set_index('release_date').resample('5AS', how={\n",
 780 |     "        'production_budget':{'avg':'mean', 'ct':'count', 'stdEm':'sem'},\n",
 781 |     "        'domestic_gross':{'low':'min', 'high':'max'},\n",
 782 |     "        'worldwide_gross':{'total':'sum'}})\n",
 783 |     "yr_bins"
 784 |    ]
 785 |   },
 786 |   {
 787 |    "cell_type": "markdown",
 788 |    "metadata": {},
 789 |    "source": [
 790 |     "# Special note: try not to use method names as column names. It will make indexing more annoying.\n",
 791 |     "## For example, a column named 'mean' will cause a collision when you call `df.mean`\n",
 792 |     "## The `mean` method will have precedence.\n",
 793 |     "\n",
 794 |     "You'd only be able to access the column like: `df['mean']`"
 795 |    ]
 796 |   },
 797 |   {
 798 |    "cell_type": "code",
 799 |    "execution_count": null,
 800 |    "metadata": {
 801 |     "collapsed": true
 802 |    },
 803 |    "outputs": [],
 804 |    "source": [
 805 |     "# PS: 'sem' is standard error of the mean\n",
 806 |     "# pd.Series.sem?"
 807 |    ]
 808 |   },
 809 |   {
 810 |    "cell_type": "markdown",
 811 |    "metadata": {},
 812 |    "source": [
 813 |     "# Multiindexing"
 814 |    ]
 815 |   },
 816 |   {
 817 |    "cell_type": "code",
 818 |    "execution_count": null,
 819 |    "metadata": {
 820 |     "collapsed": false
 821 |    },
 822 |    "outputs": [],
 823 |    "source": [
 824 |     "yr_bins.production_budget"
 825 |    ]
 826 |   },
 827 |   {
 828 |    "cell_type": "code",
 829 |    "execution_count": null,
 830 |    "metadata": {
 831 |     "collapsed": false
 832 |    },
 833 |    "outputs": [],
 834 |    "source": [
 835 |     "# chaining the dot column name syntax is fine\n",
 836 |     "yr_bins.production_budget.avg"
 837 |    ]
 838 |   },
 839 |   {
 840 |    "cell_type": "code",
 841 |    "execution_count": null,
 842 |    "metadata": {
 843 |     "collapsed": false
 844 |    },
 845 |    "outputs": [],
 846 |    "source": [
 847 |     "# you can also index both levels of the column index by name, as strings\n",
 848 |     "yr_bins['production_budget','avg']"
 849 |    ]
 850 |   },
 851 |   {
 852 |    "cell_type": "markdown",
 853 |    "metadata": {},
 854 |    "source": [
 855 |     "## Flattening a multi-level column index\n",
 856 |     "\n",
 857 |     "### Use a list comprehension to rewrite the column names"
 858 |    ]
 859 |   },
 860 |   {
 861 |    "cell_type": "code",
 862 |    "execution_count": null,
 863 |    "metadata": {
 864 |     "collapsed": false
 865 |    },
 866 |    "outputs": [],
 867 |    "source": [
 868 |     "print yr_bins.columns.values"
 869 |    ]
 870 |   },
 871 |   {
 872 |    "cell_type": "code",
 873 |    "execution_count": null,
 874 |    "metadata": {
 875 |     "collapsed": false
 876 |    },
 877 |    "outputs": [],
 878 |    "source": [
 879 |     "yr_bins_flat = yr_bins.copy()\n",
 880 |     "# use an underscore as a delimiter. But it's up to you.\n",
 881 |     "yr_bins_flat.columns = ['_'.join(col) for col in yr_bins.columns.values]\n",
 882 |     "\n",
 883 |     "yr_bins_flat"
 884 |    ]
 885 |   },
 886 |   {
 887 |    "cell_type": "markdown",
 888 |    "metadata": {},
 889 |    "source": [
 890 |     "# `pd.cut()`: bins numeric values -> categorical values"
 891 |    ]
 892 |   },
 893 |   {
 894 |    "cell_type": "code",
 895 |    "execution_count": null,
 896 |    "metadata": {
 897 |     "collapsed": false
 898 |    },
 899 |    "outputs": [],
 900 |    "source": [
 901 |     "# make some fake data\n",
 902 |     "no_movies = 10\n",
 903 |     "ratings_df = pd.DataFrame.from_dict({\n",
 904 |     "    'rating_no':pd.np.random.rand(no_movies), \n",
 905 |     "    'movie':df.title.sample(no_movies)})\n",
 906 |     "# fake gross based on fake rating\n",
 907 |     "ratings_df['gross'] = pd.np.round(ratings_df.rating_no*100000000, decimals=2)\n",
 908 |     "\n",
 909 |     "# save this unmodified version for later\n",
 910 |     "ratings_df_orig = ratings_df.copy()\n",
 911 |     "\n",
 912 |     "ratings_df"
 913 |    ]
 914 |   },
 915 |   {
 916 |    "cell_type": "code",
 917 |    "execution_count": null,
 918 |    "metadata": {
 919 |     "collapsed": false
 920 |    },
 921 |    "outputs": [],
 922 |    "source": [
 923 |     "# cut numerical ratings into N bins\n",
 924 |     "\n",
 925 |     "# here's what the labels default to when you don't define your own labels\n",
 926 |     "ratings_df['rating_category_ugly'] = pd.cut(ratings_df.rating_no, bins=4)\n",
 927 |     "\n",
 928 |     "# you can substitute whatever labels you want\n",
 929 |     "ratings_df['rating_category'] = pd.cut(ratings_df.rating_no, bins=4, labels=['bad','mediocre','good','excellent'])\n",
 930 |     "\n",
 931 |     "ratings_df"
 932 |    ]
 933 |   },
 934 |   {
 935 |    "cell_type": "code",
 936 |    "execution_count": null,
 937 |    "metadata": {
 938 |     "collapsed": false
 939 |    },
 940 |    "outputs": [],
 941 |    "source": [
 942 |     "# `pd.cut` gives us an excellent way to groupby based on bins.\n",
 943 |     "# Eg, we can use the new categorical ratings to find the mean gross for each rating bin\n",
 944 |     "print 'mean gross for each rating bin'\n",
 945 |     "ratings_df.groupby('rating_category').mean()"
 946 |    ]
 947 |   },
 948 |   {
 949 |    "cell_type": "code",
 950 |    "execution_count": null,
 951 |    "metadata": {
 952 |     "collapsed": false
 953 |    },
 954 |    "outputs": [],
 955 |    "source": [
 956 |     "# Even if we didn't care about assigning labels like 'bad', 'mediocre', etc to the rating numbers,\n",
 957 |     "# pd.cut is still very useful if we want to groupby on binned numerical data\n",
 958 |     "\n",
 959 |     "# We can do this as a one-liner, using the copy of the original ratings_df before we added those extra columns.\n",
 960 |     "# Let's do 5 bins to switch it up.\n",
 961 |     "ratings_df_orig.groupby(pd.cut(ratings_df_orig.rating_no, bins=5)).mean()"
 962 |    ]
 963 |   },
 964 |   {
 965 |    "cell_type": "markdown",
 966 |    "metadata": {},
 967 |    "source": [
 968 |     "### Just like with `resample`, empty bins have *NaN* values."
 969 |    ]
 970 |   },
 971 |   {
 972 |    "cell_type": "code",
 973 |    "execution_count": null,
 974 |    "metadata": {
 975 |     "collapsed": false
 976 |    },
 977 |    "outputs": [],
 978 |    "source": [
 979 |     "ratings_df_orig.groupby(pd.cut(ratings_df_orig.rating_no, bins=5), as_index=False).mean()"
 980 |    ]
 981 |   },
 982 |   {
 983 |    "cell_type": "markdown",
 984 |    "metadata": {},
 985 |    "source": [
 986 |     "# Bonus: Taking advantage of seaborn's groupby support"
 987 |    ]
 988 |   },
 989 |   {
 990 |    "cell_type": "code",
 991 |    "execution_count": null,
 992 |    "metadata": {
 993 |     "collapsed": false
 994 |    },
 995 |    "outputs": [],
 996 |    "source": [
 997 |     "n_top = 15\n",
 998 |     "# we only want one row per movie, we don't care about actors\n",
 999 |     "by_movie_df = df.groupby('title').first()\n",
1000 |     "by_movie_df.head()"
1001 |    ]
1002 |   },
1003 |   {
1004 |    "cell_type": "code",
1005 |    "execution_count": null,
1006 |    "metadata": {
1007 |     "collapsed": false
1008 |    },
1009 |    "outputs": [],
1010 |    "source": [
1011 |     "# select only the top N studios, by total production budget of all movies\n",
1012 |     "top_studio_names = by_movie_df.groupby('studio').sum().sort_values(\n",
1013 |     "    'production_budget', ascending=False).index[:n_top]\n",
1014 |     "\n",
1015 |     "top_studio_df = by_movie_df[by_movie_df.studio.isin(top_studio_names)]\n",
1016 |     "\n",
1017 |     "print top_studio_names\n",
1018 |     "top_studio_df.head()"
1019 |    ]
1020 |   },
1021 |   {
1022 |    "cell_type": "code",
1023 |    "execution_count": null,
1024 |    "metadata": {
1025 |     "collapsed": false
1026 |    },
1027 |    "outputs": [],
1028 |    "source": [
1029 |     "import seaborn as sns\n",
1030 |     "\n",
1031 |     "# make the size of the figure bigger (width,height)\n",
1032 |     "plt.figure(figsize=(14,8))\n",
1033 |     "\n",
1034 |     "# we pass the studio column to sns.violinplot\n",
1035 |     "sns.violinplot(top_studio_df.production_budget, groupby=top_studio_df.studio)\n",
1036 |     "plt.title('Production budget distributions for the top 10 studios');"
1037 |    ]
1038 |   },
1039 |   {
1040 |    "cell_type": "code",
1041 |    "execution_count": null,
1042 |    "metadata": {
1043 |     "collapsed": true
1044 |    },
1045 |    "outputs": [],
1046 |    "source": []
1047 |   }
1048 |  ],
1049 |  "metadata": {
1050 |   "kernelspec": {
1051 |    "display_name": "Python 2",
1052 |    "language": "python",
1053 |    "name": "python2"
1054 |   },
1055 |   "language_info": {
1056 |    "codemirror_mode": {
1057 |     "name": "ipython",
1058 |     "version": 2
1059 |    },
1060 |    "file_extension": ".py",
1061 |    "mimetype": "text/x-python",
1062 |    "name": "python",
1063 |    "nbconvert_exporter": "python",
1064 |    "pygments_lexer": "ipython2",
1065 |    "version": "2.7.11"
1066 |   }
1067 |  },
1068 |  "nbformat": 4,
1069 |  "nbformat_minor": 0
1070 | }
1071 | 


--------------------------------------------------------------------------------