├── Week3 ├── Week3.py ├── Week3 slides.pdf ├── Assignment - Week 3 │ ├── scimagojr-3.xlsx │ ├── Energy Indicators.xls │ └── Assignment+3.ipynb └── Week+3.ipynb ├── Week4 ├── Week4 slides.pdf └── Week+4.ipynb ├── Week1 ├── Week1.py └── Week+1.ipynb ├── Week2 └── Week2.py └── README.md /Week3/Week3.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Week3/Week3 slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanniey/Coursera_Intro_to_Data_Science_with_Python/HEAD/Week3/Week3 slides.pdf -------------------------------------------------------------------------------- /Week4/Week4 slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanniey/Coursera_Intro_to_Data_Science_with_Python/HEAD/Week4/Week4 slides.pdf -------------------------------------------------------------------------------- /Week3/Assignment - Week 3/scimagojr-3.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanniey/Coursera_Intro_to_Data_Science_with_Python/HEAD/Week3/Assignment - Week 3/scimagojr-3.xlsx -------------------------------------------------------------------------------- /Week3/Assignment - Week 3/Energy Indicators.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanniey/Coursera_Intro_to_Data_Science_with_Python/HEAD/Week3/Assignment - Week 3/Energy Indicators.xls -------------------------------------------------------------------------------- /Week1/Week1.py: -------------------------------------------------------------------------------- 1 | people = ['Dr. Christopher Brooks', 'Dr. Kevyn Collins-Thompson', 'Dr. VG Vinod Vydiswaran', 'Dr. Daniel Romero'] 2 | 3 | titleName = [] 4 | def split_title_and_name(): 5 | for person in people: 6 | last = person.split(" ")[-1] 7 | title = person.split(" ")[0] 8 | titleName.append(title + " "+last) 9 | print(titleName) 10 | 11 | split_title_and_name() 12 | # list(map(split_title_and_name, people) 13 | -------------------------------------------------------------------------------- /Week2/Week2.py: -------------------------------------------------------------------------------- 1 | Week 2 Assignment 2 | 3 | Question 1 4 | Which country has won the most gold medals in summer games? 5 | This function should return a single string value. 6 | 7 | ``` 8 | def answer_one(): 9 | return df['Gold'].idxmax() 10 | 11 | answer_one() 12 | ``` 13 | 14 | Question 2¶ 15 | Which country had the biggest difference between their summer and winter gold medal counts? 16 | This function should return a single string value. 17 | ``` 18 | def answer_two(): 19 | max_diff=max(df['Gold']-df['Gold.1']) 20 | answer = df[(df['Gold']-df['Gold.1'])==max_diff].index.tolist() 21 | return answer[0] 22 | 23 | answer_two() 24 | ``` 25 | 26 | Question 3 27 | 28 | Which country has the biggest difference between their summer gold medal counts and winter gold medal counts relative to their total gold medal count? 29 | (Summer Gold−Winter Gold)/Total Gold 30 | 31 | Only include countries that have won at least 1 gold in both summer and winter. 32 | This function should return a single string value. 33 | ``` 34 | def answer_three(): 35 | df_nozero = df[(df['Gold']>0) & (df['Gold.1']>0)] 36 | percentage = max(abs((df_nozero['Gold']-df_nozero['Gold.1'])/df_nozero['Gold.2'])) 37 | return df[((df['Gold']-df['Gold.1'])/df['Gold.2'])==percentage].index.tolist()[0] 38 | 39 | answer_three() 40 | ``` 41 | 42 | 43 | Question 4¶ 44 | Write a function that creates a Series called "Points" which is a weighted value where each gold medal (Gold.2) counts for 3 points, silver medals (Silver.2) for 2 points, and bronze medals (Bronze.2) for 1 point. The function should return only the column (a Series object) which you created. 45 | This function should return a Series named Points of length 146 46 | 47 | ``` 48 | def answer_four(): 49 | df['Points']= (df['Gold.2']*3+df['Silver.2']*2+df['Bronze.2']) 50 | return df['Points'] 51 | 52 | answer_four() 53 | ``` 54 | 55 | Question 5 56 | Question 5¶ 57 | Which state has the most counties in it? (hint: consider the sumlevel key carefully! You'll need this for future questions too...) 58 | This function should return a single string value. 59 | ``` 60 | 61 | def answer_five(): 62 | new_df = census_df[census_df['SUMLEV'] == 50] 63 | return new_df.groupby('STNAME').count()['SUMLEV'].idxmax() 64 | 65 | answer_five() 66 | ``` 67 | 68 | -------------------------------------------------------------------------------- /Week4/Week+4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "---\n", 8 | "\n", 9 | "_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-data-analysis/resources/0dhYG) course resource._\n", 10 | "\n", 11 | "---" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# Distributions in Pandas" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "import pandas as pd\n", 30 | "import numpy as np" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": { 37 | "collapsed": false 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "np.random.binomial(1, 0.5)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "collapsed": false 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "np.random.binomial(1000, 0.5)/1000" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "chance_of_tornado = 0.01/100\n", 64 | "np.random.binomial(100000, chance_of_tornado)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "chance_of_tornado = 0.01\n", 76 | "\n", 77 | "tornado_events = np.random.binomial(1, chance_of_tornado, 1000000)\n", 78 | " \n", 79 | "two_days_in_a_row = 0\n", 80 | "for j in range(1,len(tornado_events)-1):\n", 81 | " if tornado_events[j]==1 and tornado_events[j-1]==1:\n", 82 | " two_days_in_a_row+=1\n", 83 | "\n", 84 | "print('{} tornadoes back to back in {} years'.format(two_days_in_a_row, 1000000/365))" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "collapsed": false 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "np.random.uniform(0, 1)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "np.random.normal(0.75)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "Formula for standard deviation\n", 114 | "$$\\sqrt{\\frac{1}{N} \\sum_{i=1}^N (x_i - \\overline{x})^2}$$" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "distribution = np.random.normal(0.75,size=1000)\n", 126 | "\n", 127 | "np.sqrt(np.sum((np.mean(distribution)-distribution)**2)/len(distribution))" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": { 134 | "collapsed": false, 135 | "scrolled": true 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "np.std(distribution)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": { 146 | "collapsed": false 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "import scipy.stats as stats\n", 151 | "stats.kurtosis(distribution)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "collapsed": false 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "stats.skew(distribution)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": { 169 | "collapsed": false 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "chi_squared_df2 = np.random.chisquare(2, size=10000)\n", 174 | "stats.skew(chi_squared_df2)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": { 181 | "collapsed": false 182 | }, 183 | "outputs": [], 184 | "source": [ 185 | "chi_squared_df5 = np.random.chisquare(5, size=10000)\n", 186 | "stats.skew(chi_squared_df5)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": { 193 | "collapsed": false 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "%matplotlib inline\n", 198 | "import matplotlib\n", 199 | "import matplotlib.pyplot as plt\n", 200 | "\n", 201 | "output = plt.hist([chi_squared_df2,chi_squared_df5], bins=50, histtype='step', \n", 202 | " label=['2 degrees of freedom','5 degrees of freedom'])\n", 203 | "plt.legend(loc='upper right')\n" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "# Hypothesis Testing" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "collapsed": false 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "df = pd.read_csv('grades.csv')" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": { 228 | "collapsed": false 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "df.head()" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": { 239 | "collapsed": false 240 | }, 241 | "outputs": [], 242 | "source": [ 243 | "len(df)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "collapsed": false 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "early = df[df['assignment1_submission'] <= '2015-12-31']\n", 255 | "late = df[df['assignment1_submission'] > '2015-12-31']" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": { 262 | "collapsed": false 263 | }, 264 | "outputs": [], 265 | "source": [ 266 | "early.mean()" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": { 273 | "collapsed": false 274 | }, 275 | "outputs": [], 276 | "source": [ 277 | "late.mean()" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": { 284 | "collapsed": false 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "from scipy import stats\n", 289 | "stats.ttest_ind?" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": { 296 | "collapsed": false 297 | }, 298 | "outputs": [], 299 | "source": [ 300 | "stats.ttest_ind(early['assignment1_grade'], late['assignment1_grade'])" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": { 307 | "collapsed": false 308 | }, 309 | "outputs": [], 310 | "source": [ 311 | "stats.ttest_ind(early['assignment2_grade'], late['assignment2_grade'])" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": { 318 | "collapsed": false 319 | }, 320 | "outputs": [], 321 | "source": [ 322 | "stats.ttest_ind(early['assignment3_grade'], late['assignment3_grade'])" 323 | ] 324 | } 325 | ], 326 | "metadata": { 327 | "kernelspec": { 328 | "display_name": "Python 3", 329 | "language": "python", 330 | "name": "python3" 331 | }, 332 | "language_info": { 333 | "codemirror_mode": { 334 | "name": "ipython", 335 | "version": 3 336 | }, 337 | "file_extension": ".py", 338 | "mimetype": "text/x-python", 339 | "name": "python", 340 | "nbconvert_exporter": "python", 341 | "pygments_lexer": "ipython3", 342 | "version": "3.5.2" 343 | } 344 | }, 345 | "nbformat": 4, 346 | "nbformat_minor": 0 347 | } 348 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Intro to Data Science in Python 2 | ## University of Michigan, Professor Christopher Brooks, Coursera course 3 | ### 11/2016 - Completed on 04/12/2016 4 | 5 | Summary: 6 | Despite the course name, this is an intermediate-level data science course with Python. Familiarity with Numpy and Pandas libraries is not required, but is highly recommended, as the course does get pretty intense really quickly (i.e. Week 2) To be honest, this is a solid course for someone who has a background with Panda and numpy libraries. However, there is a big knowledge gap between the videos and the assignments, so it's challenging for beginners. 7 | 8 | 9 | 10 | Feedback: 11 | 12 | ![My feeling while taking this course...](https://imgs.xkcd.com/comics/python.png) 13 | 14 | > My feeling while taking this course... 15 | 16 | 04/12/2016: 17 | Finally finished this...was close to giving up on it SO MANY TIMES! 18 | 19 | 20 | ## Week 4 Statistical Analysis in Python and Project 21 | 22 | 23 | Binomial Distribution in numpy for coin flipping 24 | 25 | ``` 26 | np.random.binomial(1,0.5) 27 | ``` 28 | First term (1) is the number of times you want it to run, and second term (0.5) is the chance we get a zero 29 | 30 | ``` 31 | np.random.binomial(1000, 0.5)/1000 32 | ``` 33 | Flip coins 1000 times, and divide the result by 1000 34 | 35 | Run 1000 simulations of flipping coins 20 times and getting a number >= 15. 36 | 37 | ``` 38 | x = np.random.binomial(20, .5, 10000) 39 | print((x>=15).mean()) 40 | ``` 41 | Output: 42 | ``` 43 | 0.0219 44 | ``` 45 | 46 | Get the number of events given no. of simulation. 47 | "How many tornados will take place based on 100,000 simulations, given that the chance of a tornado is 0.01%?" 48 | 49 | ``` 50 | chance_of_tornado = 0.01/100 51 | np.random.binomial(100000,chance of tornado) 52 | ``` 53 | Output: 54 | ``` 55 | 8 56 | ``` 57 | 58 | "Assume the chance of tornado is 1%. How many tornados will take place (what is the chance of tornados taking place) two days in a row based on 1000000 simulations?" 59 | 60 | ``` 61 | chance_of_tornado = 0.01 62 | 63 | tornado_events = np.random.binomial(1, chance_of_tornado, 1000000) 64 | 65 | two_days_in_a_row = 0 66 | for j in range(1,len(tornado_events)-1): 67 | if tornado_events[j]==1 and tornado_events[j-1]==1: 68 | two_days_in_a_row+=1 69 | 70 | print('{} tornadoes back to back in {} years'.format(two_days_in_a_row, 1000000/365)) 71 | ``` 72 | Output: 73 | ``` 74 | 103 tornadoes back to back in 2739.72602739726 years 75 | ``` 76 | tornado_events[j]== 1 means the day when tornado took place. 77 | 78 | #### Standard deviation 79 | 80 | Draw 1000 samples of a normal distriubtion, with expected value of 0.75 and a standard deviation of 1. Result is ~ 68% of area. 81 | ``` 82 | distribution = np.random.normal(0.75,size=1000) 83 | 84 | np.sqrt(np.sum((np.mean(distribution)-distribution)**2)/len(distribution)) 85 | ``` 86 | The above code is equivalent to the np.std() function: 87 | ``` 88 | np.std(distribution) 89 | ``` 90 | 91 | #### Kirtosis (shape of tails) with stats module 92 | 93 | Positive value = more chubby than a normal distribution 94 | Negative value = more flat than a normal distribution 95 | 96 | ``` 97 | import scipy.stats as stats 98 | stats.kurtosis(distribution) 99 | 100 | ``` 101 | Output: 102 | ``` 103 | -0.21162400583818153 104 | ``` 105 | 106 | #### Skew with stats module 107 | If skew = 0.5, then there's no skew (i.e. the distribution is symmetric) 108 | 109 | ``` 110 | stats.skew(distribution) 111 | ``` 112 | Output: 113 | ``` 114 | 0.051147428570855365 115 | ``` 116 | 117 | 118 | #### Chi squared distribution (left-skewed) 119 | As the degree of freedom increases, the plot moves from left to center 120 | 121 | Degree of freedom = 2: 122 | ``` 123 | chi_squared_df2 = np.random.chisquare(2, size=10000) 124 | stats.skew(chi_squared_df2) 125 | ``` 126 | Output: 127 | ``` 128 | 1.9589902136938178 129 | ``` 130 | 131 | Degree of freemdom = 5: 132 | ``` 133 | chi_squared_df5 = np.random.chisquare(5, size=10000) 134 | stats.skew(chi_squared_df5) 135 | ``` 136 | Output: 137 | ``` 138 | 1.3010399138921354 139 | ``` 140 | #### Bimodal distribution (having 2 peaks) 141 | 142 | #### Hypothesis Testing 143 | Alternative Hypothesis vs. Null Hypothesis 144 | Significance level (alpha), 145 | alpha = 0.05 or 5% 146 | 147 | #### t-test: compare the means of two different populations 148 | 149 | stats.ttest_ind(): compare 2 difference samples to see if they have different means. In this case, we're using ttest_ind() to compare the average grade of assignment 1 between early users('early' dataframe) and late users('late' dataframe). 150 | 151 | Output is a tuple with a test statistic and a p-value. 152 | 153 | 154 | ``` 155 | import scipy.stats as stats 156 | 157 | early = df[df['assignment1_submission'] <= '2015-12-31'] 158 | late = df[df['assignment1_submission'] > '2015-12-31'] 159 | 160 | stats.ttest_ind(early['assignment1_grade'], late['assignment1_grade']) 161 | ``` 162 | Output: 163 | ``` 164 | Ttest_indResult(statistic=1.400549944897566, pvalue=0.16148283016060577) 165 | ``` 166 | 167 | If the p-value is >0.05(the significance value/alpha we decided previously), then we cannot reject the null hypothesis. 168 | 169 | Do the same test on assignment 2: 170 | ``` 171 | stats.ttest_ind(early['assignment2_grade'], late['assignment2_grade']) 172 | ``` 173 | Output: 174 | ``` 175 | Ttest_indResult(statistic=1.3239868220912567, pvalue=0.18563824610067967) 176 | In [ ]: 177 | ``` 178 | p-value is still >0.05, so we cannot reject the null hypothesis. 179 | --- 180 | 181 | ## Week 3 Advanced Python Pandas 182 | 183 | ![Finished Week 3's assignment](http://cdn.someecards.com/someecards/usercards/MjAxMi1mNWM4MDQ3MTJkODYzMzhi.png) 184 | 185 | > Finally finished Week 3's assignment. 186 | 187 | 11/27/2016 Update 188 | Finally finished this week's assignment! The first one took a long time. I had to relearn regular expression because of it. Learned a lot about dataframes through the practices, so I'm happy about the progress eventually, but Jesus,that was a lot of work... 189 | 190 | Merging dataframes based on the same index. "NaN" is assigned when there's a missing value. 191 | 192 | #### iloc() and loc() 193 | iloc()for query based on location 194 | loc() for query based on label 195 | 196 | #### Outer vs inner join 197 | 198 | Outer Join 199 | ``` 200 | pd.merge(df1,df2,how='outer',left_index=True,right_index=True) 201 | ``` 202 | Inner Join 203 | ``` 204 | pd.merge(df1,df2,how='inner,left_index=True,right_index=True) 205 | ``` 206 | Left Join: keep all information from df1 207 | ``` 208 | pd.merge(df1,df2,how='left',left_index=True,right_index=True) 209 | ``` 210 | Right Join: keep all information from df2 211 | ``` 212 | pd.merge(df1,df2,how='right',left_index=True,right_index=True) 213 | ``` 214 | Join by Column names 215 | ``` 216 | pd.merge(df1,df2,how='left',left_on='Name',right_on='Name') 217 | ``` 218 | 219 | Chain indexing - not recommended 220 | ``` 221 | df.loc['Washtenaw']['Total Population'] 222 | ``` 223 | 224 | Method chaining 225 | ``` 226 | (df.where(df['SUMLEV']==50) 227 | .dropna() 228 | .set_index(['STNAME','CTYNAME']) 229 | .rename(columns={'ESTIMATESBASE2010': 'Estimates Base 2010'})) 230 | ``` 231 | Drop rows where 'Quantity' is 0, and rename the column 'Weight' to 'Weight(oz.)' 232 | ``` 233 | df = df[df.Quantity !=0].rename({'Weight':'Weight(oz.)'}) 234 | ``` 235 | Alternatively: 236 | ``` 237 | print(df.drop(df[df['Quantity'] == 0].index).rename(columns={'Weight': 'Weight (oz.)'})) 238 | ``` 239 | 240 | #### Apply() function which applies a function to all rows in a dataframe 241 | 242 | To apply to all columns in the same row(i.e.1 = across), use axis= 1 243 | To apply to all rows in the same column (i.e. 0 = down), use axis = 0 244 | 245 | ``` 246 | import numpy as np 247 | def min_max(row): 248 | data = row[['POPESTIMATE2010', 249 | 'POPESTIMATE2011', 250 | 'POPESTIMATE2012', 251 | 'POPESTIMATE2013', 252 | 'POPESTIMATE2014', 253 | 'POPESTIMATE2015']] 254 | return pd.Series({'min': np.min(data), 'max': np.max(data)}) 255 | 256 | df.apply(min_max, axis=1) 257 | ``` 258 | Adding the applied function to the existing dataframe (instead of creating a new one) 259 | ``` 260 | import numpy as np 261 | def min_max(row): 262 | data = row[['POPESTIMATE2010', 263 | 'POPESTIMATE2011', 264 | 'POPESTIMATE2012', 265 | 'POPESTIMATE2013', 266 | 'POPESTIMATE2014', 267 | 'POPESTIMATE2015']] 268 | row['max'] = np.max(data) 269 | row['min'] = np.min(data) 270 | return row 271 | df.apply(min_max, axis=1) 272 | ``` 273 | Use apply() with lambda function: 274 | create a function with the max of each row 275 | ``` 276 | rows = ['POPESTIMATE2010', 277 | 'POPESTIMATE2011', 278 | 'POPESTIMATE2012', 279 | 'POPESTIMATE2013', 280 | 'POPESTIMATE2014', 281 | 'POPESTIMATE2015'] 282 | df.apply(lambda x: np.max(x[rows]), axis=1) 283 | ``` 284 | 285 | #### Groupby() 286 | you can use a function to be the criteria for group_by() 287 | ``` 288 | df = df.set_index('STNAME') 289 | 290 | def fun(item): 291 | if item[0]<'M': 292 | return 0 293 | if item[0]<'Q': 294 | return 1 295 | return 2 296 | 297 | for group, frame in df.groupby(fun): 298 | print('There are ' + str(len(frame)) + ' records in group ' + str(group) + ' for processing.') 299 | 300 | ``` 301 | Calculate the average/sum of a certain group with groupby() and agg() 302 | ``` 303 | df.groupby('STNAME').agg({'CENSUS2010POP': np.average}) 304 | ``` 305 | ``` 306 | print(df.groupby('Category').agg('sum')) 307 | ``` 308 | 309 | #### Use apply() with groupby() 310 | ``` 311 | def totalweight(df, w, q): 312 | return sum(df[w] * df[q]) 313 | 314 | print(df.groupby('Category').apply(totalweight, 'Weight (oz.)', 'Quantity')) 315 | ``` 316 | 317 | #### Scales 318 | Use astype() to change the type of scales from one to another 319 | 320 | create a list and use astype() to indicate the order with ordered = True. This enables > or < to be used on strings. 321 | 322 | ``` 323 | df = pd.DataFrame(['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D'], 324 | index=['excellent', 'excellent', 'excellent', 'good', 'good', 'good', 'ok', 'ok', 'ok', 'poor', 'poor']) 325 | df.rename(columns={0: 'Grades'}, inplace=True) 326 | 327 | grades = df['Grades'].astype('category', 328 | categories=['D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+'], 329 | ordered=True) 330 | grades.head() 331 | ``` 332 | output is: 333 | ``` 334 | excellent A+ 335 | excellent A 336 | excellent A- 337 | good B+ 338 | good B 339 | Name: Grades, dtype: category 340 | Categories (11, object): [D < D+ < C- < C ... B+ < A- < A < A+] 341 | 342 | ``` 343 | Use > or < functions on types, output: 344 | ``` 345 | excellent True 346 | excellent True 347 | excellent True 348 | good True 349 | good True 350 | good True 351 | ok True 352 | ok False 353 | ok False 354 | poor False 355 | poor False 356 | Name: Grades, dtype: bool 357 | ``` 358 | 359 | Change this series to categorical with ordering Low < Medium < High 360 | 361 | ``` 362 | s = pd.Series(['Low', 'Low', 'High', 'Medium', 'Low', 'High', 'Low']) 363 | 364 | s.astype('category', categories=['Low', 'Medium', 'High'], ordered=True) 365 | ``` 366 | 367 | Use get_dummies() to convert boolean values into 0s and 1s 368 | 369 | #### cut(): to cut data into bins (i.e. to divide them equally into 10 buckets) 370 | 371 | ``` 372 | df = pd.read_csv('census.csv') 373 | df = df[df['SUMLEV']==50] 374 | df = df.set_index('STNAME').groupby(level=0)['CENSUS2010POP'].agg({'avg': np.average}) 375 | pd.cut(df['avg'],10) 376 | ``` 377 | Cut a series into 3 equal-sized bins 378 | ``` 379 | s = pd.Series([168, 180, 174, 190, 170, 185, 179, 181, 175, 169, 182, 177, 180, 171]) 380 | 381 | 382 | pd.cut(s, 3) 383 | 384 | # You can also add labels for the sizes [Small < Medium < Large]. 385 | pd.cut(s, 3, labels=['Small', 'Medium', 'Large']) 386 | ``` 387 | 388 | #### Use pivot_table() to create Pivot Tables 389 | 390 | ``` 391 | df = pd.read_csv('cars.csv') 392 | df.pivot_table(values='(kW)', index='YEAR', columns='Make', aggfunc=np.mean) 393 | ``` 394 | 395 | Create a pivot table that shows mean price and mean ratings for every "Manufacturer"/"Bike Type" combination 396 | ``` 397 | print(pd.pivot_table(Bikes, index=['Manufacturer','Bike Type'])) 398 | 399 | import numpy as np 400 | print(Bikes.pivot_table(values ='Price',index = 'Manufacturer',columns = 'Bike Type',aggfunc=np.average)) 401 | ``` 402 | 403 | #### Date Functionality in Panda 404 | 1. Timestamp 405 | 2. DatetimeIndex (the index of 1) 406 | 3. Period 407 | 4. PeriodIndex (the index of 3) 408 | 409 | 1. Timestamp, exchangeable to Python's datetime 410 | ⋅⋅⋅``` 411 | ⋅⋅⋅pd.Timestamp('9/1/2016 10:05AM') 412 | ⋅⋅⋅``` 413 | 414 | 2. Period 415 | ``` 416 | pd.Period('1/2016') 417 | ``` 418 | 419 | 3. DatetimeIndex and PeriodIndex 420 | DatetimeIndex 421 | ``` 422 | t1 = pd.Series(list('abc'), [pd.Timestamp('2016-09-01'), pd.Timestamp('2016-09-02'), pd.Timestamp('2016-09-03')]) 423 | 424 | type(t1.index) 425 | 426 | ``` 427 | Output: 428 | ``` 429 | pandas.tseries.index.DatetimeIndex 430 | ``` 431 | PeriodIndex 432 | ``` 433 | t2 = pd.Series(list('def'), [pd.Period('2016-09'), pd.Period('2016-10'), pd.Period('2016-11')]) 434 | type(t2.index) 435 | ``` 436 | Output: 437 | ``` 438 | pandas.tseries.period.PeriodIndex 439 | ``` 440 | 441 | Coverts datetimes to the same format with to_datetime() 442 | 443 | ``` 444 | d1 = ['2 June 2013', 'Aug 29, 2014', '2015-06-26', '7/12/16'] 445 | ts3 = pd.DataFrame(np.random.randint(10, 100, (4,2)), index=d1, columns=list('ab')) 446 | ts3.index = pd.to_datetime(ts3.index) 447 | ``` 448 | 449 | use dayfirst = True to change the datetime into European format 450 | ``` 451 | pd.to_datetime('4.7.12', dayfirst=True) 452 | ``` 453 | #### Timedelta: show difference in times 454 | 455 | ``` 456 | pd.Timestamp('9/3/2016')-pd.Timestamp('9/1/2016') 457 | ``` 458 | Output: 459 | ``` 460 | Timedelta('2 days 00:00:00') 461 | ``` 462 | 463 | Calculate datetime with timedelta 464 | ``` 465 | pd.Timestamp('9/2/2016 8:10AM') + pd.Timedelta('12D 3H') 466 | ``` 467 | Output: 468 | ``` 469 | Timestamp('2016-09-14 11:10:00') 470 | ``` 471 | 472 | #### Date_range() 473 | Create a range of dates for bi-weekly on Sundays, starting with a specific date 474 | 475 | ``` 476 | dates = pd.date_range('10-01-2016', periods=9, freq='2W-SUN') 477 | ``` 478 | 479 | #### weekday_name(): check what day of the week it is 480 | ``` 481 | df.index.weekday_name 482 | ``` 483 | 484 | #### diff(): find difference between each day's value 485 | ``` 486 | df.diff() 487 | ``` 488 | 489 | #### resample(): frequency conversion. example: find mean count for each month, will show the data as of month end. 'M' stands for month 490 | ``` 491 | df.resample('M').mean() 492 | ``` 493 | 494 | Find values from a specific year, month or a range of dates 495 | 496 | ``` 497 | df['2017'] 498 | df['2016-12'] 499 | df['2016-12':] 500 | 501 | ``` 502 | #### asfreq(): change frequency from bi-weekly to weekly, and fill NaN value with last week's data point 503 | ``` 504 | df.asfreq('W', method='ffill') 505 | ``` 506 | #### matplotlib: visualising a timeseries 507 | 508 | ``` 509 | import matplotlib.pyplot as plt 510 | %matplotlib inline 511 | 512 | df.plot() 513 | ``` 514 | --- 515 | ## Week 2 Basic Data Processing with Pandas 516 | 517 | Dataframe 518 | 519 | ``` 520 | import pandas as pd 521 | purchase_1 = pd.Series({'Name': 'Chris', 522 | 'Item Purchased': 'Dog Food', 523 | 'Cost': 22.50}) 524 | purchase_2 = pd.Series({'Name': 'Kevyn', 525 | 'Item Purchased': 'Kitty Litter', 526 | 'Cost': 2.50}) 527 | purchase_3 = pd.Series({'Name': 'Vinod', 528 | 'Item Purchased': 'Bird Seed', 529 | 'Cost': 5.00}) 530 | df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2']) 531 | df.head() 532 | ``` 533 | 534 | df.T.loc --> T transforms data 535 | 536 | iloc vs loc: iloc searches by index, loc searches by value 537 | 538 | Avoid chaining as it generally create a copy of the data, instead of simply viewing it. 539 | 540 | Deleting data with df.drop(). It creates a copy of the dataframe with the given rows removed. 541 | 542 | ``` 543 | df.drop("Store 1") 544 | ``` 545 | 546 | Deleting data with del() function 547 | 548 | ``` 549 | del copy_df['Name'] 550 | ``` 551 | 552 | apply 20% discount to cost 553 | 554 | ``` 555 | purchase_1 = pd.Series({'Name': 'Chris', 556 | 'Item Purchased': 'Dog Food', 557 | 'Cost': 22.50}) 558 | purchase_2 = pd.Series({'Name': 'Kevyn', 559 | 'Item Purchased': 'Kitty Litter', 560 | 'Cost': 2.50}) 561 | purchase_3 = pd.Series({'Name': 'Vinod', 562 | 'Item Purchased': 'Bird Seed', 563 | 'Cost': 5.00}) 564 | 565 | df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2']) 566 | 567 | 568 | df['Cost'] *= 0.8 569 | print(df) 570 | ``` 571 | 572 | Panda's read_csv() function, making first column the index 573 | 574 | ``` 575 | df = pd.read_csv('olympics.csv', index_col=0, skiprows=1) 576 | ``` 577 | 578 | Change column names with rename() method 579 | 580 | ``` 581 | for col in df.columns: 582 | if col[:2]=='01': 583 | df.rename(columns={col:'Gold' + col[4:]}, inplace=True) 584 | if col[:2]=='02': 585 | df.rename(columns={col:'Silver' + col[4:]}, inplace=True) 586 | if col[:2]=='03': 587 | df.rename(columns={col:'Bronze' + col[4:]}, inplace=True) 588 | if col[:1]=='№': 589 | df.rename(columns={col:'#' + col[1:]}, inplace=True) 590 | 591 | df.head() 592 | ``` 593 | 594 | Boolean masking: applying a boolean (True or False) filter/mask to a dataframe/array with where() function 595 | 596 | ``` 597 | only_gold = df.where(df['Gold']>0) 598 | only_gold.head() 599 | ``` 600 | 601 | Drop lines when there is no data with na() function 602 | 603 | ``` 604 | only_gold = only_gold.dropna() 605 | ``` 606 | 607 | Chaining boolean maskes 608 | 609 | ``` 610 | 611 | len(df[(df['Gold'] > 0) | (df['Gold.1'] > 0)]) 612 | 613 | df[(df['Gold.1'] > 0) & (df['Gold'] == 0)] 614 | 615 | ``` 616 | 617 | Return all of names of people who spend more than $3.00 618 | ``` 619 | purchase_1 = pd.Series({'Name': 'Chris', 620 | 'Item Purchased': 'Dog Food', 621 | 'Cost': 22.50}) 622 | purchase_2 = pd.Series({'Name': 'Kevyn', 623 | 'Item Purchased': 'Kitty Litter', 624 | 'Cost': 2.50}) 625 | purchase_3 = pd.Series({'Name': 'Vinod', 626 | 'Item Purchased': 'Bird Seed', 627 | 'Cost': 5.00}) 628 | 629 | df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2']) 630 | df['Name'][df['Cost']>3] 631 | ``` 632 | 633 | Set_index() function 634 | 635 | Reindex the purchase records Dataframe to be index hierarchically, first by store, then by person. Name these indexes "Location" and "Name". Then add a new entry to it with the value of: 636 | 637 | Name: "Kevyn", Item Purchased: "Kitty Food", Cost: 3.00 Location:"Store 2". 638 | 639 | ``` 640 | purchase_1 = pd.Series({'Name': 'Chris', 641 | 'Item Purchased': 'Dog Food', 642 | 'Cost': 22.50}) 643 | purchase_2 = pd.Series({'Name': 'Kevyn', 644 | 'Item Purchased': 'Kitty Litter', 645 | 'Cost': 2.50}) 646 | purchase_3 = pd.Series({'Name': 'Vinod', 647 | 'Item Purchased': 'Bird Seed', 648 | 'Cost': 5.00}) 649 | 650 | df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2']) 651 | 652 | 653 | df = df.set_index([df.index, 'Name']) 654 | df.index.names = ['Location', 'Name'] 655 | df = df.append(pd.Series(data={'Cost': 3.00, 'Item Purchased': 'Kitty Food'}, name=('Store 2', 'Kevyn'))) 656 | ``` 657 | --- 658 | 659 | 660 | ## Week 1 661 | 662 | ####List Indexing and Slicing 663 | 664 | Example 1 665 | 666 | ``` 667 | people = ['Dr. Christopher Brooks', 'Dr. Kevyn Collins-Thompson', 'Dr. VG Vinod Vydiswaran', 'Dr. Daniel Romero'] 668 | 669 | titleName = [] 670 | def split_title_and_name(): 671 | for person in people: 672 | last = person.split(" ")[-1] 673 | title = person.split(" ")[0] 674 | titleName.append(title + " "+last) 675 | print(titleName) 676 | 677 | split_title_and_name() 678 | ``` 679 | 680 | 681 | Example 2 682 | 683 | ``` 684 | people = ['Dr. Christopher Brooks', 'Dr. Kevyn Collins-Thompson', 'Dr. VG Vinod Vydiswaran', 'Dr. Daniel Romero'] 685 | 686 | def split_title_and_name(person): 687 | return person.split(" ")[0] + " " + person.split(" ")[-1] 688 | 689 | list(map(split_title_and_name,people)) 690 | ``` 691 | 692 | Example 3 (official answer) 693 | 694 | ``` 695 | people = ['Dr. Christopher Brooks', 'Dr. Kevyn Collins-Thompson', 'Dr. VG Vinod Vydiswaran', 'Dr. Daniel Romero'] 696 | 697 | def split_title_and_name(person): 698 | title = person.split()[0] 699 | lastname = person.split()[-1] 700 | return '{} {}'.format(title, lastname) 701 | 702 | list(map(split_title_and_name, people)) 703 | ``` 704 | 705 | 706 | Lambda functions (for writing quick one-liner functions) 707 | 708 | ``` 709 | my_function = lambda a,b: a+b 710 | my_function(1,2) 711 | ``` 712 | 713 | list comprehension (list all even numbers in range 0 - 1000) 714 | 715 | ``` 716 | my_list = [number for number in range(0,1000) if number % 2==0] 717 | ``` 718 | 719 | 720 | 721 | ``` 722 | def times_tables(): 723 | lst = [] 724 | for i in range(10): 725 | for j in range (10): 726 | lst.append(i*j) 727 | return lst 728 | 729 | times_tables() == [j*i for i in range(10) for j in range(10)] 730 | ``` 731 | 732 | ``` 733 | lowercase = 'abcdefghijklmnopqrstuvwxyz' 734 | digits = '0123456789' 735 | 736 | correct_answer = [a+b+c+d for a in lowercase for b in lowercase for c in digits for d in digits] 737 | 738 | correct_answer[:50] # Display first 50 ids 739 | ``` -------------------------------------------------------------------------------- /Week3/Week+3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "---\n", 8 | "\n", 9 | "_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-data-analysis/resources/0dhYG) course resource._\n", 10 | "\n", 11 | "---" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# Merging Dataframes\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/html": [ 31 | "
\n", 32 | "\n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | "
CostItem PurchasedName
Store 122.5SpongeChris
Store 12.5Kitty LitterKevyn
Store 25.0SpoonFilip
\n", 62 | "
" 63 | ], 64 | "text/plain": [ 65 | " Cost Item Purchased Name\n", 66 | "Store 1 22.5 Sponge Chris\n", 67 | "Store 1 2.5 Kitty Litter Kevyn\n", 68 | "Store 2 5.0 Spoon Filip" 69 | ] 70 | }, 71 | "execution_count": 1, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "import pandas as pd\n", 78 | "\n", 79 | "df = pd.DataFrame([{'Name': 'Chris', 'Item Purchased': 'Sponge', 'Cost': 22.50},\n", 80 | " {'Name': 'Kevyn', 'Item Purchased': 'Kitty Litter', 'Cost': 2.50},\n", 81 | " {'Name': 'Filip', 'Item Purchased': 'Spoon', 'Cost': 5.00}],\n", 82 | " index=['Store 1', 'Store 1', 'Store 2'])\n", 83 | "df" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "df['Date'] = ['December 1', 'January 1', 'mid-May']\n", 95 | "df" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "df['Delivered'] = True\n", 107 | "df" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "df['Feedback'] = ['Positive', None, 'Negative']\n", 119 | "df" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "collapsed": false 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "adf = df.reset_index()\n", 131 | "adf['Date'] = pd.Series({0: 'December 1', 2: 'mid-May'})\n", 132 | "adf" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR'},\n", 144 | " {'Name': 'Sally', 'Role': 'Course liasion'},\n", 145 | " {'Name': 'James', 'Role': 'Grader'}])\n", 146 | "staff_df = staff_df.set_index('Name')\n", 147 | "student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business'},\n", 148 | " {'Name': 'Mike', 'School': 'Law'},\n", 149 | " {'Name': 'Sally', 'School': 'Engineering'}])\n", 150 | "student_df = student_df.set_index('Name')\n", 151 | "print(staff_df.head())\n", 152 | "print()\n", 153 | "print(student_df.head())" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "collapsed": false, 161 | "scrolled": true 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "pd.merge(staff_df, student_df, how='outer', left_index=True, right_index=True)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": { 172 | "collapsed": false 173 | }, 174 | "outputs": [], 175 | "source": [ 176 | "pd.merge(staff_df, student_df, how='inner', left_index=True, right_index=True)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": { 183 | "collapsed": false 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "pd.merge(staff_df, student_df, how='left', left_index=True, right_index=True)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "collapsed": false 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "pd.merge(staff_df, student_df, how='right', left_index=True, right_index=True)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "collapsed": false, 206 | "scrolled": true 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "staff_df = staff_df.reset_index()\n", 211 | "student_df = student_df.reset_index()\n", 212 | "pd.merge(staff_df, student_df, how='left', left_on='Name', right_on='Name')" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": { 219 | "collapsed": false 220 | }, 221 | "outputs": [], 222 | "source": [ 223 | "staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR', 'Location': 'State Street'},\n", 224 | " {'Name': 'Sally', 'Role': 'Course liasion', 'Location': 'Washington Avenue'},\n", 225 | " {'Name': 'James', 'Role': 'Grader', 'Location': 'Washington Avenue'}])\n", 226 | "student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business', 'Location': '1024 Billiard Avenue'},\n", 227 | " {'Name': 'Mike', 'School': 'Law', 'Location': 'Fraternity House #22'},\n", 228 | " {'Name': 'Sally', 'School': 'Engineering', 'Location': '512 Wilson Crescent'}])\n", 229 | "pd.merge(staff_df, student_df, how='left', left_on='Name', right_on='Name')" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": { 236 | "collapsed": false 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "staff_df = pd.DataFrame([{'First Name': 'Kelly', 'Last Name': 'Desjardins', 'Role': 'Director of HR'},\n", 241 | " {'First Name': 'Sally', 'Last Name': 'Brooks', 'Role': 'Course liasion'},\n", 242 | " {'First Name': 'James', 'Last Name': 'Wilde', 'Role': 'Grader'}])\n", 243 | "student_df = pd.DataFrame([{'First Name': 'James', 'Last Name': 'Hammond', 'School': 'Business'},\n", 244 | " {'First Name': 'Mike', 'Last Name': 'Smith', 'School': 'Law'},\n", 245 | " {'First Name': 'Sally', 'Last Name': 'Brooks', 'School': 'Engineering'}])\n", 246 | "staff_df\n", 247 | "student_df\n", 248 | "pd.merge(staff_df, student_df, how='inner', left_on=['First Name','Last Name'], right_on=['First Name','Last Name'])" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "# Idiomatic Pandas: Making Code Pandorable" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": { 262 | "collapsed": false 263 | }, 264 | "outputs": [], 265 | "source": [ 266 | "import pandas as pd\n", 267 | "df = pd.read_csv('census.csv')\n", 268 | "df" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": { 275 | "collapsed": false 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "(df.where(df['SUMLEV']==50)\n", 280 | " .dropna()\n", 281 | " .set_index(['STNAME','CTYNAME'])\n", 282 | " .rename(columns={'ESTIMATESBASE2010': 'Estimates Base 2010'}))" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": { 289 | "collapsed": false 290 | }, 291 | "outputs": [], 292 | "source": [ 293 | "df = df[df['SUMLEV']==50]\n", 294 | "df.set_index(['STNAME','CTYNAME'], inplace=True)\n", 295 | "df.rename(columns={'ESTIMATESBASE2010': 'Estimates Base 2010'})" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": { 302 | "collapsed": false 303 | }, 304 | "outputs": [], 305 | "source": [ 306 | "import numpy as np\n", 307 | "def min_max(row):\n", 308 | " data = row[['POPESTIMATE2010',\n", 309 | " 'POPESTIMATE2011',\n", 310 | " 'POPESTIMATE2012',\n", 311 | " 'POPESTIMATE2013',\n", 312 | " 'POPESTIMATE2014',\n", 313 | " 'POPESTIMATE2015']]\n", 314 | " return pd.Series({'min': np.min(data), 'max': np.max(data)})" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": { 321 | "collapsed": false 322 | }, 323 | "outputs": [], 324 | "source": [ 325 | "df.apply(min_max, axis=1)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": { 332 | "collapsed": false 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "import numpy as np\n", 337 | "def min_max(row):\n", 338 | " data = row[['POPESTIMATE2010',\n", 339 | " 'POPESTIMATE2011',\n", 340 | " 'POPESTIMATE2012',\n", 341 | " 'POPESTIMATE2013',\n", 342 | " 'POPESTIMATE2014',\n", 343 | " 'POPESTIMATE2015']]\n", 344 | " row['max'] = np.max(data)\n", 345 | " row['min'] = np.min(data)\n", 346 | " return row\n", 347 | "df.apply(min_max, axis=1)" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": { 354 | "collapsed": false 355 | }, 356 | "outputs": [], 357 | "source": [ 358 | "rows = ['POPESTIMATE2010',\n", 359 | " 'POPESTIMATE2011',\n", 360 | " 'POPESTIMATE2012',\n", 361 | " 'POPESTIMATE2013',\n", 362 | " 'POPESTIMATE2014',\n", 363 | " 'POPESTIMATE2015']\n", 364 | "df.apply(lambda x: np.max(x[rows]), axis=1)" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": {}, 370 | "source": [ 371 | "# Group by" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": { 378 | "collapsed": false 379 | }, 380 | "outputs": [], 381 | "source": [ 382 | "import pandas as pd\n", 383 | "import numpy as np\n", 384 | "df = pd.read_csv('census.csv')\n", 385 | "df = df[df['SUMLEV']==50]\n", 386 | "df" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": { 393 | "collapsed": false 394 | }, 395 | "outputs": [], 396 | "source": [ 397 | "%%timeit -n 10\n", 398 | "for state in df['STNAME'].unique():\n", 399 | " avg = np.average(df.where(df['STNAME']==state).dropna()['CENSUS2010POP'])\n", 400 | " print('Counties in state ' + state + ' have an average population of ' + str(avg))" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": { 407 | "collapsed": false, 408 | "scrolled": true 409 | }, 410 | "outputs": [], 411 | "source": [ 412 | "%%timeit -n 10\n", 413 | "for group, frame in df.groupby('STNAME'):\n", 414 | " avg = np.average(frame['CENSUS2010POP'])\n", 415 | " print('Counties in state ' + group + ' have an average population of ' + str(avg))" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": { 422 | "collapsed": false 423 | }, 424 | "outputs": [], 425 | "source": [ 426 | "df.head()" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": { 433 | "collapsed": false 434 | }, 435 | "outputs": [], 436 | "source": [ 437 | "df = df.set_index('STNAME')\n", 438 | "\n", 439 | "def fun(item):\n", 440 | " if item[0]<'M':\n", 441 | " return 0\n", 442 | " if item[0]<'Q':\n", 443 | " return 1\n", 444 | " return 2\n", 445 | "\n", 446 | "for group, frame in df.groupby(fun):\n", 447 | " print('There are ' + str(len(frame)) + ' records in group ' + str(group) + ' for processing.')\n" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": { 454 | "collapsed": false 455 | }, 456 | "outputs": [], 457 | "source": [ 458 | "df = pd.read_csv('census.csv')\n", 459 | "df = df[df['SUMLEV']==50]" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": { 466 | "collapsed": false 467 | }, 468 | "outputs": [], 469 | "source": [ 470 | "df.groupby('STNAME').agg({'CENSUS2010POP': np.average})" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": { 477 | "collapsed": false 478 | }, 479 | "outputs": [], 480 | "source": [ 481 | "print(type(df.groupby(level=0)['POPESTIMATE2010','POPESTIMATE2011']))\n", 482 | "print(type(df.groupby(level=0)['POPESTIMATE2010']))" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": null, 488 | "metadata": { 489 | "collapsed": false 490 | }, 491 | "outputs": [], 492 | "source": [ 493 | "(df.set_index('STNAME').groupby(level=0)['CENSUS2010POP']\n", 494 | " .agg({'avg': np.average, 'sum': np.sum}))" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": null, 500 | "metadata": { 501 | "collapsed": false 502 | }, 503 | "outputs": [], 504 | "source": [ 505 | "(df.set_index('STNAME').groupby(level=0)['POPESTIMATE2010','POPESTIMATE2011']\n", 506 | " .agg({'avg': np.average, 'sum': np.sum}))" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": null, 512 | "metadata": { 513 | "collapsed": false 514 | }, 515 | "outputs": [], 516 | "source": [ 517 | "(df.set_index('STNAME').groupby(level=0)['POPESTIMATE2010','POPESTIMATE2011']\n", 518 | " .agg({'POPESTIMATE2010': np.average, 'POPESTIMATE2011': np.sum}))" 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "metadata": {}, 524 | "source": [ 525 | "# Scales" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": null, 531 | "metadata": { 532 | "collapsed": false 533 | }, 534 | "outputs": [], 535 | "source": [ 536 | "df = pd.DataFrame(['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D'],\n", 537 | " index=['excellent', 'excellent', 'excellent', 'good', 'good', 'good', 'ok', 'ok', 'ok', 'poor', 'poor'])\n", 538 | "df.rename(columns={0: 'Grades'}, inplace=True)\n", 539 | "df" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": null, 545 | "metadata": { 546 | "collapsed": false 547 | }, 548 | "outputs": [], 549 | "source": [ 550 | "df['Grades'].astype('category').head()" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": null, 556 | "metadata": { 557 | "collapsed": false 558 | }, 559 | "outputs": [], 560 | "source": [ 561 | "grades = df['Grades'].astype('category',\n", 562 | " categories=['D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+'],\n", 563 | " ordered=True)\n", 564 | "grades.head()" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": null, 570 | "metadata": { 571 | "collapsed": false 572 | }, 573 | "outputs": [], 574 | "source": [ 575 | "grades > 'C'" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": null, 581 | "metadata": { 582 | "collapsed": false 583 | }, 584 | "outputs": [], 585 | "source": [ 586 | "df = pd.read_csv('census.csv')\n", 587 | "df = df[df['SUMLEV']==50]\n", 588 | "df = df.set_index('STNAME').groupby(level=0)['CENSUS2010POP'].agg({'avg': np.average})\n", 589 | "pd.cut(df['avg'],10)" 590 | ] 591 | }, 592 | { 593 | "cell_type": "markdown", 594 | "metadata": {}, 595 | "source": [ 596 | "# Pivot Tables" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": null, 602 | "metadata": { 603 | "collapsed": true 604 | }, 605 | "outputs": [], 606 | "source": [ 607 | "#http://open.canada.ca/data/en/dataset/98f1a129-f628-4ce4-b24d-6f16bf24dd64\n", 608 | "df = pd.read_csv('cars.csv')" 609 | ] 610 | }, 611 | { 612 | "cell_type": "code", 613 | "execution_count": null, 614 | "metadata": { 615 | "collapsed": false 616 | }, 617 | "outputs": [], 618 | "source": [ 619 | "df.head()" 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": null, 625 | "metadata": { 626 | "collapsed": false 627 | }, 628 | "outputs": [], 629 | "source": [ 630 | "df.pivot_table(values='(kW)', index='YEAR', columns='Make', aggfunc=np.mean)" 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": null, 636 | "metadata": { 637 | "collapsed": false 638 | }, 639 | "outputs": [], 640 | "source": [ 641 | "df.pivot_table(values='(kW)', index='YEAR', columns='Make', aggfunc=[np.mean,np.min], margins=True)" 642 | ] 643 | }, 644 | { 645 | "cell_type": "markdown", 646 | "metadata": {}, 647 | "source": [ 648 | "# Date Functionality in Pandas" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": 2, 654 | "metadata": { 655 | "collapsed": true 656 | }, 657 | "outputs": [], 658 | "source": [ 659 | "import pandas as pd\n", 660 | "import numpy as np" 661 | ] 662 | }, 663 | { 664 | "cell_type": "markdown", 665 | "metadata": {}, 666 | "source": [ 667 | "### Timestamp" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": 3, 673 | "metadata": { 674 | "collapsed": false 675 | }, 676 | "outputs": [ 677 | { 678 | "data": { 679 | "text/plain": [ 680 | "Timestamp('2016-09-01 10:05:00')" 681 | ] 682 | }, 683 | "execution_count": 3, 684 | "metadata": {}, 685 | "output_type": "execute_result" 686 | } 687 | ], 688 | "source": [ 689 | "pd.Timestamp('9/1/2016 10:05AM')" 690 | ] 691 | }, 692 | { 693 | "cell_type": "markdown", 694 | "metadata": {}, 695 | "source": [ 696 | "### Period" 697 | ] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": 4, 702 | "metadata": { 703 | "collapsed": false 704 | }, 705 | "outputs": [ 706 | { 707 | "data": { 708 | "text/plain": [ 709 | "Period('2016-01', 'M')" 710 | ] 711 | }, 712 | "execution_count": 4, 713 | "metadata": {}, 714 | "output_type": "execute_result" 715 | } 716 | ], 717 | "source": [ 718 | "pd.Period('1/2016')" 719 | ] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": 5, 724 | "metadata": { 725 | "collapsed": false 726 | }, 727 | "outputs": [ 728 | { 729 | "data": { 730 | "text/plain": [ 731 | "Period('2016-03-05', 'D')" 732 | ] 733 | }, 734 | "execution_count": 5, 735 | "metadata": {}, 736 | "output_type": "execute_result" 737 | } 738 | ], 739 | "source": [ 740 | "pd.Period('3/5/2016')" 741 | ] 742 | }, 743 | { 744 | "cell_type": "markdown", 745 | "metadata": {}, 746 | "source": [ 747 | "### DatetimeIndex" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": 6, 753 | "metadata": { 754 | "collapsed": false 755 | }, 756 | "outputs": [ 757 | { 758 | "data": { 759 | "text/plain": [ 760 | "2016-09-01 a\n", 761 | "2016-09-02 b\n", 762 | "2016-09-03 c\n", 763 | "dtype: object" 764 | ] 765 | }, 766 | "execution_count": 6, 767 | "metadata": {}, 768 | "output_type": "execute_result" 769 | } 770 | ], 771 | "source": [ 772 | "t1 = pd.Series(list('abc'), [pd.Timestamp('2016-09-01'), pd.Timestamp('2016-09-02'), pd.Timestamp('2016-09-03')])\n", 773 | "t1" 774 | ] 775 | }, 776 | { 777 | "cell_type": "code", 778 | "execution_count": 7, 779 | "metadata": { 780 | "collapsed": false 781 | }, 782 | "outputs": [ 783 | { 784 | "data": { 785 | "text/plain": [ 786 | "pandas.tseries.index.DatetimeIndex" 787 | ] 788 | }, 789 | "execution_count": 7, 790 | "metadata": {}, 791 | "output_type": "execute_result" 792 | } 793 | ], 794 | "source": [ 795 | "type(t1.index)" 796 | ] 797 | }, 798 | { 799 | "cell_type": "markdown", 800 | "metadata": {}, 801 | "source": [ 802 | "### PeriodIndex" 803 | ] 804 | }, 805 | { 806 | "cell_type": "code", 807 | "execution_count": 8, 808 | "metadata": { 809 | "collapsed": false 810 | }, 811 | "outputs": [ 812 | { 813 | "data": { 814 | "text/plain": [ 815 | "2016-09 d\n", 816 | "2016-10 e\n", 817 | "2016-11 f\n", 818 | "Freq: M, dtype: object" 819 | ] 820 | }, 821 | "execution_count": 8, 822 | "metadata": {}, 823 | "output_type": "execute_result" 824 | } 825 | ], 826 | "source": [ 827 | "t2 = pd.Series(list('def'), [pd.Period('2016-09'), pd.Period('2016-10'), pd.Period('2016-11')])\n", 828 | "t2" 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": 9, 834 | "metadata": { 835 | "collapsed": false 836 | }, 837 | "outputs": [ 838 | { 839 | "data": { 840 | "text/plain": [ 841 | "pandas.tseries.period.PeriodIndex" 842 | ] 843 | }, 844 | "execution_count": 9, 845 | "metadata": {}, 846 | "output_type": "execute_result" 847 | } 848 | ], 849 | "source": [ 850 | "type(t2.index)" 851 | ] 852 | }, 853 | { 854 | "cell_type": "markdown", 855 | "metadata": {}, 856 | "source": [ 857 | "### Converting to Datetime" 858 | ] 859 | }, 860 | { 861 | "cell_type": "code", 862 | "execution_count": 10, 863 | "metadata": { 864 | "collapsed": false 865 | }, 866 | "outputs": [ 867 | { 868 | "data": { 869 | "text/html": [ 870 | "
\n", 871 | "\n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | "
ab
2 June 20131646
Aug 29, 20141466
2015-06-265999
7/12/162717
\n", 902 | "
" 903 | ], 904 | "text/plain": [ 905 | " a b\n", 906 | "2 June 2013 16 46\n", 907 | "Aug 29, 2014 14 66\n", 908 | "2015-06-26 59 99\n", 909 | "7/12/16 27 17" 910 | ] 911 | }, 912 | "execution_count": 10, 913 | "metadata": {}, 914 | "output_type": "execute_result" 915 | } 916 | ], 917 | "source": [ 918 | "d1 = ['2 June 2013', 'Aug 29, 2014', '2015-06-26', '7/12/16']\n", 919 | "ts3 = pd.DataFrame(np.random.randint(10, 100, (4,2)), index=d1, columns=list('ab'))\n", 920 | "ts3" 921 | ] 922 | }, 923 | { 924 | "cell_type": "code", 925 | "execution_count": 11, 926 | "metadata": { 927 | "collapsed": false 928 | }, 929 | "outputs": [ 930 | { 931 | "data": { 932 | "text/html": [ 933 | "
\n", 934 | "\n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | "
ab
2013-06-021646
2014-08-291466
2015-06-265999
2016-07-122717
\n", 965 | "
" 966 | ], 967 | "text/plain": [ 968 | " a b\n", 969 | "2013-06-02 16 46\n", 970 | "2014-08-29 14 66\n", 971 | "2015-06-26 59 99\n", 972 | "2016-07-12 27 17" 973 | ] 974 | }, 975 | "execution_count": 11, 976 | "metadata": {}, 977 | "output_type": "execute_result" 978 | } 979 | ], 980 | "source": [ 981 | "ts3.index = pd.to_datetime(ts3.index)\n", 982 | "ts3" 983 | ] 984 | }, 985 | { 986 | "cell_type": "code", 987 | "execution_count": 12, 988 | "metadata": { 989 | "collapsed": false 990 | }, 991 | "outputs": [ 992 | { 993 | "data": { 994 | "text/plain": [ 995 | "Timestamp('2012-07-04 00:00:00')" 996 | ] 997 | }, 998 | "execution_count": 12, 999 | "metadata": {}, 1000 | "output_type": "execute_result" 1001 | } 1002 | ], 1003 | "source": [ 1004 | "pd.to_datetime('4.7.12', dayfirst=True)" 1005 | ] 1006 | }, 1007 | { 1008 | "cell_type": "markdown", 1009 | "metadata": {}, 1010 | "source": [ 1011 | "### Timedeltas" 1012 | ] 1013 | }, 1014 | { 1015 | "cell_type": "code", 1016 | "execution_count": 13, 1017 | "metadata": { 1018 | "collapsed": false 1019 | }, 1020 | "outputs": [ 1021 | { 1022 | "data": { 1023 | "text/plain": [ 1024 | "Timedelta('2 days 00:00:00')" 1025 | ] 1026 | }, 1027 | "execution_count": 13, 1028 | "metadata": {}, 1029 | "output_type": "execute_result" 1030 | } 1031 | ], 1032 | "source": [ 1033 | "pd.Timestamp('9/3/2016')-pd.Timestamp('9/1/2016')" 1034 | ] 1035 | }, 1036 | { 1037 | "cell_type": "code", 1038 | "execution_count": 14, 1039 | "metadata": { 1040 | "collapsed": false 1041 | }, 1042 | "outputs": [ 1043 | { 1044 | "data": { 1045 | "text/plain": [ 1046 | "Timestamp('2016-09-14 11:10:00')" 1047 | ] 1048 | }, 1049 | "execution_count": 14, 1050 | "metadata": {}, 1051 | "output_type": "execute_result" 1052 | } 1053 | ], 1054 | "source": [ 1055 | "pd.Timestamp('9/2/2016 8:10AM') + pd.Timedelta('12D 3H')" 1056 | ] 1057 | }, 1058 | { 1059 | "cell_type": "markdown", 1060 | "metadata": {}, 1061 | "source": [ 1062 | "### Working with Dates in a Dataframe" 1063 | ] 1064 | }, 1065 | { 1066 | "cell_type": "code", 1067 | "execution_count": 15, 1068 | "metadata": { 1069 | "collapsed": false 1070 | }, 1071 | "outputs": [ 1072 | { 1073 | "data": { 1074 | "text/plain": [ 1075 | "DatetimeIndex(['2016-10-02', '2016-10-16', '2016-10-30', '2016-11-13',\n", 1076 | " '2016-11-27', '2016-12-11', '2016-12-25', '2017-01-08',\n", 1077 | " '2017-01-22'],\n", 1078 | " dtype='datetime64[ns]', freq='2W-SUN')" 1079 | ] 1080 | }, 1081 | "execution_count": 15, 1082 | "metadata": {}, 1083 | "output_type": "execute_result" 1084 | } 1085 | ], 1086 | "source": [ 1087 | "dates = pd.date_range('10-01-2016', periods=9, freq='2W-SUN')\n", 1088 | "dates" 1089 | ] 1090 | }, 1091 | { 1092 | "cell_type": "code", 1093 | "execution_count": 16, 1094 | "metadata": { 1095 | "collapsed": false 1096 | }, 1097 | "outputs": [ 1098 | { 1099 | "data": { 1100 | "text/html": [ 1101 | "
\n", 1102 | "\n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | "
Count 1Count 2
2016-10-02104125
2016-10-16109122
2016-10-30111127
2016-11-13117126
2016-11-27114126
2016-12-11109121
2016-12-25105126
2017-01-08105125
2017-01-22101123
\n", 1158 | "
" 1159 | ], 1160 | "text/plain": [ 1161 | " Count 1 Count 2\n", 1162 | "2016-10-02 104 125\n", 1163 | "2016-10-16 109 122\n", 1164 | "2016-10-30 111 127\n", 1165 | "2016-11-13 117 126\n", 1166 | "2016-11-27 114 126\n", 1167 | "2016-12-11 109 121\n", 1168 | "2016-12-25 105 126\n", 1169 | "2017-01-08 105 125\n", 1170 | "2017-01-22 101 123" 1171 | ] 1172 | }, 1173 | "execution_count": 16, 1174 | "metadata": {}, 1175 | "output_type": "execute_result" 1176 | } 1177 | ], 1178 | "source": [ 1179 | "df = pd.DataFrame({'Count 1': 100 + np.random.randint(-5, 10, 9).cumsum(),\n", 1180 | " 'Count 2': 120 + np.random.randint(-5, 10, 9)}, index=dates)\n", 1181 | "df" 1182 | ] 1183 | }, 1184 | { 1185 | "cell_type": "code", 1186 | "execution_count": 17, 1187 | "metadata": { 1188 | "collapsed": false 1189 | }, 1190 | "outputs": [ 1191 | { 1192 | "data": { 1193 | "text/plain": [ 1194 | "array(['Sunday', 'Sunday', 'Sunday', 'Sunday', 'Sunday', 'Sunday',\n", 1195 | " 'Sunday', 'Sunday', 'Sunday'], dtype=object)" 1196 | ] 1197 | }, 1198 | "execution_count": 17, 1199 | "metadata": {}, 1200 | "output_type": "execute_result" 1201 | } 1202 | ], 1203 | "source": [ 1204 | "df.index.weekday_name" 1205 | ] 1206 | }, 1207 | { 1208 | "cell_type": "code", 1209 | "execution_count": 18, 1210 | "metadata": { 1211 | "collapsed": false 1212 | }, 1213 | "outputs": [ 1214 | { 1215 | "data": { 1216 | "text/html": [ 1217 | "
\n", 1218 | "\n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | "
Count 1Count 2
2016-10-02NaNNaN
2016-10-165.0-3.0
2016-10-302.05.0
2016-11-136.0-1.0
2016-11-27-3.00.0
2016-12-11-5.0-5.0
2016-12-25-4.05.0
2017-01-080.0-1.0
2017-01-22-4.0-2.0
\n", 1274 | "
" 1275 | ], 1276 | "text/plain": [ 1277 | " Count 1 Count 2\n", 1278 | "2016-10-02 NaN NaN\n", 1279 | "2016-10-16 5.0 -3.0\n", 1280 | "2016-10-30 2.0 5.0\n", 1281 | "2016-11-13 6.0 -1.0\n", 1282 | "2016-11-27 -3.0 0.0\n", 1283 | "2016-12-11 -5.0 -5.0\n", 1284 | "2016-12-25 -4.0 5.0\n", 1285 | "2017-01-08 0.0 -1.0\n", 1286 | "2017-01-22 -4.0 -2.0" 1287 | ] 1288 | }, 1289 | "execution_count": 18, 1290 | "metadata": {}, 1291 | "output_type": "execute_result" 1292 | } 1293 | ], 1294 | "source": [ 1295 | "df.diff()" 1296 | ] 1297 | }, 1298 | { 1299 | "cell_type": "code", 1300 | "execution_count": 19, 1301 | "metadata": { 1302 | "collapsed": false 1303 | }, 1304 | "outputs": [ 1305 | { 1306 | "data": { 1307 | "text/html": [ 1308 | "
\n", 1309 | "\n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | "
Count 1Count 2
2016-10-31108.0124.666667
2016-11-30115.5126.000000
2016-12-31107.0123.500000
2017-01-31103.0124.000000
\n", 1340 | "
" 1341 | ], 1342 | "text/plain": [ 1343 | " Count 1 Count 2\n", 1344 | "2016-10-31 108.0 124.666667\n", 1345 | "2016-11-30 115.5 126.000000\n", 1346 | "2016-12-31 107.0 123.500000\n", 1347 | "2017-01-31 103.0 124.000000" 1348 | ] 1349 | }, 1350 | "execution_count": 19, 1351 | "metadata": {}, 1352 | "output_type": "execute_result" 1353 | } 1354 | ], 1355 | "source": [ 1356 | "df.resample('M').mean()" 1357 | ] 1358 | }, 1359 | { 1360 | "cell_type": "code", 1361 | "execution_count": 20, 1362 | "metadata": { 1363 | "collapsed": false 1364 | }, 1365 | "outputs": [ 1366 | { 1367 | "data": { 1368 | "text/html": [ 1369 | "
\n", 1370 | "\n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | "
Count 1Count 2
2017-01-08105125
2017-01-22101123
\n", 1391 | "
" 1392 | ], 1393 | "text/plain": [ 1394 | " Count 1 Count 2\n", 1395 | "2017-01-08 105 125\n", 1396 | "2017-01-22 101 123" 1397 | ] 1398 | }, 1399 | "execution_count": 20, 1400 | "metadata": {}, 1401 | "output_type": "execute_result" 1402 | } 1403 | ], 1404 | "source": [ 1405 | "df['2017']" 1406 | ] 1407 | }, 1408 | { 1409 | "cell_type": "code", 1410 | "execution_count": 21, 1411 | "metadata": { 1412 | "collapsed": false 1413 | }, 1414 | "outputs": [ 1415 | { 1416 | "data": { 1417 | "text/html": [ 1418 | "
\n", 1419 | "\n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | "
Count 1Count 2
2016-12-11109121
2016-12-25105126
\n", 1440 | "
" 1441 | ], 1442 | "text/plain": [ 1443 | " Count 1 Count 2\n", 1444 | "2016-12-11 109 121\n", 1445 | "2016-12-25 105 126" 1446 | ] 1447 | }, 1448 | "execution_count": 21, 1449 | "metadata": {}, 1450 | "output_type": "execute_result" 1451 | } 1452 | ], 1453 | "source": [ 1454 | "df['2016-12']" 1455 | ] 1456 | }, 1457 | { 1458 | "cell_type": "code", 1459 | "execution_count": 22, 1460 | "metadata": { 1461 | "collapsed": false 1462 | }, 1463 | "outputs": [ 1464 | { 1465 | "data": { 1466 | "text/html": [ 1467 | "
\n", 1468 | "\n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | "
Count 1Count 2
2016-12-11109121
2016-12-25105126
2017-01-08105125
2017-01-22101123
\n", 1499 | "
" 1500 | ], 1501 | "text/plain": [ 1502 | " Count 1 Count 2\n", 1503 | "2016-12-11 109 121\n", 1504 | "2016-12-25 105 126\n", 1505 | "2017-01-08 105 125\n", 1506 | "2017-01-22 101 123" 1507 | ] 1508 | }, 1509 | "execution_count": 22, 1510 | "metadata": {}, 1511 | "output_type": "execute_result" 1512 | } 1513 | ], 1514 | "source": [ 1515 | "df['2016-12':]" 1516 | ] 1517 | }, 1518 | { 1519 | "cell_type": "code", 1520 | "execution_count": null, 1521 | "metadata": { 1522 | "collapsed": false 1523 | }, 1524 | "outputs": [], 1525 | "source": [ 1526 | "df.asfreq('W', method='ffill')" 1527 | ] 1528 | }, 1529 | { 1530 | "cell_type": "code", 1531 | "execution_count": null, 1532 | "metadata": { 1533 | "collapsed": false 1534 | }, 1535 | "outputs": [], 1536 | "source": [ 1537 | "import matplotlib.pyplot as plt\n", 1538 | "%matplotlib inline\n", 1539 | "\n", 1540 | "df.plot()" 1541 | ] 1542 | } 1543 | ], 1544 | "metadata": { 1545 | "celltoolbar": "Raw Cell Format", 1546 | "kernelspec": { 1547 | "display_name": "Python 3", 1548 | "language": "python", 1549 | "name": "python3" 1550 | }, 1551 | "language_info": { 1552 | "codemirror_mode": { 1553 | "name": "ipython", 1554 | "version": 3 1555 | }, 1556 | "file_extension": ".py", 1557 | "mimetype": "text/x-python", 1558 | "name": "python", 1559 | "nbconvert_exporter": "python", 1560 | "pygments_lexer": "ipython3", 1561 | "version": "3.5.2" 1562 | } 1563 | }, 1564 | "nbformat": 4, 1565 | "nbformat_minor": 0 1566 | } 1567 | -------------------------------------------------------------------------------- /Week3/Assignment - Week 3/Assignment+3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "---\n", 8 | "\n", 9 | "_You are currently looking at **version 1.4** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-data-analysis/resources/0dhYG) course resource._\n", 10 | "\n", 11 | "---" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# Assignment 3 - More Pandas\n", 19 | "All questions are weighted the same in this assignment. This assignment requires more individual learning then the last one did - you are encouraged to check out the [pandas documentation](http://pandas.pydata.org/pandas-docs/stable/) to find functions or methods you might not have used yet, or ask questions on [Stack Overflow](http://stackoverflow.com/) and tag them as pandas and python related. And of course, the discussion forums are open for interaction with your peers and the course staff." 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "### Question 1 (20%)\n", 27 | "Load the energy data from the file `Energy Indicators.xls`, which is a list of indicators of [energy supply and renewable electricity production](Energy%20Indicators.xls) from the [United Nations](http://unstats.un.org/unsd/environment/excel_file_tables/2013/Energy%20Indicators.xls) for the year 2013, and should be put into a DataFrame with the variable name of **energy**.\n", 28 | "\n", 29 | "Keep in mind that this is an Excel file, and not a comma separated values file. Also, make sure to exclude the footer and header information from the datafile. The first two columns are unneccessary, so you should get rid of them, and you should change the column labels so that the columns are:\n", 30 | "\n", 31 | "`['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable]`\n", 32 | "\n", 33 | "Convert `Energy Supply` to gigajoules (there are 1,000,000 gigajoules in a petajoule). For all countries which have missing data (e.g. data with \"...\") make sure this is reflected as `np.NaN` values.\n", 34 | "\n", 35 | "Rename the following list of countries (for use in later questions):\n", 36 | "\n", 37 | "```\"Republic of Korea\": \"South Korea\",\n", 38 | "\"United States of America\": \"United States\",\n", 39 | "\"United Kingdom of Great Britain and Northern Ireland\": \"United Kingdom\",\n", 40 | "\"China, Hong Kong Special Administrative Region\": \"Hong Kong\"```\n", 41 | "\n", 42 | "There are also several countries with numbers and/or parenthesis in their name. Be sure to remove these, e.g. `'Bolivia (Plurinational State of)'` should be `'Bolivia'`.\n", 43 | "\n", 44 | "
\n", 45 | "\n", 46 | "Next, load the GDP data from the file `world_bank.csv`, which is a csv containing countries' GDP from 1960 to 2015 from [World Bank](http://data.worldbank.org/indicator/NY.GDP.MKTP.CD). Call this DataFrame **GDP**. \n", 47 | "\n", 48 | "Make sure to skip the header, and rename the following list of countries:\n", 49 | "\n", 50 | "```\"Korea, Rep.\": \"South Korea\", \n", 51 | "\"Iran, Islamic Rep.\": \"Iran\",\n", 52 | "\"Hong Kong SAR, China\": \"Hong Kong\"```\n", 53 | "\n", 54 | "
\n", 55 | "\n", 56 | "Finally, load the [Sciamgo Journal and Country Rank data for Energy Engineering and Power Technology](http://www.scimagojr.com/countryrank.php?category=2102) from the file `scimagojr-3.xlsx`, which ranks countries based on their journal contributions in the aforementioned area. Call this DataFrame **ScimEn**.\n", 57 | "\n", 58 | "Join the three datasets: GDP, Energy, and ScimEn into a new dataset (using the intersection of country names). Use only the last 10 years (2006-2015) of GDP data and only the top 15 countries by Scimagojr 'Rank' (Rank 1 through 15). \n", 59 | "\n", 60 | "The index of this DataFrame should be the name of the country, and the columns should be ['Rank', 'Documents', 'Citable documents', 'Citations', 'Self-citations',\n", 61 | " 'Citations per document', 'H index', 'Energy Supply',\n", 62 | " 'Energy Supply per Capita', '% Renewable', '2006', '2007', '2008',\n", 63 | " '2009', '2010', '2011', '2012', '2013', '2014', '2015'].\n", 64 | "\n", 65 | "*This function should return a DataFrame with 20 columns and 15 entries.*" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 2, 71 | "metadata": { 72 | "collapsed": false, 73 | "scrolled": true 74 | }, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/html": [ 79 | "
\n", 80 | "\n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | "
RankDocumentsCitable documentsCitationsSelf-citationsCitations per documentH indexEnergy SupplyEnergy Supply per Capita% Renewable2006200720082009201020112012201320142015
Country
China11270501267675972374116834.701381271910000009319.75493.992331e+124.559041e+124.997775e+125.459247e+126.039659e+126.612490e+127.124978e+127.672448e+128.230121e+128.797999e+12
United States296661947477922742654368.202309083800000028611.5711.479230e+131.505540e+131.501149e+131.459484e+131.496437e+131.520402e+131.554216e+131.577367e+131.615662e+131.654857e+13
Japan33050430287223024615547.311341898400000014910.23285.496542e+125.617036e+125.558527e+125.251308e+125.498718e+125.473738e+125.569102e+125.644659e+125.642884e+125.669563e+12
United Kingdom42094420357206091378749.84139792000000012410.60052.419631e+122.482203e+122.470614e+122.367048e+122.403504e+122.450911e+122.479809e+122.533370e+122.605643e+122.666333e+12
Russian Federation5185341830134266124221.85573070900000021417.28871.385793e+121.504071e+121.583004e+121.459199e+121.524917e+121.589943e+121.645876e+121.666934e+121.678709e+121.616149e+12
Canada617899176202150034093012.011491043100000029661.94541.564469e+121.596740e+121.612713e+121.565145e+121.613406e+121.664087e+121.693133e+121.730688e+121.773486e+121.792609e+12
Germany71702716831140566274268.261261326100000016517.90153.332891e+123.441561e+123.478809e+123.283340e+123.417298e+123.542371e+123.556724e+123.567317e+123.624386e+123.685556e+12
India81500514841128763372098.58115331950000002614.96911.265894e+121.374865e+121.428361e+121.549483e+121.708459e+121.821872e+121.924235e+122.051982e+122.200617e+122.367206e+12
France91315312973130632286019.931141059700000016617.02032.607840e+122.669424e+122.674637e+122.595967e+122.646995e+122.702032e+122.706968e+122.722567e+122.729632e+122.761185e+12
South Korea101198311923114675225959.57104110070000002212.279359.410199e+119.924316e+111.020510e+121.027730e+121.094499e+121.134796e+121.160809e+121.194429e+121.234340e+121.266580e+12
Italy1110964107941118502666110.20106653000000010933.66722.202170e+122.234627e+122.211154e+122.089938e+122.125185e+122.137439e+122.077184e+122.040871e+122.033868e+122.049316e+12
Spain12942893301233362396413.08115492300000010637.96861.414823e+121.468146e+121.484530e+121.431475e+121.431673e+121.417355e+121.380216e+121.357139e+121.375605e+121.419821e+12
Iran138896881957470191256.467291720000001195.707723.895523e+114.250646e+114.289909e+114.389208e+114.677902e+114.853309e+114.532569e+114.445926e+114.639027e+11NaN
Australia1488318725907651560610.28107538600000023111.81081.021939e+121.060340e+121.099644e+121.119654e+121.142251e+121.169431e+121.211913e+121.241484e+121.272520e+121.301251e+12
Brazil158668859660702143967.0086121490000005969.6481.845080e+121.957118e+122.056809e+122.054215e+122.208872e+122.295245e+122.339209e+122.409740e+122.412231e+122.319423e+12
\n", 477 | "
" 478 | ], 479 | "text/plain": [ 480 | " Rank Documents Citable documents Citations \\\n", 481 | "Country \n", 482 | "China 1 127050 126767 597237 \n", 483 | "United States 2 96661 94747 792274 \n", 484 | "Japan 3 30504 30287 223024 \n", 485 | "United Kingdom 4 20944 20357 206091 \n", 486 | "Russian Federation 5 18534 18301 34266 \n", 487 | "Canada 6 17899 17620 215003 \n", 488 | "Germany 7 17027 16831 140566 \n", 489 | "India 8 15005 14841 128763 \n", 490 | "France 9 13153 12973 130632 \n", 491 | "South Korea 10 11983 11923 114675 \n", 492 | "Italy 11 10964 10794 111850 \n", 493 | "Spain 12 9428 9330 123336 \n", 494 | "Iran 13 8896 8819 57470 \n", 495 | "Australia 14 8831 8725 90765 \n", 496 | "Brazil 15 8668 8596 60702 \n", 497 | "\n", 498 | " Self-citations Citations per document H index \\\n", 499 | "Country \n", 500 | "China 411683 4.70 138 \n", 501 | "United States 265436 8.20 230 \n", 502 | "Japan 61554 7.31 134 \n", 503 | "United Kingdom 37874 9.84 139 \n", 504 | "Russian Federation 12422 1.85 57 \n", 505 | "Canada 40930 12.01 149 \n", 506 | "Germany 27426 8.26 126 \n", 507 | "India 37209 8.58 115 \n", 508 | "France 28601 9.93 114 \n", 509 | "South Korea 22595 9.57 104 \n", 510 | "Italy 26661 10.20 106 \n", 511 | "Spain 23964 13.08 115 \n", 512 | "Iran 19125 6.46 72 \n", 513 | "Australia 15606 10.28 107 \n", 514 | "Brazil 14396 7.00 86 \n", 515 | "\n", 516 | " Energy Supply Energy Supply per Capita % Renewable \\\n", 517 | "Country \n", 518 | "China 127191000000 93 19.7549 \n", 519 | "United States 90838000000 286 11.571 \n", 520 | "Japan 18984000000 149 10.2328 \n", 521 | "United Kingdom 7920000000 124 10.6005 \n", 522 | "Russian Federation 30709000000 214 17.2887 \n", 523 | "Canada 10431000000 296 61.9454 \n", 524 | "Germany 13261000000 165 17.9015 \n", 525 | "India 33195000000 26 14.9691 \n", 526 | "France 10597000000 166 17.0203 \n", 527 | "South Korea 11007000000 221 2.27935 \n", 528 | "Italy 6530000000 109 33.6672 \n", 529 | "Spain 4923000000 106 37.9686 \n", 530 | "Iran 9172000000 119 5.70772 \n", 531 | "Australia 5386000000 231 11.8108 \n", 532 | "Brazil 12149000000 59 69.648 \n", 533 | "\n", 534 | " 2006 2007 2008 2009 \\\n", 535 | "Country \n", 536 | "China 3.992331e+12 4.559041e+12 4.997775e+12 5.459247e+12 \n", 537 | "United States 1.479230e+13 1.505540e+13 1.501149e+13 1.459484e+13 \n", 538 | "Japan 5.496542e+12 5.617036e+12 5.558527e+12 5.251308e+12 \n", 539 | "United Kingdom 2.419631e+12 2.482203e+12 2.470614e+12 2.367048e+12 \n", 540 | "Russian Federation 1.385793e+12 1.504071e+12 1.583004e+12 1.459199e+12 \n", 541 | "Canada 1.564469e+12 1.596740e+12 1.612713e+12 1.565145e+12 \n", 542 | "Germany 3.332891e+12 3.441561e+12 3.478809e+12 3.283340e+12 \n", 543 | "India 1.265894e+12 1.374865e+12 1.428361e+12 1.549483e+12 \n", 544 | "France 2.607840e+12 2.669424e+12 2.674637e+12 2.595967e+12 \n", 545 | "South Korea 9.410199e+11 9.924316e+11 1.020510e+12 1.027730e+12 \n", 546 | "Italy 2.202170e+12 2.234627e+12 2.211154e+12 2.089938e+12 \n", 547 | "Spain 1.414823e+12 1.468146e+12 1.484530e+12 1.431475e+12 \n", 548 | "Iran 3.895523e+11 4.250646e+11 4.289909e+11 4.389208e+11 \n", 549 | "Australia 1.021939e+12 1.060340e+12 1.099644e+12 1.119654e+12 \n", 550 | "Brazil 1.845080e+12 1.957118e+12 2.056809e+12 2.054215e+12 \n", 551 | "\n", 552 | " 2010 2011 2012 2013 \\\n", 553 | "Country \n", 554 | "China 6.039659e+12 6.612490e+12 7.124978e+12 7.672448e+12 \n", 555 | "United States 1.496437e+13 1.520402e+13 1.554216e+13 1.577367e+13 \n", 556 | "Japan 5.498718e+12 5.473738e+12 5.569102e+12 5.644659e+12 \n", 557 | "United Kingdom 2.403504e+12 2.450911e+12 2.479809e+12 2.533370e+12 \n", 558 | "Russian Federation 1.524917e+12 1.589943e+12 1.645876e+12 1.666934e+12 \n", 559 | "Canada 1.613406e+12 1.664087e+12 1.693133e+12 1.730688e+12 \n", 560 | "Germany 3.417298e+12 3.542371e+12 3.556724e+12 3.567317e+12 \n", 561 | "India 1.708459e+12 1.821872e+12 1.924235e+12 2.051982e+12 \n", 562 | "France 2.646995e+12 2.702032e+12 2.706968e+12 2.722567e+12 \n", 563 | "South Korea 1.094499e+12 1.134796e+12 1.160809e+12 1.194429e+12 \n", 564 | "Italy 2.125185e+12 2.137439e+12 2.077184e+12 2.040871e+12 \n", 565 | "Spain 1.431673e+12 1.417355e+12 1.380216e+12 1.357139e+12 \n", 566 | "Iran 4.677902e+11 4.853309e+11 4.532569e+11 4.445926e+11 \n", 567 | "Australia 1.142251e+12 1.169431e+12 1.211913e+12 1.241484e+12 \n", 568 | "Brazil 2.208872e+12 2.295245e+12 2.339209e+12 2.409740e+12 \n", 569 | "\n", 570 | " 2014 2015 \n", 571 | "Country \n", 572 | "China 8.230121e+12 8.797999e+12 \n", 573 | "United States 1.615662e+13 1.654857e+13 \n", 574 | "Japan 5.642884e+12 5.669563e+12 \n", 575 | "United Kingdom 2.605643e+12 2.666333e+12 \n", 576 | "Russian Federation 1.678709e+12 1.616149e+12 \n", 577 | "Canada 1.773486e+12 1.792609e+12 \n", 578 | "Germany 3.624386e+12 3.685556e+12 \n", 579 | "India 2.200617e+12 2.367206e+12 \n", 580 | "France 2.729632e+12 2.761185e+12 \n", 581 | "South Korea 1.234340e+12 1.266580e+12 \n", 582 | "Italy 2.033868e+12 2.049316e+12 \n", 583 | "Spain 1.375605e+12 1.419821e+12 \n", 584 | "Iran 4.639027e+11 NaN \n", 585 | "Australia 1.272520e+12 1.301251e+12 \n", 586 | "Brazil 2.412231e+12 2.319423e+12 " 587 | ] 588 | }, 589 | "execution_count": 2, 590 | "metadata": {}, 591 | "output_type": "execute_result" 592 | } 593 | ], 594 | "source": [ 595 | "import pandas as pd\n", 596 | "import numpy as np\n", 597 | "\n", 598 | "# Load the Excel sheet and CSV file into Panda Dataframe. Skip header for GDP Dataframe.\n", 599 | "xls_file = pd.ExcelFile('Energy Indicators.xls')\n", 600 | "energy = xls_file.parse('Energy')\n", 601 | "GDP = pd.read_csv('world_bank.csv', header=None)\n", 602 | "ScimEn_file = pd.ExcelFile('scimagojr-3.xlsx')\n", 603 | "ScimEn = ScimEn_file.parse('Sheet1')\n", 604 | "\n", 605 | "# Drop first 2 columns\n", 606 | "energy = energy.drop(['Unnamed: 0','Unnamed: 1'],1)\n", 607 | "\n", 608 | "# Rename the columns\n", 609 | "energy.columns = ['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable']\n", 610 | "\n", 611 | "# Convert \"...\" to np.NaN\n", 612 | "# This section is not very DRY[\"Don't Repeat Yourself\"]. Need to think about ways to apply regex to column names, so that the same function applies to columns starting with \"Energy Supply\"\n", 613 | "energy['Energy Supply'].replace(regex=True,inplace=True,to_replace=r'\\W',value=np.nan)\n", 614 | "energy['Energy Supply per Capita'].replace(regex=True,inplace=True,to_replace=r'\\W',value=np.nan)\n", 615 | "\n", 616 | "# Convert 'Energy Supply' Column from petajoule to gigajoules (there are 1,000,000 gigajoules in a petajoule)\n", 617 | "energy['Energy Supply']*=1000000\n", 618 | "\n", 619 | "# Set Country column to be the index\n", 620 | "energy.set_index('Country')\n", 621 | "\n", 622 | "# Use Regular Expression to remove numbers and parenthesis(and the content inside) in country names. \\d stands for digits. \n", 623 | "# Remember to add a whitespace before the first escape before ()....couldn't find Bolivia otherwise. Struggled for a long time for this!\n", 624 | "energy['Country'].replace(regex=True,inplace=True,to_replace=r'\\d',value=r'')\n", 625 | "energy['Country'].replace(regex=True,inplace=True,to_replace=r' \\(.*\\)',value=r'')\n", 626 | "\n", 627 | "# Replace country names with new names\n", 628 | "energy.Country[energy.Country == \"Republic of Korea\"]=\"South Korea\"\n", 629 | "energy.Country[energy.Country == \"United States of America\"]=\"United States\"\n", 630 | "energy.Country[energy.Country == \"China, Hong Kong Special Administrative Region\"]=\"Hong Kong\"\n", 631 | "energy.Country[energy.Country == \"United Kingdom of Great Britain and Northern Ireland\"]=\"United Kingdom\"\n", 632 | "\n", 633 | "# Test to ensure that the replacement worked. Have turned them into comments.\n", 634 | "# energy.loc[energy.Country ==\"Bolivia\"]\n", 635 | "# energy.loc[energy.Country ==\"United Kingdom\"]\n", 636 | "\n", 637 | "\n", 638 | "# ----------------------------------------------\n", 639 | "\n", 640 | "# Drop the first few rows of GDP Dataframe to clean the data\n", 641 | "GDP.drop(GDP.index[:4],axis=0, inplace=True)\n", 642 | "\n", 643 | "# Drop the other columns so that we only keep the last 10 years' data (2006-2015)\n", 644 | "GDP.drop(GDP.columns[1:50], axis=1, inplace=True)\n", 645 | "\n", 646 | "# Convert the Column heads from number to strings (2006.0 -> Year 2006)\n", 647 | "# GDP[50:]=GDP[50:].astype(str)\n", 648 | "# GDP.rename(columns = lambda x: str(x), inplace=True)\n", 649 | "GDP.columns = ['Country','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015']\n", 650 | "GDP.columns.values\n", 651 | "\n", 652 | "\n", 653 | "\n", 654 | "# Use the first row as column headers\n", 655 | "# Drop redundant row\n", 656 | "GDP.columns = GDP.iloc[0]\n", 657 | "GDP.drop(GDP.index[0],axis=0, inplace=True)\n", 658 | "GDP.rename(columns={'Country Name': 'Country'}, inplace=True)\n", 659 | "GDP.set_index('Country')\n", 660 | "\n", 661 | "# Rename the countries. South Korea was actually named \"Korea, Rep.\" in the original spreadsheet. They gave the wrong instruction here...\n", 662 | "GDP['Country'].replace(regex=True,inplace=True,to_replace='Korea, Rep.',value='South Korea')\n", 663 | "GDP['Country'].replace(regex=True,inplace=True,to_replace='Iran, Islamic Rep.',value='Iran')\n", 664 | "GDP['Country'].replace(regex=True,inplace=True,to_replace='Hong Kong SAR, China',value='Hong Kong')\n", 665 | "\n", 666 | "# Remove the parenthesis\n", 667 | "GDP['Country'].replace(regex=True,inplace=True,to_replace=r' \\(.*\\)',value=r'')\n", 668 | "\n", 669 | "\n", 670 | "# Test to ensure that the replacement worked. Have turned it into comment.\n", 671 | "# GDP.loc[GDP[\"Country Name\"] ==\"South Korea\"]\n", 672 | "\n", 673 | "# ----------------------------------------------\n", 674 | "\n", 675 | "# Keep only the top 15 countries by Ranking in the ScimEn Dataframe\n", 676 | "ScimEn.drop(ScimEn.index[15:],axis=0, inplace=True)\n", 677 | "\n", 678 | "# Merge the 3 dataframes: energy, GDP, ScimEn\n", 679 | "new = pd.merge(pd.merge(ScimEn,energy,on='Country'),GDP,on='Country')\n", 680 | "\n", 681 | "# Set the Country column to be the index. Remember to use inplace=True. Otherwise it kept returning a list of integers...spend an hour on this. \n", 682 | "new.set_index('Country',inplace=True)\n", 683 | "\n", 684 | "# Count the number of rows and columns in the dataframe. This is a test, have commented it. \n", 685 | "# new.shape[0] #gives number of row count\n", 686 | "# new.shape[1] #gives number of col count\n", 687 | "\n", 688 | "# Hardcoding this because I couln't figure out how to convert Numbers to Strings in the header. Tried astype() and to_string, both didn't work....\n", 689 | "new.columns = ['Rank', 'Documents', 'Citable documents', 'Citations', 'Self-citations', 'Citations per document', 'H index', 'Energy Supply', 'Energy Supply per Capita', '% Renewable', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']\n", 690 | "\n", 691 | "def answer_one():\n", 692 | " return new\n", 693 | "\n", 694 | "answer_one()" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": null, 700 | "metadata": { 701 | "collapsed": true 702 | }, 703 | "outputs": [], 704 | "source": [] 705 | }, 706 | { 707 | "cell_type": "markdown", 708 | "metadata": {}, 709 | "source": [ 710 | "### Question 2 (6.6%)\n", 711 | "The previous question joined three datasets then reduced this to just the top 15 entries. When you joined the datasets, but before you reduced this to the top 15 items, how many entries did you lose?\n", 712 | "\n", 713 | "*This function should return a single number.*" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": 58, 719 | "metadata": { 720 | "collapsed": false 721 | }, 722 | "outputs": [ 723 | { 724 | "data": { 725 | "text/html": [ 726 | "\n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " Everything but this!\n", 732 | "" 733 | ], 734 | "text/plain": [ 735 | "" 736 | ] 737 | }, 738 | "metadata": {}, 739 | "output_type": "display_data" 740 | } 741 | ], 742 | "source": [ 743 | "%%HTML\n", 744 | "\n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " Everything but this!\n", 750 | "" 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": null, 756 | "metadata": { 757 | "collapsed": true 758 | }, 759 | "outputs": [], 760 | "source": [] 761 | }, 762 | { 763 | "cell_type": "code", 764 | "execution_count": null, 765 | "metadata": { 766 | "collapsed": false 767 | }, 768 | "outputs": [], 769 | "source": [ 770 | "def answer_two():\n", 771 | " return \"ANSWER\"" 772 | ] 773 | }, 774 | { 775 | "cell_type": "markdown", 776 | "metadata": {}, 777 | "source": [ 778 | "
\n", 779 | "\n", 780 | "Answer the following questions in the context of only the top 15 countries by Scimagojr Rank (aka the DataFrame returned by `answer_one()`)" 781 | ] 782 | }, 783 | { 784 | "cell_type": "markdown", 785 | "metadata": {}, 786 | "source": [ 787 | "### Question 3 (6.6%)\n", 788 | "What is the average GDP over the last 10 years for each country?\n", 789 | "\n", 790 | "*This function should return a Series named `avgGDP` with 15 countries and their average GDP sorted in descending order.*" 791 | ] 792 | }, 793 | { 794 | "cell_type": "code", 795 | "execution_count": 84, 796 | "metadata": { 797 | "collapsed": false, 798 | "scrolled": true 799 | }, 800 | "outputs": [ 801 | { 802 | "name": "stderr", 803 | "output_type": "stream", 804 | "text": [ 805 | "/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:8: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)\n" 806 | ] 807 | }, 808 | { 809 | "data": { 810 | "text/plain": [ 811 | "Country\n", 812 | "United States 1.536434e+13\n", 813 | "China 6.348609e+12\n", 814 | "Japan 5.542208e+12\n", 815 | "Germany 3.493025e+12\n", 816 | "France 2.681725e+12\n", 817 | "United Kingdom 2.487907e+12\n", 818 | "Brazil 2.189794e+12\n", 819 | "Italy 2.120175e+12\n", 820 | "India 1.769297e+12\n", 821 | "Canada 1.660647e+12\n", 822 | "Russian Federation 1.565459e+12\n", 823 | "Spain 1.418078e+12\n", 824 | "Australia 1.164043e+12\n", 825 | "South Korea 1.106715e+12\n", 826 | "Iran 4.441558e+11\n", 827 | "Name: avgGDP, dtype: float64" 828 | ] 829 | }, 830 | "execution_count": 84, 831 | "metadata": {}, 832 | "output_type": "execute_result" 833 | } 834 | ], 835 | "source": [ 836 | "# Need to create a new dataframe for each question, otherwise the autograder would think that I'm creating new columns for the DF created in question 1, and will stop working...\n", 837 | "question3=new.copy()\n", 838 | "\n", 839 | "# The 2015 GDP data for Iran is NAN, so I had to use np.mean() instead of hard code it as df.sum()/10.... Probably better this way.\n", 840 | "question3[\"avgGDP\"]=question3[['2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']].mean(axis=1)\n", 841 | "\n", 842 | "# Sort by descending order\n", 843 | "question3.sort('avgGDP', ascending=False,inplace=True)\n", 844 | "\n", 845 | "# Create a new data series named \"avgGDP\"\n", 846 | "avgGDP_series= question3.ix[:,'avgGDP']\n", 847 | "\n", 848 | "def answer_three():\n", 849 | " Top15 = answer_one()\n", 850 | " return avgGDP_series\n", 851 | "answer_three()" 852 | ] 853 | }, 854 | { 855 | "cell_type": "markdown", 856 | "metadata": {}, 857 | "source": [ 858 | "### Question 4 (6.6%)\n", 859 | "By how much had the GDP changed over the 10 year span for the country with the 6th largest average GDP?\n", 860 | "\n", 861 | "*This function should return a single number.*" 862 | ] 863 | }, 864 | { 865 | "cell_type": "code", 866 | "execution_count": 148, 867 | "metadata": { 868 | "collapsed": false, 869 | "scrolled": true 870 | }, 871 | "outputs": [ 872 | { 873 | "data": { 874 | "text/plain": [ 875 | "246702696075.3999" 876 | ] 877 | }, 878 | "execution_count": 148, 879 | "metadata": {}, 880 | "output_type": "execute_result" 881 | } 882 | ], 883 | "source": [ 884 | "question4 = question3.copy()\n", 885 | "question4.drop(question4.columns[0:10],axis=1, inplace=True)\n", 886 | "\n", 887 | "diff=question4.iloc[5][9]-question4.iloc[5][0]\n", 888 | "\n", 889 | "def answer_four():\n", 890 | " Top15 = answer_one()\n", 891 | " return diff\n", 892 | "\n", 893 | "answer_four()" 894 | ] 895 | }, 896 | { 897 | "cell_type": "code", 898 | "execution_count": null, 899 | "metadata": { 900 | "collapsed": true 901 | }, 902 | "outputs": [], 903 | "source": [] 904 | }, 905 | { 906 | "cell_type": "markdown", 907 | "metadata": {}, 908 | "source": [ 909 | "### Question 5 (6.6%)\n", 910 | "What is the mean energy supply per capita?\n", 911 | "\n", 912 | "*This function should return a single number.*" 913 | ] 914 | }, 915 | { 916 | "cell_type": "code", 917 | "execution_count": 157, 918 | "metadata": { 919 | "collapsed": false 920 | }, 921 | "outputs": [ 922 | { 923 | "data": { 924 | "text/plain": [ 925 | "157.6" 926 | ] 927 | }, 928 | "execution_count": 157, 929 | "metadata": {}, 930 | "output_type": "execute_result" 931 | } 932 | ], 933 | "source": [ 934 | "question5 = new.copy()\n", 935 | "\n", 936 | "# Use item() to convert a numpy.float object to normal Python object (float)\n", 937 | "def answer_five():\n", 938 | " return question5['Energy Supply per Capita'].mean().item()\n", 939 | "answer_five()" 940 | ] 941 | }, 942 | { 943 | "cell_type": "markdown", 944 | "metadata": {}, 945 | "source": [ 946 | "### Question 6 (6.6%)\n", 947 | "What country has the maximum % Renewable and what is the percentage?\n", 948 | "\n", 949 | "*This function should return a tuple with the name of the country and the percentage.*" 950 | ] 951 | }, 952 | { 953 | "cell_type": "code", 954 | "execution_count": 170, 955 | "metadata": { 956 | "collapsed": false 957 | }, 958 | "outputs": [ 959 | { 960 | "name": "stderr", 961 | "output_type": "stream", 962 | "text": [ 963 | "/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)\n", 964 | " from ipykernel import kernelapp as app\n" 965 | ] 966 | }, 967 | { 968 | "data": { 969 | "text/plain": [ 970 | "('Brazil', 69.64803)" 971 | ] 972 | }, 973 | "execution_count": 170, 974 | "metadata": {}, 975 | "output_type": "execute_result" 976 | } 977 | ], 978 | "source": [ 979 | "question6= new.copy()\n", 980 | "\n", 981 | "# Sort by '% Renewable'\n", 982 | "question6.sort('% Renewable',ascending=False,inplace=True)\n", 983 | "\n", 984 | "# Return as a tuple\n", 985 | "def answer_six():\n", 986 | " return (question6.iloc[0].name,question6.iloc[0]['% Renewable'])\n", 987 | "answer_six()" 988 | ] 989 | }, 990 | { 991 | "cell_type": "markdown", 992 | "metadata": {}, 993 | "source": [ 994 | "### Question 7 (6.6%)\n", 995 | "Create a new column that is the ratio of Self-Citations to Total Citations. \n", 996 | "What is the maximum value for this new column, and what country has the highest ratio?\n", 997 | "\n", 998 | "*This function should return a tuple with the name of the country and the ratio.*" 999 | ] 1000 | }, 1001 | { 1002 | "cell_type": "code", 1003 | "execution_count": 212, 1004 | "metadata": { 1005 | "collapsed": false 1006 | }, 1007 | "outputs": [ 1008 | { 1009 | "data": { 1010 | "text/plain": [ 1011 | "('China', 0.68931261793894216)" 1012 | ] 1013 | }, 1014 | "execution_count": 212, 1015 | "metadata": {}, 1016 | "output_type": "execute_result" 1017 | } 1018 | ], 1019 | "source": [ 1020 | "question7=new.copy()\n", 1021 | "\n", 1022 | "question7['Citation Ratio']=question7['Self-citations']/question7['Citations']\n", 1023 | "\n", 1024 | "name_of_country = question7[question7['Citation Ratio']==question7['Citation Ratio'].max()].index.values.item()\n", 1025 | "value = question7['Citation Ratio'].max()\n", 1026 | "def answer_seven():\n", 1027 | " return (name_of_country,value)\n", 1028 | "answer_seven()" 1029 | ] 1030 | }, 1031 | { 1032 | "cell_type": "code", 1033 | "execution_count": null, 1034 | "metadata": { 1035 | "collapsed": true 1036 | }, 1037 | "outputs": [], 1038 | "source": [] 1039 | }, 1040 | { 1041 | "cell_type": "markdown", 1042 | "metadata": {}, 1043 | "source": [ 1044 | "### Question 8 (6.6%)\n", 1045 | "\n", 1046 | "Create a column that estimates the population using Energy Supply and Energy Supply per capita. \n", 1047 | "What is the third most populous country according to this estimate?\n", 1048 | "\n", 1049 | "*This function should return a single string value.*" 1050 | ] 1051 | }, 1052 | { 1053 | "cell_type": "code", 1054 | "execution_count": 3, 1055 | "metadata": { 1056 | "collapsed": false 1057 | }, 1058 | "outputs": [ 1059 | { 1060 | "name": "stderr", 1061 | "output_type": "stream", 1062 | "text": [ 1063 | "/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:6: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)\n" 1064 | ] 1065 | }, 1066 | { 1067 | "data": { 1068 | "text/plain": [ 1069 | "'United States'" 1070 | ] 1071 | }, 1072 | "execution_count": 3, 1073 | "metadata": {}, 1074 | "output_type": "execute_result" 1075 | } 1076 | ], 1077 | "source": [ 1078 | "question8 = new.copy()\n", 1079 | "# Create a new row for 'Population'\n", 1080 | "question8['Population']=question8['Energy Supply']/question8['Energy Supply per Capita']\n", 1081 | "\n", 1082 | "# Sort the dataframe by 'Population'\n", 1083 | "question8.sort('Population',ascending=False,inplace=True)\n", 1084 | "\n", 1085 | "# Find the 3rd most populous country, and return the index (country name) with index.values. Item() converts it into a Python string\n", 1086 | "third_populous_country = question8[question8['Population']==question8['Population'][2]].index.values.item()\n", 1087 | "def answer_eight():\n", 1088 | " return third_populous_country\n", 1089 | "\n", 1090 | "answer_eight()" 1091 | ] 1092 | }, 1093 | { 1094 | "cell_type": "code", 1095 | "execution_count": null, 1096 | "metadata": { 1097 | "collapsed": true 1098 | }, 1099 | "outputs": [], 1100 | "source": [] 1101 | }, 1102 | { 1103 | "cell_type": "markdown", 1104 | "metadata": {}, 1105 | "source": [ 1106 | "### Question 9\n", 1107 | "Create a column that estimates the number of citable documents per person. \n", 1108 | "What is the correlation between the number of citable documents per capita and the energy supply per capita? Use the `.corr()` method, (Pearson's correlation).\n", 1109 | "\n", 1110 | "*This function should return a single number.*\n", 1111 | "\n", 1112 | "*(Optional: Use the built-in function `plot9()` to visualize the relationship between Energy Supply per Capita vs. Citable docs per Capita)*" 1113 | ] 1114 | }, 1115 | { 1116 | "cell_type": "code", 1117 | "execution_count": 47, 1118 | "metadata": { 1119 | "collapsed": false 1120 | }, 1121 | "outputs": [], 1122 | "source": [ 1123 | "# Can't figure this out. Ditched it. \n", 1124 | "# I want chips.\n", 1125 | "# Going out now to buy chips...\n", 1126 | "\n", 1127 | "# question9=question8.copy()\n", 1128 | "\n", 1129 | "# question9['Citable document per Capita']=question9['Citable documents']/question9['Population']\n", 1130 | "# sub = question9[['Citable document per Capita','Energy Supply per Capita']]\n", 1131 | "\n", 1132 | "# # def answer_nine():\n", 1133 | "# # return \"ANSWER\"" 1134 | ] 1135 | }, 1136 | { 1137 | "cell_type": "code", 1138 | "execution_count": 214, 1139 | "metadata": { 1140 | "collapsed": false 1141 | }, 1142 | "outputs": [], 1143 | "source": [ 1144 | "# def plot9():\n", 1145 | "# import matplotlib as plt\n", 1146 | "# %matplotlib inline\n", 1147 | " \n", 1148 | "# Top15 = answer_one()\n", 1149 | "# Top15['PopEst'] = Top15['Energy Supply'] / Top15['Energy Supply per Capita']\n", 1150 | "# Top15['Citable docs per Capita'] = Top15['Citable documents'] / Top15['PopEst']\n", 1151 | "# Top15.plot(x='Citable docs per Capita', y='Energy Supply per Capita', kind='scatter', xlim=[0, 0.0006])" 1152 | ] 1153 | }, 1154 | { 1155 | "cell_type": "code", 1156 | "execution_count": null, 1157 | "metadata": { 1158 | "collapsed": true 1159 | }, 1160 | "outputs": [], 1161 | "source": [ 1162 | "#" 1163 | ] 1164 | }, 1165 | { 1166 | "cell_type": "code", 1167 | "execution_count": null, 1168 | "metadata": { 1169 | "collapsed": false 1170 | }, 1171 | "outputs": [], 1172 | "source": [ 1173 | "#plot9() # Be sure to comment out plot9() before submitting the assignment!" 1174 | ] 1175 | }, 1176 | { 1177 | "cell_type": "markdown", 1178 | "metadata": {}, 1179 | "source": [ 1180 | "### Question 10 (6.6%)\n", 1181 | "Create a new column with a 1 if the country's % Renewable value is at or above the median for all countries in the top 15, and a 0 if the country's % Renewable value is below the median.\n", 1182 | "\n", 1183 | "*This function should return a series named `HighRenew` whose index is the country name sorted in ascending order of rank.*" 1184 | ] 1185 | }, 1186 | { 1187 | "cell_type": "code", 1188 | "execution_count": 74, 1189 | "metadata": { 1190 | "collapsed": false 1191 | }, 1192 | "outputs": [ 1193 | { 1194 | "name": "stderr", 1195 | "output_type": "stream", 1196 | "text": [ 1197 | "/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:7: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)\n" 1198 | ] 1199 | }, 1200 | { 1201 | "data": { 1202 | "text/plain": [ 1203 | "Country\n", 1204 | "China 1\n", 1205 | "United States 0\n", 1206 | "Japan 0\n", 1207 | "United Kingdom 0\n", 1208 | "Russian Federation 1\n", 1209 | "Canada 1\n", 1210 | "Germany 1\n", 1211 | "India 0\n", 1212 | "France 1\n", 1213 | "South Korea 0\n", 1214 | "Italy 1\n", 1215 | "Spain 1\n", 1216 | "Iran 0\n", 1217 | "Australia 0\n", 1218 | "Brazil 1\n", 1219 | "Name: HighRenew, dtype: int64" 1220 | ] 1221 | }, 1222 | "execution_count": 74, 1223 | "metadata": {}, 1224 | "output_type": "execute_result" 1225 | } 1226 | ], 1227 | "source": [ 1228 | "question10=new.copy()\n", 1229 | "\n", 1230 | "median = question10['% Renewable'].median()\n", 1231 | "\n", 1232 | "question10['HighRenew'] = np.where(question10['% Renewable'] >= median,1,0)\n", 1233 | "question10['HighRenew'] = question10['HighRenew'].replace(np.nan,0)\n", 1234 | "question10.sort('Rank',ascending=True, inplace=True)\n", 1235 | "HighRenew= question10.ix[:,'HighRenew']\n", 1236 | "\n", 1237 | "def answer_ten():\n", 1238 | " return HighRenew\n", 1239 | "\n", 1240 | "answer_ten()" 1241 | ] 1242 | }, 1243 | { 1244 | "cell_type": "markdown", 1245 | "metadata": {}, 1246 | "source": [ 1247 | "### Question 11 (6.6%)\n", 1248 | "Use the following dictionary to group the Countries by Continent, then create a dateframe that displays the sample size (the number of countries in each continent bin), and the sum, mean, and std deviation for the estimated population of each country.\n", 1249 | "\n", 1250 | "```python\n", 1251 | "ContinentDict = {'China':'Asia', \n", 1252 | " 'United States':'North America', \n", 1253 | " 'Japan':'Asia', \n", 1254 | " 'United Kingdom':'Europe', \n", 1255 | " 'Russian Federation':'Europe', \n", 1256 | " 'Canada':'North America', \n", 1257 | " 'Germany':'Europe', \n", 1258 | " 'India':'Asia',\n", 1259 | " 'France':'Europe', \n", 1260 | " 'South Korea':'Asia', \n", 1261 | " 'Italy':'Europe', \n", 1262 | " 'Spain':'Europe', \n", 1263 | " 'Iran':'Asia',\n", 1264 | " 'Australia':'Australia', \n", 1265 | " 'Brazil':'South America'}\n", 1266 | "```\n", 1267 | "\n", 1268 | "*This function should return a DataFrame with index named Continent `['Asia', 'Australia', 'Europe', 'North America', 'South America']` and columns `['size', 'sum', 'mean', 'std']`*" 1269 | ] 1270 | }, 1271 | { 1272 | "cell_type": "code", 1273 | "execution_count": 163, 1274 | "metadata": { 1275 | "collapsed": false 1276 | }, 1277 | "outputs": [ 1278 | { 1279 | "name": "stderr", 1280 | "output_type": "stream", 1281 | "text": [ 1282 | "/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:23: FutureWarning: convert_objects is deprecated. Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.\n" 1283 | ] 1284 | }, 1285 | { 1286 | "data": { 1287 | "text/html": [ 1288 | "
\n", 1289 | "\n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | "
sizesummeanstd
Continent
Asia52.898666e+095.797333e+086.790979e+08
Australia12.331602e+072.331602e+07NaN
Europe64.579297e+087.632161e+073.464767e+07
North America23.528552e+081.764276e+081.996696e+08
South America12.059153e+082.059153e+08NaN
\n", 1344 | "
" 1345 | ], 1346 | "text/plain": [ 1347 | " size sum mean std\n", 1348 | "Continent \n", 1349 | "Asia 5 2.898666e+09 5.797333e+08 6.790979e+08\n", 1350 | "Australia 1 2.331602e+07 2.331602e+07 NaN\n", 1351 | "Europe 6 4.579297e+08 7.632161e+07 3.464767e+07\n", 1352 | "North America 2 3.528552e+08 1.764276e+08 1.996696e+08\n", 1353 | "South America 1 2.059153e+08 2.059153e+08 NaN" 1354 | ] 1355 | }, 1356 | "execution_count": 163, 1357 | "metadata": {}, 1358 | "output_type": "execute_result" 1359 | } 1360 | ], 1361 | "source": [ 1362 | "question11 = question8.copy()\n", 1363 | "question11.drop(question11.columns[:-1],axis=1,inplace=True)\n", 1364 | "\n", 1365 | "ContinentDict = {'China':'Asia', \n", 1366 | " 'United States':'North America', \n", 1367 | " 'Japan':'Asia', \n", 1368 | " 'United Kingdom':'Europe', \n", 1369 | " 'Russian Federation':'Europe', \n", 1370 | " 'Canada':'North America', \n", 1371 | " 'Germany':'Europe', \n", 1372 | " 'India':'Asia',\n", 1373 | " 'France':'Europe', \n", 1374 | " 'South Korea':'Asia', \n", 1375 | " 'Italy':'Europe', \n", 1376 | " 'Spain':'Europe', \n", 1377 | " 'Iran':'Asia',\n", 1378 | " 'Australia':'Australia', \n", 1379 | " 'Brazil':'South America'}\n", 1380 | "question11.reset_index(inplace=1)\n", 1381 | "question11['Continent']=question11['Country'].map(ContinentDict)\n", 1382 | "\n", 1383 | "\n", 1384 | "question11['Population'] = question11['Population'].convert_objects(convert_numeric=True)\n", 1385 | "\n", 1386 | "PopSize=question11.groupby('Continent').agg({'Continent':np.count_nonzero})\n", 1387 | "PopSum=question11.groupby('Continent').agg({'Population':np.sum})\n", 1388 | "PopMean=question11.groupby('Continent').agg({'Population':np.average})\n", 1389 | "PopStd=question11.groupby('Continent').agg({'Population':np.std})\n", 1390 | "\n", 1391 | "PopTotal = pd.concat([PopSize,PopSum,PopMean,PopStd],axis=1)\n", 1392 | "PopTotal.columns = ['size', 'sum', 'mean', 'std']\n", 1393 | "\n", 1394 | "\n", 1395 | "def answer_eleven():\n", 1396 | " return PopTotal\n", 1397 | "answer_eleven()" 1398 | ] 1399 | }, 1400 | { 1401 | "cell_type": "markdown", 1402 | "metadata": {}, 1403 | "source": [ 1404 | "### Question 12 (6.6%)\n", 1405 | "Cut % Renewable into 5 bins. Group Top15 by the Continent, as well as these new % Renewable bins. How many countries are in each of these groups?\n", 1406 | "\n", 1407 | "*This function should return a Series with a MultiIndex of `Continent`, then the bins for `% Renewable`. Do not include groups with no countries.*" 1408 | ] 1409 | }, 1410 | { 1411 | "cell_type": "code", 1412 | "execution_count": null, 1413 | "metadata": { 1414 | "collapsed": false, 1415 | "scrolled": true 1416 | }, 1417 | "outputs": [], 1418 | "source": [ 1419 | "def answer_twelve():\n", 1420 | " Top15 = answer_one()\n", 1421 | " return \"ANSWER\"" 1422 | ] 1423 | }, 1424 | { 1425 | "cell_type": "markdown", 1426 | "metadata": {}, 1427 | "source": [ 1428 | "### Question 13 (6.6%)\n", 1429 | "Convert the Population Estimate series to a string with thousands separator (using commas). Do not round the results.\n", 1430 | "\n", 1431 | "e.g. 317615384.61538464 -> 317,615,384.61538464\n", 1432 | "\n", 1433 | "*This function should return a Series `PopEst` whose index is the country name and whose values are the population estimate string.*" 1434 | ] 1435 | }, 1436 | { 1437 | "cell_type": "code", 1438 | "execution_count": 95, 1439 | "metadata": { 1440 | "collapsed": false, 1441 | "scrolled": true 1442 | }, 1443 | "outputs": [ 1444 | { 1445 | "data": { 1446 | "text/plain": [ 1447 | "Country\n", 1448 | "China 1,367,645,161.2903225\n", 1449 | "India 1,276,730,769.2307692\n", 1450 | "United States 317,615,384.61538464\n", 1451 | "Brazil 205,915,254.23728815\n", 1452 | "Russian Federation 143,500,000.0\n", 1453 | "Japan 127,409,395.97315437\n", 1454 | "Germany 80,369,696.96969697\n", 1455 | "Iran 77,075,630.25210084\n", 1456 | "United Kingdom 63,870,967.741935484\n", 1457 | "France 63,837,349.39759036\n", 1458 | "Italy 59,908,256.880733944\n", 1459 | "South Korea 49,805,429.864253394\n", 1460 | "Spain 46,443,396.2264151\n", 1461 | "Canada 35,239,864.86486486\n", 1462 | "Australia 23,316,017.316017315\n", 1463 | "Name: PopEst, dtype: object" 1464 | ] 1465 | }, 1466 | "execution_count": 95, 1467 | "metadata": {}, 1468 | "output_type": "execute_result" 1469 | } 1470 | ], 1471 | "source": [ 1472 | "question13=question8.copy()\n", 1473 | "\n", 1474 | "# Keep only the 'Population' Column\n", 1475 | "question13.drop(question13.columns[:-1],axis=1,inplace=True)\n", 1476 | "\n", 1477 | "# Use format() to add the thousands separator\n", 1478 | "question13['PopEst'] = question13['Population'].apply(lambda x : '{:,}'.format(x))\n", 1479 | "\n", 1480 | "# Create a data series for the output\n", 1481 | "question13_series= question13.ix[:,'PopEst']\n", 1482 | "\n", 1483 | "def answer_thirteen():\n", 1484 | " return question13_series\n", 1485 | "\n", 1486 | "answer_thirteen()" 1487 | ] 1488 | }, 1489 | { 1490 | "cell_type": "markdown", 1491 | "metadata": {}, 1492 | "source": [ 1493 | "### Optional\n", 1494 | "\n", 1495 | "Use the built in function `plot_optional()` to see an example visualization." 1496 | ] 1497 | }, 1498 | { 1499 | "cell_type": "code", 1500 | "execution_count": null, 1501 | "metadata": { 1502 | "collapsed": false, 1503 | "scrolled": true 1504 | }, 1505 | "outputs": [], 1506 | "source": [ 1507 | "def plot_optional():\n", 1508 | " import matplotlib as plt\n", 1509 | " %matplotlib inline\n", 1510 | " Top15 = answer_one()\n", 1511 | " ax = Top15.plot(x='Rank', y='% Renewable', kind='scatter', \n", 1512 | " c=['#e41a1c','#377eb8','#e41a1c','#4daf4a','#4daf4a','#377eb8','#4daf4a','#e41a1c',\n", 1513 | " '#4daf4a','#e41a1c','#4daf4a','#4daf4a','#e41a1c','#dede00','#ff7f00'], \n", 1514 | " xticks=range(1,16), s=6*Top15['2014']/10**10, alpha=.75, figsize=[16,6]);\n", 1515 | "\n", 1516 | " for i, txt in enumerate(Top15.index):\n", 1517 | " ax.annotate(txt, [Top15['Rank'][i], Top15['% Renewable'][i]], ha='center')\n", 1518 | "\n", 1519 | " print(\"This is an example of a visualization that can be created to help understand the data. \\\n", 1520 | "This is a bubble chart showing % Renewable vs. Rank. The size of the bubble corresponds to the countries' \\\n", 1521 | "2014 GDP, and the color corresponds to the continent.\")" 1522 | ] 1523 | }, 1524 | { 1525 | "cell_type": "code", 1526 | "execution_count": null, 1527 | "metadata": { 1528 | "collapsed": false 1529 | }, 1530 | "outputs": [], 1531 | "source": [ 1532 | "#plot_optional() # Be sure to comment out plot_optional() before submitting the assignment!" 1533 | ] 1534 | } 1535 | ], 1536 | "metadata": { 1537 | "anaconda-cloud": {}, 1538 | "coursera": { 1539 | "course_slug": "python-data-analysis", 1540 | "graded_item_id": "zAr06", 1541 | "launcher_item_id": "KSSjT", 1542 | "part_id": "SL3fU" 1543 | }, 1544 | "kernelspec": { 1545 | "display_name": "Python 3", 1546 | "language": "python", 1547 | "name": "python3" 1548 | }, 1549 | "language_info": { 1550 | "codemirror_mode": { 1551 | "name": "ipython", 1552 | "version": 3 1553 | }, 1554 | "file_extension": ".py", 1555 | "mimetype": "text/x-python", 1556 | "name": "python", 1557 | "nbconvert_exporter": "python", 1558 | "pygments_lexer": "ipython3", 1559 | "version": "3.5.2" 1560 | } 1561 | }, 1562 | "nbformat": 4, 1563 | "nbformat_minor": 0 1564 | } 1565 | -------------------------------------------------------------------------------- /Week1/Week+1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "---\n", 8 | "\n", 9 | "_You are currently looking at **version 1.1** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-data-analysis/resources/0dhYG) course resource._\n", 10 | "\n", 11 | "---" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# The Python Programming Language: Functions" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/plain": [ 31 | "3" 32 | ] 33 | }, 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "output_type": "execute_result" 37 | } 38 | ], 39 | "source": [ 40 | "x = 1\n", 41 | "y = 2\n", 42 | "x + y" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 4, 48 | "metadata": { 49 | "collapsed": false 50 | }, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "x\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "print (\"x\")" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "
\n", 69 | "`add_numbers` is a function that takes two numbers and adds them together." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "def add_numbers(x, y):\n", 81 | " return x + y\n", 82 | "\n", 83 | "add_numbers(1, 2)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "
\n", 91 | "`add_numbers` updated to take an optional 3rd parameter. Using `print` allows printing of multiple expressions within a single cell." 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "def add_numbers(x,y,z=None):\n", 103 | " if (z==None):\n", 104 | " return x+y\n", 105 | " else:\n", 106 | " return x+y+z\n", 107 | "\n", 108 | "print(add_numbers(1, 2))\n", 109 | "print(add_numbers(1, 2, 3))" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "
\n", 117 | "`add_numbers` updated to take an optional flag parameter." 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 1, 123 | "metadata": { 124 | "collapsed": false 125 | }, 126 | "outputs": [ 127 | { 128 | "name": "stdout", 129 | "output_type": "stream", 130 | "text": [ 131 | "Flag is true!\n", 132 | "3\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "def add_numbers(x, y, z=None, flag=False):\n", 138 | " if (flag):\n", 139 | " print('Flag is true!')\n", 140 | " if (z==None):\n", 141 | " return x + y\n", 142 | " else:\n", 143 | " return x + y + z\n", 144 | " \n", 145 | "print(add_numbers(1, 2, flag=True))" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "
\n", 153 | "Assign function `add_numbers` to variable `a`." 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 5, 159 | "metadata": { 160 | "collapsed": false 161 | }, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "3" 167 | ] 168 | }, 169 | "execution_count": 5, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "def add_numbers(x,y):\n", 176 | " return x+y\n", 177 | "\n", 178 | "a = add_numbers\n", 179 | "a(1,2)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": { 186 | "collapsed": true 187 | }, 188 | "outputs": [], 189 | "source": [] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "
\n", 196 | "# The Python Programming Language: Types and Sequences" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "
\n", 204 | "Use `type` to return the object's type." 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": { 211 | "collapsed": false 212 | }, 213 | "outputs": [], 214 | "source": [ 215 | "type('This is a string')" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 6, 221 | "metadata": { 222 | "collapsed": false 223 | }, 224 | "outputs": [ 225 | { 226 | "data": { 227 | "text/plain": [ 228 | "NoneType" 229 | ] 230 | }, 231 | "execution_count": 6, 232 | "metadata": {}, 233 | "output_type": "execute_result" 234 | } 235 | ], 236 | "source": [ 237 | "type(None)" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": { 244 | "collapsed": true 245 | }, 246 | "outputs": [], 247 | "source": [] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 7, 252 | "metadata": { 253 | "collapsed": false 254 | }, 255 | "outputs": [ 256 | { 257 | "data": { 258 | "text/plain": [ 259 | "int" 260 | ] 261 | }, 262 | "execution_count": 7, 263 | "metadata": {}, 264 | "output_type": "execute_result" 265 | } 266 | ], 267 | "source": [ 268 | "type(1)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": { 275 | "collapsed": true 276 | }, 277 | "outputs": [], 278 | "source": [] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 8, 283 | "metadata": { 284 | "collapsed": false 285 | }, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/plain": [ 290 | "float" 291 | ] 292 | }, 293 | "execution_count": 8, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "type(1.0)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": { 306 | "collapsed": true 307 | }, 308 | "outputs": [], 309 | "source": [] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 9, 314 | "metadata": { 315 | "collapsed": false 316 | }, 317 | "outputs": [ 318 | { 319 | "data": { 320 | "text/plain": [ 321 | "function" 322 | ] 323 | }, 324 | "execution_count": 9, 325 | "metadata": {}, 326 | "output_type": "execute_result" 327 | } 328 | ], 329 | "source": [ 330 | "type(add_numbers)" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": { 337 | "collapsed": true 338 | }, 339 | "outputs": [], 340 | "source": [] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "
\n", 347 | "Tuples are an immutable data structure (cannot be altered)." 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 10, 353 | "metadata": { 354 | "collapsed": false 355 | }, 356 | "outputs": [ 357 | { 358 | "data": { 359 | "text/plain": [ 360 | "tuple" 361 | ] 362 | }, 363 | "execution_count": 10, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "x = (1, 'a', 2, 'b')\n", 370 | "type(x)" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": { 377 | "collapsed": true 378 | }, 379 | "outputs": [], 380 | "source": [] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "metadata": {}, 385 | "source": [ 386 | "
\n", 387 | "Lists are a mutable data structure." 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 11, 393 | "metadata": { 394 | "collapsed": false 395 | }, 396 | "outputs": [ 397 | { 398 | "data": { 399 | "text/plain": [ 400 | "list" 401 | ] 402 | }, 403 | "execution_count": 11, 404 | "metadata": {}, 405 | "output_type": "execute_result" 406 | } 407 | ], 408 | "source": [ 409 | "x = [1, 'a', 2, 'b']\n", 410 | "type(x)" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "metadata": { 417 | "collapsed": true 418 | }, 419 | "outputs": [], 420 | "source": [] 421 | }, 422 | { 423 | "cell_type": "markdown", 424 | "metadata": {}, 425 | "source": [ 426 | "
\n", 427 | "Use `append` to append an object to a list." 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 12, 433 | "metadata": { 434 | "collapsed": false 435 | }, 436 | "outputs": [ 437 | { 438 | "name": "stdout", 439 | "output_type": "stream", 440 | "text": [ 441 | "[1, 'a', 2, 'b', 3.3]\n" 442 | ] 443 | } 444 | ], 445 | "source": [ 446 | "x.append(3.3)\n", 447 | "print(x)" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": { 454 | "collapsed": true 455 | }, 456 | "outputs": [], 457 | "source": [] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "
\n", 464 | "This is an example of how to loop through each item in the list." 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 13, 470 | "metadata": { 471 | "collapsed": false 472 | }, 473 | "outputs": [ 474 | { 475 | "name": "stdout", 476 | "output_type": "stream", 477 | "text": [ 478 | "1\n", 479 | "a\n", 480 | "2\n", 481 | "b\n", 482 | "3.3\n" 483 | ] 484 | } 485 | ], 486 | "source": [ 487 | "for item in x:\n", 488 | " print(item)" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": { 495 | "collapsed": true 496 | }, 497 | "outputs": [], 498 | "source": [] 499 | }, 500 | { 501 | "cell_type": "markdown", 502 | "metadata": {}, 503 | "source": [ 504 | "
\n", 505 | "Or using the indexing operator:" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": 14, 511 | "metadata": { 512 | "collapsed": false 513 | }, 514 | "outputs": [ 515 | { 516 | "name": "stdout", 517 | "output_type": "stream", 518 | "text": [ 519 | "1\n", 520 | "a\n", 521 | "2\n", 522 | "b\n", 523 | "3.3\n" 524 | ] 525 | } 526 | ], 527 | "source": [ 528 | "i=0\n", 529 | "while( i != len(x) ):\n", 530 | " print(x[i])\n", 531 | " i = i + 1" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": null, 537 | "metadata": { 538 | "collapsed": true 539 | }, 540 | "outputs": [], 541 | "source": [] 542 | }, 543 | { 544 | "cell_type": "markdown", 545 | "metadata": {}, 546 | "source": [ 547 | "
\n", 548 | "Use `+` to concatenate lists." 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": 15, 554 | "metadata": { 555 | "collapsed": false 556 | }, 557 | "outputs": [ 558 | { 559 | "data": { 560 | "text/plain": [ 561 | "[1, 2, 3, 4]" 562 | ] 563 | }, 564 | "execution_count": 15, 565 | "metadata": {}, 566 | "output_type": "execute_result" 567 | } 568 | ], 569 | "source": [ 570 | "[1,2] + [3,4]" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": null, 576 | "metadata": { 577 | "collapsed": true 578 | }, 579 | "outputs": [], 580 | "source": [] 581 | }, 582 | { 583 | "cell_type": "markdown", 584 | "metadata": {}, 585 | "source": [ 586 | "
\n", 587 | "Use `*` to repeat lists." 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": 16, 593 | "metadata": { 594 | "collapsed": false 595 | }, 596 | "outputs": [ 597 | { 598 | "data": { 599 | "text/plain": [ 600 | "[1, 1, 1]" 601 | ] 602 | }, 603 | "execution_count": 16, 604 | "metadata": {}, 605 | "output_type": "execute_result" 606 | } 607 | ], 608 | "source": [ 609 | "[1]*3" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": null, 615 | "metadata": { 616 | "collapsed": true 617 | }, 618 | "outputs": [], 619 | "source": [] 620 | }, 621 | { 622 | "cell_type": "markdown", 623 | "metadata": {}, 624 | "source": [ 625 | "
\n", 626 | "Use the `in` operator to check if something is inside a list." 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": 17, 632 | "metadata": { 633 | "collapsed": false 634 | }, 635 | "outputs": [ 636 | { 637 | "data": { 638 | "text/plain": [ 639 | "True" 640 | ] 641 | }, 642 | "execution_count": 17, 643 | "metadata": {}, 644 | "output_type": "execute_result" 645 | } 646 | ], 647 | "source": [ 648 | "1 in [1, 2, 3]" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": null, 654 | "metadata": { 655 | "collapsed": true 656 | }, 657 | "outputs": [], 658 | "source": [] 659 | }, 660 | { 661 | "cell_type": "markdown", 662 | "metadata": {}, 663 | "source": [ 664 | "
\n", 665 | "Now let's look at strings. Use bracket notation to slice a string." 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": 18, 671 | "metadata": { 672 | "collapsed": false 673 | }, 674 | "outputs": [ 675 | { 676 | "name": "stdout", 677 | "output_type": "stream", 678 | "text": [ 679 | "T\n", 680 | "T\n", 681 | "Th\n" 682 | ] 683 | } 684 | ], 685 | "source": [ 686 | "x = 'This is a string'\n", 687 | "print(x[0]) #first character\n", 688 | "print(x[0:1]) #first character, but we have explicitly set the end character\n", 689 | "print(x[0:2]) #first two characters\n" 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": null, 695 | "metadata": { 696 | "collapsed": true 697 | }, 698 | "outputs": [], 699 | "source": [] 700 | }, 701 | { 702 | "cell_type": "markdown", 703 | "metadata": {}, 704 | "source": [ 705 | "
\n", 706 | "This will return the last element of the string." 707 | ] 708 | }, 709 | { 710 | "cell_type": "code", 711 | "execution_count": 19, 712 | "metadata": { 713 | "collapsed": false 714 | }, 715 | "outputs": [ 716 | { 717 | "data": { 718 | "text/plain": [ 719 | "'g'" 720 | ] 721 | }, 722 | "execution_count": 19, 723 | "metadata": {}, 724 | "output_type": "execute_result" 725 | } 726 | ], 727 | "source": [ 728 | "x[-1]" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": null, 734 | "metadata": { 735 | "collapsed": true 736 | }, 737 | "outputs": [], 738 | "source": [] 739 | }, 740 | { 741 | "cell_type": "markdown", 742 | "metadata": {}, 743 | "source": [ 744 | "
\n", 745 | "This will return the slice starting from the 4th element from the end and stopping before the 2nd element from the end." 746 | ] 747 | }, 748 | { 749 | "cell_type": "code", 750 | "execution_count": 20, 751 | "metadata": { 752 | "collapsed": false 753 | }, 754 | "outputs": [ 755 | { 756 | "data": { 757 | "text/plain": [ 758 | "'ri'" 759 | ] 760 | }, 761 | "execution_count": 20, 762 | "metadata": {}, 763 | "output_type": "execute_result" 764 | } 765 | ], 766 | "source": [ 767 | "x[-4:-2]" 768 | ] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "execution_count": null, 773 | "metadata": { 774 | "collapsed": true 775 | }, 776 | "outputs": [], 777 | "source": [] 778 | }, 779 | { 780 | "cell_type": "markdown", 781 | "metadata": {}, 782 | "source": [ 783 | "
\n", 784 | "This is a slice from the beginning of the string and stopping before the 3rd element." 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": 21, 790 | "metadata": { 791 | "collapsed": false 792 | }, 793 | "outputs": [ 794 | { 795 | "data": { 796 | "text/plain": [ 797 | "'Thi'" 798 | ] 799 | }, 800 | "execution_count": 21, 801 | "metadata": {}, 802 | "output_type": "execute_result" 803 | } 804 | ], 805 | "source": [ 806 | "x[:3]" 807 | ] 808 | }, 809 | { 810 | "cell_type": "code", 811 | "execution_count": null, 812 | "metadata": { 813 | "collapsed": true 814 | }, 815 | "outputs": [], 816 | "source": [] 817 | }, 818 | { 819 | "cell_type": "markdown", 820 | "metadata": {}, 821 | "source": [ 822 | "
\n", 823 | "And this is a slice starting from the 3rd element of the string and going all the way to the end." 824 | ] 825 | }, 826 | { 827 | "cell_type": "code", 828 | "execution_count": null, 829 | "metadata": { 830 | "collapsed": false 831 | }, 832 | "outputs": [], 833 | "source": [ 834 | "x[3:]" 835 | ] 836 | }, 837 | { 838 | "cell_type": "code", 839 | "execution_count": 1, 840 | "metadata": { 841 | "collapsed": false 842 | }, 843 | "outputs": [ 844 | { 845 | "name": "stdout", 846 | "output_type": "stream", 847 | "text": [ 848 | "Christopher Brooks\n", 849 | "ChristopherChristopherChristopher\n", 850 | "True\n" 851 | ] 852 | } 853 | ], 854 | "source": [ 855 | "firstname = 'Christopher'\n", 856 | "lastname = 'Brooks'\n", 857 | "\n", 858 | "print(firstname + ' ' + lastname)\n", 859 | "print(firstname*3)\n", 860 | "print('Chris' in firstname)\n" 861 | ] 862 | }, 863 | { 864 | "cell_type": "code", 865 | "execution_count": null, 866 | "metadata": { 867 | "collapsed": true 868 | }, 869 | "outputs": [], 870 | "source": [] 871 | }, 872 | { 873 | "cell_type": "markdown", 874 | "metadata": {}, 875 | "source": [ 876 | "
\n", 877 | "`split` returns a list of all the words in a string, or a list split on a specific character." 878 | ] 879 | }, 880 | { 881 | "cell_type": "code", 882 | "execution_count": 2, 883 | "metadata": { 884 | "collapsed": false 885 | }, 886 | "outputs": [ 887 | { 888 | "name": "stdout", 889 | "output_type": "stream", 890 | "text": [ 891 | "Christopher\n", 892 | "Brooks\n" 893 | ] 894 | } 895 | ], 896 | "source": [ 897 | "firstname = 'Christopher Arthur Hansen Brooks'.split(' ')[0] # [0] selects the first element of the list\n", 898 | "lastname = 'Christopher Arthur Hansen Brooks'.split(' ')[-1] # [-1] selects the last element of the list\n", 899 | "print(firstname)\n", 900 | "print(lastname)" 901 | ] 902 | }, 903 | { 904 | "cell_type": "code", 905 | "execution_count": null, 906 | "metadata": { 907 | "collapsed": true 908 | }, 909 | "outputs": [], 910 | "source": [] 911 | }, 912 | { 913 | "cell_type": "markdown", 914 | "metadata": {}, 915 | "source": [ 916 | "
\n", 917 | "Make sure you convert objects to strings before concatenating." 918 | ] 919 | }, 920 | { 921 | "cell_type": "code", 922 | "execution_count": null, 923 | "metadata": { 924 | "collapsed": false 925 | }, 926 | "outputs": [], 927 | "source": [ 928 | "'Chris' + 2" 929 | ] 930 | }, 931 | { 932 | "cell_type": "code", 933 | "execution_count": null, 934 | "metadata": { 935 | "collapsed": false 936 | }, 937 | "outputs": [], 938 | "source": [ 939 | "'Chris' + str(2)" 940 | ] 941 | }, 942 | { 943 | "cell_type": "markdown", 944 | "metadata": {}, 945 | "source": [ 946 | "
\n", 947 | "Dictionaries associate keys with values." 948 | ] 949 | }, 950 | { 951 | "cell_type": "code", 952 | "execution_count": null, 953 | "metadata": { 954 | "collapsed": false 955 | }, 956 | "outputs": [], 957 | "source": [ 958 | "x = {'Christopher Brooks': 'brooksch@umich.edu', 'Bill Gates': 'billg@microsoft.com'}\n", 959 | "x['Christopher Brooks'] # Retrieve a value by using the indexing operator\n" 960 | ] 961 | }, 962 | { 963 | "cell_type": "code", 964 | "execution_count": null, 965 | "metadata": { 966 | "collapsed": true 967 | }, 968 | "outputs": [], 969 | "source": [ 970 | "x['Kevyn Collins-Thompson'] = None\n", 971 | "x['Kevyn Collins-Thompson']" 972 | ] 973 | }, 974 | { 975 | "cell_type": "markdown", 976 | "metadata": {}, 977 | "source": [ 978 | "
\n", 979 | "Iterate over all of the keys:" 980 | ] 981 | }, 982 | { 983 | "cell_type": "code", 984 | "execution_count": null, 985 | "metadata": { 986 | "collapsed": false, 987 | "scrolled": true 988 | }, 989 | "outputs": [], 990 | "source": [ 991 | "for name in x:\n", 992 | " print(x[name])" 993 | ] 994 | }, 995 | { 996 | "cell_type": "markdown", 997 | "metadata": {}, 998 | "source": [ 999 | "
\n", 1000 | "Iterate over all of the values:" 1001 | ] 1002 | }, 1003 | { 1004 | "cell_type": "code", 1005 | "execution_count": null, 1006 | "metadata": { 1007 | "collapsed": false 1008 | }, 1009 | "outputs": [], 1010 | "source": [ 1011 | "for email in x.values():\n", 1012 | " print(email)" 1013 | ] 1014 | }, 1015 | { 1016 | "cell_type": "markdown", 1017 | "metadata": {}, 1018 | "source": [ 1019 | "
\n", 1020 | "Iterate over all of the items in the list:" 1021 | ] 1022 | }, 1023 | { 1024 | "cell_type": "code", 1025 | "execution_count": null, 1026 | "metadata": { 1027 | "collapsed": false 1028 | }, 1029 | "outputs": [], 1030 | "source": [ 1031 | "for name, email in x.items():\n", 1032 | " print(name)\n", 1033 | " print(email)" 1034 | ] 1035 | }, 1036 | { 1037 | "cell_type": "markdown", 1038 | "metadata": {}, 1039 | "source": [ 1040 | "
\n", 1041 | "You can unpack a sequence into different variables:" 1042 | ] 1043 | }, 1044 | { 1045 | "cell_type": "code", 1046 | "execution_count": null, 1047 | "metadata": { 1048 | "collapsed": true 1049 | }, 1050 | "outputs": [], 1051 | "source": [ 1052 | "x = ('Christopher', 'Brooks', 'brooksch@umich.edu')\n", 1053 | "fname, lname, email = x" 1054 | ] 1055 | }, 1056 | { 1057 | "cell_type": "code", 1058 | "execution_count": null, 1059 | "metadata": { 1060 | "collapsed": false 1061 | }, 1062 | "outputs": [], 1063 | "source": [ 1064 | "fname" 1065 | ] 1066 | }, 1067 | { 1068 | "cell_type": "code", 1069 | "execution_count": null, 1070 | "metadata": { 1071 | "collapsed": false 1072 | }, 1073 | "outputs": [], 1074 | "source": [ 1075 | "lname" 1076 | ] 1077 | }, 1078 | { 1079 | "cell_type": "markdown", 1080 | "metadata": {}, 1081 | "source": [ 1082 | "
\n", 1083 | "Make sure the number of values you are unpacking matches the number of variables being assigned." 1084 | ] 1085 | }, 1086 | { 1087 | "cell_type": "code", 1088 | "execution_count": null, 1089 | "metadata": { 1090 | "collapsed": false 1091 | }, 1092 | "outputs": [], 1093 | "source": [ 1094 | "x = ('Christopher', 'Brooks', 'brooksch@umich.edu', 'Ann Arbor')\n", 1095 | "fname, lname, email = x" 1096 | ] 1097 | }, 1098 | { 1099 | "cell_type": "markdown", 1100 | "metadata": {}, 1101 | "source": [ 1102 | "
\n", 1103 | "# The Python Programming Language: More on Strings" 1104 | ] 1105 | }, 1106 | { 1107 | "cell_type": "code", 1108 | "execution_count": null, 1109 | "metadata": { 1110 | "collapsed": false 1111 | }, 1112 | "outputs": [], 1113 | "source": [ 1114 | "print('Chris' + 2)" 1115 | ] 1116 | }, 1117 | { 1118 | "cell_type": "code", 1119 | "execution_count": null, 1120 | "metadata": { 1121 | "collapsed": false 1122 | }, 1123 | "outputs": [], 1124 | "source": [ 1125 | "print('Chris' + str(2))" 1126 | ] 1127 | }, 1128 | { 1129 | "cell_type": "markdown", 1130 | "metadata": {}, 1131 | "source": [ 1132 | "
\n", 1133 | "Python has a built in method for convenient string formatting." 1134 | ] 1135 | }, 1136 | { 1137 | "cell_type": "code", 1138 | "execution_count": null, 1139 | "metadata": { 1140 | "collapsed": false 1141 | }, 1142 | "outputs": [], 1143 | "source": [ 1144 | "sales_record = {\n", 1145 | "'price': 3.24,\n", 1146 | "'num_items': 4,\n", 1147 | "'person': 'Chris'}\n", 1148 | "\n", 1149 | "sales_statement = '{} bought {} item(s) at a price of {} each for a total of {}'\n", 1150 | "\n", 1151 | "print(sales_statement.format(sales_record['person'],\n", 1152 | " sales_record['num_items'],\n", 1153 | " sales_record['price'],\n", 1154 | " sales_record['num_items']*sales_record['price']))\n" 1155 | ] 1156 | }, 1157 | { 1158 | "cell_type": "markdown", 1159 | "metadata": {}, 1160 | "source": [ 1161 | "
\n", 1162 | "# Reading and Writing CSV files" 1163 | ] 1164 | }, 1165 | { 1166 | "cell_type": "markdown", 1167 | "metadata": {}, 1168 | "source": [ 1169 | "
\n", 1170 | "Let's import our datafile mpg.csv, which contains fuel economy data for 234 cars.\n", 1171 | "\n", 1172 | "* mpg : miles per gallon\n", 1173 | "* class : car classification\n", 1174 | "* cty : city mpg\n", 1175 | "* cyl : # of cylinders\n", 1176 | "* displ : engine displacement in liters\n", 1177 | "* drv : f = front-wheel drive, r = rear wheel drive, 4 = 4wd\n", 1178 | "* fl : fuel (e = ethanol E85, d = diesel, r = regular, p = premium, c = CNG)\n", 1179 | "* hwy : highway mpg\n", 1180 | "* manufacturer : automobile manufacturer\n", 1181 | "* model : model of car\n", 1182 | "* trans : type of transmission\n", 1183 | "* year : model year" 1184 | ] 1185 | }, 1186 | { 1187 | "cell_type": "code", 1188 | "execution_count": null, 1189 | "metadata": { 1190 | "collapsed": false, 1191 | "scrolled": true 1192 | }, 1193 | "outputs": [], 1194 | "source": [ 1195 | "import csv\n", 1196 | "\n", 1197 | "%precision 2\n", 1198 | "\n", 1199 | "with open('mpg.csv') as csvfile:\n", 1200 | " mpg = list(csv.DictReader(csvfile))\n", 1201 | " \n", 1202 | "mpg[:3] # The first three dictionaries in our list." 1203 | ] 1204 | }, 1205 | { 1206 | "cell_type": "markdown", 1207 | "metadata": {}, 1208 | "source": [ 1209 | "
\n", 1210 | "`csv.Dictreader` has read in each row of our csv file as a dictionary. `len` shows that our list is comprised of 234 dictionaries." 1211 | ] 1212 | }, 1213 | { 1214 | "cell_type": "code", 1215 | "execution_count": null, 1216 | "metadata": { 1217 | "collapsed": false 1218 | }, 1219 | "outputs": [], 1220 | "source": [ 1221 | "len(mpg)" 1222 | ] 1223 | }, 1224 | { 1225 | "cell_type": "markdown", 1226 | "metadata": {}, 1227 | "source": [ 1228 | "
\n", 1229 | "`keys` gives us the column names of our csv." 1230 | ] 1231 | }, 1232 | { 1233 | "cell_type": "code", 1234 | "execution_count": null, 1235 | "metadata": { 1236 | "collapsed": false 1237 | }, 1238 | "outputs": [], 1239 | "source": [ 1240 | "mpg[0].keys()" 1241 | ] 1242 | }, 1243 | { 1244 | "cell_type": "markdown", 1245 | "metadata": {}, 1246 | "source": [ 1247 | "
\n", 1248 | "This is how to find the average cty fuel economy across all cars. All values in the dictionaries are strings, so we need to convert to float." 1249 | ] 1250 | }, 1251 | { 1252 | "cell_type": "code", 1253 | "execution_count": null, 1254 | "metadata": { 1255 | "collapsed": false 1256 | }, 1257 | "outputs": [], 1258 | "source": [ 1259 | "sum(float(d['cty']) for d in mpg) / len(mpg)" 1260 | ] 1261 | }, 1262 | { 1263 | "cell_type": "markdown", 1264 | "metadata": {}, 1265 | "source": [ 1266 | "
\n", 1267 | "Similarly this is how to find the average hwy fuel economy across all cars." 1268 | ] 1269 | }, 1270 | { 1271 | "cell_type": "code", 1272 | "execution_count": null, 1273 | "metadata": { 1274 | "collapsed": false 1275 | }, 1276 | "outputs": [], 1277 | "source": [ 1278 | "sum(float(d['hwy']) for d in mpg) / len(mpg)" 1279 | ] 1280 | }, 1281 | { 1282 | "cell_type": "markdown", 1283 | "metadata": {}, 1284 | "source": [ 1285 | "
\n", 1286 | "Use `set` to return the unique values for the number of cylinders the cars in our dataset have." 1287 | ] 1288 | }, 1289 | { 1290 | "cell_type": "code", 1291 | "execution_count": null, 1292 | "metadata": { 1293 | "collapsed": false 1294 | }, 1295 | "outputs": [], 1296 | "source": [ 1297 | "cylinders = set(d['cyl'] for d in mpg)\n", 1298 | "cylinders" 1299 | ] 1300 | }, 1301 | { 1302 | "cell_type": "markdown", 1303 | "metadata": {}, 1304 | "source": [ 1305 | "
\n", 1306 | "Here's a more complex example where we are grouping the cars by number of cylinder, and finding the average cty mpg for each group." 1307 | ] 1308 | }, 1309 | { 1310 | "cell_type": "code", 1311 | "execution_count": null, 1312 | "metadata": { 1313 | "collapsed": false 1314 | }, 1315 | "outputs": [], 1316 | "source": [ 1317 | "CtyMpgByCyl = []\n", 1318 | "\n", 1319 | "for c in cylinders: # iterate over all the cylinder levels\n", 1320 | " summpg = 0\n", 1321 | " cyltypecount = 0\n", 1322 | " for d in mpg: # iterate over all dictionaries\n", 1323 | " if d['cyl'] == c: # if the cylinder level type matches,\n", 1324 | " summpg += float(d['cty']) # add the cty mpg\n", 1325 | " cyltypecount += 1 # increment the count\n", 1326 | " CtyMpgByCyl.append((c, summpg / cyltypecount)) # append the tuple ('cylinder', 'avg mpg')\n", 1327 | "\n", 1328 | "CtyMpgByCyl.sort(key=lambda x: x[0])\n", 1329 | "CtyMpgByCyl" 1330 | ] 1331 | }, 1332 | { 1333 | "cell_type": "markdown", 1334 | "metadata": {}, 1335 | "source": [ 1336 | "
\n", 1337 | "Use `set` to return the unique values for the class types in our dataset." 1338 | ] 1339 | }, 1340 | { 1341 | "cell_type": "code", 1342 | "execution_count": null, 1343 | "metadata": { 1344 | "collapsed": false 1345 | }, 1346 | "outputs": [], 1347 | "source": [ 1348 | "vehicleclass = set(d['class'] for d in mpg) # what are the class types\n", 1349 | "vehicleclass" 1350 | ] 1351 | }, 1352 | { 1353 | "cell_type": "markdown", 1354 | "metadata": {}, 1355 | "source": [ 1356 | "
\n", 1357 | "And here's an example of how to find the average hwy mpg for each class of vehicle in our dataset." 1358 | ] 1359 | }, 1360 | { 1361 | "cell_type": "code", 1362 | "execution_count": null, 1363 | "metadata": { 1364 | "collapsed": false 1365 | }, 1366 | "outputs": [], 1367 | "source": [ 1368 | "HwyMpgByClass = []\n", 1369 | "\n", 1370 | "for t in vehicleclass: # iterate over all the vehicle classes\n", 1371 | " summpg = 0\n", 1372 | " vclasscount = 0\n", 1373 | " for d in mpg: # iterate over all dictionaries\n", 1374 | " if d['class'] == t: # if the cylinder amount type matches,\n", 1375 | " summpg += float(d['hwy']) # add the hwy mpg\n", 1376 | " vclasscount += 1 # increment the count\n", 1377 | " HwyMpgByClass.append((t, summpg / vclasscount)) # append the tuple ('class', 'avg mpg')\n", 1378 | "\n", 1379 | "HwyMpgByClass.sort(key=lambda x: x[1])\n", 1380 | "HwyMpgByClass" 1381 | ] 1382 | }, 1383 | { 1384 | "cell_type": "markdown", 1385 | "metadata": {}, 1386 | "source": [ 1387 | "
\n", 1388 | "# The Python Programming Language: Dates and Times" 1389 | ] 1390 | }, 1391 | { 1392 | "cell_type": "code", 1393 | "execution_count": null, 1394 | "metadata": { 1395 | "collapsed": true 1396 | }, 1397 | "outputs": [], 1398 | "source": [ 1399 | "import datetime as dt\n", 1400 | "import time as tm" 1401 | ] 1402 | }, 1403 | { 1404 | "cell_type": "markdown", 1405 | "metadata": {}, 1406 | "source": [ 1407 | "
\n", 1408 | "`time` returns the current time in seconds since the Epoch. (January 1st, 1970)" 1409 | ] 1410 | }, 1411 | { 1412 | "cell_type": "code", 1413 | "execution_count": null, 1414 | "metadata": { 1415 | "collapsed": false 1416 | }, 1417 | "outputs": [], 1418 | "source": [ 1419 | "tm.time()" 1420 | ] 1421 | }, 1422 | { 1423 | "cell_type": "markdown", 1424 | "metadata": {}, 1425 | "source": [ 1426 | "
\n", 1427 | "Convert the timestamp to datetime." 1428 | ] 1429 | }, 1430 | { 1431 | "cell_type": "code", 1432 | "execution_count": null, 1433 | "metadata": { 1434 | "collapsed": false 1435 | }, 1436 | "outputs": [], 1437 | "source": [ 1438 | "dtnow = dt.datetime.fromtimestamp(tm.time())\n", 1439 | "dtnow" 1440 | ] 1441 | }, 1442 | { 1443 | "cell_type": "markdown", 1444 | "metadata": {}, 1445 | "source": [ 1446 | "
\n", 1447 | "Handy datetime attributes:" 1448 | ] 1449 | }, 1450 | { 1451 | "cell_type": "code", 1452 | "execution_count": null, 1453 | "metadata": { 1454 | "collapsed": false 1455 | }, 1456 | "outputs": [], 1457 | "source": [ 1458 | "dtnow.year, dtnow.month, dtnow.day, dtnow.hour, dtnow.minute, dtnow.second # get year, month, day, etc.from a datetime" 1459 | ] 1460 | }, 1461 | { 1462 | "cell_type": "markdown", 1463 | "metadata": {}, 1464 | "source": [ 1465 | "
\n", 1466 | "`timedelta` is a duration expressing the difference between two dates." 1467 | ] 1468 | }, 1469 | { 1470 | "cell_type": "code", 1471 | "execution_count": null, 1472 | "metadata": { 1473 | "collapsed": false 1474 | }, 1475 | "outputs": [], 1476 | "source": [ 1477 | "delta = dt.timedelta(days = 100) # create a timedelta of 100 days\n", 1478 | "delta" 1479 | ] 1480 | }, 1481 | { 1482 | "cell_type": "markdown", 1483 | "metadata": {}, 1484 | "source": [ 1485 | "
\n", 1486 | "`date.today` returns the current local date." 1487 | ] 1488 | }, 1489 | { 1490 | "cell_type": "code", 1491 | "execution_count": null, 1492 | "metadata": { 1493 | "collapsed": true 1494 | }, 1495 | "outputs": [], 1496 | "source": [ 1497 | "today = dt.date.today()" 1498 | ] 1499 | }, 1500 | { 1501 | "cell_type": "code", 1502 | "execution_count": null, 1503 | "metadata": { 1504 | "collapsed": false 1505 | }, 1506 | "outputs": [], 1507 | "source": [ 1508 | "today - delta # the date 100 days ago" 1509 | ] 1510 | }, 1511 | { 1512 | "cell_type": "code", 1513 | "execution_count": null, 1514 | "metadata": { 1515 | "collapsed": false 1516 | }, 1517 | "outputs": [], 1518 | "source": [ 1519 | "today > today-delta # compare dates" 1520 | ] 1521 | }, 1522 | { 1523 | "cell_type": "markdown", 1524 | "metadata": {}, 1525 | "source": [ 1526 | "
\n", 1527 | "# The Python Programming Language: Objects and map()" 1528 | ] 1529 | }, 1530 | { 1531 | "cell_type": "markdown", 1532 | "metadata": {}, 1533 | "source": [ 1534 | "
\n", 1535 | "An example of a class in python:" 1536 | ] 1537 | }, 1538 | { 1539 | "cell_type": "code", 1540 | "execution_count": null, 1541 | "metadata": { 1542 | "collapsed": true 1543 | }, 1544 | "outputs": [], 1545 | "source": [ 1546 | "class Person:\n", 1547 | " department = 'School of Information' #a class variable\n", 1548 | "\n", 1549 | " def set_name(self, new_name): #a method\n", 1550 | " self.name = new_name\n", 1551 | " def set_location(self, new_location):\n", 1552 | " self.location = new_location" 1553 | ] 1554 | }, 1555 | { 1556 | "cell_type": "code", 1557 | "execution_count": null, 1558 | "metadata": { 1559 | "collapsed": false 1560 | }, 1561 | "outputs": [], 1562 | "source": [ 1563 | "person = Person()\n", 1564 | "person.set_name('Christopher Brooks')\n", 1565 | "person.set_location('Ann Arbor, MI, USA')\n", 1566 | "print('{} live in {} and works in the department {}'.format(person.name, person.location, person.department))" 1567 | ] 1568 | }, 1569 | { 1570 | "cell_type": "markdown", 1571 | "metadata": {}, 1572 | "source": [ 1573 | "
\n", 1574 | "Here's an example of mapping the `min` function between two lists." 1575 | ] 1576 | }, 1577 | { 1578 | "cell_type": "code", 1579 | "execution_count": null, 1580 | "metadata": { 1581 | "collapsed": false 1582 | }, 1583 | "outputs": [], 1584 | "source": [ 1585 | "store1 = [10.00, 11.00, 12.34, 2.34]\n", 1586 | "store2 = [9.00, 11.10, 12.34, 2.01]\n", 1587 | "cheapest = map(min, store1, store2)\n", 1588 | "cheapest" 1589 | ] 1590 | }, 1591 | { 1592 | "cell_type": "markdown", 1593 | "metadata": {}, 1594 | "source": [ 1595 | "
\n", 1596 | "Now let's iterate through the map object to see the values." 1597 | ] 1598 | }, 1599 | { 1600 | "cell_type": "code", 1601 | "execution_count": null, 1602 | "metadata": { 1603 | "collapsed": false, 1604 | "scrolled": true 1605 | }, 1606 | "outputs": [], 1607 | "source": [ 1608 | "for item in cheapest:\n", 1609 | " print(item)" 1610 | ] 1611 | }, 1612 | { 1613 | "cell_type": "markdown", 1614 | "metadata": {}, 1615 | "source": [ 1616 | "
\n", 1617 | "# The Python Programming Language: Lambda and List Comprehensions" 1618 | ] 1619 | }, 1620 | { 1621 | "cell_type": "markdown", 1622 | "metadata": {}, 1623 | "source": [ 1624 | "
\n", 1625 | "Here's an example of lambda that takes in three parameters and adds the first two." 1626 | ] 1627 | }, 1628 | { 1629 | "cell_type": "code", 1630 | "execution_count": null, 1631 | "metadata": { 1632 | "collapsed": true 1633 | }, 1634 | "outputs": [], 1635 | "source": [ 1636 | "my_function = lambda a, b, c : a + b" 1637 | ] 1638 | }, 1639 | { 1640 | "cell_type": "code", 1641 | "execution_count": null, 1642 | "metadata": { 1643 | "collapsed": false 1644 | }, 1645 | "outputs": [], 1646 | "source": [ 1647 | "my_function(1, 2, 3)" 1648 | ] 1649 | }, 1650 | { 1651 | "cell_type": "markdown", 1652 | "metadata": {}, 1653 | "source": [ 1654 | "
\n", 1655 | "Let's iterate from 0 to 999 and return the even numbers." 1656 | ] 1657 | }, 1658 | { 1659 | "cell_type": "code", 1660 | "execution_count": null, 1661 | "metadata": { 1662 | "collapsed": false 1663 | }, 1664 | "outputs": [], 1665 | "source": [ 1666 | "my_list = []\n", 1667 | "for number in range(0, 1000):\n", 1668 | " if number % 2 == 0:\n", 1669 | " my_list.append(number)\n", 1670 | "my_list" 1671 | ] 1672 | }, 1673 | { 1674 | "cell_type": "markdown", 1675 | "metadata": {}, 1676 | "source": [ 1677 | "
\n", 1678 | "Now the same thing but with list comprehension." 1679 | ] 1680 | }, 1681 | { 1682 | "cell_type": "code", 1683 | "execution_count": null, 1684 | "metadata": { 1685 | "collapsed": false 1686 | }, 1687 | "outputs": [], 1688 | "source": [ 1689 | "my_list = [number for number in range(0,1000) if number % 2 == 0]\n", 1690 | "my_list" 1691 | ] 1692 | }, 1693 | { 1694 | "cell_type": "markdown", 1695 | "metadata": { 1696 | "collapsed": true 1697 | }, 1698 | "source": [ 1699 | "
\n", 1700 | "# The Python Programming Language: Numerical Python (NumPy)" 1701 | ] 1702 | }, 1703 | { 1704 | "cell_type": "code", 1705 | "execution_count": 3, 1706 | "metadata": { 1707 | "collapsed": true 1708 | }, 1709 | "outputs": [], 1710 | "source": [ 1711 | "import numpy as np" 1712 | ] 1713 | }, 1714 | { 1715 | "cell_type": "code", 1716 | "execution_count": null, 1717 | "metadata": { 1718 | "collapsed": true 1719 | }, 1720 | "outputs": [], 1721 | "source": [] 1722 | }, 1723 | { 1724 | "cell_type": "markdown", 1725 | "metadata": {}, 1726 | "source": [ 1727 | "
\n", 1728 | "## Creating Arrays" 1729 | ] 1730 | }, 1731 | { 1732 | "cell_type": "markdown", 1733 | "metadata": {}, 1734 | "source": [ 1735 | "Create a list and convert it to a numpy array" 1736 | ] 1737 | }, 1738 | { 1739 | "cell_type": "code", 1740 | "execution_count": 4, 1741 | "metadata": { 1742 | "collapsed": false 1743 | }, 1744 | "outputs": [ 1745 | { 1746 | "data": { 1747 | "text/plain": [ 1748 | "array([1, 2, 3])" 1749 | ] 1750 | }, 1751 | "execution_count": 4, 1752 | "metadata": {}, 1753 | "output_type": "execute_result" 1754 | } 1755 | ], 1756 | "source": [ 1757 | "mylist = [1, 2, 3]\n", 1758 | "x = np.array(mylist)\n", 1759 | "x" 1760 | ] 1761 | }, 1762 | { 1763 | "cell_type": "code", 1764 | "execution_count": null, 1765 | "metadata": { 1766 | "collapsed": true 1767 | }, 1768 | "outputs": [], 1769 | "source": [] 1770 | }, 1771 | { 1772 | "cell_type": "code", 1773 | "execution_count": null, 1774 | "metadata": { 1775 | "collapsed": true 1776 | }, 1777 | "outputs": [], 1778 | "source": [] 1779 | }, 1780 | { 1781 | "cell_type": "markdown", 1782 | "metadata": {}, 1783 | "source": [ 1784 | "
\n", 1785 | "Or just pass in a list directly" 1786 | ] 1787 | }, 1788 | { 1789 | "cell_type": "code", 1790 | "execution_count": null, 1791 | "metadata": { 1792 | "collapsed": false 1793 | }, 1794 | "outputs": [], 1795 | "source": [ 1796 | "y = np.array([4, 5, 6])\n", 1797 | "y" 1798 | ] 1799 | }, 1800 | { 1801 | "cell_type": "markdown", 1802 | "metadata": {}, 1803 | "source": [ 1804 | "
\n", 1805 | "Pass in a list of lists to create a multidimensional array." 1806 | ] 1807 | }, 1808 | { 1809 | "cell_type": "code", 1810 | "execution_count": null, 1811 | "metadata": { 1812 | "collapsed": false 1813 | }, 1814 | "outputs": [], 1815 | "source": [ 1816 | "m = np.array([[7, 8, 9], [10, 11, 12]])\n", 1817 | "m" 1818 | ] 1819 | }, 1820 | { 1821 | "cell_type": "markdown", 1822 | "metadata": {}, 1823 | "source": [ 1824 | "
\n", 1825 | "Use the shape method to find the dimensions of the array. (rows, columns)" 1826 | ] 1827 | }, 1828 | { 1829 | "cell_type": "code", 1830 | "execution_count": null, 1831 | "metadata": { 1832 | "collapsed": false 1833 | }, 1834 | "outputs": [], 1835 | "source": [ 1836 | "m.shape" 1837 | ] 1838 | }, 1839 | { 1840 | "cell_type": "markdown", 1841 | "metadata": {}, 1842 | "source": [ 1843 | "
\n", 1844 | "`arange` returns evenly spaced values within a given interval." 1845 | ] 1846 | }, 1847 | { 1848 | "cell_type": "code", 1849 | "execution_count": null, 1850 | "metadata": { 1851 | "collapsed": false 1852 | }, 1853 | "outputs": [], 1854 | "source": [ 1855 | "n = np.arange(0, 30, 2) # start at 0 count up by 2, stop before 30\n", 1856 | "n" 1857 | ] 1858 | }, 1859 | { 1860 | "cell_type": "markdown", 1861 | "metadata": {}, 1862 | "source": [ 1863 | "
\n", 1864 | "`reshape` returns an array with the same data with a new shape." 1865 | ] 1866 | }, 1867 | { 1868 | "cell_type": "code", 1869 | "execution_count": 1, 1870 | "metadata": { 1871 | "collapsed": false 1872 | }, 1873 | "outputs": [ 1874 | { 1875 | "ename": "NameError", 1876 | "evalue": "name 'n' is not defined", 1877 | "output_type": "error", 1878 | "traceback": [ 1879 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 1880 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 1881 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# reshape array to be 3x5\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 1882 | "\u001b[0;31mNameError\u001b[0m: name 'n' is not defined" 1883 | ] 1884 | } 1885 | ], 1886 | "source": [ 1887 | "n = n.reshape(3, 5) # reshape array to be 3x5\n", 1888 | "n" 1889 | ] 1890 | }, 1891 | { 1892 | "cell_type": "code", 1893 | "execution_count": null, 1894 | "metadata": { 1895 | "collapsed": true 1896 | }, 1897 | "outputs": [], 1898 | "source": [] 1899 | }, 1900 | { 1901 | "cell_type": "markdown", 1902 | "metadata": {}, 1903 | "source": [ 1904 | "
\n", 1905 | "`linspace` returns evenly spaced numbers over a specified interval." 1906 | ] 1907 | }, 1908 | { 1909 | "cell_type": "code", 1910 | "execution_count": null, 1911 | "metadata": { 1912 | "collapsed": false 1913 | }, 1914 | "outputs": [], 1915 | "source": [ 1916 | "o = np.linspace(0, 4, 9) # return 9 evenly spaced values from 0 to 4\n", 1917 | "o" 1918 | ] 1919 | }, 1920 | { 1921 | "cell_type": "markdown", 1922 | "metadata": {}, 1923 | "source": [ 1924 | "
\n", 1925 | "`resize` changes the shape and size of array in-place." 1926 | ] 1927 | }, 1928 | { 1929 | "cell_type": "code", 1930 | "execution_count": null, 1931 | "metadata": { 1932 | "collapsed": false 1933 | }, 1934 | "outputs": [], 1935 | "source": [ 1936 | "o.resize(3, 3)\n", 1937 | "o" 1938 | ] 1939 | }, 1940 | { 1941 | "cell_type": "markdown", 1942 | "metadata": {}, 1943 | "source": [ 1944 | "
\n", 1945 | "`ones` returns a new array of given shape and type, filled with ones." 1946 | ] 1947 | }, 1948 | { 1949 | "cell_type": "code", 1950 | "execution_count": null, 1951 | "metadata": { 1952 | "collapsed": false 1953 | }, 1954 | "outputs": [], 1955 | "source": [ 1956 | "np.ones((3, 2))" 1957 | ] 1958 | }, 1959 | { 1960 | "cell_type": "markdown", 1961 | "metadata": {}, 1962 | "source": [ 1963 | "
\n", 1964 | "`zeros` returns a new array of given shape and type, filled with zeros." 1965 | ] 1966 | }, 1967 | { 1968 | "cell_type": "code", 1969 | "execution_count": null, 1970 | "metadata": { 1971 | "collapsed": false 1972 | }, 1973 | "outputs": [], 1974 | "source": [ 1975 | "np.zeros((2, 3))" 1976 | ] 1977 | }, 1978 | { 1979 | "cell_type": "markdown", 1980 | "metadata": {}, 1981 | "source": [ 1982 | "
\n", 1983 | "`eye` returns a 2-D array with ones on the diagonal and zeros elsewhere." 1984 | ] 1985 | }, 1986 | { 1987 | "cell_type": "code", 1988 | "execution_count": null, 1989 | "metadata": { 1990 | "collapsed": false 1991 | }, 1992 | "outputs": [], 1993 | "source": [ 1994 | "np.eye(3)" 1995 | ] 1996 | }, 1997 | { 1998 | "cell_type": "markdown", 1999 | "metadata": {}, 2000 | "source": [ 2001 | "
\n", 2002 | "`diag` extracts a diagonal or constructs a diagonal array." 2003 | ] 2004 | }, 2005 | { 2006 | "cell_type": "code", 2007 | "execution_count": null, 2008 | "metadata": { 2009 | "collapsed": false 2010 | }, 2011 | "outputs": [], 2012 | "source": [ 2013 | "np.diag(y)" 2014 | ] 2015 | }, 2016 | { 2017 | "cell_type": "markdown", 2018 | "metadata": {}, 2019 | "source": [ 2020 | "
\n", 2021 | "Create an array using repeating list (or see `np.tile`)" 2022 | ] 2023 | }, 2024 | { 2025 | "cell_type": "code", 2026 | "execution_count": null, 2027 | "metadata": { 2028 | "collapsed": false 2029 | }, 2030 | "outputs": [], 2031 | "source": [ 2032 | "np.array([1, 2, 3] * 3)" 2033 | ] 2034 | }, 2035 | { 2036 | "cell_type": "markdown", 2037 | "metadata": {}, 2038 | "source": [ 2039 | "
\n", 2040 | "Repeat elements of an array using `repeat`." 2041 | ] 2042 | }, 2043 | { 2044 | "cell_type": "code", 2045 | "execution_count": null, 2046 | "metadata": { 2047 | "collapsed": false 2048 | }, 2049 | "outputs": [], 2050 | "source": [ 2051 | "np.repeat([1, 2, 3], 3)" 2052 | ] 2053 | }, 2054 | { 2055 | "cell_type": "markdown", 2056 | "metadata": {}, 2057 | "source": [ 2058 | "
\n", 2059 | "#### Combining Arrays" 2060 | ] 2061 | }, 2062 | { 2063 | "cell_type": "code", 2064 | "execution_count": null, 2065 | "metadata": { 2066 | "collapsed": false 2067 | }, 2068 | "outputs": [], 2069 | "source": [ 2070 | "p = np.ones([2, 3], int)\n", 2071 | "p" 2072 | ] 2073 | }, 2074 | { 2075 | "cell_type": "markdown", 2076 | "metadata": {}, 2077 | "source": [ 2078 | "
\n", 2079 | "Use `vstack` to stack arrays in sequence vertically (row wise)." 2080 | ] 2081 | }, 2082 | { 2083 | "cell_type": "code", 2084 | "execution_count": null, 2085 | "metadata": { 2086 | "collapsed": false 2087 | }, 2088 | "outputs": [], 2089 | "source": [ 2090 | "np.vstack([p, 2*p])" 2091 | ] 2092 | }, 2093 | { 2094 | "cell_type": "markdown", 2095 | "metadata": {}, 2096 | "source": [ 2097 | "
\n", 2098 | "Use `hstack` to stack arrays in sequence horizontally (column wise)." 2099 | ] 2100 | }, 2101 | { 2102 | "cell_type": "code", 2103 | "execution_count": null, 2104 | "metadata": { 2105 | "collapsed": false 2106 | }, 2107 | "outputs": [], 2108 | "source": [ 2109 | "np.hstack([p, 2*p])" 2110 | ] 2111 | }, 2112 | { 2113 | "cell_type": "markdown", 2114 | "metadata": {}, 2115 | "source": [ 2116 | "
\n", 2117 | "## Operations" 2118 | ] 2119 | }, 2120 | { 2121 | "cell_type": "markdown", 2122 | "metadata": {}, 2123 | "source": [ 2124 | "Use `+`, `-`, `*`, `/` and `**` to perform element wise addition, subtraction, multiplication, division and power." 2125 | ] 2126 | }, 2127 | { 2128 | "cell_type": "code", 2129 | "execution_count": null, 2130 | "metadata": { 2131 | "collapsed": false 2132 | }, 2133 | "outputs": [], 2134 | "source": [ 2135 | "print(x + y) # elementwise addition [1 2 3] + [4 5 6] = [5 7 9]\n", 2136 | "print(x - y) # elementwise subtraction [1 2 3] - [4 5 6] = [-3 -3 -3]" 2137 | ] 2138 | }, 2139 | { 2140 | "cell_type": "code", 2141 | "execution_count": null, 2142 | "metadata": { 2143 | "collapsed": false 2144 | }, 2145 | "outputs": [], 2146 | "source": [ 2147 | "print(x * y) # elementwise multiplication [1 2 3] * [4 5 6] = [4 10 18]\n", 2148 | "print(x / y) # elementwise divison [1 2 3] / [4 5 6] = [0.25 0.4 0.5]" 2149 | ] 2150 | }, 2151 | { 2152 | "cell_type": "code", 2153 | "execution_count": null, 2154 | "metadata": { 2155 | "collapsed": false 2156 | }, 2157 | "outputs": [], 2158 | "source": [ 2159 | "print(x**2) # elementwise power [1 2 3] ^2 = [1 4 9]" 2160 | ] 2161 | }, 2162 | { 2163 | "cell_type": "markdown", 2164 | "metadata": {}, 2165 | "source": [ 2166 | "
\n", 2167 | "**Dot Product:** \n", 2168 | "\n", 2169 | "$ \\begin{bmatrix}x_1 \\ x_2 \\ x_3\\end{bmatrix}\n", 2170 | "\\cdot\n", 2171 | "\\begin{bmatrix}y_1 \\\\ y_2 \\\\ y_3\\end{bmatrix}\n", 2172 | "= x_1 y_1 + x_2 y_2 + x_3 y_3$" 2173 | ] 2174 | }, 2175 | { 2176 | "cell_type": "code", 2177 | "execution_count": null, 2178 | "metadata": { 2179 | "collapsed": false 2180 | }, 2181 | "outputs": [], 2182 | "source": [ 2183 | "x.dot(y) # dot product 1*4 + 2*5 + 3*6" 2184 | ] 2185 | }, 2186 | { 2187 | "cell_type": "code", 2188 | "execution_count": null, 2189 | "metadata": { 2190 | "collapsed": false 2191 | }, 2192 | "outputs": [], 2193 | "source": [ 2194 | "z = np.array([y, y**2])\n", 2195 | "print(len(z)) # number of rows of array" 2196 | ] 2197 | }, 2198 | { 2199 | "cell_type": "markdown", 2200 | "metadata": {}, 2201 | "source": [ 2202 | "
\n", 2203 | "Let's look at transposing arrays. Transposing permutes the dimensions of the array." 2204 | ] 2205 | }, 2206 | { 2207 | "cell_type": "code", 2208 | "execution_count": null, 2209 | "metadata": { 2210 | "collapsed": false 2211 | }, 2212 | "outputs": [], 2213 | "source": [ 2214 | "z = np.array([y, y**2])\n", 2215 | "z" 2216 | ] 2217 | }, 2218 | { 2219 | "cell_type": "markdown", 2220 | "metadata": {}, 2221 | "source": [ 2222 | "
\n", 2223 | "The shape of array `z` is `(2,3)` before transposing." 2224 | ] 2225 | }, 2226 | { 2227 | "cell_type": "code", 2228 | "execution_count": null, 2229 | "metadata": { 2230 | "collapsed": false 2231 | }, 2232 | "outputs": [], 2233 | "source": [ 2234 | "z.shape" 2235 | ] 2236 | }, 2237 | { 2238 | "cell_type": "markdown", 2239 | "metadata": {}, 2240 | "source": [ 2241 | "
\n", 2242 | "Use `.T` to get the transpose." 2243 | ] 2244 | }, 2245 | { 2246 | "cell_type": "code", 2247 | "execution_count": null, 2248 | "metadata": { 2249 | "collapsed": false 2250 | }, 2251 | "outputs": [], 2252 | "source": [ 2253 | "z.T" 2254 | ] 2255 | }, 2256 | { 2257 | "cell_type": "markdown", 2258 | "metadata": {}, 2259 | "source": [ 2260 | "
\n", 2261 | "The number of rows has swapped with the number of columns." 2262 | ] 2263 | }, 2264 | { 2265 | "cell_type": "code", 2266 | "execution_count": null, 2267 | "metadata": { 2268 | "collapsed": false 2269 | }, 2270 | "outputs": [], 2271 | "source": [ 2272 | "z.T.shape" 2273 | ] 2274 | }, 2275 | { 2276 | "cell_type": "markdown", 2277 | "metadata": {}, 2278 | "source": [ 2279 | "
\n", 2280 | "Use `.dtype` to see the data type of the elements in the array." 2281 | ] 2282 | }, 2283 | { 2284 | "cell_type": "code", 2285 | "execution_count": null, 2286 | "metadata": { 2287 | "collapsed": false 2288 | }, 2289 | "outputs": [], 2290 | "source": [ 2291 | "z.dtype" 2292 | ] 2293 | }, 2294 | { 2295 | "cell_type": "markdown", 2296 | "metadata": {}, 2297 | "source": [ 2298 | "
\n", 2299 | "Use `.astype` to cast to a specific type." 2300 | ] 2301 | }, 2302 | { 2303 | "cell_type": "code", 2304 | "execution_count": null, 2305 | "metadata": { 2306 | "collapsed": false 2307 | }, 2308 | "outputs": [], 2309 | "source": [ 2310 | "z = z.astype('f')\n", 2311 | "z.dtype" 2312 | ] 2313 | }, 2314 | { 2315 | "cell_type": "markdown", 2316 | "metadata": {}, 2317 | "source": [ 2318 | "
\n", 2319 | "## Math Functions" 2320 | ] 2321 | }, 2322 | { 2323 | "cell_type": "markdown", 2324 | "metadata": {}, 2325 | "source": [ 2326 | "Numpy has many built in math functions that can be performed on arrays." 2327 | ] 2328 | }, 2329 | { 2330 | "cell_type": "code", 2331 | "execution_count": null, 2332 | "metadata": { 2333 | "collapsed": true 2334 | }, 2335 | "outputs": [], 2336 | "source": [ 2337 | "a = np.array([-4, -2, 1, 3, 5])" 2338 | ] 2339 | }, 2340 | { 2341 | "cell_type": "code", 2342 | "execution_count": null, 2343 | "metadata": { 2344 | "collapsed": false 2345 | }, 2346 | "outputs": [], 2347 | "source": [ 2348 | "a.sum()" 2349 | ] 2350 | }, 2351 | { 2352 | "cell_type": "code", 2353 | "execution_count": null, 2354 | "metadata": { 2355 | "collapsed": false 2356 | }, 2357 | "outputs": [], 2358 | "source": [ 2359 | "a.max()" 2360 | ] 2361 | }, 2362 | { 2363 | "cell_type": "code", 2364 | "execution_count": null, 2365 | "metadata": { 2366 | "collapsed": false 2367 | }, 2368 | "outputs": [], 2369 | "source": [ 2370 | "a.min()" 2371 | ] 2372 | }, 2373 | { 2374 | "cell_type": "code", 2375 | "execution_count": null, 2376 | "metadata": { 2377 | "collapsed": false 2378 | }, 2379 | "outputs": [], 2380 | "source": [ 2381 | "a.mean()" 2382 | ] 2383 | }, 2384 | { 2385 | "cell_type": "code", 2386 | "execution_count": null, 2387 | "metadata": { 2388 | "collapsed": false 2389 | }, 2390 | "outputs": [], 2391 | "source": [ 2392 | "a.std()" 2393 | ] 2394 | }, 2395 | { 2396 | "cell_type": "markdown", 2397 | "metadata": {}, 2398 | "source": [ 2399 | "
\n", 2400 | "`argmax` and `argmin` return the index of the maximum and minimum values in the array." 2401 | ] 2402 | }, 2403 | { 2404 | "cell_type": "code", 2405 | "execution_count": null, 2406 | "metadata": { 2407 | "collapsed": false 2408 | }, 2409 | "outputs": [], 2410 | "source": [ 2411 | "a.argmax()" 2412 | ] 2413 | }, 2414 | { 2415 | "cell_type": "code", 2416 | "execution_count": null, 2417 | "metadata": { 2418 | "collapsed": false 2419 | }, 2420 | "outputs": [], 2421 | "source": [ 2422 | "a.argmin()" 2423 | ] 2424 | }, 2425 | { 2426 | "cell_type": "markdown", 2427 | "metadata": {}, 2428 | "source": [ 2429 | "
\n", 2430 | "## Indexing / Slicing" 2431 | ] 2432 | }, 2433 | { 2434 | "cell_type": "code", 2435 | "execution_count": null, 2436 | "metadata": { 2437 | "collapsed": false 2438 | }, 2439 | "outputs": [], 2440 | "source": [ 2441 | "s = np.arange(13)**2\n", 2442 | "s" 2443 | ] 2444 | }, 2445 | { 2446 | "cell_type": "markdown", 2447 | "metadata": {}, 2448 | "source": [ 2449 | "
\n", 2450 | "Use bracket notation to get the value at a specific index. Remember that indexing starts at 0." 2451 | ] 2452 | }, 2453 | { 2454 | "cell_type": "code", 2455 | "execution_count": null, 2456 | "metadata": { 2457 | "collapsed": false 2458 | }, 2459 | "outputs": [], 2460 | "source": [ 2461 | "s[0], s[4], s[-1]" 2462 | ] 2463 | }, 2464 | { 2465 | "cell_type": "markdown", 2466 | "metadata": {}, 2467 | "source": [ 2468 | "
\n", 2469 | "Use `:` to indicate a range. `array[start:stop]`\n", 2470 | "\n", 2471 | "\n", 2472 | "Leaving `start` or `stop` empty will default to the beginning/end of the array." 2473 | ] 2474 | }, 2475 | { 2476 | "cell_type": "code", 2477 | "execution_count": null, 2478 | "metadata": { 2479 | "collapsed": false 2480 | }, 2481 | "outputs": [], 2482 | "source": [ 2483 | "s[1:5]" 2484 | ] 2485 | }, 2486 | { 2487 | "cell_type": "markdown", 2488 | "metadata": {}, 2489 | "source": [ 2490 | "
\n", 2491 | "Use negatives to count from the back." 2492 | ] 2493 | }, 2494 | { 2495 | "cell_type": "code", 2496 | "execution_count": null, 2497 | "metadata": { 2498 | "collapsed": false 2499 | }, 2500 | "outputs": [], 2501 | "source": [ 2502 | "s[-4:]" 2503 | ] 2504 | }, 2505 | { 2506 | "cell_type": "markdown", 2507 | "metadata": {}, 2508 | "source": [ 2509 | "
\n", 2510 | "A second `:` can be used to indicate step-size. `array[start:stop:stepsize]`\n", 2511 | "\n", 2512 | "Here we are starting 5th element from the end, and counting backwards by 2 until the beginning of the array is reached." 2513 | ] 2514 | }, 2515 | { 2516 | "cell_type": "code", 2517 | "execution_count": null, 2518 | "metadata": { 2519 | "collapsed": false 2520 | }, 2521 | "outputs": [], 2522 | "source": [ 2523 | "s[-5::-2]" 2524 | ] 2525 | }, 2526 | { 2527 | "cell_type": "markdown", 2528 | "metadata": { 2529 | "collapsed": false 2530 | }, 2531 | "source": [ 2532 | "
\n", 2533 | "Let's look at a multidimensional array." 2534 | ] 2535 | }, 2536 | { 2537 | "cell_type": "code", 2538 | "execution_count": null, 2539 | "metadata": { 2540 | "collapsed": false 2541 | }, 2542 | "outputs": [], 2543 | "source": [ 2544 | "r = np.arange(36)\n", 2545 | "r.resize((6, 6))\n", 2546 | "r" 2547 | ] 2548 | }, 2549 | { 2550 | "cell_type": "markdown", 2551 | "metadata": {}, 2552 | "source": [ 2553 | "
\n", 2554 | "Use bracket notation to slice: `array[row, column]`" 2555 | ] 2556 | }, 2557 | { 2558 | "cell_type": "code", 2559 | "execution_count": null, 2560 | "metadata": { 2561 | "collapsed": false 2562 | }, 2563 | "outputs": [], 2564 | "source": [ 2565 | "r[2, 2]" 2566 | ] 2567 | }, 2568 | { 2569 | "cell_type": "markdown", 2570 | "metadata": {}, 2571 | "source": [ 2572 | "
\n", 2573 | "And use : to select a range of rows or columns" 2574 | ] 2575 | }, 2576 | { 2577 | "cell_type": "code", 2578 | "execution_count": null, 2579 | "metadata": { 2580 | "collapsed": false 2581 | }, 2582 | "outputs": [], 2583 | "source": [ 2584 | "r[3, 3:6]" 2585 | ] 2586 | }, 2587 | { 2588 | "cell_type": "markdown", 2589 | "metadata": {}, 2590 | "source": [ 2591 | "
\n", 2592 | "Here we are selecting all the rows up to (and not including) row 2, and all the columns up to (and not including) the last column." 2593 | ] 2594 | }, 2595 | { 2596 | "cell_type": "code", 2597 | "execution_count": null, 2598 | "metadata": { 2599 | "collapsed": false 2600 | }, 2601 | "outputs": [], 2602 | "source": [ 2603 | "r[:2, :-1]" 2604 | ] 2605 | }, 2606 | { 2607 | "cell_type": "markdown", 2608 | "metadata": {}, 2609 | "source": [ 2610 | "
\n", 2611 | "This is a slice of the last row, and only every other element." 2612 | ] 2613 | }, 2614 | { 2615 | "cell_type": "code", 2616 | "execution_count": null, 2617 | "metadata": { 2618 | "collapsed": false 2619 | }, 2620 | "outputs": [], 2621 | "source": [ 2622 | "r[-1, ::2]" 2623 | ] 2624 | }, 2625 | { 2626 | "cell_type": "markdown", 2627 | "metadata": {}, 2628 | "source": [ 2629 | "
\n", 2630 | "We can also perform conditional indexing. Here we are selecting values from the array that are greater than 30. (Also see `np.where`)" 2631 | ] 2632 | }, 2633 | { 2634 | "cell_type": "code", 2635 | "execution_count": null, 2636 | "metadata": { 2637 | "collapsed": false 2638 | }, 2639 | "outputs": [], 2640 | "source": [ 2641 | "r[r > 30]" 2642 | ] 2643 | }, 2644 | { 2645 | "cell_type": "markdown", 2646 | "metadata": {}, 2647 | "source": [ 2648 | "
\n", 2649 | "Here we are assigning all values in the array that are greater than 30 to the value of 30." 2650 | ] 2651 | }, 2652 | { 2653 | "cell_type": "code", 2654 | "execution_count": null, 2655 | "metadata": { 2656 | "collapsed": false 2657 | }, 2658 | "outputs": [], 2659 | "source": [ 2660 | "r[r > 30] = 30\n", 2661 | "r" 2662 | ] 2663 | }, 2664 | { 2665 | "cell_type": "markdown", 2666 | "metadata": {}, 2667 | "source": [ 2668 | "
\n", 2669 | "## Copying Data" 2670 | ] 2671 | }, 2672 | { 2673 | "cell_type": "markdown", 2674 | "metadata": {}, 2675 | "source": [ 2676 | "Be careful with copying and modifying arrays in NumPy!\n", 2677 | "\n", 2678 | "\n", 2679 | "`r2` is a slice of `r`" 2680 | ] 2681 | }, 2682 | { 2683 | "cell_type": "code", 2684 | "execution_count": null, 2685 | "metadata": { 2686 | "collapsed": false 2687 | }, 2688 | "outputs": [], 2689 | "source": [ 2690 | "r2 = r[:3,:3]\n", 2691 | "r2" 2692 | ] 2693 | }, 2694 | { 2695 | "cell_type": "markdown", 2696 | "metadata": {}, 2697 | "source": [ 2698 | "
\n", 2699 | "Set this slice's values to zero ([:] selects the entire array)" 2700 | ] 2701 | }, 2702 | { 2703 | "cell_type": "code", 2704 | "execution_count": null, 2705 | "metadata": { 2706 | "collapsed": false 2707 | }, 2708 | "outputs": [], 2709 | "source": [ 2710 | "r2[:] = 0\n", 2711 | "r2" 2712 | ] 2713 | }, 2714 | { 2715 | "cell_type": "markdown", 2716 | "metadata": {}, 2717 | "source": [ 2718 | "
\n", 2719 | "`r` has also been changed!" 2720 | ] 2721 | }, 2722 | { 2723 | "cell_type": "code", 2724 | "execution_count": null, 2725 | "metadata": { 2726 | "collapsed": false 2727 | }, 2728 | "outputs": [], 2729 | "source": [ 2730 | "r" 2731 | ] 2732 | }, 2733 | { 2734 | "cell_type": "markdown", 2735 | "metadata": {}, 2736 | "source": [ 2737 | "
\n", 2738 | "To avoid this, use `r.copy` to create a copy that will not affect the original array" 2739 | ] 2740 | }, 2741 | { 2742 | "cell_type": "code", 2743 | "execution_count": null, 2744 | "metadata": { 2745 | "collapsed": false 2746 | }, 2747 | "outputs": [], 2748 | "source": [ 2749 | "r_copy = r.copy()\n", 2750 | "r_copy" 2751 | ] 2752 | }, 2753 | { 2754 | "cell_type": "markdown", 2755 | "metadata": {}, 2756 | "source": [ 2757 | "
\n", 2758 | "Now when r_copy is modified, r will not be changed." 2759 | ] 2760 | }, 2761 | { 2762 | "cell_type": "code", 2763 | "execution_count": null, 2764 | "metadata": { 2765 | "collapsed": false 2766 | }, 2767 | "outputs": [], 2768 | "source": [ 2769 | "r_copy[:] = 10\n", 2770 | "print(r_copy, '\\n')\n", 2771 | "print(r)" 2772 | ] 2773 | }, 2774 | { 2775 | "cell_type": "markdown", 2776 | "metadata": {}, 2777 | "source": [ 2778 | "
\n", 2779 | "### Iterating Over Arrays" 2780 | ] 2781 | }, 2782 | { 2783 | "cell_type": "markdown", 2784 | "metadata": {}, 2785 | "source": [ 2786 | "Let's create a new 4 by 3 array of random numbers 0-9." 2787 | ] 2788 | }, 2789 | { 2790 | "cell_type": "code", 2791 | "execution_count": null, 2792 | "metadata": { 2793 | "collapsed": false 2794 | }, 2795 | "outputs": [], 2796 | "source": [ 2797 | "test = np.random.randint(0, 10, (4,3))\n", 2798 | "test" 2799 | ] 2800 | }, 2801 | { 2802 | "cell_type": "markdown", 2803 | "metadata": {}, 2804 | "source": [ 2805 | "
\n", 2806 | "Iterate by row:" 2807 | ] 2808 | }, 2809 | { 2810 | "cell_type": "code", 2811 | "execution_count": null, 2812 | "metadata": { 2813 | "collapsed": false 2814 | }, 2815 | "outputs": [], 2816 | "source": [ 2817 | "for row in test:\n", 2818 | " print(row)" 2819 | ] 2820 | }, 2821 | { 2822 | "cell_type": "markdown", 2823 | "metadata": {}, 2824 | "source": [ 2825 | "
\n", 2826 | "Iterate by index:" 2827 | ] 2828 | }, 2829 | { 2830 | "cell_type": "code", 2831 | "execution_count": null, 2832 | "metadata": { 2833 | "collapsed": false 2834 | }, 2835 | "outputs": [], 2836 | "source": [ 2837 | "for i in range(len(test)):\n", 2838 | " print(test[i])" 2839 | ] 2840 | }, 2841 | { 2842 | "cell_type": "markdown", 2843 | "metadata": {}, 2844 | "source": [ 2845 | "
\n", 2846 | "Iterate by row and index:" 2847 | ] 2848 | }, 2849 | { 2850 | "cell_type": "code", 2851 | "execution_count": null, 2852 | "metadata": { 2853 | "collapsed": false 2854 | }, 2855 | "outputs": [], 2856 | "source": [ 2857 | "for i, row in enumerate(test):\n", 2858 | " print('row', i, 'is', row)" 2859 | ] 2860 | }, 2861 | { 2862 | "cell_type": "markdown", 2863 | "metadata": {}, 2864 | "source": [ 2865 | "
\n", 2866 | "Use `zip` to iterate over multiple iterables." 2867 | ] 2868 | }, 2869 | { 2870 | "cell_type": "code", 2871 | "execution_count": null, 2872 | "metadata": { 2873 | "collapsed": false 2874 | }, 2875 | "outputs": [], 2876 | "source": [ 2877 | "test2 = test**2\n", 2878 | "test2" 2879 | ] 2880 | }, 2881 | { 2882 | "cell_type": "code", 2883 | "execution_count": null, 2884 | "metadata": { 2885 | "collapsed": false 2886 | }, 2887 | "outputs": [], 2888 | "source": [ 2889 | "for i, j in zip(test, test2):\n", 2890 | " print(i,'+',j,'=',i+j)" 2891 | ] 2892 | } 2893 | ], 2894 | "metadata": { 2895 | "kernelspec": { 2896 | "display_name": "Python 3", 2897 | "language": "python", 2898 | "name": "python3" 2899 | }, 2900 | "language_info": { 2901 | "codemirror_mode": { 2902 | "name": "ipython", 2903 | "version": 3 2904 | }, 2905 | "file_extension": ".py", 2906 | "mimetype": "text/x-python", 2907 | "name": "python", 2908 | "nbconvert_exporter": "python", 2909 | "pygments_lexer": "ipython3", 2910 | "version": "3.5.2" 2911 | } 2912 | }, 2913 | "nbformat": 4, 2914 | "nbformat_minor": 0 2915 | } 2916 | --------------------------------------------------------------------------------