├── Week3
├── Week3.py
├── Week3 slides.pdf
├── Assignment - Week 3
│ ├── scimagojr-3.xlsx
│ ├── Energy Indicators.xls
│ └── Assignment+3.ipynb
└── Week+3.ipynb
├── Week4
├── Week4 slides.pdf
└── Week+4.ipynb
├── Week1
├── Week1.py
└── Week+1.ipynb
├── Week2
└── Week2.py
└── README.md
/Week3/Week3.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Week3/Week3 slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanniey/Coursera_Intro_to_Data_Science_with_Python/HEAD/Week3/Week3 slides.pdf
--------------------------------------------------------------------------------
/Week4/Week4 slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanniey/Coursera_Intro_to_Data_Science_with_Python/HEAD/Week4/Week4 slides.pdf
--------------------------------------------------------------------------------
/Week3/Assignment - Week 3/scimagojr-3.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanniey/Coursera_Intro_to_Data_Science_with_Python/HEAD/Week3/Assignment - Week 3/scimagojr-3.xlsx
--------------------------------------------------------------------------------
/Week3/Assignment - Week 3/Energy Indicators.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanniey/Coursera_Intro_to_Data_Science_with_Python/HEAD/Week3/Assignment - Week 3/Energy Indicators.xls
--------------------------------------------------------------------------------
/Week1/Week1.py:
--------------------------------------------------------------------------------
1 | people = ['Dr. Christopher Brooks', 'Dr. Kevyn Collins-Thompson', 'Dr. VG Vinod Vydiswaran', 'Dr. Daniel Romero']
2 |
3 | titleName = []
4 | def split_title_and_name():
5 | for person in people:
6 | last = person.split(" ")[-1]
7 | title = person.split(" ")[0]
8 | titleName.append(title + " "+last)
9 | print(titleName)
10 |
11 | split_title_and_name()
12 | # list(map(split_title_and_name, people)
13 |
--------------------------------------------------------------------------------
/Week2/Week2.py:
--------------------------------------------------------------------------------
1 | Week 2 Assignment
2 |
3 | Question 1
4 | Which country has won the most gold medals in summer games?
5 | This function should return a single string value.
6 |
7 | ```
8 | def answer_one():
9 | return df['Gold'].idxmax()
10 |
11 | answer_one()
12 | ```
13 |
14 | Question 2¶
15 | Which country had the biggest difference between their summer and winter gold medal counts?
16 | This function should return a single string value.
17 | ```
18 | def answer_two():
19 | max_diff=max(df['Gold']-df['Gold.1'])
20 | answer = df[(df['Gold']-df['Gold.1'])==max_diff].index.tolist()
21 | return answer[0]
22 |
23 | answer_two()
24 | ```
25 |
26 | Question 3
27 |
28 | Which country has the biggest difference between their summer gold medal counts and winter gold medal counts relative to their total gold medal count?
29 | (Summer Gold−Winter Gold)/Total Gold
30 |
31 | Only include countries that have won at least 1 gold in both summer and winter.
32 | This function should return a single string value.
33 | ```
34 | def answer_three():
35 | df_nozero = df[(df['Gold']>0) & (df['Gold.1']>0)]
36 | percentage = max(abs((df_nozero['Gold']-df_nozero['Gold.1'])/df_nozero['Gold.2']))
37 | return df[((df['Gold']-df['Gold.1'])/df['Gold.2'])==percentage].index.tolist()[0]
38 |
39 | answer_three()
40 | ```
41 |
42 |
43 | Question 4¶
44 | Write a function that creates a Series called "Points" which is a weighted value where each gold medal (Gold.2) counts for 3 points, silver medals (Silver.2) for 2 points, and bronze medals (Bronze.2) for 1 point. The function should return only the column (a Series object) which you created.
45 | This function should return a Series named Points of length 146
46 |
47 | ```
48 | def answer_four():
49 | df['Points']= (df['Gold.2']*3+df['Silver.2']*2+df['Bronze.2'])
50 | return df['Points']
51 |
52 | answer_four()
53 | ```
54 |
55 | Question 5
56 | Question 5¶
57 | Which state has the most counties in it? (hint: consider the sumlevel key carefully! You'll need this for future questions too...)
58 | This function should return a single string value.
59 | ```
60 |
61 | def answer_five():
62 | new_df = census_df[census_df['SUMLEV'] == 50]
63 | return new_df.groupby('STNAME').count()['SUMLEV'].idxmax()
64 |
65 | answer_five()
66 | ```
67 |
68 |
--------------------------------------------------------------------------------
/Week4/Week+4.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "---\n",
8 | "\n",
9 | "_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-data-analysis/resources/0dhYG) course resource._\n",
10 | "\n",
11 | "---"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "# Distributions in Pandas"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {
25 | "collapsed": false
26 | },
27 | "outputs": [],
28 | "source": [
29 | "import pandas as pd\n",
30 | "import numpy as np"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {
37 | "collapsed": false
38 | },
39 | "outputs": [],
40 | "source": [
41 | "np.random.binomial(1, 0.5)"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {
48 | "collapsed": false
49 | },
50 | "outputs": [],
51 | "source": [
52 | "np.random.binomial(1000, 0.5)/1000"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": null,
58 | "metadata": {
59 | "collapsed": false
60 | },
61 | "outputs": [],
62 | "source": [
63 | "chance_of_tornado = 0.01/100\n",
64 | "np.random.binomial(100000, chance_of_tornado)"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {
71 | "collapsed": false
72 | },
73 | "outputs": [],
74 | "source": [
75 | "chance_of_tornado = 0.01\n",
76 | "\n",
77 | "tornado_events = np.random.binomial(1, chance_of_tornado, 1000000)\n",
78 | " \n",
79 | "two_days_in_a_row = 0\n",
80 | "for j in range(1,len(tornado_events)-1):\n",
81 | " if tornado_events[j]==1 and tornado_events[j-1]==1:\n",
82 | " two_days_in_a_row+=1\n",
83 | "\n",
84 | "print('{} tornadoes back to back in {} years'.format(two_days_in_a_row, 1000000/365))"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {
91 | "collapsed": false
92 | },
93 | "outputs": [],
94 | "source": [
95 | "np.random.uniform(0, 1)"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "metadata": {
102 | "collapsed": false
103 | },
104 | "outputs": [],
105 | "source": [
106 | "np.random.normal(0.75)"
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "metadata": {},
112 | "source": [
113 | "Formula for standard deviation\n",
114 | "$$\\sqrt{\\frac{1}{N} \\sum_{i=1}^N (x_i - \\overline{x})^2}$$"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": null,
120 | "metadata": {
121 | "collapsed": false
122 | },
123 | "outputs": [],
124 | "source": [
125 | "distribution = np.random.normal(0.75,size=1000)\n",
126 | "\n",
127 | "np.sqrt(np.sum((np.mean(distribution)-distribution)**2)/len(distribution))"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": null,
133 | "metadata": {
134 | "collapsed": false,
135 | "scrolled": true
136 | },
137 | "outputs": [],
138 | "source": [
139 | "np.std(distribution)"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": null,
145 | "metadata": {
146 | "collapsed": false
147 | },
148 | "outputs": [],
149 | "source": [
150 | "import scipy.stats as stats\n",
151 | "stats.kurtosis(distribution)"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": null,
157 | "metadata": {
158 | "collapsed": false
159 | },
160 | "outputs": [],
161 | "source": [
162 | "stats.skew(distribution)"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {
169 | "collapsed": false
170 | },
171 | "outputs": [],
172 | "source": [
173 | "chi_squared_df2 = np.random.chisquare(2, size=10000)\n",
174 | "stats.skew(chi_squared_df2)"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "metadata": {
181 | "collapsed": false
182 | },
183 | "outputs": [],
184 | "source": [
185 | "chi_squared_df5 = np.random.chisquare(5, size=10000)\n",
186 | "stats.skew(chi_squared_df5)"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": null,
192 | "metadata": {
193 | "collapsed": false
194 | },
195 | "outputs": [],
196 | "source": [
197 | "%matplotlib inline\n",
198 | "import matplotlib\n",
199 | "import matplotlib.pyplot as plt\n",
200 | "\n",
201 | "output = plt.hist([chi_squared_df2,chi_squared_df5], bins=50, histtype='step', \n",
202 | " label=['2 degrees of freedom','5 degrees of freedom'])\n",
203 | "plt.legend(loc='upper right')\n"
204 | ]
205 | },
206 | {
207 | "cell_type": "markdown",
208 | "metadata": {},
209 | "source": [
210 | "# Hypothesis Testing"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": null,
216 | "metadata": {
217 | "collapsed": false
218 | },
219 | "outputs": [],
220 | "source": [
221 | "df = pd.read_csv('grades.csv')"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": null,
227 | "metadata": {
228 | "collapsed": false
229 | },
230 | "outputs": [],
231 | "source": [
232 | "df.head()"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": null,
238 | "metadata": {
239 | "collapsed": false
240 | },
241 | "outputs": [],
242 | "source": [
243 | "len(df)"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": null,
249 | "metadata": {
250 | "collapsed": false
251 | },
252 | "outputs": [],
253 | "source": [
254 | "early = df[df['assignment1_submission'] <= '2015-12-31']\n",
255 | "late = df[df['assignment1_submission'] > '2015-12-31']"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": null,
261 | "metadata": {
262 | "collapsed": false
263 | },
264 | "outputs": [],
265 | "source": [
266 | "early.mean()"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": null,
272 | "metadata": {
273 | "collapsed": false
274 | },
275 | "outputs": [],
276 | "source": [
277 | "late.mean()"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "metadata": {
284 | "collapsed": false
285 | },
286 | "outputs": [],
287 | "source": [
288 | "from scipy import stats\n",
289 | "stats.ttest_ind?"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": null,
295 | "metadata": {
296 | "collapsed": false
297 | },
298 | "outputs": [],
299 | "source": [
300 | "stats.ttest_ind(early['assignment1_grade'], late['assignment1_grade'])"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": null,
306 | "metadata": {
307 | "collapsed": false
308 | },
309 | "outputs": [],
310 | "source": [
311 | "stats.ttest_ind(early['assignment2_grade'], late['assignment2_grade'])"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": null,
317 | "metadata": {
318 | "collapsed": false
319 | },
320 | "outputs": [],
321 | "source": [
322 | "stats.ttest_ind(early['assignment3_grade'], late['assignment3_grade'])"
323 | ]
324 | }
325 | ],
326 | "metadata": {
327 | "kernelspec": {
328 | "display_name": "Python 3",
329 | "language": "python",
330 | "name": "python3"
331 | },
332 | "language_info": {
333 | "codemirror_mode": {
334 | "name": "ipython",
335 | "version": 3
336 | },
337 | "file_extension": ".py",
338 | "mimetype": "text/x-python",
339 | "name": "python",
340 | "nbconvert_exporter": "python",
341 | "pygments_lexer": "ipython3",
342 | "version": "3.5.2"
343 | }
344 | },
345 | "nbformat": 4,
346 | "nbformat_minor": 0
347 | }
348 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Intro to Data Science in Python
2 | ## University of Michigan, Professor Christopher Brooks, Coursera course
3 | ### 11/2016 - Completed on 04/12/2016
4 |
5 | Summary:
6 | Despite the course name, this is an intermediate-level data science course with Python. Familiarity with Numpy and Pandas libraries is not required, but is highly recommended, as the course does get pretty intense really quickly (i.e. Week 2) To be honest, this is a solid course for someone who has a background with Panda and numpy libraries. However, there is a big knowledge gap between the videos and the assignments, so it's challenging for beginners.
7 |
8 |
9 |
10 | Feedback:
11 |
12 | 
13 |
14 | > My feeling while taking this course...
15 |
16 | 04/12/2016:
17 | Finally finished this...was close to giving up on it SO MANY TIMES!
18 |
19 |
20 | ## Week 4 Statistical Analysis in Python and Project
21 |
22 |
23 | Binomial Distribution in numpy for coin flipping
24 |
25 | ```
26 | np.random.binomial(1,0.5)
27 | ```
28 | First term (1) is the number of times you want it to run, and second term (0.5) is the chance we get a zero
29 |
30 | ```
31 | np.random.binomial(1000, 0.5)/1000
32 | ```
33 | Flip coins 1000 times, and divide the result by 1000
34 |
35 | Run 1000 simulations of flipping coins 20 times and getting a number >= 15.
36 |
37 | ```
38 | x = np.random.binomial(20, .5, 10000)
39 | print((x>=15).mean())
40 | ```
41 | Output:
42 | ```
43 | 0.0219
44 | ```
45 |
46 | Get the number of events given no. of simulation.
47 | "How many tornados will take place based on 100,000 simulations, given that the chance of a tornado is 0.01%?"
48 |
49 | ```
50 | chance_of_tornado = 0.01/100
51 | np.random.binomial(100000,chance of tornado)
52 | ```
53 | Output:
54 | ```
55 | 8
56 | ```
57 |
58 | "Assume the chance of tornado is 1%. How many tornados will take place (what is the chance of tornados taking place) two days in a row based on 1000000 simulations?"
59 |
60 | ```
61 | chance_of_tornado = 0.01
62 |
63 | tornado_events = np.random.binomial(1, chance_of_tornado, 1000000)
64 |
65 | two_days_in_a_row = 0
66 | for j in range(1,len(tornado_events)-1):
67 | if tornado_events[j]==1 and tornado_events[j-1]==1:
68 | two_days_in_a_row+=1
69 |
70 | print('{} tornadoes back to back in {} years'.format(two_days_in_a_row, 1000000/365))
71 | ```
72 | Output:
73 | ```
74 | 103 tornadoes back to back in 2739.72602739726 years
75 | ```
76 | tornado_events[j]== 1 means the day when tornado took place.
77 |
78 | #### Standard deviation
79 |
80 | Draw 1000 samples of a normal distriubtion, with expected value of 0.75 and a standard deviation of 1. Result is ~ 68% of area.
81 | ```
82 | distribution = np.random.normal(0.75,size=1000)
83 |
84 | np.sqrt(np.sum((np.mean(distribution)-distribution)**2)/len(distribution))
85 | ```
86 | The above code is equivalent to the np.std() function:
87 | ```
88 | np.std(distribution)
89 | ```
90 |
91 | #### Kirtosis (shape of tails) with stats module
92 |
93 | Positive value = more chubby than a normal distribution
94 | Negative value = more flat than a normal distribution
95 |
96 | ```
97 | import scipy.stats as stats
98 | stats.kurtosis(distribution)
99 |
100 | ```
101 | Output:
102 | ```
103 | -0.21162400583818153
104 | ```
105 |
106 | #### Skew with stats module
107 | If skew = 0.5, then there's no skew (i.e. the distribution is symmetric)
108 |
109 | ```
110 | stats.skew(distribution)
111 | ```
112 | Output:
113 | ```
114 | 0.051147428570855365
115 | ```
116 |
117 |
118 | #### Chi squared distribution (left-skewed)
119 | As the degree of freedom increases, the plot moves from left to center
120 |
121 | Degree of freedom = 2:
122 | ```
123 | chi_squared_df2 = np.random.chisquare(2, size=10000)
124 | stats.skew(chi_squared_df2)
125 | ```
126 | Output:
127 | ```
128 | 1.9589902136938178
129 | ```
130 |
131 | Degree of freemdom = 5:
132 | ```
133 | chi_squared_df5 = np.random.chisquare(5, size=10000)
134 | stats.skew(chi_squared_df5)
135 | ```
136 | Output:
137 | ```
138 | 1.3010399138921354
139 | ```
140 | #### Bimodal distribution (having 2 peaks)
141 |
142 | #### Hypothesis Testing
143 | Alternative Hypothesis vs. Null Hypothesis
144 | Significance level (alpha),
145 | alpha = 0.05 or 5%
146 |
147 | #### t-test: compare the means of two different populations
148 |
149 | stats.ttest_ind(): compare 2 difference samples to see if they have different means. In this case, we're using ttest_ind() to compare the average grade of assignment 1 between early users('early' dataframe) and late users('late' dataframe).
150 |
151 | Output is a tuple with a test statistic and a p-value.
152 |
153 |
154 | ```
155 | import scipy.stats as stats
156 |
157 | early = df[df['assignment1_submission'] <= '2015-12-31']
158 | late = df[df['assignment1_submission'] > '2015-12-31']
159 |
160 | stats.ttest_ind(early['assignment1_grade'], late['assignment1_grade'])
161 | ```
162 | Output:
163 | ```
164 | Ttest_indResult(statistic=1.400549944897566, pvalue=0.16148283016060577)
165 | ```
166 |
167 | If the p-value is >0.05(the significance value/alpha we decided previously), then we cannot reject the null hypothesis.
168 |
169 | Do the same test on assignment 2:
170 | ```
171 | stats.ttest_ind(early['assignment2_grade'], late['assignment2_grade'])
172 | ```
173 | Output:
174 | ```
175 | Ttest_indResult(statistic=1.3239868220912567, pvalue=0.18563824610067967)
176 | In [ ]:
177 | ```
178 | p-value is still >0.05, so we cannot reject the null hypothesis.
179 | ---
180 |
181 | ## Week 3 Advanced Python Pandas
182 |
183 | 
184 |
185 | > Finally finished Week 3's assignment.
186 |
187 | 11/27/2016 Update
188 | Finally finished this week's assignment! The first one took a long time. I had to relearn regular expression because of it. Learned a lot about dataframes through the practices, so I'm happy about the progress eventually, but Jesus,that was a lot of work...
189 |
190 | Merging dataframes based on the same index. "NaN" is assigned when there's a missing value.
191 |
192 | #### iloc() and loc()
193 | iloc()for query based on location
194 | loc() for query based on label
195 |
196 | #### Outer vs inner join
197 |
198 | Outer Join
199 | ```
200 | pd.merge(df1,df2,how='outer',left_index=True,right_index=True)
201 | ```
202 | Inner Join
203 | ```
204 | pd.merge(df1,df2,how='inner,left_index=True,right_index=True)
205 | ```
206 | Left Join: keep all information from df1
207 | ```
208 | pd.merge(df1,df2,how='left',left_index=True,right_index=True)
209 | ```
210 | Right Join: keep all information from df2
211 | ```
212 | pd.merge(df1,df2,how='right',left_index=True,right_index=True)
213 | ```
214 | Join by Column names
215 | ```
216 | pd.merge(df1,df2,how='left',left_on='Name',right_on='Name')
217 | ```
218 |
219 | Chain indexing - not recommended
220 | ```
221 | df.loc['Washtenaw']['Total Population']
222 | ```
223 |
224 | Method chaining
225 | ```
226 | (df.where(df['SUMLEV']==50)
227 | .dropna()
228 | .set_index(['STNAME','CTYNAME'])
229 | .rename(columns={'ESTIMATESBASE2010': 'Estimates Base 2010'}))
230 | ```
231 | Drop rows where 'Quantity' is 0, and rename the column 'Weight' to 'Weight(oz.)'
232 | ```
233 | df = df[df.Quantity !=0].rename({'Weight':'Weight(oz.)'})
234 | ```
235 | Alternatively:
236 | ```
237 | print(df.drop(df[df['Quantity'] == 0].index).rename(columns={'Weight': 'Weight (oz.)'}))
238 | ```
239 |
240 | #### Apply() function which applies a function to all rows in a dataframe
241 |
242 | To apply to all columns in the same row(i.e.1 = across), use axis= 1
243 | To apply to all rows in the same column (i.e. 0 = down), use axis = 0
244 |
245 | ```
246 | import numpy as np
247 | def min_max(row):
248 | data = row[['POPESTIMATE2010',
249 | 'POPESTIMATE2011',
250 | 'POPESTIMATE2012',
251 | 'POPESTIMATE2013',
252 | 'POPESTIMATE2014',
253 | 'POPESTIMATE2015']]
254 | return pd.Series({'min': np.min(data), 'max': np.max(data)})
255 |
256 | df.apply(min_max, axis=1)
257 | ```
258 | Adding the applied function to the existing dataframe (instead of creating a new one)
259 | ```
260 | import numpy as np
261 | def min_max(row):
262 | data = row[['POPESTIMATE2010',
263 | 'POPESTIMATE2011',
264 | 'POPESTIMATE2012',
265 | 'POPESTIMATE2013',
266 | 'POPESTIMATE2014',
267 | 'POPESTIMATE2015']]
268 | row['max'] = np.max(data)
269 | row['min'] = np.min(data)
270 | return row
271 | df.apply(min_max, axis=1)
272 | ```
273 | Use apply() with lambda function:
274 | create a function with the max of each row
275 | ```
276 | rows = ['POPESTIMATE2010',
277 | 'POPESTIMATE2011',
278 | 'POPESTIMATE2012',
279 | 'POPESTIMATE2013',
280 | 'POPESTIMATE2014',
281 | 'POPESTIMATE2015']
282 | df.apply(lambda x: np.max(x[rows]), axis=1)
283 | ```
284 |
285 | #### Groupby()
286 | you can use a function to be the criteria for group_by()
287 | ```
288 | df = df.set_index('STNAME')
289 |
290 | def fun(item):
291 | if item[0]<'M':
292 | return 0
293 | if item[0]<'Q':
294 | return 1
295 | return 2
296 |
297 | for group, frame in df.groupby(fun):
298 | print('There are ' + str(len(frame)) + ' records in group ' + str(group) + ' for processing.')
299 |
300 | ```
301 | Calculate the average/sum of a certain group with groupby() and agg()
302 | ```
303 | df.groupby('STNAME').agg({'CENSUS2010POP': np.average})
304 | ```
305 | ```
306 | print(df.groupby('Category').agg('sum'))
307 | ```
308 |
309 | #### Use apply() with groupby()
310 | ```
311 | def totalweight(df, w, q):
312 | return sum(df[w] * df[q])
313 |
314 | print(df.groupby('Category').apply(totalweight, 'Weight (oz.)', 'Quantity'))
315 | ```
316 |
317 | #### Scales
318 | Use astype() to change the type of scales from one to another
319 |
320 | create a list and use astype() to indicate the order with ordered = True. This enables > or < to be used on strings.
321 |
322 | ```
323 | df = pd.DataFrame(['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D'],
324 | index=['excellent', 'excellent', 'excellent', 'good', 'good', 'good', 'ok', 'ok', 'ok', 'poor', 'poor'])
325 | df.rename(columns={0: 'Grades'}, inplace=True)
326 |
327 | grades = df['Grades'].astype('category',
328 | categories=['D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+'],
329 | ordered=True)
330 | grades.head()
331 | ```
332 | output is:
333 | ```
334 | excellent A+
335 | excellent A
336 | excellent A-
337 | good B+
338 | good B
339 | Name: Grades, dtype: category
340 | Categories (11, object): [D < D+ < C- < C ... B+ < A- < A < A+]
341 |
342 | ```
343 | Use > or < functions on types, output:
344 | ```
345 | excellent True
346 | excellent True
347 | excellent True
348 | good True
349 | good True
350 | good True
351 | ok True
352 | ok False
353 | ok False
354 | poor False
355 | poor False
356 | Name: Grades, dtype: bool
357 | ```
358 |
359 | Change this series to categorical with ordering Low < Medium < High
360 |
361 | ```
362 | s = pd.Series(['Low', 'Low', 'High', 'Medium', 'Low', 'High', 'Low'])
363 |
364 | s.astype('category', categories=['Low', 'Medium', 'High'], ordered=True)
365 | ```
366 |
367 | Use get_dummies() to convert boolean values into 0s and 1s
368 |
369 | #### cut(): to cut data into bins (i.e. to divide them equally into 10 buckets)
370 |
371 | ```
372 | df = pd.read_csv('census.csv')
373 | df = df[df['SUMLEV']==50]
374 | df = df.set_index('STNAME').groupby(level=0)['CENSUS2010POP'].agg({'avg': np.average})
375 | pd.cut(df['avg'],10)
376 | ```
377 | Cut a series into 3 equal-sized bins
378 | ```
379 | s = pd.Series([168, 180, 174, 190, 170, 185, 179, 181, 175, 169, 182, 177, 180, 171])
380 |
381 |
382 | pd.cut(s, 3)
383 |
384 | # You can also add labels for the sizes [Small < Medium < Large].
385 | pd.cut(s, 3, labels=['Small', 'Medium', 'Large'])
386 | ```
387 |
388 | #### Use pivot_table() to create Pivot Tables
389 |
390 | ```
391 | df = pd.read_csv('cars.csv')
392 | df.pivot_table(values='(kW)', index='YEAR', columns='Make', aggfunc=np.mean)
393 | ```
394 |
395 | Create a pivot table that shows mean price and mean ratings for every "Manufacturer"/"Bike Type" combination
396 | ```
397 | print(pd.pivot_table(Bikes, index=['Manufacturer','Bike Type']))
398 |
399 | import numpy as np
400 | print(Bikes.pivot_table(values ='Price',index = 'Manufacturer',columns = 'Bike Type',aggfunc=np.average))
401 | ```
402 |
403 | #### Date Functionality in Panda
404 | 1. Timestamp
405 | 2. DatetimeIndex (the index of 1)
406 | 3. Period
407 | 4. PeriodIndex (the index of 3)
408 |
409 | 1. Timestamp, exchangeable to Python's datetime
410 | ⋅⋅⋅```
411 | ⋅⋅⋅pd.Timestamp('9/1/2016 10:05AM')
412 | ⋅⋅⋅```
413 |
414 | 2. Period
415 | ```
416 | pd.Period('1/2016')
417 | ```
418 |
419 | 3. DatetimeIndex and PeriodIndex
420 | DatetimeIndex
421 | ```
422 | t1 = pd.Series(list('abc'), [pd.Timestamp('2016-09-01'), pd.Timestamp('2016-09-02'), pd.Timestamp('2016-09-03')])
423 |
424 | type(t1.index)
425 |
426 | ```
427 | Output:
428 | ```
429 | pandas.tseries.index.DatetimeIndex
430 | ```
431 | PeriodIndex
432 | ```
433 | t2 = pd.Series(list('def'), [pd.Period('2016-09'), pd.Period('2016-10'), pd.Period('2016-11')])
434 | type(t2.index)
435 | ```
436 | Output:
437 | ```
438 | pandas.tseries.period.PeriodIndex
439 | ```
440 |
441 | Coverts datetimes to the same format with to_datetime()
442 |
443 | ```
444 | d1 = ['2 June 2013', 'Aug 29, 2014', '2015-06-26', '7/12/16']
445 | ts3 = pd.DataFrame(np.random.randint(10, 100, (4,2)), index=d1, columns=list('ab'))
446 | ts3.index = pd.to_datetime(ts3.index)
447 | ```
448 |
449 | use dayfirst = True to change the datetime into European format
450 | ```
451 | pd.to_datetime('4.7.12', dayfirst=True)
452 | ```
453 | #### Timedelta: show difference in times
454 |
455 | ```
456 | pd.Timestamp('9/3/2016')-pd.Timestamp('9/1/2016')
457 | ```
458 | Output:
459 | ```
460 | Timedelta('2 days 00:00:00')
461 | ```
462 |
463 | Calculate datetime with timedelta
464 | ```
465 | pd.Timestamp('9/2/2016 8:10AM') + pd.Timedelta('12D 3H')
466 | ```
467 | Output:
468 | ```
469 | Timestamp('2016-09-14 11:10:00')
470 | ```
471 |
472 | #### Date_range()
473 | Create a range of dates for bi-weekly on Sundays, starting with a specific date
474 |
475 | ```
476 | dates = pd.date_range('10-01-2016', periods=9, freq='2W-SUN')
477 | ```
478 |
479 | #### weekday_name(): check what day of the week it is
480 | ```
481 | df.index.weekday_name
482 | ```
483 |
484 | #### diff(): find difference between each day's value
485 | ```
486 | df.diff()
487 | ```
488 |
489 | #### resample(): frequency conversion. example: find mean count for each month, will show the data as of month end. 'M' stands for month
490 | ```
491 | df.resample('M').mean()
492 | ```
493 |
494 | Find values from a specific year, month or a range of dates
495 |
496 | ```
497 | df['2017']
498 | df['2016-12']
499 | df['2016-12':]
500 |
501 | ```
502 | #### asfreq(): change frequency from bi-weekly to weekly, and fill NaN value with last week's data point
503 | ```
504 | df.asfreq('W', method='ffill')
505 | ```
506 | #### matplotlib: visualising a timeseries
507 |
508 | ```
509 | import matplotlib.pyplot as plt
510 | %matplotlib inline
511 |
512 | df.plot()
513 | ```
514 | ---
515 | ## Week 2 Basic Data Processing with Pandas
516 |
517 | Dataframe
518 |
519 | ```
520 | import pandas as pd
521 | purchase_1 = pd.Series({'Name': 'Chris',
522 | 'Item Purchased': 'Dog Food',
523 | 'Cost': 22.50})
524 | purchase_2 = pd.Series({'Name': 'Kevyn',
525 | 'Item Purchased': 'Kitty Litter',
526 | 'Cost': 2.50})
527 | purchase_3 = pd.Series({'Name': 'Vinod',
528 | 'Item Purchased': 'Bird Seed',
529 | 'Cost': 5.00})
530 | df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
531 | df.head()
532 | ```
533 |
534 | df.T.loc --> T transforms data
535 |
536 | iloc vs loc: iloc searches by index, loc searches by value
537 |
538 | Avoid chaining as it generally create a copy of the data, instead of simply viewing it.
539 |
540 | Deleting data with df.drop(). It creates a copy of the dataframe with the given rows removed.
541 |
542 | ```
543 | df.drop("Store 1")
544 | ```
545 |
546 | Deleting data with del() function
547 |
548 | ```
549 | del copy_df['Name']
550 | ```
551 |
552 | apply 20% discount to cost
553 |
554 | ```
555 | purchase_1 = pd.Series({'Name': 'Chris',
556 | 'Item Purchased': 'Dog Food',
557 | 'Cost': 22.50})
558 | purchase_2 = pd.Series({'Name': 'Kevyn',
559 | 'Item Purchased': 'Kitty Litter',
560 | 'Cost': 2.50})
561 | purchase_3 = pd.Series({'Name': 'Vinod',
562 | 'Item Purchased': 'Bird Seed',
563 | 'Cost': 5.00})
564 |
565 | df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
566 |
567 |
568 | df['Cost'] *= 0.8
569 | print(df)
570 | ```
571 |
572 | Panda's read_csv() function, making first column the index
573 |
574 | ```
575 | df = pd.read_csv('olympics.csv', index_col=0, skiprows=1)
576 | ```
577 |
578 | Change column names with rename() method
579 |
580 | ```
581 | for col in df.columns:
582 | if col[:2]=='01':
583 | df.rename(columns={col:'Gold' + col[4:]}, inplace=True)
584 | if col[:2]=='02':
585 | df.rename(columns={col:'Silver' + col[4:]}, inplace=True)
586 | if col[:2]=='03':
587 | df.rename(columns={col:'Bronze' + col[4:]}, inplace=True)
588 | if col[:1]=='№':
589 | df.rename(columns={col:'#' + col[1:]}, inplace=True)
590 |
591 | df.head()
592 | ```
593 |
594 | Boolean masking: applying a boolean (True or False) filter/mask to a dataframe/array with where() function
595 |
596 | ```
597 | only_gold = df.where(df['Gold']>0)
598 | only_gold.head()
599 | ```
600 |
601 | Drop lines when there is no data with na() function
602 |
603 | ```
604 | only_gold = only_gold.dropna()
605 | ```
606 |
607 | Chaining boolean maskes
608 |
609 | ```
610 |
611 | len(df[(df['Gold'] > 0) | (df['Gold.1'] > 0)])
612 |
613 | df[(df['Gold.1'] > 0) & (df['Gold'] == 0)]
614 |
615 | ```
616 |
617 | Return all of names of people who spend more than $3.00
618 | ```
619 | purchase_1 = pd.Series({'Name': 'Chris',
620 | 'Item Purchased': 'Dog Food',
621 | 'Cost': 22.50})
622 | purchase_2 = pd.Series({'Name': 'Kevyn',
623 | 'Item Purchased': 'Kitty Litter',
624 | 'Cost': 2.50})
625 | purchase_3 = pd.Series({'Name': 'Vinod',
626 | 'Item Purchased': 'Bird Seed',
627 | 'Cost': 5.00})
628 |
629 | df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
630 | df['Name'][df['Cost']>3]
631 | ```
632 |
633 | Set_index() function
634 |
635 | Reindex the purchase records Dataframe to be index hierarchically, first by store, then by person. Name these indexes "Location" and "Name". Then add a new entry to it with the value of:
636 |
637 | Name: "Kevyn", Item Purchased: "Kitty Food", Cost: 3.00 Location:"Store 2".
638 |
639 | ```
640 | purchase_1 = pd.Series({'Name': 'Chris',
641 | 'Item Purchased': 'Dog Food',
642 | 'Cost': 22.50})
643 | purchase_2 = pd.Series({'Name': 'Kevyn',
644 | 'Item Purchased': 'Kitty Litter',
645 | 'Cost': 2.50})
646 | purchase_3 = pd.Series({'Name': 'Vinod',
647 | 'Item Purchased': 'Bird Seed',
648 | 'Cost': 5.00})
649 |
650 | df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
651 |
652 |
653 | df = df.set_index([df.index, 'Name'])
654 | df.index.names = ['Location', 'Name']
655 | df = df.append(pd.Series(data={'Cost': 3.00, 'Item Purchased': 'Kitty Food'}, name=('Store 2', 'Kevyn')))
656 | ```
657 | ---
658 |
659 |
660 | ## Week 1
661 |
662 | ####List Indexing and Slicing
663 |
664 | Example 1
665 |
666 | ```
667 | people = ['Dr. Christopher Brooks', 'Dr. Kevyn Collins-Thompson', 'Dr. VG Vinod Vydiswaran', 'Dr. Daniel Romero']
668 |
669 | titleName = []
670 | def split_title_and_name():
671 | for person in people:
672 | last = person.split(" ")[-1]
673 | title = person.split(" ")[0]
674 | titleName.append(title + " "+last)
675 | print(titleName)
676 |
677 | split_title_and_name()
678 | ```
679 |
680 |
681 | Example 2
682 |
683 | ```
684 | people = ['Dr. Christopher Brooks', 'Dr. Kevyn Collins-Thompson', 'Dr. VG Vinod Vydiswaran', 'Dr. Daniel Romero']
685 |
686 | def split_title_and_name(person):
687 | return person.split(" ")[0] + " " + person.split(" ")[-1]
688 |
689 | list(map(split_title_and_name,people))
690 | ```
691 |
692 | Example 3 (official answer)
693 |
694 | ```
695 | people = ['Dr. Christopher Brooks', 'Dr. Kevyn Collins-Thompson', 'Dr. VG Vinod Vydiswaran', 'Dr. Daniel Romero']
696 |
697 | def split_title_and_name(person):
698 | title = person.split()[0]
699 | lastname = person.split()[-1]
700 | return '{} {}'.format(title, lastname)
701 |
702 | list(map(split_title_and_name, people))
703 | ```
704 |
705 |
706 | Lambda functions (for writing quick one-liner functions)
707 |
708 | ```
709 | my_function = lambda a,b: a+b
710 | my_function(1,2)
711 | ```
712 |
713 | list comprehension (list all even numbers in range 0 - 1000)
714 |
715 | ```
716 | my_list = [number for number in range(0,1000) if number % 2==0]
717 | ```
718 |
719 |
720 |
721 | ```
722 | def times_tables():
723 | lst = []
724 | for i in range(10):
725 | for j in range (10):
726 | lst.append(i*j)
727 | return lst
728 |
729 | times_tables() == [j*i for i in range(10) for j in range(10)]
730 | ```
731 |
732 | ```
733 | lowercase = 'abcdefghijklmnopqrstuvwxyz'
734 | digits = '0123456789'
735 |
736 | correct_answer = [a+b+c+d for a in lowercase for b in lowercase for c in digits for d in digits]
737 |
738 | correct_answer[:50] # Display first 50 ids
739 | ```
--------------------------------------------------------------------------------
/Week3/Week+3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "---\n",
8 | "\n",
9 | "_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-data-analysis/resources/0dhYG) course resource._\n",
10 | "\n",
11 | "---"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "# Merging Dataframes\n"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 1,
24 | "metadata": {
25 | "collapsed": false
26 | },
27 | "outputs": [
28 | {
29 | "data": {
30 | "text/html": [
31 | "
\n",
32 | "
\n",
33 | " \n",
34 | " \n",
35 | " \n",
36 | " Cost \n",
37 | " Item Purchased \n",
38 | " Name \n",
39 | " \n",
40 | " \n",
41 | " \n",
42 | " \n",
43 | " Store 1 \n",
44 | " 22.5 \n",
45 | " Sponge \n",
46 | " Chris \n",
47 | " \n",
48 | " \n",
49 | " Store 1 \n",
50 | " 2.5 \n",
51 | " Kitty Litter \n",
52 | " Kevyn \n",
53 | " \n",
54 | " \n",
55 | " Store 2 \n",
56 | " 5.0 \n",
57 | " Spoon \n",
58 | " Filip \n",
59 | " \n",
60 | " \n",
61 | "
\n",
62 | "
"
63 | ],
64 | "text/plain": [
65 | " Cost Item Purchased Name\n",
66 | "Store 1 22.5 Sponge Chris\n",
67 | "Store 1 2.5 Kitty Litter Kevyn\n",
68 | "Store 2 5.0 Spoon Filip"
69 | ]
70 | },
71 | "execution_count": 1,
72 | "metadata": {},
73 | "output_type": "execute_result"
74 | }
75 | ],
76 | "source": [
77 | "import pandas as pd\n",
78 | "\n",
79 | "df = pd.DataFrame([{'Name': 'Chris', 'Item Purchased': 'Sponge', 'Cost': 22.50},\n",
80 | " {'Name': 'Kevyn', 'Item Purchased': 'Kitty Litter', 'Cost': 2.50},\n",
81 | " {'Name': 'Filip', 'Item Purchased': 'Spoon', 'Cost': 5.00}],\n",
82 | " index=['Store 1', 'Store 1', 'Store 2'])\n",
83 | "df"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {
90 | "collapsed": false
91 | },
92 | "outputs": [],
93 | "source": [
94 | "df['Date'] = ['December 1', 'January 1', 'mid-May']\n",
95 | "df"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": null,
101 | "metadata": {
102 | "collapsed": false
103 | },
104 | "outputs": [],
105 | "source": [
106 | "df['Delivered'] = True\n",
107 | "df"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "metadata": {
114 | "collapsed": false
115 | },
116 | "outputs": [],
117 | "source": [
118 | "df['Feedback'] = ['Positive', None, 'Negative']\n",
119 | "df"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": null,
125 | "metadata": {
126 | "collapsed": false
127 | },
128 | "outputs": [],
129 | "source": [
130 | "adf = df.reset_index()\n",
131 | "adf['Date'] = pd.Series({0: 'December 1', 2: 'mid-May'})\n",
132 | "adf"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {
139 | "collapsed": false
140 | },
141 | "outputs": [],
142 | "source": [
143 | "staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR'},\n",
144 | " {'Name': 'Sally', 'Role': 'Course liasion'},\n",
145 | " {'Name': 'James', 'Role': 'Grader'}])\n",
146 | "staff_df = staff_df.set_index('Name')\n",
147 | "student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business'},\n",
148 | " {'Name': 'Mike', 'School': 'Law'},\n",
149 | " {'Name': 'Sally', 'School': 'Engineering'}])\n",
150 | "student_df = student_df.set_index('Name')\n",
151 | "print(staff_df.head())\n",
152 | "print()\n",
153 | "print(student_df.head())"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": null,
159 | "metadata": {
160 | "collapsed": false,
161 | "scrolled": true
162 | },
163 | "outputs": [],
164 | "source": [
165 | "pd.merge(staff_df, student_df, how='outer', left_index=True, right_index=True)"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": null,
171 | "metadata": {
172 | "collapsed": false
173 | },
174 | "outputs": [],
175 | "source": [
176 | "pd.merge(staff_df, student_df, how='inner', left_index=True, right_index=True)"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": null,
182 | "metadata": {
183 | "collapsed": false
184 | },
185 | "outputs": [],
186 | "source": [
187 | "pd.merge(staff_df, student_df, how='left', left_index=True, right_index=True)"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": null,
193 | "metadata": {
194 | "collapsed": false
195 | },
196 | "outputs": [],
197 | "source": [
198 | "pd.merge(staff_df, student_df, how='right', left_index=True, right_index=True)"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": null,
204 | "metadata": {
205 | "collapsed": false,
206 | "scrolled": true
207 | },
208 | "outputs": [],
209 | "source": [
210 | "staff_df = staff_df.reset_index()\n",
211 | "student_df = student_df.reset_index()\n",
212 | "pd.merge(staff_df, student_df, how='left', left_on='Name', right_on='Name')"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "metadata": {
219 | "collapsed": false
220 | },
221 | "outputs": [],
222 | "source": [
223 | "staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director of HR', 'Location': 'State Street'},\n",
224 | " {'Name': 'Sally', 'Role': 'Course liasion', 'Location': 'Washington Avenue'},\n",
225 | " {'Name': 'James', 'Role': 'Grader', 'Location': 'Washington Avenue'}])\n",
226 | "student_df = pd.DataFrame([{'Name': 'James', 'School': 'Business', 'Location': '1024 Billiard Avenue'},\n",
227 | " {'Name': 'Mike', 'School': 'Law', 'Location': 'Fraternity House #22'},\n",
228 | " {'Name': 'Sally', 'School': 'Engineering', 'Location': '512 Wilson Crescent'}])\n",
229 | "pd.merge(staff_df, student_df, how='left', left_on='Name', right_on='Name')"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": null,
235 | "metadata": {
236 | "collapsed": false
237 | },
238 | "outputs": [],
239 | "source": [
240 | "staff_df = pd.DataFrame([{'First Name': 'Kelly', 'Last Name': 'Desjardins', 'Role': 'Director of HR'},\n",
241 | " {'First Name': 'Sally', 'Last Name': 'Brooks', 'Role': 'Course liasion'},\n",
242 | " {'First Name': 'James', 'Last Name': 'Wilde', 'Role': 'Grader'}])\n",
243 | "student_df = pd.DataFrame([{'First Name': 'James', 'Last Name': 'Hammond', 'School': 'Business'},\n",
244 | " {'First Name': 'Mike', 'Last Name': 'Smith', 'School': 'Law'},\n",
245 | " {'First Name': 'Sally', 'Last Name': 'Brooks', 'School': 'Engineering'}])\n",
246 | "staff_df\n",
247 | "student_df\n",
248 | "pd.merge(staff_df, student_df, how='inner', left_on=['First Name','Last Name'], right_on=['First Name','Last Name'])"
249 | ]
250 | },
251 | {
252 | "cell_type": "markdown",
253 | "metadata": {},
254 | "source": [
255 | "# Idiomatic Pandas: Making Code Pandorable"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": null,
261 | "metadata": {
262 | "collapsed": false
263 | },
264 | "outputs": [],
265 | "source": [
266 | "import pandas as pd\n",
267 | "df = pd.read_csv('census.csv')\n",
268 | "df"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": null,
274 | "metadata": {
275 | "collapsed": false
276 | },
277 | "outputs": [],
278 | "source": [
279 | "(df.where(df['SUMLEV']==50)\n",
280 | " .dropna()\n",
281 | " .set_index(['STNAME','CTYNAME'])\n",
282 | " .rename(columns={'ESTIMATESBASE2010': 'Estimates Base 2010'}))"
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": null,
288 | "metadata": {
289 | "collapsed": false
290 | },
291 | "outputs": [],
292 | "source": [
293 | "df = df[df['SUMLEV']==50]\n",
294 | "df.set_index(['STNAME','CTYNAME'], inplace=True)\n",
295 | "df.rename(columns={'ESTIMATESBASE2010': 'Estimates Base 2010'})"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": null,
301 | "metadata": {
302 | "collapsed": false
303 | },
304 | "outputs": [],
305 | "source": [
306 | "import numpy as np\n",
307 | "def min_max(row):\n",
308 | " data = row[['POPESTIMATE2010',\n",
309 | " 'POPESTIMATE2011',\n",
310 | " 'POPESTIMATE2012',\n",
311 | " 'POPESTIMATE2013',\n",
312 | " 'POPESTIMATE2014',\n",
313 | " 'POPESTIMATE2015']]\n",
314 | " return pd.Series({'min': np.min(data), 'max': np.max(data)})"
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": null,
320 | "metadata": {
321 | "collapsed": false
322 | },
323 | "outputs": [],
324 | "source": [
325 | "df.apply(min_max, axis=1)"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": null,
331 | "metadata": {
332 | "collapsed": false
333 | },
334 | "outputs": [],
335 | "source": [
336 | "import numpy as np\n",
337 | "def min_max(row):\n",
338 | " data = row[['POPESTIMATE2010',\n",
339 | " 'POPESTIMATE2011',\n",
340 | " 'POPESTIMATE2012',\n",
341 | " 'POPESTIMATE2013',\n",
342 | " 'POPESTIMATE2014',\n",
343 | " 'POPESTIMATE2015']]\n",
344 | " row['max'] = np.max(data)\n",
345 | " row['min'] = np.min(data)\n",
346 | " return row\n",
347 | "df.apply(min_max, axis=1)"
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": null,
353 | "metadata": {
354 | "collapsed": false
355 | },
356 | "outputs": [],
357 | "source": [
358 | "rows = ['POPESTIMATE2010',\n",
359 | " 'POPESTIMATE2011',\n",
360 | " 'POPESTIMATE2012',\n",
361 | " 'POPESTIMATE2013',\n",
362 | " 'POPESTIMATE2014',\n",
363 | " 'POPESTIMATE2015']\n",
364 | "df.apply(lambda x: np.max(x[rows]), axis=1)"
365 | ]
366 | },
367 | {
368 | "cell_type": "markdown",
369 | "metadata": {},
370 | "source": [
371 | "# Group by"
372 | ]
373 | },
374 | {
375 | "cell_type": "code",
376 | "execution_count": null,
377 | "metadata": {
378 | "collapsed": false
379 | },
380 | "outputs": [],
381 | "source": [
382 | "import pandas as pd\n",
383 | "import numpy as np\n",
384 | "df = pd.read_csv('census.csv')\n",
385 | "df = df[df['SUMLEV']==50]\n",
386 | "df"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": null,
392 | "metadata": {
393 | "collapsed": false
394 | },
395 | "outputs": [],
396 | "source": [
397 | "%%timeit -n 10\n",
398 | "for state in df['STNAME'].unique():\n",
399 | " avg = np.average(df.where(df['STNAME']==state).dropna()['CENSUS2010POP'])\n",
400 | " print('Counties in state ' + state + ' have an average population of ' + str(avg))"
401 | ]
402 | },
403 | {
404 | "cell_type": "code",
405 | "execution_count": null,
406 | "metadata": {
407 | "collapsed": false,
408 | "scrolled": true
409 | },
410 | "outputs": [],
411 | "source": [
412 | "%%timeit -n 10\n",
413 | "for group, frame in df.groupby('STNAME'):\n",
414 | " avg = np.average(frame['CENSUS2010POP'])\n",
415 | " print('Counties in state ' + group + ' have an average population of ' + str(avg))"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": null,
421 | "metadata": {
422 | "collapsed": false
423 | },
424 | "outputs": [],
425 | "source": [
426 | "df.head()"
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": null,
432 | "metadata": {
433 | "collapsed": false
434 | },
435 | "outputs": [],
436 | "source": [
437 | "df = df.set_index('STNAME')\n",
438 | "\n",
439 | "def fun(item):\n",
440 | " if item[0]<'M':\n",
441 | " return 0\n",
442 | " if item[0]<'Q':\n",
443 | " return 1\n",
444 | " return 2\n",
445 | "\n",
446 | "for group, frame in df.groupby(fun):\n",
447 | " print('There are ' + str(len(frame)) + ' records in group ' + str(group) + ' for processing.')\n"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": null,
453 | "metadata": {
454 | "collapsed": false
455 | },
456 | "outputs": [],
457 | "source": [
458 | "df = pd.read_csv('census.csv')\n",
459 | "df = df[df['SUMLEV']==50]"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": null,
465 | "metadata": {
466 | "collapsed": false
467 | },
468 | "outputs": [],
469 | "source": [
470 | "df.groupby('STNAME').agg({'CENSUS2010POP': np.average})"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": null,
476 | "metadata": {
477 | "collapsed": false
478 | },
479 | "outputs": [],
480 | "source": [
481 | "print(type(df.groupby(level=0)['POPESTIMATE2010','POPESTIMATE2011']))\n",
482 | "print(type(df.groupby(level=0)['POPESTIMATE2010']))"
483 | ]
484 | },
485 | {
486 | "cell_type": "code",
487 | "execution_count": null,
488 | "metadata": {
489 | "collapsed": false
490 | },
491 | "outputs": [],
492 | "source": [
493 | "(df.set_index('STNAME').groupby(level=0)['CENSUS2010POP']\n",
494 | " .agg({'avg': np.average, 'sum': np.sum}))"
495 | ]
496 | },
497 | {
498 | "cell_type": "code",
499 | "execution_count": null,
500 | "metadata": {
501 | "collapsed": false
502 | },
503 | "outputs": [],
504 | "source": [
505 | "(df.set_index('STNAME').groupby(level=0)['POPESTIMATE2010','POPESTIMATE2011']\n",
506 | " .agg({'avg': np.average, 'sum': np.sum}))"
507 | ]
508 | },
509 | {
510 | "cell_type": "code",
511 | "execution_count": null,
512 | "metadata": {
513 | "collapsed": false
514 | },
515 | "outputs": [],
516 | "source": [
517 | "(df.set_index('STNAME').groupby(level=0)['POPESTIMATE2010','POPESTIMATE2011']\n",
518 | " .agg({'POPESTIMATE2010': np.average, 'POPESTIMATE2011': np.sum}))"
519 | ]
520 | },
521 | {
522 | "cell_type": "markdown",
523 | "metadata": {},
524 | "source": [
525 | "# Scales"
526 | ]
527 | },
528 | {
529 | "cell_type": "code",
530 | "execution_count": null,
531 | "metadata": {
532 | "collapsed": false
533 | },
534 | "outputs": [],
535 | "source": [
536 | "df = pd.DataFrame(['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D'],\n",
537 | " index=['excellent', 'excellent', 'excellent', 'good', 'good', 'good', 'ok', 'ok', 'ok', 'poor', 'poor'])\n",
538 | "df.rename(columns={0: 'Grades'}, inplace=True)\n",
539 | "df"
540 | ]
541 | },
542 | {
543 | "cell_type": "code",
544 | "execution_count": null,
545 | "metadata": {
546 | "collapsed": false
547 | },
548 | "outputs": [],
549 | "source": [
550 | "df['Grades'].astype('category').head()"
551 | ]
552 | },
553 | {
554 | "cell_type": "code",
555 | "execution_count": null,
556 | "metadata": {
557 | "collapsed": false
558 | },
559 | "outputs": [],
560 | "source": [
561 | "grades = df['Grades'].astype('category',\n",
562 | " categories=['D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+'],\n",
563 | " ordered=True)\n",
564 | "grades.head()"
565 | ]
566 | },
567 | {
568 | "cell_type": "code",
569 | "execution_count": null,
570 | "metadata": {
571 | "collapsed": false
572 | },
573 | "outputs": [],
574 | "source": [
575 | "grades > 'C'"
576 | ]
577 | },
578 | {
579 | "cell_type": "code",
580 | "execution_count": null,
581 | "metadata": {
582 | "collapsed": false
583 | },
584 | "outputs": [],
585 | "source": [
586 | "df = pd.read_csv('census.csv')\n",
587 | "df = df[df['SUMLEV']==50]\n",
588 | "df = df.set_index('STNAME').groupby(level=0)['CENSUS2010POP'].agg({'avg': np.average})\n",
589 | "pd.cut(df['avg'],10)"
590 | ]
591 | },
592 | {
593 | "cell_type": "markdown",
594 | "metadata": {},
595 | "source": [
596 | "# Pivot Tables"
597 | ]
598 | },
599 | {
600 | "cell_type": "code",
601 | "execution_count": null,
602 | "metadata": {
603 | "collapsed": true
604 | },
605 | "outputs": [],
606 | "source": [
607 | "#http://open.canada.ca/data/en/dataset/98f1a129-f628-4ce4-b24d-6f16bf24dd64\n",
608 | "df = pd.read_csv('cars.csv')"
609 | ]
610 | },
611 | {
612 | "cell_type": "code",
613 | "execution_count": null,
614 | "metadata": {
615 | "collapsed": false
616 | },
617 | "outputs": [],
618 | "source": [
619 | "df.head()"
620 | ]
621 | },
622 | {
623 | "cell_type": "code",
624 | "execution_count": null,
625 | "metadata": {
626 | "collapsed": false
627 | },
628 | "outputs": [],
629 | "source": [
630 | "df.pivot_table(values='(kW)', index='YEAR', columns='Make', aggfunc=np.mean)"
631 | ]
632 | },
633 | {
634 | "cell_type": "code",
635 | "execution_count": null,
636 | "metadata": {
637 | "collapsed": false
638 | },
639 | "outputs": [],
640 | "source": [
641 | "df.pivot_table(values='(kW)', index='YEAR', columns='Make', aggfunc=[np.mean,np.min], margins=True)"
642 | ]
643 | },
644 | {
645 | "cell_type": "markdown",
646 | "metadata": {},
647 | "source": [
648 | "# Date Functionality in Pandas"
649 | ]
650 | },
651 | {
652 | "cell_type": "code",
653 | "execution_count": 2,
654 | "metadata": {
655 | "collapsed": true
656 | },
657 | "outputs": [],
658 | "source": [
659 | "import pandas as pd\n",
660 | "import numpy as np"
661 | ]
662 | },
663 | {
664 | "cell_type": "markdown",
665 | "metadata": {},
666 | "source": [
667 | "### Timestamp"
668 | ]
669 | },
670 | {
671 | "cell_type": "code",
672 | "execution_count": 3,
673 | "metadata": {
674 | "collapsed": false
675 | },
676 | "outputs": [
677 | {
678 | "data": {
679 | "text/plain": [
680 | "Timestamp('2016-09-01 10:05:00')"
681 | ]
682 | },
683 | "execution_count": 3,
684 | "metadata": {},
685 | "output_type": "execute_result"
686 | }
687 | ],
688 | "source": [
689 | "pd.Timestamp('9/1/2016 10:05AM')"
690 | ]
691 | },
692 | {
693 | "cell_type": "markdown",
694 | "metadata": {},
695 | "source": [
696 | "### Period"
697 | ]
698 | },
699 | {
700 | "cell_type": "code",
701 | "execution_count": 4,
702 | "metadata": {
703 | "collapsed": false
704 | },
705 | "outputs": [
706 | {
707 | "data": {
708 | "text/plain": [
709 | "Period('2016-01', 'M')"
710 | ]
711 | },
712 | "execution_count": 4,
713 | "metadata": {},
714 | "output_type": "execute_result"
715 | }
716 | ],
717 | "source": [
718 | "pd.Period('1/2016')"
719 | ]
720 | },
721 | {
722 | "cell_type": "code",
723 | "execution_count": 5,
724 | "metadata": {
725 | "collapsed": false
726 | },
727 | "outputs": [
728 | {
729 | "data": {
730 | "text/plain": [
731 | "Period('2016-03-05', 'D')"
732 | ]
733 | },
734 | "execution_count": 5,
735 | "metadata": {},
736 | "output_type": "execute_result"
737 | }
738 | ],
739 | "source": [
740 | "pd.Period('3/5/2016')"
741 | ]
742 | },
743 | {
744 | "cell_type": "markdown",
745 | "metadata": {},
746 | "source": [
747 | "### DatetimeIndex"
748 | ]
749 | },
750 | {
751 | "cell_type": "code",
752 | "execution_count": 6,
753 | "metadata": {
754 | "collapsed": false
755 | },
756 | "outputs": [
757 | {
758 | "data": {
759 | "text/plain": [
760 | "2016-09-01 a\n",
761 | "2016-09-02 b\n",
762 | "2016-09-03 c\n",
763 | "dtype: object"
764 | ]
765 | },
766 | "execution_count": 6,
767 | "metadata": {},
768 | "output_type": "execute_result"
769 | }
770 | ],
771 | "source": [
772 | "t1 = pd.Series(list('abc'), [pd.Timestamp('2016-09-01'), pd.Timestamp('2016-09-02'), pd.Timestamp('2016-09-03')])\n",
773 | "t1"
774 | ]
775 | },
776 | {
777 | "cell_type": "code",
778 | "execution_count": 7,
779 | "metadata": {
780 | "collapsed": false
781 | },
782 | "outputs": [
783 | {
784 | "data": {
785 | "text/plain": [
786 | "pandas.tseries.index.DatetimeIndex"
787 | ]
788 | },
789 | "execution_count": 7,
790 | "metadata": {},
791 | "output_type": "execute_result"
792 | }
793 | ],
794 | "source": [
795 | "type(t1.index)"
796 | ]
797 | },
798 | {
799 | "cell_type": "markdown",
800 | "metadata": {},
801 | "source": [
802 | "### PeriodIndex"
803 | ]
804 | },
805 | {
806 | "cell_type": "code",
807 | "execution_count": 8,
808 | "metadata": {
809 | "collapsed": false
810 | },
811 | "outputs": [
812 | {
813 | "data": {
814 | "text/plain": [
815 | "2016-09 d\n",
816 | "2016-10 e\n",
817 | "2016-11 f\n",
818 | "Freq: M, dtype: object"
819 | ]
820 | },
821 | "execution_count": 8,
822 | "metadata": {},
823 | "output_type": "execute_result"
824 | }
825 | ],
826 | "source": [
827 | "t2 = pd.Series(list('def'), [pd.Period('2016-09'), pd.Period('2016-10'), pd.Period('2016-11')])\n",
828 | "t2"
829 | ]
830 | },
831 | {
832 | "cell_type": "code",
833 | "execution_count": 9,
834 | "metadata": {
835 | "collapsed": false
836 | },
837 | "outputs": [
838 | {
839 | "data": {
840 | "text/plain": [
841 | "pandas.tseries.period.PeriodIndex"
842 | ]
843 | },
844 | "execution_count": 9,
845 | "metadata": {},
846 | "output_type": "execute_result"
847 | }
848 | ],
849 | "source": [
850 | "type(t2.index)"
851 | ]
852 | },
853 | {
854 | "cell_type": "markdown",
855 | "metadata": {},
856 | "source": [
857 | "### Converting to Datetime"
858 | ]
859 | },
860 | {
861 | "cell_type": "code",
862 | "execution_count": 10,
863 | "metadata": {
864 | "collapsed": false
865 | },
866 | "outputs": [
867 | {
868 | "data": {
869 | "text/html": [
870 | "\n",
871 | "
\n",
872 | " \n",
873 | " \n",
874 | " \n",
875 | " a \n",
876 | " b \n",
877 | " \n",
878 | " \n",
879 | " \n",
880 | " \n",
881 | " 2 June 2013 \n",
882 | " 16 \n",
883 | " 46 \n",
884 | " \n",
885 | " \n",
886 | " Aug 29, 2014 \n",
887 | " 14 \n",
888 | " 66 \n",
889 | " \n",
890 | " \n",
891 | " 2015-06-26 \n",
892 | " 59 \n",
893 | " 99 \n",
894 | " \n",
895 | " \n",
896 | " 7/12/16 \n",
897 | " 27 \n",
898 | " 17 \n",
899 | " \n",
900 | " \n",
901 | "
\n",
902 | "
"
903 | ],
904 | "text/plain": [
905 | " a b\n",
906 | "2 June 2013 16 46\n",
907 | "Aug 29, 2014 14 66\n",
908 | "2015-06-26 59 99\n",
909 | "7/12/16 27 17"
910 | ]
911 | },
912 | "execution_count": 10,
913 | "metadata": {},
914 | "output_type": "execute_result"
915 | }
916 | ],
917 | "source": [
918 | "d1 = ['2 June 2013', 'Aug 29, 2014', '2015-06-26', '7/12/16']\n",
919 | "ts3 = pd.DataFrame(np.random.randint(10, 100, (4,2)), index=d1, columns=list('ab'))\n",
920 | "ts3"
921 | ]
922 | },
923 | {
924 | "cell_type": "code",
925 | "execution_count": 11,
926 | "metadata": {
927 | "collapsed": false
928 | },
929 | "outputs": [
930 | {
931 | "data": {
932 | "text/html": [
933 | "\n",
934 | "
\n",
935 | " \n",
936 | " \n",
937 | " \n",
938 | " a \n",
939 | " b \n",
940 | " \n",
941 | " \n",
942 | " \n",
943 | " \n",
944 | " 2013-06-02 \n",
945 | " 16 \n",
946 | " 46 \n",
947 | " \n",
948 | " \n",
949 | " 2014-08-29 \n",
950 | " 14 \n",
951 | " 66 \n",
952 | " \n",
953 | " \n",
954 | " 2015-06-26 \n",
955 | " 59 \n",
956 | " 99 \n",
957 | " \n",
958 | " \n",
959 | " 2016-07-12 \n",
960 | " 27 \n",
961 | " 17 \n",
962 | " \n",
963 | " \n",
964 | "
\n",
965 | "
"
966 | ],
967 | "text/plain": [
968 | " a b\n",
969 | "2013-06-02 16 46\n",
970 | "2014-08-29 14 66\n",
971 | "2015-06-26 59 99\n",
972 | "2016-07-12 27 17"
973 | ]
974 | },
975 | "execution_count": 11,
976 | "metadata": {},
977 | "output_type": "execute_result"
978 | }
979 | ],
980 | "source": [
981 | "ts3.index = pd.to_datetime(ts3.index)\n",
982 | "ts3"
983 | ]
984 | },
985 | {
986 | "cell_type": "code",
987 | "execution_count": 12,
988 | "metadata": {
989 | "collapsed": false
990 | },
991 | "outputs": [
992 | {
993 | "data": {
994 | "text/plain": [
995 | "Timestamp('2012-07-04 00:00:00')"
996 | ]
997 | },
998 | "execution_count": 12,
999 | "metadata": {},
1000 | "output_type": "execute_result"
1001 | }
1002 | ],
1003 | "source": [
1004 | "pd.to_datetime('4.7.12', dayfirst=True)"
1005 | ]
1006 | },
1007 | {
1008 | "cell_type": "markdown",
1009 | "metadata": {},
1010 | "source": [
1011 | "### Timedeltas"
1012 | ]
1013 | },
1014 | {
1015 | "cell_type": "code",
1016 | "execution_count": 13,
1017 | "metadata": {
1018 | "collapsed": false
1019 | },
1020 | "outputs": [
1021 | {
1022 | "data": {
1023 | "text/plain": [
1024 | "Timedelta('2 days 00:00:00')"
1025 | ]
1026 | },
1027 | "execution_count": 13,
1028 | "metadata": {},
1029 | "output_type": "execute_result"
1030 | }
1031 | ],
1032 | "source": [
1033 | "pd.Timestamp('9/3/2016')-pd.Timestamp('9/1/2016')"
1034 | ]
1035 | },
1036 | {
1037 | "cell_type": "code",
1038 | "execution_count": 14,
1039 | "metadata": {
1040 | "collapsed": false
1041 | },
1042 | "outputs": [
1043 | {
1044 | "data": {
1045 | "text/plain": [
1046 | "Timestamp('2016-09-14 11:10:00')"
1047 | ]
1048 | },
1049 | "execution_count": 14,
1050 | "metadata": {},
1051 | "output_type": "execute_result"
1052 | }
1053 | ],
1054 | "source": [
1055 | "pd.Timestamp('9/2/2016 8:10AM') + pd.Timedelta('12D 3H')"
1056 | ]
1057 | },
1058 | {
1059 | "cell_type": "markdown",
1060 | "metadata": {},
1061 | "source": [
1062 | "### Working with Dates in a Dataframe"
1063 | ]
1064 | },
1065 | {
1066 | "cell_type": "code",
1067 | "execution_count": 15,
1068 | "metadata": {
1069 | "collapsed": false
1070 | },
1071 | "outputs": [
1072 | {
1073 | "data": {
1074 | "text/plain": [
1075 | "DatetimeIndex(['2016-10-02', '2016-10-16', '2016-10-30', '2016-11-13',\n",
1076 | " '2016-11-27', '2016-12-11', '2016-12-25', '2017-01-08',\n",
1077 | " '2017-01-22'],\n",
1078 | " dtype='datetime64[ns]', freq='2W-SUN')"
1079 | ]
1080 | },
1081 | "execution_count": 15,
1082 | "metadata": {},
1083 | "output_type": "execute_result"
1084 | }
1085 | ],
1086 | "source": [
1087 | "dates = pd.date_range('10-01-2016', periods=9, freq='2W-SUN')\n",
1088 | "dates"
1089 | ]
1090 | },
1091 | {
1092 | "cell_type": "code",
1093 | "execution_count": 16,
1094 | "metadata": {
1095 | "collapsed": false
1096 | },
1097 | "outputs": [
1098 | {
1099 | "data": {
1100 | "text/html": [
1101 | "\n",
1102 | "
\n",
1103 | " \n",
1104 | " \n",
1105 | " \n",
1106 | " Count 1 \n",
1107 | " Count 2 \n",
1108 | " \n",
1109 | " \n",
1110 | " \n",
1111 | " \n",
1112 | " 2016-10-02 \n",
1113 | " 104 \n",
1114 | " 125 \n",
1115 | " \n",
1116 | " \n",
1117 | " 2016-10-16 \n",
1118 | " 109 \n",
1119 | " 122 \n",
1120 | " \n",
1121 | " \n",
1122 | " 2016-10-30 \n",
1123 | " 111 \n",
1124 | " 127 \n",
1125 | " \n",
1126 | " \n",
1127 | " 2016-11-13 \n",
1128 | " 117 \n",
1129 | " 126 \n",
1130 | " \n",
1131 | " \n",
1132 | " 2016-11-27 \n",
1133 | " 114 \n",
1134 | " 126 \n",
1135 | " \n",
1136 | " \n",
1137 | " 2016-12-11 \n",
1138 | " 109 \n",
1139 | " 121 \n",
1140 | " \n",
1141 | " \n",
1142 | " 2016-12-25 \n",
1143 | " 105 \n",
1144 | " 126 \n",
1145 | " \n",
1146 | " \n",
1147 | " 2017-01-08 \n",
1148 | " 105 \n",
1149 | " 125 \n",
1150 | " \n",
1151 | " \n",
1152 | " 2017-01-22 \n",
1153 | " 101 \n",
1154 | " 123 \n",
1155 | " \n",
1156 | " \n",
1157 | "
\n",
1158 | "
"
1159 | ],
1160 | "text/plain": [
1161 | " Count 1 Count 2\n",
1162 | "2016-10-02 104 125\n",
1163 | "2016-10-16 109 122\n",
1164 | "2016-10-30 111 127\n",
1165 | "2016-11-13 117 126\n",
1166 | "2016-11-27 114 126\n",
1167 | "2016-12-11 109 121\n",
1168 | "2016-12-25 105 126\n",
1169 | "2017-01-08 105 125\n",
1170 | "2017-01-22 101 123"
1171 | ]
1172 | },
1173 | "execution_count": 16,
1174 | "metadata": {},
1175 | "output_type": "execute_result"
1176 | }
1177 | ],
1178 | "source": [
1179 | "df = pd.DataFrame({'Count 1': 100 + np.random.randint(-5, 10, 9).cumsum(),\n",
1180 | " 'Count 2': 120 + np.random.randint(-5, 10, 9)}, index=dates)\n",
1181 | "df"
1182 | ]
1183 | },
1184 | {
1185 | "cell_type": "code",
1186 | "execution_count": 17,
1187 | "metadata": {
1188 | "collapsed": false
1189 | },
1190 | "outputs": [
1191 | {
1192 | "data": {
1193 | "text/plain": [
1194 | "array(['Sunday', 'Sunday', 'Sunday', 'Sunday', 'Sunday', 'Sunday',\n",
1195 | " 'Sunday', 'Sunday', 'Sunday'], dtype=object)"
1196 | ]
1197 | },
1198 | "execution_count": 17,
1199 | "metadata": {},
1200 | "output_type": "execute_result"
1201 | }
1202 | ],
1203 | "source": [
1204 | "df.index.weekday_name"
1205 | ]
1206 | },
1207 | {
1208 | "cell_type": "code",
1209 | "execution_count": 18,
1210 | "metadata": {
1211 | "collapsed": false
1212 | },
1213 | "outputs": [
1214 | {
1215 | "data": {
1216 | "text/html": [
1217 | "\n",
1218 | "
\n",
1219 | " \n",
1220 | " \n",
1221 | " \n",
1222 | " Count 1 \n",
1223 | " Count 2 \n",
1224 | " \n",
1225 | " \n",
1226 | " \n",
1227 | " \n",
1228 | " 2016-10-02 \n",
1229 | " NaN \n",
1230 | " NaN \n",
1231 | " \n",
1232 | " \n",
1233 | " 2016-10-16 \n",
1234 | " 5.0 \n",
1235 | " -3.0 \n",
1236 | " \n",
1237 | " \n",
1238 | " 2016-10-30 \n",
1239 | " 2.0 \n",
1240 | " 5.0 \n",
1241 | " \n",
1242 | " \n",
1243 | " 2016-11-13 \n",
1244 | " 6.0 \n",
1245 | " -1.0 \n",
1246 | " \n",
1247 | " \n",
1248 | " 2016-11-27 \n",
1249 | " -3.0 \n",
1250 | " 0.0 \n",
1251 | " \n",
1252 | " \n",
1253 | " 2016-12-11 \n",
1254 | " -5.0 \n",
1255 | " -5.0 \n",
1256 | " \n",
1257 | " \n",
1258 | " 2016-12-25 \n",
1259 | " -4.0 \n",
1260 | " 5.0 \n",
1261 | " \n",
1262 | " \n",
1263 | " 2017-01-08 \n",
1264 | " 0.0 \n",
1265 | " -1.0 \n",
1266 | " \n",
1267 | " \n",
1268 | " 2017-01-22 \n",
1269 | " -4.0 \n",
1270 | " -2.0 \n",
1271 | " \n",
1272 | " \n",
1273 | "
\n",
1274 | "
"
1275 | ],
1276 | "text/plain": [
1277 | " Count 1 Count 2\n",
1278 | "2016-10-02 NaN NaN\n",
1279 | "2016-10-16 5.0 -3.0\n",
1280 | "2016-10-30 2.0 5.0\n",
1281 | "2016-11-13 6.0 -1.0\n",
1282 | "2016-11-27 -3.0 0.0\n",
1283 | "2016-12-11 -5.0 -5.0\n",
1284 | "2016-12-25 -4.0 5.0\n",
1285 | "2017-01-08 0.0 -1.0\n",
1286 | "2017-01-22 -4.0 -2.0"
1287 | ]
1288 | },
1289 | "execution_count": 18,
1290 | "metadata": {},
1291 | "output_type": "execute_result"
1292 | }
1293 | ],
1294 | "source": [
1295 | "df.diff()"
1296 | ]
1297 | },
1298 | {
1299 | "cell_type": "code",
1300 | "execution_count": 19,
1301 | "metadata": {
1302 | "collapsed": false
1303 | },
1304 | "outputs": [
1305 | {
1306 | "data": {
1307 | "text/html": [
1308 | "\n",
1309 | "
\n",
1310 | " \n",
1311 | " \n",
1312 | " \n",
1313 | " Count 1 \n",
1314 | " Count 2 \n",
1315 | " \n",
1316 | " \n",
1317 | " \n",
1318 | " \n",
1319 | " 2016-10-31 \n",
1320 | " 108.0 \n",
1321 | " 124.666667 \n",
1322 | " \n",
1323 | " \n",
1324 | " 2016-11-30 \n",
1325 | " 115.5 \n",
1326 | " 126.000000 \n",
1327 | " \n",
1328 | " \n",
1329 | " 2016-12-31 \n",
1330 | " 107.0 \n",
1331 | " 123.500000 \n",
1332 | " \n",
1333 | " \n",
1334 | " 2017-01-31 \n",
1335 | " 103.0 \n",
1336 | " 124.000000 \n",
1337 | " \n",
1338 | " \n",
1339 | "
\n",
1340 | "
"
1341 | ],
1342 | "text/plain": [
1343 | " Count 1 Count 2\n",
1344 | "2016-10-31 108.0 124.666667\n",
1345 | "2016-11-30 115.5 126.000000\n",
1346 | "2016-12-31 107.0 123.500000\n",
1347 | "2017-01-31 103.0 124.000000"
1348 | ]
1349 | },
1350 | "execution_count": 19,
1351 | "metadata": {},
1352 | "output_type": "execute_result"
1353 | }
1354 | ],
1355 | "source": [
1356 | "df.resample('M').mean()"
1357 | ]
1358 | },
1359 | {
1360 | "cell_type": "code",
1361 | "execution_count": 20,
1362 | "metadata": {
1363 | "collapsed": false
1364 | },
1365 | "outputs": [
1366 | {
1367 | "data": {
1368 | "text/html": [
1369 | "\n",
1370 | "
\n",
1371 | " \n",
1372 | " \n",
1373 | " \n",
1374 | " Count 1 \n",
1375 | " Count 2 \n",
1376 | " \n",
1377 | " \n",
1378 | " \n",
1379 | " \n",
1380 | " 2017-01-08 \n",
1381 | " 105 \n",
1382 | " 125 \n",
1383 | " \n",
1384 | " \n",
1385 | " 2017-01-22 \n",
1386 | " 101 \n",
1387 | " 123 \n",
1388 | " \n",
1389 | " \n",
1390 | "
\n",
1391 | "
"
1392 | ],
1393 | "text/plain": [
1394 | " Count 1 Count 2\n",
1395 | "2017-01-08 105 125\n",
1396 | "2017-01-22 101 123"
1397 | ]
1398 | },
1399 | "execution_count": 20,
1400 | "metadata": {},
1401 | "output_type": "execute_result"
1402 | }
1403 | ],
1404 | "source": [
1405 | "df['2017']"
1406 | ]
1407 | },
1408 | {
1409 | "cell_type": "code",
1410 | "execution_count": 21,
1411 | "metadata": {
1412 | "collapsed": false
1413 | },
1414 | "outputs": [
1415 | {
1416 | "data": {
1417 | "text/html": [
1418 | "\n",
1419 | "
\n",
1420 | " \n",
1421 | " \n",
1422 | " \n",
1423 | " Count 1 \n",
1424 | " Count 2 \n",
1425 | " \n",
1426 | " \n",
1427 | " \n",
1428 | " \n",
1429 | " 2016-12-11 \n",
1430 | " 109 \n",
1431 | " 121 \n",
1432 | " \n",
1433 | " \n",
1434 | " 2016-12-25 \n",
1435 | " 105 \n",
1436 | " 126 \n",
1437 | " \n",
1438 | " \n",
1439 | "
\n",
1440 | "
"
1441 | ],
1442 | "text/plain": [
1443 | " Count 1 Count 2\n",
1444 | "2016-12-11 109 121\n",
1445 | "2016-12-25 105 126"
1446 | ]
1447 | },
1448 | "execution_count": 21,
1449 | "metadata": {},
1450 | "output_type": "execute_result"
1451 | }
1452 | ],
1453 | "source": [
1454 | "df['2016-12']"
1455 | ]
1456 | },
1457 | {
1458 | "cell_type": "code",
1459 | "execution_count": 22,
1460 | "metadata": {
1461 | "collapsed": false
1462 | },
1463 | "outputs": [
1464 | {
1465 | "data": {
1466 | "text/html": [
1467 | "\n",
1468 | "
\n",
1469 | " \n",
1470 | " \n",
1471 | " \n",
1472 | " Count 1 \n",
1473 | " Count 2 \n",
1474 | " \n",
1475 | " \n",
1476 | " \n",
1477 | " \n",
1478 | " 2016-12-11 \n",
1479 | " 109 \n",
1480 | " 121 \n",
1481 | " \n",
1482 | " \n",
1483 | " 2016-12-25 \n",
1484 | " 105 \n",
1485 | " 126 \n",
1486 | " \n",
1487 | " \n",
1488 | " 2017-01-08 \n",
1489 | " 105 \n",
1490 | " 125 \n",
1491 | " \n",
1492 | " \n",
1493 | " 2017-01-22 \n",
1494 | " 101 \n",
1495 | " 123 \n",
1496 | " \n",
1497 | " \n",
1498 | "
\n",
1499 | "
"
1500 | ],
1501 | "text/plain": [
1502 | " Count 1 Count 2\n",
1503 | "2016-12-11 109 121\n",
1504 | "2016-12-25 105 126\n",
1505 | "2017-01-08 105 125\n",
1506 | "2017-01-22 101 123"
1507 | ]
1508 | },
1509 | "execution_count": 22,
1510 | "metadata": {},
1511 | "output_type": "execute_result"
1512 | }
1513 | ],
1514 | "source": [
1515 | "df['2016-12':]"
1516 | ]
1517 | },
1518 | {
1519 | "cell_type": "code",
1520 | "execution_count": null,
1521 | "metadata": {
1522 | "collapsed": false
1523 | },
1524 | "outputs": [],
1525 | "source": [
1526 | "df.asfreq('W', method='ffill')"
1527 | ]
1528 | },
1529 | {
1530 | "cell_type": "code",
1531 | "execution_count": null,
1532 | "metadata": {
1533 | "collapsed": false
1534 | },
1535 | "outputs": [],
1536 | "source": [
1537 | "import matplotlib.pyplot as plt\n",
1538 | "%matplotlib inline\n",
1539 | "\n",
1540 | "df.plot()"
1541 | ]
1542 | }
1543 | ],
1544 | "metadata": {
1545 | "celltoolbar": "Raw Cell Format",
1546 | "kernelspec": {
1547 | "display_name": "Python 3",
1548 | "language": "python",
1549 | "name": "python3"
1550 | },
1551 | "language_info": {
1552 | "codemirror_mode": {
1553 | "name": "ipython",
1554 | "version": 3
1555 | },
1556 | "file_extension": ".py",
1557 | "mimetype": "text/x-python",
1558 | "name": "python",
1559 | "nbconvert_exporter": "python",
1560 | "pygments_lexer": "ipython3",
1561 | "version": "3.5.2"
1562 | }
1563 | },
1564 | "nbformat": 4,
1565 | "nbformat_minor": 0
1566 | }
1567 |
--------------------------------------------------------------------------------
/Week3/Assignment - Week 3/Assignment+3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "---\n",
8 | "\n",
9 | "_You are currently looking at **version 1.4** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-data-analysis/resources/0dhYG) course resource._\n",
10 | "\n",
11 | "---"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "# Assignment 3 - More Pandas\n",
19 | "All questions are weighted the same in this assignment. This assignment requires more individual learning then the last one did - you are encouraged to check out the [pandas documentation](http://pandas.pydata.org/pandas-docs/stable/) to find functions or methods you might not have used yet, or ask questions on [Stack Overflow](http://stackoverflow.com/) and tag them as pandas and python related. And of course, the discussion forums are open for interaction with your peers and the course staff."
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "### Question 1 (20%)\n",
27 | "Load the energy data from the file `Energy Indicators.xls`, which is a list of indicators of [energy supply and renewable electricity production](Energy%20Indicators.xls) from the [United Nations](http://unstats.un.org/unsd/environment/excel_file_tables/2013/Energy%20Indicators.xls) for the year 2013, and should be put into a DataFrame with the variable name of **energy**.\n",
28 | "\n",
29 | "Keep in mind that this is an Excel file, and not a comma separated values file. Also, make sure to exclude the footer and header information from the datafile. The first two columns are unneccessary, so you should get rid of them, and you should change the column labels so that the columns are:\n",
30 | "\n",
31 | "`['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable]`\n",
32 | "\n",
33 | "Convert `Energy Supply` to gigajoules (there are 1,000,000 gigajoules in a petajoule). For all countries which have missing data (e.g. data with \"...\") make sure this is reflected as `np.NaN` values.\n",
34 | "\n",
35 | "Rename the following list of countries (for use in later questions):\n",
36 | "\n",
37 | "```\"Republic of Korea\": \"South Korea\",\n",
38 | "\"United States of America\": \"United States\",\n",
39 | "\"United Kingdom of Great Britain and Northern Ireland\": \"United Kingdom\",\n",
40 | "\"China, Hong Kong Special Administrative Region\": \"Hong Kong\"```\n",
41 | "\n",
42 | "There are also several countries with numbers and/or parenthesis in their name. Be sure to remove these, e.g. `'Bolivia (Plurinational State of)'` should be `'Bolivia'`.\n",
43 | "\n",
44 | " \n",
45 | "\n",
46 | "Next, load the GDP data from the file `world_bank.csv`, which is a csv containing countries' GDP from 1960 to 2015 from [World Bank](http://data.worldbank.org/indicator/NY.GDP.MKTP.CD). Call this DataFrame **GDP**. \n",
47 | "\n",
48 | "Make sure to skip the header, and rename the following list of countries:\n",
49 | "\n",
50 | "```\"Korea, Rep.\": \"South Korea\", \n",
51 | "\"Iran, Islamic Rep.\": \"Iran\",\n",
52 | "\"Hong Kong SAR, China\": \"Hong Kong\"```\n",
53 | "\n",
54 | " \n",
55 | "\n",
56 | "Finally, load the [Sciamgo Journal and Country Rank data for Energy Engineering and Power Technology](http://www.scimagojr.com/countryrank.php?category=2102) from the file `scimagojr-3.xlsx`, which ranks countries based on their journal contributions in the aforementioned area. Call this DataFrame **ScimEn**.\n",
57 | "\n",
58 | "Join the three datasets: GDP, Energy, and ScimEn into a new dataset (using the intersection of country names). Use only the last 10 years (2006-2015) of GDP data and only the top 15 countries by Scimagojr 'Rank' (Rank 1 through 15). \n",
59 | "\n",
60 | "The index of this DataFrame should be the name of the country, and the columns should be ['Rank', 'Documents', 'Citable documents', 'Citations', 'Self-citations',\n",
61 | " 'Citations per document', 'H index', 'Energy Supply',\n",
62 | " 'Energy Supply per Capita', '% Renewable', '2006', '2007', '2008',\n",
63 | " '2009', '2010', '2011', '2012', '2013', '2014', '2015'].\n",
64 | "\n",
65 | "*This function should return a DataFrame with 20 columns and 15 entries.*"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 2,
71 | "metadata": {
72 | "collapsed": false,
73 | "scrolled": true
74 | },
75 | "outputs": [
76 | {
77 | "data": {
78 | "text/html": [
79 | "\n",
80 | "
\n",
81 | " \n",
82 | " \n",
83 | " \n",
84 | " Rank \n",
85 | " Documents \n",
86 | " Citable documents \n",
87 | " Citations \n",
88 | " Self-citations \n",
89 | " Citations per document \n",
90 | " H index \n",
91 | " Energy Supply \n",
92 | " Energy Supply per Capita \n",
93 | " % Renewable \n",
94 | " 2006 \n",
95 | " 2007 \n",
96 | " 2008 \n",
97 | " 2009 \n",
98 | " 2010 \n",
99 | " 2011 \n",
100 | " 2012 \n",
101 | " 2013 \n",
102 | " 2014 \n",
103 | " 2015 \n",
104 | " \n",
105 | " \n",
106 | " Country \n",
107 | " \n",
108 | " \n",
109 | " \n",
110 | " \n",
111 | " \n",
112 | " \n",
113 | " \n",
114 | " \n",
115 | " \n",
116 | " \n",
117 | " \n",
118 | " \n",
119 | " \n",
120 | " \n",
121 | " \n",
122 | " \n",
123 | " \n",
124 | " \n",
125 | " \n",
126 | " \n",
127 | " \n",
128 | " \n",
129 | " \n",
130 | " \n",
131 | " China \n",
132 | " 1 \n",
133 | " 127050 \n",
134 | " 126767 \n",
135 | " 597237 \n",
136 | " 411683 \n",
137 | " 4.70 \n",
138 | " 138 \n",
139 | " 127191000000 \n",
140 | " 93 \n",
141 | " 19.7549 \n",
142 | " 3.992331e+12 \n",
143 | " 4.559041e+12 \n",
144 | " 4.997775e+12 \n",
145 | " 5.459247e+12 \n",
146 | " 6.039659e+12 \n",
147 | " 6.612490e+12 \n",
148 | " 7.124978e+12 \n",
149 | " 7.672448e+12 \n",
150 | " 8.230121e+12 \n",
151 | " 8.797999e+12 \n",
152 | " \n",
153 | " \n",
154 | " United States \n",
155 | " 2 \n",
156 | " 96661 \n",
157 | " 94747 \n",
158 | " 792274 \n",
159 | " 265436 \n",
160 | " 8.20 \n",
161 | " 230 \n",
162 | " 90838000000 \n",
163 | " 286 \n",
164 | " 11.571 \n",
165 | " 1.479230e+13 \n",
166 | " 1.505540e+13 \n",
167 | " 1.501149e+13 \n",
168 | " 1.459484e+13 \n",
169 | " 1.496437e+13 \n",
170 | " 1.520402e+13 \n",
171 | " 1.554216e+13 \n",
172 | " 1.577367e+13 \n",
173 | " 1.615662e+13 \n",
174 | " 1.654857e+13 \n",
175 | " \n",
176 | " \n",
177 | " Japan \n",
178 | " 3 \n",
179 | " 30504 \n",
180 | " 30287 \n",
181 | " 223024 \n",
182 | " 61554 \n",
183 | " 7.31 \n",
184 | " 134 \n",
185 | " 18984000000 \n",
186 | " 149 \n",
187 | " 10.2328 \n",
188 | " 5.496542e+12 \n",
189 | " 5.617036e+12 \n",
190 | " 5.558527e+12 \n",
191 | " 5.251308e+12 \n",
192 | " 5.498718e+12 \n",
193 | " 5.473738e+12 \n",
194 | " 5.569102e+12 \n",
195 | " 5.644659e+12 \n",
196 | " 5.642884e+12 \n",
197 | " 5.669563e+12 \n",
198 | " \n",
199 | " \n",
200 | " United Kingdom \n",
201 | " 4 \n",
202 | " 20944 \n",
203 | " 20357 \n",
204 | " 206091 \n",
205 | " 37874 \n",
206 | " 9.84 \n",
207 | " 139 \n",
208 | " 7920000000 \n",
209 | " 124 \n",
210 | " 10.6005 \n",
211 | " 2.419631e+12 \n",
212 | " 2.482203e+12 \n",
213 | " 2.470614e+12 \n",
214 | " 2.367048e+12 \n",
215 | " 2.403504e+12 \n",
216 | " 2.450911e+12 \n",
217 | " 2.479809e+12 \n",
218 | " 2.533370e+12 \n",
219 | " 2.605643e+12 \n",
220 | " 2.666333e+12 \n",
221 | " \n",
222 | " \n",
223 | " Russian Federation \n",
224 | " 5 \n",
225 | " 18534 \n",
226 | " 18301 \n",
227 | " 34266 \n",
228 | " 12422 \n",
229 | " 1.85 \n",
230 | " 57 \n",
231 | " 30709000000 \n",
232 | " 214 \n",
233 | " 17.2887 \n",
234 | " 1.385793e+12 \n",
235 | " 1.504071e+12 \n",
236 | " 1.583004e+12 \n",
237 | " 1.459199e+12 \n",
238 | " 1.524917e+12 \n",
239 | " 1.589943e+12 \n",
240 | " 1.645876e+12 \n",
241 | " 1.666934e+12 \n",
242 | " 1.678709e+12 \n",
243 | " 1.616149e+12 \n",
244 | " \n",
245 | " \n",
246 | " Canada \n",
247 | " 6 \n",
248 | " 17899 \n",
249 | " 17620 \n",
250 | " 215003 \n",
251 | " 40930 \n",
252 | " 12.01 \n",
253 | " 149 \n",
254 | " 10431000000 \n",
255 | " 296 \n",
256 | " 61.9454 \n",
257 | " 1.564469e+12 \n",
258 | " 1.596740e+12 \n",
259 | " 1.612713e+12 \n",
260 | " 1.565145e+12 \n",
261 | " 1.613406e+12 \n",
262 | " 1.664087e+12 \n",
263 | " 1.693133e+12 \n",
264 | " 1.730688e+12 \n",
265 | " 1.773486e+12 \n",
266 | " 1.792609e+12 \n",
267 | " \n",
268 | " \n",
269 | " Germany \n",
270 | " 7 \n",
271 | " 17027 \n",
272 | " 16831 \n",
273 | " 140566 \n",
274 | " 27426 \n",
275 | " 8.26 \n",
276 | " 126 \n",
277 | " 13261000000 \n",
278 | " 165 \n",
279 | " 17.9015 \n",
280 | " 3.332891e+12 \n",
281 | " 3.441561e+12 \n",
282 | " 3.478809e+12 \n",
283 | " 3.283340e+12 \n",
284 | " 3.417298e+12 \n",
285 | " 3.542371e+12 \n",
286 | " 3.556724e+12 \n",
287 | " 3.567317e+12 \n",
288 | " 3.624386e+12 \n",
289 | " 3.685556e+12 \n",
290 | " \n",
291 | " \n",
292 | " India \n",
293 | " 8 \n",
294 | " 15005 \n",
295 | " 14841 \n",
296 | " 128763 \n",
297 | " 37209 \n",
298 | " 8.58 \n",
299 | " 115 \n",
300 | " 33195000000 \n",
301 | " 26 \n",
302 | " 14.9691 \n",
303 | " 1.265894e+12 \n",
304 | " 1.374865e+12 \n",
305 | " 1.428361e+12 \n",
306 | " 1.549483e+12 \n",
307 | " 1.708459e+12 \n",
308 | " 1.821872e+12 \n",
309 | " 1.924235e+12 \n",
310 | " 2.051982e+12 \n",
311 | " 2.200617e+12 \n",
312 | " 2.367206e+12 \n",
313 | " \n",
314 | " \n",
315 | " France \n",
316 | " 9 \n",
317 | " 13153 \n",
318 | " 12973 \n",
319 | " 130632 \n",
320 | " 28601 \n",
321 | " 9.93 \n",
322 | " 114 \n",
323 | " 10597000000 \n",
324 | " 166 \n",
325 | " 17.0203 \n",
326 | " 2.607840e+12 \n",
327 | " 2.669424e+12 \n",
328 | " 2.674637e+12 \n",
329 | " 2.595967e+12 \n",
330 | " 2.646995e+12 \n",
331 | " 2.702032e+12 \n",
332 | " 2.706968e+12 \n",
333 | " 2.722567e+12 \n",
334 | " 2.729632e+12 \n",
335 | " 2.761185e+12 \n",
336 | " \n",
337 | " \n",
338 | " South Korea \n",
339 | " 10 \n",
340 | " 11983 \n",
341 | " 11923 \n",
342 | " 114675 \n",
343 | " 22595 \n",
344 | " 9.57 \n",
345 | " 104 \n",
346 | " 11007000000 \n",
347 | " 221 \n",
348 | " 2.27935 \n",
349 | " 9.410199e+11 \n",
350 | " 9.924316e+11 \n",
351 | " 1.020510e+12 \n",
352 | " 1.027730e+12 \n",
353 | " 1.094499e+12 \n",
354 | " 1.134796e+12 \n",
355 | " 1.160809e+12 \n",
356 | " 1.194429e+12 \n",
357 | " 1.234340e+12 \n",
358 | " 1.266580e+12 \n",
359 | " \n",
360 | " \n",
361 | " Italy \n",
362 | " 11 \n",
363 | " 10964 \n",
364 | " 10794 \n",
365 | " 111850 \n",
366 | " 26661 \n",
367 | " 10.20 \n",
368 | " 106 \n",
369 | " 6530000000 \n",
370 | " 109 \n",
371 | " 33.6672 \n",
372 | " 2.202170e+12 \n",
373 | " 2.234627e+12 \n",
374 | " 2.211154e+12 \n",
375 | " 2.089938e+12 \n",
376 | " 2.125185e+12 \n",
377 | " 2.137439e+12 \n",
378 | " 2.077184e+12 \n",
379 | " 2.040871e+12 \n",
380 | " 2.033868e+12 \n",
381 | " 2.049316e+12 \n",
382 | " \n",
383 | " \n",
384 | " Spain \n",
385 | " 12 \n",
386 | " 9428 \n",
387 | " 9330 \n",
388 | " 123336 \n",
389 | " 23964 \n",
390 | " 13.08 \n",
391 | " 115 \n",
392 | " 4923000000 \n",
393 | " 106 \n",
394 | " 37.9686 \n",
395 | " 1.414823e+12 \n",
396 | " 1.468146e+12 \n",
397 | " 1.484530e+12 \n",
398 | " 1.431475e+12 \n",
399 | " 1.431673e+12 \n",
400 | " 1.417355e+12 \n",
401 | " 1.380216e+12 \n",
402 | " 1.357139e+12 \n",
403 | " 1.375605e+12 \n",
404 | " 1.419821e+12 \n",
405 | " \n",
406 | " \n",
407 | " Iran \n",
408 | " 13 \n",
409 | " 8896 \n",
410 | " 8819 \n",
411 | " 57470 \n",
412 | " 19125 \n",
413 | " 6.46 \n",
414 | " 72 \n",
415 | " 9172000000 \n",
416 | " 119 \n",
417 | " 5.70772 \n",
418 | " 3.895523e+11 \n",
419 | " 4.250646e+11 \n",
420 | " 4.289909e+11 \n",
421 | " 4.389208e+11 \n",
422 | " 4.677902e+11 \n",
423 | " 4.853309e+11 \n",
424 | " 4.532569e+11 \n",
425 | " 4.445926e+11 \n",
426 | " 4.639027e+11 \n",
427 | " NaN \n",
428 | " \n",
429 | " \n",
430 | " Australia \n",
431 | " 14 \n",
432 | " 8831 \n",
433 | " 8725 \n",
434 | " 90765 \n",
435 | " 15606 \n",
436 | " 10.28 \n",
437 | " 107 \n",
438 | " 5386000000 \n",
439 | " 231 \n",
440 | " 11.8108 \n",
441 | " 1.021939e+12 \n",
442 | " 1.060340e+12 \n",
443 | " 1.099644e+12 \n",
444 | " 1.119654e+12 \n",
445 | " 1.142251e+12 \n",
446 | " 1.169431e+12 \n",
447 | " 1.211913e+12 \n",
448 | " 1.241484e+12 \n",
449 | " 1.272520e+12 \n",
450 | " 1.301251e+12 \n",
451 | " \n",
452 | " \n",
453 | " Brazil \n",
454 | " 15 \n",
455 | " 8668 \n",
456 | " 8596 \n",
457 | " 60702 \n",
458 | " 14396 \n",
459 | " 7.00 \n",
460 | " 86 \n",
461 | " 12149000000 \n",
462 | " 59 \n",
463 | " 69.648 \n",
464 | " 1.845080e+12 \n",
465 | " 1.957118e+12 \n",
466 | " 2.056809e+12 \n",
467 | " 2.054215e+12 \n",
468 | " 2.208872e+12 \n",
469 | " 2.295245e+12 \n",
470 | " 2.339209e+12 \n",
471 | " 2.409740e+12 \n",
472 | " 2.412231e+12 \n",
473 | " 2.319423e+12 \n",
474 | " \n",
475 | " \n",
476 | "
\n",
477 | "
"
478 | ],
479 | "text/plain": [
480 | " Rank Documents Citable documents Citations \\\n",
481 | "Country \n",
482 | "China 1 127050 126767 597237 \n",
483 | "United States 2 96661 94747 792274 \n",
484 | "Japan 3 30504 30287 223024 \n",
485 | "United Kingdom 4 20944 20357 206091 \n",
486 | "Russian Federation 5 18534 18301 34266 \n",
487 | "Canada 6 17899 17620 215003 \n",
488 | "Germany 7 17027 16831 140566 \n",
489 | "India 8 15005 14841 128763 \n",
490 | "France 9 13153 12973 130632 \n",
491 | "South Korea 10 11983 11923 114675 \n",
492 | "Italy 11 10964 10794 111850 \n",
493 | "Spain 12 9428 9330 123336 \n",
494 | "Iran 13 8896 8819 57470 \n",
495 | "Australia 14 8831 8725 90765 \n",
496 | "Brazil 15 8668 8596 60702 \n",
497 | "\n",
498 | " Self-citations Citations per document H index \\\n",
499 | "Country \n",
500 | "China 411683 4.70 138 \n",
501 | "United States 265436 8.20 230 \n",
502 | "Japan 61554 7.31 134 \n",
503 | "United Kingdom 37874 9.84 139 \n",
504 | "Russian Federation 12422 1.85 57 \n",
505 | "Canada 40930 12.01 149 \n",
506 | "Germany 27426 8.26 126 \n",
507 | "India 37209 8.58 115 \n",
508 | "France 28601 9.93 114 \n",
509 | "South Korea 22595 9.57 104 \n",
510 | "Italy 26661 10.20 106 \n",
511 | "Spain 23964 13.08 115 \n",
512 | "Iran 19125 6.46 72 \n",
513 | "Australia 15606 10.28 107 \n",
514 | "Brazil 14396 7.00 86 \n",
515 | "\n",
516 | " Energy Supply Energy Supply per Capita % Renewable \\\n",
517 | "Country \n",
518 | "China 127191000000 93 19.7549 \n",
519 | "United States 90838000000 286 11.571 \n",
520 | "Japan 18984000000 149 10.2328 \n",
521 | "United Kingdom 7920000000 124 10.6005 \n",
522 | "Russian Federation 30709000000 214 17.2887 \n",
523 | "Canada 10431000000 296 61.9454 \n",
524 | "Germany 13261000000 165 17.9015 \n",
525 | "India 33195000000 26 14.9691 \n",
526 | "France 10597000000 166 17.0203 \n",
527 | "South Korea 11007000000 221 2.27935 \n",
528 | "Italy 6530000000 109 33.6672 \n",
529 | "Spain 4923000000 106 37.9686 \n",
530 | "Iran 9172000000 119 5.70772 \n",
531 | "Australia 5386000000 231 11.8108 \n",
532 | "Brazil 12149000000 59 69.648 \n",
533 | "\n",
534 | " 2006 2007 2008 2009 \\\n",
535 | "Country \n",
536 | "China 3.992331e+12 4.559041e+12 4.997775e+12 5.459247e+12 \n",
537 | "United States 1.479230e+13 1.505540e+13 1.501149e+13 1.459484e+13 \n",
538 | "Japan 5.496542e+12 5.617036e+12 5.558527e+12 5.251308e+12 \n",
539 | "United Kingdom 2.419631e+12 2.482203e+12 2.470614e+12 2.367048e+12 \n",
540 | "Russian Federation 1.385793e+12 1.504071e+12 1.583004e+12 1.459199e+12 \n",
541 | "Canada 1.564469e+12 1.596740e+12 1.612713e+12 1.565145e+12 \n",
542 | "Germany 3.332891e+12 3.441561e+12 3.478809e+12 3.283340e+12 \n",
543 | "India 1.265894e+12 1.374865e+12 1.428361e+12 1.549483e+12 \n",
544 | "France 2.607840e+12 2.669424e+12 2.674637e+12 2.595967e+12 \n",
545 | "South Korea 9.410199e+11 9.924316e+11 1.020510e+12 1.027730e+12 \n",
546 | "Italy 2.202170e+12 2.234627e+12 2.211154e+12 2.089938e+12 \n",
547 | "Spain 1.414823e+12 1.468146e+12 1.484530e+12 1.431475e+12 \n",
548 | "Iran 3.895523e+11 4.250646e+11 4.289909e+11 4.389208e+11 \n",
549 | "Australia 1.021939e+12 1.060340e+12 1.099644e+12 1.119654e+12 \n",
550 | "Brazil 1.845080e+12 1.957118e+12 2.056809e+12 2.054215e+12 \n",
551 | "\n",
552 | " 2010 2011 2012 2013 \\\n",
553 | "Country \n",
554 | "China 6.039659e+12 6.612490e+12 7.124978e+12 7.672448e+12 \n",
555 | "United States 1.496437e+13 1.520402e+13 1.554216e+13 1.577367e+13 \n",
556 | "Japan 5.498718e+12 5.473738e+12 5.569102e+12 5.644659e+12 \n",
557 | "United Kingdom 2.403504e+12 2.450911e+12 2.479809e+12 2.533370e+12 \n",
558 | "Russian Federation 1.524917e+12 1.589943e+12 1.645876e+12 1.666934e+12 \n",
559 | "Canada 1.613406e+12 1.664087e+12 1.693133e+12 1.730688e+12 \n",
560 | "Germany 3.417298e+12 3.542371e+12 3.556724e+12 3.567317e+12 \n",
561 | "India 1.708459e+12 1.821872e+12 1.924235e+12 2.051982e+12 \n",
562 | "France 2.646995e+12 2.702032e+12 2.706968e+12 2.722567e+12 \n",
563 | "South Korea 1.094499e+12 1.134796e+12 1.160809e+12 1.194429e+12 \n",
564 | "Italy 2.125185e+12 2.137439e+12 2.077184e+12 2.040871e+12 \n",
565 | "Spain 1.431673e+12 1.417355e+12 1.380216e+12 1.357139e+12 \n",
566 | "Iran 4.677902e+11 4.853309e+11 4.532569e+11 4.445926e+11 \n",
567 | "Australia 1.142251e+12 1.169431e+12 1.211913e+12 1.241484e+12 \n",
568 | "Brazil 2.208872e+12 2.295245e+12 2.339209e+12 2.409740e+12 \n",
569 | "\n",
570 | " 2014 2015 \n",
571 | "Country \n",
572 | "China 8.230121e+12 8.797999e+12 \n",
573 | "United States 1.615662e+13 1.654857e+13 \n",
574 | "Japan 5.642884e+12 5.669563e+12 \n",
575 | "United Kingdom 2.605643e+12 2.666333e+12 \n",
576 | "Russian Federation 1.678709e+12 1.616149e+12 \n",
577 | "Canada 1.773486e+12 1.792609e+12 \n",
578 | "Germany 3.624386e+12 3.685556e+12 \n",
579 | "India 2.200617e+12 2.367206e+12 \n",
580 | "France 2.729632e+12 2.761185e+12 \n",
581 | "South Korea 1.234340e+12 1.266580e+12 \n",
582 | "Italy 2.033868e+12 2.049316e+12 \n",
583 | "Spain 1.375605e+12 1.419821e+12 \n",
584 | "Iran 4.639027e+11 NaN \n",
585 | "Australia 1.272520e+12 1.301251e+12 \n",
586 | "Brazil 2.412231e+12 2.319423e+12 "
587 | ]
588 | },
589 | "execution_count": 2,
590 | "metadata": {},
591 | "output_type": "execute_result"
592 | }
593 | ],
594 | "source": [
595 | "import pandas as pd\n",
596 | "import numpy as np\n",
597 | "\n",
598 | "# Load the Excel sheet and CSV file into Panda Dataframe. Skip header for GDP Dataframe.\n",
599 | "xls_file = pd.ExcelFile('Energy Indicators.xls')\n",
600 | "energy = xls_file.parse('Energy')\n",
601 | "GDP = pd.read_csv('world_bank.csv', header=None)\n",
602 | "ScimEn_file = pd.ExcelFile('scimagojr-3.xlsx')\n",
603 | "ScimEn = ScimEn_file.parse('Sheet1')\n",
604 | "\n",
605 | "# Drop first 2 columns\n",
606 | "energy = energy.drop(['Unnamed: 0','Unnamed: 1'],1)\n",
607 | "\n",
608 | "# Rename the columns\n",
609 | "energy.columns = ['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable']\n",
610 | "\n",
611 | "# Convert \"...\" to np.NaN\n",
612 | "# This section is not very DRY[\"Don't Repeat Yourself\"]. Need to think about ways to apply regex to column names, so that the same function applies to columns starting with \"Energy Supply\"\n",
613 | "energy['Energy Supply'].replace(regex=True,inplace=True,to_replace=r'\\W',value=np.nan)\n",
614 | "energy['Energy Supply per Capita'].replace(regex=True,inplace=True,to_replace=r'\\W',value=np.nan)\n",
615 | "\n",
616 | "# Convert 'Energy Supply' Column from petajoule to gigajoules (there are 1,000,000 gigajoules in a petajoule)\n",
617 | "energy['Energy Supply']*=1000000\n",
618 | "\n",
619 | "# Set Country column to be the index\n",
620 | "energy.set_index('Country')\n",
621 | "\n",
622 | "# Use Regular Expression to remove numbers and parenthesis(and the content inside) in country names. \\d stands for digits. \n",
623 | "# Remember to add a whitespace before the first escape before ()....couldn't find Bolivia otherwise. Struggled for a long time for this!\n",
624 | "energy['Country'].replace(regex=True,inplace=True,to_replace=r'\\d',value=r'')\n",
625 | "energy['Country'].replace(regex=True,inplace=True,to_replace=r' \\(.*\\)',value=r'')\n",
626 | "\n",
627 | "# Replace country names with new names\n",
628 | "energy.Country[energy.Country == \"Republic of Korea\"]=\"South Korea\"\n",
629 | "energy.Country[energy.Country == \"United States of America\"]=\"United States\"\n",
630 | "energy.Country[energy.Country == \"China, Hong Kong Special Administrative Region\"]=\"Hong Kong\"\n",
631 | "energy.Country[energy.Country == \"United Kingdom of Great Britain and Northern Ireland\"]=\"United Kingdom\"\n",
632 | "\n",
633 | "# Test to ensure that the replacement worked. Have turned them into comments.\n",
634 | "# energy.loc[energy.Country ==\"Bolivia\"]\n",
635 | "# energy.loc[energy.Country ==\"United Kingdom\"]\n",
636 | "\n",
637 | "\n",
638 | "# ----------------------------------------------\n",
639 | "\n",
640 | "# Drop the first few rows of GDP Dataframe to clean the data\n",
641 | "GDP.drop(GDP.index[:4],axis=0, inplace=True)\n",
642 | "\n",
643 | "# Drop the other columns so that we only keep the last 10 years' data (2006-2015)\n",
644 | "GDP.drop(GDP.columns[1:50], axis=1, inplace=True)\n",
645 | "\n",
646 | "# Convert the Column heads from number to strings (2006.0 -> Year 2006)\n",
647 | "# GDP[50:]=GDP[50:].astype(str)\n",
648 | "# GDP.rename(columns = lambda x: str(x), inplace=True)\n",
649 | "GDP.columns = ['Country','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015']\n",
650 | "GDP.columns.values\n",
651 | "\n",
652 | "\n",
653 | "\n",
654 | "# Use the first row as column headers\n",
655 | "# Drop redundant row\n",
656 | "GDP.columns = GDP.iloc[0]\n",
657 | "GDP.drop(GDP.index[0],axis=0, inplace=True)\n",
658 | "GDP.rename(columns={'Country Name': 'Country'}, inplace=True)\n",
659 | "GDP.set_index('Country')\n",
660 | "\n",
661 | "# Rename the countries. South Korea was actually named \"Korea, Rep.\" in the original spreadsheet. They gave the wrong instruction here...\n",
662 | "GDP['Country'].replace(regex=True,inplace=True,to_replace='Korea, Rep.',value='South Korea')\n",
663 | "GDP['Country'].replace(regex=True,inplace=True,to_replace='Iran, Islamic Rep.',value='Iran')\n",
664 | "GDP['Country'].replace(regex=True,inplace=True,to_replace='Hong Kong SAR, China',value='Hong Kong')\n",
665 | "\n",
666 | "# Remove the parenthesis\n",
667 | "GDP['Country'].replace(regex=True,inplace=True,to_replace=r' \\(.*\\)',value=r'')\n",
668 | "\n",
669 | "\n",
670 | "# Test to ensure that the replacement worked. Have turned it into comment.\n",
671 | "# GDP.loc[GDP[\"Country Name\"] ==\"South Korea\"]\n",
672 | "\n",
673 | "# ----------------------------------------------\n",
674 | "\n",
675 | "# Keep only the top 15 countries by Ranking in the ScimEn Dataframe\n",
676 | "ScimEn.drop(ScimEn.index[15:],axis=0, inplace=True)\n",
677 | "\n",
678 | "# Merge the 3 dataframes: energy, GDP, ScimEn\n",
679 | "new = pd.merge(pd.merge(ScimEn,energy,on='Country'),GDP,on='Country')\n",
680 | "\n",
681 | "# Set the Country column to be the index. Remember to use inplace=True. Otherwise it kept returning a list of integers...spend an hour on this. \n",
682 | "new.set_index('Country',inplace=True)\n",
683 | "\n",
684 | "# Count the number of rows and columns in the dataframe. This is a test, have commented it. \n",
685 | "# new.shape[0] #gives number of row count\n",
686 | "# new.shape[1] #gives number of col count\n",
687 | "\n",
688 | "# Hardcoding this because I couln't figure out how to convert Numbers to Strings in the header. Tried astype() and to_string, both didn't work....\n",
689 | "new.columns = ['Rank', 'Documents', 'Citable documents', 'Citations', 'Self-citations', 'Citations per document', 'H index', 'Energy Supply', 'Energy Supply per Capita', '% Renewable', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']\n",
690 | "\n",
691 | "def answer_one():\n",
692 | " return new\n",
693 | "\n",
694 | "answer_one()"
695 | ]
696 | },
697 | {
698 | "cell_type": "code",
699 | "execution_count": null,
700 | "metadata": {
701 | "collapsed": true
702 | },
703 | "outputs": [],
704 | "source": []
705 | },
706 | {
707 | "cell_type": "markdown",
708 | "metadata": {},
709 | "source": [
710 | "### Question 2 (6.6%)\n",
711 | "The previous question joined three datasets then reduced this to just the top 15 entries. When you joined the datasets, but before you reduced this to the top 15 items, how many entries did you lose?\n",
712 | "\n",
713 | "*This function should return a single number.*"
714 | ]
715 | },
716 | {
717 | "cell_type": "code",
718 | "execution_count": 58,
719 | "metadata": {
720 | "collapsed": false
721 | },
722 | "outputs": [
723 | {
724 | "data": {
725 | "text/html": [
726 | "\n",
727 | " \n",
728 | " \n",
729 | " \n",
730 | " \n",
731 | " Everything but this! \n",
732 | " "
733 | ],
734 | "text/plain": [
735 | ""
736 | ]
737 | },
738 | "metadata": {},
739 | "output_type": "display_data"
740 | }
741 | ],
742 | "source": [
743 | "%%HTML\n",
744 | "\n",
745 | " \n",
746 | " \n",
747 | " \n",
748 | " \n",
749 | " Everything but this! \n",
750 | " "
751 | ]
752 | },
753 | {
754 | "cell_type": "code",
755 | "execution_count": null,
756 | "metadata": {
757 | "collapsed": true
758 | },
759 | "outputs": [],
760 | "source": []
761 | },
762 | {
763 | "cell_type": "code",
764 | "execution_count": null,
765 | "metadata": {
766 | "collapsed": false
767 | },
768 | "outputs": [],
769 | "source": [
770 | "def answer_two():\n",
771 | " return \"ANSWER\""
772 | ]
773 | },
774 | {
775 | "cell_type": "markdown",
776 | "metadata": {},
777 | "source": [
778 | " \n",
779 | "\n",
780 | "Answer the following questions in the context of only the top 15 countries by Scimagojr Rank (aka the DataFrame returned by `answer_one()`)"
781 | ]
782 | },
783 | {
784 | "cell_type": "markdown",
785 | "metadata": {},
786 | "source": [
787 | "### Question 3 (6.6%)\n",
788 | "What is the average GDP over the last 10 years for each country?\n",
789 | "\n",
790 | "*This function should return a Series named `avgGDP` with 15 countries and their average GDP sorted in descending order.*"
791 | ]
792 | },
793 | {
794 | "cell_type": "code",
795 | "execution_count": 84,
796 | "metadata": {
797 | "collapsed": false,
798 | "scrolled": true
799 | },
800 | "outputs": [
801 | {
802 | "name": "stderr",
803 | "output_type": "stream",
804 | "text": [
805 | "/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:8: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)\n"
806 | ]
807 | },
808 | {
809 | "data": {
810 | "text/plain": [
811 | "Country\n",
812 | "United States 1.536434e+13\n",
813 | "China 6.348609e+12\n",
814 | "Japan 5.542208e+12\n",
815 | "Germany 3.493025e+12\n",
816 | "France 2.681725e+12\n",
817 | "United Kingdom 2.487907e+12\n",
818 | "Brazil 2.189794e+12\n",
819 | "Italy 2.120175e+12\n",
820 | "India 1.769297e+12\n",
821 | "Canada 1.660647e+12\n",
822 | "Russian Federation 1.565459e+12\n",
823 | "Spain 1.418078e+12\n",
824 | "Australia 1.164043e+12\n",
825 | "South Korea 1.106715e+12\n",
826 | "Iran 4.441558e+11\n",
827 | "Name: avgGDP, dtype: float64"
828 | ]
829 | },
830 | "execution_count": 84,
831 | "metadata": {},
832 | "output_type": "execute_result"
833 | }
834 | ],
835 | "source": [
836 | "# Need to create a new dataframe for each question, otherwise the autograder would think that I'm creating new columns for the DF created in question 1, and will stop working...\n",
837 | "question3=new.copy()\n",
838 | "\n",
839 | "# The 2015 GDP data for Iran is NAN, so I had to use np.mean() instead of hard code it as df.sum()/10.... Probably better this way.\n",
840 | "question3[\"avgGDP\"]=question3[['2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015']].mean(axis=1)\n",
841 | "\n",
842 | "# Sort by descending order\n",
843 | "question3.sort('avgGDP', ascending=False,inplace=True)\n",
844 | "\n",
845 | "# Create a new data series named \"avgGDP\"\n",
846 | "avgGDP_series= question3.ix[:,'avgGDP']\n",
847 | "\n",
848 | "def answer_three():\n",
849 | " Top15 = answer_one()\n",
850 | " return avgGDP_series\n",
851 | "answer_three()"
852 | ]
853 | },
854 | {
855 | "cell_type": "markdown",
856 | "metadata": {},
857 | "source": [
858 | "### Question 4 (6.6%)\n",
859 | "By how much had the GDP changed over the 10 year span for the country with the 6th largest average GDP?\n",
860 | "\n",
861 | "*This function should return a single number.*"
862 | ]
863 | },
864 | {
865 | "cell_type": "code",
866 | "execution_count": 148,
867 | "metadata": {
868 | "collapsed": false,
869 | "scrolled": true
870 | },
871 | "outputs": [
872 | {
873 | "data": {
874 | "text/plain": [
875 | "246702696075.3999"
876 | ]
877 | },
878 | "execution_count": 148,
879 | "metadata": {},
880 | "output_type": "execute_result"
881 | }
882 | ],
883 | "source": [
884 | "question4 = question3.copy()\n",
885 | "question4.drop(question4.columns[0:10],axis=1, inplace=True)\n",
886 | "\n",
887 | "diff=question4.iloc[5][9]-question4.iloc[5][0]\n",
888 | "\n",
889 | "def answer_four():\n",
890 | " Top15 = answer_one()\n",
891 | " return diff\n",
892 | "\n",
893 | "answer_four()"
894 | ]
895 | },
896 | {
897 | "cell_type": "code",
898 | "execution_count": null,
899 | "metadata": {
900 | "collapsed": true
901 | },
902 | "outputs": [],
903 | "source": []
904 | },
905 | {
906 | "cell_type": "markdown",
907 | "metadata": {},
908 | "source": [
909 | "### Question 5 (6.6%)\n",
910 | "What is the mean energy supply per capita?\n",
911 | "\n",
912 | "*This function should return a single number.*"
913 | ]
914 | },
915 | {
916 | "cell_type": "code",
917 | "execution_count": 157,
918 | "metadata": {
919 | "collapsed": false
920 | },
921 | "outputs": [
922 | {
923 | "data": {
924 | "text/plain": [
925 | "157.6"
926 | ]
927 | },
928 | "execution_count": 157,
929 | "metadata": {},
930 | "output_type": "execute_result"
931 | }
932 | ],
933 | "source": [
934 | "question5 = new.copy()\n",
935 | "\n",
936 | "# Use item() to convert a numpy.float object to normal Python object (float)\n",
937 | "def answer_five():\n",
938 | " return question5['Energy Supply per Capita'].mean().item()\n",
939 | "answer_five()"
940 | ]
941 | },
942 | {
943 | "cell_type": "markdown",
944 | "metadata": {},
945 | "source": [
946 | "### Question 6 (6.6%)\n",
947 | "What country has the maximum % Renewable and what is the percentage?\n",
948 | "\n",
949 | "*This function should return a tuple with the name of the country and the percentage.*"
950 | ]
951 | },
952 | {
953 | "cell_type": "code",
954 | "execution_count": 170,
955 | "metadata": {
956 | "collapsed": false
957 | },
958 | "outputs": [
959 | {
960 | "name": "stderr",
961 | "output_type": "stream",
962 | "text": [
963 | "/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:2: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)\n",
964 | " from ipykernel import kernelapp as app\n"
965 | ]
966 | },
967 | {
968 | "data": {
969 | "text/plain": [
970 | "('Brazil', 69.64803)"
971 | ]
972 | },
973 | "execution_count": 170,
974 | "metadata": {},
975 | "output_type": "execute_result"
976 | }
977 | ],
978 | "source": [
979 | "question6= new.copy()\n",
980 | "\n",
981 | "# Sort by '% Renewable'\n",
982 | "question6.sort('% Renewable',ascending=False,inplace=True)\n",
983 | "\n",
984 | "# Return as a tuple\n",
985 | "def answer_six():\n",
986 | " return (question6.iloc[0].name,question6.iloc[0]['% Renewable'])\n",
987 | "answer_six()"
988 | ]
989 | },
990 | {
991 | "cell_type": "markdown",
992 | "metadata": {},
993 | "source": [
994 | "### Question 7 (6.6%)\n",
995 | "Create a new column that is the ratio of Self-Citations to Total Citations. \n",
996 | "What is the maximum value for this new column, and what country has the highest ratio?\n",
997 | "\n",
998 | "*This function should return a tuple with the name of the country and the ratio.*"
999 | ]
1000 | },
1001 | {
1002 | "cell_type": "code",
1003 | "execution_count": 212,
1004 | "metadata": {
1005 | "collapsed": false
1006 | },
1007 | "outputs": [
1008 | {
1009 | "data": {
1010 | "text/plain": [
1011 | "('China', 0.68931261793894216)"
1012 | ]
1013 | },
1014 | "execution_count": 212,
1015 | "metadata": {},
1016 | "output_type": "execute_result"
1017 | }
1018 | ],
1019 | "source": [
1020 | "question7=new.copy()\n",
1021 | "\n",
1022 | "question7['Citation Ratio']=question7['Self-citations']/question7['Citations']\n",
1023 | "\n",
1024 | "name_of_country = question7[question7['Citation Ratio']==question7['Citation Ratio'].max()].index.values.item()\n",
1025 | "value = question7['Citation Ratio'].max()\n",
1026 | "def answer_seven():\n",
1027 | " return (name_of_country,value)\n",
1028 | "answer_seven()"
1029 | ]
1030 | },
1031 | {
1032 | "cell_type": "code",
1033 | "execution_count": null,
1034 | "metadata": {
1035 | "collapsed": true
1036 | },
1037 | "outputs": [],
1038 | "source": []
1039 | },
1040 | {
1041 | "cell_type": "markdown",
1042 | "metadata": {},
1043 | "source": [
1044 | "### Question 8 (6.6%)\n",
1045 | "\n",
1046 | "Create a column that estimates the population using Energy Supply and Energy Supply per capita. \n",
1047 | "What is the third most populous country according to this estimate?\n",
1048 | "\n",
1049 | "*This function should return a single string value.*"
1050 | ]
1051 | },
1052 | {
1053 | "cell_type": "code",
1054 | "execution_count": 3,
1055 | "metadata": {
1056 | "collapsed": false
1057 | },
1058 | "outputs": [
1059 | {
1060 | "name": "stderr",
1061 | "output_type": "stream",
1062 | "text": [
1063 | "/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:6: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)\n"
1064 | ]
1065 | },
1066 | {
1067 | "data": {
1068 | "text/plain": [
1069 | "'United States'"
1070 | ]
1071 | },
1072 | "execution_count": 3,
1073 | "metadata": {},
1074 | "output_type": "execute_result"
1075 | }
1076 | ],
1077 | "source": [
1078 | "question8 = new.copy()\n",
1079 | "# Create a new row for 'Population'\n",
1080 | "question8['Population']=question8['Energy Supply']/question8['Energy Supply per Capita']\n",
1081 | "\n",
1082 | "# Sort the dataframe by 'Population'\n",
1083 | "question8.sort('Population',ascending=False,inplace=True)\n",
1084 | "\n",
1085 | "# Find the 3rd most populous country, and return the index (country name) with index.values. Item() converts it into a Python string\n",
1086 | "third_populous_country = question8[question8['Population']==question8['Population'][2]].index.values.item()\n",
1087 | "def answer_eight():\n",
1088 | " return third_populous_country\n",
1089 | "\n",
1090 | "answer_eight()"
1091 | ]
1092 | },
1093 | {
1094 | "cell_type": "code",
1095 | "execution_count": null,
1096 | "metadata": {
1097 | "collapsed": true
1098 | },
1099 | "outputs": [],
1100 | "source": []
1101 | },
1102 | {
1103 | "cell_type": "markdown",
1104 | "metadata": {},
1105 | "source": [
1106 | "### Question 9\n",
1107 | "Create a column that estimates the number of citable documents per person. \n",
1108 | "What is the correlation between the number of citable documents per capita and the energy supply per capita? Use the `.corr()` method, (Pearson's correlation).\n",
1109 | "\n",
1110 | "*This function should return a single number.*\n",
1111 | "\n",
1112 | "*(Optional: Use the built-in function `plot9()` to visualize the relationship between Energy Supply per Capita vs. Citable docs per Capita)*"
1113 | ]
1114 | },
1115 | {
1116 | "cell_type": "code",
1117 | "execution_count": 47,
1118 | "metadata": {
1119 | "collapsed": false
1120 | },
1121 | "outputs": [],
1122 | "source": [
1123 | "# Can't figure this out. Ditched it. \n",
1124 | "# I want chips.\n",
1125 | "# Going out now to buy chips...\n",
1126 | "\n",
1127 | "# question9=question8.copy()\n",
1128 | "\n",
1129 | "# question9['Citable document per Capita']=question9['Citable documents']/question9['Population']\n",
1130 | "# sub = question9[['Citable document per Capita','Energy Supply per Capita']]\n",
1131 | "\n",
1132 | "# # def answer_nine():\n",
1133 | "# # return \"ANSWER\""
1134 | ]
1135 | },
1136 | {
1137 | "cell_type": "code",
1138 | "execution_count": 214,
1139 | "metadata": {
1140 | "collapsed": false
1141 | },
1142 | "outputs": [],
1143 | "source": [
1144 | "# def plot9():\n",
1145 | "# import matplotlib as plt\n",
1146 | "# %matplotlib inline\n",
1147 | " \n",
1148 | "# Top15 = answer_one()\n",
1149 | "# Top15['PopEst'] = Top15['Energy Supply'] / Top15['Energy Supply per Capita']\n",
1150 | "# Top15['Citable docs per Capita'] = Top15['Citable documents'] / Top15['PopEst']\n",
1151 | "# Top15.plot(x='Citable docs per Capita', y='Energy Supply per Capita', kind='scatter', xlim=[0, 0.0006])"
1152 | ]
1153 | },
1154 | {
1155 | "cell_type": "code",
1156 | "execution_count": null,
1157 | "metadata": {
1158 | "collapsed": true
1159 | },
1160 | "outputs": [],
1161 | "source": [
1162 | "#"
1163 | ]
1164 | },
1165 | {
1166 | "cell_type": "code",
1167 | "execution_count": null,
1168 | "metadata": {
1169 | "collapsed": false
1170 | },
1171 | "outputs": [],
1172 | "source": [
1173 | "#plot9() # Be sure to comment out plot9() before submitting the assignment!"
1174 | ]
1175 | },
1176 | {
1177 | "cell_type": "markdown",
1178 | "metadata": {},
1179 | "source": [
1180 | "### Question 10 (6.6%)\n",
1181 | "Create a new column with a 1 if the country's % Renewable value is at or above the median for all countries in the top 15, and a 0 if the country's % Renewable value is below the median.\n",
1182 | "\n",
1183 | "*This function should return a series named `HighRenew` whose index is the country name sorted in ascending order of rank.*"
1184 | ]
1185 | },
1186 | {
1187 | "cell_type": "code",
1188 | "execution_count": 74,
1189 | "metadata": {
1190 | "collapsed": false
1191 | },
1192 | "outputs": [
1193 | {
1194 | "name": "stderr",
1195 | "output_type": "stream",
1196 | "text": [
1197 | "/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:7: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)\n"
1198 | ]
1199 | },
1200 | {
1201 | "data": {
1202 | "text/plain": [
1203 | "Country\n",
1204 | "China 1\n",
1205 | "United States 0\n",
1206 | "Japan 0\n",
1207 | "United Kingdom 0\n",
1208 | "Russian Federation 1\n",
1209 | "Canada 1\n",
1210 | "Germany 1\n",
1211 | "India 0\n",
1212 | "France 1\n",
1213 | "South Korea 0\n",
1214 | "Italy 1\n",
1215 | "Spain 1\n",
1216 | "Iran 0\n",
1217 | "Australia 0\n",
1218 | "Brazil 1\n",
1219 | "Name: HighRenew, dtype: int64"
1220 | ]
1221 | },
1222 | "execution_count": 74,
1223 | "metadata": {},
1224 | "output_type": "execute_result"
1225 | }
1226 | ],
1227 | "source": [
1228 | "question10=new.copy()\n",
1229 | "\n",
1230 | "median = question10['% Renewable'].median()\n",
1231 | "\n",
1232 | "question10['HighRenew'] = np.where(question10['% Renewable'] >= median,1,0)\n",
1233 | "question10['HighRenew'] = question10['HighRenew'].replace(np.nan,0)\n",
1234 | "question10.sort('Rank',ascending=True, inplace=True)\n",
1235 | "HighRenew= question10.ix[:,'HighRenew']\n",
1236 | "\n",
1237 | "def answer_ten():\n",
1238 | " return HighRenew\n",
1239 | "\n",
1240 | "answer_ten()"
1241 | ]
1242 | },
1243 | {
1244 | "cell_type": "markdown",
1245 | "metadata": {},
1246 | "source": [
1247 | "### Question 11 (6.6%)\n",
1248 | "Use the following dictionary to group the Countries by Continent, then create a dateframe that displays the sample size (the number of countries in each continent bin), and the sum, mean, and std deviation for the estimated population of each country.\n",
1249 | "\n",
1250 | "```python\n",
1251 | "ContinentDict = {'China':'Asia', \n",
1252 | " 'United States':'North America', \n",
1253 | " 'Japan':'Asia', \n",
1254 | " 'United Kingdom':'Europe', \n",
1255 | " 'Russian Federation':'Europe', \n",
1256 | " 'Canada':'North America', \n",
1257 | " 'Germany':'Europe', \n",
1258 | " 'India':'Asia',\n",
1259 | " 'France':'Europe', \n",
1260 | " 'South Korea':'Asia', \n",
1261 | " 'Italy':'Europe', \n",
1262 | " 'Spain':'Europe', \n",
1263 | " 'Iran':'Asia',\n",
1264 | " 'Australia':'Australia', \n",
1265 | " 'Brazil':'South America'}\n",
1266 | "```\n",
1267 | "\n",
1268 | "*This function should return a DataFrame with index named Continent `['Asia', 'Australia', 'Europe', 'North America', 'South America']` and columns `['size', 'sum', 'mean', 'std']`*"
1269 | ]
1270 | },
1271 | {
1272 | "cell_type": "code",
1273 | "execution_count": 163,
1274 | "metadata": {
1275 | "collapsed": false
1276 | },
1277 | "outputs": [
1278 | {
1279 | "name": "stderr",
1280 | "output_type": "stream",
1281 | "text": [
1282 | "/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:23: FutureWarning: convert_objects is deprecated. Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.\n"
1283 | ]
1284 | },
1285 | {
1286 | "data": {
1287 | "text/html": [
1288 | "\n",
1289 | "
\n",
1290 | " \n",
1291 | " \n",
1292 | " \n",
1293 | " size \n",
1294 | " sum \n",
1295 | " mean \n",
1296 | " std \n",
1297 | " \n",
1298 | " \n",
1299 | " Continent \n",
1300 | " \n",
1301 | " \n",
1302 | " \n",
1303 | " \n",
1304 | " \n",
1305 | " \n",
1306 | " \n",
1307 | " \n",
1308 | " Asia \n",
1309 | " 5 \n",
1310 | " 2.898666e+09 \n",
1311 | " 5.797333e+08 \n",
1312 | " 6.790979e+08 \n",
1313 | " \n",
1314 | " \n",
1315 | " Australia \n",
1316 | " 1 \n",
1317 | " 2.331602e+07 \n",
1318 | " 2.331602e+07 \n",
1319 | " NaN \n",
1320 | " \n",
1321 | " \n",
1322 | " Europe \n",
1323 | " 6 \n",
1324 | " 4.579297e+08 \n",
1325 | " 7.632161e+07 \n",
1326 | " 3.464767e+07 \n",
1327 | " \n",
1328 | " \n",
1329 | " North America \n",
1330 | " 2 \n",
1331 | " 3.528552e+08 \n",
1332 | " 1.764276e+08 \n",
1333 | " 1.996696e+08 \n",
1334 | " \n",
1335 | " \n",
1336 | " South America \n",
1337 | " 1 \n",
1338 | " 2.059153e+08 \n",
1339 | " 2.059153e+08 \n",
1340 | " NaN \n",
1341 | " \n",
1342 | " \n",
1343 | "
\n",
1344 | "
"
1345 | ],
1346 | "text/plain": [
1347 | " size sum mean std\n",
1348 | "Continent \n",
1349 | "Asia 5 2.898666e+09 5.797333e+08 6.790979e+08\n",
1350 | "Australia 1 2.331602e+07 2.331602e+07 NaN\n",
1351 | "Europe 6 4.579297e+08 7.632161e+07 3.464767e+07\n",
1352 | "North America 2 3.528552e+08 1.764276e+08 1.996696e+08\n",
1353 | "South America 1 2.059153e+08 2.059153e+08 NaN"
1354 | ]
1355 | },
1356 | "execution_count": 163,
1357 | "metadata": {},
1358 | "output_type": "execute_result"
1359 | }
1360 | ],
1361 | "source": [
1362 | "question11 = question8.copy()\n",
1363 | "question11.drop(question11.columns[:-1],axis=1,inplace=True)\n",
1364 | "\n",
1365 | "ContinentDict = {'China':'Asia', \n",
1366 | " 'United States':'North America', \n",
1367 | " 'Japan':'Asia', \n",
1368 | " 'United Kingdom':'Europe', \n",
1369 | " 'Russian Federation':'Europe', \n",
1370 | " 'Canada':'North America', \n",
1371 | " 'Germany':'Europe', \n",
1372 | " 'India':'Asia',\n",
1373 | " 'France':'Europe', \n",
1374 | " 'South Korea':'Asia', \n",
1375 | " 'Italy':'Europe', \n",
1376 | " 'Spain':'Europe', \n",
1377 | " 'Iran':'Asia',\n",
1378 | " 'Australia':'Australia', \n",
1379 | " 'Brazil':'South America'}\n",
1380 | "question11.reset_index(inplace=1)\n",
1381 | "question11['Continent']=question11['Country'].map(ContinentDict)\n",
1382 | "\n",
1383 | "\n",
1384 | "question11['Population'] = question11['Population'].convert_objects(convert_numeric=True)\n",
1385 | "\n",
1386 | "PopSize=question11.groupby('Continent').agg({'Continent':np.count_nonzero})\n",
1387 | "PopSum=question11.groupby('Continent').agg({'Population':np.sum})\n",
1388 | "PopMean=question11.groupby('Continent').agg({'Population':np.average})\n",
1389 | "PopStd=question11.groupby('Continent').agg({'Population':np.std})\n",
1390 | "\n",
1391 | "PopTotal = pd.concat([PopSize,PopSum,PopMean,PopStd],axis=1)\n",
1392 | "PopTotal.columns = ['size', 'sum', 'mean', 'std']\n",
1393 | "\n",
1394 | "\n",
1395 | "def answer_eleven():\n",
1396 | " return PopTotal\n",
1397 | "answer_eleven()"
1398 | ]
1399 | },
1400 | {
1401 | "cell_type": "markdown",
1402 | "metadata": {},
1403 | "source": [
1404 | "### Question 12 (6.6%)\n",
1405 | "Cut % Renewable into 5 bins. Group Top15 by the Continent, as well as these new % Renewable bins. How many countries are in each of these groups?\n",
1406 | "\n",
1407 | "*This function should return a Series with a MultiIndex of `Continent`, then the bins for `% Renewable`. Do not include groups with no countries.*"
1408 | ]
1409 | },
1410 | {
1411 | "cell_type": "code",
1412 | "execution_count": null,
1413 | "metadata": {
1414 | "collapsed": false,
1415 | "scrolled": true
1416 | },
1417 | "outputs": [],
1418 | "source": [
1419 | "def answer_twelve():\n",
1420 | " Top15 = answer_one()\n",
1421 | " return \"ANSWER\""
1422 | ]
1423 | },
1424 | {
1425 | "cell_type": "markdown",
1426 | "metadata": {},
1427 | "source": [
1428 | "### Question 13 (6.6%)\n",
1429 | "Convert the Population Estimate series to a string with thousands separator (using commas). Do not round the results.\n",
1430 | "\n",
1431 | "e.g. 317615384.61538464 -> 317,615,384.61538464\n",
1432 | "\n",
1433 | "*This function should return a Series `PopEst` whose index is the country name and whose values are the population estimate string.*"
1434 | ]
1435 | },
1436 | {
1437 | "cell_type": "code",
1438 | "execution_count": 95,
1439 | "metadata": {
1440 | "collapsed": false,
1441 | "scrolled": true
1442 | },
1443 | "outputs": [
1444 | {
1445 | "data": {
1446 | "text/plain": [
1447 | "Country\n",
1448 | "China 1,367,645,161.2903225\n",
1449 | "India 1,276,730,769.2307692\n",
1450 | "United States 317,615,384.61538464\n",
1451 | "Brazil 205,915,254.23728815\n",
1452 | "Russian Federation 143,500,000.0\n",
1453 | "Japan 127,409,395.97315437\n",
1454 | "Germany 80,369,696.96969697\n",
1455 | "Iran 77,075,630.25210084\n",
1456 | "United Kingdom 63,870,967.741935484\n",
1457 | "France 63,837,349.39759036\n",
1458 | "Italy 59,908,256.880733944\n",
1459 | "South Korea 49,805,429.864253394\n",
1460 | "Spain 46,443,396.2264151\n",
1461 | "Canada 35,239,864.86486486\n",
1462 | "Australia 23,316,017.316017315\n",
1463 | "Name: PopEst, dtype: object"
1464 | ]
1465 | },
1466 | "execution_count": 95,
1467 | "metadata": {},
1468 | "output_type": "execute_result"
1469 | }
1470 | ],
1471 | "source": [
1472 | "question13=question8.copy()\n",
1473 | "\n",
1474 | "# Keep only the 'Population' Column\n",
1475 | "question13.drop(question13.columns[:-1],axis=1,inplace=True)\n",
1476 | "\n",
1477 | "# Use format() to add the thousands separator\n",
1478 | "question13['PopEst'] = question13['Population'].apply(lambda x : '{:,}'.format(x))\n",
1479 | "\n",
1480 | "# Create a data series for the output\n",
1481 | "question13_series= question13.ix[:,'PopEst']\n",
1482 | "\n",
1483 | "def answer_thirteen():\n",
1484 | " return question13_series\n",
1485 | "\n",
1486 | "answer_thirteen()"
1487 | ]
1488 | },
1489 | {
1490 | "cell_type": "markdown",
1491 | "metadata": {},
1492 | "source": [
1493 | "### Optional\n",
1494 | "\n",
1495 | "Use the built in function `plot_optional()` to see an example visualization."
1496 | ]
1497 | },
1498 | {
1499 | "cell_type": "code",
1500 | "execution_count": null,
1501 | "metadata": {
1502 | "collapsed": false,
1503 | "scrolled": true
1504 | },
1505 | "outputs": [],
1506 | "source": [
1507 | "def plot_optional():\n",
1508 | " import matplotlib as plt\n",
1509 | " %matplotlib inline\n",
1510 | " Top15 = answer_one()\n",
1511 | " ax = Top15.plot(x='Rank', y='% Renewable', kind='scatter', \n",
1512 | " c=['#e41a1c','#377eb8','#e41a1c','#4daf4a','#4daf4a','#377eb8','#4daf4a','#e41a1c',\n",
1513 | " '#4daf4a','#e41a1c','#4daf4a','#4daf4a','#e41a1c','#dede00','#ff7f00'], \n",
1514 | " xticks=range(1,16), s=6*Top15['2014']/10**10, alpha=.75, figsize=[16,6]);\n",
1515 | "\n",
1516 | " for i, txt in enumerate(Top15.index):\n",
1517 | " ax.annotate(txt, [Top15['Rank'][i], Top15['% Renewable'][i]], ha='center')\n",
1518 | "\n",
1519 | " print(\"This is an example of a visualization that can be created to help understand the data. \\\n",
1520 | "This is a bubble chart showing % Renewable vs. Rank. The size of the bubble corresponds to the countries' \\\n",
1521 | "2014 GDP, and the color corresponds to the continent.\")"
1522 | ]
1523 | },
1524 | {
1525 | "cell_type": "code",
1526 | "execution_count": null,
1527 | "metadata": {
1528 | "collapsed": false
1529 | },
1530 | "outputs": [],
1531 | "source": [
1532 | "#plot_optional() # Be sure to comment out plot_optional() before submitting the assignment!"
1533 | ]
1534 | }
1535 | ],
1536 | "metadata": {
1537 | "anaconda-cloud": {},
1538 | "coursera": {
1539 | "course_slug": "python-data-analysis",
1540 | "graded_item_id": "zAr06",
1541 | "launcher_item_id": "KSSjT",
1542 | "part_id": "SL3fU"
1543 | },
1544 | "kernelspec": {
1545 | "display_name": "Python 3",
1546 | "language": "python",
1547 | "name": "python3"
1548 | },
1549 | "language_info": {
1550 | "codemirror_mode": {
1551 | "name": "ipython",
1552 | "version": 3
1553 | },
1554 | "file_extension": ".py",
1555 | "mimetype": "text/x-python",
1556 | "name": "python",
1557 | "nbconvert_exporter": "python",
1558 | "pygments_lexer": "ipython3",
1559 | "version": "3.5.2"
1560 | }
1561 | },
1562 | "nbformat": 4,
1563 | "nbformat_minor": 0
1564 | }
1565 |
--------------------------------------------------------------------------------
/Week1/Week+1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "---\n",
8 | "\n",
9 | "_You are currently looking at **version 1.1** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-data-analysis/resources/0dhYG) course resource._\n",
10 | "\n",
11 | "---"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "# The Python Programming Language: Functions"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 2,
24 | "metadata": {
25 | "collapsed": false
26 | },
27 | "outputs": [
28 | {
29 | "data": {
30 | "text/plain": [
31 | "3"
32 | ]
33 | },
34 | "execution_count": 2,
35 | "metadata": {},
36 | "output_type": "execute_result"
37 | }
38 | ],
39 | "source": [
40 | "x = 1\n",
41 | "y = 2\n",
42 | "x + y"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 4,
48 | "metadata": {
49 | "collapsed": false
50 | },
51 | "outputs": [
52 | {
53 | "name": "stdout",
54 | "output_type": "stream",
55 | "text": [
56 | "x\n"
57 | ]
58 | }
59 | ],
60 | "source": [
61 | "print (\"x\")"
62 | ]
63 | },
64 | {
65 | "cell_type": "markdown",
66 | "metadata": {},
67 | "source": [
68 | " \n",
69 | "`add_numbers` is a function that takes two numbers and adds them together."
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": null,
75 | "metadata": {
76 | "collapsed": false
77 | },
78 | "outputs": [],
79 | "source": [
80 | "def add_numbers(x, y):\n",
81 | " return x + y\n",
82 | "\n",
83 | "add_numbers(1, 2)"
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "metadata": {},
89 | "source": [
90 | " \n",
91 | "`add_numbers` updated to take an optional 3rd parameter. Using `print` allows printing of multiple expressions within a single cell."
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {
98 | "collapsed": false
99 | },
100 | "outputs": [],
101 | "source": [
102 | "def add_numbers(x,y,z=None):\n",
103 | " if (z==None):\n",
104 | " return x+y\n",
105 | " else:\n",
106 | " return x+y+z\n",
107 | "\n",
108 | "print(add_numbers(1, 2))\n",
109 | "print(add_numbers(1, 2, 3))"
110 | ]
111 | },
112 | {
113 | "cell_type": "markdown",
114 | "metadata": {},
115 | "source": [
116 | " \n",
117 | "`add_numbers` updated to take an optional flag parameter."
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 1,
123 | "metadata": {
124 | "collapsed": false
125 | },
126 | "outputs": [
127 | {
128 | "name": "stdout",
129 | "output_type": "stream",
130 | "text": [
131 | "Flag is true!\n",
132 | "3\n"
133 | ]
134 | }
135 | ],
136 | "source": [
137 | "def add_numbers(x, y, z=None, flag=False):\n",
138 | " if (flag):\n",
139 | " print('Flag is true!')\n",
140 | " if (z==None):\n",
141 | " return x + y\n",
142 | " else:\n",
143 | " return x + y + z\n",
144 | " \n",
145 | "print(add_numbers(1, 2, flag=True))"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "metadata": {},
151 | "source": [
152 | " \n",
153 | "Assign function `add_numbers` to variable `a`."
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 5,
159 | "metadata": {
160 | "collapsed": false
161 | },
162 | "outputs": [
163 | {
164 | "data": {
165 | "text/plain": [
166 | "3"
167 | ]
168 | },
169 | "execution_count": 5,
170 | "metadata": {},
171 | "output_type": "execute_result"
172 | }
173 | ],
174 | "source": [
175 | "def add_numbers(x,y):\n",
176 | " return x+y\n",
177 | "\n",
178 | "a = add_numbers\n",
179 | "a(1,2)"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": null,
185 | "metadata": {
186 | "collapsed": true
187 | },
188 | "outputs": [],
189 | "source": []
190 | },
191 | {
192 | "cell_type": "markdown",
193 | "metadata": {},
194 | "source": [
195 | " \n",
196 | "# The Python Programming Language: Types and Sequences"
197 | ]
198 | },
199 | {
200 | "cell_type": "markdown",
201 | "metadata": {},
202 | "source": [
203 | " \n",
204 | "Use `type` to return the object's type."
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {
211 | "collapsed": false
212 | },
213 | "outputs": [],
214 | "source": [
215 | "type('This is a string')"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 6,
221 | "metadata": {
222 | "collapsed": false
223 | },
224 | "outputs": [
225 | {
226 | "data": {
227 | "text/plain": [
228 | "NoneType"
229 | ]
230 | },
231 | "execution_count": 6,
232 | "metadata": {},
233 | "output_type": "execute_result"
234 | }
235 | ],
236 | "source": [
237 | "type(None)"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": null,
243 | "metadata": {
244 | "collapsed": true
245 | },
246 | "outputs": [],
247 | "source": []
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": 7,
252 | "metadata": {
253 | "collapsed": false
254 | },
255 | "outputs": [
256 | {
257 | "data": {
258 | "text/plain": [
259 | "int"
260 | ]
261 | },
262 | "execution_count": 7,
263 | "metadata": {},
264 | "output_type": "execute_result"
265 | }
266 | ],
267 | "source": [
268 | "type(1)"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": null,
274 | "metadata": {
275 | "collapsed": true
276 | },
277 | "outputs": [],
278 | "source": []
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": 8,
283 | "metadata": {
284 | "collapsed": false
285 | },
286 | "outputs": [
287 | {
288 | "data": {
289 | "text/plain": [
290 | "float"
291 | ]
292 | },
293 | "execution_count": 8,
294 | "metadata": {},
295 | "output_type": "execute_result"
296 | }
297 | ],
298 | "source": [
299 | "type(1.0)"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": null,
305 | "metadata": {
306 | "collapsed": true
307 | },
308 | "outputs": [],
309 | "source": []
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": 9,
314 | "metadata": {
315 | "collapsed": false
316 | },
317 | "outputs": [
318 | {
319 | "data": {
320 | "text/plain": [
321 | "function"
322 | ]
323 | },
324 | "execution_count": 9,
325 | "metadata": {},
326 | "output_type": "execute_result"
327 | }
328 | ],
329 | "source": [
330 | "type(add_numbers)"
331 | ]
332 | },
333 | {
334 | "cell_type": "code",
335 | "execution_count": null,
336 | "metadata": {
337 | "collapsed": true
338 | },
339 | "outputs": [],
340 | "source": []
341 | },
342 | {
343 | "cell_type": "markdown",
344 | "metadata": {},
345 | "source": [
346 | " \n",
347 | "Tuples are an immutable data structure (cannot be altered)."
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": 10,
353 | "metadata": {
354 | "collapsed": false
355 | },
356 | "outputs": [
357 | {
358 | "data": {
359 | "text/plain": [
360 | "tuple"
361 | ]
362 | },
363 | "execution_count": 10,
364 | "metadata": {},
365 | "output_type": "execute_result"
366 | }
367 | ],
368 | "source": [
369 | "x = (1, 'a', 2, 'b')\n",
370 | "type(x)"
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": null,
376 | "metadata": {
377 | "collapsed": true
378 | },
379 | "outputs": [],
380 | "source": []
381 | },
382 | {
383 | "cell_type": "markdown",
384 | "metadata": {},
385 | "source": [
386 | " \n",
387 | "Lists are a mutable data structure."
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": 11,
393 | "metadata": {
394 | "collapsed": false
395 | },
396 | "outputs": [
397 | {
398 | "data": {
399 | "text/plain": [
400 | "list"
401 | ]
402 | },
403 | "execution_count": 11,
404 | "metadata": {},
405 | "output_type": "execute_result"
406 | }
407 | ],
408 | "source": [
409 | "x = [1, 'a', 2, 'b']\n",
410 | "type(x)"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": null,
416 | "metadata": {
417 | "collapsed": true
418 | },
419 | "outputs": [],
420 | "source": []
421 | },
422 | {
423 | "cell_type": "markdown",
424 | "metadata": {},
425 | "source": [
426 | " \n",
427 | "Use `append` to append an object to a list."
428 | ]
429 | },
430 | {
431 | "cell_type": "code",
432 | "execution_count": 12,
433 | "metadata": {
434 | "collapsed": false
435 | },
436 | "outputs": [
437 | {
438 | "name": "stdout",
439 | "output_type": "stream",
440 | "text": [
441 | "[1, 'a', 2, 'b', 3.3]\n"
442 | ]
443 | }
444 | ],
445 | "source": [
446 | "x.append(3.3)\n",
447 | "print(x)"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": null,
453 | "metadata": {
454 | "collapsed": true
455 | },
456 | "outputs": [],
457 | "source": []
458 | },
459 | {
460 | "cell_type": "markdown",
461 | "metadata": {},
462 | "source": [
463 | " \n",
464 | "This is an example of how to loop through each item in the list."
465 | ]
466 | },
467 | {
468 | "cell_type": "code",
469 | "execution_count": 13,
470 | "metadata": {
471 | "collapsed": false
472 | },
473 | "outputs": [
474 | {
475 | "name": "stdout",
476 | "output_type": "stream",
477 | "text": [
478 | "1\n",
479 | "a\n",
480 | "2\n",
481 | "b\n",
482 | "3.3\n"
483 | ]
484 | }
485 | ],
486 | "source": [
487 | "for item in x:\n",
488 | " print(item)"
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": null,
494 | "metadata": {
495 | "collapsed": true
496 | },
497 | "outputs": [],
498 | "source": []
499 | },
500 | {
501 | "cell_type": "markdown",
502 | "metadata": {},
503 | "source": [
504 | " \n",
505 | "Or using the indexing operator:"
506 | ]
507 | },
508 | {
509 | "cell_type": "code",
510 | "execution_count": 14,
511 | "metadata": {
512 | "collapsed": false
513 | },
514 | "outputs": [
515 | {
516 | "name": "stdout",
517 | "output_type": "stream",
518 | "text": [
519 | "1\n",
520 | "a\n",
521 | "2\n",
522 | "b\n",
523 | "3.3\n"
524 | ]
525 | }
526 | ],
527 | "source": [
528 | "i=0\n",
529 | "while( i != len(x) ):\n",
530 | " print(x[i])\n",
531 | " i = i + 1"
532 | ]
533 | },
534 | {
535 | "cell_type": "code",
536 | "execution_count": null,
537 | "metadata": {
538 | "collapsed": true
539 | },
540 | "outputs": [],
541 | "source": []
542 | },
543 | {
544 | "cell_type": "markdown",
545 | "metadata": {},
546 | "source": [
547 | " \n",
548 | "Use `+` to concatenate lists."
549 | ]
550 | },
551 | {
552 | "cell_type": "code",
553 | "execution_count": 15,
554 | "metadata": {
555 | "collapsed": false
556 | },
557 | "outputs": [
558 | {
559 | "data": {
560 | "text/plain": [
561 | "[1, 2, 3, 4]"
562 | ]
563 | },
564 | "execution_count": 15,
565 | "metadata": {},
566 | "output_type": "execute_result"
567 | }
568 | ],
569 | "source": [
570 | "[1,2] + [3,4]"
571 | ]
572 | },
573 | {
574 | "cell_type": "code",
575 | "execution_count": null,
576 | "metadata": {
577 | "collapsed": true
578 | },
579 | "outputs": [],
580 | "source": []
581 | },
582 | {
583 | "cell_type": "markdown",
584 | "metadata": {},
585 | "source": [
586 | " \n",
587 | "Use `*` to repeat lists."
588 | ]
589 | },
590 | {
591 | "cell_type": "code",
592 | "execution_count": 16,
593 | "metadata": {
594 | "collapsed": false
595 | },
596 | "outputs": [
597 | {
598 | "data": {
599 | "text/plain": [
600 | "[1, 1, 1]"
601 | ]
602 | },
603 | "execution_count": 16,
604 | "metadata": {},
605 | "output_type": "execute_result"
606 | }
607 | ],
608 | "source": [
609 | "[1]*3"
610 | ]
611 | },
612 | {
613 | "cell_type": "code",
614 | "execution_count": null,
615 | "metadata": {
616 | "collapsed": true
617 | },
618 | "outputs": [],
619 | "source": []
620 | },
621 | {
622 | "cell_type": "markdown",
623 | "metadata": {},
624 | "source": [
625 | " \n",
626 | "Use the `in` operator to check if something is inside a list."
627 | ]
628 | },
629 | {
630 | "cell_type": "code",
631 | "execution_count": 17,
632 | "metadata": {
633 | "collapsed": false
634 | },
635 | "outputs": [
636 | {
637 | "data": {
638 | "text/plain": [
639 | "True"
640 | ]
641 | },
642 | "execution_count": 17,
643 | "metadata": {},
644 | "output_type": "execute_result"
645 | }
646 | ],
647 | "source": [
648 | "1 in [1, 2, 3]"
649 | ]
650 | },
651 | {
652 | "cell_type": "code",
653 | "execution_count": null,
654 | "metadata": {
655 | "collapsed": true
656 | },
657 | "outputs": [],
658 | "source": []
659 | },
660 | {
661 | "cell_type": "markdown",
662 | "metadata": {},
663 | "source": [
664 | " \n",
665 | "Now let's look at strings. Use bracket notation to slice a string."
666 | ]
667 | },
668 | {
669 | "cell_type": "code",
670 | "execution_count": 18,
671 | "metadata": {
672 | "collapsed": false
673 | },
674 | "outputs": [
675 | {
676 | "name": "stdout",
677 | "output_type": "stream",
678 | "text": [
679 | "T\n",
680 | "T\n",
681 | "Th\n"
682 | ]
683 | }
684 | ],
685 | "source": [
686 | "x = 'This is a string'\n",
687 | "print(x[0]) #first character\n",
688 | "print(x[0:1]) #first character, but we have explicitly set the end character\n",
689 | "print(x[0:2]) #first two characters\n"
690 | ]
691 | },
692 | {
693 | "cell_type": "code",
694 | "execution_count": null,
695 | "metadata": {
696 | "collapsed": true
697 | },
698 | "outputs": [],
699 | "source": []
700 | },
701 | {
702 | "cell_type": "markdown",
703 | "metadata": {},
704 | "source": [
705 | " \n",
706 | "This will return the last element of the string."
707 | ]
708 | },
709 | {
710 | "cell_type": "code",
711 | "execution_count": 19,
712 | "metadata": {
713 | "collapsed": false
714 | },
715 | "outputs": [
716 | {
717 | "data": {
718 | "text/plain": [
719 | "'g'"
720 | ]
721 | },
722 | "execution_count": 19,
723 | "metadata": {},
724 | "output_type": "execute_result"
725 | }
726 | ],
727 | "source": [
728 | "x[-1]"
729 | ]
730 | },
731 | {
732 | "cell_type": "code",
733 | "execution_count": null,
734 | "metadata": {
735 | "collapsed": true
736 | },
737 | "outputs": [],
738 | "source": []
739 | },
740 | {
741 | "cell_type": "markdown",
742 | "metadata": {},
743 | "source": [
744 | " \n",
745 | "This will return the slice starting from the 4th element from the end and stopping before the 2nd element from the end."
746 | ]
747 | },
748 | {
749 | "cell_type": "code",
750 | "execution_count": 20,
751 | "metadata": {
752 | "collapsed": false
753 | },
754 | "outputs": [
755 | {
756 | "data": {
757 | "text/plain": [
758 | "'ri'"
759 | ]
760 | },
761 | "execution_count": 20,
762 | "metadata": {},
763 | "output_type": "execute_result"
764 | }
765 | ],
766 | "source": [
767 | "x[-4:-2]"
768 | ]
769 | },
770 | {
771 | "cell_type": "code",
772 | "execution_count": null,
773 | "metadata": {
774 | "collapsed": true
775 | },
776 | "outputs": [],
777 | "source": []
778 | },
779 | {
780 | "cell_type": "markdown",
781 | "metadata": {},
782 | "source": [
783 | " \n",
784 | "This is a slice from the beginning of the string and stopping before the 3rd element."
785 | ]
786 | },
787 | {
788 | "cell_type": "code",
789 | "execution_count": 21,
790 | "metadata": {
791 | "collapsed": false
792 | },
793 | "outputs": [
794 | {
795 | "data": {
796 | "text/plain": [
797 | "'Thi'"
798 | ]
799 | },
800 | "execution_count": 21,
801 | "metadata": {},
802 | "output_type": "execute_result"
803 | }
804 | ],
805 | "source": [
806 | "x[:3]"
807 | ]
808 | },
809 | {
810 | "cell_type": "code",
811 | "execution_count": null,
812 | "metadata": {
813 | "collapsed": true
814 | },
815 | "outputs": [],
816 | "source": []
817 | },
818 | {
819 | "cell_type": "markdown",
820 | "metadata": {},
821 | "source": [
822 | " \n",
823 | "And this is a slice starting from the 3rd element of the string and going all the way to the end."
824 | ]
825 | },
826 | {
827 | "cell_type": "code",
828 | "execution_count": null,
829 | "metadata": {
830 | "collapsed": false
831 | },
832 | "outputs": [],
833 | "source": [
834 | "x[3:]"
835 | ]
836 | },
837 | {
838 | "cell_type": "code",
839 | "execution_count": 1,
840 | "metadata": {
841 | "collapsed": false
842 | },
843 | "outputs": [
844 | {
845 | "name": "stdout",
846 | "output_type": "stream",
847 | "text": [
848 | "Christopher Brooks\n",
849 | "ChristopherChristopherChristopher\n",
850 | "True\n"
851 | ]
852 | }
853 | ],
854 | "source": [
855 | "firstname = 'Christopher'\n",
856 | "lastname = 'Brooks'\n",
857 | "\n",
858 | "print(firstname + ' ' + lastname)\n",
859 | "print(firstname*3)\n",
860 | "print('Chris' in firstname)\n"
861 | ]
862 | },
863 | {
864 | "cell_type": "code",
865 | "execution_count": null,
866 | "metadata": {
867 | "collapsed": true
868 | },
869 | "outputs": [],
870 | "source": []
871 | },
872 | {
873 | "cell_type": "markdown",
874 | "metadata": {},
875 | "source": [
876 | " \n",
877 | "`split` returns a list of all the words in a string, or a list split on a specific character."
878 | ]
879 | },
880 | {
881 | "cell_type": "code",
882 | "execution_count": 2,
883 | "metadata": {
884 | "collapsed": false
885 | },
886 | "outputs": [
887 | {
888 | "name": "stdout",
889 | "output_type": "stream",
890 | "text": [
891 | "Christopher\n",
892 | "Brooks\n"
893 | ]
894 | }
895 | ],
896 | "source": [
897 | "firstname = 'Christopher Arthur Hansen Brooks'.split(' ')[0] # [0] selects the first element of the list\n",
898 | "lastname = 'Christopher Arthur Hansen Brooks'.split(' ')[-1] # [-1] selects the last element of the list\n",
899 | "print(firstname)\n",
900 | "print(lastname)"
901 | ]
902 | },
903 | {
904 | "cell_type": "code",
905 | "execution_count": null,
906 | "metadata": {
907 | "collapsed": true
908 | },
909 | "outputs": [],
910 | "source": []
911 | },
912 | {
913 | "cell_type": "markdown",
914 | "metadata": {},
915 | "source": [
916 | " \n",
917 | "Make sure you convert objects to strings before concatenating."
918 | ]
919 | },
920 | {
921 | "cell_type": "code",
922 | "execution_count": null,
923 | "metadata": {
924 | "collapsed": false
925 | },
926 | "outputs": [],
927 | "source": [
928 | "'Chris' + 2"
929 | ]
930 | },
931 | {
932 | "cell_type": "code",
933 | "execution_count": null,
934 | "metadata": {
935 | "collapsed": false
936 | },
937 | "outputs": [],
938 | "source": [
939 | "'Chris' + str(2)"
940 | ]
941 | },
942 | {
943 | "cell_type": "markdown",
944 | "metadata": {},
945 | "source": [
946 | " \n",
947 | "Dictionaries associate keys with values."
948 | ]
949 | },
950 | {
951 | "cell_type": "code",
952 | "execution_count": null,
953 | "metadata": {
954 | "collapsed": false
955 | },
956 | "outputs": [],
957 | "source": [
958 | "x = {'Christopher Brooks': 'brooksch@umich.edu', 'Bill Gates': 'billg@microsoft.com'}\n",
959 | "x['Christopher Brooks'] # Retrieve a value by using the indexing operator\n"
960 | ]
961 | },
962 | {
963 | "cell_type": "code",
964 | "execution_count": null,
965 | "metadata": {
966 | "collapsed": true
967 | },
968 | "outputs": [],
969 | "source": [
970 | "x['Kevyn Collins-Thompson'] = None\n",
971 | "x['Kevyn Collins-Thompson']"
972 | ]
973 | },
974 | {
975 | "cell_type": "markdown",
976 | "metadata": {},
977 | "source": [
978 | " \n",
979 | "Iterate over all of the keys:"
980 | ]
981 | },
982 | {
983 | "cell_type": "code",
984 | "execution_count": null,
985 | "metadata": {
986 | "collapsed": false,
987 | "scrolled": true
988 | },
989 | "outputs": [],
990 | "source": [
991 | "for name in x:\n",
992 | " print(x[name])"
993 | ]
994 | },
995 | {
996 | "cell_type": "markdown",
997 | "metadata": {},
998 | "source": [
999 | " \n",
1000 | "Iterate over all of the values:"
1001 | ]
1002 | },
1003 | {
1004 | "cell_type": "code",
1005 | "execution_count": null,
1006 | "metadata": {
1007 | "collapsed": false
1008 | },
1009 | "outputs": [],
1010 | "source": [
1011 | "for email in x.values():\n",
1012 | " print(email)"
1013 | ]
1014 | },
1015 | {
1016 | "cell_type": "markdown",
1017 | "metadata": {},
1018 | "source": [
1019 | " \n",
1020 | "Iterate over all of the items in the list:"
1021 | ]
1022 | },
1023 | {
1024 | "cell_type": "code",
1025 | "execution_count": null,
1026 | "metadata": {
1027 | "collapsed": false
1028 | },
1029 | "outputs": [],
1030 | "source": [
1031 | "for name, email in x.items():\n",
1032 | " print(name)\n",
1033 | " print(email)"
1034 | ]
1035 | },
1036 | {
1037 | "cell_type": "markdown",
1038 | "metadata": {},
1039 | "source": [
1040 | " \n",
1041 | "You can unpack a sequence into different variables:"
1042 | ]
1043 | },
1044 | {
1045 | "cell_type": "code",
1046 | "execution_count": null,
1047 | "metadata": {
1048 | "collapsed": true
1049 | },
1050 | "outputs": [],
1051 | "source": [
1052 | "x = ('Christopher', 'Brooks', 'brooksch@umich.edu')\n",
1053 | "fname, lname, email = x"
1054 | ]
1055 | },
1056 | {
1057 | "cell_type": "code",
1058 | "execution_count": null,
1059 | "metadata": {
1060 | "collapsed": false
1061 | },
1062 | "outputs": [],
1063 | "source": [
1064 | "fname"
1065 | ]
1066 | },
1067 | {
1068 | "cell_type": "code",
1069 | "execution_count": null,
1070 | "metadata": {
1071 | "collapsed": false
1072 | },
1073 | "outputs": [],
1074 | "source": [
1075 | "lname"
1076 | ]
1077 | },
1078 | {
1079 | "cell_type": "markdown",
1080 | "metadata": {},
1081 | "source": [
1082 | " \n",
1083 | "Make sure the number of values you are unpacking matches the number of variables being assigned."
1084 | ]
1085 | },
1086 | {
1087 | "cell_type": "code",
1088 | "execution_count": null,
1089 | "metadata": {
1090 | "collapsed": false
1091 | },
1092 | "outputs": [],
1093 | "source": [
1094 | "x = ('Christopher', 'Brooks', 'brooksch@umich.edu', 'Ann Arbor')\n",
1095 | "fname, lname, email = x"
1096 | ]
1097 | },
1098 | {
1099 | "cell_type": "markdown",
1100 | "metadata": {},
1101 | "source": [
1102 | " \n",
1103 | "# The Python Programming Language: More on Strings"
1104 | ]
1105 | },
1106 | {
1107 | "cell_type": "code",
1108 | "execution_count": null,
1109 | "metadata": {
1110 | "collapsed": false
1111 | },
1112 | "outputs": [],
1113 | "source": [
1114 | "print('Chris' + 2)"
1115 | ]
1116 | },
1117 | {
1118 | "cell_type": "code",
1119 | "execution_count": null,
1120 | "metadata": {
1121 | "collapsed": false
1122 | },
1123 | "outputs": [],
1124 | "source": [
1125 | "print('Chris' + str(2))"
1126 | ]
1127 | },
1128 | {
1129 | "cell_type": "markdown",
1130 | "metadata": {},
1131 | "source": [
1132 | " \n",
1133 | "Python has a built in method for convenient string formatting."
1134 | ]
1135 | },
1136 | {
1137 | "cell_type": "code",
1138 | "execution_count": null,
1139 | "metadata": {
1140 | "collapsed": false
1141 | },
1142 | "outputs": [],
1143 | "source": [
1144 | "sales_record = {\n",
1145 | "'price': 3.24,\n",
1146 | "'num_items': 4,\n",
1147 | "'person': 'Chris'}\n",
1148 | "\n",
1149 | "sales_statement = '{} bought {} item(s) at a price of {} each for a total of {}'\n",
1150 | "\n",
1151 | "print(sales_statement.format(sales_record['person'],\n",
1152 | " sales_record['num_items'],\n",
1153 | " sales_record['price'],\n",
1154 | " sales_record['num_items']*sales_record['price']))\n"
1155 | ]
1156 | },
1157 | {
1158 | "cell_type": "markdown",
1159 | "metadata": {},
1160 | "source": [
1161 | " \n",
1162 | "# Reading and Writing CSV files"
1163 | ]
1164 | },
1165 | {
1166 | "cell_type": "markdown",
1167 | "metadata": {},
1168 | "source": [
1169 | " \n",
1170 | "Let's import our datafile mpg.csv, which contains fuel economy data for 234 cars.\n",
1171 | "\n",
1172 | "* mpg : miles per gallon\n",
1173 | "* class : car classification\n",
1174 | "* cty : city mpg\n",
1175 | "* cyl : # of cylinders\n",
1176 | "* displ : engine displacement in liters\n",
1177 | "* drv : f = front-wheel drive, r = rear wheel drive, 4 = 4wd\n",
1178 | "* fl : fuel (e = ethanol E85, d = diesel, r = regular, p = premium, c = CNG)\n",
1179 | "* hwy : highway mpg\n",
1180 | "* manufacturer : automobile manufacturer\n",
1181 | "* model : model of car\n",
1182 | "* trans : type of transmission\n",
1183 | "* year : model year"
1184 | ]
1185 | },
1186 | {
1187 | "cell_type": "code",
1188 | "execution_count": null,
1189 | "metadata": {
1190 | "collapsed": false,
1191 | "scrolled": true
1192 | },
1193 | "outputs": [],
1194 | "source": [
1195 | "import csv\n",
1196 | "\n",
1197 | "%precision 2\n",
1198 | "\n",
1199 | "with open('mpg.csv') as csvfile:\n",
1200 | " mpg = list(csv.DictReader(csvfile))\n",
1201 | " \n",
1202 | "mpg[:3] # The first three dictionaries in our list."
1203 | ]
1204 | },
1205 | {
1206 | "cell_type": "markdown",
1207 | "metadata": {},
1208 | "source": [
1209 | " \n",
1210 | "`csv.Dictreader` has read in each row of our csv file as a dictionary. `len` shows that our list is comprised of 234 dictionaries."
1211 | ]
1212 | },
1213 | {
1214 | "cell_type": "code",
1215 | "execution_count": null,
1216 | "metadata": {
1217 | "collapsed": false
1218 | },
1219 | "outputs": [],
1220 | "source": [
1221 | "len(mpg)"
1222 | ]
1223 | },
1224 | {
1225 | "cell_type": "markdown",
1226 | "metadata": {},
1227 | "source": [
1228 | " \n",
1229 | "`keys` gives us the column names of our csv."
1230 | ]
1231 | },
1232 | {
1233 | "cell_type": "code",
1234 | "execution_count": null,
1235 | "metadata": {
1236 | "collapsed": false
1237 | },
1238 | "outputs": [],
1239 | "source": [
1240 | "mpg[0].keys()"
1241 | ]
1242 | },
1243 | {
1244 | "cell_type": "markdown",
1245 | "metadata": {},
1246 | "source": [
1247 | " \n",
1248 | "This is how to find the average cty fuel economy across all cars. All values in the dictionaries are strings, so we need to convert to float."
1249 | ]
1250 | },
1251 | {
1252 | "cell_type": "code",
1253 | "execution_count": null,
1254 | "metadata": {
1255 | "collapsed": false
1256 | },
1257 | "outputs": [],
1258 | "source": [
1259 | "sum(float(d['cty']) for d in mpg) / len(mpg)"
1260 | ]
1261 | },
1262 | {
1263 | "cell_type": "markdown",
1264 | "metadata": {},
1265 | "source": [
1266 | " \n",
1267 | "Similarly this is how to find the average hwy fuel economy across all cars."
1268 | ]
1269 | },
1270 | {
1271 | "cell_type": "code",
1272 | "execution_count": null,
1273 | "metadata": {
1274 | "collapsed": false
1275 | },
1276 | "outputs": [],
1277 | "source": [
1278 | "sum(float(d['hwy']) for d in mpg) / len(mpg)"
1279 | ]
1280 | },
1281 | {
1282 | "cell_type": "markdown",
1283 | "metadata": {},
1284 | "source": [
1285 | " \n",
1286 | "Use `set` to return the unique values for the number of cylinders the cars in our dataset have."
1287 | ]
1288 | },
1289 | {
1290 | "cell_type": "code",
1291 | "execution_count": null,
1292 | "metadata": {
1293 | "collapsed": false
1294 | },
1295 | "outputs": [],
1296 | "source": [
1297 | "cylinders = set(d['cyl'] for d in mpg)\n",
1298 | "cylinders"
1299 | ]
1300 | },
1301 | {
1302 | "cell_type": "markdown",
1303 | "metadata": {},
1304 | "source": [
1305 | " \n",
1306 | "Here's a more complex example where we are grouping the cars by number of cylinder, and finding the average cty mpg for each group."
1307 | ]
1308 | },
1309 | {
1310 | "cell_type": "code",
1311 | "execution_count": null,
1312 | "metadata": {
1313 | "collapsed": false
1314 | },
1315 | "outputs": [],
1316 | "source": [
1317 | "CtyMpgByCyl = []\n",
1318 | "\n",
1319 | "for c in cylinders: # iterate over all the cylinder levels\n",
1320 | " summpg = 0\n",
1321 | " cyltypecount = 0\n",
1322 | " for d in mpg: # iterate over all dictionaries\n",
1323 | " if d['cyl'] == c: # if the cylinder level type matches,\n",
1324 | " summpg += float(d['cty']) # add the cty mpg\n",
1325 | " cyltypecount += 1 # increment the count\n",
1326 | " CtyMpgByCyl.append((c, summpg / cyltypecount)) # append the tuple ('cylinder', 'avg mpg')\n",
1327 | "\n",
1328 | "CtyMpgByCyl.sort(key=lambda x: x[0])\n",
1329 | "CtyMpgByCyl"
1330 | ]
1331 | },
1332 | {
1333 | "cell_type": "markdown",
1334 | "metadata": {},
1335 | "source": [
1336 | " \n",
1337 | "Use `set` to return the unique values for the class types in our dataset."
1338 | ]
1339 | },
1340 | {
1341 | "cell_type": "code",
1342 | "execution_count": null,
1343 | "metadata": {
1344 | "collapsed": false
1345 | },
1346 | "outputs": [],
1347 | "source": [
1348 | "vehicleclass = set(d['class'] for d in mpg) # what are the class types\n",
1349 | "vehicleclass"
1350 | ]
1351 | },
1352 | {
1353 | "cell_type": "markdown",
1354 | "metadata": {},
1355 | "source": [
1356 | " \n",
1357 | "And here's an example of how to find the average hwy mpg for each class of vehicle in our dataset."
1358 | ]
1359 | },
1360 | {
1361 | "cell_type": "code",
1362 | "execution_count": null,
1363 | "metadata": {
1364 | "collapsed": false
1365 | },
1366 | "outputs": [],
1367 | "source": [
1368 | "HwyMpgByClass = []\n",
1369 | "\n",
1370 | "for t in vehicleclass: # iterate over all the vehicle classes\n",
1371 | " summpg = 0\n",
1372 | " vclasscount = 0\n",
1373 | " for d in mpg: # iterate over all dictionaries\n",
1374 | " if d['class'] == t: # if the cylinder amount type matches,\n",
1375 | " summpg += float(d['hwy']) # add the hwy mpg\n",
1376 | " vclasscount += 1 # increment the count\n",
1377 | " HwyMpgByClass.append((t, summpg / vclasscount)) # append the tuple ('class', 'avg mpg')\n",
1378 | "\n",
1379 | "HwyMpgByClass.sort(key=lambda x: x[1])\n",
1380 | "HwyMpgByClass"
1381 | ]
1382 | },
1383 | {
1384 | "cell_type": "markdown",
1385 | "metadata": {},
1386 | "source": [
1387 | " \n",
1388 | "# The Python Programming Language: Dates and Times"
1389 | ]
1390 | },
1391 | {
1392 | "cell_type": "code",
1393 | "execution_count": null,
1394 | "metadata": {
1395 | "collapsed": true
1396 | },
1397 | "outputs": [],
1398 | "source": [
1399 | "import datetime as dt\n",
1400 | "import time as tm"
1401 | ]
1402 | },
1403 | {
1404 | "cell_type": "markdown",
1405 | "metadata": {},
1406 | "source": [
1407 | " \n",
1408 | "`time` returns the current time in seconds since the Epoch. (January 1st, 1970)"
1409 | ]
1410 | },
1411 | {
1412 | "cell_type": "code",
1413 | "execution_count": null,
1414 | "metadata": {
1415 | "collapsed": false
1416 | },
1417 | "outputs": [],
1418 | "source": [
1419 | "tm.time()"
1420 | ]
1421 | },
1422 | {
1423 | "cell_type": "markdown",
1424 | "metadata": {},
1425 | "source": [
1426 | " \n",
1427 | "Convert the timestamp to datetime."
1428 | ]
1429 | },
1430 | {
1431 | "cell_type": "code",
1432 | "execution_count": null,
1433 | "metadata": {
1434 | "collapsed": false
1435 | },
1436 | "outputs": [],
1437 | "source": [
1438 | "dtnow = dt.datetime.fromtimestamp(tm.time())\n",
1439 | "dtnow"
1440 | ]
1441 | },
1442 | {
1443 | "cell_type": "markdown",
1444 | "metadata": {},
1445 | "source": [
1446 | " \n",
1447 | "Handy datetime attributes:"
1448 | ]
1449 | },
1450 | {
1451 | "cell_type": "code",
1452 | "execution_count": null,
1453 | "metadata": {
1454 | "collapsed": false
1455 | },
1456 | "outputs": [],
1457 | "source": [
1458 | "dtnow.year, dtnow.month, dtnow.day, dtnow.hour, dtnow.minute, dtnow.second # get year, month, day, etc.from a datetime"
1459 | ]
1460 | },
1461 | {
1462 | "cell_type": "markdown",
1463 | "metadata": {},
1464 | "source": [
1465 | " \n",
1466 | "`timedelta` is a duration expressing the difference between two dates."
1467 | ]
1468 | },
1469 | {
1470 | "cell_type": "code",
1471 | "execution_count": null,
1472 | "metadata": {
1473 | "collapsed": false
1474 | },
1475 | "outputs": [],
1476 | "source": [
1477 | "delta = dt.timedelta(days = 100) # create a timedelta of 100 days\n",
1478 | "delta"
1479 | ]
1480 | },
1481 | {
1482 | "cell_type": "markdown",
1483 | "metadata": {},
1484 | "source": [
1485 | " \n",
1486 | "`date.today` returns the current local date."
1487 | ]
1488 | },
1489 | {
1490 | "cell_type": "code",
1491 | "execution_count": null,
1492 | "metadata": {
1493 | "collapsed": true
1494 | },
1495 | "outputs": [],
1496 | "source": [
1497 | "today = dt.date.today()"
1498 | ]
1499 | },
1500 | {
1501 | "cell_type": "code",
1502 | "execution_count": null,
1503 | "metadata": {
1504 | "collapsed": false
1505 | },
1506 | "outputs": [],
1507 | "source": [
1508 | "today - delta # the date 100 days ago"
1509 | ]
1510 | },
1511 | {
1512 | "cell_type": "code",
1513 | "execution_count": null,
1514 | "metadata": {
1515 | "collapsed": false
1516 | },
1517 | "outputs": [],
1518 | "source": [
1519 | "today > today-delta # compare dates"
1520 | ]
1521 | },
1522 | {
1523 | "cell_type": "markdown",
1524 | "metadata": {},
1525 | "source": [
1526 | " \n",
1527 | "# The Python Programming Language: Objects and map()"
1528 | ]
1529 | },
1530 | {
1531 | "cell_type": "markdown",
1532 | "metadata": {},
1533 | "source": [
1534 | " \n",
1535 | "An example of a class in python:"
1536 | ]
1537 | },
1538 | {
1539 | "cell_type": "code",
1540 | "execution_count": null,
1541 | "metadata": {
1542 | "collapsed": true
1543 | },
1544 | "outputs": [],
1545 | "source": [
1546 | "class Person:\n",
1547 | " department = 'School of Information' #a class variable\n",
1548 | "\n",
1549 | " def set_name(self, new_name): #a method\n",
1550 | " self.name = new_name\n",
1551 | " def set_location(self, new_location):\n",
1552 | " self.location = new_location"
1553 | ]
1554 | },
1555 | {
1556 | "cell_type": "code",
1557 | "execution_count": null,
1558 | "metadata": {
1559 | "collapsed": false
1560 | },
1561 | "outputs": [],
1562 | "source": [
1563 | "person = Person()\n",
1564 | "person.set_name('Christopher Brooks')\n",
1565 | "person.set_location('Ann Arbor, MI, USA')\n",
1566 | "print('{} live in {} and works in the department {}'.format(person.name, person.location, person.department))"
1567 | ]
1568 | },
1569 | {
1570 | "cell_type": "markdown",
1571 | "metadata": {},
1572 | "source": [
1573 | " \n",
1574 | "Here's an example of mapping the `min` function between two lists."
1575 | ]
1576 | },
1577 | {
1578 | "cell_type": "code",
1579 | "execution_count": null,
1580 | "metadata": {
1581 | "collapsed": false
1582 | },
1583 | "outputs": [],
1584 | "source": [
1585 | "store1 = [10.00, 11.00, 12.34, 2.34]\n",
1586 | "store2 = [9.00, 11.10, 12.34, 2.01]\n",
1587 | "cheapest = map(min, store1, store2)\n",
1588 | "cheapest"
1589 | ]
1590 | },
1591 | {
1592 | "cell_type": "markdown",
1593 | "metadata": {},
1594 | "source": [
1595 | " \n",
1596 | "Now let's iterate through the map object to see the values."
1597 | ]
1598 | },
1599 | {
1600 | "cell_type": "code",
1601 | "execution_count": null,
1602 | "metadata": {
1603 | "collapsed": false,
1604 | "scrolled": true
1605 | },
1606 | "outputs": [],
1607 | "source": [
1608 | "for item in cheapest:\n",
1609 | " print(item)"
1610 | ]
1611 | },
1612 | {
1613 | "cell_type": "markdown",
1614 | "metadata": {},
1615 | "source": [
1616 | " \n",
1617 | "# The Python Programming Language: Lambda and List Comprehensions"
1618 | ]
1619 | },
1620 | {
1621 | "cell_type": "markdown",
1622 | "metadata": {},
1623 | "source": [
1624 | " \n",
1625 | "Here's an example of lambda that takes in three parameters and adds the first two."
1626 | ]
1627 | },
1628 | {
1629 | "cell_type": "code",
1630 | "execution_count": null,
1631 | "metadata": {
1632 | "collapsed": true
1633 | },
1634 | "outputs": [],
1635 | "source": [
1636 | "my_function = lambda a, b, c : a + b"
1637 | ]
1638 | },
1639 | {
1640 | "cell_type": "code",
1641 | "execution_count": null,
1642 | "metadata": {
1643 | "collapsed": false
1644 | },
1645 | "outputs": [],
1646 | "source": [
1647 | "my_function(1, 2, 3)"
1648 | ]
1649 | },
1650 | {
1651 | "cell_type": "markdown",
1652 | "metadata": {},
1653 | "source": [
1654 | " \n",
1655 | "Let's iterate from 0 to 999 and return the even numbers."
1656 | ]
1657 | },
1658 | {
1659 | "cell_type": "code",
1660 | "execution_count": null,
1661 | "metadata": {
1662 | "collapsed": false
1663 | },
1664 | "outputs": [],
1665 | "source": [
1666 | "my_list = []\n",
1667 | "for number in range(0, 1000):\n",
1668 | " if number % 2 == 0:\n",
1669 | " my_list.append(number)\n",
1670 | "my_list"
1671 | ]
1672 | },
1673 | {
1674 | "cell_type": "markdown",
1675 | "metadata": {},
1676 | "source": [
1677 | " \n",
1678 | "Now the same thing but with list comprehension."
1679 | ]
1680 | },
1681 | {
1682 | "cell_type": "code",
1683 | "execution_count": null,
1684 | "metadata": {
1685 | "collapsed": false
1686 | },
1687 | "outputs": [],
1688 | "source": [
1689 | "my_list = [number for number in range(0,1000) if number % 2 == 0]\n",
1690 | "my_list"
1691 | ]
1692 | },
1693 | {
1694 | "cell_type": "markdown",
1695 | "metadata": {
1696 | "collapsed": true
1697 | },
1698 | "source": [
1699 | " \n",
1700 | "# The Python Programming Language: Numerical Python (NumPy)"
1701 | ]
1702 | },
1703 | {
1704 | "cell_type": "code",
1705 | "execution_count": 3,
1706 | "metadata": {
1707 | "collapsed": true
1708 | },
1709 | "outputs": [],
1710 | "source": [
1711 | "import numpy as np"
1712 | ]
1713 | },
1714 | {
1715 | "cell_type": "code",
1716 | "execution_count": null,
1717 | "metadata": {
1718 | "collapsed": true
1719 | },
1720 | "outputs": [],
1721 | "source": []
1722 | },
1723 | {
1724 | "cell_type": "markdown",
1725 | "metadata": {},
1726 | "source": [
1727 | " \n",
1728 | "## Creating Arrays"
1729 | ]
1730 | },
1731 | {
1732 | "cell_type": "markdown",
1733 | "metadata": {},
1734 | "source": [
1735 | "Create a list and convert it to a numpy array"
1736 | ]
1737 | },
1738 | {
1739 | "cell_type": "code",
1740 | "execution_count": 4,
1741 | "metadata": {
1742 | "collapsed": false
1743 | },
1744 | "outputs": [
1745 | {
1746 | "data": {
1747 | "text/plain": [
1748 | "array([1, 2, 3])"
1749 | ]
1750 | },
1751 | "execution_count": 4,
1752 | "metadata": {},
1753 | "output_type": "execute_result"
1754 | }
1755 | ],
1756 | "source": [
1757 | "mylist = [1, 2, 3]\n",
1758 | "x = np.array(mylist)\n",
1759 | "x"
1760 | ]
1761 | },
1762 | {
1763 | "cell_type": "code",
1764 | "execution_count": null,
1765 | "metadata": {
1766 | "collapsed": true
1767 | },
1768 | "outputs": [],
1769 | "source": []
1770 | },
1771 | {
1772 | "cell_type": "code",
1773 | "execution_count": null,
1774 | "metadata": {
1775 | "collapsed": true
1776 | },
1777 | "outputs": [],
1778 | "source": []
1779 | },
1780 | {
1781 | "cell_type": "markdown",
1782 | "metadata": {},
1783 | "source": [
1784 | " \n",
1785 | "Or just pass in a list directly"
1786 | ]
1787 | },
1788 | {
1789 | "cell_type": "code",
1790 | "execution_count": null,
1791 | "metadata": {
1792 | "collapsed": false
1793 | },
1794 | "outputs": [],
1795 | "source": [
1796 | "y = np.array([4, 5, 6])\n",
1797 | "y"
1798 | ]
1799 | },
1800 | {
1801 | "cell_type": "markdown",
1802 | "metadata": {},
1803 | "source": [
1804 | " \n",
1805 | "Pass in a list of lists to create a multidimensional array."
1806 | ]
1807 | },
1808 | {
1809 | "cell_type": "code",
1810 | "execution_count": null,
1811 | "metadata": {
1812 | "collapsed": false
1813 | },
1814 | "outputs": [],
1815 | "source": [
1816 | "m = np.array([[7, 8, 9], [10, 11, 12]])\n",
1817 | "m"
1818 | ]
1819 | },
1820 | {
1821 | "cell_type": "markdown",
1822 | "metadata": {},
1823 | "source": [
1824 | " \n",
1825 | "Use the shape method to find the dimensions of the array. (rows, columns)"
1826 | ]
1827 | },
1828 | {
1829 | "cell_type": "code",
1830 | "execution_count": null,
1831 | "metadata": {
1832 | "collapsed": false
1833 | },
1834 | "outputs": [],
1835 | "source": [
1836 | "m.shape"
1837 | ]
1838 | },
1839 | {
1840 | "cell_type": "markdown",
1841 | "metadata": {},
1842 | "source": [
1843 | " \n",
1844 | "`arange` returns evenly spaced values within a given interval."
1845 | ]
1846 | },
1847 | {
1848 | "cell_type": "code",
1849 | "execution_count": null,
1850 | "metadata": {
1851 | "collapsed": false
1852 | },
1853 | "outputs": [],
1854 | "source": [
1855 | "n = np.arange(0, 30, 2) # start at 0 count up by 2, stop before 30\n",
1856 | "n"
1857 | ]
1858 | },
1859 | {
1860 | "cell_type": "markdown",
1861 | "metadata": {},
1862 | "source": [
1863 | " \n",
1864 | "`reshape` returns an array with the same data with a new shape."
1865 | ]
1866 | },
1867 | {
1868 | "cell_type": "code",
1869 | "execution_count": 1,
1870 | "metadata": {
1871 | "collapsed": false
1872 | },
1873 | "outputs": [
1874 | {
1875 | "ename": "NameError",
1876 | "evalue": "name 'n' is not defined",
1877 | "output_type": "error",
1878 | "traceback": [
1879 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1880 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
1881 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# reshape array to be 3x5\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1882 | "\u001b[0;31mNameError\u001b[0m: name 'n' is not defined"
1883 | ]
1884 | }
1885 | ],
1886 | "source": [
1887 | "n = n.reshape(3, 5) # reshape array to be 3x5\n",
1888 | "n"
1889 | ]
1890 | },
1891 | {
1892 | "cell_type": "code",
1893 | "execution_count": null,
1894 | "metadata": {
1895 | "collapsed": true
1896 | },
1897 | "outputs": [],
1898 | "source": []
1899 | },
1900 | {
1901 | "cell_type": "markdown",
1902 | "metadata": {},
1903 | "source": [
1904 | " \n",
1905 | "`linspace` returns evenly spaced numbers over a specified interval."
1906 | ]
1907 | },
1908 | {
1909 | "cell_type": "code",
1910 | "execution_count": null,
1911 | "metadata": {
1912 | "collapsed": false
1913 | },
1914 | "outputs": [],
1915 | "source": [
1916 | "o = np.linspace(0, 4, 9) # return 9 evenly spaced values from 0 to 4\n",
1917 | "o"
1918 | ]
1919 | },
1920 | {
1921 | "cell_type": "markdown",
1922 | "metadata": {},
1923 | "source": [
1924 | " \n",
1925 | "`resize` changes the shape and size of array in-place."
1926 | ]
1927 | },
1928 | {
1929 | "cell_type": "code",
1930 | "execution_count": null,
1931 | "metadata": {
1932 | "collapsed": false
1933 | },
1934 | "outputs": [],
1935 | "source": [
1936 | "o.resize(3, 3)\n",
1937 | "o"
1938 | ]
1939 | },
1940 | {
1941 | "cell_type": "markdown",
1942 | "metadata": {},
1943 | "source": [
1944 | " \n",
1945 | "`ones` returns a new array of given shape and type, filled with ones."
1946 | ]
1947 | },
1948 | {
1949 | "cell_type": "code",
1950 | "execution_count": null,
1951 | "metadata": {
1952 | "collapsed": false
1953 | },
1954 | "outputs": [],
1955 | "source": [
1956 | "np.ones((3, 2))"
1957 | ]
1958 | },
1959 | {
1960 | "cell_type": "markdown",
1961 | "metadata": {},
1962 | "source": [
1963 | " \n",
1964 | "`zeros` returns a new array of given shape and type, filled with zeros."
1965 | ]
1966 | },
1967 | {
1968 | "cell_type": "code",
1969 | "execution_count": null,
1970 | "metadata": {
1971 | "collapsed": false
1972 | },
1973 | "outputs": [],
1974 | "source": [
1975 | "np.zeros((2, 3))"
1976 | ]
1977 | },
1978 | {
1979 | "cell_type": "markdown",
1980 | "metadata": {},
1981 | "source": [
1982 | " \n",
1983 | "`eye` returns a 2-D array with ones on the diagonal and zeros elsewhere."
1984 | ]
1985 | },
1986 | {
1987 | "cell_type": "code",
1988 | "execution_count": null,
1989 | "metadata": {
1990 | "collapsed": false
1991 | },
1992 | "outputs": [],
1993 | "source": [
1994 | "np.eye(3)"
1995 | ]
1996 | },
1997 | {
1998 | "cell_type": "markdown",
1999 | "metadata": {},
2000 | "source": [
2001 | " \n",
2002 | "`diag` extracts a diagonal or constructs a diagonal array."
2003 | ]
2004 | },
2005 | {
2006 | "cell_type": "code",
2007 | "execution_count": null,
2008 | "metadata": {
2009 | "collapsed": false
2010 | },
2011 | "outputs": [],
2012 | "source": [
2013 | "np.diag(y)"
2014 | ]
2015 | },
2016 | {
2017 | "cell_type": "markdown",
2018 | "metadata": {},
2019 | "source": [
2020 | " \n",
2021 | "Create an array using repeating list (or see `np.tile`)"
2022 | ]
2023 | },
2024 | {
2025 | "cell_type": "code",
2026 | "execution_count": null,
2027 | "metadata": {
2028 | "collapsed": false
2029 | },
2030 | "outputs": [],
2031 | "source": [
2032 | "np.array([1, 2, 3] * 3)"
2033 | ]
2034 | },
2035 | {
2036 | "cell_type": "markdown",
2037 | "metadata": {},
2038 | "source": [
2039 | " \n",
2040 | "Repeat elements of an array using `repeat`."
2041 | ]
2042 | },
2043 | {
2044 | "cell_type": "code",
2045 | "execution_count": null,
2046 | "metadata": {
2047 | "collapsed": false
2048 | },
2049 | "outputs": [],
2050 | "source": [
2051 | "np.repeat([1, 2, 3], 3)"
2052 | ]
2053 | },
2054 | {
2055 | "cell_type": "markdown",
2056 | "metadata": {},
2057 | "source": [
2058 | " \n",
2059 | "#### Combining Arrays"
2060 | ]
2061 | },
2062 | {
2063 | "cell_type": "code",
2064 | "execution_count": null,
2065 | "metadata": {
2066 | "collapsed": false
2067 | },
2068 | "outputs": [],
2069 | "source": [
2070 | "p = np.ones([2, 3], int)\n",
2071 | "p"
2072 | ]
2073 | },
2074 | {
2075 | "cell_type": "markdown",
2076 | "metadata": {},
2077 | "source": [
2078 | " \n",
2079 | "Use `vstack` to stack arrays in sequence vertically (row wise)."
2080 | ]
2081 | },
2082 | {
2083 | "cell_type": "code",
2084 | "execution_count": null,
2085 | "metadata": {
2086 | "collapsed": false
2087 | },
2088 | "outputs": [],
2089 | "source": [
2090 | "np.vstack([p, 2*p])"
2091 | ]
2092 | },
2093 | {
2094 | "cell_type": "markdown",
2095 | "metadata": {},
2096 | "source": [
2097 | " \n",
2098 | "Use `hstack` to stack arrays in sequence horizontally (column wise)."
2099 | ]
2100 | },
2101 | {
2102 | "cell_type": "code",
2103 | "execution_count": null,
2104 | "metadata": {
2105 | "collapsed": false
2106 | },
2107 | "outputs": [],
2108 | "source": [
2109 | "np.hstack([p, 2*p])"
2110 | ]
2111 | },
2112 | {
2113 | "cell_type": "markdown",
2114 | "metadata": {},
2115 | "source": [
2116 | " \n",
2117 | "## Operations"
2118 | ]
2119 | },
2120 | {
2121 | "cell_type": "markdown",
2122 | "metadata": {},
2123 | "source": [
2124 | "Use `+`, `-`, `*`, `/` and `**` to perform element wise addition, subtraction, multiplication, division and power."
2125 | ]
2126 | },
2127 | {
2128 | "cell_type": "code",
2129 | "execution_count": null,
2130 | "metadata": {
2131 | "collapsed": false
2132 | },
2133 | "outputs": [],
2134 | "source": [
2135 | "print(x + y) # elementwise addition [1 2 3] + [4 5 6] = [5 7 9]\n",
2136 | "print(x - y) # elementwise subtraction [1 2 3] - [4 5 6] = [-3 -3 -3]"
2137 | ]
2138 | },
2139 | {
2140 | "cell_type": "code",
2141 | "execution_count": null,
2142 | "metadata": {
2143 | "collapsed": false
2144 | },
2145 | "outputs": [],
2146 | "source": [
2147 | "print(x * y) # elementwise multiplication [1 2 3] * [4 5 6] = [4 10 18]\n",
2148 | "print(x / y) # elementwise divison [1 2 3] / [4 5 6] = [0.25 0.4 0.5]"
2149 | ]
2150 | },
2151 | {
2152 | "cell_type": "code",
2153 | "execution_count": null,
2154 | "metadata": {
2155 | "collapsed": false
2156 | },
2157 | "outputs": [],
2158 | "source": [
2159 | "print(x**2) # elementwise power [1 2 3] ^2 = [1 4 9]"
2160 | ]
2161 | },
2162 | {
2163 | "cell_type": "markdown",
2164 | "metadata": {},
2165 | "source": [
2166 | " \n",
2167 | "**Dot Product:** \n",
2168 | "\n",
2169 | "$ \\begin{bmatrix}x_1 \\ x_2 \\ x_3\\end{bmatrix}\n",
2170 | "\\cdot\n",
2171 | "\\begin{bmatrix}y_1 \\\\ y_2 \\\\ y_3\\end{bmatrix}\n",
2172 | "= x_1 y_1 + x_2 y_2 + x_3 y_3$"
2173 | ]
2174 | },
2175 | {
2176 | "cell_type": "code",
2177 | "execution_count": null,
2178 | "metadata": {
2179 | "collapsed": false
2180 | },
2181 | "outputs": [],
2182 | "source": [
2183 | "x.dot(y) # dot product 1*4 + 2*5 + 3*6"
2184 | ]
2185 | },
2186 | {
2187 | "cell_type": "code",
2188 | "execution_count": null,
2189 | "metadata": {
2190 | "collapsed": false
2191 | },
2192 | "outputs": [],
2193 | "source": [
2194 | "z = np.array([y, y**2])\n",
2195 | "print(len(z)) # number of rows of array"
2196 | ]
2197 | },
2198 | {
2199 | "cell_type": "markdown",
2200 | "metadata": {},
2201 | "source": [
2202 | " \n",
2203 | "Let's look at transposing arrays. Transposing permutes the dimensions of the array."
2204 | ]
2205 | },
2206 | {
2207 | "cell_type": "code",
2208 | "execution_count": null,
2209 | "metadata": {
2210 | "collapsed": false
2211 | },
2212 | "outputs": [],
2213 | "source": [
2214 | "z = np.array([y, y**2])\n",
2215 | "z"
2216 | ]
2217 | },
2218 | {
2219 | "cell_type": "markdown",
2220 | "metadata": {},
2221 | "source": [
2222 | " \n",
2223 | "The shape of array `z` is `(2,3)` before transposing."
2224 | ]
2225 | },
2226 | {
2227 | "cell_type": "code",
2228 | "execution_count": null,
2229 | "metadata": {
2230 | "collapsed": false
2231 | },
2232 | "outputs": [],
2233 | "source": [
2234 | "z.shape"
2235 | ]
2236 | },
2237 | {
2238 | "cell_type": "markdown",
2239 | "metadata": {},
2240 | "source": [
2241 | " \n",
2242 | "Use `.T` to get the transpose."
2243 | ]
2244 | },
2245 | {
2246 | "cell_type": "code",
2247 | "execution_count": null,
2248 | "metadata": {
2249 | "collapsed": false
2250 | },
2251 | "outputs": [],
2252 | "source": [
2253 | "z.T"
2254 | ]
2255 | },
2256 | {
2257 | "cell_type": "markdown",
2258 | "metadata": {},
2259 | "source": [
2260 | " \n",
2261 | "The number of rows has swapped with the number of columns."
2262 | ]
2263 | },
2264 | {
2265 | "cell_type": "code",
2266 | "execution_count": null,
2267 | "metadata": {
2268 | "collapsed": false
2269 | },
2270 | "outputs": [],
2271 | "source": [
2272 | "z.T.shape"
2273 | ]
2274 | },
2275 | {
2276 | "cell_type": "markdown",
2277 | "metadata": {},
2278 | "source": [
2279 | " \n",
2280 | "Use `.dtype` to see the data type of the elements in the array."
2281 | ]
2282 | },
2283 | {
2284 | "cell_type": "code",
2285 | "execution_count": null,
2286 | "metadata": {
2287 | "collapsed": false
2288 | },
2289 | "outputs": [],
2290 | "source": [
2291 | "z.dtype"
2292 | ]
2293 | },
2294 | {
2295 | "cell_type": "markdown",
2296 | "metadata": {},
2297 | "source": [
2298 | " \n",
2299 | "Use `.astype` to cast to a specific type."
2300 | ]
2301 | },
2302 | {
2303 | "cell_type": "code",
2304 | "execution_count": null,
2305 | "metadata": {
2306 | "collapsed": false
2307 | },
2308 | "outputs": [],
2309 | "source": [
2310 | "z = z.astype('f')\n",
2311 | "z.dtype"
2312 | ]
2313 | },
2314 | {
2315 | "cell_type": "markdown",
2316 | "metadata": {},
2317 | "source": [
2318 | " \n",
2319 | "## Math Functions"
2320 | ]
2321 | },
2322 | {
2323 | "cell_type": "markdown",
2324 | "metadata": {},
2325 | "source": [
2326 | "Numpy has many built in math functions that can be performed on arrays."
2327 | ]
2328 | },
2329 | {
2330 | "cell_type": "code",
2331 | "execution_count": null,
2332 | "metadata": {
2333 | "collapsed": true
2334 | },
2335 | "outputs": [],
2336 | "source": [
2337 | "a = np.array([-4, -2, 1, 3, 5])"
2338 | ]
2339 | },
2340 | {
2341 | "cell_type": "code",
2342 | "execution_count": null,
2343 | "metadata": {
2344 | "collapsed": false
2345 | },
2346 | "outputs": [],
2347 | "source": [
2348 | "a.sum()"
2349 | ]
2350 | },
2351 | {
2352 | "cell_type": "code",
2353 | "execution_count": null,
2354 | "metadata": {
2355 | "collapsed": false
2356 | },
2357 | "outputs": [],
2358 | "source": [
2359 | "a.max()"
2360 | ]
2361 | },
2362 | {
2363 | "cell_type": "code",
2364 | "execution_count": null,
2365 | "metadata": {
2366 | "collapsed": false
2367 | },
2368 | "outputs": [],
2369 | "source": [
2370 | "a.min()"
2371 | ]
2372 | },
2373 | {
2374 | "cell_type": "code",
2375 | "execution_count": null,
2376 | "metadata": {
2377 | "collapsed": false
2378 | },
2379 | "outputs": [],
2380 | "source": [
2381 | "a.mean()"
2382 | ]
2383 | },
2384 | {
2385 | "cell_type": "code",
2386 | "execution_count": null,
2387 | "metadata": {
2388 | "collapsed": false
2389 | },
2390 | "outputs": [],
2391 | "source": [
2392 | "a.std()"
2393 | ]
2394 | },
2395 | {
2396 | "cell_type": "markdown",
2397 | "metadata": {},
2398 | "source": [
2399 | " \n",
2400 | "`argmax` and `argmin` return the index of the maximum and minimum values in the array."
2401 | ]
2402 | },
2403 | {
2404 | "cell_type": "code",
2405 | "execution_count": null,
2406 | "metadata": {
2407 | "collapsed": false
2408 | },
2409 | "outputs": [],
2410 | "source": [
2411 | "a.argmax()"
2412 | ]
2413 | },
2414 | {
2415 | "cell_type": "code",
2416 | "execution_count": null,
2417 | "metadata": {
2418 | "collapsed": false
2419 | },
2420 | "outputs": [],
2421 | "source": [
2422 | "a.argmin()"
2423 | ]
2424 | },
2425 | {
2426 | "cell_type": "markdown",
2427 | "metadata": {},
2428 | "source": [
2429 | " \n",
2430 | "## Indexing / Slicing"
2431 | ]
2432 | },
2433 | {
2434 | "cell_type": "code",
2435 | "execution_count": null,
2436 | "metadata": {
2437 | "collapsed": false
2438 | },
2439 | "outputs": [],
2440 | "source": [
2441 | "s = np.arange(13)**2\n",
2442 | "s"
2443 | ]
2444 | },
2445 | {
2446 | "cell_type": "markdown",
2447 | "metadata": {},
2448 | "source": [
2449 | " \n",
2450 | "Use bracket notation to get the value at a specific index. Remember that indexing starts at 0."
2451 | ]
2452 | },
2453 | {
2454 | "cell_type": "code",
2455 | "execution_count": null,
2456 | "metadata": {
2457 | "collapsed": false
2458 | },
2459 | "outputs": [],
2460 | "source": [
2461 | "s[0], s[4], s[-1]"
2462 | ]
2463 | },
2464 | {
2465 | "cell_type": "markdown",
2466 | "metadata": {},
2467 | "source": [
2468 | " \n",
2469 | "Use `:` to indicate a range. `array[start:stop]`\n",
2470 | "\n",
2471 | "\n",
2472 | "Leaving `start` or `stop` empty will default to the beginning/end of the array."
2473 | ]
2474 | },
2475 | {
2476 | "cell_type": "code",
2477 | "execution_count": null,
2478 | "metadata": {
2479 | "collapsed": false
2480 | },
2481 | "outputs": [],
2482 | "source": [
2483 | "s[1:5]"
2484 | ]
2485 | },
2486 | {
2487 | "cell_type": "markdown",
2488 | "metadata": {},
2489 | "source": [
2490 | " \n",
2491 | "Use negatives to count from the back."
2492 | ]
2493 | },
2494 | {
2495 | "cell_type": "code",
2496 | "execution_count": null,
2497 | "metadata": {
2498 | "collapsed": false
2499 | },
2500 | "outputs": [],
2501 | "source": [
2502 | "s[-4:]"
2503 | ]
2504 | },
2505 | {
2506 | "cell_type": "markdown",
2507 | "metadata": {},
2508 | "source": [
2509 | " \n",
2510 | "A second `:` can be used to indicate step-size. `array[start:stop:stepsize]`\n",
2511 | "\n",
2512 | "Here we are starting 5th element from the end, and counting backwards by 2 until the beginning of the array is reached."
2513 | ]
2514 | },
2515 | {
2516 | "cell_type": "code",
2517 | "execution_count": null,
2518 | "metadata": {
2519 | "collapsed": false
2520 | },
2521 | "outputs": [],
2522 | "source": [
2523 | "s[-5::-2]"
2524 | ]
2525 | },
2526 | {
2527 | "cell_type": "markdown",
2528 | "metadata": {
2529 | "collapsed": false
2530 | },
2531 | "source": [
2532 | " \n",
2533 | "Let's look at a multidimensional array."
2534 | ]
2535 | },
2536 | {
2537 | "cell_type": "code",
2538 | "execution_count": null,
2539 | "metadata": {
2540 | "collapsed": false
2541 | },
2542 | "outputs": [],
2543 | "source": [
2544 | "r = np.arange(36)\n",
2545 | "r.resize((6, 6))\n",
2546 | "r"
2547 | ]
2548 | },
2549 | {
2550 | "cell_type": "markdown",
2551 | "metadata": {},
2552 | "source": [
2553 | " \n",
2554 | "Use bracket notation to slice: `array[row, column]`"
2555 | ]
2556 | },
2557 | {
2558 | "cell_type": "code",
2559 | "execution_count": null,
2560 | "metadata": {
2561 | "collapsed": false
2562 | },
2563 | "outputs": [],
2564 | "source": [
2565 | "r[2, 2]"
2566 | ]
2567 | },
2568 | {
2569 | "cell_type": "markdown",
2570 | "metadata": {},
2571 | "source": [
2572 | " \n",
2573 | "And use : to select a range of rows or columns"
2574 | ]
2575 | },
2576 | {
2577 | "cell_type": "code",
2578 | "execution_count": null,
2579 | "metadata": {
2580 | "collapsed": false
2581 | },
2582 | "outputs": [],
2583 | "source": [
2584 | "r[3, 3:6]"
2585 | ]
2586 | },
2587 | {
2588 | "cell_type": "markdown",
2589 | "metadata": {},
2590 | "source": [
2591 | " \n",
2592 | "Here we are selecting all the rows up to (and not including) row 2, and all the columns up to (and not including) the last column."
2593 | ]
2594 | },
2595 | {
2596 | "cell_type": "code",
2597 | "execution_count": null,
2598 | "metadata": {
2599 | "collapsed": false
2600 | },
2601 | "outputs": [],
2602 | "source": [
2603 | "r[:2, :-1]"
2604 | ]
2605 | },
2606 | {
2607 | "cell_type": "markdown",
2608 | "metadata": {},
2609 | "source": [
2610 | " \n",
2611 | "This is a slice of the last row, and only every other element."
2612 | ]
2613 | },
2614 | {
2615 | "cell_type": "code",
2616 | "execution_count": null,
2617 | "metadata": {
2618 | "collapsed": false
2619 | },
2620 | "outputs": [],
2621 | "source": [
2622 | "r[-1, ::2]"
2623 | ]
2624 | },
2625 | {
2626 | "cell_type": "markdown",
2627 | "metadata": {},
2628 | "source": [
2629 | " \n",
2630 | "We can also perform conditional indexing. Here we are selecting values from the array that are greater than 30. (Also see `np.where`)"
2631 | ]
2632 | },
2633 | {
2634 | "cell_type": "code",
2635 | "execution_count": null,
2636 | "metadata": {
2637 | "collapsed": false
2638 | },
2639 | "outputs": [],
2640 | "source": [
2641 | "r[r > 30]"
2642 | ]
2643 | },
2644 | {
2645 | "cell_type": "markdown",
2646 | "metadata": {},
2647 | "source": [
2648 | " \n",
2649 | "Here we are assigning all values in the array that are greater than 30 to the value of 30."
2650 | ]
2651 | },
2652 | {
2653 | "cell_type": "code",
2654 | "execution_count": null,
2655 | "metadata": {
2656 | "collapsed": false
2657 | },
2658 | "outputs": [],
2659 | "source": [
2660 | "r[r > 30] = 30\n",
2661 | "r"
2662 | ]
2663 | },
2664 | {
2665 | "cell_type": "markdown",
2666 | "metadata": {},
2667 | "source": [
2668 | " \n",
2669 | "## Copying Data"
2670 | ]
2671 | },
2672 | {
2673 | "cell_type": "markdown",
2674 | "metadata": {},
2675 | "source": [
2676 | "Be careful with copying and modifying arrays in NumPy!\n",
2677 | "\n",
2678 | "\n",
2679 | "`r2` is a slice of `r`"
2680 | ]
2681 | },
2682 | {
2683 | "cell_type": "code",
2684 | "execution_count": null,
2685 | "metadata": {
2686 | "collapsed": false
2687 | },
2688 | "outputs": [],
2689 | "source": [
2690 | "r2 = r[:3,:3]\n",
2691 | "r2"
2692 | ]
2693 | },
2694 | {
2695 | "cell_type": "markdown",
2696 | "metadata": {},
2697 | "source": [
2698 | " \n",
2699 | "Set this slice's values to zero ([:] selects the entire array)"
2700 | ]
2701 | },
2702 | {
2703 | "cell_type": "code",
2704 | "execution_count": null,
2705 | "metadata": {
2706 | "collapsed": false
2707 | },
2708 | "outputs": [],
2709 | "source": [
2710 | "r2[:] = 0\n",
2711 | "r2"
2712 | ]
2713 | },
2714 | {
2715 | "cell_type": "markdown",
2716 | "metadata": {},
2717 | "source": [
2718 | " \n",
2719 | "`r` has also been changed!"
2720 | ]
2721 | },
2722 | {
2723 | "cell_type": "code",
2724 | "execution_count": null,
2725 | "metadata": {
2726 | "collapsed": false
2727 | },
2728 | "outputs": [],
2729 | "source": [
2730 | "r"
2731 | ]
2732 | },
2733 | {
2734 | "cell_type": "markdown",
2735 | "metadata": {},
2736 | "source": [
2737 | " \n",
2738 | "To avoid this, use `r.copy` to create a copy that will not affect the original array"
2739 | ]
2740 | },
2741 | {
2742 | "cell_type": "code",
2743 | "execution_count": null,
2744 | "metadata": {
2745 | "collapsed": false
2746 | },
2747 | "outputs": [],
2748 | "source": [
2749 | "r_copy = r.copy()\n",
2750 | "r_copy"
2751 | ]
2752 | },
2753 | {
2754 | "cell_type": "markdown",
2755 | "metadata": {},
2756 | "source": [
2757 | " \n",
2758 | "Now when r_copy is modified, r will not be changed."
2759 | ]
2760 | },
2761 | {
2762 | "cell_type": "code",
2763 | "execution_count": null,
2764 | "metadata": {
2765 | "collapsed": false
2766 | },
2767 | "outputs": [],
2768 | "source": [
2769 | "r_copy[:] = 10\n",
2770 | "print(r_copy, '\\n')\n",
2771 | "print(r)"
2772 | ]
2773 | },
2774 | {
2775 | "cell_type": "markdown",
2776 | "metadata": {},
2777 | "source": [
2778 | " \n",
2779 | "### Iterating Over Arrays"
2780 | ]
2781 | },
2782 | {
2783 | "cell_type": "markdown",
2784 | "metadata": {},
2785 | "source": [
2786 | "Let's create a new 4 by 3 array of random numbers 0-9."
2787 | ]
2788 | },
2789 | {
2790 | "cell_type": "code",
2791 | "execution_count": null,
2792 | "metadata": {
2793 | "collapsed": false
2794 | },
2795 | "outputs": [],
2796 | "source": [
2797 | "test = np.random.randint(0, 10, (4,3))\n",
2798 | "test"
2799 | ]
2800 | },
2801 | {
2802 | "cell_type": "markdown",
2803 | "metadata": {},
2804 | "source": [
2805 | " \n",
2806 | "Iterate by row:"
2807 | ]
2808 | },
2809 | {
2810 | "cell_type": "code",
2811 | "execution_count": null,
2812 | "metadata": {
2813 | "collapsed": false
2814 | },
2815 | "outputs": [],
2816 | "source": [
2817 | "for row in test:\n",
2818 | " print(row)"
2819 | ]
2820 | },
2821 | {
2822 | "cell_type": "markdown",
2823 | "metadata": {},
2824 | "source": [
2825 | " \n",
2826 | "Iterate by index:"
2827 | ]
2828 | },
2829 | {
2830 | "cell_type": "code",
2831 | "execution_count": null,
2832 | "metadata": {
2833 | "collapsed": false
2834 | },
2835 | "outputs": [],
2836 | "source": [
2837 | "for i in range(len(test)):\n",
2838 | " print(test[i])"
2839 | ]
2840 | },
2841 | {
2842 | "cell_type": "markdown",
2843 | "metadata": {},
2844 | "source": [
2845 | " \n",
2846 | "Iterate by row and index:"
2847 | ]
2848 | },
2849 | {
2850 | "cell_type": "code",
2851 | "execution_count": null,
2852 | "metadata": {
2853 | "collapsed": false
2854 | },
2855 | "outputs": [],
2856 | "source": [
2857 | "for i, row in enumerate(test):\n",
2858 | " print('row', i, 'is', row)"
2859 | ]
2860 | },
2861 | {
2862 | "cell_type": "markdown",
2863 | "metadata": {},
2864 | "source": [
2865 | " \n",
2866 | "Use `zip` to iterate over multiple iterables."
2867 | ]
2868 | },
2869 | {
2870 | "cell_type": "code",
2871 | "execution_count": null,
2872 | "metadata": {
2873 | "collapsed": false
2874 | },
2875 | "outputs": [],
2876 | "source": [
2877 | "test2 = test**2\n",
2878 | "test2"
2879 | ]
2880 | },
2881 | {
2882 | "cell_type": "code",
2883 | "execution_count": null,
2884 | "metadata": {
2885 | "collapsed": false
2886 | },
2887 | "outputs": [],
2888 | "source": [
2889 | "for i, j in zip(test, test2):\n",
2890 | " print(i,'+',j,'=',i+j)"
2891 | ]
2892 | }
2893 | ],
2894 | "metadata": {
2895 | "kernelspec": {
2896 | "display_name": "Python 3",
2897 | "language": "python",
2898 | "name": "python3"
2899 | },
2900 | "language_info": {
2901 | "codemirror_mode": {
2902 | "name": "ipython",
2903 | "version": 3
2904 | },
2905 | "file_extension": ".py",
2906 | "mimetype": "text/x-python",
2907 | "name": "python",
2908 | "nbconvert_exporter": "python",
2909 | "pygments_lexer": "ipython3",
2910 | "version": "3.5.2"
2911 | }
2912 | },
2913 | "nbformat": 4,
2914 | "nbformat_minor": 0
2915 | }
2916 |
--------------------------------------------------------------------------------