├── EDA
    ├── EDA of Insurance Company's Dataset.ipynb
    └── insurance-data.csv
├── ESTU
    ├── EDA.ipynb
    └── insurance.csv
├── Linear Algebra
    └── Linear Algebra.ipynb
├── MSKU
    ├── Demand_Prediction
    │   ├── Demand_Prediction_LSTM_MSKU.ipynb
    │   └── store_sharing.csv
    ├── Methods of ML
    │   ├── Classification_Algorithms_Final.ipynb
    │   ├── KrediKartıVerileri.xlsx
    │   └── Methods_of_ML_1.ipynb
    └── Statistical Theory of ML
    │   ├── Final.ipynb
    │   ├── Statistical Theory Behind ML_1.ipynb
    │   ├── arasınav_Ödev Veri Seti.xlsx
    │   ├── finalVeriSeti.xlsx
    │   └── yz.ipynb
├── Matplotlib
    ├── DV_1.ipynb
    ├── DV_1.py
    ├── DV_2.ipynb
    ├── DV_2.py
    ├── Matplotlib_1.ipynb
    ├── Matplotlib_1.py
    ├── Matplotlib_Class.ipynb
    ├── Matplotlib_Class.py
    ├── Matplotlib_U.ipynb
    └── Matplotlib_U.py
├── NumPy
    ├── NumPy_1.ipynb
    ├── NumPy_1.py
    ├── NumPy_Class.ipynb
    ├── NumPy_Class.py
    ├── NumPy_U.ipynb
    └── NumPy_U.py
├── Other
    ├── Feature_Selection.ipynb
    ├── PIWorks
    │   ├── SARIMAX+Regression.ipynb
    │   ├── SARIMAX.ipynb
    │   └── municipality_bus_utilization.csv
    ├── Regex.ipynb
    ├── Regex.py
    ├── Sklearn_Encoding.ipynb
    └── car_price.csv
├── PI
    └── PIWorks.ipynb
├── Pandas
    ├── Pandas Built-in DV_1.ipynb
    ├── Pandas Built-in DV_1.py
    ├── Pandas Built-in DV_Class.ipynb
    ├── Pandas Built-in DV_Class.py
    ├── Pandas Built-in DV_U.ipynb
    ├── Pandas Built-in DV_U.py
    ├── Pandas-(Aggregation, Groupby, Operations).ipynb
    ├── Pandas-(Aggregation, Groupby, Operations).py
    ├── Pandas-(Missing Values, Outliers).ipynb
    ├── Pandas-(Missing Values, Outliers).py
    ├── Pandas_1.ipynb
    ├── Pandas_1.py
    ├── Pandas_2.ipynb
    ├── Pandas_2.py
    ├── Pandas_3.ipynb
    ├── Pandas_3.py
    ├── Pandas_Class.ipynb
    ├── Pandas_Class.py
    ├── Pandas_Class2.ipynb
    ├── Pandas_Class2.py
    ├── Pandas_U.ipynb
    └── Pandas_U.py
├── README.md
├── Seaborn
    ├── Seaborn Class.ipynb
    ├── Seaborn Class.py
    ├── Seaborn Class2.ipynb
    ├── Seaborn Class2.py
    ├── Seaborn1.ipynb
    ├── Seaborn2.ipynb
    ├── Seaborn3.ipynb
    ├── Seaborn_1.ipynb
    ├── Seaborn_1.py
    ├── Seaborn_U.ipynb
    └── Seaborn_U.py
└── Statistics
    ├── Statistics.ipynb
    ├── Statistics.py
    ├── Statistics_2.ipynb
    ├── Statistics_2.py
    ├── Statistics_3.ipynb
    ├── Statistics_3.py
    ├── Statistics_4.ipynb
    ├── Statistics_4.py
    ├── Statistics_5.ipynb
    ├── Statistics_5.py
    ├── Statistics_6.ipynb
    ├── Statistics_6.py
    ├── Statistics_Exercise_1.ipynb
    ├── Statistics_Exercise_1.py
    ├── Statistics_Exercise_2.ipynb
    ├── Statistics_Exercise_2.py
    ├── Statistics_Exercise_3.ipynb
    ├── Statistics_Exercise_3.py
    ├── Statistics_Exercise_4.ipynb
    ├── Statistics_Exercise_4.py
    ├── Statistics_Exercise_5.ipynb
    ├── Statistics_Exercise_5.py
    ├── Statistics_Exercise_6.ipynb
    └── Statistics_Exercise_6.py


/MSKU/Methods of ML/KrediKartıVerileri.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hersany/DataScience/5d73888fcde2765730f0bfa33e143205e6b36742/MSKU/Methods of ML/KrediKartıVerileri.xlsx


--------------------------------------------------------------------------------
/MSKU/Statistical Theory of ML/arasınav_Ödev Veri Seti.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hersany/DataScience/5d73888fcde2765730f0bfa33e143205e6b36742/MSKU/Statistical Theory of ML/arasınav_Ödev Veri Seti.xlsx


--------------------------------------------------------------------------------
/MSKU/Statistical Theory of ML/finalVeriSeti.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hersany/DataScience/5d73888fcde2765730f0bfa33e143205e6b36742/MSKU/Statistical Theory of ML/finalVeriSeti.xlsx


--------------------------------------------------------------------------------
/Matplotlib/DV_1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import matplotlib.pyplot as plt
 10 | import seaborn as sns
 11 | 
 12 | 
 13 | # # 1. Set darkgrid style from seaborn
 14 | 
 15 | # In[3]:
 16 | 
 17 | 
 18 | sns.set(style = 'darkgrid')
 19 | 
 20 | 
 21 | # In[ ]:
 22 | 
 23 | 
 24 | ## areaplot
 25 | 
 26 | 
 27 | # In[2]:
 28 | 
 29 | 
 30 | df = pd.DataFrame({'buy': [1, 2, 4, 9, 11, 5], 'register': [4, 6, 5, 11, 13, 15],
 31 |                    'view': [25, 45, 24, 58, 75, 55]}, 
 32 |                   index=pd.date_range(start='2018/01/01', end='2018/07/01',
 33 |                                       freq='M'))
 34 | df.head()
 35 | 
 36 | 
 37 | # In[ ]:
 38 | 
 39 | 
 40 | df.shape
 41 | 
 42 | 
 43 | # In[7]:
 44 | 
 45 | 
 46 | df.plot()
 47 | 
 48 | 
 49 | # In[6]:
 50 | 
 51 | 
 52 | df.plot.area()
 53 | 
 54 | 
 55 | # # 2. Make a bar plot
 56 | 
 57 | # In[ ]:
 58 | 
 59 | 
 60 | ## barplots
 61 | 
 62 | 
 63 | # In[9]:
 64 | 
 65 | 
 66 | income = [100, 80, 150, 48, 52, 69, 88]
 67 | expense = [30, 100, 100, 20, 75, 50, 28]
 68 | index = ['A', 'B', 'C','D', 'E', 'F', 'G']
 69 | df = pd.DataFrame({'income': income,'expense': expense}, index=index)
 70 | df.head()
 71 | 
 72 | 
 73 | # # 3. Make a bar plot
 74 | 
 75 | # In[10]:
 76 | 
 77 | 
 78 | df.plot.bar()
 79 | 
 80 | 
 81 | # In[10]:
 82 | 
 83 | 
 84 | df.plot(kind = 'bar')
 85 | 
 86 | 
 87 | # In[ ]:
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | # # 3. Stack the bars
 94 | 
 95 | # In[16]:
 96 | 
 97 | 
 98 | df.plot.bar(stacked = True)
 99 | 
100 | 
101 | # In[ ]:
102 | 
103 | 
104 | 
105 | 
106 | 
107 | # # 3. Rotate the labels and set figsize
108 | 
109 | # In[17]:
110 | 
111 | 
112 | games = ['Game-1', 'Game-2', 'Game-3', 'Game-4', 'Game-5', 'Game-6', 'Game-7']
113 | 
114 | 
115 | # In[12]:
116 | 
117 | 
118 | df.plot.bar(stacked = True, figsize = (9, 6))
119 | plt.xticks(rotation = 0)
120 | plt.show()
121 | 
122 | 
123 | # In[ ]:
124 | 
125 | 
126 | 
127 | 
128 | 
129 | # # 4. Replace the labels by "Game-1", "Game-2", "Game-3", "Game-4", "Game-5", "Game-6", "Game-7"
130 | # # 5. Unstuck the bars, annotate the hight of the bars on top of them
131 | 
132 | # In[27]:
133 | 
134 | 
135 | games = ['Game-1', 'Game-2', 'Game-3', 'Game-4', 'Game-5', 'Game-6', 'Game-7']
136 | 
137 | 
138 | # In[17]:
139 | 
140 | 
141 | g = df.plot.bar(figsize = (9, 6))
142 | plt.xticks(rotation = 0)
143 | for p in g.patches:
144 |     g.annotate((p.get_height()), (p.get_x()+ 0.01, p.get_height()+0.6), size = 10)
145 | plt.show()
146 | 
147 | 
148 | # In[40]:
149 | 
150 | 
151 | g = df.plot.bar(figsize = (9, 6))
152 | plt.xticks(rotation = 0)
153 | g.set_xticklabels(games)
154 | for p in g.patches:
155 |     g.annotate((p.get_height()), (p.get_x()+ 0.01, p.get_height()+0.6), size = 10)
156 | plt.show()
157 | 
158 | 
159 | # In[ ]:
160 | 
161 | 
162 | 
163 | 
164 | 
165 | # In[ ]:
166 | 
167 | 
168 | ## histograms
169 | 
170 | 
171 | # In[18]:
172 | 
173 | 
174 | tips=sns.load_dataset("tips")
175 | 
176 | 
177 | # In[19]:
178 | 
179 | 
180 | tips.head()
181 | 
182 | 
183 | # # 6. Histogram of the total_bill column
184 | 
185 | # In[52]:
186 | 
187 | 
188 | sns.distplot(tips['total_bill'], kde = False)
189 | 
190 | 
191 | # In[50]:
192 | 
193 | 
194 | tips['total_bill'].hist()
195 | 
196 | 
197 | # In[ ]:
198 | 
199 | 
200 | 
201 | 
202 | 
203 | # In[ ]:
204 | 
205 | 
206 | ## lineplots
207 | 
208 | 
209 | # In[ ]:
210 | 
211 | 
212 | tips.head()
213 | 
214 | 
215 | # # 7. Plot the avg tip by size  (lineplot)
216 | 
217 | # In[63]:
218 | 
219 | 
220 | tips.groupby('size')['tip'].mean().plot()
221 | 
222 | 
223 | # In[ ]:
224 | 
225 | 
226 | 
227 | 
228 | 
229 | # # 8.Set the linestyle as "--"
230 | 
231 | # In[66]:
232 | 
233 | 
234 | tips.groupby('size')['tip'].mean().plot(ls = '--')
235 | 
236 | 
237 | # In[ ]:
238 | 
239 | 
240 | 
241 | 
242 | 
243 | # In[ ]:
244 | 
245 | 
246 | ## Scatter Plots
247 | 
248 | 
249 | # In[ ]:
250 | 
251 | 
252 | tips.head()
253 | 
254 | 
255 | # # 9. Make a scatter plot between tip and total_bill
256 | 
257 | # In[69]:
258 | 
259 | 
260 | tips.plot.scatter('tip', 'total_bill')
261 | 
262 | 
263 | # In[ ]:
264 | 
265 | 
266 | 
267 | 
268 | 
269 | # # 10. Set an additional dimension using size column
270 | 
271 | # In[72]:
272 | 
273 | 
274 | tips.head()
275 | 
276 | 
277 | # In[21]:
278 | 
279 | 
280 | tips.plot.scatter('tip', 'total_bill', c = 'size', cmap = 'coolwarm')
281 | 
282 | 
283 | # In[ ]:
284 | 
285 | 
286 | 
287 | 
288 | 
289 | # In[ ]:
290 | 
291 | 
292 | ## boxplots
293 | 
294 | 
295 | # In[ ]:
296 | 
297 | 
298 | tips.head()
299 | 
300 | 
301 | # # 11. Make a box plot of total_bill column
302 | 
303 | # In[75]:
304 | 
305 | 
306 | sns.boxplot(tips['total_bill'])
307 | 
308 | 
309 | # In[77]:
310 | 
311 | 
312 | tips.boxplot('total_bill')
313 | 
314 | 
315 | # In[ ]:
316 | 
317 | 
318 | 
319 | 
320 | 
321 | # # 12. Seperate the the boxplot above using size columns
322 | 
323 | # In[82]:
324 | 
325 | 
326 | tips.boxplot('total_bill', 'size')
327 | plt.tight_layout()
328 | 
329 | 
330 | # In[ ]:
331 | 
332 | 
333 | 
334 | 
335 | 
336 | # # 13. Make the same plot using seaborn
337 | 
338 | # In[83]:
339 | 
340 | 
341 | sns.boxplot('size', 'total_bill', data = tips)
342 | 
343 | 
344 | # In[ ]:
345 | 
346 | 
347 | 
348 | 
349 | 
350 | # # 14. Make a violinplot instead of boxplot and discuss the difference between boxplot and violinplot
351 | 
352 | # In[84]:
353 | 
354 | 
355 | sns.violinplot('size', 'total_bill', data = tips)
356 | 
357 | 
358 | # In[23]:
359 | 
360 | 
361 | sns.violinplot('size', 'total_bill', data = tips)
362 | sns.swarmplot('size', 'total_bill', data = tips, color = 'black')
363 | 
364 | 
365 | # In[ ]:
366 | 
367 | 
368 | 
369 | 
370 | 
371 | # In[ ]:
372 | 
373 | 
374 | 
375 | 
376 | 


--------------------------------------------------------------------------------
/Matplotlib/DV_2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # #### import the libraries
  5 | 
  6 | # In[11]:
  7 | 
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | import matplotlib.pyplot as plt
 12 | import seaborn as sns
 13 | 
 14 | 
 15 | # #### load the tips dataset 
 16 | 
 17 | # In[12]:
 18 | 
 19 | 
 20 | tips = sns.load_dataset('tips')
 21 | tips.head()
 22 | 
 23 | 
 24 | # In[4]:
 25 | 
 26 | 
 27 | # instrutction: make a plot with seaborn that shows distribution of total bill. 
 28 | 
 29 | 
 30 | # ### 1-Distribution Plot
 31 | 
 32 | # ####  DISTPLOT
 33 | 
 34 | # In[8]:
 35 | 
 36 | 
 37 | sns.distplot(tips['total_bill'], kde = False, hist_kws = dict(edgecolor = 'k', lw = 2), bins = 15)
 38 | 
 39 | 
 40 | # In[13]:
 41 | 
 42 | 
 43 | sns.distplot(tips['total_bill'], kde = False)
 44 | 
 45 | 
 46 | # In[5]:
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | # show rug, kde, distplot in same figure but in different axes. 
 53 | # Explain the difference between matplotlib and seaborn in aspect of using axes.
 54 | # 
 55 | # (birinde ax.bar.... diye başlıyorduk, burada ax=axes[.. diye parantez içine yazıyoruz.)
 56 | 
 57 | # In[10]:
 58 | 
 59 | 
 60 | sns.set(style="darkgrid")
 61 | rs = np.random.RandomState(10)
 62 | 
 63 | # Set up the matplotlib figure
 64 | f, axes = plt.subplots(2, 2, figsize=(7, 7), sharex=True)
 65 | 
 66 | # Generate a random univariate dataset
 67 | # d = rs.normal(size=100)
 68 | 
 69 | # Plot a simple histogram with binsize determined automatically
 70 | sns.distplot(tips['total_bill'], kde=False, color="b", ax=axes[0, 0])
 71 | 
 72 | # Plot a kernel density estimate and rug plot
 73 | sns.distplot(tips['total_bill'], hist=False, rug=True, color="r", ax=axes[0, 1])
 74 | 
 75 | # Plot a filled kernel density estimate
 76 | sns.distplot(tips['total_bill'], hist=False, color="g", kde_kws={"shade": True}, ax=axes[1, 0])
 77 | 
 78 | # Plot a histogram and kernel density estimate
 79 | sns.distplot(tips['total_bill'], color="m", ax=axes[1, 1])
 80 | 
 81 | # plt.setp(axes, yticks=[])
 82 | plt.tight_layout()
 83 | 
 84 | 
 85 | # In[16]:
 86 | 
 87 | 
 88 | sns.set(style="white")
 89 | 
 90 | # Set up the matplotlib figure
 91 | 
 92 | 
 93 | # Generate a random univariate dataset
 94 | 
 95 | 
 96 | # Plot a simple histogram with binsize determined automatically
 97 | 
 98 | # Plot a kernel density estimate and rug plot
 99 | 
100 | # Plot a filled kernel density estimate
101 | 
102 | # Plot a histogram and kernel density estimate
103 | 
104 | 
105 | # ### 2-Categorical Plot
106 | 
107 | # In[6]:
108 | 
109 | 
110 | # ins: make a plot that shows avg total bills in both genders.
111 | 
112 | 
113 | # In[9]:
114 | 
115 | 
116 | sns.barplot('sex', 'total_bill', data = tips)
117 | 
118 | 
119 | # In[17]:
120 | 
121 | 
122 | sns.barplot('sex', 'total_bill', 'day', data = tips)
123 | plt.legend(loc = 3)
124 | 
125 | 
126 | # In[8]:
127 | 
128 | 
129 | # ins: make a plot that shows avg total bills in both genders as well as the avg total bills in different days.
130 | # what is the black bars on the graphs? (ci)
131 | 
132 | 
133 | # In[ ]:
134 | 
135 | 
136 | 
137 | 
138 | 
139 | # In[6]:
140 | 
141 | 
142 | 
143 | 
144 | 
145 | # #### B) COUNTPLOT
146 | 
147 | # In[ ]:
148 | 
149 | 
150 | # ins: count the people in the dataset in each day. And order them.
151 | 
152 | 
153 | # In[30]:
154 | 
155 | 
156 | tips.groupby('day').count()['size'].sort_values(ascending = False).index
157 | 
158 | 
159 | # In[31]:
160 | 
161 | 
162 | sns.countplot(tips['day'], order = tips.groupby('day').count()['size'].sort_values(ascending = False).index)
163 | 
164 | 
165 | # In[27]:
166 | 
167 | 
168 | 
169 | 
170 | 
171 | # #### C) BOXPLOT
172 | 
173 | # In[ ]:
174 | 
175 | 
176 | # Show the total bills range according to days as well as according to smokers/non smokers.
177 | 
178 | 
179 | # In[32]:
180 | 
181 | 
182 | sns.boxplot('day', 'total_bill', 'smoker', tips)
183 | 
184 | 
185 | # In[8]:
186 | 
187 | 
188 | 
189 | 
190 | 
191 | # #### D) VIOLINPLOT
192 | 
193 | # In[ ]:
194 | 
195 | 
196 | # make a violin plot of total bill separeted by days on x axis.
197 | 
198 | 
199 | # In[33]:
200 | 
201 | 
202 | sns.violinplot('day', 'total_bill', data = tips)
203 | 
204 | 
205 | # In[17]:
206 | 
207 | 
208 | 
209 | 
210 | 
211 | # #### G) CATPLOT ( FORMER NAME: FACTOR PLOT)
212 | 
213 | # In[9]:
214 | 
215 | 
216 | # try to make same plots using catplot instead of bar, violin, box plots.
217 | 
218 | 
219 | # In[ ]:
220 | 
221 | 
222 | sns.catplot()
223 | 
224 | 
225 | # In[46]:
226 | 
227 | 
228 | 
229 | # kind options: bar, swarm, strip(default), box, violin, point and count. 
230 | 
231 | 
232 | # #### H) POINTPLOT
233 | 
234 | # In[10]:
235 | 
236 | 
237 | # make a pointplot that shows avg total bils both in lunch and dinner.
238 | 
239 | 
240 | # In[20]:
241 | 
242 | 
243 | sns.pointplot('time', 'total_bill', data = tips)
244 | 
245 | 
246 | # In[21]:
247 | 
248 | 
249 | 
250 | 
251 | 
252 | # # 3- Matrix and Grid Plots
253 | 
254 | # In[11]:
255 | 
256 | 
257 | # make a hit map to show corr matrix on tips dataset.
258 | 
259 | 
260 | # In[35]:
261 | 
262 | 
263 | sns.heatmap(tips.corr(), annot = True, cmap = 'coolwarm')
264 | 
265 | 
266 | # In[28]:
267 | 
268 | 
269 | 
270 | 
271 | 
272 | # In[12]:
273 | 
274 | 
275 | # make a pair plot of tips data set and make comments on it.
276 | 
277 | 
278 | # In[45]:
279 | 
280 | 
281 | sns.pairplot(tis);
282 | 
283 | 


--------------------------------------------------------------------------------
/Matplotlib/Matplotlib_1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Exercises
  5 | # 
  6 | # Follow the instructions to recreate the plots using this data:
  7 | # 
  8 | # ## Data
  9 | 
 10 | # In[17]:
 11 | 
 12 | 
 13 | import numpy as np
 14 | x = np.arange(0,100)
 15 | y = x*2
 16 | z = x**2
 17 | 
 18 | 
 19 | # ** Import matplotlib.pyplot as plt and set %matplotlib inline if you are using the jupyter notebook. What command do you use if you aren't using the jupyter notebook?**
 20 | 
 21 | # In[2]:
 22 | 
 23 | 
 24 | import matplotlib.pyplot as plt
 25 | get_ipython().run_line_magic('matplotlib', 'inline')
 26 | 
 27 | 
 28 | # ## Exercise 1
 29 | # 
 30 | # ** Follow along with these steps: **
 31 | # * ** Create a figure object called fig using plt.figure() **
 32 | # * ** Use add_axes to add an axis to the figure canvas at [0,0,1,1]. Call this new axis ax. **
 33 | # * ** Plot (x,y) on that axes and set the labels and titles to match the plot below:**
 34 | 
 35 | # In[18]:
 36 | 
 37 | 
 38 | fig = plt.figure()
 39 | 
 40 | ax = fig.add_axes([0, 0, 1, 1])
 41 | 
 42 | ax.plot(x, y)
 43 | ax.set_xlabel('x')
 44 | ax.set_ylabel('y')
 45 | ax.set_title('title')
 46 | plt.show()
 47 | 
 48 | 
 49 | # ## Exercise 2
 50 | # ** Create a figure object and put two axes on it, ax1 and ax2. Located at [0,0,1,1] and [0.2,0.5,.2,.2] respectively.**
 51 | 
 52 | # In[4]:
 53 | 
 54 | 
 55 | fig = plt.figure()
 56 | 
 57 | ax1 = fig.add_axes([0, 0, 1, 1])
 58 | ax2 = fig.add_axes([0.2, 0.5, 0.2, 0.2])
 59 | 
 60 | 
 61 | # ** Now plot (x,y) on both axes. And call your figure object to show it.**
 62 | 
 63 | # In[21]:
 64 | 
 65 | 
 66 | fig = plt.figure()
 67 | 
 68 | ax1 = fig.add_axes([0, 0, 1, 1])
 69 | ax1.plot(x, y)
 70 | 
 71 | ax2 = fig.add_axes([0.2, 0.5, 0.2, 0.2])
 72 | ax2.plot(x, y)
 73 | plt.show()
 74 | 
 75 | 
 76 | # ## Exercise 3
 77 | # 
 78 | # ** Create the plot below by adding two axes to a figure object at [0,0,1,1] and [0.2,0.5,.4,.4]**
 79 | 
 80 | # In[6]:
 81 | 
 82 | 
 83 | fig = plt.figure()
 84 | 
 85 | ax1 = fig.add_axes([0, 0, 1, 1])
 86 | ax2 = fig.add_axes([0.2, 0.5, 0.4, 0.4])
 87 | 
 88 | 
 89 | # ** Now use x,y, and z arrays to recreate the plot below. Notice the xlimits and y limits on the inserted plot:**
 90 | 
 91 | # In[22]:
 92 | 
 93 | 
 94 | fig = plt.figure()
 95 | 
 96 | ax1 = fig.add_axes([0, 0, 1, 1])
 97 | ax1.plot(x, z, 'green')
 98 | ax1.plot(x, y, 'blue')
 99 | ax1.set_xlabel('x')
100 | ax1.set_ylabel('z')
101 | ax1.set_xlim(0)
102 | ax1.set_ylim(0)
103 | 
104 | ax2 = fig.add_axes([0.2, 0.5, 0.4, 0.4])
105 | ax2.plot(x, y)
106 | ax2.set_xlabel('x')
107 | ax2.set_ylabel('y')
108 | ax2.set_title('zoom')
109 | ax2.set_xlim([20.0, 22.0])
110 | ax2.set_ylim([30, 50])
111 | plt.show()
112 | 
113 | 
114 | # ## Exercise 4
115 | # 
116 | # ** Use plt.subplots(nrows=1, ncols=2) to create the plot below.**
117 | 
118 | # In[8]:
119 | 
120 | 
121 | fig, ax = plt.subplots(1, 2)
122 | 
123 | 
124 | # ** Now plot (x,y) and (x,z) on the axes. Play around with the linewidth and style**
125 | 
126 | # In[9]:
127 | 
128 | 
129 | fig, ax = plt.subplots(1, 2)
130 | 
131 | ax[0].plot(x, y, 'b', lw = 2.5, ls = '--')
132 | ax[0].set_xlim(0)
133 | ax[0].set_ylim(0)
134 | 
135 | ax[1].plot(x, z, 'r', lw = 4)
136 | ax[1].set_xlim(0)
137 | ax[1].set_ylim(0)
138 | 
139 | plt.tight_layout()
140 | 
141 | 
142 | # ** See if you can resize the plot by adding the figsize() argument in plt.subplots() are copying and pasting your previous code.**
143 | 
144 | # In[10]:
145 | 
146 | 
147 | fig, ax = plt.subplots(1, 2, figsize = (10, 5))
148 | 
149 | ax[0].plot(x, y, 'b', lw = 2.5, ls = '--')
150 | ax[0].set_xlim(0)
151 | ax[0].set_ylim(0)
152 | 
153 | ax[1].plot(x, z, 'r', lw = 4)
154 | ax[1].set_xlim(0)
155 | ax[1].set_ylim(0)
156 | 
157 | plt.tight_layout()
158 | 
159 | 
160 | # # Great Job!
161 | 


--------------------------------------------------------------------------------
/Matplotlib/Matplotlib_Class.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[9]:
  5 | 
  6 | 
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | 
 10 | # In[10]:
 11 | 
 12 | 
 13 | age = [25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45]
 14 | salary = [38496, 42000, 46752, 49320, 53200,
 15 |          56000, 62316, 64928, 67317, 68748, 73752]
 16 | 
 17 | 
 18 | # In[11]:
 19 | 
 20 | 
 21 | # Functional
 22 | 
 23 | 
 24 | # In[12]:
 25 | 
 26 | 
 27 | plt.plot(age, salary);   # ; = plt.show()
 28 | 
 29 | 
 30 | # In[8]:
 31 | 
 32 | 
 33 | plt.plot(age, salary)
 34 | plt.xlabel('age')
 35 | plt.ylabel('salary')
 36 | plt.title('Salary by Age');
 37 | 
 38 | 
 39 | # In[14]:
 40 | 
 41 | 
 42 | salary_2 = [45372, 48876, 53850, 57287, 63016,
 43 |             65998, 70003, 70000, 71496, 75370, 83640]
 44 | 
 45 | 
 46 | # In[10]:
 47 | 
 48 | 
 49 | plt.plot(age, salary)
 50 | plt.plot(age, salary_2)
 51 | plt.xlabel('age')
 52 | plt.ylabel('salary')
 53 | plt.title('Salary by Age');
 54 | 
 55 | 
 56 | # In[13]:
 57 | 
 58 | 
 59 | plt.plot(age, salary, label = 'Turkey')
 60 | plt.plot(age, salary_2, label = 'Europe')
 61 | plt.xlabel('age')
 62 | plt.ylabel('salary')
 63 | plt.title('Salary by Age')
 64 | plt.legend();
 65 | 
 66 | 
 67 | # In[23]:
 68 | 
 69 | 
 70 | plt.subplot(2, 1, 1)
 71 | plt.plot(age, salary, 'r')
 72 | 
 73 | plt.subplot(2, 1, 2)
 74 | plt.plot(age, salary_2, 'b')
 75 | 
 76 | plt.tight_layout()
 77 | 
 78 | 
 79 | # In[15]:
 80 | 
 81 | 
 82 | import pandas as pd
 83 | df = pd.DataFrame(list(zip(age, salary, salary_2)), columns =['age', 'salary', 'salary_2'])
 84 | df.head()
 85 | 
 86 | 
 87 | # In[26]:
 88 | 
 89 | 
 90 | df['salary']
 91 | 
 92 | 
 93 | # In[25]:
 94 | 
 95 | 
 96 | plt.plot(df['salary']);        # index and salary
 97 | 
 98 | 
 99 | # In[17]:
100 | 
101 | 
102 | plt.plot(df['age'], df['salary']);         # age and salary
103 | 
104 | 
105 | # In[28]:
106 | 
107 | 
108 | # OOP
109 | 
110 | 
111 | # In[20]:
112 | 
113 | 
114 | fig = plt.figure()
115 | 
116 | ax = fig.add_axes([0, 0, 0.8, 0.8])
117 | 
118 | 
119 | # In[32]:
120 | 
121 | 
122 | fig = plt.figure()
123 | 
124 | ax = fig.add_axes([0, 0, 1, 1])
125 | 
126 | ax.plot(age, salary, 'r')
127 | ax.set_xlabel('Age')
128 | ax.set_ylabel('Salary')
129 | ax.set_title('Salary by Age');
130 | 
131 | 
132 | # In[34]:
133 | 
134 | 
135 | fig = plt.figure()
136 | 
137 | ax1 = fig.add_axes([0, 0, 0.8, 0.8])
138 | 
139 | ax1.plot(age, salary, 'r')
140 | ax1.set_xlabel('Age')
141 | ax1.set_ylabel('Salary')
142 | ax1.set_title('Salary by Age')
143 | 
144 | ax2 = fig.add_axes([1, 0.1, 0.4, 0.4])
145 | 
146 | ax2.plot(age, salary_2, 'b')
147 | ax2.set_xlabel('Age')
148 | ax2.set_ylabel('Salary2')
149 | ax2.set_title('Salary2 by Age');
150 | 
151 | 
152 | # In[46]:
153 | 
154 | 
155 | ax1
156 | 
157 | 
158 | # In[47]:
159 | 
160 | 
161 | ax2
162 | 
163 | 
164 | # In[48]:
165 | 
166 | 
167 | fig
168 | 
169 | 
170 | # In[49]:
171 | 
172 | 
173 | fig, ax = plt.subplots()
174 | 
175 | 
176 | # In[56]:
177 | 
178 | 
179 | fig, ax = plt.subplots()         # default 1 row 1 column
180 | 
181 | ax.plot(age, salary, 'r')
182 | ax.set_xlabel('Age')
183 | ax.set_ylabel('Salary')
184 | ax.set_title('Salary by Age')
185 | plt.tight_layout()
186 | 
187 | 
188 | # In[37]:
189 | 
190 | 
191 | fig, ax = plt.subplots(2, 1)         
192 | 
193 | ax[0].plot(age, salary, 'r')
194 | ax[0].set_xlabel('Age')
195 | ax[0].set_ylabel('Salary')
196 | ax[0].set_title('Salary by Age')
197 | 
198 | ax[1].plot(age, salary_2, 'b')
199 | ax[1].set_xlabel('Age')
200 | ax[1].set_ylabel('Salary2')
201 | ax[1].set_title('Salary2 by Age')
202 | plt.tight_layout()
203 | 
204 | 
205 | # In[74]:
206 | 
207 | 
208 | fig, ax = plt.subplots(2, 2)
209 | plt.tight_layout()
210 | 
211 | 
212 | # In[73]:
213 | 
214 | 
215 | ax
216 | 
217 | 
218 | # In[75]:
219 | 
220 | 
221 | fig
222 | 
223 | 
224 | # In[79]:
225 | 
226 | 
227 | fig, ax = plt.subplots(1, 2)
228 | 
229 | ax[0].plot(age, salary)
230 | ax[0].set_title('First Plot')
231 | ax[0].set_xlabel('Age')
232 | ax[0].set_ylabel('Salaries')
233 | 
234 | ax[1].plot(age, salary_2)
235 | ax[1].set_title('Second Plot')
236 | ax[1].set_xlabel('Age')
237 | 
238 | plt.tight_layout()
239 | 
240 | 
241 | # In[84]:
242 | 
243 | 
244 | fig, ax = plt.subplots(2, 2)
245 | 
246 | ax[0, 0].plot(age, salary)
247 | ax[0, 0].set_title('First Plot')
248 | ax[0, 0].set_xlabel('Age')
249 | ax[0, 0].set_ylabel('Salaries')
250 | 
251 | ax[0, 1].plot(age, salary)
252 | ax[0, 1].set_title('Second Plot')
253 | ax[0, 1].set_xlabel('Age')
254 | ax[0, 1].set_ylabel('Salaries')
255 | 
256 | ax[1, 0].plot(age, salary_2)
257 | ax[1, 0].set_title('Third Plot')
258 | ax[1, 0].set_xlabel('Age')
259 | ax[1, 0].set_ylabel('Salaries')
260 | 
261 | ax[1, 1].plot(age, salary_2)
262 | ax[1, 1].set_title('Fourth Plot')
263 | ax[1, 1].set_xlabel('Age')
264 | ax[1, 1].set_ylabel('Salaries')
265 | 
266 | plt.tight_layout()
267 | 
268 | 
269 | # In[5]:
270 | 
271 | 
272 | import matplotlib.pyplot as plt
273 | 
274 | 
275 | # In[6]:
276 | 
277 | 
278 | age = [25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45]
279 | salary = [38496, 42000, 46752, 49320, 53200,
280 |          56000, 62316, 64928, 67317, 68748, 73752]
281 | salary_2 = [45372, 48876, 53850, 57287, 63016,
282 |             65998, 70003, 70000, 71496, 75370, 83640]
283 | 
284 | 
285 | # In[7]:
286 | 
287 | 
288 | import pandas as pd
289 | df = pd.DataFrame(list(zip(age, salary, salary_2)), columns =['age', 'salary', 'salary_2'])
290 | 
291 | 
292 | # In[9]:
293 | 
294 | 
295 | df.head()
296 | 
297 | 
298 | # In[10]:
299 | 
300 | 
301 | fig = plt.figure(figsize = (8, 4))
302 | 
303 | 
304 | # In[11]:
305 | 
306 | 
307 | fig, ax = plt.subplots(figsize = (8, 4))
308 | 
309 | 
310 | # In[13]:
311 | 
312 | 
313 | fig, ax = plt.subplots(figsize = (6, 3))
314 | ax.plot(age, salary, 'r')
315 | ax.set_xlabel('age')
316 | ax.set_ylabel('salary')
317 | ax.set_title('title');
318 | 
319 | 
320 | # In[39]:
321 | 
322 | 
323 | fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(6,3))
324 | ax[0].plot(age, salary)
325 | ax[0].set_xlabel('age')
326 | ax[1].plot(age, salary_2)
327 | ax[0].set_title('First')
328 | ax[1].set_title('Second')
329 | ax[1].set_xlabel('age')
330 | plt.tight_layout()
331 | 
332 | 
333 | # In[15]:
334 | 
335 | 
336 | fig.savefig('myplot')
337 | 
338 | 
339 | # In[ ]:
340 | 
341 | 
342 | # legend, label, title
343 | 
344 | 
345 | # In[40]:
346 | 
347 | 
348 | fig, ax = plt.subplots(figsize=(6,3))
349 | ax.plot(age, salary, label = 'salary_1')
350 | ax.set_xlabel('Age')
351 | ax.plot(age, salary_2, label = 'salary_2')
352 | ax.set_title('Title')
353 | ax.set_ylabel('Salaries')
354 | ax.legend(loc = 0)
355 | plt.tight_layout()
356 | 
357 | 
358 | # In[41]:
359 | 
360 | 
361 | # setting colors, linewidths, linetypes, marker
362 | 
363 | 
364 | # In[61]:
365 | 
366 | 
367 | fig, ax = plt.subplots(figsize=(6,3))
368 | ax.plot(age, salary, 'y', lw = 2, ls = ':', marker = '*', ms = 10, mfc = 'b', alpha = 0.3);
369 | 
370 | 
371 | # In[62]:
372 | 
373 | 
374 | # plot range (xlim - ylim)
375 | 
376 | 
377 | # In[80]:
378 | 
379 | 
380 | fig, ax = plt.subplots(figsize=(6,3))
381 | ax.plot(age, salary)
382 | ax.set_xlim([30, 40])
383 | ax.set_ylim([45000, 60000]);
384 | 
385 | 
386 | # In[66]:
387 | 
388 | 
389 | # adding vertical-horizontal lines
390 | 
391 | 
392 | # In[56]:
393 | 
394 | 
395 | fig, ax = plt.subplots(figsize=(6,3))
396 | ax.plot(age, salary)
397 | ax.set_xlim([30, 40])
398 | ax.set_ylim([45000, 60000])
399 | ax.axvline(35)
400 | ax.axhline(50000, color = 'red');
401 | 
402 | 
403 | # In[60]:
404 | 
405 | 
406 | import numpy as np
407 | np.random.seed(5)
408 | x = np.arange(1, 101)
409 | y = 20 + 3 * x + np.random.normal(0, 60, 100)
410 | p =  plt.plot(x, y, "o")
411 | plt.vlines(70,100,250)
412 | plt.hlines(100, 0, 100)
413 | 
414 | 
415 | # In[82]:
416 | 
417 | 
418 | # Plot types
419 | 
420 | 
421 | # In[83]:
422 | 
423 | 
424 | # Bar chart
425 | 
426 | 
427 | # In[84]:
428 | 
429 | 
430 | country = ['UK', 'USA', 'FRA', 'GER', 'NOR']
431 | pci = [40000, 50000, 38000, 55000, 80000]
432 | 
433 | 
434 | # In[85]:
435 | 
436 | 
437 | fig, ax = plt.subplots()
438 | ax.bar(country, pci)
439 | 
440 | 
441 | # In[86]:
442 | 
443 | 
444 | labels = ['G1', 'G2', 'G3', 'G4', 'G5']
445 | men_means = [20, 34, 30, 35, 27]
446 | women_means = [25, 32, 34, 20, 25]
447 | 
448 | 
449 | # In[89]:
450 | 
451 | 
452 | fig, ax = plt.subplots()
453 | ax.bar(labels, men_means)
454 | 
455 | 
456 | # In[94]:
457 | 
458 | 
459 | fig, ax = plt.subplots()
460 | ax.bar(labels, women_means, color = 'orange');
461 | 
462 | 
463 | # In[95]:
464 | 
465 | 
466 | x = np.arange(len(labels))
467 | width = 0.35
468 | 
469 | 
470 | # In[96]:
471 | 
472 | 
473 | fig, ax = plt.subplots()
474 | ax.bar(x - width/2, men_means, width, label='Men')
475 | ax.bar(x + width/2, women_means, width, label='Women')
476 | 
477 | 
478 | # In[98]:
479 | 
480 | 
481 | fig, ax = plt.subplots()
482 | ax.bar(x - width/2, men_means, width, label='Men')
483 | ax.bar(x + width/2, women_means, width, label='Women')
484 | ax.set_ylabel('Scores')
485 | ax.set_title('Scores by group and gender')
486 | ax.set_xticks(x)
487 | ax.set_xticklabels(labels)
488 | ax.legend()
489 | 
490 | 
491 | # In[105]:
492 | 
493 | 
494 | age = [25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45]
495 | 
496 | 
497 | fig, ax = plt.subplots(figsize = (8, 4))
498 | 
499 | ax.plot(age, salary)
500 | ax.set_xticks([25, 30, 35, 40, 45]);
501 | 
502 | 
503 | # In[107]:
504 | 
505 | 
506 | import pandas as pd
507 | df = pd.DataFrame(list(zip(labels, men_means, women_means)), columns =["labels", "men_means", "women_means"])
508 | df.head()
509 | 
510 | 
511 | # In[110]:
512 | 
513 | 
514 | df.plot.bar(stacked = True)
515 | 
516 | 


--------------------------------------------------------------------------------
/Matplotlib/Matplotlib_U.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | 
 10 | # In[2]:
 11 | 
 12 | 
 13 | get_ipython().run_line_magic('matplotlib', 'inline')
 14 | 
 15 | 
 16 | # In[3]:
 17 | 
 18 | 
 19 | import numpy as np
 20 | 
 21 | 
 22 | # In[4]:
 23 | 
 24 | 
 25 | x = np.linspace(0, 5, 11)
 26 | 
 27 | 
 28 | # In[5]:
 29 | 
 30 | 
 31 | y = x ** 2
 32 | 
 33 | 
 34 | # In[6]:
 35 | 
 36 | 
 37 | x
 38 | 
 39 | 
 40 | # In[7]:
 41 | 
 42 | 
 43 | y
 44 | 
 45 | 
 46 | # In[8]:
 47 | 
 48 | 
 49 | # Functional Method
 50 | 
 51 | 
 52 | # In[9]:
 53 | 
 54 | 
 55 | plt.plot(x, y)
 56 | # plt.show()   = print() for matplotlib
 57 | 
 58 | 
 59 | # In[10]:
 60 | 
 61 | 
 62 | plt.xlabel('X Label')
 63 | plt.ylabel('Y Label')
 64 | plt.title('Title')
 65 | 
 66 | 
 67 | # In[11]:
 68 | 
 69 | 
 70 | plt.plot(x, y)
 71 | plt.xlabel('X Label')
 72 | plt.ylabel('Y Label')
 73 | plt.title('Title')
 74 | 
 75 | 
 76 | # In[12]:
 77 | 
 78 | 
 79 | plt.subplot(1, 2, 1)
 80 | plt.plot(x, y, 'k')
 81 | 
 82 | plt.subplot(1, 2, 2)
 83 | plt.plot(y, x, 'b')
 84 | 
 85 | 
 86 | # In[13]:
 87 | 
 88 | 
 89 | # Object-Oriented Method
 90 | 
 91 | 
 92 | # In[14]:
 93 | 
 94 | 
 95 | fig = plt.figure()
 96 | 
 97 | axes = fig.add_axes([0.1, 0.1, 0.8, 0.8])  # left bottom width height, 0-1,  relation to black canvas
 98 | 
 99 | axes.plot(x, y)
100 | axes.set_xlabel('X Label')
101 | axes.set_ylabel('Y Label')
102 | axes.set_title('Title')
103 | 
104 | 
105 | # In[15]:
106 | 
107 | 
108 | fig = plt.figure()
109 | 
110 | axes1 = fig.add_axes([0.1, 0.1, 0.8, 0.8])
111 | axes2 = fig.add_axes([0.2, 0.5, 0.4, 0.3])
112 | 
113 | axes1.plot(x, y)
114 | axes1.set_title('LARGER PLOT')
115 | 
116 | axes2.plot(y, x)
117 | axes2.set_title('SMALLER PLOT')
118 | 
119 | 
120 | # In[16]:
121 | 
122 | 
123 | fig = plt.figure()
124 | plt.show()
125 | 
126 | 
127 | # In[17]:
128 | 
129 | 
130 | fig = plt.figure()
131 | 
132 | axes1 = fig.add_axes([0.1, 0.1, 0.8, 0.8])
133 | axes1.plot(x, y)
134 | 
135 | 
136 | # In[18]:
137 | 
138 | 
139 | fig, axes = plt.subplots(nrows = 1, ncols = 2)
140 | 
141 | # axes.plot(x, y)
142 | 
143 | 
144 | # In[19]:
145 | 
146 | 
147 | fig, axes = plt.subplots(nrows = 3, ncols = 3)
148 | 
149 | # axes.plot(x, y)
150 | plt.tight_layout()
151 | 
152 | 
153 | # In[20]:
154 | 
155 | 
156 | axes
157 | 
158 | 
159 | # In[21]:
160 | 
161 | 
162 | fig, axes = plt.subplots(nrows = 1, ncols = 2)
163 | 
164 | for current_ax in axes:
165 |     current_ax.plot(x,y)
166 | 
167 | 
168 | # In[22]:
169 | 
170 | 
171 | axes
172 | 
173 | 
174 | # In[23]:
175 | 
176 | 
177 | fig, axes = plt.subplots(nrows = 1, ncols = 2)
178 | 
179 | axes[0].plot(x, y)
180 | 
181 | 
182 | # In[24]:
183 | 
184 | 
185 | fig, axes = plt.subplots(nrows = 1, ncols = 2)
186 | 
187 | axes[0].plot(x, y)
188 | axes[0].set_title('First Plot')
189 | 
190 | axes[1].plot(y, x)
191 | axes[1].set_title('Second Plot')
192 | 
193 | plt.tight_layout()
194 | 
195 | 
196 | # In[25]:
197 | 
198 | 
199 | # Figure size, DPI
200 | 
201 | 
202 | # In[26]:
203 | 
204 | 
205 | fig = plt.figure(figsize = (8, 2))      # figsize = width, height in inches
206 | 
207 | ax = fig.add_axes([0, 0, 1, 1])
208 | ax.plot(x, y)
209 | 
210 | 
211 | # In[27]:
212 | 
213 | 
214 | fig, axes = plt.subplots(nrows = 2, ncols = 1, figsize = (8, 2))
215 | 
216 | axes[0].plot(x, y)
217 | 
218 | axes[1].plot(y, x)
219 | 
220 | plt.tight_layout()
221 | 
222 | 
223 | # In[28]:
224 | 
225 | 
226 | fig
227 | 
228 | 
229 | # In[29]:
230 | 
231 | 
232 | fig.savefig('my_picture.png', dpi = 200, edgecolor = 'black', facecolor = 'w', transparent = True)
233 | # default dpi is 100 it is about pixels
234 | 
235 | 
236 | # In[30]:
237 | 
238 | 
239 | fig = plt.figure(figsize = (8, 2)) 
240 | 
241 | ax = fig.add_axes([0, 0, 1, 1])
242 | ax.set_title('Title')
243 | ax.set_ylabel('Y')
244 | ax.set_xlabel('X')
245 | 
246 | ax.plot(x, x ** 2, label = 'X Squared')
247 | ax.plot(x, x ** 3, label = 'X Cubed')
248 | ax.legend()                              # it uses/refers labels in .plot
249 | # ax.legend(loc=(0.1, 0.1))
250 | 
251 | 
252 | # In[31]:
253 | 
254 | 
255 | # setting colors, line width, line types
256 | 
257 | 
258 | # In[32]:
259 | 
260 | 
261 | fig = plt.figure()
262 | 
263 | ax = fig.add_axes([0, 0, 1, 1])
264 | 
265 | ax.plot(x, y, color = 'green')   # RGB Hex Code google for custom colors #FF8C00
266 | plt.show()
267 | 
268 | 
269 | # In[33]:
270 | 
271 | 
272 | fig = plt.figure()
273 | 
274 | ax = fig.add_axes([0, 0, 1, 1])
275 | 
276 | ax.plot(x, y, color = 'green', linewidth = 3)       # default linewidth is 1, we can use lw instead of it.
277 | 
278 | 
279 | # In[34]:
280 | 
281 | 
282 | fig = plt.figure()
283 | 
284 | ax = fig.add_axes([0, 0, 1, 1])
285 | 
286 | ax.plot(x, y, color = 'green', linewidth = 3, alpha = 0.3)   # alpha for transparency dafault is 1
287 | 
288 | 
289 | # In[35]:
290 | 
291 | 
292 | fig = plt.figure()
293 | 
294 | ax = fig.add_axes([0, 0, 1, 1])
295 | 
296 | ax.plot(x, y, color = 'green', lw = 3, linestyle = '-.')       # default linestyle is solid, ls
297 | 
298 | 
299 | # In[36]:
300 | 
301 | 
302 | fig = plt.figure()
303 | 
304 | ax = fig.add_axes([0, 0, 1, 1])
305 | 
306 | ax.plot(x, y, color = 'green', lw = 3, ls = '-', marker = 'o')    # marker each value x/y
307 | 
308 | 
309 | # In[37]:
310 | 
311 | 
312 | fig = plt.figure()
313 | 
314 | ax = fig.add_axes([0, 0, 1, 1])
315 | 
316 | ax.plot(x, y, color = 'green', lw = 3, ls = '-', marker = 'o', markersize = 15,
317 |        markerfacecolor = 'red') 
318 | 
319 | 
320 | # In[38]:
321 | 
322 | 
323 | fig = plt.figure()
324 | 
325 | ax = fig.add_axes([0, 0, 1, 1])
326 | 
327 | ax.plot(x, y, color = 'green', lw = 3, ls = '-', marker = 'o', markersize = 15,
328 |        markerfacecolor = 'red', markeredgewidth = 3, markeredgecolor = 'blue') 
329 | 
330 | 
331 | # In[39]:
332 | 
333 | 
334 | # ylim xlim
335 | 
336 | 
337 | # In[40]:
338 | 
339 | 
340 | fig = plt.figure()
341 | 
342 | ax = fig.add_axes([0, 0, 1, 1])
343 | 
344 | ax.plot(x, y, color = 'purple', lw = 2, ls = '--') 
345 | 
346 | ax.set_xlim([0 ,1])
347 | ax.set_ylim([0, 2])
348 | 
349 | 
350 | # In[ ]:
351 | 
352 | 
353 | 
354 | 
355 | 


--------------------------------------------------------------------------------
/NumPy/NumPy_1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import numpy as np
  8 | 
  9 | 
 10 | # In[2]:
 11 | 
 12 | 
 13 | np.zeros(10)
 14 | 
 15 | 
 16 | # In[3]:
 17 | 
 18 | 
 19 | np.ones(10)
 20 | 
 21 | 
 22 | # In[5]:
 23 | 
 24 | 
 25 | np.ones(10) * 5
 26 | 
 27 | 
 28 | # In[6]:
 29 | 
 30 | 
 31 | np.arange(10, 51)
 32 | 
 33 | 
 34 | # In[7]:
 35 | 
 36 | 
 37 | np.arange(10, 51, 2)
 38 | 
 39 | 
 40 | # In[8]:
 41 | 
 42 | 
 43 | np.arange(9).reshape(3, 3)
 44 | 
 45 | 
 46 | # In[9]:
 47 | 
 48 | 
 49 | np.eye(3)
 50 | 
 51 | 
 52 | # In[14]:
 53 | 
 54 | 
 55 | from numpy.random import rand
 56 | rand(1)
 57 | 
 58 | 
 59 | # In[15]:
 60 | 
 61 | 
 62 | from numpy.random import randn
 63 | randn(25)
 64 | 
 65 | 
 66 | # In[24]:
 67 | 
 68 | 
 69 | np.arange(1, 101).reshape(10, 10) / 100
 70 | 
 71 | 
 72 | # In[25]:
 73 | 
 74 | 
 75 | np.linspace(0, 1, 20)
 76 | 
 77 | 
 78 | # In[27]:
 79 | 
 80 | 
 81 | mat = np.arange(1, 26).reshape(5, 5)
 82 | mat
 83 | 
 84 | 
 85 | # In[28]:
 86 | 
 87 | 
 88 | mat[2:,1:]
 89 | 
 90 | 
 91 | # In[29]:
 92 | 
 93 | 
 94 | mat[3, 4]
 95 | 
 96 | 
 97 | # In[31]:
 98 | 
 99 | 
100 | mat[:3,1:2]
101 | 
102 | 
103 | # In[53]:
104 | 
105 | 
106 | mat[4]
107 | 
108 | 
109 | # In[34]:
110 | 
111 | 
112 | mat[3:]
113 | 
114 | 
115 | # In[36]:
116 | 
117 | 
118 | np.sum(mat)
119 | 
120 | 
121 | # In[38]:
122 | 
123 | 
124 | np.std(mat)
125 | 
126 | 
127 | # In[51]:
128 | 
129 | 
130 | np.sum(mat, axis = 0)
131 | 
132 | 
133 | # In[1]:
134 | 
135 | 
136 | ########LAB#########
137 | 
138 | 
139 | # In[2]:
140 | 
141 | 
142 | import numpy as np
143 | 
144 | 
145 | # In[3]:
146 | 
147 | 
148 | a = np.array([[3, 1], [1, 2]])
149 | b = np.array([9, 8])
150 | 
151 | 
152 | # In[4]:
153 | 
154 | 
155 | a
156 | 
157 | 
158 | # In[5]:
159 | 
160 | 
161 | b
162 | 
163 | 
164 | # In[28]:
165 | 
166 | 
167 | np.linalg.multi_dot(b)
168 | 
169 | 
170 | # In[9]:
171 | 
172 | 
173 | x = np.arange(1, 11)
174 | 
175 | 
176 | # In[10]:
177 | 
178 | 
179 | x
180 | 
181 | 
182 | # In[11]:
183 | 
184 | 
185 | y = np.arange(-1, 1, 0.2)
186 | 
187 | 
188 | # In[12]:
189 | 
190 | 
191 | y
192 | 
193 | 
194 | # In[13]:
195 | 
196 | 
197 | np.linspace(0, 10, 25)
198 | 
199 | 
200 | # In[20]:
201 | 
202 | 
203 | np.logspace(0, 10, 10, base = 2)
204 | 
205 | 
206 | # In[30]:
207 | 
208 | 
209 | np.random.seed(0)
210 | np.random.rand(5, 5)        # uniform distribution
211 | 
212 | 
213 | # In[31]:
214 | 
215 | 
216 | np.random.rand(5, 5)
217 | 
218 | 
219 | # In[32]:
220 | 
221 | 
222 | np.random.rand(5, 5)
223 | 
224 | 
225 | # In[33]:
226 | 
227 | 
228 | np.random.seed(0)
229 | np.random.rand(5, 5)
230 | 
231 | 
232 | # In[34]:
233 | 
234 | 
235 | np.random.randn(3, 3)       # normal distribution
236 | 
237 | 
238 | # In[36]:
239 | 
240 | 
241 | np.diag([1, 2, 3, 4])      # diagonal matrix
242 | 
243 | 
244 | # In[37]:
245 | 
246 | 
247 | np.diag([1, 2, 3, 4], k = -1)    # default k = 0
248 | 
249 | 
250 | # In[38]:
251 | 
252 | 
253 | np.diag([1, 2, 3, 4], k = 1)
254 | 
255 | 
256 | # In[41]:
257 | 
258 | 
259 | np.eye(4, k = -1)
260 | 
261 | 
262 | # In[42]:
263 | 
264 | 
265 | np.eye(4)
266 | 
267 | 
268 | # In[43]:
269 | 
270 | 
271 | d = np.array([i for i in range(5)])
272 | 
273 | 
274 | # In[44]:
275 | 
276 | 
277 | d
278 | 
279 | 
280 | # In[45]:
281 | 
282 | 
283 | row_mask = np.array([True, False, True, False, False])   # np.arrays are homogeneous.
284 | 
285 | 
286 | # In[46]:
287 | 
288 | 
289 | d[row_mask]
290 | 
291 | 
292 | # In[47]:
293 | 
294 | 
295 | row1_mask = np.array([0, 0, 0, 1, 1], dtype = bool)
296 | 
297 | 
298 | # In[49]:
299 | 
300 | 
301 | d[row1_mask]
302 | 
303 | 
304 | # In[54]:
305 | 
306 | 
307 | x = np.arange(0, 10, 0.5)
308 | 
309 | 
310 | # In[55]:
311 | 
312 | 
313 | x
314 | 
315 | 
316 | # In[56]:
317 | 
318 | 
319 | mask = (5 < x) & (x < 7.5)
320 | 
321 | 
322 | # In[59]:
323 | 
324 | 
325 | mask
326 | 
327 | 
328 | # In[57]:
329 | 
330 | 
331 | x[mask]
332 | 
333 | 
334 | # In[58]:
335 | 
336 | 
337 | x[(5 < x) & (x < 7.5)]
338 | 
339 | 
340 | # In[62]:
341 | 
342 | 
343 | indices = np.where(mask)
344 | 
345 | 
346 | # In[63]:
347 | 
348 | 
349 | indices
350 | 
351 | 
352 | # In[64]:
353 | 
354 | 
355 | np.where(mask)
356 | 
357 | 
358 | # In[65]:
359 | 
360 | 
361 | x[indices]
362 | 
363 | 
364 | # In[67]:
365 | 
366 | 
367 | a = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [1, 2, 3, 4], [5, 6, 7, 8]])
368 | 
369 | 
370 | # In[68]:
371 | 
372 | 
373 | a
374 | 
375 | 
376 | # In[70]:
377 | 
378 | 
379 | np.diag(a)
380 | 
381 | 
382 | # In[71]:
383 | 
384 | 
385 | np.diag(a, k = 1)
386 | 
387 | 
388 | # In[75]:
389 | 
390 | 
391 | np.diag(a, k = 3)
392 | 
393 | 
394 | # In[76]:
395 | 
396 | 
397 | arr = np.arange(-3, 3)
398 | 
399 | 
400 | # In[77]:
401 | 
402 | 
403 | arr
404 | 
405 | 
406 | # In[78]:
407 | 
408 | 
409 | arr[[1, 3, 5]]
410 | 
411 | 
412 | # In[79]:
413 | 
414 | 
415 | arr.take([1, 3, 5])
416 | 
417 | 
418 | # In[85]:
419 | 
420 | 
421 | np.choose([1, 3, 5], arr)
422 | 
423 | 
424 | # In[ ]:
425 | 
426 | 
427 | 
428 | 
429 | 


--------------------------------------------------------------------------------
/NumPy/NumPy_Class.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python
   2 | # coding: utf-8
   3 | 
   4 | # In[1]:
   5 | 
   6 | 
   7 | import numpy as np
   8 | 
   9 | 
  10 | # In[2]:
  11 | 
  12 | 
  13 | my_list = [1, 2, 3]
  14 | 
  15 | 
  16 | # In[3]:
  17 | 
  18 | 
  19 | np.array(my_list)
  20 | 
  21 | 
  22 | # In[5]:
  23 | 
  24 | 
  25 | type(np.array(my_list))
  26 | 
  27 | 
  28 | # In[6]:
  29 | 
  30 | 
  31 | a = [1, 2, 3, 4]
  32 | b = [2, 3, 4, 5]
  33 | 
  34 | 
  35 | # In[8]:
  36 | 
  37 | 
  38 | np.array(a) * np.array(b)
  39 | 
  40 | 
  41 | # In[9]:
  42 | 
  43 | 
  44 | my_matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
  45 | 
  46 | 
  47 | # In[10]:
  48 | 
  49 | 
  50 | type(my_matrix)
  51 | 
  52 | 
  53 | # In[11]:
  54 | 
  55 | 
  56 | np.array(my_matrix)
  57 | 
  58 | 
  59 | # In[12]:
  60 | 
  61 | 
  62 | type(np.array(my_matrix))
  63 | 
  64 | 
  65 | # In[13]:
  66 | 
  67 | 
  68 | np.arange(0, 10)
  69 | 
  70 | 
  71 | # In[16]:
  72 | 
  73 | 
  74 | np.arange(0, 11, 2)
  75 | 
  76 | 
  77 | # In[20]:
  78 | 
  79 | 
  80 | np.arange(10)
  81 | 
  82 | 
  83 | # In[21]:
  84 | 
  85 | 
  86 | np.zeros(2)
  87 | 
  88 | 
  89 | # In[37]:
  90 | 
  91 | 
  92 | np.zeros((4, 4), dtype = bool)
  93 | 
  94 | 
  95 | # In[43]:
  96 | 
  97 | 
  98 | np.zeros((4, 4), dtype = str)
  99 | 
 100 | 
 101 | # In[39]:
 102 | 
 103 | 
 104 | np.ones((2, 2), dtype = int)
 105 | 
 106 | 
 107 | # In[41]:
 108 | 
 109 | 
 110 | np.ones(3)
 111 | 
 112 | 
 113 | # In[44]:
 114 | 
 115 | 
 116 | np.ones((4, 4), dtype = bool)
 117 | 
 118 | 
 119 | # In[46]:
 120 | 
 121 | 
 122 | np.full((3, 5), 7)
 123 | 
 124 | 
 125 | # In[47]:
 126 | 
 127 | 
 128 | np.full((3, 5), '7')
 129 | 
 130 | 
 131 | # In[54]:
 132 | 
 133 | 
 134 | np.linspace(0, 10, 3)
 135 | 
 136 | 
 137 | # In[55]:
 138 | 
 139 | 
 140 | np.linspace(0, 10)
 141 | 
 142 | 
 143 | # In[56]:
 144 | 
 145 | 
 146 | len(np.linspace(0, 10))
 147 | 
 148 | 
 149 | # In[57]:
 150 | 
 151 | 
 152 | np.linspace(0, 10, dtype = int)
 153 | 
 154 | 
 155 | # In[58]:
 156 | 
 157 | 
 158 | set(np.linspace(0, 10, dtype = int))
 159 | 
 160 | 
 161 | # In[60]:
 162 | 
 163 | 
 164 | np.linspace(0, 10, dtype = int).reshape(10, 5)
 165 | 
 166 | 
 167 | # In[61]:
 168 | 
 169 | 
 170 | np.eye(4)
 171 | 
 172 | 
 173 | # In[62]:
 174 | 
 175 | 
 176 | np.random.rand(5)     # uniform distribution
 177 | 
 178 | 
 179 | # In[64]:
 180 | 
 181 | 
 182 | np.random.rand(3, 2)
 183 | 
 184 | 
 185 | # In[1]:
 186 | 
 187 | 
 188 | import matplotlib.pyplot as plt
 189 | 
 190 | 
 191 | # In[4]:
 192 | 
 193 | 
 194 | plt.hist(np.random.rand(5000))   # uniform distribution
 195 | plt.show()
 196 | 
 197 | 
 198 | # In[90]:
 199 | 
 200 | 
 201 | plt.hist(np.random.rand(50000), bins = 75) 
 202 | plt.show()
 203 | 
 204 | 
 205 | # In[80]:
 206 | 
 207 | 
 208 | np.random.randn(5)        # normal distribution
 209 | 
 210 | 
 211 | # In[88]:
 212 | 
 213 | 
 214 | np.random.randn(5, 5) 
 215 | 
 216 | 
 217 | # In[94]:
 218 | 
 219 | 
 220 | plt.hist(np.random.randn(50000))       # normal distribution
 221 | plt.show()
 222 | 
 223 | 
 224 | # In[99]:
 225 | 
 226 | 
 227 | np.random.randn(50000).mean()
 228 | 
 229 | 
 230 | # In[5]:
 231 | 
 232 | 
 233 | np.random.randn(50000).std()
 234 | 
 235 | 
 236 | # In[5]:
 237 | 
 238 | 
 239 | np.random.randint(1, 100)
 240 | 
 241 | 
 242 | # In[7]:
 243 | 
 244 | 
 245 | np.random.randint(100, size = 10)
 246 | 
 247 | 
 248 | # In[8]:
 249 | 
 250 | 
 251 | np.random.randint(1, 100, 10)
 252 | 
 253 | 
 254 | # In[15]:
 255 | 
 256 | 
 257 | np.random.randint(1, 100, (2, 2))
 258 | 
 259 | 
 260 | # In[7]:
 261 | 
 262 | 
 263 | np.random.randint(1, [3, 50, 100])
 264 | 
 265 | 
 266 | # In[9]:
 267 | 
 268 | 
 269 | np.random.randint(1, [3, 50, 100], (10, 3))
 270 | 
 271 | 
 272 | # In[22]:
 273 | 
 274 | 
 275 | np.random.randint([3, 50, 100], [5, 60, 120])
 276 | 
 277 | 
 278 | # In[10]:
 279 | 
 280 | 
 281 | np.random.randint([3, 50, 100], [5, 60, 120], (5, 3))
 282 | 
 283 | 
 284 | # In[23]:
 285 | 
 286 | 
 287 | arr = np.arange(25)
 288 | ranarr = np.random.randint(0, 50, 10)
 289 | 
 290 | 
 291 | # In[24]:
 292 | 
 293 | 
 294 | arr
 295 | 
 296 | 
 297 | # In[25]:
 298 | 
 299 | 
 300 | ranarr
 301 | 
 302 | 
 303 | # In[26]:
 304 | 
 305 | 
 306 | arr.reshape(5, 5)
 307 | 
 308 | 
 309 | # In[28]:
 310 | 
 311 | 
 312 | np.reshape(ranarr, (2, 5))
 313 | 
 314 | 
 315 | # In[31]:
 316 | 
 317 | 
 318 | ranarr.max()
 319 | 
 320 | 
 321 | # In[32]:
 322 | 
 323 | 
 324 | ranarr.argmax()
 325 | 
 326 | 
 327 | # In[33]:
 328 | 
 329 | 
 330 | np.max(ranarr)
 331 | 
 332 | 
 333 | # In[34]:
 334 | 
 335 | 
 336 | ranarr.min()
 337 | 
 338 | 
 339 | # In[35]:
 340 | 
 341 | 
 342 | ranarr.argmin()
 343 | 
 344 | 
 345 | # In[37]:
 346 | 
 347 | 
 348 | arr.ndim
 349 | 
 350 | 
 351 | # In[38]:
 352 | 
 353 | 
 354 | arr.shape
 355 | 
 356 | 
 357 | # In[41]:
 358 | 
 359 | 
 360 | arr.reshape(5, 5).shape
 361 | 
 362 | 
 363 | # In[39]:
 364 | 
 365 | 
 366 | arr.size
 367 | 
 368 | 
 369 | # In[40]:
 370 | 
 371 | 
 372 | arr.dtype
 373 | 
 374 | 
 375 | # In[11]:
 376 | 
 377 | 
 378 | x = np.array([1, 2, 3])
 379 | y = np.array([4, 5, 6])
 380 | 
 381 | 
 382 | # In[13]:
 383 | 
 384 | 
 385 | np.concatenate([x, y])
 386 | 
 387 | 
 388 | # In[45]:
 389 | 
 390 | 
 391 | z = np.array([7, 8, 9])
 392 | 
 393 | 
 394 | # In[48]:
 395 | 
 396 | 
 397 | np.concatenate([x, y, z])
 398 | 
 399 | 
 400 | # In[50]:
 401 | 
 402 | 
 403 | a1 = np.concatenate([x, y]).reshape(2, 3)
 404 | 
 405 | 
 406 | # In[51]:
 407 | 
 408 | 
 409 | a1
 410 | 
 411 | 
 412 | # In[54]:
 413 | 
 414 | 
 415 | np.concatenate([a1, a1])
 416 | 
 417 | 
 418 | # In[55]:
 419 | 
 420 | 
 421 | np.concatenate([a1, a1], axis = 1)
 422 | 
 423 | 
 424 | # In[56]:
 425 | 
 426 | 
 427 | x = np.array([1, 2, 3, 99, 99, 3, 2, 1])
 428 | 
 429 | 
 430 | # In[63]:
 431 | 
 432 | 
 433 | np.split(x, [3, 5, 7])
 434 | 
 435 | 
 436 | # In[70]:
 437 | 
 438 | 
 439 | a, b, c, d = np.split(x, [3, 5, 7])
 440 | 
 441 | 
 442 | # In[71]:
 443 | 
 444 | 
 445 | a
 446 | 
 447 | 
 448 | # In[72]:
 449 | 
 450 | 
 451 | b
 452 | 
 453 | 
 454 | # In[73]:
 455 | 
 456 | 
 457 | c
 458 | 
 459 | 
 460 | # In[74]:
 461 | 
 462 | 
 463 | d
 464 | 
 465 | 
 466 | # In[75]:
 467 | 
 468 | 
 469 | np.split(x, 4)
 470 | 
 471 | 
 472 | # In[16]:
 473 | 
 474 | 
 475 | y = np.arange(20).reshape(5, 4)
 476 | 
 477 | 
 478 | # In[17]:
 479 | 
 480 | 
 481 | y
 482 | 
 483 | 
 484 | # In[18]:
 485 | 
 486 | 
 487 | np.split(y, 5)
 488 | 
 489 | 
 490 | # In[19]:
 491 | 
 492 | 
 493 | np.split(y, 4, axis = 1)
 494 | 
 495 | 
 496 | # In[20]:
 497 | 
 498 | 
 499 | np.vsplit(y, [2,4])
 500 | 
 501 | 
 502 | # In[21]:
 503 | 
 504 | 
 505 | np.vsplit(y, 5)
 506 | 
 507 | 
 508 | # In[22]:
 509 | 
 510 | 
 511 | np.split(y, [2, 4])
 512 | 
 513 | 
 514 | # In[25]:
 515 | 
 516 | 
 517 | np.split(y, [2, 4], axis = 1)
 518 | 
 519 | 
 520 | # In[118]:
 521 | 
 522 | 
 523 | y
 524 | 
 525 | 
 526 | # In[123]:
 527 | 
 528 | 
 529 | np.hsplit(y, [3])
 530 | 
 531 | 
 532 | # In[125]:
 533 | 
 534 | 
 535 | np.hsplit(y, 2)
 536 | 
 537 | 
 538 | # In[127]:
 539 | 
 540 | 
 541 | left, right = np.hsplit(y, 2)
 542 | 
 543 | 
 544 | # In[128]:
 545 | 
 546 | 
 547 | left
 548 | 
 549 | 
 550 | # In[129]:
 551 | 
 552 | 
 553 | right
 554 | 
 555 | 
 556 | # In[132]:
 557 | 
 558 | 
 559 | upper, lower = np.vsplit(y, [4])
 560 | 
 561 | 
 562 | # In[133]:
 563 | 
 564 | 
 565 | upper
 566 | 
 567 | 
 568 | # In[134]:
 569 | 
 570 | 
 571 | lower
 572 | 
 573 | 
 574 | # In[140]:
 575 | 
 576 | 
 577 | v = np.array([2, 1, 4, 3, 5])
 578 | v
 579 | 
 580 | 
 581 | # In[141]:
 582 | 
 583 | 
 584 | np.sort(v)     # we need to assign a new variable
 585 | 
 586 | 
 587 | # In[142]:
 588 | 
 589 | 
 590 | v
 591 | 
 592 | 
 593 | # In[143]:
 594 | 
 595 | 
 596 | v.sort()         # changes
 597 | 
 598 | 
 599 | # In[144]:
 600 | 
 601 | 
 602 | v
 603 | 
 604 | 
 605 | # In[148]:
 606 | 
 607 | 
 608 | v2 = np.random.randint(5, 100, (3, 3))
 609 | 
 610 | 
 611 | # In[149]:
 612 | 
 613 | 
 614 | v2
 615 | 
 616 | 
 617 | # In[150]:
 618 | 
 619 | 
 620 | np.sort(v2, axis = 0)
 621 | 
 622 | 
 623 | # In[151]:
 624 | 
 625 | 
 626 | np.sort(v2, axis = 1)
 627 | 
 628 | 
 629 | # In[156]:
 630 | 
 631 | 
 632 | np.sort(v2)
 633 | 
 634 | 
 635 | # In[157]:
 636 | 
 637 | 
 638 | arr = np.arange(0, 11)
 639 | 
 640 | 
 641 | # In[158]:
 642 | 
 643 | 
 644 | arr
 645 | 
 646 | 
 647 | # In[159]:
 648 | 
 649 | 
 650 | arr[2:4]
 651 | 
 652 | 
 653 | # In[160]:
 654 | 
 655 | 
 656 | arr[8]
 657 | 
 658 | 
 659 | # In[161]:
 660 | 
 661 | 
 662 | arr[-1]
 663 | 
 664 | 
 665 | # In[163]:
 666 | 
 667 | 
 668 | arr[::2]
 669 | 
 670 | 
 671 | # In[164]:
 672 | 
 673 | 
 674 | arr[0:5] = 100
 675 | 
 676 | 
 677 | # In[165]:
 678 | 
 679 | 
 680 | arr
 681 | 
 682 | 
 683 | # In[166]:
 684 | 
 685 | 
 686 | arr = np.arange(11)
 687 | 
 688 | 
 689 | # In[167]:
 690 | 
 691 | 
 692 | arr
 693 | 
 694 | 
 695 | # In[172]:
 696 | 
 697 | 
 698 | slice_of_arr = arr[0:6]
 699 | 
 700 | 
 701 | # In[173]:
 702 | 
 703 | 
 704 | slice_of_arr
 705 | 
 706 | 
 707 | # In[176]:
 708 | 
 709 | 
 710 | slice_of_arr[:] = 88
 711 | 
 712 | 
 713 | # In[177]:
 714 | 
 715 | 
 716 | arr
 717 | 
 718 | 
 719 | # In[178]:
 720 | 
 721 | 
 722 | slice_of_arr
 723 | 
 724 | 
 725 | # In[179]:
 726 | 
 727 | 
 728 | arr = np.arange(11)
 729 | 
 730 | 
 731 | # In[180]:
 732 | 
 733 | 
 734 | arr_2 = arr.copy()
 735 | 
 736 | 
 737 | # In[181]:
 738 | 
 739 | 
 740 | arr_2
 741 | 
 742 | 
 743 | # In[182]:
 744 | 
 745 | 
 746 | slice_of_arr = arr[0:6]
 747 | 
 748 | 
 749 | # In[183]:
 750 | 
 751 | 
 752 | slice_of_arr[:] = 77
 753 | 
 754 | 
 755 | # In[184]:
 756 | 
 757 | 
 758 | arr
 759 | 
 760 | 
 761 | # In[185]:
 762 | 
 763 | 
 764 | arr_2
 765 | 
 766 | 
 767 | # In[27]:
 768 | 
 769 | 
 770 | arr_2d = np.array([[5, 10, 15], [20, 25, 30], [35, 40, 45]])
 771 | arr_2d
 772 | 
 773 | 
 774 | # In[187]:
 775 | 
 776 | 
 777 | arr_2d[1]
 778 | 
 779 | 
 780 | # In[30]:
 781 | 
 782 | 
 783 | arr_2d[1, 0]
 784 | 
 785 | 
 786 | # In[192]:
 787 | 
 788 | 
 789 | arr_2d[1, 0:1]
 790 | 
 791 | 
 792 | # In[193]:
 793 | 
 794 | 
 795 | arr_2d[:, 2]
 796 | 
 797 | 
 798 | # In[194]:
 799 | 
 800 | 
 801 | arr_2d[:, 2:]
 802 | 
 803 | 
 804 | # In[195]:
 805 | 
 806 | 
 807 | arr_2d[:, 2] = 3
 808 | 
 809 | 
 810 | # In[196]:
 811 | 
 812 | 
 813 | arr_2d
 814 | 
 815 | 
 816 | # In[200]:
 817 | 
 818 | 
 819 | v = np.arange(0, 30, 3)
 820 | 
 821 | 
 822 | # In[201]:
 823 | 
 824 | 
 825 | v
 826 | 
 827 | 
 828 | # In[202]:
 829 | 
 830 | 
 831 | v[1]
 832 | 
 833 | 
 834 | # In[206]:
 835 | 
 836 | 
 837 | idx_list = [1, 3, 5]             # fancy indexing
 838 | 
 839 | 
 840 | # In[207]:
 841 | 
 842 | 
 843 | v[idx_list]                      # fancy indexing
 844 | 
 845 | 
 846 | # In[208]:
 847 | 
 848 | 
 849 | v[[1, 3, 5]]                     # fancy indexing
 850 | 
 851 | 
 852 | # In[210]:
 853 | 
 854 | 
 855 | arr_2d = np.zeros((10, 10), dtype = int)
 856 | 
 857 | 
 858 | # In[211]:
 859 | 
 860 | 
 861 | arr_2d
 862 | 
 863 | 
 864 | # In[212]:
 865 | 
 866 | 
 867 | arr_2d.shape
 868 | 
 869 | 
 870 | # In[213]:
 871 | 
 872 | 
 873 | arr_length = arr_2d.shape[1]
 874 | 
 875 | 
 876 | # In[215]:
 877 | 
 878 | 
 879 | arr_length
 880 | 
 881 | 
 882 | # In[216]:
 883 | 
 884 | 
 885 | arr_2d[0]
 886 | 
 887 | 
 888 | # In[217]:
 889 | 
 890 | 
 891 | arr_2d[3]
 892 | 
 893 | 
 894 | # In[218]:
 895 | 
 896 | 
 897 | for i in range(arr_length):
 898 |     arr_2d[i] = i
 899 | 
 900 | 
 901 | # In[219]:
 902 | 
 903 | 
 904 | arr_2d
 905 | 
 906 | 
 907 | # In[220]:
 908 | 
 909 | 
 910 | arr_2d[[2, 4, 6, 8]]
 911 | 
 912 | 
 913 | # In[221]:
 914 | 
 915 | 
 916 | arr_2d[[6, 4, 2, 7]]
 917 | 
 918 | 
 919 | # In[3]:
 920 | 
 921 | 
 922 | jj = np.arange(1, 17).reshape(4, 4)
 923 | 
 924 | 
 925 | # In[4]:
 926 | 
 927 | 
 928 | jj
 929 | 
 930 | 
 931 | # In[9]:
 932 | 
 933 | 
 934 | jj[[1, 3], [2, 3]]              # fancy indexing [axis-0], [axis-1]
 935 | 
 936 | 
 937 | # In[226]:
 938 | 
 939 | 
 940 | jj[[1, 2], [0, 3]]
 941 | 
 942 | 
 943 | # In[227]:
 944 | 
 945 | 
 946 | jj
 947 | 
 948 | 
 949 | # In[228]:
 950 | 
 951 | 
 952 | jj[1, [1, 3]]
 953 | 
 954 | 
 955 | # In[230]:
 956 | 
 957 | 
 958 | jj [[0, 3], 1]
 959 | 
 960 | 
 961 | # In[232]:
 962 | 
 963 | 
 964 | jj[0:, [1, 3]]
 965 | 
 966 | 
 967 | # In[233]:
 968 | 
 969 | 
 970 | arr = np.arange(1, 11)
 971 | 
 972 | 
 973 | # In[234]:
 974 | 
 975 | 
 976 | arr
 977 | 
 978 | 
 979 | # In[235]:
 980 | 
 981 | 
 982 | arr > 4
 983 | 
 984 | 
 985 | # In[236]:
 986 | 
 987 | 
 988 | arr[arr > 4]
 989 | 
 990 | 
 991 | # In[242]:
 992 | 
 993 | 
 994 | arr[(arr != 3) & (arr != 4)]
 995 | 
 996 | 
 997 | # In[250]:
 998 | 
 999 | 
1000 | arr[arr % 2 == 0]
1001 | 
1002 | 
1003 | # In[251]:
1004 | 
1005 | 
1006 | arr = np.arange(11)
1007 | 
1008 | 
1009 | # In[252]:
1010 | 
1011 | 
1012 | arr + arr
1013 | 
1014 | 
1015 | # In[253]:
1016 | 
1017 | 
1018 | arr - arr
1019 | 
1020 | 
1021 | # In[254]:
1022 | 
1023 | 
1024 | arr * arr
1025 | 
1026 | 
1027 | # In[255]:
1028 | 
1029 | 
1030 | arr ** 2
1031 | 
1032 | 
1033 | # In[256]:
1034 | 
1035 | 
1036 | arr // arr
1037 | 
1038 | 
1039 | # In[257]:
1040 | 
1041 | 
1042 | arr / 0
1043 | 
1044 | 
1045 | # In[258]:
1046 | 
1047 | 
1048 | arr / 1
1049 | 
1050 | 
1051 | # In[259]:
1052 | 
1053 | 
1054 | arr + 3
1055 | 
1056 | 
1057 | # In[260]:
1058 | 
1059 | 
1060 | np.exp(arr)
1061 | 
1062 | 
1063 | # In[261]:
1064 | 
1065 | 
1066 | np.sin(arr)
1067 | 
1068 | 
1069 | # In[263]:
1070 | 
1071 | 
1072 | np.sin(np.pi/2)
1073 | 
1074 | 
1075 | # In[264]:
1076 | 
1077 | 
1078 | np.tan(np.pi/4)
1079 | 
1080 | 
1081 | # In[ ]:
1082 | 
1083 | 
1084 | 
1085 | 
1086 | 


--------------------------------------------------------------------------------
/NumPy/NumPy_U.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | pip show numpy
  8 | 
  9 | 
 10 | # In[2]:
 11 | 
 12 | 
 13 | pip install numpy
 14 | 
 15 | 
 16 | # In[3]:
 17 | 
 18 | 
 19 | my_list = [1, 2, 3]
 20 | 
 21 | 
 22 | # In[4]:
 23 | 
 24 | 
 25 | import numpy as np
 26 | 
 27 | 
 28 | # In[5]:
 29 | 
 30 | 
 31 | arr = np.array(my_list)
 32 | 
 33 | 
 34 | # In[6]:
 35 | 
 36 | 
 37 | arr
 38 | 
 39 | 
 40 | # In[7]:
 41 | 
 42 | 
 43 | my_math = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
 44 | 
 45 | 
 46 | # In[8]:
 47 | 
 48 | 
 49 | np.array(my_math)
 50 | 
 51 | 
 52 | # In[9]:
 53 | 
 54 | 
 55 | np.arange(11)       # it is similar range() func. in python. (start,stop,step)
 56 | 
 57 | 
 58 | # In[10]:
 59 | 
 60 | 
 61 | np.arange(1, 11, 2)
 62 | 
 63 | 
 64 | # In[11]:
 65 | 
 66 | 
 67 | np.zeros(3)
 68 | 
 69 | 
 70 | # In[12]:
 71 | 
 72 | 
 73 | np.zeros((2, 5))        # 2 rows, 5 columns
 74 | 
 75 | 
 76 | # In[13]:
 77 | 
 78 | 
 79 | np.ones(4)              # output has 1 bracket it is 1-D array
 80 | 
 81 | 
 82 | # In[14]:
 83 | 
 84 | 
 85 | np.ones((3, 2))         # output has 2 brackets it is 2-D array
 86 | 
 87 | 
 88 | # In[15]:
 89 | 
 90 | 
 91 | np.linspace(2, 3, 5)    # (start, stop, number, some extra parameters...)
 92 | 
 93 | 
 94 | # In[16]:
 95 | 
 96 | 
 97 | np.linspace(0, 5, 10)
 98 | 
 99 | 
100 | # In[17]:
101 | 
102 | 
103 | np.linspace(0, 5, 10, retstep = True)
104 | 
105 | 
106 | # In[18]:
107 | 
108 | 
109 | np.eye(4)
110 | 
111 | 
112 | # In[19]:
113 | 
114 | 
115 | np.random.rand(5)       # random between 0 and 1, uniform distribution
116 | 
117 | 
118 | # In[20]:
119 | 
120 | 
121 | np.random.rand(5, 5)
122 | 
123 | 
124 | # In[21]:
125 | 
126 | 
127 | np.random.randn(5)       # random around 0, normal distribution
128 | 
129 | 
130 | # In[22]:
131 | 
132 | 
133 | np.random.randn(4, 4)
134 | 
135 | 
136 | # In[23]:
137 | 
138 | 
139 | np.random.randint(1,100)      # 1 is inclusive, 100 is exclusive
140 | 
141 | 
142 | # In[24]:
143 | 
144 | 
145 | np.random.randint(1, 100, 10)
146 | 
147 | 
148 | # In[25]:
149 | 
150 | 
151 | arr = np.arange(25)
152 | print(arr)
153 | 
154 | 
155 | # In[26]:
156 | 
157 | 
158 | np.reshape(arr, (5, 5))
159 | 
160 | 
161 | # In[27]:
162 | 
163 | 
164 | ranarr = np.random.randint(0, 50, 10)
165 | print(ranarr)
166 | 
167 | 
168 | # In[28]:
169 | 
170 | 
171 | arr.reshape(5, 10)
172 | 
173 | 
174 | # In[ ]:
175 | 
176 | 
177 | arr.reshape(5, 5)   # we changed it as a 2-D with reshape method
178 | 
179 | 
180 | # In[29]:
181 | 
182 | 
183 | ranarr
184 | 
185 | 
186 | # In[30]:
187 | 
188 | 
189 | ranarr.max()
190 | 
191 | 
192 | # In[31]:
193 | 
194 | 
195 | ranarr.min()
196 | 
197 | 
198 | # In[32]:
199 | 
200 | 
201 | ranarr.argmax()    # indexing max value in array
202 | 
203 | 
204 | # In[33]:
205 | 
206 | 
207 | ranarr.argmin()
208 | 
209 | 
210 | # In[34]:
211 | 
212 | 
213 | arr.shape     # 1-D
214 | 
215 | 
216 | # In[35]:
217 | 
218 | 
219 | arr
220 | 
221 | 
222 | # In[36]:
223 | 
224 | 
225 | arr = arr.reshape(5, 5)
226 | 
227 | 
228 | # In[37]:
229 | 
230 | 
231 | arr.shape       # 2-D
232 | 
233 | 
234 | # In[38]:
235 | 
236 | 
237 | arr
238 | 
239 | 
240 | # In[39]:
241 | 
242 | 
243 | arr.dtype          # it gives actual data type 
244 | 
245 | 
246 | # In[40]:
247 | 
248 | 
249 | from numpy.random import randint
250 | 
251 | 
252 | # In[41]:
253 | 
254 | 
255 | randint(2, 10)
256 | 
257 | 
258 | # In[42]:
259 | 
260 | 
261 | # NumPy Indexing and Selection
262 | 
263 | 
264 | # In[43]:
265 | 
266 | 
267 | import numpy as np
268 | 
269 | 
270 | # In[44]:
271 | 
272 | 
273 | arr = np.arange(11)
274 | 
275 | 
276 | # In[45]:
277 | 
278 | 
279 | arr
280 | 
281 | 
282 | # In[46]:
283 | 
284 | 
285 | arr[8]
286 | 
287 | 
288 | # In[47]:
289 | 
290 | 
291 | arr[1:5]
292 | 
293 | 
294 | # In[48]:
295 | 
296 | 
297 | arr[0:5]
298 | 
299 | 
300 | # In[49]:
301 | 
302 | 
303 | arr[:]
304 | 
305 | 
306 | # In[50]:
307 | 
308 | 
309 | arr[:6]
310 | 
311 | 
312 | # In[51]:
313 | 
314 | 
315 | arr[::-1]
316 | 
317 | 
318 | # In[52]:
319 | 
320 | 
321 | arr[::2]
322 | 
323 | 
324 | # In[53]:
325 | 
326 | 
327 | arr[0:5] = 100         # broadcast
328 | 
329 | 
330 | # In[54]:
331 | 
332 | 
333 | arr
334 | 
335 | 
336 | # In[55]:
337 | 
338 | 
339 | arr = np.arange(11)
340 | 
341 | 
342 | # In[56]:
343 | 
344 | 
345 | arr
346 | 
347 | 
348 | # In[57]:
349 | 
350 | 
351 | slice_of_arr = arr[0:6]        # original array does not copied
352 | 
353 | 
354 | # In[58]:
355 | 
356 | 
357 | slice_of_arr
358 | 
359 | 
360 | # In[59]:
361 | 
362 | 
363 | slice_of_arr[:] = 99
364 | 
365 | 
366 | # In[60]:
367 | 
368 | 
369 | slice_of_arr
370 | 
371 | 
372 | # In[61]:
373 | 
374 | 
375 | arr                         # it changes too
376 | 
377 | 
378 | # In[62]:
379 | 
380 | 
381 | arr_copy = arr.copy()       # we copied it now the origibal doesnt change
382 | 
383 | 
384 | # In[63]:
385 | 
386 | 
387 | arr
388 | 
389 | 
390 | # In[64]:
391 | 
392 | 
393 | arr_copy[:] = 100
394 | 
395 | 
396 | # In[65]:
397 | 
398 | 
399 | arr_copy
400 | 
401 | 
402 | # In[66]:
403 | 
404 | 
405 | arr
406 | 
407 | 
408 | # In[67]:
409 | 
410 | 
411 | arr_2d = np.array([[5, 10, 15], [20, 25, 30], [35, 40, 45]])
412 | 
413 | 
414 | # In[68]:
415 | 
416 | 
417 | arr_2d
418 | 
419 | 
420 | # In[69]:
421 | 
422 | 
423 | arr_2d[0, 0]
424 | 
425 | 
426 | # In[70]:
427 | 
428 | 
429 | arr_2d[0, 1]
430 | 
431 | 
432 | # In[71]:
433 | 
434 | 
435 | arr_2d[1, 2]
436 | 
437 | 
438 | # In[72]:
439 | 
440 | 
441 | arr_2d[0]
442 | 
443 | 
444 | # In[73]:
445 | 
446 | 
447 | arr_2d[1]
448 | 
449 | 
450 | # In[74]:
451 | 
452 | 
453 | arr_2d[:2]
454 | 
455 | 
456 | # In[75]:
457 | 
458 | 
459 | arr_2d[::2]
460 | 
461 | 
462 | # In[76]:
463 | 
464 | 
465 | arr_2d[:2,1:]
466 | 
467 | 
468 | # In[77]:
469 | 
470 | 
471 | arr_2d[1:,:2]
472 | 
473 | 
474 | # In[78]:
475 | 
476 | 
477 | arr = np.arange(1, 11)
478 | 
479 | 
480 | # In[79]:
481 | 
482 | 
483 | arr
484 | 
485 | 
486 | # In[80]:
487 | 
488 | 
489 | bool_arr = arr > 5
490 | 
491 | 
492 | # In[81]:
493 | 
494 | 
495 | bool_arr
496 | 
497 | 
498 | # In[82]:
499 | 
500 | 
501 | arr[bool_arr]
502 | 
503 | 
504 | # In[83]:
505 | 
506 | 
507 | arr[arr > 5]
508 | 
509 | 
510 | # In[84]:
511 | 
512 | 
513 | arr[arr <= 3]
514 | 
515 | 
516 | # In[85]:
517 | 
518 | 
519 | arr_2d = np.arange(50).reshape(5, 10)
520 | 
521 | 
522 | # In[86]:
523 | 
524 | 
525 | arr_2d
526 | 
527 | 
528 | # In[87]:
529 | 
530 | 
531 | arr_2d[1:3,3:5]
532 | 
533 | 
534 | # In[88]:
535 | 
536 | 
537 | # NumPy Operations
538 | 
539 | 
540 | # In[89]:
541 | 
542 | 
543 | import numpy as np
544 | 
545 | 
546 | # In[90]:
547 | 
548 | 
549 | arr = np.arange(11)
550 | 
551 | 
552 | # In[91]:
553 | 
554 | 
555 | arr
556 | 
557 | 
558 | # In[92]:
559 | 
560 | 
561 | arr + arr
562 | 
563 | 
564 | # In[93]:
565 | 
566 | 
567 | arr - arr
568 | 
569 | 
570 | # In[94]:
571 | 
572 | 
573 | arr * arr
574 | 
575 | 
576 | # In[95]:
577 | 
578 | 
579 | arr + 100
580 | 
581 | 
582 | # In[96]:
583 | 
584 | 
585 | arr * 2
586 | 
587 | 
588 | # In[97]:
589 | 
590 | 
591 | arr ** 2
592 | 
593 | 
594 | # In[98]:
595 | 
596 | 
597 | arr % 2
598 | 
599 | 
600 | # In[99]:
601 | 
602 | 
603 | arr / arr           # 0 / 0 gives error normally. numpy just give a warning and gives a nan value
604 | 
605 | 
606 | # In[100]:
607 | 
608 | 
609 | 1 / arr
610 | 
611 | 
612 | # In[101]:
613 | 
614 | 
615 | np.sqrt(arr)
616 | 
617 | 
618 | # In[102]:
619 | 
620 | 
621 | np.exp(arr)
622 | 
623 | 
624 | # In[103]:
625 | 
626 | 
627 | np.max(arr)
628 | 
629 | 
630 | # In[104]:
631 | 
632 | 
633 | arr.max()
634 | 
635 | 
636 | # In[105]:
637 | 
638 | 
639 | np.sin(arr)
640 | 
641 | 
642 | # In[106]:
643 | 
644 | 
645 | arr
646 | 
647 | 
648 | # In[107]:
649 | 
650 | 
651 | np.log(arr)
652 | 
653 | 
654 | # In[108]:
655 | 
656 | 
657 | import numpy as np
658 | 
659 | 
660 | # In[109]:
661 | 
662 | 
663 | arr = np.arange(10)
664 | arr
665 | 
666 | 
667 | # In[110]:
668 | 
669 | 
670 | print(arr)
671 | 
672 | 
673 | # In[111]:
674 | 
675 | 
676 | print(type(arr))
677 | 
678 | 
679 | # In[112]:
680 | 
681 | 
682 | print(type(arr[0]))
683 | 
684 | 
685 | # In[113]:
686 | 
687 | 
688 | np.full((3, 2), 1)
689 | 
690 | 
691 | # In[114]:
692 | 
693 | 
694 | np.empty(2, dtype = int)
695 | 
696 | 
697 | # In[115]:
698 | 
699 | 
700 | np.empty((2, 2))
701 | 
702 | 
703 | # In[116]:
704 | 
705 | 
706 | np.random.seed(101)
707 | np.random.randint(10, size = 6)
708 | 
709 | 
710 | # In[117]:
711 | 
712 | 
713 | from skimage import io
714 | photo = io.imread('Sea.jpg')
715 | type(photo)
716 | 
717 | 
718 | # In[118]:
719 | 
720 | 
721 | photo.shape
722 | 
723 | 
724 | # In[119]:
725 | 
726 | 
727 | import matplotlib.pyplot as plt
728 | print(plt.imshow(photo))
729 | 
730 | 
731 | # In[120]:
732 | 
733 | 
734 | plt.imshow(photo[::-1])
735 | 
736 | 
737 | # In[121]:
738 | 
739 | 
740 | plt.imshow(photo[:, ::-1])
741 | 
742 | 
743 | # In[122]:
744 | 
745 | 
746 | plt.imshow(photo[:300,:600])
747 | 
748 | 
749 | # In[123]:
750 | 
751 | 
752 | plt.imshow(photo[::2, ::2])         # resize image /2
753 | 
754 | 
755 | # In[124]:
756 | 
757 | 
758 | print(np.sum(photo))
759 | 
760 | 
761 | # In[125]:
762 | 
763 | 
764 | print(np.mean(photo))
765 | print(np.std(photo))
766 | print(np.var(photo))
767 | 
768 | 
769 | # In[126]:
770 | 
771 | 
772 | a = np.array([9, 5, 1, 7, 3])
773 | 
774 | 
775 | # In[127]:
776 | 
777 | 
778 | a
779 | 
780 | 
781 | # In[128]:
782 | 
783 | 
784 | np.sort(a)
785 | 
786 | 
787 | # In[129]:
788 | 
789 | 
790 | a = np.array(42)
791 | b = np.array([1, 2, 3, 4, 5])
792 | c = np.array([[1, 2, 3], [4, 5, 6]])
793 | d = np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])
794 | 
795 | print(a.ndim) 
796 | print(b.ndim) 
797 | print(c.ndim) 
798 | print(d.ndim)
799 | 
800 | 
801 | # In[130]:
802 | 
803 | 
804 | arr = np.array([1, 2, 3, 4], ndmin=5)
805 | 
806 | print(arr)
807 | print('number of dimensions :', arr.ndim)
808 | 
809 | 
810 | # In[131]:
811 | 
812 | 
813 | arr = np.array([1, 2, 3, 4])
814 | 
815 | print(type(arr))
816 | print(arr.dtype)
817 | 
818 | 
819 | # In[132]:
820 | 
821 | 
822 | arr = np.array(['apple', 'banana', 'cherry'])
823 | 
824 | print(arr.dtype),
825 | print(type(arr[0]))
826 | 
827 | 
828 | # In[133]:
829 | 
830 | 
831 | # data types in NumPy
832 | i - integer
833 | b - boolean
834 | u - unsigned integer
835 | f - float
836 | c - complex float
837 | m - timedelta
838 | M - datetime
839 | O - object
840 | S - string
841 | U - unicode string
842 | V - fixed chunk of memory for other type ( void )
843 | 
844 | 
845 | # In[134]:
846 | 
847 | 
848 | arr = np.array([1, 2, 3, 4], dtype='U')
849 | 
850 | print(arr)
851 | print(arr.dtype)
852 | print(type(arr[0]))
853 | 
854 | 
855 | # In[135]:
856 | 
857 | 
858 | arr = np.array([1, 2, 3, 4], dtype='i4')
859 | 
860 | print(arr)
861 | print(arr.dtype)
862 | 
863 | 


--------------------------------------------------------------------------------
/Other/Regex.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import re
  8 | 
  9 | 
 10 | # In[2]:
 11 | 
 12 | 
 13 | import pandas as pd
 14 | 
 15 | 
 16 | # In[26]:
 17 | 
 18 | 
 19 | text = re.search('\d', 'A2')       # digits
 20 | 
 21 | 
 22 | # In[27]:
 23 | 
 24 | 
 25 | print(text)
 26 | 
 27 | 
 28 | # In[6]:
 29 | 
 30 | 
 31 | print(text.group())
 32 | 
 33 | 
 34 | # In[23]:
 35 | 
 36 | 
 37 | text = re.search('\D', '22a')      # non-digits
 38 | print(text.group())
 39 | 
 40 | 
 41 | # In[9]:
 42 | 
 43 | 
 44 | text = 'My phone number is 505-555-5555'
 45 | 
 46 | 
 47 | # In[19]:
 48 | 
 49 | 
 50 | output = re.search('(\d\d\d)-(\d\d\d)-(\d\d\d\d)', text)
 51 | 
 52 | 
 53 | # In[20]:
 54 | 
 55 | 
 56 | print(output.group())
 57 | 
 58 | 
 59 | # In[36]:
 60 | 
 61 | 
 62 | text = 'My phone number is 415-555-1212'
 63 | 
 64 | 
 65 | # In[37]:
 66 | 
 67 | 
 68 | output = re.search('(\d\d\d)-(\d\d\d-\d\d\d\d)', text)
 69 | 
 70 | 
 71 | # In[42]:
 72 | 
 73 | 
 74 | print(output.group(2))       # we make group with -
 75 | 
 76 | 
 77 | # In[45]:
 78 | 
 79 | 
 80 | print(output.group(1))
 81 | 
 82 | 
 83 | # In[46]:
 84 | 
 85 | 
 86 | text = 'My phone number is (415) 555-1212'
 87 | 
 88 | 
 89 | # In[49]:
 90 | 
 91 | 
 92 | output = re.search('(\(\d\d\d\)) (\d\d\d-\d\d\d\d)', text)
 93 | 
 94 | 
 95 | # In[50]:
 96 | 
 97 | 
 98 | print(output.group())
 99 | 
100 | 
101 | # In[51]:
102 | 
103 | 
104 | value = '0 1, t 10, o 100.'
105 | 
106 | 
107 | # In[56]:
108 | 
109 | 
110 | output = re.findall('\d', value)
111 | print(output)
112 | 
113 | 
114 | # In[58]:
115 | 
116 | 
117 | output = re.findall('\d\d', value)
118 | print(output)
119 | 
120 | 
121 | # In[59]:
122 | 
123 | 
124 | output = re.findall('\d{1,3}', value)
125 | print(output)
126 | 
127 | 
128 | # In[64]:
129 | 
130 | 
131 | phone = '2004-959-559 # This is Phone Number'
132 | output = re.sub('\D', '', phone)             # replace except ('expression')  with ''
133 | print(output)
134 | 
135 | 
136 | # In[63]:
137 | 
138 | 
139 | phone = '2004-959-559 # This is Phone Number'
140 | output = re.sub('\D', '.', phone)
141 | print(output)
142 | 
143 | 
144 | # In[73]:
145 | 
146 | 
147 | txt = 'hello world'
148 | 
149 | output = re.findall('^he', txt)
150 | 
151 | print(output)
152 | 
153 | 
154 | # In[ ]:
155 | 
156 | 
157 | # Pandas
158 | 
159 | 
160 | # In[76]:
161 | 
162 | 
163 | s = pd.Series(['a3', 'b4', 'c5'])            # extract numbers from pandas series
164 | 
165 | s.str.extract('(\d)')
166 | 
167 | 
168 | # In[77]:
169 | 
170 | 
171 | s = pd.Series(['a3', 'b4', 'c5'])           # extract letters from pandas series
172 | 
173 | s.str.extract('(\w)')
174 | 
175 | 
176 | # In[78]:
177 | 
178 | 
179 | s = pd.Series(['a3f', 'b4f', 'c5f'])        
180 | 
181 | s.str.extract('(\w\d)')
182 | 
183 | 
184 | # In[81]:
185 | 
186 | 
187 | s = pd.Series(['40 l/100 km (comb)',
188 |         '38 l/100 km (comb)', '6.4 l/100 km (comb)',
189 |        '8.3 kg/100 km (comb)', '5.1 kg/100 km (comb)',
190 |        '5.4 l/100 km (comb)', '6.7 l/100 km (comb)',
191 |        '6.2 l/100 km (comb)', '7.3 l/100 km (comb)',
192 |        '6.3 l/100 km (comb)', '5.7 l/100 km (comb)',
193 |        '6.1 l/100 km (comb)', '6.8 l/100 km (comb)',
194 |        '7.5 l/100 km (comb)', '7.4 l/100 km (comb)',
195 |        '3.6 kg/100 km (comb)', '0 l/100 km (comb)',
196 |        '7.8 l/100 km (comb)'])
197 | 
198 | 
199 | # In[95]:
200 | 
201 | 
202 | s.str.extract('(\d\d|\d.\d|\d)')
203 | 
204 | 
205 | # In[96]:
206 | 
207 | 
208 | s = pd.Series(['40 l/100 km (comb)',
209 |         '38 l/100 km (comb)', '6.4 l/100 km (comb)',
210 |        '8.3 kg/100 km (comb)', '5.1 kg/100 km (comb)',
211 |        '5.4 l/100 km (comb)', '6.7 l/100 km (comb)',
212 |        '6.2 l/100 km (comb)', '7.3 l/100 km (comb)',
213 |        '6.3 l/100 km (comb)', '5.7 l/100 km (comb)',
214 |        '6.1 l/100 km (comb)', '6.8 l/100 km (comb)',
215 |        '7.5 l/100 km (comb)', '7.4 l/100 km (comb)',
216 |        '3.6 kg/100 km (comb)', '0 l/100 km (comb)',
217 |        '7.8 l/100 km (comb)'])
218 | 
219 | 
220 | # In[103]:
221 | 
222 | 
223 | s.str.extract('(\d\d|\d.\d|\d).*(\d\d\d)')
224 | 
225 | 
226 | # In[105]:
227 | 
228 | 
229 | s = pd.Series(['06/2020\n\n4.9 l/100 km (comb)',
230 | '11/2020\n\n166 g CO2/km (comb)',
231 | '10/2019\n\n5.3 l/100 km (comb)',
232 | '05/2022\n\n6.3 l/100 km (comb)',
233 | '07/2019\n\n128 g CO2/km (comb)',
234 | '06/2022\n\n112 g CO2/km (comb)',
235 | '01/2022\n\n5.8 l/100 km (comb)',
236 | '11/2020\n\n106 g CO2/km (comb)',
237 | '04/2019\n\n105 g CO2/km (comb)',
238 | '08/2020\n\n133 g CO2/km (comb)',
239 | '04/2022\n\n133 g CO2/km (comb)'])
240 | 
241 | 
242 | # In[108]:
243 | 
244 | 
245 | s.str.extract('(\d\d).(\d{4})')
246 | 
247 | 
248 | # In[109]:
249 | 
250 | 
251 | s = pd.Series(['06/2020\n\n4.9 l/100 km (comb)',
252 | '11/2020\n\n166 g CO2/km (comb)',
253 | '10/2019\n\n5.3 l/100 km (comb)',
254 | '05/2022\n\n6.3 l/100 km (comb)',
255 | '07/2019\n\n128 g CO2/km (comb)',
256 | '06/2022\n\n112 g CO2/km (comb)',
257 | '01/2022\n\n5.8 l/100 km (comb)',
258 | '11/2020\n\n106 g CO2/km (comb)',
259 | '04/2019\n\n105 g CO2/km (comb)',
260 | '08/2020\n\n133 g CO2/km (comb)',
261 | '04/2022\n\n133 g CO2/km (comb)'])
262 | 
263 | 
264 | # In[113]:
265 | 
266 | 
267 | s.str.extract('(\d\d).(\d\d\d\d)\s\s(\d{3}|\d.\d)')
268 | 
269 | 
270 | # In[ ]:
271 | 
272 | 
273 | 
274 | 
275 | 


--------------------------------------------------------------------------------
/Pandas/Pandas Built-in DV_1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Pandas Data Visualization Exercise
  5 | # 
  6 | # This is just a quick exercise for you to review the various plots we showed earlier. Use **df3** to replicate the following plots. 
  7 | 
  8 | # In[1]:
  9 | 
 10 | 
 11 | import pandas as pd
 12 | import matplotlib.pyplot as plt
 13 | df3 = pd.read_csv('df3')
 14 | get_ipython().run_line_magic('matplotlib', 'inline')
 15 | 
 16 | 
 17 | # In[2]:
 18 | 
 19 | 
 20 | df3.info()
 21 | 
 22 | 
 23 | # In[3]:
 24 | 
 25 | 
 26 | df3.head()
 27 | 
 28 | 
 29 | # ** Recreate this scatter plot of b vs a. Note the color and size of the points. Also note the figure size. See if you can figure out how to stretch it in a similar fashion. Remeber back to your matplotlib lecture...**
 30 | 
 31 | # In[39]:
 32 | 
 33 | 
 34 | df3.plot.scatter('a', 'b', figsize = (12, 3), c = 'red', s = 50)
 35 | 
 36 | 
 37 | # ** Create a histogram of the 'a' column.**
 38 | 
 39 | # In[20]:
 40 | 
 41 | 
 42 | df3['a'].hist()
 43 | 
 44 | 
 45 | # ** These plots are okay, but they don't look very polished. Use style sheets to set the style to 'ggplot' and redo the histogram from above. Also figure out how to add more bins to it.***
 46 | 
 47 | # In[43]:
 48 | 
 49 | 
 50 | plt.style.use('ggplot')
 51 | df3['a'].hist(bins = 20, alpha = 0.5)
 52 | 
 53 | 
 54 | # ** Create a boxplot comparing the a and b columns.**
 55 | 
 56 | # In[44]:
 57 | 
 58 | 
 59 | df3[['a', 'b']].plot.box()
 60 | 
 61 | 
 62 | # In[46]:
 63 | 
 64 | 
 65 | df3[['a', 'b']].boxplot()
 66 | 
 67 | 
 68 | # ** Create a kde plot of the 'd' column **
 69 | 
 70 | # In[26]:
 71 | 
 72 | 
 73 | df3['d'].plot.kde()
 74 | 
 75 | 
 76 | # ** Figure out how to increase the linewidth and make the linestyle dashed. (Note: You would usually not dash a kde plot line)**
 77 | 
 78 | # In[48]:
 79 | 
 80 | 
 81 | df3['d'].plot.kde(lw = 5, ls = ':')
 82 | 
 83 | 
 84 | # In[49]:
 85 | 
 86 | 
 87 | df3['d'].plot.density(lw = 5, ls = ':')
 88 | 
 89 | 
 90 | # ** Create an area plot of all the columns for just the rows up to 30. (hint: use .ix).**
 91 | 
 92 | # In[50]:
 93 | 
 94 | 
 95 | df3.loc[0:30].plot.area(alpha = 0.4)
 96 | 
 97 | 
 98 | # Note, you may find this really hard, reference the solutions if you can't figure it out!
 99 | # ** Notice how the legend in our previous figure overlapped some of actual diagram. Can you figure out how to display the legend outside of the plot as shown below?**
100 | # 
101 | # ** Try searching Google for a good stackoverflow link on this topic. If you can't find it on your own - [use this one for a hint.](http://stackoverflow.com/questions/23556153/how-to-put-legend-outside-the-plot-with-pandas)**
102 | 
103 | # In[56]:
104 | 
105 | 
106 | df3.loc[0:30].plot.area(alpha = 0.4)
107 | plt.legend(loc = 'center left', bbox_to_anchor = (1., 0.5))
108 | 
109 | 
110 | # # Great Job!
111 | 


--------------------------------------------------------------------------------
/Pandas/Pandas Built-in DV_Class.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import matplotlib.pyplot as plt
 10 | import seaborn as sns
 11 | 
 12 | 
 13 | # In[2]:
 14 | 
 15 | 
 16 | df1 = pd.read_csv('df1', index_col = 0)
 17 | df2 = pd.read_csv('df2')
 18 | 
 19 | 
 20 | # In[3]:
 21 | 
 22 | 
 23 | df1.head()
 24 | 
 25 | 
 26 | # In[4]:
 27 | 
 28 | 
 29 | df2.head()
 30 | 
 31 | 
 32 | # In[5]:
 33 | 
 34 | 
 35 | sns.set(style = 'darkgrid')
 36 | 
 37 | 
 38 | # In[6]:
 39 | 
 40 | 
 41 | df1['A'].hist()
 42 | 
 43 | 
 44 | # In[4]:
 45 | 
 46 | 
 47 | df = pd.DataFrame({'sales': [3, 2, 3, 9, 10, 6], 'signups': [5, 5, 6, 12, 14, 13],
 48 |                    'visits': [20, 42, 28, 62, 81, 50]},
 49 |                   index=pd.date_range(start='2018/01/01', end='2018/07/01',
 50 |                                       freq='M'))
 51 | 
 52 | 
 53 | # In[5]:
 54 | 
 55 | 
 56 | df.head()
 57 | 
 58 | 
 59 | # In[6]:
 60 | 
 61 | 
 62 | df.shape
 63 | 
 64 | 
 65 | # In[7]:
 66 | 
 67 | 
 68 | # area plot
 69 | 
 70 | 
 71 | # In[8]:
 72 | 
 73 | 
 74 | df.plot.area()      # = df.plot(kind = 'area')
 75 | 
 76 | 
 77 | # In[9]:
 78 | 
 79 | 
 80 | df.plot.area(stacked = False)  
 81 | 
 82 | 
 83 | # In[12]:
 84 | 
 85 | 
 86 | df.plot.area(figsize = (9, 5))
 87 | 
 88 | 
 89 | # In[17]:
 90 | 
 91 | 
 92 | # barplots
 93 | 
 94 | 
 95 | # In[16]:
 96 | 
 97 | 
 98 | speed = [0.1, 17.5, 40, 48, 52, 69, 88]
 99 | lifespan = [2, 8, 70, 1.5, 25, 12, 28]
100 | index = ['snail', 'pig', 'elephant','rabbit', 'giraffe', 'coyote', 'horse']
101 | df = pd.DataFrame({'speed': speed,'lifespan': lifespan}, index=index)
102 | df.head()
103 | 
104 | 
105 | # In[18]:
106 | 
107 | 
108 | df.plot.bar()
109 | 
110 | 
111 | # In[41]:
112 | 
113 | 
114 | df.plot.bar(figsize = (9, 6), rot = 0)
115 | plt.axhline(50, color = 'green', ls = '--')
116 | 
117 | 
118 | # In[44]:
119 | 
120 | 
121 | labels=['Snail', 'Pig', 'Elephant','Rabbit', 'Giraffe', 'Coyote', 'Horse']
122 | 
123 | 
124 | # In[46]:
125 | 
126 | 
127 | g = df.plot.bar(figsize = (9, 6), rot = 0)
128 | g.set_xticklabels(labels)
129 | for p in g.patches:
130 |     g.annotate((p.get_height()), (p.get_x()+0.02, p.get_height()+0.5))
131 | 
132 | 
133 | # In[47]:
134 | 
135 | 
136 | income = [100, 80, 150, 48, 52, 69, 88]
137 | expense = [30, 100, 100, 20, 75, 50, 28]
138 | index = ['snail', 'pig', 'elephant','rabbit', 'giraffe', 'coyote', 'horse']
139 | df = pd.DataFrame({'income': income,'expense': expense}, index=index)
140 | df.head()
141 | 
142 | 
143 | # In[48]:
144 | 
145 | 
146 | df.plot.bar()
147 | 
148 | 
149 | # In[49]:
150 | 
151 | 
152 | df.plot.bar(stacked = True)
153 | 
154 | 
155 | # In[50]:
156 | 
157 | 
158 | df['profit_loss'] = df['income'] - df['expense']
159 | 
160 | 
161 | # In[52]:
162 | 
163 | 
164 | df.plot.bar(figsize = (8, 4))
165 | 
166 | 
167 | # In[53]:
168 | 
169 | 
170 | # histograms
171 | 
172 | 
173 | # In[54]:
174 | 
175 | 
176 | mpg = sns.load_dataset('mpg')
177 | 
178 | 
179 | # In[55]:
180 | 
181 | 
182 | mpg.head()
183 | 
184 | 
185 | # In[56]:
186 | 
187 | 
188 | mpg['horsepower'].plot.hist(bins = 20)
189 | 
190 | 
191 | # In[57]:
192 | 
193 | 
194 | mpg['horsepower'].plot(kind = 'hist', bins = 20)
195 | 
196 | 
197 | # In[59]:
198 | 
199 | 
200 | df1.head()
201 | 
202 | 
203 | # In[61]:
204 | 
205 | 
206 | df1['B'].plot()
207 | 
208 | 
209 | # In[62]:
210 | 
211 | 
212 | df1['B'].plot.line()
213 | 
214 | 
215 | # In[63]:
216 | 
217 | 
218 | df1.plot(y = 'B')
219 | 
220 | 
221 | # In[66]:
222 | 
223 | 
224 | mpg.groupby('model_year')['horsepower'].mean().plot()
225 | 
226 | 
227 | # In[67]:
228 | 
229 | 
230 | mpg.groupby('model_year')['horsepower'].mean().plot.line()
231 | 
232 | 
233 | # In[68]:
234 | 
235 | 
236 | mpg.groupby('model_year')['mpg'].mean().plot.line(ls = '--')
237 | 
238 | 
239 | # In[ ]:
240 | 
241 | 
242 | 
243 | 
244 | 


--------------------------------------------------------------------------------
/Pandas/Pandas Built-in DV_U.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[18]:
  5 | 
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import matplotlib.pyplot as plt
 10 | import seaborn as sns
 11 | 
 12 | 
 13 | # In[8]:
 14 | 
 15 | 
 16 | df1 = pd.read_csv('df1', index_col = 0)
 17 | 
 18 | 
 19 | # In[24]:
 20 | 
 21 | 
 22 | df1.head()
 23 | 
 24 | 
 25 | # In[14]:
 26 | 
 27 | 
 28 | df2 = pd.read_csv('df2')
 29 | 
 30 | 
 31 | # In[15]:
 32 | 
 33 | 
 34 | df2.head()
 35 | 
 36 | 
 37 | # In[19]:
 38 | 
 39 | 
 40 | df1['A'].hist()
 41 | 
 42 | 
 43 | # In[20]:
 44 | 
 45 | 
 46 | df1['A'].hist(bins = 30)
 47 | 
 48 | 
 49 | # In[25]:
 50 | 
 51 | 
 52 | df1['A'].plot(kind = 'hist')
 53 | 
 54 | 
 55 | # In[26]:
 56 | 
 57 | 
 58 | df1['A'].plot(kind = 'hist', bins = 30)
 59 | 
 60 | 
 61 | # In[27]:
 62 | 
 63 | 
 64 | df1['A'].plot.hist()
 65 | 
 66 | 
 67 | # In[28]:
 68 | 
 69 | 
 70 | df1['A'].plot.hist(bins = 30)
 71 | 
 72 | 
 73 | # In[29]:
 74 | 
 75 | 
 76 | df2.head()
 77 | 
 78 | 
 79 | # In[30]:
 80 | 
 81 | 
 82 | df2.plot.area()
 83 | 
 84 | 
 85 | # In[32]:
 86 | 
 87 | 
 88 | df2.plot.area(figsize = (10, 7), alpha = 0.4)
 89 | 
 90 | 
 91 | # In[33]:
 92 | 
 93 | 
 94 | df2.plot.bar()
 95 | 
 96 | 
 97 | # In[36]:
 98 | 
 99 | 
100 | df2.plot(kind = 'bar', figsize = (10, 7))
101 | 
102 | 
103 | # In[37]:
104 | 
105 | 
106 | df2
107 | 
108 | 
109 | # In[38]:
110 | 
111 | 
112 | df2.plot.bar(stacked = True)
113 | 
114 | 
115 | # In[41]:
116 | 
117 | 
118 | sns.set(style = 'darkgrid')
119 | df1['A'].plot.hist(bins = 40)
120 | 
121 | 
122 | # In[42]:
123 | 
124 | 
125 | df1.head()
126 | 
127 | 
128 | # In[56]:
129 | 
130 | 
131 | df1.plot.line(y = 'B', figsize = (12, 3))
132 | 
133 | 
134 | # In[55]:
135 | 
136 | 
137 | df1.plot.line(y = 'B', figsize = (12, 3), lw = 1)
138 | 
139 | 
140 | # In[57]:
141 | 
142 | 
143 | df1.plot.scatter('A', 'B')
144 | 
145 | 
146 | # In[58]:
147 | 
148 | 
149 | df1.plot.scatter('A', 'B', c = 'C')
150 | 
151 | 
152 | # In[60]:
153 | 
154 | 
155 | df1.plot.scatter('A', 'B', c = 'C', cmap = 'coolwarm', figsize = (10, 7))
156 | 
157 | 
158 | # In[62]:
159 | 
160 | 
161 | df1.plot.scatter('A', 'B', s = df1['C']*100)
162 | 
163 | 
164 | # In[64]:
165 | 
166 | 
167 | df2.plot.box(figsize = (10, 7))
168 | 
169 | 
170 | # In[65]:
171 | 
172 | 
173 | df= pd.DataFrame(np.random.randn(1000, 2), columns = ['a', 'b'])
174 | 
175 | 
176 | # In[67]:
177 | 
178 | 
179 | df.head()
180 | 
181 | 
182 | # In[70]:
183 | 
184 | 
185 | df.plot.hexbin('a', 'b', figsize = (10, 7), gridsize = 25)
186 | 
187 | 
188 | # In[71]:
189 | 
190 | 
191 | df.plot.hexbin('a', 'b', figsize = (10, 7), gridsize = 25, cmap = 'coolwarm')
192 | 
193 | 
194 | # In[72]:
195 | 
196 | 
197 | df2['a']
198 | 
199 | 
200 | # In[73]:
201 | 
202 | 
203 | df2['a'].plot.kde()
204 | 
205 | 
206 | # In[74]:
207 | 
208 | 
209 | df2['a'].plot.density()
210 | 
211 | 
212 | # In[75]:
213 | 
214 | 
215 | df2.plot.density()
216 | 
217 | 
218 | # In[ ]:
219 | 
220 | 
221 | 
222 | 
223 | 


--------------------------------------------------------------------------------
/Pandas/Pandas-(Aggregation, Groupby, Operations).py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Aggregation & Groupby
  5 | # 
  6 | # The ``groupby`` method allows you to group rows of data together and call aggregate functions
  7 | 
  8 | # ### Basic aggregation methods:
  9 | # 
 10 | # * ``count()``   Compute count of group
 11 | # * ``mean()``    Compute mean of groups
 12 | # * ``median()``  Compute median of groups
 13 | # * ``min()``     Compute min of group values
 14 | # * ``max()``     Compute max of group values
 15 | # * ``std()``     Standard deviation of groups
 16 | # * ``var()``     Compute variance of groups
 17 | # * ``sum()``     Compute sum of group values
 18 | # * ``describe()``Generates descriptive statistics
 19 | 
 20 | # In[1]:
 21 | 
 22 | 
 23 | import pandas as pd
 24 | import numpy as np
 25 | import seaborn as sns
 26 | 
 27 | 
 28 | # In[2]:
 29 | 
 30 | 
 31 | df = sns.load_dataset("planets")
 32 | 
 33 | 
 34 | # In[3]:
 35 | 
 36 | 
 37 | df
 38 | 
 39 | 
 40 | # In[5]:
 41 | 
 42 | 
 43 | df.head(2)
 44 | 
 45 | 
 46 | # In[6]:
 47 | 
 48 | 
 49 | df.shape
 50 | 
 51 | 
 52 | # In[7]:
 53 | 
 54 | 
 55 | df.info()
 56 | 
 57 | 
 58 | # In[8]:
 59 | 
 60 | 
 61 | df['mass']
 62 | 
 63 | 
 64 | # In[9]:
 65 | 
 66 | 
 67 | df["mass"].mean()
 68 | 
 69 | 
 70 | # In[10]:
 71 | 
 72 | 
 73 | df["mass"].count()
 74 | 
 75 | 
 76 | # In[11]:
 77 | 
 78 | 
 79 | df["mass"].min()
 80 | 
 81 | 
 82 | # In[12]:
 83 | 
 84 | 
 85 | df["mass"].max()
 86 | 
 87 | 
 88 | # In[13]:
 89 | 
 90 | 
 91 | df["mass"].sum()
 92 | 
 93 | 
 94 | # In[14]:
 95 | 
 96 | 
 97 | df["mass"].std()
 98 | 
 99 | 
100 | # In[15]:
101 | 
102 | 
103 | df["mass"].var()
104 | 
105 | 
106 | # In[16]:
107 | 
108 | 
109 | df.describe()
110 | 
111 | 
112 | # In[18]:
113 | 
114 | 
115 | df.describe().T
116 | 
117 | 
118 | # In[17]:
119 | 
120 | 
121 | df.describe().transpose()
122 | 
123 | 
124 | #  - # ``df.groupby()``
125 | 
126 | # In[19]:
127 | 
128 | 
129 | df.head()
130 | 
131 | 
132 | # In[20]:
133 | 
134 | 
135 | df.info()
136 | 
137 | 
138 | # In[21]:
139 | 
140 | 
141 | df['method'].unique()
142 | 
143 | 
144 | # In[22]:
145 | 
146 | 
147 | df['method'].nunique()
148 | 
149 | 
150 | # In[26]:
151 | 
152 | 
153 | df['mass'].value_counts(dropna = False)
154 | 
155 | 
156 | # In[24]:
157 | 
158 | 
159 | df["method"].value_counts()
160 | 
161 | 
162 | # In[27]:
163 | 
164 | 
165 | df.groupby("method")
166 | 
167 | 
168 | # In[28]:
169 | 
170 | 
171 | df.groupby("method").max()
172 | 
173 | 
174 | # In[29]:
175 | 
176 | 
177 | df.groupby("method").mean()
178 | 
179 | 
180 | # In[30]:
181 | 
182 | 
183 | df.groupby("method").mean()['distance']
184 | 
185 | 
186 | # In[31]:
187 | 
188 | 
189 | df.groupby("method").mean()[['distance']]
190 | 
191 | 
192 | # In[33]:
193 | 
194 | 
195 | df.groupby("method").describe()['year']
196 | 
197 | 
198 | # In[34]:
199 | 
200 | 
201 | df
202 | 
203 | 
204 | # In[35]:
205 | 
206 | 
207 | df.groupby('year')['distance'].sum()
208 | 
209 | 
210 | # In[36]:
211 | 
212 | 
213 | data = {'Company':['GOOG', 'GOOG', 'MSFT', 'MSFT', 'FB', 'FB'],
214 |        'Person':['Sam', 'Charlie', 'Amy', 'Vanessa', 'Carl', 'Sarah'],
215 |        'Sales':[200, 120, 340, 124, 243, 350]}
216 | 
217 | 
218 | # In[37]:
219 | 
220 | 
221 | df1 = pd.DataFrame(data)
222 | 
223 | 
224 | # In[38]:
225 | 
226 | 
227 | df1
228 | 
229 | 
230 | # In[39]:
231 | 
232 | 
233 | df1.groupby('Company')[['Sales']].mean()
234 | 
235 | 
236 | # In[40]:
237 | 
238 | 
239 | df1.groupby('Company').min()
240 | 
241 | 
242 | # In[41]:
243 | 
244 | 
245 | df1.groupby('Company').sum()
246 | 
247 | 
248 | #  - # ``DataFrame`` Operations
249 | 
250 | # - ### Common Operations 👈
251 | 
252 | # There are lots of operations with pandas that will be really useful to you, but don't fall into any distinct category. Let's show **Common Operations** here in this lecture:
253 | 
254 | # - Quick review and refresh
255 | 
256 | # In[42]:
257 | 
258 | 
259 | df2 = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})
260 | df2.head()
261 | 
262 | 
263 | # ### Info on Unique Values
264 | 
265 | # In[43]:
266 | 
267 | 
268 | df2["col2"].unique()
269 | 
270 | 
271 | # In[44]:
272 | 
273 | 
274 | df2["col2"].nunique()
275 | 
276 | 
277 | # In[45]:
278 | 
279 | 
280 | df2["col2"].value_counts()
281 | 
282 | 
283 | # In[46]:
284 | 
285 | 
286 | df['mass'].value_counts(dropna = False)
287 | 
288 | 
289 | # ### Selecting Data
290 | 
291 | # In[47]:
292 | 
293 | 
294 | df2
295 | 
296 | 
297 | # In[48]:
298 | 
299 | 
300 | df2['col1'] > 2
301 | 
302 | 
303 | # In[49]:
304 | 
305 | 
306 | df2[df2['col1'] > 2]
307 | 
308 | 
309 | # In[50]:
310 | 
311 | 
312 | df2[(df2['col1'] > 2) & (df2['col2'] == 444)]
313 | 
314 | 
315 | # In[51]:
316 | 
317 | 
318 | df2[(df2['col1']>2) | (df2['col2']==444)]
319 | 
320 | 
321 | # **Get column and index names:**
322 | 
323 | # In[52]:
324 | 
325 | 
326 | df2
327 | 
328 | 
329 | # In[53]:
330 | 
331 | 
332 | df2.columns
333 | 
334 | 
335 | # In[54]:
336 | 
337 | 
338 | df.columns
339 | 
340 | 
341 | # In[55]:
342 | 
343 | 
344 | df2.shape
345 | 
346 | 
347 | # In[56]:
348 | 
349 | 
350 | df2.index
351 | 
352 | 
353 | # In[ ]:
354 | 
355 | 
356 | 
357 | 
358 | 
359 | # In[58]:
360 | 
361 | 
362 | df4 = df.groupby("method")["distance"].describe()
363 | 
364 | 
365 | # In[59]:
366 | 
367 | 
368 | df4
369 | 
370 | 
371 | # In[60]:
372 | 
373 | 
374 | df4.index
375 | 
376 | 
377 | # **Sorting and Ordering a DataFrame:**
378 | 
379 | # In[61]:
380 | 
381 | 
382 | df2
383 | 
384 | 
385 | # In[62]:
386 | 
387 | 
388 | df2.sort_values(by = 'col2')
389 | 
390 | 
391 | # In[63]:
392 | 
393 | 
394 | df2.sort_values(by = 'col2', ascending = False, inplace = True)
395 | 
396 | 
397 | # In[64]:
398 | 
399 | 
400 | df2
401 | 
402 | 
403 | # - ### `.transform()`
404 | # - ### `.apply()`
405 | 
406 | # ### ``.transform()``
407 | 
408 | # In[65]:
409 | 
410 | 
411 | df4 = pd.DataFrame({'groups': ['A', 'B', 'C', 'A', 'B', 'C'],
412 |                    'var1': [10,23,33,22,11,99],
413 |                    'var2': [100,253,333,262,111,969]})
414 | df4
415 | 
416 | 
417 | # In[66]:
418 | 
419 | 
420 | df4["var1"]*9
421 | 
422 | 
423 | # In[67]:
424 | 
425 | 
426 | df_numeric = df4.iloc[:, 1:3]
427 | 
428 | 
429 | # In[68]:
430 | 
431 | 
432 | df_numeric
433 | 
434 | 
435 | # In[69]:
436 | 
437 | 
438 | df_numeric.transform(lambda x : (x-x.mean()) / x.std())
439 | 
440 | 
441 | # In[70]:
442 | 
443 | 
444 | df_numeric.iloc[0,0]
445 | 
446 | 
447 | # In[71]:
448 | 
449 | 
450 | (df_numeric.iloc[0,0] - df_numeric['var1'].mean()) / df_numeric['var1'].std()
451 | 
452 | 
453 | # In[72]:
454 | 
455 | 
456 | df_numeric.transform(lambda x : np.log10(x))
457 | 
458 | 
459 | # In[73]:
460 | 
461 | 
462 | df_numeric.transform(np.log10)
463 | 
464 | 
465 | # ### ``.apply()``
466 | 
467 | # In[74]:
468 | 
469 | 
470 | df4 = pd.DataFrame({'groups': ['A', 'B', 'C', 'A', 'B', 'C'],
471 |                    'var1': [10,23,33,22,11,99],
472 |                    'var2': [100,253,333,262,111,969]})
473 | df4
474 | 
475 | 
476 | # In[82]:
477 | 
478 | 
479 | df4.apply('mean')
480 | 
481 | 
482 | # In[76]:
483 | 
484 | 
485 | df4['var1'].sum()
486 | 
487 | 
488 | # In[77]:
489 | 
490 | 
491 | df4['groups'].sum()
492 | 
493 | 
494 | # In[84]:
495 | 
496 | 
497 | df_numeric
498 | 
499 | 
500 | # In[85]:
501 | 
502 | 
503 | df_numeric.apply(np.median)
504 | 
505 | 
506 | # In[ ]:
507 | 
508 | 
509 | df_numeric
510 | 
511 | 
512 | # In[87]:
513 | 
514 | 
515 | df_numeric.apply(np.mean, axis = 1)
516 | 
517 | 
518 | # In[88]:
519 | 
520 | 
521 | df4
522 | 
523 | 
524 | # In[89]:
525 | 
526 | 
527 | df4.groupby("groups").apply(np.mean)
528 | 
529 | 
530 | # In[90]:
531 | 
532 | 
533 | df4.groupby("groups").mean()
534 | 
535 | 
536 | # In[91]:
537 | 
538 | 
539 | df2 = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abcc','de','ghi','xyzzz']})
540 | 
541 | df2
542 | 
543 | 
544 | # In[92]:
545 | 
546 | 
547 | def times2(x):
548 |     return x * 2
549 | 
550 | 
551 | # In[93]:
552 | 
553 | 
554 | df2["col1"].apply(times2)
555 | 
556 | 
557 | # In[94]:
558 | 
559 | 
560 | df2["col3"].apply(len)
561 | 
562 | 
563 | # ### `df.transform() vs df.apply()`
564 | 
565 | # In[95]:
566 | 
567 | 
568 | df2
569 | 
570 | 
571 | # In[96]:
572 | 
573 | 
574 | df2.transform(len)
575 | 
576 | 
577 | # In[97]:
578 | 
579 | 
580 | df2["col3"].transform(len)
581 | 
582 | 
583 | # In[98]:
584 | 
585 | 
586 | df2.apply(len)
587 | 
588 | 
589 | # In[99]:
590 | 
591 | 
592 | df1 = pd.DataFrame([["a", 9, 25]] * 4, columns=["grp", 'P', 'Q'])
593 | df2 = pd.DataFrame([["b", 9, 25]] * 3, columns=["grp", 'P', 'Q'])
594 | df3 = pd.concat([df1, df2], ignore_index=True)
595 | df3
596 | 
597 | 
598 | # In[100]:
599 | 
600 | 
601 | df3.apply(lambda x : x + x)
602 | 
603 | 
604 | # In[101]:
605 | 
606 | 
607 | df3.transform(lambda y : y + y)
608 | 
609 | 
610 | # In[102]:
611 | 
612 | 
613 | df3
614 | 
615 | 
616 | # In[103]:
617 | 
618 | 
619 | df3.groupby("grp").apply(sum)
620 | 
621 | 
622 | # In[106]:
623 | 
624 | 
625 | df3.groupby("grp").transform(np.mean)
626 | 
627 | 
628 | # In[107]:
629 | 
630 | 
631 | df3.groupby("grp").sum()
632 | 
633 | 
634 | # In[108]:
635 | 
636 | 
637 | df3
638 | 
639 | 
640 | # In[109]:
641 | 
642 | 
643 | df3.groupby("grp").transform(len)
644 | 
645 | 
646 | # In[110]:
647 | 
648 | 
649 | df3.iloc[0:4]
650 | 
651 | 
652 | # In[111]:
653 | 
654 | 
655 | len(df3.iloc[0:4])
656 | 
657 | 
658 | # In[112]:
659 | 
660 | 
661 | df3.groupby("grp").apply(len)
662 | 
663 | 
664 | # ### Pivot Tables
665 | 
666 | # In[113]:
667 | 
668 | 
669 | titanic = sns.load_dataset("titanic")
670 | 
671 | 
672 | # In[114]:
673 | 
674 | 
675 | titanic.head()
676 | 
677 | 
678 | # In[115]:
679 | 
680 | 
681 | titanic.groupby("sex")[["survived"]].mean()
682 | 
683 | 
684 | # In[116]:
685 | 
686 | 
687 | titanic.groupby(["sex", "class"])[["survived"]].mean()
688 | 
689 | 
690 | # In[117]:
691 | 
692 | 
693 | titanic.groupby(["sex", "class"])[["survived"]].mean().T
694 | 
695 | 
696 | # In[118]:
697 | 
698 | 
699 | titanic.groupby(["sex", "class"])[["survived"]].mean().unstack()
700 | 
701 | 
702 | # In[ ]:
703 | 
704 | 
705 | 
706 | 
707 | 
708 | # ### Using pivot table
709 | 
710 | # - Create a spreadsheet-style pivot table as a ``DataFrame``.
711 | 
712 | # In[120]:
713 | 
714 | 
715 | titanic.pivot_table(values = "survived", index = "sex", columns = "class", aggfunc = 'sum')
716 | 
717 | 
718 | # In[ ]:
719 | 
720 | 
721 | titanic.head(2)
722 | 
723 | 
724 | # In[ ]:
725 | 
726 | 
727 | titanic.pivot_table("age", index = "sex", columns = "class")
728 | 
729 | 
730 | # In[ ]:
731 | 
732 | 
733 | titanic.pivot_table("age", index = "class", columns = "sex")
734 | 
735 | 
736 | # In[121]:
737 | 
738 | 
739 | data = {'A':['foo', 'foo', 'foo', 'bar', 'bar', 'bar'],
740 |        'B':['one', 'one', 'two', 'two', 'one', 'one'],
741 |        'C':['x', 'y', 'x', 'y', 'x', 'y'],
742 |        'D':[1, 3, 2, 5, 4, 1]}
743 | 
744 | df5 = pd.DataFrame(data)
745 | 
746 | df5
747 | 
748 | 
749 | # In[122]:
750 | 
751 | 
752 | df5.pivot_table(values = "D", index = ["A", "B"], columns = "C")
753 | 
754 | 
755 | # # The End of the Session
756 | 


--------------------------------------------------------------------------------
/Pandas/Pandas-(Missing Values, Outliers).py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python
   2 | # coding: utf-8
   3 | 
   4 | # # Missing Values & Outliers
   5 | 
   6 | # - # Handling with Missing Values
   7 | 
   8 | # In[1]:
   9 | 
  10 | 
  11 | import pandas as pd
  12 | import numpy as np
  13 | 
  14 | 
  15 | # In[2]:
  16 | 
  17 | 
  18 | df = pd.DataFrame({'A':[1, 2, np.nan],
  19 |                   'B':[5, np.nan, np.nan],
  20 |                   'C':[1, 2, 3]})
  21 | 
  22 | 
  23 | # In[3]:
  24 | 
  25 | 
  26 | df
  27 | 
  28 | 
  29 | # In[4]:
  30 | 
  31 | 
  32 | df.dropna()
  33 | 
  34 | 
  35 | # In[5]:
  36 | 
  37 | 
  38 | df.dropna(axis = 1)
  39 | 
  40 | 
  41 | # In[6]:
  42 | 
  43 | 
  44 | df
  45 | 
  46 | 
  47 | # In[8]:
  48 | 
  49 | 
  50 | df.dropna(thresh = 1)
  51 | 
  52 | 
  53 | # In[9]:
  54 | 
  55 | 
  56 | df.fillna(value = "xxx")
  57 | 
  58 | 
  59 | # In[10]:
  60 | 
  61 | 
  62 | df['A']
  63 | 
  64 | 
  65 | # In[11]:
  66 | 
  67 | 
  68 | df['A'].mean()
  69 | 
  70 | 
  71 | # In[12]:
  72 | 
  73 | 
  74 | df["A"].fillna(value = df["A"].mean())
  75 | 
  76 | 
  77 | # In[ ]:
  78 | 
  79 | 
  80 | df
  81 | 
  82 | 
  83 | # In[13]:
  84 | 
  85 | 
  86 | V1 = np.array([2,3,5,np.NaN,7,1,np.NaN,10,14])
  87 | V2 = np.array([8,np.NaN,5,8,11,np.NaN,np.NaN,2,3])
  88 | V3 = np.array([np.NaN,13,5,6,13,7,np.NaN,3,30])
  89 | df = pd.DataFrame(
  90 |         {"Var1" : V1,
  91 |          "Var2" : V2,
  92 |          "Var3" : V3}        
  93 | )
  94 | 
  95 | df
  96 | 
  97 | 
  98 | # In[14]:
  99 | 
 100 | 
 101 | df.isnull()
 102 | 
 103 | 
 104 | # In[16]:
 105 | 
 106 | 
 107 | df.notnull()
 108 | 
 109 | 
 110 | # In[15]:
 111 | 
 112 | 
 113 | df.isnull().sum()
 114 | 
 115 | 
 116 | # In[17]:
 117 | 
 118 | 
 119 | df.notnull().sum()
 120 | 
 121 | 
 122 | # In[18]:
 123 | 
 124 | 
 125 | df.isnull().sum().sum()
 126 | 
 127 | 
 128 | # In[19]:
 129 | 
 130 | 
 131 | df
 132 | 
 133 | 
 134 | # In[20]:
 135 | 
 136 | 
 137 | df['Var1'].isnull()
 138 | 
 139 | 
 140 | # In[21]:
 141 | 
 142 | 
 143 | df['Var1'][df['Var1'].isnull()]
 144 | 
 145 | 
 146 | # In[ ]:
 147 | 
 148 | 
 149 | df
 150 | 
 151 | 
 152 | # ### Missing Values Handling Methods
 153 | 
 154 | #  - #### Dropping
 155 | 
 156 | # In[22]:
 157 | 
 158 | 
 159 | df
 160 | 
 161 | 
 162 | # In[23]:
 163 | 
 164 | 
 165 | df.dropna()
 166 | 
 167 | 
 168 | # In[24]:
 169 | 
 170 | 
 171 | df.dropna(how = "all")
 172 | 
 173 | 
 174 | # In[26]:
 175 | 
 176 | 
 177 | df
 178 | 
 179 | 
 180 | # In[25]:
 181 | 
 182 | 
 183 | df.dropna(axis = 1)
 184 | 
 185 | 
 186 | # In[27]:
 187 | 
 188 | 
 189 | df
 190 | 
 191 | 
 192 | # In[28]:
 193 | 
 194 | 
 195 | df.dropna(axis = 1, how = "all")
 196 | 
 197 | 
 198 | # In[29]:
 199 | 
 200 | 
 201 | df["delete_me"] = np.nan
 202 | 
 203 | 
 204 | # In[30]:
 205 | 
 206 | 
 207 | df
 208 | 
 209 | 
 210 | # In[31]:
 211 | 
 212 | 
 213 | df.dropna(axis = 1, how = "all", inplace = True)
 214 | 
 215 | 
 216 | # In[32]:
 217 | 
 218 | 
 219 | df
 220 | 
 221 | 
 222 | #  - #### Filling
 223 | 
 224 | # In[33]:
 225 | 
 226 | 
 227 | df
 228 | 
 229 | 
 230 | #  - Filling with a specific value
 231 | 
 232 | # In[34]:
 233 | 
 234 | 
 235 | df["Var1"]
 236 | 
 237 | 
 238 | # In[35]:
 239 | 
 240 | 
 241 | df["Var1"].fillna(0)
 242 | 
 243 | 
 244 | # In[36]:
 245 | 
 246 | 
 247 | df.fillna(value = 0)
 248 | 
 249 | 
 250 | #  - Filling with any Proper Value
 251 | 
 252 | # In[37]:
 253 | 
 254 | 
 255 | df
 256 | 
 257 | 
 258 | # In[38]:
 259 | 
 260 | 
 261 | df["Var1"].mean()
 262 | 
 263 | 
 264 | # In[39]:
 265 | 
 266 | 
 267 | df["Var1"].fillna(value = df["Var1"].mean())
 268 | 
 269 | 
 270 | # In[40]:
 271 | 
 272 | 
 273 | df
 274 | 
 275 | 
 276 | # In[44]:
 277 | 
 278 | 
 279 | df['Var2'].mean()
 280 | 
 281 | 
 282 | # In[55]:
 283 | 
 284 | 
 285 | df.apply(lambda x : x.fillna(value = x.mean()))
 286 | 
 287 | 
 288 | # In[47]:
 289 | 
 290 | 
 291 | df.mean()
 292 | 
 293 | 
 294 | # In[46]:
 295 | 
 296 | 
 297 | df.fillna(df.mean())
 298 | 
 299 | 
 300 | # In[48]:
 301 | 
 302 | 
 303 | df
 304 | 
 305 | 
 306 | # In[49]:
 307 | 
 308 | 
 309 | df.fillna({"Var1" : 6, "Var2": 6.16})
 310 | 
 311 | 
 312 | # In[56]:
 313 | 
 314 | 
 315 | df["Var3"].fillna(df["Var3"].median())
 316 | 
 317 | 
 318 | #  - Filling with any Proper Value Regarding to Group of the Categorical Variables 
 319 | 
 320 | # In[57]:
 321 | 
 322 | 
 323 | Var1 = np.array([1,3,6,np.NaN,7,1,9,np.NaN,15])
 324 | Var2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3])
 325 | Var3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31])
 326 | Var4 = np.array(["IT","IT","IT","HR","HR","HR","HR","IT","IT"])
 327 | 
 328 | df = pd.DataFrame(
 329 |         {"salary" : Var1,
 330 |          "Var2" : Var2,
 331 |          "Var3" : Var3,
 332 |          "department" : Var4}        
 333 | )
 334 | 
 335 | df
 336 | 
 337 | 
 338 | # In[58]:
 339 | 
 340 | 
 341 | df.groupby("department")["salary"].mean()
 342 | 
 343 | 
 344 | # In[59]:
 345 | 
 346 | 
 347 | df.groupby("department")["salary"].transform(np.mean)
 348 | 
 349 | 
 350 | # In[60]:
 351 | 
 352 | 
 353 | df.groupby("department")["salary"].apply(np.mean)
 354 | 
 355 | 
 356 | # In[61]:
 357 | 
 358 | 
 359 | df["salary"].fillna(value = df.groupby("department")["salary"].transform(np.mean))
 360 | 
 361 | 
 362 | # In[62]:
 363 | 
 364 | 
 365 | df["salary"].fillna(value = df.groupby("department")["salary"].apply(np.mean))
 366 | 
 367 | 
 368 | # In[63]:
 369 | 
 370 | 
 371 | df.salary.fillna({0:1, 1:2, 2:3, 3:4, 4:5, 5:6, 6:7, 7:8, 8:9})
 372 | 
 373 | 
 374 | #  - Filling the Missing Values of Categorical Values
 375 | 
 376 | # In[64]:
 377 | 
 378 | 
 379 | V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
 380 | V4 = np.array(["IT",np.nan,"HR","HR","HR","HR",np.nan,"IT","HR"], dtype=object)
 381 | 
 382 | df = pd.DataFrame(
 383 |         {"salary" : V1,
 384 |         "department" : V4}        
 385 | )
 386 | 
 387 | df
 388 | 
 389 | 
 390 | # In[65]:
 391 | 
 392 | 
 393 | df["department"].mode()[0]
 394 | 
 395 | 
 396 | # In[66]:
 397 | 
 398 | 
 399 | df["department"].fillna(df["department"].mode()[0])
 400 | 
 401 | 
 402 | # In[67]:
 403 | 
 404 | 
 405 | df
 406 | 
 407 | 
 408 | # In[68]:
 409 | 
 410 | 
 411 | df["department"].fillna(method = "bfill")
 412 | 
 413 | 
 414 | # In[69]:
 415 | 
 416 | 
 417 | df["department"].fillna(method = "ffill")
 418 | 
 419 | 
 420 | # In[70]:
 421 | 
 422 | 
 423 | df
 424 | 
 425 | 
 426 | # In[71]:
 427 | 
 428 | 
 429 | df.drop('department', axis = 1)
 430 | 
 431 | 
 432 | # In[72]:
 433 | 
 434 | 
 435 | df.drop(index = 1)
 436 | 
 437 | 
 438 | # In[ ]:
 439 | 
 440 | 
 441 | #df.farazi.fillna(method = "ffill", limit = 2)
 442 | #df.farazi.fillna(method = "bfill", limit = 2)
 443 | 
 444 | 
 445 | # In[ ]:
 446 | 
 447 | 
 448 | #df.fillna(value = "unique1", limit=10, inplace=True)
 449 | #df.fillna("unique2", limit=30, inplace=True)
 450 | #df.fillna("unique3", limit=25, inplace=True)
 451 | #df.fillna("unique4", limit=35, inplace=True)
 452 | 
 453 | 
 454 | # In[73]:
 455 | 
 456 | 
 457 | df = pd.DataFrame({"A":[None, 1, 2, 3, None, None],  
 458 |                    "B":[11, 5, None, None, None, 8], 
 459 |                    "C":[None, 5, 10, 11, None, 8]}) 
 460 | 
 461 | 
 462 | # In[74]:
 463 | 
 464 | 
 465 | df
 466 | 
 467 | 
 468 | # In[82]:
 469 | 
 470 | 
 471 | df.fillna(method = "ffill", limit = 2)
 472 | 
 473 | 
 474 | #  - # Handling with Outliers
 475 | 
 476 | # ## Catching and Detecting Outliers
 477 | 
 478 | # In[83]:
 479 | 
 480 | 
 481 | import seaborn as sns
 482 | df = sns.load_dataset('diamonds')
 483 | df = df.select_dtypes(include = ['float64', 'int64']) 
 484 | df = df.dropna()
 485 | df.head()
 486 | 
 487 | 
 488 | # In[84]:
 489 | 
 490 | 
 491 | import matplotlib.pyplot as plt
 492 | 
 493 | 
 494 | # In[85]:
 495 | 
 496 | 
 497 | plt.figure(figsize=(20,15))
 498 | sns.boxplot(x = df['table'])
 499 | 
 500 | 
 501 | # In[86]:
 502 | 
 503 | 
 504 | df['table'].describe()
 505 | 
 506 | 
 507 | # In[87]:
 508 | 
 509 | 
 510 | df_table = df["table"]
 511 | 
 512 | 
 513 | # In[88]:
 514 | 
 515 | 
 516 | df_table.head()
 517 | 
 518 | 
 519 | # In[89]:
 520 | 
 521 | 
 522 | pd.DataFrame(df_table).info()
 523 | 
 524 | 
 525 | # In[90]:
 526 | 
 527 | 
 528 | len(df_table)
 529 | 
 530 | 
 531 | # ## Tukey's Fences | Tukey's Rule
 532 | 
 533 | # - First way of specifying ``Q1 & Q3`` is using the ``.quantile()`` method
 534 | 
 535 | # In[91]:
 536 | 
 537 | 
 538 | df_table.describe()
 539 | 
 540 | 
 541 | # In[178]:
 542 | 
 543 | 
 544 | df_table.quantile(0.25)
 545 | 
 546 | 
 547 | # In[93]:
 548 | 
 549 | 
 550 | Q1 = df_table.quantile(0.25)
 551 | Q3 = df_table.quantile(0.75)
 552 | IQR = Q3 - Q1
 553 | 
 554 | 
 555 | # In[94]:
 556 | 
 557 | 
 558 | Q1
 559 | 
 560 | 
 561 | # In[95]:
 562 | 
 563 | 
 564 | Q3
 565 | 
 566 | 
 567 | # In[96]:
 568 | 
 569 | 
 570 | IQR
 571 | 
 572 | 
 573 | # - Second way of specifying ``Q1 & Q3`` is using the ``.describe()`` method
 574 | 
 575 | # In[97]:
 576 | 
 577 | 
 578 | lower_lim = Q1 - 1.5 * IQR
 579 | upper_lim = Q3 + 1.5 * IQR
 580 | 
 581 | 
 582 | # In[98]:
 583 | 
 584 | 
 585 | lower_lim
 586 | 
 587 | 
 588 | # In[99]:
 589 | 
 590 | 
 591 | upper_lim
 592 | 
 593 | 
 594 | # In[100]:
 595 | 
 596 | 
 597 | (df_table < lower_lim)
 598 | 
 599 | 
 600 | # In[101]:
 601 | 
 602 | 
 603 | (df_table > upper_lim)
 604 | 
 605 | 
 606 | # In[102]:
 607 | 
 608 | 
 609 | outliers_15_low = (df_table < lower_lim)
 610 | 
 611 | 
 612 | # In[103]:
 613 | 
 614 | 
 615 | outliers_15_up = (df_table > upper_lim)
 616 | 
 617 | 
 618 | # In[104]:
 619 | 
 620 | 
 621 | df_table[outliers_15_low]
 622 | 
 623 | 
 624 | # In[105]:
 625 | 
 626 | 
 627 | len(df_table[outliers_15_low])
 628 | 
 629 | 
 630 | # In[106]:
 631 | 
 632 | 
 633 | df_table[outliers_15_up]
 634 | 
 635 | 
 636 | # In[107]:
 637 | 
 638 | 
 639 | len(df_table) - (len(df_table[outliers_15_low]) + len(df_table[outliers_15_up]))
 640 | 
 641 | 
 642 | # In[108]:
 643 | 
 644 | 
 645 | df_table[(outliers_15_low | outliers_15_up)]
 646 | 
 647 | 
 648 | # ***
 649 | 
 650 | # In[109]:
 651 | 
 652 | 
 653 | lower_lim = Q1 - 2.5 * IQR
 654 | upper_lim = Q3 + 2.5 * IQR
 655 | 
 656 | 
 657 | # In[110]:
 658 | 
 659 | 
 660 | lower_lim
 661 | 
 662 | 
 663 | # In[111]:
 664 | 
 665 | 
 666 | upper_lim
 667 | 
 668 | 
 669 | # In[112]:
 670 | 
 671 | 
 672 | (df_table < lower_lim) | (df_table > upper_lim)
 673 | 
 674 | 
 675 | # In[113]:
 676 | 
 677 | 
 678 | outliers_25 = (df_table < lower_lim) | (df_table > upper_lim)
 679 | 
 680 | 
 681 | # In[114]:
 682 | 
 683 | 
 684 | df_table[outliers_25]
 685 | 
 686 | 
 687 | # ### Removing the Outliers
 688 | 
 689 | # In[121]:
 690 | 
 691 | 
 692 | df_table[~(outliers_15_low | outliers_15_up)]
 693 | 
 694 | 
 695 | # In[117]:
 696 | 
 697 | 
 698 | df
 699 | 
 700 | 
 701 | # In[123]:
 702 | 
 703 | 
 704 | clean_df = df[~(outliers_15_low | outliers_15_up)]
 705 | 
 706 | 
 707 | # In[119]:
 708 | 
 709 | 
 710 | clean_df
 711 | 
 712 | 
 713 | # ### Limitation and Transformation of the Outliers
 714 | 
 715 | # - ### Limitation using ``.winsorize()`` method
 716 | 
 717 | # In[124]:
 718 | 
 719 | 
 720 | from scipy.stats.mstats import winsorize
 721 | 
 722 | 
 723 | # In[127]:
 724 | 
 725 | 
 726 | df
 727 | 
 728 | 
 729 | # In[128]:
 730 | 
 731 | 
 732 | df_table
 733 | 
 734 | 
 735 | # In[129]:
 736 | 
 737 | 
 738 | sns.boxplot(x = df_table)
 739 | 
 740 | 
 741 | # In[130]:
 742 | 
 743 | 
 744 | sns.distplot(df_table, bins = 15, kde = False)
 745 | 
 746 | 
 747 | # In[176]:
 748 | 
 749 | 
 750 | df_table.describe()
 751 | 
 752 | 
 753 | # In[125]:
 754 | 
 755 | 
 756 | df_table.quantile(0.01)
 757 | 
 758 | 
 759 | # In[126]:
 760 | 
 761 | 
 762 | df_table.quantile(0.98)
 763 | 
 764 | 
 765 | # In[131]:
 766 | 
 767 | 
 768 | df_table_win = winsorize(df_table, (0.01, 0.02))
 769 | 
 770 | 
 771 | # In[132]:
 772 | 
 773 | 
 774 | df_table_win
 775 | 
 776 | 
 777 | # In[133]:
 778 | 
 779 | 
 780 | sns.boxplot(x = df_table_win)
 781 | 
 782 | 
 783 | # In[134]:
 784 | 
 785 | 
 786 | sns.distplot(df_table_win, bins = 10, kde =False)
 787 | 
 788 | 
 789 | # In[135]:
 790 | 
 791 | 
 792 | pd.DataFrame(df_table_win)
 793 | 
 794 | 
 795 | # In[136]:
 796 | 
 797 | 
 798 | pd.DataFrame(df_table_win)[0]
 799 | 
 800 | 
 801 | # In[137]:
 802 | 
 803 | 
 804 | df_table_win = pd.DataFrame(df_table_win)[0]
 805 | 
 806 | 
 807 | # In[139]:
 808 | 
 809 | 
 810 | df_table_win.describe()
 811 | 
 812 | 
 813 | # In[138]:
 814 | 
 815 | 
 816 | df_table.describe()
 817 | 
 818 | 
 819 | # In[ ]:
 820 | 
 821 | 
 822 | df_table.quantile(0.01)
 823 | 
 824 | 
 825 | # In[140]:
 826 | 
 827 | 
 828 | df_table.quantile(0.98)
 829 | 
 830 | 
 831 | # In[ ]:
 832 | 
 833 | 
 834 | df_table_win.describe()
 835 | 
 836 | 
 837 | # In[141]:
 838 | 
 839 | 
 840 | df_table.sort_values().head(20)
 841 | 
 842 | 
 843 | # In[147]:
 844 | 
 845 | 
 846 | df_table_win.sort_values().head(50)
 847 | 
 848 | 
 849 | # In[143]:
 850 | 
 851 | 
 852 | df_table_win[df_table_win == 53]
 853 | 
 854 | 
 855 | # In[144]:
 856 | 
 857 | 
 858 | df_table[df_table == 53]
 859 | 
 860 | 
 861 | # In[145]:
 862 | 
 863 | 
 864 | df_table_win[df_table_win == 63]
 865 | 
 866 | 
 867 | # In[146]:
 868 | 
 869 | 
 870 | df_table[df_table == 63]
 871 | 
 872 | 
 873 | # In[149]:
 874 | 
 875 | 
 876 | Q1 = 56.0
 877 | Q3 = 59.0
 878 | 
 879 | 
 880 | # In[150]:
 881 | 
 882 | 
 883 | IQR = Q3 - Q1
 884 | 
 885 | 
 886 | # In[151]:
 887 | 
 888 | 
 889 | lower = Q1 - 1.5 * IQR
 890 | upper = Q3 + 1.5 * IQR
 891 | 
 892 | 
 893 | # In[152]:
 894 | 
 895 | 
 896 | lower
 897 | 
 898 | 
 899 | # In[153]:
 900 | 
 901 | 
 902 | upper
 903 | 
 904 | 
 905 | # In[154]:
 906 | 
 907 | 
 908 | outliers_15 = (df_table_win < lower) | (df_table_win > upper)
 909 | 
 910 | 
 911 | # In[155]:
 912 | 
 913 | 
 914 | df_table[outliers_15]
 915 | 
 916 | 
 917 | # In[156]:
 918 | 
 919 | 
 920 | df["table_win"] = df_table_win
 921 | 
 922 | 
 923 | # In[157]:
 924 | 
 925 | 
 926 | df.head()
 927 | 
 928 | 
 929 | # - ### ``log()`` Transformation
 930 | 
 931 | # In[158]:
 932 | 
 933 | 
 934 | df.info()
 935 | 
 936 | 
 937 | # In[159]:
 938 | 
 939 | 
 940 | df_carat = df["carat"]
 941 | 
 942 | 
 943 | # In[160]:
 944 | 
 945 | 
 946 | df_carat.shape
 947 | 
 948 | 
 949 | # In[161]:
 950 | 
 951 | 
 952 | df_carat.head()
 953 | 
 954 | 
 955 | # In[162]:
 956 | 
 957 | 
 958 | sns.boxplot(x = df_carat)
 959 | 
 960 | 
 961 | # In[163]:
 962 | 
 963 | 
 964 | sns.distplot(df_carat, bins = 15, kde = False)
 965 | 
 966 | 
 967 | # In[164]:
 968 | 
 969 | 
 970 | df_carat_log = np.log(df_carat)
 971 | 
 972 | 
 973 | # In[166]:
 974 | 
 975 | 
 976 | df_carat
 977 | 
 978 | 
 979 | # In[165]:
 980 | 
 981 | 
 982 | df_carat_log
 983 | 
 984 | 
 985 | # In[167]:
 986 | 
 987 | 
 988 | sns.boxplot(x = df_carat_log)
 989 | 
 990 | 
 991 | # In[174]:
 992 | 
 993 | 
 994 | sns.distplot(df_carat_log, bins = 11, kde = False)
 995 | 
 996 | 
 997 | # In[ ]:
 998 | 
 999 | 
1000 | df["carat_log"] = np.log(df["carat"])
1001 | 
1002 | 
1003 | # In[ ]:
1004 | 
1005 | 
1006 | df.head()
1007 | 
1008 | 


--------------------------------------------------------------------------------
/Pandas/Pandas_1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # SF Salaries Exercise 
  5 | # 
  6 | # Welcome to a quick exercise for you to practice your pandas skills! We will be using the [SF Salaries Dataset](https://www.kaggle.com/kaggle/sf-salaries) from Kaggle! Just follow along and complete the tasks outlined in bold below. The tasks will get harder and harder as you go along.
  7 | 
  8 | # ** Import pandas as pd.**
  9 | 
 10 | # In[1]:
 11 | 
 12 | 
 13 | import pandas as pd
 14 | 
 15 | 
 16 | # ** Read Salaries.csv as a dataframe called sal.**
 17 | 
 18 | # In[2]:
 19 | 
 20 | 
 21 | sal = pd.read_csv('Salaries.csv')
 22 | 
 23 | 
 24 | # ** Check the head of the DataFrame. **
 25 | 
 26 | # In[3]:
 27 | 
 28 | 
 29 | sal.head()
 30 | 
 31 | 
 32 | # ** Use the .info() method to find out how many entries there are.**
 33 | 
 34 | # In[4]:
 35 | 
 36 | 
 37 | sal.info()
 38 | 
 39 | 
 40 | # **What is the average BasePay ?**
 41 | 
 42 | # In[11]:
 43 | 
 44 | 
 45 | sal['BasePay'].mean()
 46 | 
 47 | 
 48 | # ** What is the highest amount of OvertimePay in the dataset ? **
 49 | 
 50 | # In[23]:
 51 | 
 52 | 
 53 | sal['OvertimePay'].sort_values(ascending = False)[1]
 54 | 
 55 | 
 56 | # In[75]:
 57 | 
 58 | 
 59 | sal['OvertimePay'].max()
 60 | 
 61 | 
 62 | # ** What is the job title of  JOSEPH DRISCOLL ? Note: Use all caps, otherwise you may get an answer that doesn't match up (there is also a lowercase Joseph Driscoll). **
 63 | 
 64 | # In[16]:
 65 | 
 66 | 
 67 | sal['JobTitle'][sal['EmployeeName'] == 'JOSEPH DRISCOLL']
 68 | 
 69 | 
 70 | # In[76]:
 71 | 
 72 | 
 73 | sal[sal['EmployeeName'] == 'JOSEPH DRISCOLL']['JobTitle']
 74 | 
 75 | 
 76 | # ** How much does JOSEPH DRISCOLL make (including benefits)? **
 77 | 
 78 | # In[18]:
 79 | 
 80 | 
 81 | sal['TotalPayBenefits'][sal['EmployeeName'] == 'JOSEPH DRISCOLL']
 82 | 
 83 | 
 84 | # In[77]:
 85 | 
 86 | 
 87 | sal[sal['EmployeeName'] == 'JOSEPH DRISCOLL']['TotalPayBenefits']
 88 | 
 89 | 
 90 | # ** What is the name of highest paid person (including benefits)?**
 91 | 
 92 | # In[82]:
 93 | 
 94 | 
 95 | sal.sort_values(by = 'TotalPayBenefits', ascending = False).iloc[[0]]
 96 | 
 97 | 
 98 | # In[80]:
 99 | 
100 | 
101 | sal[sal['TotalPayBenefits'] == sal['TotalPayBenefits'].max()]
102 | 
103 | 
104 | # In[86]:
105 | 
106 | 
107 | sal.loc[[sal['TotalPayBenefits'].idxmax()]]   # idxmax() is similar numpy argmax()
108 | 
109 | 
110 | # In[87]:
111 | 
112 | 
113 | sal.iloc[[sal['TotalPayBenefits'].argmax()]]
114 | 
115 | 
116 | # ** What is the name of lowest paid person (including benefits)? Do you notice something strange about how much he or she is paid?**
117 | 
118 | # In[32]:
119 | 
120 | 
121 | sal.sort_values(by = 'TotalPayBenefits').iloc[[0]]
122 | 
123 | 
124 | # In[89]:
125 | 
126 | 
127 | sal[sal['TotalPayBenefits'] == sal['TotalPayBenefits'].min()]
128 | 
129 | 
130 | # In[93]:
131 | 
132 | 
133 | sal.loc[[sal['TotalPayBenefits'].idxmin()]]
134 | 
135 | 
136 | # In[95]:
137 | 
138 | 
139 | sal.iloc[[sal['TotalPayBenefits'].argmin()]]
140 | 
141 | 
142 | # ** What was the average (mean) BasePay of all employees per year? (2011-2014) ? **
143 | 
144 | # In[40]:
145 | 
146 | 
147 | sal['Year'].unique()
148 | 
149 | 
150 | # In[113]:
151 | 
152 | 
153 | sal.groupby('Year').mean()['BasePay']
154 | 
155 | 
156 | # In[116]:
157 | 
158 | 
159 | sal.groupby('Year').mean()['BasePay'][[2011, 2013]]
160 | 
161 | 
162 | # ** How many unique job titles are there? **
163 | 
164 | # In[45]:
165 | 
166 | 
167 | sal['JobTitle'].nunique()
168 | 
169 | 
170 | # In[117]:
171 | 
172 | 
173 | len(sal['JobTitle'].unique())
174 | 
175 | 
176 | # ** What are the top 5 most common jobs? **
177 | 
178 | # In[48]:
179 | 
180 | 
181 | sal['JobTitle'].value_counts()[:5]
182 | 
183 | 
184 | # In[119]:
185 | 
186 | 
187 | sal['JobTitle'].value_counts().head(5)
188 | 
189 | 
190 | # In[120]:
191 | 
192 | 
193 | type(sal['JobTitle'].value_counts())
194 | 
195 | 
196 | # ** How many Job Titles were represented by only one person in 2013? (e.g. Job Titles with only one occurence in 2013?) **
197 | 
198 | # In[126]:
199 | 
200 | 
201 | sal[['JobTitle']][sal['Year'] == 2013].nunique()
202 | 
203 | 
204 | # In[128]:
205 | 
206 | 
207 | sal[['JobTitle']][sal['Year'] == 2013].value_counts() == 1
208 | 
209 | 
210 | # In[129]:
211 | 
212 | 
213 | sum(sal[['JobTitle']][sal['Year'] == 2013].value_counts() == 1)
214 | 
215 | 
216 | # ** How many people have the word Chief in their job title? (This is pretty tricky) **
217 | 
218 | # In[151]:
219 | 
220 | 
221 | sum(sal['JobTitle'].apply(str.lower).str.contains('chief'))
222 | 
223 | 
224 | # In[159]:
225 | 
226 | 
227 | sal['JobTitle'][sal['JobTitle'].apply(str.lower).str.contains('chief')].value_counts().index
228 | 
229 | 
230 | # ** Bonus: Is there a correlation between length of the Job Title string and Salary? **
231 | 
232 | # In[71]:
233 | 
234 | 
235 | title_len = sal['JobTitle'].apply(len)
236 | 
237 | 
238 | # In[72]:
239 | 
240 | 
241 | import numpy as np
242 | 
243 | 
244 | # In[73]:
245 | 
246 | 
247 | np.corrcoef(title_len, sal['TotalPayBenefits'])
248 | 
249 | 
250 | # In[152]:
251 | 
252 | 
253 | sal['title_len'] = sal['JobTitle'].apply(len)
254 | 
255 | 
256 | # In[161]:
257 | 
258 | 
259 | sal[['TotalPayBenefits', 'title_len']].corr('pearson')
260 | 
261 | 
262 | # # Great Job!
263 | 


--------------------------------------------------------------------------------
/Pandas/Pandas_2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Ecommerce Purchases Exercise
  5 | # 
  6 | # In this Exercise you will be given some Fake Data about some purchases done through Amazon! Just go ahead and follow the directions and try your best to answer the questions and complete the tasks. Feel free to reference the solutions. Most of the tasks can be solved in different ways. For the most part, the questions get progressively harder.
  7 | # 
  8 | # Please excuse anything that doesn't make "Real-World" sense in the dataframe, all the data is fake and made-up.
  9 | # 
 10 | # Also note that all of these questions can be answered with one line of code.
 11 | # ____
 12 | # ** Import pandas and read in the Ecommerce Purchases csv file and set it to a DataFrame called ecom. **
 13 | 
 14 | # In[1]:
 15 | 
 16 | 
 17 | import pandas as pd
 18 | import numpy as np
 19 | 
 20 | 
 21 | # In[2]:
 22 | 
 23 | 
 24 | ecom = pd.read_csv('Ecommerce Purchases')
 25 | 
 26 | 
 27 | # **Check the head of the DataFrame.**
 28 | 
 29 | # In[3]:
 30 | 
 31 | 
 32 | ecom.head()
 33 | 
 34 | 
 35 | # ** How many rows and columns are there? **
 36 | 
 37 | # In[4]:
 38 | 
 39 | 
 40 | ecom.info()
 41 | 
 42 | 
 43 | # ** What is the average Purchase Price? **
 44 | 
 45 | # In[5]:
 46 | 
 47 | 
 48 | ecom['Purchase Price'].mean()
 49 | 
 50 | 
 51 | # ** What were the highest and lowest purchase prices? **
 52 | 
 53 | # In[6]:
 54 | 
 55 | 
 56 | ecom['Purchase Price'].max()
 57 | 
 58 | 
 59 | # In[7]:
 60 | 
 61 | 
 62 | ecom['Purchase Price'].min()
 63 | 
 64 | 
 65 | # ** How many people have English 'en' as their Language of choice on the website? **
 66 | 
 67 | # In[8]:
 68 | 
 69 | 
 70 | ecom['Language'].unique()
 71 | 
 72 | 
 73 | # In[10]:
 74 | 
 75 | 
 76 | ecom[ecom['Language'] == 'en'].count()
 77 | 
 78 | 
 79 | # ** How many people have the job title of "Lawyer" ? **
 80 | # 
 81 | 
 82 | # In[13]:
 83 | 
 84 | 
 85 | ecom[ecom['Job'] == 'Lawyer'].info()
 86 | 
 87 | 
 88 | # ** How many people made the purchase during the AM and how many people made the purchase during PM ? **
 89 | # 
 90 | # **(Hint: Check out [value_counts()](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.value_counts.html) ) **
 91 | 
 92 | # In[14]:
 93 | 
 94 | 
 95 | ecom['AM or PM'].value_counts()
 96 | 
 97 | 
 98 | # ** What are the 5 most common Job Titles? **
 99 | 
100 | # In[26]:
101 | 
102 | 
103 | ecom['Job'].value_counts().head()
104 | 
105 | 
106 | # ** Someone made a purchase that came from Lot: "90 WT" , what was the Purchase Price for this transaction? **
107 | 
108 | # In[27]:
109 | 
110 | 
111 | ecom.head()
112 | 
113 | 
114 | # In[28]:
115 | 
116 | 
117 | ecom[ecom['Lot'] == '90 WT']['Purchase Price']
118 | 
119 | 
120 | # ** What is the email of the person with the following Credit Card Number: 4926535242672853 **
121 | 
122 | # In[30]:
123 | 
124 | 
125 | ecom[ecom['Credit Card'] == 4926535242672853]['Email']
126 | 
127 | 
128 | # ** How many people have American Express as their Credit Card Provider *and* made a purchase above $95 ?**
129 | 
130 | # In[31]:
131 | 
132 | 
133 | ecom[(ecom['CC Provider'] == 'American Express') & (ecom['Purchase Price'] > 95)].count()
134 | 
135 | 
136 | # ** Hard: How many people have a credit card that expires in 2025? **
137 | 
138 | # In[35]:
139 | 
140 | 
141 | ecom[ecom['CC Exp Date'].str.contains('25')]['CC Security Code'].count()
142 | 
143 | 
144 | # ** Hard: What are the top 5 most popular email providers/hosts (e.g. gmail.com, yahoo.com, etc...) **
145 | 
146 | # In[39]:
147 | 
148 | 
149 | ecom['Email'].apply(lambda email : email.split('@')[1]).value_counts().head()
150 | 
151 | 
152 | # # Great Job!
153 | 


--------------------------------------------------------------------------------
/Pandas/Pandas_3.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | 
 10 | 
 11 | # In[3]:
 12 | 
 13 | 
 14 | df = pd.read_csv('movies_metadata.csv', error_bad_lines=False)
 15 | 
 16 | 
 17 | # In[4]:
 18 | 
 19 | 
 20 | df = pd.read_csv('movies_metadata.csv', error_bad_lines=False, encoding = 'latin-1')
 21 | 
 22 | 
 23 | # In[9]:
 24 | 
 25 | 
 26 | df.info()
 27 | 
 28 | 
 29 | # In[10]:
 30 | 
 31 | 
 32 | df.head()
 33 | 
 34 | 
 35 | # In[13]:
 36 | 
 37 | 
 38 | df.loc[2]
 39 | 
 40 | 
 41 | # In[14]:
 42 | 
 43 | 
 44 | df.shape
 45 | 
 46 | 
 47 | # In[16]:
 48 | 
 49 | 
 50 | df.columns
 51 | 
 52 | 
 53 | # In[18]:
 54 | 
 55 | 
 56 | df[['title', 'genres']]
 57 | 
 58 | 
 59 | # In[15]:
 60 | 
 61 | 
 62 | df[df['original_title'] == 'Grumpier Old Men']
 63 | 
 64 | 
 65 | # In[5]:
 66 | 
 67 | 
 68 | df.iloc[4]
 69 | 
 70 | 
 71 | # In[20]:
 72 | 
 73 | 
 74 | df.info()
 75 | 
 76 | 
 77 | # In[21]:
 78 | 
 79 | 
 80 | df2 = df[['title', 'release_date', 'budget', 'revenue', 'runtime']]
 81 | 
 82 | 
 83 | # In[22]:
 84 | 
 85 | 
 86 | df2.head(2)
 87 | 
 88 | 
 89 | # In[24]:
 90 | 
 91 | 
 92 | df.head(10)
 93 | 
 94 | 
 95 | # In[25]:
 96 | 
 97 | 
 98 | df.sort_values(by = 'release_date')
 99 | 
100 | 
101 | # In[34]:
102 | 
103 | 
104 | df['release_date'].dtype
105 | 
106 | 
107 | # In[37]:
108 | 
109 | 
110 | df[df['release_date'] > '1995-01-01']
111 | 
112 | 
113 | # In[38]:
114 | 
115 | 
116 | df.columns
117 | 
118 | 
119 | # In[39]:
120 | 
121 | 
122 | df.sort_values('runtime', ascending = False)
123 | 
124 | 
125 | # In[40]:
126 | 
127 | 
128 | df.info()
129 | 
130 | 
131 | # In[44]:
132 | 
133 | 
134 | df['budget'].value_counts()
135 | 
136 | 
137 | # In[70]:
138 | 
139 | 
140 | df[(df['revenue'] >= 2000000) & (df['budget'] <= 1000000)]
141 | 
142 | 
143 | # In[50]:
144 | 
145 | 
146 | df['runtime'].max()
147 | 
148 | 
149 | # In[51]:
150 | 
151 | 
152 | df['runtime'].min()
153 | 
154 | 
155 | # In[52]:
156 | 
157 | 
158 | df.info()
159 | 
160 | 
161 | # In[53]:
162 | 
163 | 
164 | df['vote_count'].value_counts()
165 | 
166 | 
167 | # In[54]:
168 | 
169 | 
170 | df.describe()
171 | 
172 | 
173 | # In[6]:
174 | 
175 | 
176 | df['vote_count'].quantile(0.70)
177 | 
178 | 
179 | # In[8]:
180 | 
181 | 
182 | df[(df['runtime'] >= 30) & (df['runtime'] <= 360)]['title']
183 | 
184 | 
185 | # In[73]:
186 | 
187 | 
188 | df.info()
189 | 
190 | 
191 | # In[74]:
192 | 
193 | 
194 | df[['title', 'vote_count']]
195 | 
196 | 


--------------------------------------------------------------------------------
/Pandas/Pandas_Class2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | # missing values / outliers
  8 | 
  9 | 
 10 | # In[2]:
 11 | 
 12 | 
 13 | import pandas as pd
 14 | import numpy as np
 15 | 
 16 | 
 17 | # In[3]:
 18 | 
 19 | 
 20 | df = pd.DataFrame({'A':[1, 2, np.nan],
 21 |                   'B':[5, np.nan, np.nan],
 22 |                   'C':[1, 2, 3]})
 23 | 
 24 | 
 25 | # In[4]:
 26 | 
 27 | 
 28 | df
 29 | 
 30 | 
 31 | # In[5]:
 32 | 
 33 | 
 34 | df.dropna()
 35 | 
 36 | 
 37 | # In[6]:
 38 | 
 39 | 
 40 | df.dropna(axis = 1)
 41 | 
 42 | 
 43 | # In[7]:
 44 | 
 45 | 
 46 | df.dropna(thresh = 2)
 47 | 
 48 | 
 49 | # In[8]:
 50 | 
 51 | 
 52 | df.fillna(value = 'milk')
 53 | 
 54 | 
 55 | # In[9]:
 56 | 
 57 | 
 58 | V1 = np.array([2,3,5,np.NaN,7,1,np.NaN,10,14])
 59 | V2 = np.array([8,np.NaN,5,8,11,np.NaN,np.NaN,2,3])
 60 | V3 = np.array([np.NaN,13,5,6,13,7,np.NaN,3,30])
 61 | df = pd.DataFrame(
 62 |         {"Var1" : V1,
 63 |          "Var2" : V2,
 64 |          "Var3" : V3}
 65 | )
 66 | df
 67 | 
 68 | 
 69 | # In[10]:
 70 | 
 71 | 
 72 | df.isnull()
 73 | 
 74 | 
 75 | # In[11]:
 76 | 
 77 | 
 78 | df.notnull()
 79 | 
 80 | 
 81 | # In[12]:
 82 | 
 83 | 
 84 | df.isnull().sum()
 85 | 
 86 | 
 87 | # In[13]:
 88 | 
 89 | 
 90 | df.notnull().sum()
 91 | 
 92 | 
 93 | # In[14]:
 94 | 
 95 | 
 96 | df['Var1'].isnull()
 97 | 
 98 | 
 99 | # In[15]:
100 | 
101 | 
102 | df[df['Var1'].isnull()]
103 | 
104 | 
105 | # In[16]:
106 | 
107 | 
108 | df.isnull()
109 | 
110 | 
111 | # In[17]:
112 | 
113 | 
114 | df
115 | 
116 | 
117 | # In[18]:
118 | 
119 | 
120 | df.isnull().any(axis=0)
121 | 
122 | 
123 | # In[19]:
124 | 
125 | 
126 | df.isnull().all(axis=1)
127 | 
128 | 
129 | # In[20]:
130 | 
131 | 
132 | df.isnull().any(axis=1)
133 | 
134 | 
135 | # In[21]:
136 | 
137 | 
138 | df[df.isnull().any(axis=1)]
139 | 
140 | 
141 | # In[22]:
142 | 
143 | 
144 | df[~df.isnull().any(axis=1)]    # ~ means against
145 | 
146 | 
147 | # In[23]:
148 | 
149 | 
150 | # handle with missing values
151 | 
152 | 
153 | # In[24]:
154 | 
155 | 
156 | df.dropna()
157 | 
158 | 
159 | # In[25]:
160 | 
161 | 
162 | df.dropna(how = 'all')
163 | 
164 | 
165 | # In[26]:
166 | 
167 | 
168 | df.dropna(how = 'any')
169 | 
170 | 
171 | # In[27]:
172 | 
173 | 
174 | df['Var1']
175 | 
176 | 
177 | # In[28]:
178 | 
179 | 
180 | df['Var1'].fillna(0)
181 | 
182 | 
183 | # In[29]:
184 | 
185 | 
186 | df['Var1'].fillna(df['Var1'].mean())
187 | 
188 | 
189 | # In[30]:
190 | 
191 | 
192 | df.apply(lambda x : x.fillna(x.mean()))
193 | 
194 | 
195 | # In[31]:
196 | 
197 | 
198 | df.mean()
199 | 
200 | 
201 | # In[32]:
202 | 
203 | 
204 | df.fillna(df.mean())
205 | 
206 | 
207 | # In[33]:
208 | 
209 | 
210 | df.fillna({'Var1' : 6, 'Var2' : 6.16})
211 | 
212 | 
213 | # In[34]:
214 | 
215 | 
216 | #where
217 | 
218 | 
219 | # In[35]:
220 | 
221 | 
222 | df.where(pd.notnull(df), df.mean(), axis = 1)
223 | 
224 | 
225 | # In[36]:
226 | 
227 | 
228 | Var1 = np.array([1,3,6,np.NaN,7,1,9,np.NaN,15])
229 | Var2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3])
230 | Var3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31])
231 | Var4 = np.array(["IT","IT","IT","HR","HR","HR","HR","IT","IT"])
232 | df = pd.DataFrame(
233 |         {"salary" : Var1,
234 |          "Var2" : Var2,
235 |          "Var3" : Var3,
236 |          "department" : Var4}
237 | )
238 | 
239 | 
240 | # In[37]:
241 | 
242 | 
243 | df
244 | 
245 | 
246 | # In[38]:
247 | 
248 | 
249 | df.groupby('department')['salary'].mean()
250 | 
251 | 
252 | # In[39]:
253 | 
254 | 
255 | df['salary'].fillna({0:1, 1:2, 2:3, 3:4, 4:5, 5:6, 6:7, 7:8, 8:9})
256 | 
257 | 
258 | # In[40]:
259 | 
260 | 
261 | df
262 | 
263 | 
264 | # In[41]:
265 | 
266 | 
267 | df.groupby('department')['salary'].transform('mean')
268 | 
269 | 
270 | # In[42]:
271 | 
272 | 
273 | df.salary.fillna(df.groupby('department')['salary'].transform('mean'))
274 | 
275 | 
276 | # In[43]:
277 | 
278 | 
279 | V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
280 | V4 = np.array(["IT",np.nan,"HR","HR","HR","HR",np.nan,"IT","HR"], dtype=object)
281 | df = pd.DataFrame(
282 |         {"salary" : V1,
283 |         "department" : V4}
284 | )
285 | df
286 | 
287 | 
288 | # In[44]:
289 | 
290 | 
291 | df['department'].fillna(df['department'].mode()[0])         # fill missing values with mode of column
292 | 
293 | 
294 | # In[45]:
295 | 
296 | 
297 | df['department'].fillna(method = 'bfill')                  # back fill
298 | 
299 | 
300 | # In[46]:
301 | 
302 | 
303 | #df['department'].fillna(a, method = 'ffill', limit = 200)   
304 | #df['department'].fillna(b, method = 'ffill', limit = 100)  
305 | #df['department'].fillna(a, method = 'ffill', limit = 100)  
306 | #df['department'].fillna(c, method = 'ffill', limit = 100)  
307 | #df['department'].fillna(a, method = 'ffill', limit = 100) 
308 | #df['department'].fillna(b, method = 'ffill', limit = 200)  
309 | #df['department'].fillna(c, method = 'ffill', limit = 100)  
310 | #df['department'].fillna(a, method = 'ffill', limit = 100)  
311 | 
312 | 
313 | # In[47]:
314 | 
315 | 
316 | df['department'].fillna(method = 'ffill')                    # forward fill
317 | 
318 | 
319 | # In[48]:
320 | 
321 | 
322 | # outliers
323 | 
324 | 
325 | # In[49]:
326 | 
327 | 
328 | import seaborn as sns
329 | df = sns.load_dataset('diamonds')
330 | df = df.select_dtypes(include = ['float64', 'int64'])
331 | df = df.dropna()
332 | df.head()
333 | 
334 | 
335 | # In[50]:
336 | 
337 | 
338 | sns.boxplot(df['table'])
339 | 
340 | 
341 | # In[51]:
342 | 
343 | 
344 | df_table = df['table']
345 | 
346 | 
347 | # In[52]:
348 | 
349 | 
350 | df_table.head()
351 | 
352 | 
353 | # In[53]:
354 | 
355 | 
356 | pd.DataFrame(df_table).info()
357 | 
358 | 
359 | # In[54]:
360 | 
361 | 
362 | q1 = df_table.quantile(0.25)
363 | q3 = df_table.quantile(0.75)
364 | iqr = q3 - q1
365 | 
366 | 
367 | # In[55]:
368 | 
369 | 
370 | q3
371 | 
372 | 
373 | # In[56]:
374 | 
375 | 
376 | q1
377 | 
378 | 
379 | # In[57]:
380 | 
381 | 
382 | iqr
383 | 
384 | 
385 | # In[58]:
386 | 
387 | 
388 | df.describe()
389 | 
390 | 
391 | # In[59]:
392 | 
393 | 
394 | lower_lim = q1 - 1.5 * iqr
395 | upper_lim = q3 + 1.5 * iqr
396 | 
397 | 
398 | # In[60]:
399 | 
400 | 
401 | lower_lim
402 | 
403 | 
404 | # In[61]:
405 | 
406 | 
407 | upper_lim
408 | 
409 | 
410 | # In[62]:
411 | 
412 | 
413 | outliers_15_low = df_table < lower_lim
414 | 
415 | 
416 | # In[63]:
417 | 
418 | 
419 | outliers_15_up = df_table > upper_lim
420 | 
421 | 
422 | # In[64]:
423 | 
424 | 
425 | df_table[outliers_15_low]
426 | 
427 | 
428 | # In[65]:
429 | 
430 | 
431 | df_table[outliers_15_up]
432 | 
433 | 
434 | # In[66]:
435 | 
436 | 
437 | df_table[outliers_15_low | outliers_15_up]
438 | 
439 | 
440 | # In[67]:
441 | 
442 | 
443 | lower_lim = q1 - 2.5 * iqr
444 | upper_lim = q3 + 2.5 * iqr
445 | 
446 | 
447 | # In[68]:
448 | 
449 | 
450 | df_table[(df_table < lower_lim) | (df_table > upper_lim)]
451 | 
452 | 
453 | # In[69]:
454 | 
455 | 
456 | #removing the outliers
457 | 
458 | 
459 | # In[70]:
460 | 
461 | 
462 | df_table[~(outliers_15_low | outliers_15_up)]
463 | 
464 | 
465 | # In[71]:
466 | 
467 | 
468 | clean_df = df[~(outliers_15_low | outliers_15_up)]
469 | 
470 | 
471 | # In[72]:
472 | 
473 | 
474 | clean_df                             # without ouliers (1.5)
475 | 
476 | 
477 | # In[73]:
478 | 
479 | 
480 | # limitation winsorize() method
481 | 
482 | 
483 | # In[74]:
484 | 
485 | 
486 | from scipy.stats.mstats import winsorize
487 | 
488 | 
489 | # In[75]:
490 | 
491 | 
492 | df
493 | 
494 | 
495 | # In[76]:
496 | 
497 | 
498 | df_table
499 | 
500 | 
501 | # In[77]:
502 | 
503 | 
504 | sns.boxplot(df['table'])
505 | 
506 | 
507 | # In[78]:
508 | 
509 | 
510 | sns.distplot(df['table'], kde = False, bins = 15)
511 | 
512 | 
513 | # In[79]:
514 | 
515 | 
516 | df_table_win = winsorize(df_table, (0.01, 0.02))      # % 1 from bottom % 2 from top
517 | 
518 | 
519 | # In[80]:
520 | 
521 | 
522 | df_table_win
523 | 
524 | 
525 | # In[81]:
526 | 
527 | 
528 | sns.distplot(df_table_win, kde = False, bins = 10)
529 | 
530 | 
531 | # In[82]:
532 | 
533 | 
534 | sns.boxplot(df_table_win)
535 | 
536 | 
537 | # In[83]:
538 | 
539 | 
540 | df['table'].describe()
541 | 
542 | 
543 | # In[86]:
544 | 
545 | 
546 | df_table_win = pd.DataFrame(df_table_win)[0]
547 | 
548 | 
549 | # In[87]:
550 | 
551 | 
552 | df_table_win.describe()
553 | 
554 | 
555 | # In[92]:
556 | 
557 | 
558 | df['table'].sort_values().head(20)
559 | 
560 | 
561 | # In[93]:
562 | 
563 | 
564 | df_table_win.sort_values().head(20)    # scale values
565 | 
566 | 
567 | # In[94]:
568 | 
569 | 
570 | df_table_win[11368]
571 | 
572 | 
573 | # In[95]:
574 | 
575 | 
576 | df['table'][11368]
577 | 
578 | 
579 | # In[96]:
580 | 
581 | 
582 | df_table_win[24815]
583 | 
584 | 
585 | # In[97]:
586 | 
587 | 
588 | df['table'][24815]
589 | 
590 | 
591 | # In[100]:
592 | 
593 | 
594 | df_table_win[df_table_win == 53]
595 | 
596 | 
597 | # In[99]:
598 | 
599 | 
600 | df_table[df_table == 53]
601 | 
602 | 
603 | # In[103]:
604 | 
605 | 
606 | df_table_win[df_table_win == 63]           # 1180 - 563. because right skewed.
607 |                                            # upper outliers are more than lower. 0.02 from uuper side.
608 | 
609 | 
610 | # In[104]:
611 | 
612 | 
613 | df_table[df_table == 63]
614 | 
615 | 
616 | # In[107]:
617 | 
618 | 
619 | q1
620 | 
621 | 
622 | # In[106]:
623 | 
624 | 
625 | q3
626 | 
627 | 
628 | # In[108]:
629 | 
630 | 
631 | iqr
632 | 
633 | 
634 | # In[111]:
635 | 
636 | 
637 | lower = q1 - 1.5 * iqr
638 | upper = q3 + 1.5 * iqr
639 | 
640 | 
641 | # In[112]:
642 | 
643 | 
644 | lower
645 | 
646 | 
647 | # In[113]:
648 | 
649 | 
650 | upper
651 | 
652 | 
653 | # In[114]:
654 | 
655 | 
656 | outliers_15 = (df_table_win < lower) | (df_table_win > upper)
657 | 
658 | 
659 | # In[116]:
660 | 
661 | 
662 | df_table_win[outliers_15]
663 | 
664 | 
665 | # In[117]:
666 | 
667 | 
668 | df_table[(df_table < lower) | (df_table > upper)]
669 | 
670 | 
671 | # In[131]:
672 | 
673 | 
674 | df['table_win'] = df_table_win
675 | 
676 | 
677 | # In[119]:
678 | 
679 | 
680 | # log() transformation
681 | 
682 | 
683 | # In[132]:
684 | 
685 | 
686 | df.info()
687 | 
688 | 
689 | # In[122]:
690 | 
691 | 
692 | df_carat = df['carat']
693 | 
694 | 
695 | # In[123]:
696 | 
697 | 
698 | df_carat.head()
699 | 
700 | 
701 | # In[124]:
702 | 
703 | 
704 | sns.boxplot(df_carat)
705 | 
706 | 
707 | # In[127]:
708 | 
709 | 
710 | sns.distplot(df_carat, bins = 15, kde = False)
711 | 
712 | 
713 | # In[128]:
714 | 
715 | 
716 | df_carat_log = np.log(df_carat)
717 | 
718 | 
719 | # In[129]:
720 | 
721 | 
722 | sns.distplot(df_carat_log, bins = 15, kde = False)
723 | 
724 | 
725 | # In[130]:
726 | 
727 | 
728 | sns.boxplot(df_carat_log)
729 | 
730 | 
731 | # In[133]:
732 | 
733 | 
734 | df['carat_log'] = np.log(df['carat'])
735 | 
736 | 
737 | # In[134]:
738 | 
739 | 
740 | df.head()
741 | 
742 | 
743 | # In[1]:
744 | 
745 | 
746 | import pandas as pd
747 | 
748 | 
749 | # In[2]:
750 | 
751 | 
752 | df1 = pd.DataFrame({'lkey': ['x', 'y', 'z', 'c', 'z','x'],
753 |                     'lvalue': [2, 3, 5, 7, 0, 99]})
754 | df2 = pd.DataFrame({'rkey': ['x', 'x', 'z', 'z'],
755 |                     'rvalue': [7, 8, 9, 10]})
756 | 
757 | 
758 | # In[3]:
759 | 
760 | 
761 | df1
762 | 
763 | 
764 | # In[4]:
765 | 
766 | 
767 | df2
768 | 
769 | 
770 | # In[5]:
771 | 
772 | 
773 | pd.merge(df1, df2, left_on = 'lkey', right_on = 'rkey', how = 'left')
774 | 
775 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DataScience
2 | There are basic/advanced codes, notes and exercises for data analysis and data visualization. 
3 | There are notes about statistic and advanced implement of statistic at python. 
4 | Pandas, NumPy, Seaborn, Matplotlib, SciPy, Researchpy, Regex.
5 | 


--------------------------------------------------------------------------------
/Seaborn/Seaborn Class2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import seaborn as sns
  8 | import pandas as pd
  9 | import matplotlib.pyplot as plt
 10 | import numpy as np
 11 | from scipy import stats
 12 | 
 13 | 
 14 | # In[2]:
 15 | 
 16 | 
 17 | df = pd.read_csv('cleaned_autos.csv')
 18 | 
 19 | 
 20 | # In[3]:
 21 | 
 22 | 
 23 | df.info()
 24 | 
 25 | 
 26 | # In[4]:
 27 | 
 28 | 
 29 | pd.set_option('display.max_columns', 27)
 30 | 
 31 | 
 32 | # In[5]:
 33 | 
 34 | 
 35 | df.head()
 36 | 
 37 | 
 38 | # In[6]:
 39 | 
 40 | 
 41 | df['vehicleType'].unique()
 42 | 
 43 | 
 44 | # In[7]:
 45 | 
 46 | 
 47 | df.groupby('vehicleType')['price'].mean().plot.bar()
 48 | 
 49 | 
 50 | # In[8]:
 51 | 
 52 | 
 53 | # variation of the price range by vehicle types
 54 | 
 55 | 
 56 | # In[9]:
 57 | 
 58 | 
 59 | plt.subplots(figsize = (12, 6))
 60 | sns.boxplot('vehicleType', 'price', data = df)
 61 | 
 62 | 
 63 | # In[10]:
 64 | 
 65 | 
 66 | # demonstration of the mean prices by the vehicle type
 67 | 
 68 | 
 69 | # In[11]:
 70 | 
 71 | 
 72 | fig, ax = plt.subplots(figsize = (12, 6))
 73 | sns.set(style="darkgrid")
 74 | sns.pointplot('vehicleType', 'price', data = df)
 75 | ax.set_xticklabels(df['vehicleType'].unique(), rotation = 90);
 76 | 
 77 | 
 78 | # In[12]:
 79 | 
 80 | 
 81 | # total count of vehicles by type available on sale
 82 | 
 83 | 
 84 | # In[13]:
 85 | 
 86 | 
 87 | plt.subplots(figsize = (12, 6))
 88 | sns.countplot('vehicleType', data = df)
 89 | 
 90 | 
 91 | # In[6]:
 92 | 
 93 | 
 94 | df.head()
 95 | 
 96 | 
 97 | # In[8]:
 98 | 
 99 | 
100 | # average price for vehicles based on the type of vehicle as well as on the type of gearbox
101 | 
102 | 
103 | # In[12]:
104 | 
105 | 
106 | plt.figure(figsize = (12, 6))
107 | sns.barplot('vehicleType', 'price', 'gearbox', data = df)
108 | plt.show()
109 | 
110 | 
111 | # In[13]:
112 | 
113 | 
114 | # average price for vehicles by fuel type as well as on the type of gearbox
115 | 
116 | 
117 | # In[14]:
118 | 
119 | 
120 | plt.figure(figsize = (12, 6))
121 | sns.barplot('fuelType', 'price', 'gearbox', data = df)
122 | plt.tight_layout()
123 | 
124 | 


--------------------------------------------------------------------------------
/Seaborn/Seaborn_1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | 
 7 | import pandas as pd
 8 | import numpy as np
 9 | import seaborn as sns
10 | import matplotlib.pyplot as plt
11 | 
12 | 
13 | # In[2]:
14 | 
15 | 
16 | titanic = sns.load_dataset('titanic')
17 | 
18 | 
19 | # In[3]:
20 | 
21 | 
22 | titanic.head()
23 | 
24 | 
25 | # In[4]:
26 | 
27 | 
28 | titanic.info()
29 | 
30 | 
31 | # In[60]:
32 | 
33 | 
34 | sns.set(style = "whitegrid")
35 | sns.jointplot('fare', 'age', data = titanic, xlim = (-100, 600))
36 | plt.show()
37 | 
38 | 
39 | # In[59]:
40 | 
41 | 
42 | sns.distplot(titanic['fare'], kde = False, color = 'red', bins = 30)
43 | plt.xlim(0)
44 | plt.show()
45 | 
46 | 
47 | # In[57]:
48 | 
49 | 
50 | sns.boxplot('class', 'age', data = titanic, palette = 'rainbow')
51 | plt.show()
52 | 
53 | 
54 | # In[61]:
55 | 
56 | 
57 | sns.swarmplot('class', 'age', data = titanic, palette = 'Set2')
58 | plt.show()
59 | 
60 | 
61 | # In[22]:
62 | 
63 | 
64 | sns.countplot('sex', data = titanic)
65 | plt.show()
66 | 
67 | 
68 | # In[66]:
69 | 
70 | 
71 | plt.figure(figsize = (8, 5))
72 | sns.heatmap(titanic.corr(), cmap = 'coolwarm', vmin = -0.8, vmax = 0.8)
73 | plt.title('titanic.corr()')
74 | plt.show()
75 | 
76 | 
77 | # In[64]:
78 | 
79 | 
80 | g = sns.FacetGrid(titanic, col = 'sex', aspect = 1, height = 5)
81 | g.map(sns.distplot, 'age', kde = False, bins = 10)
82 | plt.xlim(0)
83 | plt.tight_layout()
84 | 
85 | 
86 | # In[63]:
87 | 
88 | 
89 | g = sns.FacetGrid(titanic, col = 'sex', aspect = 1, height = 5)
90 | g.map(plt.hist, 'age', bins = 10)
91 | plt.xlim(0)
92 | plt.tight_layout()
93 | 
94 | 


--------------------------------------------------------------------------------
/Seaborn/Seaborn_U.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import seaborn as sns
  8 | 
  9 | 
 10 | # In[2]:
 11 | 
 12 | 
 13 | get_ipython().run_line_magic('matplotlib', 'inline')
 14 | 
 15 | 
 16 | # In[3]:
 17 | 
 18 | 
 19 | # Distribution Plots
 20 | 
 21 | 
 22 | # In[4]:
 23 | 
 24 | 
 25 | import ssl
 26 | ssl._create_default_https_context = ssl._create_unverified_context
 27 | 
 28 | 
 29 | # In[5]:
 30 | 
 31 | 
 32 | tips = sns.load_dataset('tips')
 33 | 
 34 | 
 35 | # In[6]:
 36 | 
 37 | 
 38 | tips.head()
 39 | 
 40 | 
 41 | # In[7]:
 42 | 
 43 | 
 44 | tips
 45 | 
 46 | 
 47 | # In[10]:
 48 | 
 49 | 
 50 | sns.distplot(tips['total_bill'])
 51 | 
 52 | 
 53 | # In[10]:
 54 | 
 55 | 
 56 | sns.distplot(tips['total_bill'], kde = False, bins = 20);
 57 | 
 58 | 
 59 | # In[11]:
 60 | 
 61 | 
 62 | sns.distplot(tips['total_bill'], kde = False, bins = 40);      # histogram, there is a hist = True arg.
 63 | 
 64 | 
 65 | # In[21]:
 66 | 
 67 | 
 68 | sns.jointplot(x = tips['total_bill'], y = tips['tip'], data = tips)       # scatter plot
 69 | 
 70 | 
 71 | # In[22]:
 72 | 
 73 | 
 74 | sns.jointplot(tips['total_bill'], tips['tip'], tips, kind = 'hex')  # default kind value is scatter
 75 | 
 76 | 
 77 | # In[23]:
 78 | 
 79 | 
 80 | sns.jointplot(tips['total_bill'], tips['tip'], tips, kind = 'reg')
 81 | 
 82 | 
 83 | # In[25]:
 84 | 
 85 | 
 86 | tips.head()
 87 | 
 88 | 
 89 | # In[24]:
 90 | 
 91 | 
 92 | sns.pairplot(tips)         # it is a collected form of jointplots of each numerical variables.scatter/strip
 93 | 
 94 | 
 95 | # In[26]:
 96 | 
 97 | 
 98 | sns.pairplot(tips, hue = 'sex')     # we can add categorical variable with hue argument
 99 | 
100 | 
101 | # In[27]:
102 | 
103 | 
104 | sns.pairplot(tips, hue = 'sex', palette = 'coolwarm') 
105 | 
106 | 
107 | # In[28]:
108 | 
109 | 
110 | sns.rugplot(tips['total_bill'])   # it gives distribution without bins (unlike histogram)
111 |                                   # dash mark for every points with uniform distribution
112 | 
113 | 
114 | # In[32]:
115 | 
116 | 
117 | sns.distplot(tips['total_bill'], kde = False)
118 | 
119 | 
120 | # In[33]:
121 | 
122 | 
123 | sns.distplot(tips['total_bill'])   # kde lin  e; kernel density estimation
124 |                                    # normal distribution over each point and collected form of norm.dist.
125 | 
126 | 
127 | # In[34]:
128 | 
129 | 
130 | sns.kdeplot(tips['total_bill'])
131 | 
132 | 
133 | # In[ ]:
134 | 
135 | 
136 | # Categorical Plots
137 | 
138 | 
139 | # In[5]:
140 | 
141 | 
142 | import seaborn as sns
143 | get_ipython().run_line_magic('matplotlib', 'inline')
144 | 
145 | 
146 | # In[6]:
147 | 
148 | 
149 | tips = sns.load_dataset('tips')
150 | 
151 | 
152 | # In[7]:
153 | 
154 | 
155 | tips.head()
156 | 
157 | 
158 | # In[17]:
159 | 
160 | 
161 | import numpy as np
162 | 
163 | 
164 | # In[16]:
165 | 
166 | 
167 | sns.barplot(x = 'sex', y = 'total_bill', data = tips)  # x = categorical, y = numeric
168 |                                                        # it gives mean of total_bill
169 | 
170 | 
171 | # In[18]:
172 | 
173 | 
174 | sns.barplot(x = 'sex', y = 'total_bill', data = tips, estimator = np.sum)  # we can get sum by estimator arg.
175 | 
176 | 
177 | # In[21]:
178 | 
179 | 
180 | sns.barplot(tips['sex'], tips['total_bill'], estimator = np.std)
181 | 
182 | 
183 | # In[20]:
184 | 
185 | 
186 | sns.countplot(tips['sex'])            # it counts values
187 | 
188 | 
189 | # In[23]:
190 | 
191 | 
192 | sns.countplot('size', data = tips)  # it is barchart with numbers of values.
193 | 
194 | 
195 | # In[46]:
196 | 
197 | 
198 | sns.violinplot(tips['total_bill'])
199 | 
200 | 
201 | # In[37]:
202 | 
203 | 
204 | sns.boxplot(tips['total_bill'])
205 | 
206 | 
207 | # In[24]:
208 | 
209 | 
210 | sns.boxplot(tips['day'], tips['total_bill'])     # it gives total_bill boxplot per day
211 | 
212 | 
213 | # In[26]:
214 | 
215 | 
216 | sns.boxplot(tips['day'], tips['total_bill'], hue = tips['smoker'])   # we can add other status/info with hue.
217 | 
218 | 
219 | # In[27]:
220 | 
221 | 
222 | sns.violinplot(tips['day'], tips['total_bill'])    # points kde. violinplot related with boxplot. 
223 | 
224 | 
225 | # In[29]:
226 | 
227 | 
228 | sns.violinplot(tips['day'], tips['total_bill'], hue = tips['smoker'])
229 | 
230 | 
231 | # In[51]:
232 | 
233 | 
234 | sns.violinplot(tips['day'], tips['total_bill'], hue = tipss['smoker'], split = True)
235 | 
236 | 
237 | # In[31]:
238 | 
239 | 
240 | sns.stripplot(x = 'day', y = 'total_bill', data = tips)  # like scatter plot based one numeric one categorical
241 |                                                          # scatter plot is about 2 numeric
242 | 
243 | 
244 | # In[33]:
245 | 
246 | 
247 | sns.stripplot(x = 'day', y = 'total_bill', data = tips, jitter = False)
248 | 
249 | 
250 | # In[49]:
251 | 
252 | 
253 | sns.stripplot(x = 'day', y = 'total_bill', data = tips, hue = 'sex', dodge = True)
254 | 
255 | 
256 | # In[52]:
257 | 
258 | 
259 | sns.stripplot(x = 'day', y = 'total_bill', data = tips)
260 | 
261 | 
262 | # In[8]:
263 | 
264 | 
265 | sns.swarmplot(x = 'day', y = 'total_bill', data = tips)   # combining stripplot and violinplot
266 |                                                           # it is for small data sets
267 | 
268 | 
269 | # In[64]:
270 | 
271 | 
272 | tips['day'][tips['day'] == 'Fri'].value_counts()
273 | 
274 | 
275 | # In[60]:
276 | 
277 | 
278 | sns.violinplot(x = 'day', y = 'total_bill', data = tips)
279 | sns.swarmplot(x = 'day', y = 'total_bill', data = tips, color = 'black')
280 | 
281 | 
282 | # In[67]:
283 | 
284 | 
285 | sns.barplot(x = 'day', y = 'total_bill', data = tips)
286 | 
287 | 
288 | # In[72]:
289 | 
290 | 
291 | sns.catplot(x = 'day', y = 'total_bill', data = tips, kind = 'bar') # we can call all types with kind arg.
292 | 
293 | 
294 | # In[9]:
295 | 
296 | 
297 | # Matrix Plots
298 | # we need matrix form for heatmap.
299 | 
300 | 
301 | # In[53]:
302 | 
303 | 
304 | import seaborn as sns
305 | import matplotlib.pyplot as plt
306 | get_ipython().run_line_magic('matplotlib', 'inline')
307 | 
308 | 
309 | # In[11]:
310 | 
311 | 
312 | tips = sns.load_dataset('tips')
313 | 
314 | 
315 | # In[12]:
316 | 
317 | 
318 | flights = sns.load_dataset('flights')
319 | 
320 | 
321 | # In[13]:
322 | 
323 | 
324 | flights.head()
325 | 
326 | 
327 | # In[14]:
328 | 
329 | 
330 | tips.head()
331 | 
332 | 
333 | # In[15]:
334 | 
335 | 
336 | tips.corr()
337 | 
338 | 
339 | # In[22]:
340 | 
341 | 
342 | sns.heatmap(tips.corr(), annot = True, cmap = 'coolwarm')
343 | 
344 | 
345 | # In[37]:
346 | 
347 | 
348 | flights.head()
349 | 
350 | 
351 | # In[32]:
352 | 
353 | 
354 | flights.corr()
355 | 
356 | 
357 | # In[27]:
358 | 
359 | 
360 | sns.heatmap(flights.corr(), annot = True)
361 | 
362 | 
363 | # In[40]:
364 | 
365 | 
366 | fp = flights.pivot_table(values = 'passengers', index = 'month', columns = 'year') 
367 | 
368 | 
369 | # In[41]:
370 | 
371 | 
372 | fp
373 | 
374 | 
375 | # In[42]:
376 | 
377 | 
378 | sns.heatmap(fp)
379 | 
380 | 
381 | # In[52]:
382 | 
383 | 
384 | sns.heatmap(fp, linecolor = 'white', linewidths = 1)
385 | 
386 | 
387 | # In[56]:
388 | 
389 | 
390 | plt.subplots(figsize=(10,5))
391 | sns.heatmap(fp, cmap = 'coolwarm', linecolor = 'black', linewidths = 1)
392 | 
393 | 
394 | # In[58]:
395 | 
396 | 
397 | sns.clustermap(fp, cmap = 'coolwarm')              # it doesn't order. it makes clustersgroups.
398 |                                                    #standard_scale = 1
399 | 
400 | 
401 | # In[59]:
402 | 
403 | 
404 | # Grids
405 | 
406 | 
407 | # In[61]:
408 | 
409 | 
410 | import seaborn as sns
411 | import matplotlib.pyplot as plt
412 | get_ipython().run_line_magic('matplotlib', 'inline')
413 | 
414 | 
415 | # In[62]:
416 | 
417 | 
418 | iris = sns.load_dataset('iris')
419 | 
420 | 
421 | # In[63]:
422 | 
423 | 
424 | iris.head()
425 | 
426 | 
427 | # In[64]:
428 | 
429 | 
430 | sns.pairplot(iris)
431 | 
432 | 
433 | # In[65]:
434 | 
435 | 
436 | sns.PairGrid(iris)
437 | 
438 | 
439 | # In[70]:
440 | 
441 | 
442 | g = sns.PairGrid(iris)
443 | g.map(plt.scatter)
444 | 
445 | 
446 | # In[71]:
447 | 
448 | 
449 | g = sns.PairGrid(iris)
450 | g.map_diag(sns.distplot)
451 | g.map_upper(plt.scatter)
452 | g.map_lower(sns.kdeplot)
453 | 
454 | 
455 | # In[72]:
456 | 
457 | 
458 | tips = sns.load_dataset('tips')
459 | 
460 | 
461 | # In[73]:
462 | 
463 | 
464 | tips.head()
465 | 
466 | 
467 | # In[76]:
468 | 
469 | 
470 | g = sns.FacetGrid(data = tips, col = 'time', row = 'smoker')
471 | 
472 | 
473 | # In[81]:
474 | 
475 | 
476 | g = sns.FacetGrid(data = tips, col = 'time', row = 'smoker')
477 | g.map(sns.distplot, 'total_bill')
478 | 
479 | 
480 | # In[83]:
481 | 
482 | 
483 | g = sns.FacetGrid(data = tips, col = 'time', row = 'smoker')
484 | g.map(plt.scatter, 'total_bill', 'tip')
485 | 
486 | 
487 | # In[ ]:
488 | 
489 | 
490 | 
491 | 
492 | 


--------------------------------------------------------------------------------
/Statistics/Statistics.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | salary = [102, 33, 26, 27, 30, 25, 33, 33, 24]
  8 | import numpy as np
  9 | from scipy import stats
 10 | 
 11 | 
 12 | # In[2]:
 13 | 
 14 | 
 15 | mean_salary = np.mean(salary)
 16 | print('Mean :', mean_salary)
 17 | 
 18 | 
 19 | # In[6]:
 20 | 
 21 | 
 22 | median_salary = np.median(salary)
 23 | print('Median :', median_salary)
 24 | 
 25 | 
 26 | # In[4]:
 27 | 
 28 | 
 29 | stats.iqr(salary)
 30 | 
 31 | 
 32 | # In[7]:
 33 | 
 34 | 
 35 | mode_salary = stats.mode(salary)
 36 | print('Mode :', mode_salary)
 37 | 
 38 | 
 39 | # In[9]:
 40 | 
 41 | 
 42 | print('Range :', (np.max(salary) - (np.min(salary))))
 43 | 
 44 | 
 45 | # In[10]:
 46 | 
 47 | 
 48 | print('Variance :', (np.var(salary)))
 49 | 
 50 | 
 51 | # In[11]:
 52 | 
 53 | 
 54 | print('Std :', (np.std(salary)))
 55 | 
 56 | 
 57 | # In[12]:
 58 | 
 59 | 
 60 | a = [1, 10, 7, 12, 0, 30, 15, 22, 8, 2]
 61 | print(np.std(a))
 62 | 
 63 | 
 64 | # In[3]:
 65 | 
 66 | 
 67 | import numpy as np
 68 | 
 69 | 
 70 | # In[4]:
 71 | 
 72 | 
 73 | temp = [93, 84, 82, 78, 98, 70]
 74 | number_of_people = [13, 10, 11, 8, 15, 9]
 75 | 
 76 | 
 77 | # In[5]:
 78 | 
 79 | 
 80 | print('Coveriance :', np.cov(temp, number_of_people))
 81 | 
 82 | 
 83 | # In[6]:
 84 | 
 85 | 
 86 | print('Correlation :', np.corrcoef(temp, number_of_people))
 87 | 
 88 | 
 89 | # In[1]:
 90 | 
 91 | 
 92 | import numpy as np
 93 | 
 94 | 
 95 | # In[17]:
 96 | 
 97 | 
 98 | np.random.seed(101)
 99 | population = np.random.randint(0, 80, 100000)
100 | 
101 | 
102 | # In[18]:
103 | 
104 | 
105 | population
106 | 
107 | 
108 | # In[19]:
109 | 
110 | 
111 | len(population)
112 | 
113 | 
114 | # In[20]:
115 | 
116 | 
117 | np.random.seed(101)
118 | sample = np.random.choice(population, 100)
119 | 
120 | 
121 | # In[21]:
122 | 
123 | 
124 | sample
125 | 
126 | 
127 | # In[11]:
128 | 
129 | 
130 | len(sample)
131 | 
132 | 
133 | # In[25]:
134 | 
135 | 
136 | population.mean()
137 | 
138 | 
139 | # In[26]:
140 | 
141 | 
142 | sample.mean()
143 | 
144 | 
145 | # In[24]:
146 | 
147 | 
148 | np.random.seed(101)
149 | for i in range(10):
150 |     sample = np.random.choice(population, 100)
151 |     print(sample.mean())
152 | 
153 | 
154 | # In[28]:
155 | 
156 | 
157 | np.random.seed(101)
158 | sample_means = []
159 | for i in range(10):
160 |     sample = np.random.choice(population, 100)
161 |     sample_means.append(sample.mean())
162 | 
163 | 
164 | # In[29]:
165 | 
166 | 
167 | sample_means
168 | 
169 | 
170 | # In[30]:
171 | 
172 | 
173 | np.mean(sample_means)
174 | 
175 | 
176 | # In[31]:
177 | 
178 | 
179 | population.mean()
180 | 
181 | 
182 | # In[32]:
183 | 
184 | 
185 | from scipy.stats import kurtosis, skew
186 | 
187 | 
188 | # In[33]:
189 | 
190 | 
191 | pip install matplotlib
192 | 
193 | 
194 | # In[1]:
195 | 
196 | 
197 | import matplotlib.pyplot as plt
198 | 
199 | 
200 | # In[43]:
201 | 
202 | 
203 | np.random.seed(42)
204 | x = np.random.normal(0, 2, 100000)
205 | 
206 | 
207 | # In[44]:
208 | 
209 | 
210 | plt.hist(x, bins = 100);
211 | 
212 | 
213 | # In[45]:
214 | 
215 | 
216 | kurtosis(x)
217 | 
218 | 
219 | # In[46]:
220 | 
221 | 
222 | skew(x)
223 | 
224 | 
225 | # In[6]:
226 | 
227 | 
228 | import numpy as np
229 | from scipy import stats
230 | 
231 | 
232 | # In[49]:
233 | 
234 | 
235 | age = [20, 22, 25, 25, 27, 27, 29, 30, 31, 121]
236 | 
237 | 
238 | # In[50]:
239 | 
240 | 
241 | np.mean(age)
242 | 
243 | 
244 | # In[51]:
245 | 
246 | 
247 | np.median(age)
248 | 
249 | 
250 | # In[13]:
251 | 
252 | 
253 | stats.mode(age)
254 | 
255 | 
256 | # In[15]:
257 | 
258 | 
259 | stats.mode(age)[0]
260 | 
261 | 
262 | # In[16]:
263 | 
264 | 
265 | stats.mode(age)[0][0]
266 | 
267 | 
268 | # In[24]:
269 | 
270 | 
271 | age_2 = [20, 22, 25, 25, 27, 27, 29, 30, 31]
272 | 
273 | 
274 | # In[25]:
275 | 
276 | 
277 | np.mean(age_2)
278 | 
279 | 
280 | # In[26]:
281 | 
282 | 
283 | np.median(age_2)
284 | 
285 | 
286 | # In[27]:
287 | 
288 | 
289 | age_3 = [19, 20, 21, 22]
290 | 
291 | 
292 | # In[28]:
293 | 
294 | 
295 | stats.mode(age_3)   # it gives smallest element if no duplicate or more.
296 | 
297 | 
298 | # In[29]:
299 | 
300 | 
301 | type(age)
302 | 
303 | 
304 | # In[30]:
305 | 
306 | 
307 | # mean in arrays
308 | age_new = np.array(age)
309 | 
310 | 
311 | # In[31]:
312 | 
313 | 
314 | age_new
315 | 
316 | 
317 | # In[32]:
318 | 
319 | 
320 | type(age_new)
321 | 
322 | 
323 | # In[33]:
324 | 
325 | 
326 | age_new.mean()
327 | 
328 | 
329 | # In[35]:
330 | 
331 | 
332 | a = np.array([[6, 8, 3, 0],
333 |               [3, 2, 1, 7],
334 |               [8, 1, 8, 4],
335 |               [5, 3, 0, 5],
336 |               [4, 7, 5, 9]])
337 | stats.mode(a)
338 | 
339 | 
340 | # In[38]:
341 | 
342 | 
343 | stats.mode(a, axis = 1)
344 | 
345 | 
346 | # In[52]:
347 | 
348 | 
349 | # range, sd, variance
350 | age = [20, 22, 25, 25, 27, 27, 27, 29, 30, 31, 121]
351 | 
352 | 
353 | # In[73]:
354 | 
355 | 
356 | range = np.max(age) - np.min(age)
357 | print(range)
358 | 
359 | 
360 | # In[74]:
361 | 
362 | 
363 | np.ptp(age)   # it gives range Peak To Peak ptp
364 | 
365 | 
366 | # In[54]:
367 | 
368 | 
369 | np.std(age)
370 | 
371 | 
372 | # In[55]:
373 | 
374 | 
375 | np.var(age)
376 | 
377 | 
378 | # In[56]:
379 | 
380 | 
381 | age_2 = [20, 22, 25, 25, 27, 27, 27, 29, 30, 31]
382 | 
383 | 
384 | # In[57]:
385 | 
386 | 
387 | np.std(age_2)
388 | 
389 | 
390 | # In[60]:
391 | 
392 | 
393 | range = np.max(age_2) - np.min(age_2)
394 | 
395 | 
396 | # In[61]:
397 | 
398 | 
399 | print(range)
400 | 
401 | 
402 | # In[62]:
403 | 
404 | 
405 | # IQR - iqr
406 | 
407 | 
408 | # In[63]:
409 | 
410 | 
411 | x=[8, 10, 5, 24, 8, 3, 11, 3, 40, 7, 6, 12, 4]
412 | 
413 | 
414 | # In[64]:
415 | 
416 | 
417 | q75, q25 = np.percentile(x, [75, 25])
418 | 
419 | 
420 | # In[65]:
421 | 
422 | 
423 | q75
424 | 
425 | 
426 | # In[66]:
427 | 
428 | 
429 | q25
430 | 
431 | 
432 | # In[67]:
433 | 
434 | 
435 | sorted(x)
436 | 
437 | 
438 | # In[68]:
439 | 
440 | 
441 | iqr = q75-q25
442 | 
443 | 
444 | # In[69]:
445 | 
446 | 
447 | iqr
448 | 
449 | 
450 | # In[70]:
451 | 
452 | 
453 | stats.iqr(x)
454 | 
455 | 
456 | # In[71]:
457 | 
458 | 
459 | np.percentile(x, 75)
460 | 
461 | 
462 | # In[72]:
463 | 
464 | 
465 | np.percentile(x, 25)
466 | 
467 | 
468 | # In[1]:
469 | 
470 | 
471 | q = [62, 63, 64, 64, 70, 72, 76, 77, 81, 81]
472 | 
473 | 
474 | # In[3]:
475 | 
476 | 
477 | from scipy import stats
478 | import numpy as np
479 | 
480 | 
481 | # In[5]:
482 | 
483 | 
484 | np.percentile(q, 25)
485 | 
486 | 
487 | # In[6]:
488 | 
489 | 
490 | np.percentile(q, 75)
491 | 
492 | 
493 | # In[7]:
494 | 
495 | 
496 | stats.iqr(q)
497 | 
498 | 
499 | # In[8]:
500 | 
501 | 
502 | np.median(q)
503 | 
504 | 
505 | # In[9]:
506 | 
507 | 
508 | np.mean(q)
509 | 
510 | 
511 | # In[15]:
512 | 
513 | 
514 | stats.mode(q)
515 | 
516 | 
517 | # In[16]:
518 | 
519 | 
520 | # scatter plot
521 | 
522 | 
523 | # In[17]:
524 | 
525 | 
526 | # method matplotlib
527 | 
528 | 
529 | # In[1]:
530 | 
531 | 
532 | import numpy as np
533 | import matplotlib.pyplot as plt
534 | 
535 | 
536 | # In[2]:
537 | 
538 | 
539 | x = np.linspace(0, 5, 11)
540 | 
541 | 
542 | # In[3]:
543 | 
544 | 
545 | x
546 | 
547 | 
548 | # In[4]:
549 | 
550 | 
551 | y = x ** 2
552 | 
553 | 
554 | # In[5]:
555 | 
556 | 
557 | y
558 | 
559 | 
560 | # In[6]:
561 | 
562 | 
563 | plt.plot(x, y);   # line plot
564 | 
565 | 
566 | # In[7]:
567 | 
568 | 
569 | plt.scatter(x, y);  #scatter plot
570 | 
571 | 
572 | # In[16]:
573 | 
574 | 
575 | np.cov(x, y)
576 | 
577 | 
578 | # In[8]:
579 | 
580 | 
581 | # method seaborn
582 | 
583 | 
584 | # In[9]:
585 | 
586 | 
587 | import seaborn as sns
588 | 
589 | 
590 | # In[10]:
591 | 
592 | 
593 | sns.scatterplot(x, y);
594 | 
595 | 
596 | # In[14]:
597 | 
598 | 
599 | sns.jointplot(x, y, kind = 'scatter');
600 | 
601 | 
602 | # In[15]:
603 | 
604 | 
605 | # method pandas
606 | 
607 | 
608 | # In[11]:
609 | 
610 | 
611 | import pandas as pd
612 | 
613 | 
614 | # In[15]:
615 | 
616 | 
617 | lst = zip(x, y)
618 | print(list((lst)))
619 | df = pd.DataFrame(lst, columns = ['x', 'y'])
620 | 
621 | 
622 | # In[18]:
623 | 
624 | 
625 | df.head()
626 | 
627 | 
628 | # In[21]:
629 | 
630 | 
631 | df.plot.scatter('x', 'y');
632 | 
633 | 
634 | # In[22]:
635 | 
636 | 
637 | # method changing linestyle in line plot
638 | 
639 | 
640 | # In[23]:
641 | 
642 | 
643 | plt.plot(x, y);
644 | 
645 | 
646 | # In[24]:
647 | 
648 | 
649 | plt.plot(x, y, marker = 'o', linestyle = ' ');
650 | 
651 | 
652 | # In[28]:
653 | 
654 | 
655 | # boxplot
656 | 
657 | 
658 | # In[ ]:
659 | 
660 | 
661 | # boxplot with matplotlib
662 | 
663 | 
664 | # In[26]:
665 | 
666 | 
667 | x
668 | 
669 | 
670 | # In[27]:
671 | 
672 | 
673 | plt.boxplot(x);
674 | 
675 | 
676 | # In[30]:
677 | 
678 | 
679 | arr1 = np.random.randint(100, 200, 100)
680 | 
681 | 
682 | # In[31]:
683 | 
684 | 
685 | arr1
686 | 
687 | 
688 | # In[34]:
689 | 
690 | 
691 | plt.boxplot(arr1);
692 | 
693 | 
694 | # In[36]:
695 | 
696 | 
697 | arr1=np.random.randint(100,200,100)
698 | arr2=np.random.randint(1,50,5)
699 | arr3=np.random.randint(300,350,5)
700 | arr=np.append(arr1, arr2)
701 | arr=np.append(arr, arr3)
702 | plt.boxplot(arr);
703 | 
704 | 
705 | # In[37]:
706 | 
707 | 
708 | min(arr)
709 | 
710 | 
711 | # In[38]:
712 | 
713 | 
714 | max(arr)
715 | 
716 | 
717 | # In[39]:
718 | 
719 | 
720 | from scipy import stats
721 | 
722 | 
723 | # In[40]:
724 | 
725 | 
726 | stats.iqr(arr)
727 | 
728 | 
729 | # In[41]:
730 | 
731 | 
732 | np.percentile(arr, 25)
733 | 
734 | 
735 | # In[42]:
736 | 
737 | 
738 | np.percentile(arr, 75)
739 | 
740 | 
741 | # In[43]:
742 | 
743 | 
744 | np.median(arr)
745 | 
746 | 
747 | # In[45]:
748 | 
749 | 
750 | sns.boxplot(arr, orient = 'v');
751 | 
752 | 
753 | # In[17]:
754 | 
755 | 
756 | # corr and cov
757 | 
758 | 
759 | # In[4]:
760 | 
761 | 
762 | import numpy as np
763 | import matplotlib.pyplot as plt
764 | import seaborn as sns
765 | 
766 | 
767 | # In[2]:
768 | 
769 | 
770 | temp=[93,84,82,78,98,70]
771 | number_of_people=[13,10, 11, 8, 15, 9]
772 | 
773 | 
774 | # In[3]:
775 | 
776 | 
777 | np.cov(temp, number_of_people)[0, 1]
778 | 
779 | 
780 | # In[4]:
781 | 
782 | 
783 | np.corrcoef(temp, number_of_people)[0, 1]
784 | 
785 | 
786 | # In[7]:
787 | 
788 | 
789 | df = sns.load_dataset('tips')
790 | 
791 | 
792 | # In[8]:
793 | 
794 | 
795 | df.head()
796 | 
797 | 
798 | # In[9]:
799 | 
800 | 
801 | df.dtypes
802 | 
803 | 
804 | # In[10]:
805 | 
806 | 
807 | df.corr('pearson')
808 | 
809 | 
810 | # In[11]:
811 | 
812 | 
813 | df.corr()
814 | 
815 | 
816 | # In[15]:
817 | 
818 | 
819 | np.corrcoef(df['total_bill'], df['tip'])[0, 1]
820 | 
821 | 
822 | # In[16]:
823 | 
824 | 
825 | df['total_bill'].corr(df['tip'])
826 | 
827 | 
828 | # In[21]:
829 | 
830 | 
831 | sns.heatmap(tips.corr(), annot = True, cmap = 'RdYlGn');
832 | 
833 | 
834 | # In[24]:
835 | 
836 | 
837 | mpg = sns.load_dataset('mpg')
838 | 
839 | 
840 | # In[25]:
841 | 
842 | 
843 | sns.pairplot(mpg);
844 | 
845 | 


--------------------------------------------------------------------------------
/Statistics/Statistics_2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Probability
  5 | 
  6 | # In[35]:
  7 | 
  8 | 
  9 | # Heads or Tails
 10 | 
 11 | 
 12 | # In[36]:
 13 | 
 14 | 
 15 | import random
 16 | 
 17 | 
 18 | # In[37]:
 19 | 
 20 | 
 21 | coin = ('H', 'T')
 22 | 
 23 | 
 24 | # In[38]:
 25 | 
 26 | 
 27 | random.choice(coin)
 28 | 
 29 | 
 30 | # In[39]:
 31 | 
 32 | 
 33 | for i in range(5):
 34 |     result = random.choice(coin)
 35 |     print(result)
 36 | 
 37 | 
 38 | # In[42]:
 39 | 
 40 | 
 41 | results = {'H' : 0, 'T' : 0}
 42 | 
 43 | for i in range(10):
 44 |     results[random.choice(list(results.keys()))] += 1
 45 |     
 46 | print('P(Heads):', results['H'] / sum(results.values()))
 47 | print('P(Tails):', results['T'] / sum(results.values()))
 48 | 
 49 | 
 50 | # In[41]:
 51 | 
 52 | 
 53 | results = {'H' : 0, 'T' : 0}
 54 | 
 55 | for i in range(100000):                                     # law of large estimates
 56 |     results[random.choice(list(results.keys()))] += 1
 57 |     
 58 | print('P(Heads):', results['H'] / sum(results.values()))
 59 | print('P(Tails):', results['T'] / sum(results.values()))
 60 | 
 61 | 
 62 | # In[43]:
 63 | 
 64 | 
 65 | # Rolling 2 Dice
 66 | 
 67 | 
 68 | # In[44]:
 69 | 
 70 | 
 71 | import numpy as np
 72 | import seaborn as sns
 73 | import matplotlib.pyplot as plt
 74 | 
 75 | 
 76 | # In[45]:
 77 | 
 78 | 
 79 | np.random.seed(51)
 80 | 
 81 | 
 82 | # In[46]:
 83 | 
 84 | 
 85 | d1 = np.array([1, 2, 3, 4, 5, 6])
 86 | d2 = np.array([1, 2, 3, 4, 5, 6])
 87 | 
 88 | 
 89 | # In[60]:
 90 | 
 91 | 
 92 | dice_1 = []
 93 | dice_2 = []
 94 | 
 95 | sums = []
 96 | 
 97 | for i in range(1000):
 98 |     dice_1.append(np.random.choice(d1))
 99 |     dice_2.append(np.random.choice(d2))
100 |     sums.append(dice_1[i] + dice_2[i])
101 | 
102 | #print(dice_1)
103 | #print(dice_2)
104 | #print(sums)
105 | 
106 | 
107 | # In[70]:
108 | 
109 | 
110 | fig, (ax1, ax2) = plt.subplots(ncols = 2, sharey = True, figsize = (12, 4))
111 | sns.countplot(dice_1, ax = ax1)
112 | sns.countplot(dice_2, ax = ax2)
113 | 
114 | 
115 | # In[68]:
116 | 
117 | 
118 | sns.countplot(sums)
119 | 
120 | 
121 | # # Combinatoric Generators
122 | 
123 | # In[72]:
124 | 
125 | 
126 | # Product
127 | 
128 | 
129 | # In[73]:
130 | 
131 | 
132 | import itertools as it
133 | 
134 | 
135 | # In[84]:
136 | 
137 | 
138 | cp = list(it.product('HT', repeat = 3))      # possible outcomes of H or T (2 ** 3)
139 | 
140 | 
141 | # In[85]:
142 | 
143 | 
144 | len(cp)
145 | 
146 | 
147 | # In[86]:
148 | 
149 | 
150 | cp
151 | 
152 | 
153 | # In[87]:
154 | 
155 | 
156 | cp2 = list(it.product('123456', 'HT'))       # 2 x 6 = 12 possible outcomes
157 | 
158 | 
159 | # In[88]:
160 | 
161 | 
162 | cp2
163 | 
164 | 
165 | # In[89]:
166 | 
167 | 
168 | len(cp2)
169 | 
170 | 
171 | # # Permutations
172 | 
173 | # In[1]:
174 | 
175 | 
176 | import math
177 | 
178 | 
179 | # In[2]:
180 | 
181 | 
182 | math.factorial(4)
183 | 
184 | 
185 | # In[3]:
186 | 
187 | 
188 | def permutation(n, r):
189 |     return math.factorial(n) / math.factorial(n - r)
190 | 
191 | 
192 | # In[4]:
193 | 
194 | 
195 | permutation(4, 2)
196 | 
197 | 
198 | # In[5]:
199 | 
200 | 
201 | permutation(8, 4)
202 | 
203 | 
204 | # In[105]:
205 | 
206 | 
207 | import itertools as it
208 | 
209 | 
210 | # In[102]:
211 | 
212 | 
213 | cp3 = list(it.permutations('GRYB', 2))       # pick 2 colors of 4 colors (order is important)
214 | 
215 | 
216 | # In[103]:
217 | 
218 | 
219 | len(cp3)
220 | 
221 | 
222 | # In[104]:
223 | 
224 | 
225 | cp3                                          # sequence is important
226 | 
227 | 
228 | # # Combinations
229 | 
230 | # In[6]:
231 | 
232 | 
233 | import itertools as it
234 | 
235 | 
236 | # In[106]:
237 | 
238 | 
239 | cp4 = list(it.combinations('GRYB', 2))           # pick 2 colors of 4 colors (order is not important)
240 | 
241 | 
242 | # In[107]:
243 | 
244 | 
245 | cp4
246 | 
247 | 
248 | # In[108]:
249 | 
250 | 
251 | len(cp4)
252 | 
253 | 
254 | # In[7]:
255 | 
256 | 
257 | def combination(n, r):
258 |     return math.factorial(n) / (math.factorial(n - r) * math.factorial(r))
259 | 
260 | 
261 | # In[8]:
262 | 
263 | 
264 | combination(4, 2)
265 | 
266 | 
267 | # In[9]:
268 | 
269 | 
270 | combination(20, 11)
271 | 
272 | 
273 | # In[119]:
274 | 
275 | 
276 | cp5 = list(it.combinations_with_replacement('GRYB', 2))  
277 | 
278 | 
279 | # In[120]:
280 | 
281 | 
282 | cp5
283 | 
284 | 
285 | # In[121]:
286 | 
287 | 
288 | len(cp5)
289 | 
290 | 
291 | # In[ ]:
292 | 
293 | 
294 | 
295 | 
296 | 


--------------------------------------------------------------------------------
/Statistics/Statistics_3.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Binomial Distribution
  5 | 
  6 | # In[1]:
  7 | 
  8 | 
  9 | import numpy as np
 10 | from scipy import stats
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | 
 14 | # In[34]:
 15 | 
 16 | 
 17 | (n, p) = (2, 0.5)
 18 | 
 19 | 
 20 | # In[35]:
 21 | 
 22 | 
 23 | stats.binom(n, p)
 24 | 
 25 | 
 26 | # In[36]:
 27 | 
 28 | 
 29 | binomDist = stats.binom(n, p)
 30 | 
 31 | 
 32 | # In[37]:
 33 | 
 34 | 
 35 | binomDist.args
 36 | 
 37 | 
 38 | # In[38]:
 39 | 
 40 | 
 41 | binomDist.pmf(0)               # it gives P(x=0) in n = 2, p = 0.5
 42 | 
 43 | 
 44 | # In[39]:
 45 | 
 46 | 
 47 | dist = []
 48 | 
 49 | print('r\tp(r)')
 50 | for i in range(n + 1):
 51 |     dist.append(binomDist.pmf(i))
 52 |     print(str(i) + '\t' + str(binomDist.pmf(i)))
 53 | 
 54 | 
 55 | # In[12]:
 56 | 
 57 | 
 58 | binomDist.pmf(2) 
 59 | 
 60 | 
 61 | # In[13]:
 62 | 
 63 | 
 64 | binomDist.pmf(3)    # because n = 2
 65 | 
 66 | 
 67 | # In[21]:
 68 | 
 69 | 
 70 | # other example
 71 | 
 72 | 
 73 | # In[41]:
 74 | 
 75 | 
 76 | (n, p) = (10, 0.2)
 77 | 
 78 | 
 79 | # In[42]:
 80 | 
 81 | 
 82 | binomDist = stats.binom(n, p)
 83 | 
 84 | 
 85 | # In[43]:
 86 | 
 87 | 
 88 | binomDist.args
 89 | 
 90 | 
 91 | # In[44]:
 92 | 
 93 | 
 94 | dist = []
 95 | 
 96 | print('r\tp(r)')
 97 | for i in range(n + 1):
 98 |     dist.append(binomDist.pmf(i))
 99 |     print(str(i) + '\t' + str(binomDist.pmf(i)))
100 | 
101 | 
102 | # In[45]:
103 | 
104 | 
105 | binomDist.pmf(7) 
106 | 
107 | 
108 | # In[46]:
109 | 
110 | 
111 | binomDist.cdf(1)           # pmf(0) + pmf(1)
112 | 
113 | 
114 | # In[47]:
115 | 
116 | 
117 | plt.bar(list(range(n + 1)), dist)
118 | 
119 | 
120 | # In[48]:
121 | 
122 | 
123 | mean, var = binomDist.stats()
124 | 
125 | 
126 | # In[49]:
127 | 
128 | 
129 | print('mean = ' + str(mean))
130 | 
131 | 
132 | # print('var = ' + str(var))
133 | 
134 | # In[53]:
135 | 
136 | 
137 | binomDist.stats()      # mean and var
138 | 
139 | 
140 | # In[54]:
141 | 
142 | 
143 | binomDist.median()
144 | 
145 | 
146 | # In[56]:
147 | 
148 | 
149 | binomDist.std()
150 | 
151 | 
152 | # In[72]:
153 | 
154 | 
155 | binomDist.rvs(100)
156 | 
157 | 
158 | # In[75]:
159 | 
160 | 
161 | stats.binom.cdf(2, 10, 0.2)
162 | 
163 | 
164 | # In[76]:
165 | 
166 | 
167 | binomDist.cdf(2)
168 | 
169 | 
170 | # ### Exercise
171 | # There was a probability of 0.8 success in any attempt to make a call. 
172 | # Calculate the probability of having 7 successes in 10 attempts.
173 | 
174 | # In[92]:
175 | 
176 | 
177 | stats.binom.pmf(7, 10, 0.8)
178 | 
179 | 
180 | # ### Exercise
181 | # A (blindfolded) marksman finds that on the average he hits the target 4 times out of 5. If he fires 4 shots, what is the probability of
182 | # (a) more than 2 hits?
183 | # (b) at least 3 misses?
184 | 
185 | # In[ ]:
186 | 
187 | 
188 | 
189 | 
190 | 
191 | # # Poisson Distribution
192 | 
193 | # In[94]:
194 | 
195 | 
196 | stats.poisson.pmf(5, 6)            # avg = 6, x = 5
197 | 
198 | 
199 | # In[95]:
200 | 
201 | 
202 | stats.poisson.cdf(5, 6)    
203 | 
204 | 
205 | # ### Exercise
206 | # A bank is interested in studying the number of people who use the ATM located outside its office late at night.
207 | # On average, 1.6 customers walk up to the ATM during any 10 minute interval between 9pm and midnight.
208 | # What is lambda λ for this problem?
209 | # What is the probability of exactly 3 customers using th ATM during any 10 minute interval?
210 | # What is the probability of 3 or fewer people?
211 | 
212 | # In[98]:
213 | 
214 | 
215 | avg = 1.6
216 | x = 3
217 | 
218 | 
219 | # In[99]:
220 | 
221 | 
222 | stats.poisson.pmf(3, 1.6)   
223 | 
224 | 
225 | # In[100]:
226 | 
227 | 
228 | stats.poisson.cdf(3, 1.6)   
229 | 
230 | 
231 | # In[101]:
232 | 
233 | 
234 | poissonDist = stats.poisson(avg)
235 | 
236 | 
237 | # In[102]:
238 | 
239 | 
240 | dist = []
241 | 
242 | print('r\tp(r)')
243 | for i in range(10):
244 |     dist.append(poissonDist.pmf(i))
245 |     print(str(i) + '\t' + str(poissonDist.pmf(i)))
246 | 
247 | 
248 | # ### Exercise
249 | # The Indiana Department of Transportation is concerned about the number of deer being struck by cars between Martinsville and Bloomington. They note the number of deer carcasses and other deer-related accidents over a 1-month period in a 2-mile intervals.
250 | # What is the probability of zero deer strike incidents during any 2-mile interval between Martinsville and Bloomington?
251 | # 0.08 strikes per/day
252 | 
253 | # In[104]:
254 | 
255 | 
256 | stats.poisson.pmf(0, 0.08*30)
257 | 
258 | 
259 | # # Bernoulli Distribution
260 | 
261 | # In[116]:
262 | 
263 | 
264 | p = 0.3
265 | 
266 | 
267 | # In[117]:
268 | 
269 | 
270 | bernDist = stats.bernoulli(p)
271 | 
272 | 
273 | # In[118]:
274 | 
275 | 
276 | bernDist.pmf(0)
277 | 
278 | 
279 | # In[119]:
280 | 
281 | 
282 | bernDist.pmf(1)
283 | 
284 | 
285 | # In[120]:
286 | 
287 | 
288 | bernDist.pmf(2)             # because single trial. there is no other option.
289 | 
290 | 
291 | # In[121]:
292 | 
293 | 
294 | dist = []
295 | 
296 | print('r\tp(r)')
297 | for i in range(2):
298 |     dist.append(bernDist.pmf(i))
299 |     print(str(i) + '\t' + str(bernDist.pmf(i)))
300 | 
301 | 
302 | # In[122]:
303 | 
304 | 
305 | plt.bar(list(range(2)), dist)
306 | plt.xticks(list(range(2)), ('0', '1'))
307 | plt.show()
308 | 
309 | 
310 | # In[123]:
311 | 
312 | 
313 | mean, var = bernDist.stats()
314 | 
315 | 
316 | # In[125]:
317 | 
318 | 
319 | str(mean)
320 | 
321 | 
322 | # In[126]:
323 | 
324 | 
325 | str(var)
326 | 
327 | 
328 | # In[127]:
329 | 
330 | 
331 | bernDist.median()
332 | 
333 | 
334 | # In[129]:
335 | 
336 | 
337 | bernDist.std()
338 | 
339 | 
340 | # In[ ]:
341 | 
342 | 
343 | 
344 | 
345 | 


--------------------------------------------------------------------------------
/Statistics/Statistics_4.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import numpy as np
  8 | from scipy import stats
  9 | import matplotlib.pyplot as plt
 10 | 
 11 | 
 12 | # # Uniform Distribution
 13 | 
 14 | # In[2]:
 15 | 
 16 | 
 17 | (a, b) = (0, 30)
 18 | 
 19 | 
 20 | # In[4]:
 21 | 
 22 | 
 23 | unifDist = stats.uniform(0, 30)
 24 | 
 25 | 
 26 | # In[10]:
 27 | 
 28 | 
 29 | unifDist.args
 30 | 
 31 | 
 32 | # In[6]:
 33 | 
 34 | 
 35 | unifDist.pdf(10)
 36 | 
 37 | 
 38 | # In[14]:
 39 | 
 40 | 
 41 | unifDist.pdf(15)
 42 | 
 43 | 
 44 | # In[15]:
 45 | 
 46 | 
 47 | unifDist.cdf(15)
 48 | 
 49 | 
 50 | # In[12]:
 51 | 
 52 | 
 53 | unifDist.cdf(10)
 54 | 
 55 | 
 56 | # In[20]:
 57 | 
 58 | 
 59 | a = unifDist.rvs(1000)
 60 | 
 61 | b = []
 62 | for i in a:
 63 |     b.append(unifDist.pdf(i))
 64 | 
 65 | 
 66 | # In[23]:
 67 | 
 68 | 
 69 | plt.bar(a, b)
 70 | 
 71 | 
 72 | # In[24]:
 73 | 
 74 | 
 75 | mean, var = unifDist.stats()
 76 | 
 77 | 
 78 | # In[25]:
 79 | 
 80 | 
 81 | str(mean)
 82 | 
 83 | 
 84 | # In[26]:
 85 | 
 86 | 
 87 | str(var)
 88 | 
 89 | 
 90 | # In[27]:
 91 | 
 92 | 
 93 | unifDist.median()
 94 | 
 95 | 
 96 | # In[28]:
 97 | 
 98 | 
 99 | unifDist.std()
100 | 
101 | 
102 | # # Normal Distribution
103 | 
104 | # In[30]:
105 | 
106 | 
107 | (mu, sigma) = (0, 1)
108 | 
109 | 
110 | # In[31]:
111 | 
112 | 
113 | normDist = stats.norm(mu, sigma)
114 | 
115 | 
116 | # In[33]:
117 | 
118 | 
119 | normDist.args
120 | 
121 | 
122 | # In[55]:
123 | 
124 | 
125 | 1 - normDist.cdf(2)            # P(Z>2)
126 | 
127 | 
128 | # In[52]:
129 | 
130 | 
131 | normDist.pdf(4)
132 | 
133 | 
134 | # In[62]:
135 | 
136 | 
137 | x = np.linspace(-5, 5, 1000)
138 | 
139 | y = normDist.pdf(x)
140 | 
141 | 
142 | # In[63]:
143 | 
144 | 
145 | plt.plot(x, y)
146 | 
147 | 
148 | # In[64]:
149 | 
150 | 
151 | mean, var, skew, kurt = normDist.stats(moments = 'mvsk')
152 | 
153 | 
154 | # In[65]:
155 | 
156 | 
157 | str(mean)
158 | 
159 | 
160 | # In[66]:
161 | 
162 | 
163 | str(var)
164 | 
165 | 
166 | # In[67]:
167 | 
168 | 
169 | str(skew)
170 | 
171 | 
172 | # In[68]:
173 | 
174 | 
175 | str(kurt)
176 | 
177 | 
178 | # In[70]:
179 | 
180 | 
181 | normDist.median()
182 | 
183 | 
184 | # In[71]:
185 | 
186 | 
187 | normDist.std()
188 | 
189 | 
190 | # # t distribution
191 | 
192 | # In[72]:
193 | 
194 | 
195 | stats.t.cdf(-0.7745966, df = 14)
196 | 
197 | 
198 | # In[79]:
199 | 
200 | 
201 | stats.t.cdf(0, df = 14)
202 | 
203 | 
204 | # In[74]:
205 | 
206 | 
207 | tDist = stats.t(df = 15)
208 | 
209 | x = np.linspace(-5, 5, 100)
210 | 
211 | y = tDist.pdf(x)
212 | 
213 | 
214 | # In[75]:
215 | 
216 | 
217 | plt.plot(x, y)
218 | 
219 | 
220 | # In[ ]:
221 | 
222 | 
223 | 
224 | 
225 | 


--------------------------------------------------------------------------------
/Statistics/Statistics_5.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # CENTRAL LIMIT THEOREM
  5 | 
  6 | # ## Sample Mean for a Uniform Distribution
  7 | 
  8 | # In[1]:
  9 | 
 10 | 
 11 | import random
 12 | import math
 13 | import matplotlib.pyplot as plt
 14 | import numpy as np
 15 | from scipy import stats
 16 | random.seed(54312)
 17 | 
 18 | 
 19 | # In[10]:
 20 | 
 21 | 
 22 | sample_size = 30
 23 | 
 24 | sim_num = 10000
 25 | 
 26 | 
 27 | # In[11]:
 28 | 
 29 | 
 30 | mean_list = []
 31 | 
 32 | for i in range(sim_num):
 33 |     sample_list = []
 34 |     for i in range(sample_size):
 35 |         sample_list.append(random.randint(0, 100))
 36 |     sample_mean = sum(sample_list)/sample_size
 37 |     mean_list.append(sample_mean)
 38 | 
 39 | 
 40 | # In[12]:
 41 | 
 42 | 
 43 | mean_list
 44 | 
 45 | 
 46 | # In[13]:
 47 | 
 48 | 
 49 | sum(mean_list)/len(mean_list)
 50 | 
 51 | 
 52 | # In[19]:
 53 | 
 54 | 
 55 | plt.hist(mean_list, bins=100, density = True, color = 'r');
 56 | plt.grid()
 57 | mu = 50
 58 | sigma = math.sqrt((100**2)/12) / (math.sqrt(sample_size))
 59 | x = np.linspace(mu - 4* sigma, mu + 4 * sigma)
 60 | plt.plot(x, stats.norm.pdf(x, mu, sigma))
 61 | 
 62 | 
 63 | # ## Sample Mean for a Exponential Distribution
 64 | 
 65 | # In[32]:
 66 | 
 67 | 
 68 | sample_size = 30
 69 | 
 70 | sim_num = 10000
 71 | 
 72 | 
 73 | # In[33]:
 74 | 
 75 | 
 76 | mean_list = []
 77 | 
 78 | for i in range(sim_num):
 79 |     sample_list = []
 80 |     for i in range(sample_size):
 81 |         sample_list.append(np.random.exponential(1))
 82 |     sample_mean = sum(sample_list)/sample_size
 83 |     mean_list.append(sample_mean)
 84 | 
 85 | 
 86 | # In[34]:
 87 | 
 88 | 
 89 | plt.hist(mean_list, bins=100, density = True, color = 'r');
 90 | plt.grid()
 91 | mu = 1
 92 | sigma = 1 / (math.sqrt(sample_size))
 93 | x = np.linspace(mu - 4* sigma, mu + 4 * sigma)
 94 | plt.plot(x, stats.norm.pdf(x, mu, sigma))
 95 | 
 96 | 
 97 | # ## CONFIDENCE INTERVAL
 98 | 
 99 | # In[39]:
100 | 
101 | 
102 | import random
103 | import math
104 | import matplotlib.pyplot as plt
105 | import numpy as np
106 | from scipy import stats
107 | random.seed(39809)
108 | 
109 | 
110 | # In[40]:
111 | 
112 | 
113 | sample_size = 30
114 | sample_list = []
115 | 
116 | for i in range(30):
117 |     sample_list.append(random.randint(0, 10))    
118 | 
119 | 
120 | # In[41]:
121 | 
122 | 
123 | sample_mean = np.mean(sample_list)
124 | 
125 | 
126 | # In[42]:
127 | 
128 | 
129 | sample_mean
130 | 
131 | 
132 | # In[43]:
133 | 
134 | 
135 | n = len(sample_list)
136 | 
137 | 
138 | # In[44]:
139 | 
140 | 
141 | cl = 0.95
142 | 
143 | std = 1
144 | 
145 | 
146 | # In[51]:
147 | 
148 | 
149 | critic_value = stats.norm.ppf(((1-0.95)/2) + 0.95)
150 | 
151 | 
152 | # In[53]:
153 | 
154 | 
155 | (((1-0.95)/2) + 0.95)   # z table ppf value for 0.95 at t table
156 | 
157 | 
158 | # In[52]:
159 | 
160 | 
161 | critic_value
162 | 
163 | 
164 | # In[54]:
165 | 
166 | 
167 | lower_limit = sample_mean - (critic_value * (std/math.sqrt(n)))
168 | 
169 | 
170 | # In[56]:
171 | 
172 | 
173 | upper_limit = sample_mean + (critic_value * (std/math.sqrt(n)))
174 | 
175 | 
176 | # In[62]:
177 | 
178 | 
179 | print(f'Your {cl} z confidence interval is ({lower_limit:.2f}, {upper_limit:.2f})')
180 | 
181 | 
182 | # Exercise
183 | 
184 | # In[63]:
185 | 
186 | 
187 | sample_list = [2, 3, 5, 6, 9]
188 | 
189 | 
190 | # In[65]:
191 | 
192 | 
193 | sample_mean = np.mean(sample_list)
194 | 
195 | sample_mean
196 | 
197 | 
198 | # In[66]:
199 | 
200 | 
201 | std = 2.5
202 | 
203 | 
204 | # In[67]:
205 | 
206 | 
207 | n = len(sample_list)
208 | 
209 | 
210 | # In[68]:
211 | 
212 | 
213 | cl = 0.95
214 | 
215 | 
216 | # In[71]:
217 | 
218 | 
219 | critic_value = stats.norm.ppf(((1 - cl)/2) + cl)
220 | 
221 | 
222 | # In[72]:
223 | 
224 | 
225 | critic_value
226 | 
227 | 
228 | # In[73]:
229 | 
230 | 
231 | lower_limit = sample_mean - (critic_value * (std/math.sqrt(n)))
232 | 
233 | 
234 | # In[74]:
235 | 
236 | 
237 | upper_limit = sample_mean + (critic_value * (std/math.sqrt(n)))
238 | 
239 | 
240 | # In[75]:
241 | 
242 | 
243 | print(f'Your {cl} z confidence interval is ({lower_limit:.2f}, {upper_limit:.2f})')
244 | 
245 | 
246 | # In[84]:
247 | 
248 | 
249 | stats.norm.interval(cl, loc = sample_mean, scale = std/math.sqrt(n))   # using scipy
250 | 
251 | 
252 | # In[76]:
253 | 
254 | 
255 | critic_value = stats.norm.ppf(((1 - 0.99)/2) + 0.99)  # interval gets larger beacuse CL gets higher
256 | 
257 | 
258 | # In[77]:
259 | 
260 | 
261 | lower_limit = sample_mean - (critic_value * (std/math.sqrt(n)))
262 | 
263 | 
264 | # In[78]:
265 | 
266 | 
267 | upper_limit = sample_mean + (critic_value * (std/math.sqrt(n)))
268 | 
269 | 
270 | # In[79]:
271 | 
272 | 
273 | print(f'Your {cl} z confidence interval is ({lower_limit:.2f}, {upper_limit:.2f})')   # interval gets larger
274 | 
275 | 
276 | # In[85]:
277 | 
278 | 
279 | stats.norm.interval(0.99, loc = sample_mean, scale = std/math.sqrt(n))
280 | 
281 | 
282 | # ## USING SCIPY
283 | 
284 | # In[87]:
285 | 
286 | 
287 | stats.norm.interval(cl, loc = sample_mean, scale = std/math.sqrt(n))   # using scipy
288 | 
289 | 


--------------------------------------------------------------------------------
/Statistics/Statistics_6.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | from scipy import stats
 10 | import math
 11 | 
 12 | 
 13 | # ## One Sample t Test
 14 | 
 15 | # #### Analyze if college students get 7.2 hours of sleep, on average, based on a sample of students, alpha = 0.05
 16 | 
 17 | # In[2]:
 18 | 
 19 | 
 20 | df = pd.read_csv('students.csv')
 21 | 
 22 | 
 23 | # In[3]:
 24 | 
 25 | 
 26 | df.head()
 27 | 
 28 | 
 29 | # In[4]:
 30 | 
 31 | 
 32 | # H0: mu = 7.2
 33 | # Ha: mu != 7.2
 34 | # it is two tailed test
 35 | 
 36 | 
 37 | # In[5]:
 38 | 
 39 | 
 40 | onesample = stats.ttest_1samp(df['Sleep'], 7.2)         # sample, pop_mean
 41 | 
 42 | 
 43 | # In[6]:
 44 | 
 45 | 
 46 | onesample.statistic
 47 | 
 48 | 
 49 | # In[7]:
 50 | 
 51 | 
 52 | onesample.pvalue           # fail to reject H0 because p_value is higher than alpha (significance level)
 53 | 
 54 | 
 55 | # In[8]:
 56 | 
 57 | 
 58 | alpha = 0.05
 59 | p_value = onesample.pvalue
 60 | if p_value<alpha:
 61 |     print('At {} level of significance, we can reject the null hypothesis in the favor of Ha.'.format(alpha))
 62 | else:
 63 |     print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))
 64 | 
 65 | 
 66 | # #### The principal of the school thinks that the average hours of sleep is at most 7.2, alpha = 0.05
 67 | 
 68 | # In[9]:
 69 | 
 70 | 
 71 | # H0: mu = 7.2
 72 | # Ha: mu < 7.2
 73 | # it is one tailed test
 74 | 
 75 | 
 76 | # In[10]:
 77 | 
 78 | 
 79 | onesample = stats.ttest_1samp(df['Sleep'], 7.2)
 80 | 
 81 | 
 82 | # In[11]:
 83 | 
 84 | 
 85 | onesample.pvalue / 2       # reject H0 because p_value is lower than alpha (significance level)
 86 | 
 87 | 
 88 | # In[12]:
 89 | 
 90 | 
 91 | alpha = 0.05
 92 | p_value = onesample.pvalue/2
 93 | if p_value<alpha:
 94 |     print('At {} level of significance, we can reject the null hypothesis in the favor of Ha.'.format(alpha))
 95 | else:
 96 |     print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))
 97 | 
 98 | 
 99 | # ## Independent Samples T Test (variances unknown and equal)
100 | 
101 | # In[13]:
102 | 
103 | 
104 | # H0 : mu1 = mu2
105 | # Ha : mu1 != mu2
106 | 
107 | 
108 | # In[14]:
109 | 
110 | 
111 | df = pd.read_csv('catalysts.csv')
112 | 
113 | 
114 | # In[15]:
115 | 
116 | 
117 | df
118 | 
119 | 
120 | # In[16]:
121 | 
122 | 
123 | xbar1 = df['Catalyst1'].mean()
124 | xbar2 = df['Catalyst2'].mean()
125 | 
126 | s1 = df['Catalyst1'].std()
127 | s2 = df['Catalyst2'].std()
128 | 
129 | 
130 | # In[17]:
131 | 
132 | 
133 | xbar1
134 | 
135 | 
136 | # In[18]:
137 | 
138 | 
139 | xbar2
140 | 
141 | 
142 | # In[19]:
143 | 
144 | 
145 | s1
146 | 
147 | 
148 | # In[20]:
149 | 
150 | 
151 | s2
152 | 
153 | 
154 | # In[21]:
155 | 
156 | 
157 | s_pooled = math.sqrt(((len(df['Catalyst1']) - 1) * (s1 ** 2) + (len(df['Catalyst2']) - 1) * (s2 ** 2)) / (len(df['Catalyst1']) - 1 + len(df['Catalyst2']) - 1))
158 | print('spooled = {:.3f}'.format(s_pooled))
159 | 
160 | 
161 | # In[22]:
162 | 
163 | 
164 | t_statistic = (xbar1-xbar2)/(s_pooled*math.sqrt(1/len(df['Catalyst1']) + 1/len(df['Catalyst2'])))
165 | print ('t_statistic = {:.3f}'.format(t_statistic))
166 | 
167 | 
168 | # In[23]:
169 | 
170 | 
171 | # degrees_of_freedom = n1 + n2 - 2
172 | 
173 | 
174 | # In[24]:
175 | 
176 | 
177 | p_value = 2 * stats.t.cdf(t_statistic, 14)
178 | 
179 | 
180 | # In[25]:
181 | 
182 | 
183 | p_value
184 | 
185 | 
186 | # In[26]:
187 | 
188 | 
189 | alpha = 0.05
190 | 
191 | if p_value<alpha:
192 |     print('At {} level of significance, we can reject the null hypothesis in the favor of Ha.'.format(alpha))
193 | else:
194 |     print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))
195 | 
196 | 
197 | # ### scipy.stats.ttest for 2 groups
198 | 
199 | # In[27]:
200 | 
201 | 
202 | ind_test_w_2gr = stats.ttest_ind(df['Catalyst1'], df['Catalyst2'], equal_var = True)
203 | 
204 | 
205 | # In[28]:
206 | 
207 | 
208 | ind_test_w_2gr.statistic
209 | 
210 | 
211 | # In[29]:
212 | 
213 | 
214 | ind_test_w_2gr.pvalue
215 | 
216 | 
217 | # ### rp.ttest for 2 groups
218 | 
219 | # In[30]:
220 | 
221 | 
222 | import researchpy as rp
223 | 
224 | 
225 | # In[31]:
226 | 
227 | 
228 | rp.ttest(df['Catalyst1'], df['Catalyst2'])
229 | 
230 | 
231 | # ## Arsenic concentration in public drinking water supplies is a potential health risk. An article in the Arizona Republic (May 27, 2001) reported drinking water arsenic concentrations in parts per billion (ppb) for 10 metropolitan Phoenix communities and 10 communities in rural Arizona. You can find the data in CSV file.
232 | 
233 | # In[32]:
234 | 
235 | 
236 | df = pd.read_csv('arsenic.csv')
237 | 
238 | 
239 | # In[34]:
240 | 
241 | 
242 | df
243 | 
244 | 
245 | # In[36]:
246 | 
247 | 
248 | # Independent Samples T Test (assumption that --> variances unknown and equal), small size
249 | 
250 | 
251 | # In[37]:
252 | 
253 | 
254 | ind_test_w_2gr = stats.ttest_ind(df['x1'], df['x2'], equal_var = True)
255 | 
256 | 
257 | # In[38]:
258 | 
259 | 
260 | ind_test_w_2gr.statistic
261 | 
262 | 
263 | # In[39]:
264 | 
265 | 
266 | p_value = ind_test_w_2gr.pvalue
267 | 
268 | 
269 | # In[40]:
270 | 
271 | 
272 | alpha = 0.05
273 | 
274 | if p_value<alpha:
275 |     print('At {} level of significance, we can reject the null hypothesis in the favor of Ha.'.format(alpha))
276 | else:
277 |     print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))
278 | 
279 | 
280 | # Drinking water arsenic concentrations in ppb are different for rural Arizona and metropolitan Phoenix.
281 | 
282 | # ## Paired Sample Test
283 | 
284 | # In[41]:
285 | 
286 | 
287 | df = pd.read_csv('prozac.csv')
288 | 
289 | 
290 | # In[43]:
291 | 
292 | 
293 | df
294 | 
295 | 
296 | # In[46]:
297 | 
298 | 
299 | paired_test = stats.ttest_rel(df['moodpre'], df['moodpost'])
300 | 
301 | 
302 | # In[48]:
303 | 
304 | 
305 | paired_test.pvalue        # it is for two side
306 | 
307 | 
308 | # In[49]:
309 | 
310 | 
311 | paired_test.pvalue / 2    # it is for one side
312 | 
313 | 
314 | # In[52]:
315 | 
316 | 
317 | rp.ttest(df['moodpre'], df['moodpost'], paired = True)           # with researchpy
318 | 
319 | 


--------------------------------------------------------------------------------
/Statistics/Statistics_Exercise_1.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Randomness and Probability¶
  5 | # In this exercise, we will use random.choice to produce random samples with and without replacement. Follow these steps to complete this exercise:
  6 | # 
  7 | # 1.Import the NumPy library:
  8 | 
  9 | # In[1]:
 10 | 
 11 | 
 12 | import numpy as np
 13 | 
 14 | 
 15 | # 2.Create two lists containing four different suits and 13 different ranks in the set of standard cards:
 16 | 
 17 | # In[2]:
 18 | 
 19 | 
 20 | a = ['a' + str(i) for i in range(1, 14)]
 21 | b = ['b' + str(i) for i in range(1, 14)]
 22 | c = ['c' + str(i) for i in range(1, 14)]
 23 | d = ['d' + str(i) for i in range(1, 14)]
 24 | 
 25 | 
 26 | # 3.Create a list, named cards, containing the 52 cards of the standard deck:
 27 | 
 28 | # In[3]:
 29 | 
 30 | 
 31 | deck = a + b + c + d
 32 | 
 33 | 
 34 | # 
 35 | # 4.Use the np.random.choice function to draw a hand (five cards) from the deck. Use replace=False so that each card gets selected only once:
 36 | 
 37 | # In[4]:
 38 | 
 39 | 
 40 | list(np.random.choice(deck, 5, replace = False))
 41 | 
 42 | 
 43 | # 5.Now, create a function named deal_hands that returns two lists, each with five cards drawn from the same deck. Use replace=False in the np.random.choice function. This function will perform sampling without replacement:
 44 | 
 45 | # In[5]:
 46 | 
 47 | 
 48 | def deal_hands():
 49 |     return list(np.random.choice(deck, 5, replace = False)), list(np.random.choice(deck, 5, replace = False))
 50 | 
 51 | 
 52 | # In[6]:
 53 | 
 54 | 
 55 | deal_hands()
 56 | 
 57 | 
 58 | # 
 59 | # 6.Create a second function called deal_hands2 that's identical to the last one, but with the replace=True argument in the np.random.choice function. This function will perform sampling with replacement:
 60 | # 
 61 | # 7.Finally, run the following code:
 62 | 
 63 | # In[7]:
 64 | 
 65 | 
 66 | def deal_hands2():
 67 |     return list(np.random.choice(deck, 5, replace = True)), list(np.random.choice(deck, 5, replace = True))
 68 | 
 69 | 
 70 | # In[8]:
 71 | 
 72 | 
 73 | np.random.seed(2)
 74 | deal_hands2()
 75 | 
 76 | 
 77 | # ## Binomial Distribution¶
 78 | # Exercise: Checking If a Random Variable Follows a Binomial Distribution
 79 | # In this exercise, we will practice how to verify if a random variable follows a binomial distribution. We will also create a random variable using scipy.stats and plot the distribution. This will be a mostly conceptual exercise.
 80 | # 
 81 | # Here, we will check if the random variable, Z: number of defective auto parts in a 12-box pack, follows a binomial distribution (remember that we consider 4% of the auto parts are defective). Follow these steps to complete this exercise:
 82 | # 
 83 | # 1.Import NumPy, Matplotlib, and scipy.stats following the usual conventions:
 84 | 
 85 | # In[1]:
 86 | 
 87 | 
 88 | import numpy as np
 89 | import matplotlib.pyplot as plt
 90 | from scipy import stats
 91 | 
 92 | 
 93 | # 2.Try to conceptually check if Z fulfills the properties given for a binomial random variable:
 94 | 
 95 | # 3.Determine the p and n parameters for the distributions of this variable, that is, p = 0.04 and n = 12.
 96 | 
 97 | # 
 98 | # 4.Use the theoretical formula with the former parameters to get the exact theoretical probability of getting exactly one defective piece per box (using x = 1):
 99 | 
100 | # In[10]:
101 | 
102 | 
103 | stats.binom.pmf(1, 12, 0.04)   # x, n, p
104 | 
105 | 
106 | # 
107 | # 5.Use the scipy.stats module to produce an instance of the Z random variable. Name it Z_rv:
108 | 
109 | # In[11]:
110 | 
111 | 
112 | Z_rv = stats.binom(12, 0.04)
113 | 
114 | 
115 | # In[12]:
116 | 
117 | 
118 | Z_rv.rvs(10)
119 | 
120 | 
121 | # In[13]:
122 | 
123 | 
124 | n = 12
125 | 
126 | 
127 | # 6.Plot the probability mass function of Z:
128 | 
129 | # In[14]:
130 | 
131 | 
132 | dist = []
133 | 
134 | print('r\tp(r)')
135 | for i in range(n + 1):
136 |     dist.append(Z_rv.pmf(i))
137 |     print(str(i) + '\t' + str(Z_rv.pmf(i)))
138 | 
139 | 
140 | # In[15]:
141 | 
142 | 
143 | plt.bar(list(range(n + 1)), dist)
144 | 
145 | 
146 | # 
147 | # # Normal Distribution
148 | # Exercise: Using the Normal Distribution in Education
149 | # In this exercise, we'll use a normal distribution object from scipy.stats and the cdf and its inverse, ppf, to answer questions about education.
150 | # 
151 | # In psychometrics and education, it is a well-known fact that many variables relevant to education policy are normally distributed. For instance, scores in standardized mathematics tests follow a normal distribution. In this exercise, we'll explore this phenomenon: in a certain country, high school students take a standardized mathematics test whose scores follow a normal distribution with the following parameters: mean = 100, standard deviation = 15. Follow these steps to complete this exercise:
152 | # 
153 | # 1.Import NumPy, Matplotlib, and scipy.stats following the usual conventions:
154 | 
155 | # In[2]:
156 | 
157 | 
158 | import numpy as np
159 | import matplotlib.pyplot as plt
160 | from scipy import stats
161 | 
162 | 
163 | # In[3]:
164 | 
165 | 
166 | mu = 100
167 | std = 15
168 | 
169 | 
170 | # 
171 | # 2.Use the scipy.stats module to produce an instance of a normally distributed random variable, named X_rv, with mean = 100 and standard deviation = 15:
172 | 
173 | # In[4]:
174 | 
175 | 
176 | X_rv = stats.norm(mu, std)   # mu, sigma
177 | 
178 | 
179 | # 3.Plot the probability distribution of X:
180 | 
181 | # In[5]:
182 | 
183 | 
184 | x = np.linspace(mu - 4 * std, mu + 4 * std)
185 | y = X_rv.pdf(x)
186 | 
187 | 
188 | # In[6]:
189 | 
190 | 
191 | plt.plot(x, y)
192 | 
193 | 
194 | # 
195 | # 4.The Ministry of Education has decided that the minimum score for someone to be considered competent in mathematics is 80. Use the cdf method to calculate the proportion of students that will get a score above that score:
196 | 
197 | # In[7]:
198 | 
199 | 
200 | X_rv_80 = 1- X_rv.cdf(80)
201 | 
202 | 
203 | # In[8]:
204 | 
205 | 
206 | X_rv_80
207 | 
208 | 
209 | # In[14]:
210 | 
211 | 
212 | X_rv.pdf(80)       # pmf for cont. dist.
213 | 
214 | 
215 | # 5.A very selective university wants to set very high standards for high school students that are admitted to their programs. The policy of the university is to only admit students with mathematics scores in the top 2% of the population. Use the ppf method (which is essentially the inverse function of the cdf method) with an argument of 1 - 0.02 = 0.98 to get the cut-off score for admission:
216 | 
217 | # In[9]:
218 | 
219 | 
220 | prop = 0.02                # ppf give us 1 - 0.02 = 0.98. point
221 | cut_off = X_rv.ppf(1-prop)
222 | 
223 | 
224 | # In[10]:
225 | 
226 | 
227 | cut_off
228 | 
229 | 
230 | # In[13]:
231 | 
232 | 
233 | X_rv.cdf(130.80623365947733)
234 | 
235 | 


--------------------------------------------------------------------------------
/Statistics/Statistics_Exercise_2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import numpy as np
  8 | from scipy import stats
  9 | 
 10 | 
 11 | # ## 1  
 12 | # Create a binomial cumulative distribution table for n=10 using Python scipy.stats. 
 13 | 
 14 | # In[26]:
 15 | 
 16 | 
 17 | p = [0.01, 0.05, 0.10, 0.20, 0.30, 0.40, 0.50, 0.60, 0.70, 0.80, 0.90, 0.95, 0.99]
 18 | 
 19 | 
 20 | # In[4]:
 21 | 
 22 | 
 23 | k = list(range(10))
 24 | 
 25 | 
 26 | # In[5]:
 27 | 
 28 | 
 29 | n = 10
 30 | 
 31 | 
 32 | # In[25]:
 33 | 
 34 | 
 35 | print('n = 10')
 36 | print()
 37 | print('p', end = ' ')
 38 | for i in p:
 39 |     print('{:.2f}'.format(i), end = ' ')
 40 | print('\n')
 41 | print('k', end = '\n\n')
 42 | for(ii) in k:
 43 |     print(ii, end = ' ')
 44 |     for i in range(len(p)):
 45 |         a = '{:.2f}'.format(stats.binom.cdf(k[ii], n, p[i]))
 46 |         print(a, end = ' ')
 47 |         if i == 12:
 48 |             print('\n')
 49 | 
 50 | 
 51 | # ## 2.1
 52 | # A salesperson has found that the probability of a sale on a single contact is approximately .3. If the salesperson contacts 10 prospects, what is the approximate probability of making at least one sale?
 53 | 
 54 | # In[27]:
 55 | 
 56 | 
 57 | n = 10
 58 | p = 0.30
 59 | 
 60 | 
 61 | # In[28]:
 62 | 
 63 | 
 64 | binomDist = stats.binom(n, p)
 65 | 
 66 | 
 67 | # In[32]:
 68 | 
 69 | 
 70 | binomDist.pmf(0)         # probability of no sale
 71 | 
 72 | 
 73 | # In[33]:
 74 | 
 75 | 
 76 | 1 - binomDist.pmf(0)      # probability of at least one sale
 77 | 
 78 | 
 79 | # ## 2.2
 80 | # Ten coins are tossed simultaneously. Find the probability of getting
 81 | # 
 82 | # (i) at least seven heads
 83 | # 
 84 | # (ii) exactly seven heads
 85 | # 
 86 | # (iii)at most seven heads
 87 | 
 88 | # In[35]:
 89 | 
 90 | 
 91 | n = 10
 92 | p = 0.5  # for heads
 93 | 
 94 | 
 95 | # In[36]:
 96 | 
 97 | 
 98 | binomDist = stats.binom(n, p)
 99 | 
100 | 
101 | # In[38]:
102 | 
103 | 
104 | 1 - binomDist.cdf(6)      # probability of at least seven heads
105 | 
106 | 
107 | # In[39]:
108 | 
109 | 
110 | binomDist.pmf(7)  + binomDist.pmf(8)  + binomDist.pmf(9)  + binomDist.pmf(10)
111 | 
112 | 
113 | # In[43]:
114 | 
115 | 
116 | binomDist.pmf(7)          # probability of seven heads
117 | 
118 | 
119 | # In[41]:
120 | 
121 | 
122 | binomDist.cdf(7)           # probability of at most seven heads
123 | 
124 | 
125 | # ## 3.1
126 | # A type of tree has seedlings dispersed in a large area with a mean density of five seedlings per square yard. What is the probability that none of ten randomly selected one-square yard regions have seedlings?
127 | 
128 | # In[49]:
129 | 
130 | 
131 | avg = 50   # for 10 one-square yard
132 | x = 0
133 | 
134 | 
135 | # In[50]:
136 | 
137 | 
138 | stats.poisson.pmf(x, avg)
139 | 
140 | 
141 | # ## 3.2
142 | # Let Y denote a random variable that has a Poisson distribution with mean λ = 2. Find
143 | # (i) P(Y = 4)
144 | # (ii) P(Y ≥ 4)
145 | # (iii)P(Y < 4)
146 | # (iv)P(Y ≥ 4 | Y ≥ 2 )
147 | 
148 | # In[53]:
149 | 
150 | 
151 | avg = 2
152 | x = 4
153 | 
154 | 
155 | # In[55]:
156 | 
157 | 
158 | stats.poisson.pmf(x, avg)          # x = 4
159 | 
160 | 
161 | # In[56]:
162 | 
163 | 
164 | 1 - stats.poisson.cdf(3, avg)     # x >= 4
165 | 
166 | 
167 | # In[57]:
168 | 
169 | 
170 | stats.poisson.cdf(3, avg)         # x < 4
171 | 
172 | 
173 | # In[59]:
174 | 
175 | 
176 | (1 - stats.poisson.cdf(1, avg)) - (1 - stats.poisson.cdf(3, avg))   # P(x ≥ 4 | x ≥ 2 )
177 | 
178 | 
179 | # ## 4
180 | # Consider binomial experiment for n = 20, p = .05. 
181 | 
182 | # In[61]:
183 | 
184 | 
185 | n = 20
186 | p = 0.05
187 | 
188 | 
189 | # In[62]:
190 | 
191 | 
192 | binomDist = stats.binom(n, p)
193 | 
194 | 
195 | # ## 4.1
196 | # Calculate the binomial probabilities for Y = 0, 1, 2, 3, and 4.
197 | 
198 | # In[72]:
199 | 
200 | 
201 | binomDist.pmf(0)
202 | 
203 | 
204 | # In[68]:
205 | 
206 | 
207 | binomDist.pmf(1)
208 | 
209 | 
210 | # In[69]:
211 | 
212 | 
213 | binomDist.pmf(2)
214 | 
215 | 
216 | # In[70]:
217 | 
218 | 
219 | binomDist.pmf(3)
220 | 
221 | 
222 | # In[71]:
223 | 
224 | 
225 | binomDist.pmf(4)
226 | 
227 | 
228 | # ## 4.1
229 | # Calculate the same probabilities by using the Poisson approximation with λ = np. Compare.
230 | 
231 | # In[73]:
232 | 
233 | 
234 | avg = n*p   # avg = 1
235 | 
236 | 
237 | # In[74]:
238 | 
239 | 
240 | stats.poisson.pmf(0, avg)   
241 | 
242 | 
243 | # In[75]:
244 | 
245 | 
246 | stats.poisson.pmf(1, avg) 
247 | 
248 | 
249 | # In[76]:
250 | 
251 | 
252 | stats.poisson.pmf(2, avg) 
253 | 
254 | 
255 | # In[77]:
256 | 
257 | 
258 | stats.poisson.pmf(3, avg) 
259 | 
260 | 
261 | # In[78]:
262 | 
263 | 
264 | stats.poisson.pmf(4, avg) 
265 | 
266 | 
267 | # In[ ]:
268 | 
269 | 
270 | 
271 | 
272 | 


--------------------------------------------------------------------------------
/Statistics/Statistics_Exercise_3.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "from scipy import stats\n",
 11 |     "import math"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## 1  \n",
 19 |     " Create a Standard Normal Distribution Table using Python scipy.stats. "
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 3,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "(mu, sigma) = (0, 1)\n",
 29 |     "normDist = stats.norm(mu, sigma)"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 64,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "z = np.round(np.linspace(0, 3, 310), 2) * -1"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 65,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "z = list(z)"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 84,
 53 |    "metadata": {
 54 |     "scrolled": true
 55 |    },
 56 |    "outputs": [
 57 |     {
 58 |      "name": "stdout",
 59 |      "output_type": "stream",
 60 |      "text": [
 61 |       "0.5 0.496 0.492 0.488 0.484 0.4801 0.4761 0.4721 0.4681 0.4641 0.4602 \n",
 62 |       "\n",
 63 |       "0.4562 0.4522 0.4483 0.4443 0.4404 0.4364 0.4325 0.4325 0.4286 0.4247 \n",
 64 |       "\n",
 65 |       "0.4207 0.4168 0.4129 0.409 0.4052 0.4013 0.3974 0.3936 0.3897 0.3859 \n",
 66 |       "\n",
 67 |       "0.3821 0.3783 0.3745 0.3707 0.3669 0.3632 0.3594 0.3557 0.352 0.3483 \n",
 68 |       "\n",
 69 |       "0.3446 0.3409 0.3372 0.3336 0.33 0.3264 0.3228 0.3192 0.3156 0.3121 \n",
 70 |       "\n",
 71 |       "0.3085 0.3085 0.305 0.3015 0.2981 0.2946 0.2912 0.2877 0.2843 0.281 \n",
 72 |       "\n",
 73 |       "0.2776 0.2743 0.2709 0.2676 0.2643 0.2611 0.2578 0.2546 0.2514 0.2483 \n",
 74 |       "\n",
 75 |       "0.2451 0.242 0.2389 0.2358 0.2327 0.2296 0.2266 0.2236 0.2206 0.2177 \n",
 76 |       "\n",
 77 |       "0.2148 0.2119 0.209 0.2061 0.2033 0.2033 0.2005 0.1977 0.1949 0.1922 \n",
 78 |       "\n",
 79 |       "0.1894 0.1867 0.1841 0.1814 0.1788 0.1762 0.1736 0.1711 0.1685 0.166 \n",
 80 |       "\n",
 81 |       "0.1635 0.1611 0.1587 0.1562 0.1539 0.1515 0.1492 0.1469 0.1446 0.1423 \n",
 82 |       "\n",
 83 |       "0.1401 0.1379 0.1357 0.1335 0.1314 0.1292 0.1271 0.1251 0.123 0.121 \n",
 84 |       "\n",
 85 |       "0.121 0.119 0.117 0.1151 0.1131 0.1112 0.1093 0.1075 0.1056 0.1038 \n",
 86 |       "\n",
 87 |       "0.102 0.1003 0.0985 0.0968 0.0951 0.0934 0.0918 0.0901 0.0885 0.0869 \n",
 88 |       "\n",
 89 |       "0.0853 0.0838 0.0823 0.0808 0.0793 0.0778 0.0764 0.0749 0.0735 0.0721 \n",
 90 |       "\n",
 91 |       "0.0708 0.0694 0.0681 0.0668 0.0668 0.0655 0.0643 0.063 0.0618 0.0606 \n",
 92 |       "\n",
 93 |       "0.0594 0.0582 0.0571 0.0559 0.0548 0.0537 0.0526 0.0516 0.0505 0.0495 \n",
 94 |       "\n",
 95 |       "0.0485 0.0475 0.0465 0.0455 0.0446 0.0436 0.0427 0.0418 0.0409 0.0401 \n",
 96 |       "\n",
 97 |       "0.0392 0.0384 0.0375 0.0367 0.0359 0.0351 0.0344 0.0336 0.0336 0.0329 \n",
 98 |       "\n",
 99 |       "0.0322 0.0314 0.0307 0.0301 0.0294 0.0287 0.0281 0.0274 0.0268 0.0262 \n",
100 |       "\n",
101 |       "0.0256 0.025 0.0244 0.0239 0.0233 0.0228 0.0222 0.0217 0.0212 0.0207 \n",
102 |       "\n",
103 |       "0.0202 0.0197 0.0192 0.0188 0.0183 0.0179 0.0174 0.017 0.0166 0.0162 \n",
104 |       "\n",
105 |       "0.0158 0.0154 0.015 0.015 0.0146 0.0143 0.0139 0.0136 0.0132 0.0129 \n",
106 |       "\n",
107 |       "0.0125 0.0122 0.0119 0.0116 0.0113 0.011 0.0107 0.0104 0.0102 0.0099 \n",
108 |       "\n",
109 |       "0.0096 0.0094 0.0091 0.0089 0.0087 0.0084 0.0082 0.008 0.0078 0.0075 \n",
110 |       "\n",
111 |       "0.0073 0.0071 0.0069 0.0068 0.0066 0.0064 0.0062 0.0062 0.006 0.0059 \n",
112 |       "\n",
113 |       "0.0057 0.0055 0.0054 0.0052 0.0051 0.0049 0.0048 0.0047 0.0045 0.0044 \n",
114 |       "\n",
115 |       "0.0043 0.0041 0.004 0.0039 0.0038 0.0037 0.0036 0.0035 0.0034 0.0033 \n",
116 |       "\n",
117 |       "0.0032 0.0031 0.003 0.0029 0.0028 0.0027 0.0026 0.0026 0.0025 0.0024 \n",
118 |       "\n",
119 |       "0.0023 0.0023 0.0023 0.0022 0.0021 0.0021 0.002 0.0019 0.0019 0.0018 \n",
120 |       "\n",
121 |       "0.0018 0.0017 0.0016 0.0016 0.0015 0.0015 0.0014 0.0014 0.0013 "
122 |      ]
123 |     }
124 |    ],
125 |    "source": [
126 |     "for(ii) in range(310):\n",
127 |     "    a = np.round(normDist.cdf(z[ii]), 4)\n",
128 |     "    print(a, end = ' ')\n",
129 |     "    if ii % 10 == 0 and ii != 0:\n",
130 |     "        print('\\n')"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "## 2\n",
138 |     "The cycle time for trucks hauling concrete to a highway construction site is uniformly distributed over the interval 50 to 70 minutes. What is the probability that the cycle time exceeds 65 minutes  if it is known that the cycle time exceeds 55 minutes?"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 85,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "(a, b) = (50, 70)"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 86,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "unifDist = stats.uniform(50, 70)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 94,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "# P(X > 65 | X > 55)"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 98,
171 |    "metadata": {},
172 |    "outputs": [
173 |     {
174 |      "data": {
175 |       "text/plain": [
176 |        "0.21428571428571427"
177 |       ]
178 |      },
179 |      "execution_count": 98,
180 |      "metadata": {},
181 |      "output_type": "execute_result"
182 |     }
183 |    ],
184 |    "source": [
185 |     "unifDist.cdf(65)"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 99,
191 |    "metadata": {},
192 |    "outputs": [
193 |     {
194 |      "data": {
195 |       "text/plain": [
196 |        "0.07142857142857142"
197 |       ]
198 |      },
199 |      "execution_count": 99,
200 |      "metadata": {},
201 |      "output_type": "execute_result"
202 |     }
203 |    ],
204 |    "source": [
205 |     "unifDist.cdf(55)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 100,
211 |    "metadata": {},
212 |    "outputs": [
213 |     {
214 |      "data": {
215 |       "text/plain": [
216 |        "0.3333333333333333"
217 |       ]
218 |      },
219 |      "execution_count": 100,
220 |      "metadata": {},
221 |      "output_type": "execute_result"
222 |     }
223 |    ],
224 |    "source": [
225 |     "unifDist.cdf(55) / unifDist.cdf(65)"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "metadata": {},
231 |    "source": [
232 |     "## 3.1\n",
233 |     "The width of bolts of fabric is normally distributed with mean 950 mm (millimeters) and standard deviation 10 mm.\n",
234 |     "What is the probability that a randomly chosen bolt has a width of between 947 and 958mm?"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 102,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": [
243 |     "(mu, sigma) = (950, 10)\n",
244 |     "normDist = stats.norm(mu, sigma)"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 105,
250 |    "metadata": {},
251 |    "outputs": [
252 |     {
253 |      "data": {
254 |       "text/plain": [
255 |        "0.406056023605556"
256 |       ]
257 |      },
258 |      "execution_count": 105,
259 |      "metadata": {},
260 |      "output_type": "execute_result"
261 |     }
262 |    ],
263 |    "source": [
264 |     "normDist.cdf(958) - normDist.cdf(947) "
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "metadata": {},
270 |    "source": [
271 |     "## 3.2\n",
272 |     "The width of bolts of fabric is normally distributed with mean 950 mm (millimeters) and standard deviation 10 mm.\n",
273 |     "What is the appropriate value for C such that a randomly chosen bolt has a width less than C with probability .8531?"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 109,
279 |    "metadata": {},
280 |    "outputs": [
281 |     {
282 |      "data": {
283 |       "text/plain": [
284 |        "960.4982190962642"
285 |       ]
286 |      },
287 |      "execution_count": 109,
288 |      "metadata": {},
289 |      "output_type": "execute_result"
290 |     }
291 |    ],
292 |    "source": [
293 |     "normDist.ppf(0.8531)"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "metadata": {},
299 |    "source": [
300 |     "## 4\n",
301 |     "The school board administered an IQ test to 20 randomly selected teachers. They found that the average IQ score was 114 with a standard deviation of 10. Assume that the cumulative probability is 0.90. What population mean would have produced this sample result?"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": 113,
307 |    "metadata": {},
308 |    "outputs": [],
309 |    "source": [
310 |     "# n = 20, df = 19, mu_sample = 114, std = 10"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 114,
316 |    "metadata": {},
317 |    "outputs": [
318 |     {
319 |      "data": {
320 |       "text/plain": [
321 |        "1.3277282090267986"
322 |       ]
323 |      },
324 |      "execution_count": 114,
325 |      "metadata": {},
326 |      "output_type": "execute_result"
327 |     }
328 |    ],
329 |    "source": [
330 |     "stats.t.ppf(0.90, 19)"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": 116,
336 |    "metadata": {},
337 |    "outputs": [
338 |     {
339 |      "data": {
340 |       "text/plain": [
341 |        "111.03110946897203"
342 |       ]
343 |      },
344 |      "execution_count": 116,
345 |      "metadata": {},
346 |      "output_type": "execute_result"
347 |     }
348 |    ],
349 |    "source": [
350 |     "114 - (stats.t.ppf(0.90, 19) * (10/math.sqrt(20)))   # = mu_population"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": 119,
356 |    "metadata": {},
357 |    "outputs": [
358 |     {
359 |      "data": {
360 |       "text/plain": [
361 |        "1.3277282090267963"
362 |       ]
363 |      },
364 |      "execution_count": 119,
365 |      "metadata": {},
366 |      "output_type": "execute_result"
367 |     }
368 |    ],
369 |    "source": [
370 |     "(114 - 111.03110946897203)/(10/math.sqrt(20))"
371 |    ]
372 |   }
373 |  ],
374 |  "metadata": {
375 |   "kernelspec": {
376 |    "display_name": "Python 3",
377 |    "language": "python",
378 |    "name": "python3"
379 |   },
380 |   "language_info": {
381 |    "codemirror_mode": {
382 |     "name": "ipython",
383 |     "version": 3
384 |    },
385 |    "file_extension": ".py",
386 |    "mimetype": "text/x-python",
387 |    "name": "python",
388 |    "nbconvert_exporter": "python",
389 |    "pygments_lexer": "ipython3",
390 |    "version": "3.7.3"
391 |   }
392 |  },
393 |  "nbformat": 4,
394 |  "nbformat_minor": 2
395 | }
396 | 


--------------------------------------------------------------------------------
/Statistics/Statistics_Exercise_3.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import numpy as np
  8 | from scipy import stats
  9 | import math
 10 | 
 11 | 
 12 | # ## 1  
 13 | #  Create a Standard Normal Distribution Table using Python scipy.stats. 
 14 | 
 15 | # In[3]:
 16 | 
 17 | 
 18 | (mu, sigma) = (0, 1)
 19 | normDist = stats.norm(mu, sigma)
 20 | 
 21 | 
 22 | # In[64]:
 23 | 
 24 | 
 25 | z = np.round(np.linspace(0, 3, 310), 2) * -1
 26 | 
 27 | 
 28 | # In[65]:
 29 | 
 30 | 
 31 | z = list(z)
 32 | 
 33 | 
 34 | # In[84]:
 35 | 
 36 | 
 37 | for(ii) in range(310):
 38 |     a = np.round(normDist.cdf(z[ii]), 4)
 39 |     print(a, end = ' ')
 40 |     if ii % 10 == 0 and ii != 0:
 41 |         print('\n')
 42 | 
 43 | 
 44 | # ## 2
 45 | # The cycle time for trucks hauling concrete to a highway construction site is uniformly distributed over the interval 50 to 70 minutes. What is the probability that the cycle time exceeds 65 minutes  if it is known that the cycle time exceeds 55 minutes?
 46 | 
 47 | # In[85]:
 48 | 
 49 | 
 50 | (a, b) = (50, 70)
 51 | 
 52 | 
 53 | # In[86]:
 54 | 
 55 | 
 56 | unifDist = stats.uniform(50, 70)
 57 | 
 58 | 
 59 | # In[94]:
 60 | 
 61 | 
 62 | # P(X > 65 | X > 55)
 63 | 
 64 | 
 65 | # In[98]:
 66 | 
 67 | 
 68 | unifDist.cdf(65)
 69 | 
 70 | 
 71 | # In[99]:
 72 | 
 73 | 
 74 | unifDist.cdf(55)
 75 | 
 76 | 
 77 | # In[100]:
 78 | 
 79 | 
 80 | unifDist.cdf(55) / unifDist.cdf(65)
 81 | 
 82 | 
 83 | # ## 3.1
 84 | # The width of bolts of fabric is normally distributed with mean 950 mm (millimeters) and standard deviation 10 mm.
 85 | # What is the probability that a randomly chosen bolt has a width of between 947 and 958mm?
 86 | 
 87 | # In[102]:
 88 | 
 89 | 
 90 | (mu, sigma) = (950, 10)
 91 | normDist = stats.norm(mu, sigma)
 92 | 
 93 | 
 94 | # In[105]:
 95 | 
 96 | 
 97 | normDist.cdf(958) - normDist.cdf(947) 
 98 | 
 99 | 
100 | # ## 3.2
101 | # The width of bolts of fabric is normally distributed with mean 950 mm (millimeters) and standard deviation 10 mm.
102 | # What is the appropriate value for C such that a randomly chosen bolt has a width less than C with probability .8531?
103 | 
104 | # In[109]:
105 | 
106 | 
107 | normDist.ppf(0.8531)
108 | 
109 | 
110 | # ## 4
111 | # The school board administered an IQ test to 20 randomly selected teachers. They found that the average IQ score was 114 with a standard deviation of 10. Assume that the cumulative probability is 0.90. What population mean would have produced this sample result?
112 | 
113 | # In[113]:
114 | 
115 | 
116 | # n = 20, df = 19, mu_sample = 114, std = 10
117 | 
118 | 
119 | # In[114]:
120 | 
121 | 
122 | stats.t.ppf(0.90, 19)
123 | 
124 | 
125 | # In[116]:
126 | 
127 | 
128 | 114 - (stats.t.ppf(0.90, 19) * (10/math.sqrt(20)))   # = mu_population
129 | 
130 | 
131 | # In[119]:
132 | 
133 | 
134 | (114 - 111.03110946897203)/(10/math.sqrt(20))
135 | 
136 | 


--------------------------------------------------------------------------------
/Statistics/Statistics_Exercise_4.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import math\n",
 11 |     "from scipy import stats"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## 1\n",
 19 |     "\n",
 20 |     "Suppose scores on exams in statistics are normally distributed with an unknown population mean and a population standard deviation of 3 points. A random sample of 36 scores is taken and gives a sample mean (sample  mean score) of 68. Find a confidence interval estimate for the population mean exam score (the mean score on all exams).\n",
 21 |     "\n",
 22 |     "Find a 90% confidence interval for the true (population) mean of statistics exam scores."
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 3,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "sample_mean = 68\n",
 32 |     "n = 36\n",
 33 |     "std = 3\n",
 34 |     "cl = 0.90"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 4,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "std_e = std/math.sqrt(n)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 6,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "ci = stats.norm.interval(cl, sample_mean, std_e)"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 10,
 58 |    "metadata": {},
 59 |    "outputs": [
 60 |     {
 61 |      "name": "stdout",
 62 |      "output_type": "stream",
 63 |      "text": [
 64 |       "Your 0.9 z confidence interval is (67.17757318652427, 68.82242681347573).\n"
 65 |      ]
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "print('Your {} z confidence interval is {}.'.format(cl, ci))"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "## 2\n",
 77 |     "\n",
 78 |     "What is the normal body temperature for healthy humans? A random sample of 130 healthy human body temperatures provided by Allen Shoemaker7 yielded 98.25 degrees and standard deviation 0.73 degrees. \n",
 79 |     "\n",
 80 |     "Give a 99% confidence interval for the average body temperature of healthy people."
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 13,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "sample_mean = 98.25\n",
 90 |     "n = 130\n",
 91 |     "std = 0.73\n",
 92 |     "cl = 0.99"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 14,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "std_e = std/math.sqrt(n)"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 15,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "ci = stats.norm.interval(cl, sample_mean, std_e)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 16,
116 |    "metadata": {},
117 |    "outputs": [
118 |     {
119 |      "name": "stdout",
120 |      "output_type": "stream",
121 |      "text": [
122 |       "Your 0.99 z confidence interval is (98.08508192246582, 98.41491807753418).\n"
123 |      ]
124 |     }
125 |    ],
126 |    "source": [
127 |     "print('Your {} z confidence interval is {}.'.format(cl, ci))"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "## 3\n",
135 |     "\n",
136 |     "The administrators for a hospital wished to estimate the average number of days required for inpatient treatment of patients between the ages of 25 and 34. A random sample of 500 hospital patients between these ages produced a mean and standard deviation equal to 5.4 and 3.1 days, respectively.\n",
137 |     "Construct a 95% confidence interval for the mean length of stay for the population of patients from which the sample was drawn."
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 18,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "sample_mean = 5.4\n",
147 |     "n = 500\n",
148 |     "std = 3.1\n",
149 |     "cl = 0.95"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 19,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "std_e = std/math.sqrt(n)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 20,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "ci = stats.norm.interval(cl, sample_mean, std_e)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 21,
173 |    "metadata": {},
174 |    "outputs": [
175 |     {
176 |      "name": "stdout",
177 |      "output_type": "stream",
178 |      "text": [
179 |       "Your 0.95 z confidence interval is (5.12827801242126, 5.67172198757874).\n"
180 |      ]
181 |     }
182 |    ],
183 |    "source": [
184 |     "print('Your {} z confidence interval is {}.'.format(cl, ci))"
185 |    ]
186 |   }
187 |  ],
188 |  "metadata": {
189 |   "kernelspec": {
190 |    "display_name": "Python 3",
191 |    "language": "python",
192 |    "name": "python3"
193 |   },
194 |   "language_info": {
195 |    "codemirror_mode": {
196 |     "name": "ipython",
197 |     "version": 3
198 |    },
199 |    "file_extension": ".py",
200 |    "mimetype": "text/x-python",
201 |    "name": "python",
202 |    "nbconvert_exporter": "python",
203 |    "pygments_lexer": "ipython3",
204 |    "version": "3.7.3"
205 |   }
206 |  },
207 |  "nbformat": 4,
208 |  "nbformat_minor": 2
209 | }
210 | 


--------------------------------------------------------------------------------
/Statistics/Statistics_Exercise_4.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import numpy as np
  8 | import math
  9 | from scipy import stats
 10 | 
 11 | 
 12 | # ## 1
 13 | # 
 14 | # Suppose scores on exams in statistics are normally distributed with an unknown population mean and a population standard deviation of 3 points. A random sample of 36 scores is taken and gives a sample mean (sample  mean score) of 68. Find a confidence interval estimate for the population mean exam score (the mean score on all exams).
 15 | # 
 16 | # Find a 90% confidence interval for the true (population) mean of statistics exam scores.
 17 | 
 18 | # In[3]:
 19 | 
 20 | 
 21 | sample_mean = 68
 22 | n = 36
 23 | std = 3
 24 | cl = 0.90
 25 | 
 26 | 
 27 | # In[4]:
 28 | 
 29 | 
 30 | std_e = std/math.sqrt(n)
 31 | 
 32 | 
 33 | # In[6]:
 34 | 
 35 | 
 36 | ci = stats.norm.interval(cl, sample_mean, std_e)
 37 | 
 38 | 
 39 | # In[10]:
 40 | 
 41 | 
 42 | print('Your {} z confidence interval is {}.'.format(cl, ci))
 43 | 
 44 | 
 45 | # ## 2
 46 | # 
 47 | # What is the normal body temperature for healthy humans? A random sample of 130 healthy human body temperatures provided by Allen Shoemaker7 yielded 98.25 degrees and standard deviation 0.73 degrees. 
 48 | # 
 49 | # Give a 99% confidence interval for the average body temperature of healthy people.
 50 | 
 51 | # In[13]:
 52 | 
 53 | 
 54 | sample_mean = 98.25
 55 | n = 130
 56 | std = 0.73
 57 | cl = 0.99
 58 | 
 59 | 
 60 | # In[14]:
 61 | 
 62 | 
 63 | std_e = std/math.sqrt(n)
 64 | 
 65 | 
 66 | # In[15]:
 67 | 
 68 | 
 69 | ci = stats.norm.interval(cl, sample_mean, std_e)
 70 | 
 71 | 
 72 | # In[16]:
 73 | 
 74 | 
 75 | print('Your {} z confidence interval is {}.'.format(cl, ci))
 76 | 
 77 | 
 78 | # ## 3
 79 | # 
 80 | # The administrators for a hospital wished to estimate the average number of days required for inpatient treatment of patients between the ages of 25 and 34. A random sample of 500 hospital patients between these ages produced a mean and standard deviation equal to 5.4 and 3.1 days, respectively.
 81 | # Construct a 95% confidence interval for the mean length of stay for the population of patients from which the sample was drawn.
 82 | 
 83 | # In[18]:
 84 | 
 85 | 
 86 | sample_mean = 5.4
 87 | n = 500
 88 | std = 3.1
 89 | cl = 0.95
 90 | 
 91 | 
 92 | # In[19]:
 93 | 
 94 | 
 95 | std_e = std/math.sqrt(n)
 96 | 
 97 | 
 98 | # In[20]:
 99 | 
100 | 
101 | ci = stats.norm.interval(cl, sample_mean, std_e)
102 | 
103 | 
104 | # In[21]:
105 | 
106 | 
107 | print('Your {} z confidence interval is {}.'.format(cl, ci))
108 | 
109 | 


--------------------------------------------------------------------------------
/Statistics/Statistics_Exercise_5.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Finding the Confidence Interval of Polling Figures"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "You are running a political campaign and decide to run 30 focus groups with about 10 people in each group. You get the results and want to report to your candidate the number of people who would vote for them in a typical 10-person group. Since there is some variability in each focus group, you decide that the most accurate way is to give a 95% z-confidence interval. You assume from past experience that the standard deviation is 2.89."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "1.Import the random Python package and set the seed to 39809. This will ensure that we get the same results every time we run the program:"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 1,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import random\n",
 31 |     "import math\n",
 32 |     "import numpy as np\n",
 33 |     "from scipy import stats\n",
 34 |     "random.seed(39809)"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "2.Initialize our sample list and collect our samples from our focus groups. Use random.randint"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 2,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "sample_size = 30\n",
 51 |     "sample_list = []\n",
 52 |     "\n",
 53 |     "for i in range(30):\n",
 54 |     "    sample_list.append(random.randint(0, 10))\n",
 55 |     "sample_mean = np.mean(sample_list)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "3.Calculate 95% z-confidence interval."
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 3,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "sample_mean\n",
 72 |     "n = 30\n",
 73 |     "std = 2.89\n",
 74 |     "cl = 0.95"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 4,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "std_e = std/math.sqrt(n)"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 5,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "ci = stats.norm.interval(cl, sample_mean, std_e)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 6,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "data": {
102 |       "text/plain": [
103 |        "(3.965845784931483, 6.034154215068517)"
104 |       ]
105 |      },
106 |      "execution_count": 6,
107 |      "metadata": {},
108 |      "output_type": "execute_result"
109 |     }
110 |    ],
111 |    "source": [
112 |     "ci"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 7,
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "name": "stdout",
122 |      "output_type": "stream",
123 |      "text": [
124 |       "Your 0.95 z confidence interval is (3.965845784931483, 6.034154215068517).\n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "print('Your {} z confidence interval is {}.'.format(cl, ci))"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "4.If you did everything correctly, then the following should be printed when you run your notebook:"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "    Your 0.95 z confidence interval is (3.965845784931483, 6.034154215068517)"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "# Hypothesis Testing"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "Your boss asks you to conduct a hypothesis test about the mean dwell time of a new type of UAV.  Before you arrived, an experiment was conducted on n=5 UAVs (all of the new type) resulting in a sample mean dwell time of 10.4 hours.  The goal is to conclusively demonstrate, if possible, that the data supports the manufacturer’s claim that the mean dwell time is greater than 10 hours.  Given that it is reasonable to assume the dwell times are normally distributed, the sample standard deviation is s = 0.5 hours, and using a significance level of α = 0.01:"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "1.Write out the null and alternative hypotheses"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 8,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "# H0 : mu = 10\n",
174 |     "# Ha : mu > 10"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "2.Calculate the test statistic"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 9,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "mu_sample = 10.4\n",
191 |     "mu = 10\n",
192 |     "s = 0.5\n",
193 |     "n = 5\n",
194 |     "alpha = 0.01"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 10,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "t_statistic = (mu_sample-mu)/(s/math.sqrt(n))"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 11,
209 |    "metadata": {},
210 |    "outputs": [
211 |     {
212 |      "data": {
213 |       "text/plain": [
214 |        "1.7888543819998335"
215 |       ]
216 |      },
217 |      "execution_count": 11,
218 |      "metadata": {},
219 |      "output_type": "execute_result"
220 |     }
221 |    ],
222 |    "source": [
223 |     "t_statistic"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "3.Find the p-value and state the outcome"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 12,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "df = n - 1"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 13,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "p_value = 1 - stats.t.cdf(t_statistic, df)"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 14,
254 |    "metadata": {},
255 |    "outputs": [
256 |     {
257 |      "data": {
258 |       "text/plain": [
259 |        "0.07407407407407385"
260 |       ]
261 |      },
262 |      "execution_count": 14,
263 |      "metadata": {},
264 |      "output_type": "execute_result"
265 |     }
266 |    ],
267 |    "source": [
268 |     "p_value"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 15,
274 |    "metadata": {},
275 |    "outputs": [
276 |     {
277 |      "name": "stdout",
278 |      "output_type": "stream",
279 |      "text": [
280 |       "At 0.01 level of significance, we fail to reject the null hypothesis.\n"
281 |      ]
282 |     }
283 |    ],
284 |    "source": [
285 |     "if p_value<alpha:\n",
286 |     "    print('At {} level of significance, we can reject the null hypothesis in the favor of Ha.'.format(alpha))\n",
287 |     "else:\n",
288 |     "    print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))"
289 |    ]
290 |   }
291 |  ],
292 |  "metadata": {
293 |   "kernelspec": {
294 |    "display_name": "Python 3",
295 |    "language": "python",
296 |    "name": "python3"
297 |   },
298 |   "language_info": {
299 |    "codemirror_mode": {
300 |     "name": "ipython",
301 |     "version": 3
302 |    },
303 |    "file_extension": ".py",
304 |    "mimetype": "text/x-python",
305 |    "name": "python",
306 |    "nbconvert_exporter": "python",
307 |    "pygments_lexer": "ipython3",
308 |    "version": "3.7.3"
309 |   }
310 |  },
311 |  "nbformat": 4,
312 |  "nbformat_minor": 2
313 | }
314 | 


--------------------------------------------------------------------------------
/Statistics/Statistics_Exercise_5.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Finding the Confidence Interval of Polling Figures
  5 | 
  6 | # You are running a political campaign and decide to run 30 focus groups with about 10 people in each group. You get the results and want to report to your candidate the number of people who would vote for them in a typical 10-person group. Since there is some variability in each focus group, you decide that the most accurate way is to give a 95% z-confidence interval. You assume from past experience that the standard deviation is 2.89.
  7 | 
  8 | # 1.Import the random Python package and set the seed to 39809. This will ensure that we get the same results every time we run the program:
  9 | 
 10 | # In[1]:
 11 | 
 12 | 
 13 | import random
 14 | import math
 15 | import numpy as np
 16 | from scipy import stats
 17 | random.seed(39809)
 18 | 
 19 | 
 20 | # 2.Initialize our sample list and collect our samples from our focus groups. Use random.randint
 21 | 
 22 | # In[2]:
 23 | 
 24 | 
 25 | sample_size = 30
 26 | sample_list = []
 27 | 
 28 | for i in range(30):
 29 |     sample_list.append(random.randint(0, 10))
 30 | sample_mean = np.mean(sample_list)
 31 | 
 32 | 
 33 | # 3.Calculate 95% z-confidence interval.
 34 | 
 35 | # In[3]:
 36 | 
 37 | 
 38 | sample_mean
 39 | n = 30
 40 | std = 2.89
 41 | cl = 0.95
 42 | 
 43 | 
 44 | # In[4]:
 45 | 
 46 | 
 47 | std_e = std/math.sqrt(n)
 48 | 
 49 | 
 50 | # In[5]:
 51 | 
 52 | 
 53 | ci = stats.norm.interval(cl, sample_mean, std_e)
 54 | 
 55 | 
 56 | # In[6]:
 57 | 
 58 | 
 59 | ci
 60 | 
 61 | 
 62 | # In[7]:
 63 | 
 64 | 
 65 | print('Your {} z confidence interval is {}.'.format(cl, ci))
 66 | 
 67 | 
 68 | # 4.If you did everything correctly, then the following should be printed when you run your notebook:
 69 | 
 70 | #     Your 0.95 z confidence interval is (3.965845784931483, 6.034154215068517)
 71 | 
 72 | # # Hypothesis Testing
 73 | 
 74 | # Your boss asks you to conduct a hypothesis test about the mean dwell time of a new type of UAV.  Before you arrived, an experiment was conducted on n=5 UAVs (all of the new type) resulting in a sample mean dwell time of 10.4 hours.  The goal is to conclusively demonstrate, if possible, that the data supports the manufacturer’s claim that the mean dwell time is greater than 10 hours.  Given that it is reasonable to assume the dwell times are normally distributed, the sample standard deviation is s = 0.5 hours, and using a significance level of α = 0.01:
 75 | 
 76 | # 1.Write out the null and alternative hypotheses
 77 | 
 78 | # In[8]:
 79 | 
 80 | 
 81 | # H0 : mu = 10
 82 | # Ha : mu > 10
 83 | 
 84 | 
 85 | # 2.Calculate the test statistic
 86 | 
 87 | # In[9]:
 88 | 
 89 | 
 90 | mu_sample = 10.4
 91 | mu = 10
 92 | s = 0.5
 93 | n = 5
 94 | alpha = 0.01
 95 | 
 96 | 
 97 | # In[10]:
 98 | 
 99 | 
100 | t_statistic = (mu_sample-mu)/(s/math.sqrt(n))
101 | 
102 | 
103 | # In[11]:
104 | 
105 | 
106 | t_statistic
107 | 
108 | 
109 | # 3.Find the p-value and state the outcome
110 | 
111 | # In[12]:
112 | 
113 | 
114 | df = n - 1
115 | 
116 | 
117 | # In[13]:
118 | 
119 | 
120 | p_value = 1 - stats.t.cdf(t_statistic, df)
121 | 
122 | 
123 | # In[14]:
124 | 
125 | 
126 | p_value
127 | 
128 | 
129 | # In[15]:
130 | 
131 | 
132 | if p_value<alpha:
133 |     print('At {} level of significance, we can reject the null hypothesis in the favor of Ha.'.format(alpha))
134 | else:
135 |     print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))
136 | 
137 | 


--------------------------------------------------------------------------------
/Statistics/Statistics_Exercise_6.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np\n",
 11 |     "from scipy import stats\n",
 12 |     "import math"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "## EXERCISE 1. \n",
 20 |     "The hourly wages in a particular industry are normally distributed with mean  $13.20  and  standard deviation $2.50. A company in this industry employs 40 workers, paying them an average of $12.20 per hour. Can this company be accused of paying substandard wages? Use an α = .01 level test."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "# H0: mu = 13.20\n",
 30 |     "# Ha: mu < 13.20\n",
 31 |     "# it is one tailed test"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "mu = 13.20\n",
 41 |     "mu_sample = 12.20\n",
 42 |     "n = 40\n",
 43 |     "std = 2.5\n",
 44 |     "alpha = 0.01"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 4,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "t_statistic = (mu_sample - mu)/(std/math.sqrt(n))"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 5,
 59 |    "metadata": {},
 60 |    "outputs": [
 61 |     {
 62 |      "data": {
 63 |       "text/plain": [
 64 |        "-2.5298221281347035"
 65 |       ]
 66 |      },
 67 |      "execution_count": 5,
 68 |      "metadata": {},
 69 |      "output_type": "execute_result"
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "t_statistic"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 6,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "p_value = stats.norm.cdf(t_statistic)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 7,
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "data": {
 92 |       "text/plain": [
 93 |        "0.005706018193000826"
 94 |       ]
 95 |      },
 96 |      "execution_count": 7,
 97 |      "metadata": {},
 98 |      "output_type": "execute_result"
 99 |     }
100 |    ],
101 |    "source": [
102 |     "p_value"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 8,
108 |    "metadata": {},
109 |    "outputs": [
110 |     {
111 |      "name": "stdout",
112 |      "output_type": "stream",
113 |      "text": [
114 |       "At 0.01 level of significance, we can reject the null hypothesis in the favor of Ha.\n"
115 |      ]
116 |     }
117 |    ],
118 |    "source": [
119 |     "if p_value < alpha:\n",
120 |     "    print('At {} level of significance, we can reject the null hypothesis in the favor of Ha.'.format(alpha))\n",
121 |     "else:\n",
122 |     "    print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "## EXERCISE 2.\n",
130 |     "Shear strength measurements derived from unconfined compression tests for two types of soils gave the results shown in the following document (measurements in tons per square foot). Do the soils appear to differ with respect to average shear strength, at the 1% significance level?"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 9,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "# H0: mu1 = mu2\n",
140 |     "# Ha: mu1 != mu2"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 10,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "df = pd.read_csv('soil.csv')"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 11,
155 |    "metadata": {
156 |     "scrolled": true
157 |    },
158 |    "outputs": [
159 |     {
160 |      "data": {
161 |       "text/html": [
162 |        "<div>\n",
163 |        "<style scoped>\n",
164 |        "    .dataframe tbody tr th:only-of-type {\n",
165 |        "        vertical-align: middle;\n",
166 |        "    }\n",
167 |        "\n",
168 |        "    .dataframe tbody tr th {\n",
169 |        "        vertical-align: top;\n",
170 |        "    }\n",
171 |        "\n",
172 |        "    .dataframe thead th {\n",
173 |        "        text-align: right;\n",
174 |        "    }\n",
175 |        "</style>\n",
176 |        "<table border=\"1\" class=\"dataframe\">\n",
177 |        "  <thead>\n",
178 |        "    <tr style=\"text-align: right;\">\n",
179 |        "      <th></th>\n",
180 |        "      <th>Soil1</th>\n",
181 |        "      <th>Soil2</th>\n",
182 |        "    </tr>\n",
183 |        "  </thead>\n",
184 |        "  <tbody>\n",
185 |        "    <tr>\n",
186 |        "      <th>0</th>\n",
187 |        "      <td>1.442</td>\n",
188 |        "      <td>1.364</td>\n",
189 |        "    </tr>\n",
190 |        "    <tr>\n",
191 |        "      <th>1</th>\n",
192 |        "      <td>1.943</td>\n",
193 |        "      <td>1.878</td>\n",
194 |        "    </tr>\n",
195 |        "    <tr>\n",
196 |        "      <th>2</th>\n",
197 |        "      <td>1.110</td>\n",
198 |        "      <td>1.337</td>\n",
199 |        "    </tr>\n",
200 |        "    <tr>\n",
201 |        "      <th>3</th>\n",
202 |        "      <td>1.912</td>\n",
203 |        "      <td>1.828</td>\n",
204 |        "    </tr>\n",
205 |        "    <tr>\n",
206 |        "      <th>4</th>\n",
207 |        "      <td>1.553</td>\n",
208 |        "      <td>1.371</td>\n",
209 |        "    </tr>\n",
210 |        "  </tbody>\n",
211 |        "</table>\n",
212 |        "</div>"
213 |       ],
214 |       "text/plain": [
215 |        "   Soil1  Soil2\n",
216 |        "0  1.442  1.364\n",
217 |        "1  1.943  1.878\n",
218 |        "2  1.110  1.337\n",
219 |        "3  1.912  1.828\n",
220 |        "4  1.553  1.371"
221 |       ]
222 |      },
223 |      "execution_count": 11,
224 |      "metadata": {},
225 |      "output_type": "execute_result"
226 |     }
227 |    ],
228 |    "source": [
229 |     "df.head()"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 12,
235 |    "metadata": {},
236 |    "outputs": [],
237 |    "source": [
238 |     "xbar1 = df['Soil1'].mean()\n",
239 |     "xbar2 = df['Soil2'].mean()\n",
240 |     "\n",
241 |     "s1 = df['Soil1'].std()\n",
242 |     "s2 = df['Soil2'].std()\n",
243 |     "\n",
244 |     "alpha = 0.01"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 13,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "t_statistic = ((xbar1 - xbar2) - 0) / math.sqrt(((s1 ** 2) / df['Soil1'].notna().sum()) + ((s2 ** 2) / df['Soil2'].notna().sum()))"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 14,
259 |    "metadata": {},
260 |    "outputs": [
261 |     {
262 |      "data": {
263 |       "text/plain": [
264 |        "5.191460504717394"
265 |       ]
266 |      },
267 |      "execution_count": 14,
268 |      "metadata": {},
269 |      "output_type": "execute_result"
270 |     }
271 |    ],
272 |    "source": [
273 |     "t_statistic"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 15,
279 |    "metadata": {
280 |     "scrolled": false
281 |    },
282 |    "outputs": [
283 |     {
284 |      "data": {
285 |       "text/plain": [
286 |        "-2.575829303548901"
287 |       ]
288 |      },
289 |      "execution_count": 15,
290 |      "metadata": {},
291 |      "output_type": "execute_result"
292 |     }
293 |    ],
294 |    "source": [
295 |     "stats.norm.ppf(0.005)  # 0.01 / 2"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 16,
301 |    "metadata": {},
302 |    "outputs": [
303 |     {
304 |      "data": {
305 |       "text/plain": [
306 |        "0.27468571428571464"
307 |       ]
308 |      },
309 |      "execution_count": 16,
310 |      "metadata": {},
311 |      "output_type": "execute_result"
312 |     }
313 |    ],
314 |    "source": [
315 |     "diff = xbar1 - xbar2\n",
316 |     "diff"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 17,
322 |    "metadata": {},
323 |    "outputs": [
324 |     {
325 |      "data": {
326 |       "text/plain": [
327 |        "0.05291106694081796"
328 |       ]
329 |      },
330 |      "execution_count": 17,
331 |      "metadata": {},
332 |      "output_type": "execute_result"
333 |     }
334 |    ],
335 |    "source": [
336 |     "std_e = math.sqrt(((s1 ** 2) / df['Soil1'].notna().sum()) + ((s2 ** 2) / df['Soil2'].notna().sum()))\n",
337 |     "std_e"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 18,
343 |    "metadata": {},
344 |    "outputs": [
345 |     {
346 |      "data": {
347 |       "text/plain": [
348 |        "0.410975590993911"
349 |       ]
350 |      },
351 |      "execution_count": 18,
352 |      "metadata": {},
353 |      "output_type": "execute_result"
354 |     }
355 |    ],
356 |    "source": [
357 |     "diff - stats.norm.ppf(0.005) * std_e"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": 19,
363 |    "metadata": {},
364 |    "outputs": [
365 |     {
366 |      "data": {
367 |       "text/plain": [
368 |        "0.13839583757751825"
369 |       ]
370 |      },
371 |      "execution_count": 19,
372 |      "metadata": {},
373 |      "output_type": "execute_result"
374 |     }
375 |    ],
376 |    "source": [
377 |     "diff + stats.norm.ppf(0.005) * std_e"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": 20,
383 |    "metadata": {},
384 |    "outputs": [],
385 |    "source": [
386 |     "# ci for xbar1 - xbar2 is 0.41 and 0.375 with %90 CL so we reject H0"
387 |    ]
388 |   }
389 |  ],
390 |  "metadata": {
391 |   "kernelspec": {
392 |    "display_name": "Python 3",
393 |    "language": "python",
394 |    "name": "python3"
395 |   },
396 |   "language_info": {
397 |    "codemirror_mode": {
398 |     "name": "ipython",
399 |     "version": 3
400 |    },
401 |    "file_extension": ".py",
402 |    "mimetype": "text/x-python",
403 |    "name": "python",
404 |    "nbconvert_exporter": "python",
405 |    "pygments_lexer": "ipython3",
406 |    "version": "3.7.3"
407 |   }
408 |  },
409 |  "nbformat": 4,
410 |  "nbformat_minor": 2
411 | }
412 | 


--------------------------------------------------------------------------------
/Statistics/Statistics_Exercise_6.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import pandas as pd
  8 | import numpy as np
  9 | from scipy import stats
 10 | import math
 11 | 
 12 | 
 13 | # ## EXERCISE 1. 
 14 | # The hourly wages in a particular industry are normally distributed with mean  $13.20  and  standard deviation $2.50. A company in this industry employs 40 workers, paying them an average of $12.20 per hour. Can this company be accused of paying substandard wages? Use an α = .01 level test.
 15 | 
 16 | # In[2]:
 17 | 
 18 | 
 19 | # H0: mu = 13.20
 20 | # Ha: mu < 13.20
 21 | # it is one tailed test
 22 | 
 23 | 
 24 | # In[3]:
 25 | 
 26 | 
 27 | mu = 13.20
 28 | mu_sample = 12.20
 29 | n = 40
 30 | std = 2.5
 31 | alpha = 0.01
 32 | 
 33 | 
 34 | # In[4]:
 35 | 
 36 | 
 37 | t_statistic = (mu_sample - mu)/(std/math.sqrt(n))
 38 | 
 39 | 
 40 | # In[5]:
 41 | 
 42 | 
 43 | t_statistic
 44 | 
 45 | 
 46 | # In[6]:
 47 | 
 48 | 
 49 | p_value = stats.norm.cdf(t_statistic)
 50 | 
 51 | 
 52 | # In[7]:
 53 | 
 54 | 
 55 | p_value
 56 | 
 57 | 
 58 | # In[8]:
 59 | 
 60 | 
 61 | if p_value < alpha:
 62 |     print('At {} level of significance, we can reject the null hypothesis in the favor of Ha.'.format(alpha))
 63 | else:
 64 |     print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha))
 65 | 
 66 | 
 67 | # ## EXERCISE 2.
 68 | # Shear strength measurements derived from unconfined compression tests for two types of soils gave the results shown in the following document (measurements in tons per square foot). Do the soils appear to differ with respect to average shear strength, at the 1% significance level?
 69 | 
 70 | # In[9]:
 71 | 
 72 | 
 73 | # H0: mu1 = mu2
 74 | # Ha: mu1 != mu2
 75 | 
 76 | 
 77 | # In[10]:
 78 | 
 79 | 
 80 | df = pd.read_csv('soil.csv')
 81 | 
 82 | 
 83 | # In[11]:
 84 | 
 85 | 
 86 | df.head()
 87 | 
 88 | 
 89 | # In[12]:
 90 | 
 91 | 
 92 | xbar1 = df['Soil1'].mean()
 93 | xbar2 = df['Soil2'].mean()
 94 | 
 95 | s1 = df['Soil1'].std()
 96 | s2 = df['Soil2'].std()
 97 | 
 98 | alpha = 0.01
 99 | 
100 | 
101 | # In[13]:
102 | 
103 | 
104 | t_statistic = ((xbar1 - xbar2) - 0) / math.sqrt(((s1 ** 2) / df['Soil1'].notna().sum()) + ((s2 ** 2) / df['Soil2'].notna().sum()))
105 | 
106 | 
107 | # In[14]:
108 | 
109 | 
110 | t_statistic
111 | 
112 | 
113 | # In[15]:
114 | 
115 | 
116 | stats.norm.ppf(0.005)  # 0.01 / 2
117 | 
118 | 
119 | # In[16]:
120 | 
121 | 
122 | diff = xbar1 - xbar2
123 | diff
124 | 
125 | 
126 | # In[17]:
127 | 
128 | 
129 | std_e = math.sqrt(((s1 ** 2) / df['Soil1'].notna().sum()) + ((s2 ** 2) / df['Soil2'].notna().sum()))
130 | std_e
131 | 
132 | 
133 | # In[18]:
134 | 
135 | 
136 | diff - stats.norm.ppf(0.005) * std_e
137 | 
138 | 
139 | # In[19]:
140 | 
141 | 
142 | diff + stats.norm.ppf(0.005) * std_e
143 | 
144 | 
145 | # In[20]:
146 | 
147 | 
148 | # ci for xbar1 - xbar2 is 0.41 and 0.375 with %90 CL so we reject H0
149 | 
150 | 


--------------------------------------------------------------------------------