├── EDA ├── EDA of Insurance Company's Dataset.ipynb └── insurance-data.csv ├── ESTU ├── EDA.ipynb └── insurance.csv ├── Linear Algebra └── Linear Algebra.ipynb ├── MSKU ├── Demand_Prediction │ ├── Demand_Prediction_LSTM_MSKU.ipynb │ └── store_sharing.csv ├── Methods of ML │ ├── Classification_Algorithms_Final.ipynb │ ├── KrediKartıVerileri.xlsx │ └── Methods_of_ML_1.ipynb └── Statistical Theory of ML │ ├── Final.ipynb │ ├── Statistical Theory Behind ML_1.ipynb │ ├── arasınav_Ödev Veri Seti.xlsx │ ├── finalVeriSeti.xlsx │ └── yz.ipynb ├── Matplotlib ├── DV_1.ipynb ├── DV_1.py ├── DV_2.ipynb ├── DV_2.py ├── Matplotlib_1.ipynb ├── Matplotlib_1.py ├── Matplotlib_Class.ipynb ├── Matplotlib_Class.py ├── Matplotlib_U.ipynb └── Matplotlib_U.py ├── NumPy ├── NumPy_1.ipynb ├── NumPy_1.py ├── NumPy_Class.ipynb ├── NumPy_Class.py ├── NumPy_U.ipynb └── NumPy_U.py ├── Other ├── Feature_Selection.ipynb ├── PIWorks │ ├── SARIMAX+Regression.ipynb │ ├── SARIMAX.ipynb │ └── municipality_bus_utilization.csv ├── Regex.ipynb ├── Regex.py ├── Sklearn_Encoding.ipynb └── car_price.csv ├── PI └── PIWorks.ipynb ├── Pandas ├── Pandas Built-in DV_1.ipynb ├── Pandas Built-in DV_1.py ├── Pandas Built-in DV_Class.ipynb ├── Pandas Built-in DV_Class.py ├── Pandas Built-in DV_U.ipynb ├── Pandas Built-in DV_U.py ├── Pandas-(Aggregation, Groupby, Operations).ipynb ├── Pandas-(Aggregation, Groupby, Operations).py ├── Pandas-(Missing Values, Outliers).ipynb ├── Pandas-(Missing Values, Outliers).py ├── Pandas_1.ipynb ├── Pandas_1.py ├── Pandas_2.ipynb ├── Pandas_2.py ├── Pandas_3.ipynb ├── Pandas_3.py ├── Pandas_Class.ipynb ├── Pandas_Class.py ├── Pandas_Class2.ipynb ├── Pandas_Class2.py ├── Pandas_U.ipynb └── Pandas_U.py ├── README.md ├── Seaborn ├── Seaborn Class.ipynb ├── Seaborn Class.py ├── Seaborn Class2.ipynb ├── Seaborn Class2.py ├── Seaborn1.ipynb ├── Seaborn2.ipynb ├── Seaborn3.ipynb ├── Seaborn_1.ipynb ├── Seaborn_1.py ├── Seaborn_U.ipynb └── Seaborn_U.py └── Statistics ├── Statistics.ipynb ├── Statistics.py ├── Statistics_2.ipynb ├── Statistics_2.py ├── Statistics_3.ipynb ├── Statistics_3.py ├── Statistics_4.ipynb ├── Statistics_4.py ├── Statistics_5.ipynb ├── Statistics_5.py ├── Statistics_6.ipynb ├── Statistics_6.py ├── Statistics_Exercise_1.ipynb ├── Statistics_Exercise_1.py ├── Statistics_Exercise_2.ipynb ├── Statistics_Exercise_2.py ├── Statistics_Exercise_3.ipynb ├── Statistics_Exercise_3.py ├── Statistics_Exercise_4.ipynb ├── Statistics_Exercise_4.py ├── Statistics_Exercise_5.ipynb ├── Statistics_Exercise_5.py ├── Statistics_Exercise_6.ipynb └── Statistics_Exercise_6.py /MSKU/Methods of ML/KrediKartıVerileri.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hersany/DataScience/5d73888fcde2765730f0bfa33e143205e6b36742/MSKU/Methods of ML/KrediKartıVerileri.xlsx -------------------------------------------------------------------------------- /MSKU/Statistical Theory of ML/arasınav_Ödev Veri Seti.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hersany/DataScience/5d73888fcde2765730f0bfa33e143205e6b36742/MSKU/Statistical Theory of ML/arasınav_Ödev Veri Seti.xlsx -------------------------------------------------------------------------------- /MSKU/Statistical Theory of ML/finalVeriSeti.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hersany/DataScience/5d73888fcde2765730f0bfa33e143205e6b36742/MSKU/Statistical Theory of ML/finalVeriSeti.xlsx -------------------------------------------------------------------------------- /Matplotlib/DV_1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import matplotlib.pyplot as plt 10 | import seaborn as sns 11 | 12 | 13 | # # 1. Set darkgrid style from seaborn 14 | 15 | # In[3]: 16 | 17 | 18 | sns.set(style = 'darkgrid') 19 | 20 | 21 | # In[ ]: 22 | 23 | 24 | ## areaplot 25 | 26 | 27 | # In[2]: 28 | 29 | 30 | df = pd.DataFrame({'buy': [1, 2, 4, 9, 11, 5], 'register': [4, 6, 5, 11, 13, 15], 31 | 'view': [25, 45, 24, 58, 75, 55]}, 32 | index=pd.date_range(start='2018/01/01', end='2018/07/01', 33 | freq='M')) 34 | df.head() 35 | 36 | 37 | # In[ ]: 38 | 39 | 40 | df.shape 41 | 42 | 43 | # In[7]: 44 | 45 | 46 | df.plot() 47 | 48 | 49 | # In[6]: 50 | 51 | 52 | df.plot.area() 53 | 54 | 55 | # # 2. Make a bar plot 56 | 57 | # In[ ]: 58 | 59 | 60 | ## barplots 61 | 62 | 63 | # In[9]: 64 | 65 | 66 | income = [100, 80, 150, 48, 52, 69, 88] 67 | expense = [30, 100, 100, 20, 75, 50, 28] 68 | index = ['A', 'B', 'C','D', 'E', 'F', 'G'] 69 | df = pd.DataFrame({'income': income,'expense': expense}, index=index) 70 | df.head() 71 | 72 | 73 | # # 3. Make a bar plot 74 | 75 | # In[10]: 76 | 77 | 78 | df.plot.bar() 79 | 80 | 81 | # In[10]: 82 | 83 | 84 | df.plot(kind = 'bar') 85 | 86 | 87 | # In[ ]: 88 | 89 | 90 | 91 | 92 | 93 | # # 3. Stack the bars 94 | 95 | # In[16]: 96 | 97 | 98 | df.plot.bar(stacked = True) 99 | 100 | 101 | # In[ ]: 102 | 103 | 104 | 105 | 106 | 107 | # # 3. Rotate the labels and set figsize 108 | 109 | # In[17]: 110 | 111 | 112 | games = ['Game-1', 'Game-2', 'Game-3', 'Game-4', 'Game-5', 'Game-6', 'Game-7'] 113 | 114 | 115 | # In[12]: 116 | 117 | 118 | df.plot.bar(stacked = True, figsize = (9, 6)) 119 | plt.xticks(rotation = 0) 120 | plt.show() 121 | 122 | 123 | # In[ ]: 124 | 125 | 126 | 127 | 128 | 129 | # # 4. Replace the labels by "Game-1", "Game-2", "Game-3", "Game-4", "Game-5", "Game-6", "Game-7" 130 | # # 5. Unstuck the bars, annotate the hight of the bars on top of them 131 | 132 | # In[27]: 133 | 134 | 135 | games = ['Game-1', 'Game-2', 'Game-3', 'Game-4', 'Game-5', 'Game-6', 'Game-7'] 136 | 137 | 138 | # In[17]: 139 | 140 | 141 | g = df.plot.bar(figsize = (9, 6)) 142 | plt.xticks(rotation = 0) 143 | for p in g.patches: 144 | g.annotate((p.get_height()), (p.get_x()+ 0.01, p.get_height()+0.6), size = 10) 145 | plt.show() 146 | 147 | 148 | # In[40]: 149 | 150 | 151 | g = df.plot.bar(figsize = (9, 6)) 152 | plt.xticks(rotation = 0) 153 | g.set_xticklabels(games) 154 | for p in g.patches: 155 | g.annotate((p.get_height()), (p.get_x()+ 0.01, p.get_height()+0.6), size = 10) 156 | plt.show() 157 | 158 | 159 | # In[ ]: 160 | 161 | 162 | 163 | 164 | 165 | # In[ ]: 166 | 167 | 168 | ## histograms 169 | 170 | 171 | # In[18]: 172 | 173 | 174 | tips=sns.load_dataset("tips") 175 | 176 | 177 | # In[19]: 178 | 179 | 180 | tips.head() 181 | 182 | 183 | # # 6. Histogram of the total_bill column 184 | 185 | # In[52]: 186 | 187 | 188 | sns.distplot(tips['total_bill'], kde = False) 189 | 190 | 191 | # In[50]: 192 | 193 | 194 | tips['total_bill'].hist() 195 | 196 | 197 | # In[ ]: 198 | 199 | 200 | 201 | 202 | 203 | # In[ ]: 204 | 205 | 206 | ## lineplots 207 | 208 | 209 | # In[ ]: 210 | 211 | 212 | tips.head() 213 | 214 | 215 | # # 7. Plot the avg tip by size (lineplot) 216 | 217 | # In[63]: 218 | 219 | 220 | tips.groupby('size')['tip'].mean().plot() 221 | 222 | 223 | # In[ ]: 224 | 225 | 226 | 227 | 228 | 229 | # # 8.Set the linestyle as "--" 230 | 231 | # In[66]: 232 | 233 | 234 | tips.groupby('size')['tip'].mean().plot(ls = '--') 235 | 236 | 237 | # In[ ]: 238 | 239 | 240 | 241 | 242 | 243 | # In[ ]: 244 | 245 | 246 | ## Scatter Plots 247 | 248 | 249 | # In[ ]: 250 | 251 | 252 | tips.head() 253 | 254 | 255 | # # 9. Make a scatter plot between tip and total_bill 256 | 257 | # In[69]: 258 | 259 | 260 | tips.plot.scatter('tip', 'total_bill') 261 | 262 | 263 | # In[ ]: 264 | 265 | 266 | 267 | 268 | 269 | # # 10. Set an additional dimension using size column 270 | 271 | # In[72]: 272 | 273 | 274 | tips.head() 275 | 276 | 277 | # In[21]: 278 | 279 | 280 | tips.plot.scatter('tip', 'total_bill', c = 'size', cmap = 'coolwarm') 281 | 282 | 283 | # In[ ]: 284 | 285 | 286 | 287 | 288 | 289 | # In[ ]: 290 | 291 | 292 | ## boxplots 293 | 294 | 295 | # In[ ]: 296 | 297 | 298 | tips.head() 299 | 300 | 301 | # # 11. Make a box plot of total_bill column 302 | 303 | # In[75]: 304 | 305 | 306 | sns.boxplot(tips['total_bill']) 307 | 308 | 309 | # In[77]: 310 | 311 | 312 | tips.boxplot('total_bill') 313 | 314 | 315 | # In[ ]: 316 | 317 | 318 | 319 | 320 | 321 | # # 12. Seperate the the boxplot above using size columns 322 | 323 | # In[82]: 324 | 325 | 326 | tips.boxplot('total_bill', 'size') 327 | plt.tight_layout() 328 | 329 | 330 | # In[ ]: 331 | 332 | 333 | 334 | 335 | 336 | # # 13. Make the same plot using seaborn 337 | 338 | # In[83]: 339 | 340 | 341 | sns.boxplot('size', 'total_bill', data = tips) 342 | 343 | 344 | # In[ ]: 345 | 346 | 347 | 348 | 349 | 350 | # # 14. Make a violinplot instead of boxplot and discuss the difference between boxplot and violinplot 351 | 352 | # In[84]: 353 | 354 | 355 | sns.violinplot('size', 'total_bill', data = tips) 356 | 357 | 358 | # In[23]: 359 | 360 | 361 | sns.violinplot('size', 'total_bill', data = tips) 362 | sns.swarmplot('size', 'total_bill', data = tips, color = 'black') 363 | 364 | 365 | # In[ ]: 366 | 367 | 368 | 369 | 370 | 371 | # In[ ]: 372 | 373 | 374 | 375 | 376 | -------------------------------------------------------------------------------- /Matplotlib/DV_2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # #### import the libraries 5 | 6 | # In[11]: 7 | 8 | 9 | import numpy as np 10 | import pandas as pd 11 | import matplotlib.pyplot as plt 12 | import seaborn as sns 13 | 14 | 15 | # #### load the tips dataset 16 | 17 | # In[12]: 18 | 19 | 20 | tips = sns.load_dataset('tips') 21 | tips.head() 22 | 23 | 24 | # In[4]: 25 | 26 | 27 | # instrutction: make a plot with seaborn that shows distribution of total bill. 28 | 29 | 30 | # ### 1-Distribution Plot 31 | 32 | # #### DISTPLOT 33 | 34 | # In[8]: 35 | 36 | 37 | sns.distplot(tips['total_bill'], kde = False, hist_kws = dict(edgecolor = 'k', lw = 2), bins = 15) 38 | 39 | 40 | # In[13]: 41 | 42 | 43 | sns.distplot(tips['total_bill'], kde = False) 44 | 45 | 46 | # In[5]: 47 | 48 | 49 | 50 | 51 | 52 | # show rug, kde, distplot in same figure but in different axes. 53 | # Explain the difference between matplotlib and seaborn in aspect of using axes. 54 | # 55 | # (birinde ax.bar.... diye başlıyorduk, burada ax=axes[.. diye parantez içine yazıyoruz.) 56 | 57 | # In[10]: 58 | 59 | 60 | sns.set(style="darkgrid") 61 | rs = np.random.RandomState(10) 62 | 63 | # Set up the matplotlib figure 64 | f, axes = plt.subplots(2, 2, figsize=(7, 7), sharex=True) 65 | 66 | # Generate a random univariate dataset 67 | # d = rs.normal(size=100) 68 | 69 | # Plot a simple histogram with binsize determined automatically 70 | sns.distplot(tips['total_bill'], kde=False, color="b", ax=axes[0, 0]) 71 | 72 | # Plot a kernel density estimate and rug plot 73 | sns.distplot(tips['total_bill'], hist=False, rug=True, color="r", ax=axes[0, 1]) 74 | 75 | # Plot a filled kernel density estimate 76 | sns.distplot(tips['total_bill'], hist=False, color="g", kde_kws={"shade": True}, ax=axes[1, 0]) 77 | 78 | # Plot a histogram and kernel density estimate 79 | sns.distplot(tips['total_bill'], color="m", ax=axes[1, 1]) 80 | 81 | # plt.setp(axes, yticks=[]) 82 | plt.tight_layout() 83 | 84 | 85 | # In[16]: 86 | 87 | 88 | sns.set(style="white") 89 | 90 | # Set up the matplotlib figure 91 | 92 | 93 | # Generate a random univariate dataset 94 | 95 | 96 | # Plot a simple histogram with binsize determined automatically 97 | 98 | # Plot a kernel density estimate and rug plot 99 | 100 | # Plot a filled kernel density estimate 101 | 102 | # Plot a histogram and kernel density estimate 103 | 104 | 105 | # ### 2-Categorical Plot 106 | 107 | # In[6]: 108 | 109 | 110 | # ins: make a plot that shows avg total bills in both genders. 111 | 112 | 113 | # In[9]: 114 | 115 | 116 | sns.barplot('sex', 'total_bill', data = tips) 117 | 118 | 119 | # In[17]: 120 | 121 | 122 | sns.barplot('sex', 'total_bill', 'day', data = tips) 123 | plt.legend(loc = 3) 124 | 125 | 126 | # In[8]: 127 | 128 | 129 | # ins: make a plot that shows avg total bills in both genders as well as the avg total bills in different days. 130 | # what is the black bars on the graphs? (ci) 131 | 132 | 133 | # In[ ]: 134 | 135 | 136 | 137 | 138 | 139 | # In[6]: 140 | 141 | 142 | 143 | 144 | 145 | # #### B) COUNTPLOT 146 | 147 | # In[ ]: 148 | 149 | 150 | # ins: count the people in the dataset in each day. And order them. 151 | 152 | 153 | # In[30]: 154 | 155 | 156 | tips.groupby('day').count()['size'].sort_values(ascending = False).index 157 | 158 | 159 | # In[31]: 160 | 161 | 162 | sns.countplot(tips['day'], order = tips.groupby('day').count()['size'].sort_values(ascending = False).index) 163 | 164 | 165 | # In[27]: 166 | 167 | 168 | 169 | 170 | 171 | # #### C) BOXPLOT 172 | 173 | # In[ ]: 174 | 175 | 176 | # Show the total bills range according to days as well as according to smokers/non smokers. 177 | 178 | 179 | # In[32]: 180 | 181 | 182 | sns.boxplot('day', 'total_bill', 'smoker', tips) 183 | 184 | 185 | # In[8]: 186 | 187 | 188 | 189 | 190 | 191 | # #### D) VIOLINPLOT 192 | 193 | # In[ ]: 194 | 195 | 196 | # make a violin plot of total bill separeted by days on x axis. 197 | 198 | 199 | # In[33]: 200 | 201 | 202 | sns.violinplot('day', 'total_bill', data = tips) 203 | 204 | 205 | # In[17]: 206 | 207 | 208 | 209 | 210 | 211 | # #### G) CATPLOT ( FORMER NAME: FACTOR PLOT) 212 | 213 | # In[9]: 214 | 215 | 216 | # try to make same plots using catplot instead of bar, violin, box plots. 217 | 218 | 219 | # In[ ]: 220 | 221 | 222 | sns.catplot() 223 | 224 | 225 | # In[46]: 226 | 227 | 228 | 229 | # kind options: bar, swarm, strip(default), box, violin, point and count. 230 | 231 | 232 | # #### H) POINTPLOT 233 | 234 | # In[10]: 235 | 236 | 237 | # make a pointplot that shows avg total bils both in lunch and dinner. 238 | 239 | 240 | # In[20]: 241 | 242 | 243 | sns.pointplot('time', 'total_bill', data = tips) 244 | 245 | 246 | # In[21]: 247 | 248 | 249 | 250 | 251 | 252 | # # 3- Matrix and Grid Plots 253 | 254 | # In[11]: 255 | 256 | 257 | # make a hit map to show corr matrix on tips dataset. 258 | 259 | 260 | # In[35]: 261 | 262 | 263 | sns.heatmap(tips.corr(), annot = True, cmap = 'coolwarm') 264 | 265 | 266 | # In[28]: 267 | 268 | 269 | 270 | 271 | 272 | # In[12]: 273 | 274 | 275 | # make a pair plot of tips data set and make comments on it. 276 | 277 | 278 | # In[45]: 279 | 280 | 281 | sns.pairplot(tis); 282 | 283 | -------------------------------------------------------------------------------- /Matplotlib/Matplotlib_1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Exercises 5 | # 6 | # Follow the instructions to recreate the plots using this data: 7 | # 8 | # ## Data 9 | 10 | # In[17]: 11 | 12 | 13 | import numpy as np 14 | x = np.arange(0,100) 15 | y = x*2 16 | z = x**2 17 | 18 | 19 | # ** Import matplotlib.pyplot as plt and set %matplotlib inline if you are using the jupyter notebook. What command do you use if you aren't using the jupyter notebook?** 20 | 21 | # In[2]: 22 | 23 | 24 | import matplotlib.pyplot as plt 25 | get_ipython().run_line_magic('matplotlib', 'inline') 26 | 27 | 28 | # ## Exercise 1 29 | # 30 | # ** Follow along with these steps: ** 31 | # * ** Create a figure object called fig using plt.figure() ** 32 | # * ** Use add_axes to add an axis to the figure canvas at [0,0,1,1]. Call this new axis ax. ** 33 | # * ** Plot (x,y) on that axes and set the labels and titles to match the plot below:** 34 | 35 | # In[18]: 36 | 37 | 38 | fig = plt.figure() 39 | 40 | ax = fig.add_axes([0, 0, 1, 1]) 41 | 42 | ax.plot(x, y) 43 | ax.set_xlabel('x') 44 | ax.set_ylabel('y') 45 | ax.set_title('title') 46 | plt.show() 47 | 48 | 49 | # ## Exercise 2 50 | # ** Create a figure object and put two axes on it, ax1 and ax2. Located at [0,0,1,1] and [0.2,0.5,.2,.2] respectively.** 51 | 52 | # In[4]: 53 | 54 | 55 | fig = plt.figure() 56 | 57 | ax1 = fig.add_axes([0, 0, 1, 1]) 58 | ax2 = fig.add_axes([0.2, 0.5, 0.2, 0.2]) 59 | 60 | 61 | # ** Now plot (x,y) on both axes. And call your figure object to show it.** 62 | 63 | # In[21]: 64 | 65 | 66 | fig = plt.figure() 67 | 68 | ax1 = fig.add_axes([0, 0, 1, 1]) 69 | ax1.plot(x, y) 70 | 71 | ax2 = fig.add_axes([0.2, 0.5, 0.2, 0.2]) 72 | ax2.plot(x, y) 73 | plt.show() 74 | 75 | 76 | # ## Exercise 3 77 | # 78 | # ** Create the plot below by adding two axes to a figure object at [0,0,1,1] and [0.2,0.5,.4,.4]** 79 | 80 | # In[6]: 81 | 82 | 83 | fig = plt.figure() 84 | 85 | ax1 = fig.add_axes([0, 0, 1, 1]) 86 | ax2 = fig.add_axes([0.2, 0.5, 0.4, 0.4]) 87 | 88 | 89 | # ** Now use x,y, and z arrays to recreate the plot below. Notice the xlimits and y limits on the inserted plot:** 90 | 91 | # In[22]: 92 | 93 | 94 | fig = plt.figure() 95 | 96 | ax1 = fig.add_axes([0, 0, 1, 1]) 97 | ax1.plot(x, z, 'green') 98 | ax1.plot(x, y, 'blue') 99 | ax1.set_xlabel('x') 100 | ax1.set_ylabel('z') 101 | ax1.set_xlim(0) 102 | ax1.set_ylim(0) 103 | 104 | ax2 = fig.add_axes([0.2, 0.5, 0.4, 0.4]) 105 | ax2.plot(x, y) 106 | ax2.set_xlabel('x') 107 | ax2.set_ylabel('y') 108 | ax2.set_title('zoom') 109 | ax2.set_xlim([20.0, 22.0]) 110 | ax2.set_ylim([30, 50]) 111 | plt.show() 112 | 113 | 114 | # ## Exercise 4 115 | # 116 | # ** Use plt.subplots(nrows=1, ncols=2) to create the plot below.** 117 | 118 | # In[8]: 119 | 120 | 121 | fig, ax = plt.subplots(1, 2) 122 | 123 | 124 | # ** Now plot (x,y) and (x,z) on the axes. Play around with the linewidth and style** 125 | 126 | # In[9]: 127 | 128 | 129 | fig, ax = plt.subplots(1, 2) 130 | 131 | ax[0].plot(x, y, 'b', lw = 2.5, ls = '--') 132 | ax[0].set_xlim(0) 133 | ax[0].set_ylim(0) 134 | 135 | ax[1].plot(x, z, 'r', lw = 4) 136 | ax[1].set_xlim(0) 137 | ax[1].set_ylim(0) 138 | 139 | plt.tight_layout() 140 | 141 | 142 | # ** See if you can resize the plot by adding the figsize() argument in plt.subplots() are copying and pasting your previous code.** 143 | 144 | # In[10]: 145 | 146 | 147 | fig, ax = plt.subplots(1, 2, figsize = (10, 5)) 148 | 149 | ax[0].plot(x, y, 'b', lw = 2.5, ls = '--') 150 | ax[0].set_xlim(0) 151 | ax[0].set_ylim(0) 152 | 153 | ax[1].plot(x, z, 'r', lw = 4) 154 | ax[1].set_xlim(0) 155 | ax[1].set_ylim(0) 156 | 157 | plt.tight_layout() 158 | 159 | 160 | # # Great Job! 161 | -------------------------------------------------------------------------------- /Matplotlib/Matplotlib_Class.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[9]: 5 | 6 | 7 | import matplotlib.pyplot as plt 8 | 9 | 10 | # In[10]: 11 | 12 | 13 | age = [25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45] 14 | salary = [38496, 42000, 46752, 49320, 53200, 15 | 56000, 62316, 64928, 67317, 68748, 73752] 16 | 17 | 18 | # In[11]: 19 | 20 | 21 | # Functional 22 | 23 | 24 | # In[12]: 25 | 26 | 27 | plt.plot(age, salary); # ; = plt.show() 28 | 29 | 30 | # In[8]: 31 | 32 | 33 | plt.plot(age, salary) 34 | plt.xlabel('age') 35 | plt.ylabel('salary') 36 | plt.title('Salary by Age'); 37 | 38 | 39 | # In[14]: 40 | 41 | 42 | salary_2 = [45372, 48876, 53850, 57287, 63016, 43 | 65998, 70003, 70000, 71496, 75370, 83640] 44 | 45 | 46 | # In[10]: 47 | 48 | 49 | plt.plot(age, salary) 50 | plt.plot(age, salary_2) 51 | plt.xlabel('age') 52 | plt.ylabel('salary') 53 | plt.title('Salary by Age'); 54 | 55 | 56 | # In[13]: 57 | 58 | 59 | plt.plot(age, salary, label = 'Turkey') 60 | plt.plot(age, salary_2, label = 'Europe') 61 | plt.xlabel('age') 62 | plt.ylabel('salary') 63 | plt.title('Salary by Age') 64 | plt.legend(); 65 | 66 | 67 | # In[23]: 68 | 69 | 70 | plt.subplot(2, 1, 1) 71 | plt.plot(age, salary, 'r') 72 | 73 | plt.subplot(2, 1, 2) 74 | plt.plot(age, salary_2, 'b') 75 | 76 | plt.tight_layout() 77 | 78 | 79 | # In[15]: 80 | 81 | 82 | import pandas as pd 83 | df = pd.DataFrame(list(zip(age, salary, salary_2)), columns =['age', 'salary', 'salary_2']) 84 | df.head() 85 | 86 | 87 | # In[26]: 88 | 89 | 90 | df['salary'] 91 | 92 | 93 | # In[25]: 94 | 95 | 96 | plt.plot(df['salary']); # index and salary 97 | 98 | 99 | # In[17]: 100 | 101 | 102 | plt.plot(df['age'], df['salary']); # age and salary 103 | 104 | 105 | # In[28]: 106 | 107 | 108 | # OOP 109 | 110 | 111 | # In[20]: 112 | 113 | 114 | fig = plt.figure() 115 | 116 | ax = fig.add_axes([0, 0, 0.8, 0.8]) 117 | 118 | 119 | # In[32]: 120 | 121 | 122 | fig = plt.figure() 123 | 124 | ax = fig.add_axes([0, 0, 1, 1]) 125 | 126 | ax.plot(age, salary, 'r') 127 | ax.set_xlabel('Age') 128 | ax.set_ylabel('Salary') 129 | ax.set_title('Salary by Age'); 130 | 131 | 132 | # In[34]: 133 | 134 | 135 | fig = plt.figure() 136 | 137 | ax1 = fig.add_axes([0, 0, 0.8, 0.8]) 138 | 139 | ax1.plot(age, salary, 'r') 140 | ax1.set_xlabel('Age') 141 | ax1.set_ylabel('Salary') 142 | ax1.set_title('Salary by Age') 143 | 144 | ax2 = fig.add_axes([1, 0.1, 0.4, 0.4]) 145 | 146 | ax2.plot(age, salary_2, 'b') 147 | ax2.set_xlabel('Age') 148 | ax2.set_ylabel('Salary2') 149 | ax2.set_title('Salary2 by Age'); 150 | 151 | 152 | # In[46]: 153 | 154 | 155 | ax1 156 | 157 | 158 | # In[47]: 159 | 160 | 161 | ax2 162 | 163 | 164 | # In[48]: 165 | 166 | 167 | fig 168 | 169 | 170 | # In[49]: 171 | 172 | 173 | fig, ax = plt.subplots() 174 | 175 | 176 | # In[56]: 177 | 178 | 179 | fig, ax = plt.subplots() # default 1 row 1 column 180 | 181 | ax.plot(age, salary, 'r') 182 | ax.set_xlabel('Age') 183 | ax.set_ylabel('Salary') 184 | ax.set_title('Salary by Age') 185 | plt.tight_layout() 186 | 187 | 188 | # In[37]: 189 | 190 | 191 | fig, ax = plt.subplots(2, 1) 192 | 193 | ax[0].plot(age, salary, 'r') 194 | ax[0].set_xlabel('Age') 195 | ax[0].set_ylabel('Salary') 196 | ax[0].set_title('Salary by Age') 197 | 198 | ax[1].plot(age, salary_2, 'b') 199 | ax[1].set_xlabel('Age') 200 | ax[1].set_ylabel('Salary2') 201 | ax[1].set_title('Salary2 by Age') 202 | plt.tight_layout() 203 | 204 | 205 | # In[74]: 206 | 207 | 208 | fig, ax = plt.subplots(2, 2) 209 | plt.tight_layout() 210 | 211 | 212 | # In[73]: 213 | 214 | 215 | ax 216 | 217 | 218 | # In[75]: 219 | 220 | 221 | fig 222 | 223 | 224 | # In[79]: 225 | 226 | 227 | fig, ax = plt.subplots(1, 2) 228 | 229 | ax[0].plot(age, salary) 230 | ax[0].set_title('First Plot') 231 | ax[0].set_xlabel('Age') 232 | ax[0].set_ylabel('Salaries') 233 | 234 | ax[1].plot(age, salary_2) 235 | ax[1].set_title('Second Plot') 236 | ax[1].set_xlabel('Age') 237 | 238 | plt.tight_layout() 239 | 240 | 241 | # In[84]: 242 | 243 | 244 | fig, ax = plt.subplots(2, 2) 245 | 246 | ax[0, 0].plot(age, salary) 247 | ax[0, 0].set_title('First Plot') 248 | ax[0, 0].set_xlabel('Age') 249 | ax[0, 0].set_ylabel('Salaries') 250 | 251 | ax[0, 1].plot(age, salary) 252 | ax[0, 1].set_title('Second Plot') 253 | ax[0, 1].set_xlabel('Age') 254 | ax[0, 1].set_ylabel('Salaries') 255 | 256 | ax[1, 0].plot(age, salary_2) 257 | ax[1, 0].set_title('Third Plot') 258 | ax[1, 0].set_xlabel('Age') 259 | ax[1, 0].set_ylabel('Salaries') 260 | 261 | ax[1, 1].plot(age, salary_2) 262 | ax[1, 1].set_title('Fourth Plot') 263 | ax[1, 1].set_xlabel('Age') 264 | ax[1, 1].set_ylabel('Salaries') 265 | 266 | plt.tight_layout() 267 | 268 | 269 | # In[5]: 270 | 271 | 272 | import matplotlib.pyplot as plt 273 | 274 | 275 | # In[6]: 276 | 277 | 278 | age = [25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45] 279 | salary = [38496, 42000, 46752, 49320, 53200, 280 | 56000, 62316, 64928, 67317, 68748, 73752] 281 | salary_2 = [45372, 48876, 53850, 57287, 63016, 282 | 65998, 70003, 70000, 71496, 75370, 83640] 283 | 284 | 285 | # In[7]: 286 | 287 | 288 | import pandas as pd 289 | df = pd.DataFrame(list(zip(age, salary, salary_2)), columns =['age', 'salary', 'salary_2']) 290 | 291 | 292 | # In[9]: 293 | 294 | 295 | df.head() 296 | 297 | 298 | # In[10]: 299 | 300 | 301 | fig = plt.figure(figsize = (8, 4)) 302 | 303 | 304 | # In[11]: 305 | 306 | 307 | fig, ax = plt.subplots(figsize = (8, 4)) 308 | 309 | 310 | # In[13]: 311 | 312 | 313 | fig, ax = plt.subplots(figsize = (6, 3)) 314 | ax.plot(age, salary, 'r') 315 | ax.set_xlabel('age') 316 | ax.set_ylabel('salary') 317 | ax.set_title('title'); 318 | 319 | 320 | # In[39]: 321 | 322 | 323 | fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(6,3)) 324 | ax[0].plot(age, salary) 325 | ax[0].set_xlabel('age') 326 | ax[1].plot(age, salary_2) 327 | ax[0].set_title('First') 328 | ax[1].set_title('Second') 329 | ax[1].set_xlabel('age') 330 | plt.tight_layout() 331 | 332 | 333 | # In[15]: 334 | 335 | 336 | fig.savefig('myplot') 337 | 338 | 339 | # In[ ]: 340 | 341 | 342 | # legend, label, title 343 | 344 | 345 | # In[40]: 346 | 347 | 348 | fig, ax = plt.subplots(figsize=(6,3)) 349 | ax.plot(age, salary, label = 'salary_1') 350 | ax.set_xlabel('Age') 351 | ax.plot(age, salary_2, label = 'salary_2') 352 | ax.set_title('Title') 353 | ax.set_ylabel('Salaries') 354 | ax.legend(loc = 0) 355 | plt.tight_layout() 356 | 357 | 358 | # In[41]: 359 | 360 | 361 | # setting colors, linewidths, linetypes, marker 362 | 363 | 364 | # In[61]: 365 | 366 | 367 | fig, ax = plt.subplots(figsize=(6,3)) 368 | ax.plot(age, salary, 'y', lw = 2, ls = ':', marker = '*', ms = 10, mfc = 'b', alpha = 0.3); 369 | 370 | 371 | # In[62]: 372 | 373 | 374 | # plot range (xlim - ylim) 375 | 376 | 377 | # In[80]: 378 | 379 | 380 | fig, ax = plt.subplots(figsize=(6,3)) 381 | ax.plot(age, salary) 382 | ax.set_xlim([30, 40]) 383 | ax.set_ylim([45000, 60000]); 384 | 385 | 386 | # In[66]: 387 | 388 | 389 | # adding vertical-horizontal lines 390 | 391 | 392 | # In[56]: 393 | 394 | 395 | fig, ax = plt.subplots(figsize=(6,3)) 396 | ax.plot(age, salary) 397 | ax.set_xlim([30, 40]) 398 | ax.set_ylim([45000, 60000]) 399 | ax.axvline(35) 400 | ax.axhline(50000, color = 'red'); 401 | 402 | 403 | # In[60]: 404 | 405 | 406 | import numpy as np 407 | np.random.seed(5) 408 | x = np.arange(1, 101) 409 | y = 20 + 3 * x + np.random.normal(0, 60, 100) 410 | p = plt.plot(x, y, "o") 411 | plt.vlines(70,100,250) 412 | plt.hlines(100, 0, 100) 413 | 414 | 415 | # In[82]: 416 | 417 | 418 | # Plot types 419 | 420 | 421 | # In[83]: 422 | 423 | 424 | # Bar chart 425 | 426 | 427 | # In[84]: 428 | 429 | 430 | country = ['UK', 'USA', 'FRA', 'GER', 'NOR'] 431 | pci = [40000, 50000, 38000, 55000, 80000] 432 | 433 | 434 | # In[85]: 435 | 436 | 437 | fig, ax = plt.subplots() 438 | ax.bar(country, pci) 439 | 440 | 441 | # In[86]: 442 | 443 | 444 | labels = ['G1', 'G2', 'G3', 'G4', 'G5'] 445 | men_means = [20, 34, 30, 35, 27] 446 | women_means = [25, 32, 34, 20, 25] 447 | 448 | 449 | # In[89]: 450 | 451 | 452 | fig, ax = plt.subplots() 453 | ax.bar(labels, men_means) 454 | 455 | 456 | # In[94]: 457 | 458 | 459 | fig, ax = plt.subplots() 460 | ax.bar(labels, women_means, color = 'orange'); 461 | 462 | 463 | # In[95]: 464 | 465 | 466 | x = np.arange(len(labels)) 467 | width = 0.35 468 | 469 | 470 | # In[96]: 471 | 472 | 473 | fig, ax = plt.subplots() 474 | ax.bar(x - width/2, men_means, width, label='Men') 475 | ax.bar(x + width/2, women_means, width, label='Women') 476 | 477 | 478 | # In[98]: 479 | 480 | 481 | fig, ax = plt.subplots() 482 | ax.bar(x - width/2, men_means, width, label='Men') 483 | ax.bar(x + width/2, women_means, width, label='Women') 484 | ax.set_ylabel('Scores') 485 | ax.set_title('Scores by group and gender') 486 | ax.set_xticks(x) 487 | ax.set_xticklabels(labels) 488 | ax.legend() 489 | 490 | 491 | # In[105]: 492 | 493 | 494 | age = [25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45] 495 | 496 | 497 | fig, ax = plt.subplots(figsize = (8, 4)) 498 | 499 | ax.plot(age, salary) 500 | ax.set_xticks([25, 30, 35, 40, 45]); 501 | 502 | 503 | # In[107]: 504 | 505 | 506 | import pandas as pd 507 | df = pd.DataFrame(list(zip(labels, men_means, women_means)), columns =["labels", "men_means", "women_means"]) 508 | df.head() 509 | 510 | 511 | # In[110]: 512 | 513 | 514 | df.plot.bar(stacked = True) 515 | 516 | -------------------------------------------------------------------------------- /Matplotlib/Matplotlib_U.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import matplotlib.pyplot as plt 8 | 9 | 10 | # In[2]: 11 | 12 | 13 | get_ipython().run_line_magic('matplotlib', 'inline') 14 | 15 | 16 | # In[3]: 17 | 18 | 19 | import numpy as np 20 | 21 | 22 | # In[4]: 23 | 24 | 25 | x = np.linspace(0, 5, 11) 26 | 27 | 28 | # In[5]: 29 | 30 | 31 | y = x ** 2 32 | 33 | 34 | # In[6]: 35 | 36 | 37 | x 38 | 39 | 40 | # In[7]: 41 | 42 | 43 | y 44 | 45 | 46 | # In[8]: 47 | 48 | 49 | # Functional Method 50 | 51 | 52 | # In[9]: 53 | 54 | 55 | plt.plot(x, y) 56 | # plt.show() = print() for matplotlib 57 | 58 | 59 | # In[10]: 60 | 61 | 62 | plt.xlabel('X Label') 63 | plt.ylabel('Y Label') 64 | plt.title('Title') 65 | 66 | 67 | # In[11]: 68 | 69 | 70 | plt.plot(x, y) 71 | plt.xlabel('X Label') 72 | plt.ylabel('Y Label') 73 | plt.title('Title') 74 | 75 | 76 | # In[12]: 77 | 78 | 79 | plt.subplot(1, 2, 1) 80 | plt.plot(x, y, 'k') 81 | 82 | plt.subplot(1, 2, 2) 83 | plt.plot(y, x, 'b') 84 | 85 | 86 | # In[13]: 87 | 88 | 89 | # Object-Oriented Method 90 | 91 | 92 | # In[14]: 93 | 94 | 95 | fig = plt.figure() 96 | 97 | axes = fig.add_axes([0.1, 0.1, 0.8, 0.8]) # left bottom width height, 0-1, relation to black canvas 98 | 99 | axes.plot(x, y) 100 | axes.set_xlabel('X Label') 101 | axes.set_ylabel('Y Label') 102 | axes.set_title('Title') 103 | 104 | 105 | # In[15]: 106 | 107 | 108 | fig = plt.figure() 109 | 110 | axes1 = fig.add_axes([0.1, 0.1, 0.8, 0.8]) 111 | axes2 = fig.add_axes([0.2, 0.5, 0.4, 0.3]) 112 | 113 | axes1.plot(x, y) 114 | axes1.set_title('LARGER PLOT') 115 | 116 | axes2.plot(y, x) 117 | axes2.set_title('SMALLER PLOT') 118 | 119 | 120 | # In[16]: 121 | 122 | 123 | fig = plt.figure() 124 | plt.show() 125 | 126 | 127 | # In[17]: 128 | 129 | 130 | fig = plt.figure() 131 | 132 | axes1 = fig.add_axes([0.1, 0.1, 0.8, 0.8]) 133 | axes1.plot(x, y) 134 | 135 | 136 | # In[18]: 137 | 138 | 139 | fig, axes = plt.subplots(nrows = 1, ncols = 2) 140 | 141 | # axes.plot(x, y) 142 | 143 | 144 | # In[19]: 145 | 146 | 147 | fig, axes = plt.subplots(nrows = 3, ncols = 3) 148 | 149 | # axes.plot(x, y) 150 | plt.tight_layout() 151 | 152 | 153 | # In[20]: 154 | 155 | 156 | axes 157 | 158 | 159 | # In[21]: 160 | 161 | 162 | fig, axes = plt.subplots(nrows = 1, ncols = 2) 163 | 164 | for current_ax in axes: 165 | current_ax.plot(x,y) 166 | 167 | 168 | # In[22]: 169 | 170 | 171 | axes 172 | 173 | 174 | # In[23]: 175 | 176 | 177 | fig, axes = plt.subplots(nrows = 1, ncols = 2) 178 | 179 | axes[0].plot(x, y) 180 | 181 | 182 | # In[24]: 183 | 184 | 185 | fig, axes = plt.subplots(nrows = 1, ncols = 2) 186 | 187 | axes[0].plot(x, y) 188 | axes[0].set_title('First Plot') 189 | 190 | axes[1].plot(y, x) 191 | axes[1].set_title('Second Plot') 192 | 193 | plt.tight_layout() 194 | 195 | 196 | # In[25]: 197 | 198 | 199 | # Figure size, DPI 200 | 201 | 202 | # In[26]: 203 | 204 | 205 | fig = plt.figure(figsize = (8, 2)) # figsize = width, height in inches 206 | 207 | ax = fig.add_axes([0, 0, 1, 1]) 208 | ax.plot(x, y) 209 | 210 | 211 | # In[27]: 212 | 213 | 214 | fig, axes = plt.subplots(nrows = 2, ncols = 1, figsize = (8, 2)) 215 | 216 | axes[0].plot(x, y) 217 | 218 | axes[1].plot(y, x) 219 | 220 | plt.tight_layout() 221 | 222 | 223 | # In[28]: 224 | 225 | 226 | fig 227 | 228 | 229 | # In[29]: 230 | 231 | 232 | fig.savefig('my_picture.png', dpi = 200, edgecolor = 'black', facecolor = 'w', transparent = True) 233 | # default dpi is 100 it is about pixels 234 | 235 | 236 | # In[30]: 237 | 238 | 239 | fig = plt.figure(figsize = (8, 2)) 240 | 241 | ax = fig.add_axes([0, 0, 1, 1]) 242 | ax.set_title('Title') 243 | ax.set_ylabel('Y') 244 | ax.set_xlabel('X') 245 | 246 | ax.plot(x, x ** 2, label = 'X Squared') 247 | ax.plot(x, x ** 3, label = 'X Cubed') 248 | ax.legend() # it uses/refers labels in .plot 249 | # ax.legend(loc=(0.1, 0.1)) 250 | 251 | 252 | # In[31]: 253 | 254 | 255 | # setting colors, line width, line types 256 | 257 | 258 | # In[32]: 259 | 260 | 261 | fig = plt.figure() 262 | 263 | ax = fig.add_axes([0, 0, 1, 1]) 264 | 265 | ax.plot(x, y, color = 'green') # RGB Hex Code google for custom colors #FF8C00 266 | plt.show() 267 | 268 | 269 | # In[33]: 270 | 271 | 272 | fig = plt.figure() 273 | 274 | ax = fig.add_axes([0, 0, 1, 1]) 275 | 276 | ax.plot(x, y, color = 'green', linewidth = 3) # default linewidth is 1, we can use lw instead of it. 277 | 278 | 279 | # In[34]: 280 | 281 | 282 | fig = plt.figure() 283 | 284 | ax = fig.add_axes([0, 0, 1, 1]) 285 | 286 | ax.plot(x, y, color = 'green', linewidth = 3, alpha = 0.3) # alpha for transparency dafault is 1 287 | 288 | 289 | # In[35]: 290 | 291 | 292 | fig = plt.figure() 293 | 294 | ax = fig.add_axes([0, 0, 1, 1]) 295 | 296 | ax.plot(x, y, color = 'green', lw = 3, linestyle = '-.') # default linestyle is solid, ls 297 | 298 | 299 | # In[36]: 300 | 301 | 302 | fig = plt.figure() 303 | 304 | ax = fig.add_axes([0, 0, 1, 1]) 305 | 306 | ax.plot(x, y, color = 'green', lw = 3, ls = '-', marker = 'o') # marker each value x/y 307 | 308 | 309 | # In[37]: 310 | 311 | 312 | fig = plt.figure() 313 | 314 | ax = fig.add_axes([0, 0, 1, 1]) 315 | 316 | ax.plot(x, y, color = 'green', lw = 3, ls = '-', marker = 'o', markersize = 15, 317 | markerfacecolor = 'red') 318 | 319 | 320 | # In[38]: 321 | 322 | 323 | fig = plt.figure() 324 | 325 | ax = fig.add_axes([0, 0, 1, 1]) 326 | 327 | ax.plot(x, y, color = 'green', lw = 3, ls = '-', marker = 'o', markersize = 15, 328 | markerfacecolor = 'red', markeredgewidth = 3, markeredgecolor = 'blue') 329 | 330 | 331 | # In[39]: 332 | 333 | 334 | # ylim xlim 335 | 336 | 337 | # In[40]: 338 | 339 | 340 | fig = plt.figure() 341 | 342 | ax = fig.add_axes([0, 0, 1, 1]) 343 | 344 | ax.plot(x, y, color = 'purple', lw = 2, ls = '--') 345 | 346 | ax.set_xlim([0 ,1]) 347 | ax.set_ylim([0, 2]) 348 | 349 | 350 | # In[ ]: 351 | 352 | 353 | 354 | 355 | -------------------------------------------------------------------------------- /NumPy/NumPy_1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import numpy as np 8 | 9 | 10 | # In[2]: 11 | 12 | 13 | np.zeros(10) 14 | 15 | 16 | # In[3]: 17 | 18 | 19 | np.ones(10) 20 | 21 | 22 | # In[5]: 23 | 24 | 25 | np.ones(10) * 5 26 | 27 | 28 | # In[6]: 29 | 30 | 31 | np.arange(10, 51) 32 | 33 | 34 | # In[7]: 35 | 36 | 37 | np.arange(10, 51, 2) 38 | 39 | 40 | # In[8]: 41 | 42 | 43 | np.arange(9).reshape(3, 3) 44 | 45 | 46 | # In[9]: 47 | 48 | 49 | np.eye(3) 50 | 51 | 52 | # In[14]: 53 | 54 | 55 | from numpy.random import rand 56 | rand(1) 57 | 58 | 59 | # In[15]: 60 | 61 | 62 | from numpy.random import randn 63 | randn(25) 64 | 65 | 66 | # In[24]: 67 | 68 | 69 | np.arange(1, 101).reshape(10, 10) / 100 70 | 71 | 72 | # In[25]: 73 | 74 | 75 | np.linspace(0, 1, 20) 76 | 77 | 78 | # In[27]: 79 | 80 | 81 | mat = np.arange(1, 26).reshape(5, 5) 82 | mat 83 | 84 | 85 | # In[28]: 86 | 87 | 88 | mat[2:,1:] 89 | 90 | 91 | # In[29]: 92 | 93 | 94 | mat[3, 4] 95 | 96 | 97 | # In[31]: 98 | 99 | 100 | mat[:3,1:2] 101 | 102 | 103 | # In[53]: 104 | 105 | 106 | mat[4] 107 | 108 | 109 | # In[34]: 110 | 111 | 112 | mat[3:] 113 | 114 | 115 | # In[36]: 116 | 117 | 118 | np.sum(mat) 119 | 120 | 121 | # In[38]: 122 | 123 | 124 | np.std(mat) 125 | 126 | 127 | # In[51]: 128 | 129 | 130 | np.sum(mat, axis = 0) 131 | 132 | 133 | # In[1]: 134 | 135 | 136 | ########LAB######### 137 | 138 | 139 | # In[2]: 140 | 141 | 142 | import numpy as np 143 | 144 | 145 | # In[3]: 146 | 147 | 148 | a = np.array([[3, 1], [1, 2]]) 149 | b = np.array([9, 8]) 150 | 151 | 152 | # In[4]: 153 | 154 | 155 | a 156 | 157 | 158 | # In[5]: 159 | 160 | 161 | b 162 | 163 | 164 | # In[28]: 165 | 166 | 167 | np.linalg.multi_dot(b) 168 | 169 | 170 | # In[9]: 171 | 172 | 173 | x = np.arange(1, 11) 174 | 175 | 176 | # In[10]: 177 | 178 | 179 | x 180 | 181 | 182 | # In[11]: 183 | 184 | 185 | y = np.arange(-1, 1, 0.2) 186 | 187 | 188 | # In[12]: 189 | 190 | 191 | y 192 | 193 | 194 | # In[13]: 195 | 196 | 197 | np.linspace(0, 10, 25) 198 | 199 | 200 | # In[20]: 201 | 202 | 203 | np.logspace(0, 10, 10, base = 2) 204 | 205 | 206 | # In[30]: 207 | 208 | 209 | np.random.seed(0) 210 | np.random.rand(5, 5) # uniform distribution 211 | 212 | 213 | # In[31]: 214 | 215 | 216 | np.random.rand(5, 5) 217 | 218 | 219 | # In[32]: 220 | 221 | 222 | np.random.rand(5, 5) 223 | 224 | 225 | # In[33]: 226 | 227 | 228 | np.random.seed(0) 229 | np.random.rand(5, 5) 230 | 231 | 232 | # In[34]: 233 | 234 | 235 | np.random.randn(3, 3) # normal distribution 236 | 237 | 238 | # In[36]: 239 | 240 | 241 | np.diag([1, 2, 3, 4]) # diagonal matrix 242 | 243 | 244 | # In[37]: 245 | 246 | 247 | np.diag([1, 2, 3, 4], k = -1) # default k = 0 248 | 249 | 250 | # In[38]: 251 | 252 | 253 | np.diag([1, 2, 3, 4], k = 1) 254 | 255 | 256 | # In[41]: 257 | 258 | 259 | np.eye(4, k = -1) 260 | 261 | 262 | # In[42]: 263 | 264 | 265 | np.eye(4) 266 | 267 | 268 | # In[43]: 269 | 270 | 271 | d = np.array([i for i in range(5)]) 272 | 273 | 274 | # In[44]: 275 | 276 | 277 | d 278 | 279 | 280 | # In[45]: 281 | 282 | 283 | row_mask = np.array([True, False, True, False, False]) # np.arrays are homogeneous. 284 | 285 | 286 | # In[46]: 287 | 288 | 289 | d[row_mask] 290 | 291 | 292 | # In[47]: 293 | 294 | 295 | row1_mask = np.array([0, 0, 0, 1, 1], dtype = bool) 296 | 297 | 298 | # In[49]: 299 | 300 | 301 | d[row1_mask] 302 | 303 | 304 | # In[54]: 305 | 306 | 307 | x = np.arange(0, 10, 0.5) 308 | 309 | 310 | # In[55]: 311 | 312 | 313 | x 314 | 315 | 316 | # In[56]: 317 | 318 | 319 | mask = (5 < x) & (x < 7.5) 320 | 321 | 322 | # In[59]: 323 | 324 | 325 | mask 326 | 327 | 328 | # In[57]: 329 | 330 | 331 | x[mask] 332 | 333 | 334 | # In[58]: 335 | 336 | 337 | x[(5 < x) & (x < 7.5)] 338 | 339 | 340 | # In[62]: 341 | 342 | 343 | indices = np.where(mask) 344 | 345 | 346 | # In[63]: 347 | 348 | 349 | indices 350 | 351 | 352 | # In[64]: 353 | 354 | 355 | np.where(mask) 356 | 357 | 358 | # In[65]: 359 | 360 | 361 | x[indices] 362 | 363 | 364 | # In[67]: 365 | 366 | 367 | a = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [1, 2, 3, 4], [5, 6, 7, 8]]) 368 | 369 | 370 | # In[68]: 371 | 372 | 373 | a 374 | 375 | 376 | # In[70]: 377 | 378 | 379 | np.diag(a) 380 | 381 | 382 | # In[71]: 383 | 384 | 385 | np.diag(a, k = 1) 386 | 387 | 388 | # In[75]: 389 | 390 | 391 | np.diag(a, k = 3) 392 | 393 | 394 | # In[76]: 395 | 396 | 397 | arr = np.arange(-3, 3) 398 | 399 | 400 | # In[77]: 401 | 402 | 403 | arr 404 | 405 | 406 | # In[78]: 407 | 408 | 409 | arr[[1, 3, 5]] 410 | 411 | 412 | # In[79]: 413 | 414 | 415 | arr.take([1, 3, 5]) 416 | 417 | 418 | # In[85]: 419 | 420 | 421 | np.choose([1, 3, 5], arr) 422 | 423 | 424 | # In[ ]: 425 | 426 | 427 | 428 | 429 | -------------------------------------------------------------------------------- /NumPy/NumPy_Class.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import numpy as np 8 | 9 | 10 | # In[2]: 11 | 12 | 13 | my_list = [1, 2, 3] 14 | 15 | 16 | # In[3]: 17 | 18 | 19 | np.array(my_list) 20 | 21 | 22 | # In[5]: 23 | 24 | 25 | type(np.array(my_list)) 26 | 27 | 28 | # In[6]: 29 | 30 | 31 | a = [1, 2, 3, 4] 32 | b = [2, 3, 4, 5] 33 | 34 | 35 | # In[8]: 36 | 37 | 38 | np.array(a) * np.array(b) 39 | 40 | 41 | # In[9]: 42 | 43 | 44 | my_matrix = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] 45 | 46 | 47 | # In[10]: 48 | 49 | 50 | type(my_matrix) 51 | 52 | 53 | # In[11]: 54 | 55 | 56 | np.array(my_matrix) 57 | 58 | 59 | # In[12]: 60 | 61 | 62 | type(np.array(my_matrix)) 63 | 64 | 65 | # In[13]: 66 | 67 | 68 | np.arange(0, 10) 69 | 70 | 71 | # In[16]: 72 | 73 | 74 | np.arange(0, 11, 2) 75 | 76 | 77 | # In[20]: 78 | 79 | 80 | np.arange(10) 81 | 82 | 83 | # In[21]: 84 | 85 | 86 | np.zeros(2) 87 | 88 | 89 | # In[37]: 90 | 91 | 92 | np.zeros((4, 4), dtype = bool) 93 | 94 | 95 | # In[43]: 96 | 97 | 98 | np.zeros((4, 4), dtype = str) 99 | 100 | 101 | # In[39]: 102 | 103 | 104 | np.ones((2, 2), dtype = int) 105 | 106 | 107 | # In[41]: 108 | 109 | 110 | np.ones(3) 111 | 112 | 113 | # In[44]: 114 | 115 | 116 | np.ones((4, 4), dtype = bool) 117 | 118 | 119 | # In[46]: 120 | 121 | 122 | np.full((3, 5), 7) 123 | 124 | 125 | # In[47]: 126 | 127 | 128 | np.full((3, 5), '7') 129 | 130 | 131 | # In[54]: 132 | 133 | 134 | np.linspace(0, 10, 3) 135 | 136 | 137 | # In[55]: 138 | 139 | 140 | np.linspace(0, 10) 141 | 142 | 143 | # In[56]: 144 | 145 | 146 | len(np.linspace(0, 10)) 147 | 148 | 149 | # In[57]: 150 | 151 | 152 | np.linspace(0, 10, dtype = int) 153 | 154 | 155 | # In[58]: 156 | 157 | 158 | set(np.linspace(0, 10, dtype = int)) 159 | 160 | 161 | # In[60]: 162 | 163 | 164 | np.linspace(0, 10, dtype = int).reshape(10, 5) 165 | 166 | 167 | # In[61]: 168 | 169 | 170 | np.eye(4) 171 | 172 | 173 | # In[62]: 174 | 175 | 176 | np.random.rand(5) # uniform distribution 177 | 178 | 179 | # In[64]: 180 | 181 | 182 | np.random.rand(3, 2) 183 | 184 | 185 | # In[1]: 186 | 187 | 188 | import matplotlib.pyplot as plt 189 | 190 | 191 | # In[4]: 192 | 193 | 194 | plt.hist(np.random.rand(5000)) # uniform distribution 195 | plt.show() 196 | 197 | 198 | # In[90]: 199 | 200 | 201 | plt.hist(np.random.rand(50000), bins = 75) 202 | plt.show() 203 | 204 | 205 | # In[80]: 206 | 207 | 208 | np.random.randn(5) # normal distribution 209 | 210 | 211 | # In[88]: 212 | 213 | 214 | np.random.randn(5, 5) 215 | 216 | 217 | # In[94]: 218 | 219 | 220 | plt.hist(np.random.randn(50000)) # normal distribution 221 | plt.show() 222 | 223 | 224 | # In[99]: 225 | 226 | 227 | np.random.randn(50000).mean() 228 | 229 | 230 | # In[5]: 231 | 232 | 233 | np.random.randn(50000).std() 234 | 235 | 236 | # In[5]: 237 | 238 | 239 | np.random.randint(1, 100) 240 | 241 | 242 | # In[7]: 243 | 244 | 245 | np.random.randint(100, size = 10) 246 | 247 | 248 | # In[8]: 249 | 250 | 251 | np.random.randint(1, 100, 10) 252 | 253 | 254 | # In[15]: 255 | 256 | 257 | np.random.randint(1, 100, (2, 2)) 258 | 259 | 260 | # In[7]: 261 | 262 | 263 | np.random.randint(1, [3, 50, 100]) 264 | 265 | 266 | # In[9]: 267 | 268 | 269 | np.random.randint(1, [3, 50, 100], (10, 3)) 270 | 271 | 272 | # In[22]: 273 | 274 | 275 | np.random.randint([3, 50, 100], [5, 60, 120]) 276 | 277 | 278 | # In[10]: 279 | 280 | 281 | np.random.randint([3, 50, 100], [5, 60, 120], (5, 3)) 282 | 283 | 284 | # In[23]: 285 | 286 | 287 | arr = np.arange(25) 288 | ranarr = np.random.randint(0, 50, 10) 289 | 290 | 291 | # In[24]: 292 | 293 | 294 | arr 295 | 296 | 297 | # In[25]: 298 | 299 | 300 | ranarr 301 | 302 | 303 | # In[26]: 304 | 305 | 306 | arr.reshape(5, 5) 307 | 308 | 309 | # In[28]: 310 | 311 | 312 | np.reshape(ranarr, (2, 5)) 313 | 314 | 315 | # In[31]: 316 | 317 | 318 | ranarr.max() 319 | 320 | 321 | # In[32]: 322 | 323 | 324 | ranarr.argmax() 325 | 326 | 327 | # In[33]: 328 | 329 | 330 | np.max(ranarr) 331 | 332 | 333 | # In[34]: 334 | 335 | 336 | ranarr.min() 337 | 338 | 339 | # In[35]: 340 | 341 | 342 | ranarr.argmin() 343 | 344 | 345 | # In[37]: 346 | 347 | 348 | arr.ndim 349 | 350 | 351 | # In[38]: 352 | 353 | 354 | arr.shape 355 | 356 | 357 | # In[41]: 358 | 359 | 360 | arr.reshape(5, 5).shape 361 | 362 | 363 | # In[39]: 364 | 365 | 366 | arr.size 367 | 368 | 369 | # In[40]: 370 | 371 | 372 | arr.dtype 373 | 374 | 375 | # In[11]: 376 | 377 | 378 | x = np.array([1, 2, 3]) 379 | y = np.array([4, 5, 6]) 380 | 381 | 382 | # In[13]: 383 | 384 | 385 | np.concatenate([x, y]) 386 | 387 | 388 | # In[45]: 389 | 390 | 391 | z = np.array([7, 8, 9]) 392 | 393 | 394 | # In[48]: 395 | 396 | 397 | np.concatenate([x, y, z]) 398 | 399 | 400 | # In[50]: 401 | 402 | 403 | a1 = np.concatenate([x, y]).reshape(2, 3) 404 | 405 | 406 | # In[51]: 407 | 408 | 409 | a1 410 | 411 | 412 | # In[54]: 413 | 414 | 415 | np.concatenate([a1, a1]) 416 | 417 | 418 | # In[55]: 419 | 420 | 421 | np.concatenate([a1, a1], axis = 1) 422 | 423 | 424 | # In[56]: 425 | 426 | 427 | x = np.array([1, 2, 3, 99, 99, 3, 2, 1]) 428 | 429 | 430 | # In[63]: 431 | 432 | 433 | np.split(x, [3, 5, 7]) 434 | 435 | 436 | # In[70]: 437 | 438 | 439 | a, b, c, d = np.split(x, [3, 5, 7]) 440 | 441 | 442 | # In[71]: 443 | 444 | 445 | a 446 | 447 | 448 | # In[72]: 449 | 450 | 451 | b 452 | 453 | 454 | # In[73]: 455 | 456 | 457 | c 458 | 459 | 460 | # In[74]: 461 | 462 | 463 | d 464 | 465 | 466 | # In[75]: 467 | 468 | 469 | np.split(x, 4) 470 | 471 | 472 | # In[16]: 473 | 474 | 475 | y = np.arange(20).reshape(5, 4) 476 | 477 | 478 | # In[17]: 479 | 480 | 481 | y 482 | 483 | 484 | # In[18]: 485 | 486 | 487 | np.split(y, 5) 488 | 489 | 490 | # In[19]: 491 | 492 | 493 | np.split(y, 4, axis = 1) 494 | 495 | 496 | # In[20]: 497 | 498 | 499 | np.vsplit(y, [2,4]) 500 | 501 | 502 | # In[21]: 503 | 504 | 505 | np.vsplit(y, 5) 506 | 507 | 508 | # In[22]: 509 | 510 | 511 | np.split(y, [2, 4]) 512 | 513 | 514 | # In[25]: 515 | 516 | 517 | np.split(y, [2, 4], axis = 1) 518 | 519 | 520 | # In[118]: 521 | 522 | 523 | y 524 | 525 | 526 | # In[123]: 527 | 528 | 529 | np.hsplit(y, [3]) 530 | 531 | 532 | # In[125]: 533 | 534 | 535 | np.hsplit(y, 2) 536 | 537 | 538 | # In[127]: 539 | 540 | 541 | left, right = np.hsplit(y, 2) 542 | 543 | 544 | # In[128]: 545 | 546 | 547 | left 548 | 549 | 550 | # In[129]: 551 | 552 | 553 | right 554 | 555 | 556 | # In[132]: 557 | 558 | 559 | upper, lower = np.vsplit(y, [4]) 560 | 561 | 562 | # In[133]: 563 | 564 | 565 | upper 566 | 567 | 568 | # In[134]: 569 | 570 | 571 | lower 572 | 573 | 574 | # In[140]: 575 | 576 | 577 | v = np.array([2, 1, 4, 3, 5]) 578 | v 579 | 580 | 581 | # In[141]: 582 | 583 | 584 | np.sort(v) # we need to assign a new variable 585 | 586 | 587 | # In[142]: 588 | 589 | 590 | v 591 | 592 | 593 | # In[143]: 594 | 595 | 596 | v.sort() # changes 597 | 598 | 599 | # In[144]: 600 | 601 | 602 | v 603 | 604 | 605 | # In[148]: 606 | 607 | 608 | v2 = np.random.randint(5, 100, (3, 3)) 609 | 610 | 611 | # In[149]: 612 | 613 | 614 | v2 615 | 616 | 617 | # In[150]: 618 | 619 | 620 | np.sort(v2, axis = 0) 621 | 622 | 623 | # In[151]: 624 | 625 | 626 | np.sort(v2, axis = 1) 627 | 628 | 629 | # In[156]: 630 | 631 | 632 | np.sort(v2) 633 | 634 | 635 | # In[157]: 636 | 637 | 638 | arr = np.arange(0, 11) 639 | 640 | 641 | # In[158]: 642 | 643 | 644 | arr 645 | 646 | 647 | # In[159]: 648 | 649 | 650 | arr[2:4] 651 | 652 | 653 | # In[160]: 654 | 655 | 656 | arr[8] 657 | 658 | 659 | # In[161]: 660 | 661 | 662 | arr[-1] 663 | 664 | 665 | # In[163]: 666 | 667 | 668 | arr[::2] 669 | 670 | 671 | # In[164]: 672 | 673 | 674 | arr[0:5] = 100 675 | 676 | 677 | # In[165]: 678 | 679 | 680 | arr 681 | 682 | 683 | # In[166]: 684 | 685 | 686 | arr = np.arange(11) 687 | 688 | 689 | # In[167]: 690 | 691 | 692 | arr 693 | 694 | 695 | # In[172]: 696 | 697 | 698 | slice_of_arr = arr[0:6] 699 | 700 | 701 | # In[173]: 702 | 703 | 704 | slice_of_arr 705 | 706 | 707 | # In[176]: 708 | 709 | 710 | slice_of_arr[:] = 88 711 | 712 | 713 | # In[177]: 714 | 715 | 716 | arr 717 | 718 | 719 | # In[178]: 720 | 721 | 722 | slice_of_arr 723 | 724 | 725 | # In[179]: 726 | 727 | 728 | arr = np.arange(11) 729 | 730 | 731 | # In[180]: 732 | 733 | 734 | arr_2 = arr.copy() 735 | 736 | 737 | # In[181]: 738 | 739 | 740 | arr_2 741 | 742 | 743 | # In[182]: 744 | 745 | 746 | slice_of_arr = arr[0:6] 747 | 748 | 749 | # In[183]: 750 | 751 | 752 | slice_of_arr[:] = 77 753 | 754 | 755 | # In[184]: 756 | 757 | 758 | arr 759 | 760 | 761 | # In[185]: 762 | 763 | 764 | arr_2 765 | 766 | 767 | # In[27]: 768 | 769 | 770 | arr_2d = np.array([[5, 10, 15], [20, 25, 30], [35, 40, 45]]) 771 | arr_2d 772 | 773 | 774 | # In[187]: 775 | 776 | 777 | arr_2d[1] 778 | 779 | 780 | # In[30]: 781 | 782 | 783 | arr_2d[1, 0] 784 | 785 | 786 | # In[192]: 787 | 788 | 789 | arr_2d[1, 0:1] 790 | 791 | 792 | # In[193]: 793 | 794 | 795 | arr_2d[:, 2] 796 | 797 | 798 | # In[194]: 799 | 800 | 801 | arr_2d[:, 2:] 802 | 803 | 804 | # In[195]: 805 | 806 | 807 | arr_2d[:, 2] = 3 808 | 809 | 810 | # In[196]: 811 | 812 | 813 | arr_2d 814 | 815 | 816 | # In[200]: 817 | 818 | 819 | v = np.arange(0, 30, 3) 820 | 821 | 822 | # In[201]: 823 | 824 | 825 | v 826 | 827 | 828 | # In[202]: 829 | 830 | 831 | v[1] 832 | 833 | 834 | # In[206]: 835 | 836 | 837 | idx_list = [1, 3, 5] # fancy indexing 838 | 839 | 840 | # In[207]: 841 | 842 | 843 | v[idx_list] # fancy indexing 844 | 845 | 846 | # In[208]: 847 | 848 | 849 | v[[1, 3, 5]] # fancy indexing 850 | 851 | 852 | # In[210]: 853 | 854 | 855 | arr_2d = np.zeros((10, 10), dtype = int) 856 | 857 | 858 | # In[211]: 859 | 860 | 861 | arr_2d 862 | 863 | 864 | # In[212]: 865 | 866 | 867 | arr_2d.shape 868 | 869 | 870 | # In[213]: 871 | 872 | 873 | arr_length = arr_2d.shape[1] 874 | 875 | 876 | # In[215]: 877 | 878 | 879 | arr_length 880 | 881 | 882 | # In[216]: 883 | 884 | 885 | arr_2d[0] 886 | 887 | 888 | # In[217]: 889 | 890 | 891 | arr_2d[3] 892 | 893 | 894 | # In[218]: 895 | 896 | 897 | for i in range(arr_length): 898 | arr_2d[i] = i 899 | 900 | 901 | # In[219]: 902 | 903 | 904 | arr_2d 905 | 906 | 907 | # In[220]: 908 | 909 | 910 | arr_2d[[2, 4, 6, 8]] 911 | 912 | 913 | # In[221]: 914 | 915 | 916 | arr_2d[[6, 4, 2, 7]] 917 | 918 | 919 | # In[3]: 920 | 921 | 922 | jj = np.arange(1, 17).reshape(4, 4) 923 | 924 | 925 | # In[4]: 926 | 927 | 928 | jj 929 | 930 | 931 | # In[9]: 932 | 933 | 934 | jj[[1, 3], [2, 3]] # fancy indexing [axis-0], [axis-1] 935 | 936 | 937 | # In[226]: 938 | 939 | 940 | jj[[1, 2], [0, 3]] 941 | 942 | 943 | # In[227]: 944 | 945 | 946 | jj 947 | 948 | 949 | # In[228]: 950 | 951 | 952 | jj[1, [1, 3]] 953 | 954 | 955 | # In[230]: 956 | 957 | 958 | jj [[0, 3], 1] 959 | 960 | 961 | # In[232]: 962 | 963 | 964 | jj[0:, [1, 3]] 965 | 966 | 967 | # In[233]: 968 | 969 | 970 | arr = np.arange(1, 11) 971 | 972 | 973 | # In[234]: 974 | 975 | 976 | arr 977 | 978 | 979 | # In[235]: 980 | 981 | 982 | arr > 4 983 | 984 | 985 | # In[236]: 986 | 987 | 988 | arr[arr > 4] 989 | 990 | 991 | # In[242]: 992 | 993 | 994 | arr[(arr != 3) & (arr != 4)] 995 | 996 | 997 | # In[250]: 998 | 999 | 1000 | arr[arr % 2 == 0] 1001 | 1002 | 1003 | # In[251]: 1004 | 1005 | 1006 | arr = np.arange(11) 1007 | 1008 | 1009 | # In[252]: 1010 | 1011 | 1012 | arr + arr 1013 | 1014 | 1015 | # In[253]: 1016 | 1017 | 1018 | arr - arr 1019 | 1020 | 1021 | # In[254]: 1022 | 1023 | 1024 | arr * arr 1025 | 1026 | 1027 | # In[255]: 1028 | 1029 | 1030 | arr ** 2 1031 | 1032 | 1033 | # In[256]: 1034 | 1035 | 1036 | arr // arr 1037 | 1038 | 1039 | # In[257]: 1040 | 1041 | 1042 | arr / 0 1043 | 1044 | 1045 | # In[258]: 1046 | 1047 | 1048 | arr / 1 1049 | 1050 | 1051 | # In[259]: 1052 | 1053 | 1054 | arr + 3 1055 | 1056 | 1057 | # In[260]: 1058 | 1059 | 1060 | np.exp(arr) 1061 | 1062 | 1063 | # In[261]: 1064 | 1065 | 1066 | np.sin(arr) 1067 | 1068 | 1069 | # In[263]: 1070 | 1071 | 1072 | np.sin(np.pi/2) 1073 | 1074 | 1075 | # In[264]: 1076 | 1077 | 1078 | np.tan(np.pi/4) 1079 | 1080 | 1081 | # In[ ]: 1082 | 1083 | 1084 | 1085 | 1086 | -------------------------------------------------------------------------------- /NumPy/NumPy_U.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | pip show numpy 8 | 9 | 10 | # In[2]: 11 | 12 | 13 | pip install numpy 14 | 15 | 16 | # In[3]: 17 | 18 | 19 | my_list = [1, 2, 3] 20 | 21 | 22 | # In[4]: 23 | 24 | 25 | import numpy as np 26 | 27 | 28 | # In[5]: 29 | 30 | 31 | arr = np.array(my_list) 32 | 33 | 34 | # In[6]: 35 | 36 | 37 | arr 38 | 39 | 40 | # In[7]: 41 | 42 | 43 | my_math = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] 44 | 45 | 46 | # In[8]: 47 | 48 | 49 | np.array(my_math) 50 | 51 | 52 | # In[9]: 53 | 54 | 55 | np.arange(11) # it is similar range() func. in python. (start,stop,step) 56 | 57 | 58 | # In[10]: 59 | 60 | 61 | np.arange(1, 11, 2) 62 | 63 | 64 | # In[11]: 65 | 66 | 67 | np.zeros(3) 68 | 69 | 70 | # In[12]: 71 | 72 | 73 | np.zeros((2, 5)) # 2 rows, 5 columns 74 | 75 | 76 | # In[13]: 77 | 78 | 79 | np.ones(4) # output has 1 bracket it is 1-D array 80 | 81 | 82 | # In[14]: 83 | 84 | 85 | np.ones((3, 2)) # output has 2 brackets it is 2-D array 86 | 87 | 88 | # In[15]: 89 | 90 | 91 | np.linspace(2, 3, 5) # (start, stop, number, some extra parameters...) 92 | 93 | 94 | # In[16]: 95 | 96 | 97 | np.linspace(0, 5, 10) 98 | 99 | 100 | # In[17]: 101 | 102 | 103 | np.linspace(0, 5, 10, retstep = True) 104 | 105 | 106 | # In[18]: 107 | 108 | 109 | np.eye(4) 110 | 111 | 112 | # In[19]: 113 | 114 | 115 | np.random.rand(5) # random between 0 and 1, uniform distribution 116 | 117 | 118 | # In[20]: 119 | 120 | 121 | np.random.rand(5, 5) 122 | 123 | 124 | # In[21]: 125 | 126 | 127 | np.random.randn(5) # random around 0, normal distribution 128 | 129 | 130 | # In[22]: 131 | 132 | 133 | np.random.randn(4, 4) 134 | 135 | 136 | # In[23]: 137 | 138 | 139 | np.random.randint(1,100) # 1 is inclusive, 100 is exclusive 140 | 141 | 142 | # In[24]: 143 | 144 | 145 | np.random.randint(1, 100, 10) 146 | 147 | 148 | # In[25]: 149 | 150 | 151 | arr = np.arange(25) 152 | print(arr) 153 | 154 | 155 | # In[26]: 156 | 157 | 158 | np.reshape(arr, (5, 5)) 159 | 160 | 161 | # In[27]: 162 | 163 | 164 | ranarr = np.random.randint(0, 50, 10) 165 | print(ranarr) 166 | 167 | 168 | # In[28]: 169 | 170 | 171 | arr.reshape(5, 10) 172 | 173 | 174 | # In[ ]: 175 | 176 | 177 | arr.reshape(5, 5) # we changed it as a 2-D with reshape method 178 | 179 | 180 | # In[29]: 181 | 182 | 183 | ranarr 184 | 185 | 186 | # In[30]: 187 | 188 | 189 | ranarr.max() 190 | 191 | 192 | # In[31]: 193 | 194 | 195 | ranarr.min() 196 | 197 | 198 | # In[32]: 199 | 200 | 201 | ranarr.argmax() # indexing max value in array 202 | 203 | 204 | # In[33]: 205 | 206 | 207 | ranarr.argmin() 208 | 209 | 210 | # In[34]: 211 | 212 | 213 | arr.shape # 1-D 214 | 215 | 216 | # In[35]: 217 | 218 | 219 | arr 220 | 221 | 222 | # In[36]: 223 | 224 | 225 | arr = arr.reshape(5, 5) 226 | 227 | 228 | # In[37]: 229 | 230 | 231 | arr.shape # 2-D 232 | 233 | 234 | # In[38]: 235 | 236 | 237 | arr 238 | 239 | 240 | # In[39]: 241 | 242 | 243 | arr.dtype # it gives actual data type 244 | 245 | 246 | # In[40]: 247 | 248 | 249 | from numpy.random import randint 250 | 251 | 252 | # In[41]: 253 | 254 | 255 | randint(2, 10) 256 | 257 | 258 | # In[42]: 259 | 260 | 261 | # NumPy Indexing and Selection 262 | 263 | 264 | # In[43]: 265 | 266 | 267 | import numpy as np 268 | 269 | 270 | # In[44]: 271 | 272 | 273 | arr = np.arange(11) 274 | 275 | 276 | # In[45]: 277 | 278 | 279 | arr 280 | 281 | 282 | # In[46]: 283 | 284 | 285 | arr[8] 286 | 287 | 288 | # In[47]: 289 | 290 | 291 | arr[1:5] 292 | 293 | 294 | # In[48]: 295 | 296 | 297 | arr[0:5] 298 | 299 | 300 | # In[49]: 301 | 302 | 303 | arr[:] 304 | 305 | 306 | # In[50]: 307 | 308 | 309 | arr[:6] 310 | 311 | 312 | # In[51]: 313 | 314 | 315 | arr[::-1] 316 | 317 | 318 | # In[52]: 319 | 320 | 321 | arr[::2] 322 | 323 | 324 | # In[53]: 325 | 326 | 327 | arr[0:5] = 100 # broadcast 328 | 329 | 330 | # In[54]: 331 | 332 | 333 | arr 334 | 335 | 336 | # In[55]: 337 | 338 | 339 | arr = np.arange(11) 340 | 341 | 342 | # In[56]: 343 | 344 | 345 | arr 346 | 347 | 348 | # In[57]: 349 | 350 | 351 | slice_of_arr = arr[0:6] # original array does not copied 352 | 353 | 354 | # In[58]: 355 | 356 | 357 | slice_of_arr 358 | 359 | 360 | # In[59]: 361 | 362 | 363 | slice_of_arr[:] = 99 364 | 365 | 366 | # In[60]: 367 | 368 | 369 | slice_of_arr 370 | 371 | 372 | # In[61]: 373 | 374 | 375 | arr # it changes too 376 | 377 | 378 | # In[62]: 379 | 380 | 381 | arr_copy = arr.copy() # we copied it now the origibal doesnt change 382 | 383 | 384 | # In[63]: 385 | 386 | 387 | arr 388 | 389 | 390 | # In[64]: 391 | 392 | 393 | arr_copy[:] = 100 394 | 395 | 396 | # In[65]: 397 | 398 | 399 | arr_copy 400 | 401 | 402 | # In[66]: 403 | 404 | 405 | arr 406 | 407 | 408 | # In[67]: 409 | 410 | 411 | arr_2d = np.array([[5, 10, 15], [20, 25, 30], [35, 40, 45]]) 412 | 413 | 414 | # In[68]: 415 | 416 | 417 | arr_2d 418 | 419 | 420 | # In[69]: 421 | 422 | 423 | arr_2d[0, 0] 424 | 425 | 426 | # In[70]: 427 | 428 | 429 | arr_2d[0, 1] 430 | 431 | 432 | # In[71]: 433 | 434 | 435 | arr_2d[1, 2] 436 | 437 | 438 | # In[72]: 439 | 440 | 441 | arr_2d[0] 442 | 443 | 444 | # In[73]: 445 | 446 | 447 | arr_2d[1] 448 | 449 | 450 | # In[74]: 451 | 452 | 453 | arr_2d[:2] 454 | 455 | 456 | # In[75]: 457 | 458 | 459 | arr_2d[::2] 460 | 461 | 462 | # In[76]: 463 | 464 | 465 | arr_2d[:2,1:] 466 | 467 | 468 | # In[77]: 469 | 470 | 471 | arr_2d[1:,:2] 472 | 473 | 474 | # In[78]: 475 | 476 | 477 | arr = np.arange(1, 11) 478 | 479 | 480 | # In[79]: 481 | 482 | 483 | arr 484 | 485 | 486 | # In[80]: 487 | 488 | 489 | bool_arr = arr > 5 490 | 491 | 492 | # In[81]: 493 | 494 | 495 | bool_arr 496 | 497 | 498 | # In[82]: 499 | 500 | 501 | arr[bool_arr] 502 | 503 | 504 | # In[83]: 505 | 506 | 507 | arr[arr > 5] 508 | 509 | 510 | # In[84]: 511 | 512 | 513 | arr[arr <= 3] 514 | 515 | 516 | # In[85]: 517 | 518 | 519 | arr_2d = np.arange(50).reshape(5, 10) 520 | 521 | 522 | # In[86]: 523 | 524 | 525 | arr_2d 526 | 527 | 528 | # In[87]: 529 | 530 | 531 | arr_2d[1:3,3:5] 532 | 533 | 534 | # In[88]: 535 | 536 | 537 | # NumPy Operations 538 | 539 | 540 | # In[89]: 541 | 542 | 543 | import numpy as np 544 | 545 | 546 | # In[90]: 547 | 548 | 549 | arr = np.arange(11) 550 | 551 | 552 | # In[91]: 553 | 554 | 555 | arr 556 | 557 | 558 | # In[92]: 559 | 560 | 561 | arr + arr 562 | 563 | 564 | # In[93]: 565 | 566 | 567 | arr - arr 568 | 569 | 570 | # In[94]: 571 | 572 | 573 | arr * arr 574 | 575 | 576 | # In[95]: 577 | 578 | 579 | arr + 100 580 | 581 | 582 | # In[96]: 583 | 584 | 585 | arr * 2 586 | 587 | 588 | # In[97]: 589 | 590 | 591 | arr ** 2 592 | 593 | 594 | # In[98]: 595 | 596 | 597 | arr % 2 598 | 599 | 600 | # In[99]: 601 | 602 | 603 | arr / arr # 0 / 0 gives error normally. numpy just give a warning and gives a nan value 604 | 605 | 606 | # In[100]: 607 | 608 | 609 | 1 / arr 610 | 611 | 612 | # In[101]: 613 | 614 | 615 | np.sqrt(arr) 616 | 617 | 618 | # In[102]: 619 | 620 | 621 | np.exp(arr) 622 | 623 | 624 | # In[103]: 625 | 626 | 627 | np.max(arr) 628 | 629 | 630 | # In[104]: 631 | 632 | 633 | arr.max() 634 | 635 | 636 | # In[105]: 637 | 638 | 639 | np.sin(arr) 640 | 641 | 642 | # In[106]: 643 | 644 | 645 | arr 646 | 647 | 648 | # In[107]: 649 | 650 | 651 | np.log(arr) 652 | 653 | 654 | # In[108]: 655 | 656 | 657 | import numpy as np 658 | 659 | 660 | # In[109]: 661 | 662 | 663 | arr = np.arange(10) 664 | arr 665 | 666 | 667 | # In[110]: 668 | 669 | 670 | print(arr) 671 | 672 | 673 | # In[111]: 674 | 675 | 676 | print(type(arr)) 677 | 678 | 679 | # In[112]: 680 | 681 | 682 | print(type(arr[0])) 683 | 684 | 685 | # In[113]: 686 | 687 | 688 | np.full((3, 2), 1) 689 | 690 | 691 | # In[114]: 692 | 693 | 694 | np.empty(2, dtype = int) 695 | 696 | 697 | # In[115]: 698 | 699 | 700 | np.empty((2, 2)) 701 | 702 | 703 | # In[116]: 704 | 705 | 706 | np.random.seed(101) 707 | np.random.randint(10, size = 6) 708 | 709 | 710 | # In[117]: 711 | 712 | 713 | from skimage import io 714 | photo = io.imread('Sea.jpg') 715 | type(photo) 716 | 717 | 718 | # In[118]: 719 | 720 | 721 | photo.shape 722 | 723 | 724 | # In[119]: 725 | 726 | 727 | import matplotlib.pyplot as plt 728 | print(plt.imshow(photo)) 729 | 730 | 731 | # In[120]: 732 | 733 | 734 | plt.imshow(photo[::-1]) 735 | 736 | 737 | # In[121]: 738 | 739 | 740 | plt.imshow(photo[:, ::-1]) 741 | 742 | 743 | # In[122]: 744 | 745 | 746 | plt.imshow(photo[:300,:600]) 747 | 748 | 749 | # In[123]: 750 | 751 | 752 | plt.imshow(photo[::2, ::2]) # resize image /2 753 | 754 | 755 | # In[124]: 756 | 757 | 758 | print(np.sum(photo)) 759 | 760 | 761 | # In[125]: 762 | 763 | 764 | print(np.mean(photo)) 765 | print(np.std(photo)) 766 | print(np.var(photo)) 767 | 768 | 769 | # In[126]: 770 | 771 | 772 | a = np.array([9, 5, 1, 7, 3]) 773 | 774 | 775 | # In[127]: 776 | 777 | 778 | a 779 | 780 | 781 | # In[128]: 782 | 783 | 784 | np.sort(a) 785 | 786 | 787 | # In[129]: 788 | 789 | 790 | a = np.array(42) 791 | b = np.array([1, 2, 3, 4, 5]) 792 | c = np.array([[1, 2, 3], [4, 5, 6]]) 793 | d = np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]]) 794 | 795 | print(a.ndim) 796 | print(b.ndim) 797 | print(c.ndim) 798 | print(d.ndim) 799 | 800 | 801 | # In[130]: 802 | 803 | 804 | arr = np.array([1, 2, 3, 4], ndmin=5) 805 | 806 | print(arr) 807 | print('number of dimensions :', arr.ndim) 808 | 809 | 810 | # In[131]: 811 | 812 | 813 | arr = np.array([1, 2, 3, 4]) 814 | 815 | print(type(arr)) 816 | print(arr.dtype) 817 | 818 | 819 | # In[132]: 820 | 821 | 822 | arr = np.array(['apple', 'banana', 'cherry']) 823 | 824 | print(arr.dtype), 825 | print(type(arr[0])) 826 | 827 | 828 | # In[133]: 829 | 830 | 831 | # data types in NumPy 832 | i - integer 833 | b - boolean 834 | u - unsigned integer 835 | f - float 836 | c - complex float 837 | m - timedelta 838 | M - datetime 839 | O - object 840 | S - string 841 | U - unicode string 842 | V - fixed chunk of memory for other type ( void ) 843 | 844 | 845 | # In[134]: 846 | 847 | 848 | arr = np.array([1, 2, 3, 4], dtype='U') 849 | 850 | print(arr) 851 | print(arr.dtype) 852 | print(type(arr[0])) 853 | 854 | 855 | # In[135]: 856 | 857 | 858 | arr = np.array([1, 2, 3, 4], dtype='i4') 859 | 860 | print(arr) 861 | print(arr.dtype) 862 | 863 | -------------------------------------------------------------------------------- /Other/Regex.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import re 8 | 9 | 10 | # In[2]: 11 | 12 | 13 | import pandas as pd 14 | 15 | 16 | # In[26]: 17 | 18 | 19 | text = re.search('\d', 'A2') # digits 20 | 21 | 22 | # In[27]: 23 | 24 | 25 | print(text) 26 | 27 | 28 | # In[6]: 29 | 30 | 31 | print(text.group()) 32 | 33 | 34 | # In[23]: 35 | 36 | 37 | text = re.search('\D', '22a') # non-digits 38 | print(text.group()) 39 | 40 | 41 | # In[9]: 42 | 43 | 44 | text = 'My phone number is 505-555-5555' 45 | 46 | 47 | # In[19]: 48 | 49 | 50 | output = re.search('(\d\d\d)-(\d\d\d)-(\d\d\d\d)', text) 51 | 52 | 53 | # In[20]: 54 | 55 | 56 | print(output.group()) 57 | 58 | 59 | # In[36]: 60 | 61 | 62 | text = 'My phone number is 415-555-1212' 63 | 64 | 65 | # In[37]: 66 | 67 | 68 | output = re.search('(\d\d\d)-(\d\d\d-\d\d\d\d)', text) 69 | 70 | 71 | # In[42]: 72 | 73 | 74 | print(output.group(2)) # we make group with - 75 | 76 | 77 | # In[45]: 78 | 79 | 80 | print(output.group(1)) 81 | 82 | 83 | # In[46]: 84 | 85 | 86 | text = 'My phone number is (415) 555-1212' 87 | 88 | 89 | # In[49]: 90 | 91 | 92 | output = re.search('(\(\d\d\d\)) (\d\d\d-\d\d\d\d)', text) 93 | 94 | 95 | # In[50]: 96 | 97 | 98 | print(output.group()) 99 | 100 | 101 | # In[51]: 102 | 103 | 104 | value = '0 1, t 10, o 100.' 105 | 106 | 107 | # In[56]: 108 | 109 | 110 | output = re.findall('\d', value) 111 | print(output) 112 | 113 | 114 | # In[58]: 115 | 116 | 117 | output = re.findall('\d\d', value) 118 | print(output) 119 | 120 | 121 | # In[59]: 122 | 123 | 124 | output = re.findall('\d{1,3}', value) 125 | print(output) 126 | 127 | 128 | # In[64]: 129 | 130 | 131 | phone = '2004-959-559 # This is Phone Number' 132 | output = re.sub('\D', '', phone) # replace except ('expression') with '' 133 | print(output) 134 | 135 | 136 | # In[63]: 137 | 138 | 139 | phone = '2004-959-559 # This is Phone Number' 140 | output = re.sub('\D', '.', phone) 141 | print(output) 142 | 143 | 144 | # In[73]: 145 | 146 | 147 | txt = 'hello world' 148 | 149 | output = re.findall('^he', txt) 150 | 151 | print(output) 152 | 153 | 154 | # In[ ]: 155 | 156 | 157 | # Pandas 158 | 159 | 160 | # In[76]: 161 | 162 | 163 | s = pd.Series(['a3', 'b4', 'c5']) # extract numbers from pandas series 164 | 165 | s.str.extract('(\d)') 166 | 167 | 168 | # In[77]: 169 | 170 | 171 | s = pd.Series(['a3', 'b4', 'c5']) # extract letters from pandas series 172 | 173 | s.str.extract('(\w)') 174 | 175 | 176 | # In[78]: 177 | 178 | 179 | s = pd.Series(['a3f', 'b4f', 'c5f']) 180 | 181 | s.str.extract('(\w\d)') 182 | 183 | 184 | # In[81]: 185 | 186 | 187 | s = pd.Series(['40 l/100 km (comb)', 188 | '38 l/100 km (comb)', '6.4 l/100 km (comb)', 189 | '8.3 kg/100 km (comb)', '5.1 kg/100 km (comb)', 190 | '5.4 l/100 km (comb)', '6.7 l/100 km (comb)', 191 | '6.2 l/100 km (comb)', '7.3 l/100 km (comb)', 192 | '6.3 l/100 km (comb)', '5.7 l/100 km (comb)', 193 | '6.1 l/100 km (comb)', '6.8 l/100 km (comb)', 194 | '7.5 l/100 km (comb)', '7.4 l/100 km (comb)', 195 | '3.6 kg/100 km (comb)', '0 l/100 km (comb)', 196 | '7.8 l/100 km (comb)']) 197 | 198 | 199 | # In[95]: 200 | 201 | 202 | s.str.extract('(\d\d|\d.\d|\d)') 203 | 204 | 205 | # In[96]: 206 | 207 | 208 | s = pd.Series(['40 l/100 km (comb)', 209 | '38 l/100 km (comb)', '6.4 l/100 km (comb)', 210 | '8.3 kg/100 km (comb)', '5.1 kg/100 km (comb)', 211 | '5.4 l/100 km (comb)', '6.7 l/100 km (comb)', 212 | '6.2 l/100 km (comb)', '7.3 l/100 km (comb)', 213 | '6.3 l/100 km (comb)', '5.7 l/100 km (comb)', 214 | '6.1 l/100 km (comb)', '6.8 l/100 km (comb)', 215 | '7.5 l/100 km (comb)', '7.4 l/100 km (comb)', 216 | '3.6 kg/100 km (comb)', '0 l/100 km (comb)', 217 | '7.8 l/100 km (comb)']) 218 | 219 | 220 | # In[103]: 221 | 222 | 223 | s.str.extract('(\d\d|\d.\d|\d).*(\d\d\d)') 224 | 225 | 226 | # In[105]: 227 | 228 | 229 | s = pd.Series(['06/2020\n\n4.9 l/100 km (comb)', 230 | '11/2020\n\n166 g CO2/km (comb)', 231 | '10/2019\n\n5.3 l/100 km (comb)', 232 | '05/2022\n\n6.3 l/100 km (comb)', 233 | '07/2019\n\n128 g CO2/km (comb)', 234 | '06/2022\n\n112 g CO2/km (comb)', 235 | '01/2022\n\n5.8 l/100 km (comb)', 236 | '11/2020\n\n106 g CO2/km (comb)', 237 | '04/2019\n\n105 g CO2/km (comb)', 238 | '08/2020\n\n133 g CO2/km (comb)', 239 | '04/2022\n\n133 g CO2/km (comb)']) 240 | 241 | 242 | # In[108]: 243 | 244 | 245 | s.str.extract('(\d\d).(\d{4})') 246 | 247 | 248 | # In[109]: 249 | 250 | 251 | s = pd.Series(['06/2020\n\n4.9 l/100 km (comb)', 252 | '11/2020\n\n166 g CO2/km (comb)', 253 | '10/2019\n\n5.3 l/100 km (comb)', 254 | '05/2022\n\n6.3 l/100 km (comb)', 255 | '07/2019\n\n128 g CO2/km (comb)', 256 | '06/2022\n\n112 g CO2/km (comb)', 257 | '01/2022\n\n5.8 l/100 km (comb)', 258 | '11/2020\n\n106 g CO2/km (comb)', 259 | '04/2019\n\n105 g CO2/km (comb)', 260 | '08/2020\n\n133 g CO2/km (comb)', 261 | '04/2022\n\n133 g CO2/km (comb)']) 262 | 263 | 264 | # In[113]: 265 | 266 | 267 | s.str.extract('(\d\d).(\d\d\d\d)\s\s(\d{3}|\d.\d)') 268 | 269 | 270 | # In[ ]: 271 | 272 | 273 | 274 | 275 | -------------------------------------------------------------------------------- /Pandas/Pandas Built-in DV_1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Pandas Data Visualization Exercise 5 | # 6 | # This is just a quick exercise for you to review the various plots we showed earlier. Use **df3** to replicate the following plots. 7 | 8 | # In[1]: 9 | 10 | 11 | import pandas as pd 12 | import matplotlib.pyplot as plt 13 | df3 = pd.read_csv('df3') 14 | get_ipython().run_line_magic('matplotlib', 'inline') 15 | 16 | 17 | # In[2]: 18 | 19 | 20 | df3.info() 21 | 22 | 23 | # In[3]: 24 | 25 | 26 | df3.head() 27 | 28 | 29 | # ** Recreate this scatter plot of b vs a. Note the color and size of the points. Also note the figure size. See if you can figure out how to stretch it in a similar fashion. Remeber back to your matplotlib lecture...** 30 | 31 | # In[39]: 32 | 33 | 34 | df3.plot.scatter('a', 'b', figsize = (12, 3), c = 'red', s = 50) 35 | 36 | 37 | # ** Create a histogram of the 'a' column.** 38 | 39 | # In[20]: 40 | 41 | 42 | df3['a'].hist() 43 | 44 | 45 | # ** These plots are okay, but they don't look very polished. Use style sheets to set the style to 'ggplot' and redo the histogram from above. Also figure out how to add more bins to it.*** 46 | 47 | # In[43]: 48 | 49 | 50 | plt.style.use('ggplot') 51 | df3['a'].hist(bins = 20, alpha = 0.5) 52 | 53 | 54 | # ** Create a boxplot comparing the a and b columns.** 55 | 56 | # In[44]: 57 | 58 | 59 | df3[['a', 'b']].plot.box() 60 | 61 | 62 | # In[46]: 63 | 64 | 65 | df3[['a', 'b']].boxplot() 66 | 67 | 68 | # ** Create a kde plot of the 'd' column ** 69 | 70 | # In[26]: 71 | 72 | 73 | df3['d'].plot.kde() 74 | 75 | 76 | # ** Figure out how to increase the linewidth and make the linestyle dashed. (Note: You would usually not dash a kde plot line)** 77 | 78 | # In[48]: 79 | 80 | 81 | df3['d'].plot.kde(lw = 5, ls = ':') 82 | 83 | 84 | # In[49]: 85 | 86 | 87 | df3['d'].plot.density(lw = 5, ls = ':') 88 | 89 | 90 | # ** Create an area plot of all the columns for just the rows up to 30. (hint: use .ix).** 91 | 92 | # In[50]: 93 | 94 | 95 | df3.loc[0:30].plot.area(alpha = 0.4) 96 | 97 | 98 | # Note, you may find this really hard, reference the solutions if you can't figure it out! 99 | # ** Notice how the legend in our previous figure overlapped some of actual diagram. Can you figure out how to display the legend outside of the plot as shown below?** 100 | # 101 | # ** Try searching Google for a good stackoverflow link on this topic. If you can't find it on your own - [use this one for a hint.](http://stackoverflow.com/questions/23556153/how-to-put-legend-outside-the-plot-with-pandas)** 102 | 103 | # In[56]: 104 | 105 | 106 | df3.loc[0:30].plot.area(alpha = 0.4) 107 | plt.legend(loc = 'center left', bbox_to_anchor = (1., 0.5)) 108 | 109 | 110 | # # Great Job! 111 | -------------------------------------------------------------------------------- /Pandas/Pandas Built-in DV_Class.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import matplotlib.pyplot as plt 10 | import seaborn as sns 11 | 12 | 13 | # In[2]: 14 | 15 | 16 | df1 = pd.read_csv('df1', index_col = 0) 17 | df2 = pd.read_csv('df2') 18 | 19 | 20 | # In[3]: 21 | 22 | 23 | df1.head() 24 | 25 | 26 | # In[4]: 27 | 28 | 29 | df2.head() 30 | 31 | 32 | # In[5]: 33 | 34 | 35 | sns.set(style = 'darkgrid') 36 | 37 | 38 | # In[6]: 39 | 40 | 41 | df1['A'].hist() 42 | 43 | 44 | # In[4]: 45 | 46 | 47 | df = pd.DataFrame({'sales': [3, 2, 3, 9, 10, 6], 'signups': [5, 5, 6, 12, 14, 13], 48 | 'visits': [20, 42, 28, 62, 81, 50]}, 49 | index=pd.date_range(start='2018/01/01', end='2018/07/01', 50 | freq='M')) 51 | 52 | 53 | # In[5]: 54 | 55 | 56 | df.head() 57 | 58 | 59 | # In[6]: 60 | 61 | 62 | df.shape 63 | 64 | 65 | # In[7]: 66 | 67 | 68 | # area plot 69 | 70 | 71 | # In[8]: 72 | 73 | 74 | df.plot.area() # = df.plot(kind = 'area') 75 | 76 | 77 | # In[9]: 78 | 79 | 80 | df.plot.area(stacked = False) 81 | 82 | 83 | # In[12]: 84 | 85 | 86 | df.plot.area(figsize = (9, 5)) 87 | 88 | 89 | # In[17]: 90 | 91 | 92 | # barplots 93 | 94 | 95 | # In[16]: 96 | 97 | 98 | speed = [0.1, 17.5, 40, 48, 52, 69, 88] 99 | lifespan = [2, 8, 70, 1.5, 25, 12, 28] 100 | index = ['snail', 'pig', 'elephant','rabbit', 'giraffe', 'coyote', 'horse'] 101 | df = pd.DataFrame({'speed': speed,'lifespan': lifespan}, index=index) 102 | df.head() 103 | 104 | 105 | # In[18]: 106 | 107 | 108 | df.plot.bar() 109 | 110 | 111 | # In[41]: 112 | 113 | 114 | df.plot.bar(figsize = (9, 6), rot = 0) 115 | plt.axhline(50, color = 'green', ls = '--') 116 | 117 | 118 | # In[44]: 119 | 120 | 121 | labels=['Snail', 'Pig', 'Elephant','Rabbit', 'Giraffe', 'Coyote', 'Horse'] 122 | 123 | 124 | # In[46]: 125 | 126 | 127 | g = df.plot.bar(figsize = (9, 6), rot = 0) 128 | g.set_xticklabels(labels) 129 | for p in g.patches: 130 | g.annotate((p.get_height()), (p.get_x()+0.02, p.get_height()+0.5)) 131 | 132 | 133 | # In[47]: 134 | 135 | 136 | income = [100, 80, 150, 48, 52, 69, 88] 137 | expense = [30, 100, 100, 20, 75, 50, 28] 138 | index = ['snail', 'pig', 'elephant','rabbit', 'giraffe', 'coyote', 'horse'] 139 | df = pd.DataFrame({'income': income,'expense': expense}, index=index) 140 | df.head() 141 | 142 | 143 | # In[48]: 144 | 145 | 146 | df.plot.bar() 147 | 148 | 149 | # In[49]: 150 | 151 | 152 | df.plot.bar(stacked = True) 153 | 154 | 155 | # In[50]: 156 | 157 | 158 | df['profit_loss'] = df['income'] - df['expense'] 159 | 160 | 161 | # In[52]: 162 | 163 | 164 | df.plot.bar(figsize = (8, 4)) 165 | 166 | 167 | # In[53]: 168 | 169 | 170 | # histograms 171 | 172 | 173 | # In[54]: 174 | 175 | 176 | mpg = sns.load_dataset('mpg') 177 | 178 | 179 | # In[55]: 180 | 181 | 182 | mpg.head() 183 | 184 | 185 | # In[56]: 186 | 187 | 188 | mpg['horsepower'].plot.hist(bins = 20) 189 | 190 | 191 | # In[57]: 192 | 193 | 194 | mpg['horsepower'].plot(kind = 'hist', bins = 20) 195 | 196 | 197 | # In[59]: 198 | 199 | 200 | df1.head() 201 | 202 | 203 | # In[61]: 204 | 205 | 206 | df1['B'].plot() 207 | 208 | 209 | # In[62]: 210 | 211 | 212 | df1['B'].plot.line() 213 | 214 | 215 | # In[63]: 216 | 217 | 218 | df1.plot(y = 'B') 219 | 220 | 221 | # In[66]: 222 | 223 | 224 | mpg.groupby('model_year')['horsepower'].mean().plot() 225 | 226 | 227 | # In[67]: 228 | 229 | 230 | mpg.groupby('model_year')['horsepower'].mean().plot.line() 231 | 232 | 233 | # In[68]: 234 | 235 | 236 | mpg.groupby('model_year')['mpg'].mean().plot.line(ls = '--') 237 | 238 | 239 | # In[ ]: 240 | 241 | 242 | 243 | 244 | -------------------------------------------------------------------------------- /Pandas/Pandas Built-in DV_U.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[18]: 5 | 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import matplotlib.pyplot as plt 10 | import seaborn as sns 11 | 12 | 13 | # In[8]: 14 | 15 | 16 | df1 = pd.read_csv('df1', index_col = 0) 17 | 18 | 19 | # In[24]: 20 | 21 | 22 | df1.head() 23 | 24 | 25 | # In[14]: 26 | 27 | 28 | df2 = pd.read_csv('df2') 29 | 30 | 31 | # In[15]: 32 | 33 | 34 | df2.head() 35 | 36 | 37 | # In[19]: 38 | 39 | 40 | df1['A'].hist() 41 | 42 | 43 | # In[20]: 44 | 45 | 46 | df1['A'].hist(bins = 30) 47 | 48 | 49 | # In[25]: 50 | 51 | 52 | df1['A'].plot(kind = 'hist') 53 | 54 | 55 | # In[26]: 56 | 57 | 58 | df1['A'].plot(kind = 'hist', bins = 30) 59 | 60 | 61 | # In[27]: 62 | 63 | 64 | df1['A'].plot.hist() 65 | 66 | 67 | # In[28]: 68 | 69 | 70 | df1['A'].plot.hist(bins = 30) 71 | 72 | 73 | # In[29]: 74 | 75 | 76 | df2.head() 77 | 78 | 79 | # In[30]: 80 | 81 | 82 | df2.plot.area() 83 | 84 | 85 | # In[32]: 86 | 87 | 88 | df2.plot.area(figsize = (10, 7), alpha = 0.4) 89 | 90 | 91 | # In[33]: 92 | 93 | 94 | df2.plot.bar() 95 | 96 | 97 | # In[36]: 98 | 99 | 100 | df2.plot(kind = 'bar', figsize = (10, 7)) 101 | 102 | 103 | # In[37]: 104 | 105 | 106 | df2 107 | 108 | 109 | # In[38]: 110 | 111 | 112 | df2.plot.bar(stacked = True) 113 | 114 | 115 | # In[41]: 116 | 117 | 118 | sns.set(style = 'darkgrid') 119 | df1['A'].plot.hist(bins = 40) 120 | 121 | 122 | # In[42]: 123 | 124 | 125 | df1.head() 126 | 127 | 128 | # In[56]: 129 | 130 | 131 | df1.plot.line(y = 'B', figsize = (12, 3)) 132 | 133 | 134 | # In[55]: 135 | 136 | 137 | df1.plot.line(y = 'B', figsize = (12, 3), lw = 1) 138 | 139 | 140 | # In[57]: 141 | 142 | 143 | df1.plot.scatter('A', 'B') 144 | 145 | 146 | # In[58]: 147 | 148 | 149 | df1.plot.scatter('A', 'B', c = 'C') 150 | 151 | 152 | # In[60]: 153 | 154 | 155 | df1.plot.scatter('A', 'B', c = 'C', cmap = 'coolwarm', figsize = (10, 7)) 156 | 157 | 158 | # In[62]: 159 | 160 | 161 | df1.plot.scatter('A', 'B', s = df1['C']*100) 162 | 163 | 164 | # In[64]: 165 | 166 | 167 | df2.plot.box(figsize = (10, 7)) 168 | 169 | 170 | # In[65]: 171 | 172 | 173 | df= pd.DataFrame(np.random.randn(1000, 2), columns = ['a', 'b']) 174 | 175 | 176 | # In[67]: 177 | 178 | 179 | df.head() 180 | 181 | 182 | # In[70]: 183 | 184 | 185 | df.plot.hexbin('a', 'b', figsize = (10, 7), gridsize = 25) 186 | 187 | 188 | # In[71]: 189 | 190 | 191 | df.plot.hexbin('a', 'b', figsize = (10, 7), gridsize = 25, cmap = 'coolwarm') 192 | 193 | 194 | # In[72]: 195 | 196 | 197 | df2['a'] 198 | 199 | 200 | # In[73]: 201 | 202 | 203 | df2['a'].plot.kde() 204 | 205 | 206 | # In[74]: 207 | 208 | 209 | df2['a'].plot.density() 210 | 211 | 212 | # In[75]: 213 | 214 | 215 | df2.plot.density() 216 | 217 | 218 | # In[ ]: 219 | 220 | 221 | 222 | 223 | -------------------------------------------------------------------------------- /Pandas/Pandas-(Aggregation, Groupby, Operations).py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Aggregation & Groupby 5 | # 6 | # The ``groupby`` method allows you to group rows of data together and call aggregate functions 7 | 8 | # ### Basic aggregation methods: 9 | # 10 | # * ``count()`` Compute count of group 11 | # * ``mean()`` Compute mean of groups 12 | # * ``median()`` Compute median of groups 13 | # * ``min()`` Compute min of group values 14 | # * ``max()`` Compute max of group values 15 | # * ``std()`` Standard deviation of groups 16 | # * ``var()`` Compute variance of groups 17 | # * ``sum()`` Compute sum of group values 18 | # * ``describe()``Generates descriptive statistics 19 | 20 | # In[1]: 21 | 22 | 23 | import pandas as pd 24 | import numpy as np 25 | import seaborn as sns 26 | 27 | 28 | # In[2]: 29 | 30 | 31 | df = sns.load_dataset("planets") 32 | 33 | 34 | # In[3]: 35 | 36 | 37 | df 38 | 39 | 40 | # In[5]: 41 | 42 | 43 | df.head(2) 44 | 45 | 46 | # In[6]: 47 | 48 | 49 | df.shape 50 | 51 | 52 | # In[7]: 53 | 54 | 55 | df.info() 56 | 57 | 58 | # In[8]: 59 | 60 | 61 | df['mass'] 62 | 63 | 64 | # In[9]: 65 | 66 | 67 | df["mass"].mean() 68 | 69 | 70 | # In[10]: 71 | 72 | 73 | df["mass"].count() 74 | 75 | 76 | # In[11]: 77 | 78 | 79 | df["mass"].min() 80 | 81 | 82 | # In[12]: 83 | 84 | 85 | df["mass"].max() 86 | 87 | 88 | # In[13]: 89 | 90 | 91 | df["mass"].sum() 92 | 93 | 94 | # In[14]: 95 | 96 | 97 | df["mass"].std() 98 | 99 | 100 | # In[15]: 101 | 102 | 103 | df["mass"].var() 104 | 105 | 106 | # In[16]: 107 | 108 | 109 | df.describe() 110 | 111 | 112 | # In[18]: 113 | 114 | 115 | df.describe().T 116 | 117 | 118 | # In[17]: 119 | 120 | 121 | df.describe().transpose() 122 | 123 | 124 | # - # ``df.groupby()`` 125 | 126 | # In[19]: 127 | 128 | 129 | df.head() 130 | 131 | 132 | # In[20]: 133 | 134 | 135 | df.info() 136 | 137 | 138 | # In[21]: 139 | 140 | 141 | df['method'].unique() 142 | 143 | 144 | # In[22]: 145 | 146 | 147 | df['method'].nunique() 148 | 149 | 150 | # In[26]: 151 | 152 | 153 | df['mass'].value_counts(dropna = False) 154 | 155 | 156 | # In[24]: 157 | 158 | 159 | df["method"].value_counts() 160 | 161 | 162 | # In[27]: 163 | 164 | 165 | df.groupby("method") 166 | 167 | 168 | # In[28]: 169 | 170 | 171 | df.groupby("method").max() 172 | 173 | 174 | # In[29]: 175 | 176 | 177 | df.groupby("method").mean() 178 | 179 | 180 | # In[30]: 181 | 182 | 183 | df.groupby("method").mean()['distance'] 184 | 185 | 186 | # In[31]: 187 | 188 | 189 | df.groupby("method").mean()[['distance']] 190 | 191 | 192 | # In[33]: 193 | 194 | 195 | df.groupby("method").describe()['year'] 196 | 197 | 198 | # In[34]: 199 | 200 | 201 | df 202 | 203 | 204 | # In[35]: 205 | 206 | 207 | df.groupby('year')['distance'].sum() 208 | 209 | 210 | # In[36]: 211 | 212 | 213 | data = {'Company':['GOOG', 'GOOG', 'MSFT', 'MSFT', 'FB', 'FB'], 214 | 'Person':['Sam', 'Charlie', 'Amy', 'Vanessa', 'Carl', 'Sarah'], 215 | 'Sales':[200, 120, 340, 124, 243, 350]} 216 | 217 | 218 | # In[37]: 219 | 220 | 221 | df1 = pd.DataFrame(data) 222 | 223 | 224 | # In[38]: 225 | 226 | 227 | df1 228 | 229 | 230 | # In[39]: 231 | 232 | 233 | df1.groupby('Company')[['Sales']].mean() 234 | 235 | 236 | # In[40]: 237 | 238 | 239 | df1.groupby('Company').min() 240 | 241 | 242 | # In[41]: 243 | 244 | 245 | df1.groupby('Company').sum() 246 | 247 | 248 | # - # ``DataFrame`` Operations 249 | 250 | # - ### Common Operations 👈 251 | 252 | # There are lots of operations with pandas that will be really useful to you, but don't fall into any distinct category. Let's show **Common Operations** here in this lecture: 253 | 254 | # - Quick review and refresh 255 | 256 | # In[42]: 257 | 258 | 259 | df2 = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']}) 260 | df2.head() 261 | 262 | 263 | # ### Info on Unique Values 264 | 265 | # In[43]: 266 | 267 | 268 | df2["col2"].unique() 269 | 270 | 271 | # In[44]: 272 | 273 | 274 | df2["col2"].nunique() 275 | 276 | 277 | # In[45]: 278 | 279 | 280 | df2["col2"].value_counts() 281 | 282 | 283 | # In[46]: 284 | 285 | 286 | df['mass'].value_counts(dropna = False) 287 | 288 | 289 | # ### Selecting Data 290 | 291 | # In[47]: 292 | 293 | 294 | df2 295 | 296 | 297 | # In[48]: 298 | 299 | 300 | df2['col1'] > 2 301 | 302 | 303 | # In[49]: 304 | 305 | 306 | df2[df2['col1'] > 2] 307 | 308 | 309 | # In[50]: 310 | 311 | 312 | df2[(df2['col1'] > 2) & (df2['col2'] == 444)] 313 | 314 | 315 | # In[51]: 316 | 317 | 318 | df2[(df2['col1']>2) | (df2['col2']==444)] 319 | 320 | 321 | # **Get column and index names:** 322 | 323 | # In[52]: 324 | 325 | 326 | df2 327 | 328 | 329 | # In[53]: 330 | 331 | 332 | df2.columns 333 | 334 | 335 | # In[54]: 336 | 337 | 338 | df.columns 339 | 340 | 341 | # In[55]: 342 | 343 | 344 | df2.shape 345 | 346 | 347 | # In[56]: 348 | 349 | 350 | df2.index 351 | 352 | 353 | # In[ ]: 354 | 355 | 356 | 357 | 358 | 359 | # In[58]: 360 | 361 | 362 | df4 = df.groupby("method")["distance"].describe() 363 | 364 | 365 | # In[59]: 366 | 367 | 368 | df4 369 | 370 | 371 | # In[60]: 372 | 373 | 374 | df4.index 375 | 376 | 377 | # **Sorting and Ordering a DataFrame:** 378 | 379 | # In[61]: 380 | 381 | 382 | df2 383 | 384 | 385 | # In[62]: 386 | 387 | 388 | df2.sort_values(by = 'col2') 389 | 390 | 391 | # In[63]: 392 | 393 | 394 | df2.sort_values(by = 'col2', ascending = False, inplace = True) 395 | 396 | 397 | # In[64]: 398 | 399 | 400 | df2 401 | 402 | 403 | # - ### `.transform()` 404 | # - ### `.apply()` 405 | 406 | # ### ``.transform()`` 407 | 408 | # In[65]: 409 | 410 | 411 | df4 = pd.DataFrame({'groups': ['A', 'B', 'C', 'A', 'B', 'C'], 412 | 'var1': [10,23,33,22,11,99], 413 | 'var2': [100,253,333,262,111,969]}) 414 | df4 415 | 416 | 417 | # In[66]: 418 | 419 | 420 | df4["var1"]*9 421 | 422 | 423 | # In[67]: 424 | 425 | 426 | df_numeric = df4.iloc[:, 1:3] 427 | 428 | 429 | # In[68]: 430 | 431 | 432 | df_numeric 433 | 434 | 435 | # In[69]: 436 | 437 | 438 | df_numeric.transform(lambda x : (x-x.mean()) / x.std()) 439 | 440 | 441 | # In[70]: 442 | 443 | 444 | df_numeric.iloc[0,0] 445 | 446 | 447 | # In[71]: 448 | 449 | 450 | (df_numeric.iloc[0,0] - df_numeric['var1'].mean()) / df_numeric['var1'].std() 451 | 452 | 453 | # In[72]: 454 | 455 | 456 | df_numeric.transform(lambda x : np.log10(x)) 457 | 458 | 459 | # In[73]: 460 | 461 | 462 | df_numeric.transform(np.log10) 463 | 464 | 465 | # ### ``.apply()`` 466 | 467 | # In[74]: 468 | 469 | 470 | df4 = pd.DataFrame({'groups': ['A', 'B', 'C', 'A', 'B', 'C'], 471 | 'var1': [10,23,33,22,11,99], 472 | 'var2': [100,253,333,262,111,969]}) 473 | df4 474 | 475 | 476 | # In[82]: 477 | 478 | 479 | df4.apply('mean') 480 | 481 | 482 | # In[76]: 483 | 484 | 485 | df4['var1'].sum() 486 | 487 | 488 | # In[77]: 489 | 490 | 491 | df4['groups'].sum() 492 | 493 | 494 | # In[84]: 495 | 496 | 497 | df_numeric 498 | 499 | 500 | # In[85]: 501 | 502 | 503 | df_numeric.apply(np.median) 504 | 505 | 506 | # In[ ]: 507 | 508 | 509 | df_numeric 510 | 511 | 512 | # In[87]: 513 | 514 | 515 | df_numeric.apply(np.mean, axis = 1) 516 | 517 | 518 | # In[88]: 519 | 520 | 521 | df4 522 | 523 | 524 | # In[89]: 525 | 526 | 527 | df4.groupby("groups").apply(np.mean) 528 | 529 | 530 | # In[90]: 531 | 532 | 533 | df4.groupby("groups").mean() 534 | 535 | 536 | # In[91]: 537 | 538 | 539 | df2 = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abcc','de','ghi','xyzzz']}) 540 | 541 | df2 542 | 543 | 544 | # In[92]: 545 | 546 | 547 | def times2(x): 548 | return x * 2 549 | 550 | 551 | # In[93]: 552 | 553 | 554 | df2["col1"].apply(times2) 555 | 556 | 557 | # In[94]: 558 | 559 | 560 | df2["col3"].apply(len) 561 | 562 | 563 | # ### `df.transform() vs df.apply()` 564 | 565 | # In[95]: 566 | 567 | 568 | df2 569 | 570 | 571 | # In[96]: 572 | 573 | 574 | df2.transform(len) 575 | 576 | 577 | # In[97]: 578 | 579 | 580 | df2["col3"].transform(len) 581 | 582 | 583 | # In[98]: 584 | 585 | 586 | df2.apply(len) 587 | 588 | 589 | # In[99]: 590 | 591 | 592 | df1 = pd.DataFrame([["a", 9, 25]] * 4, columns=["grp", 'P', 'Q']) 593 | df2 = pd.DataFrame([["b", 9, 25]] * 3, columns=["grp", 'P', 'Q']) 594 | df3 = pd.concat([df1, df2], ignore_index=True) 595 | df3 596 | 597 | 598 | # In[100]: 599 | 600 | 601 | df3.apply(lambda x : x + x) 602 | 603 | 604 | # In[101]: 605 | 606 | 607 | df3.transform(lambda y : y + y) 608 | 609 | 610 | # In[102]: 611 | 612 | 613 | df3 614 | 615 | 616 | # In[103]: 617 | 618 | 619 | df3.groupby("grp").apply(sum) 620 | 621 | 622 | # In[106]: 623 | 624 | 625 | df3.groupby("grp").transform(np.mean) 626 | 627 | 628 | # In[107]: 629 | 630 | 631 | df3.groupby("grp").sum() 632 | 633 | 634 | # In[108]: 635 | 636 | 637 | df3 638 | 639 | 640 | # In[109]: 641 | 642 | 643 | df3.groupby("grp").transform(len) 644 | 645 | 646 | # In[110]: 647 | 648 | 649 | df3.iloc[0:4] 650 | 651 | 652 | # In[111]: 653 | 654 | 655 | len(df3.iloc[0:4]) 656 | 657 | 658 | # In[112]: 659 | 660 | 661 | df3.groupby("grp").apply(len) 662 | 663 | 664 | # ### Pivot Tables 665 | 666 | # In[113]: 667 | 668 | 669 | titanic = sns.load_dataset("titanic") 670 | 671 | 672 | # In[114]: 673 | 674 | 675 | titanic.head() 676 | 677 | 678 | # In[115]: 679 | 680 | 681 | titanic.groupby("sex")[["survived"]].mean() 682 | 683 | 684 | # In[116]: 685 | 686 | 687 | titanic.groupby(["sex", "class"])[["survived"]].mean() 688 | 689 | 690 | # In[117]: 691 | 692 | 693 | titanic.groupby(["sex", "class"])[["survived"]].mean().T 694 | 695 | 696 | # In[118]: 697 | 698 | 699 | titanic.groupby(["sex", "class"])[["survived"]].mean().unstack() 700 | 701 | 702 | # In[ ]: 703 | 704 | 705 | 706 | 707 | 708 | # ### Using pivot table 709 | 710 | # - Create a spreadsheet-style pivot table as a ``DataFrame``. 711 | 712 | # In[120]: 713 | 714 | 715 | titanic.pivot_table(values = "survived", index = "sex", columns = "class", aggfunc = 'sum') 716 | 717 | 718 | # In[ ]: 719 | 720 | 721 | titanic.head(2) 722 | 723 | 724 | # In[ ]: 725 | 726 | 727 | titanic.pivot_table("age", index = "sex", columns = "class") 728 | 729 | 730 | # In[ ]: 731 | 732 | 733 | titanic.pivot_table("age", index = "class", columns = "sex") 734 | 735 | 736 | # In[121]: 737 | 738 | 739 | data = {'A':['foo', 'foo', 'foo', 'bar', 'bar', 'bar'], 740 | 'B':['one', 'one', 'two', 'two', 'one', 'one'], 741 | 'C':['x', 'y', 'x', 'y', 'x', 'y'], 742 | 'D':[1, 3, 2, 5, 4, 1]} 743 | 744 | df5 = pd.DataFrame(data) 745 | 746 | df5 747 | 748 | 749 | # In[122]: 750 | 751 | 752 | df5.pivot_table(values = "D", index = ["A", "B"], columns = "C") 753 | 754 | 755 | # # The End of the Session 756 | -------------------------------------------------------------------------------- /Pandas/Pandas-(Missing Values, Outliers).py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Missing Values & Outliers 5 | 6 | # - # Handling with Missing Values 7 | 8 | # In[1]: 9 | 10 | 11 | import pandas as pd 12 | import numpy as np 13 | 14 | 15 | # In[2]: 16 | 17 | 18 | df = pd.DataFrame({'A':[1, 2, np.nan], 19 | 'B':[5, np.nan, np.nan], 20 | 'C':[1, 2, 3]}) 21 | 22 | 23 | # In[3]: 24 | 25 | 26 | df 27 | 28 | 29 | # In[4]: 30 | 31 | 32 | df.dropna() 33 | 34 | 35 | # In[5]: 36 | 37 | 38 | df.dropna(axis = 1) 39 | 40 | 41 | # In[6]: 42 | 43 | 44 | df 45 | 46 | 47 | # In[8]: 48 | 49 | 50 | df.dropna(thresh = 1) 51 | 52 | 53 | # In[9]: 54 | 55 | 56 | df.fillna(value = "xxx") 57 | 58 | 59 | # In[10]: 60 | 61 | 62 | df['A'] 63 | 64 | 65 | # In[11]: 66 | 67 | 68 | df['A'].mean() 69 | 70 | 71 | # In[12]: 72 | 73 | 74 | df["A"].fillna(value = df["A"].mean()) 75 | 76 | 77 | # In[ ]: 78 | 79 | 80 | df 81 | 82 | 83 | # In[13]: 84 | 85 | 86 | V1 = np.array([2,3,5,np.NaN,7,1,np.NaN,10,14]) 87 | V2 = np.array([8,np.NaN,5,8,11,np.NaN,np.NaN,2,3]) 88 | V3 = np.array([np.NaN,13,5,6,13,7,np.NaN,3,30]) 89 | df = pd.DataFrame( 90 | {"Var1" : V1, 91 | "Var2" : V2, 92 | "Var3" : V3} 93 | ) 94 | 95 | df 96 | 97 | 98 | # In[14]: 99 | 100 | 101 | df.isnull() 102 | 103 | 104 | # In[16]: 105 | 106 | 107 | df.notnull() 108 | 109 | 110 | # In[15]: 111 | 112 | 113 | df.isnull().sum() 114 | 115 | 116 | # In[17]: 117 | 118 | 119 | df.notnull().sum() 120 | 121 | 122 | # In[18]: 123 | 124 | 125 | df.isnull().sum().sum() 126 | 127 | 128 | # In[19]: 129 | 130 | 131 | df 132 | 133 | 134 | # In[20]: 135 | 136 | 137 | df['Var1'].isnull() 138 | 139 | 140 | # In[21]: 141 | 142 | 143 | df['Var1'][df['Var1'].isnull()] 144 | 145 | 146 | # In[ ]: 147 | 148 | 149 | df 150 | 151 | 152 | # ### Missing Values Handling Methods 153 | 154 | # - #### Dropping 155 | 156 | # In[22]: 157 | 158 | 159 | df 160 | 161 | 162 | # In[23]: 163 | 164 | 165 | df.dropna() 166 | 167 | 168 | # In[24]: 169 | 170 | 171 | df.dropna(how = "all") 172 | 173 | 174 | # In[26]: 175 | 176 | 177 | df 178 | 179 | 180 | # In[25]: 181 | 182 | 183 | df.dropna(axis = 1) 184 | 185 | 186 | # In[27]: 187 | 188 | 189 | df 190 | 191 | 192 | # In[28]: 193 | 194 | 195 | df.dropna(axis = 1, how = "all") 196 | 197 | 198 | # In[29]: 199 | 200 | 201 | df["delete_me"] = np.nan 202 | 203 | 204 | # In[30]: 205 | 206 | 207 | df 208 | 209 | 210 | # In[31]: 211 | 212 | 213 | df.dropna(axis = 1, how = "all", inplace = True) 214 | 215 | 216 | # In[32]: 217 | 218 | 219 | df 220 | 221 | 222 | # - #### Filling 223 | 224 | # In[33]: 225 | 226 | 227 | df 228 | 229 | 230 | # - Filling with a specific value 231 | 232 | # In[34]: 233 | 234 | 235 | df["Var1"] 236 | 237 | 238 | # In[35]: 239 | 240 | 241 | df["Var1"].fillna(0) 242 | 243 | 244 | # In[36]: 245 | 246 | 247 | df.fillna(value = 0) 248 | 249 | 250 | # - Filling with any Proper Value 251 | 252 | # In[37]: 253 | 254 | 255 | df 256 | 257 | 258 | # In[38]: 259 | 260 | 261 | df["Var1"].mean() 262 | 263 | 264 | # In[39]: 265 | 266 | 267 | df["Var1"].fillna(value = df["Var1"].mean()) 268 | 269 | 270 | # In[40]: 271 | 272 | 273 | df 274 | 275 | 276 | # In[44]: 277 | 278 | 279 | df['Var2'].mean() 280 | 281 | 282 | # In[55]: 283 | 284 | 285 | df.apply(lambda x : x.fillna(value = x.mean())) 286 | 287 | 288 | # In[47]: 289 | 290 | 291 | df.mean() 292 | 293 | 294 | # In[46]: 295 | 296 | 297 | df.fillna(df.mean()) 298 | 299 | 300 | # In[48]: 301 | 302 | 303 | df 304 | 305 | 306 | # In[49]: 307 | 308 | 309 | df.fillna({"Var1" : 6, "Var2": 6.16}) 310 | 311 | 312 | # In[56]: 313 | 314 | 315 | df["Var3"].fillna(df["Var3"].median()) 316 | 317 | 318 | # - Filling with any Proper Value Regarding to Group of the Categorical Variables 319 | 320 | # In[57]: 321 | 322 | 323 | Var1 = np.array([1,3,6,np.NaN,7,1,9,np.NaN,15]) 324 | Var2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3]) 325 | Var3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31]) 326 | Var4 = np.array(["IT","IT","IT","HR","HR","HR","HR","IT","IT"]) 327 | 328 | df = pd.DataFrame( 329 | {"salary" : Var1, 330 | "Var2" : Var2, 331 | "Var3" : Var3, 332 | "department" : Var4} 333 | ) 334 | 335 | df 336 | 337 | 338 | # In[58]: 339 | 340 | 341 | df.groupby("department")["salary"].mean() 342 | 343 | 344 | # In[59]: 345 | 346 | 347 | df.groupby("department")["salary"].transform(np.mean) 348 | 349 | 350 | # In[60]: 351 | 352 | 353 | df.groupby("department")["salary"].apply(np.mean) 354 | 355 | 356 | # In[61]: 357 | 358 | 359 | df["salary"].fillna(value = df.groupby("department")["salary"].transform(np.mean)) 360 | 361 | 362 | # In[62]: 363 | 364 | 365 | df["salary"].fillna(value = df.groupby("department")["salary"].apply(np.mean)) 366 | 367 | 368 | # In[63]: 369 | 370 | 371 | df.salary.fillna({0:1, 1:2, 2:3, 3:4, 4:5, 5:6, 6:7, 7:8, 8:9}) 372 | 373 | 374 | # - Filling the Missing Values of Categorical Values 375 | 376 | # In[64]: 377 | 378 | 379 | V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15]) 380 | V4 = np.array(["IT",np.nan,"HR","HR","HR","HR",np.nan,"IT","HR"], dtype=object) 381 | 382 | df = pd.DataFrame( 383 | {"salary" : V1, 384 | "department" : V4} 385 | ) 386 | 387 | df 388 | 389 | 390 | # In[65]: 391 | 392 | 393 | df["department"].mode()[0] 394 | 395 | 396 | # In[66]: 397 | 398 | 399 | df["department"].fillna(df["department"].mode()[0]) 400 | 401 | 402 | # In[67]: 403 | 404 | 405 | df 406 | 407 | 408 | # In[68]: 409 | 410 | 411 | df["department"].fillna(method = "bfill") 412 | 413 | 414 | # In[69]: 415 | 416 | 417 | df["department"].fillna(method = "ffill") 418 | 419 | 420 | # In[70]: 421 | 422 | 423 | df 424 | 425 | 426 | # In[71]: 427 | 428 | 429 | df.drop('department', axis = 1) 430 | 431 | 432 | # In[72]: 433 | 434 | 435 | df.drop(index = 1) 436 | 437 | 438 | # In[ ]: 439 | 440 | 441 | #df.farazi.fillna(method = "ffill", limit = 2) 442 | #df.farazi.fillna(method = "bfill", limit = 2) 443 | 444 | 445 | # In[ ]: 446 | 447 | 448 | #df.fillna(value = "unique1", limit=10, inplace=True) 449 | #df.fillna("unique2", limit=30, inplace=True) 450 | #df.fillna("unique3", limit=25, inplace=True) 451 | #df.fillna("unique4", limit=35, inplace=True) 452 | 453 | 454 | # In[73]: 455 | 456 | 457 | df = pd.DataFrame({"A":[None, 1, 2, 3, None, None], 458 | "B":[11, 5, None, None, None, 8], 459 | "C":[None, 5, 10, 11, None, 8]}) 460 | 461 | 462 | # In[74]: 463 | 464 | 465 | df 466 | 467 | 468 | # In[82]: 469 | 470 | 471 | df.fillna(method = "ffill", limit = 2) 472 | 473 | 474 | # - # Handling with Outliers 475 | 476 | # ## Catching and Detecting Outliers 477 | 478 | # In[83]: 479 | 480 | 481 | import seaborn as sns 482 | df = sns.load_dataset('diamonds') 483 | df = df.select_dtypes(include = ['float64', 'int64']) 484 | df = df.dropna() 485 | df.head() 486 | 487 | 488 | # In[84]: 489 | 490 | 491 | import matplotlib.pyplot as plt 492 | 493 | 494 | # In[85]: 495 | 496 | 497 | plt.figure(figsize=(20,15)) 498 | sns.boxplot(x = df['table']) 499 | 500 | 501 | # In[86]: 502 | 503 | 504 | df['table'].describe() 505 | 506 | 507 | # In[87]: 508 | 509 | 510 | df_table = df["table"] 511 | 512 | 513 | # In[88]: 514 | 515 | 516 | df_table.head() 517 | 518 | 519 | # In[89]: 520 | 521 | 522 | pd.DataFrame(df_table).info() 523 | 524 | 525 | # In[90]: 526 | 527 | 528 | len(df_table) 529 | 530 | 531 | # ## Tukey's Fences | Tukey's Rule 532 | 533 | # - First way of specifying ``Q1 & Q3`` is using the ``.quantile()`` method 534 | 535 | # In[91]: 536 | 537 | 538 | df_table.describe() 539 | 540 | 541 | # In[178]: 542 | 543 | 544 | df_table.quantile(0.25) 545 | 546 | 547 | # In[93]: 548 | 549 | 550 | Q1 = df_table.quantile(0.25) 551 | Q3 = df_table.quantile(0.75) 552 | IQR = Q3 - Q1 553 | 554 | 555 | # In[94]: 556 | 557 | 558 | Q1 559 | 560 | 561 | # In[95]: 562 | 563 | 564 | Q3 565 | 566 | 567 | # In[96]: 568 | 569 | 570 | IQR 571 | 572 | 573 | # - Second way of specifying ``Q1 & Q3`` is using the ``.describe()`` method 574 | 575 | # In[97]: 576 | 577 | 578 | lower_lim = Q1 - 1.5 * IQR 579 | upper_lim = Q3 + 1.5 * IQR 580 | 581 | 582 | # In[98]: 583 | 584 | 585 | lower_lim 586 | 587 | 588 | # In[99]: 589 | 590 | 591 | upper_lim 592 | 593 | 594 | # In[100]: 595 | 596 | 597 | (df_table < lower_lim) 598 | 599 | 600 | # In[101]: 601 | 602 | 603 | (df_table > upper_lim) 604 | 605 | 606 | # In[102]: 607 | 608 | 609 | outliers_15_low = (df_table < lower_lim) 610 | 611 | 612 | # In[103]: 613 | 614 | 615 | outliers_15_up = (df_table > upper_lim) 616 | 617 | 618 | # In[104]: 619 | 620 | 621 | df_table[outliers_15_low] 622 | 623 | 624 | # In[105]: 625 | 626 | 627 | len(df_table[outliers_15_low]) 628 | 629 | 630 | # In[106]: 631 | 632 | 633 | df_table[outliers_15_up] 634 | 635 | 636 | # In[107]: 637 | 638 | 639 | len(df_table) - (len(df_table[outliers_15_low]) + len(df_table[outliers_15_up])) 640 | 641 | 642 | # In[108]: 643 | 644 | 645 | df_table[(outliers_15_low | outliers_15_up)] 646 | 647 | 648 | # *** 649 | 650 | # In[109]: 651 | 652 | 653 | lower_lim = Q1 - 2.5 * IQR 654 | upper_lim = Q3 + 2.5 * IQR 655 | 656 | 657 | # In[110]: 658 | 659 | 660 | lower_lim 661 | 662 | 663 | # In[111]: 664 | 665 | 666 | upper_lim 667 | 668 | 669 | # In[112]: 670 | 671 | 672 | (df_table < lower_lim) | (df_table > upper_lim) 673 | 674 | 675 | # In[113]: 676 | 677 | 678 | outliers_25 = (df_table < lower_lim) | (df_table > upper_lim) 679 | 680 | 681 | # In[114]: 682 | 683 | 684 | df_table[outliers_25] 685 | 686 | 687 | # ### Removing the Outliers 688 | 689 | # In[121]: 690 | 691 | 692 | df_table[~(outliers_15_low | outliers_15_up)] 693 | 694 | 695 | # In[117]: 696 | 697 | 698 | df 699 | 700 | 701 | # In[123]: 702 | 703 | 704 | clean_df = df[~(outliers_15_low | outliers_15_up)] 705 | 706 | 707 | # In[119]: 708 | 709 | 710 | clean_df 711 | 712 | 713 | # ### Limitation and Transformation of the Outliers 714 | 715 | # - ### Limitation using ``.winsorize()`` method 716 | 717 | # In[124]: 718 | 719 | 720 | from scipy.stats.mstats import winsorize 721 | 722 | 723 | # In[127]: 724 | 725 | 726 | df 727 | 728 | 729 | # In[128]: 730 | 731 | 732 | df_table 733 | 734 | 735 | # In[129]: 736 | 737 | 738 | sns.boxplot(x = df_table) 739 | 740 | 741 | # In[130]: 742 | 743 | 744 | sns.distplot(df_table, bins = 15, kde = False) 745 | 746 | 747 | # In[176]: 748 | 749 | 750 | df_table.describe() 751 | 752 | 753 | # In[125]: 754 | 755 | 756 | df_table.quantile(0.01) 757 | 758 | 759 | # In[126]: 760 | 761 | 762 | df_table.quantile(0.98) 763 | 764 | 765 | # In[131]: 766 | 767 | 768 | df_table_win = winsorize(df_table, (0.01, 0.02)) 769 | 770 | 771 | # In[132]: 772 | 773 | 774 | df_table_win 775 | 776 | 777 | # In[133]: 778 | 779 | 780 | sns.boxplot(x = df_table_win) 781 | 782 | 783 | # In[134]: 784 | 785 | 786 | sns.distplot(df_table_win, bins = 10, kde =False) 787 | 788 | 789 | # In[135]: 790 | 791 | 792 | pd.DataFrame(df_table_win) 793 | 794 | 795 | # In[136]: 796 | 797 | 798 | pd.DataFrame(df_table_win)[0] 799 | 800 | 801 | # In[137]: 802 | 803 | 804 | df_table_win = pd.DataFrame(df_table_win)[0] 805 | 806 | 807 | # In[139]: 808 | 809 | 810 | df_table_win.describe() 811 | 812 | 813 | # In[138]: 814 | 815 | 816 | df_table.describe() 817 | 818 | 819 | # In[ ]: 820 | 821 | 822 | df_table.quantile(0.01) 823 | 824 | 825 | # In[140]: 826 | 827 | 828 | df_table.quantile(0.98) 829 | 830 | 831 | # In[ ]: 832 | 833 | 834 | df_table_win.describe() 835 | 836 | 837 | # In[141]: 838 | 839 | 840 | df_table.sort_values().head(20) 841 | 842 | 843 | # In[147]: 844 | 845 | 846 | df_table_win.sort_values().head(50) 847 | 848 | 849 | # In[143]: 850 | 851 | 852 | df_table_win[df_table_win == 53] 853 | 854 | 855 | # In[144]: 856 | 857 | 858 | df_table[df_table == 53] 859 | 860 | 861 | # In[145]: 862 | 863 | 864 | df_table_win[df_table_win == 63] 865 | 866 | 867 | # In[146]: 868 | 869 | 870 | df_table[df_table == 63] 871 | 872 | 873 | # In[149]: 874 | 875 | 876 | Q1 = 56.0 877 | Q3 = 59.0 878 | 879 | 880 | # In[150]: 881 | 882 | 883 | IQR = Q3 - Q1 884 | 885 | 886 | # In[151]: 887 | 888 | 889 | lower = Q1 - 1.5 * IQR 890 | upper = Q3 + 1.5 * IQR 891 | 892 | 893 | # In[152]: 894 | 895 | 896 | lower 897 | 898 | 899 | # In[153]: 900 | 901 | 902 | upper 903 | 904 | 905 | # In[154]: 906 | 907 | 908 | outliers_15 = (df_table_win < lower) | (df_table_win > upper) 909 | 910 | 911 | # In[155]: 912 | 913 | 914 | df_table[outliers_15] 915 | 916 | 917 | # In[156]: 918 | 919 | 920 | df["table_win"] = df_table_win 921 | 922 | 923 | # In[157]: 924 | 925 | 926 | df.head() 927 | 928 | 929 | # - ### ``log()`` Transformation 930 | 931 | # In[158]: 932 | 933 | 934 | df.info() 935 | 936 | 937 | # In[159]: 938 | 939 | 940 | df_carat = df["carat"] 941 | 942 | 943 | # In[160]: 944 | 945 | 946 | df_carat.shape 947 | 948 | 949 | # In[161]: 950 | 951 | 952 | df_carat.head() 953 | 954 | 955 | # In[162]: 956 | 957 | 958 | sns.boxplot(x = df_carat) 959 | 960 | 961 | # In[163]: 962 | 963 | 964 | sns.distplot(df_carat, bins = 15, kde = False) 965 | 966 | 967 | # In[164]: 968 | 969 | 970 | df_carat_log = np.log(df_carat) 971 | 972 | 973 | # In[166]: 974 | 975 | 976 | df_carat 977 | 978 | 979 | # In[165]: 980 | 981 | 982 | df_carat_log 983 | 984 | 985 | # In[167]: 986 | 987 | 988 | sns.boxplot(x = df_carat_log) 989 | 990 | 991 | # In[174]: 992 | 993 | 994 | sns.distplot(df_carat_log, bins = 11, kde = False) 995 | 996 | 997 | # In[ ]: 998 | 999 | 1000 | df["carat_log"] = np.log(df["carat"]) 1001 | 1002 | 1003 | # In[ ]: 1004 | 1005 | 1006 | df.head() 1007 | 1008 | -------------------------------------------------------------------------------- /Pandas/Pandas_1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # SF Salaries Exercise 5 | # 6 | # Welcome to a quick exercise for you to practice your pandas skills! We will be using the [SF Salaries Dataset](https://www.kaggle.com/kaggle/sf-salaries) from Kaggle! Just follow along and complete the tasks outlined in bold below. The tasks will get harder and harder as you go along. 7 | 8 | # ** Import pandas as pd.** 9 | 10 | # In[1]: 11 | 12 | 13 | import pandas as pd 14 | 15 | 16 | # ** Read Salaries.csv as a dataframe called sal.** 17 | 18 | # In[2]: 19 | 20 | 21 | sal = pd.read_csv('Salaries.csv') 22 | 23 | 24 | # ** Check the head of the DataFrame. ** 25 | 26 | # In[3]: 27 | 28 | 29 | sal.head() 30 | 31 | 32 | # ** Use the .info() method to find out how many entries there are.** 33 | 34 | # In[4]: 35 | 36 | 37 | sal.info() 38 | 39 | 40 | # **What is the average BasePay ?** 41 | 42 | # In[11]: 43 | 44 | 45 | sal['BasePay'].mean() 46 | 47 | 48 | # ** What is the highest amount of OvertimePay in the dataset ? ** 49 | 50 | # In[23]: 51 | 52 | 53 | sal['OvertimePay'].sort_values(ascending = False)[1] 54 | 55 | 56 | # In[75]: 57 | 58 | 59 | sal['OvertimePay'].max() 60 | 61 | 62 | # ** What is the job title of JOSEPH DRISCOLL ? Note: Use all caps, otherwise you may get an answer that doesn't match up (there is also a lowercase Joseph Driscoll). ** 63 | 64 | # In[16]: 65 | 66 | 67 | sal['JobTitle'][sal['EmployeeName'] == 'JOSEPH DRISCOLL'] 68 | 69 | 70 | # In[76]: 71 | 72 | 73 | sal[sal['EmployeeName'] == 'JOSEPH DRISCOLL']['JobTitle'] 74 | 75 | 76 | # ** How much does JOSEPH DRISCOLL make (including benefits)? ** 77 | 78 | # In[18]: 79 | 80 | 81 | sal['TotalPayBenefits'][sal['EmployeeName'] == 'JOSEPH DRISCOLL'] 82 | 83 | 84 | # In[77]: 85 | 86 | 87 | sal[sal['EmployeeName'] == 'JOSEPH DRISCOLL']['TotalPayBenefits'] 88 | 89 | 90 | # ** What is the name of highest paid person (including benefits)?** 91 | 92 | # In[82]: 93 | 94 | 95 | sal.sort_values(by = 'TotalPayBenefits', ascending = False).iloc[[0]] 96 | 97 | 98 | # In[80]: 99 | 100 | 101 | sal[sal['TotalPayBenefits'] == sal['TotalPayBenefits'].max()] 102 | 103 | 104 | # In[86]: 105 | 106 | 107 | sal.loc[[sal['TotalPayBenefits'].idxmax()]] # idxmax() is similar numpy argmax() 108 | 109 | 110 | # In[87]: 111 | 112 | 113 | sal.iloc[[sal['TotalPayBenefits'].argmax()]] 114 | 115 | 116 | # ** What is the name of lowest paid person (including benefits)? Do you notice something strange about how much he or she is paid?** 117 | 118 | # In[32]: 119 | 120 | 121 | sal.sort_values(by = 'TotalPayBenefits').iloc[[0]] 122 | 123 | 124 | # In[89]: 125 | 126 | 127 | sal[sal['TotalPayBenefits'] == sal['TotalPayBenefits'].min()] 128 | 129 | 130 | # In[93]: 131 | 132 | 133 | sal.loc[[sal['TotalPayBenefits'].idxmin()]] 134 | 135 | 136 | # In[95]: 137 | 138 | 139 | sal.iloc[[sal['TotalPayBenefits'].argmin()]] 140 | 141 | 142 | # ** What was the average (mean) BasePay of all employees per year? (2011-2014) ? ** 143 | 144 | # In[40]: 145 | 146 | 147 | sal['Year'].unique() 148 | 149 | 150 | # In[113]: 151 | 152 | 153 | sal.groupby('Year').mean()['BasePay'] 154 | 155 | 156 | # In[116]: 157 | 158 | 159 | sal.groupby('Year').mean()['BasePay'][[2011, 2013]] 160 | 161 | 162 | # ** How many unique job titles are there? ** 163 | 164 | # In[45]: 165 | 166 | 167 | sal['JobTitle'].nunique() 168 | 169 | 170 | # In[117]: 171 | 172 | 173 | len(sal['JobTitle'].unique()) 174 | 175 | 176 | # ** What are the top 5 most common jobs? ** 177 | 178 | # In[48]: 179 | 180 | 181 | sal['JobTitle'].value_counts()[:5] 182 | 183 | 184 | # In[119]: 185 | 186 | 187 | sal['JobTitle'].value_counts().head(5) 188 | 189 | 190 | # In[120]: 191 | 192 | 193 | type(sal['JobTitle'].value_counts()) 194 | 195 | 196 | # ** How many Job Titles were represented by only one person in 2013? (e.g. Job Titles with only one occurence in 2013?) ** 197 | 198 | # In[126]: 199 | 200 | 201 | sal[['JobTitle']][sal['Year'] == 2013].nunique() 202 | 203 | 204 | # In[128]: 205 | 206 | 207 | sal[['JobTitle']][sal['Year'] == 2013].value_counts() == 1 208 | 209 | 210 | # In[129]: 211 | 212 | 213 | sum(sal[['JobTitle']][sal['Year'] == 2013].value_counts() == 1) 214 | 215 | 216 | # ** How many people have the word Chief in their job title? (This is pretty tricky) ** 217 | 218 | # In[151]: 219 | 220 | 221 | sum(sal['JobTitle'].apply(str.lower).str.contains('chief')) 222 | 223 | 224 | # In[159]: 225 | 226 | 227 | sal['JobTitle'][sal['JobTitle'].apply(str.lower).str.contains('chief')].value_counts().index 228 | 229 | 230 | # ** Bonus: Is there a correlation between length of the Job Title string and Salary? ** 231 | 232 | # In[71]: 233 | 234 | 235 | title_len = sal['JobTitle'].apply(len) 236 | 237 | 238 | # In[72]: 239 | 240 | 241 | import numpy as np 242 | 243 | 244 | # In[73]: 245 | 246 | 247 | np.corrcoef(title_len, sal['TotalPayBenefits']) 248 | 249 | 250 | # In[152]: 251 | 252 | 253 | sal['title_len'] = sal['JobTitle'].apply(len) 254 | 255 | 256 | # In[161]: 257 | 258 | 259 | sal[['TotalPayBenefits', 'title_len']].corr('pearson') 260 | 261 | 262 | # # Great Job! 263 | -------------------------------------------------------------------------------- /Pandas/Pandas_2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Ecommerce Purchases Exercise 5 | # 6 | # In this Exercise you will be given some Fake Data about some purchases done through Amazon! Just go ahead and follow the directions and try your best to answer the questions and complete the tasks. Feel free to reference the solutions. Most of the tasks can be solved in different ways. For the most part, the questions get progressively harder. 7 | # 8 | # Please excuse anything that doesn't make "Real-World" sense in the dataframe, all the data is fake and made-up. 9 | # 10 | # Also note that all of these questions can be answered with one line of code. 11 | # ____ 12 | # ** Import pandas and read in the Ecommerce Purchases csv file and set it to a DataFrame called ecom. ** 13 | 14 | # In[1]: 15 | 16 | 17 | import pandas as pd 18 | import numpy as np 19 | 20 | 21 | # In[2]: 22 | 23 | 24 | ecom = pd.read_csv('Ecommerce Purchases') 25 | 26 | 27 | # **Check the head of the DataFrame.** 28 | 29 | # In[3]: 30 | 31 | 32 | ecom.head() 33 | 34 | 35 | # ** How many rows and columns are there? ** 36 | 37 | # In[4]: 38 | 39 | 40 | ecom.info() 41 | 42 | 43 | # ** What is the average Purchase Price? ** 44 | 45 | # In[5]: 46 | 47 | 48 | ecom['Purchase Price'].mean() 49 | 50 | 51 | # ** What were the highest and lowest purchase prices? ** 52 | 53 | # In[6]: 54 | 55 | 56 | ecom['Purchase Price'].max() 57 | 58 | 59 | # In[7]: 60 | 61 | 62 | ecom['Purchase Price'].min() 63 | 64 | 65 | # ** How many people have English 'en' as their Language of choice on the website? ** 66 | 67 | # In[8]: 68 | 69 | 70 | ecom['Language'].unique() 71 | 72 | 73 | # In[10]: 74 | 75 | 76 | ecom[ecom['Language'] == 'en'].count() 77 | 78 | 79 | # ** How many people have the job title of "Lawyer" ? ** 80 | # 81 | 82 | # In[13]: 83 | 84 | 85 | ecom[ecom['Job'] == 'Lawyer'].info() 86 | 87 | 88 | # ** How many people made the purchase during the AM and how many people made the purchase during PM ? ** 89 | # 90 | # **(Hint: Check out [value_counts()](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.value_counts.html) ) ** 91 | 92 | # In[14]: 93 | 94 | 95 | ecom['AM or PM'].value_counts() 96 | 97 | 98 | # ** What are the 5 most common Job Titles? ** 99 | 100 | # In[26]: 101 | 102 | 103 | ecom['Job'].value_counts().head() 104 | 105 | 106 | # ** Someone made a purchase that came from Lot: "90 WT" , what was the Purchase Price for this transaction? ** 107 | 108 | # In[27]: 109 | 110 | 111 | ecom.head() 112 | 113 | 114 | # In[28]: 115 | 116 | 117 | ecom[ecom['Lot'] == '90 WT']['Purchase Price'] 118 | 119 | 120 | # ** What is the email of the person with the following Credit Card Number: 4926535242672853 ** 121 | 122 | # In[30]: 123 | 124 | 125 | ecom[ecom['Credit Card'] == 4926535242672853]['Email'] 126 | 127 | 128 | # ** How many people have American Express as their Credit Card Provider *and* made a purchase above $95 ?** 129 | 130 | # In[31]: 131 | 132 | 133 | ecom[(ecom['CC Provider'] == 'American Express') & (ecom['Purchase Price'] > 95)].count() 134 | 135 | 136 | # ** Hard: How many people have a credit card that expires in 2025? ** 137 | 138 | # In[35]: 139 | 140 | 141 | ecom[ecom['CC Exp Date'].str.contains('25')]['CC Security Code'].count() 142 | 143 | 144 | # ** Hard: What are the top 5 most popular email providers/hosts (e.g. gmail.com, yahoo.com, etc...) ** 145 | 146 | # In[39]: 147 | 148 | 149 | ecom['Email'].apply(lambda email : email.split('@')[1]).value_counts().head() 150 | 151 | 152 | # # Great Job! 153 | -------------------------------------------------------------------------------- /Pandas/Pandas_3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import numpy as np 9 | 10 | 11 | # In[3]: 12 | 13 | 14 | df = pd.read_csv('movies_metadata.csv', error_bad_lines=False) 15 | 16 | 17 | # In[4]: 18 | 19 | 20 | df = pd.read_csv('movies_metadata.csv', error_bad_lines=False, encoding = 'latin-1') 21 | 22 | 23 | # In[9]: 24 | 25 | 26 | df.info() 27 | 28 | 29 | # In[10]: 30 | 31 | 32 | df.head() 33 | 34 | 35 | # In[13]: 36 | 37 | 38 | df.loc[2] 39 | 40 | 41 | # In[14]: 42 | 43 | 44 | df.shape 45 | 46 | 47 | # In[16]: 48 | 49 | 50 | df.columns 51 | 52 | 53 | # In[18]: 54 | 55 | 56 | df[['title', 'genres']] 57 | 58 | 59 | # In[15]: 60 | 61 | 62 | df[df['original_title'] == 'Grumpier Old Men'] 63 | 64 | 65 | # In[5]: 66 | 67 | 68 | df.iloc[4] 69 | 70 | 71 | # In[20]: 72 | 73 | 74 | df.info() 75 | 76 | 77 | # In[21]: 78 | 79 | 80 | df2 = df[['title', 'release_date', 'budget', 'revenue', 'runtime']] 81 | 82 | 83 | # In[22]: 84 | 85 | 86 | df2.head(2) 87 | 88 | 89 | # In[24]: 90 | 91 | 92 | df.head(10) 93 | 94 | 95 | # In[25]: 96 | 97 | 98 | df.sort_values(by = 'release_date') 99 | 100 | 101 | # In[34]: 102 | 103 | 104 | df['release_date'].dtype 105 | 106 | 107 | # In[37]: 108 | 109 | 110 | df[df['release_date'] > '1995-01-01'] 111 | 112 | 113 | # In[38]: 114 | 115 | 116 | df.columns 117 | 118 | 119 | # In[39]: 120 | 121 | 122 | df.sort_values('runtime', ascending = False) 123 | 124 | 125 | # In[40]: 126 | 127 | 128 | df.info() 129 | 130 | 131 | # In[44]: 132 | 133 | 134 | df['budget'].value_counts() 135 | 136 | 137 | # In[70]: 138 | 139 | 140 | df[(df['revenue'] >= 2000000) & (df['budget'] <= 1000000)] 141 | 142 | 143 | # In[50]: 144 | 145 | 146 | df['runtime'].max() 147 | 148 | 149 | # In[51]: 150 | 151 | 152 | df['runtime'].min() 153 | 154 | 155 | # In[52]: 156 | 157 | 158 | df.info() 159 | 160 | 161 | # In[53]: 162 | 163 | 164 | df['vote_count'].value_counts() 165 | 166 | 167 | # In[54]: 168 | 169 | 170 | df.describe() 171 | 172 | 173 | # In[6]: 174 | 175 | 176 | df['vote_count'].quantile(0.70) 177 | 178 | 179 | # In[8]: 180 | 181 | 182 | df[(df['runtime'] >= 30) & (df['runtime'] <= 360)]['title'] 183 | 184 | 185 | # In[73]: 186 | 187 | 188 | df.info() 189 | 190 | 191 | # In[74]: 192 | 193 | 194 | df[['title', 'vote_count']] 195 | 196 | -------------------------------------------------------------------------------- /Pandas/Pandas_Class2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | # missing values / outliers 8 | 9 | 10 | # In[2]: 11 | 12 | 13 | import pandas as pd 14 | import numpy as np 15 | 16 | 17 | # In[3]: 18 | 19 | 20 | df = pd.DataFrame({'A':[1, 2, np.nan], 21 | 'B':[5, np.nan, np.nan], 22 | 'C':[1, 2, 3]}) 23 | 24 | 25 | # In[4]: 26 | 27 | 28 | df 29 | 30 | 31 | # In[5]: 32 | 33 | 34 | df.dropna() 35 | 36 | 37 | # In[6]: 38 | 39 | 40 | df.dropna(axis = 1) 41 | 42 | 43 | # In[7]: 44 | 45 | 46 | df.dropna(thresh = 2) 47 | 48 | 49 | # In[8]: 50 | 51 | 52 | df.fillna(value = 'milk') 53 | 54 | 55 | # In[9]: 56 | 57 | 58 | V1 = np.array([2,3,5,np.NaN,7,1,np.NaN,10,14]) 59 | V2 = np.array([8,np.NaN,5,8,11,np.NaN,np.NaN,2,3]) 60 | V3 = np.array([np.NaN,13,5,6,13,7,np.NaN,3,30]) 61 | df = pd.DataFrame( 62 | {"Var1" : V1, 63 | "Var2" : V2, 64 | "Var3" : V3} 65 | ) 66 | df 67 | 68 | 69 | # In[10]: 70 | 71 | 72 | df.isnull() 73 | 74 | 75 | # In[11]: 76 | 77 | 78 | df.notnull() 79 | 80 | 81 | # In[12]: 82 | 83 | 84 | df.isnull().sum() 85 | 86 | 87 | # In[13]: 88 | 89 | 90 | df.notnull().sum() 91 | 92 | 93 | # In[14]: 94 | 95 | 96 | df['Var1'].isnull() 97 | 98 | 99 | # In[15]: 100 | 101 | 102 | df[df['Var1'].isnull()] 103 | 104 | 105 | # In[16]: 106 | 107 | 108 | df.isnull() 109 | 110 | 111 | # In[17]: 112 | 113 | 114 | df 115 | 116 | 117 | # In[18]: 118 | 119 | 120 | df.isnull().any(axis=0) 121 | 122 | 123 | # In[19]: 124 | 125 | 126 | df.isnull().all(axis=1) 127 | 128 | 129 | # In[20]: 130 | 131 | 132 | df.isnull().any(axis=1) 133 | 134 | 135 | # In[21]: 136 | 137 | 138 | df[df.isnull().any(axis=1)] 139 | 140 | 141 | # In[22]: 142 | 143 | 144 | df[~df.isnull().any(axis=1)] # ~ means against 145 | 146 | 147 | # In[23]: 148 | 149 | 150 | # handle with missing values 151 | 152 | 153 | # In[24]: 154 | 155 | 156 | df.dropna() 157 | 158 | 159 | # In[25]: 160 | 161 | 162 | df.dropna(how = 'all') 163 | 164 | 165 | # In[26]: 166 | 167 | 168 | df.dropna(how = 'any') 169 | 170 | 171 | # In[27]: 172 | 173 | 174 | df['Var1'] 175 | 176 | 177 | # In[28]: 178 | 179 | 180 | df['Var1'].fillna(0) 181 | 182 | 183 | # In[29]: 184 | 185 | 186 | df['Var1'].fillna(df['Var1'].mean()) 187 | 188 | 189 | # In[30]: 190 | 191 | 192 | df.apply(lambda x : x.fillna(x.mean())) 193 | 194 | 195 | # In[31]: 196 | 197 | 198 | df.mean() 199 | 200 | 201 | # In[32]: 202 | 203 | 204 | df.fillna(df.mean()) 205 | 206 | 207 | # In[33]: 208 | 209 | 210 | df.fillna({'Var1' : 6, 'Var2' : 6.16}) 211 | 212 | 213 | # In[34]: 214 | 215 | 216 | #where 217 | 218 | 219 | # In[35]: 220 | 221 | 222 | df.where(pd.notnull(df), df.mean(), axis = 1) 223 | 224 | 225 | # In[36]: 226 | 227 | 228 | Var1 = np.array([1,3,6,np.NaN,7,1,9,np.NaN,15]) 229 | Var2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3]) 230 | Var3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31]) 231 | Var4 = np.array(["IT","IT","IT","HR","HR","HR","HR","IT","IT"]) 232 | df = pd.DataFrame( 233 | {"salary" : Var1, 234 | "Var2" : Var2, 235 | "Var3" : Var3, 236 | "department" : Var4} 237 | ) 238 | 239 | 240 | # In[37]: 241 | 242 | 243 | df 244 | 245 | 246 | # In[38]: 247 | 248 | 249 | df.groupby('department')['salary'].mean() 250 | 251 | 252 | # In[39]: 253 | 254 | 255 | df['salary'].fillna({0:1, 1:2, 2:3, 3:4, 4:5, 5:6, 6:7, 7:8, 8:9}) 256 | 257 | 258 | # In[40]: 259 | 260 | 261 | df 262 | 263 | 264 | # In[41]: 265 | 266 | 267 | df.groupby('department')['salary'].transform('mean') 268 | 269 | 270 | # In[42]: 271 | 272 | 273 | df.salary.fillna(df.groupby('department')['salary'].transform('mean')) 274 | 275 | 276 | # In[43]: 277 | 278 | 279 | V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15]) 280 | V4 = np.array(["IT",np.nan,"HR","HR","HR","HR",np.nan,"IT","HR"], dtype=object) 281 | df = pd.DataFrame( 282 | {"salary" : V1, 283 | "department" : V4} 284 | ) 285 | df 286 | 287 | 288 | # In[44]: 289 | 290 | 291 | df['department'].fillna(df['department'].mode()[0]) # fill missing values with mode of column 292 | 293 | 294 | # In[45]: 295 | 296 | 297 | df['department'].fillna(method = 'bfill') # back fill 298 | 299 | 300 | # In[46]: 301 | 302 | 303 | #df['department'].fillna(a, method = 'ffill', limit = 200) 304 | #df['department'].fillna(b, method = 'ffill', limit = 100) 305 | #df['department'].fillna(a, method = 'ffill', limit = 100) 306 | #df['department'].fillna(c, method = 'ffill', limit = 100) 307 | #df['department'].fillna(a, method = 'ffill', limit = 100) 308 | #df['department'].fillna(b, method = 'ffill', limit = 200) 309 | #df['department'].fillna(c, method = 'ffill', limit = 100) 310 | #df['department'].fillna(a, method = 'ffill', limit = 100) 311 | 312 | 313 | # In[47]: 314 | 315 | 316 | df['department'].fillna(method = 'ffill') # forward fill 317 | 318 | 319 | # In[48]: 320 | 321 | 322 | # outliers 323 | 324 | 325 | # In[49]: 326 | 327 | 328 | import seaborn as sns 329 | df = sns.load_dataset('diamonds') 330 | df = df.select_dtypes(include = ['float64', 'int64']) 331 | df = df.dropna() 332 | df.head() 333 | 334 | 335 | # In[50]: 336 | 337 | 338 | sns.boxplot(df['table']) 339 | 340 | 341 | # In[51]: 342 | 343 | 344 | df_table = df['table'] 345 | 346 | 347 | # In[52]: 348 | 349 | 350 | df_table.head() 351 | 352 | 353 | # In[53]: 354 | 355 | 356 | pd.DataFrame(df_table).info() 357 | 358 | 359 | # In[54]: 360 | 361 | 362 | q1 = df_table.quantile(0.25) 363 | q3 = df_table.quantile(0.75) 364 | iqr = q3 - q1 365 | 366 | 367 | # In[55]: 368 | 369 | 370 | q3 371 | 372 | 373 | # In[56]: 374 | 375 | 376 | q1 377 | 378 | 379 | # In[57]: 380 | 381 | 382 | iqr 383 | 384 | 385 | # In[58]: 386 | 387 | 388 | df.describe() 389 | 390 | 391 | # In[59]: 392 | 393 | 394 | lower_lim = q1 - 1.5 * iqr 395 | upper_lim = q3 + 1.5 * iqr 396 | 397 | 398 | # In[60]: 399 | 400 | 401 | lower_lim 402 | 403 | 404 | # In[61]: 405 | 406 | 407 | upper_lim 408 | 409 | 410 | # In[62]: 411 | 412 | 413 | outliers_15_low = df_table < lower_lim 414 | 415 | 416 | # In[63]: 417 | 418 | 419 | outliers_15_up = df_table > upper_lim 420 | 421 | 422 | # In[64]: 423 | 424 | 425 | df_table[outliers_15_low] 426 | 427 | 428 | # In[65]: 429 | 430 | 431 | df_table[outliers_15_up] 432 | 433 | 434 | # In[66]: 435 | 436 | 437 | df_table[outliers_15_low | outliers_15_up] 438 | 439 | 440 | # In[67]: 441 | 442 | 443 | lower_lim = q1 - 2.5 * iqr 444 | upper_lim = q3 + 2.5 * iqr 445 | 446 | 447 | # In[68]: 448 | 449 | 450 | df_table[(df_table < lower_lim) | (df_table > upper_lim)] 451 | 452 | 453 | # In[69]: 454 | 455 | 456 | #removing the outliers 457 | 458 | 459 | # In[70]: 460 | 461 | 462 | df_table[~(outliers_15_low | outliers_15_up)] 463 | 464 | 465 | # In[71]: 466 | 467 | 468 | clean_df = df[~(outliers_15_low | outliers_15_up)] 469 | 470 | 471 | # In[72]: 472 | 473 | 474 | clean_df # without ouliers (1.5) 475 | 476 | 477 | # In[73]: 478 | 479 | 480 | # limitation winsorize() method 481 | 482 | 483 | # In[74]: 484 | 485 | 486 | from scipy.stats.mstats import winsorize 487 | 488 | 489 | # In[75]: 490 | 491 | 492 | df 493 | 494 | 495 | # In[76]: 496 | 497 | 498 | df_table 499 | 500 | 501 | # In[77]: 502 | 503 | 504 | sns.boxplot(df['table']) 505 | 506 | 507 | # In[78]: 508 | 509 | 510 | sns.distplot(df['table'], kde = False, bins = 15) 511 | 512 | 513 | # In[79]: 514 | 515 | 516 | df_table_win = winsorize(df_table, (0.01, 0.02)) # % 1 from bottom % 2 from top 517 | 518 | 519 | # In[80]: 520 | 521 | 522 | df_table_win 523 | 524 | 525 | # In[81]: 526 | 527 | 528 | sns.distplot(df_table_win, kde = False, bins = 10) 529 | 530 | 531 | # In[82]: 532 | 533 | 534 | sns.boxplot(df_table_win) 535 | 536 | 537 | # In[83]: 538 | 539 | 540 | df['table'].describe() 541 | 542 | 543 | # In[86]: 544 | 545 | 546 | df_table_win = pd.DataFrame(df_table_win)[0] 547 | 548 | 549 | # In[87]: 550 | 551 | 552 | df_table_win.describe() 553 | 554 | 555 | # In[92]: 556 | 557 | 558 | df['table'].sort_values().head(20) 559 | 560 | 561 | # In[93]: 562 | 563 | 564 | df_table_win.sort_values().head(20) # scale values 565 | 566 | 567 | # In[94]: 568 | 569 | 570 | df_table_win[11368] 571 | 572 | 573 | # In[95]: 574 | 575 | 576 | df['table'][11368] 577 | 578 | 579 | # In[96]: 580 | 581 | 582 | df_table_win[24815] 583 | 584 | 585 | # In[97]: 586 | 587 | 588 | df['table'][24815] 589 | 590 | 591 | # In[100]: 592 | 593 | 594 | df_table_win[df_table_win == 53] 595 | 596 | 597 | # In[99]: 598 | 599 | 600 | df_table[df_table == 53] 601 | 602 | 603 | # In[103]: 604 | 605 | 606 | df_table_win[df_table_win == 63] # 1180 - 563. because right skewed. 607 | # upper outliers are more than lower. 0.02 from uuper side. 608 | 609 | 610 | # In[104]: 611 | 612 | 613 | df_table[df_table == 63] 614 | 615 | 616 | # In[107]: 617 | 618 | 619 | q1 620 | 621 | 622 | # In[106]: 623 | 624 | 625 | q3 626 | 627 | 628 | # In[108]: 629 | 630 | 631 | iqr 632 | 633 | 634 | # In[111]: 635 | 636 | 637 | lower = q1 - 1.5 * iqr 638 | upper = q3 + 1.5 * iqr 639 | 640 | 641 | # In[112]: 642 | 643 | 644 | lower 645 | 646 | 647 | # In[113]: 648 | 649 | 650 | upper 651 | 652 | 653 | # In[114]: 654 | 655 | 656 | outliers_15 = (df_table_win < lower) | (df_table_win > upper) 657 | 658 | 659 | # In[116]: 660 | 661 | 662 | df_table_win[outliers_15] 663 | 664 | 665 | # In[117]: 666 | 667 | 668 | df_table[(df_table < lower) | (df_table > upper)] 669 | 670 | 671 | # In[131]: 672 | 673 | 674 | df['table_win'] = df_table_win 675 | 676 | 677 | # In[119]: 678 | 679 | 680 | # log() transformation 681 | 682 | 683 | # In[132]: 684 | 685 | 686 | df.info() 687 | 688 | 689 | # In[122]: 690 | 691 | 692 | df_carat = df['carat'] 693 | 694 | 695 | # In[123]: 696 | 697 | 698 | df_carat.head() 699 | 700 | 701 | # In[124]: 702 | 703 | 704 | sns.boxplot(df_carat) 705 | 706 | 707 | # In[127]: 708 | 709 | 710 | sns.distplot(df_carat, bins = 15, kde = False) 711 | 712 | 713 | # In[128]: 714 | 715 | 716 | df_carat_log = np.log(df_carat) 717 | 718 | 719 | # In[129]: 720 | 721 | 722 | sns.distplot(df_carat_log, bins = 15, kde = False) 723 | 724 | 725 | # In[130]: 726 | 727 | 728 | sns.boxplot(df_carat_log) 729 | 730 | 731 | # In[133]: 732 | 733 | 734 | df['carat_log'] = np.log(df['carat']) 735 | 736 | 737 | # In[134]: 738 | 739 | 740 | df.head() 741 | 742 | 743 | # In[1]: 744 | 745 | 746 | import pandas as pd 747 | 748 | 749 | # In[2]: 750 | 751 | 752 | df1 = pd.DataFrame({'lkey': ['x', 'y', 'z', 'c', 'z','x'], 753 | 'lvalue': [2, 3, 5, 7, 0, 99]}) 754 | df2 = pd.DataFrame({'rkey': ['x', 'x', 'z', 'z'], 755 | 'rvalue': [7, 8, 9, 10]}) 756 | 757 | 758 | # In[3]: 759 | 760 | 761 | df1 762 | 763 | 764 | # In[4]: 765 | 766 | 767 | df2 768 | 769 | 770 | # In[5]: 771 | 772 | 773 | pd.merge(df1, df2, left_on = 'lkey', right_on = 'rkey', how = 'left') 774 | 775 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DataScience 2 | There are basic/advanced codes, notes and exercises for data analysis and data visualization. 3 | There are notes about statistic and advanced implement of statistic at python. 4 | Pandas, NumPy, Seaborn, Matplotlib, SciPy, Researchpy, Regex. 5 | -------------------------------------------------------------------------------- /Seaborn/Seaborn Class2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import seaborn as sns 8 | import pandas as pd 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | from scipy import stats 12 | 13 | 14 | # In[2]: 15 | 16 | 17 | df = pd.read_csv('cleaned_autos.csv') 18 | 19 | 20 | # In[3]: 21 | 22 | 23 | df.info() 24 | 25 | 26 | # In[4]: 27 | 28 | 29 | pd.set_option('display.max_columns', 27) 30 | 31 | 32 | # In[5]: 33 | 34 | 35 | df.head() 36 | 37 | 38 | # In[6]: 39 | 40 | 41 | df['vehicleType'].unique() 42 | 43 | 44 | # In[7]: 45 | 46 | 47 | df.groupby('vehicleType')['price'].mean().plot.bar() 48 | 49 | 50 | # In[8]: 51 | 52 | 53 | # variation of the price range by vehicle types 54 | 55 | 56 | # In[9]: 57 | 58 | 59 | plt.subplots(figsize = (12, 6)) 60 | sns.boxplot('vehicleType', 'price', data = df) 61 | 62 | 63 | # In[10]: 64 | 65 | 66 | # demonstration of the mean prices by the vehicle type 67 | 68 | 69 | # In[11]: 70 | 71 | 72 | fig, ax = plt.subplots(figsize = (12, 6)) 73 | sns.set(style="darkgrid") 74 | sns.pointplot('vehicleType', 'price', data = df) 75 | ax.set_xticklabels(df['vehicleType'].unique(), rotation = 90); 76 | 77 | 78 | # In[12]: 79 | 80 | 81 | # total count of vehicles by type available on sale 82 | 83 | 84 | # In[13]: 85 | 86 | 87 | plt.subplots(figsize = (12, 6)) 88 | sns.countplot('vehicleType', data = df) 89 | 90 | 91 | # In[6]: 92 | 93 | 94 | df.head() 95 | 96 | 97 | # In[8]: 98 | 99 | 100 | # average price for vehicles based on the type of vehicle as well as on the type of gearbox 101 | 102 | 103 | # In[12]: 104 | 105 | 106 | plt.figure(figsize = (12, 6)) 107 | sns.barplot('vehicleType', 'price', 'gearbox', data = df) 108 | plt.show() 109 | 110 | 111 | # In[13]: 112 | 113 | 114 | # average price for vehicles by fuel type as well as on the type of gearbox 115 | 116 | 117 | # In[14]: 118 | 119 | 120 | plt.figure(figsize = (12, 6)) 121 | sns.barplot('fuelType', 'price', 'gearbox', data = df) 122 | plt.tight_layout() 123 | 124 | -------------------------------------------------------------------------------- /Seaborn/Seaborn_1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import numpy as np 9 | import seaborn as sns 10 | import matplotlib.pyplot as plt 11 | 12 | 13 | # In[2]: 14 | 15 | 16 | titanic = sns.load_dataset('titanic') 17 | 18 | 19 | # In[3]: 20 | 21 | 22 | titanic.head() 23 | 24 | 25 | # In[4]: 26 | 27 | 28 | titanic.info() 29 | 30 | 31 | # In[60]: 32 | 33 | 34 | sns.set(style = "whitegrid") 35 | sns.jointplot('fare', 'age', data = titanic, xlim = (-100, 600)) 36 | plt.show() 37 | 38 | 39 | # In[59]: 40 | 41 | 42 | sns.distplot(titanic['fare'], kde = False, color = 'red', bins = 30) 43 | plt.xlim(0) 44 | plt.show() 45 | 46 | 47 | # In[57]: 48 | 49 | 50 | sns.boxplot('class', 'age', data = titanic, palette = 'rainbow') 51 | plt.show() 52 | 53 | 54 | # In[61]: 55 | 56 | 57 | sns.swarmplot('class', 'age', data = titanic, palette = 'Set2') 58 | plt.show() 59 | 60 | 61 | # In[22]: 62 | 63 | 64 | sns.countplot('sex', data = titanic) 65 | plt.show() 66 | 67 | 68 | # In[66]: 69 | 70 | 71 | plt.figure(figsize = (8, 5)) 72 | sns.heatmap(titanic.corr(), cmap = 'coolwarm', vmin = -0.8, vmax = 0.8) 73 | plt.title('titanic.corr()') 74 | plt.show() 75 | 76 | 77 | # In[64]: 78 | 79 | 80 | g = sns.FacetGrid(titanic, col = 'sex', aspect = 1, height = 5) 81 | g.map(sns.distplot, 'age', kde = False, bins = 10) 82 | plt.xlim(0) 83 | plt.tight_layout() 84 | 85 | 86 | # In[63]: 87 | 88 | 89 | g = sns.FacetGrid(titanic, col = 'sex', aspect = 1, height = 5) 90 | g.map(plt.hist, 'age', bins = 10) 91 | plt.xlim(0) 92 | plt.tight_layout() 93 | 94 | -------------------------------------------------------------------------------- /Seaborn/Seaborn_U.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import seaborn as sns 8 | 9 | 10 | # In[2]: 11 | 12 | 13 | get_ipython().run_line_magic('matplotlib', 'inline') 14 | 15 | 16 | # In[3]: 17 | 18 | 19 | # Distribution Plots 20 | 21 | 22 | # In[4]: 23 | 24 | 25 | import ssl 26 | ssl._create_default_https_context = ssl._create_unverified_context 27 | 28 | 29 | # In[5]: 30 | 31 | 32 | tips = sns.load_dataset('tips') 33 | 34 | 35 | # In[6]: 36 | 37 | 38 | tips.head() 39 | 40 | 41 | # In[7]: 42 | 43 | 44 | tips 45 | 46 | 47 | # In[10]: 48 | 49 | 50 | sns.distplot(tips['total_bill']) 51 | 52 | 53 | # In[10]: 54 | 55 | 56 | sns.distplot(tips['total_bill'], kde = False, bins = 20); 57 | 58 | 59 | # In[11]: 60 | 61 | 62 | sns.distplot(tips['total_bill'], kde = False, bins = 40); # histogram, there is a hist = True arg. 63 | 64 | 65 | # In[21]: 66 | 67 | 68 | sns.jointplot(x = tips['total_bill'], y = tips['tip'], data = tips) # scatter plot 69 | 70 | 71 | # In[22]: 72 | 73 | 74 | sns.jointplot(tips['total_bill'], tips['tip'], tips, kind = 'hex') # default kind value is scatter 75 | 76 | 77 | # In[23]: 78 | 79 | 80 | sns.jointplot(tips['total_bill'], tips['tip'], tips, kind = 'reg') 81 | 82 | 83 | # In[25]: 84 | 85 | 86 | tips.head() 87 | 88 | 89 | # In[24]: 90 | 91 | 92 | sns.pairplot(tips) # it is a collected form of jointplots of each numerical variables.scatter/strip 93 | 94 | 95 | # In[26]: 96 | 97 | 98 | sns.pairplot(tips, hue = 'sex') # we can add categorical variable with hue argument 99 | 100 | 101 | # In[27]: 102 | 103 | 104 | sns.pairplot(tips, hue = 'sex', palette = 'coolwarm') 105 | 106 | 107 | # In[28]: 108 | 109 | 110 | sns.rugplot(tips['total_bill']) # it gives distribution without bins (unlike histogram) 111 | # dash mark for every points with uniform distribution 112 | 113 | 114 | # In[32]: 115 | 116 | 117 | sns.distplot(tips['total_bill'], kde = False) 118 | 119 | 120 | # In[33]: 121 | 122 | 123 | sns.distplot(tips['total_bill']) # kde lin e; kernel density estimation 124 | # normal distribution over each point and collected form of norm.dist. 125 | 126 | 127 | # In[34]: 128 | 129 | 130 | sns.kdeplot(tips['total_bill']) 131 | 132 | 133 | # In[ ]: 134 | 135 | 136 | # Categorical Plots 137 | 138 | 139 | # In[5]: 140 | 141 | 142 | import seaborn as sns 143 | get_ipython().run_line_magic('matplotlib', 'inline') 144 | 145 | 146 | # In[6]: 147 | 148 | 149 | tips = sns.load_dataset('tips') 150 | 151 | 152 | # In[7]: 153 | 154 | 155 | tips.head() 156 | 157 | 158 | # In[17]: 159 | 160 | 161 | import numpy as np 162 | 163 | 164 | # In[16]: 165 | 166 | 167 | sns.barplot(x = 'sex', y = 'total_bill', data = tips) # x = categorical, y = numeric 168 | # it gives mean of total_bill 169 | 170 | 171 | # In[18]: 172 | 173 | 174 | sns.barplot(x = 'sex', y = 'total_bill', data = tips, estimator = np.sum) # we can get sum by estimator arg. 175 | 176 | 177 | # In[21]: 178 | 179 | 180 | sns.barplot(tips['sex'], tips['total_bill'], estimator = np.std) 181 | 182 | 183 | # In[20]: 184 | 185 | 186 | sns.countplot(tips['sex']) # it counts values 187 | 188 | 189 | # In[23]: 190 | 191 | 192 | sns.countplot('size', data = tips) # it is barchart with numbers of values. 193 | 194 | 195 | # In[46]: 196 | 197 | 198 | sns.violinplot(tips['total_bill']) 199 | 200 | 201 | # In[37]: 202 | 203 | 204 | sns.boxplot(tips['total_bill']) 205 | 206 | 207 | # In[24]: 208 | 209 | 210 | sns.boxplot(tips['day'], tips['total_bill']) # it gives total_bill boxplot per day 211 | 212 | 213 | # In[26]: 214 | 215 | 216 | sns.boxplot(tips['day'], tips['total_bill'], hue = tips['smoker']) # we can add other status/info with hue. 217 | 218 | 219 | # In[27]: 220 | 221 | 222 | sns.violinplot(tips['day'], tips['total_bill']) # points kde. violinplot related with boxplot. 223 | 224 | 225 | # In[29]: 226 | 227 | 228 | sns.violinplot(tips['day'], tips['total_bill'], hue = tips['smoker']) 229 | 230 | 231 | # In[51]: 232 | 233 | 234 | sns.violinplot(tips['day'], tips['total_bill'], hue = tipss['smoker'], split = True) 235 | 236 | 237 | # In[31]: 238 | 239 | 240 | sns.stripplot(x = 'day', y = 'total_bill', data = tips) # like scatter plot based one numeric one categorical 241 | # scatter plot is about 2 numeric 242 | 243 | 244 | # In[33]: 245 | 246 | 247 | sns.stripplot(x = 'day', y = 'total_bill', data = tips, jitter = False) 248 | 249 | 250 | # In[49]: 251 | 252 | 253 | sns.stripplot(x = 'day', y = 'total_bill', data = tips, hue = 'sex', dodge = True) 254 | 255 | 256 | # In[52]: 257 | 258 | 259 | sns.stripplot(x = 'day', y = 'total_bill', data = tips) 260 | 261 | 262 | # In[8]: 263 | 264 | 265 | sns.swarmplot(x = 'day', y = 'total_bill', data = tips) # combining stripplot and violinplot 266 | # it is for small data sets 267 | 268 | 269 | # In[64]: 270 | 271 | 272 | tips['day'][tips['day'] == 'Fri'].value_counts() 273 | 274 | 275 | # In[60]: 276 | 277 | 278 | sns.violinplot(x = 'day', y = 'total_bill', data = tips) 279 | sns.swarmplot(x = 'day', y = 'total_bill', data = tips, color = 'black') 280 | 281 | 282 | # In[67]: 283 | 284 | 285 | sns.barplot(x = 'day', y = 'total_bill', data = tips) 286 | 287 | 288 | # In[72]: 289 | 290 | 291 | sns.catplot(x = 'day', y = 'total_bill', data = tips, kind = 'bar') # we can call all types with kind arg. 292 | 293 | 294 | # In[9]: 295 | 296 | 297 | # Matrix Plots 298 | # we need matrix form for heatmap. 299 | 300 | 301 | # In[53]: 302 | 303 | 304 | import seaborn as sns 305 | import matplotlib.pyplot as plt 306 | get_ipython().run_line_magic('matplotlib', 'inline') 307 | 308 | 309 | # In[11]: 310 | 311 | 312 | tips = sns.load_dataset('tips') 313 | 314 | 315 | # In[12]: 316 | 317 | 318 | flights = sns.load_dataset('flights') 319 | 320 | 321 | # In[13]: 322 | 323 | 324 | flights.head() 325 | 326 | 327 | # In[14]: 328 | 329 | 330 | tips.head() 331 | 332 | 333 | # In[15]: 334 | 335 | 336 | tips.corr() 337 | 338 | 339 | # In[22]: 340 | 341 | 342 | sns.heatmap(tips.corr(), annot = True, cmap = 'coolwarm') 343 | 344 | 345 | # In[37]: 346 | 347 | 348 | flights.head() 349 | 350 | 351 | # In[32]: 352 | 353 | 354 | flights.corr() 355 | 356 | 357 | # In[27]: 358 | 359 | 360 | sns.heatmap(flights.corr(), annot = True) 361 | 362 | 363 | # In[40]: 364 | 365 | 366 | fp = flights.pivot_table(values = 'passengers', index = 'month', columns = 'year') 367 | 368 | 369 | # In[41]: 370 | 371 | 372 | fp 373 | 374 | 375 | # In[42]: 376 | 377 | 378 | sns.heatmap(fp) 379 | 380 | 381 | # In[52]: 382 | 383 | 384 | sns.heatmap(fp, linecolor = 'white', linewidths = 1) 385 | 386 | 387 | # In[56]: 388 | 389 | 390 | plt.subplots(figsize=(10,5)) 391 | sns.heatmap(fp, cmap = 'coolwarm', linecolor = 'black', linewidths = 1) 392 | 393 | 394 | # In[58]: 395 | 396 | 397 | sns.clustermap(fp, cmap = 'coolwarm') # it doesn't order. it makes clustersgroups. 398 | #standard_scale = 1 399 | 400 | 401 | # In[59]: 402 | 403 | 404 | # Grids 405 | 406 | 407 | # In[61]: 408 | 409 | 410 | import seaborn as sns 411 | import matplotlib.pyplot as plt 412 | get_ipython().run_line_magic('matplotlib', 'inline') 413 | 414 | 415 | # In[62]: 416 | 417 | 418 | iris = sns.load_dataset('iris') 419 | 420 | 421 | # In[63]: 422 | 423 | 424 | iris.head() 425 | 426 | 427 | # In[64]: 428 | 429 | 430 | sns.pairplot(iris) 431 | 432 | 433 | # In[65]: 434 | 435 | 436 | sns.PairGrid(iris) 437 | 438 | 439 | # In[70]: 440 | 441 | 442 | g = sns.PairGrid(iris) 443 | g.map(plt.scatter) 444 | 445 | 446 | # In[71]: 447 | 448 | 449 | g = sns.PairGrid(iris) 450 | g.map_diag(sns.distplot) 451 | g.map_upper(plt.scatter) 452 | g.map_lower(sns.kdeplot) 453 | 454 | 455 | # In[72]: 456 | 457 | 458 | tips = sns.load_dataset('tips') 459 | 460 | 461 | # In[73]: 462 | 463 | 464 | tips.head() 465 | 466 | 467 | # In[76]: 468 | 469 | 470 | g = sns.FacetGrid(data = tips, col = 'time', row = 'smoker') 471 | 472 | 473 | # In[81]: 474 | 475 | 476 | g = sns.FacetGrid(data = tips, col = 'time', row = 'smoker') 477 | g.map(sns.distplot, 'total_bill') 478 | 479 | 480 | # In[83]: 481 | 482 | 483 | g = sns.FacetGrid(data = tips, col = 'time', row = 'smoker') 484 | g.map(plt.scatter, 'total_bill', 'tip') 485 | 486 | 487 | # In[ ]: 488 | 489 | 490 | 491 | 492 | -------------------------------------------------------------------------------- /Statistics/Statistics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | salary = [102, 33, 26, 27, 30, 25, 33, 33, 24] 8 | import numpy as np 9 | from scipy import stats 10 | 11 | 12 | # In[2]: 13 | 14 | 15 | mean_salary = np.mean(salary) 16 | print('Mean :', mean_salary) 17 | 18 | 19 | # In[6]: 20 | 21 | 22 | median_salary = np.median(salary) 23 | print('Median :', median_salary) 24 | 25 | 26 | # In[4]: 27 | 28 | 29 | stats.iqr(salary) 30 | 31 | 32 | # In[7]: 33 | 34 | 35 | mode_salary = stats.mode(salary) 36 | print('Mode :', mode_salary) 37 | 38 | 39 | # In[9]: 40 | 41 | 42 | print('Range :', (np.max(salary) - (np.min(salary)))) 43 | 44 | 45 | # In[10]: 46 | 47 | 48 | print('Variance :', (np.var(salary))) 49 | 50 | 51 | # In[11]: 52 | 53 | 54 | print('Std :', (np.std(salary))) 55 | 56 | 57 | # In[12]: 58 | 59 | 60 | a = [1, 10, 7, 12, 0, 30, 15, 22, 8, 2] 61 | print(np.std(a)) 62 | 63 | 64 | # In[3]: 65 | 66 | 67 | import numpy as np 68 | 69 | 70 | # In[4]: 71 | 72 | 73 | temp = [93, 84, 82, 78, 98, 70] 74 | number_of_people = [13, 10, 11, 8, 15, 9] 75 | 76 | 77 | # In[5]: 78 | 79 | 80 | print('Coveriance :', np.cov(temp, number_of_people)) 81 | 82 | 83 | # In[6]: 84 | 85 | 86 | print('Correlation :', np.corrcoef(temp, number_of_people)) 87 | 88 | 89 | # In[1]: 90 | 91 | 92 | import numpy as np 93 | 94 | 95 | # In[17]: 96 | 97 | 98 | np.random.seed(101) 99 | population = np.random.randint(0, 80, 100000) 100 | 101 | 102 | # In[18]: 103 | 104 | 105 | population 106 | 107 | 108 | # In[19]: 109 | 110 | 111 | len(population) 112 | 113 | 114 | # In[20]: 115 | 116 | 117 | np.random.seed(101) 118 | sample = np.random.choice(population, 100) 119 | 120 | 121 | # In[21]: 122 | 123 | 124 | sample 125 | 126 | 127 | # In[11]: 128 | 129 | 130 | len(sample) 131 | 132 | 133 | # In[25]: 134 | 135 | 136 | population.mean() 137 | 138 | 139 | # In[26]: 140 | 141 | 142 | sample.mean() 143 | 144 | 145 | # In[24]: 146 | 147 | 148 | np.random.seed(101) 149 | for i in range(10): 150 | sample = np.random.choice(population, 100) 151 | print(sample.mean()) 152 | 153 | 154 | # In[28]: 155 | 156 | 157 | np.random.seed(101) 158 | sample_means = [] 159 | for i in range(10): 160 | sample = np.random.choice(population, 100) 161 | sample_means.append(sample.mean()) 162 | 163 | 164 | # In[29]: 165 | 166 | 167 | sample_means 168 | 169 | 170 | # In[30]: 171 | 172 | 173 | np.mean(sample_means) 174 | 175 | 176 | # In[31]: 177 | 178 | 179 | population.mean() 180 | 181 | 182 | # In[32]: 183 | 184 | 185 | from scipy.stats import kurtosis, skew 186 | 187 | 188 | # In[33]: 189 | 190 | 191 | pip install matplotlib 192 | 193 | 194 | # In[1]: 195 | 196 | 197 | import matplotlib.pyplot as plt 198 | 199 | 200 | # In[43]: 201 | 202 | 203 | np.random.seed(42) 204 | x = np.random.normal(0, 2, 100000) 205 | 206 | 207 | # In[44]: 208 | 209 | 210 | plt.hist(x, bins = 100); 211 | 212 | 213 | # In[45]: 214 | 215 | 216 | kurtosis(x) 217 | 218 | 219 | # In[46]: 220 | 221 | 222 | skew(x) 223 | 224 | 225 | # In[6]: 226 | 227 | 228 | import numpy as np 229 | from scipy import stats 230 | 231 | 232 | # In[49]: 233 | 234 | 235 | age = [20, 22, 25, 25, 27, 27, 29, 30, 31, 121] 236 | 237 | 238 | # In[50]: 239 | 240 | 241 | np.mean(age) 242 | 243 | 244 | # In[51]: 245 | 246 | 247 | np.median(age) 248 | 249 | 250 | # In[13]: 251 | 252 | 253 | stats.mode(age) 254 | 255 | 256 | # In[15]: 257 | 258 | 259 | stats.mode(age)[0] 260 | 261 | 262 | # In[16]: 263 | 264 | 265 | stats.mode(age)[0][0] 266 | 267 | 268 | # In[24]: 269 | 270 | 271 | age_2 = [20, 22, 25, 25, 27, 27, 29, 30, 31] 272 | 273 | 274 | # In[25]: 275 | 276 | 277 | np.mean(age_2) 278 | 279 | 280 | # In[26]: 281 | 282 | 283 | np.median(age_2) 284 | 285 | 286 | # In[27]: 287 | 288 | 289 | age_3 = [19, 20, 21, 22] 290 | 291 | 292 | # In[28]: 293 | 294 | 295 | stats.mode(age_3) # it gives smallest element if no duplicate or more. 296 | 297 | 298 | # In[29]: 299 | 300 | 301 | type(age) 302 | 303 | 304 | # In[30]: 305 | 306 | 307 | # mean in arrays 308 | age_new = np.array(age) 309 | 310 | 311 | # In[31]: 312 | 313 | 314 | age_new 315 | 316 | 317 | # In[32]: 318 | 319 | 320 | type(age_new) 321 | 322 | 323 | # In[33]: 324 | 325 | 326 | age_new.mean() 327 | 328 | 329 | # In[35]: 330 | 331 | 332 | a = np.array([[6, 8, 3, 0], 333 | [3, 2, 1, 7], 334 | [8, 1, 8, 4], 335 | [5, 3, 0, 5], 336 | [4, 7, 5, 9]]) 337 | stats.mode(a) 338 | 339 | 340 | # In[38]: 341 | 342 | 343 | stats.mode(a, axis = 1) 344 | 345 | 346 | # In[52]: 347 | 348 | 349 | # range, sd, variance 350 | age = [20, 22, 25, 25, 27, 27, 27, 29, 30, 31, 121] 351 | 352 | 353 | # In[73]: 354 | 355 | 356 | range = np.max(age) - np.min(age) 357 | print(range) 358 | 359 | 360 | # In[74]: 361 | 362 | 363 | np.ptp(age) # it gives range Peak To Peak ptp 364 | 365 | 366 | # In[54]: 367 | 368 | 369 | np.std(age) 370 | 371 | 372 | # In[55]: 373 | 374 | 375 | np.var(age) 376 | 377 | 378 | # In[56]: 379 | 380 | 381 | age_2 = [20, 22, 25, 25, 27, 27, 27, 29, 30, 31] 382 | 383 | 384 | # In[57]: 385 | 386 | 387 | np.std(age_2) 388 | 389 | 390 | # In[60]: 391 | 392 | 393 | range = np.max(age_2) - np.min(age_2) 394 | 395 | 396 | # In[61]: 397 | 398 | 399 | print(range) 400 | 401 | 402 | # In[62]: 403 | 404 | 405 | # IQR - iqr 406 | 407 | 408 | # In[63]: 409 | 410 | 411 | x=[8, 10, 5, 24, 8, 3, 11, 3, 40, 7, 6, 12, 4] 412 | 413 | 414 | # In[64]: 415 | 416 | 417 | q75, q25 = np.percentile(x, [75, 25]) 418 | 419 | 420 | # In[65]: 421 | 422 | 423 | q75 424 | 425 | 426 | # In[66]: 427 | 428 | 429 | q25 430 | 431 | 432 | # In[67]: 433 | 434 | 435 | sorted(x) 436 | 437 | 438 | # In[68]: 439 | 440 | 441 | iqr = q75-q25 442 | 443 | 444 | # In[69]: 445 | 446 | 447 | iqr 448 | 449 | 450 | # In[70]: 451 | 452 | 453 | stats.iqr(x) 454 | 455 | 456 | # In[71]: 457 | 458 | 459 | np.percentile(x, 75) 460 | 461 | 462 | # In[72]: 463 | 464 | 465 | np.percentile(x, 25) 466 | 467 | 468 | # In[1]: 469 | 470 | 471 | q = [62, 63, 64, 64, 70, 72, 76, 77, 81, 81] 472 | 473 | 474 | # In[3]: 475 | 476 | 477 | from scipy import stats 478 | import numpy as np 479 | 480 | 481 | # In[5]: 482 | 483 | 484 | np.percentile(q, 25) 485 | 486 | 487 | # In[6]: 488 | 489 | 490 | np.percentile(q, 75) 491 | 492 | 493 | # In[7]: 494 | 495 | 496 | stats.iqr(q) 497 | 498 | 499 | # In[8]: 500 | 501 | 502 | np.median(q) 503 | 504 | 505 | # In[9]: 506 | 507 | 508 | np.mean(q) 509 | 510 | 511 | # In[15]: 512 | 513 | 514 | stats.mode(q) 515 | 516 | 517 | # In[16]: 518 | 519 | 520 | # scatter plot 521 | 522 | 523 | # In[17]: 524 | 525 | 526 | # method matplotlib 527 | 528 | 529 | # In[1]: 530 | 531 | 532 | import numpy as np 533 | import matplotlib.pyplot as plt 534 | 535 | 536 | # In[2]: 537 | 538 | 539 | x = np.linspace(0, 5, 11) 540 | 541 | 542 | # In[3]: 543 | 544 | 545 | x 546 | 547 | 548 | # In[4]: 549 | 550 | 551 | y = x ** 2 552 | 553 | 554 | # In[5]: 555 | 556 | 557 | y 558 | 559 | 560 | # In[6]: 561 | 562 | 563 | plt.plot(x, y); # line plot 564 | 565 | 566 | # In[7]: 567 | 568 | 569 | plt.scatter(x, y); #scatter plot 570 | 571 | 572 | # In[16]: 573 | 574 | 575 | np.cov(x, y) 576 | 577 | 578 | # In[8]: 579 | 580 | 581 | # method seaborn 582 | 583 | 584 | # In[9]: 585 | 586 | 587 | import seaborn as sns 588 | 589 | 590 | # In[10]: 591 | 592 | 593 | sns.scatterplot(x, y); 594 | 595 | 596 | # In[14]: 597 | 598 | 599 | sns.jointplot(x, y, kind = 'scatter'); 600 | 601 | 602 | # In[15]: 603 | 604 | 605 | # method pandas 606 | 607 | 608 | # In[11]: 609 | 610 | 611 | import pandas as pd 612 | 613 | 614 | # In[15]: 615 | 616 | 617 | lst = zip(x, y) 618 | print(list((lst))) 619 | df = pd.DataFrame(lst, columns = ['x', 'y']) 620 | 621 | 622 | # In[18]: 623 | 624 | 625 | df.head() 626 | 627 | 628 | # In[21]: 629 | 630 | 631 | df.plot.scatter('x', 'y'); 632 | 633 | 634 | # In[22]: 635 | 636 | 637 | # method changing linestyle in line plot 638 | 639 | 640 | # In[23]: 641 | 642 | 643 | plt.plot(x, y); 644 | 645 | 646 | # In[24]: 647 | 648 | 649 | plt.plot(x, y, marker = 'o', linestyle = ' '); 650 | 651 | 652 | # In[28]: 653 | 654 | 655 | # boxplot 656 | 657 | 658 | # In[ ]: 659 | 660 | 661 | # boxplot with matplotlib 662 | 663 | 664 | # In[26]: 665 | 666 | 667 | x 668 | 669 | 670 | # In[27]: 671 | 672 | 673 | plt.boxplot(x); 674 | 675 | 676 | # In[30]: 677 | 678 | 679 | arr1 = np.random.randint(100, 200, 100) 680 | 681 | 682 | # In[31]: 683 | 684 | 685 | arr1 686 | 687 | 688 | # In[34]: 689 | 690 | 691 | plt.boxplot(arr1); 692 | 693 | 694 | # In[36]: 695 | 696 | 697 | arr1=np.random.randint(100,200,100) 698 | arr2=np.random.randint(1,50,5) 699 | arr3=np.random.randint(300,350,5) 700 | arr=np.append(arr1, arr2) 701 | arr=np.append(arr, arr3) 702 | plt.boxplot(arr); 703 | 704 | 705 | # In[37]: 706 | 707 | 708 | min(arr) 709 | 710 | 711 | # In[38]: 712 | 713 | 714 | max(arr) 715 | 716 | 717 | # In[39]: 718 | 719 | 720 | from scipy import stats 721 | 722 | 723 | # In[40]: 724 | 725 | 726 | stats.iqr(arr) 727 | 728 | 729 | # In[41]: 730 | 731 | 732 | np.percentile(arr, 25) 733 | 734 | 735 | # In[42]: 736 | 737 | 738 | np.percentile(arr, 75) 739 | 740 | 741 | # In[43]: 742 | 743 | 744 | np.median(arr) 745 | 746 | 747 | # In[45]: 748 | 749 | 750 | sns.boxplot(arr, orient = 'v'); 751 | 752 | 753 | # In[17]: 754 | 755 | 756 | # corr and cov 757 | 758 | 759 | # In[4]: 760 | 761 | 762 | import numpy as np 763 | import matplotlib.pyplot as plt 764 | import seaborn as sns 765 | 766 | 767 | # In[2]: 768 | 769 | 770 | temp=[93,84,82,78,98,70] 771 | number_of_people=[13,10, 11, 8, 15, 9] 772 | 773 | 774 | # In[3]: 775 | 776 | 777 | np.cov(temp, number_of_people)[0, 1] 778 | 779 | 780 | # In[4]: 781 | 782 | 783 | np.corrcoef(temp, number_of_people)[0, 1] 784 | 785 | 786 | # In[7]: 787 | 788 | 789 | df = sns.load_dataset('tips') 790 | 791 | 792 | # In[8]: 793 | 794 | 795 | df.head() 796 | 797 | 798 | # In[9]: 799 | 800 | 801 | df.dtypes 802 | 803 | 804 | # In[10]: 805 | 806 | 807 | df.corr('pearson') 808 | 809 | 810 | # In[11]: 811 | 812 | 813 | df.corr() 814 | 815 | 816 | # In[15]: 817 | 818 | 819 | np.corrcoef(df['total_bill'], df['tip'])[0, 1] 820 | 821 | 822 | # In[16]: 823 | 824 | 825 | df['total_bill'].corr(df['tip']) 826 | 827 | 828 | # In[21]: 829 | 830 | 831 | sns.heatmap(tips.corr(), annot = True, cmap = 'RdYlGn'); 832 | 833 | 834 | # In[24]: 835 | 836 | 837 | mpg = sns.load_dataset('mpg') 838 | 839 | 840 | # In[25]: 841 | 842 | 843 | sns.pairplot(mpg); 844 | 845 | -------------------------------------------------------------------------------- /Statistics/Statistics_2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Probability 5 | 6 | # In[35]: 7 | 8 | 9 | # Heads or Tails 10 | 11 | 12 | # In[36]: 13 | 14 | 15 | import random 16 | 17 | 18 | # In[37]: 19 | 20 | 21 | coin = ('H', 'T') 22 | 23 | 24 | # In[38]: 25 | 26 | 27 | random.choice(coin) 28 | 29 | 30 | # In[39]: 31 | 32 | 33 | for i in range(5): 34 | result = random.choice(coin) 35 | print(result) 36 | 37 | 38 | # In[42]: 39 | 40 | 41 | results = {'H' : 0, 'T' : 0} 42 | 43 | for i in range(10): 44 | results[random.choice(list(results.keys()))] += 1 45 | 46 | print('P(Heads):', results['H'] / sum(results.values())) 47 | print('P(Tails):', results['T'] / sum(results.values())) 48 | 49 | 50 | # In[41]: 51 | 52 | 53 | results = {'H' : 0, 'T' : 0} 54 | 55 | for i in range(100000): # law of large estimates 56 | results[random.choice(list(results.keys()))] += 1 57 | 58 | print('P(Heads):', results['H'] / sum(results.values())) 59 | print('P(Tails):', results['T'] / sum(results.values())) 60 | 61 | 62 | # In[43]: 63 | 64 | 65 | # Rolling 2 Dice 66 | 67 | 68 | # In[44]: 69 | 70 | 71 | import numpy as np 72 | import seaborn as sns 73 | import matplotlib.pyplot as plt 74 | 75 | 76 | # In[45]: 77 | 78 | 79 | np.random.seed(51) 80 | 81 | 82 | # In[46]: 83 | 84 | 85 | d1 = np.array([1, 2, 3, 4, 5, 6]) 86 | d2 = np.array([1, 2, 3, 4, 5, 6]) 87 | 88 | 89 | # In[60]: 90 | 91 | 92 | dice_1 = [] 93 | dice_2 = [] 94 | 95 | sums = [] 96 | 97 | for i in range(1000): 98 | dice_1.append(np.random.choice(d1)) 99 | dice_2.append(np.random.choice(d2)) 100 | sums.append(dice_1[i] + dice_2[i]) 101 | 102 | #print(dice_1) 103 | #print(dice_2) 104 | #print(sums) 105 | 106 | 107 | # In[70]: 108 | 109 | 110 | fig, (ax1, ax2) = plt.subplots(ncols = 2, sharey = True, figsize = (12, 4)) 111 | sns.countplot(dice_1, ax = ax1) 112 | sns.countplot(dice_2, ax = ax2) 113 | 114 | 115 | # In[68]: 116 | 117 | 118 | sns.countplot(sums) 119 | 120 | 121 | # # Combinatoric Generators 122 | 123 | # In[72]: 124 | 125 | 126 | # Product 127 | 128 | 129 | # In[73]: 130 | 131 | 132 | import itertools as it 133 | 134 | 135 | # In[84]: 136 | 137 | 138 | cp = list(it.product('HT', repeat = 3)) # possible outcomes of H or T (2 ** 3) 139 | 140 | 141 | # In[85]: 142 | 143 | 144 | len(cp) 145 | 146 | 147 | # In[86]: 148 | 149 | 150 | cp 151 | 152 | 153 | # In[87]: 154 | 155 | 156 | cp2 = list(it.product('123456', 'HT')) # 2 x 6 = 12 possible outcomes 157 | 158 | 159 | # In[88]: 160 | 161 | 162 | cp2 163 | 164 | 165 | # In[89]: 166 | 167 | 168 | len(cp2) 169 | 170 | 171 | # # Permutations 172 | 173 | # In[1]: 174 | 175 | 176 | import math 177 | 178 | 179 | # In[2]: 180 | 181 | 182 | math.factorial(4) 183 | 184 | 185 | # In[3]: 186 | 187 | 188 | def permutation(n, r): 189 | return math.factorial(n) / math.factorial(n - r) 190 | 191 | 192 | # In[4]: 193 | 194 | 195 | permutation(4, 2) 196 | 197 | 198 | # In[5]: 199 | 200 | 201 | permutation(8, 4) 202 | 203 | 204 | # In[105]: 205 | 206 | 207 | import itertools as it 208 | 209 | 210 | # In[102]: 211 | 212 | 213 | cp3 = list(it.permutations('GRYB', 2)) # pick 2 colors of 4 colors (order is important) 214 | 215 | 216 | # In[103]: 217 | 218 | 219 | len(cp3) 220 | 221 | 222 | # In[104]: 223 | 224 | 225 | cp3 # sequence is important 226 | 227 | 228 | # # Combinations 229 | 230 | # In[6]: 231 | 232 | 233 | import itertools as it 234 | 235 | 236 | # In[106]: 237 | 238 | 239 | cp4 = list(it.combinations('GRYB', 2)) # pick 2 colors of 4 colors (order is not important) 240 | 241 | 242 | # In[107]: 243 | 244 | 245 | cp4 246 | 247 | 248 | # In[108]: 249 | 250 | 251 | len(cp4) 252 | 253 | 254 | # In[7]: 255 | 256 | 257 | def combination(n, r): 258 | return math.factorial(n) / (math.factorial(n - r) * math.factorial(r)) 259 | 260 | 261 | # In[8]: 262 | 263 | 264 | combination(4, 2) 265 | 266 | 267 | # In[9]: 268 | 269 | 270 | combination(20, 11) 271 | 272 | 273 | # In[119]: 274 | 275 | 276 | cp5 = list(it.combinations_with_replacement('GRYB', 2)) 277 | 278 | 279 | # In[120]: 280 | 281 | 282 | cp5 283 | 284 | 285 | # In[121]: 286 | 287 | 288 | len(cp5) 289 | 290 | 291 | # In[ ]: 292 | 293 | 294 | 295 | 296 | -------------------------------------------------------------------------------- /Statistics/Statistics_3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Binomial Distribution 5 | 6 | # In[1]: 7 | 8 | 9 | import numpy as np 10 | from scipy import stats 11 | import matplotlib.pyplot as plt 12 | 13 | 14 | # In[34]: 15 | 16 | 17 | (n, p) = (2, 0.5) 18 | 19 | 20 | # In[35]: 21 | 22 | 23 | stats.binom(n, p) 24 | 25 | 26 | # In[36]: 27 | 28 | 29 | binomDist = stats.binom(n, p) 30 | 31 | 32 | # In[37]: 33 | 34 | 35 | binomDist.args 36 | 37 | 38 | # In[38]: 39 | 40 | 41 | binomDist.pmf(0) # it gives P(x=0) in n = 2, p = 0.5 42 | 43 | 44 | # In[39]: 45 | 46 | 47 | dist = [] 48 | 49 | print('r\tp(r)') 50 | for i in range(n + 1): 51 | dist.append(binomDist.pmf(i)) 52 | print(str(i) + '\t' + str(binomDist.pmf(i))) 53 | 54 | 55 | # In[12]: 56 | 57 | 58 | binomDist.pmf(2) 59 | 60 | 61 | # In[13]: 62 | 63 | 64 | binomDist.pmf(3) # because n = 2 65 | 66 | 67 | # In[21]: 68 | 69 | 70 | # other example 71 | 72 | 73 | # In[41]: 74 | 75 | 76 | (n, p) = (10, 0.2) 77 | 78 | 79 | # In[42]: 80 | 81 | 82 | binomDist = stats.binom(n, p) 83 | 84 | 85 | # In[43]: 86 | 87 | 88 | binomDist.args 89 | 90 | 91 | # In[44]: 92 | 93 | 94 | dist = [] 95 | 96 | print('r\tp(r)') 97 | for i in range(n + 1): 98 | dist.append(binomDist.pmf(i)) 99 | print(str(i) + '\t' + str(binomDist.pmf(i))) 100 | 101 | 102 | # In[45]: 103 | 104 | 105 | binomDist.pmf(7) 106 | 107 | 108 | # In[46]: 109 | 110 | 111 | binomDist.cdf(1) # pmf(0) + pmf(1) 112 | 113 | 114 | # In[47]: 115 | 116 | 117 | plt.bar(list(range(n + 1)), dist) 118 | 119 | 120 | # In[48]: 121 | 122 | 123 | mean, var = binomDist.stats() 124 | 125 | 126 | # In[49]: 127 | 128 | 129 | print('mean = ' + str(mean)) 130 | 131 | 132 | # print('var = ' + str(var)) 133 | 134 | # In[53]: 135 | 136 | 137 | binomDist.stats() # mean and var 138 | 139 | 140 | # In[54]: 141 | 142 | 143 | binomDist.median() 144 | 145 | 146 | # In[56]: 147 | 148 | 149 | binomDist.std() 150 | 151 | 152 | # In[72]: 153 | 154 | 155 | binomDist.rvs(100) 156 | 157 | 158 | # In[75]: 159 | 160 | 161 | stats.binom.cdf(2, 10, 0.2) 162 | 163 | 164 | # In[76]: 165 | 166 | 167 | binomDist.cdf(2) 168 | 169 | 170 | # ### Exercise 171 | # There was a probability of 0.8 success in any attempt to make a call. 172 | # Calculate the probability of having 7 successes in 10 attempts. 173 | 174 | # In[92]: 175 | 176 | 177 | stats.binom.pmf(7, 10, 0.8) 178 | 179 | 180 | # ### Exercise 181 | # A (blindfolded) marksman finds that on the average he hits the target 4 times out of 5. If he fires 4 shots, what is the probability of 182 | # (a) more than 2 hits? 183 | # (b) at least 3 misses? 184 | 185 | # In[ ]: 186 | 187 | 188 | 189 | 190 | 191 | # # Poisson Distribution 192 | 193 | # In[94]: 194 | 195 | 196 | stats.poisson.pmf(5, 6) # avg = 6, x = 5 197 | 198 | 199 | # In[95]: 200 | 201 | 202 | stats.poisson.cdf(5, 6) 203 | 204 | 205 | # ### Exercise 206 | # A bank is interested in studying the number of people who use the ATM located outside its office late at night. 207 | # On average, 1.6 customers walk up to the ATM during any 10 minute interval between 9pm and midnight. 208 | # What is lambda λ for this problem? 209 | # What is the probability of exactly 3 customers using th ATM during any 10 minute interval? 210 | # What is the probability of 3 or fewer people? 211 | 212 | # In[98]: 213 | 214 | 215 | avg = 1.6 216 | x = 3 217 | 218 | 219 | # In[99]: 220 | 221 | 222 | stats.poisson.pmf(3, 1.6) 223 | 224 | 225 | # In[100]: 226 | 227 | 228 | stats.poisson.cdf(3, 1.6) 229 | 230 | 231 | # In[101]: 232 | 233 | 234 | poissonDist = stats.poisson(avg) 235 | 236 | 237 | # In[102]: 238 | 239 | 240 | dist = [] 241 | 242 | print('r\tp(r)') 243 | for i in range(10): 244 | dist.append(poissonDist.pmf(i)) 245 | print(str(i) + '\t' + str(poissonDist.pmf(i))) 246 | 247 | 248 | # ### Exercise 249 | # The Indiana Department of Transportation is concerned about the number of deer being struck by cars between Martinsville and Bloomington. They note the number of deer carcasses and other deer-related accidents over a 1-month period in a 2-mile intervals. 250 | # What is the probability of zero deer strike incidents during any 2-mile interval between Martinsville and Bloomington? 251 | # 0.08 strikes per/day 252 | 253 | # In[104]: 254 | 255 | 256 | stats.poisson.pmf(0, 0.08*30) 257 | 258 | 259 | # # Bernoulli Distribution 260 | 261 | # In[116]: 262 | 263 | 264 | p = 0.3 265 | 266 | 267 | # In[117]: 268 | 269 | 270 | bernDist = stats.bernoulli(p) 271 | 272 | 273 | # In[118]: 274 | 275 | 276 | bernDist.pmf(0) 277 | 278 | 279 | # In[119]: 280 | 281 | 282 | bernDist.pmf(1) 283 | 284 | 285 | # In[120]: 286 | 287 | 288 | bernDist.pmf(2) # because single trial. there is no other option. 289 | 290 | 291 | # In[121]: 292 | 293 | 294 | dist = [] 295 | 296 | print('r\tp(r)') 297 | for i in range(2): 298 | dist.append(bernDist.pmf(i)) 299 | print(str(i) + '\t' + str(bernDist.pmf(i))) 300 | 301 | 302 | # In[122]: 303 | 304 | 305 | plt.bar(list(range(2)), dist) 306 | plt.xticks(list(range(2)), ('0', '1')) 307 | plt.show() 308 | 309 | 310 | # In[123]: 311 | 312 | 313 | mean, var = bernDist.stats() 314 | 315 | 316 | # In[125]: 317 | 318 | 319 | str(mean) 320 | 321 | 322 | # In[126]: 323 | 324 | 325 | str(var) 326 | 327 | 328 | # In[127]: 329 | 330 | 331 | bernDist.median() 332 | 333 | 334 | # In[129]: 335 | 336 | 337 | bernDist.std() 338 | 339 | 340 | # In[ ]: 341 | 342 | 343 | 344 | 345 | -------------------------------------------------------------------------------- /Statistics/Statistics_4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import numpy as np 8 | from scipy import stats 9 | import matplotlib.pyplot as plt 10 | 11 | 12 | # # Uniform Distribution 13 | 14 | # In[2]: 15 | 16 | 17 | (a, b) = (0, 30) 18 | 19 | 20 | # In[4]: 21 | 22 | 23 | unifDist = stats.uniform(0, 30) 24 | 25 | 26 | # In[10]: 27 | 28 | 29 | unifDist.args 30 | 31 | 32 | # In[6]: 33 | 34 | 35 | unifDist.pdf(10) 36 | 37 | 38 | # In[14]: 39 | 40 | 41 | unifDist.pdf(15) 42 | 43 | 44 | # In[15]: 45 | 46 | 47 | unifDist.cdf(15) 48 | 49 | 50 | # In[12]: 51 | 52 | 53 | unifDist.cdf(10) 54 | 55 | 56 | # In[20]: 57 | 58 | 59 | a = unifDist.rvs(1000) 60 | 61 | b = [] 62 | for i in a: 63 | b.append(unifDist.pdf(i)) 64 | 65 | 66 | # In[23]: 67 | 68 | 69 | plt.bar(a, b) 70 | 71 | 72 | # In[24]: 73 | 74 | 75 | mean, var = unifDist.stats() 76 | 77 | 78 | # In[25]: 79 | 80 | 81 | str(mean) 82 | 83 | 84 | # In[26]: 85 | 86 | 87 | str(var) 88 | 89 | 90 | # In[27]: 91 | 92 | 93 | unifDist.median() 94 | 95 | 96 | # In[28]: 97 | 98 | 99 | unifDist.std() 100 | 101 | 102 | # # Normal Distribution 103 | 104 | # In[30]: 105 | 106 | 107 | (mu, sigma) = (0, 1) 108 | 109 | 110 | # In[31]: 111 | 112 | 113 | normDist = stats.norm(mu, sigma) 114 | 115 | 116 | # In[33]: 117 | 118 | 119 | normDist.args 120 | 121 | 122 | # In[55]: 123 | 124 | 125 | 1 - normDist.cdf(2) # P(Z>2) 126 | 127 | 128 | # In[52]: 129 | 130 | 131 | normDist.pdf(4) 132 | 133 | 134 | # In[62]: 135 | 136 | 137 | x = np.linspace(-5, 5, 1000) 138 | 139 | y = normDist.pdf(x) 140 | 141 | 142 | # In[63]: 143 | 144 | 145 | plt.plot(x, y) 146 | 147 | 148 | # In[64]: 149 | 150 | 151 | mean, var, skew, kurt = normDist.stats(moments = 'mvsk') 152 | 153 | 154 | # In[65]: 155 | 156 | 157 | str(mean) 158 | 159 | 160 | # In[66]: 161 | 162 | 163 | str(var) 164 | 165 | 166 | # In[67]: 167 | 168 | 169 | str(skew) 170 | 171 | 172 | # In[68]: 173 | 174 | 175 | str(kurt) 176 | 177 | 178 | # In[70]: 179 | 180 | 181 | normDist.median() 182 | 183 | 184 | # In[71]: 185 | 186 | 187 | normDist.std() 188 | 189 | 190 | # # t distribution 191 | 192 | # In[72]: 193 | 194 | 195 | stats.t.cdf(-0.7745966, df = 14) 196 | 197 | 198 | # In[79]: 199 | 200 | 201 | stats.t.cdf(0, df = 14) 202 | 203 | 204 | # In[74]: 205 | 206 | 207 | tDist = stats.t(df = 15) 208 | 209 | x = np.linspace(-5, 5, 100) 210 | 211 | y = tDist.pdf(x) 212 | 213 | 214 | # In[75]: 215 | 216 | 217 | plt.plot(x, y) 218 | 219 | 220 | # In[ ]: 221 | 222 | 223 | 224 | 225 | -------------------------------------------------------------------------------- /Statistics/Statistics_5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # CENTRAL LIMIT THEOREM 5 | 6 | # ## Sample Mean for a Uniform Distribution 7 | 8 | # In[1]: 9 | 10 | 11 | import random 12 | import math 13 | import matplotlib.pyplot as plt 14 | import numpy as np 15 | from scipy import stats 16 | random.seed(54312) 17 | 18 | 19 | # In[10]: 20 | 21 | 22 | sample_size = 30 23 | 24 | sim_num = 10000 25 | 26 | 27 | # In[11]: 28 | 29 | 30 | mean_list = [] 31 | 32 | for i in range(sim_num): 33 | sample_list = [] 34 | for i in range(sample_size): 35 | sample_list.append(random.randint(0, 100)) 36 | sample_mean = sum(sample_list)/sample_size 37 | mean_list.append(sample_mean) 38 | 39 | 40 | # In[12]: 41 | 42 | 43 | mean_list 44 | 45 | 46 | # In[13]: 47 | 48 | 49 | sum(mean_list)/len(mean_list) 50 | 51 | 52 | # In[19]: 53 | 54 | 55 | plt.hist(mean_list, bins=100, density = True, color = 'r'); 56 | plt.grid() 57 | mu = 50 58 | sigma = math.sqrt((100**2)/12) / (math.sqrt(sample_size)) 59 | x = np.linspace(mu - 4* sigma, mu + 4 * sigma) 60 | plt.plot(x, stats.norm.pdf(x, mu, sigma)) 61 | 62 | 63 | # ## Sample Mean for a Exponential Distribution 64 | 65 | # In[32]: 66 | 67 | 68 | sample_size = 30 69 | 70 | sim_num = 10000 71 | 72 | 73 | # In[33]: 74 | 75 | 76 | mean_list = [] 77 | 78 | for i in range(sim_num): 79 | sample_list = [] 80 | for i in range(sample_size): 81 | sample_list.append(np.random.exponential(1)) 82 | sample_mean = sum(sample_list)/sample_size 83 | mean_list.append(sample_mean) 84 | 85 | 86 | # In[34]: 87 | 88 | 89 | plt.hist(mean_list, bins=100, density = True, color = 'r'); 90 | plt.grid() 91 | mu = 1 92 | sigma = 1 / (math.sqrt(sample_size)) 93 | x = np.linspace(mu - 4* sigma, mu + 4 * sigma) 94 | plt.plot(x, stats.norm.pdf(x, mu, sigma)) 95 | 96 | 97 | # ## CONFIDENCE INTERVAL 98 | 99 | # In[39]: 100 | 101 | 102 | import random 103 | import math 104 | import matplotlib.pyplot as plt 105 | import numpy as np 106 | from scipy import stats 107 | random.seed(39809) 108 | 109 | 110 | # In[40]: 111 | 112 | 113 | sample_size = 30 114 | sample_list = [] 115 | 116 | for i in range(30): 117 | sample_list.append(random.randint(0, 10)) 118 | 119 | 120 | # In[41]: 121 | 122 | 123 | sample_mean = np.mean(sample_list) 124 | 125 | 126 | # In[42]: 127 | 128 | 129 | sample_mean 130 | 131 | 132 | # In[43]: 133 | 134 | 135 | n = len(sample_list) 136 | 137 | 138 | # In[44]: 139 | 140 | 141 | cl = 0.95 142 | 143 | std = 1 144 | 145 | 146 | # In[51]: 147 | 148 | 149 | critic_value = stats.norm.ppf(((1-0.95)/2) + 0.95) 150 | 151 | 152 | # In[53]: 153 | 154 | 155 | (((1-0.95)/2) + 0.95) # z table ppf value for 0.95 at t table 156 | 157 | 158 | # In[52]: 159 | 160 | 161 | critic_value 162 | 163 | 164 | # In[54]: 165 | 166 | 167 | lower_limit = sample_mean - (critic_value * (std/math.sqrt(n))) 168 | 169 | 170 | # In[56]: 171 | 172 | 173 | upper_limit = sample_mean + (critic_value * (std/math.sqrt(n))) 174 | 175 | 176 | # In[62]: 177 | 178 | 179 | print(f'Your {cl} z confidence interval is ({lower_limit:.2f}, {upper_limit:.2f})') 180 | 181 | 182 | # Exercise 183 | 184 | # In[63]: 185 | 186 | 187 | sample_list = [2, 3, 5, 6, 9] 188 | 189 | 190 | # In[65]: 191 | 192 | 193 | sample_mean = np.mean(sample_list) 194 | 195 | sample_mean 196 | 197 | 198 | # In[66]: 199 | 200 | 201 | std = 2.5 202 | 203 | 204 | # In[67]: 205 | 206 | 207 | n = len(sample_list) 208 | 209 | 210 | # In[68]: 211 | 212 | 213 | cl = 0.95 214 | 215 | 216 | # In[71]: 217 | 218 | 219 | critic_value = stats.norm.ppf(((1 - cl)/2) + cl) 220 | 221 | 222 | # In[72]: 223 | 224 | 225 | critic_value 226 | 227 | 228 | # In[73]: 229 | 230 | 231 | lower_limit = sample_mean - (critic_value * (std/math.sqrt(n))) 232 | 233 | 234 | # In[74]: 235 | 236 | 237 | upper_limit = sample_mean + (critic_value * (std/math.sqrt(n))) 238 | 239 | 240 | # In[75]: 241 | 242 | 243 | print(f'Your {cl} z confidence interval is ({lower_limit:.2f}, {upper_limit:.2f})') 244 | 245 | 246 | # In[84]: 247 | 248 | 249 | stats.norm.interval(cl, loc = sample_mean, scale = std/math.sqrt(n)) # using scipy 250 | 251 | 252 | # In[76]: 253 | 254 | 255 | critic_value = stats.norm.ppf(((1 - 0.99)/2) + 0.99) # interval gets larger beacuse CL gets higher 256 | 257 | 258 | # In[77]: 259 | 260 | 261 | lower_limit = sample_mean - (critic_value * (std/math.sqrt(n))) 262 | 263 | 264 | # In[78]: 265 | 266 | 267 | upper_limit = sample_mean + (critic_value * (std/math.sqrt(n))) 268 | 269 | 270 | # In[79]: 271 | 272 | 273 | print(f'Your {cl} z confidence interval is ({lower_limit:.2f}, {upper_limit:.2f})') # interval gets larger 274 | 275 | 276 | # In[85]: 277 | 278 | 279 | stats.norm.interval(0.99, loc = sample_mean, scale = std/math.sqrt(n)) 280 | 281 | 282 | # ## USING SCIPY 283 | 284 | # In[87]: 285 | 286 | 287 | stats.norm.interval(cl, loc = sample_mean, scale = std/math.sqrt(n)) # using scipy 288 | 289 | -------------------------------------------------------------------------------- /Statistics/Statistics_6.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import numpy as np 9 | from scipy import stats 10 | import math 11 | 12 | 13 | # ## One Sample t Test 14 | 15 | # #### Analyze if college students get 7.2 hours of sleep, on average, based on a sample of students, alpha = 0.05 16 | 17 | # In[2]: 18 | 19 | 20 | df = pd.read_csv('students.csv') 21 | 22 | 23 | # In[3]: 24 | 25 | 26 | df.head() 27 | 28 | 29 | # In[4]: 30 | 31 | 32 | # H0: mu = 7.2 33 | # Ha: mu != 7.2 34 | # it is two tailed test 35 | 36 | 37 | # In[5]: 38 | 39 | 40 | onesample = stats.ttest_1samp(df['Sleep'], 7.2) # sample, pop_mean 41 | 42 | 43 | # In[6]: 44 | 45 | 46 | onesample.statistic 47 | 48 | 49 | # In[7]: 50 | 51 | 52 | onesample.pvalue # fail to reject H0 because p_value is higher than alpha (significance level) 53 | 54 | 55 | # In[8]: 56 | 57 | 58 | alpha = 0.05 59 | p_value = onesample.pvalue 60 | if p_value variances unknown and equal), small size 249 | 250 | 251 | # In[37]: 252 | 253 | 254 | ind_test_w_2gr = stats.ttest_ind(df['x1'], df['x2'], equal_var = True) 255 | 256 | 257 | # In[38]: 258 | 259 | 260 | ind_test_w_2gr.statistic 261 | 262 | 263 | # In[39]: 264 | 265 | 266 | p_value = ind_test_w_2gr.pvalue 267 | 268 | 269 | # In[40]: 270 | 271 | 272 | alpha = 0.05 273 | 274 | if p_value= 4 165 | 166 | 167 | # In[57]: 168 | 169 | 170 | stats.poisson.cdf(3, avg) # x < 4 171 | 172 | 173 | # In[59]: 174 | 175 | 176 | (1 - stats.poisson.cdf(1, avg)) - (1 - stats.poisson.cdf(3, avg)) # P(x ≥ 4 | x ≥ 2 ) 177 | 178 | 179 | # ## 4 180 | # Consider binomial experiment for n = 20, p = .05. 181 | 182 | # In[61]: 183 | 184 | 185 | n = 20 186 | p = 0.05 187 | 188 | 189 | # In[62]: 190 | 191 | 192 | binomDist = stats.binom(n, p) 193 | 194 | 195 | # ## 4.1 196 | # Calculate the binomial probabilities for Y = 0, 1, 2, 3, and 4. 197 | 198 | # In[72]: 199 | 200 | 201 | binomDist.pmf(0) 202 | 203 | 204 | # In[68]: 205 | 206 | 207 | binomDist.pmf(1) 208 | 209 | 210 | # In[69]: 211 | 212 | 213 | binomDist.pmf(2) 214 | 215 | 216 | # In[70]: 217 | 218 | 219 | binomDist.pmf(3) 220 | 221 | 222 | # In[71]: 223 | 224 | 225 | binomDist.pmf(4) 226 | 227 | 228 | # ## 4.1 229 | # Calculate the same probabilities by using the Poisson approximation with λ = np. Compare. 230 | 231 | # In[73]: 232 | 233 | 234 | avg = n*p # avg = 1 235 | 236 | 237 | # In[74]: 238 | 239 | 240 | stats.poisson.pmf(0, avg) 241 | 242 | 243 | # In[75]: 244 | 245 | 246 | stats.poisson.pmf(1, avg) 247 | 248 | 249 | # In[76]: 250 | 251 | 252 | stats.poisson.pmf(2, avg) 253 | 254 | 255 | # In[77]: 256 | 257 | 258 | stats.poisson.pmf(3, avg) 259 | 260 | 261 | # In[78]: 262 | 263 | 264 | stats.poisson.pmf(4, avg) 265 | 266 | 267 | # In[ ]: 268 | 269 | 270 | 271 | 272 | -------------------------------------------------------------------------------- /Statistics/Statistics_Exercise_3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "from scipy import stats\n", 11 | "import math" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## 1 \n", 19 | " Create a Standard Normal Distribution Table using Python scipy.stats. " 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "(mu, sigma) = (0, 1)\n", 29 | "normDist = stats.norm(mu, sigma)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 64, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "z = np.round(np.linspace(0, 3, 310), 2) * -1" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 65, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "z = list(z)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 84, 53 | "metadata": { 54 | "scrolled": true 55 | }, 56 | "outputs": [ 57 | { 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "0.5 0.496 0.492 0.488 0.484 0.4801 0.4761 0.4721 0.4681 0.4641 0.4602 \n", 62 | "\n", 63 | "0.4562 0.4522 0.4483 0.4443 0.4404 0.4364 0.4325 0.4325 0.4286 0.4247 \n", 64 | "\n", 65 | "0.4207 0.4168 0.4129 0.409 0.4052 0.4013 0.3974 0.3936 0.3897 0.3859 \n", 66 | "\n", 67 | "0.3821 0.3783 0.3745 0.3707 0.3669 0.3632 0.3594 0.3557 0.352 0.3483 \n", 68 | "\n", 69 | "0.3446 0.3409 0.3372 0.3336 0.33 0.3264 0.3228 0.3192 0.3156 0.3121 \n", 70 | "\n", 71 | "0.3085 0.3085 0.305 0.3015 0.2981 0.2946 0.2912 0.2877 0.2843 0.281 \n", 72 | "\n", 73 | "0.2776 0.2743 0.2709 0.2676 0.2643 0.2611 0.2578 0.2546 0.2514 0.2483 \n", 74 | "\n", 75 | "0.2451 0.242 0.2389 0.2358 0.2327 0.2296 0.2266 0.2236 0.2206 0.2177 \n", 76 | "\n", 77 | "0.2148 0.2119 0.209 0.2061 0.2033 0.2033 0.2005 0.1977 0.1949 0.1922 \n", 78 | "\n", 79 | "0.1894 0.1867 0.1841 0.1814 0.1788 0.1762 0.1736 0.1711 0.1685 0.166 \n", 80 | "\n", 81 | "0.1635 0.1611 0.1587 0.1562 0.1539 0.1515 0.1492 0.1469 0.1446 0.1423 \n", 82 | "\n", 83 | "0.1401 0.1379 0.1357 0.1335 0.1314 0.1292 0.1271 0.1251 0.123 0.121 \n", 84 | "\n", 85 | "0.121 0.119 0.117 0.1151 0.1131 0.1112 0.1093 0.1075 0.1056 0.1038 \n", 86 | "\n", 87 | "0.102 0.1003 0.0985 0.0968 0.0951 0.0934 0.0918 0.0901 0.0885 0.0869 \n", 88 | "\n", 89 | "0.0853 0.0838 0.0823 0.0808 0.0793 0.0778 0.0764 0.0749 0.0735 0.0721 \n", 90 | "\n", 91 | "0.0708 0.0694 0.0681 0.0668 0.0668 0.0655 0.0643 0.063 0.0618 0.0606 \n", 92 | "\n", 93 | "0.0594 0.0582 0.0571 0.0559 0.0548 0.0537 0.0526 0.0516 0.0505 0.0495 \n", 94 | "\n", 95 | "0.0485 0.0475 0.0465 0.0455 0.0446 0.0436 0.0427 0.0418 0.0409 0.0401 \n", 96 | "\n", 97 | "0.0392 0.0384 0.0375 0.0367 0.0359 0.0351 0.0344 0.0336 0.0336 0.0329 \n", 98 | "\n", 99 | "0.0322 0.0314 0.0307 0.0301 0.0294 0.0287 0.0281 0.0274 0.0268 0.0262 \n", 100 | "\n", 101 | "0.0256 0.025 0.0244 0.0239 0.0233 0.0228 0.0222 0.0217 0.0212 0.0207 \n", 102 | "\n", 103 | "0.0202 0.0197 0.0192 0.0188 0.0183 0.0179 0.0174 0.017 0.0166 0.0162 \n", 104 | "\n", 105 | "0.0158 0.0154 0.015 0.015 0.0146 0.0143 0.0139 0.0136 0.0132 0.0129 \n", 106 | "\n", 107 | "0.0125 0.0122 0.0119 0.0116 0.0113 0.011 0.0107 0.0104 0.0102 0.0099 \n", 108 | "\n", 109 | "0.0096 0.0094 0.0091 0.0089 0.0087 0.0084 0.0082 0.008 0.0078 0.0075 \n", 110 | "\n", 111 | "0.0073 0.0071 0.0069 0.0068 0.0066 0.0064 0.0062 0.0062 0.006 0.0059 \n", 112 | "\n", 113 | "0.0057 0.0055 0.0054 0.0052 0.0051 0.0049 0.0048 0.0047 0.0045 0.0044 \n", 114 | "\n", 115 | "0.0043 0.0041 0.004 0.0039 0.0038 0.0037 0.0036 0.0035 0.0034 0.0033 \n", 116 | "\n", 117 | "0.0032 0.0031 0.003 0.0029 0.0028 0.0027 0.0026 0.0026 0.0025 0.0024 \n", 118 | "\n", 119 | "0.0023 0.0023 0.0023 0.0022 0.0021 0.0021 0.002 0.0019 0.0019 0.0018 \n", 120 | "\n", 121 | "0.0018 0.0017 0.0016 0.0016 0.0015 0.0015 0.0014 0.0014 0.0013 " 122 | ] 123 | } 124 | ], 125 | "source": [ 126 | "for(ii) in range(310):\n", 127 | " a = np.round(normDist.cdf(z[ii]), 4)\n", 128 | " print(a, end = ' ')\n", 129 | " if ii % 10 == 0 and ii != 0:\n", 130 | " print('\\n')" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "## 2\n", 138 | "The cycle time for trucks hauling concrete to a highway construction site is uniformly distributed over the interval 50 to 70 minutes. What is the probability that the cycle time exceeds 65 minutes if it is known that the cycle time exceeds 55 minutes?" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 85, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "(a, b) = (50, 70)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 86, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "unifDist = stats.uniform(50, 70)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 94, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "# P(X > 65 | X > 55)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 98, 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/plain": [ 176 | "0.21428571428571427" 177 | ] 178 | }, 179 | "execution_count": 98, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "unifDist.cdf(65)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 99, 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "data": { 195 | "text/plain": [ 196 | "0.07142857142857142" 197 | ] 198 | }, 199 | "execution_count": 99, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "unifDist.cdf(55)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 100, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/plain": [ 216 | "0.3333333333333333" 217 | ] 218 | }, 219 | "execution_count": 100, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "unifDist.cdf(55) / unifDist.cdf(65)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "## 3.1\n", 233 | "The width of bolts of fabric is normally distributed with mean 950 mm (millimeters) and standard deviation 10 mm.\n", 234 | "What is the probability that a randomly chosen bolt has a width of between 947 and 958mm?" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 102, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "(mu, sigma) = (950, 10)\n", 244 | "normDist = stats.norm(mu, sigma)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 105, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "0.406056023605556" 256 | ] 257 | }, 258 | "execution_count": 105, 259 | "metadata": {}, 260 | "output_type": "execute_result" 261 | } 262 | ], 263 | "source": [ 264 | "normDist.cdf(958) - normDist.cdf(947) " 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "## 3.2\n", 272 | "The width of bolts of fabric is normally distributed with mean 950 mm (millimeters) and standard deviation 10 mm.\n", 273 | "What is the appropriate value for C such that a randomly chosen bolt has a width less than C with probability .8531?" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 109, 279 | "metadata": {}, 280 | "outputs": [ 281 | { 282 | "data": { 283 | "text/plain": [ 284 | "960.4982190962642" 285 | ] 286 | }, 287 | "execution_count": 109, 288 | "metadata": {}, 289 | "output_type": "execute_result" 290 | } 291 | ], 292 | "source": [ 293 | "normDist.ppf(0.8531)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "## 4\n", 301 | "The school board administered an IQ test to 20 randomly selected teachers. They found that the average IQ score was 114 with a standard deviation of 10. Assume that the cumulative probability is 0.90. What population mean would have produced this sample result?" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 113, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "# n = 20, df = 19, mu_sample = 114, std = 10" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 114, 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "data": { 320 | "text/plain": [ 321 | "1.3277282090267986" 322 | ] 323 | }, 324 | "execution_count": 114, 325 | "metadata": {}, 326 | "output_type": "execute_result" 327 | } 328 | ], 329 | "source": [ 330 | "stats.t.ppf(0.90, 19)" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 116, 336 | "metadata": {}, 337 | "outputs": [ 338 | { 339 | "data": { 340 | "text/plain": [ 341 | "111.03110946897203" 342 | ] 343 | }, 344 | "execution_count": 116, 345 | "metadata": {}, 346 | "output_type": "execute_result" 347 | } 348 | ], 349 | "source": [ 350 | "114 - (stats.t.ppf(0.90, 19) * (10/math.sqrt(20))) # = mu_population" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 119, 356 | "metadata": {}, 357 | "outputs": [ 358 | { 359 | "data": { 360 | "text/plain": [ 361 | "1.3277282090267963" 362 | ] 363 | }, 364 | "execution_count": 119, 365 | "metadata": {}, 366 | "output_type": "execute_result" 367 | } 368 | ], 369 | "source": [ 370 | "(114 - 111.03110946897203)/(10/math.sqrt(20))" 371 | ] 372 | } 373 | ], 374 | "metadata": { 375 | "kernelspec": { 376 | "display_name": "Python 3", 377 | "language": "python", 378 | "name": "python3" 379 | }, 380 | "language_info": { 381 | "codemirror_mode": { 382 | "name": "ipython", 383 | "version": 3 384 | }, 385 | "file_extension": ".py", 386 | "mimetype": "text/x-python", 387 | "name": "python", 388 | "nbconvert_exporter": "python", 389 | "pygments_lexer": "ipython3", 390 | "version": "3.7.3" 391 | } 392 | }, 393 | "nbformat": 4, 394 | "nbformat_minor": 2 395 | } 396 | -------------------------------------------------------------------------------- /Statistics/Statistics_Exercise_3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import numpy as np 8 | from scipy import stats 9 | import math 10 | 11 | 12 | # ## 1 13 | # Create a Standard Normal Distribution Table using Python scipy.stats. 14 | 15 | # In[3]: 16 | 17 | 18 | (mu, sigma) = (0, 1) 19 | normDist = stats.norm(mu, sigma) 20 | 21 | 22 | # In[64]: 23 | 24 | 25 | z = np.round(np.linspace(0, 3, 310), 2) * -1 26 | 27 | 28 | # In[65]: 29 | 30 | 31 | z = list(z) 32 | 33 | 34 | # In[84]: 35 | 36 | 37 | for(ii) in range(310): 38 | a = np.round(normDist.cdf(z[ii]), 4) 39 | print(a, end = ' ') 40 | if ii % 10 == 0 and ii != 0: 41 | print('\n') 42 | 43 | 44 | # ## 2 45 | # The cycle time for trucks hauling concrete to a highway construction site is uniformly distributed over the interval 50 to 70 minutes. What is the probability that the cycle time exceeds 65 minutes if it is known that the cycle time exceeds 55 minutes? 46 | 47 | # In[85]: 48 | 49 | 50 | (a, b) = (50, 70) 51 | 52 | 53 | # In[86]: 54 | 55 | 56 | unifDist = stats.uniform(50, 70) 57 | 58 | 59 | # In[94]: 60 | 61 | 62 | # P(X > 65 | X > 55) 63 | 64 | 65 | # In[98]: 66 | 67 | 68 | unifDist.cdf(65) 69 | 70 | 71 | # In[99]: 72 | 73 | 74 | unifDist.cdf(55) 75 | 76 | 77 | # In[100]: 78 | 79 | 80 | unifDist.cdf(55) / unifDist.cdf(65) 81 | 82 | 83 | # ## 3.1 84 | # The width of bolts of fabric is normally distributed with mean 950 mm (millimeters) and standard deviation 10 mm. 85 | # What is the probability that a randomly chosen bolt has a width of between 947 and 958mm? 86 | 87 | # In[102]: 88 | 89 | 90 | (mu, sigma) = (950, 10) 91 | normDist = stats.norm(mu, sigma) 92 | 93 | 94 | # In[105]: 95 | 96 | 97 | normDist.cdf(958) - normDist.cdf(947) 98 | 99 | 100 | # ## 3.2 101 | # The width of bolts of fabric is normally distributed with mean 950 mm (millimeters) and standard deviation 10 mm. 102 | # What is the appropriate value for C such that a randomly chosen bolt has a width less than C with probability .8531? 103 | 104 | # In[109]: 105 | 106 | 107 | normDist.ppf(0.8531) 108 | 109 | 110 | # ## 4 111 | # The school board administered an IQ test to 20 randomly selected teachers. They found that the average IQ score was 114 with a standard deviation of 10. Assume that the cumulative probability is 0.90. What population mean would have produced this sample result? 112 | 113 | # In[113]: 114 | 115 | 116 | # n = 20, df = 19, mu_sample = 114, std = 10 117 | 118 | 119 | # In[114]: 120 | 121 | 122 | stats.t.ppf(0.90, 19) 123 | 124 | 125 | # In[116]: 126 | 127 | 128 | 114 - (stats.t.ppf(0.90, 19) * (10/math.sqrt(20))) # = mu_population 129 | 130 | 131 | # In[119]: 132 | 133 | 134 | (114 - 111.03110946897203)/(10/math.sqrt(20)) 135 | 136 | -------------------------------------------------------------------------------- /Statistics/Statistics_Exercise_4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import math\n", 11 | "from scipy import stats" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## 1\n", 19 | "\n", 20 | "Suppose scores on exams in statistics are normally distributed with an unknown population mean and a population standard deviation of 3 points. A random sample of 36 scores is taken and gives a sample mean (sample mean score) of 68. Find a confidence interval estimate for the population mean exam score (the mean score on all exams).\n", 21 | "\n", 22 | "Find a 90% confidence interval for the true (population) mean of statistics exam scores." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 3, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "sample_mean = 68\n", 32 | "n = 36\n", 33 | "std = 3\n", 34 | "cl = 0.90" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 4, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "std_e = std/math.sqrt(n)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 6, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "ci = stats.norm.interval(cl, sample_mean, std_e)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 10, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "Your 0.9 z confidence interval is (67.17757318652427, 68.82242681347573).\n" 65 | ] 66 | } 67 | ], 68 | "source": [ 69 | "print('Your {} z confidence interval is {}.'.format(cl, ci))" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "## 2\n", 77 | "\n", 78 | "What is the normal body temperature for healthy humans? A random sample of 130 healthy human body temperatures provided by Allen Shoemaker7 yielded 98.25 degrees and standard deviation 0.73 degrees. \n", 79 | "\n", 80 | "Give a 99% confidence interval for the average body temperature of healthy people." 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 13, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "sample_mean = 98.25\n", 90 | "n = 130\n", 91 | "std = 0.73\n", 92 | "cl = 0.99" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 14, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "std_e = std/math.sqrt(n)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 15, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "ci = stats.norm.interval(cl, sample_mean, std_e)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 16, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "Your 0.99 z confidence interval is (98.08508192246582, 98.41491807753418).\n" 123 | ] 124 | } 125 | ], 126 | "source": [ 127 | "print('Your {} z confidence interval is {}.'.format(cl, ci))" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "## 3\n", 135 | "\n", 136 | "The administrators for a hospital wished to estimate the average number of days required for inpatient treatment of patients between the ages of 25 and 34. A random sample of 500 hospital patients between these ages produced a mean and standard deviation equal to 5.4 and 3.1 days, respectively.\n", 137 | "Construct a 95% confidence interval for the mean length of stay for the population of patients from which the sample was drawn." 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 18, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "sample_mean = 5.4\n", 147 | "n = 500\n", 148 | "std = 3.1\n", 149 | "cl = 0.95" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 19, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "std_e = std/math.sqrt(n)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 20, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "ci = stats.norm.interval(cl, sample_mean, std_e)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 21, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "name": "stdout", 177 | "output_type": "stream", 178 | "text": [ 179 | "Your 0.95 z confidence interval is (5.12827801242126, 5.67172198757874).\n" 180 | ] 181 | } 182 | ], 183 | "source": [ 184 | "print('Your {} z confidence interval is {}.'.format(cl, ci))" 185 | ] 186 | } 187 | ], 188 | "metadata": { 189 | "kernelspec": { 190 | "display_name": "Python 3", 191 | "language": "python", 192 | "name": "python3" 193 | }, 194 | "language_info": { 195 | "codemirror_mode": { 196 | "name": "ipython", 197 | "version": 3 198 | }, 199 | "file_extension": ".py", 200 | "mimetype": "text/x-python", 201 | "name": "python", 202 | "nbconvert_exporter": "python", 203 | "pygments_lexer": "ipython3", 204 | "version": "3.7.3" 205 | } 206 | }, 207 | "nbformat": 4, 208 | "nbformat_minor": 2 209 | } 210 | -------------------------------------------------------------------------------- /Statistics/Statistics_Exercise_4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import numpy as np 8 | import math 9 | from scipy import stats 10 | 11 | 12 | # ## 1 13 | # 14 | # Suppose scores on exams in statistics are normally distributed with an unknown population mean and a population standard deviation of 3 points. A random sample of 36 scores is taken and gives a sample mean (sample mean score) of 68. Find a confidence interval estimate for the population mean exam score (the mean score on all exams). 15 | # 16 | # Find a 90% confidence interval for the true (population) mean of statistics exam scores. 17 | 18 | # In[3]: 19 | 20 | 21 | sample_mean = 68 22 | n = 36 23 | std = 3 24 | cl = 0.90 25 | 26 | 27 | # In[4]: 28 | 29 | 30 | std_e = std/math.sqrt(n) 31 | 32 | 33 | # In[6]: 34 | 35 | 36 | ci = stats.norm.interval(cl, sample_mean, std_e) 37 | 38 | 39 | # In[10]: 40 | 41 | 42 | print('Your {} z confidence interval is {}.'.format(cl, ci)) 43 | 44 | 45 | # ## 2 46 | # 47 | # What is the normal body temperature for healthy humans? A random sample of 130 healthy human body temperatures provided by Allen Shoemaker7 yielded 98.25 degrees and standard deviation 0.73 degrees. 48 | # 49 | # Give a 99% confidence interval for the average body temperature of healthy people. 50 | 51 | # In[13]: 52 | 53 | 54 | sample_mean = 98.25 55 | n = 130 56 | std = 0.73 57 | cl = 0.99 58 | 59 | 60 | # In[14]: 61 | 62 | 63 | std_e = std/math.sqrt(n) 64 | 65 | 66 | # In[15]: 67 | 68 | 69 | ci = stats.norm.interval(cl, sample_mean, std_e) 70 | 71 | 72 | # In[16]: 73 | 74 | 75 | print('Your {} z confidence interval is {}.'.format(cl, ci)) 76 | 77 | 78 | # ## 3 79 | # 80 | # The administrators for a hospital wished to estimate the average number of days required for inpatient treatment of patients between the ages of 25 and 34. A random sample of 500 hospital patients between these ages produced a mean and standard deviation equal to 5.4 and 3.1 days, respectively. 81 | # Construct a 95% confidence interval for the mean length of stay for the population of patients from which the sample was drawn. 82 | 83 | # In[18]: 84 | 85 | 86 | sample_mean = 5.4 87 | n = 500 88 | std = 3.1 89 | cl = 0.95 90 | 91 | 92 | # In[19]: 93 | 94 | 95 | std_e = std/math.sqrt(n) 96 | 97 | 98 | # In[20]: 99 | 100 | 101 | ci = stats.norm.interval(cl, sample_mean, std_e) 102 | 103 | 104 | # In[21]: 105 | 106 | 107 | print('Your {} z confidence interval is {}.'.format(cl, ci)) 108 | 109 | -------------------------------------------------------------------------------- /Statistics/Statistics_Exercise_5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Finding the Confidence Interval of Polling Figures" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "You are running a political campaign and decide to run 30 focus groups with about 10 people in each group. You get the results and want to report to your candidate the number of people who would vote for them in a typical 10-person group. Since there is some variability in each focus group, you decide that the most accurate way is to give a 95% z-confidence interval. You assume from past experience that the standard deviation is 2.89." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "1.Import the random Python package and set the seed to 39809. This will ensure that we get the same results every time we run the program:" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import random\n", 31 | "import math\n", 32 | "import numpy as np\n", 33 | "from scipy import stats\n", 34 | "random.seed(39809)" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "2.Initialize our sample list and collect our samples from our focus groups. Use random.randint" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "sample_size = 30\n", 51 | "sample_list = []\n", 52 | "\n", 53 | "for i in range(30):\n", 54 | " sample_list.append(random.randint(0, 10))\n", 55 | "sample_mean = np.mean(sample_list)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "3.Calculate 95% z-confidence interval." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 3, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "sample_mean\n", 72 | "n = 30\n", 73 | "std = 2.89\n", 74 | "cl = 0.95" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "std_e = std/math.sqrt(n)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 5, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "ci = stats.norm.interval(cl, sample_mean, std_e)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 6, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "text/plain": [ 103 | "(3.965845784931483, 6.034154215068517)" 104 | ] 105 | }, 106 | "execution_count": 6, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "ci" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 7, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "Your 0.95 z confidence interval is (3.965845784931483, 6.034154215068517).\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "print('Your {} z confidence interval is {}.'.format(cl, ci))" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "4.If you did everything correctly, then the following should be printed when you run your notebook:" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | " Your 0.95 z confidence interval is (3.965845784931483, 6.034154215068517)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "# Hypothesis Testing" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "Your boss asks you to conduct a hypothesis test about the mean dwell time of a new type of UAV. Before you arrived, an experiment was conducted on n=5 UAVs (all of the new type) resulting in a sample mean dwell time of 10.4 hours. The goal is to conclusively demonstrate, if possible, that the data supports the manufacturer’s claim that the mean dwell time is greater than 10 hours. Given that it is reasonable to assume the dwell times are normally distributed, the sample standard deviation is s = 0.5 hours, and using a significance level of α = 0.01:" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "1.Write out the null and alternative hypotheses" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 8, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "# H0 : mu = 10\n", 174 | "# Ha : mu > 10" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "2.Calculate the test statistic" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 9, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "mu_sample = 10.4\n", 191 | "mu = 10\n", 192 | "s = 0.5\n", 193 | "n = 5\n", 194 | "alpha = 0.01" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 10, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "t_statistic = (mu_sample-mu)/(s/math.sqrt(n))" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 11, 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "data": { 213 | "text/plain": [ 214 | "1.7888543819998335" 215 | ] 216 | }, 217 | "execution_count": 11, 218 | "metadata": {}, 219 | "output_type": "execute_result" 220 | } 221 | ], 222 | "source": [ 223 | "t_statistic" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "3.Find the p-value and state the outcome" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 12, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "df = n - 1" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 13, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "p_value = 1 - stats.t.cdf(t_statistic, df)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 14, 254 | "metadata": {}, 255 | "outputs": [ 256 | { 257 | "data": { 258 | "text/plain": [ 259 | "0.07407407407407385" 260 | ] 261 | }, 262 | "execution_count": 14, 263 | "metadata": {}, 264 | "output_type": "execute_result" 265 | } 266 | ], 267 | "source": [ 268 | "p_value" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 15, 274 | "metadata": {}, 275 | "outputs": [ 276 | { 277 | "name": "stdout", 278 | "output_type": "stream", 279 | "text": [ 280 | "At 0.01 level of significance, we fail to reject the null hypothesis.\n" 281 | ] 282 | } 283 | ], 284 | "source": [ 285 | "if p_value 10 83 | 84 | 85 | # 2.Calculate the test statistic 86 | 87 | # In[9]: 88 | 89 | 90 | mu_sample = 10.4 91 | mu = 10 92 | s = 0.5 93 | n = 5 94 | alpha = 0.01 95 | 96 | 97 | # In[10]: 98 | 99 | 100 | t_statistic = (mu_sample-mu)/(s/math.sqrt(n)) 101 | 102 | 103 | # In[11]: 104 | 105 | 106 | t_statistic 107 | 108 | 109 | # 3.Find the p-value and state the outcome 110 | 111 | # In[12]: 112 | 113 | 114 | df = n - 1 115 | 116 | 117 | # In[13]: 118 | 119 | 120 | p_value = 1 - stats.t.cdf(t_statistic, df) 121 | 122 | 123 | # In[14]: 124 | 125 | 126 | p_value 127 | 128 | 129 | # In[15]: 130 | 131 | 132 | if p_value\n", 163 | "\n", 176 | "\n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | "
Soil1Soil2
01.4421.364
11.9431.878
21.1101.337
31.9121.828
41.5531.371
\n", 212 | "" 213 | ], 214 | "text/plain": [ 215 | " Soil1 Soil2\n", 216 | "0 1.442 1.364\n", 217 | "1 1.943 1.878\n", 218 | "2 1.110 1.337\n", 219 | "3 1.912 1.828\n", 220 | "4 1.553 1.371" 221 | ] 222 | }, 223 | "execution_count": 11, 224 | "metadata": {}, 225 | "output_type": "execute_result" 226 | } 227 | ], 228 | "source": [ 229 | "df.head()" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 12, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "xbar1 = df['Soil1'].mean()\n", 239 | "xbar2 = df['Soil2'].mean()\n", 240 | "\n", 241 | "s1 = df['Soil1'].std()\n", 242 | "s2 = df['Soil2'].std()\n", 243 | "\n", 244 | "alpha = 0.01" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 13, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "t_statistic = ((xbar1 - xbar2) - 0) / math.sqrt(((s1 ** 2) / df['Soil1'].notna().sum()) + ((s2 ** 2) / df['Soil2'].notna().sum()))" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 14, 259 | "metadata": {}, 260 | "outputs": [ 261 | { 262 | "data": { 263 | "text/plain": [ 264 | "5.191460504717394" 265 | ] 266 | }, 267 | "execution_count": 14, 268 | "metadata": {}, 269 | "output_type": "execute_result" 270 | } 271 | ], 272 | "source": [ 273 | "t_statistic" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 15, 279 | "metadata": { 280 | "scrolled": false 281 | }, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/plain": [ 286 | "-2.575829303548901" 287 | ] 288 | }, 289 | "execution_count": 15, 290 | "metadata": {}, 291 | "output_type": "execute_result" 292 | } 293 | ], 294 | "source": [ 295 | "stats.norm.ppf(0.005) # 0.01 / 2" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 16, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/plain": [ 306 | "0.27468571428571464" 307 | ] 308 | }, 309 | "execution_count": 16, 310 | "metadata": {}, 311 | "output_type": "execute_result" 312 | } 313 | ], 314 | "source": [ 315 | "diff = xbar1 - xbar2\n", 316 | "diff" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 17, 322 | "metadata": {}, 323 | "outputs": [ 324 | { 325 | "data": { 326 | "text/plain": [ 327 | "0.05291106694081796" 328 | ] 329 | }, 330 | "execution_count": 17, 331 | "metadata": {}, 332 | "output_type": "execute_result" 333 | } 334 | ], 335 | "source": [ 336 | "std_e = math.sqrt(((s1 ** 2) / df['Soil1'].notna().sum()) + ((s2 ** 2) / df['Soil2'].notna().sum()))\n", 337 | "std_e" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 18, 343 | "metadata": {}, 344 | "outputs": [ 345 | { 346 | "data": { 347 | "text/plain": [ 348 | "0.410975590993911" 349 | ] 350 | }, 351 | "execution_count": 18, 352 | "metadata": {}, 353 | "output_type": "execute_result" 354 | } 355 | ], 356 | "source": [ 357 | "diff - stats.norm.ppf(0.005) * std_e" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 19, 363 | "metadata": {}, 364 | "outputs": [ 365 | { 366 | "data": { 367 | "text/plain": [ 368 | "0.13839583757751825" 369 | ] 370 | }, 371 | "execution_count": 19, 372 | "metadata": {}, 373 | "output_type": "execute_result" 374 | } 375 | ], 376 | "source": [ 377 | "diff + stats.norm.ppf(0.005) * std_e" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 20, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "# ci for xbar1 - xbar2 is 0.41 and 0.375 with %90 CL so we reject H0" 387 | ] 388 | } 389 | ], 390 | "metadata": { 391 | "kernelspec": { 392 | "display_name": "Python 3", 393 | "language": "python", 394 | "name": "python3" 395 | }, 396 | "language_info": { 397 | "codemirror_mode": { 398 | "name": "ipython", 399 | "version": 3 400 | }, 401 | "file_extension": ".py", 402 | "mimetype": "text/x-python", 403 | "name": "python", 404 | "nbconvert_exporter": "python", 405 | "pygments_lexer": "ipython3", 406 | "version": "3.7.3" 407 | } 408 | }, 409 | "nbformat": 4, 410 | "nbformat_minor": 2 411 | } 412 | -------------------------------------------------------------------------------- /Statistics/Statistics_Exercise_6.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pandas as pd 8 | import numpy as np 9 | from scipy import stats 10 | import math 11 | 12 | 13 | # ## EXERCISE 1. 14 | # The hourly wages in a particular industry are normally distributed with mean $13.20 and standard deviation $2.50. A company in this industry employs 40 workers, paying them an average of $12.20 per hour. Can this company be accused of paying substandard wages? Use an α = .01 level test. 15 | 16 | # In[2]: 17 | 18 | 19 | # H0: mu = 13.20 20 | # Ha: mu < 13.20 21 | # it is one tailed test 22 | 23 | 24 | # In[3]: 25 | 26 | 27 | mu = 13.20 28 | mu_sample = 12.20 29 | n = 40 30 | std = 2.5 31 | alpha = 0.01 32 | 33 | 34 | # In[4]: 35 | 36 | 37 | t_statistic = (mu_sample - mu)/(std/math.sqrt(n)) 38 | 39 | 40 | # In[5]: 41 | 42 | 43 | t_statistic 44 | 45 | 46 | # In[6]: 47 | 48 | 49 | p_value = stats.norm.cdf(t_statistic) 50 | 51 | 52 | # In[7]: 53 | 54 | 55 | p_value 56 | 57 | 58 | # In[8]: 59 | 60 | 61 | if p_value < alpha: 62 | print('At {} level of significance, we can reject the null hypothesis in the favor of Ha.'.format(alpha)) 63 | else: 64 | print('At {} level of significance, we fail to reject the null hypothesis.'.format(alpha)) 65 | 66 | 67 | # ## EXERCISE 2. 68 | # Shear strength measurements derived from unconfined compression tests for two types of soils gave the results shown in the following document (measurements in tons per square foot). Do the soils appear to differ with respect to average shear strength, at the 1% significance level? 69 | 70 | # In[9]: 71 | 72 | 73 | # H0: mu1 = mu2 74 | # Ha: mu1 != mu2 75 | 76 | 77 | # In[10]: 78 | 79 | 80 | df = pd.read_csv('soil.csv') 81 | 82 | 83 | # In[11]: 84 | 85 | 86 | df.head() 87 | 88 | 89 | # In[12]: 90 | 91 | 92 | xbar1 = df['Soil1'].mean() 93 | xbar2 = df['Soil2'].mean() 94 | 95 | s1 = df['Soil1'].std() 96 | s2 = df['Soil2'].std() 97 | 98 | alpha = 0.01 99 | 100 | 101 | # In[13]: 102 | 103 | 104 | t_statistic = ((xbar1 - xbar2) - 0) / math.sqrt(((s1 ** 2) / df['Soil1'].notna().sum()) + ((s2 ** 2) / df['Soil2'].notna().sum())) 105 | 106 | 107 | # In[14]: 108 | 109 | 110 | t_statistic 111 | 112 | 113 | # In[15]: 114 | 115 | 116 | stats.norm.ppf(0.005) # 0.01 / 2 117 | 118 | 119 | # In[16]: 120 | 121 | 122 | diff = xbar1 - xbar2 123 | diff 124 | 125 | 126 | # In[17]: 127 | 128 | 129 | std_e = math.sqrt(((s1 ** 2) / df['Soil1'].notna().sum()) + ((s2 ** 2) / df['Soil2'].notna().sum())) 130 | std_e 131 | 132 | 133 | # In[18]: 134 | 135 | 136 | diff - stats.norm.ppf(0.005) * std_e 137 | 138 | 139 | # In[19]: 140 | 141 | 142 | diff + stats.norm.ppf(0.005) * std_e 143 | 144 | 145 | # In[20]: 146 | 147 | 148 | # ci for xbar1 - xbar2 is 0.41 and 0.375 with %90 CL so we reject H0 149 | 150 | --------------------------------------------------------------------------------