├── README.md
├── 50_startups Success Rate Prediction
├── 50_Startups.csv
└── Multiple_Linear_Regression.ipynb
├── House_Price_Prediction Detailed Analysis
└── Feature_Selection.ipynb
└── Fraud Detection Using ML
└── fraud_detection.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # Machine Learning Projects
2 |
3 | Welcome to my Machine Learning Projects repository! This repository contains a collection of machine learning projects that cover various topics.
4 |
--------------------------------------------------------------------------------
/50_startups Success Rate Prediction/50_Startups.csv:
--------------------------------------------------------------------------------
1 | R&D Spend,Administration,Marketing Spend,State,Profit
2 | 165349.2,136897.8,471784.1,New York,192261.83
3 | 162597.7,151377.59,443898.53,California,191792.06
4 | 153441.51,101145.55,407934.54,Florida,191050.39
5 | 144372.41,118671.85,383199.62,New York,182901.99
6 | 142107.34,91391.77,366168.42,Florida,166187.94
7 | 131876.9,99814.71,362861.36,New York,156991.12
8 | 134615.46,147198.87,127716.82,California,156122.51
9 | 130298.13,145530.06,323876.68,Florida,155752.6
10 | 120542.52,148718.95,311613.29,New York,152211.77
11 | 123334.88,108679.17,304981.62,California,149759.96
12 | 101913.08,110594.11,229160.95,Florida,146121.95
13 | 100671.96,91790.61,249744.55,California,144259.4
14 | 93863.75,127320.38,249839.44,Florida,141585.52
15 | 91992.39,135495.07,252664.93,California,134307.35
16 | 119943.24,156547.42,256512.92,Florida,132602.65
17 | 114523.61,122616.84,261776.23,New York,129917.04
18 | 78013.11,121597.55,264346.06,California,126992.93
19 | 94657.16,145077.58,282574.31,New York,125370.37
20 | 91749.16,114175.79,294919.57,Florida,124266.9
21 | 86419.7,153514.11,0,New York,122776.86
22 | 76253.86,113867.3,298664.47,California,118474.03
23 | 78389.47,153773.43,299737.29,New York,111313.02
24 | 73994.56,122782.75,303319.26,Florida,110352.25
25 | 67532.53,105751.03,304768.73,Florida,108733.99
26 | 77044.01,99281.34,140574.81,New York,108552.04
27 | 64664.71,139553.16,137962.62,California,107404.34
28 | 75328.87,144135.98,134050.07,Florida,105733.54
29 | 72107.6,127864.55,353183.81,New York,105008.31
30 | 66051.52,182645.56,118148.2,Florida,103282.38
31 | 65605.48,153032.06,107138.38,New York,101004.64
32 | 61994.48,115641.28,91131.24,Florida,99937.59
33 | 61136.38,152701.92,88218.23,New York,97483.56
34 | 63408.86,129219.61,46085.25,California,97427.84
35 | 55493.95,103057.49,214634.81,Florida,96778.92
36 | 46426.07,157693.92,210797.67,California,96712.8
37 | 46014.02,85047.44,205517.64,New York,96479.51
38 | 28663.76,127056.21,201126.82,Florida,90708.19
39 | 44069.95,51283.14,197029.42,California,89949.14
40 | 20229.59,65947.93,185265.1,New York,81229.06
41 | 38558.51,82982.09,174999.3,California,81005.76
42 | 28754.33,118546.05,172795.67,California,78239.91
43 | 27892.92,84710.77,164470.71,Florida,77798.83
44 | 23640.93,96189.63,148001.11,California,71498.49
45 | 15505.73,127382.3,35534.17,New York,69758.98
46 | 22177.74,154806.14,28334.72,California,65200.33
47 | 1000.23,124153.04,1903.93,New York,64926.08
48 | 1315.46,115816.21,297114.46,Florida,49490.75
49 | 0,135426.92,0,California,42559.73
50 | 542.05,51743.15,0,New York,35673.41
51 | 0,116983.8,45173.06,California,14681.4
--------------------------------------------------------------------------------
/50_startups Success Rate Prediction/Multiple_Linear_Regression.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import matplotlib.pyplot as plt\n",
11 | "import pandas as pd"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "df = pd.read_csv('Startups.csv')"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 3,
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "data": {
30 | "text/html": [
31 | "
\n",
32 | "\n",
45 | "
\n",
46 | " \n",
47 | " \n",
48 | " | \n",
49 | " R&D Spend | \n",
50 | " Administration | \n",
51 | " Marketing Spend | \n",
52 | " State | \n",
53 | " Profit | \n",
54 | "
\n",
55 | " \n",
56 | " \n",
57 | " \n",
58 | " | 0 | \n",
59 | " 165349.20 | \n",
60 | " 136897.80 | \n",
61 | " 471784.10 | \n",
62 | " New York | \n",
63 | " 192261.83 | \n",
64 | "
\n",
65 | " \n",
66 | " | 1 | \n",
67 | " 162597.70 | \n",
68 | " 151377.59 | \n",
69 | " 443898.53 | \n",
70 | " California | \n",
71 | " 191792.06 | \n",
72 | "
\n",
73 | " \n",
74 | " | 2 | \n",
75 | " 153441.51 | \n",
76 | " 101145.55 | \n",
77 | " 407934.54 | \n",
78 | " Florida | \n",
79 | " 191050.39 | \n",
80 | "
\n",
81 | " \n",
82 | " | 3 | \n",
83 | " 144372.41 | \n",
84 | " 118671.85 | \n",
85 | " 383199.62 | \n",
86 | " New York | \n",
87 | " 182901.99 | \n",
88 | "
\n",
89 | " \n",
90 | " | 4 | \n",
91 | " 142107.34 | \n",
92 | " 91391.77 | \n",
93 | " 366168.42 | \n",
94 | " Florida | \n",
95 | " 166187.94 | \n",
96 | "
\n",
97 | " \n",
98 | "
\n",
99 | "
"
100 | ],
101 | "text/plain": [
102 | " R&D Spend Administration Marketing Spend State Profit\n",
103 | "0 165349.20 136897.80 471784.10 New York 192261.83\n",
104 | "1 162597.70 151377.59 443898.53 California 191792.06\n",
105 | "2 153441.51 101145.55 407934.54 Florida 191050.39\n",
106 | "3 144372.41 118671.85 383199.62 New York 182901.99\n",
107 | "4 142107.34 91391.77 366168.42 Florida 166187.94"
108 | ]
109 | },
110 | "execution_count": 3,
111 | "metadata": {},
112 | "output_type": "execute_result"
113 | }
114 | ],
115 | "source": [
116 | "df.head()"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 4,
122 | "metadata": {},
123 | "outputs": [
124 | {
125 | "data": {
126 | "text/plain": [
127 | "(50, 5)"
128 | ]
129 | },
130 | "execution_count": 4,
131 | "metadata": {},
132 | "output_type": "execute_result"
133 | }
134 | ],
135 | "source": [
136 | "df.shape"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 5,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "X = df.iloc[:, :-1] # independent data\n",
146 | "y = df.iloc[:, -1] # dependent data"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 6,
152 | "metadata": {},
153 | "outputs": [
154 | {
155 | "data": {
156 | "text/plain": [
157 | "New York 17\n",
158 | "California 17\n",
159 | "Florida 16\n",
160 | "Name: State, dtype: int64"
161 | ]
162 | },
163 | "execution_count": 6,
164 | "metadata": {},
165 | "output_type": "execute_result"
166 | }
167 | ],
168 | "source": [
169 | "df['State'].value_counts() ## count the cities"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 7,
175 | "metadata": {},
176 | "outputs": [
177 | {
178 | "data": {
179 | "text/plain": [
180 | "R&D Spend 0\n",
181 | "Administration 0\n",
182 | "Marketing Spend 0\n",
183 | "State 0\n",
184 | "Profit 0\n",
185 | "dtype: int64"
186 | ]
187 | },
188 | "execution_count": 7,
189 | "metadata": {},
190 | "output_type": "execute_result"
191 | }
192 | ],
193 | "source": [
194 | "df.isnull().sum() ## checking null values"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 9,
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "states = pd.get_dummies(X['State'], drop_first=True) # Converting State column into one-hot-encoding"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 12,
209 | "metadata": {},
210 | "outputs": [],
211 | "source": [
212 | "X = X.drop('State', axis=1) # droping state column"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": 14,
218 | "metadata": {},
219 | "outputs": [],
220 | "source": [
221 | "X = pd.concat([X, states], axis=1)"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": 15,
227 | "metadata": {},
228 | "outputs": [
229 | {
230 | "data": {
231 | "text/html": [
232 | "\n",
233 | "\n",
246 | "
\n",
247 | " \n",
248 | " \n",
249 | " | \n",
250 | " R&D Spend | \n",
251 | " Administration | \n",
252 | " Marketing Spend | \n",
253 | " Florida | \n",
254 | " New York | \n",
255 | "
\n",
256 | " \n",
257 | " \n",
258 | " \n",
259 | " | 0 | \n",
260 | " 165349.20 | \n",
261 | " 136897.80 | \n",
262 | " 471784.10 | \n",
263 | " 0 | \n",
264 | " 1 | \n",
265 | "
\n",
266 | " \n",
267 | " | 1 | \n",
268 | " 162597.70 | \n",
269 | " 151377.59 | \n",
270 | " 443898.53 | \n",
271 | " 0 | \n",
272 | " 0 | \n",
273 | "
\n",
274 | " \n",
275 | " | 2 | \n",
276 | " 153441.51 | \n",
277 | " 101145.55 | \n",
278 | " 407934.54 | \n",
279 | " 1 | \n",
280 | " 0 | \n",
281 | "
\n",
282 | " \n",
283 | " | 3 | \n",
284 | " 144372.41 | \n",
285 | " 118671.85 | \n",
286 | " 383199.62 | \n",
287 | " 0 | \n",
288 | " 1 | \n",
289 | "
\n",
290 | " \n",
291 | " | 4 | \n",
292 | " 142107.34 | \n",
293 | " 91391.77 | \n",
294 | " 366168.42 | \n",
295 | " 1 | \n",
296 | " 0 | \n",
297 | "
\n",
298 | " \n",
299 | "
\n",
300 | "
"
301 | ],
302 | "text/plain": [
303 | " R&D Spend Administration Marketing Spend Florida New York\n",
304 | "0 165349.20 136897.80 471784.10 0 1\n",
305 | "1 162597.70 151377.59 443898.53 0 0\n",
306 | "2 153441.51 101145.55 407934.54 1 0\n",
307 | "3 144372.41 118671.85 383199.62 0 1\n",
308 | "4 142107.34 91391.77 366168.42 1 0"
309 | ]
310 | },
311 | "execution_count": 15,
312 | "metadata": {},
313 | "output_type": "execute_result"
314 | }
315 | ],
316 | "source": [
317 | "X.head()"
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": 16,
323 | "metadata": {},
324 | "outputs": [],
325 | "source": [
326 | "from sklearn.model_selection import train_test_split"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 17,
332 | "metadata": {},
333 | "outputs": [],
334 | "source": [
335 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)"
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": 18,
341 | "metadata": {},
342 | "outputs": [],
343 | "source": [
344 | "from sklearn.linear_model import LinearRegression"
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": 19,
350 | "metadata": {},
351 | "outputs": [],
352 | "source": [
353 | "regressor = LinearRegression()"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": 20,
359 | "metadata": {},
360 | "outputs": [
361 | {
362 | "data": {
363 | "text/plain": [
364 | "LinearRegression()"
365 | ]
366 | },
367 | "execution_count": 20,
368 | "metadata": {},
369 | "output_type": "execute_result"
370 | }
371 | ],
372 | "source": [
373 | "regressor.fit(X_train, y_train)"
374 | ]
375 | },
376 | {
377 | "cell_type": "code",
378 | "execution_count": 21,
379 | "metadata": {},
380 | "outputs": [],
381 | "source": [
382 | "y_pred = regressor.predict(X_test)"
383 | ]
384 | },
385 | {
386 | "cell_type": "code",
387 | "execution_count": 24,
388 | "metadata": {},
389 | "outputs": [
390 | {
391 | "name": "stdout",
392 | "output_type": "stream",
393 | "text": [
394 | "[103015.20159796 132582.27760816 132447.73845174 71976.09851258\n",
395 | " 178537.48221055]\n",
396 | "28 103282.38\n",
397 | "11 144259.40\n",
398 | "10 146121.95\n",
399 | "41 77798.83\n",
400 | "2 191050.39\n",
401 | "27 105008.31\n",
402 | "38 81229.06\n",
403 | "31 97483.56\n",
404 | "22 110352.25\n",
405 | "4 166187.94\n",
406 | "Name: Profit, dtype: float64\n"
407 | ]
408 | }
409 | ],
410 | "source": [
411 | "print(y_pred[:5])\n",
412 | "print(y_test[:])"
413 | ]
414 | },
415 | {
416 | "cell_type": "code",
417 | "execution_count": 28,
418 | "metadata": {},
419 | "outputs": [],
420 | "source": [
421 | "from sklearn.metrics import r2_score"
422 | ]
423 | },
424 | {
425 | "cell_type": "code",
426 | "execution_count": 29,
427 | "metadata": {},
428 | "outputs": [],
429 | "source": [
430 | "score = r2_score(y_test, y_pred)"
431 | ]
432 | },
433 | {
434 | "cell_type": "code",
435 | "execution_count": 30,
436 | "metadata": {},
437 | "outputs": [
438 | {
439 | "name": "stdout",
440 | "output_type": "stream",
441 | "text": [
442 | "0.9347068473282423\n"
443 | ]
444 | }
445 | ],
446 | "source": [
447 | "print(score)"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": null,
453 | "metadata": {},
454 | "outputs": [],
455 | "source": []
456 | }
457 | ],
458 | "metadata": {
459 | "kernelspec": {
460 | "display_name": "Python 3",
461 | "language": "python",
462 | "name": "python3"
463 | },
464 | "language_info": {
465 | "codemirror_mode": {
466 | "name": "ipython",
467 | "version": 3
468 | },
469 | "file_extension": ".py",
470 | "mimetype": "text/x-python",
471 | "name": "python",
472 | "nbconvert_exporter": "python",
473 | "pygments_lexer": "ipython3",
474 | "version": "3.8.5"
475 | }
476 | },
477 | "nbformat": 4,
478 | "nbformat_minor": 4
479 | }
480 |
--------------------------------------------------------------------------------
/House_Price_Prediction Detailed Analysis/Feature_Selection.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "%matplotlib inline\n",
14 | "\n",
15 | "# for feature selection\n",
16 | "from sklearn.linear_model import Lasso\n",
17 | "from sklearn.feature_selection import SelectFromModel\n",
18 | "\n",
19 | "# to visualize all the column of the dataframe\n",
20 | "pd.pandas.set_option('display.max_columns', None)"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "dataset = pd.read_csv('train.csv')"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "metadata": {},
36 | "outputs": [
37 | {
38 | "data": {
39 | "text/html": [
40 | "\n",
41 | "\n",
54 | "
\n",
55 | " \n",
56 | " \n",
57 | " | \n",
58 | " Unnamed: 0 | \n",
59 | " Id | \n",
60 | " SalePrice | \n",
61 | " MSSubClass | \n",
62 | " MSZoning | \n",
63 | " LotFrontage | \n",
64 | " LotArea | \n",
65 | " Street | \n",
66 | " Alley | \n",
67 | " LotShape | \n",
68 | " LandContour | \n",
69 | " Utilities | \n",
70 | " LotConfig | \n",
71 | " LandSlope | \n",
72 | " Neighborhood | \n",
73 | " Condition1 | \n",
74 | " Condition2 | \n",
75 | " BldgType | \n",
76 | " HouseStyle | \n",
77 | " OverallQual | \n",
78 | " OverallCond | \n",
79 | " YearBuilt | \n",
80 | " YearRemodAdd | \n",
81 | " RoofStyle | \n",
82 | " RoofMatl | \n",
83 | " Exterior1st | \n",
84 | " Exterior2nd | \n",
85 | " MasVnrType | \n",
86 | " MasVnrArea | \n",
87 | " ExterQual | \n",
88 | " ExterCond | \n",
89 | " Foundation | \n",
90 | " BsmtQual | \n",
91 | " BsmtCond | \n",
92 | " BsmtExposure | \n",
93 | " BsmtFinType1 | \n",
94 | " BsmtFinSF1 | \n",
95 | " BsmtFinType2 | \n",
96 | " BsmtFinSF2 | \n",
97 | " BsmtUnfSF | \n",
98 | " TotalBsmtSF | \n",
99 | " Heating | \n",
100 | " HeatingQC | \n",
101 | " CentralAir | \n",
102 | " Electrical | \n",
103 | " 1stFlrSF | \n",
104 | " 2ndFlrSF | \n",
105 | " LowQualFinSF | \n",
106 | " GrLivArea | \n",
107 | " BsmtFullBath | \n",
108 | " BsmtHalfBath | \n",
109 | " FullBath | \n",
110 | " HalfBath | \n",
111 | " BedroomAbvGr | \n",
112 | " KitchenAbvGr | \n",
113 | " KitchenQual | \n",
114 | " TotRmsAbvGrd | \n",
115 | " Functional | \n",
116 | " Fireplaces | \n",
117 | " FireplaceQu | \n",
118 | " GarageType | \n",
119 | " GarageYrBlt | \n",
120 | " GarageFinish | \n",
121 | " GarageCars | \n",
122 | " GarageArea | \n",
123 | " GarageQual | \n",
124 | " GarageCond | \n",
125 | " PavedDrive | \n",
126 | " WoodDeckSF | \n",
127 | " OpenPorchSF | \n",
128 | " EnclosedPorch | \n",
129 | " 3SsnPorch | \n",
130 | " ScreenPorch | \n",
131 | " PoolArea | \n",
132 | " PoolQC | \n",
133 | " Fence | \n",
134 | " MiscFeature | \n",
135 | " MiscVal | \n",
136 | " MoSold | \n",
137 | " YrSold | \n",
138 | " SaleType | \n",
139 | " SaleCondition | \n",
140 | " LotFrontagenan | \n",
141 | " MasVnrAreanan | \n",
142 | " GarageYrBltnan | \n",
143 | "
\n",
144 | " \n",
145 | " \n",
146 | " \n",
147 | " | 0 | \n",
148 | " 0 | \n",
149 | " 1 | \n",
150 | " 12.247694 | \n",
151 | " 0.235294 | \n",
152 | " 0.5 | \n",
153 | " 0.418208 | \n",
154 | " 0.366344 | \n",
155 | " 0.0 | \n",
156 | " 0.5 | \n",
157 | " 1.0 | \n",
158 | " 1.0 | \n",
159 | " 0.0 | \n",
160 | " 0.75 | \n",
161 | " 0.0 | \n",
162 | " 0.181818 | \n",
163 | " 0.4 | \n",
164 | " 0.0 | \n",
165 | " 0.0 | \n",
166 | " 0.4 | \n",
167 | " 0.666667 | \n",
168 | " 0.500 | \n",
169 | " 0.036765 | \n",
170 | " 0.098361 | \n",
171 | " 0.0 | \n",
172 | " 0.0 | \n",
173 | " 0.8 | \n",
174 | " 0.8 | \n",
175 | " 0.25 | \n",
176 | " 0.12250 | \n",
177 | " 0.333333 | \n",
178 | " 1.0 | \n",
179 | " 0.50 | \n",
180 | " 0.5 | \n",
181 | " 1.00 | \n",
182 | " 1.00 | \n",
183 | " 0.333333 | \n",
184 | " 0.125089 | \n",
185 | " 1.0 | \n",
186 | " 0.0 | \n",
187 | " 0.064212 | \n",
188 | " 0.140098 | \n",
189 | " 0.0 | \n",
190 | " 0.0 | \n",
191 | " 1.0 | \n",
192 | " 1.0 | \n",
193 | " 0.356155 | \n",
194 | " 0.413559 | \n",
195 | " 0.0 | \n",
196 | " 0.577712 | \n",
197 | " 0.333333 | \n",
198 | " 0.0 | \n",
199 | " 0.666667 | \n",
200 | " 0.5 | \n",
201 | " 0.375 | \n",
202 | " 0.333333 | \n",
203 | " 0.666667 | \n",
204 | " 0.500000 | \n",
205 | " 1.0 | \n",
206 | " 0.000000 | \n",
207 | " 0.6 | \n",
208 | " 0.0 | \n",
209 | " 0.046729 | \n",
210 | " 0.666667 | \n",
211 | " 0.50 | \n",
212 | " 0.386460 | \n",
213 | " 1.0 | \n",
214 | " 1.0 | \n",
215 | " 1.0 | \n",
216 | " 0.000000 | \n",
217 | " 0.111517 | \n",
218 | " 0.000000 | \n",
219 | " 0.0 | \n",
220 | " 0.0 | \n",
221 | " 0.0 | \n",
222 | " 0.0 | \n",
223 | " 0.5 | \n",
224 | " 0.0 | \n",
225 | " 0.0 | \n",
226 | " 0.090909 | \n",
227 | " 0.50 | \n",
228 | " 1.0 | \n",
229 | " 0.5 | \n",
230 | " 0.0 | \n",
231 | " 0.0 | \n",
232 | " 0.0 | \n",
233 | "
\n",
234 | " \n",
235 | " | 1 | \n",
236 | " 1 | \n",
237 | " 2 | \n",
238 | " 12.109011 | \n",
239 | " 0.000000 | \n",
240 | " 0.5 | \n",
241 | " 0.495064 | \n",
242 | " 0.391317 | \n",
243 | " 0.0 | \n",
244 | " 0.5 | \n",
245 | " 1.0 | \n",
246 | " 1.0 | \n",
247 | " 0.0 | \n",
248 | " 0.50 | \n",
249 | " 0.0 | \n",
250 | " 0.727273 | \n",
251 | " 0.2 | \n",
252 | " 0.0 | \n",
253 | " 0.0 | \n",
254 | " 0.2 | \n",
255 | " 0.555556 | \n",
256 | " 0.875 | \n",
257 | " 0.227941 | \n",
258 | " 0.524590 | \n",
259 | " 0.0 | \n",
260 | " 0.0 | \n",
261 | " 0.4 | \n",
262 | " 0.4 | \n",
263 | " 0.50 | \n",
264 | " 0.00000 | \n",
265 | " 1.000000 | \n",
266 | " 1.0 | \n",
267 | " 0.25 | \n",
268 | " 0.5 | \n",
269 | " 1.00 | \n",
270 | " 0.25 | \n",
271 | " 0.000000 | \n",
272 | " 0.173281 | \n",
273 | " 1.0 | \n",
274 | " 0.0 | \n",
275 | " 0.121575 | \n",
276 | " 0.206547 | \n",
277 | " 0.0 | \n",
278 | " 0.0 | \n",
279 | " 1.0 | \n",
280 | " 1.0 | \n",
281 | " 0.503056 | \n",
282 | " 0.000000 | \n",
283 | " 0.0 | \n",
284 | " 0.470245 | \n",
285 | " 0.000000 | \n",
286 | " 0.5 | \n",
287 | " 0.666667 | \n",
288 | " 0.0 | \n",
289 | " 0.375 | \n",
290 | " 0.333333 | \n",
291 | " 1.000000 | \n",
292 | " 0.333333 | \n",
293 | " 1.0 | \n",
294 | " 0.333333 | \n",
295 | " 1.0 | \n",
296 | " 0.0 | \n",
297 | " 0.289720 | \n",
298 | " 0.666667 | \n",
299 | " 0.50 | \n",
300 | " 0.324401 | \n",
301 | " 1.0 | \n",
302 | " 1.0 | \n",
303 | " 1.0 | \n",
304 | " 0.347725 | \n",
305 | " 0.000000 | \n",
306 | " 0.000000 | \n",
307 | " 0.0 | \n",
308 | " 0.0 | \n",
309 | " 0.0 | \n",
310 | " 0.0 | \n",
311 | " 0.5 | \n",
312 | " 0.0 | \n",
313 | " 0.0 | \n",
314 | " 0.363636 | \n",
315 | " 0.25 | \n",
316 | " 1.0 | \n",
317 | " 0.5 | \n",
318 | " 0.0 | \n",
319 | " 0.0 | \n",
320 | " 0.0 | \n",
321 | "
\n",
322 | " \n",
323 | " | 2 | \n",
324 | " 2 | \n",
325 | " 3 | \n",
326 | " 12.317167 | \n",
327 | " 0.235294 | \n",
328 | " 0.5 | \n",
329 | " 0.434909 | \n",
330 | " 0.422359 | \n",
331 | " 0.0 | \n",
332 | " 0.5 | \n",
333 | " 0.0 | \n",
334 | " 1.0 | \n",
335 | " 0.0 | \n",
336 | " 0.75 | \n",
337 | " 0.0 | \n",
338 | " 0.181818 | \n",
339 | " 0.4 | \n",
340 | " 0.0 | \n",
341 | " 0.0 | \n",
342 | " 0.4 | \n",
343 | " 0.666667 | \n",
344 | " 0.500 | \n",
345 | " 0.051471 | \n",
346 | " 0.114754 | \n",
347 | " 0.0 | \n",
348 | " 0.0 | \n",
349 | " 0.8 | \n",
350 | " 0.8 | \n",
351 | " 0.25 | \n",
352 | " 0.10125 | \n",
353 | " 0.333333 | \n",
354 | " 1.0 | \n",
355 | " 0.50 | \n",
356 | " 0.5 | \n",
357 | " 1.00 | \n",
358 | " 0.75 | \n",
359 | " 0.333333 | \n",
360 | " 0.086109 | \n",
361 | " 1.0 | \n",
362 | " 0.0 | \n",
363 | " 0.185788 | \n",
364 | " 0.150573 | \n",
365 | " 0.0 | \n",
366 | " 0.0 | \n",
367 | " 1.0 | \n",
368 | " 1.0 | \n",
369 | " 0.383441 | \n",
370 | " 0.419370 | \n",
371 | " 0.0 | \n",
372 | " 0.593095 | \n",
373 | " 0.333333 | \n",
374 | " 0.0 | \n",
375 | " 0.666667 | \n",
376 | " 0.5 | \n",
377 | " 0.375 | \n",
378 | " 0.333333 | \n",
379 | " 0.666667 | \n",
380 | " 0.333333 | \n",
381 | " 1.0 | \n",
382 | " 0.333333 | \n",
383 | " 1.0 | \n",
384 | " 0.0 | \n",
385 | " 0.065421 | \n",
386 | " 0.666667 | \n",
387 | " 0.50 | \n",
388 | " 0.428773 | \n",
389 | " 1.0 | \n",
390 | " 1.0 | \n",
391 | " 1.0 | \n",
392 | " 0.000000 | \n",
393 | " 0.076782 | \n",
394 | " 0.000000 | \n",
395 | " 0.0 | \n",
396 | " 0.0 | \n",
397 | " 0.0 | \n",
398 | " 0.0 | \n",
399 | " 0.5 | \n",
400 | " 0.0 | \n",
401 | " 0.0 | \n",
402 | " 0.727273 | \n",
403 | " 0.50 | \n",
404 | " 1.0 | \n",
405 | " 0.5 | \n",
406 | " 0.0 | \n",
407 | " 0.0 | \n",
408 | " 0.0 | \n",
409 | "
\n",
410 | " \n",
411 | " | 3 | \n",
412 | " 3 | \n",
413 | " 4 | \n",
414 | " 11.849398 | \n",
415 | " 0.294118 | \n",
416 | " 0.5 | \n",
417 | " 0.388581 | \n",
418 | " 0.390295 | \n",
419 | " 0.0 | \n",
420 | " 0.5 | \n",
421 | " 0.0 | \n",
422 | " 1.0 | \n",
423 | " 0.0 | \n",
424 | " 0.00 | \n",
425 | " 0.0 | \n",
426 | " 0.227273 | \n",
427 | " 0.4 | \n",
428 | " 0.0 | \n",
429 | " 0.0 | \n",
430 | " 0.4 | \n",
431 | " 0.666667 | \n",
432 | " 0.500 | \n",
433 | " 0.669118 | \n",
434 | " 0.606557 | \n",
435 | " 0.0 | \n",
436 | " 0.0 | \n",
437 | " 0.9 | \n",
438 | " 1.0 | \n",
439 | " 0.50 | \n",
440 | " 0.00000 | \n",
441 | " 1.000000 | \n",
442 | " 1.0 | \n",
443 | " 0.00 | \n",
444 | " 1.0 | \n",
445 | " 0.25 | \n",
446 | " 1.00 | \n",
447 | " 0.000000 | \n",
448 | " 0.038271 | \n",
449 | " 1.0 | \n",
450 | " 0.0 | \n",
451 | " 0.231164 | \n",
452 | " 0.123732 | \n",
453 | " 0.0 | \n",
454 | " 0.5 | \n",
455 | " 1.0 | \n",
456 | " 1.0 | \n",
457 | " 0.399941 | \n",
458 | " 0.366102 | \n",
459 | " 0.0 | \n",
460 | " 0.579157 | \n",
461 | " 0.333333 | \n",
462 | " 0.0 | \n",
463 | " 0.333333 | \n",
464 | " 0.0 | \n",
465 | " 0.375 | \n",
466 | " 0.333333 | \n",
467 | " 0.666667 | \n",
468 | " 0.416667 | \n",
469 | " 1.0 | \n",
470 | " 0.333333 | \n",
471 | " 0.4 | \n",
472 | " 0.6 | \n",
473 | " 0.074766 | \n",
474 | " 1.000000 | \n",
475 | " 0.75 | \n",
476 | " 0.452750 | \n",
477 | " 1.0 | \n",
478 | " 1.0 | \n",
479 | " 1.0 | \n",
480 | " 0.000000 | \n",
481 | " 0.063985 | \n",
482 | " 0.492754 | \n",
483 | " 0.0 | \n",
484 | " 0.0 | \n",
485 | " 0.0 | \n",
486 | " 0.0 | \n",
487 | " 0.5 | \n",
488 | " 0.0 | \n",
489 | " 0.0 | \n",
490 | " 0.090909 | \n",
491 | " 0.00 | \n",
492 | " 1.0 | \n",
493 | " 0.0 | \n",
494 | " 0.0 | \n",
495 | " 0.0 | \n",
496 | " 0.0 | \n",
497 | "
\n",
498 | " \n",
499 | " | 4 | \n",
500 | " 4 | \n",
501 | " 5 | \n",
502 | " 12.429216 | \n",
503 | " 0.235294 | \n",
504 | " 0.5 | \n",
505 | " 0.513123 | \n",
506 | " 0.468761 | \n",
507 | " 0.0 | \n",
508 | " 0.5 | \n",
509 | " 0.0 | \n",
510 | " 1.0 | \n",
511 | " 0.0 | \n",
512 | " 0.50 | \n",
513 | " 0.0 | \n",
514 | " 0.590909 | \n",
515 | " 0.4 | \n",
516 | " 0.0 | \n",
517 | " 0.0 | \n",
518 | " 0.4 | \n",
519 | " 0.777778 | \n",
520 | " 0.500 | \n",
521 | " 0.058824 | \n",
522 | " 0.147541 | \n",
523 | " 0.0 | \n",
524 | " 0.0 | \n",
525 | " 0.8 | \n",
526 | " 0.8 | \n",
527 | " 0.25 | \n",
528 | " 0.21875 | \n",
529 | " 0.333333 | \n",
530 | " 1.0 | \n",
531 | " 0.50 | \n",
532 | " 0.5 | \n",
533 | " 1.00 | \n",
534 | " 0.00 | \n",
535 | " 0.333333 | \n",
536 | " 0.116052 | \n",
537 | " 1.0 | \n",
538 | " 0.0 | \n",
539 | " 0.209760 | \n",
540 | " 0.187398 | \n",
541 | " 0.0 | \n",
542 | " 0.0 | \n",
543 | " 1.0 | \n",
544 | " 1.0 | \n",
545 | " 0.466237 | \n",
546 | " 0.509927 | \n",
547 | " 0.0 | \n",
548 | " 0.666523 | \n",
549 | " 0.333333 | \n",
550 | " 0.0 | \n",
551 | " 0.666667 | \n",
552 | " 0.5 | \n",
553 | " 0.500 | \n",
554 | " 0.333333 | \n",
555 | " 0.666667 | \n",
556 | " 0.583333 | \n",
557 | " 1.0 | \n",
558 | " 0.333333 | \n",
559 | " 1.0 | \n",
560 | " 0.0 | \n",
561 | " 0.074766 | \n",
562 | " 0.666667 | \n",
563 | " 0.75 | \n",
564 | " 0.589563 | \n",
565 | " 1.0 | \n",
566 | " 1.0 | \n",
567 | " 1.0 | \n",
568 | " 0.224037 | \n",
569 | " 0.153565 | \n",
570 | " 0.000000 | \n",
571 | " 0.0 | \n",
572 | " 0.0 | \n",
573 | " 0.0 | \n",
574 | " 0.0 | \n",
575 | " 0.5 | \n",
576 | " 0.0 | \n",
577 | " 0.0 | \n",
578 | " 1.000000 | \n",
579 | " 0.50 | \n",
580 | " 1.0 | \n",
581 | " 0.5 | \n",
582 | " 0.0 | \n",
583 | " 0.0 | \n",
584 | " 0.0 | \n",
585 | "
\n",
586 | " \n",
587 | "
\n",
588 | "
"
589 | ],
590 | "text/plain": [
591 | " Unnamed: 0 Id SalePrice MSSubClass MSZoning LotFrontage LotArea \\\n",
592 | "0 0 1 12.247694 0.235294 0.5 0.418208 0.366344 \n",
593 | "1 1 2 12.109011 0.000000 0.5 0.495064 0.391317 \n",
594 | "2 2 3 12.317167 0.235294 0.5 0.434909 0.422359 \n",
595 | "3 3 4 11.849398 0.294118 0.5 0.388581 0.390295 \n",
596 | "4 4 5 12.429216 0.235294 0.5 0.513123 0.468761 \n",
597 | "\n",
598 | " Street Alley LotShape LandContour Utilities LotConfig LandSlope \\\n",
599 | "0 0.0 0.5 1.0 1.0 0.0 0.75 0.0 \n",
600 | "1 0.0 0.5 1.0 1.0 0.0 0.50 0.0 \n",
601 | "2 0.0 0.5 0.0 1.0 0.0 0.75 0.0 \n",
602 | "3 0.0 0.5 0.0 1.0 0.0 0.00 0.0 \n",
603 | "4 0.0 0.5 0.0 1.0 0.0 0.50 0.0 \n",
604 | "\n",
605 | " Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual \\\n",
606 | "0 0.181818 0.4 0.0 0.0 0.4 0.666667 \n",
607 | "1 0.727273 0.2 0.0 0.0 0.2 0.555556 \n",
608 | "2 0.181818 0.4 0.0 0.0 0.4 0.666667 \n",
609 | "3 0.227273 0.4 0.0 0.0 0.4 0.666667 \n",
610 | "4 0.590909 0.4 0.0 0.0 0.4 0.777778 \n",
611 | "\n",
612 | " OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st \\\n",
613 | "0 0.500 0.036765 0.098361 0.0 0.0 0.8 \n",
614 | "1 0.875 0.227941 0.524590 0.0 0.0 0.4 \n",
615 | "2 0.500 0.051471 0.114754 0.0 0.0 0.8 \n",
616 | "3 0.500 0.669118 0.606557 0.0 0.0 0.9 \n",
617 | "4 0.500 0.058824 0.147541 0.0 0.0 0.8 \n",
618 | "\n",
619 | " Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation \\\n",
620 | "0 0.8 0.25 0.12250 0.333333 1.0 0.50 \n",
621 | "1 0.4 0.50 0.00000 1.000000 1.0 0.25 \n",
622 | "2 0.8 0.25 0.10125 0.333333 1.0 0.50 \n",
623 | "3 1.0 0.50 0.00000 1.000000 1.0 0.00 \n",
624 | "4 0.8 0.25 0.21875 0.333333 1.0 0.50 \n",
625 | "\n",
626 | " BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 \\\n",
627 | "0 0.5 1.00 1.00 0.333333 0.125089 1.0 \n",
628 | "1 0.5 1.00 0.25 0.000000 0.173281 1.0 \n",
629 | "2 0.5 1.00 0.75 0.333333 0.086109 1.0 \n",
630 | "3 1.0 0.25 1.00 0.000000 0.038271 1.0 \n",
631 | "4 0.5 1.00 0.00 0.333333 0.116052 1.0 \n",
632 | "\n",
633 | " BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir \\\n",
634 | "0 0.0 0.064212 0.140098 0.0 0.0 1.0 \n",
635 | "1 0.0 0.121575 0.206547 0.0 0.0 1.0 \n",
636 | "2 0.0 0.185788 0.150573 0.0 0.0 1.0 \n",
637 | "3 0.0 0.231164 0.123732 0.0 0.5 1.0 \n",
638 | "4 0.0 0.209760 0.187398 0.0 0.0 1.0 \n",
639 | "\n",
640 | " Electrical 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath \\\n",
641 | "0 1.0 0.356155 0.413559 0.0 0.577712 0.333333 \n",
642 | "1 1.0 0.503056 0.000000 0.0 0.470245 0.000000 \n",
643 | "2 1.0 0.383441 0.419370 0.0 0.593095 0.333333 \n",
644 | "3 1.0 0.399941 0.366102 0.0 0.579157 0.333333 \n",
645 | "4 1.0 0.466237 0.509927 0.0 0.666523 0.333333 \n",
646 | "\n",
647 | " BsmtHalfBath FullBath HalfBath BedroomAbvGr KitchenAbvGr KitchenQual \\\n",
648 | "0 0.0 0.666667 0.5 0.375 0.333333 0.666667 \n",
649 | "1 0.5 0.666667 0.0 0.375 0.333333 1.000000 \n",
650 | "2 0.0 0.666667 0.5 0.375 0.333333 0.666667 \n",
651 | "3 0.0 0.333333 0.0 0.375 0.333333 0.666667 \n",
652 | "4 0.0 0.666667 0.5 0.500 0.333333 0.666667 \n",
653 | "\n",
654 | " TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType GarageYrBlt \\\n",
655 | "0 0.500000 1.0 0.000000 0.6 0.0 0.046729 \n",
656 | "1 0.333333 1.0 0.333333 1.0 0.0 0.289720 \n",
657 | "2 0.333333 1.0 0.333333 1.0 0.0 0.065421 \n",
658 | "3 0.416667 1.0 0.333333 0.4 0.6 0.074766 \n",
659 | "4 0.583333 1.0 0.333333 1.0 0.0 0.074766 \n",
660 | "\n",
661 | " GarageFinish GarageCars GarageArea GarageQual GarageCond PavedDrive \\\n",
662 | "0 0.666667 0.50 0.386460 1.0 1.0 1.0 \n",
663 | "1 0.666667 0.50 0.324401 1.0 1.0 1.0 \n",
664 | "2 0.666667 0.50 0.428773 1.0 1.0 1.0 \n",
665 | "3 1.000000 0.75 0.452750 1.0 1.0 1.0 \n",
666 | "4 0.666667 0.75 0.589563 1.0 1.0 1.0 \n",
667 | "\n",
668 | " WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch ScreenPorch PoolArea \\\n",
669 | "0 0.000000 0.111517 0.000000 0.0 0.0 0.0 \n",
670 | "1 0.347725 0.000000 0.000000 0.0 0.0 0.0 \n",
671 | "2 0.000000 0.076782 0.000000 0.0 0.0 0.0 \n",
672 | "3 0.000000 0.063985 0.492754 0.0 0.0 0.0 \n",
673 | "4 0.224037 0.153565 0.000000 0.0 0.0 0.0 \n",
674 | "\n",
675 | " PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType \\\n",
676 | "0 0.0 0.5 0.0 0.0 0.090909 0.50 1.0 \n",
677 | "1 0.0 0.5 0.0 0.0 0.363636 0.25 1.0 \n",
678 | "2 0.0 0.5 0.0 0.0 0.727273 0.50 1.0 \n",
679 | "3 0.0 0.5 0.0 0.0 0.090909 0.00 1.0 \n",
680 | "4 0.0 0.5 0.0 0.0 1.000000 0.50 1.0 \n",
681 | "\n",
682 | " SaleCondition LotFrontagenan MasVnrAreanan GarageYrBltnan \n",
683 | "0 0.5 0.0 0.0 0.0 \n",
684 | "1 0.5 0.0 0.0 0.0 \n",
685 | "2 0.5 0.0 0.0 0.0 \n",
686 | "3 0.0 0.0 0.0 0.0 \n",
687 | "4 0.5 0.0 0.0 0.0 "
688 | ]
689 | },
690 | "execution_count": 3,
691 | "metadata": {},
692 | "output_type": "execute_result"
693 | }
694 | ],
695 | "source": [
696 | "dataset.head()"
697 | ]
698 | },
699 | {
700 | "cell_type": "code",
701 | "execution_count": 4,
702 | "metadata": {},
703 | "outputs": [],
704 | "source": [
705 | "# capture the dependent dataset\n",
706 | "y_train = dataset[['SalePrice']]"
707 | ]
708 | },
709 | {
710 | "cell_type": "code",
711 | "execution_count": 5,
712 | "metadata": {},
713 | "outputs": [],
714 | "source": [
715 | "# drop the dependent feature from dataset\n",
716 | "X_train = dataset.drop(['Id', 'SalePrice'], axis=1)"
717 | ]
718 | },
719 | {
720 | "cell_type": "code",
721 | "execution_count": 6,
722 | "metadata": {},
723 | "outputs": [
724 | {
725 | "data": {
726 | "text/plain": [
727 | "SelectFromModel(estimator=Lasso(alpha=0.05, random_state=0))"
728 | ]
729 | },
730 | "execution_count": 6,
731 | "metadata": {},
732 | "output_type": "execute_result"
733 | }
734 | ],
735 | "source": [
736 | "# Apply Feature Selection \n",
737 | "# First, I specify the Lasso Regression model, and I \n",
738 | "# Selected a suitable alpha (equivalent of penalty).\n",
739 | "# The bigger the alpha the less features that will be selected\n",
740 | "\n",
741 | "# Then I use selectfromModel object from sklearn which\n",
742 | "# will select the feature which co-officients are non zero\n",
743 | "\n",
744 | "feature_sel_model = SelectFromModel(Lasso(alpha=0.05, random_state=0))\n",
745 | "feature_sel_model.fit(X_train, y_train)"
746 | ]
747 | },
748 | {
749 | "cell_type": "code",
750 | "execution_count": 7,
751 | "metadata": {},
752 | "outputs": [
753 | {
754 | "data": {
755 | "text/plain": [
756 | "array([ True, False, False, False, False, False, False, False, False,\n",
757 | " False, False, False, False, False, False, False, False, False,\n",
758 | " False, False, True, False, False, False, False, False, False,\n",
759 | " True, False, False, True, False, False, False, False, False,\n",
760 | " False, False, False, False, False, False, False, False, False,\n",
761 | " False, False, False, False, False, False, False, False, False,\n",
762 | " False, False, False, False, False, False, False, False, False,\n",
763 | " False, False, False, False, False, False, False, False, False,\n",
764 | " False, False, False, False, False, False, False, False, False,\n",
765 | " False, False])"
766 | ]
767 | },
768 | "execution_count": 7,
769 | "metadata": {},
770 | "output_type": "execute_result"
771 | }
772 | ],
773 | "source": [
774 | "feature_sel_model.get_support()"
775 | ]
776 | },
777 | {
778 | "cell_type": "code",
779 | "execution_count": 14,
780 | "metadata": {},
781 | "outputs": [
782 | {
783 | "name": "stdout",
784 | "output_type": "stream",
785 | "text": [
786 | "Total features: 83\n",
787 | "selected featrues: 4\n",
788 | "features with cofficients shrank to zero: 79\n"
789 | ]
790 | }
791 | ],
792 | "source": [
793 | "# Let's print the number of total and selected features\n",
794 | "# this is how we can make a list of the selected features\n",
795 | "\n",
796 | "selected_feat = X_train.columns[(feature_sel_model.get_support())]\n",
797 | "\n",
798 | "# let's print some stats\n",
799 | "print('Total features: {}'.format(X_train.shape[1]))\n",
800 | "print(\"selected featrues: {}\".format(len(selected_feat)))\n",
801 | "print('features with cofficients shrank to zero: {}'.format(\n",
802 | " np.sum(feature_sel_model.estimator_.coef_ == 0)))"
803 | ]
804 | },
805 | {
806 | "cell_type": "code",
807 | "execution_count": 16,
808 | "metadata": {},
809 | "outputs": [],
810 | "source": [
811 | "X_train = X_train[selected_feat]"
812 | ]
813 | },
814 | {
815 | "cell_type": "code",
816 | "execution_count": 17,
817 | "metadata": {},
818 | "outputs": [
819 | {
820 | "data": {
821 | "text/html": [
822 | "\n",
823 | "\n",
836 | "
\n",
837 | " \n",
838 | " \n",
839 | " | \n",
840 | " Unnamed: 0 | \n",
841 | " YearRemodAdd | \n",
842 | " ExterQual | \n",
843 | " BsmtQual | \n",
844 | "
\n",
845 | " \n",
846 | " \n",
847 | " \n",
848 | " | 0 | \n",
849 | " 0 | \n",
850 | " 0.098361 | \n",
851 | " 0.333333 | \n",
852 | " 0.5 | \n",
853 | "
\n",
854 | " \n",
855 | " | 1 | \n",
856 | " 1 | \n",
857 | " 0.524590 | \n",
858 | " 1.000000 | \n",
859 | " 0.5 | \n",
860 | "
\n",
861 | " \n",
862 | " | 2 | \n",
863 | " 2 | \n",
864 | " 0.114754 | \n",
865 | " 0.333333 | \n",
866 | " 0.5 | \n",
867 | "
\n",
868 | " \n",
869 | " | 3 | \n",
870 | " 3 | \n",
871 | " 0.606557 | \n",
872 | " 1.000000 | \n",
873 | " 1.0 | \n",
874 | "
\n",
875 | " \n",
876 | " | 4 | \n",
877 | " 4 | \n",
878 | " 0.147541 | \n",
879 | " 0.333333 | \n",
880 | " 0.5 | \n",
881 | "
\n",
882 | " \n",
883 | "
\n",
884 | "
"
885 | ],
886 | "text/plain": [
887 | " Unnamed: 0 YearRemodAdd ExterQual BsmtQual\n",
888 | "0 0 0.098361 0.333333 0.5\n",
889 | "1 1 0.524590 1.000000 0.5\n",
890 | "2 2 0.114754 0.333333 0.5\n",
891 | "3 3 0.606557 1.000000 1.0\n",
892 | "4 4 0.147541 0.333333 0.5"
893 | ]
894 | },
895 | "execution_count": 17,
896 | "metadata": {},
897 | "output_type": "execute_result"
898 | }
899 | ],
900 | "source": [
901 | "X_train.head()"
902 | ]
903 | },
904 | {
905 | "cell_type": "code",
906 | "execution_count": null,
907 | "metadata": {},
908 | "outputs": [],
909 | "source": []
910 | }
911 | ],
912 | "metadata": {
913 | "kernelspec": {
914 | "display_name": "Python 3",
915 | "language": "python",
916 | "name": "python3"
917 | },
918 | "language_info": {
919 | "codemirror_mode": {
920 | "name": "ipython",
921 | "version": 3
922 | },
923 | "file_extension": ".py",
924 | "mimetype": "text/x-python",
925 | "name": "python",
926 | "nbconvert_exporter": "python",
927 | "pygments_lexer": "ipython3",
928 | "version": "3.8.5"
929 | }
930 | },
931 | "nbformat": 4,
932 | "nbformat_minor": 4
933 | }
934 |
--------------------------------------------------------------------------------
/Fraud Detection Using ML/fraud_detection.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np \n",
10 | "import pandas as pd \n",
11 | "import seaborn as sns \n",
12 | "import matplotlib.pyplot as plt \n",
13 | "%matplotlib inline\n",
14 | "from sklearn.model_selection import train_test_split \n",
15 | "from sklearn.preprocessing import StandardScaler\n",
16 | "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n",
17 | "from sklearn.linear_model import LogisticRegression \n",
18 | "\n",
19 | "sns.set_style('darkgrid')"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 35,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "## Dataset \n",
29 | "df = pd.read_csv('payment_fraud.csv')"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 36,
35 | "metadata": {},
36 | "outputs": [
37 | {
38 | "data": {
39 | "text/html": [
40 | "\n",
41 | "\n",
54 | "
\n",
55 | " \n",
56 | " \n",
57 | " | \n",
58 | " accountAgeDays | \n",
59 | " numItems | \n",
60 | " localTime | \n",
61 | " paymentMethod | \n",
62 | " paymentMethodAgeDays | \n",
63 | " label | \n",
64 | "
\n",
65 | " \n",
66 | " \n",
67 | " \n",
68 | " | 0 | \n",
69 | " 29 | \n",
70 | " 1 | \n",
71 | " 4.745402 | \n",
72 | " paypal | \n",
73 | " 28.204861 | \n",
74 | " 0 | \n",
75 | "
\n",
76 | " \n",
77 | " | 1 | \n",
78 | " 725 | \n",
79 | " 1 | \n",
80 | " 4.742303 | \n",
81 | " storecredit | \n",
82 | " 0.000000 | \n",
83 | " 0 | \n",
84 | "
\n",
85 | " \n",
86 | " | 2 | \n",
87 | " 845 | \n",
88 | " 1 | \n",
89 | " 4.921318 | \n",
90 | " creditcard | \n",
91 | " 0.000000 | \n",
92 | " 0 | \n",
93 | "
\n",
94 | " \n",
95 | " | 3 | \n",
96 | " 503 | \n",
97 | " 1 | \n",
98 | " 4.886641 | \n",
99 | " creditcard | \n",
100 | " 0.000000 | \n",
101 | " 0 | \n",
102 | "
\n",
103 | " \n",
104 | " | 4 | \n",
105 | " 2000 | \n",
106 | " 1 | \n",
107 | " 5.040929 | \n",
108 | " creditcard | \n",
109 | " 0.000000 | \n",
110 | " 0 | \n",
111 | "
\n",
112 | " \n",
113 | "
\n",
114 | "
"
115 | ],
116 | "text/plain": [
117 | " accountAgeDays numItems localTime paymentMethod paymentMethodAgeDays \\\n",
118 | "0 29 1 4.745402 paypal 28.204861 \n",
119 | "1 725 1 4.742303 storecredit 0.000000 \n",
120 | "2 845 1 4.921318 creditcard 0.000000 \n",
121 | "3 503 1 4.886641 creditcard 0.000000 \n",
122 | "4 2000 1 5.040929 creditcard 0.000000 \n",
123 | "\n",
124 | " label \n",
125 | "0 0 \n",
126 | "1 0 \n",
127 | "2 0 \n",
128 | "3 0 \n",
129 | "4 0 "
130 | ]
131 | },
132 | "execution_count": 36,
133 | "metadata": {},
134 | "output_type": "execute_result"
135 | }
136 | ],
137 | "source": [
138 | "df.head()"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 37,
144 | "metadata": {},
145 | "outputs": [
146 | {
147 | "data": {
148 | "text/plain": [
149 | "accountAgeDays 0\n",
150 | "numItems 0\n",
151 | "localTime 0\n",
152 | "paymentMethod 0\n",
153 | "paymentMethodAgeDays 0\n",
154 | "label 0\n",
155 | "dtype: int64"
156 | ]
157 | },
158 | "execution_count": 37,
159 | "metadata": {},
160 | "output_type": "execute_result"
161 | }
162 | ],
163 | "source": [
164 | "df.isnull().sum() ## checking the null valeus "
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 39,
170 | "metadata": {},
171 | "outputs": [
172 | {
173 | "data": {
174 | "text/plain": [
175 | "0 38661\n",
176 | "1 560\n",
177 | "Name: label, dtype: int64"
178 | ]
179 | },
180 | "execution_count": 39,
181 | "metadata": {},
182 | "output_type": "execute_result"
183 | }
184 | ],
185 | "source": [
186 | "df.label.value_counts() ## count the number of 0's and 1's"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": 40,
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "## coverting paymentMethod column into label encoding\n",
196 | "paymthd_label = {v:k for k, v in enumerate(df.paymentMethod.unique())}\n",
197 | "\n",
198 | "df.paymentMethod = df.paymentMethod.map(paymthd_label)"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 41,
204 | "metadata": {},
205 | "outputs": [
206 | {
207 | "data": {
208 | "text/html": [
209 | "\n",
210 | "\n",
223 | "
\n",
224 | " \n",
225 | " \n",
226 | " | \n",
227 | " accountAgeDays | \n",
228 | " numItems | \n",
229 | " localTime | \n",
230 | " paymentMethod | \n",
231 | " paymentMethodAgeDays | \n",
232 | " label | \n",
233 | "
\n",
234 | " \n",
235 | " \n",
236 | " \n",
237 | " | 0 | \n",
238 | " 29 | \n",
239 | " 1 | \n",
240 | " 4.745402 | \n",
241 | " 0 | \n",
242 | " 28.204861 | \n",
243 | " 0 | \n",
244 | "
\n",
245 | " \n",
246 | " | 1 | \n",
247 | " 725 | \n",
248 | " 1 | \n",
249 | " 4.742303 | \n",
250 | " 1 | \n",
251 | " 0.000000 | \n",
252 | " 0 | \n",
253 | "
\n",
254 | " \n",
255 | " | 2 | \n",
256 | " 845 | \n",
257 | " 1 | \n",
258 | " 4.921318 | \n",
259 | " 2 | \n",
260 | " 0.000000 | \n",
261 | " 0 | \n",
262 | "
\n",
263 | " \n",
264 | " | 3 | \n",
265 | " 503 | \n",
266 | " 1 | \n",
267 | " 4.886641 | \n",
268 | " 2 | \n",
269 | " 0.000000 | \n",
270 | " 0 | \n",
271 | "
\n",
272 | " \n",
273 | " | 4 | \n",
274 | " 2000 | \n",
275 | " 1 | \n",
276 | " 5.040929 | \n",
277 | " 2 | \n",
278 | " 0.000000 | \n",
279 | " 0 | \n",
280 | "
\n",
281 | " \n",
282 | "
\n",
283 | "
"
284 | ],
285 | "text/plain": [
286 | " accountAgeDays numItems localTime paymentMethod paymentMethodAgeDays \\\n",
287 | "0 29 1 4.745402 0 28.204861 \n",
288 | "1 725 1 4.742303 1 0.000000 \n",
289 | "2 845 1 4.921318 2 0.000000 \n",
290 | "3 503 1 4.886641 2 0.000000 \n",
291 | "4 2000 1 5.040929 2 0.000000 \n",
292 | "\n",
293 | " label \n",
294 | "0 0 \n",
295 | "1 0 \n",
296 | "2 0 \n",
297 | "3 0 \n",
298 | "4 0 "
299 | ]
300 | },
301 | "execution_count": 41,
302 | "metadata": {},
303 | "output_type": "execute_result"
304 | }
305 | ],
306 | "source": [
307 | "df.head()"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": 42,
313 | "metadata": {},
314 | "outputs": [
315 | {
316 | "data": {
317 | "image/png": "",
318 | "text/plain": [
319 | ""
320 | ]
321 | },
322 | "metadata": {},
323 | "output_type": "display_data"
324 | }
325 | ],
326 | "source": [
327 | "## corr(): it gives the correlation between the featuers\n",
328 | "plt.figure(figsize=(10, 10))\n",
329 | "sns.heatmap(df.corr(), annot=True);"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": 43,
335 | "metadata": {},
336 | "outputs": [
337 | {
338 | "data": {
339 | "text/html": [
340 | "\n",
341 | "\n",
354 | "
\n",
355 | " \n",
356 | " \n",
357 | " | \n",
358 | " accountAgeDays | \n",
359 | " numItems | \n",
360 | " localTime | \n",
361 | " paymentMethod | \n",
362 | " paymentMethodAgeDays | \n",
363 | " label | \n",
364 | "
\n",
365 | " \n",
366 | " \n",
367 | " \n",
368 | " | count | \n",
369 | " 39221.000000 | \n",
370 | " 39221.000000 | \n",
371 | " 39221.000000 | \n",
372 | " 39221.000000 | \n",
373 | " 39221.000000 | \n",
374 | " 39221.000000 | \n",
375 | "
\n",
376 | " \n",
377 | " | mean | \n",
378 | " 857.563984 | \n",
379 | " 1.084751 | \n",
380 | " 4.748232 | \n",
381 | " 1.476811 | \n",
382 | " 122.641326 | \n",
383 | " 0.014278 | \n",
384 | "
\n",
385 | " \n",
386 | " | std | \n",
387 | " 804.788212 | \n",
388 | " 0.566899 | \n",
389 | " 0.389360 | \n",
390 | " 0.850805 | \n",
391 | " 283.569177 | \n",
392 | " 0.118636 | \n",
393 | "
\n",
394 | " \n",
395 | " | min | \n",
396 | " 1.000000 | \n",
397 | " 1.000000 | \n",
398 | " 0.421214 | \n",
399 | " 0.000000 | \n",
400 | " 0.000000 | \n",
401 | " 0.000000 | \n",
402 | "
\n",
403 | " \n",
404 | " | 25% | \n",
405 | " 72.000000 | \n",
406 | " 1.000000 | \n",
407 | " 4.742303 | \n",
408 | " 1.000000 | \n",
409 | " 0.000000 | \n",
410 | " 0.000000 | \n",
411 | "
\n",
412 | " \n",
413 | " | 50% | \n",
414 | " 603.000000 | \n",
415 | " 1.000000 | \n",
416 | " 4.886641 | \n",
417 | " 2.000000 | \n",
418 | " 0.012500 | \n",
419 | " 0.000000 | \n",
420 | "
\n",
421 | " \n",
422 | " | 75% | \n",
423 | " 1804.000000 | \n",
424 | " 1.000000 | \n",
425 | " 4.962055 | \n",
426 | " 2.000000 | \n",
427 | " 87.510417 | \n",
428 | " 0.000000 | \n",
429 | "
\n",
430 | " \n",
431 | " | max | \n",
432 | " 2000.000000 | \n",
433 | " 29.000000 | \n",
434 | " 5.040929 | \n",
435 | " 2.000000 | \n",
436 | " 1999.580556 | \n",
437 | " 1.000000 | \n",
438 | "
\n",
439 | " \n",
440 | "
\n",
441 | "
"
442 | ],
443 | "text/plain": [
444 | " accountAgeDays numItems localTime paymentMethod \\\n",
445 | "count 39221.000000 39221.000000 39221.000000 39221.000000 \n",
446 | "mean 857.563984 1.084751 4.748232 1.476811 \n",
447 | "std 804.788212 0.566899 0.389360 0.850805 \n",
448 | "min 1.000000 1.000000 0.421214 0.000000 \n",
449 | "25% 72.000000 1.000000 4.742303 1.000000 \n",
450 | "50% 603.000000 1.000000 4.886641 2.000000 \n",
451 | "75% 1804.000000 1.000000 4.962055 2.000000 \n",
452 | "max 2000.000000 29.000000 5.040929 2.000000 \n",
453 | "\n",
454 | " paymentMethodAgeDays label \n",
455 | "count 39221.000000 39221.000000 \n",
456 | "mean 122.641326 0.014278 \n",
457 | "std 283.569177 0.118636 \n",
458 | "min 0.000000 0.000000 \n",
459 | "25% 0.000000 0.000000 \n",
460 | "50% 0.012500 0.000000 \n",
461 | "75% 87.510417 0.000000 \n",
462 | "max 1999.580556 1.000000 "
463 | ]
464 | },
465 | "execution_count": 43,
466 | "metadata": {},
467 | "output_type": "execute_result"
468 | }
469 | ],
470 | "source": [
471 | "df.describe()"
472 | ]
473 | },
474 | {
475 | "cell_type": "code",
476 | "execution_count": 44,
477 | "metadata": {},
478 | "outputs": [],
479 | "source": [
480 | "## independent and dependent features\n",
481 | "X = df.iloc[:, :-1].values\n",
482 | "y = df.iloc[:, -1].values"
483 | ]
484 | },
485 | {
486 | "cell_type": "code",
487 | "execution_count": 45,
488 | "metadata": {},
489 | "outputs": [],
490 | "source": [
491 | "## scaling \n",
492 | "\n",
493 | "sc = StandardScaler()\n",
494 | "X = sc.fit_transform(X)"
495 | ]
496 | },
497 | {
498 | "cell_type": "code",
499 | "execution_count": 46,
500 | "metadata": {},
501 | "outputs": [],
502 | "source": [
503 | "## train test split \n",
504 | "\n",
505 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)"
506 | ]
507 | },
508 | {
509 | "cell_type": "code",
510 | "execution_count": 48,
511 | "metadata": {},
512 | "outputs": [
513 | {
514 | "name": "stdout",
515 | "output_type": "stream",
516 | "text": [
517 | "X_train shape: (29415, 5)\n",
518 | "X_test shape: (9806, 5)\n",
519 | "y_train shape: (29415,)\n",
520 | "y_test shape: (9806,)\n"
521 | ]
522 | }
523 | ],
524 | "source": [
525 | "print(\"X_train shape: \", X_train.shape)\n",
526 | "print(\"X_test shape: \", X_test.shape)\n",
527 | "print(\"y_train shape: \", y_train.shape)\n",
528 | "print(\"y_test shape: \", y_test.shape)"
529 | ]
530 | },
531 | {
532 | "cell_type": "code",
533 | "execution_count": 49,
534 | "metadata": {},
535 | "outputs": [
536 | {
537 | "data": {
538 | "text/plain": [
539 | "LogisticRegression()"
540 | ]
541 | },
542 | "execution_count": 49,
543 | "metadata": {},
544 | "output_type": "execute_result"
545 | }
546 | ],
547 | "source": [
548 | "## logisticRegression Model\n",
549 | "lg = LogisticRegression()\n",
550 | "\n",
551 | "## training\n",
552 | "lg.fit(X_train, y_train)"
553 | ]
554 | },
555 | {
556 | "cell_type": "code",
557 | "execution_count": 50,
558 | "metadata": {},
559 | "outputs": [],
560 | "source": [
561 | "## prediction \n",
562 | "pred = lg.predict(X_test)"
563 | ]
564 | },
565 | {
566 | "cell_type": "code",
567 | "execution_count": null,
568 | "metadata": {},
569 | "outputs": [],
570 | "source": []
571 | },
572 | {
573 | "cell_type": "code",
574 | "execution_count": null,
575 | "metadata": {},
576 | "outputs": [],
577 | "source": []
578 | }
579 | ],
580 | "metadata": {
581 | "kernelspec": {
582 | "display_name": "Python 3",
583 | "language": "python",
584 | "name": "python3"
585 | },
586 | "language_info": {
587 | "codemirror_mode": {
588 | "name": "ipython",
589 | "version": 3
590 | },
591 | "file_extension": ".py",
592 | "mimetype": "text/x-python",
593 | "name": "python",
594 | "nbconvert_exporter": "python",
595 | "pygments_lexer": "ipython3",
596 | "version": "3.8.5"
597 | }
598 | },
599 | "nbformat": 4,
600 | "nbformat_minor": 4
601 | }
602 |
--------------------------------------------------------------------------------