├── AmesHousing.tsv
├── House_Sale_Price_Prediction.ipynb
└── README.md
/House_Sale_Price_Prediction.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "collapsed": true
7 | },
8 | "source": [
9 | "# Predicting House Sale Prices"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "## Import Necessary Libraries"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 350,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "import pandas as pd\n",
26 | "import numpy as np\n",
27 | "import matplotlib.pyplot as plt\n",
28 | "from sklearn.model_selection import KFold\n",
29 | "from sklearn.metrics import mean_squared_error\n",
30 | "from sklearn import linear_model"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "## Housing Dataset"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "Here is the housing data for the city of Ames, Iowa, United States from 2006 to 2010. "
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 351,
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "data": {
54 | "text/html": [
55 | "
\n",
56 | "\n",
69 | "
\n",
70 | " \n",
71 | " \n",
72 | " | \n",
73 | " Order | \n",
74 | " PID | \n",
75 | " MS SubClass | \n",
76 | " MS Zoning | \n",
77 | " Lot Frontage | \n",
78 | " Lot Area | \n",
79 | " Street | \n",
80 | " Alley | \n",
81 | " Lot Shape | \n",
82 | " Land Contour | \n",
83 | " ... | \n",
84 | " Pool Area | \n",
85 | " Pool QC | \n",
86 | " Fence | \n",
87 | " Misc Feature | \n",
88 | " Misc Val | \n",
89 | " Mo Sold | \n",
90 | " Yr Sold | \n",
91 | " Sale Type | \n",
92 | " Sale Condition | \n",
93 | " SalePrice | \n",
94 | "
\n",
95 | " \n",
96 | " \n",
97 | " \n",
98 | " | 0 | \n",
99 | " 1 | \n",
100 | " 526301100 | \n",
101 | " 20 | \n",
102 | " RL | \n",
103 | " 141.0 | \n",
104 | " 31770 | \n",
105 | " Pave | \n",
106 | " NaN | \n",
107 | " IR1 | \n",
108 | " Lvl | \n",
109 | " ... | \n",
110 | " 0 | \n",
111 | " NaN | \n",
112 | " NaN | \n",
113 | " NaN | \n",
114 | " 0 | \n",
115 | " 5 | \n",
116 | " 2010 | \n",
117 | " WD | \n",
118 | " Normal | \n",
119 | " 215000 | \n",
120 | "
\n",
121 | " \n",
122 | " | 1 | \n",
123 | " 2 | \n",
124 | " 526350040 | \n",
125 | " 20 | \n",
126 | " RH | \n",
127 | " 80.0 | \n",
128 | " 11622 | \n",
129 | " Pave | \n",
130 | " NaN | \n",
131 | " Reg | \n",
132 | " Lvl | \n",
133 | " ... | \n",
134 | " 0 | \n",
135 | " NaN | \n",
136 | " MnPrv | \n",
137 | " NaN | \n",
138 | " 0 | \n",
139 | " 6 | \n",
140 | " 2010 | \n",
141 | " WD | \n",
142 | " Normal | \n",
143 | " 105000 | \n",
144 | "
\n",
145 | " \n",
146 | " | 2 | \n",
147 | " 3 | \n",
148 | " 526351010 | \n",
149 | " 20 | \n",
150 | " RL | \n",
151 | " 81.0 | \n",
152 | " 14267 | \n",
153 | " Pave | \n",
154 | " NaN | \n",
155 | " IR1 | \n",
156 | " Lvl | \n",
157 | " ... | \n",
158 | " 0 | \n",
159 | " NaN | \n",
160 | " NaN | \n",
161 | " Gar2 | \n",
162 | " 12500 | \n",
163 | " 6 | \n",
164 | " 2010 | \n",
165 | " WD | \n",
166 | " Normal | \n",
167 | " 172000 | \n",
168 | "
\n",
169 | " \n",
170 | " | 3 | \n",
171 | " 4 | \n",
172 | " 526353030 | \n",
173 | " 20 | \n",
174 | " RL | \n",
175 | " 93.0 | \n",
176 | " 11160 | \n",
177 | " Pave | \n",
178 | " NaN | \n",
179 | " Reg | \n",
180 | " Lvl | \n",
181 | " ... | \n",
182 | " 0 | \n",
183 | " NaN | \n",
184 | " NaN | \n",
185 | " NaN | \n",
186 | " 0 | \n",
187 | " 4 | \n",
188 | " 2010 | \n",
189 | " WD | \n",
190 | " Normal | \n",
191 | " 244000 | \n",
192 | "
\n",
193 | " \n",
194 | " | 4 | \n",
195 | " 5 | \n",
196 | " 527105010 | \n",
197 | " 60 | \n",
198 | " RL | \n",
199 | " 74.0 | \n",
200 | " 13830 | \n",
201 | " Pave | \n",
202 | " NaN | \n",
203 | " IR1 | \n",
204 | " Lvl | \n",
205 | " ... | \n",
206 | " 0 | \n",
207 | " NaN | \n",
208 | " MnPrv | \n",
209 | " NaN | \n",
210 | " 0 | \n",
211 | " 3 | \n",
212 | " 2010 | \n",
213 | " WD | \n",
214 | " Normal | \n",
215 | " 189900 | \n",
216 | "
\n",
217 | " \n",
218 | " | ... | \n",
219 | " ... | \n",
220 | " ... | \n",
221 | " ... | \n",
222 | " ... | \n",
223 | " ... | \n",
224 | " ... | \n",
225 | " ... | \n",
226 | " ... | \n",
227 | " ... | \n",
228 | " ... | \n",
229 | " ... | \n",
230 | " ... | \n",
231 | " ... | \n",
232 | " ... | \n",
233 | " ... | \n",
234 | " ... | \n",
235 | " ... | \n",
236 | " ... | \n",
237 | " ... | \n",
238 | " ... | \n",
239 | " ... | \n",
240 | "
\n",
241 | " \n",
242 | " | 2925 | \n",
243 | " 2926 | \n",
244 | " 923275080 | \n",
245 | " 80 | \n",
246 | " RL | \n",
247 | " 37.0 | \n",
248 | " 7937 | \n",
249 | " Pave | \n",
250 | " NaN | \n",
251 | " IR1 | \n",
252 | " Lvl | \n",
253 | " ... | \n",
254 | " 0 | \n",
255 | " NaN | \n",
256 | " GdPrv | \n",
257 | " NaN | \n",
258 | " 0 | \n",
259 | " 3 | \n",
260 | " 2006 | \n",
261 | " WD | \n",
262 | " Normal | \n",
263 | " 142500 | \n",
264 | "
\n",
265 | " \n",
266 | " | 2926 | \n",
267 | " 2927 | \n",
268 | " 923276100 | \n",
269 | " 20 | \n",
270 | " RL | \n",
271 | " NaN | \n",
272 | " 8885 | \n",
273 | " Pave | \n",
274 | " NaN | \n",
275 | " IR1 | \n",
276 | " Low | \n",
277 | " ... | \n",
278 | " 0 | \n",
279 | " NaN | \n",
280 | " MnPrv | \n",
281 | " NaN | \n",
282 | " 0 | \n",
283 | " 6 | \n",
284 | " 2006 | \n",
285 | " WD | \n",
286 | " Normal | \n",
287 | " 131000 | \n",
288 | "
\n",
289 | " \n",
290 | " | 2927 | \n",
291 | " 2928 | \n",
292 | " 923400125 | \n",
293 | " 85 | \n",
294 | " RL | \n",
295 | " 62.0 | \n",
296 | " 10441 | \n",
297 | " Pave | \n",
298 | " NaN | \n",
299 | " Reg | \n",
300 | " Lvl | \n",
301 | " ... | \n",
302 | " 0 | \n",
303 | " NaN | \n",
304 | " MnPrv | \n",
305 | " Shed | \n",
306 | " 700 | \n",
307 | " 7 | \n",
308 | " 2006 | \n",
309 | " WD | \n",
310 | " Normal | \n",
311 | " 132000 | \n",
312 | "
\n",
313 | " \n",
314 | " | 2928 | \n",
315 | " 2929 | \n",
316 | " 924100070 | \n",
317 | " 20 | \n",
318 | " RL | \n",
319 | " 77.0 | \n",
320 | " 10010 | \n",
321 | " Pave | \n",
322 | " NaN | \n",
323 | " Reg | \n",
324 | " Lvl | \n",
325 | " ... | \n",
326 | " 0 | \n",
327 | " NaN | \n",
328 | " NaN | \n",
329 | " NaN | \n",
330 | " 0 | \n",
331 | " 4 | \n",
332 | " 2006 | \n",
333 | " WD | \n",
334 | " Normal | \n",
335 | " 170000 | \n",
336 | "
\n",
337 | " \n",
338 | " | 2929 | \n",
339 | " 2930 | \n",
340 | " 924151050 | \n",
341 | " 60 | \n",
342 | " RL | \n",
343 | " 74.0 | \n",
344 | " 9627 | \n",
345 | " Pave | \n",
346 | " NaN | \n",
347 | " Reg | \n",
348 | " Lvl | \n",
349 | " ... | \n",
350 | " 0 | \n",
351 | " NaN | \n",
352 | " NaN | \n",
353 | " NaN | \n",
354 | " 0 | \n",
355 | " 11 | \n",
356 | " 2006 | \n",
357 | " WD | \n",
358 | " Normal | \n",
359 | " 188000 | \n",
360 | "
\n",
361 | " \n",
362 | "
\n",
363 | "
2930 rows × 82 columns
\n",
364 | "
"
365 | ],
366 | "text/plain": [
367 | " Order PID MS SubClass MS Zoning Lot Frontage Lot Area Street \\\n",
368 | "0 1 526301100 20 RL 141.0 31770 Pave \n",
369 | "1 2 526350040 20 RH 80.0 11622 Pave \n",
370 | "2 3 526351010 20 RL 81.0 14267 Pave \n",
371 | "3 4 526353030 20 RL 93.0 11160 Pave \n",
372 | "4 5 527105010 60 RL 74.0 13830 Pave \n",
373 | "... ... ... ... ... ... ... ... \n",
374 | "2925 2926 923275080 80 RL 37.0 7937 Pave \n",
375 | "2926 2927 923276100 20 RL NaN 8885 Pave \n",
376 | "2927 2928 923400125 85 RL 62.0 10441 Pave \n",
377 | "2928 2929 924100070 20 RL 77.0 10010 Pave \n",
378 | "2929 2930 924151050 60 RL 74.0 9627 Pave \n",
379 | "\n",
380 | " Alley Lot Shape Land Contour ... Pool Area Pool QC Fence Misc Feature \\\n",
381 | "0 NaN IR1 Lvl ... 0 NaN NaN NaN \n",
382 | "1 NaN Reg Lvl ... 0 NaN MnPrv NaN \n",
383 | "2 NaN IR1 Lvl ... 0 NaN NaN Gar2 \n",
384 | "3 NaN Reg Lvl ... 0 NaN NaN NaN \n",
385 | "4 NaN IR1 Lvl ... 0 NaN MnPrv NaN \n",
386 | "... ... ... ... ... ... ... ... ... \n",
387 | "2925 NaN IR1 Lvl ... 0 NaN GdPrv NaN \n",
388 | "2926 NaN IR1 Low ... 0 NaN MnPrv NaN \n",
389 | "2927 NaN Reg Lvl ... 0 NaN MnPrv Shed \n",
390 | "2928 NaN Reg Lvl ... 0 NaN NaN NaN \n",
391 | "2929 NaN Reg Lvl ... 0 NaN NaN NaN \n",
392 | "\n",
393 | " Misc Val Mo Sold Yr Sold Sale Type Sale Condition SalePrice \n",
394 | "0 0 5 2010 WD Normal 215000 \n",
395 | "1 0 6 2010 WD Normal 105000 \n",
396 | "2 12500 6 2010 WD Normal 172000 \n",
397 | "3 0 4 2010 WD Normal 244000 \n",
398 | "4 0 3 2010 WD Normal 189900 \n",
399 | "... ... ... ... ... ... ... \n",
400 | "2925 0 3 2006 WD Normal 142500 \n",
401 | "2926 0 6 2006 WD Normal 131000 \n",
402 | "2927 700 7 2006 WD Normal 132000 \n",
403 | "2928 0 4 2006 WD Normal 170000 \n",
404 | "2929 0 11 2006 WD Normal 188000 \n",
405 | "\n",
406 | "[2930 rows x 82 columns]"
407 | ]
408 | },
409 | "execution_count": 351,
410 | "metadata": {},
411 | "output_type": "execute_result"
412 | }
413 | ],
414 | "source": [
415 | "df = pd.read_csv(\"AmesHousing.tsv\", sep=\"\\t\")\n",
416 | "df"
417 | ]
418 | },
419 | {
420 | "cell_type": "markdown",
421 | "metadata": {},
422 | "source": [
423 | "## Feature Engineering, Data Exploration, and Data Cleaning"
424 | ]
425 | },
426 | {
427 | "cell_type": "markdown",
428 | "metadata": {},
429 | "source": [
430 | "Steps:\n",
431 | "1. See which of the features have missing values. For the features that has more than 5% of values missing, drop the features.\n",
432 | "2. For the text features, drop those that have missing values.\n",
433 | "3. For the numeric features that have up to 5% missing values, replace the missing values with the most occurring value (i.e mode).\n",
434 | "4. Remove the columns that leak about the final sale and columns useless for building models."
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": 359,
440 | "metadata": {},
441 | "outputs": [],
442 | "source": [
443 | "def transform_features(df):\n",
444 | " \n",
445 | " feature_data = {\"Feature\": df.isnull().sum().keys().tolist(), \"Missing Values\": df.isnull().sum().values.tolist()}\n",
446 | " df_features = pd.DataFrame(feature_data)\n",
447 | " rows,cols = df.shape\n",
448 | " df_features['Percent Missing'] = 100*df_features['Missing Values']/rows\n",
449 | " features = df_features[df_features['Percent Missing'] > 5]['Feature'].tolist()\n",
450 | " df = df.drop(features, axis=1)\n",
451 | " \n",
452 | " text_feature_data = {\"Feature\": df.select_dtypes(include=['object']).isnull().sum().keys().tolist(), \"Missing Values\": df.select_dtypes(include=['object']).isnull().sum().values.tolist()}\n",
453 | " df_text_features = pd.DataFrame(text_feature_data)\n",
454 | " text_features = df_text_features[df_text_features['Missing Values'] > 0]['Feature'].tolist()\n",
455 | " df = df.drop(text_features, axis=1)\n",
456 | " \n",
457 | " numeric_dtypes_missing_values = df.select_dtypes(include=['int64','float64']).isnull().sum()\n",
458 | " numeric_features = numeric_dtypes_missing_values[numeric_dtypes_missing_values > 0]\n",
459 | " df = df.fillna(value=numeric_features)\n",
460 | " \n",
461 | " df['Years Since Remod'] = df['Yr Sold'] - df['Year Remod/Add']\n",
462 | " df['Years Before Sale'] = df['Yr Sold'] - df['Year Built']\n",
463 | " df = df.drop(['Year Built', 'Year Remod/Add'], axis=1)\n",
464 | " df = df.drop([1702,2180,2181], axis=0)\n",
465 | " df = df.drop([\"Order\",\"PID\",\"Mo Sold\", \"Sale Condition\", \"Sale Type\", \"Yr Sold\"], axis=1)\n",
466 | " \n",
467 | " return df"
468 | ]
469 | },
470 | {
471 | "cell_type": "markdown",
472 | "metadata": {},
473 | "source": [
474 | "## Feature Selection"
475 | ]
476 | },
477 | {
478 | "cell_type": "markdown",
479 | "metadata": {},
480 | "source": [
481 | "Steps:\n",
482 | "1. Generate a correlation heatmap matrix of the numerical features in the training data set with respect to SalePrice.\n",
483 | "2. Determine which numerical features should be converted to categorical features as their numbers don't have any semantic meaning. Aim for the features with few unique values with a significant majority (95%) of the values belonging to a specific category."
484 | ]
485 | },
486 | {
487 | "cell_type": "code",
488 | "execution_count": 360,
489 | "metadata": {},
490 | "outputs": [],
491 | "source": [
492 | "def select_features(df, coeff_threshold=0.4, uniq_threshold=10):\n",
493 | " numerical_df = transform_df.select_dtypes(include=['int64', 'float64'])\n",
494 | " abs_corr_coeffs = numerical_df.corr()['SalePrice'].abs().sort_values()\n",
495 | " df = df.drop(abs_corr_coeffs[abs_corr_coeffs < coeff_threshold].index, axis=1)\n",
496 | " \n",
497 | " nominal_features = [\"PID\", \"MS SubClass\", \"MS Zoning\", \"Street\", \"Alley\", \"Land Contour\", \"Lot Config\", \"Neighborhood\", \n",
498 | " \"Condition 1\", \"Condition 2\", \"Bldg Type\", \"House Style\", \"Roof Style\", \"Roof Matl\", \"Exterior 1st\", \n",
499 | " \"Exterior 2nd\", \"Mas Vnr Type\", \"Foundation\", \"Heating\", \"Central Air\", \"Garage Type\", \n",
500 | " \"Misc Feature\", \"Sale Type\", \"Sale Condition\"]\n",
501 | " \n",
502 | " transform_cat_cols = []\n",
503 | " for col in nominal_features:\n",
504 | " if col in df.columns:\n",
505 | " transform_cat_cols.append(col)\n",
506 | "\n",
507 | " uniqueness_counts = df[transform_cat_cols].apply(lambda col: len(col.value_counts())).sort_values()\n",
508 | " drop_nonuniq_cols = uniqueness_counts[uniqueness_counts > 10].index\n",
509 | " df = df.drop(drop_nonuniq_cols, axis=1)\n",
510 | " \n",
511 | " text_cols = df.select_dtypes(include=['object'])\n",
512 | " for col in text_cols:\n",
513 | " df[col] = df[col].astype('category')\n",
514 | " df = pd.concat([df, pd.get_dummies(df.select_dtypes(include=['category']))], axis=1).drop(text_cols,axis=1)\n",
515 | " \n",
516 | " return df"
517 | ]
518 | },
519 | {
520 | "cell_type": "markdown",
521 | "metadata": {},
522 | "source": [
523 | "## Train and Test the Model"
524 | ]
525 | },
526 | {
527 | "cell_type": "markdown",
528 | "metadata": {},
529 | "source": [
530 | "The function train_test can perform validation (k=0) or K-Fold cross validation (k > 0). "
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": 361,
536 | "metadata": {},
537 | "outputs": [],
538 | "source": [
539 | "def train_test(df, k=0):\n",
540 | " if k < 0:\n",
541 | " raise Exception(\"Value for k is negative!\")\n",
542 | " \n",
543 | " numeric_df = df.select_dtypes(include=['integer', 'float'])\n",
544 | " features = numeric_df.columns.drop(\"SalePrice\")\n",
545 | " lr = linear_model.LinearRegression()\n",
546 | " \n",
547 | " if k==0:\n",
548 | " row,col = df.shape\n",
549 | " train_test_cutoff = row*0.7\n",
550 | " train = numeric_df.iloc[:int(train_test_cutoff)]\n",
551 | " test = numeric_df.iloc[int(train_test_cutoff):]\n",
552 | " lr.fit(train[features], train[\"SalePrice\"])\n",
553 | " predictions = lr.predict(test[features])\n",
554 | " mse = mean_squared_error(test[\"SalePrice\"], predictions)\n",
555 | " \n",
556 | " return np.sqrt(mse)\n",
557 | " \n",
558 | " else:\n",
559 | " kf = KFold(n_splits=k, shuffle=True)\n",
560 | " rmse_values = []\n",
561 | " for train_index, test_index, in kf.split(df):\n",
562 | " train = df.iloc[train_index]\n",
563 | " test = df.iloc[test_index]\n",
564 | " lr.fit(train[features], train[\"SalePrice\"])\n",
565 | " predictions = lr.predict(test[features])\n",
566 | " mse = mean_squared_error(test[\"SalePrice\"], predictions)\n",
567 | " rmse = np.sqrt(mse)\n",
568 | " rmse_values.append(rmse)\n",
569 | " print(rmse_values)\n",
570 | " avg_rmse = np.mean(rmse_values)\n",
571 | " return avg_rmse\n",
572 | " "
573 | ]
574 | },
575 | {
576 | "cell_type": "markdown",
577 | "metadata": {},
578 | "source": [
579 | "## Complete Automated ML Pipeline"
580 | ]
581 | },
582 | {
583 | "cell_type": "code",
584 | "execution_count": 362,
585 | "metadata": {},
586 | "outputs": [
587 | {
588 | "name": "stdout",
589 | "output_type": "stream",
590 | "text": [
591 | "[24344.031226139265, 29103.803299408657, 35693.48786672571, 26965.662593750727]\n"
592 | ]
593 | },
594 | {
595 | "data": {
596 | "text/plain": [
597 | "29026.74624650609"
598 | ]
599 | },
600 | "execution_count": 362,
601 | "metadata": {},
602 | "output_type": "execute_result"
603 | }
604 | ],
605 | "source": [
606 | "transform_df = transform_features(df)\n",
607 | "filtered_df = select_features(transform_df)\n",
608 | "rmse = train_test(filtered_df, k=4)\n",
609 | "rmse"
610 | ]
611 | },
612 | {
613 | "cell_type": "code",
614 | "execution_count": null,
615 | "metadata": {},
616 | "outputs": [],
617 | "source": []
618 | }
619 | ],
620 | "metadata": {
621 | "anaconda-cloud": {},
622 | "kernelspec": {
623 | "display_name": "Python 3",
624 | "language": "python",
625 | "name": "python3"
626 | },
627 | "language_info": {
628 | "codemirror_mode": {
629 | "name": "ipython",
630 | "version": 3
631 | },
632 | "file_extension": ".py",
633 | "mimetype": "text/x-python",
634 | "name": "python",
635 | "nbconvert_exporter": "python",
636 | "pygments_lexer": "ipython3",
637 | "version": "3.6.8"
638 | }
639 | },
640 | "nbformat": 4,
641 | "nbformat_minor": 2
642 | }
643 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Project: House Sale Price Prediction
2 |
3 | ## Project Goal
4 | Investigate housing data set and build an ML Pipeline that automates feature engineering, feature selection, and training and testing models for estimation and prediction of housing prices.
5 |
6 | ## Dataset Information:
7 | AmesHousing.tsv is a data set describing the sale of individual residential property in Ames, Iowa from 2006 to 2010. The data set contains 2930 observations and a large number of explanatory variables (23 nominal, 23 ordinal, 14 discrete, and 20 continuous) involved in assessing home values.
8 |
9 | ## Tech Stack
10 | Python
11 | Jupyter Notebook
12 | NumPy
13 | Pandas
14 | Matplotlib
15 | Scikit-learn
16 |
17 | ## Featured ML Algorithms
18 | K-Fold Cross Validation
19 | Linear Regression
--------------------------------------------------------------------------------