├── README.md
└── Pune house price prediction.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # Machine-Learning-Project- Pune House Price Prediction
2 | This repository consists House Prices Prediction in Pune. Datasets are provided in each of the folders above, and also the solution to the problem statements have been provided.
3 |
4 | Please do ⭐ the repository, if it helped you in anyway.
5 |
--------------------------------------------------------------------------------
/Pune house price prediction.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 244,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# Importing essential libraries\n",
10 | "import numpy as np\n",
11 | "import pandas as pd\n",
12 | "from matplotlib import pyplot as plt\n",
13 | "from matplotlib import rcParams as rcP\n",
14 | "%matplotlib inline"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 245,
20 | "metadata": {},
21 | "outputs": [
22 | {
23 | "data": {
24 | "text/html": [
25 | "
\n",
26 | "\n",
39 | "
\n",
40 | " \n",
41 | " \n",
42 | " | \n",
43 | " area_type | \n",
44 | " availability | \n",
45 | " size | \n",
46 | " society | \n",
47 | " total_sqft | \n",
48 | " bath | \n",
49 | " balcony | \n",
50 | " price | \n",
51 | " site_location | \n",
52 | "
\n",
53 | " \n",
54 | " \n",
55 | " \n",
56 | " | 0 | \n",
57 | " Super built-up Area | \n",
58 | " 19-Dec | \n",
59 | " 2 BHK | \n",
60 | " Coomee | \n",
61 | " 1056 | \n",
62 | " 2.0 | \n",
63 | " 1.0 | \n",
64 | " 39.07 | \n",
65 | " Alandi Road | \n",
66 | "
\n",
67 | " \n",
68 | " | 1 | \n",
69 | " Plot Area | \n",
70 | " Ready To Move | \n",
71 | " 4 Bedroom | \n",
72 | " Theanmp | \n",
73 | " 2600 | \n",
74 | " 5.0 | \n",
75 | " 3.0 | \n",
76 | " 120.00 | \n",
77 | " Ambegaon Budruk | \n",
78 | "
\n",
79 | " \n",
80 | " | 2 | \n",
81 | " Built-up Area | \n",
82 | " Ready To Move | \n",
83 | " 3 BHK | \n",
84 | " NaN | \n",
85 | " 1440 | \n",
86 | " 2.0 | \n",
87 | " 3.0 | \n",
88 | " 62.00 | \n",
89 | " Anandnagar | \n",
90 | "
\n",
91 | " \n",
92 | " | 3 | \n",
93 | " Super built-up Area | \n",
94 | " Ready To Move | \n",
95 | " 3 BHK | \n",
96 | " Soiewre | \n",
97 | " 1521 | \n",
98 | " 3.0 | \n",
99 | " 1.0 | \n",
100 | " 95.00 | \n",
101 | " Aundh | \n",
102 | "
\n",
103 | " \n",
104 | " | 4 | \n",
105 | " Super built-up Area | \n",
106 | " Ready To Move | \n",
107 | " 2 BHK | \n",
108 | " NaN | \n",
109 | " 1200 | \n",
110 | " 2.0 | \n",
111 | " 1.0 | \n",
112 | " 51.00 | \n",
113 | " Aundh Road | \n",
114 | "
\n",
115 | " \n",
116 | "
\n",
117 | "
"
118 | ],
119 | "text/plain": [
120 | " area_type availability size society total_sqft bath \\\n",
121 | "0 Super built-up Area 19-Dec 2 BHK Coomee 1056 2.0 \n",
122 | "1 Plot Area Ready To Move 4 Bedroom Theanmp 2600 5.0 \n",
123 | "2 Built-up Area Ready To Move 3 BHK NaN 1440 2.0 \n",
124 | "3 Super built-up Area Ready To Move 3 BHK Soiewre 1521 3.0 \n",
125 | "4 Super built-up Area Ready To Move 2 BHK NaN 1200 2.0 \n",
126 | "\n",
127 | " balcony price site_location \n",
128 | "0 1.0 39.07 Alandi Road \n",
129 | "1 3.0 120.00 Ambegaon Budruk \n",
130 | "2 3.0 62.00 Anandnagar \n",
131 | "3 1.0 95.00 Aundh \n",
132 | "4 1.0 51.00 Aundh Road "
133 | ]
134 | },
135 | "execution_count": 245,
136 | "metadata": {},
137 | "output_type": "execute_result"
138 | }
139 | ],
140 | "source": [
141 | "# Loading the dataset\n",
142 | "df = pd.read_csv('pune_House_Data.csv')\n",
143 | "df.head()"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 246,
149 | "metadata": {},
150 | "outputs": [
151 | {
152 | "data": {
153 | "text/plain": [
154 | "(13320, 9)"
155 | ]
156 | },
157 | "execution_count": 246,
158 | "metadata": {},
159 | "output_type": "execute_result"
160 | }
161 | ],
162 | "source": [
163 | "# Exploring the dataset\n",
164 | "df.shape"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 247,
170 | "metadata": {
171 | "scrolled": true
172 | },
173 | "outputs": [
174 | {
175 | "data": {
176 | "text/plain": [
177 | "area_type\n",
178 | "Built-up Area 2418\n",
179 | "Carpet Area 87\n",
180 | "Plot Area 2025\n",
181 | "Super built-up Area 8790\n",
182 | "Name: area_type, dtype: int64"
183 | ]
184 | },
185 | "execution_count": 247,
186 | "metadata": {},
187 | "output_type": "execute_result"
188 | }
189 | ],
190 | "source": [
191 | "# Exploring the dataset\n",
192 | "df.groupby('area_type')['area_type'].agg('count')"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 248,
198 | "metadata": {
199 | "scrolled": true
200 | },
201 | "outputs": [
202 | {
203 | "data": {
204 | "text/plain": [
205 | "availability\n",
206 | "14-Jul 1\n",
207 | "14-Nov 1\n",
208 | "15-Aug 1\n",
209 | "15-Dec 1\n",
210 | "15-Jun 1\n",
211 | " ... \n",
212 | "22-Mar 3\n",
213 | "22-May 10\n",
214 | "22-Nov 2\n",
215 | "Immediate Possession 16\n",
216 | "Ready To Move 10581\n",
217 | "Name: availability, Length: 81, dtype: int64"
218 | ]
219 | },
220 | "execution_count": 248,
221 | "metadata": {},
222 | "output_type": "execute_result"
223 | }
224 | ],
225 | "source": [
226 | "# Exploring the dataset\n",
227 | "df.groupby('availability')['availability'].agg('count')"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 249,
233 | "metadata": {
234 | "scrolled": true
235 | },
236 | "outputs": [
237 | {
238 | "data": {
239 | "text/plain": [
240 | "size\n",
241 | "1 BHK 538\n",
242 | "1 Bedroom 105\n",
243 | "1 RK 13\n",
244 | "10 BHK 2\n",
245 | "10 Bedroom 12\n",
246 | "11 BHK 2\n",
247 | "11 Bedroom 2\n",
248 | "12 Bedroom 1\n",
249 | "13 BHK 1\n",
250 | "14 BHK 1\n",
251 | "16 BHK 1\n",
252 | "18 Bedroom 1\n",
253 | "19 BHK 1\n",
254 | "2 BHK 5199\n",
255 | "2 Bedroom 329\n",
256 | "27 BHK 1\n",
257 | "3 BHK 4310\n",
258 | "3 Bedroom 547\n",
259 | "4 BHK 591\n",
260 | "4 Bedroom 826\n",
261 | "43 Bedroom 1\n",
262 | "5 BHK 59\n",
263 | "5 Bedroom 297\n",
264 | "6 BHK 30\n",
265 | "6 Bedroom 191\n",
266 | "7 BHK 17\n",
267 | "7 Bedroom 83\n",
268 | "8 BHK 5\n",
269 | "8 Bedroom 84\n",
270 | "9 BHK 8\n",
271 | "9 Bedroom 46\n",
272 | "Name: size, dtype: int64"
273 | ]
274 | },
275 | "execution_count": 249,
276 | "metadata": {},
277 | "output_type": "execute_result"
278 | }
279 | ],
280 | "source": [
281 | "# Exploring the dataset\n",
282 | "df.groupby('size')['size'].agg('count')"
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": 250,
288 | "metadata": {},
289 | "outputs": [
290 | {
291 | "data": {
292 | "text/plain": [
293 | "site_location\n",
294 | "Alandi Road 139\n",
295 | "Ambegaon Budruk 139\n",
296 | "Anandnagar 139\n",
297 | "Aundh 139\n",
298 | "Aundh Road 139\n",
299 | " ... \n",
300 | "Wakadewadi 138\n",
301 | "Wanowrie 138\n",
302 | "Warje 138\n",
303 | "Yerawada 138\n",
304 | "other 1\n",
305 | "Name: site_location, Length: 97, dtype: int64"
306 | ]
307 | },
308 | "execution_count": 250,
309 | "metadata": {},
310 | "output_type": "execute_result"
311 | }
312 | ],
313 | "source": [
314 | "# Exploring the dataset\n",
315 | "df.groupby('site_location')['site_location'].agg('count')"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 251,
321 | "metadata": {},
322 | "outputs": [
323 | {
324 | "data": {
325 | "text/html": [
326 | "\n",
327 | "\n",
340 | "
\n",
341 | " \n",
342 | " \n",
343 | " | \n",
344 | " area_type | \n",
345 | " availability | \n",
346 | " size | \n",
347 | " total_sqft | \n",
348 | " bath | \n",
349 | " balcony | \n",
350 | " price | \n",
351 | " site_location | \n",
352 | "
\n",
353 | " \n",
354 | " \n",
355 | " \n",
356 | " | 0 | \n",
357 | " Super built-up Area | \n",
358 | " 19-Dec | \n",
359 | " 2 BHK | \n",
360 | " 1056 | \n",
361 | " 2.0 | \n",
362 | " 1.0 | \n",
363 | " 39.07 | \n",
364 | " Alandi Road | \n",
365 | "
\n",
366 | " \n",
367 | " | 1 | \n",
368 | " Plot Area | \n",
369 | " Ready To Move | \n",
370 | " 4 Bedroom | \n",
371 | " 2600 | \n",
372 | " 5.0 | \n",
373 | " 3.0 | \n",
374 | " 120.00 | \n",
375 | " Ambegaon Budruk | \n",
376 | "
\n",
377 | " \n",
378 | " | 2 | \n",
379 | " Built-up Area | \n",
380 | " Ready To Move | \n",
381 | " 3 BHK | \n",
382 | " 1440 | \n",
383 | " 2.0 | \n",
384 | " 3.0 | \n",
385 | " 62.00 | \n",
386 | " Anandnagar | \n",
387 | "
\n",
388 | " \n",
389 | " | 3 | \n",
390 | " Super built-up Area | \n",
391 | " Ready To Move | \n",
392 | " 3 BHK | \n",
393 | " 1521 | \n",
394 | " 3.0 | \n",
395 | " 1.0 | \n",
396 | " 95.00 | \n",
397 | " Aundh | \n",
398 | "
\n",
399 | " \n",
400 | " | 4 | \n",
401 | " Super built-up Area | \n",
402 | " Ready To Move | \n",
403 | " 2 BHK | \n",
404 | " 1200 | \n",
405 | " 2.0 | \n",
406 | " 1.0 | \n",
407 | " 51.00 | \n",
408 | " Aundh Road | \n",
409 | "
\n",
410 | " \n",
411 | "
\n",
412 | "
"
413 | ],
414 | "text/plain": [
415 | " area_type availability size total_sqft bath balcony \\\n",
416 | "0 Super built-up Area 19-Dec 2 BHK 1056 2.0 1.0 \n",
417 | "1 Plot Area Ready To Move 4 Bedroom 2600 5.0 3.0 \n",
418 | "2 Built-up Area Ready To Move 3 BHK 1440 2.0 3.0 \n",
419 | "3 Super built-up Area Ready To Move 3 BHK 1521 3.0 1.0 \n",
420 | "4 Super built-up Area Ready To Move 2 BHK 1200 2.0 1.0 \n",
421 | "\n",
422 | " price site_location \n",
423 | "0 39.07 Alandi Road \n",
424 | "1 120.00 Ambegaon Budruk \n",
425 | "2 62.00 Anandnagar \n",
426 | "3 95.00 Aundh \n",
427 | "4 51.00 Aundh Road "
428 | ]
429 | },
430 | "execution_count": 251,
431 | "metadata": {},
432 | "output_type": "execute_result"
433 | }
434 | ],
435 | "source": [
436 | "# Removing the columns of society\n",
437 | "df = df.drop('society', axis='columns')\n",
438 | "df.head()"
439 | ]
440 | },
441 | {
442 | "cell_type": "markdown",
443 | "metadata": {},
444 | "source": [
445 | "## Data Cleaning Process"
446 | ]
447 | },
448 | {
449 | "cell_type": "code",
450 | "execution_count": 252,
451 | "metadata": {},
452 | "outputs": [
453 | {
454 | "data": {
455 | "text/plain": [
456 | "area_type 0\n",
457 | "availability 0\n",
458 | "size 16\n",
459 | "total_sqft 0\n",
460 | "bath 73\n",
461 | "balcony 609\n",
462 | "price 0\n",
463 | "site_location 1\n",
464 | "dtype: int64"
465 | ]
466 | },
467 | "execution_count": 252,
468 | "metadata": {},
469 | "output_type": "execute_result"
470 | }
471 | ],
472 | "source": [
473 | "# Data Cleaning\n",
474 | "# Checking the null values in the dataset\n",
475 | "df.isnull().sum()"
476 | ]
477 | },
478 | {
479 | "cell_type": "code",
480 | "execution_count": 253,
481 | "metadata": {},
482 | "outputs": [],
483 | "source": [
484 | "# Applying median to the balcony and bath column\n",
485 | "from math import floor\n",
486 | "\n",
487 | "balcony_median = float(floor(df.balcony.median()))\n",
488 | "bath_median = float(floor(df.bath.median()))\n",
489 | "\n",
490 | "df.balcony = df.balcony.fillna(balcony_median)\n",
491 | "df.bath = df.bath.fillna(bath_median)"
492 | ]
493 | },
494 | {
495 | "cell_type": "code",
496 | "execution_count": 254,
497 | "metadata": {},
498 | "outputs": [
499 | {
500 | "data": {
501 | "text/plain": [
502 | "area_type 0\n",
503 | "availability 0\n",
504 | "size 16\n",
505 | "total_sqft 0\n",
506 | "bath 0\n",
507 | "balcony 0\n",
508 | "price 0\n",
509 | "site_location 1\n",
510 | "dtype: int64"
511 | ]
512 | },
513 | "execution_count": 254,
514 | "metadata": {},
515 | "output_type": "execute_result"
516 | }
517 | ],
518 | "source": [
519 | "# Checking the null values in the dataset again\n",
520 | "df.isnull().sum()"
521 | ]
522 | },
523 | {
524 | "cell_type": "code",
525 | "execution_count": 255,
526 | "metadata": {},
527 | "outputs": [
528 | {
529 | "data": {
530 | "text/plain": [
531 | "area_type 0\n",
532 | "availability 0\n",
533 | "size 0\n",
534 | "total_sqft 0\n",
535 | "bath 0\n",
536 | "balcony 0\n",
537 | "price 0\n",
538 | "site_location 0\n",
539 | "dtype: int64"
540 | ]
541 | },
542 | "execution_count": 255,
543 | "metadata": {},
544 | "output_type": "execute_result"
545 | }
546 | ],
547 | "source": [
548 | "# Dropping the rows with null values because the dataset is huge as compared to null values.\n",
549 | "df = df.dropna()\n",
550 | "df.isnull().sum()"
551 | ]
552 | },
553 | {
554 | "cell_type": "code",
555 | "execution_count": 256,
556 | "metadata": {
557 | "scrolled": true
558 | },
559 | "outputs": [
560 | {
561 | "data": {
562 | "text/plain": [
563 | "bhk\n",
564 | "1 656\n",
565 | "2 5527\n",
566 | "3 4857\n",
567 | "4 1417\n",
568 | "5 356\n",
569 | "6 221\n",
570 | "7 100\n",
571 | "8 89\n",
572 | "9 54\n",
573 | "10 14\n",
574 | "11 4\n",
575 | "12 1\n",
576 | "13 1\n",
577 | "14 1\n",
578 | "16 1\n",
579 | "18 1\n",
580 | "19 1\n",
581 | "27 1\n",
582 | "43 1\n",
583 | "Name: bhk, dtype: int64"
584 | ]
585 | },
586 | "execution_count": 256,
587 | "metadata": {},
588 | "output_type": "execute_result"
589 | }
590 | ],
591 | "source": [
592 | "# Converting the size column to bhk\n",
593 | "df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))\n",
594 | "df = df.drop('size', axis='columns')\n",
595 | "df.groupby('bhk')['bhk'].agg('count')"
596 | ]
597 | },
598 | {
599 | "cell_type": "code",
600 | "execution_count": 257,
601 | "metadata": {
602 | "scrolled": true
603 | },
604 | "outputs": [
605 | {
606 | "data": {
607 | "text/plain": [
608 | "array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],\n",
609 | " dtype=object)"
610 | ]
611 | },
612 | "execution_count": 257,
613 | "metadata": {},
614 | "output_type": "execute_result"
615 | }
616 | ],
617 | "source": [
618 | "# Exploring the total_sqft column\n",
619 | "df.total_sqft.unique()"
620 | ]
621 | },
622 | {
623 | "cell_type": "code",
624 | "execution_count": 258,
625 | "metadata": {},
626 | "outputs": [],
627 | "source": [
628 | "# Since the total_sqft contains range values such as 1133-1384, lets filter out these values\n",
629 | "def isFloat(x):\n",
630 | " try:\n",
631 | " float(x)\n",
632 | " except:\n",
633 | " return False\n",
634 | " return True"
635 | ]
636 | },
637 | {
638 | "cell_type": "code",
639 | "execution_count": 259,
640 | "metadata": {},
641 | "outputs": [
642 | {
643 | "data": {
644 | "text/html": [
645 | "\n",
646 | "\n",
659 | "
\n",
660 | " \n",
661 | " \n",
662 | " | \n",
663 | " area_type | \n",
664 | " availability | \n",
665 | " total_sqft | \n",
666 | " bath | \n",
667 | " balcony | \n",
668 | " price | \n",
669 | " site_location | \n",
670 | " bhk | \n",
671 | "
\n",
672 | " \n",
673 | " \n",
674 | " \n",
675 | " | 30 | \n",
676 | " Super built-up Area | \n",
677 | " 19-Dec | \n",
678 | " 2100 - 2850 | \n",
679 | " 4.0 | \n",
680 | " 0.0 | \n",
681 | " 186.000 | \n",
682 | " Gultekdi | \n",
683 | " 4 | \n",
684 | "
\n",
685 | " \n",
686 | " | 56 | \n",
687 | " Built-up Area | \n",
688 | " 20-Feb | \n",
689 | " 3010 - 3410 | \n",
690 | " 2.0 | \n",
691 | " 2.0 | \n",
692 | " 192.000 | \n",
693 | " Model colony | \n",
694 | " 4 | \n",
695 | "
\n",
696 | " \n",
697 | " | 81 | \n",
698 | " Built-up Area | \n",
699 | " 18-Oct | \n",
700 | " 2957 - 3450 | \n",
701 | " 2.0 | \n",
702 | " 2.0 | \n",
703 | " 224.500 | \n",
704 | " Shukrawar Peth | \n",
705 | " 4 | \n",
706 | "
\n",
707 | " \n",
708 | " | 122 | \n",
709 | " Super built-up Area | \n",
710 | " 18-Mar | \n",
711 | " 3067 - 8156 | \n",
712 | " 4.0 | \n",
713 | " 0.0 | \n",
714 | " 477.000 | \n",
715 | " Ganeshkhind | \n",
716 | " 4 | \n",
717 | "
\n",
718 | " \n",
719 | " | 137 | \n",
720 | " Super built-up Area | \n",
721 | " 19-Mar | \n",
722 | " 1042 - 1105 | \n",
723 | " 2.0 | \n",
724 | " 0.0 | \n",
725 | " 54.005 | \n",
726 | " Khadaki | \n",
727 | " 2 | \n",
728 | "
\n",
729 | " \n",
730 | " | ... | \n",
731 | " ... | \n",
732 | " ... | \n",
733 | " ... | \n",
734 | " ... | \n",
735 | " ... | \n",
736 | " ... | \n",
737 | " ... | \n",
738 | " ... | \n",
739 | "
\n",
740 | " \n",
741 | " | 12990 | \n",
742 | " Super built-up Area | \n",
743 | " 18-May | \n",
744 | " 1804 - 2273 | \n",
745 | " 3.0 | \n",
746 | " 0.0 | \n",
747 | " 122.000 | \n",
748 | " Gokhale Nagar | \n",
749 | " 3 | \n",
750 | "
\n",
751 | " \n",
752 | " | 13059 | \n",
753 | " Super built-up Area | \n",
754 | " Ready To Move | \n",
755 | " 1200 - 1470 | \n",
756 | " 2.0 | \n",
757 | " 0.0 | \n",
758 | " 72.760 | \n",
759 | " Anandnagar | \n",
760 | " 2 | \n",
761 | "
\n",
762 | " \n",
763 | " | 13240 | \n",
764 | " Super built-up Area | \n",
765 | " Ready To Move | \n",
766 | " 1020 - 1130 | \n",
767 | " 2.0 | \n",
768 | " 2.0 | \n",
769 | " 52.570 | \n",
770 | " Vadgaon Budruk | \n",
771 | " 1 | \n",
772 | "
\n",
773 | " \n",
774 | " | 13265 | \n",
775 | " Super built-up Area | \n",
776 | " 20-Sep | \n",
777 | " 1133 - 1384 | \n",
778 | " 2.0 | \n",
779 | " 0.0 | \n",
780 | " 59.135 | \n",
781 | " Dapodi | \n",
782 | " 2 | \n",
783 | "
\n",
784 | " \n",
785 | " | 13299 | \n",
786 | " Super built-up Area | \n",
787 | " 18-Dec | \n",
788 | " 2830 - 2882 | \n",
789 | " 5.0 | \n",
790 | " 0.0 | \n",
791 | " 154.500 | \n",
792 | " Laxmi Road | \n",
793 | " 4 | \n",
794 | "
\n",
795 | " \n",
796 | "
\n",
797 | "
239 rows × 8 columns
\n",
798 | "
"
799 | ],
800 | "text/plain": [
801 | " area_type availability total_sqft bath balcony \\\n",
802 | "30 Super built-up Area 19-Dec 2100 - 2850 4.0 0.0 \n",
803 | "56 Built-up Area 20-Feb 3010 - 3410 2.0 2.0 \n",
804 | "81 Built-up Area 18-Oct 2957 - 3450 2.0 2.0 \n",
805 | "122 Super built-up Area 18-Mar 3067 - 8156 4.0 0.0 \n",
806 | "137 Super built-up Area 19-Mar 1042 - 1105 2.0 0.0 \n",
807 | "... ... ... ... ... ... \n",
808 | "12990 Super built-up Area 18-May 1804 - 2273 3.0 0.0 \n",
809 | "13059 Super built-up Area Ready To Move 1200 - 1470 2.0 0.0 \n",
810 | "13240 Super built-up Area Ready To Move 1020 - 1130 2.0 2.0 \n",
811 | "13265 Super built-up Area 20-Sep 1133 - 1384 2.0 0.0 \n",
812 | "13299 Super built-up Area 18-Dec 2830 - 2882 5.0 0.0 \n",
813 | "\n",
814 | " price site_location bhk \n",
815 | "30 186.000 Gultekdi 4 \n",
816 | "56 192.000 Model colony 4 \n",
817 | "81 224.500 Shukrawar Peth 4 \n",
818 | "122 477.000 Ganeshkhind 4 \n",
819 | "137 54.005 Khadaki 2 \n",
820 | "... ... ... ... \n",
821 | "12990 122.000 Gokhale Nagar 3 \n",
822 | "13059 72.760 Anandnagar 2 \n",
823 | "13240 52.570 Vadgaon Budruk 1 \n",
824 | "13265 59.135 Dapodi 2 \n",
825 | "13299 154.500 Laxmi Road 4 \n",
826 | "\n",
827 | "[239 rows x 8 columns]"
828 | ]
829 | },
830 | "execution_count": 259,
831 | "metadata": {},
832 | "output_type": "execute_result"
833 | }
834 | ],
835 | "source": [
836 | "# Displaying all the rows that are not integers\n",
837 | "df[~df['total_sqft'].apply(isFloat)]"
838 | ]
839 | },
840 | {
841 | "cell_type": "code",
842 | "execution_count": 260,
843 | "metadata": {},
844 | "outputs": [],
845 | "source": [
846 | "# Converting the range values to integer values and removing other types of error\n",
847 | "def convert_sqft_to_num(x):\n",
848 | " tokens = x.split('-')\n",
849 | " if len(tokens) == 2:\n",
850 | " return (float(tokens[0])+float(tokens[1]))/2\n",
851 | " try:\n",
852 | " return float(x)\n",
853 | " except:\n",
854 | " return None"
855 | ]
856 | },
857 | {
858 | "cell_type": "code",
859 | "execution_count": 261,
860 | "metadata": {
861 | "scrolled": false
862 | },
863 | "outputs": [
864 | {
865 | "data": {
866 | "text/html": [
867 | "\n",
868 | "\n",
881 | "
\n",
882 | " \n",
883 | " \n",
884 | " | \n",
885 | " area_type | \n",
886 | " availability | \n",
887 | " bath | \n",
888 | " balcony | \n",
889 | " price | \n",
890 | " site_location | \n",
891 | " bhk | \n",
892 | " new_total_sqft | \n",
893 | "
\n",
894 | " \n",
895 | " \n",
896 | " \n",
897 | " | 0 | \n",
898 | " Super built-up Area | \n",
899 | " 19-Dec | \n",
900 | " 2.0 | \n",
901 | " 1.0 | \n",
902 | " 39.07 | \n",
903 | " Alandi Road | \n",
904 | " 2 | \n",
905 | " 1056.0 | \n",
906 | "
\n",
907 | " \n",
908 | " | 1 | \n",
909 | " Plot Area | \n",
910 | " Ready To Move | \n",
911 | " 5.0 | \n",
912 | " 3.0 | \n",
913 | " 120.00 | \n",
914 | " Ambegaon Budruk | \n",
915 | " 4 | \n",
916 | " 2600.0 | \n",
917 | "
\n",
918 | " \n",
919 | " | 2 | \n",
920 | " Built-up Area | \n",
921 | " Ready To Move | \n",
922 | " 2.0 | \n",
923 | " 3.0 | \n",
924 | " 62.00 | \n",
925 | " Anandnagar | \n",
926 | " 3 | \n",
927 | " 1440.0 | \n",
928 | "
\n",
929 | " \n",
930 | " | 3 | \n",
931 | " Super built-up Area | \n",
932 | " Ready To Move | \n",
933 | " 3.0 | \n",
934 | " 1.0 | \n",
935 | " 95.00 | \n",
936 | " Aundh | \n",
937 | " 3 | \n",
938 | " 1521.0 | \n",
939 | "
\n",
940 | " \n",
941 | " | 4 | \n",
942 | " Super built-up Area | \n",
943 | " Ready To Move | \n",
944 | " 2.0 | \n",
945 | " 1.0 | \n",
946 | " 51.00 | \n",
947 | " Aundh Road | \n",
948 | " 2 | \n",
949 | " 1200.0 | \n",
950 | "
\n",
951 | " \n",
952 | "
\n",
953 | "
"
954 | ],
955 | "text/plain": [
956 | " area_type availability bath balcony price \\\n",
957 | "0 Super built-up Area 19-Dec 2.0 1.0 39.07 \n",
958 | "1 Plot Area Ready To Move 5.0 3.0 120.00 \n",
959 | "2 Built-up Area Ready To Move 2.0 3.0 62.00 \n",
960 | "3 Super built-up Area Ready To Move 3.0 1.0 95.00 \n",
961 | "4 Super built-up Area Ready To Move 2.0 1.0 51.00 \n",
962 | "\n",
963 | " site_location bhk new_total_sqft \n",
964 | "0 Alandi Road 2 1056.0 \n",
965 | "1 Ambegaon Budruk 4 2600.0 \n",
966 | "2 Anandnagar 3 1440.0 \n",
967 | "3 Aundh 3 1521.0 \n",
968 | "4 Aundh Road 2 1200.0 "
969 | ]
970 | },
971 | "execution_count": 261,
972 | "metadata": {},
973 | "output_type": "execute_result"
974 | }
975 | ],
976 | "source": [
977 | "df['new_total_sqft'] = df.total_sqft.apply(convert_sqft_to_num)\n",
978 | "df = df.drop('total_sqft', axis='columns')\n",
979 | "df.head()"
980 | ]
981 | },
982 | {
983 | "cell_type": "code",
984 | "execution_count": 262,
985 | "metadata": {},
986 | "outputs": [
987 | {
988 | "data": {
989 | "text/plain": [
990 | "area_type 0\n",
991 | "availability 0\n",
992 | "bath 0\n",
993 | "balcony 0\n",
994 | "price 0\n",
995 | "site_location 0\n",
996 | "bhk 0\n",
997 | "new_total_sqft 46\n",
998 | "dtype: int64"
999 | ]
1000 | },
1001 | "execution_count": 262,
1002 | "metadata": {},
1003 | "output_type": "execute_result"
1004 | }
1005 | ],
1006 | "source": [
1007 | "# Removing the rows in new_total_sqft column that hase None values\n",
1008 | "df.isna().sum()"
1009 | ]
1010 | },
1011 | {
1012 | "cell_type": "code",
1013 | "execution_count": 263,
1014 | "metadata": {},
1015 | "outputs": [
1016 | {
1017 | "data": {
1018 | "text/plain": [
1019 | "area_type 0\n",
1020 | "availability 0\n",
1021 | "bath 0\n",
1022 | "balcony 0\n",
1023 | "price 0\n",
1024 | "site_location 0\n",
1025 | "bhk 0\n",
1026 | "new_total_sqft 0\n",
1027 | "dtype: int64"
1028 | ]
1029 | },
1030 | "execution_count": 263,
1031 | "metadata": {},
1032 | "output_type": "execute_result"
1033 | }
1034 | ],
1035 | "source": [
1036 | "# Removing the rows in new_total_sqft column that hase None values\n",
1037 | "df = df.dropna()\n",
1038 | "df.isna().sum()"
1039 | ]
1040 | },
1041 | {
1042 | "cell_type": "markdown",
1043 | "metadata": {},
1044 | "source": [
1045 | "## Feature Engineering"
1046 | ]
1047 | },
1048 | {
1049 | "cell_type": "code",
1050 | "execution_count": 264,
1051 | "metadata": {
1052 | "scrolled": true
1053 | },
1054 | "outputs": [
1055 | {
1056 | "data": {
1057 | "text/html": [
1058 | "\n",
1059 | "\n",
1072 | "
\n",
1073 | " \n",
1074 | " \n",
1075 | " | \n",
1076 | " area_type | \n",
1077 | " availability | \n",
1078 | " bath | \n",
1079 | " balcony | \n",
1080 | " price | \n",
1081 | " site_location | \n",
1082 | " bhk | \n",
1083 | " new_total_sqft | \n",
1084 | " price_per_sqft | \n",
1085 | "
\n",
1086 | " \n",
1087 | " \n",
1088 | " \n",
1089 | " | 0 | \n",
1090 | " Super built-up Area | \n",
1091 | " 19-Dec | \n",
1092 | " 2.0 | \n",
1093 | " 1.0 | \n",
1094 | " 39.07 | \n",
1095 | " Alandi Road | \n",
1096 | " 2 | \n",
1097 | " 1056.0 | \n",
1098 | " 3699.810606 | \n",
1099 | "
\n",
1100 | " \n",
1101 | " | 1 | \n",
1102 | " Plot Area | \n",
1103 | " Ready To Move | \n",
1104 | " 5.0 | \n",
1105 | " 3.0 | \n",
1106 | " 120.00 | \n",
1107 | " Ambegaon Budruk | \n",
1108 | " 4 | \n",
1109 | " 2600.0 | \n",
1110 | " 4615.384615 | \n",
1111 | "
\n",
1112 | " \n",
1113 | " | 2 | \n",
1114 | " Built-up Area | \n",
1115 | " Ready To Move | \n",
1116 | " 2.0 | \n",
1117 | " 3.0 | \n",
1118 | " 62.00 | \n",
1119 | " Anandnagar | \n",
1120 | " 3 | \n",
1121 | " 1440.0 | \n",
1122 | " 4305.555556 | \n",
1123 | "
\n",
1124 | " \n",
1125 | " | 3 | \n",
1126 | " Super built-up Area | \n",
1127 | " Ready To Move | \n",
1128 | " 3.0 | \n",
1129 | " 1.0 | \n",
1130 | " 95.00 | \n",
1131 | " Aundh | \n",
1132 | " 3 | \n",
1133 | " 1521.0 | \n",
1134 | " 6245.890861 | \n",
1135 | "
\n",
1136 | " \n",
1137 | " | 4 | \n",
1138 | " Super built-up Area | \n",
1139 | " Ready To Move | \n",
1140 | " 2.0 | \n",
1141 | " 1.0 | \n",
1142 | " 51.00 | \n",
1143 | " Aundh Road | \n",
1144 | " 2 | \n",
1145 | " 1200.0 | \n",
1146 | " 4250.000000 | \n",
1147 | "
\n",
1148 | " \n",
1149 | "
\n",
1150 | "
"
1151 | ],
1152 | "text/plain": [
1153 | " area_type availability bath balcony price \\\n",
1154 | "0 Super built-up Area 19-Dec 2.0 1.0 39.07 \n",
1155 | "1 Plot Area Ready To Move 5.0 3.0 120.00 \n",
1156 | "2 Built-up Area Ready To Move 2.0 3.0 62.00 \n",
1157 | "3 Super built-up Area Ready To Move 3.0 1.0 95.00 \n",
1158 | "4 Super built-up Area Ready To Move 2.0 1.0 51.00 \n",
1159 | "\n",
1160 | " site_location bhk new_total_sqft price_per_sqft \n",
1161 | "0 Alandi Road 2 1056.0 3699.810606 \n",
1162 | "1 Ambegaon Budruk 4 2600.0 4615.384615 \n",
1163 | "2 Anandnagar 3 1440.0 4305.555556 \n",
1164 | "3 Aundh 3 1521.0 6245.890861 \n",
1165 | "4 Aundh Road 2 1200.0 4250.000000 "
1166 | ]
1167 | },
1168 | "execution_count": 264,
1169 | "metadata": {},
1170 | "output_type": "execute_result"
1171 | }
1172 | ],
1173 | "source": [
1174 | "# Adding a new column of price_per_sqft\n",
1175 | "df1 = df.copy()\n",
1176 | "\n",
1177 | "# In our dataset the price column is in Lakhs\n",
1178 | "df1['price_per_sqft'] = (df1['price']*100000)/df1['new_total_sqft']\n",
1179 | "df1.head()"
1180 | ]
1181 | },
1182 | {
1183 | "cell_type": "code",
1184 | "execution_count": 265,
1185 | "metadata": {
1186 | "scrolled": true
1187 | },
1188 | "outputs": [
1189 | {
1190 | "name": "stdout",
1191 | "output_type": "stream",
1192 | "text": [
1193 | "97\n"
1194 | ]
1195 | }
1196 | ],
1197 | "source": [
1198 | "# Checking unique values of 'location' column\n",
1199 | "locations = list(df['site_location'].unique())\n",
1200 | "print(len(locations))"
1201 | ]
1202 | },
1203 | {
1204 | "cell_type": "code",
1205 | "execution_count": 266,
1206 | "metadata": {
1207 | "scrolled": false
1208 | },
1209 | "outputs": [
1210 | {
1211 | "data": {
1212 | "text/plain": [
1213 | "site_location\n",
1214 | "Pune Railway Station 139\n",
1215 | "Paud Road 139\n",
1216 | "Ganesh Peth 139\n",
1217 | "Mangalwar peth 139\n",
1218 | "Manik Bagh 139\n",
1219 | " ... \n",
1220 | "Nagar Road 136\n",
1221 | "Narayangaon 136\n",
1222 | "Fatima Nagar 136\n",
1223 | "Camp 136\n",
1224 | "other 1\n",
1225 | "Name: site_location, Length: 97, dtype: int64"
1226 | ]
1227 | },
1228 | "execution_count": 266,
1229 | "metadata": {},
1230 | "output_type": "execute_result"
1231 | }
1232 | ],
1233 | "source": [
1234 | "# Removing the extra spaces at the end\n",
1235 | "df1.site_location = df1.site_location.apply(lambda x: x.strip())\n",
1236 | "\n",
1237 | "# Calulating all the unqiue values in 'site_location' column\n",
1238 | "location_stats = df1.groupby('site_location')['site_location'].agg('count').sort_values(ascending=False)\n",
1239 | "location_stats"
1240 | ]
1241 | },
1242 | {
1243 | "cell_type": "code",
1244 | "execution_count": 267,
1245 | "metadata": {
1246 | "scrolled": false
1247 | },
1248 | "outputs": [
1249 | {
1250 | "name": "stdout",
1251 | "output_type": "stream",
1252 | "text": [
1253 | "1 97\n"
1254 | ]
1255 | }
1256 | ],
1257 | "source": [
1258 | "# Checking locations with less than 10 values\n",
1259 | "print(len(location_stats[location_stats<=10]), len(df1.site_location.unique()))"
1260 | ]
1261 | },
1262 | {
1263 | "cell_type": "code",
1264 | "execution_count": 268,
1265 | "metadata": {},
1266 | "outputs": [
1267 | {
1268 | "data": {
1269 | "text/html": [
1270 | "\n",
1271 | "\n",
1284 | "
\n",
1285 | " \n",
1286 | " \n",
1287 | " | \n",
1288 | " area_type | \n",
1289 | " availability | \n",
1290 | " bath | \n",
1291 | " balcony | \n",
1292 | " price | \n",
1293 | " site_location | \n",
1294 | " bhk | \n",
1295 | " new_total_sqft | \n",
1296 | " price_per_sqft | \n",
1297 | "
\n",
1298 | " \n",
1299 | " \n",
1300 | " \n",
1301 | " | 0 | \n",
1302 | " Super built-up Area | \n",
1303 | " 19-Dec | \n",
1304 | " 2.0 | \n",
1305 | " 1.0 | \n",
1306 | " 39.07 | \n",
1307 | " Alandi Road | \n",
1308 | " 2 | \n",
1309 | " 1056.0 | \n",
1310 | " 3699.810606 | \n",
1311 | "
\n",
1312 | " \n",
1313 | " | 1 | \n",
1314 | " Plot Area | \n",
1315 | " Ready To Move | \n",
1316 | " 5.0 | \n",
1317 | " 3.0 | \n",
1318 | " 120.00 | \n",
1319 | " Ambegaon Budruk | \n",
1320 | " 4 | \n",
1321 | " 2600.0 | \n",
1322 | " 4615.384615 | \n",
1323 | "
\n",
1324 | " \n",
1325 | " | 2 | \n",
1326 | " Built-up Area | \n",
1327 | " Ready To Move | \n",
1328 | " 2.0 | \n",
1329 | " 3.0 | \n",
1330 | " 62.00 | \n",
1331 | " Anandnagar | \n",
1332 | " 3 | \n",
1333 | " 1440.0 | \n",
1334 | " 4305.555556 | \n",
1335 | "
\n",
1336 | " \n",
1337 | " | 3 | \n",
1338 | " Super built-up Area | \n",
1339 | " Ready To Move | \n",
1340 | " 3.0 | \n",
1341 | " 1.0 | \n",
1342 | " 95.00 | \n",
1343 | " Aundh | \n",
1344 | " 3 | \n",
1345 | " 1521.0 | \n",
1346 | " 6245.890861 | \n",
1347 | "
\n",
1348 | " \n",
1349 | " | 4 | \n",
1350 | " Super built-up Area | \n",
1351 | " Ready To Move | \n",
1352 | " 2.0 | \n",
1353 | " 1.0 | \n",
1354 | " 51.00 | \n",
1355 | " Aundh Road | \n",
1356 | " 2 | \n",
1357 | " 1200.0 | \n",
1358 | " 4250.000000 | \n",
1359 | "
\n",
1360 | " \n",
1361 | "
\n",
1362 | "
"
1363 | ],
1364 | "text/plain": [
1365 | " area_type availability bath balcony price \\\n",
1366 | "0 Super built-up Area 19-Dec 2.0 1.0 39.07 \n",
1367 | "1 Plot Area Ready To Move 5.0 3.0 120.00 \n",
1368 | "2 Built-up Area Ready To Move 2.0 3.0 62.00 \n",
1369 | "3 Super built-up Area Ready To Move 3.0 1.0 95.00 \n",
1370 | "4 Super built-up Area Ready To Move 2.0 1.0 51.00 \n",
1371 | "\n",
1372 | " site_location bhk new_total_sqft price_per_sqft \n",
1373 | "0 Alandi Road 2 1056.0 3699.810606 \n",
1374 | "1 Ambegaon Budruk 4 2600.0 4615.384615 \n",
1375 | "2 Anandnagar 3 1440.0 4305.555556 \n",
1376 | "3 Aundh 3 1521.0 6245.890861 \n",
1377 | "4 Aundh Road 2 1200.0 4250.000000 "
1378 | ]
1379 | },
1380 | "execution_count": 268,
1381 | "metadata": {},
1382 | "output_type": "execute_result"
1383 | }
1384 | ],
1385 | "source": [
1386 | "df1.head()"
1387 | ]
1388 | },
1389 | {
1390 | "cell_type": "code",
1391 | "execution_count": 269,
1392 | "metadata": {
1393 | "scrolled": true
1394 | },
1395 | "outputs": [
1396 | {
1397 | "data": {
1398 | "text/plain": [
1399 | "97"
1400 | ]
1401 | },
1402 | "execution_count": 269,
1403 | "metadata": {},
1404 | "output_type": "execute_result"
1405 | }
1406 | ],
1407 | "source": [
1408 | "# Labelling the locations with less than or equal to 10 occurences to 'other'\n",
1409 | "locations_less_than_10 = location_stats[location_stats<=10]\n",
1410 | "\n",
1411 | "df1.site_location = df1.site_location.apply(lambda x: 'other' if x in locations_less_than_10 else x)\n",
1412 | "len(df1.site_location.unique())"
1413 | ]
1414 | },
1415 | {
1416 | "cell_type": "code",
1417 | "execution_count": 270,
1418 | "metadata": {},
1419 | "outputs": [
1420 | {
1421 | "data": {
1422 | "text/plain": [
1423 | "availability\n",
1424 | "Ready To Move 10541\n",
1425 | "18-Dec 306\n",
1426 | "18-May 294\n",
1427 | "18-Apr 271\n",
1428 | "18-Aug 199\n",
1429 | " ... \n",
1430 | "15-Jun 1\n",
1431 | "15-Dec 1\n",
1432 | "15-Aug 1\n",
1433 | "14-Nov 1\n",
1434 | "14-Jul 1\n",
1435 | "Name: availability, Length: 80, dtype: int64"
1436 | ]
1437 | },
1438 | "execution_count": 270,
1439 | "metadata": {},
1440 | "output_type": "execute_result"
1441 | }
1442 | ],
1443 | "source": [
1444 | "# Checking the unique values in 'availability column'\n",
1445 | "df1.groupby('availability')['availability'].agg('count').sort_values(ascending=False)"
1446 | ]
1447 | },
1448 | {
1449 | "cell_type": "code",
1450 | "execution_count": 271,
1451 | "metadata": {},
1452 | "outputs": [
1453 | {
1454 | "data": {
1455 | "text/plain": [
1456 | "2"
1457 | ]
1458 | },
1459 | "execution_count": 271,
1460 | "metadata": {},
1461 | "output_type": "execute_result"
1462 | }
1463 | ],
1464 | "source": [
1465 | "# Labelling the dates into Not Ready\n",
1466 | "dates = df1.groupby('availability')['availability'].agg('count').sort_values(ascending=False)\n",
1467 | "\n",
1468 | "dates_not_ready = dates[dates<10000]\n",
1469 | "df1.availability = df1.availability.apply(lambda x: 'Not Ready' if x in dates_not_ready else x)\n",
1470 | "\n",
1471 | "len(df1.availability.unique())"
1472 | ]
1473 | },
1474 | {
1475 | "cell_type": "code",
1476 | "execution_count": 272,
1477 | "metadata": {},
1478 | "outputs": [
1479 | {
1480 | "data": {
1481 | "text/html": [
1482 | "\n",
1483 | "\n",
1496 | "
\n",
1497 | " \n",
1498 | " \n",
1499 | " | \n",
1500 | " area_type | \n",
1501 | " availability | \n",
1502 | " bath | \n",
1503 | " balcony | \n",
1504 | " price | \n",
1505 | " site_location | \n",
1506 | " bhk | \n",
1507 | " new_total_sqft | \n",
1508 | " price_per_sqft | \n",
1509 | "
\n",
1510 | " \n",
1511 | " \n",
1512 | " \n",
1513 | " | 0 | \n",
1514 | " Super built-up Area | \n",
1515 | " Not Ready | \n",
1516 | " 2.0 | \n",
1517 | " 1.0 | \n",
1518 | " 39.07 | \n",
1519 | " Alandi Road | \n",
1520 | " 2 | \n",
1521 | " 1056.0 | \n",
1522 | " 3699.810606 | \n",
1523 | "
\n",
1524 | " \n",
1525 | " | 1 | \n",
1526 | " Plot Area | \n",
1527 | " Ready To Move | \n",
1528 | " 5.0 | \n",
1529 | " 3.0 | \n",
1530 | " 120.00 | \n",
1531 | " Ambegaon Budruk | \n",
1532 | " 4 | \n",
1533 | " 2600.0 | \n",
1534 | " 4615.384615 | \n",
1535 | "
\n",
1536 | " \n",
1537 | " | 2 | \n",
1538 | " Built-up Area | \n",
1539 | " Ready To Move | \n",
1540 | " 2.0 | \n",
1541 | " 3.0 | \n",
1542 | " 62.00 | \n",
1543 | " Anandnagar | \n",
1544 | " 3 | \n",
1545 | " 1440.0 | \n",
1546 | " 4305.555556 | \n",
1547 | "
\n",
1548 | " \n",
1549 | " | 3 | \n",
1550 | " Super built-up Area | \n",
1551 | " Ready To Move | \n",
1552 | " 3.0 | \n",
1553 | " 1.0 | \n",
1554 | " 95.00 | \n",
1555 | " Aundh | \n",
1556 | " 3 | \n",
1557 | " 1521.0 | \n",
1558 | " 6245.890861 | \n",
1559 | "
\n",
1560 | " \n",
1561 | " | 4 | \n",
1562 | " Super built-up Area | \n",
1563 | " Ready To Move | \n",
1564 | " 2.0 | \n",
1565 | " 1.0 | \n",
1566 | " 51.00 | \n",
1567 | " Aundh Road | \n",
1568 | " 2 | \n",
1569 | " 1200.0 | \n",
1570 | " 4250.000000 | \n",
1571 | "
\n",
1572 | " \n",
1573 | "
\n",
1574 | "
"
1575 | ],
1576 | "text/plain": [
1577 | " area_type availability bath balcony price \\\n",
1578 | "0 Super built-up Area Not Ready 2.0 1.0 39.07 \n",
1579 | "1 Plot Area Ready To Move 5.0 3.0 120.00 \n",
1580 | "2 Built-up Area Ready To Move 2.0 3.0 62.00 \n",
1581 | "3 Super built-up Area Ready To Move 3.0 1.0 95.00 \n",
1582 | "4 Super built-up Area Ready To Move 2.0 1.0 51.00 \n",
1583 | "\n",
1584 | " site_location bhk new_total_sqft price_per_sqft \n",
1585 | "0 Alandi Road 2 1056.0 3699.810606 \n",
1586 | "1 Ambegaon Budruk 4 2600.0 4615.384615 \n",
1587 | "2 Anandnagar 3 1440.0 4305.555556 \n",
1588 | "3 Aundh 3 1521.0 6245.890861 \n",
1589 | "4 Aundh Road 2 1200.0 4250.000000 "
1590 | ]
1591 | },
1592 | "execution_count": 272,
1593 | "metadata": {},
1594 | "output_type": "execute_result"
1595 | }
1596 | ],
1597 | "source": [
1598 | "df1.head()"
1599 | ]
1600 | },
1601 | {
1602 | "cell_type": "code",
1603 | "execution_count": 273,
1604 | "metadata": {},
1605 | "outputs": [
1606 | {
1607 | "data": {
1608 | "text/plain": [
1609 | "area_type\n",
1610 | "Super built-up Area 8778\n",
1611 | "Built-up Area 2402\n",
1612 | "Plot Area 1991\n",
1613 | "Carpet Area 86\n",
1614 | "Name: area_type, dtype: int64"
1615 | ]
1616 | },
1617 | "execution_count": 273,
1618 | "metadata": {},
1619 | "output_type": "execute_result"
1620 | }
1621 | ],
1622 | "source": [
1623 | "# Checking the unique values in 'area_type' column\n",
1624 | "df1.groupby('area_type')['area_type'].agg('count').sort_values(ascending=False)\n",
1625 | "\n",
1626 | "# Since the column has only few unique values, we don't perform any operation"
1627 | ]
1628 | },
1629 | {
1630 | "cell_type": "markdown",
1631 | "metadata": {},
1632 | "source": [
1633 | "## Removing Outliers"
1634 | ]
1635 | },
1636 | {
1637 | "cell_type": "code",
1638 | "execution_count": 274,
1639 | "metadata": {},
1640 | "outputs": [
1641 | {
1642 | "name": "stdout",
1643 | "output_type": "stream",
1644 | "text": [
1645 | "12513 13257\n"
1646 | ]
1647 | }
1648 | ],
1649 | "source": [
1650 | "# Removing the rows that have 1 Room for less than 300sqft\n",
1651 | "\n",
1652 | "df2 = df1[~(df1.new_total_sqft/df1.bhk<300)]\n",
1653 | "print(len(df2), len(df1))"
1654 | ]
1655 | },
1656 | {
1657 | "cell_type": "code",
1658 | "execution_count": 275,
1659 | "metadata": {},
1660 | "outputs": [
1661 | {
1662 | "data": {
1663 | "text/plain": [
1664 | "count 12513.000000\n",
1665 | "mean 6307.567166\n",
1666 | "std 4160.879784\n",
1667 | "min 267.829813\n",
1668 | "25% 4211.469534\n",
1669 | "50% 5295.138889\n",
1670 | "75% 6916.666667\n",
1671 | "max 176470.588235\n",
1672 | "Name: price_per_sqft, dtype: float64"
1673 | ]
1674 | },
1675 | "execution_count": 275,
1676 | "metadata": {},
1677 | "output_type": "execute_result"
1678 | }
1679 | ],
1680 | "source": [
1681 | "df2.price_per_sqft.describe()"
1682 | ]
1683 | },
1684 | {
1685 | "cell_type": "code",
1686 | "execution_count": 276,
1687 | "metadata": {},
1688 | "outputs": [
1689 | {
1690 | "name": "stdout",
1691 | "output_type": "stream",
1692 | "text": [
1693 | "12513 10937\n"
1694 | ]
1695 | }
1696 | ],
1697 | "source": [
1698 | "# Since there is a wide range for 'price_per_sqft' column with min = Rs.267/sqft till max = Rs. 127470/sqft, we remove the extreme ends using the SD\n",
1699 | "def remove_pps_outliers(df):\n",
1700 | " \n",
1701 | " df_out = pd.DataFrame()\n",
1702 | " \n",
1703 | " for key, sub_df in df.groupby('site_location'):\n",
1704 | " m = np.mean(sub_df.price_per_sqft)\n",
1705 | " sd = np.std(sub_df.price_per_sqft)\n",
1706 | " reduce_df = sub_df[(sub_df.price_per_sqft>(m-sd)) & (sub_df.price_per_sqft<(m+sd))]\n",
1707 | " df_out = pd.concat([df_out, reduce_df], ignore_index=True)\n",
1708 | " \n",
1709 | " return df_out\n",
1710 | "\n",
1711 | "df3 = remove_pps_outliers(df2)\n",
1712 | "print(len(df2), len(df3))"
1713 | ]
1714 | },
1715 | {
1716 | "cell_type": "code",
1717 | "execution_count": 277,
1718 | "metadata": {},
1719 | "outputs": [
1720 | {
1721 | "data": {
1722 | "image/png": "\n",
1723 | "text/plain": [
1724 | ""
1725 | ]
1726 | },
1727 | "metadata": {
1728 | "needs_background": "light"
1729 | },
1730 | "output_type": "display_data"
1731 | }
1732 | ],
1733 | "source": [
1734 | "def plot_scatter_chart(df, site_location):\n",
1735 | " bhk2 = df[(df.site_location == site_location) & (df.bhk == 2)]\n",
1736 | " bhk3 = df[(df.site_location == site_location) & (df.bhk == 3)]\n",
1737 | " rcP['figure.figsize'] = (15,10)\n",
1738 | " plt.scatter(bhk2.new_total_sqft, bhk2.price, color='blue', label='2 BHK', s=50)\n",
1739 | " plt.scatter(bhk3.new_total_sqft, bhk3.price, color='green', marker='+', label='3 BHK', s=50)\n",
1740 | " plt.xlabel('Total Square Feet Area')\n",
1741 | " plt.ylabel('Price (in Lakhs)')\n",
1742 | " plt.title(site_location)\n",
1743 | " plt.legend()\n",
1744 | " \n",
1745 | "plot_scatter_chart(df3, 'Hadapsar')"
1746 | ]
1747 | },
1748 | {
1749 | "cell_type": "code",
1750 | "execution_count": 278,
1751 | "metadata": {},
1752 | "outputs": [
1753 | {
1754 | "name": "stdout",
1755 | "output_type": "stream",
1756 | "text": [
1757 | "10937 7459\n"
1758 | ]
1759 | }
1760 | ],
1761 | "source": [
1762 | "# Here we observe that 3 BHK cost that same as 2 BHK in 'Hebbal' location hence removing such outliers is necessary\n",
1763 | "def remove_bhk_outliers(df):\n",
1764 | " exclude_indices = np.array([])\n",
1765 | " \n",
1766 | " for site_location, site_location_df in df.groupby('site_location'):\n",
1767 | " bhk_stats = {}\n",
1768 | " \n",
1769 | " for bhk, bhk_df in site_location_df.groupby('bhk'):\n",
1770 | " bhk_stats[bhk] = {\n",
1771 | " 'mean': np.mean(bhk_df.price_per_sqft),\n",
1772 | " 'std': np.std(bhk_df.price_per_sqft),\n",
1773 | " 'count': bhk_df.shape[0]\n",
1774 | " }\n",
1775 | " \n",
1776 | " for bhk, bhk_df in site_location_df.groupby('bhk'):\n",
1777 | " stats = bhk_stats.get(bhk-1)\n",
1778 | " if stats and stats['count']>5:\n",
1779 | " exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)\n",
1780 | " \n",
1781 | " return df.drop(exclude_indices, axis='index')\n",
1782 | "\n",
1783 | "df4 = remove_bhk_outliers(df3)\n",
1784 | "print(len(df3), len(df4))"
1785 | ]
1786 | },
1787 | {
1788 | "cell_type": "code",
1789 | "execution_count": 279,
1790 | "metadata": {},
1791 | "outputs": [
1792 | {
1793 | "data": {
1794 | "image/png": "\n",
1795 | "text/plain": [
1796 | ""
1797 | ]
1798 | },
1799 | "metadata": {
1800 | "needs_background": "light"
1801 | },
1802 | "output_type": "display_data"
1803 | }
1804 | ],
1805 | "source": [
1806 | "plot_scatter_chart(df4, 'Hadapsar')"
1807 | ]
1808 | },
1809 | {
1810 | "cell_type": "code",
1811 | "execution_count": 280,
1812 | "metadata": {},
1813 | "outputs": [
1814 | {
1815 | "data": {
1816 | "text/plain": [
1817 | "Text(0, 0.5, 'Count')"
1818 | ]
1819 | },
1820 | "execution_count": 280,
1821 | "metadata": {},
1822 | "output_type": "execute_result"
1823 | },
1824 | {
1825 | "data": {
1826 | "image/png": "\n",
1827 | "text/plain": [
1828 | ""
1829 | ]
1830 | },
1831 | "metadata": {
1832 | "needs_background": "light"
1833 | },
1834 | "output_type": "display_data"
1835 | }
1836 | ],
1837 | "source": [
1838 | "plt.hist(df4.price_per_sqft, rwidth=0.5)\n",
1839 | "plt.xlabel('Price Per Square Feet')\n",
1840 | "plt.ylabel('Count')"
1841 | ]
1842 | },
1843 | {
1844 | "cell_type": "code",
1845 | "execution_count": 281,
1846 | "metadata": {},
1847 | "outputs": [
1848 | {
1849 | "data": {
1850 | "text/plain": [
1851 | "Text(0, 0.5, 'Count')"
1852 | ]
1853 | },
1854 | "execution_count": 281,
1855 | "metadata": {},
1856 | "output_type": "execute_result"
1857 | },
1858 | {
1859 | "data": {
1860 | "image/png": "\n",
1861 | "text/plain": [
1862 | ""
1863 | ]
1864 | },
1865 | "metadata": {
1866 | "needs_background": "light"
1867 | },
1868 | "output_type": "display_data"
1869 | }
1870 | ],
1871 | "source": [
1872 | "plt.hist(df4.bath, rwidth=0.5)\n",
1873 | "plt.xlabel('Number of Bathrooms')\n",
1874 | "plt.ylabel('Count')"
1875 | ]
1876 | },
1877 | {
1878 | "cell_type": "code",
1879 | "execution_count": 282,
1880 | "metadata": {
1881 | "scrolled": true
1882 | },
1883 | "outputs": [
1884 | {
1885 | "name": "stdout",
1886 | "output_type": "stream",
1887 | "text": [
1888 | "7459 7395\n"
1889 | ]
1890 | }
1891 | ],
1892 | "source": [
1893 | "# Removing the rows that have 'bath' greater than 'bhk'+2\n",
1894 | "df5 = df4[df4.bath<(df4.bhk+2)]\n",
1895 | "print(len(df4), len(df5))"
1896 | ]
1897 | },
1898 | {
1899 | "cell_type": "code",
1900 | "execution_count": 283,
1901 | "metadata": {},
1902 | "outputs": [
1903 | {
1904 | "data": {
1905 | "text/html": [
1906 | "\n",
1907 | "\n",
1920 | "
\n",
1921 | " \n",
1922 | " \n",
1923 | " | \n",
1924 | " area_type | \n",
1925 | " availability | \n",
1926 | " bath | \n",
1927 | " balcony | \n",
1928 | " price | \n",
1929 | " site_location | \n",
1930 | " bhk | \n",
1931 | " new_total_sqft | \n",
1932 | " price_per_sqft | \n",
1933 | "
\n",
1934 | " \n",
1935 | " \n",
1936 | " \n",
1937 | " | 10923 | \n",
1938 | " Super built-up Area | \n",
1939 | " Not Ready | \n",
1940 | " 2.0 | \n",
1941 | " 1.0 | \n",
1942 | " 67.0 | \n",
1943 | " Yerawada | \n",
1944 | " 2 | \n",
1945 | " 1165.0 | \n",
1946 | " 5751.072961 | \n",
1947 | "
\n",
1948 | " \n",
1949 | " | 10928 | \n",
1950 | " Built-up Area | \n",
1951 | " Ready To Move | \n",
1952 | " 6.0 | \n",
1953 | " 2.0 | \n",
1954 | " 115.0 | \n",
1955 | " Yerawada | \n",
1956 | " 6 | \n",
1957 | " 3500.0 | \n",
1958 | " 3285.714286 | \n",
1959 | "
\n",
1960 | " \n",
1961 | " | 10931 | \n",
1962 | " Built-up Area | \n",
1963 | " Not Ready | \n",
1964 | " 2.0 | \n",
1965 | " 2.0 | \n",
1966 | " 353.0 | \n",
1967 | " Yerawada | \n",
1968 | " 5 | \n",
1969 | " 3210.0 | \n",
1970 | " 10996.884735 | \n",
1971 | "
\n",
1972 | " \n",
1973 | " | 10932 | \n",
1974 | " Super built-up Area | \n",
1975 | " Ready To Move | \n",
1976 | " 2.0 | \n",
1977 | " 2.0 | \n",
1978 | " 54.0 | \n",
1979 | " Yerawada | \n",
1980 | " 2 | \n",
1981 | " 1050.0 | \n",
1982 | " 5142.857143 | \n",
1983 | "
\n",
1984 | " \n",
1985 | " | 10936 | \n",
1986 | " Super built-up Area | \n",
1987 | " Not Ready | \n",
1988 | " 2.0 | \n",
1989 | " 1.0 | \n",
1990 | " 70.0 | \n",
1991 | " Yerawada | \n",
1992 | " 2 | \n",
1993 | " 1132.0 | \n",
1994 | " 6183.745583 | \n",
1995 | "
\n",
1996 | " \n",
1997 | "
\n",
1998 | "
"
1999 | ],
2000 | "text/plain": [
2001 | " area_type availability bath balcony price \\\n",
2002 | "10923 Super built-up Area Not Ready 2.0 1.0 67.0 \n",
2003 | "10928 Built-up Area Ready To Move 6.0 2.0 115.0 \n",
2004 | "10931 Built-up Area Not Ready 2.0 2.0 353.0 \n",
2005 | "10932 Super built-up Area Ready To Move 2.0 2.0 54.0 \n",
2006 | "10936 Super built-up Area Not Ready 2.0 1.0 70.0 \n",
2007 | "\n",
2008 | " site_location bhk new_total_sqft price_per_sqft \n",
2009 | "10923 Yerawada 2 1165.0 5751.072961 \n",
2010 | "10928 Yerawada 6 3500.0 3285.714286 \n",
2011 | "10931 Yerawada 5 3210.0 10996.884735 \n",
2012 | "10932 Yerawada 2 1050.0 5142.857143 \n",
2013 | "10936 Yerawada 2 1132.0 6183.745583 "
2014 | ]
2015 | },
2016 | "execution_count": 283,
2017 | "metadata": {},
2018 | "output_type": "execute_result"
2019 | }
2020 | ],
2021 | "source": [
2022 | "df5.tail()"
2023 | ]
2024 | },
2025 | {
2026 | "cell_type": "markdown",
2027 | "metadata": {},
2028 | "source": [
2029 | "## Model Building"
2030 | ]
2031 | },
2032 | {
2033 | "cell_type": "code",
2034 | "execution_count": 284,
2035 | "metadata": {},
2036 | "outputs": [],
2037 | "source": [
2038 | "# Removing the unnecessary columns (columns that were added only for removing the outliers)\n",
2039 | "df6 = df5.copy()\n",
2040 | "df6 = df6.drop('price_per_sqft', axis='columns')"
2041 | ]
2042 | },
2043 | {
2044 | "cell_type": "code",
2045 | "execution_count": 285,
2046 | "metadata": {},
2047 | "outputs": [
2048 | {
2049 | "data": {
2050 | "text/html": [
2051 | "\n",
2052 | "\n",
2065 | "
\n",
2066 | " \n",
2067 | " \n",
2068 | " | \n",
2069 | " area_type | \n",
2070 | " availability | \n",
2071 | " bath | \n",
2072 | " balcony | \n",
2073 | " price | \n",
2074 | " site_location | \n",
2075 | " bhk | \n",
2076 | " new_total_sqft | \n",
2077 | "
\n",
2078 | " \n",
2079 | " \n",
2080 | " \n",
2081 | " | 0 | \n",
2082 | " Super built-up Area | \n",
2083 | " Not Ready | \n",
2084 | " 2.0 | \n",
2085 | " 1.0 | \n",
2086 | " 39.07 | \n",
2087 | " Alandi Road | \n",
2088 | " 2 | \n",
2089 | " 1056.0 | \n",
2090 | "
\n",
2091 | " \n",
2092 | " | 1 | \n",
2093 | " Plot Area | \n",
2094 | " Ready To Move | \n",
2095 | " 4.0 | \n",
2096 | " 1.0 | \n",
2097 | " 245.00 | \n",
2098 | " Alandi Road | \n",
2099 | " 4 | \n",
2100 | " 2894.0 | \n",
2101 | "
\n",
2102 | " \n",
2103 | " | 2 | \n",
2104 | " Super built-up Area | \n",
2105 | " Ready To Move | \n",
2106 | " 2.0 | \n",
2107 | " 2.0 | \n",
2108 | " 50.00 | \n",
2109 | " Alandi Road | \n",
2110 | " 2 | \n",
2111 | " 1084.0 | \n",
2112 | "
\n",
2113 | " \n",
2114 | " | 3 | \n",
2115 | " Super built-up Area | \n",
2116 | " Ready To Move | \n",
2117 | " 2.0 | \n",
2118 | " 2.0 | \n",
2119 | " 80.00 | \n",
2120 | " Alandi Road | \n",
2121 | " 2 | \n",
2122 | " 1230.0 | \n",
2123 | "
\n",
2124 | " \n",
2125 | " | 4 | \n",
2126 | " Super built-up Area | \n",
2127 | " Ready To Move | \n",
2128 | " 3.0 | \n",
2129 | " 2.0 | \n",
2130 | " 130.00 | \n",
2131 | " Alandi Road | \n",
2132 | " 3 | \n",
2133 | " 1750.0 | \n",
2134 | "
\n",
2135 | " \n",
2136 | "
\n",
2137 | "
"
2138 | ],
2139 | "text/plain": [
2140 | " area_type availability bath balcony price site_location \\\n",
2141 | "0 Super built-up Area Not Ready 2.0 1.0 39.07 Alandi Road \n",
2142 | "1 Plot Area Ready To Move 4.0 1.0 245.00 Alandi Road \n",
2143 | "2 Super built-up Area Ready To Move 2.0 2.0 50.00 Alandi Road \n",
2144 | "3 Super built-up Area Ready To Move 2.0 2.0 80.00 Alandi Road \n",
2145 | "4 Super built-up Area Ready To Move 3.0 2.0 130.00 Alandi Road \n",
2146 | "\n",
2147 | " bhk new_total_sqft \n",
2148 | "0 2 1056.0 \n",
2149 | "1 4 2894.0 \n",
2150 | "2 2 1084.0 \n",
2151 | "3 2 1230.0 \n",
2152 | "4 3 1750.0 "
2153 | ]
2154 | },
2155 | "execution_count": 285,
2156 | "metadata": {},
2157 | "output_type": "execute_result"
2158 | }
2159 | ],
2160 | "source": [
2161 | "df6.head()"
2162 | ]
2163 | },
2164 | {
2165 | "cell_type": "code",
2166 | "execution_count": 286,
2167 | "metadata": {},
2168 | "outputs": [],
2169 | "source": [
2170 | "# Converting the categorical_value into numerical_values using get_dummies method\n",
2171 | "dummy_cols = pd.get_dummies(df6.site_location)\n",
2172 | "df6 = pd.concat([df6,dummy_cols], axis='columns')"
2173 | ]
2174 | },
2175 | {
2176 | "cell_type": "code",
2177 | "execution_count": 287,
2178 | "metadata": {},
2179 | "outputs": [],
2180 | "source": [
2181 | "# Converting the categorical_value into numerical_values using get_dummies method\n",
2182 | "dummy_cols = pd.get_dummies(df6.availability).drop('Not Ready', axis='columns')\n",
2183 | "df6 = pd.concat([df6,dummy_cols], axis='columns')"
2184 | ]
2185 | },
2186 | {
2187 | "cell_type": "code",
2188 | "execution_count": 288,
2189 | "metadata": {},
2190 | "outputs": [],
2191 | "source": [
2192 | "# Converting the categorical_value into numerical_values using get_dummies method\n",
2193 | "dummy_cols = pd.get_dummies(df6.area_type).drop('Super built-up Area', axis='columns')\n",
2194 | "df6 = pd.concat([df6,dummy_cols], axis='columns')"
2195 | ]
2196 | },
2197 | {
2198 | "cell_type": "code",
2199 | "execution_count": 289,
2200 | "metadata": {
2201 | "scrolled": false
2202 | },
2203 | "outputs": [
2204 | {
2205 | "data": {
2206 | "text/html": [
2207 | "\n",
2208 | "\n",
2221 | "
\n",
2222 | " \n",
2223 | " \n",
2224 | " | \n",
2225 | " bath | \n",
2226 | " balcony | \n",
2227 | " price | \n",
2228 | " bhk | \n",
2229 | " new_total_sqft | \n",
2230 | " Alandi Road | \n",
2231 | " Ambegaon Budruk | \n",
2232 | " Anandnagar | \n",
2233 | " Aundh | \n",
2234 | " Aundh Road | \n",
2235 | " ... | \n",
2236 | " Wadgaon Sheri | \n",
2237 | " Wagholi | \n",
2238 | " Wakadewadi | \n",
2239 | " Wanowrie | \n",
2240 | " Warje | \n",
2241 | " Yerawada | \n",
2242 | " Ready To Move | \n",
2243 | " Built-up Area | \n",
2244 | " Carpet Area | \n",
2245 | " Plot Area | \n",
2246 | "
\n",
2247 | " \n",
2248 | " \n",
2249 | " \n",
2250 | " | 0 | \n",
2251 | " 2.0 | \n",
2252 | " 1.0 | \n",
2253 | " 39.07 | \n",
2254 | " 2 | \n",
2255 | " 1056.0 | \n",
2256 | " 1 | \n",
2257 | " 0 | \n",
2258 | " 0 | \n",
2259 | " 0 | \n",
2260 | " 0 | \n",
2261 | " ... | \n",
2262 | " 0 | \n",
2263 | " 0 | \n",
2264 | " 0 | \n",
2265 | " 0 | \n",
2266 | " 0 | \n",
2267 | " 0 | \n",
2268 | " 0 | \n",
2269 | " 0 | \n",
2270 | " 0 | \n",
2271 | " 0 | \n",
2272 | "
\n",
2273 | " \n",
2274 | " | 1 | \n",
2275 | " 4.0 | \n",
2276 | " 1.0 | \n",
2277 | " 245.00 | \n",
2278 | " 4 | \n",
2279 | " 2894.0 | \n",
2280 | " 1 | \n",
2281 | " 0 | \n",
2282 | " 0 | \n",
2283 | " 0 | \n",
2284 | " 0 | \n",
2285 | " ... | \n",
2286 | " 0 | \n",
2287 | " 0 | \n",
2288 | " 0 | \n",
2289 | " 0 | \n",
2290 | " 0 | \n",
2291 | " 0 | \n",
2292 | " 1 | \n",
2293 | " 0 | \n",
2294 | " 0 | \n",
2295 | " 1 | \n",
2296 | "
\n",
2297 | " \n",
2298 | " | 2 | \n",
2299 | " 2.0 | \n",
2300 | " 2.0 | \n",
2301 | " 50.00 | \n",
2302 | " 2 | \n",
2303 | " 1084.0 | \n",
2304 | " 1 | \n",
2305 | " 0 | \n",
2306 | " 0 | \n",
2307 | " 0 | \n",
2308 | " 0 | \n",
2309 | " ... | \n",
2310 | " 0 | \n",
2311 | " 0 | \n",
2312 | " 0 | \n",
2313 | " 0 | \n",
2314 | " 0 | \n",
2315 | " 0 | \n",
2316 | " 1 | \n",
2317 | " 0 | \n",
2318 | " 0 | \n",
2319 | " 0 | \n",
2320 | "
\n",
2321 | " \n",
2322 | " | 3 | \n",
2323 | " 2.0 | \n",
2324 | " 2.0 | \n",
2325 | " 80.00 | \n",
2326 | " 2 | \n",
2327 | " 1230.0 | \n",
2328 | " 1 | \n",
2329 | " 0 | \n",
2330 | " 0 | \n",
2331 | " 0 | \n",
2332 | " 0 | \n",
2333 | " ... | \n",
2334 | " 0 | \n",
2335 | " 0 | \n",
2336 | " 0 | \n",
2337 | " 0 | \n",
2338 | " 0 | \n",
2339 | " 0 | \n",
2340 | " 1 | \n",
2341 | " 0 | \n",
2342 | " 0 | \n",
2343 | " 0 | \n",
2344 | "
\n",
2345 | " \n",
2346 | " | 4 | \n",
2347 | " 3.0 | \n",
2348 | " 2.0 | \n",
2349 | " 130.00 | \n",
2350 | " 3 | \n",
2351 | " 1750.0 | \n",
2352 | " 1 | \n",
2353 | " 0 | \n",
2354 | " 0 | \n",
2355 | " 0 | \n",
2356 | " 0 | \n",
2357 | " ... | \n",
2358 | " 0 | \n",
2359 | " 0 | \n",
2360 | " 0 | \n",
2361 | " 0 | \n",
2362 | " 0 | \n",
2363 | " 0 | \n",
2364 | " 1 | \n",
2365 | " 0 | \n",
2366 | " 0 | \n",
2367 | " 0 | \n",
2368 | "
\n",
2369 | " \n",
2370 | " | 5 | \n",
2371 | " 2.0 | \n",
2372 | " 1.0 | \n",
2373 | " 41.00 | \n",
2374 | " 2 | \n",
2375 | " 995.0 | \n",
2376 | " 1 | \n",
2377 | " 0 | \n",
2378 | " 0 | \n",
2379 | " 0 | \n",
2380 | " 0 | \n",
2381 | " ... | \n",
2382 | " 0 | \n",
2383 | " 0 | \n",
2384 | " 0 | \n",
2385 | " 0 | \n",
2386 | " 0 | \n",
2387 | " 0 | \n",
2388 | " 1 | \n",
2389 | " 1 | \n",
2390 | " 0 | \n",
2391 | " 0 | \n",
2392 | "
\n",
2393 | " \n",
2394 | " | 8 | \n",
2395 | " 2.0 | \n",
2396 | " 1.0 | \n",
2397 | " 95.00 | \n",
2398 | " 2 | \n",
2399 | " 1360.0 | \n",
2400 | " 1 | \n",
2401 | " 0 | \n",
2402 | " 0 | \n",
2403 | " 0 | \n",
2404 | " 0 | \n",
2405 | " ... | \n",
2406 | " 0 | \n",
2407 | " 0 | \n",
2408 | " 0 | \n",
2409 | " 0 | \n",
2410 | " 0 | \n",
2411 | " 0 | \n",
2412 | " 1 | \n",
2413 | " 0 | \n",
2414 | " 0 | \n",
2415 | " 0 | \n",
2416 | "
\n",
2417 | " \n",
2418 | " | 9 | \n",
2419 | " 2.0 | \n",
2420 | " 2.0 | \n",
2421 | " 50.00 | \n",
2422 | " 2 | \n",
2423 | " 1040.0 | \n",
2424 | " 1 | \n",
2425 | " 0 | \n",
2426 | " 0 | \n",
2427 | " 0 | \n",
2428 | " 0 | \n",
2429 | " ... | \n",
2430 | " 0 | \n",
2431 | " 0 | \n",
2432 | " 0 | \n",
2433 | " 0 | \n",
2434 | " 0 | \n",
2435 | " 0 | \n",
2436 | " 1 | \n",
2437 | " 1 | \n",
2438 | " 0 | \n",
2439 | " 0 | \n",
2440 | "
\n",
2441 | " \n",
2442 | " | 10 | \n",
2443 | " 3.0 | \n",
2444 | " 2.0 | \n",
2445 | " 86.06 | \n",
2446 | " 3 | \n",
2447 | " 1655.0 | \n",
2448 | " 1 | \n",
2449 | " 0 | \n",
2450 | " 0 | \n",
2451 | " 0 | \n",
2452 | " 0 | \n",
2453 | " ... | \n",
2454 | " 0 | \n",
2455 | " 0 | \n",
2456 | " 0 | \n",
2457 | " 0 | \n",
2458 | " 0 | \n",
2459 | " 0 | \n",
2460 | " 1 | \n",
2461 | " 0 | \n",
2462 | " 0 | \n",
2463 | " 0 | \n",
2464 | "
\n",
2465 | " \n",
2466 | " | 12 | \n",
2467 | " 5.0 | \n",
2468 | " 3.0 | \n",
2469 | " 198.00 | \n",
2470 | " 4 | \n",
2471 | " 2790.0 | \n",
2472 | " 1 | \n",
2473 | " 0 | \n",
2474 | " 0 | \n",
2475 | " 0 | \n",
2476 | " 0 | \n",
2477 | " ... | \n",
2478 | " 0 | \n",
2479 | " 0 | \n",
2480 | " 0 | \n",
2481 | " 0 | \n",
2482 | " 0 | \n",
2483 | " 0 | \n",
2484 | " 0 | \n",
2485 | " 0 | \n",
2486 | " 0 | \n",
2487 | " 0 | \n",
2488 | "
\n",
2489 | " \n",
2490 | "
\n",
2491 | "
10 rows × 105 columns
\n",
2492 | "
"
2493 | ],
2494 | "text/plain": [
2495 | " bath balcony price bhk new_total_sqft Alandi Road Ambegaon Budruk \\\n",
2496 | "0 2.0 1.0 39.07 2 1056.0 1 0 \n",
2497 | "1 4.0 1.0 245.00 4 2894.0 1 0 \n",
2498 | "2 2.0 2.0 50.00 2 1084.0 1 0 \n",
2499 | "3 2.0 2.0 80.00 2 1230.0 1 0 \n",
2500 | "4 3.0 2.0 130.00 3 1750.0 1 0 \n",
2501 | "5 2.0 1.0 41.00 2 995.0 1 0 \n",
2502 | "8 2.0 1.0 95.00 2 1360.0 1 0 \n",
2503 | "9 2.0 2.0 50.00 2 1040.0 1 0 \n",
2504 | "10 3.0 2.0 86.06 3 1655.0 1 0 \n",
2505 | "12 5.0 3.0 198.00 4 2790.0 1 0 \n",
2506 | "\n",
2507 | " Anandnagar Aundh Aundh Road ... Wadgaon Sheri Wagholi Wakadewadi \\\n",
2508 | "0 0 0 0 ... 0 0 0 \n",
2509 | "1 0 0 0 ... 0 0 0 \n",
2510 | "2 0 0 0 ... 0 0 0 \n",
2511 | "3 0 0 0 ... 0 0 0 \n",
2512 | "4 0 0 0 ... 0 0 0 \n",
2513 | "5 0 0 0 ... 0 0 0 \n",
2514 | "8 0 0 0 ... 0 0 0 \n",
2515 | "9 0 0 0 ... 0 0 0 \n",
2516 | "10 0 0 0 ... 0 0 0 \n",
2517 | "12 0 0 0 ... 0 0 0 \n",
2518 | "\n",
2519 | " Wanowrie Warje Yerawada Ready To Move Built-up Area Carpet Area \\\n",
2520 | "0 0 0 0 0 0 0 \n",
2521 | "1 0 0 0 1 0 0 \n",
2522 | "2 0 0 0 1 0 0 \n",
2523 | "3 0 0 0 1 0 0 \n",
2524 | "4 0 0 0 1 0 0 \n",
2525 | "5 0 0 0 1 1 0 \n",
2526 | "8 0 0 0 1 0 0 \n",
2527 | "9 0 0 0 1 1 0 \n",
2528 | "10 0 0 0 1 0 0 \n",
2529 | "12 0 0 0 0 0 0 \n",
2530 | "\n",
2531 | " Plot Area \n",
2532 | "0 0 \n",
2533 | "1 1 \n",
2534 | "2 0 \n",
2535 | "3 0 \n",
2536 | "4 0 \n",
2537 | "5 0 \n",
2538 | "8 0 \n",
2539 | "9 0 \n",
2540 | "10 0 \n",
2541 | "12 0 \n",
2542 | "\n",
2543 | "[10 rows x 105 columns]"
2544 | ]
2545 | },
2546 | "execution_count": 289,
2547 | "metadata": {},
2548 | "output_type": "execute_result"
2549 | }
2550 | ],
2551 | "source": [
2552 | "df6.drop(['area_type','availability','site_location'], axis='columns', inplace=True)\n",
2553 | "df6.head(10)"
2554 | ]
2555 | },
2556 | {
2557 | "cell_type": "code",
2558 | "execution_count": 290,
2559 | "metadata": {},
2560 | "outputs": [
2561 | {
2562 | "data": {
2563 | "text/plain": [
2564 | "(7395, 105)"
2565 | ]
2566 | },
2567 | "execution_count": 290,
2568 | "metadata": {},
2569 | "output_type": "execute_result"
2570 | }
2571 | ],
2572 | "source": [
2573 | "# Size of the dataset\n",
2574 | "df6.shape"
2575 | ]
2576 | },
2577 | {
2578 | "cell_type": "code",
2579 | "execution_count": 291,
2580 | "metadata": {},
2581 | "outputs": [],
2582 | "source": [
2583 | "# Splitting the dataset into features and label\n",
2584 | "X = df6.drop('price', axis='columns')\n",
2585 | "y = df6['price']"
2586 | ]
2587 | },
2588 | {
2589 | "cell_type": "code",
2590 | "execution_count": 292,
2591 | "metadata": {},
2592 | "outputs": [],
2593 | "source": [
2594 | "# Using GridSearchCV to find the best algorithm for this problem\n",
2595 | "from sklearn.model_selection import GridSearchCV\n",
2596 | "from sklearn.model_selection import ShuffleSplit\n",
2597 | "from sklearn.linear_model import LinearRegression\n",
2598 | "from sklearn.linear_model import Lasso\n",
2599 | "from sklearn.tree import DecisionTreeRegressor"
2600 | ]
2601 | },
2602 | {
2603 | "cell_type": "code",
2604 | "execution_count": 293,
2605 | "metadata": {},
2606 | "outputs": [
2607 | {
2608 | "data": {
2609 | "text/html": [
2610 | "\n",
2611 | "\n",
2624 | "
\n",
2625 | " \n",
2626 | " \n",
2627 | " | \n",
2628 | " model | \n",
2629 | " best_parameters | \n",
2630 | " accuracy | \n",
2631 | "
\n",
2632 | " \n",
2633 | " \n",
2634 | " \n",
2635 | " | 0 | \n",
2636 | " linear_regression | \n",
2637 | " {'normalize': True} | \n",
2638 | " 0.835475 | \n",
2639 | "
\n",
2640 | " \n",
2641 | " | 1 | \n",
2642 | " lasso | \n",
2643 | " {'alpha': 2, 'selection': 'random'} | \n",
2644 | " 0.829241 | \n",
2645 | "
\n",
2646 | " \n",
2647 | " | 2 | \n",
2648 | " decision_tree | \n",
2649 | " {'criterion': 'mse', 'splitter': 'best'} | \n",
2650 | " 0.781004 | \n",
2651 | "
\n",
2652 | " \n",
2653 | "
\n",
2654 | "
"
2655 | ],
2656 | "text/plain": [
2657 | " model best_parameters accuracy\n",
2658 | "0 linear_regression {'normalize': True} 0.835475\n",
2659 | "1 lasso {'alpha': 2, 'selection': 'random'} 0.829241\n",
2660 | "2 decision_tree {'criterion': 'mse', 'splitter': 'best'} 0.781004"
2661 | ]
2662 | },
2663 | "execution_count": 293,
2664 | "metadata": {},
2665 | "output_type": "execute_result"
2666 | }
2667 | ],
2668 | "source": [
2669 | "# Creating a function for GridSearchCV\n",
2670 | "\n",
2671 | "def find_best_model(X, y):\n",
2672 | " models = {\n",
2673 | " 'linear_regression': {\n",
2674 | " 'model': LinearRegression(),\n",
2675 | " 'parameters': {\n",
2676 | " 'normalize': [True,False]\n",
2677 | " }\n",
2678 | " },\n",
2679 | " \n",
2680 | " 'lasso': {\n",
2681 | " 'model': Lasso(),\n",
2682 | " 'parameters': {\n",
2683 | " 'alpha': [1,2],\n",
2684 | " 'selection': ['random', 'cyclic']\n",
2685 | " }\n",
2686 | " },\n",
2687 | " \n",
2688 | " 'decision_tree': {\n",
2689 | " 'model': DecisionTreeRegressor(),\n",
2690 | " 'parameters': {\n",
2691 | " 'criterion': ['mse', 'friedman_mse'],\n",
2692 | " 'splitter': ['best', 'random']\n",
2693 | " }\n",
2694 | " }\n",
2695 | " }\n",
2696 | " \n",
2697 | " scores = []\n",
2698 | " cv_X_y = ShuffleSplit(n_splits=5, test_size=0.20, random_state=0)\n",
2699 | " \n",
2700 | " for model_name, model_params in models.items():\n",
2701 | " gs = GridSearchCV(model_params['model'], model_params['parameters'], cv=cv_X_y, return_train_score=False)\n",
2702 | " gs.fit(X, y)\n",
2703 | " scores.append({\n",
2704 | " 'model': model_name,\n",
2705 | " 'best_parameters': gs.best_params_,\n",
2706 | " 'accuracy': gs.best_score_\n",
2707 | " })\n",
2708 | " \n",
2709 | " return pd.DataFrame(scores, columns=['model', 'best_parameters', 'accuracy'])\n",
2710 | "\n",
2711 | "find_best_model(X, y)"
2712 | ]
2713 | },
2714 | {
2715 | "cell_type": "markdown",
2716 | "metadata": {},
2717 | "source": [
2718 | "#### Since the Linear Regression has the highest accuracy, the model selected for this problem is Linear Regression"
2719 | ]
2720 | },
2721 | {
2722 | "cell_type": "code",
2723 | "execution_count": 294,
2724 | "metadata": {},
2725 | "outputs": [],
2726 | "source": [
2727 | "# Splitting the dataset into train and test set\n",
2728 | "from sklearn.model_selection import train_test_split\n",
2729 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=20)"
2730 | ]
2731 | },
2732 | {
2733 | "cell_type": "code",
2734 | "execution_count": 295,
2735 | "metadata": {},
2736 | "outputs": [
2737 | {
2738 | "data": {
2739 | "text/plain": [
2740 | "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)"
2741 | ]
2742 | },
2743 | "execution_count": 295,
2744 | "metadata": {},
2745 | "output_type": "execute_result"
2746 | }
2747 | ],
2748 | "source": [
2749 | "# Creating Linear Regression Model\n",
2750 | "from sklearn.linear_model import LinearRegression\n",
2751 | "model = LinearRegression(normalize=True)\n",
2752 | "model.fit(X_train, y_train)"
2753 | ]
2754 | },
2755 | {
2756 | "cell_type": "code",
2757 | "execution_count": 296,
2758 | "metadata": {},
2759 | "outputs": [
2760 | {
2761 | "data": {
2762 | "text/plain": [
2763 | "0.8180571987758956"
2764 | ]
2765 | },
2766 | "execution_count": 296,
2767 | "metadata": {},
2768 | "output_type": "execute_result"
2769 | }
2770 | ],
2771 | "source": [
2772 | "model.score(X_test, y_test)"
2773 | ]
2774 | },
2775 | {
2776 | "cell_type": "markdown",
2777 | "metadata": {},
2778 | "source": [
2779 | "#### Predicting the values using our trained model"
2780 | ]
2781 | },
2782 | {
2783 | "cell_type": "code",
2784 | "execution_count": 297,
2785 | "metadata": {},
2786 | "outputs": [
2787 | {
2788 | "data": {
2789 | "text/plain": [
2790 | "Index(['bath', 'balcony', 'bhk', 'new_total_sqft', 'Alandi Road',\n",
2791 | " 'Ambegaon Budruk', 'Anandnagar', 'Aundh', 'Aundh Road', 'Balaji Nagar',\n",
2792 | " ...\n",
2793 | " 'Wadgaon Sheri', 'Wagholi', 'Wakadewadi', 'Wanowrie', 'Warje',\n",
2794 | " 'Yerawada', 'Ready To Move', 'Built-up Area', 'Carpet Area',\n",
2795 | " 'Plot Area'],\n",
2796 | " dtype='object', length=104)"
2797 | ]
2798 | },
2799 | "execution_count": 297,
2800 | "metadata": {},
2801 | "output_type": "execute_result"
2802 | }
2803 | ],
2804 | "source": [
2805 | "X.columns"
2806 | ]
2807 | },
2808 | {
2809 | "cell_type": "code",
2810 | "execution_count": 298,
2811 | "metadata": {},
2812 | "outputs": [
2813 | {
2814 | "data": {
2815 | "text/plain": [
2816 | "9"
2817 | ]
2818 | },
2819 | "execution_count": 298,
2820 | "metadata": {},
2821 | "output_type": "execute_result"
2822 | }
2823 | ],
2824 | "source": [
2825 | "# For finding the appropriate location\n",
2826 | "np.where(X.columns=='Balaji Nagar')[0][0]"
2827 | ]
2828 | },
2829 | {
2830 | "cell_type": "code",
2831 | "execution_count": 299,
2832 | "metadata": {},
2833 | "outputs": [
2834 | {
2835 | "data": {
2836 | "text/plain": [
2837 | "101"
2838 | ]
2839 | },
2840 | "execution_count": 299,
2841 | "metadata": {},
2842 | "output_type": "execute_result"
2843 | }
2844 | ],
2845 | "source": [
2846 | "# For finding the appropriate area_type\n",
2847 | "np.where(X.columns=='Built-up Area')[0][0]"
2848 | ]
2849 | },
2850 | {
2851 | "cell_type": "code",
2852 | "execution_count": 300,
2853 | "metadata": {},
2854 | "outputs": [
2855 | {
2856 | "data": {
2857 | "text/plain": [
2858 | "100"
2859 | ]
2860 | },
2861 | "execution_count": 300,
2862 | "metadata": {},
2863 | "output_type": "execute_result"
2864 | }
2865 | ],
2866 | "source": [
2867 | "# For finding the appropriate availability\n",
2868 | "np.where(X.columns=='Ready To Move')[0][0]"
2869 | ]
2870 | },
2871 | {
2872 | "cell_type": "code",
2873 | "execution_count": 301,
2874 | "metadata": {},
2875 | "outputs": [],
2876 | "source": [
2877 | "# Creating a fuction to predict values\n",
2878 | "def prediction(location, bhk, bath, balcony, sqft, area_type, availability):\n",
2879 | " \n",
2880 | " loc_index, area_index, avail_index = -1,-1,-1\n",
2881 | " \n",
2882 | " if location!='other':\n",
2883 | " loc_index = int(np.where(X.columns==location)[0][0])\n",
2884 | " \n",
2885 | " if area_type!='Super built-up Area':\n",
2886 | " area_index = np.where(X.columns==area_type)[0][0]\n",
2887 | " \n",
2888 | " if availability!='Not Ready': \n",
2889 | " avail_index = np.where(X.columns==availability)[0][0]\n",
2890 | " \n",
2891 | " x = np.zeros(len(X.columns))\n",
2892 | " x[0] = bath\n",
2893 | " x[1] = balcony\n",
2894 | " x[2] = bhk\n",
2895 | " x[3] = sqft\n",
2896 | " \n",
2897 | " if loc_index >= 0:\n",
2898 | " x[loc_index] = 1\n",
2899 | " if area_index >= 0:\n",
2900 | " x[area_index] = 1\n",
2901 | " if avail_index >= 0:\n",
2902 | " x[avail_index] = 1\n",
2903 | " \n",
2904 | " return model.predict([x])[0]"
2905 | ]
2906 | },
2907 | {
2908 | "cell_type": "code",
2909 | "execution_count": 302,
2910 | "metadata": {},
2911 | "outputs": [
2912 | {
2913 | "data": {
2914 | "text/plain": [
2915 | "52.17049124040433"
2916 | ]
2917 | },
2918 | "execution_count": 302,
2919 | "metadata": {},
2920 | "output_type": "execute_result"
2921 | }
2922 | ],
2923 | "source": [
2924 | "# Prediction 1\n",
2925 | "# Input in the form : Location, BHK, Bath, Balcony, Sqft, area_type, availability.\n",
2926 | "prediction('Balaji Nagar', 2, 2, 2, 1000, 'Built-up Area', 'Ready To Move')"
2927 | ]
2928 | },
2929 | {
2930 | "cell_type": "code",
2931 | "execution_count": 303,
2932 | "metadata": {},
2933 | "outputs": [
2934 | {
2935 | "data": {
2936 | "text/plain": [
2937 | "52.09543340931981"
2938 | ]
2939 | },
2940 | "execution_count": 303,
2941 | "metadata": {},
2942 | "output_type": "execute_result"
2943 | }
2944 | ],
2945 | "source": [
2946 | "# Prediction 2\n",
2947 | "# Input in the form : Location, BHK, Bath, Balcony, Sqft, area_type, availability.\n",
2948 | "prediction('Hadapsar', 2, 2, 2, 1000, 'Super built-up Area', 'Ready To Move')"
2949 | ]
2950 | },
2951 | {
2952 | "cell_type": "code",
2953 | "execution_count": 304,
2954 | "metadata": {},
2955 | "outputs": [
2956 | {
2957 | "data": {
2958 | "text/plain": [
2959 | "140.1821349541965"
2960 | ]
2961 | },
2962 | "execution_count": 304,
2963 | "metadata": {},
2964 | "output_type": "execute_result"
2965 | }
2966 | ],
2967 | "source": [
2968 | "# Prediction 3\n",
2969 | "# Input in the form : Location, BHK, Bath, Balcony, Sqft, area_type, availability.\n",
2970 | "prediction('Camp', 2, 3, 2, 2000, 'Plot Area', 'Not Ready')"
2971 | ]
2972 | },
2973 | {
2974 | "cell_type": "code",
2975 | "execution_count": 305,
2976 | "metadata": {},
2977 | "outputs": [
2978 | {
2979 | "data": {
2980 | "text/plain": [
2981 | "140.2386444110721"
2982 | ]
2983 | },
2984 | "execution_count": 305,
2985 | "metadata": {},
2986 | "output_type": "execute_result"
2987 | }
2988 | ],
2989 | "source": [
2990 | "# Prediction 4\n",
2991 | "# Input in the form : Location, BHK, Bath, Balcony, Sqft, area_type, availability.\n",
2992 | "prediction('Baner', 2, 3, 2, 2000, 'Plot Area', 'Not Ready')"
2993 | ]
2994 | }
2995 | ],
2996 | "metadata": {
2997 | "kernelspec": {
2998 | "display_name": "Python 3",
2999 | "language": "python",
3000 | "name": "python3"
3001 | },
3002 | "language_info": {
3003 | "codemirror_mode": {
3004 | "name": "ipython",
3005 | "version": 3
3006 | },
3007 | "file_extension": ".py",
3008 | "mimetype": "text/x-python",
3009 | "name": "python",
3010 | "nbconvert_exporter": "python",
3011 | "pygments_lexer": "ipython3",
3012 | "version": "3.7.4"
3013 | }
3014 | },
3015 | "nbformat": 4,
3016 | "nbformat_minor": 2
3017 | }
3018 |
--------------------------------------------------------------------------------