├── README.md └── Pune house price prediction.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Machine-Learning-Project- Pune House Price Prediction 2 | This repository consists House Prices Prediction in Pune. Datasets are provided in each of the folders above, and also the solution to the problem statements have been provided. 3 | 4 | Please do ⭐ the repository, if it helped you in anyway. 5 | -------------------------------------------------------------------------------- /Pune house price prediction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 244, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# Importing essential libraries\n", 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "from matplotlib import pyplot as plt\n", 13 | "from matplotlib import rcParams as rcP\n", 14 | "%matplotlib inline" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 245, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "data": { 24 | "text/html": [ 25 | "
\n", 26 | "\n", 39 | "\n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | "
area_typeavailabilitysizesocietytotal_sqftbathbalconypricesite_location
0Super built-up Area19-Dec2 BHKCoomee10562.01.039.07Alandi Road
1Plot AreaReady To Move4 BedroomTheanmp26005.03.0120.00Ambegaon Budruk
2Built-up AreaReady To Move3 BHKNaN14402.03.062.00Anandnagar
3Super built-up AreaReady To Move3 BHKSoiewre15213.01.095.00Aundh
4Super built-up AreaReady To Move2 BHKNaN12002.01.051.00Aundh Road
\n", 117 | "
" 118 | ], 119 | "text/plain": [ 120 | " area_type availability size society total_sqft bath \\\n", 121 | "0 Super built-up Area 19-Dec 2 BHK Coomee 1056 2.0 \n", 122 | "1 Plot Area Ready To Move 4 Bedroom Theanmp 2600 5.0 \n", 123 | "2 Built-up Area Ready To Move 3 BHK NaN 1440 2.0 \n", 124 | "3 Super built-up Area Ready To Move 3 BHK Soiewre 1521 3.0 \n", 125 | "4 Super built-up Area Ready To Move 2 BHK NaN 1200 2.0 \n", 126 | "\n", 127 | " balcony price site_location \n", 128 | "0 1.0 39.07 Alandi Road \n", 129 | "1 3.0 120.00 Ambegaon Budruk \n", 130 | "2 3.0 62.00 Anandnagar \n", 131 | "3 1.0 95.00 Aundh \n", 132 | "4 1.0 51.00 Aundh Road " 133 | ] 134 | }, 135 | "execution_count": 245, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "# Loading the dataset\n", 142 | "df = pd.read_csv('pune_House_Data.csv')\n", 143 | "df.head()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 246, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/plain": [ 154 | "(13320, 9)" 155 | ] 156 | }, 157 | "execution_count": 246, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "# Exploring the dataset\n", 164 | "df.shape" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 247, 170 | "metadata": { 171 | "scrolled": true 172 | }, 173 | "outputs": [ 174 | { 175 | "data": { 176 | "text/plain": [ 177 | "area_type\n", 178 | "Built-up Area 2418\n", 179 | "Carpet Area 87\n", 180 | "Plot Area 2025\n", 181 | "Super built-up Area 8790\n", 182 | "Name: area_type, dtype: int64" 183 | ] 184 | }, 185 | "execution_count": 247, 186 | "metadata": {}, 187 | "output_type": "execute_result" 188 | } 189 | ], 190 | "source": [ 191 | "# Exploring the dataset\n", 192 | "df.groupby('area_type')['area_type'].agg('count')" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 248, 198 | "metadata": { 199 | "scrolled": true 200 | }, 201 | "outputs": [ 202 | { 203 | "data": { 204 | "text/plain": [ 205 | "availability\n", 206 | "14-Jul 1\n", 207 | "14-Nov 1\n", 208 | "15-Aug 1\n", 209 | "15-Dec 1\n", 210 | "15-Jun 1\n", 211 | " ... \n", 212 | "22-Mar 3\n", 213 | "22-May 10\n", 214 | "22-Nov 2\n", 215 | "Immediate Possession 16\n", 216 | "Ready To Move 10581\n", 217 | "Name: availability, Length: 81, dtype: int64" 218 | ] 219 | }, 220 | "execution_count": 248, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "# Exploring the dataset\n", 227 | "df.groupby('availability')['availability'].agg('count')" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 249, 233 | "metadata": { 234 | "scrolled": true 235 | }, 236 | "outputs": [ 237 | { 238 | "data": { 239 | "text/plain": [ 240 | "size\n", 241 | "1 BHK 538\n", 242 | "1 Bedroom 105\n", 243 | "1 RK 13\n", 244 | "10 BHK 2\n", 245 | "10 Bedroom 12\n", 246 | "11 BHK 2\n", 247 | "11 Bedroom 2\n", 248 | "12 Bedroom 1\n", 249 | "13 BHK 1\n", 250 | "14 BHK 1\n", 251 | "16 BHK 1\n", 252 | "18 Bedroom 1\n", 253 | "19 BHK 1\n", 254 | "2 BHK 5199\n", 255 | "2 Bedroom 329\n", 256 | "27 BHK 1\n", 257 | "3 BHK 4310\n", 258 | "3 Bedroom 547\n", 259 | "4 BHK 591\n", 260 | "4 Bedroom 826\n", 261 | "43 Bedroom 1\n", 262 | "5 BHK 59\n", 263 | "5 Bedroom 297\n", 264 | "6 BHK 30\n", 265 | "6 Bedroom 191\n", 266 | "7 BHK 17\n", 267 | "7 Bedroom 83\n", 268 | "8 BHK 5\n", 269 | "8 Bedroom 84\n", 270 | "9 BHK 8\n", 271 | "9 Bedroom 46\n", 272 | "Name: size, dtype: int64" 273 | ] 274 | }, 275 | "execution_count": 249, 276 | "metadata": {}, 277 | "output_type": "execute_result" 278 | } 279 | ], 280 | "source": [ 281 | "# Exploring the dataset\n", 282 | "df.groupby('size')['size'].agg('count')" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 250, 288 | "metadata": {}, 289 | "outputs": [ 290 | { 291 | "data": { 292 | "text/plain": [ 293 | "site_location\n", 294 | "Alandi Road 139\n", 295 | "Ambegaon Budruk 139\n", 296 | "Anandnagar 139\n", 297 | "Aundh 139\n", 298 | "Aundh Road 139\n", 299 | " ... \n", 300 | "Wakadewadi 138\n", 301 | "Wanowrie 138\n", 302 | "Warje 138\n", 303 | "Yerawada 138\n", 304 | "other 1\n", 305 | "Name: site_location, Length: 97, dtype: int64" 306 | ] 307 | }, 308 | "execution_count": 250, 309 | "metadata": {}, 310 | "output_type": "execute_result" 311 | } 312 | ], 313 | "source": [ 314 | "# Exploring the dataset\n", 315 | "df.groupby('site_location')['site_location'].agg('count')" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 251, 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "data": { 325 | "text/html": [ 326 | "
\n", 327 | "\n", 340 | "\n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | "
area_typeavailabilitysizetotal_sqftbathbalconypricesite_location
0Super built-up Area19-Dec2 BHK10562.01.039.07Alandi Road
1Plot AreaReady To Move4 Bedroom26005.03.0120.00Ambegaon Budruk
2Built-up AreaReady To Move3 BHK14402.03.062.00Anandnagar
3Super built-up AreaReady To Move3 BHK15213.01.095.00Aundh
4Super built-up AreaReady To Move2 BHK12002.01.051.00Aundh Road
\n", 412 | "
" 413 | ], 414 | "text/plain": [ 415 | " area_type availability size total_sqft bath balcony \\\n", 416 | "0 Super built-up Area 19-Dec 2 BHK 1056 2.0 1.0 \n", 417 | "1 Plot Area Ready To Move 4 Bedroom 2600 5.0 3.0 \n", 418 | "2 Built-up Area Ready To Move 3 BHK 1440 2.0 3.0 \n", 419 | "3 Super built-up Area Ready To Move 3 BHK 1521 3.0 1.0 \n", 420 | "4 Super built-up Area Ready To Move 2 BHK 1200 2.0 1.0 \n", 421 | "\n", 422 | " price site_location \n", 423 | "0 39.07 Alandi Road \n", 424 | "1 120.00 Ambegaon Budruk \n", 425 | "2 62.00 Anandnagar \n", 426 | "3 95.00 Aundh \n", 427 | "4 51.00 Aundh Road " 428 | ] 429 | }, 430 | "execution_count": 251, 431 | "metadata": {}, 432 | "output_type": "execute_result" 433 | } 434 | ], 435 | "source": [ 436 | "# Removing the columns of society\n", 437 | "df = df.drop('society', axis='columns')\n", 438 | "df.head()" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": {}, 444 | "source": [ 445 | "## Data Cleaning Process" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 252, 451 | "metadata": {}, 452 | "outputs": [ 453 | { 454 | "data": { 455 | "text/plain": [ 456 | "area_type 0\n", 457 | "availability 0\n", 458 | "size 16\n", 459 | "total_sqft 0\n", 460 | "bath 73\n", 461 | "balcony 609\n", 462 | "price 0\n", 463 | "site_location 1\n", 464 | "dtype: int64" 465 | ] 466 | }, 467 | "execution_count": 252, 468 | "metadata": {}, 469 | "output_type": "execute_result" 470 | } 471 | ], 472 | "source": [ 473 | "# Data Cleaning\n", 474 | "# Checking the null values in the dataset\n", 475 | "df.isnull().sum()" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 253, 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [ 484 | "# Applying median to the balcony and bath column\n", 485 | "from math import floor\n", 486 | "\n", 487 | "balcony_median = float(floor(df.balcony.median()))\n", 488 | "bath_median = float(floor(df.bath.median()))\n", 489 | "\n", 490 | "df.balcony = df.balcony.fillna(balcony_median)\n", 491 | "df.bath = df.bath.fillna(bath_median)" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 254, 497 | "metadata": {}, 498 | "outputs": [ 499 | { 500 | "data": { 501 | "text/plain": [ 502 | "area_type 0\n", 503 | "availability 0\n", 504 | "size 16\n", 505 | "total_sqft 0\n", 506 | "bath 0\n", 507 | "balcony 0\n", 508 | "price 0\n", 509 | "site_location 1\n", 510 | "dtype: int64" 511 | ] 512 | }, 513 | "execution_count": 254, 514 | "metadata": {}, 515 | "output_type": "execute_result" 516 | } 517 | ], 518 | "source": [ 519 | "# Checking the null values in the dataset again\n", 520 | "df.isnull().sum()" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": 255, 526 | "metadata": {}, 527 | "outputs": [ 528 | { 529 | "data": { 530 | "text/plain": [ 531 | "area_type 0\n", 532 | "availability 0\n", 533 | "size 0\n", 534 | "total_sqft 0\n", 535 | "bath 0\n", 536 | "balcony 0\n", 537 | "price 0\n", 538 | "site_location 0\n", 539 | "dtype: int64" 540 | ] 541 | }, 542 | "execution_count": 255, 543 | "metadata": {}, 544 | "output_type": "execute_result" 545 | } 546 | ], 547 | "source": [ 548 | "# Dropping the rows with null values because the dataset is huge as compared to null values.\n", 549 | "df = df.dropna()\n", 550 | "df.isnull().sum()" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": 256, 556 | "metadata": { 557 | "scrolled": true 558 | }, 559 | "outputs": [ 560 | { 561 | "data": { 562 | "text/plain": [ 563 | "bhk\n", 564 | "1 656\n", 565 | "2 5527\n", 566 | "3 4857\n", 567 | "4 1417\n", 568 | "5 356\n", 569 | "6 221\n", 570 | "7 100\n", 571 | "8 89\n", 572 | "9 54\n", 573 | "10 14\n", 574 | "11 4\n", 575 | "12 1\n", 576 | "13 1\n", 577 | "14 1\n", 578 | "16 1\n", 579 | "18 1\n", 580 | "19 1\n", 581 | "27 1\n", 582 | "43 1\n", 583 | "Name: bhk, dtype: int64" 584 | ] 585 | }, 586 | "execution_count": 256, 587 | "metadata": {}, 588 | "output_type": "execute_result" 589 | } 590 | ], 591 | "source": [ 592 | "# Converting the size column to bhk\n", 593 | "df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))\n", 594 | "df = df.drop('size', axis='columns')\n", 595 | "df.groupby('bhk')['bhk'].agg('count')" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": 257, 601 | "metadata": { 602 | "scrolled": true 603 | }, 604 | "outputs": [ 605 | { 606 | "data": { 607 | "text/plain": [ 608 | "array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],\n", 609 | " dtype=object)" 610 | ] 611 | }, 612 | "execution_count": 257, 613 | "metadata": {}, 614 | "output_type": "execute_result" 615 | } 616 | ], 617 | "source": [ 618 | "# Exploring the total_sqft column\n", 619 | "df.total_sqft.unique()" 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": 258, 625 | "metadata": {}, 626 | "outputs": [], 627 | "source": [ 628 | "# Since the total_sqft contains range values such as 1133-1384, lets filter out these values\n", 629 | "def isFloat(x):\n", 630 | " try:\n", 631 | " float(x)\n", 632 | " except:\n", 633 | " return False\n", 634 | " return True" 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": 259, 640 | "metadata": {}, 641 | "outputs": [ 642 | { 643 | "data": { 644 | "text/html": [ 645 | "
\n", 646 | "\n", 659 | "\n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | "
area_typeavailabilitytotal_sqftbathbalconypricesite_locationbhk
30Super built-up Area19-Dec2100 - 28504.00.0186.000Gultekdi4
56Built-up Area20-Feb3010 - 34102.02.0192.000Model colony4
81Built-up Area18-Oct2957 - 34502.02.0224.500Shukrawar Peth4
122Super built-up Area18-Mar3067 - 81564.00.0477.000Ganeshkhind4
137Super built-up Area19-Mar1042 - 11052.00.054.005Khadaki2
...........................
12990Super built-up Area18-May1804 - 22733.00.0122.000Gokhale Nagar3
13059Super built-up AreaReady To Move1200 - 14702.00.072.760Anandnagar2
13240Super built-up AreaReady To Move1020 - 11302.02.052.570Vadgaon Budruk1
13265Super built-up Area20-Sep1133 - 13842.00.059.135Dapodi2
13299Super built-up Area18-Dec2830 - 28825.00.0154.500Laxmi Road4
\n", 797 | "

239 rows × 8 columns

\n", 798 | "
" 799 | ], 800 | "text/plain": [ 801 | " area_type availability total_sqft bath balcony \\\n", 802 | "30 Super built-up Area 19-Dec 2100 - 2850 4.0 0.0 \n", 803 | "56 Built-up Area 20-Feb 3010 - 3410 2.0 2.0 \n", 804 | "81 Built-up Area 18-Oct 2957 - 3450 2.0 2.0 \n", 805 | "122 Super built-up Area 18-Mar 3067 - 8156 4.0 0.0 \n", 806 | "137 Super built-up Area 19-Mar 1042 - 1105 2.0 0.0 \n", 807 | "... ... ... ... ... ... \n", 808 | "12990 Super built-up Area 18-May 1804 - 2273 3.0 0.0 \n", 809 | "13059 Super built-up Area Ready To Move 1200 - 1470 2.0 0.0 \n", 810 | "13240 Super built-up Area Ready To Move 1020 - 1130 2.0 2.0 \n", 811 | "13265 Super built-up Area 20-Sep 1133 - 1384 2.0 0.0 \n", 812 | "13299 Super built-up Area 18-Dec 2830 - 2882 5.0 0.0 \n", 813 | "\n", 814 | " price site_location bhk \n", 815 | "30 186.000 Gultekdi 4 \n", 816 | "56 192.000 Model colony 4 \n", 817 | "81 224.500 Shukrawar Peth 4 \n", 818 | "122 477.000 Ganeshkhind 4 \n", 819 | "137 54.005 Khadaki 2 \n", 820 | "... ... ... ... \n", 821 | "12990 122.000 Gokhale Nagar 3 \n", 822 | "13059 72.760 Anandnagar 2 \n", 823 | "13240 52.570 Vadgaon Budruk 1 \n", 824 | "13265 59.135 Dapodi 2 \n", 825 | "13299 154.500 Laxmi Road 4 \n", 826 | "\n", 827 | "[239 rows x 8 columns]" 828 | ] 829 | }, 830 | "execution_count": 259, 831 | "metadata": {}, 832 | "output_type": "execute_result" 833 | } 834 | ], 835 | "source": [ 836 | "# Displaying all the rows that are not integers\n", 837 | "df[~df['total_sqft'].apply(isFloat)]" 838 | ] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": 260, 843 | "metadata": {}, 844 | "outputs": [], 845 | "source": [ 846 | "# Converting the range values to integer values and removing other types of error\n", 847 | "def convert_sqft_to_num(x):\n", 848 | " tokens = x.split('-')\n", 849 | " if len(tokens) == 2:\n", 850 | " return (float(tokens[0])+float(tokens[1]))/2\n", 851 | " try:\n", 852 | " return float(x)\n", 853 | " except:\n", 854 | " return None" 855 | ] 856 | }, 857 | { 858 | "cell_type": "code", 859 | "execution_count": 261, 860 | "metadata": { 861 | "scrolled": false 862 | }, 863 | "outputs": [ 864 | { 865 | "data": { 866 | "text/html": [ 867 | "
\n", 868 | "\n", 881 | "\n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | "
area_typeavailabilitybathbalconypricesite_locationbhknew_total_sqft
0Super built-up Area19-Dec2.01.039.07Alandi Road21056.0
1Plot AreaReady To Move5.03.0120.00Ambegaon Budruk42600.0
2Built-up AreaReady To Move2.03.062.00Anandnagar31440.0
3Super built-up AreaReady To Move3.01.095.00Aundh31521.0
4Super built-up AreaReady To Move2.01.051.00Aundh Road21200.0
\n", 953 | "
" 954 | ], 955 | "text/plain": [ 956 | " area_type availability bath balcony price \\\n", 957 | "0 Super built-up Area 19-Dec 2.0 1.0 39.07 \n", 958 | "1 Plot Area Ready To Move 5.0 3.0 120.00 \n", 959 | "2 Built-up Area Ready To Move 2.0 3.0 62.00 \n", 960 | "3 Super built-up Area Ready To Move 3.0 1.0 95.00 \n", 961 | "4 Super built-up Area Ready To Move 2.0 1.0 51.00 \n", 962 | "\n", 963 | " site_location bhk new_total_sqft \n", 964 | "0 Alandi Road 2 1056.0 \n", 965 | "1 Ambegaon Budruk 4 2600.0 \n", 966 | "2 Anandnagar 3 1440.0 \n", 967 | "3 Aundh 3 1521.0 \n", 968 | "4 Aundh Road 2 1200.0 " 969 | ] 970 | }, 971 | "execution_count": 261, 972 | "metadata": {}, 973 | "output_type": "execute_result" 974 | } 975 | ], 976 | "source": [ 977 | "df['new_total_sqft'] = df.total_sqft.apply(convert_sqft_to_num)\n", 978 | "df = df.drop('total_sqft', axis='columns')\n", 979 | "df.head()" 980 | ] 981 | }, 982 | { 983 | "cell_type": "code", 984 | "execution_count": 262, 985 | "metadata": {}, 986 | "outputs": [ 987 | { 988 | "data": { 989 | "text/plain": [ 990 | "area_type 0\n", 991 | "availability 0\n", 992 | "bath 0\n", 993 | "balcony 0\n", 994 | "price 0\n", 995 | "site_location 0\n", 996 | "bhk 0\n", 997 | "new_total_sqft 46\n", 998 | "dtype: int64" 999 | ] 1000 | }, 1001 | "execution_count": 262, 1002 | "metadata": {}, 1003 | "output_type": "execute_result" 1004 | } 1005 | ], 1006 | "source": [ 1007 | "# Removing the rows in new_total_sqft column that hase None values\n", 1008 | "df.isna().sum()" 1009 | ] 1010 | }, 1011 | { 1012 | "cell_type": "code", 1013 | "execution_count": 263, 1014 | "metadata": {}, 1015 | "outputs": [ 1016 | { 1017 | "data": { 1018 | "text/plain": [ 1019 | "area_type 0\n", 1020 | "availability 0\n", 1021 | "bath 0\n", 1022 | "balcony 0\n", 1023 | "price 0\n", 1024 | "site_location 0\n", 1025 | "bhk 0\n", 1026 | "new_total_sqft 0\n", 1027 | "dtype: int64" 1028 | ] 1029 | }, 1030 | "execution_count": 263, 1031 | "metadata": {}, 1032 | "output_type": "execute_result" 1033 | } 1034 | ], 1035 | "source": [ 1036 | "# Removing the rows in new_total_sqft column that hase None values\n", 1037 | "df = df.dropna()\n", 1038 | "df.isna().sum()" 1039 | ] 1040 | }, 1041 | { 1042 | "cell_type": "markdown", 1043 | "metadata": {}, 1044 | "source": [ 1045 | "## Feature Engineering" 1046 | ] 1047 | }, 1048 | { 1049 | "cell_type": "code", 1050 | "execution_count": 264, 1051 | "metadata": { 1052 | "scrolled": true 1053 | }, 1054 | "outputs": [ 1055 | { 1056 | "data": { 1057 | "text/html": [ 1058 | "
\n", 1059 | "\n", 1072 | "\n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | "
area_typeavailabilitybathbalconypricesite_locationbhknew_total_sqftprice_per_sqft
0Super built-up Area19-Dec2.01.039.07Alandi Road21056.03699.810606
1Plot AreaReady To Move5.03.0120.00Ambegaon Budruk42600.04615.384615
2Built-up AreaReady To Move2.03.062.00Anandnagar31440.04305.555556
3Super built-up AreaReady To Move3.01.095.00Aundh31521.06245.890861
4Super built-up AreaReady To Move2.01.051.00Aundh Road21200.04250.000000
\n", 1150 | "
" 1151 | ], 1152 | "text/plain": [ 1153 | " area_type availability bath balcony price \\\n", 1154 | "0 Super built-up Area 19-Dec 2.0 1.0 39.07 \n", 1155 | "1 Plot Area Ready To Move 5.0 3.0 120.00 \n", 1156 | "2 Built-up Area Ready To Move 2.0 3.0 62.00 \n", 1157 | "3 Super built-up Area Ready To Move 3.0 1.0 95.00 \n", 1158 | "4 Super built-up Area Ready To Move 2.0 1.0 51.00 \n", 1159 | "\n", 1160 | " site_location bhk new_total_sqft price_per_sqft \n", 1161 | "0 Alandi Road 2 1056.0 3699.810606 \n", 1162 | "1 Ambegaon Budruk 4 2600.0 4615.384615 \n", 1163 | "2 Anandnagar 3 1440.0 4305.555556 \n", 1164 | "3 Aundh 3 1521.0 6245.890861 \n", 1165 | "4 Aundh Road 2 1200.0 4250.000000 " 1166 | ] 1167 | }, 1168 | "execution_count": 264, 1169 | "metadata": {}, 1170 | "output_type": "execute_result" 1171 | } 1172 | ], 1173 | "source": [ 1174 | "# Adding a new column of price_per_sqft\n", 1175 | "df1 = df.copy()\n", 1176 | "\n", 1177 | "# In our dataset the price column is in Lakhs\n", 1178 | "df1['price_per_sqft'] = (df1['price']*100000)/df1['new_total_sqft']\n", 1179 | "df1.head()" 1180 | ] 1181 | }, 1182 | { 1183 | "cell_type": "code", 1184 | "execution_count": 265, 1185 | "metadata": { 1186 | "scrolled": true 1187 | }, 1188 | "outputs": [ 1189 | { 1190 | "name": "stdout", 1191 | "output_type": "stream", 1192 | "text": [ 1193 | "97\n" 1194 | ] 1195 | } 1196 | ], 1197 | "source": [ 1198 | "# Checking unique values of 'location' column\n", 1199 | "locations = list(df['site_location'].unique())\n", 1200 | "print(len(locations))" 1201 | ] 1202 | }, 1203 | { 1204 | "cell_type": "code", 1205 | "execution_count": 266, 1206 | "metadata": { 1207 | "scrolled": false 1208 | }, 1209 | "outputs": [ 1210 | { 1211 | "data": { 1212 | "text/plain": [ 1213 | "site_location\n", 1214 | "Pune Railway Station 139\n", 1215 | "Paud Road 139\n", 1216 | "Ganesh Peth 139\n", 1217 | "Mangalwar peth 139\n", 1218 | "Manik Bagh 139\n", 1219 | " ... \n", 1220 | "Nagar Road 136\n", 1221 | "Narayangaon 136\n", 1222 | "Fatima Nagar 136\n", 1223 | "Camp 136\n", 1224 | "other 1\n", 1225 | "Name: site_location, Length: 97, dtype: int64" 1226 | ] 1227 | }, 1228 | "execution_count": 266, 1229 | "metadata": {}, 1230 | "output_type": "execute_result" 1231 | } 1232 | ], 1233 | "source": [ 1234 | "# Removing the extra spaces at the end\n", 1235 | "df1.site_location = df1.site_location.apply(lambda x: x.strip())\n", 1236 | "\n", 1237 | "# Calulating all the unqiue values in 'site_location' column\n", 1238 | "location_stats = df1.groupby('site_location')['site_location'].agg('count').sort_values(ascending=False)\n", 1239 | "location_stats" 1240 | ] 1241 | }, 1242 | { 1243 | "cell_type": "code", 1244 | "execution_count": 267, 1245 | "metadata": { 1246 | "scrolled": false 1247 | }, 1248 | "outputs": [ 1249 | { 1250 | "name": "stdout", 1251 | "output_type": "stream", 1252 | "text": [ 1253 | "1 97\n" 1254 | ] 1255 | } 1256 | ], 1257 | "source": [ 1258 | "# Checking locations with less than 10 values\n", 1259 | "print(len(location_stats[location_stats<=10]), len(df1.site_location.unique()))" 1260 | ] 1261 | }, 1262 | { 1263 | "cell_type": "code", 1264 | "execution_count": 268, 1265 | "metadata": {}, 1266 | "outputs": [ 1267 | { 1268 | "data": { 1269 | "text/html": [ 1270 | "
\n", 1271 | "\n", 1284 | "\n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | "
area_typeavailabilitybathbalconypricesite_locationbhknew_total_sqftprice_per_sqft
0Super built-up Area19-Dec2.01.039.07Alandi Road21056.03699.810606
1Plot AreaReady To Move5.03.0120.00Ambegaon Budruk42600.04615.384615
2Built-up AreaReady To Move2.03.062.00Anandnagar31440.04305.555556
3Super built-up AreaReady To Move3.01.095.00Aundh31521.06245.890861
4Super built-up AreaReady To Move2.01.051.00Aundh Road21200.04250.000000
\n", 1362 | "
" 1363 | ], 1364 | "text/plain": [ 1365 | " area_type availability bath balcony price \\\n", 1366 | "0 Super built-up Area 19-Dec 2.0 1.0 39.07 \n", 1367 | "1 Plot Area Ready To Move 5.0 3.0 120.00 \n", 1368 | "2 Built-up Area Ready To Move 2.0 3.0 62.00 \n", 1369 | "3 Super built-up Area Ready To Move 3.0 1.0 95.00 \n", 1370 | "4 Super built-up Area Ready To Move 2.0 1.0 51.00 \n", 1371 | "\n", 1372 | " site_location bhk new_total_sqft price_per_sqft \n", 1373 | "0 Alandi Road 2 1056.0 3699.810606 \n", 1374 | "1 Ambegaon Budruk 4 2600.0 4615.384615 \n", 1375 | "2 Anandnagar 3 1440.0 4305.555556 \n", 1376 | "3 Aundh 3 1521.0 6245.890861 \n", 1377 | "4 Aundh Road 2 1200.0 4250.000000 " 1378 | ] 1379 | }, 1380 | "execution_count": 268, 1381 | "metadata": {}, 1382 | "output_type": "execute_result" 1383 | } 1384 | ], 1385 | "source": [ 1386 | "df1.head()" 1387 | ] 1388 | }, 1389 | { 1390 | "cell_type": "code", 1391 | "execution_count": 269, 1392 | "metadata": { 1393 | "scrolled": true 1394 | }, 1395 | "outputs": [ 1396 | { 1397 | "data": { 1398 | "text/plain": [ 1399 | "97" 1400 | ] 1401 | }, 1402 | "execution_count": 269, 1403 | "metadata": {}, 1404 | "output_type": "execute_result" 1405 | } 1406 | ], 1407 | "source": [ 1408 | "# Labelling the locations with less than or equal to 10 occurences to 'other'\n", 1409 | "locations_less_than_10 = location_stats[location_stats<=10]\n", 1410 | "\n", 1411 | "df1.site_location = df1.site_location.apply(lambda x: 'other' if x in locations_less_than_10 else x)\n", 1412 | "len(df1.site_location.unique())" 1413 | ] 1414 | }, 1415 | { 1416 | "cell_type": "code", 1417 | "execution_count": 270, 1418 | "metadata": {}, 1419 | "outputs": [ 1420 | { 1421 | "data": { 1422 | "text/plain": [ 1423 | "availability\n", 1424 | "Ready To Move 10541\n", 1425 | "18-Dec 306\n", 1426 | "18-May 294\n", 1427 | "18-Apr 271\n", 1428 | "18-Aug 199\n", 1429 | " ... \n", 1430 | "15-Jun 1\n", 1431 | "15-Dec 1\n", 1432 | "15-Aug 1\n", 1433 | "14-Nov 1\n", 1434 | "14-Jul 1\n", 1435 | "Name: availability, Length: 80, dtype: int64" 1436 | ] 1437 | }, 1438 | "execution_count": 270, 1439 | "metadata": {}, 1440 | "output_type": "execute_result" 1441 | } 1442 | ], 1443 | "source": [ 1444 | "# Checking the unique values in 'availability column'\n", 1445 | "df1.groupby('availability')['availability'].agg('count').sort_values(ascending=False)" 1446 | ] 1447 | }, 1448 | { 1449 | "cell_type": "code", 1450 | "execution_count": 271, 1451 | "metadata": {}, 1452 | "outputs": [ 1453 | { 1454 | "data": { 1455 | "text/plain": [ 1456 | "2" 1457 | ] 1458 | }, 1459 | "execution_count": 271, 1460 | "metadata": {}, 1461 | "output_type": "execute_result" 1462 | } 1463 | ], 1464 | "source": [ 1465 | "# Labelling the dates into Not Ready\n", 1466 | "dates = df1.groupby('availability')['availability'].agg('count').sort_values(ascending=False)\n", 1467 | "\n", 1468 | "dates_not_ready = dates[dates<10000]\n", 1469 | "df1.availability = df1.availability.apply(lambda x: 'Not Ready' if x in dates_not_ready else x)\n", 1470 | "\n", 1471 | "len(df1.availability.unique())" 1472 | ] 1473 | }, 1474 | { 1475 | "cell_type": "code", 1476 | "execution_count": 272, 1477 | "metadata": {}, 1478 | "outputs": [ 1479 | { 1480 | "data": { 1481 | "text/html": [ 1482 | "
\n", 1483 | "\n", 1496 | "\n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | " \n", 1543 | " \n", 1544 | " \n", 1545 | " \n", 1546 | " \n", 1547 | " \n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | " \n", 1559 | " \n", 1560 | " \n", 1561 | " \n", 1562 | " \n", 1563 | " \n", 1564 | " \n", 1565 | " \n", 1566 | " \n", 1567 | " \n", 1568 | " \n", 1569 | " \n", 1570 | " \n", 1571 | " \n", 1572 | " \n", 1573 | "
area_typeavailabilitybathbalconypricesite_locationbhknew_total_sqftprice_per_sqft
0Super built-up AreaNot Ready2.01.039.07Alandi Road21056.03699.810606
1Plot AreaReady To Move5.03.0120.00Ambegaon Budruk42600.04615.384615
2Built-up AreaReady To Move2.03.062.00Anandnagar31440.04305.555556
3Super built-up AreaReady To Move3.01.095.00Aundh31521.06245.890861
4Super built-up AreaReady To Move2.01.051.00Aundh Road21200.04250.000000
\n", 1574 | "
" 1575 | ], 1576 | "text/plain": [ 1577 | " area_type availability bath balcony price \\\n", 1578 | "0 Super built-up Area Not Ready 2.0 1.0 39.07 \n", 1579 | "1 Plot Area Ready To Move 5.0 3.0 120.00 \n", 1580 | "2 Built-up Area Ready To Move 2.0 3.0 62.00 \n", 1581 | "3 Super built-up Area Ready To Move 3.0 1.0 95.00 \n", 1582 | "4 Super built-up Area Ready To Move 2.0 1.0 51.00 \n", 1583 | "\n", 1584 | " site_location bhk new_total_sqft price_per_sqft \n", 1585 | "0 Alandi Road 2 1056.0 3699.810606 \n", 1586 | "1 Ambegaon Budruk 4 2600.0 4615.384615 \n", 1587 | "2 Anandnagar 3 1440.0 4305.555556 \n", 1588 | "3 Aundh 3 1521.0 6245.890861 \n", 1589 | "4 Aundh Road 2 1200.0 4250.000000 " 1590 | ] 1591 | }, 1592 | "execution_count": 272, 1593 | "metadata": {}, 1594 | "output_type": "execute_result" 1595 | } 1596 | ], 1597 | "source": [ 1598 | "df1.head()" 1599 | ] 1600 | }, 1601 | { 1602 | "cell_type": "code", 1603 | "execution_count": 273, 1604 | "metadata": {}, 1605 | "outputs": [ 1606 | { 1607 | "data": { 1608 | "text/plain": [ 1609 | "area_type\n", 1610 | "Super built-up Area 8778\n", 1611 | "Built-up Area 2402\n", 1612 | "Plot Area 1991\n", 1613 | "Carpet Area 86\n", 1614 | "Name: area_type, dtype: int64" 1615 | ] 1616 | }, 1617 | "execution_count": 273, 1618 | "metadata": {}, 1619 | "output_type": "execute_result" 1620 | } 1621 | ], 1622 | "source": [ 1623 | "# Checking the unique values in 'area_type' column\n", 1624 | "df1.groupby('area_type')['area_type'].agg('count').sort_values(ascending=False)\n", 1625 | "\n", 1626 | "# Since the column has only few unique values, we don't perform any operation" 1627 | ] 1628 | }, 1629 | { 1630 | "cell_type": "markdown", 1631 | "metadata": {}, 1632 | "source": [ 1633 | "## Removing Outliers" 1634 | ] 1635 | }, 1636 | { 1637 | "cell_type": "code", 1638 | "execution_count": 274, 1639 | "metadata": {}, 1640 | "outputs": [ 1641 | { 1642 | "name": "stdout", 1643 | "output_type": "stream", 1644 | "text": [ 1645 | "12513 13257\n" 1646 | ] 1647 | } 1648 | ], 1649 | "source": [ 1650 | "# Removing the rows that have 1 Room for less than 300sqft\n", 1651 | "\n", 1652 | "df2 = df1[~(df1.new_total_sqft/df1.bhk<300)]\n", 1653 | "print(len(df2), len(df1))" 1654 | ] 1655 | }, 1656 | { 1657 | "cell_type": "code", 1658 | "execution_count": 275, 1659 | "metadata": {}, 1660 | "outputs": [ 1661 | { 1662 | "data": { 1663 | "text/plain": [ 1664 | "count 12513.000000\n", 1665 | "mean 6307.567166\n", 1666 | "std 4160.879784\n", 1667 | "min 267.829813\n", 1668 | "25% 4211.469534\n", 1669 | "50% 5295.138889\n", 1670 | "75% 6916.666667\n", 1671 | "max 176470.588235\n", 1672 | "Name: price_per_sqft, dtype: float64" 1673 | ] 1674 | }, 1675 | "execution_count": 275, 1676 | "metadata": {}, 1677 | "output_type": "execute_result" 1678 | } 1679 | ], 1680 | "source": [ 1681 | "df2.price_per_sqft.describe()" 1682 | ] 1683 | }, 1684 | { 1685 | "cell_type": "code", 1686 | "execution_count": 276, 1687 | "metadata": {}, 1688 | "outputs": [ 1689 | { 1690 | "name": "stdout", 1691 | "output_type": "stream", 1692 | "text": [ 1693 | "12513 10937\n" 1694 | ] 1695 | } 1696 | ], 1697 | "source": [ 1698 | "# Since there is a wide range for 'price_per_sqft' column with min = Rs.267/sqft till max = Rs. 127470/sqft, we remove the extreme ends using the SD\n", 1699 | "def remove_pps_outliers(df):\n", 1700 | " \n", 1701 | " df_out = pd.DataFrame()\n", 1702 | " \n", 1703 | " for key, sub_df in df.groupby('site_location'):\n", 1704 | " m = np.mean(sub_df.price_per_sqft)\n", 1705 | " sd = np.std(sub_df.price_per_sqft)\n", 1706 | " reduce_df = sub_df[(sub_df.price_per_sqft>(m-sd)) & (sub_df.price_per_sqft<(m+sd))]\n", 1707 | " df_out = pd.concat([df_out, reduce_df], ignore_index=True)\n", 1708 | " \n", 1709 | " return df_out\n", 1710 | "\n", 1711 | "df3 = remove_pps_outliers(df2)\n", 1712 | "print(len(df2), len(df3))" 1713 | ] 1714 | }, 1715 | { 1716 | "cell_type": "code", 1717 | "execution_count": 277, 1718 | "metadata": {}, 1719 | "outputs": [ 1720 | { 1721 | "data": { 1722 | "image/png": "\n", 1723 | "text/plain": [ 1724 | "
" 1725 | ] 1726 | }, 1727 | "metadata": { 1728 | "needs_background": "light" 1729 | }, 1730 | "output_type": "display_data" 1731 | } 1732 | ], 1733 | "source": [ 1734 | "def plot_scatter_chart(df, site_location):\n", 1735 | " bhk2 = df[(df.site_location == site_location) & (df.bhk == 2)]\n", 1736 | " bhk3 = df[(df.site_location == site_location) & (df.bhk == 3)]\n", 1737 | " rcP['figure.figsize'] = (15,10)\n", 1738 | " plt.scatter(bhk2.new_total_sqft, bhk2.price, color='blue', label='2 BHK', s=50)\n", 1739 | " plt.scatter(bhk3.new_total_sqft, bhk3.price, color='green', marker='+', label='3 BHK', s=50)\n", 1740 | " plt.xlabel('Total Square Feet Area')\n", 1741 | " plt.ylabel('Price (in Lakhs)')\n", 1742 | " plt.title(site_location)\n", 1743 | " plt.legend()\n", 1744 | " \n", 1745 | "plot_scatter_chart(df3, 'Hadapsar')" 1746 | ] 1747 | }, 1748 | { 1749 | "cell_type": "code", 1750 | "execution_count": 278, 1751 | "metadata": {}, 1752 | "outputs": [ 1753 | { 1754 | "name": "stdout", 1755 | "output_type": "stream", 1756 | "text": [ 1757 | "10937 7459\n" 1758 | ] 1759 | } 1760 | ], 1761 | "source": [ 1762 | "# Here we observe that 3 BHK cost that same as 2 BHK in 'Hebbal' location hence removing such outliers is necessary\n", 1763 | "def remove_bhk_outliers(df):\n", 1764 | " exclude_indices = np.array([])\n", 1765 | " \n", 1766 | " for site_location, site_location_df in df.groupby('site_location'):\n", 1767 | " bhk_stats = {}\n", 1768 | " \n", 1769 | " for bhk, bhk_df in site_location_df.groupby('bhk'):\n", 1770 | " bhk_stats[bhk] = {\n", 1771 | " 'mean': np.mean(bhk_df.price_per_sqft),\n", 1772 | " 'std': np.std(bhk_df.price_per_sqft),\n", 1773 | " 'count': bhk_df.shape[0]\n", 1774 | " }\n", 1775 | " \n", 1776 | " for bhk, bhk_df in site_location_df.groupby('bhk'):\n", 1777 | " stats = bhk_stats.get(bhk-1)\n", 1778 | " if stats and stats['count']>5:\n", 1779 | " exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)\n", 1780 | " \n", 1781 | " return df.drop(exclude_indices, axis='index')\n", 1782 | "\n", 1783 | "df4 = remove_bhk_outliers(df3)\n", 1784 | "print(len(df3), len(df4))" 1785 | ] 1786 | }, 1787 | { 1788 | "cell_type": "code", 1789 | "execution_count": 279, 1790 | "metadata": {}, 1791 | "outputs": [ 1792 | { 1793 | "data": { 1794 | "image/png": "\n", 1795 | "text/plain": [ 1796 | "
" 1797 | ] 1798 | }, 1799 | "metadata": { 1800 | "needs_background": "light" 1801 | }, 1802 | "output_type": "display_data" 1803 | } 1804 | ], 1805 | "source": [ 1806 | "plot_scatter_chart(df4, 'Hadapsar')" 1807 | ] 1808 | }, 1809 | { 1810 | "cell_type": "code", 1811 | "execution_count": 280, 1812 | "metadata": {}, 1813 | "outputs": [ 1814 | { 1815 | "data": { 1816 | "text/plain": [ 1817 | "Text(0, 0.5, 'Count')" 1818 | ] 1819 | }, 1820 | "execution_count": 280, 1821 | "metadata": {}, 1822 | "output_type": "execute_result" 1823 | }, 1824 | { 1825 | "data": { 1826 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAA4EAAAJNCAYAAACP93C3AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3dfbBkd13n8c+XDAQUMMEMbEziTsDRJVoacQgRdCuCQngog1WgsSiJGDc+AOvzbpBaWXWpiorisiIYIRpYJEREiRCNMbKIrpBMIAkJATOSQMZEMhgIUVw08bt/9Blpwr137gy35z78Xq+qrtv9O6e7f31PnZl5zzn33OruAAAAMIb7rfcEAAAAOHxEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEC2rfcEFuGYY47pHTt2rPc0AAAA1sXVV1/98e7evtSyLRmBO3bsyO7du9d7GgAAAOuiqj6y3DKngwIAAAxEBAIAAAxEBAIAAAxEBAIAAAxEBAIAAAxEBAIAAAxEBAIAAAxEBAIAAAxEBAIAAAxEBAIAAAxEBAIAAAxEBAIAAAxEBAIAAAxEBAIAAAxkYRFYVQ+sqiur6tqquqGqfnYaP7Gq3lNVN1XVm6rqAdP4kdPjPdPyHXOv9aJp/ENV9ZRFzRkAAGCrW+SRwM8keWJ3f12Sk5OcXlWnJvmFJC/v7p1JPpHk7Gn9s5N8oru/IsnLp/VSVSclOTPJVyc5PcmvV9URC5w3AADAlrWwCOyZf5ge3n+6dZInJnnzNH5hkmdO98+YHmda/qSqqmn8ou7+THffnGRPklMWNW8AAICtbKE/E1hVR1TVNUnuSHJ5kr9J8snuvmdaZW+S46b7xyW5NUmm5Xcl+dL58SWeAwAAwEFYaAR2973dfXKS4zM7evfopVabvtYyy5Yb/xxVdU5V7a6q3fv27TvUKQMAAGxph+XqoN39yST/J8mpSY6qqm3TouOT3Dbd35vkhCSZln9Jkjvnx5d4zvx7nN/du7p71/bt2xfxMQAAADa9RV4ddHtVHTXdf1CSb01yY5J3JHnWtNpZSd463b9kepxp+Z91d0/jZ05XDz0xyc4kVy5q3gAAAFvZtgOvcsiOTXLhdCXP+yW5uLvfVlUfSHJRVf2PJO9L8tpp/dcmeX1V7cnsCOCZSdLdN1TVxUk+kOSeJM/v7nsXOG8AAIAtq2YH27aWXbt29e7du9d7GgAAAOuiqq7u7l1LLTssPxMIAADAxrDI00GBDWbHuW9f7yms6Jbznr7eUwAA2PIcCQQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABiICAQAABjIwiKwqk6oqndU1Y1VdUNV/cg0/t+r6m+r6prp9rS557yoqvZU1Yeq6ilz46dPY3uq6txFzRkAAGCr27bA174nyU9093ur6iFJrq6qy6dlL+/ul82vXFUnJTkzyVcn+bIkf1pVXzktfmWSb0uyN8lVVXVJd39ggXMHAADYkhYWgd19e5Lbp/t3V9WNSY5b4SlnJLmouz+T5Oaq2pPklGnZnu7+cJJU1UXTuiIQAADgIB2Wnwmsqh1Jvj7Je6ahF1TVdVV1QVUdPY0dl+TWuaftncaWGwcAAOAgLTwCq+rBSX4vyY9296eSvCrJo5KcnNmRwl/ev+oST+8Vxu/7PudU1e6q2r1v3741mTsAAMBWs9AIrKr7ZxaAb+jutyRJd3+su+/t7n9N8pv57Cmfe5OcMPf045PctsL45+ju87t7V3fv2r59+9p/GAAAgC1gkVcHrSSvTXJjd//K3Pixc6t9R5Lrp/uXJDmzqo6sqhOT7ExyZZKrkuysqhOr6gGZXTzmkkXNGwAAYCtb5NVBn5Dke5K8v6qumcZ+Osl3V9XJmZ3SeUuSH0iS7r6hqi7O7IIv9yR5fnffmyRV9YIklyU5IskF3X3DAucNAACwZS3y6qB/kaV/nu/SFZ7z0iQvXWL80pWeBwAAwOoclquDAgAAsDGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIGIQAAAgIEsLAKr6oSqekdV3VhVN1TVj0zjD6uqy6vqpunr0dN4VdUrqmpPVV1XVY+Ze62zpvVvqqqzFjVnAACArW6RRwLvSfIT3f3oJKcmeX5VnZTk3CRXdPfOJFdMj5PkqUl2TrdzkrwqmUVjkpckeVySU5K8ZH84AgAAcHAWFoHdfXt3v3e6f3eSG5Mcl+SMJBdOq12Y5JnT/TOSvK5n3p3kqKo6NslTklze3Xd29yeSXJ7k9EXNGwAAYCs7LD8TWFU7knx9kvckeUR3357MQjHJw6fVjkty69zT9k5jy40DAABwkBYegVX14CS/l+RHu/tTK626xFivMH7f9zmnqnZX1e59+/Yd2mQBAAC2uIVGYFXdP7MAfEN3v2Ua/th0mmemr3dM43uTnDD39OOT3LbC+Ofo7vO7e1d379q+ffvafhAAAIAtYpFXB60kr01yY3f/ytyiS5Lsv8LnWUneOjf+3OkqoacmuWs6XfSyJE+uqqOnC8I8eRoDAADgIG1b4Gs/Icn3JHl/VV0zjf10kvOSXFxVZyf5aJJnT8suTfK0JHuSfDrJ85Kku++sqp9PctW03s91950LnDcAAMCWtbAI7O6/yNI/z5ckT1pi/U7y/GVe64IkF6zd7AAAAMZ0WK4OCgAAwMYgAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAYiAgEAAAaybb0nAAdjx7lvX+8prOiW856+3lMAAIAVORIIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwEBEIAAAwkFVFYFU9YTVjAAAAbGyrPRL4v1Y5BgAAwAa2baWFVfWNSR6fZHtV/fjcoocmOWKREwMAAGDtrRiBSR6Q5MHTeg+ZG/9UkmctalIAAAAsxooR2N3vTPLOqvrt7v7IYZoTAAAAC3KgI4H7HVlV5yfZMf+c7n7iIiYFsBHtOPft6z2FFd1y3tPXewoAwCaw2gj83SSvTvKaJPcubjoAAAAs0moj8J7uftVCZwIAAMDCrfZXRPxhVf1wVR1bVQ/bf1vozAAAAFhzqz0SeNb09afmxjrJI9d2OgAAACzSqiKwu09c9EQAAABYvFVFYFU9d6nx7n7d2k4HAACARVrt6aCPnbv/wCRPSvLeJCIQAABgE1nt6aAvnH9cVV+S5PULmREAAAALs9qrg97Xp5PsXMuJAAAAsHir/ZnAP8zsaqBJckSSRye5eFGTAgAAYDFW+zOBL5u7f0+Sj3T33gXMBwAAgAVa1emg3f3OJB9M8pAkRyf550VOCgAAgMVYVQRW1XcmuTLJs5N8Z5L3VNWzFjkxAAAA1t5qLwzz4iSP7e6zuvu5SU5J8t9WekJVXVBVd1TV9XNj/72q/raqrpluT5tb9qKq2lNVH6qqp8yNnz6N7amqcw/u4wEAADBvtRF4v+6+Y+7x36/iub+d5PQlxl/e3SdPt0uTpKpOSnJmkq+envPrVXVEVR2R5JVJnprkpCTfPa0LAADAIVjthWH+uKouS/LG6fF3Jbl0pSd0959X1Y5Vvv4ZSS7q7s8kubmq9mR2tDFJ9nT3h5Okqi6a1v3AKl8XAACAOSsezauqr6iqJ3T3TyX5jSRfm+TrkvxVkvMP8T1fUFXXTaeLHj2NHZfk1rl19k5jy40DAABwCA50SuevJrk7Sbr7Ld394939Y5kdBfzVQ3i/VyV5VJKTk9ye5Jen8Vpi3V5h/PNU1TlVtbuqdu/bt+8QpgYAALD1HSgCd3T3dfcd7O7dSXYc7Jt198e6+97u/tckv5nPnvK5N8kJc6sen+S2FcaXeu3zu3tXd+/avn37wU4NAABgCAeKwAeusOxBB/tmVXXs3MPvSLL/yqGXJDmzqo6sqhOT7MzsV1JclWRnVZ1YVQ/I7OIxlxzs+wIAADBzoAvDXFVV/6m7f3N+sKrOTnL1Sk+sqjcmOS3JMVW1N8lLkpxWVSdndkrnLUl+IEm6+4aqujizC77ck+T53X3v9DovSHJZkiOSXNDdNxzUJwQAAODfHCgCfzTJ71fVc/LZ6NuV5AGZHclbVnd/9xLDr11h/ZcmeekS45fmAFciBQAAYHVWjMDu/liSx1fVtyT5mmn47d39ZwufGQAAAGtuVb8nsLvfkeQdC54LAAAAC3agC8MAAACwhYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgYhAAACAgSwsAqvqgqq6o6qunxt7WFVdXlU3TV+Pnsarql5RVXuq6rqqeszcc86a1r+pqs5a1HwBAABGsMgjgb+d5PT7jJ2b5Iru3pnkiulxkjw1yc7pdk6SVyWzaEzykiSPS3JKkpfsD0cAAAAO3sIisLv/PMmd9xk+I8mF0/0Lkzxzbvx1PfPuJEdV1bFJnpLk8u6+s7s/keTyfH5YAgAAsEqH+2cCH9HdtyfJ9PXh0/hxSW6dW2/vNLbcOAAAAIdgo1wYppYY6xXGP/8Fqs6pqt1VtXvfvn1rOjkAAICt4nBH4Mem0zwzfb1jGt+b5IS59Y5PctsK45+nu8/v7l3dvWv79u1rPnEAAICt4HBH4CVJ9l/h86wkb50bf+50ldBTk9w1nS56WZInV9XR0wVhnjyNAQAAcAi2LeqFq+qNSU5LckxV7c3sKp/nJbm4qs5O8tEkz55WvzTJ05LsSfLpJM9Lku6+s6p+PslV03o/1933vdgMAAAAq7SwCOzu715m0ZOWWLeTPH+Z17kgyQVrODUAAIBhbZQLwwAAAHAYiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBiEAAAICBrEsEVtUtVfX+qrqmqnZPYw+rqsur6qbp69HTeFXVK6pqT1VdV1WPWY85AwAAbAXreSTwW7r75O7eNT0+N8kV3b0zyRXT4yR5apKd0+2cJK867DMFAADYIjbS6aBnJLlwun9hkmfOjb+uZ96d5KiqOnY9JggAALDZrVcEdpI/qaqrq+qcaewR3X17kkxfHz6NH5fk1rnn7p3GAAAAOEjb1ul9n9Ddt1XVw5NcXlUfXGHdWmKsP2+lWUyekyRf/uVfvjazBAAA2GLW5Uhgd982fb0jye8nOSXJx/af5jl9vWNafW+SE+aefnyS25Z4zfO7e1d379q+ffsipw8AALBpHfYjgVX1xUnu1913T/efnOTnklyS5Kwk501f3zo95ZIkL6iqi5I8Lsld+08bBWDz2HHu29d7Ciu65bynr/cUAOCwWI/TQR+R5Perav/7/053/3FVXZXk4qo6O8lHkzx7Wv/SJE9LsifJp5M87/BPGQAAYGs47BHY3R9O8nVLjP99kictMd5Jnn8YpgYAALDlbaRfEQEAAMCCiUAAAICBiEAAAICBiEAAAICBiEAAAICBrMeviBjWRv8dWYnfkwUAAFudI4EAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAADEYEAAAAD2bbeEwAADmzHuW9f7yms6Jbznr7eUwBglRwJBAAAGIgIBAAAGIgIBAAAGIgIBAAAGIgIBAAAGIgIBAAAGIgIBAAAGIgIBAAAGIgIBAAAGIgIBAAAGIgIBAAAGIgIBAAAGIgIBAAAGIgIBAAAGMimicCqOr2qPlRVe6rq3PWeDwAAwGa0bb0nsBpVdUSSVyb5tiR7k1xVVZd09wfWd2YAwGaw49y3r/cUVnTLeU9f7ykAA9kUEZjklCR7uvvDSVJVFyU5I4kIBABYMBENW8tmicDjktw693hvkset01wAAIDDxH9CrL3q7vWewwFV1bOTPKW7v396/D1JTunuF86tc06Sc6aHX5XkQ2vw1sck+fgavA5rz7bZ2Gyfjcu22bhsm43Lttm4bJuNy7ZZf/++u7cvtWCzHAncm+SEucfHJ7ltfoXuPj/J+Wv5plW1u7t3reVrsjZsm43N9tm4bJuNy7bZuGybjcu22bhsm41ts1wd9KokO6vqxKp6QJIzk1yyznMCAADYdDbFkcDuvqeqXpDksiRHJLmgu29Y52kBAABsOpsiApOkuy9Nculhfts1Pb2UNWXbbGy2z8Zl22xcts3GZdtsXLbNxmXbbGCb4sIwAAAArI3N8jOBAAAArIHhIrCqTqiqd1TVjVV1Q1X9yDT+sKq6vKpumr4ePY1XVb2iqvZU1XVV9Zi51zprWv+mqjprvT7TVlJVR1TV+6rqbdPjE6vqPdP3+E3ThYFSVUdOj/dMy3fMvcaLpvEPVdVT1ueTbD1VdVRVvbmqPjjtP99ov9kYqurHpj/Prq+qN1bVA+0766OqLqiqO6rq+rmxNdtPquobqur903NeUVV1eD/h5rXMtvml6c+066rq96vqqLllS+4PVXX6NLanqs6dG19yn+PAlto2c8t+sqq6qo6ZHttvDqPltk1VvXDaD26oql+cG7ffbBbdPdQtybFJHjPdf0iSv05yUpJfTHLuNH5ukl+Y7j8tyR8lqSSnJnnPNP6wJB+evh493T96vT/fZr8l+fEkv5PkbdPji5OcOd1/dZIfmu7/cJJXT/fPTPKm6f5JSa5NcmSSE5P8TZIj1vtzbYVbkguTfP90/wFJjrLfrP8tyXFJbk7yoOnxxUm+176zbtvjPyZ5TJLr58bWbD9JcmWSb5ye80dJnrren3mz3JbZNk9Osm26/wtz22bJ/WG6/U2SR05/Dl6b5KTpOUvuc26Htm2m8RMyuyjgR5IcM43Zb9Z52yT5liR/muTI6fHDp6/2m010G+5IYHff3t3vne7fneTGzP4RdUZm/8jN9PWZ0/0zkryuZ96d5KiqOjbJU5Jc3t13dvcnklye5PTD+FG2nKo6PsnTk7xmelxJnpjkzdMq990u+7fXm5M8aVr/jCQXdfdnuvvmJHuSnHJ4PsHWVVUPzewvgtcmSXf/c3d/MvabjWJbkgdV1bYkX5Tk9th31kV3/3mSO+8zvCb7ybTsod39Vz37F9Pr5l6LA1hq23T3n3T3PdPDd2f2e4iT5feHU5Ls6e4Pd/c/J7koyRkH+PuKA1hmv0mSlyf5L0nmL2BhvzmMltk2P5TkvO7+zLTOHdO4/WYTGS4C502nQX19kvckeUR3357MQjHJw6fVjkty69zT9k5jy41z6H41sz/s/3V6/KVJPjn3F/T89/jfvv/T8rum9W2XxXhkkn1Jfqtmp+u+pqq+OPabddfdf5vkZUk+mln83ZXk6th3NpK12k+Om+7fd5y18X2ZHSVKDn7brPT3FYegqr49yd9297X3WWS/WX9fmeSbp9M431lVj53G7TebyLARWFUPTvJ7SX60uz+10qpLjPUK4xyCqnpGkju6++r54SVW7QMss10WY1tmp4O8qru/Psk/ZnZa23Jsn8Nk+vmyMzI79ebLknxxkqcusap9Z+M52G1hGy1IVb04yT1J3rB/aInVbJvDpKq+KMmLk/zMUouXGLNtDq9tmZ1ye2qSn0py8XRUz7bZRIaMwKq6f2YB+Ibufss0/LHplIFMX/cf2t6b2Tnp+x2f5LYVxjk0T0jy7VV1S2anCTwxsyODR02nuCWf+z3+t+//tPxLMjtdwXZZjL1J9nb3e6bHb84sCu036+9bk9zc3fu6+1+SvCXJ42Pf2UjWaj/Zm8+erjg/zhdguoDIM5I8ZzpdMDn4bfPxLL/PcfAeldl/bF07/bvg+CTvrap/F/vNRrA3yVumU3KvzOwMrmNiv9lUhovA6X8qXpvkxu7+lblFlyTZfyWps5K8dW78udPVqE5Nctd0Os9lSZ5cVUdP/xP/5GmMQ9DdL+ru47t7R2YXq/iz7n5Oknckeda02n23y/7t9axp/Z7Gz6zZFRBPTLIzsx8I5wvQ3X+X5Naq+qpp6ElJPhD7zUbw0SSnVtUXTX++7d829p2NY032k2nZ3VV16rStnzv3WhyCqjo9yX9N8u3d/em5RcvtD1cl2Tld0fABmf19dcm0Dy23z3GQuvv93f3w7t4x/btgb2YX9fu72G82gj/I7D/rU1VfmdnFXj4e+83mcqhXlNmstyTflNmh5uuSXDPdnpbZeclXJLlp+vqwaf1K8srMrmr0/iS75tCdo/AAAAV3SURBVF7r+zL7odc9SZ633p9tq9ySnJbPXh30kZn9AbInye/ms1eieuD0eM+0/JFzz3/xtL0+FFcAW8vtcnKS3dO+8weZnQpiv9kAtyQ/m+SDSa5P8vrMrsxm31mfbfHGzH42818y+4fr2Wu5nyTZNW3nv0nya0lqvT/zZrkts232ZPazSvv/PfDqufWX3B+mfzP89bTsxXPjS+5zboe2be6z/JZ89uqg9pt13jaZRd//nr6n703yxLn17Teb5FbTBgAAAGAAw50OCgAAMDIRCAAAMBARCAAAMBARCAAAMBARCAAAMBARCMC6q6p7q+qaqrq+qn63qr5omfUuraqjvsD3Oq2q7qqq91XVjVX1ki/w9b6vqt5fVddN8z/jC3m9tVJV31tV+6bv6zVV9bpDfJ3Tqurxaz0/ANbPtvWeAAAk+afuPjlJquoNSX4wya/sXzj9kufq7qet0fu9q7ufUVVfnOSaqnpbd199oCdV1RHdfe/c4+Mz+71Yj+nuu6rqwUm2r9EcVzWHA3hTd7/gC3zL05L8Q5L/+wW+DgAbhCOBAGw070ryFVW1YzpS9+uZ/ULiE6rqlqo6Jkmq6rnT0bdrq+r109j2qvq9qrpquj1hpTfq7n9McnWSR1XVEVX1S9PzrquqH5he87SqekdV/U5mv5x63sOT3J1ZJKW7/6G7b56e9w3T3P5qet3rp/Hvrapf2/8CVfW2qjptuv+qqtpdVTdU1c/OrXNLVf1MVf1FkmdX1aOq6o+r6uqqeldV/YfVfnOXe+5S37uq2pFZkP/YdDTxm1f7PgBsXI4EArBhVNW2JE9N8sfT0FcleV53//C0fP96X53ZEbgndPfHq+ph0/r/M8nLu/svqurLk1yW5NErvN+XJjk1yc8nOTvJXd392Ko6MslfVtWfTKuekuRr9gfenGuTfCzJzVV1RZK3dPcfTst+K8kLu/udVfVLq/wWvLi776yqI5JcUVVf293XTcv+X3d/0zTvK5L8YHffVFWPS/LrSZ64xOt9V1V90/7vTXf/VpLzl3nu533vuvvRVfXqJP/Q3S9b5WcAYIMTgQBsBA+qqmum++9K8tokX5bkI9397iXWf2KSN3f3x5Oku++cxr81yUn7YzHJQ6vqId19932e/81V9b4k/5rkvO7ef+Tta6vqWdM6X5JkZ5J/TnLlEgGY7r63qk5P8tgkT0ry8qr6hiQvT3JUd79zWvX1mcXtgXxnVZ2T2d/PxyY5Kcn+CHxTkkynnD4+ye/Ofc4jl3m9zzkd9ADPXfJ7t4o5A7DJiEAANoJ/+5nA/aYY+cdl1q8kvcT4/ZJ8Y3f/0wHe713d/YwlXvOF3X3ZfeZx2grzSHd3kiuTXFlVl2d2BPBXl5lfktyTz/1xjAdO73Nikp9M8tju/kRV/fb+ZZP9c7hfkk/e9/u1Sis9d8nv3VwUArBF+JlAADajKzI7avalSTJ3OuifJJk/8nUwoXRZkh+qqvtPz/3K6cIxy6qqL6uqx8wNnZzZ0ctPJrlr7lTM58ytc0uSk6vqflV1QmanmibJQzMLvbuq6hFZ5shhd38qs9NPnz3Noarq61bzAQ/w3OW+d3cncUQQYAsRgQBsOt19Q5KXJnlnVV2bz15J9D8n2TVd2OUDmV3UZLVek+QDSd47XcTlN3LgM2bun+RlVfXB6XTW70ryI9Oy5yV5ZVX9VZL5o2t/meTmzC4y87LMLnqT7r42yfuS3JDkgmm95TwnydnTZ78hycH8Worlnrvc9+4Pk3yHC8MAbB01O4sFAFiU6Sqbb+vur1nnqQCAI4EAAAAjcSQQAABgII4EAgAADEQEAgAADEQEAgAADEQEAgAADEQEAgAADEQEAgAADOT/A2kwLhWzbwQVAAAAAElFTkSuQmCC\n", 1827 | "text/plain": [ 1828 | "
" 1829 | ] 1830 | }, 1831 | "metadata": { 1832 | "needs_background": "light" 1833 | }, 1834 | "output_type": "display_data" 1835 | } 1836 | ], 1837 | "source": [ 1838 | "plt.hist(df4.price_per_sqft, rwidth=0.5)\n", 1839 | "plt.xlabel('Price Per Square Feet')\n", 1840 | "plt.ylabel('Count')" 1841 | ] 1842 | }, 1843 | { 1844 | "cell_type": "code", 1845 | "execution_count": 281, 1846 | "metadata": {}, 1847 | "outputs": [ 1848 | { 1849 | "data": { 1850 | "text/plain": [ 1851 | "Text(0, 0.5, 'Count')" 1852 | ] 1853 | }, 1854 | "execution_count": 281, 1855 | "metadata": {}, 1856 | "output_type": "execute_result" 1857 | }, 1858 | { 1859 | "data": { 1860 | "image/png": "\n", 1861 | "text/plain": [ 1862 | "
" 1863 | ] 1864 | }, 1865 | "metadata": { 1866 | "needs_background": "light" 1867 | }, 1868 | "output_type": "display_data" 1869 | } 1870 | ], 1871 | "source": [ 1872 | "plt.hist(df4.bath, rwidth=0.5)\n", 1873 | "plt.xlabel('Number of Bathrooms')\n", 1874 | "plt.ylabel('Count')" 1875 | ] 1876 | }, 1877 | { 1878 | "cell_type": "code", 1879 | "execution_count": 282, 1880 | "metadata": { 1881 | "scrolled": true 1882 | }, 1883 | "outputs": [ 1884 | { 1885 | "name": "stdout", 1886 | "output_type": "stream", 1887 | "text": [ 1888 | "7459 7395\n" 1889 | ] 1890 | } 1891 | ], 1892 | "source": [ 1893 | "# Removing the rows that have 'bath' greater than 'bhk'+2\n", 1894 | "df5 = df4[df4.bath<(df4.bhk+2)]\n", 1895 | "print(len(df4), len(df5))" 1896 | ] 1897 | }, 1898 | { 1899 | "cell_type": "code", 1900 | "execution_count": 283, 1901 | "metadata": {}, 1902 | "outputs": [ 1903 | { 1904 | "data": { 1905 | "text/html": [ 1906 | "
\n", 1907 | "\n", 1920 | "\n", 1921 | " \n", 1922 | " \n", 1923 | " \n", 1924 | " \n", 1925 | " \n", 1926 | " \n", 1927 | " \n", 1928 | " \n", 1929 | " \n", 1930 | " \n", 1931 | " \n", 1932 | " \n", 1933 | " \n", 1934 | " \n", 1935 | " \n", 1936 | " \n", 1937 | " \n", 1938 | " \n", 1939 | " \n", 1940 | " \n", 1941 | " \n", 1942 | " \n", 1943 | " \n", 1944 | " \n", 1945 | " \n", 1946 | " \n", 1947 | " \n", 1948 | " \n", 1949 | " \n", 1950 | " \n", 1951 | " \n", 1952 | " \n", 1953 | " \n", 1954 | " \n", 1955 | " \n", 1956 | " \n", 1957 | " \n", 1958 | " \n", 1959 | " \n", 1960 | " \n", 1961 | " \n", 1962 | " \n", 1963 | " \n", 1964 | " \n", 1965 | " \n", 1966 | " \n", 1967 | " \n", 1968 | " \n", 1969 | " \n", 1970 | " \n", 1971 | " \n", 1972 | " \n", 1973 | " \n", 1974 | " \n", 1975 | " \n", 1976 | " \n", 1977 | " \n", 1978 | " \n", 1979 | " \n", 1980 | " \n", 1981 | " \n", 1982 | " \n", 1983 | " \n", 1984 | " \n", 1985 | " \n", 1986 | " \n", 1987 | " \n", 1988 | " \n", 1989 | " \n", 1990 | " \n", 1991 | " \n", 1992 | " \n", 1993 | " \n", 1994 | " \n", 1995 | " \n", 1996 | " \n", 1997 | "
area_typeavailabilitybathbalconypricesite_locationbhknew_total_sqftprice_per_sqft
10923Super built-up AreaNot Ready2.01.067.0Yerawada21165.05751.072961
10928Built-up AreaReady To Move6.02.0115.0Yerawada63500.03285.714286
10931Built-up AreaNot Ready2.02.0353.0Yerawada53210.010996.884735
10932Super built-up AreaReady To Move2.02.054.0Yerawada21050.05142.857143
10936Super built-up AreaNot Ready2.01.070.0Yerawada21132.06183.745583
\n", 1998 | "
" 1999 | ], 2000 | "text/plain": [ 2001 | " area_type availability bath balcony price \\\n", 2002 | "10923 Super built-up Area Not Ready 2.0 1.0 67.0 \n", 2003 | "10928 Built-up Area Ready To Move 6.0 2.0 115.0 \n", 2004 | "10931 Built-up Area Not Ready 2.0 2.0 353.0 \n", 2005 | "10932 Super built-up Area Ready To Move 2.0 2.0 54.0 \n", 2006 | "10936 Super built-up Area Not Ready 2.0 1.0 70.0 \n", 2007 | "\n", 2008 | " site_location bhk new_total_sqft price_per_sqft \n", 2009 | "10923 Yerawada 2 1165.0 5751.072961 \n", 2010 | "10928 Yerawada 6 3500.0 3285.714286 \n", 2011 | "10931 Yerawada 5 3210.0 10996.884735 \n", 2012 | "10932 Yerawada 2 1050.0 5142.857143 \n", 2013 | "10936 Yerawada 2 1132.0 6183.745583 " 2014 | ] 2015 | }, 2016 | "execution_count": 283, 2017 | "metadata": {}, 2018 | "output_type": "execute_result" 2019 | } 2020 | ], 2021 | "source": [ 2022 | "df5.tail()" 2023 | ] 2024 | }, 2025 | { 2026 | "cell_type": "markdown", 2027 | "metadata": {}, 2028 | "source": [ 2029 | "## Model Building" 2030 | ] 2031 | }, 2032 | { 2033 | "cell_type": "code", 2034 | "execution_count": 284, 2035 | "metadata": {}, 2036 | "outputs": [], 2037 | "source": [ 2038 | "# Removing the unnecessary columns (columns that were added only for removing the outliers)\n", 2039 | "df6 = df5.copy()\n", 2040 | "df6 = df6.drop('price_per_sqft', axis='columns')" 2041 | ] 2042 | }, 2043 | { 2044 | "cell_type": "code", 2045 | "execution_count": 285, 2046 | "metadata": {}, 2047 | "outputs": [ 2048 | { 2049 | "data": { 2050 | "text/html": [ 2051 | "
\n", 2052 | "\n", 2065 | "\n", 2066 | " \n", 2067 | " \n", 2068 | " \n", 2069 | " \n", 2070 | " \n", 2071 | " \n", 2072 | " \n", 2073 | " \n", 2074 | " \n", 2075 | " \n", 2076 | " \n", 2077 | " \n", 2078 | " \n", 2079 | " \n", 2080 | " \n", 2081 | " \n", 2082 | " \n", 2083 | " \n", 2084 | " \n", 2085 | " \n", 2086 | " \n", 2087 | " \n", 2088 | " \n", 2089 | " \n", 2090 | " \n", 2091 | " \n", 2092 | " \n", 2093 | " \n", 2094 | " \n", 2095 | " \n", 2096 | " \n", 2097 | " \n", 2098 | " \n", 2099 | " \n", 2100 | " \n", 2101 | " \n", 2102 | " \n", 2103 | " \n", 2104 | " \n", 2105 | " \n", 2106 | " \n", 2107 | " \n", 2108 | " \n", 2109 | " \n", 2110 | " \n", 2111 | " \n", 2112 | " \n", 2113 | " \n", 2114 | " \n", 2115 | " \n", 2116 | " \n", 2117 | " \n", 2118 | " \n", 2119 | " \n", 2120 | " \n", 2121 | " \n", 2122 | " \n", 2123 | " \n", 2124 | " \n", 2125 | " \n", 2126 | " \n", 2127 | " \n", 2128 | " \n", 2129 | " \n", 2130 | " \n", 2131 | " \n", 2132 | " \n", 2133 | " \n", 2134 | " \n", 2135 | " \n", 2136 | "
area_typeavailabilitybathbalconypricesite_locationbhknew_total_sqft
0Super built-up AreaNot Ready2.01.039.07Alandi Road21056.0
1Plot AreaReady To Move4.01.0245.00Alandi Road42894.0
2Super built-up AreaReady To Move2.02.050.00Alandi Road21084.0
3Super built-up AreaReady To Move2.02.080.00Alandi Road21230.0
4Super built-up AreaReady To Move3.02.0130.00Alandi Road31750.0
\n", 2137 | "
" 2138 | ], 2139 | "text/plain": [ 2140 | " area_type availability bath balcony price site_location \\\n", 2141 | "0 Super built-up Area Not Ready 2.0 1.0 39.07 Alandi Road \n", 2142 | "1 Plot Area Ready To Move 4.0 1.0 245.00 Alandi Road \n", 2143 | "2 Super built-up Area Ready To Move 2.0 2.0 50.00 Alandi Road \n", 2144 | "3 Super built-up Area Ready To Move 2.0 2.0 80.00 Alandi Road \n", 2145 | "4 Super built-up Area Ready To Move 3.0 2.0 130.00 Alandi Road \n", 2146 | "\n", 2147 | " bhk new_total_sqft \n", 2148 | "0 2 1056.0 \n", 2149 | "1 4 2894.0 \n", 2150 | "2 2 1084.0 \n", 2151 | "3 2 1230.0 \n", 2152 | "4 3 1750.0 " 2153 | ] 2154 | }, 2155 | "execution_count": 285, 2156 | "metadata": {}, 2157 | "output_type": "execute_result" 2158 | } 2159 | ], 2160 | "source": [ 2161 | "df6.head()" 2162 | ] 2163 | }, 2164 | { 2165 | "cell_type": "code", 2166 | "execution_count": 286, 2167 | "metadata": {}, 2168 | "outputs": [], 2169 | "source": [ 2170 | "# Converting the categorical_value into numerical_values using get_dummies method\n", 2171 | "dummy_cols = pd.get_dummies(df6.site_location)\n", 2172 | "df6 = pd.concat([df6,dummy_cols], axis='columns')" 2173 | ] 2174 | }, 2175 | { 2176 | "cell_type": "code", 2177 | "execution_count": 287, 2178 | "metadata": {}, 2179 | "outputs": [], 2180 | "source": [ 2181 | "# Converting the categorical_value into numerical_values using get_dummies method\n", 2182 | "dummy_cols = pd.get_dummies(df6.availability).drop('Not Ready', axis='columns')\n", 2183 | "df6 = pd.concat([df6,dummy_cols], axis='columns')" 2184 | ] 2185 | }, 2186 | { 2187 | "cell_type": "code", 2188 | "execution_count": 288, 2189 | "metadata": {}, 2190 | "outputs": [], 2191 | "source": [ 2192 | "# Converting the categorical_value into numerical_values using get_dummies method\n", 2193 | "dummy_cols = pd.get_dummies(df6.area_type).drop('Super built-up Area', axis='columns')\n", 2194 | "df6 = pd.concat([df6,dummy_cols], axis='columns')" 2195 | ] 2196 | }, 2197 | { 2198 | "cell_type": "code", 2199 | "execution_count": 289, 2200 | "metadata": { 2201 | "scrolled": false 2202 | }, 2203 | "outputs": [ 2204 | { 2205 | "data": { 2206 | "text/html": [ 2207 | "
\n", 2208 | "\n", 2221 | "\n", 2222 | " \n", 2223 | " \n", 2224 | " \n", 2225 | " \n", 2226 | " \n", 2227 | " \n", 2228 | " \n", 2229 | " \n", 2230 | " \n", 2231 | " \n", 2232 | " \n", 2233 | " \n", 2234 | " \n", 2235 | " \n", 2236 | " \n", 2237 | " \n", 2238 | " \n", 2239 | " \n", 2240 | " \n", 2241 | " \n", 2242 | " \n", 2243 | " \n", 2244 | " \n", 2245 | " \n", 2246 | " \n", 2247 | " \n", 2248 | " \n", 2249 | " \n", 2250 | " \n", 2251 | " \n", 2252 | " \n", 2253 | " \n", 2254 | " \n", 2255 | " \n", 2256 | " \n", 2257 | " \n", 2258 | " \n", 2259 | " \n", 2260 | " \n", 2261 | " \n", 2262 | " \n", 2263 | " \n", 2264 | " \n", 2265 | " \n", 2266 | " \n", 2267 | " \n", 2268 | " \n", 2269 | " \n", 2270 | " \n", 2271 | " \n", 2272 | " \n", 2273 | " \n", 2274 | " \n", 2275 | " \n", 2276 | " \n", 2277 | " \n", 2278 | " \n", 2279 | " \n", 2280 | " \n", 2281 | " \n", 2282 | " \n", 2283 | " \n", 2284 | " \n", 2285 | " \n", 2286 | " \n", 2287 | " \n", 2288 | " \n", 2289 | " \n", 2290 | " \n", 2291 | " \n", 2292 | " \n", 2293 | " \n", 2294 | " \n", 2295 | " \n", 2296 | " \n", 2297 | " \n", 2298 | " \n", 2299 | " \n", 2300 | " \n", 2301 | " \n", 2302 | " \n", 2303 | " \n", 2304 | " \n", 2305 | " \n", 2306 | " \n", 2307 | " \n", 2308 | " \n", 2309 | " \n", 2310 | " \n", 2311 | " \n", 2312 | " \n", 2313 | " \n", 2314 | " \n", 2315 | " \n", 2316 | " \n", 2317 | " \n", 2318 | " \n", 2319 | " \n", 2320 | " \n", 2321 | " \n", 2322 | " \n", 2323 | " \n", 2324 | " \n", 2325 | " \n", 2326 | " \n", 2327 | " \n", 2328 | " \n", 2329 | " \n", 2330 | " \n", 2331 | " \n", 2332 | " \n", 2333 | " \n", 2334 | " \n", 2335 | " \n", 2336 | " \n", 2337 | " \n", 2338 | " \n", 2339 | " \n", 2340 | " \n", 2341 | " \n", 2342 | " \n", 2343 | " \n", 2344 | " \n", 2345 | " \n", 2346 | " \n", 2347 | " \n", 2348 | " \n", 2349 | " \n", 2350 | " \n", 2351 | " \n", 2352 | " \n", 2353 | " \n", 2354 | " \n", 2355 | " \n", 2356 | " \n", 2357 | " \n", 2358 | " \n", 2359 | " \n", 2360 | " \n", 2361 | " \n", 2362 | " \n", 2363 | " \n", 2364 | " \n", 2365 | " \n", 2366 | " \n", 2367 | " \n", 2368 | " \n", 2369 | " \n", 2370 | " \n", 2371 | " \n", 2372 | " \n", 2373 | " \n", 2374 | " \n", 2375 | " \n", 2376 | " \n", 2377 | " \n", 2378 | " \n", 2379 | " \n", 2380 | " \n", 2381 | " \n", 2382 | " \n", 2383 | " \n", 2384 | " \n", 2385 | " \n", 2386 | " \n", 2387 | " \n", 2388 | " \n", 2389 | " \n", 2390 | " \n", 2391 | " \n", 2392 | " \n", 2393 | " \n", 2394 | " \n", 2395 | " \n", 2396 | " \n", 2397 | " \n", 2398 | " \n", 2399 | " \n", 2400 | " \n", 2401 | " \n", 2402 | " \n", 2403 | " \n", 2404 | " \n", 2405 | " \n", 2406 | " \n", 2407 | " \n", 2408 | " \n", 2409 | " \n", 2410 | " \n", 2411 | " \n", 2412 | " \n", 2413 | " \n", 2414 | " \n", 2415 | " \n", 2416 | " \n", 2417 | " \n", 2418 | " \n", 2419 | " \n", 2420 | " \n", 2421 | " \n", 2422 | " \n", 2423 | " \n", 2424 | " \n", 2425 | " \n", 2426 | " \n", 2427 | " \n", 2428 | " \n", 2429 | " \n", 2430 | " \n", 2431 | " \n", 2432 | " \n", 2433 | " \n", 2434 | " \n", 2435 | " \n", 2436 | " \n", 2437 | " \n", 2438 | " \n", 2439 | " \n", 2440 | " \n", 2441 | " \n", 2442 | " \n", 2443 | " \n", 2444 | " \n", 2445 | " \n", 2446 | " \n", 2447 | " \n", 2448 | " \n", 2449 | " \n", 2450 | " \n", 2451 | " \n", 2452 | " \n", 2453 | " \n", 2454 | " \n", 2455 | " \n", 2456 | " \n", 2457 | " \n", 2458 | " \n", 2459 | " \n", 2460 | " \n", 2461 | " \n", 2462 | " \n", 2463 | " \n", 2464 | " \n", 2465 | " \n", 2466 | " \n", 2467 | " \n", 2468 | " \n", 2469 | " \n", 2470 | " \n", 2471 | " \n", 2472 | " \n", 2473 | " \n", 2474 | " \n", 2475 | " \n", 2476 | " \n", 2477 | " \n", 2478 | " \n", 2479 | " \n", 2480 | " \n", 2481 | " \n", 2482 | " \n", 2483 | " \n", 2484 | " \n", 2485 | " \n", 2486 | " \n", 2487 | " \n", 2488 | " \n", 2489 | " \n", 2490 | "
bathbalconypricebhknew_total_sqftAlandi RoadAmbegaon BudrukAnandnagarAundhAundh Road...Wadgaon SheriWagholiWakadewadiWanowrieWarjeYerawadaReady To MoveBuilt-up AreaCarpet AreaPlot Area
02.01.039.0721056.010000...0000000000
14.01.0245.0042894.010000...0000001001
22.02.050.0021084.010000...0000001000
32.02.080.0021230.010000...0000001000
43.02.0130.0031750.010000...0000001000
52.01.041.002995.010000...0000001100
82.01.095.0021360.010000...0000001000
92.02.050.0021040.010000...0000001100
103.02.086.0631655.010000...0000001000
125.03.0198.0042790.010000...0000000000
\n", 2491 | "

10 rows × 105 columns

\n", 2492 | "
" 2493 | ], 2494 | "text/plain": [ 2495 | " bath balcony price bhk new_total_sqft Alandi Road Ambegaon Budruk \\\n", 2496 | "0 2.0 1.0 39.07 2 1056.0 1 0 \n", 2497 | "1 4.0 1.0 245.00 4 2894.0 1 0 \n", 2498 | "2 2.0 2.0 50.00 2 1084.0 1 0 \n", 2499 | "3 2.0 2.0 80.00 2 1230.0 1 0 \n", 2500 | "4 3.0 2.0 130.00 3 1750.0 1 0 \n", 2501 | "5 2.0 1.0 41.00 2 995.0 1 0 \n", 2502 | "8 2.0 1.0 95.00 2 1360.0 1 0 \n", 2503 | "9 2.0 2.0 50.00 2 1040.0 1 0 \n", 2504 | "10 3.0 2.0 86.06 3 1655.0 1 0 \n", 2505 | "12 5.0 3.0 198.00 4 2790.0 1 0 \n", 2506 | "\n", 2507 | " Anandnagar Aundh Aundh Road ... Wadgaon Sheri Wagholi Wakadewadi \\\n", 2508 | "0 0 0 0 ... 0 0 0 \n", 2509 | "1 0 0 0 ... 0 0 0 \n", 2510 | "2 0 0 0 ... 0 0 0 \n", 2511 | "3 0 0 0 ... 0 0 0 \n", 2512 | "4 0 0 0 ... 0 0 0 \n", 2513 | "5 0 0 0 ... 0 0 0 \n", 2514 | "8 0 0 0 ... 0 0 0 \n", 2515 | "9 0 0 0 ... 0 0 0 \n", 2516 | "10 0 0 0 ... 0 0 0 \n", 2517 | "12 0 0 0 ... 0 0 0 \n", 2518 | "\n", 2519 | " Wanowrie Warje Yerawada Ready To Move Built-up Area Carpet Area \\\n", 2520 | "0 0 0 0 0 0 0 \n", 2521 | "1 0 0 0 1 0 0 \n", 2522 | "2 0 0 0 1 0 0 \n", 2523 | "3 0 0 0 1 0 0 \n", 2524 | "4 0 0 0 1 0 0 \n", 2525 | "5 0 0 0 1 1 0 \n", 2526 | "8 0 0 0 1 0 0 \n", 2527 | "9 0 0 0 1 1 0 \n", 2528 | "10 0 0 0 1 0 0 \n", 2529 | "12 0 0 0 0 0 0 \n", 2530 | "\n", 2531 | " Plot Area \n", 2532 | "0 0 \n", 2533 | "1 1 \n", 2534 | "2 0 \n", 2535 | "3 0 \n", 2536 | "4 0 \n", 2537 | "5 0 \n", 2538 | "8 0 \n", 2539 | "9 0 \n", 2540 | "10 0 \n", 2541 | "12 0 \n", 2542 | "\n", 2543 | "[10 rows x 105 columns]" 2544 | ] 2545 | }, 2546 | "execution_count": 289, 2547 | "metadata": {}, 2548 | "output_type": "execute_result" 2549 | } 2550 | ], 2551 | "source": [ 2552 | "df6.drop(['area_type','availability','site_location'], axis='columns', inplace=True)\n", 2553 | "df6.head(10)" 2554 | ] 2555 | }, 2556 | { 2557 | "cell_type": "code", 2558 | "execution_count": 290, 2559 | "metadata": {}, 2560 | "outputs": [ 2561 | { 2562 | "data": { 2563 | "text/plain": [ 2564 | "(7395, 105)" 2565 | ] 2566 | }, 2567 | "execution_count": 290, 2568 | "metadata": {}, 2569 | "output_type": "execute_result" 2570 | } 2571 | ], 2572 | "source": [ 2573 | "# Size of the dataset\n", 2574 | "df6.shape" 2575 | ] 2576 | }, 2577 | { 2578 | "cell_type": "code", 2579 | "execution_count": 291, 2580 | "metadata": {}, 2581 | "outputs": [], 2582 | "source": [ 2583 | "# Splitting the dataset into features and label\n", 2584 | "X = df6.drop('price', axis='columns')\n", 2585 | "y = df6['price']" 2586 | ] 2587 | }, 2588 | { 2589 | "cell_type": "code", 2590 | "execution_count": 292, 2591 | "metadata": {}, 2592 | "outputs": [], 2593 | "source": [ 2594 | "# Using GridSearchCV to find the best algorithm for this problem\n", 2595 | "from sklearn.model_selection import GridSearchCV\n", 2596 | "from sklearn.model_selection import ShuffleSplit\n", 2597 | "from sklearn.linear_model import LinearRegression\n", 2598 | "from sklearn.linear_model import Lasso\n", 2599 | "from sklearn.tree import DecisionTreeRegressor" 2600 | ] 2601 | }, 2602 | { 2603 | "cell_type": "code", 2604 | "execution_count": 293, 2605 | "metadata": {}, 2606 | "outputs": [ 2607 | { 2608 | "data": { 2609 | "text/html": [ 2610 | "
\n", 2611 | "\n", 2624 | "\n", 2625 | " \n", 2626 | " \n", 2627 | " \n", 2628 | " \n", 2629 | " \n", 2630 | " \n", 2631 | " \n", 2632 | " \n", 2633 | " \n", 2634 | " \n", 2635 | " \n", 2636 | " \n", 2637 | " \n", 2638 | " \n", 2639 | " \n", 2640 | " \n", 2641 | " \n", 2642 | " \n", 2643 | " \n", 2644 | " \n", 2645 | " \n", 2646 | " \n", 2647 | " \n", 2648 | " \n", 2649 | " \n", 2650 | " \n", 2651 | " \n", 2652 | " \n", 2653 | "
modelbest_parametersaccuracy
0linear_regression{'normalize': True}0.835475
1lasso{'alpha': 2, 'selection': 'random'}0.829241
2decision_tree{'criterion': 'mse', 'splitter': 'best'}0.781004
\n", 2654 | "
" 2655 | ], 2656 | "text/plain": [ 2657 | " model best_parameters accuracy\n", 2658 | "0 linear_regression {'normalize': True} 0.835475\n", 2659 | "1 lasso {'alpha': 2, 'selection': 'random'} 0.829241\n", 2660 | "2 decision_tree {'criterion': 'mse', 'splitter': 'best'} 0.781004" 2661 | ] 2662 | }, 2663 | "execution_count": 293, 2664 | "metadata": {}, 2665 | "output_type": "execute_result" 2666 | } 2667 | ], 2668 | "source": [ 2669 | "# Creating a function for GridSearchCV\n", 2670 | "\n", 2671 | "def find_best_model(X, y):\n", 2672 | " models = {\n", 2673 | " 'linear_regression': {\n", 2674 | " 'model': LinearRegression(),\n", 2675 | " 'parameters': {\n", 2676 | " 'normalize': [True,False]\n", 2677 | " }\n", 2678 | " },\n", 2679 | " \n", 2680 | " 'lasso': {\n", 2681 | " 'model': Lasso(),\n", 2682 | " 'parameters': {\n", 2683 | " 'alpha': [1,2],\n", 2684 | " 'selection': ['random', 'cyclic']\n", 2685 | " }\n", 2686 | " },\n", 2687 | " \n", 2688 | " 'decision_tree': {\n", 2689 | " 'model': DecisionTreeRegressor(),\n", 2690 | " 'parameters': {\n", 2691 | " 'criterion': ['mse', 'friedman_mse'],\n", 2692 | " 'splitter': ['best', 'random']\n", 2693 | " }\n", 2694 | " }\n", 2695 | " }\n", 2696 | " \n", 2697 | " scores = []\n", 2698 | " cv_X_y = ShuffleSplit(n_splits=5, test_size=0.20, random_state=0)\n", 2699 | " \n", 2700 | " for model_name, model_params in models.items():\n", 2701 | " gs = GridSearchCV(model_params['model'], model_params['parameters'], cv=cv_X_y, return_train_score=False)\n", 2702 | " gs.fit(X, y)\n", 2703 | " scores.append({\n", 2704 | " 'model': model_name,\n", 2705 | " 'best_parameters': gs.best_params_,\n", 2706 | " 'accuracy': gs.best_score_\n", 2707 | " })\n", 2708 | " \n", 2709 | " return pd.DataFrame(scores, columns=['model', 'best_parameters', 'accuracy'])\n", 2710 | "\n", 2711 | "find_best_model(X, y)" 2712 | ] 2713 | }, 2714 | { 2715 | "cell_type": "markdown", 2716 | "metadata": {}, 2717 | "source": [ 2718 | "#### Since the Linear Regression has the highest accuracy, the model selected for this problem is Linear Regression" 2719 | ] 2720 | }, 2721 | { 2722 | "cell_type": "code", 2723 | "execution_count": 294, 2724 | "metadata": {}, 2725 | "outputs": [], 2726 | "source": [ 2727 | "# Splitting the dataset into train and test set\n", 2728 | "from sklearn.model_selection import train_test_split\n", 2729 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=20)" 2730 | ] 2731 | }, 2732 | { 2733 | "cell_type": "code", 2734 | "execution_count": 295, 2735 | "metadata": {}, 2736 | "outputs": [ 2737 | { 2738 | "data": { 2739 | "text/plain": [ 2740 | "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)" 2741 | ] 2742 | }, 2743 | "execution_count": 295, 2744 | "metadata": {}, 2745 | "output_type": "execute_result" 2746 | } 2747 | ], 2748 | "source": [ 2749 | "# Creating Linear Regression Model\n", 2750 | "from sklearn.linear_model import LinearRegression\n", 2751 | "model = LinearRegression(normalize=True)\n", 2752 | "model.fit(X_train, y_train)" 2753 | ] 2754 | }, 2755 | { 2756 | "cell_type": "code", 2757 | "execution_count": 296, 2758 | "metadata": {}, 2759 | "outputs": [ 2760 | { 2761 | "data": { 2762 | "text/plain": [ 2763 | "0.8180571987758956" 2764 | ] 2765 | }, 2766 | "execution_count": 296, 2767 | "metadata": {}, 2768 | "output_type": "execute_result" 2769 | } 2770 | ], 2771 | "source": [ 2772 | "model.score(X_test, y_test)" 2773 | ] 2774 | }, 2775 | { 2776 | "cell_type": "markdown", 2777 | "metadata": {}, 2778 | "source": [ 2779 | "#### Predicting the values using our trained model" 2780 | ] 2781 | }, 2782 | { 2783 | "cell_type": "code", 2784 | "execution_count": 297, 2785 | "metadata": {}, 2786 | "outputs": [ 2787 | { 2788 | "data": { 2789 | "text/plain": [ 2790 | "Index(['bath', 'balcony', 'bhk', 'new_total_sqft', 'Alandi Road',\n", 2791 | " 'Ambegaon Budruk', 'Anandnagar', 'Aundh', 'Aundh Road', 'Balaji Nagar',\n", 2792 | " ...\n", 2793 | " 'Wadgaon Sheri', 'Wagholi', 'Wakadewadi', 'Wanowrie', 'Warje',\n", 2794 | " 'Yerawada', 'Ready To Move', 'Built-up Area', 'Carpet Area',\n", 2795 | " 'Plot Area'],\n", 2796 | " dtype='object', length=104)" 2797 | ] 2798 | }, 2799 | "execution_count": 297, 2800 | "metadata": {}, 2801 | "output_type": "execute_result" 2802 | } 2803 | ], 2804 | "source": [ 2805 | "X.columns" 2806 | ] 2807 | }, 2808 | { 2809 | "cell_type": "code", 2810 | "execution_count": 298, 2811 | "metadata": {}, 2812 | "outputs": [ 2813 | { 2814 | "data": { 2815 | "text/plain": [ 2816 | "9" 2817 | ] 2818 | }, 2819 | "execution_count": 298, 2820 | "metadata": {}, 2821 | "output_type": "execute_result" 2822 | } 2823 | ], 2824 | "source": [ 2825 | "# For finding the appropriate location\n", 2826 | "np.where(X.columns=='Balaji Nagar')[0][0]" 2827 | ] 2828 | }, 2829 | { 2830 | "cell_type": "code", 2831 | "execution_count": 299, 2832 | "metadata": {}, 2833 | "outputs": [ 2834 | { 2835 | "data": { 2836 | "text/plain": [ 2837 | "101" 2838 | ] 2839 | }, 2840 | "execution_count": 299, 2841 | "metadata": {}, 2842 | "output_type": "execute_result" 2843 | } 2844 | ], 2845 | "source": [ 2846 | "# For finding the appropriate area_type\n", 2847 | "np.where(X.columns=='Built-up Area')[0][0]" 2848 | ] 2849 | }, 2850 | { 2851 | "cell_type": "code", 2852 | "execution_count": 300, 2853 | "metadata": {}, 2854 | "outputs": [ 2855 | { 2856 | "data": { 2857 | "text/plain": [ 2858 | "100" 2859 | ] 2860 | }, 2861 | "execution_count": 300, 2862 | "metadata": {}, 2863 | "output_type": "execute_result" 2864 | } 2865 | ], 2866 | "source": [ 2867 | "# For finding the appropriate availability\n", 2868 | "np.where(X.columns=='Ready To Move')[0][0]" 2869 | ] 2870 | }, 2871 | { 2872 | "cell_type": "code", 2873 | "execution_count": 301, 2874 | "metadata": {}, 2875 | "outputs": [], 2876 | "source": [ 2877 | "# Creating a fuction to predict values\n", 2878 | "def prediction(location, bhk, bath, balcony, sqft, area_type, availability):\n", 2879 | " \n", 2880 | " loc_index, area_index, avail_index = -1,-1,-1\n", 2881 | " \n", 2882 | " if location!='other':\n", 2883 | " loc_index = int(np.where(X.columns==location)[0][0])\n", 2884 | " \n", 2885 | " if area_type!='Super built-up Area':\n", 2886 | " area_index = np.where(X.columns==area_type)[0][0]\n", 2887 | " \n", 2888 | " if availability!='Not Ready': \n", 2889 | " avail_index = np.where(X.columns==availability)[0][0]\n", 2890 | " \n", 2891 | " x = np.zeros(len(X.columns))\n", 2892 | " x[0] = bath\n", 2893 | " x[1] = balcony\n", 2894 | " x[2] = bhk\n", 2895 | " x[3] = sqft\n", 2896 | " \n", 2897 | " if loc_index >= 0:\n", 2898 | " x[loc_index] = 1\n", 2899 | " if area_index >= 0:\n", 2900 | " x[area_index] = 1\n", 2901 | " if avail_index >= 0:\n", 2902 | " x[avail_index] = 1\n", 2903 | " \n", 2904 | " return model.predict([x])[0]" 2905 | ] 2906 | }, 2907 | { 2908 | "cell_type": "code", 2909 | "execution_count": 302, 2910 | "metadata": {}, 2911 | "outputs": [ 2912 | { 2913 | "data": { 2914 | "text/plain": [ 2915 | "52.17049124040433" 2916 | ] 2917 | }, 2918 | "execution_count": 302, 2919 | "metadata": {}, 2920 | "output_type": "execute_result" 2921 | } 2922 | ], 2923 | "source": [ 2924 | "# Prediction 1\n", 2925 | "# Input in the form : Location, BHK, Bath, Balcony, Sqft, area_type, availability.\n", 2926 | "prediction('Balaji Nagar', 2, 2, 2, 1000, 'Built-up Area', 'Ready To Move')" 2927 | ] 2928 | }, 2929 | { 2930 | "cell_type": "code", 2931 | "execution_count": 303, 2932 | "metadata": {}, 2933 | "outputs": [ 2934 | { 2935 | "data": { 2936 | "text/plain": [ 2937 | "52.09543340931981" 2938 | ] 2939 | }, 2940 | "execution_count": 303, 2941 | "metadata": {}, 2942 | "output_type": "execute_result" 2943 | } 2944 | ], 2945 | "source": [ 2946 | "# Prediction 2\n", 2947 | "# Input in the form : Location, BHK, Bath, Balcony, Sqft, area_type, availability.\n", 2948 | "prediction('Hadapsar', 2, 2, 2, 1000, 'Super built-up Area', 'Ready To Move')" 2949 | ] 2950 | }, 2951 | { 2952 | "cell_type": "code", 2953 | "execution_count": 304, 2954 | "metadata": {}, 2955 | "outputs": [ 2956 | { 2957 | "data": { 2958 | "text/plain": [ 2959 | "140.1821349541965" 2960 | ] 2961 | }, 2962 | "execution_count": 304, 2963 | "metadata": {}, 2964 | "output_type": "execute_result" 2965 | } 2966 | ], 2967 | "source": [ 2968 | "# Prediction 3\n", 2969 | "# Input in the form : Location, BHK, Bath, Balcony, Sqft, area_type, availability.\n", 2970 | "prediction('Camp', 2, 3, 2, 2000, 'Plot Area', 'Not Ready')" 2971 | ] 2972 | }, 2973 | { 2974 | "cell_type": "code", 2975 | "execution_count": 305, 2976 | "metadata": {}, 2977 | "outputs": [ 2978 | { 2979 | "data": { 2980 | "text/plain": [ 2981 | "140.2386444110721" 2982 | ] 2983 | }, 2984 | "execution_count": 305, 2985 | "metadata": {}, 2986 | "output_type": "execute_result" 2987 | } 2988 | ], 2989 | "source": [ 2990 | "# Prediction 4\n", 2991 | "# Input in the form : Location, BHK, Bath, Balcony, Sqft, area_type, availability.\n", 2992 | "prediction('Baner', 2, 3, 2, 2000, 'Plot Area', 'Not Ready')" 2993 | ] 2994 | } 2995 | ], 2996 | "metadata": { 2997 | "kernelspec": { 2998 | "display_name": "Python 3", 2999 | "language": "python", 3000 | "name": "python3" 3001 | }, 3002 | "language_info": { 3003 | "codemirror_mode": { 3004 | "name": "ipython", 3005 | "version": 3 3006 | }, 3007 | "file_extension": ".py", 3008 | "mimetype": "text/x-python", 3009 | "name": "python", 3010 | "nbconvert_exporter": "python", 3011 | "pygments_lexer": "ipython3", 3012 | "version": "3.7.4" 3013 | } 3014 | }, 3015 | "nbformat": 4, 3016 | "nbformat_minor": 2 3017 | } 3018 | --------------------------------------------------------------------------------