└── Task1.ipynb /Task1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyN6n0+/ccm+qWvvyeTSUyIs", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "id": "8ifQa8b7h07u" 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "import pandas as pd\n", 38 | "import numpy as np\n", 39 | "from sklearn.model_selection import train_test_split, GridSearchCV\n", 40 | "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", 41 | "from sklearn.compose import ColumnTransformer\n", 42 | "from sklearn.pipeline import Pipeline\n", 43 | "from sklearn.impute import SimpleImputer\n", 44 | "from sklearn.ensemble import RandomForestRegressor\n", 45 | "from sklearn.linear_model import LinearRegression\n", 46 | "from sklearn.metrics import mean_squared_error, r2_score\n", 47 | "import joblib\n" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "source": [ 53 | "np.random.seed(42)\n", 54 | "n_samples = 1000\n", 55 | "square_feet = np.random.randint(500, 5000, size=n_samples)\n", 56 | "num_bedrooms = np.random.randint(1, 6, size=n_samples)\n", 57 | "num_bathrooms = np.random.randint(1, 4, size=n_samples)\n", 58 | "lot_size = np.random.randint(1000, 10000, size=n_samples)\n", 59 | "year_built = np.random.randint(1900, 2021, size=n_samples)\n", 60 | "garage_size = np.random.randint(0, 4, size=n_samples)\n", 61 | "neighborhoods = ['A', 'B', 'C', 'D']\n", 62 | "neighborhood = np.random.choice(neighborhoods, size=n_samples)\n", 63 | "styles = ['Ranch', 'Colonial', 'Victorian', 'Modern']\n", 64 | "home_style = np.random.choice(styles, size=n_samples)\n", 65 | "home_value = (square_feet * 100 + num_bedrooms * 10000 + num_bathrooms * 5000 +\n", 66 | " lot_size * 10 + garage_size * 2000 + (year_built - 1900) * 300 +\n", 67 | " np.random.normal(0, 10000, size=n_samples))\n", 68 | "data = pd.DataFrame({\n", 69 | " 'square_feet': square_feet,\n", 70 | " 'num_bedrooms': num_bedrooms,\n", 71 | " 'num_bathrooms': num_bathrooms,\n", 72 | " 'lot_size': lot_size,\n", 73 | " 'year_built': year_built,\n", 74 | " 'garage_size': garage_size,\n", 75 | " 'neighborhood': neighborhood,\n", 76 | " 'home_style': home_style,\n", 77 | " 'home_value': home_value\n", 78 | "})" 79 | ], 80 | "metadata": { 81 | "id": "RPgaIFHeh_u0" 82 | }, 83 | "execution_count": null, 84 | "outputs": [] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "source": [ 89 | "# Define the target variable and features\n", 90 | "target = 'home_value'\n", 91 | "features = data.drop(columns=[target]).columns\n" 92 | ], 93 | "metadata": { 94 | "id": "wQZGN6rNiLgJ" 95 | }, 96 | "execution_count": null, 97 | "outputs": [] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "source": [ 102 | "# Separate features and target\n", 103 | "X = data[features]\n", 104 | "y = data[target]" 105 | ], 106 | "metadata": { 107 | "id": "Lq3R19aVGs2p" 108 | }, 109 | "execution_count": null, 110 | "outputs": [] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "source": [ 115 | "# Preprocess the data\n", 116 | "numeric_features = X.select_dtypes(include=['int64', 'float64']).columns\n", 117 | "categorical_features = X.select_dtypes(include=['object']).columns\n", 118 | "\n", 119 | "numeric_transformer = Pipeline(steps=[\n", 120 | " ('imputer', SimpleImputer(strategy='median')),\n", 121 | " ('scaler', StandardScaler())])\n", 122 | "\n", 123 | "categorical_transformer = Pipeline(steps=[\n", 124 | " ('imputer', SimpleImputer(strategy='most_frequent')),\n", 125 | " ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n", 126 | "\n", 127 | "preprocessor = ColumnTransformer(\n", 128 | " transformers=[\n", 129 | " ('num', numeric_transformer, numeric_features),\n", 130 | " ('cat', categorical_transformer, categorical_features)])" 131 | ], 132 | "metadata": { 133 | "id": "8t61bZV2G0Zc" 134 | }, 135 | "execution_count": null, 136 | "outputs": [] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "source": [ 141 | "# Split the data into training and testing sets\n", 142 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n" 143 | ], 144 | "metadata": { 145 | "id": "27s_In1SG7No" 146 | }, 147 | "execution_count": null, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "source": [ 153 | "# Create a pipeline that includes preprocessing and the model\n", 154 | "pipeline = Pipeline(steps=[('preprocessor', preprocessor),\n", 155 | " ('regressor', RandomForestRegressor())])" 156 | ], 157 | "metadata": { 158 | "id": "2nAnvTQQG8oN" 159 | }, 160 | "execution_count": null, 161 | "outputs": [] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "source": [ 166 | "# Define the parameter grid for GridSearchCV\n", 167 | "param_grid = {\n", 168 | " 'regressor__n_estimators': [100, 200, 300],\n", 169 | " 'regressor__max_depth': [None, 10, 20, 30],\n", 170 | " 'regressor__min_samples_split': [2, 5, 10]\n", 171 | "}\n" 172 | ], 173 | "metadata": { 174 | "id": "n1l3BNbhHBCB" 175 | }, 176 | "execution_count": null, 177 | "outputs": [] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "source": [ 182 | "# Use GridSearchCV to optimize hyperparameters\n", 183 | "grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)\n", 184 | "grid_search.fit(X_train, y_train)" 185 | ], 186 | "metadata": { 187 | "colab": { 188 | "base_uri": "https://localhost:8080/", 189 | "height": 251 190 | }, 191 | "id": "Q3Lxq6StHF95", 192 | "outputId": "f4cef710-e3ce-4d39-de63-ba18fbe6a2f5" 193 | }, 194 | "execution_count": null, 195 | "outputs": [ 196 | { 197 | "output_type": "execute_result", 198 | "data": { 199 | "text/plain": [ 200 | "GridSearchCV(cv=5,\n", 201 | " estimator=Pipeline(steps=[('preprocessor',\n", 202 | " ColumnTransformer(transformers=[('num',\n", 203 | " Pipeline(steps=[('imputer',\n", 204 | " SimpleImputer(strategy='median')),\n", 205 | " ('scaler',\n", 206 | " StandardScaler())]),\n", 207 | " Index(['square_feet', 'num_bedrooms', 'num_bathrooms', 'lot_size',\n", 208 | " 'year_built', 'garage_size'],\n", 209 | " dtype='object')),\n", 210 | " ('cat',\n", 211 | " Pipeline(steps=[('imputer',\n", 212 | " SimpleImputer(strategy='most_frequent')),\n", 213 | " ('onehot',\n", 214 | " OneHotEncoder(handle_unknown='ignore'))]),\n", 215 | " Index(['neighborhood', 'home_style'], dtype='object'))])),\n", 216 | " ('regressor', RandomForestRegressor())]),\n", 217 | " n_jobs=-1,\n", 218 | " param_grid={'regressor__max_depth': [None, 10, 20, 30],\n", 219 | " 'regressor__min_samples_split': [2, 5, 10],\n", 220 | " 'regressor__n_estimators': [100, 200, 300]},\n", 221 | " scoring='neg_mean_squared_error')" 222 | ], 223 | "text/html": [ 224 | "
GridSearchCV(cv=5,\n",
225 |               "             estimator=Pipeline(steps=[('preprocessor',\n",
226 |               "                                        ColumnTransformer(transformers=[('num',\n",
227 |               "                                                                         Pipeline(steps=[('imputer',\n",
228 |               "                                                                                          SimpleImputer(strategy='median')),\n",
229 |               "                                                                                         ('scaler',\n",
230 |               "                                                                                          StandardScaler())]),\n",
231 |               "                                                                         Index(['square_feet', 'num_bedrooms', 'num_bathrooms', 'lot_size',\n",
232 |               "       'year_built', 'garage_size'],\n",
233 |               "      dtype='object')),\n",
234 |               "                                                                        ('cat',\n",
235 |               "                                                                         Pipeline(steps=[('imputer',\n",
236 |               "                                                                                          SimpleImputer(strategy='most_frequent')),\n",
237 |               "                                                                                         ('onehot',\n",
238 |               "                                                                                          OneHotEncoder(handle_unknown='ignore'))]),\n",
239 |               "                                                                         Index(['neighborhood', 'home_style'], dtype='object'))])),\n",
240 |               "                                       ('regressor', RandomForestRegressor())]),\n",
241 |               "             n_jobs=-1,\n",
242 |               "             param_grid={'regressor__max_depth': [None, 10, 20, 30],\n",
243 |               "                         'regressor__min_samples_split': [2, 5, 10],\n",
244 |               "                         'regressor__n_estimators': [100, 200, 300]},\n",
245 |               "             scoring='neg_mean_squared_error')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" 296 | ] 297 | }, 298 | "metadata": {}, 299 | "execution_count": 9 300 | } 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "source": [ 306 | "\n", 307 | "# Get the best model\n", 308 | "best_model = grid_search.best_estimator_" 309 | ], 310 | "metadata": { 311 | "id": "s0lLZHKvHL9f" 312 | }, 313 | "execution_count": null, 314 | "outputs": [] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "source": [ 319 | "# Evaluate the model on the test set\n", 320 | "y_pred = best_model.predict(X_test)\n", 321 | "mse = mean_squared_error(y_test, y_pred)\n", 322 | "r2 = r2_score(y_test, y_pred)\n", 323 | "\n", 324 | "print(f'Mean Squared Error: {mse}')\n", 325 | "print(f'R-squared: {r2}')" 326 | ], 327 | "metadata": { 328 | "colab": { 329 | "base_uri": "https://localhost:8080/" 330 | }, 331 | "id": "WeONZnLjHrdi", 332 | "outputId": "ca5fbf28-e4a9-424a-bd5a-44f9691913f1" 333 | }, 334 | "execution_count": null, 335 | "outputs": [ 336 | { 337 | "output_type": "stream", 338 | "name": "stdout", 339 | "text": [ 340 | "Mean Squared Error: 273480135.4046684\n", 341 | "R-squared: 0.9845607574776657\n" 342 | ] 343 | } 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "source": [ 349 | "# Fine-tune the model if necessary (example with Linear Regression)\n", 350 | "pipeline_lr = Pipeline(steps=[('preprocessor', preprocessor),\n", 351 | " ('regressor', LinearRegression())])" 352 | ], 353 | "metadata": { 354 | "id": "PiHFswlYHvrS" 355 | }, 356 | "execution_count": null, 357 | "outputs": [] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "source": [ 362 | "# Train the linear regression model\n", 363 | "pipeline_lr.fit(X_train, y_train)" 364 | ], 365 | "metadata": { 366 | "colab": { 367 | "base_uri": "https://localhost:8080/", 368 | "height": 225 369 | }, 370 | "id": "VeTlK74WH94-", 371 | "outputId": "9cf63745-8010-4a3a-a5b1-12c558130497" 372 | }, 373 | "execution_count": null, 374 | "outputs": [ 375 | { 376 | "output_type": "execute_result", 377 | "data": { 378 | "text/plain": [ 379 | "Pipeline(steps=[('preprocessor',\n", 380 | " ColumnTransformer(transformers=[('num',\n", 381 | " Pipeline(steps=[('imputer',\n", 382 | " SimpleImputer(strategy='median')),\n", 383 | " ('scaler',\n", 384 | " StandardScaler())]),\n", 385 | " Index(['square_feet', 'num_bedrooms', 'num_bathrooms', 'lot_size',\n", 386 | " 'year_built', 'garage_size'],\n", 387 | " dtype='object')),\n", 388 | " ('cat',\n", 389 | " Pipeline(steps=[('imputer',\n", 390 | " SimpleImputer(strategy='most_frequent')),\n", 391 | " ('onehot',\n", 392 | " OneHotEncoder(handle_unknown='ignore'))]),\n", 393 | " Index(['neighborhood', 'home_style'], dtype='object'))])),\n", 394 | " ('regressor', LinearRegression())])" 395 | ], 396 | "text/html": [ 397 | "
Pipeline(steps=[('preprocessor',\n",
398 |               "                 ColumnTransformer(transformers=[('num',\n",
399 |               "                                                  Pipeline(steps=[('imputer',\n",
400 |               "                                                                   SimpleImputer(strategy='median')),\n",
401 |               "                                                                  ('scaler',\n",
402 |               "                                                                   StandardScaler())]),\n",
403 |               "                                                  Index(['square_feet', 'num_bedrooms', 'num_bathrooms', 'lot_size',\n",
404 |               "       'year_built', 'garage_size'],\n",
405 |               "      dtype='object')),\n",
406 |               "                                                 ('cat',\n",
407 |               "                                                  Pipeline(steps=[('imputer',\n",
408 |               "                                                                   SimpleImputer(strategy='most_frequent')),\n",
409 |               "                                                                  ('onehot',\n",
410 |               "                                                                   OneHotEncoder(handle_unknown='ignore'))]),\n",
411 |               "                                                  Index(['neighborhood', 'home_style'], dtype='object'))])),\n",
412 |               "                ('regressor', LinearRegression())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" 442 | ] 443 | }, 444 | "metadata": {}, 445 | "execution_count": 14 446 | } 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "source": [ 452 | "# Evaluate the linear regression model\n", 453 | "y_pred_lr = pipeline_lr.predict(X_test)\n", 454 | "mse_lr = mean_squared_error(y_test, y_pred_lr)\n", 455 | "r2_lr = r2_score(y_test, y_pred_lr)\n", 456 | "\n", 457 | "print(f'Linear Regression Mean Squared Error: {mse_lr}')\n", 458 | "print(f'Linear Regression R-squared: {r2_lr}')\n", 459 | "# Save the best model to a file for future use\n", 460 | "joblib.dump(best_model, 'best_home_value_model.pkl')" 461 | ], 462 | "metadata": { 463 | "colab": { 464 | "base_uri": "https://localhost:8080/" 465 | }, 466 | "id": "3fSVnk5PIDHm", 467 | "outputId": "6a6d9318-2166-46c0-9fb6-c96f83e16742" 468 | }, 469 | "execution_count": null, 470 | "outputs": [ 471 | { 472 | "output_type": "stream", 473 | "name": "stdout", 474 | "text": [ 475 | "Linear Regression Mean Squared Error: 96366997.81335427\n", 476 | "Linear Regression R-squared: 0.9945596288074522\n" 477 | ] 478 | } 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "source": [ 484 | "# Save the best model to a file for future use\n", 485 | "joblib.dump(best_model, 'best_home_value_model.pkl')" 486 | ], 487 | "metadata": { 488 | "colab": { 489 | "base_uri": "https://localhost:8080/" 490 | }, 491 | "id": "ifxvppJIII00", 492 | "outputId": "7ddcf96f-b555-4c00-8b2e-e37c1c5ff1b5" 493 | }, 494 | "execution_count": null, 495 | "outputs": [ 496 | { 497 | "output_type": "execute_result", 498 | "data": { 499 | "text/plain": [ 500 | "['best_home_value_model.pkl']" 501 | ] 502 | }, 503 | "metadata": {}, 504 | "execution_count": 17 505 | } 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "source": [ 511 | "\n", 512 | "# Documentation\n", 513 | "documentation = f\"\"\"\n", 514 | "# Home Value Prediction Model\n", 515 | "\n", 516 | "## Model Architecture\n", 517 | "The model uses a RandomForestRegressor for predicting home values. It includes data preprocessing steps such as handling missing values, encoding categorical variables, and standardizing numerical features.\n", 518 | "\n", 519 | "## Training Process\n", 520 | "1. Load and preprocess the data.\n", 521 | "2. Split the data into training and testing sets.\n", 522 | "3. Train the model using GridSearchCV to find the best hyperparameters.\n", 523 | "4. Evaluate the model using metrics like Mean Squared Error and R-squared.\n", 524 | "5. Fine-tune the model if necessary.\n", 525 | "\n", 526 | "## Usage Instructions\n", 527 | "1. Load the saved model using joblib:\n", 528 | " ```python\n", 529 | " import joblib\n", 530 | " model = joblib.load('best_home_value_model.pkl')\n", 531 | " ```\n", 532 | "2. Prepare the input data for prediction:\n", 533 | " ```python\n", 534 | " # Ensure your input data has the same structure as the training data\n", 535 | " input_data = pd.DataFrame([...]) # Replace with your actual input data\n", 536 | " ```\n", 537 | "3. Make predictions:\n", 538 | " ```python\n", 539 | " predictions = model.predict(input_data)\n", 540 | " ```\n", 541 | "\n", 542 | "## Hyperparameter Optimization\n", 543 | "The hyperparameters of the RandomForestRegressor were optimized using GridSearchCV with the following parameter grid:\n", 544 | "- `n_estimators`: [100, 200, 300]\n", 545 | "- `max_depth`: [None, 10, 20, 30]\n", 546 | "- `min_samples_split`: [2, 5, 10]\n", 547 | "\n", 548 | "## Evaluation\n", 549 | "The model was evaluated using Mean Squared Error and R-squared on the test set. The results are as follows:\n", 550 | "- Mean Squared Error: {mse}\n", 551 | "- R-squared: {r2}\n", 552 | "\n", 553 | "For the Linear Regression model:\n", 554 | "- Mean Squared Error: {mse_lr}\n", 555 | "- R-squared: {r2_lr}\n", 556 | "\"\"\"\n", 557 | "\n", 558 | "# Print the documentation\n", 559 | "print(documentation)" 560 | ], 561 | "metadata": { 562 | "colab": { 563 | "base_uri": "https://localhost:8080/" 564 | }, 565 | "id": "hWi5hvdQIib0", 566 | "outputId": "ce50504b-fc27-4657-9c89-6bb9691abd70" 567 | }, 568 | "execution_count": null, 569 | "outputs": [ 570 | { 571 | "output_type": "stream", 572 | "name": "stdout", 573 | "text": [ 574 | "\n", 575 | "# Home Value Prediction Model\n", 576 | "\n", 577 | "## Model Architecture\n", 578 | "The model uses a RandomForestRegressor for predicting home values. It includes data preprocessing steps such as handling missing values, encoding categorical variables, and standardizing numerical features.\n", 579 | "\n", 580 | "## Training Process\n", 581 | "1. Load and preprocess the data.\n", 582 | "2. Split the data into training and testing sets.\n", 583 | "3. Train the model using GridSearchCV to find the best hyperparameters.\n", 584 | "4. Evaluate the model using metrics like Mean Squared Error and R-squared.\n", 585 | "5. Fine-tune the model if necessary.\n", 586 | "\n", 587 | "## Usage Instructions\n", 588 | "1. Load the saved model using joblib:\n", 589 | " ```python\n", 590 | " import joblib\n", 591 | " model = joblib.load('best_home_value_model.pkl')\n", 592 | " ```\n", 593 | "2. Prepare the input data for prediction:\n", 594 | " ```python\n", 595 | " # Ensure your input data has the same structure as the training data\n", 596 | " input_data = pd.DataFrame([...]) # Replace with your actual input data\n", 597 | " ```\n", 598 | "3. Make predictions:\n", 599 | " ```python\n", 600 | " predictions = model.predict(input_data)\n", 601 | " ```\n", 602 | "\n", 603 | "## Hyperparameter Optimization\n", 604 | "The hyperparameters of the RandomForestRegressor were optimized using GridSearchCV with the following parameter grid:\n", 605 | "- `n_estimators`: [100, 200, 300]\n", 606 | "- `max_depth`: [None, 10, 20, 30]\n", 607 | "- `min_samples_split`: [2, 5, 10]\n", 608 | "\n", 609 | "## Evaluation\n", 610 | "The model was evaluated using Mean Squared Error and R-squared on the test set. The results are as follows:\n", 611 | "- Mean Squared Error: 273480135.4046684\n", 612 | "- R-squared: 0.9845607574776657\n", 613 | "\n", 614 | "For the Linear Regression model:\n", 615 | "- Mean Squared Error: 96366997.81335427\n", 616 | "- R-squared: 0.9945596288074522\n", 617 | "\n" 618 | ] 619 | } 620 | ] 621 | } 622 | ] 623 | } --------------------------------------------------------------------------------