└── Task1.ipynb
/Task1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyN6n0+/ccm+qWvvyeTSUyIs",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | ""
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": null,
32 | "metadata": {
33 | "id": "8ifQa8b7h07u"
34 | },
35 | "outputs": [],
36 | "source": [
37 | "import pandas as pd\n",
38 | "import numpy as np\n",
39 | "from sklearn.model_selection import train_test_split, GridSearchCV\n",
40 | "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
41 | "from sklearn.compose import ColumnTransformer\n",
42 | "from sklearn.pipeline import Pipeline\n",
43 | "from sklearn.impute import SimpleImputer\n",
44 | "from sklearn.ensemble import RandomForestRegressor\n",
45 | "from sklearn.linear_model import LinearRegression\n",
46 | "from sklearn.metrics import mean_squared_error, r2_score\n",
47 | "import joblib\n"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "source": [
53 | "np.random.seed(42)\n",
54 | "n_samples = 1000\n",
55 | "square_feet = np.random.randint(500, 5000, size=n_samples)\n",
56 | "num_bedrooms = np.random.randint(1, 6, size=n_samples)\n",
57 | "num_bathrooms = np.random.randint(1, 4, size=n_samples)\n",
58 | "lot_size = np.random.randint(1000, 10000, size=n_samples)\n",
59 | "year_built = np.random.randint(1900, 2021, size=n_samples)\n",
60 | "garage_size = np.random.randint(0, 4, size=n_samples)\n",
61 | "neighborhoods = ['A', 'B', 'C', 'D']\n",
62 | "neighborhood = np.random.choice(neighborhoods, size=n_samples)\n",
63 | "styles = ['Ranch', 'Colonial', 'Victorian', 'Modern']\n",
64 | "home_style = np.random.choice(styles, size=n_samples)\n",
65 | "home_value = (square_feet * 100 + num_bedrooms * 10000 + num_bathrooms * 5000 +\n",
66 | " lot_size * 10 + garage_size * 2000 + (year_built - 1900) * 300 +\n",
67 | " np.random.normal(0, 10000, size=n_samples))\n",
68 | "data = pd.DataFrame({\n",
69 | " 'square_feet': square_feet,\n",
70 | " 'num_bedrooms': num_bedrooms,\n",
71 | " 'num_bathrooms': num_bathrooms,\n",
72 | " 'lot_size': lot_size,\n",
73 | " 'year_built': year_built,\n",
74 | " 'garage_size': garage_size,\n",
75 | " 'neighborhood': neighborhood,\n",
76 | " 'home_style': home_style,\n",
77 | " 'home_value': home_value\n",
78 | "})"
79 | ],
80 | "metadata": {
81 | "id": "RPgaIFHeh_u0"
82 | },
83 | "execution_count": null,
84 | "outputs": []
85 | },
86 | {
87 | "cell_type": "code",
88 | "source": [
89 | "# Define the target variable and features\n",
90 | "target = 'home_value'\n",
91 | "features = data.drop(columns=[target]).columns\n"
92 | ],
93 | "metadata": {
94 | "id": "wQZGN6rNiLgJ"
95 | },
96 | "execution_count": null,
97 | "outputs": []
98 | },
99 | {
100 | "cell_type": "code",
101 | "source": [
102 | "# Separate features and target\n",
103 | "X = data[features]\n",
104 | "y = data[target]"
105 | ],
106 | "metadata": {
107 | "id": "Lq3R19aVGs2p"
108 | },
109 | "execution_count": null,
110 | "outputs": []
111 | },
112 | {
113 | "cell_type": "code",
114 | "source": [
115 | "# Preprocess the data\n",
116 | "numeric_features = X.select_dtypes(include=['int64', 'float64']).columns\n",
117 | "categorical_features = X.select_dtypes(include=['object']).columns\n",
118 | "\n",
119 | "numeric_transformer = Pipeline(steps=[\n",
120 | " ('imputer', SimpleImputer(strategy='median')),\n",
121 | " ('scaler', StandardScaler())])\n",
122 | "\n",
123 | "categorical_transformer = Pipeline(steps=[\n",
124 | " ('imputer', SimpleImputer(strategy='most_frequent')),\n",
125 | " ('onehot', OneHotEncoder(handle_unknown='ignore'))])\n",
126 | "\n",
127 | "preprocessor = ColumnTransformer(\n",
128 | " transformers=[\n",
129 | " ('num', numeric_transformer, numeric_features),\n",
130 | " ('cat', categorical_transformer, categorical_features)])"
131 | ],
132 | "metadata": {
133 | "id": "8t61bZV2G0Zc"
134 | },
135 | "execution_count": null,
136 | "outputs": []
137 | },
138 | {
139 | "cell_type": "code",
140 | "source": [
141 | "# Split the data into training and testing sets\n",
142 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n"
143 | ],
144 | "metadata": {
145 | "id": "27s_In1SG7No"
146 | },
147 | "execution_count": null,
148 | "outputs": []
149 | },
150 | {
151 | "cell_type": "code",
152 | "source": [
153 | "# Create a pipeline that includes preprocessing and the model\n",
154 | "pipeline = Pipeline(steps=[('preprocessor', preprocessor),\n",
155 | " ('regressor', RandomForestRegressor())])"
156 | ],
157 | "metadata": {
158 | "id": "2nAnvTQQG8oN"
159 | },
160 | "execution_count": null,
161 | "outputs": []
162 | },
163 | {
164 | "cell_type": "code",
165 | "source": [
166 | "# Define the parameter grid for GridSearchCV\n",
167 | "param_grid = {\n",
168 | " 'regressor__n_estimators': [100, 200, 300],\n",
169 | " 'regressor__max_depth': [None, 10, 20, 30],\n",
170 | " 'regressor__min_samples_split': [2, 5, 10]\n",
171 | "}\n"
172 | ],
173 | "metadata": {
174 | "id": "n1l3BNbhHBCB"
175 | },
176 | "execution_count": null,
177 | "outputs": []
178 | },
179 | {
180 | "cell_type": "code",
181 | "source": [
182 | "# Use GridSearchCV to optimize hyperparameters\n",
183 | "grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)\n",
184 | "grid_search.fit(X_train, y_train)"
185 | ],
186 | "metadata": {
187 | "colab": {
188 | "base_uri": "https://localhost:8080/",
189 | "height": 251
190 | },
191 | "id": "Q3Lxq6StHF95",
192 | "outputId": "f4cef710-e3ce-4d39-de63-ba18fbe6a2f5"
193 | },
194 | "execution_count": null,
195 | "outputs": [
196 | {
197 | "output_type": "execute_result",
198 | "data": {
199 | "text/plain": [
200 | "GridSearchCV(cv=5,\n",
201 | " estimator=Pipeline(steps=[('preprocessor',\n",
202 | " ColumnTransformer(transformers=[('num',\n",
203 | " Pipeline(steps=[('imputer',\n",
204 | " SimpleImputer(strategy='median')),\n",
205 | " ('scaler',\n",
206 | " StandardScaler())]),\n",
207 | " Index(['square_feet', 'num_bedrooms', 'num_bathrooms', 'lot_size',\n",
208 | " 'year_built', 'garage_size'],\n",
209 | " dtype='object')),\n",
210 | " ('cat',\n",
211 | " Pipeline(steps=[('imputer',\n",
212 | " SimpleImputer(strategy='most_frequent')),\n",
213 | " ('onehot',\n",
214 | " OneHotEncoder(handle_unknown='ignore'))]),\n",
215 | " Index(['neighborhood', 'home_style'], dtype='object'))])),\n",
216 | " ('regressor', RandomForestRegressor())]),\n",
217 | " n_jobs=-1,\n",
218 | " param_grid={'regressor__max_depth': [None, 10, 20, 30],\n",
219 | " 'regressor__min_samples_split': [2, 5, 10],\n",
220 | " 'regressor__n_estimators': [100, 200, 300]},\n",
221 | " scoring='neg_mean_squared_error')"
222 | ],
223 | "text/html": [
224 | "
GridSearchCV(cv=5,\n",
225 | " estimator=Pipeline(steps=[('preprocessor',\n",
226 | " ColumnTransformer(transformers=[('num',\n",
227 | " Pipeline(steps=[('imputer',\n",
228 | " SimpleImputer(strategy='median')),\n",
229 | " ('scaler',\n",
230 | " StandardScaler())]),\n",
231 | " Index(['square_feet', 'num_bedrooms', 'num_bathrooms', 'lot_size',\n",
232 | " 'year_built', 'garage_size'],\n",
233 | " dtype='object')),\n",
234 | " ('cat',\n",
235 | " Pipeline(steps=[('imputer',\n",
236 | " SimpleImputer(strategy='most_frequent')),\n",
237 | " ('onehot',\n",
238 | " OneHotEncoder(handle_unknown='ignore'))]),\n",
239 | " Index(['neighborhood', 'home_style'], dtype='object'))])),\n",
240 | " ('regressor', RandomForestRegressor())]),\n",
241 | " n_jobs=-1,\n",
242 | " param_grid={'regressor__max_depth': [None, 10, 20, 30],\n",
243 | " 'regressor__min_samples_split': [2, 5, 10],\n",
244 | " 'regressor__n_estimators': [100, 200, 300]},\n",
245 | " scoring='neg_mean_squared_error')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=5,\n",
246 | " estimator=Pipeline(steps=[('preprocessor',\n",
247 | " ColumnTransformer(transformers=[('num',\n",
248 | " Pipeline(steps=[('imputer',\n",
249 | " SimpleImputer(strategy='median')),\n",
250 | " ('scaler',\n",
251 | " StandardScaler())]),\n",
252 | " Index(['square_feet', 'num_bedrooms', 'num_bathrooms', 'lot_size',\n",
253 | " 'year_built', 'garage_size'],\n",
254 | " dtype='object')),\n",
255 | " ('cat',\n",
256 | " Pipeline(steps=[('imputer',\n",
257 | " SimpleImputer(strategy='most_frequent')),\n",
258 | " ('onehot',\n",
259 | " OneHotEncoder(handle_unknown='ignore'))]),\n",
260 | " Index(['neighborhood', 'home_style'], dtype='object'))])),\n",
261 | " ('regressor', RandomForestRegressor())]),\n",
262 | " n_jobs=-1,\n",
263 | " param_grid={'regressor__max_depth': [None, 10, 20, 30],\n",
264 | " 'regressor__min_samples_split': [2, 5, 10],\n",
265 | " 'regressor__n_estimators': [100, 200, 300]},\n",
266 | " scoring='neg_mean_squared_error')Pipeline(steps=[('preprocessor',\n",
267 | " ColumnTransformer(transformers=[('num',\n",
268 | " Pipeline(steps=[('imputer',\n",
269 | " SimpleImputer(strategy='median')),\n",
270 | " ('scaler',\n",
271 | " StandardScaler())]),\n",
272 | " Index(['square_feet', 'num_bedrooms', 'num_bathrooms', 'lot_size',\n",
273 | " 'year_built', 'garage_size'],\n",
274 | " dtype='object')),\n",
275 | " ('cat',\n",
276 | " Pipeline(steps=[('imputer',\n",
277 | " SimpleImputer(strategy='most_frequent')),\n",
278 | " ('onehot',\n",
279 | " OneHotEncoder(handle_unknown='ignore'))]),\n",
280 | " Index(['neighborhood', 'home_style'], dtype='object'))])),\n",
281 | " ('regressor', RandomForestRegressor())])ColumnTransformer(transformers=[('num',\n",
282 | " Pipeline(steps=[('imputer',\n",
283 | " SimpleImputer(strategy='median')),\n",
284 | " ('scaler', StandardScaler())]),\n",
285 | " Index(['square_feet', 'num_bedrooms', 'num_bathrooms', 'lot_size',\n",
286 | " 'year_built', 'garage_size'],\n",
287 | " dtype='object')),\n",
288 | " ('cat',\n",
289 | " Pipeline(steps=[('imputer',\n",
290 | " SimpleImputer(strategy='most_frequent')),\n",
291 | " ('onehot',\n",
292 | " OneHotEncoder(handle_unknown='ignore'))]),\n",
293 | " Index(['neighborhood', 'home_style'], dtype='object'))])Index(['square_feet', 'num_bedrooms', 'num_bathrooms', 'lot_size',\n", 294 | " 'year_built', 'garage_size'],\n", 295 | " dtype='object')
SimpleImputer(strategy='median')
StandardScaler()
Index(['neighborhood', 'home_style'], dtype='object')
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
RandomForestRegressor()
Pipeline(steps=[('preprocessor',\n",
398 | " ColumnTransformer(transformers=[('num',\n",
399 | " Pipeline(steps=[('imputer',\n",
400 | " SimpleImputer(strategy='median')),\n",
401 | " ('scaler',\n",
402 | " StandardScaler())]),\n",
403 | " Index(['square_feet', 'num_bedrooms', 'num_bathrooms', 'lot_size',\n",
404 | " 'year_built', 'garage_size'],\n",
405 | " dtype='object')),\n",
406 | " ('cat',\n",
407 | " Pipeline(steps=[('imputer',\n",
408 | " SimpleImputer(strategy='most_frequent')),\n",
409 | " ('onehot',\n",
410 | " OneHotEncoder(handle_unknown='ignore'))]),\n",
411 | " Index(['neighborhood', 'home_style'], dtype='object'))])),\n",
412 | " ('regressor', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('preprocessor',\n",
413 | " ColumnTransformer(transformers=[('num',\n",
414 | " Pipeline(steps=[('imputer',\n",
415 | " SimpleImputer(strategy='median')),\n",
416 | " ('scaler',\n",
417 | " StandardScaler())]),\n",
418 | " Index(['square_feet', 'num_bedrooms', 'num_bathrooms', 'lot_size',\n",
419 | " 'year_built', 'garage_size'],\n",
420 | " dtype='object')),\n",
421 | " ('cat',\n",
422 | " Pipeline(steps=[('imputer',\n",
423 | " SimpleImputer(strategy='most_frequent')),\n",
424 | " ('onehot',\n",
425 | " OneHotEncoder(handle_unknown='ignore'))]),\n",
426 | " Index(['neighborhood', 'home_style'], dtype='object'))])),\n",
427 | " ('regressor', LinearRegression())])ColumnTransformer(transformers=[('num',\n",
428 | " Pipeline(steps=[('imputer',\n",
429 | " SimpleImputer(strategy='median')),\n",
430 | " ('scaler', StandardScaler())]),\n",
431 | " Index(['square_feet', 'num_bedrooms', 'num_bathrooms', 'lot_size',\n",
432 | " 'year_built', 'garage_size'],\n",
433 | " dtype='object')),\n",
434 | " ('cat',\n",
435 | " Pipeline(steps=[('imputer',\n",
436 | " SimpleImputer(strategy='most_frequent')),\n",
437 | " ('onehot',\n",
438 | " OneHotEncoder(handle_unknown='ignore'))]),\n",
439 | " Index(['neighborhood', 'home_style'], dtype='object'))])Index(['square_feet', 'num_bedrooms', 'num_bathrooms', 'lot_size',\n", 440 | " 'year_built', 'garage_size'],\n", 441 | " dtype='object')
SimpleImputer(strategy='median')
StandardScaler()
Index(['neighborhood', 'home_style'], dtype='object')
SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
LinearRegression()