├── README.md
├── requirements.txt
└── Predicting used car prices.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # Car-Price-Prediction
2 | I'll use various machine learning algorithms to predict the price of used cars.
3 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | appnope==0.1.0
2 | attrs==19.3.0
3 | backcall==0.1.0
4 | bleach==3.1.0
5 | cycler==0.10.0
6 | decorator==4.4.1
7 | defusedxml==0.6.0
8 | entrypoints==0.3
9 | importlib-metadata==0.23
10 | ipykernel==5.1.3
11 | ipython==7.9.0
12 | ipython-genutils==0.2.0
13 | ipywidgets==7.5.1
14 | jedi==0.15.1
15 | Jinja2==2.10.3
16 | joblib==0.14.0
17 | jsonschema==3.1.1
18 | jupyter==1.0.0
19 | jupyter-client==5.3.4
20 | jupyter-console==6.0.0
21 | jupyter-core==4.6.1
22 | kiwisolver==1.1.0
23 | MarkupSafe==1.1.1
24 | matplotlib==3.1.1
25 | mistune==0.8.4
26 | more-itertools==7.2.0
27 | nbconvert==5.6.1
28 | nbformat==4.4.0
29 | notebook==6.0.2
30 | numpy==1.17.4
31 | pandas==0.25.3
32 | pandocfilters==1.4.2
33 | parso==0.5.1
34 | pexpect==4.7.0
35 | pickleshare==0.7.5
36 | prometheus-client==0.7.1
37 | prompt-toolkit==2.0.10
38 | ptyprocess==0.6.0
39 | Pygments==2.4.2
40 | pyparsing==2.4.5
41 | pyrsistent==0.15.5
42 | python-dateutil==2.8.1
43 | pytz==2019.3
44 | pyzmq==18.1.1
45 | qtconsole==4.5.5
46 | scikit-learn==0.21.3
47 | scipy==1.3.2
48 | seaborn==0.9.0
49 | Send2Trash==1.5.0
50 | six==1.13.0
51 | sklearn==0.0
52 | terminado==0.8.3
53 | testpath==0.4.4
54 | tornado==6.0.3
55 | traitlets==4.3.3
56 | wcwidth==0.1.7
57 | webencodings==0.5.1
58 | widgetsnbextension==3.5.1
59 | zipp==0.6.0
60 |
--------------------------------------------------------------------------------
/Predicting used car prices.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Predicting used car prices\n",
8 | "\n",
9 | "In this notebook, I'll work with the [Kaggle](https://www.kaggle.com/avikasliwal/used-cars-price-prediction) dataset about used cars and their prices. The notebook first includes exploration of the dataset followed by prediction of prices."
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "## Import libraries\n",
17 | "\n",
18 | "I'll import `datetime` to handle year, `numpy` to work with arrays and `pandas` to read in the dataset files, `matplotlib` & `seaborn` for plotting and `sklearn` for various machine learning models."
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 1,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "import datetime\n",
28 | "\n",
29 | "import numpy as np\n",
30 | "import pandas as pd\n",
31 | "\n",
32 | "import matplotlib.pyplot as plt\n",
33 | "import seaborn as sns\n",
34 | "%matplotlib inline\n",
35 | "\n",
36 | "from sklearn.model_selection import train_test_split\n",
37 | "from sklearn.linear_model import LinearRegression\n",
38 | "from sklearn.ensemble import RandomForestRegressor\n",
39 | "from sklearn.preprocessing import StandardScaler\n",
40 | "from sklearn.metrics import r2_score"
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "metadata": {},
46 | "source": [
47 | "## Read dataset\n",
48 | "\n",
49 | "I'll read the dataset and get information about it."
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 2,
55 | "metadata": {},
56 | "outputs": [
57 | {
58 | "data": {
59 | "text/html": [
60 | "
\n",
61 | "\n",
74 | "
\n",
75 | " \n",
76 | " \n",
77 | " | \n",
78 | " Unnamed: 0 | \n",
79 | " Name | \n",
80 | " Location | \n",
81 | " Year | \n",
82 | " Kilometers_Driven | \n",
83 | " Fuel_Type | \n",
84 | " Transmission | \n",
85 | " Owner_Type | \n",
86 | " Mileage | \n",
87 | " Engine | \n",
88 | " Power | \n",
89 | " Seats | \n",
90 | " New_Price | \n",
91 | " Price | \n",
92 | "
\n",
93 | " \n",
94 | " \n",
95 | " \n",
96 | " | 0 | \n",
97 | " 0 | \n",
98 | " Maruti Wagon R LXI CNG | \n",
99 | " Mumbai | \n",
100 | " 2010 | \n",
101 | " 72000 | \n",
102 | " CNG | \n",
103 | " Manual | \n",
104 | " First | \n",
105 | " 26.6 km/kg | \n",
106 | " 998 CC | \n",
107 | " 58.16 bhp | \n",
108 | " 5.0 | \n",
109 | " NaN | \n",
110 | " 1.75 | \n",
111 | "
\n",
112 | " \n",
113 | " | 1 | \n",
114 | " 1 | \n",
115 | " Hyundai Creta 1.6 CRDi SX Option | \n",
116 | " Pune | \n",
117 | " 2015 | \n",
118 | " 41000 | \n",
119 | " Diesel | \n",
120 | " Manual | \n",
121 | " First | \n",
122 | " 19.67 kmpl | \n",
123 | " 1582 CC | \n",
124 | " 126.2 bhp | \n",
125 | " 5.0 | \n",
126 | " NaN | \n",
127 | " 12.50 | \n",
128 | "
\n",
129 | " \n",
130 | " | 2 | \n",
131 | " 2 | \n",
132 | " Honda Jazz V | \n",
133 | " Chennai | \n",
134 | " 2011 | \n",
135 | " 46000 | \n",
136 | " Petrol | \n",
137 | " Manual | \n",
138 | " First | \n",
139 | " 18.2 kmpl | \n",
140 | " 1199 CC | \n",
141 | " 88.7 bhp | \n",
142 | " 5.0 | \n",
143 | " 8.61 Lakh | \n",
144 | " 4.50 | \n",
145 | "
\n",
146 | " \n",
147 | " | 3 | \n",
148 | " 3 | \n",
149 | " Maruti Ertiga VDI | \n",
150 | " Chennai | \n",
151 | " 2012 | \n",
152 | " 87000 | \n",
153 | " Diesel | \n",
154 | " Manual | \n",
155 | " First | \n",
156 | " 20.77 kmpl | \n",
157 | " 1248 CC | \n",
158 | " 88.76 bhp | \n",
159 | " 7.0 | \n",
160 | " NaN | \n",
161 | " 6.00 | \n",
162 | "
\n",
163 | " \n",
164 | " | 4 | \n",
165 | " 4 | \n",
166 | " Audi A4 New 2.0 TDI Multitronic | \n",
167 | " Coimbatore | \n",
168 | " 2013 | \n",
169 | " 40670 | \n",
170 | " Diesel | \n",
171 | " Automatic | \n",
172 | " Second | \n",
173 | " 15.2 kmpl | \n",
174 | " 1968 CC | \n",
175 | " 140.8 bhp | \n",
176 | " 5.0 | \n",
177 | " NaN | \n",
178 | " 17.74 | \n",
179 | "
\n",
180 | " \n",
181 | "
\n",
182 | "
"
183 | ],
184 | "text/plain": [
185 | " Unnamed: 0 Name Location Year \\\n",
186 | "0 0 Maruti Wagon R LXI CNG Mumbai 2010 \n",
187 | "1 1 Hyundai Creta 1.6 CRDi SX Option Pune 2015 \n",
188 | "2 2 Honda Jazz V Chennai 2011 \n",
189 | "3 3 Maruti Ertiga VDI Chennai 2012 \n",
190 | "4 4 Audi A4 New 2.0 TDI Multitronic Coimbatore 2013 \n",
191 | "\n",
192 | " Kilometers_Driven Fuel_Type Transmission Owner_Type Mileage Engine \\\n",
193 | "0 72000 CNG Manual First 26.6 km/kg 998 CC \n",
194 | "1 41000 Diesel Manual First 19.67 kmpl 1582 CC \n",
195 | "2 46000 Petrol Manual First 18.2 kmpl 1199 CC \n",
196 | "3 87000 Diesel Manual First 20.77 kmpl 1248 CC \n",
197 | "4 40670 Diesel Automatic Second 15.2 kmpl 1968 CC \n",
198 | "\n",
199 | " Power Seats New_Price Price \n",
200 | "0 58.16 bhp 5.0 NaN 1.75 \n",
201 | "1 126.2 bhp 5.0 NaN 12.50 \n",
202 | "2 88.7 bhp 5.0 8.61 Lakh 4.50 \n",
203 | "3 88.76 bhp 7.0 NaN 6.00 \n",
204 | "4 140.8 bhp 5.0 NaN 17.74 "
205 | ]
206 | },
207 | "execution_count": 2,
208 | "metadata": {},
209 | "output_type": "execute_result"
210 | }
211 | ],
212 | "source": [
213 | "dataset = pd.read_csv(\"data/dataset.csv\")\n",
214 | "dataset.head(5)"
215 | ]
216 | },
217 | {
218 | "cell_type": "markdown",
219 | "metadata": {},
220 | "source": [
221 | "Let's first split the dataset into train and test datasets."
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": 3,
227 | "metadata": {},
228 | "outputs": [],
229 | "source": [
230 | "X_train, X_test, y_train, y_test = train_test_split(dataset.iloc[:, :-1], \n",
231 | " dataset.iloc[:, -1], \n",
232 | " test_size = 0.3, \n",
233 | " random_state = 42)"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 4,
239 | "metadata": {},
240 | "outputs": [
241 | {
242 | "name": "stdout",
243 | "output_type": "stream",
244 | "text": [
245 | "\n",
246 | "Int64Index: 4213 entries, 4201 to 860\n",
247 | "Data columns (total 13 columns):\n",
248 | "Unnamed: 0 4213 non-null int64\n",
249 | "Name 4213 non-null object\n",
250 | "Location 4213 non-null object\n",
251 | "Year 4213 non-null int64\n",
252 | "Kilometers_Driven 4213 non-null int64\n",
253 | "Fuel_Type 4213 non-null object\n",
254 | "Transmission 4213 non-null object\n",
255 | "Owner_Type 4213 non-null object\n",
256 | "Mileage 4212 non-null object\n",
257 | "Engine 4189 non-null object\n",
258 | "Power 4189 non-null object\n",
259 | "Seats 4185 non-null float64\n",
260 | "New_Price 580 non-null object\n",
261 | "dtypes: float64(1), int64(3), object(9)\n",
262 | "memory usage: 460.8+ KB\n"
263 | ]
264 | }
265 | ],
266 | "source": [
267 | "X_train.info()"
268 | ]
269 | },
270 | {
271 | "cell_type": "markdown",
272 | "metadata": {},
273 | "source": [
274 | "## Exploratory Data Analysis\n",
275 | "\n",
276 | "Let's explore the various columns and draw information about how useful each column is. I'll also modify the test data based on training data."
277 | ]
278 | },
279 | {
280 | "cell_type": "markdown",
281 | "metadata": {},
282 | "source": [
283 | "### Index\n",
284 | "\n",
285 | "The first column is the index for each data point and hence we can simply remove it."
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 5,
291 | "metadata": {},
292 | "outputs": [],
293 | "source": [
294 | "X_train = X_train.iloc[:, 1:]\n",
295 | "X_test = X_test.iloc[:, 1:]"
296 | ]
297 | },
298 | {
299 | "cell_type": "markdown",
300 | "metadata": {},
301 | "source": [
302 | "### Name\n",
303 | "\n",
304 | "Let's explore the various cars in the dataset."
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": 6,
310 | "metadata": {},
311 | "outputs": [
312 | {
313 | "data": {
314 | "text/plain": [
315 | "Mahindra XUV500 W8 2WD 35\n",
316 | "Maruti Swift VDI 31\n",
317 | "Maruti Ritz VDi 26\n",
318 | "Hyundai i10 Sportz 25\n",
319 | "Maruti Swift Dzire VDI 24\n",
320 | " ..\n",
321 | "BMW X5 xDrive 30d M Sport 1\n",
322 | "Mini Countryman Cooper D 1\n",
323 | "Mitsubishi Lancer 1.5 SFXi 1\n",
324 | "Hyundai i10 Magna Optional 1.1L 1\n",
325 | "Mercedes-Benz CLA 200 CDI Style 1\n",
326 | "Name: Name, Length: 1592, dtype: int64"
327 | ]
328 | },
329 | "execution_count": 6,
330 | "metadata": {},
331 | "output_type": "execute_result"
332 | }
333 | ],
334 | "source": [
335 | "X_train[\"Name\"].value_counts()"
336 | ]
337 | },
338 | {
339 | "cell_type": "markdown",
340 | "metadata": {},
341 | "source": [
342 | "As it appears, there are several cars in the dataset, some of them with a count higher than 1.\n",
343 | "Sometimes the resale value of a car also depends on manufacturer of car and hence, I'll extract the manufacturer from this column and add it to the dataset."
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "execution_count": 7,
349 | "metadata": {},
350 | "outputs": [],
351 | "source": [
352 | "make_train = X_train[\"Name\"].str.split(\" \", expand = True)\n",
353 | "make_test = X_test[\"Name\"].str.split(\" \", expand = True)"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": 8,
359 | "metadata": {},
360 | "outputs": [],
361 | "source": [
362 | "X_train[\"Manufacturer\"] = make_train[0]\n",
363 | "X_test[\"Manufacturer\"] = make_test[0]"
364 | ]
365 | },
366 | {
367 | "cell_type": "markdown",
368 | "metadata": {},
369 | "source": [
370 | "Let's also confirm that there are no null values and identify all unique values."
371 | ]
372 | },
373 | {
374 | "cell_type": "code",
375 | "execution_count": 9,
376 | "metadata": {},
377 | "outputs": [
378 | {
379 | "data": {
380 | "text/plain": [
381 | "Text(0, 0.5, 'Count of cars')"
382 | ]
383 | },
384 | "execution_count": 9,
385 | "metadata": {},
386 | "output_type": "execute_result"
387 | },
388 | {
389 | "data": {
390 | "image/png": "\n",
391 | "text/plain": [
392 | ""
393 | ]
394 | },
395 | "metadata": {
396 | "needs_background": "light"
397 | },
398 | "output_type": "display_data"
399 | }
400 | ],
401 | "source": [
402 | "plt.figure(figsize = (12, 8))\n",
403 | "plot = sns.countplot(x = 'Manufacturer', data = X_train)\n",
404 | "plt.xticks(rotation = 90)\n",
405 | "for p in plot.patches:\n",
406 | " plot.annotate(p.get_height(), \n",
407 | " (p.get_x() + p.get_width() / 2.0, \n",
408 | " p.get_height()), \n",
409 | " ha = 'center', \n",
410 | " va = 'center', \n",
411 | " xytext = (0, 5),\n",
412 | " textcoords = 'offset points')\n",
413 | "\n",
414 | "plt.title(\"Count of cars based on manufacturers\")\n",
415 | "plt.xlabel(\"Manufacturer\")\n",
416 | "plt.ylabel(\"Count of cars\")"
417 | ]
418 | },
419 | {
420 | "cell_type": "markdown",
421 | "metadata": {},
422 | "source": [
423 | "Maximum cars in the dataset are by the manufacturer **Maruti** and there are no null values.\n",
424 | "\n",
425 | "I'll also drop the `Name` column."
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": 10,
431 | "metadata": {},
432 | "outputs": [],
433 | "source": [
434 | "X_train.drop(\"Name\", axis = 1, inplace = True)\n",
435 | "X_test.drop(\"Name\", axis = 1, inplace = True)"
436 | ]
437 | },
438 | {
439 | "cell_type": "markdown",
440 | "metadata": {},
441 | "source": [
442 | "### Location\n",
443 | "\n",
444 | "Location should not be a determinant for the price of a car and I'll safely remove it."
445 | ]
446 | },
447 | {
448 | "cell_type": "code",
449 | "execution_count": 11,
450 | "metadata": {},
451 | "outputs": [],
452 | "source": [
453 | "X_train.drop(\"Location\", axis = 1, inplace = True)\n",
454 | "X_test.drop(\"Location\", axis = 1, inplace = True)"
455 | ]
456 | },
457 | {
458 | "cell_type": "markdown",
459 | "metadata": {},
460 | "source": [
461 | "### Year\n",
462 | "\n",
463 | "Year has no significance on its own unless we try to extract how old a car is from this and see how its resale price may get affected."
464 | ]
465 | },
466 | {
467 | "cell_type": "code",
468 | "execution_count": 12,
469 | "metadata": {},
470 | "outputs": [],
471 | "source": [
472 | "curr_time = datetime.datetime.now()\n",
473 | "X_train['Year'] = X_train['Year'].apply(lambda x : curr_time.year - x)\n",
474 | "X_test['Year'] = X_test['Year'].apply(lambda x : curr_time.year - x)"
475 | ]
476 | },
477 | {
478 | "cell_type": "markdown",
479 | "metadata": {},
480 | "source": [
481 | "### Fuel_Type, Transmission, and Owner_Type\n",
482 | "\n",
483 | "All these columns are categorical columns which should be converted to dummy variables before being used."
484 | ]
485 | },
486 | {
487 | "cell_type": "markdown",
488 | "metadata": {},
489 | "source": [
490 | "### Kilometers_Driven\n",
491 | "\n",
492 | "`Kilometers_Driven` is a numerical column with a certain range of values."
493 | ]
494 | },
495 | {
496 | "cell_type": "code",
497 | "execution_count": 13,
498 | "metadata": {},
499 | "outputs": [
500 | {
501 | "data": {
502 | "text/plain": [
503 | "4201 77000\n",
504 | "4383 19947\n",
505 | "1779 70963\n",
506 | "4020 115195\n",
507 | "3248 58752\n",
508 | " ... \n",
509 | "3772 27000\n",
510 | "5191 9000\n",
511 | "5226 140000\n",
512 | "5390 76414\n",
513 | "860 98000\n",
514 | "Name: Kilometers_Driven, Length: 4213, dtype: int64"
515 | ]
516 | },
517 | "execution_count": 13,
518 | "metadata": {},
519 | "output_type": "execute_result"
520 | }
521 | ],
522 | "source": [
523 | "X_train[\"Kilometers_Driven\"]"
524 | ]
525 | },
526 | {
527 | "cell_type": "markdown",
528 | "metadata": {},
529 | "source": [
530 | "The data range is really varied and the high values might affect prediction, thus, it is really important that scaling be applied to this column for sure."
531 | ]
532 | },
533 | {
534 | "cell_type": "markdown",
535 | "metadata": {},
536 | "source": [
537 | "## Mileage\n",
538 | "\n",
539 | "This column defines the mileage of the car. We need to extract the numerical value out of each string and save it."
540 | ]
541 | },
542 | {
543 | "cell_type": "code",
544 | "execution_count": 14,
545 | "metadata": {},
546 | "outputs": [],
547 | "source": [
548 | "mileage_train = X_train[\"Mileage\"].str.split(\" \", expand = True)\n",
549 | "mileage_test = X_test[\"Mileage\"].str.split(\" \", expand = True)\n",
550 | "\n",
551 | "X_train[\"Mileage\"] = pd.to_numeric(mileage_train[0], errors = 'coerce')\n",
552 | "X_test[\"Mileage\"] = pd.to_numeric(mileage_test[0], errors = 'coerce')"
553 | ]
554 | },
555 | {
556 | "cell_type": "markdown",
557 | "metadata": {},
558 | "source": [
559 | "Let's check for missing values."
560 | ]
561 | },
562 | {
563 | "cell_type": "code",
564 | "execution_count": 15,
565 | "metadata": {},
566 | "outputs": [
567 | {
568 | "name": "stdout",
569 | "output_type": "stream",
570 | "text": [
571 | "1\n",
572 | "1\n"
573 | ]
574 | }
575 | ],
576 | "source": [
577 | "print(sum(X_train[\"Mileage\"].isnull()))\n",
578 | "print(sum(X_test[\"Mileage\"].isnull()))"
579 | ]
580 | },
581 | {
582 | "cell_type": "markdown",
583 | "metadata": {},
584 | "source": [
585 | "There is one missing value in each. I'll replace the missing value with the mean value of the column based on the training data."
586 | ]
587 | },
588 | {
589 | "cell_type": "code",
590 | "execution_count": 16,
591 | "metadata": {},
592 | "outputs": [],
593 | "source": [
594 | "X_train[\"Mileage\"].fillna(X_train[\"Mileage\"].astype(\"float64\").mean(), inplace = True)\n",
595 | "X_test[\"Mileage\"].fillna(X_train[\"Mileage\"].astype(\"float64\").mean(), inplace = True)"
596 | ]
597 | },
598 | {
599 | "cell_type": "markdown",
600 | "metadata": {},
601 | "source": [
602 | "### Engine, Power and Seats\n",
603 | "\n",
604 | "The `Engine` values are defined in CC so I need to remove `CC` from the data. Similarly, `Power` has bhp, so I'll remove `bhp` from it. Also, as there are missing values in `Engine`, `Power` and `Seats`, I'll again replace them with the mean."
605 | ]
606 | },
607 | {
608 | "cell_type": "code",
609 | "execution_count": 17,
610 | "metadata": {},
611 | "outputs": [],
612 | "source": [
613 | "cc_train = X_train[\"Engine\"].str.split(\" \", expand = True)\n",
614 | "cc_test = X_test[\"Engine\"].str.split(\" \", expand = True)\n",
615 | "X_train[\"Engine\"] = pd.to_numeric(cc_train[0], errors = 'coerce')\n",
616 | "X_test[\"Engine\"] = pd.to_numeric(cc_test[0], errors = 'coerce')\n",
617 | "\n",
618 | "bhp_train = X_train[\"Power\"].str.split(\" \", expand = True)\n",
619 | "bhp_test = X_test[\"Power\"].str.split(\" \", expand = True)\n",
620 | "X_train[\"Power\"] = pd.to_numeric(bhp_train[0], errors = 'coerce')\n",
621 | "X_test[\"Power\"] = pd.to_numeric(bhp_test[0], errors = 'coerce')"
622 | ]
623 | },
624 | {
625 | "cell_type": "code",
626 | "execution_count": 18,
627 | "metadata": {},
628 | "outputs": [],
629 | "source": [
630 | "X_train[\"Engine\"].fillna(X_train[\"Engine\"].astype(\"float64\").mean(), inplace = True)\n",
631 | "X_test[\"Engine\"].fillna(X_train[\"Engine\"].astype(\"float64\").mean(), inplace = True)\n",
632 | "\n",
633 | "X_train[\"Power\"].fillna(X_train[\"Power\"].astype(\"float64\").mean(), inplace = True)\n",
634 | "X_test[\"Power\"].fillna(X_train[\"Power\"].astype(\"float64\").mean(), inplace = True)\n",
635 | "\n",
636 | "X_train[\"Seats\"].fillna(X_train[\"Seats\"].astype(\"float64\").mean(), inplace = True)\n",
637 | "X_test[\"Seats\"].fillna(X_train[\"Seats\"].astype(\"float64\").mean(), inplace = True)"
638 | ]
639 | },
640 | {
641 | "cell_type": "markdown",
642 | "metadata": {},
643 | "source": [
644 | "### New Price\n",
645 | "\n",
646 | "As most of the values are missing, I'll drop this column altogether."
647 | ]
648 | },
649 | {
650 | "cell_type": "code",
651 | "execution_count": 19,
652 | "metadata": {},
653 | "outputs": [],
654 | "source": [
655 | "X_train.drop([\"New_Price\"], axis = 1, inplace = True)\n",
656 | "X_test.drop([\"New_Price\"], axis = 1, inplace = True)"
657 | ]
658 | },
659 | {
660 | "cell_type": "markdown",
661 | "metadata": {},
662 | "source": [
663 | "## Data Processing\n",
664 | "\n",
665 | "Now that we have worked with the training data, let's create dummy columns for categorical columns before we begin training."
666 | ]
667 | },
668 | {
669 | "cell_type": "code",
670 | "execution_count": 20,
671 | "metadata": {},
672 | "outputs": [],
673 | "source": [
674 | "X_train = pd.get_dummies(X_train,\n",
675 | " columns = [\"Manufacturer\", \"Fuel_Type\", \"Transmission\", \"Owner_Type\"],\n",
676 | " drop_first = True)"
677 | ]
678 | },
679 | {
680 | "cell_type": "code",
681 | "execution_count": 21,
682 | "metadata": {},
683 | "outputs": [],
684 | "source": [
685 | "X_test = pd.get_dummies(X_test,\n",
686 | " columns = [\"Manufacturer\", \"Fuel_Type\", \"Transmission\", \"Owner_Type\"],\n",
687 | " drop_first = True)"
688 | ]
689 | },
690 | {
691 | "cell_type": "markdown",
692 | "metadata": {},
693 | "source": [
694 | "It might be possible that the dummy column creation would be different in test and train data, thus, I'd fill in all missing columns with zeros."
695 | ]
696 | },
697 | {
698 | "cell_type": "code",
699 | "execution_count": 22,
700 | "metadata": {},
701 | "outputs": [],
702 | "source": [
703 | "missing_cols = set(X_train.columns) - set(X_test.columns)\n",
704 | "for col in missing_cols:\n",
705 | " X_test[col] = 0\n",
706 | "X_test = X_test[X_train.columns]"
707 | ]
708 | },
709 | {
710 | "cell_type": "markdown",
711 | "metadata": {},
712 | "source": [
713 | "Finally, as the last step of data processing, I'll scale the data."
714 | ]
715 | },
716 | {
717 | "cell_type": "code",
718 | "execution_count": 23,
719 | "metadata": {},
720 | "outputs": [],
721 | "source": [
722 | "standardScaler = StandardScaler()\n",
723 | "standardScaler.fit(X_train)\n",
724 | "X_train = standardScaler.transform(X_train)\n",
725 | "X_test = standardScaler.transform(X_test)"
726 | ]
727 | },
728 | {
729 | "cell_type": "markdown",
730 | "metadata": {},
731 | "source": [
732 | "## Training and predicting\n",
733 | "\n",
734 | "I'll create a **Linear Regression** model and a **Random Forest** model to train on the data and use it for future predictions."
735 | ]
736 | },
737 | {
738 | "cell_type": "code",
739 | "execution_count": 24,
740 | "metadata": {},
741 | "outputs": [
742 | {
743 | "data": {
744 | "text/plain": [
745 | "0.7008908549416721"
746 | ]
747 | },
748 | "execution_count": 24,
749 | "metadata": {},
750 | "output_type": "execute_result"
751 | }
752 | ],
753 | "source": [
754 | "linearRegression = LinearRegression()\n",
755 | "linearRegression.fit(X_train, y_train)\n",
756 | "y_pred = linearRegression.predict(X_test)\n",
757 | "r2_score(y_test, y_pred)"
758 | ]
759 | },
760 | {
761 | "cell_type": "code",
762 | "execution_count": 25,
763 | "metadata": {},
764 | "outputs": [
765 | {
766 | "data": {
767 | "text/plain": [
768 | "0.8860868487769373"
769 | ]
770 | },
771 | "execution_count": 25,
772 | "metadata": {},
773 | "output_type": "execute_result"
774 | }
775 | ],
776 | "source": [
777 | "rf = RandomForestRegressor(n_estimators = 100)\n",
778 | "rf.fit(X_train, y_train)\n",
779 | "y_pred = rf.predict(X_test)\n",
780 | "r2_score(y_test, y_pred)"
781 | ]
782 | },
783 | {
784 | "cell_type": "markdown",
785 | "metadata": {},
786 | "source": [
787 | "The **Random Forest** model performed the best with a R2 score of **0.88**."
788 | ]
789 | }
790 | ],
791 | "metadata": {
792 | "kernelspec": {
793 | "display_name": "Python 3",
794 | "language": "python",
795 | "name": "python3"
796 | },
797 | "language_info": {
798 | "codemirror_mode": {
799 | "name": "ipython",
800 | "version": 3
801 | },
802 | "file_extension": ".py",
803 | "mimetype": "text/x-python",
804 | "name": "python",
805 | "nbconvert_exporter": "python",
806 | "pygments_lexer": "ipython3",
807 | "version": "3.7.4"
808 | }
809 | },
810 | "nbformat": 4,
811 | "nbformat_minor": 2
812 | }
813 |
--------------------------------------------------------------------------------