├── README.md └── Task_1_Data_Analytics.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Prediction-using-Supervised-ML 2 | Data Science and Business Analytics Task-1 (Predict the percentage of an student based on the no. of study hours) 3 | Using simple linear regression model, forecasting the marks of a student based on the numbers of hours studied per day. 4 | Tool(s) Used - Python (Jupyter Notebook) 5 | -------------------------------------------------------------------------------- /Task_1_Data_Analytics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# **THE SPARKS FOUNDATION**\n", 8 | "\n", 9 | "## **TASK 1 - Prediction using Supervised ML**\n", 10 | "To Predict the percentage of marks of the students based on the number of hours they studied\n", 11 | "\n", 12 | "### *Author - Krishna Bansal*" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 51, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "# importing the required libraries\n", 22 | "import pandas as pd\n", 23 | "import numpy as np\n", 24 | "import matplotlib.pyplot as plt \n", 25 | "import seaborn as sns\n", 26 | "from sklearn.model_selection import train_test_split\n", 27 | "from sklearn.linear_model import LinearRegression\n", 28 | "from sklearn.metrics import mean_absolute_error" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 52, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "data": { 38 | "text/html": [ 39 | "
\n", 40 | "\n", 53 | "\n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | "
HoursScores
02.521
15.147
23.227
38.575
43.530
\n", 89 | "
" 90 | ], 91 | "text/plain": [ 92 | " Hours Scores\n", 93 | "0 2.5 21\n", 94 | "1 5.1 47\n", 95 | "2 3.2 27\n", 96 | "3 8.5 75\n", 97 | "4 3.5 30" 98 | ] 99 | }, 100 | "execution_count": 52, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "# Reading the Data \n", 107 | "data = pd.read_csv('http://bit.ly/w-data')\n", 108 | "data.head(5)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 53, 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "data": { 118 | "text/plain": [ 119 | "False" 120 | ] 121 | }, 122 | "execution_count": 53, 123 | "metadata": {}, 124 | "output_type": "execute_result" 125 | } 126 | ], 127 | "source": [ 128 | "# Check if there any null value in the Dataset\n", 129 | "data.isnull == True" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "**There is no null value in the Dataset so, we can now visualize our Data.**" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 54, 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "data": { 146 | "image/png": "\n", 147 | "text/plain": [ 148 | "
" 149 | ] 150 | }, 151 | "metadata": {}, 152 | "output_type": "display_data" 153 | } 154 | ], 155 | "source": [ 156 | "sns.set_style('darkgrid')\n", 157 | "sns.scatterplot(y= data['Scores'], x= data['Hours'])\n", 158 | "plt.title('Marks Vs Study Hours',size=20)\n", 159 | "plt.ylabel('Marks Percentage', size=12)\n", 160 | "plt.xlabel('Hours Studied', size=12)\n", 161 | "plt.show()" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "**From the above scatter plot there looks to be correlation between the 'Marks Percentage' and 'Hours Studied', Lets plot a regression line to confirm the correlation.**" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 55, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "image/png": "\n", 179 | "text/plain": [ 180 | "
" 181 | ] 182 | }, 183 | "metadata": {}, 184 | "output_type": "display_data" 185 | }, 186 | { 187 | "name": "stdout", 188 | "output_type": "stream", 189 | "text": [ 190 | " Hours Scores\n", 191 | "Hours 1.000000 0.976191\n", 192 | "Scores 0.976191 1.000000\n" 193 | ] 194 | } 195 | ], 196 | "source": [ 197 | "sns.regplot(x= data['Hours'], y= data['Scores'])\n", 198 | "plt.title('Regression Plot',size=20)\n", 199 | "plt.ylabel('Marks Percentage', size=12)\n", 200 | "plt.xlabel('Hours Studied', size=12)\n", 201 | "plt.show()\n", 202 | "print(data.corr())" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "**It is confirmed that the variables are positively correlated.**" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "## Training the Model\n", 217 | "### 1) Splitting the Data" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 56, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "# Defining X and y from the Data\n", 227 | "X = data.iloc[:, :-1].values \n", 228 | "y = data.iloc[:, 1].values\n", 229 | "\n", 230 | "# Spliting the Data in two\n", 231 | "train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "### 2) Fitting the Data into the model" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 57, 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "name": "stdout", 248 | "output_type": "stream", 249 | "text": [ 250 | "---------Model Trained---------\n" 251 | ] 252 | } 253 | ], 254 | "source": [ 255 | "regression = LinearRegression()\n", 256 | "regression.fit(train_X, train_y)\n", 257 | "print(\"---------Model Trained---------\")" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "## Predicting the Percentage of Marks" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 58, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "data": { 274 | "text/html": [ 275 | "
\n", 276 | "\n", 289 | "\n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | "
HoursPredicted Marks
01.516.844722
13.233.745575
27.475.500624
32.526.786400
45.960.588106
53.839.710582
61.920.821393
\n", 335 | "
" 336 | ], 337 | "text/plain": [ 338 | " Hours Predicted Marks\n", 339 | "0 1.5 16.844722\n", 340 | "1 3.2 33.745575\n", 341 | "2 7.4 75.500624\n", 342 | "3 2.5 26.786400\n", 343 | "4 5.9 60.588106\n", 344 | "5 3.8 39.710582\n", 345 | "6 1.9 20.821393" 346 | ] 347 | }, 348 | "execution_count": 58, 349 | "metadata": {}, 350 | "output_type": "execute_result" 351 | } 352 | ], 353 | "source": [ 354 | "pred_y = regression.predict(val_X)\n", 355 | "prediction = pd.DataFrame({'Hours': [i[0] for i in val_X], 'Predicted Marks': [k for k in pred_y]})\n", 356 | "prediction" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "## Comparing the Predicted Marks with the Actual Marks" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 59, 369 | "metadata": {}, 370 | "outputs": [ 371 | { 372 | "data": { 373 | "text/html": [ 374 | "
\n", 375 | "\n", 388 | "\n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | "
Actual MarksPredicted Marks
02016.844722
12733.745575
26975.500624
33026.786400
46260.588106
53539.710582
62420.821393
\n", 434 | "
" 435 | ], 436 | "text/plain": [ 437 | " Actual Marks Predicted Marks\n", 438 | "0 20 16.844722\n", 439 | "1 27 33.745575\n", 440 | "2 69 75.500624\n", 441 | "3 30 26.786400\n", 442 | "4 62 60.588106\n", 443 | "5 35 39.710582\n", 444 | "6 24 20.821393" 445 | ] 446 | }, 447 | "execution_count": 59, 448 | "metadata": {}, 449 | "output_type": "execute_result" 450 | } 451 | ], 452 | "source": [ 453 | "compare_scores = pd.DataFrame({'Actual Marks': val_y, 'Predicted Marks': pred_y})\n", 454 | "compare_scores" 455 | ] 456 | }, 457 | { 458 | "cell_type": "markdown", 459 | "metadata": {}, 460 | "source": [ 461 | "## Visually Comparing the Predicted Marks with the Actual Marks" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": 60, 467 | "metadata": {}, 468 | "outputs": [ 469 | { 470 | "data": { 471 | "image/png": "\n", 472 | "text/plain": [ 473 | "
" 474 | ] 475 | }, 476 | "metadata": {}, 477 | "output_type": "display_data" 478 | } 479 | ], 480 | "source": [ 481 | "plt.scatter(x=val_X, y=val_y, color='blue')\n", 482 | "plt.plot(val_X, pred_y, color='Black')\n", 483 | "plt.title('Actual vs Predicted', size=20)\n", 484 | "plt.ylabel('Marks Percentage', size=12)\n", 485 | "plt.xlabel('Hours Studied', size=12)\n", 486 | "plt.show()" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": {}, 492 | "source": [ 493 | "## Evaluating the Model" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 61, 499 | "metadata": {}, 500 | "outputs": [ 501 | { 502 | "name": "stdout", 503 | "output_type": "stream", 504 | "text": [ 505 | "Mean absolute error: 4.130879918502486\n" 506 | ] 507 | } 508 | ], 509 | "source": [ 510 | "# Calculating the accuracy of the model\n", 511 | "print('Mean absolute error: ',mean_absolute_error(val_y,pred_y))" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "metadata": {}, 517 | "source": [ 518 | "**Small value of Mean absolute error states that the chances of error or wrong forecasting through the model are very less.**" 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "metadata": {}, 524 | "source": [ 525 | "## What will be the predicted score of a student if he/she studies for 9.25 hrs/ day?" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": 62, 531 | "metadata": {}, 532 | "outputs": [ 533 | { 534 | "name": "stdout", 535 | "output_type": "stream", 536 | "text": [ 537 | "Score = 93.893\n" 538 | ] 539 | } 540 | ], 541 | "source": [ 542 | "hours = [9.25]\n", 543 | "answer = regression.predict([hours])\n", 544 | "print(\"Score = {}\".format(round(answer[0],3)))" 545 | ] 546 | }, 547 | { 548 | "cell_type": "markdown", 549 | "metadata": {}, 550 | "source": [ 551 | "**According to the regression model if a student studies for 9.25 hours a day he/she is likely to score 93.89 marks.**" 552 | ] 553 | } 554 | ], 555 | "metadata": { 556 | "kernelspec": { 557 | "display_name": "Python 3", 558 | "language": "python", 559 | "name": "python3" 560 | }, 561 | "language_info": { 562 | "codemirror_mode": { 563 | "name": "ipython", 564 | "version": 3 565 | }, 566 | "file_extension": ".py", 567 | "mimetype": "text/x-python", 568 | "name": "python", 569 | "nbconvert_exporter": "python", 570 | "pygments_lexer": "ipython3", 571 | "version": "3.8.3" 572 | } 573 | }, 574 | "nbformat": 4, 575 | "nbformat_minor": 4 576 | } 577 | --------------------------------------------------------------------------------