├── .gitignore ├── LICENSE ├── PaperPenWorkOut ├── chanakya.pdf ├── classify_05.csv ├── classify_25.csv ├── creditRisk.csv ├── regression_05.csv └── regression_25.csv ├── README.md ├── bank-marketing ├── data │ ├── test.csv │ └── train.csv └── notebook │ ├── 1. Bank Marketing - Logistic Regression.ipynb │ ├── 2. Bank Marketing - Decision Tree.ipynb │ ├── 3. Alternate Encoding Mechanism.ipynb │ ├── 4. Model Complexity.ipynb │ └── img │ ├── Precisionrecall.png │ ├── accuracy_metrics.png │ ├── biasvariance_0.png │ ├── biasvariance_1.png │ ├── biasvariance_2.png │ ├── biasvariance_3.png │ ├── biasvariance_4.png │ ├── logit.png │ ├── overfitting_1.jpg │ ├── overfitting_2.png │ └── tpr_fpr.png ├── cars ├── Acquire.ipynb ├── Explore.ipynb ├── Linear Regression.xlsx ├── Model.ipynb ├── Refine.ipynb └── data │ ├── cars.csv │ └── cars.tidy.csv ├── cheatsheets ├── data-wrangling-cheatsheet.pdf └── ggplot2-cheatsheet-2.0.pdf ├── curriculum.md ├── img ├── ISLR.jpeg ├── acquire.jpg ├── amit.png ├── approach.jpg ├── art.jpeg ├── bank.jpg ├── bargava.jpg ├── book.png ├── books.jpg ├── break.jpg ├── cars.jpg ├── clay.jpeg ├── confusion-matrix.jpg ├── corr.svg ├── craft.jpeg ├── data_analysis.png ├── datascienceinR.png ├── diamond-clarity.png ├── diamond-colors.png ├── diamonds.jpg ├── estimating_coefficients.png ├── explore.jpg ├── frame.jpg ├── glass.jpg ├── harddisk.jpg ├── hari.jpg ├── hasgeek.png ├── insight.jpg ├── kaggle.png ├── kaggle_short.png ├── lens.jpeg ├── list.jpg ├── model.jpg ├── nischal.jpg ├── numbers.jpg ├── onion-image.jpg ├── onion.jpg ├── onion.png ├── onion_small.png ├── onion_tables.png ├── overview.jpg ├── pair.jpg ├── peeling_the_onion_small.png ├── postit.jpg ├── problems.png ├── r2.gif ├── r_squared.png ├── raghottam.jpg ├── refine.jpg ├── retail.jpg ├── science.jpeg ├── see.jpeg ├── shrayas.jpg ├── single.jpeg ├── skills.png ├── slope_intercept.png ├── speak.jpeg ├── sports.jpg ├── stars.jpg ├── subsetcolumns.png ├── subsetrows.png ├── table.jpg ├── think.jpg ├── thinking.jpg ├── thinkstats.jpg ├── time.jpg ├── tool.jpg ├── travel.jpg ├── unnati.png ├── var1.jpg ├── var2.jpg ├── var3.jpg ├── welcome.jpg ├── wesmckinney.jpg ├── wine.jpg ├── workshop.jpg └── zainab.jpg ├── installation_guide_linux.md ├── installation_guide_osx.md ├── installation_guide_windows.md ├── intro ├── intro-to-r.ipynb ├── intro_viz.ipynb └── small_cars.csv ├── introduction.md ├── onion ├── 1-Frame.ipynb ├── 2-Acquire.ipynb ├── 3-Refine.ipynb ├── 4-Explore.ipynb ├── 5-Model.ipynb ├── 6-Insight.ipynb ├── MonthWiseMarketArrivals.csv ├── MonthWiseMarketArrivals.html ├── MonthWiseMarketArrivalsAll.csv ├── MonthWiseMarketArrivalsJan2016.html ├── MonthWiseMarketArrivalsJan2016All.csv ├── MonthWiseMarketArrivals_Clean.csv ├── city_geocode.csv ├── img │ ├── Cov_nonstationary.png │ ├── Mean_nonstationary.png │ ├── Var_nonstationary.png │ ├── left_merge.png │ ├── mutate.png │ ├── pivot.png │ ├── splitapplycombine.png │ ├── subsetcolumns.png │ ├── subsetrows.png │ ├── tallformat.png │ └── wideformat.png ├── my_db.sqlite3 └── state_geocode.csv ├── overview-slides.pdf ├── overview.md ├── schedule.md └── wine └── winequality-red.csv /.gitignore: -------------------------------------------------------------------------------- 1 | # IPY 2 | .ipynb_checkpoints/ 3 | 4 | # R 5 | .RData 6 | .Rhistory 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Unnati 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /PaperPenWorkOut/chanakya.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/PaperPenWorkOut/chanakya.pdf -------------------------------------------------------------------------------- /PaperPenWorkOut/classify_05.csv: -------------------------------------------------------------------------------- 1 | X1,X2,Label -17,1,1 -1,9,1 -21,0,1 -1,13,1 -12,-12,0 5,-11,0 20,-15,0 -1,11,1 -17,8,1 -11,-10,0 14,-8,0 -9,-9,0 26,12,1 -8,-5,0 18,-14,0 -9,-10,0 22,-13,0 -6,10,1 -25,3,1 0,18,1 -------------------------------------------------------------------------------- /PaperPenWorkOut/classify_25.csv: -------------------------------------------------------------------------------- 1 | X1,X2,Label -2,-24,0 10,10,1 -23,-5,0 11,-10,0 29,0,1 -22,11,1 11,14,1 -9,11,0 -15,6,1 3,-18,0 8,-8,0 16,10,1 -17,9,0 34,1,1 14,-12,0 7,-12,1 -16,-9,0 16,-7,0 -8,15,0 16,-5,0 -------------------------------------------------------------------------------- /PaperPenWorkOut/creditRisk.csv: -------------------------------------------------------------------------------- 1 | Income,Credit History,Risk 0,Unknown,High 0,Bad,High 0,Good,High 14000,Unknown,High 14000,Bad,High 14000,Good,High 16000,Unknown,Moderate 16000,Bad,High 16000,Good,Moderate 34000,Unknown,Moderate 34000,Bad,High 34000,Good,Moderate 36000,Unknown,Low 36000,Bad,Moderate 36000,Good,Low 70000,Unknown,Low 70000,Bad,Moderate 70000,Good,Low -------------------------------------------------------------------------------- /PaperPenWorkOut/regression_05.csv: -------------------------------------------------------------------------------- 1 | x,y 1,11 2,10 3,19 4,26 5,22 6,27 7,36 8,37 9,40 10,45 11,50 12,51 13,54 14,61 15,68 16,70 17,76 18,77 19,80 20,88 -------------------------------------------------------------------------------- /PaperPenWorkOut/regression_25.csv: -------------------------------------------------------------------------------- 1 | x,y 1,81 2,72 3,17 4,104 5,84 6,49 7,32 8,105 9,66 10,113 11,98 12,50 13,126 14,153 15,90 16,74 17,132 18,141 19,81 20,87 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction to Data Science in R 2 | 3 | This is a repository for the data science workshop in R conducted by Unnati | Hasgeek 4 | 5 | - [Curriculum](curriculum.md) - The scope of the workshop. 6 | - Installation - To get yourself ready for the workshop. 7 | - [Installation for Windows](installation_guide_windows.md) 8 | - [Installation for OSX](installation_guide_osx.md) 9 | - [Installation for Linux ](installation_guide_linux.md) 10 | - [Schedule](schedule.md) - The broad schedule for the workshop 11 | - [Introduction](introduction.md) - The overall introduction to the workshop. 12 | - [Overview](overview.md) - The overview presentation for the workshop. ([Overview Slides](/overview-slides.pdf)) 13 | - [Intro to Data Structures in R](intro/intro-to-r.ipynb) - Get started with R. 14 | - [Intro to Visualisation in R](intro/intro_viz.ipynb) - Get started with visualisation in R. 15 | 16 | Case Studies 17 | - [Case #1 - Peeling the Onion](/onion) - Price & Quantity of Onion across in India. 18 | - [Frame the Problem](/onion/1-Frame.ipynb) 19 | - [Acquire the Data](/onion/2-Acquire.ipynb) 20 | - [Refine the Data](/onion/3-Refine.ipynb) 21 | - [Explore the Data](/onion/4-Explore.ipynb) 22 | - [Model the Solution](/onion/5-Model.ipynb) 23 | - [Insight Communication](/onion/6-Insight.ipynb) 24 | - [Case #2 - Cars: Kitna Deti Hain](/cars) - Price & Mileage for Cars in India 25 | - [Frame the Problem](/cars/Frame.ipynb) 26 | - [Acquire the Data](/cars/Acquire.ipynb) 27 | - [Refine the Data](/cars/Refine.ipynb) 28 | - [Explore the Data](/cars/Explore.ipynb) 29 | - [Model the Solution](/cars/Model.ipynb) 30 | - [Case #3 - Bank Marketing](/bank-marketing/) - Bank Marketing 31 | - [Model - Logistic Regression](/bank-marketing/notebook/1. Bank Marketing - Logistic Regression.ipynb) 32 | - [Model - Decision Trees](/bank-marketing/notebook/2. Bank Marketing - Decision Tree.ipynb) 33 | - [Model Complexity](/bank-marketing/notebook/4. Model Complexity.ipynb) 34 | 35 | You may also be interested to check out the [Introduction to Data Science in Python](https://github.com/unnati-xyz/intro-python-data-science) repo. 36 | -------------------------------------------------------------------------------- /bank-marketing/notebook/3. Alternate Encoding Mechanism.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## One Hot Encoding" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "dd <- read.table(text=\"\n", 19 | " RACE AGE.BELOW.21 CLASS\n", 20 | " HISPANIC 0 A\n", 21 | " ASIAN 1 A\n", 22 | " HISPANIC 1 D\n", 23 | " CAUCASIAN 1 B\",\n", 24 | " header=TRUE)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": { 31 | "collapsed": false 32 | }, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/html": [ 37 | "\n", 38 | "\n", 39 | "\n", 40 | "\t\n", 41 | "\t\n", 42 | "\t\n", 43 | "\t\n", 44 | "\n", 45 | "
RACEAGE.BELOW.21CLASS
1HISPANIC0A
2ASIAN1A
3HISPANIC1D
4CAUCASIAN1B
\n" 46 | ], 47 | "text/latex": [ 48 | "\\begin{tabular}{r|lll}\n", 49 | " & RACE & AGE.BELOW.21 & CLASS\\\\\n", 50 | "\\hline\n", 51 | "\t1 & HISPANIC & 0 & A\\\\\n", 52 | "\t2 & ASIAN & 1 & A\\\\\n", 53 | "\t3 & HISPANIC & 1 & D\\\\\n", 54 | "\t4 & CAUCASIAN & 1 & B\\\\\n", 55 | "\\end{tabular}\n" 56 | ], 57 | "text/plain": [ 58 | " RACE AGE.BELOW.21 CLASS\n", 59 | "1 HISPANIC 0 A\n", 60 | "2 ASIAN 1 A\n", 61 | "3 HISPANIC 1 D\n", 62 | "4 CAUCASIAN 1 B" 63 | ] 64 | }, 65 | "execution_count": 2, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "dd" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 6, 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "outputs": [ 81 | { 82 | "data": { 83 | "text/html": [ 84 | "\n", 85 | "\n", 86 | "\n", 87 | "\t\n", 88 | "\t\n", 89 | "\t\n", 90 | "\t\n", 91 | "\n", 92 | "
RACEASIANRACECAUCASIANRACEHISPANICAGE.BELOW.21CLASS
10010A
21001A
30011D
40101B
\n" 93 | ], 94 | "text/latex": [ 95 | "\\begin{tabular}{r|lllll}\n", 96 | " & RACEASIAN & RACECAUCASIAN & RACEHISPANIC & AGE.BELOW.21 & CLASS\\\\\n", 97 | "\\hline\n", 98 | "\t1 & 0 & 0 & 1 & 0 & A\\\\\n", 99 | "\t2 & 1 & 0 & 0 & 1 & A\\\\\n", 100 | "\t3 & 0 & 0 & 1 & 1 & D\\\\\n", 101 | "\t4 & 0 & 1 & 0 & 1 & B\\\\\n", 102 | "\\end{tabular}\n" 103 | ], 104 | "text/plain": [ 105 | " RACEASIAN RACECAUCASIAN RACEHISPANIC AGE.BELOW.21 CLASS\n", 106 | "1 0 0 1 0 A\n", 107 | "2 1 0 0 1 A\n", 108 | "3 0 0 1 1 D\n", 109 | "4 0 1 0 1 B" 110 | ] 111 | }, 112 | "execution_count": 6, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "with(dd,\n", 119 | " data.frame(model.matrix(~RACE-1,dd),\n", 120 | " AGE.BELOW.21,CLASS))" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "### Including all levels" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 8, 133 | "metadata": { 134 | "collapsed": false 135 | }, 136 | "outputs": [ 137 | { 138 | "data": { 139 | "text/html": [ 140 | "\n", 141 | "\n", 142 | "\n", 143 | "\t\n", 144 | "\t\n", 145 | "\t\n", 146 | "\t\n", 147 | "\n", 148 | "
RACEASIANRACECAUCASIANRACEHISPANICCLASSACLASSBCLASSD
1001100
2100100
3001001
4010010
\n" 149 | ], 150 | "text/latex": [ 151 | "\\begin{tabular}{r|llllll}\n", 152 | " & RACEASIAN & RACECAUCASIAN & RACEHISPANIC & CLASSA & CLASSB & CLASSD\\\\\n", 153 | "\\hline\n", 154 | "\t1 & 0 & 0 & 1 & 1 & 0 & 0\\\\\n", 155 | "\t2 & 1 & 0 & 0 & 1 & 0 & 0\\\\\n", 156 | "\t3 & 0 & 0 & 1 & 0 & 0 & 1\\\\\n", 157 | "\t4 & 0 & 1 & 0 & 0 & 1 & 0\\\\\n", 158 | "\\end{tabular}\n" 159 | ], 160 | "text/markdown": [ 161 | "1. 0\n", 162 | "2. 1\n", 163 | "3. 0\n", 164 | "4. 0\n", 165 | "5. 0\n", 166 | "6. 0\n", 167 | "7. 0\n", 168 | "8. 1\n", 169 | "9. 1\n", 170 | "10. 0\n", 171 | "11. 1\n", 172 | "12. 0\n", 173 | "13. 1\n", 174 | "14. 1\n", 175 | "15. 0\n", 176 | "16. 0\n", 177 | "17. 0\n", 178 | "18. 0\n", 179 | "19. 0\n", 180 | "20. 1\n", 181 | "21. 0\n", 182 | "22. 0\n", 183 | "23. 1\n", 184 | "24. 0\n", 185 | "\n", 186 | "\n" 187 | ], 188 | "text/plain": [ 189 | " RACEASIAN RACECAUCASIAN RACEHISPANIC CLASSA CLASSB CLASSD\n", 190 | "1 0 0 1 1 0 0\n", 191 | "2 1 0 0 1 0 0\n", 192 | "3 0 0 1 0 0 1\n", 193 | "4 0 1 0 0 1 0" 194 | ] 195 | }, 196 | "execution_count": 8, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | } 200 | ], 201 | "source": [ 202 | "cbind(with(dd, model.matrix(~ RACE + 0)), with(dd, model.matrix(~ CLASS + 0)))" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "### One more approach" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 9, 215 | "metadata": { 216 | "collapsed": false 217 | }, 218 | "outputs": [ 219 | { 220 | "name": "stderr", 221 | "output_type": "stream", 222 | "text": [ 223 | "Loading required package: lattice\n", 224 | "Loading required package: ggplot2\n", 225 | "Warning message:\n", 226 | ": package ‘ggplot2’ was built under R version 3.2.4" 227 | ] 228 | } 229 | ], 230 | "source": [ 231 | "library(caret)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 12, 237 | "metadata": { 238 | "collapsed": false 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "trainDummy <- dummyVars(AGE.BELOW.21 ~. , data=dd)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 14, 248 | "metadata": { 249 | "collapsed": false 250 | }, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/html": [ 255 | "\n", 256 | "\n", 257 | "\n", 258 | "\t\n", 259 | "\t\n", 260 | "\t\n", 261 | "\t\n", 262 | "\n", 263 | "
RACE.ASIANRACE.CAUCASIANRACE.HISPANICCLASS.ACLASS.BCLASS.D
1001100
2100100
3001001
4010010
\n" 264 | ], 265 | "text/latex": [ 266 | "\\begin{tabular}{r|llllll}\n", 267 | " & RACE.ASIAN & RACE.CAUCASIAN & RACE.HISPANIC & CLASS.A & CLASS.B & CLASS.D\\\\\n", 268 | "\\hline\n", 269 | "\t1 & 0 & 0 & 1 & 1 & 0 & 0\\\\\n", 270 | "\t2 & 1 & 0 & 0 & 1 & 0 & 0\\\\\n", 271 | "\t3 & 0 & 0 & 1 & 0 & 0 & 1\\\\\n", 272 | "\t4 & 0 & 1 & 0 & 0 & 1 & 0\\\\\n", 273 | "\\end{tabular}\n" 274 | ], 275 | "text/markdown": [ 276 | "1. 0\n", 277 | "2. 1\n", 278 | "3. 0\n", 279 | "4. 0\n", 280 | "5. 0\n", 281 | "6. 0\n", 282 | "7. 0\n", 283 | "8. 1\n", 284 | "9. 1\n", 285 | "10. 0\n", 286 | "11. 1\n", 287 | "12. 0\n", 288 | "13. 1\n", 289 | "14. 1\n", 290 | "15. 0\n", 291 | "16. 0\n", 292 | "17. 0\n", 293 | "18. 0\n", 294 | "19. 0\n", 295 | "20. 1\n", 296 | "21. 0\n", 297 | "22. 0\n", 298 | "23. 1\n", 299 | "24. 0\n", 300 | "\n", 301 | "\n" 302 | ], 303 | "text/plain": [ 304 | " RACE.ASIAN RACE.CAUCASIAN RACE.HISPANIC CLASS.A CLASS.B CLASS.D\n", 305 | "1 0 0 1 1 0 0\n", 306 | "2 1 0 0 1 0 0\n", 307 | "3 0 0 1 0 0 1\n", 308 | "4 0 1 0 0 1 0" 309 | ] 310 | }, 311 | "execution_count": 14, 312 | "metadata": {}, 313 | "output_type": "execute_result" 314 | } 315 | ], 316 | "source": [ 317 | "predict(trainDummy, dd)" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "**Exercise** \n", 325 | "\n", 326 | "1. Do one hot encoding on the bank marketing dataset.\n", 327 | "2. Run logistic regression on that\n", 328 | "3. Compute accuracy metrics. Do you see any difference?\n", 329 | "4. Does one hot encoding impact decision tree? Discuss" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": { 336 | "collapsed": false 337 | }, 338 | "outputs": [], 339 | "source": [] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "metadata": { 345 | "collapsed": true 346 | }, 347 | "outputs": [], 348 | "source": [] 349 | } 350 | ], 351 | "metadata": { 352 | "kernelspec": { 353 | "display_name": "R", 354 | "language": "R", 355 | "name": "ir" 356 | }, 357 | "language_info": { 358 | "codemirror_mode": "r", 359 | "file_extension": ".r", 360 | "mimetype": "text/x-r-source", 361 | "name": "R", 362 | "pygments_lexer": "r", 363 | "version": "3.2.3" 364 | } 365 | }, 366 | "nbformat": 4, 367 | "nbformat_minor": 0 368 | } 369 | -------------------------------------------------------------------------------- /bank-marketing/notebook/4. Model Complexity.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Model Complexity and Bias - Variance\n", 8 | "\n", 9 | "\n", 10 | "\n", 11 | "\n", 12 | "# Model Complexity \n", 13 | "\n", 14 | "\n", 15 | "# Bias - Variance\n", 16 | "\n", 17 | "\n", 18 | "\n", 19 | "# Bias- Variance - Another pic \n", 20 | "\n", 21 | "\n", 22 | "\n", 23 | "# Bias - Variance - One more example \n", 24 | "\n", 25 | "\n", 26 | "\n", 27 | "# Overfitting\n", 28 | "\n", 29 | "\n", 30 | "\n", 31 | "# Overfitting - Another example\n", 32 | "\n", 33 | "" 34 | ] 35 | } 36 | ], 37 | "metadata": { 38 | "kernelspec": { 39 | "display_name": "R", 40 | "language": "R", 41 | "name": "ir" 42 | }, 43 | "language_info": { 44 | "codemirror_mode": "r", 45 | "file_extension": ".r", 46 | "mimetype": "text/x-r-source", 47 | "name": "R", 48 | "pygments_lexer": "r", 49 | "version": "3.2.3" 50 | } 51 | }, 52 | "nbformat": 4, 53 | "nbformat_minor": 0 54 | } 55 | -------------------------------------------------------------------------------- /bank-marketing/notebook/img/Precisionrecall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/bank-marketing/notebook/img/Precisionrecall.png -------------------------------------------------------------------------------- /bank-marketing/notebook/img/accuracy_metrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/bank-marketing/notebook/img/accuracy_metrics.png -------------------------------------------------------------------------------- /bank-marketing/notebook/img/biasvariance_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/bank-marketing/notebook/img/biasvariance_0.png -------------------------------------------------------------------------------- /bank-marketing/notebook/img/biasvariance_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/bank-marketing/notebook/img/biasvariance_1.png -------------------------------------------------------------------------------- /bank-marketing/notebook/img/biasvariance_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/bank-marketing/notebook/img/biasvariance_2.png -------------------------------------------------------------------------------- /bank-marketing/notebook/img/biasvariance_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/bank-marketing/notebook/img/biasvariance_3.png -------------------------------------------------------------------------------- /bank-marketing/notebook/img/biasvariance_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/bank-marketing/notebook/img/biasvariance_4.png -------------------------------------------------------------------------------- /bank-marketing/notebook/img/logit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/bank-marketing/notebook/img/logit.png -------------------------------------------------------------------------------- /bank-marketing/notebook/img/overfitting_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/bank-marketing/notebook/img/overfitting_1.jpg -------------------------------------------------------------------------------- /bank-marketing/notebook/img/overfitting_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/bank-marketing/notebook/img/overfitting_2.png -------------------------------------------------------------------------------- /bank-marketing/notebook/img/tpr_fpr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/bank-marketing/notebook/img/tpr_fpr.png -------------------------------------------------------------------------------- /cars/Explore.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Cars\n", 8 | "\n", 9 | "\n", 10 | "### We have the dataset of cars available in India, lets compare prices, specs, fuel economy etc. across car types. \n", 11 | "\n", 12 | "\n", 13 | "### Does better mileage means lower price?\n", 14 | "### What are the features that drive the mileage of cars?" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "library(ggplot2)\n", 26 | "library(dplyr)\n", 27 | "library(tidyr)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": true 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "options(repr.plot.width = 10, repr.plot.height = 6)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "### Read CSV" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "collapsed": false 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "df <- read.csv(\"data/cars.tidy.csv\", stringsAsFactor = FALSE)\n", 57 | "df$price_in_1000 = df$price / 1000\n", 58 | "colnames(df)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "### Filter all cars above 17 lakhs" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": { 72 | "collapsed": false 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "df <- filter(df, price < 1700000)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": { 83 | "collapsed": false 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "head(df, 3)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "length(unique(df$name))" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": { 105 | "collapsed": false 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "str(df)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "## Single variable visualization\n", 117 | "### Engine Capacity" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": { 124 | "collapsed": false 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "ggplot(df, aes(engine)) + geom_histogram(binwidth = 300)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### Seating Capacity" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": { 142 | "collapsed": false 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "ggplot(df, aes(seats)) + geom_bar(stat = \"count\")" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": { 153 | "collapsed": false, 154 | "scrolled": false 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "unique(df$type)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "colnames(df)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "## Multi variable visualization\n", 177 | "### Mileage in city vs Engine capacity" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": { 184 | "collapsed": false, 185 | "scrolled": false 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "ggplot(df,\n", 190 | " aes(mileage_city, engine, color=fuel)) + geom_point()" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "### Visualize the mileage of petrol cars" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": { 204 | "collapsed": false 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "ggplot(filter(df, fuel == ' Petrol'),\n", 209 | " aes(mileage_highway, engine, color=fuel)) + geom_point()" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "### Mileage vs Price vs Engine capacity" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": { 223 | "collapsed": false 224 | }, 225 | "outputs": [], 226 | "source": [ 227 | "ggplot(df,\n", 228 | " aes(mileage_city, price_in_1000, color=engine)) + geom_point() + ylim(c(0,1500))" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "Price vs Mileage" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": { 242 | "collapsed": false 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "ggplot(df, aes(price, mileage_city)) + geom_point()" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": { 253 | "collapsed": true 254 | }, 255 | "outputs": [], 256 | "source": [] 257 | } 258 | ], 259 | "metadata": { 260 | "kernelspec": { 261 | "display_name": "R", 262 | "language": "R", 263 | "name": "ir" 264 | }, 265 | "language_info": { 266 | "codemirror_mode": "r", 267 | "file_extension": ".r", 268 | "mimetype": "text/x-r-source", 269 | "name": "R", 270 | "pygments_lexer": "r", 271 | "version": "3.2.2" 272 | } 273 | }, 274 | "nbformat": 4, 275 | "nbformat_minor": 0 276 | } 277 | -------------------------------------------------------------------------------- /cars/Linear Regression.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/cars/Linear Regression.xlsx -------------------------------------------------------------------------------- /cars/Model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Cars\n", 8 | "## Model" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": { 15 | "collapsed": false 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "library(ggplot2)\n", 20 | "library(dplyr)\n", 21 | "library(tidyr)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": { 28 | "collapsed": true 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "options(repr.plot.width = 10, repr.plot.height = 6)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "df <- read.csv(\"data/cars.tidy.csv\", stringsAsFactor = FALSE)\n", 44 | "df$price_in_1000 <- df$price / 1000" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## Correlation of features\n", 52 | "### Mileage in city vs price\n" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "cor(df$mileage_city, df$price_in_1000)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": { 70 | "collapsed": false 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "ggplot(df) + aes(mileage_city, price_in_1000) + geom_point()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "### Mileage on highway vs Price" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": false 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "cor(df$mileage_highway, df$price_in_1000)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "### Engine capacity vs Mileage in city" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "collapsed": false 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "cor(df$engine, df$mileage_city)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "### Price vs Mileage in city" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": { 124 | "collapsed": false 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "cor(df$price_in_1000, df$mileage_city)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "### Gears & Engine capacity vs Mileage on Highway" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": { 142 | "collapsed": false 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "cor(c(df$engine * df$gears), df$mileage_highway)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": { 153 | "collapsed": false 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "df$mileage_city <- as.numeric(df$mileage_city)\n", 158 | "head(df$mileage_city)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "# Linear Regression" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "\n", 173 | "#### Simple Linear Regression\n", 174 | "\n", 175 | "Simple linear regression is an approach for predicting a quantitative response using a single feature (or \"predictor\" or \"input variable\"). It takes the following form:\n", 176 | "\n", 177 | " y=β0+β1x\n", 178 | "\n", 179 | "What does each term represent?\n", 180 | "\n", 181 | "* y is the response\n", 182 | "* x is the feature\n", 183 | "* β0 is the intercept\n", 184 | "* β1 is the coefficient for x\n", 185 | "\n", 186 | "Together, β0 and β1 are called the model coefficients. To create your model, you must \"learn\" the values of these coefficients. And once we've learned these coefficients, we can use the model to predict Price!\n" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "#### Estimating (\"Learning\") Model Coefficients\n", 194 | "Generally speaking, coefficients are estimated using the least squares criterion, which means we are find the line (mathematically) which minimizes the sum of squared residuals (or \"sum of squared errors\"):\n", 195 | "\n", 196 | "![](../img/estimating_coefficients.png)\n", 197 | "\n", 198 | "**What elements are present in the diagram?**\n", 199 | "\n", 200 | "* The black dots are the observed values of x and y.\n", 201 | "* The blue line is our least squares line.\n", 202 | "* The red lines are the residuals, which are the distances between the observed values and the least squares line.\n", 203 | "\n", 204 | "**How do the model coefficients relate to the least squares line?**\n", 205 | "* β0 is the intercept (the value of y when x=0)\n", 206 | "* β1 is the slope (the change in y divided by change in x)\n", 207 | "Here is a graphical depiction of those calculations:\n", 208 | "\n", 209 | "![](../img/slope_intercept.png)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "collapsed": false 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "model <- lm(price_in_1000 ~ mileage_city, data = df)\n", 221 | "model" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "### Interpreting Model Coefficients\n", 229 | "\n", 230 | "How do we interpret the mileage coefficient (β1)?\n", 231 | "Increase in mileage is associated with a 1153 decrease in price.\n", 232 | "\n", 233 | "Note that if an increase in mileage was associated with a positive in price, β1 would be positive.\n", 234 | "\n", 235 | "#### Using the Model for Prediction\n", 236 | "\n", 237 | " y=β0+β1x" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": { 244 | "collapsed": false 245 | }, 246 | "outputs": [], 247 | "source": [ 248 | "summary(model)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "## Understanding the Output\n", 256 | "\n", 257 | "\n", 258 | "\t\n", 259 | "\t\t\n", 260 | "\t\t\n", 261 | "\t\t\n", 262 | "\t\n", 263 | "\t\n", 264 | "\t\t\n", 265 | "\t\t\n", 266 | "\t\t\n", 267 | "\t\n", 268 | "\t\n", 269 | "\t\t\n", 270 | "\t\t\n", 271 | "\t\t\n", 272 | "\t\n", 273 | "\t\n", 274 | "\t\t\n", 275 | "\t\t\n", 276 | "\t\t\n", 277 | "\t\n", 278 | "\t\n", 279 | "\t\t\n", 280 | "\t\t\n", 281 | "\t\t\n", 282 | "\t\n", 283 | "\t\n", 284 | "\t\t\n", 285 | "\t\t\n", 286 | "\t\t\n", 287 | "\t\n", 288 | "\t\n", 289 | "\t\t\n", 290 | "\t\t\n", 291 | "\t\t\n", 292 | "\t\n", 293 | "\t\n", 294 | "\t\t\n", 295 | "\t\t\n", 296 | "\t\t\n", 297 | "\t\n", 298 | "\t\n", 299 | "\t\t\n", 300 | "\t\t\n", 301 | "\t\t\n", 302 | "\t\n", 303 | "\t\n", 304 | "\t\t\n", 305 | "\t\t\n", 306 | "\t\t\n", 307 | "\t\n", 308 | "\t\n", 309 | "\t\t\n", 310 | "\t\t\n", 311 | "\t\t\n", 312 | "\t\n", 313 | "
#NameDescription
1ResidualsThe residuals are the difference between the actual values of the variable you're predicting and predicted values from your regression--y - ŷ. For most regressions you want your residuals to look like a normal distribution when plotted. If our residuals are normally distributed, this indicates the mean of the difference between our predictions and the actual values is close to 0 (good) and that when we miss, we're missing both short and long of the actual value, and the likelihood of a miss being far from the actual value gets smaller as the distance from the actual value gets larger.

Think of it like a dartboard. A good model is going to hit the bullseye some of the time (but not everytime). When it doesn't hit the bullseye, it's missing in all of the other buckets evenly (i.e. not just missing in the 16 bin) and it also misses closer to the bullseye as opposed to on the outer edges of the dartboard.
2Significance StarsThe stars are shorthand for significance levels, with the number of asterisks displayed according to the p-value computed. *** for high significance and * for low significance.
3Estimated CoeffecientThe estimated coefficient is the value of slope calculated by the regression. It might seem a little confusing that the Intercept also has a value, but just think of it as a slope that is always multiplied by 1. This number will obviously vary based on the magnitude of the variable you're inputting into the regression, but it's always good to spot check this number to make sure it seems reasonable.
4Standard Error of the Coefficient EstimateMeasure of the variability in the estimate for the coefficient. Lower means better but this number is relative to the value of the coefficient. As a rule of thumb, you'd like this value to be at least an order of magnitude less than the coefficient estimate.
5t-value of the Coefficient EstimateScore that measures whether or not the coefficient for this variable is meaningful for the model. You probably won't use this value itself, but know that it is used to calculate the p-value and the significance levels.
6Variable p-valueProbability the variable is NOT relevant. You want this number to be as small as possible. If the number is really small, R will display it in scientific notation.
7Significance LegendThe more punctuation there is next to your variables, the better.

Blank=bad, Dots=pretty good, Stars=good, More Stars=very good
8Residual Std Error / Degrees of FreedomThe Residual Std Error is just the standard deviation of your residuals. You'd like this number to be proportional to the quantiles of the residuals in #1. For a normal distribution, the 1st and 3rd quantiles should be 1.5 +/- the std error.

The Degrees of Freedom is the difference between the number of observations included in your training sample and the number of variables used in your model (intercept counts as a variable).
9R-squaredMetric for evaluating the goodness of fit of your model. Higher is better with 1 being the best. Corresponds with the amount of variability in what you're predicting that is explained by the model.
WARNING: While a high R-squared indicates good correlation, correlation does not always imply causation.
10F-statistic & resulting p-valuePerforms an F-test on the model. This takes the parameters of our model (in our case we only have 1) and compares it to a model that has fewer parameters. In theory the model with more parameters should fit better. If the model with more parameters (your model) doesn't perform better than the model with fewer parameters, the F-test will have a high p-value (probability NOT significant boost). If the model with more parameters is better than the model with fewer parameters, you will have a lower p-value.

\tThe DF, or degrees of freedom, pertains to how many variables are in the model.
" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "metadata": {}, 319 | "source": [ 320 | "### How Well Does the Model Fit the data?\n", 321 | "\n", 322 | "The most common way to evaluate the overall fit of a linear model is by the R-squared value. R-squared is the proportion of variance explained, meaning the proportion of variance in the observed data that is explained by the model, or the reduction in error over the null model. (The null model just predicts the mean of the observed response, and thus it has an intercept and no slope.)\n", 323 | "\n", 324 | "R-squared is between 0 and 1, and higher is better because it means that more variance is explained by the model. Here's an example of what R-squared \"looks like\":\n", 325 | "\n", 326 | "![](../img/r_squared.png)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "#### Goodness of fit - R2 score\n", 334 | "\n", 335 | "![](../img/r2.gif)\n" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": { 342 | "collapsed": false 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "model$fitted.values" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": { 353 | "collapsed": false 354 | }, 355 | "outputs": [], 356 | "source": [ 357 | "resid <- data.frame(model$residuals, model$fitted.values)" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": { 364 | "collapsed": false 365 | }, 366 | "outputs": [], 367 | "source": [ 368 | "head(resid)" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": { 375 | "collapsed": false 376 | }, 377 | "outputs": [], 378 | "source": [ 379 | "ggplot(resid) + aes(y = model.residuals, x = model.fitted.values) + geom_point() + stat_smooth()" 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "metadata": {}, 385 | "source": [ 386 | "#### Exercise plot log price vs engine capacity and build a model with it" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": { 393 | "collapsed": true 394 | }, 395 | "outputs": [], 396 | "source": [] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "## Using multiple variable for regression\n", 403 | "### Gears & Engine capacity vs Mileage on highway" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": { 410 | "collapsed": false 411 | }, 412 | "outputs": [], 413 | "source": [ 414 | "ggplot(df) + aes(y = mileage_highway, x= engine, color = gears) + geom_point()" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": {}, 420 | "source": [ 421 | "### Gears & Engine vs Mileage on highway" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": { 428 | "collapsed": false 429 | }, 430 | "outputs": [], 431 | "source": [ 432 | "model <- lm(mileage_highway ~ gears + engine, data = df)" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "metadata": { 439 | "collapsed": false 440 | }, 441 | "outputs": [], 442 | "source": [ 443 | "summary(model)" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": null, 449 | "metadata": { 450 | "collapsed": false 451 | }, 452 | "outputs": [], 453 | "source": [ 454 | "model <- lm(mileage_highway ~ gears + engine - 1, data = df)\n", 455 | "summary(model)" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": null, 461 | "metadata": { 462 | "collapsed": true 463 | }, 464 | "outputs": [], 465 | "source": [] 466 | } 467 | ], 468 | "metadata": { 469 | "kernelspec": { 470 | "display_name": "R", 471 | "language": "R", 472 | "name": "ir" 473 | }, 474 | "language_info": { 475 | "codemirror_mode": "r", 476 | "file_extension": ".r", 477 | "mimetype": "text/x-r-source", 478 | "name": "R", 479 | "pygments_lexer": "r", 480 | "version": "3.2.2" 481 | } 482 | }, 483 | "nbformat": 4, 484 | "nbformat_minor": 0 485 | } 486 | -------------------------------------------------------------------------------- /cars/data/cars.tidy.csv: -------------------------------------------------------------------------------- 1 | "name","model","price","type","engine","fuel","weight","clearance","bhp","rpm","mileage_city","mileage_highway","cylinders","seats","transmission_type","brand","gears","transmission","fuel_type" 2 | "Ashok Leyland Stile","Ashok Leyland Stile LE 8-STR (Diesel)",749990,"MPV",1461," Diesel"," 2000",180," 75 "," 3300",16.2,20.7," 4"," 8"," 5 Speed Manual","Ashok",5,"Manual",1 3 | "Aston Martin Rapide","Aston Martin Rapide LUXE (Petrol)",3.5e+07,"Sedan",5935," Petrol",NA,120," 350 "," 6000",11.83425,7," 4"," 4"," 6 Speed Automatic","Aston",6,"Automatic",0 4 | "Aston Martin Rapide S","Aston Martin Rapide S (Petrol)",4.4e+07,"Sedan",5935," Petrol"," 1990",110," 550 "," 6750",5.6,11.9," 12"," 4"," 6 Speed Manual","Aston",6,"Manual",0 5 | "Aston Martin V12 Vantage","Aston Martin V12 Vantage Coupe (Petrol)",3.5e+07,"Coupe",5935," Petrol",NA,120," 510 "," 6500",7,9," 12"," 2"," 6 Speed Manual","Aston",6,"Manual",0 6 | "Aston Martin V8 Vantage","Aston Martin V8 Vantage Coupe (Petrol)",13500000,"Coupe",4735," Petrol",NA,120," 313 "," 7000",5,8," 8"," 2"," 6 Speed Manual","Aston",6,"Manual",0 7 | "Aston Martin Vanquish","Aston Martin Vanquish V12 (Petrol)",38500000,"Sedan",5935," Petrol",NA,147," 564 "," 6750",4,8," 4"," 2"," 6 Speed Automatic","Aston",6,"Automatic",0 8 | "Audi A3 Cabriolet","Audi A3 Attraction 35 TDI (Diesel)",2295000,"Sedan",1968," Diesel"," 1890",NA," 143 "," 3500-4000",18.08,20.38," 4"," 5"," 6 Speed Automatic","Audi",6,"Automatic",1 9 | "Audi A4","Audi A4 2.0 TDI (Diesel)",2899000,"Sedan",1968," Diesel"," 2065",128," 241 "," 4000",11.83425,16.55," 4"," 5"," 7 Speed Automatic","Audi",7,"Automatic",1 10 | "Audi A6","Audi A6 2.0 TDI (Diesel)",4356000,"Sedan",1968," Diesel"," 2165",165," 174 "," 3750",11.83425,17.68," 4"," 5"," 8 Speed Automatic","Audi",8,"Automatic",1 11 | "Audi A7","Audi A7 Sportback 3.0 TDI (Diesel)",8588000,"Sedan",2967," Diesel"," 2395",160," 241.3 "," 4000",11.83425,14.4," 6"," 5"," 6 Speed Automatic","Audi",6,"Automatic",1 12 | "Audi A8 L","Audi A8 L 3.0 TDI quattro (Diesel)",10700000,"Sedan",2967," Diesel"," 2540",165," 247 "," 4000",11.83425,15.5," 6"," 4"," 8 Speed Automatic","Audi",8,"Automatic",1 13 | "Audi Q3","Audi Q3 30 TDI S Edition (Diesel)",2899000,"SUV",1968," Diesel"," 2030",170," 138.13 "," 4200",14.25,17.32," 4"," 5"," 6 Speed Manual","Audi",6,"Manual",1 14 | "Audi Q5","Audi Q5 2.0 TFSI Quattro (Petrol)",4539000,"SUV",1984," Petrol"," 2365",200," 221.27 "," 4500-6200",11.83425,11.81," 4"," 5"," 8 Speed Automatic","Audi",8,"Automatic",0 15 | "Audi Q7","Audi Q7 3.0 TFSI quattro (Petrol)",5577000,"SUV",2995," Petrol"," 2995",240," 329 "," 5500",11.83425,14," 6"," 6"," 8 Speed Automatic","Audi",8,"Automatic",0 16 | "Audi R8","Audi R8 4.2 FSI quattro (Petrol)",15991000,"Sedan",4163," Petrol"," 1885",160," 423.8 "," 7900",11.83425,8.06," 8"," 2"," 7 Speed Automatic","Audi",7,"Automatic",0 17 | "Audi RS5","Audi RS5 Coupe (Petrol)",10127000,"Coupe",4163," Petrol",NA,NA," 438 "," 8250",5.6,9," 8"," 5"," 7 Speed Automatic","Audi",7,"Automatic",0 18 | "Audi RS6","Audi RS6 Avant",13500000,"Luxury",3993," Petrol"," 2580",114," 552.5 "," 5700-6600",6.4,10.41," 8"," 5"," 8 Speed Automatic","Audi",8,"Automatic",0 19 | "Audi RS7","Audi RS7 Sportback (Petrol)",1.4e+07,"Sedan",3993," Petrol"," 2505",109," 560 "," 5700-6600",7.2,13.3," 8"," 4"," 8 Speed Automatic","Audi",8,"Automatic",0 20 | "Audi S6","Audi S6 4.0 TFSI (Petrol)",8995000,"Sedan",3993," Petrol",NA,125," 420 "," 5500",7,10.3," 8"," 5"," 7 Speed Automatic","Audi",7,"Automatic",0 21 | "Audi TT","Audi TT 45 TFSI (Petrol)",6034000,"Coupe",1984," Petrol"," 1735",135," 230 "," 4500-6200",5.7,8.3," 4"," 4"," 6 Speed Automatic","Audi",6,"Automatic",0 22 | "Bentley Continental Flying Spur","Bentley Continental Flying Spur W12 (Petrol)",3.1e+07,"Sedan",5998," Petrol",NA,142," 552 "," 6100",2,4," 12"," 5"," 8 Speed Automatic","Bentley",8,"Automatic",0 23 | "Bentley Continental GT","Bentley Continental GT (Petrol)",18700000,"Coupe",5998," Petrol",NA,142," 567 "," 6000",5.3,8.6," 12"," 5"," 8 Speed Automatic","Bentley",8,"Automatic",0 24 | "Bentley Continental GTC","Bentley Continental GTC Convertible (Petrol)",2.1e+07,"Sedan",5998," Petrol",NA,142," 567 "," 6000",5.5,8.5," 12"," 5"," 8 Speed Automatic","Bentley",8,"Automatic",0 25 | "Bentley Mulsanne","Bentley Mulsanne V8 (Petrol)",26900000,"Coupe",5998," Petrol",NA,142," 500 "," 6000",6.2,9.5," 8"," 5"," 8 Speed Automatic","Bentley",8,"Automatic",0 26 | "BMW 1 Series","BMW 1 Series",2e+06,"Sedan",1995," Diesel",NA,157," 143 "," 5000",11.83425,20.58," 4",NA," 8 Speed Automatic","BMW",8,"Automatic",1 27 | "BMW 3 Series","BMW 3 Series 320d (Diesel)",3340000,"Sedan",1995," Diesel",NA,140," 184 "," 4000",14.44,18.88," 4"," 5"," 8 Speed Automatic","BMW",8,"Automatic",1 28 | "BMW 3 Series GT","BMW 3 Series GT 320d Sport Line(Diesel)",3990000,"Sedan",1995," Diesel",NA,165," 184 "," 4000",15.54,19.59," 4"," 5"," 8 Speed Automatic","BMW",8,"Automatic",1 29 | "BMW 5 Series","BMW 5 Series 520 Luxury Line (Diesel)",4890000,"Sedan",1995," Diesel",NA,158," 184 "," 5000",8.4,11.5," 4"," 5"," 8 Speed Automatic","BMW",8,"Automatic",1 30 | "BMW 5 Series GT","BMW 5 Series GT 530d LE (Diesel)",8870000,"Sedan",2993,NA,NA,131," 245 "," 4000",11.83425,15.3514948453608,NA,NA,NA,"BMW",4,"Manual",0 31 | "BMW 6 Series","BMW 6 Series Gran Coupe 640d (Diesel)",11490000,"Coupe",2993," Diesel"," 2410",124," 313 "," 4400",14.54,17.54," 6"," 4"," 8 Speed Automatic","BMW",8,"Automatic",1 32 | "BMW 7 Series","BMW 7 Series 730Ld (Diesel)",10290000,"Sedan",2993," Diesel",NA,142," 258 "," 4000",11.83425,12," 6"," 5"," 8 Speed Automatic","BMW",8,"Automatic",1 33 | "BMW i8","BMW i8 (Petrol)",22900000,"Coupe",1499," Petrol"," 1490",117," 228 bhp "," 5800 RPM",28,29," 3"," 2"," 6 Speed Automatic","BMW",6,"Automatic",0 34 | "BMW M3","BMW M3 Sedan (Petrol)",11980000,"Sedan",2979," Petrol"," 2100",NA," 431 "," 5500-7300",7.32,10.75," 6"," 5"," 7 Speed Automatic","BMW",7,"Automatic",0 35 | "BMW M5","BMW M5 Sedan (Petrol)",11700000,"Sedan",4395," Petrol",NA,142," 560 "," 6000",7.14,13.25," 8"," 5"," 7 Speed Automatic","BMW",7,"Automatic",0 36 | "BMW M6 Gran Coupe","BMW M6 Gran Coupe (Petrol)",17500000,"Coupe",4395," Petrol",NA,NA," 560 "," 6000-7000",11.83425,10.1," 8"," 5"," 7 Speed Automatic","BMW",7,"Automatic",0 37 | "BMW X1","BMW X1 sDrive20d (Diesel)",3075000,"SUV",1995," Diesel",NA,179," 184 "," 4000",13,17.5," 4"," 5"," 8 Speed Automatic","BMW",8,"Automatic",1 38 | "BMW X3","BMW X3 xDrive20d Expedition (Diesel)",4490000,"SUV",1995," Diesel",NA,210," 190 "," 4000",15.77,18.56," 4"," 5"," 8 Speed Automatic","BMW",8,"Automatic",1 39 | "BMW X5","BMW X5 xDrive30d (Diesel)",7090000,"SUV",2993," Diesel",NA,209," 258 "," 4000",11.9,15.35," 6"," 5"," 8 Speed Automatic","BMW",8,"Automatic",1 40 | "BMW X6","BMW X6 xDrive 40d (Diesel)",9670000,"SUV",2993,NA,NA,212," 173 "," 4000",11.83425,15.3514948453608,NA,NA,NA,"BMW",4,"Manual",0 41 | "BMW Z4","BMW Z4 Roadster sDrive35i (Petrol)",7090000,"Convertible",2979," Petrol"," 1855",212," 306 "," 5800",7.16,10.37," 6"," 2"," 7 Speed Automatic","BMW",7,"Automatic",0 42 | "Car Specification","New Renault Koleos 2WD MT (Diesel)",2233000,"SUV",1995," Diesel",NA,206," 148 "," 4000",11.83425,17.15," 4"," 5"," 6 Speed Manual","Car",6,"Manual",1 43 | "Chevrolet Captiva","Chevrolet Captiva – LT/Xtreme",2354229,"SUV",2231," Diesel",NA,177," 184 "," 3800",11.83425,14.6," 4"," 7"," 5 Speed Manual","Chevrolet",5,"Manual",1 44 | "Chevrolet Cruze","Chevrolet Cruze LT (Diesel)",1379222,"Sedan",1998," Diesel",NA,165," 163.73 "," 3800",11.83425,17.3," 4"," 5"," 6 Speed Manual","Chevrolet",6,"Manual",1 45 | "Chevrolet Enjoy","Chevrolet Enjoy 1.4 LS 8 Seater (Petrol)",611960,"MPV",1399," Petrol"," 1860",161," 102.58 "," 6000",11.83425,13.7," 4"," 8"," 5 Speed Manual","Chevrolet",5,"Manual",0 46 | "Chevrolet Sail","Chevrolet Sail 1.2 Base (Petrol)",550677,"Sedan",1199," Petrol"," 1450",174," 82.45 "," 6000",15.9,18.2," 4"," 5"," 5 Speed Manual","Chevrolet",5,"Manual",0 47 | "Chevrolet Sail Hatchback (U-VA)","Chevrolet Sail Hatchback 1.2 Base (Petrol)",467689,"Hatchback‎",1199," Petrol",NA,174," 82.45 "," 6000",15,18.2," 4"," 5"," 5 Speed Manual","Chevrolet",5,"Manual",0 48 | "Chevrolet Spark","Chevrolet Spark 1.0 (Petrol)",344710,"Hatchback",995," Petrol",NA,170," 62.14 "," 5400",11.83425,16.2," 4"," 5"," 5 Speed Manual","Chevrolet",5,"Manual",0 49 | "Chevrolet Tavera","Chevrolet Tavera Neo 3 10 Seats BSIII (Diesel)",727353,"MUV",2499," Diesel",NA,184," 72.30 "," 3900",11.83425,13.58," 4"," 10"," 5 Speed Manual","Chevrolet",5,"Manual",1 50 | "Datsun GO Plus","Datsun Go Plus D (Petrol)",379642,"MPV",1198," Petrol",NA,170," 67 "," 5000",16.3,20.6," 3"," 7"," 5 Speed Manual","Datsun",5,"Manual",0 51 | "Ferrari 458 Italia","Ferrari 458 Italia Coupe (Petrol)",25200000,"Coupe",4499," Petrol",NA,145," 419 "," 9000",6,9," 8"," 2"," 7 Speed Automatic","Ferrari",7,"Automatic",0 52 | "Ferrari 599 GTB Fiorano","Ferrari 599 GTB Fiorano Coupe (Petrol)",33700000,"Coupe",5999," Petrol",NA,145," 612 "," 7600",6,9," 4"," 2"," 6 Speed Automatic","Ferrari",6,"Automatic",0 53 | "Ferrari California","Ferrari California Convertible (Petrol)",2.2e+07,"Sedan",4297," Petrol",NA,145," 360 "," 7750",6,9," 8"," 2"," 7 Speed Automatic","Ferrari",7,"Automatic",0 54 | "Ferrari FF","Ferrari FF 6.3L V12 (Petrol)",3.4e+07,"Coupe",6262," Petrol",NA,145," 661 "," 8000",2,4," 4"," 5"," 7 Speed Automatic","Ferrari",7,"Automatic",0 55 | "Fiat Avventura","Fiat Avventura Active (Petrol)",629187,"SUV",1368," Petrol",NA,205," 89 "," 6000",11.83425,14.4," 4"," 5"," 5 Speed Manual","Fiat",5,"Manual",0 56 | "Fiat Linea Classic","Fiat Linea Classic 1.4L (Petrol)",612439,"Sedan",1368," Petrol",NA,185," 88.77 "," 6000",11.4,14.9," 4"," 5"," 5 Speed Manual","Fiat",5,"Manual",0 57 | "Fiat New Linea","Fiat Linea Active 1.4 (Petrol)",700480,"Sedan",1368," Petrol",NA,185," 112.4 "," 5000",11.3,15.7," 4"," 5"," 5 Speed Manual","Fiat",5,"Manual",0 58 | "Fiat Punto Evo","Fiat Punto EVO Active (Petrol)",498577,"Hatchback",1172," Petrol",NA,195," 67.07 "," 6000",11.83425,15.8," 4"," 5"," 5 Speed Manual","Fiat",5,"Manual",0 59 | "Fiat Viaggio","Fiat Viaggio – Expected Specification",6e+05,"Sedan",1368," Petrol",NA,165," 147 "," 5500",11.83425,15.3514948453608,NA,NA," 5 Speed Manual","Fiat",5,"Manual",0 60 | "Force One","Force One EX 7-STR (Diesel)",926805,"SUV",2650," Diesel",NA,205," 80.88 "," 3200",14,17," 4"," 7"," 5 Speed Manual","Force",5,"Manual",1 61 | "Ford Classic","Ford Classic 1.6 Duratec LXi (Petrol)",506000,"Sedan",1596," Petrol",NA,168," 99.6 "," 6500",11.83425,14.09," 4"," 5"," 5 Speed Manual","Ford",5,"Manual",0 62 | "Ford EcoSport","Ford EcoSport Ambiente 1.5 Ti-VCT (Petrol)",675000,"SUV",1499," Petrol",NA,200," 110 "," 6300",11.83425,15.8," 4"," 5"," 5 Speed Manual","Ford",5,"Manual",0 63 | "Ford Endeavour","New Ford Endeavour 2.5L 2WD (Diesel)",2215500,"SUV",2499," Diesel",NA,210," 141.04 "," 3500",8.9,12.67," 4"," 7"," 5 Speed Manual","Ford",5,"Manual",1 64 | "Ford Fiesta","Ford Fiesta 1.5D Ambiente (Diesel)",850000,"Sedan",1498," Diesel",NA,NA," 89.85 "," 3750",20.2,23.5," 4"," 5"," 5 Speed Manual","Ford",5,"Manual",1 65 | "Ford Figo 2014","New Ford Figo LXi 1.2 (Petrol)",413700,"Hatchback",1196," Petrol",NA,168," 70.03 "," 6250",11.83425,15.33," 4"," 5"," 5 Speed Manual","Ford",5,"Manual",0 66 | "Gumpert Apollo","Gumpert Apollo STD (Petrol)",5e+07,"Sport Coupe",4163," Petrol",NA,NA," 650 "," 6500",11.83425,15.3514948453608," 8"," 2"," 6 Speed Automatic","Gumpert",6,"Automatic",0 67 | "Honda Amaze","Honda Amaze 1.2 E MT i-VTEC (Petrol)",518900,"Sedan",1198," Petrol",NA,165," 86.79 "," 6000",15,18," 4"," 5"," 5 Speed Manual","Honda",5,"Manual",0 68 | "Honda Brio","Honda Brio E MT (Petrol)",421400,"Hatchback",1198," Petrol",NA,165," 86.79 "," 6000",16.2,19.4," 4"," 5"," 5 Speed Manual","Honda",5,"Manual",0 69 | "Honda City","Honda City VX Opt (Petrol)",1064000,"Sedan",1497," Petrol",NA,165," 117.3 "," 6600",14.1,17.1," 4"," 5"," 5 Speed Manual","Honda",5,"Manual",0 70 | "Honda CR-V 2014","Honda CR-V 2.0L 2WD MT (Petrol)",2190000,"SUV",1997," Petrol",NA,170," 156 "," 6500",10.9,13.7," 4"," 5"," 6 Speed Manual","Honda",6,"Manual",0 71 | "Honda Freed","Honda Freed 1.5 (Petrol)",750000,"MPV",1497," Petrol",NA,165," 116.38 "," 6600",11.83425,16.2," 4",NA," 5 Speed Automatic","Honda",5,"Automatic",0 72 | "Honda Jazz","Honda jazz – Expected Specification",756455,"Hatchback",1198," Petrol",NA,135," 9066 "," 6200",13,16," 4",NA," 5 Speed Manual","Honda",5,"Manual",0 73 | "Honda Mobilio","Honda Mobilio E i-VTEC (Petrol)",674700,"MPV",1497," Petrol",NA,189," 117.37 "," 6600",11.83425,17.3," 4"," 7"," 5 Speed Manual","Honda",5,"Manual",0 74 | "Hyundai Elantra","Hyundai Elantra 1.8 S (Petrol)",1413196,"Sedan",1797," Petrol",NA,167," 147.45 "," 6500",11.83425,16.3," 4"," 5"," 6 Speed Manual","Hyundai",6,"Manual",0 75 | "Hyundai EON","Hyundai Eon D-LITE (Petrol)",302157,"Hatchback",814," Petrol",NA,170," 55.23 "," 5500",11.83425,21.1," 3"," 5"," 5 Speed Manual","Hyundai",5,"Manual",0 76 | "Hyundai Grand i10","Hyundai Grand i10 Era 1.2 Kappa VTVT (Petrol)",463027,"Hatchback",1197," Petrol",NA,165," 81.86 "," 6000",15.9,18.9," 4"," 5"," 5 Speed Manual","Hyundai",5,"Manual",0 77 | "Hyundai i10","Hyundai Next Gen i10 1.1 iRDE2 (Petrol)",418200,"Hatchback",1086," Petrol",NA,165," 68.05 "," 5500",11.83425,19.81," 4"," 5"," 5 Speed Manual","Hyundai",5,"Manual",0 78 | "Hyundai i20","Hyundai Elite i20 1.2 Era (Petrol)",522974,"Hatchback",1197," Petrol",NA,170," 81.86 "," 6000",13.3,18.6," 4"," 5"," 5 Speed Manual","Hyundai",5,"Manual",0 79 | "Hyundai i20 Active","Hyundai i20 Active Base 1.2 (Petrol)",638586,"Hatchback",1197," Petrol",NA,190," 81.9 "," 6000",13.5,17.9," 4"," 5"," 5 Speed Manual","Hyundai",5,"Manual",0 80 | "Hyundai New Santa FE","Hyundai New Santa Fe 2WD MT (Diesel)",2684430,"SUV",2199," Diesel",NA,185," 194.3 "," 3800",11.83425,14," 4"," 7"," 6 Speed Manual","Hyundai",6,"Manual",1 81 | "Hyundai Sonata","Hyundai Sonata 2.4 GDi MT (Petrol)",1920584,"Sedan",2359," Petrol",NA,155," 198.25 "," 6300",10,13.08," 4"," 5"," 6 Speed Manual","Hyundai",6,"Manual",0 82 | "Hyundai Xcent","Hyundai Xcent Base 1.2 (Petrol)",496012,"Sedan",1197," Petrol",NA,165," 81.86 "," 6000",15.7,19.1," 4"," 5"," 5 Speed Manual","Hyundai",5,"Manual",0 83 | "Isuzu MU 7","Isuzu MU 7 2WD (Diesel)",2195000,"SUV",2999," Diesel",NA,210," 160.8 "," 3600",9.8,10.3," 4"," 7"," 5 Speed Manual","Isuzu",5,"Manual",1 84 | "Jaguar F-Type","Jaguar F-Type V6 S Convertible (Petrol)",14937004,"Convertible",2995," Petrol",NA,NA," 280 "," 6500",11.83425,15.3514948453608," 6"," 2"," 8 Speed Automatic","Jaguar",8,"Automatic",0 85 | "Jaguar F-Type Coupe","Jaguar F-Type Coupe (Petrol)",12200000,"Coupe",2995," Petrol",NA,NA," 335.34 "," 6500",11.83425,14.7," 6"," 2"," 8 Speed Automatic","Jaguar",8,"Automatic",0 86 | "Jaguar XF","Jaguar XF 2.2 Executive Edition (Diesel)",4512000,"Sedan",2179," Diesel"," 2320",100," 190 "," 3000",11.83425,16.3," 4"," 5"," 8 Speed Automatic","Jaguar",8,"Automatic",1 87 | "Jaguar XJ","Jaguar XJ 2.0L LWB (Petrol)",9375000,"Sedan",1999," Petrol"," 2265",NA," 236.71 "," 5500",5.8,9.4," 4"," 5"," 8 Speed Automatic","Jaguar",8,"Automatic",0 88 | "Lamborghini Huracan","Lamborghini Huracan LP 610-4 (Petrol)",34300000,"Supercar",5204," Petrol",NA,NA," 602 "," 8250",4,6.4," 10"," 2"," 7 Speed Automatic","Lamborghini",7,"Automatic",0 89 | "Land Rover Discovery 4","Land Rover Discovery 4 3.0L TDV6 SE (Diesel)",10800000,"SUV",2993," Diesel",NA,210," 245 "," 4000",7.7,11.8," 6"," 7"," 8 Speed Automatic","Land",8,"Automatic",1 90 | "Land Rover Freelander 2","Land Rover Freelander 2 SE (Diesel)",3919000,"SUV",2179," Diesel",NA,210," 147 "," 4000",11.83425,12.39," 4"," 5"," 6 Speed Automatic","Land",6,"Automatic",1 91 | "Land Rover Range Rover","Land Rover Range Rover 3.0 V6 Diesel Vogue (Diesel)",18300000,"SUV",2993," Diesel",NA,295.5," 335 "," 4000",11.83425,11.6," 6"," 7"," 8 Speed Automatic","Land",8,"Automatic",1 92 | "Land Rover Range Rover Evoque","Land Rover Range Rover Evoque Pure SD4 (Diesel)",5499000,"SUV",2179," Diesel",NA,227," 190 "," 3500",11.83425,13.32," 4"," 5"," 6 Speed Automatic","Land",6,"Automatic",1 93 | "Land Rover Range Rover Sport","Land Rover Range Rover Sport SE (Diesel)",12504562,"SUV",2993," Diesel",NA,172," 242 "," 4000",11.83425,14.07," 6"," 5"," 8 Speed Automatic","Land",8,"Automatic",1 94 | "Mahindra Bolero","Mahindra Bolero SLE (Diesel)",731476,"SUV",2523," Diesel",NA,180," 63 "," 3200",11.83425,15.96," 4"," 7"," 5 Speed Manual","Mahindra",5,"Manual",1 95 | "Mahindra e2o","Mahindra e2o T2",592649,"Hatchback",3,NA,NA,180," 25.5 "," 3750",11.83425,15.3514948453608,NA," 4",NA,"Mahindra",4,"Manual",0 96 | "Mahindra Quanto","Mahindra Quanto C2 (Diesel)",641670,"SUV",1493," Diesel",NA,180," 100 "," 3750",11.83425,17.1," 3"," 7"," 5 Speed Manual","Mahindra",5,"Manual",1 97 | "Mahindra Scorpio","Mahindra Scorpio S2 (Diesel)",869253,"SUV",2523," Diesel"," 2510",180," 75 "," 3200",12,14," 4"," 7 & 9"," 5 Speed Manual","Mahindra",5,"Manual",1 98 | "Mahindra Thar","Mahindra Thar DI 2WD (Diesel)",475817,"SUV",2523," Diesel",NA,187," 63 "," 3800",11.83425,13," 4"," 7"," 5 Speed Manual","Mahindra",5,"Manual",1 99 | "Mahindra Verito","Mahindra Verito 1.5 D2 BS-III (Diesel)",673293,"Sedan",1461," Diesel"," 1630",172," 65 "," 4000",11.83425,21.1," 4"," 5"," 5 Speed Manual","Mahindra",5,"Manual",1 100 | "Mahindra Verito Vibe","Mahindra Verito Vibe (CS) 1.5 D2 (Diesel)",600962,"Hatchback",1461," Diesel"," 1650",172," 64 "," 4000",11.83425,20.8," 4"," 5"," 5 Speed Manual","Mahindra",5,"Manual",1 101 | "Mahindra XUV500","Mahindra XUV500 W4 (Diesel)",1120668,"SUV",2179," Diesel"," 2450",160," 140 "," 3750",12,15.1," 4"," 7"," 6 Speed Manual","Mahindra",6,"Manual",1 102 | "Mahindra Xylo","Mahindra Xylo D2 BS-III (Diesel)",807513,"SUV",2489," Diesel",NA,186," 95 "," 3600",11.83425,13.9," 4"," 7 & 8"," 5 Speed Manual","Mahindra",5,"Manual",1 103 | "Maruti Suzuki Alto","Maruti Suzuki New Alto K10 LX (Petrol)",315218,"Hatchback",998," Petrol"," 1210",160," 67.1 "," 6000",19,24.07," 3"," 5"," 5 Speed Manual","Maruti",5,"Manual",0 104 | "Maruti Suzuki Alto 800","Maruti Suzuki Alto 800 Std (Petrol)",247936,"Hatchback",796," Petrol"," 1185",160," 47.3 "," 6000",17,22.74," 3"," 5"," 5 Speed Manual","Maruti",5,"Manual",0 105 | "Maruti Suzuki Celerio","Maruti Suzuki Celerio LXi MT (Petrol)",392017,"Hatchback",998," Petrol"," 1250",165," 67.07 "," 6000",19,23.1," 3"," 5"," 5 Speed Manual","Maruti",5,"Manual",0 106 | "Maruti Suzuki Ciaz","Maruti Suzuki Ciaz VXi (Petrol)",725458,"Sedan",1373," Petrol"," 1490",170," 91.2 "," 6000",16.08,20.73," 4"," 5"," 5 Speed Manual","Maruti",5,"Manual",0 107 | "Maruti Suzuki EECO","Maruti Suzuki Eeco 5 STR (Petrol)",311222,"MUV",1196," Petrol",NA,160," 73 "," 6000",11.8,15.1," 4"," 5"," 5 Speed Manual","Maruti",5,"Manual",0 108 | "Maruti Suzuki Ertiga","Maruti Suzuki Ertiga LXi (Petrol)",602995,"MUV",1373," Petrol"," 1760",185," 93.7 "," 6000",12.3,16.02," 4"," 7"," 5 Speed Manual","Maruti",5,"Manual",0 109 | "Maruti Suzuki Estilo","Maruti Suzuki Estilo LX BS-IV (Petrol)",337832,"Hatchback",998," Petrol",NA,165," 67 "," 6200",15,19," 3"," 5"," 5 Speed Manual","Maruti",5,"Manual",0 110 | "Maruti Suzuki Grand Vitara","Maruti Suzuki Grand Vitara 2.4 MT (Petrol)",2139904,"SUV",2393," Petrol"," 2100",200," 163.5 "," 6000",7.2,10.4," 4"," 5"," 5 Speed Manual","Maruti",5,"Manual",0 111 | "Maruti Suzuki Gypsy King","Maruti Suzuki Gypsy King ST BS-IV (Petrol)",600400,"SUV",1298," Petrol",NA,210," 80 "," 6000",11.83425,11.96," 4"," 8"," 5 Speed Manual","Maruti",5,"Manual",0 112 | "Maruti Suzuki Kizashi","Maruti Suzuki Kizashi MT (Petrol)",1720172,"Sedan",2393," Petrol",NA,155," 175 "," 6500",9.2,12.45," 4"," 5"," 6 Speed Manual","Maruti",6,"Manual",0 113 | "Maruti Suzuki Omni","Maruti Suzuki Omni 5 STR BS-IV (Petrol)",251591,"MUV",796," Petrol",NA,165," 33 "," 5000",11.83425,14.07," 3"," 5"," 4 Speed Manual","Maruti",4,"Manual",0 114 | "Maruti Suzuki Ritz","Maruti Suzuki Ritz Lxi BS-IV (Petrol)",442275,"Hatchback",1248," Petrol",NA,170," 86 "," 6000",14.7,18.5," 4"," 5"," 5 Speed Manual","Maruti",5,"Manual",0 115 | "Maruti Suzuki Swift","Maruti Suzuki Swift LXi (Petrol)",461883,"Hatchback",1197," Petrol"," 1415",170," 83.15 "," 6000",15.6,20.4," 4"," 5"," 5 Speed Manual","Maruti",5,"Manual",0 116 | "Maruti Suzuki Swift DZire","Maruti Suzuki Swift DZire LXI (Petrol)",507883,"Sedan",1197," Petrol"," 1415",170," 85.8 "," 6000",16.3,19.1," 4"," 5"," 5 Speed Manual","Maruti",5,"Manual",0 117 | "Maruti Suzuki SX4","Maruti Suzuki SX4 Vxi (Petrol)",715138,"Sedan",1586," Petrol",NA,170," 103 "," 5600",12,16.51," 4"," 5"," 5 Speed Manual","Maruti",5,"Manual",0 118 | "Maruti Suzuki Wagon-R","Maruti Wagon R LX (Petrol)",362917,"Hatchback",998," Petrol"," 1350",165," 67.07 "," 6200",17.08,20.5," 3"," 5"," 5 Speed Manual","Maruti",5,"Manual",0 119 | "Maruti Suzuki WagonR Stingray","Maruti Suzuki WagonR Stingray LXI (Petrol)",417033,"Hatchback",998," Petrol"," 1350",165," 67.7 "," 6200",17.08,20.51," 3"," 5"," 5 Speed Manual","Maruti",5,"Manual",0 120 | "Mercedes-Benz A-Class","Mercedes-Benz A-Class A180 CDI (Diesel)",2329000,"SUV",2143," Diesel"," 2000",160," 107.28 "," 3200-4400",17.9,20.6," 4"," 5"," 7 Speed Automatic","Mercedes-Benz",7,"Automatic",1 121 | "Mercedes-Benz B-class","Mercedes-Benz B-Class B180 Sport (Petrol)",2795000,"SUV",1595," Petrol"," 1950",134," 122 "," 5000",7.9,11.9," 4"," 5"," 7 Speed Automatic","Mercedes-Benz",7,"Automatic",0 122 | "Mercedes Benz C Class","Mercedes-Benz C-Class C 220 CDI Style (Diesel)",3990000,"Sedan",2143," Diesel"," 2135",NA," 168 "," 3000 – 4200",11.83425,19.27," 4"," 5"," 7 Speed Automatic","Mercedes",7,"Automatic",1 123 | "Mercedes-Benz CLA","Mercedes-Benz CLA 200 CDI Style (Diesel)",3150000,"Sedan",1991," Diesel"," 2005",NA," 135 "," 3600 4400",13,17.9," 4"," 5"," 7 Speed Automatic","Mercedes-Benz",7,"Automatic",1 124 | "Mercedes-Benz CLA 45 AMG","Mercedes-Benz CLA 45 AMG (Petrol)",6850000,"Saloon Sedan",1991," Petrol",NA,NA," 355.37 "," 6000",11.83425,15.3514948453608,NA,NA," 7 Speed Automatic","Mercedes-Benz",7,"Automatic",0 125 | "Mercedes-Benz CLS","Mercedes-Benz CLS-Class 250 CDI",7650000,"Coupe",2143," Diesel"," 2270",118," 204 "," 380",6.13,9.26," 4"," 4"," 7 Speed Automatic","Mercedes-Benz",7,"Automatic",1 126 | "Mercedes-Benz E-Class","Mercedes-Benz E-Class E 200 CGI (Petrol)",4850000,"Sedan",1991," Petrol"," 2200",120," 246 "," 5500",8,12," 4"," 5"," 7 Speed Automatic","Mercedes-Benz",7,"Automatic",0 127 | "Mercedes-Benz G-Class","Mercedes-Benz G-Class G 63 AMG (Petrol)",13977500,"SUV",5461," Petrol",NA,205," 536 "," 5500",11.83425,9.5," 8"," 5"," 7 Speed Automatic","Mercedes-Benz",7,"Automatic",0 128 | "Mercedes-Benz GL","Mercedes-Benz GL 350 CDI (Diesel)",7758000,"SUV",2987," Diesel"," 3250",201," 258 "," 3600",8,12," 6"," 7"," 7 Speed Automatic","Mercedes-Benz",7,"Automatic",1 129 | "Mercedes-Benz GLA","Mercedes-Benz GLA 200 CDI Style (Diesel)",3275000,"SUV",2143," Diesel"," 2115",183," 134.10 "," 3600 - 4400",13.4,17.7," 4"," 5"," 7 Speed Automatic","Mercedes-Benz",7,"Automatic",1 130 | "Mercedes-Benz M-Class","Mercedes-Benz M-Class ML 250 CDI BlueEfficiency (Diesel)",5490000,"SUV",2143," Diesel"," 2950",223," 203.2 "," 4200",11.83425,14," 4"," 5"," 7 Speed Automatic","Mercedes-Benz",7,"Automatic",1 131 | "Mercedes-Benz R-Class","Mercedes-Benz R-Class R350 4MATIC (Petrol)",7168000,"SUV",3496," Diesel",NA,201," 272 "," 6000",11.83425,8.4," 6"," 7"," 7 Speed Automatic","Mercedes-Benz",7,"Automatic",1 132 | "Mercedes-Benz S-Class","Mercedes Benz S Class S 350 CDI (Diesel)",10700000,"Sedan",2987," Diesel"," 2580",146," 254.79 "," 3600",9,13.5," 6"," 5"," 7 Speed Automatic","Mercedes-Benz",7,"Automatic",1 133 | "Mercedes-Benz SL","Mercedes-Benz SL 350 (Petrol)",9850000,"Sedan",3498," Petrol",NA,135," 316 "," 6500",11.83425,8.1," 6"," 2"," 7 Speed Automatic","Mercedes-Benz",7,"Automatic",0 134 | "Mercedes-Benz SLK-Class","Mercedes-Benz SLK-Class SLK 350 Blue Efficiency (Petrol)",8378000,"Sedan",3498," Petrol"," 1855",135," 306 "," 6500",6.5,11.11," 6"," 2"," 7 Speed Automatic","Mercedes-Benz",7,"Automatic",0 135 | "Mercedes-Benz SLS","Mercedes-Benz SLS AMG Coupe (Petrol)",25200000,"Sedan",6208," Petrol"," 1935",135," 420 "," 6800",11.83425,10.7," 8"," 2"," 7 Speed Automatic","Mercedes-Benz",7,"Automatic",0 136 | "Mini Cooper","Mini Cooper D 3 Door (Diesel)",3185000,"Hatchback",1496," Diesel",NA,NA," 114 "," 4000",19.03,21.15," 3"," 4"," 6 Speed Automatic","Mini",6,"Automatic",1 137 | "Mini Cooper Convertible","Mini Cooper Convertible 1.6 (Petrol)",3320000,"Hatchback",1598," Petrol",NA,135," 122 "," 6000",12.3,15.5," 2"," 4"," 6 Speed Automatic","Mini",6,"Automatic",0 138 | "Mini Cooper S","Mini Cooper S 1.6 (Petrol)",3420000,"Hatchback",1598," Petrol",NA,135," 184 "," 5500",9,15.62," 4"," 4"," 6 Speed Automatic","Mini",6,"Automatic",0 139 | "Mitsubishi Pajero Sport","Mitsubishi Pajero Sport 2.5 MT",2350000,"SUV",2477," Diesel"," 2710",215," 178 "," 4000",11.83425,13.5," 4"," 7"," 6 Speed Manual","Mitsubishi",6,"Manual",1 140 | "New 4S Fluidic Hyundai Verna","Hyundai 1.4L Gamma VTVT Base 4S Fluidic Verna (Petrol)",773903,"Sedan",1396," Petrol",NA,165," 105.5 "," 6300",14.21,17.43," 4"," 5"," 5 Speed Manual","New",5,"Manual",0 141 | "New Chevrolet Beat","New Chevrolet Beat PS (Petrol)",420923,"Hatchback",1199," Petrol",NA,165," 79.4 "," 6200",11.83425,18.6," 4"," 5"," 5 Speed Manual","New",5,"Manual",0 142 | "New Skoda Superb","New Skoda Superb Ambition 1.8 TSI MT (Petrol)",1862276,"Sedan",1798," Petrol"," 2074",159," 157.81 "," 4500-6200",10.7,13.1," 4"," 5"," 6 Speed Manual","New",6,"Manual",0 143 | "Nissan Datsun GO","Datsun GO D (Petrol)",312270,"Hatchback",1198," Petrol",NA,170," 67.07 "," 5000",17,20.6," 3"," 5"," 5 Speed Manual","Nissan",5,"Manual",0 144 | "Nissan Evalia","Nissan Evalia XE (Diesel)",849999,"MPV",1461," Diesel"," 2000",180," 84.8 "," 3750",11.83425,19.3," 4"," 7"," 5 Speed Manual","Nissan",5,"Manual",1 145 | "Nissan Micra","Nissan Micra Active XL (Petrol)",412819,"Hatchback",1198," Petrol",NA,154," 67 "," 5000",16.24,19.49," 3"," 5"," 5 Speed Manual","Nissan",5,"Manual",0 146 | "Nissan Qashqai","Nissan Qashqai – Expected Specification",2190000,"SUV",1598," Petrol",NA,NA," 116 "," 6000",10,15," 4"," 7"," 5 Speed Automatic","Nissan",5,"Automatic",0 147 | "Nissan Sunny","Nissan New Sunny XE (Petrol)",699000,"Sedan",1498," Petrol",NA,165," 97.6 "," 6000",13.45,16.95," 4"," 5"," 5 Speed Manual","Nissan",5,"Manual",0 148 | "Nissan Teana","Nissan Teana 250 XL (Petrol)",2114086,"Sedan",2496," Petrol"," 2030",145," 179.5 "," 6000",8.03,11.7," 6"," 5"," Automatic","Nissan",5,"Automatic",0 149 | "Nissan Terrano","Nissan Terrano XE (Diesel)",962875,"SUV",1461," Diesel"," 1764",205," 84 "," 3750",17.1,20.5," 4"," 5"," 5 Speed Manual","Nissan",5,"Manual",1 150 | "Porsche 911","Porsche 911 Targa 4 (Petrol)",15900000,"Coupe",3436," Petrol"," 1945",NA," 350 "," 7400",6.9,8.7," 6"," 5"," 7 Speed Automatic","Porsche",7,"Automatic",0 151 | "Porsche Boxster","Porsche Boxster S (Petrol)",9202000,"Supercar",3436," Petrol"," 1665",NA," 315 "," 6700",8.19,14.49," 6"," 2"," 6 Speed Manual","Porsche",6,"Manual",0 152 | "Porsche Cayenne","Porsche Cayenne Diesel",10400000,"SUV",2967," Diesel"," 2870",215," 245 ","3800-4400",12.82,16.12," 6"," 5"," 8 Speed Automatic","Porsche",8,"Automatic",1 153 | "Porsche Cayman","Porsche Cayman S",9450000,"Coupe",3436," Petrol"," 1665",NA," 325 "," 7400",8.19,14.9," 6"," 2"," 6 Speed Manual","Porsche",6,"Manual",0 154 | "Renault Duster","Renault Duster RxE (Petrol)",830009,"SUV",1598," Petrol"," 1755",205," 103.8 "," 5850",10.5,13.05," 4"," 5"," 5 Speed Manual","Renault",5,"Manual",0 155 | "Renault Fluence","Renault Fluence 1.5 E2 (Diesel)",1399000,"Sedan",1461," Diesel",NA,168," 108.4 "," 4000",17.2,20.4," 4"," 5"," 6 Speed Manual","Renault",6,"Manual",1 156 | "Renault Lodgy","Renault Lodgy 85PS STD",819000,"MPV",1461," Diesel",NA,174," 83.8 "," 4000",16.8,21.04," 4"," 8"," 5 Speed Manual","Renault",5,"Manual",1 157 | "Renault Pulse","Renault Pulse RxE (Petrol)",446100,"Hatchback",1198," Petrol",NA,154," 74 "," 6000",15,18," 4"," 5"," 5 Speed Manual","Renault",5,"Manual",0 158 | "Renault Scala","Renault Scala RxE (Petrol)",724500,"Sedan",1498," Petrol",NA,161," 98 "," 6000",13.5,16.95," 4"," 5"," 5 Speed Manual","Renault",5,"Manual",0 159 | "Rolls Royce Ghost V Specification","Rolls Royce Ghost V Specification (Petrol)",46600000,"Sedan",6592," Petrol",NA,170," 593 "," 5250",11.83425,10.2," 12",NA," 8 Speed Automatic","Rolls",8,"Automatic",0 160 | "Rolls-Royce Wraith","Rolls Royce Wraith Coupe (Petrol)",4.6e+07,"Coupe",6592," Petrol",NA,NA," 624 "," 5600",11.83425,9,NA,NA," 8 Speed Automatic","Rolls-Royce",8,"Automatic",0 161 | "San Motors Storm","San Motors Storm 1.2 (Petrol)",595000,"Sedan",1149," Petrol"," 1510",135," 59 "," 5250",11.9,16," 4"," 2"," 5 Speed Manual","San",5,"Manual",0 162 | "Skoda D Rapid","Skoda D Rapid – 1.6 TDI CR/77 kW",850608,"Sedan",1598," Diesel",NA,168," 77 "," 4400",11.83425,20.5,NA,NA," 5 Speed Manual","Skoda",5,"Manual",1 163 | "Skoda Fabia","Skoda Fabia – 1.2 MPI/55 kW (Petrol)",502768,"Hatchback",1198," Petrol"," 1550",158," 75 "," 5400",11.83425,16.4," 3"," 5"," 5 Speed Manual","Skoda",5,"Manual",0 164 | "Skoda New Octavia","Skoda New Octavia Active 1.4 TSI (Petrol)",1395000,"Sedan",1395," Petrol"," 1820",155," 138.8 "," 4500-6000",11.83425,16.8," 4"," 5"," 6 Speed Manual","Skoda",6,"Manual",0 165 | "Skoda Rapid","Skoda Rapid 1.6 MPI Active (Petrol)",755500,"Sedan",1598," Petrol"," 1674",168," 103.5 "," 5250",12,15," 4"," 5"," 5 Speed Manual","Skoda",5,"Manual",0 166 | "Skoda Yeti Facelift","Skoda Yeti Ambition 2.0 TDi CR 2WD (Diesel)",2010000,"SUV",1968," Diesel"," 1960",180," 108.5 "," 4200",14.3,17.72," 4"," 5"," 5 Speed Manual","Skoda",5,"Manual",1 167 | "Ssangyong Korando","Ssangyong Korando – Expected Specification",1615000,"SUV",1998," Diesel",NA,NA," 174 "," 4000",11.83425,7.5,NA,NA," 6 Speed Manual","Ssangyong",6,"Manual",1 168 | "Ssangyong Rexton","SsangYong Rexton W RX 5 MT (Diesel)",1987337,"SUV",2696," Diesel"," 2760",181," 162 "," 4000",11.4,12.4," 5"," 7"," 5 Speed Manual","Ssangyong",5,"Manual",1 169 | "Tata Aria Facelift","Tata Aria Pure LX 2WD (Diesel)",1043434,"SUV",2179," Diesel"," 2850",200," 147.95 "," 4000",12.08,15.05," 4"," 7"," 5 Speed Manual","Tata",5,"Manual",1 170 | "Tata Bolt","Tata Bolt Revotron 1.2T XE (Petrol)",444993,"Hatchback",1198," Petrol",NA,165," 89 "," 5000",11.83425,15.3514948453608," 4"," 5"," 5 Speed Manual","Tata",5,"Manual",0 171 | "Tata Indica eV2","Tata Indica eV2 emax GLS (CNG)",403555,"Hatchback",1193," CNG",NA,165," 55.2 "," 5200",11.83425,23.7," 4"," 5"," 5 Speed Manual","Tata",5,"Manual",2 172 | "Tata Indica Vista","Tata Indica Vista LS TDI BS-III (Diesel)",478000,"Hatchback",1405," Diesel",NA,165," 70 "," 4500",11.83425,19.1," 4"," 5"," 5 Speed Manual","Tata",5,"Manual",1 173 | "Tata Indigo eCS","Tata Indigo eCS GLX (Petrol)",499000,"Sedan",1193," Petrol",NA,165," 65 "," 5000",11.83425,14," 4"," 5"," 5 Speed Manual","Tata",5,"Manual",0 174 | "Tata Manza","Tata Manza LS (Diesel)",616357,"Sedan",1248," Diesel"," 1650-1660",165," 88 "," 4000",11.83425,21.2," 4"," 5"," 5 Speed Manual","Tata",5,"Manual",1 175 | "Tata Movus","Tata Movus CX 7 STR Side Facing-BS4",698272,"MUV",2179," Diesel"," 2535",180," 118.36 "," 4000",11.38,15.16," 4"," 7"," 5 Speed Manual","Tata",5,"Manual",1 176 | "Tata Nano","Tata GenX Nano XE",199000,"Hatchback",624," Petrol",NA,180," 37.5 "," 5250 - 5750",20.4,23.9," 2"," 4"," 4 Speed Manual","Tata",4,"Manual",0 177 | "Tata Safari DICOR","Tata Safari 2WD LX DICOR BS-III (Diesel)",825622,"SUV",2179," Diesel",NA,205," 140 "," 4000",11.83425,13.93," 4"," 7"," 5 Speed Manual","Tata",5,"Manual",1 178 | "Tata Safari Storme","Tata Safari Storme LX 2WD (Diesel)",999000,"SUV",2179," Diesel"," 2555",200," 138 "," 4000",10.8,14.1," 4"," 7"," 5 Speed Manual","Tata",5,"Manual",1 179 | "Tata Sumo Gold","Tata Sumo Gold CX BS III (Diesel)",622730,"SUV",2956," Diesel",NA,190," 69 "," 3000",11.83425,14.7," 4"," 7 & 9"," 5 Speed Manual","Tata",5,"Manual",1 180 | "Tata Sumo Grande","Tata Sumo Grande MK II LX BS-IV (Diesel)",762204,"MUV",2179," Diesel"," 2625",180," 118 "," 4000",11.83425,13.55," 4"," 7 & 8"," 5 Speed Manual","Tata",5,"Manual",1 181 | "Tata Venture","Tata Venture LX (Diesel)",456715,"Van",1405," Diesel",NA,160," 71 "," 4500",11.83425,15.42," 4"," 8"," 6 Speed Manual","Tata",6,"Manual",1 182 | "Tata Winger","Tata Winger Platinum BS-III (Diesel)",760000,"Van",1948,NA,NA,165," 91 "," 4300",11.83425,15.3514948453608,NA,NA,NA,"Tata",4,"Manual",0 183 | "Tata Xenon","Tata Xenon XT EX 2WD (Diesel)",1000850,"SUV",2179," Diesel",NA,200," 140 "," 4000",11.83425,13.49," 4"," 6"," 5 Speed Manual","Tata",5,"Manual",1 184 | "Tata Zest","Tata Zest XE (Petrol)",481191,"Sedan",1193," Petrol",NA,175," 88.8 "," 5000",13.2,17.6," 4"," 5"," 5 Speed Manual","Tata",5,"Manual",0 185 | "Toyota Corolla Altis 2014","New Toyota Corolla Altis JS (Petrol)",1306324,"Sedan",1798," Petrol"," 1640 - 1670",175," 138.08 "," 6400",9.5,14.53," 4"," 5"," 6 Speed Manual","Toyota",6,"Manual",0 186 | "Toyota Etios","Toyota Etios J PS (Petrol)",603081,"Sedan",1496," Petrol",NA,170," 88.77 "," 5600",13.5,16.78," 4"," 5"," 5 Speed Manual","Toyota",5,"Manual",0 187 | "Toyota Etios Cross","Toyota Etios Cross 1.2 G (Petrol)",623000,"SUV",1197," Petrol",NA,170," 78.90 "," 5600`",13.3,17.71," 4"," 5"," 5 Speed Manual","Toyota",5,"Manual",0 188 | "Toyota Etios Liva","Toyota Etios Liva J PS (Petrol)",500145,"Hatchback",1197," Petrol",NA,170," 78.87 "," 5600",15.1,17.71," 4"," 5"," 5 Speed Manual","Toyota",5,"Manual",0 189 | "Toyota Etios Xclusive","Toyota Etios Xclusive (Petrol)",598139,"Sedan",1496," Petrol",NA,174," 88 "," 5600",13.5,16.78,NA," 5"," 5 Speed Manual","Toyota",5,"Manual",0 190 | "Toyota Fortuner","Toyota Fortuner 2WD Manual (Diesel)",2417200,"SUV",2982," Diesel"," 2510",220," 169 "," 3600",11.83425,12.55," 4"," 7"," 5 Speed Manual","Toyota",5,"Manual",1 191 | "Volkswagen Cross Polo","Volkswagen Cross Polo 1.2 MPI (Petrol)",699000,"Hatchback",1198," Petrol"," 1520",165," 74 "," 5400",12.2,16.5," 3"," 5"," 5 Speed Manual","Volkswagen",5,"Manual",0 192 | "Volkswagen New JETTA","Volkswagen Jetta 1.4L TSI MT Trendline (Petrol)",1370000,"Sedan",1390," Petrol",NA,159," 120 "," 5000",11.83425,14.69," 4"," 5"," 6 Speed Manual","Volkswagen",6,"Manual",0 193 | "Volkswagen New Phaeton","Volkswagen New Phaeton 3.6L (Petrol)",7729000,"Sedan",3597," Petrol",NA,128," 238 "," 4000",1.2,12.8," 6"," 5"," 6 Speed Automatic","Volkswagen",6,"Automatic",0 194 | "Volkswagen Passat","Volkswagen Passat Trendline MT (Diesel)",2206000,"Sedan",1968," Diesel"," 2180",150," 168 "," 4200",15.16,18.78," 4"," 5"," 6 Speed Manual","Volkswagen",6,"Manual",1 195 | "Volkswagen Polo","Volkswagen Polo 1.2L Trendline (Petrol)",535400,"Hatchback",1198," Petrol"," 1520",165," 73.97 "," 5400",11.83425,16.47," 3"," 5"," 5 Speed Manual","Volkswagen",5,"Manual",0 196 | "Volkswagen Polo GT","Volkswagen Polo GT TDI (Diesel)",799800,"Hatchback",1598," Diesel",NA,168," 104.5 "," 4400",13.7,19.7," 4"," 5"," 5 Speed Manual","Volkswagen",5,"Manual",1 197 | "Volkswagen Taigun","Volkswagen Taigun -Expected Specification",1178000,"SUV",1198," Petrol",NA,167," 108 "," 5000",11.83425,21," 3",NA," 6 Speed Manual","Volkswagen",6,"Manual",0 198 | "Volkswagen Up","Volkswagen Up – Expected Specification",360000,"Hatchback",1199," Petrol",NA,170," 74 "," 4200",11.83425,21," 3"," 5"," 5 Speed Manual","Volkswagen",5,"Manual",0 199 | "Volkswagen Vento","Volkswagen Vento Trendline (Petrol)",785000,"Sedan",1598," Petrol"," 1680",163," 103.56 "," 5250",12.02,16.09," 4"," 5"," 5 Speed Manual","Volkswagen",5,"Manual",0 200 | "Volvo S60","Volvo S60 Kinetic D4 (Diesel)",3190000,"Sedan",1984," Diesel",NA,136," 163 "," 3500",11.83425,18.5," 5"," 5"," 6 Speed Automatic","Volvo",6,"Automatic",1 201 | "Volvo S80","Volvo S80 Summum D4 (Diesel)",4135000,"Sedan",1984," Diesel"," 2190",151," 163 "," 3500",11.83425,16.66," 5"," 5"," 6 Speed Automatic","Volvo",6,"Automatic",1 202 | "Volvo V40","Volvo V40 D3 Kinetic (Diesel)",2475000,"Hatchback",1969," Diesel"," 1980",145," 150 "," 3750",23.6,27.8," 4"," 5"," 6 Speed Automatic","Volvo",6,"Automatic",1 203 | "Volvo V40 Cross Country","Volvo V40 D3 Cross Country (Diesel)",3180000,"Hatchback",1984," Diesel",NA,145," 150 "," 3500",11.83425,16.8," 5"," 5"," 6 Speed Automatic","Volvo",6,"Automatic",1 204 | "Volvo XC90","Volvo XC 90 D5 Momentum(Diesel)",6490000,"SUV",1969," Diesel",NA,238," 225 "," 3900",8.4,11.1," 4"," 7"," 8 Speed Automatic","Volvo",8,"Automatic",1 205 | -------------------------------------------------------------------------------- /cheatsheets/data-wrangling-cheatsheet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/cheatsheets/data-wrangling-cheatsheet.pdf -------------------------------------------------------------------------------- /cheatsheets/ggplot2-cheatsheet-2.0.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/cheatsheets/ggplot2-cheatsheet-2.0.pdf -------------------------------------------------------------------------------- /curriculum.md: -------------------------------------------------------------------------------- 1 | # Curriculum 2 | 3 | The broad curriculum elements for the workshop are listed below. We would showcase **some of them** in this workshop. 4 | 5 | 1. **Introduction** - “I think, therefore I am” 6 | - What is data analysis? 7 | - What type of questions can be answered? 8 | - Frame/Acquire/Refine/Explore/Model/Insight framework 9 | 10 | 2. **Acquire** - "Data is the new oil" 11 | - Sources of Data - Download from an internal system, Obtained from client, or other 3rd party, Extracted from a web-based API, Scraped from a website / pdfs, or Gathered manually and recorded 12 | - Acquire data from a csv file or a database 13 | - Acquire data from a 3rd part client (e.g. twitter) 14 | 15 | 3. **Refine** - "Data is messy" 16 | - Concept of Tidy Data - Why is it important? 17 | - Missing e.g. Check for missing or incomplete data 18 | - Quality e.g. Check for duplicates, accuracy, unusual data 19 | - Parse e.g. extract year from date 20 | - Merge e.g. first and surname for full name 21 | - Convert e.g. free text to coded value 22 | - Derive e.g. gender from title 23 | - Calculate e.g. percentages, proportion 24 | - Remove e.g. remove redundant data 25 | - Aggregate e.g. rollup by year, cluster by area 26 | - Filter e.g. exclude based on location 27 | - Sample e.g. extract a representative data 28 | - Summary e.g. show summary stats like mean 29 | - Basic statistics: variance, standard deviation, co-variance, correlation 30 | 31 | 4. **Explore** - "I don't know, what I don't know" 32 | - Why do visual exploration? 33 | - Understand Data Structure & Types 34 | - Explore single variable graphs - Quantitative, Categorical 35 | - Explore dual variable graphs - Q & Q, Q & C, C & C 36 | - Explore multi-dimensional variable graphs 37 | 38 | 5. **Model** - "All models are wrong, Some of them are useful" 39 | - Introduction to Machine Learning 40 | - The power and limits of models 41 | - Tradeoff between Prediction Accuracy and Model Interpretability 42 | - Assessing Model Accuracy 43 | - For Regression problems - RMSE 44 | - For classification problems- Precision, Recall, AUC/ROC, F-Score, Mis-classification rate 45 | - Bias-Variance tradeoff 46 | - Overfitting 47 | - Linear Regression 48 | - Logistic Regression 49 | - Classification model 50 | - Decision Trees 51 | - Visualizing decision trees 52 | 53 | 6. **Insight** - “The goal is to turn data into insight” 54 | - Why do we need to communicate insight? 55 | - Types of communication - Exploration vs. Explanation 56 | - Explanation: Telling a story with data 57 | - Exploration: Building an interface for people to find stories 58 | -------------------------------------------------------------------------------- /img/ISLR.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/ISLR.jpeg -------------------------------------------------------------------------------- /img/acquire.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/acquire.jpg -------------------------------------------------------------------------------- /img/amit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/amit.png -------------------------------------------------------------------------------- /img/approach.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/approach.jpg -------------------------------------------------------------------------------- /img/art.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/art.jpeg -------------------------------------------------------------------------------- /img/bank.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/bank.jpg -------------------------------------------------------------------------------- /img/bargava.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/bargava.jpg -------------------------------------------------------------------------------- /img/book.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/book.png -------------------------------------------------------------------------------- /img/books.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/books.jpg -------------------------------------------------------------------------------- /img/break.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/break.jpg -------------------------------------------------------------------------------- /img/cars.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/cars.jpg -------------------------------------------------------------------------------- /img/clay.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/clay.jpeg -------------------------------------------------------------------------------- /img/confusion-matrix.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/confusion-matrix.jpg -------------------------------------------------------------------------------- /img/craft.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/craft.jpeg -------------------------------------------------------------------------------- /img/data_analysis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/data_analysis.png -------------------------------------------------------------------------------- /img/datascienceinR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/datascienceinR.png -------------------------------------------------------------------------------- /img/diamond-clarity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/diamond-clarity.png -------------------------------------------------------------------------------- /img/diamond-colors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/diamond-colors.png -------------------------------------------------------------------------------- /img/diamonds.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/diamonds.jpg -------------------------------------------------------------------------------- /img/estimating_coefficients.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/estimating_coefficients.png -------------------------------------------------------------------------------- /img/explore.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/explore.jpg -------------------------------------------------------------------------------- /img/frame.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/frame.jpg -------------------------------------------------------------------------------- /img/glass.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/glass.jpg -------------------------------------------------------------------------------- /img/harddisk.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/harddisk.jpg -------------------------------------------------------------------------------- /img/hari.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/hari.jpg -------------------------------------------------------------------------------- /img/hasgeek.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/hasgeek.png -------------------------------------------------------------------------------- /img/insight.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/insight.jpg -------------------------------------------------------------------------------- /img/kaggle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/kaggle.png -------------------------------------------------------------------------------- /img/kaggle_short.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/kaggle_short.png -------------------------------------------------------------------------------- /img/lens.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/lens.jpeg -------------------------------------------------------------------------------- /img/list.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/list.jpg -------------------------------------------------------------------------------- /img/model.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/model.jpg -------------------------------------------------------------------------------- /img/nischal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/nischal.jpg -------------------------------------------------------------------------------- /img/numbers.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/numbers.jpg -------------------------------------------------------------------------------- /img/onion-image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/onion-image.jpg -------------------------------------------------------------------------------- /img/onion.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/onion.jpg -------------------------------------------------------------------------------- /img/onion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/onion.png -------------------------------------------------------------------------------- /img/onion_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/onion_small.png -------------------------------------------------------------------------------- /img/onion_tables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/onion_tables.png -------------------------------------------------------------------------------- /img/overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/overview.jpg -------------------------------------------------------------------------------- /img/pair.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/pair.jpg -------------------------------------------------------------------------------- /img/peeling_the_onion_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/peeling_the_onion_small.png -------------------------------------------------------------------------------- /img/postit.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/postit.jpg -------------------------------------------------------------------------------- /img/problems.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/problems.png -------------------------------------------------------------------------------- /img/r2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/r2.gif -------------------------------------------------------------------------------- /img/r_squared.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/r_squared.png -------------------------------------------------------------------------------- /img/raghottam.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/raghottam.jpg -------------------------------------------------------------------------------- /img/refine.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/refine.jpg -------------------------------------------------------------------------------- /img/retail.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/retail.jpg -------------------------------------------------------------------------------- /img/science.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/science.jpeg -------------------------------------------------------------------------------- /img/see.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/see.jpeg -------------------------------------------------------------------------------- /img/shrayas.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/shrayas.jpg -------------------------------------------------------------------------------- /img/single.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/single.jpeg -------------------------------------------------------------------------------- /img/skills.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/skills.png -------------------------------------------------------------------------------- /img/slope_intercept.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/slope_intercept.png -------------------------------------------------------------------------------- /img/speak.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/speak.jpeg -------------------------------------------------------------------------------- /img/sports.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/sports.jpg -------------------------------------------------------------------------------- /img/stars.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/stars.jpg -------------------------------------------------------------------------------- /img/subsetcolumns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/subsetcolumns.png -------------------------------------------------------------------------------- /img/subsetrows.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/subsetrows.png -------------------------------------------------------------------------------- /img/table.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/table.jpg -------------------------------------------------------------------------------- /img/think.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/think.jpg -------------------------------------------------------------------------------- /img/thinking.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/thinking.jpg -------------------------------------------------------------------------------- /img/thinkstats.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/thinkstats.jpg -------------------------------------------------------------------------------- /img/time.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/time.jpg -------------------------------------------------------------------------------- /img/tool.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/tool.jpg -------------------------------------------------------------------------------- /img/travel.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/travel.jpg -------------------------------------------------------------------------------- /img/unnati.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/unnati.png -------------------------------------------------------------------------------- /img/var1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/var1.jpg -------------------------------------------------------------------------------- /img/var2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/var2.jpg -------------------------------------------------------------------------------- /img/var3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/var3.jpg -------------------------------------------------------------------------------- /img/welcome.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/welcome.jpg -------------------------------------------------------------------------------- /img/wesmckinney.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/wesmckinney.jpg -------------------------------------------------------------------------------- /img/wine.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/wine.jpg -------------------------------------------------------------------------------- /img/workshop.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/workshop.jpg -------------------------------------------------------------------------------- /img/zainab.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/unnati-xyz/intro-R-data-science/069b83b98cb39d3f9d3675a8686bda053d3b2c8a/img/zainab.jpg -------------------------------------------------------------------------------- /installation_guide_linux.md: -------------------------------------------------------------------------------- 1 | # Linux Installation Guide 2 | 3 | 1. Download and install miniconda (please be careful to choose the right installer which suits your system architecture) - [http://conda.pydata.org/miniconda.html](http://conda.pydata.org/miniconda.html) 4 | 5 | 2. From the command prompt install `r-essentials` via the `r` channel 6 | 7 | conda install -c r r-essentials 8 | 9 | 3. From command prompt run `R` 10 | 11 | 4. In the R shell that has opened, run the command 12 | 13 | install.packages('rvest',repos = "http://ftp.iitm.ac.in/cran") 14 | 15 | 5. Navigate to the repository for Introduction to Data Science in R and run 16 | 17 | jupyter notebook 18 | 19 | **You should be able to create R notebooks from Jupyter now.** 20 | -------------------------------------------------------------------------------- /installation_guide_osx.md: -------------------------------------------------------------------------------- 1 | # Mac installation Guide for Introduction to Data Science in R 2 | 3 | 1. Download and install brew from - [http://brew.sh/](http://brew.sh/) 4 | 5 | 2. Install r 6 | 7 | brew cask install r 8 | 9 | 3. Install zeromq 10 | 11 | brew install zeromq 12 | 13 | 4. Download and install miniconda2 from [here](http://conda.pydata.org/miniconda.html) 14 | 15 | 5. Install jupyter 16 | 17 | conda install jupyter 18 | 19 | OR 20 | 21 | in case you want to use python pip 22 | 23 | pip install jupyter 24 | 25 | 6. From command prompt run `R` 26 | 27 | 7. In the R shell that has opened run the command 28 | 29 | install.packages(c('rzmq','repr','IRkernel','IRdisplay'),repos = c('http://irkernel.github.io/', getOption('repos')), type='source') 30 | 31 | 8. On successful completion of the previous command, in the R shell itself, make `IRKernel` available to jupyter by running the following command 32 | 33 | IRkernel::installspec(user = FALSE) 34 | 35 | 9. Navigate to the repository for Introduction to Data Science in R and run 36 | 37 | jupyter notebook 38 | 39 | **You should be able to create R notebooks from Jupyter now.** 40 | -------------------------------------------------------------------------------- /installation_guide_windows.md: -------------------------------------------------------------------------------- 1 | # Windows Installation Guide 2 | 3 | 1. Download and install miniconda (please be careful to choose the right installer which suits your system architecture) - [http://conda.pydata.org/miniconda.html](http://conda.pydata.org/miniconda.html) 4 | 5 | 2. From the command prompt install `r-essentials` via the `r` channel 6 | 7 | conda install -c r r-essentials 8 | 9 | 3. From command prompt run `R` 10 | 11 | 4. In the R shell that has opened, run the command 12 | 13 | install.packages('rvest',repos = "http://ftp.iitm.ac.in/cran") 14 | 15 | 16 | 5. Navigate to the repository for Introduction to Data Science in R and run 17 | 18 | jupyter notebook 19 | 20 | **You should be able to create R notebooks from Jupyter now.** 21 | -------------------------------------------------------------------------------- /intro/small_cars.csv: -------------------------------------------------------------------------------- 1 | "name","model","url","price","type","ABS","Acceleration (0-100 kmph)","Air Conditioner","Audio Controls on Streeing Wheel","Audio System (with remote)","Bluetooth Connectivity","Body Coloured Bumpers","Boot Space (litres)","Brakes Front","Brakes Rear","CD Player","Central Locking","Clean Air Filter","Cruise Control","Door Ajar Warning","Driver Seatbelt Warning","Dual SRS Airbags (D+P)","EBD","Engine Size (cc)","Engine Type","Front Track","Front Tyres","Fuel Supply System","Fuel Tank Capacity (litres)","Fuel Type","Full Wheel Caps","Gross Weight (kg)","Ground Clearance (mm)","Heater","Immobiliser","Kerb Weight (kg)","Keyless Entry","Max Power (bhp @ rpm)","Max Torque (Nm @ rpm)","Mileage in City (kmpl)","Mileage on Highway (kmpl)","No. of Cylinders","Overall Length(mm) x Width(mm) x Height(mm)","Panoramic Sunroof","Passenger Airbags","Power Steering","Power Windows (Front)","Power Windows (Rear)","Rain Sensing Wipers","Rear Track","Rear Tyres","Seat Capacity","Speakers","Tilt Function","Top Speed (kmph)","Traction Control","Transmission Type","Tubeless Tyres","Turning Circle Radius (metres)","USB & Auxiliary Input","Wheelbase (mm)","brand" 2 | "Ashok Leyland Stile","Ashok Leyland Stile LE 8-STR (Diesel)","http://carzoom.in/car-specification/ashok-leyland-stile-le-8-str-diesel/",749990,"MPV"," No"," 18.7"," Manual"," No"," No"," No"," Yes"," 500"," Disc"," Drum"," No"," No"," Yes"," No"," No"," No"," No"," No"," 1461"," Turbo-intercooled, Common Rail Diesel Engine",NA," 165 R14"," CRDi"," 50"," Diesel"," Yes"," 2000"," 180"," Yes"," Yes"," 1426"," No"," 75 @ 3300"," 185 @ 1750-2750"," 16.2"," 20.7"," 4"," 4400 x 1700 x 1860"," No"," No"," Yes"," No"," No"," No",NA," 165 R14"," 8"," No"," Yes"," 140"," No"," 5 Speed Manual"," Yes"," 5.2"," No"," 2725","Ashok" 3 | "Ashok Leyland Stile","Ashok Leyland Stile LS 8-STR (Diesel)","http://carzoom.in/car-specification/ashok-leyland-stile-ls-8-str-diesel/",799990,"MPV"," No"," 18.7"," Manual"," No"," No"," No"," Yes"," 500"," Disc"," Drum"," No"," Yes"," No"," No"," No"," No"," No"," No"," 1461"," Turbo-intercooled, Common Rail Diesel Engine",NA," 165 R14"," CRDi"," 50"," Diesel"," No"," 2000"," 180"," Yes"," Yes"," 1426"," No"," 75 @ 3300"," 185 @ 1750-2750"," 16.2"," 20.7"," 4"," 4400 x 1700 x 1860"," No"," No"," Yes"," Yes"," No"," No",NA," 165 R14"," 8"," No"," Yes"," 140"," No"," 5 Speed Manual"," Yes"," 5.2"," No"," 2725","Ashok" 4 | "Ashok Leyland Stile","Ashok Leyland Stile LX 8-STR (Diesel)","http://carzoom.in/car-specification/ashok-leyland-stile-lx-8-str-diesel/",829990,"MPV"," No"," 18.7"," Manual"," No"," No"," No"," Yes"," 500"," Disc"," Drum"," No"," Yes"," Yes"," No"," No"," No"," No"," No"," 1461"," Turbo-intercooled, Common Rail Diesel Engine",NA," 165 R14"," CRDi"," 50"," Diesel"," No"," 2000"," 180"," Yes"," Yes"," 1426"," No"," 75 @ 3300"," 185 @ 1750-2750"," 16.2"," 20.7"," 4"," 4400 x 1700 x 1860"," No"," No"," Yes"," Yes"," No"," No",NA," 165 R14"," 8"," No"," Yes"," 140"," No"," 5 Speed Manual"," Yes"," 5.2"," No"," 2725","Ashok" 5 | "Ashok Leyland Stile","Ashok Leyland Stile LS 7-STR (Diesel)","http://carzoom.in/car-specification/ashok-leyland-stile-ls-7-str-diesel/",849990,"MPV"," No"," 18.7"," Manual"," No"," No"," No"," Yes"," 500"," Disc"," Drum"," No"," Yes"," Yes"," No"," No"," No"," No"," No"," 1461"," Turbo-intercooled, Common Rail Diesel Engine",NA," 165 R14"," CRDi"," 50"," Diesel"," No"," 2000"," 180"," Yes"," Yes"," 1426"," No"," 75 @ 3300"," 185 @ 1750-2750"," 16.2"," 20.7"," 4"," 4400 x 1700 x 1860"," No"," No"," Yes"," Yes"," No"," No",NA," 165 R14"," 7"," No"," Yes"," 140"," No"," 5 Speed Manual"," Yes"," 5.2"," No"," 2725","Ashok" 6 | -------------------------------------------------------------------------------- /introduction.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | > “I think, therefore I am” 4 | 5 | - What is data analysis? 6 | - What type of questions can be answered? 7 | - Frame/Acquire/Refine/Explore/Model/Insight framework 8 | 9 | ## Data Science as an Art 10 | 11 | > "Science is knowledge which we understand so well that we can teach it to a computer. Everything else is art" - Donald Knuth 12 | 13 | - We need to know the science, we need to learn the art. 14 | - Analogous examples - Creating a hit song, Diagnosing a medical problem. 15 | - Business problems are 'wicked in nature' - multiple stakeholder, different problem definition, different solutions, interdependence, constraints, amplifying loops 16 | 17 | 18 | ![](img/problems.png) 19 | 20 | > "Data analysis is hard, and part of the problem is that few people can explain how to do it. It’s not that there aren’t any people doing data analysis on a regular basis. It’s that the people who are really good at it have yet to enlighten us about the thought process that goes on in their heads." - Roger Peng 21 | 22 | ![](img/data_analysis.png) 23 | 24 | 25 | ## Types of Question 26 | 27 | > "Doing data analysis requires quite a bit of thinking and we believe that when you’ve completed a good data analysis, you’ve spent more time thinking than doing." - Roger Peng 28 | 29 | 1. **Descriptive** - "seeks to summarize a characteristic of a set of data" 30 | 2. **Exploratory** - "analyze the data to see if there are patterns, trends, or relationships between variables" (hypothesis generating) 31 | 3. **Inferential** - "a restatement of this proposed hypothesis as a question and would be answered by analyzing a different set of data" (hypothesis testing) 32 | 4. **Predictive** - "determine the impact on one factor based on other factor in a population - to make a prediction" 33 | 5. **Causal** - "asks whether changing one factor will change another factor in a population - to establish a causal link" 34 | 6. **Mechanistic** - "establish *how* the change in one factor results in change in another factor in a population - to determine the exact mechanism" 35 | 36 | 37 | ## Hypothesis driven Approach 38 | Hypothesis is an educated guess / hunch. 39 | 40 | Hypothesis generation asks the question "what if"; Hypotheses testing follows it up by saying "if x, then y" with relevant data and analysis. If we keep doing this, the we can keep improving the hypothesis. It is process of "iteration and learning". Both the definition of the problem and the solution are not separate and we keep refining and reshaping and sharpening both of them 41 | 42 | Hypothesis testing is based on abductive reasoning. When you have Induction - you start with data, working backward to form a rule... you look at a set of data and notice when price increase, demand falls. When you have deduction, you start with rule and makes a prediction of what you will observe = when price increase, demand falls. Abduction however reasons from effect to cause - if demand is down, it might be because prices is up. 43 | - Induction - something is operative 44 | - Deduction - proves that something must be. 45 | - Abductions - only suggest that something may be 46 | 47 | Now why is abduction important - Possibility of both problem and solutions are unbounded, good hypothesis generations is critical. Because the solution is invented choice, rather than discovered truth - its contestability requires persuasive argumentation. 48 | 49 | 50 | ## Making the Case 51 | 52 | "Making the case" is important and compelling case comes from data based hypothesis. Explaining 'what is' is an essential step in building confidence in the recommendation. Learning and changing mental models is needed for implementation and acceptance 53 | -------------------------------------------------------------------------------- /onion/1-Frame.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 1. Frame the Problem\n", 8 | "\n", 9 | "In late 2010, Onion prices shot through the roof and caused grave crisis. Apparently the crisis was caused by lack of rainfall in major onion producing regions - Maharashtra and Karnataka and led to large scale hoarding by the traders. The crisis caused political tension in the country and described as \"a grave concern\" by then Prime Minister Manmohan Singh.\n", 10 | "\n", 11 | "\n", 12 | "- BBC Article in Dec 2010 - [Stink over onion crisis is enough to make you cry](http://www.bbc.co.uk/blogs/thereporters/soutikbiswas/2010/12/indias_onion_crisis.html)\n", 13 | "- Hindu OpEd in Dec 2010 - [The political price of onions](http://www.thehindu.com/opinion/editorial/article977100.ece)\n", 14 | "\n", 15 | "![](../img/peeling_the_onion_small.png)\n", 16 | "\n", 17 | "So what are the type of questions on Onion Prices - you would like to ask. \n", 18 | "\n", 19 | "\n", 20 | "## Types of Question\n", 21 | "\n", 22 | "> \"Doing data analysis requires quite a bit of thinking and we believe that when you’ve completed a good data analysis, you’ve spent more time thinking than doing.\" - Roger Peng\n", 23 | "\n", 24 | "1. **Descriptive** - \"seeks to summarize a characteristic of a set of data\"\n", 25 | "2. **Exploratory** - \"analyze the data to see if there are patterns, trends, or relationships between variables\" (hypothesis generating) \n", 26 | "3. **Inferential** - \"a restatement of this proposed hypothesis as a question and would be answered by analyzing a different set of data\" (hypothesis testing)\n", 27 | "4. **Predictive** - \"determine the impact on one factor based on other factor in a population - to make a prediction\"\n", 28 | "5. **Causal** - \"asks whether changing one factor will change another factor in a population - to establish a causal link\" \n", 29 | "6. **Mechanistic** - \"establish *how* the change in one factor results in change in another factor in a population - to determine the exact mechanism\"\n", 30 | "\n", 31 | "\n", 32 | "### Descriptive \n", 33 | "- Which states have the highest onion production and sales?\n", 34 | "- Which city (Mandi's) have the highest sales?\n", 35 | "- What is the average price for Onion across a year in Bangalore?\n", 36 | "- ...\n", 37 | "\n", 38 | "### Exploratory & Inferential \n", 39 | "- Is there a large difference between High and Low prices of Onion in a day?\n", 40 | "- What is the trend of onion price across days or months in Bangalore?\n", 41 | "- How is the price on onion correlated with volume of onion?\n", 42 | "- How is the export volume of onion correlated to domestic production volume?\n", 43 | "- ...\n", 44 | "\n", 45 | "### Predictive \n", 46 | "- What is the price of onion likely to be next day?\n", 47 | "- What is the price of onion likely to be next month?\n", 48 | "- What will be the sales quantity of onion tommorrow in Delhi?\n", 49 | "- ...\n", 50 | "\n", 51 | "### Causal\n", 52 | "- Does the change in production of onion have an impact on the onion prices? \n", 53 | "- Does the change in rainfall in monsoon have an impact on onion prices?\n", 54 | "- ...\n", 55 | "\n", 56 | "### Mechanistic\n", 57 | "- How does change in onion production impact the price of onion?\n", 58 | "- How does onion export volumes impact the prices of onion in local markets in India?\n", 59 | "- ...\n", 60 | "\n", 61 | "\n", 62 | "## Questions we will attempt to answer.\n", 63 | "\n", 64 | "### 1. Descriptive: How big is the Bangalore onion market compared to other cities in India?\n", 65 | "\n", 66 | "### 2. Exploratory / Inferential: Have the price variation in Onion prices in Bangalore really gone up over the years?\n", 67 | "\n", 68 | "### 3. Exploratory / Inferential: How is the quantity and price variation in Onion prices in Bangalore really gone up over the years?\n", 69 | "\n", 70 | "### 4. Predictive: Can we predict the price of onion in Bangalore for next month?" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "outputs": [], 80 | "source": [] 81 | } 82 | ], 83 | "metadata": { 84 | "kernelspec": { 85 | "display_name": "R", 86 | "language": "R", 87 | "name": "ir" 88 | }, 89 | "language_info": { 90 | "codemirror_mode": "r", 91 | "file_extension": ".r", 92 | "mimetype": "text/x-r-source", 93 | "name": "R", 94 | "pygments_lexer": "r", 95 | "version": "3.2.4" 96 | } 97 | }, 98 | "nbformat": 4, 99 | "nbformat_minor": 0 100 | } 101 | -------------------------------------------------------------------------------- /onion/2-Acquire.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 2. Acquire the Data\n", 8 | "\n", 9 | "\n", 10 | "## Finding Data Sources\n", 11 | "\n", 12 | "There are three place to get onion price and quantity information by market. \n", 13 | "\n", 14 | "1. **[Agmarket](http://agmarknet.nic.in/)** - This is the website run by the Directorate of Marketing & Inspection (DMI), Ministry of Agriculture, Government of India and provides daily price and arrival data for all agricultural commodities at national and state level. Unfortunately, the link to get Market-wise Daily Report for Specific Commodity (Onion for us) leads to a multipage aspx entry form to get data for each date. So it is like to require an involved scraper to get the data. Too much effort - Move on. Here is the best link to go to get what is available - http://agmarknet.nic.in/agnew/NationalBEnglish/SpecificCommodityWeeklyReport.aspx?ss=1\n", 15 | "\n", 16 | "\n", 17 | "2. **[Data.gov.in](https://data.gov.in/)** - This is normally a good place to get government data in a machine readable form like csv or xml. The Variety-wise Daily Market Prices Data of Onion is available for each year as an XML but unfortunately it does not include quantity information that is needed. It would be good to have both price and quantity - so even though this is easy, lets see if we can get both from a different source. Here is the best link to go to get what is available - https://data.gov.in/catalog/variety-wise-daily-market-prices-data-onion#web_catalog_tabs_block_10\n", 18 | "\n", 19 | "\n", 20 | "3. **[NHRDF](http://nhrdf.org/en-us/)** - This is the website of National Horticultural Research & Development Foundation and maintains a database on Market Arrivals and Price, Area and Production and Export Data for three commodities - Garlic, Onion and Potatoes. We are in luck! It also has data from 1996 onwards and has only got one form to fill to get the data in a tabular form. Further it also has production and export data. Excellent. Lets use this. Here is the best link to got to get all that is available - http://nhrdf.org/en-us/DatabaseReports\n", 21 | "\n", 22 | "\n", 23 | "## Scraping the Data\n", 24 | "\n", 25 | "\n", 26 | "### Ways to Scrape Data\n", 27 | "Now we can do this in two different levels of sophistication\n", 28 | "\n", 29 | "1. **Automate the form filling process**: The form on this page looks simple. But viewing source in the browser shows there is a form to fill with hidden fields and we will need to access it as a browser to get the session fields and then submit the form. This is a little bit more complicated than simple scraping a table on a webpage\n", 30 | "\n", 31 | "2. **Manually fill the form**: What if we manually fill the form with the desired form fields and then save the page as a html file. Then we can read this file and just scrape the table from it. Lets go with the simple way for now.\n", 32 | "\n", 33 | "\n", 34 | "### Scraping - Manual Form Filling\n", 35 | "\n", 36 | "So let us fill the form to get a small subset of data and test our scraping process. We will start by getting the [Monthwise Market Arrivals](http://nhrdf.org/en-us/MonthWiseMarketArrivals). \n", 37 | "\n", 38 | "- Crop Name: Onion\n", 39 | "- Month: January\n", 40 | "- Market: All\n", 41 | "- Year: 2016\n", 42 | "\n", 43 | "The saved webpage is available at [MonthWiseMarketArrivalsJan2016.html](MonthWiseMarketArrivalsJan2016.html)\n", 44 | "\n", 45 | "### Understand the HTML Structure\n", 46 | "\n", 47 | "We need to scrape data from this html page... So let us try to understand the structure of the page.\n", 48 | "\n", 49 | "1. You can view the source of the page - typically Right Click and View Source on any browser and that would give your the source HTML for any page.\n", 50 | "\n", 51 | "2. You can open the developer tools in your browser and investigate the structure as you mouse over the page \n", 52 | "\n", 53 | "3. We can use a tools like [Selector Gadget](http://selectorgadget.com/) to understand the id's and classes used in the web page\n", 54 | "\n", 55 | "Our data is under the **<table>** tag " 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "### Exercise - Finding the Table" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "Find the number of tables in the HTML Structure of [MonthWiseMarketArrivalsJan2016.html](MonthWiseMarketArrivalsJan2016.html)?" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 34, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/html": [ 82 | "''" 83 | ], 84 | "text/latex": [ 85 | "''" 86 | ], 87 | "text/markdown": [ 88 | "''" 89 | ], 90 | "text/plain": [ 91 | "[1] \"\"" 92 | ] 93 | }, 94 | "execution_count": 34, 95 | "metadata": {}, 96 | "output_type": "execute_result" 97 | } 98 | ], 99 | "source": [] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "Find the exact table and #id attribute for the the table" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": { 112 | "collapsed": true 113 | }, 114 | "outputs": [], 115 | "source": [] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "### Manual Scraping" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 30, 127 | "metadata": { 128 | "collapsed": false 129 | }, 130 | "outputs": [ 131 | { 132 | "name": "stderr", 133 | "output_type": "stream", 134 | "text": [ 135 | "also installing the dependencies 'curl', 'openssl', 'XML', 'xml2', 'httr', 'selectr'\n", 136 | "\n", 137 | "Warning message:\n", 138 | "In install.packages(\"rvest\", repos = \"http://ftp.iitm.ac.in/cran/\"): installation of package 'curl' had non-zero exit statusWarning message:\n", 139 | "In install.packages(\"rvest\", repos = \"http://ftp.iitm.ac.in/cran/\"): installation of package 'openssl' had non-zero exit statusWarning message:\n", 140 | "In install.packages(\"rvest\", repos = \"http://ftp.iitm.ac.in/cran/\"): installation of package 'XML' had non-zero exit statusWarning message:\n", 141 | "In install.packages(\"rvest\", repos = \"http://ftp.iitm.ac.in/cran/\"): installation of package 'xml2' had non-zero exit statusWarning message:\n", 142 | "In install.packages(\"rvest\", repos = \"http://ftp.iitm.ac.in/cran/\"): installation of package 'httr' had non-zero exit statusWarning message:\n", 143 | "In install.packages(\"rvest\", repos = \"http://ftp.iitm.ac.in/cran/\"): installation of package 'selectr' had non-zero exit statusWarning message:\n", 144 | "In install.packages(\"rvest\", repos = \"http://ftp.iitm.ac.in/cran/\"): installation of package 'rvest' had non-zero exit status" 145 | ] 146 | }, 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "\n", 152 | "The downloaded source packages are in\n", 153 | "\t'/private/var/folders/04/r20f0_4n2m7cv23lr8t97wp00000gn/T/Rtmpi3LZwX/downloaded_packages'\n" 154 | ] 155 | }, 156 | { 157 | "name": "stderr", 158 | "output_type": "stream", 159 | "text": [ 160 | "Updating HTML index of packages in '.Library'\n", 161 | "Making 'packages.html' ... done\n" 162 | ] 163 | } 164 | ], 165 | "source": [ 166 | "install.packages(\"rvest\", repos='http://ftp.iitm.ac.in/cran/')" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 2, 172 | "metadata": { 173 | "collapsed": false 174 | }, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/html": [ 179 | "'/Users/amitkaps/Dropbox/github/intro-R-data-science/onion'" 180 | ], 181 | "text/latex": [ 182 | "'/Users/amitkaps/Dropbox/github/intro-R-data-science/onion'" 183 | ], 184 | "text/markdown": [ 185 | "'/Users/amitkaps/Dropbox/github/intro-R-data-science/onion'" 186 | ], 187 | "text/plain": [ 188 | "[1] \"/Users/amitkaps/Dropbox/github/intro-R-data-science/onion\"" 189 | ] 190 | }, 191 | "execution_count": 2, 192 | "metadata": {}, 193 | "output_type": "execute_result" 194 | } 195 | ], 196 | "source": [ 197 | "getwd()" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 1, 203 | "metadata": { 204 | "collapsed": false 205 | }, 206 | "outputs": [ 207 | { 208 | "name": "stderr", 209 | "output_type": "stream", 210 | "text": [ 211 | "Loading required package: xml2\n" 212 | ] 213 | } 214 | ], 215 | "source": [ 216 | "library(rvest)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 10, 222 | "metadata": { 223 | "collapsed": false 224 | }, 225 | "outputs": [], 226 | "source": [ 227 | "pg.out <- read_html('MonthWiseMarketArrivalsJan2016.html')" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 28, 233 | "metadata": { 234 | "collapsed": false 235 | }, 236 | "outputs": [ 237 | { 238 | "data": { 239 | "text/plain": [ 240 | "{xml_document}\n", 241 | "\n", 242 | "[1] \\n