├── .RData ├── .Rhistory ├── CART.R ├── Charts & Plots.R ├── DataSets(Excel & Csv files) ├── BicycleRidership.xlsx ├── Breakfast_Cereals.xlsx ├── Cutoffdata.xlsx ├── E-Commerce.xlsx ├── Financial_Reporting.xlsx ├── FlightDetails.xlsx ├── InternetCorruption.xlsx ├── LiftPrediction.xlsx ├── Mypromooffers.xlsx ├── Promoffers-9 Variables.xlsx ├── Promoffers.xlsx ├── SedanCar.xlsx └── UsedCars.xlsx ├── Dimensionality_Reduction.R ├── Installation.R ├── KNN.R ├── Logistic Regression.R ├── Multiple Linear Regression.R ├── Naive Bayes.R ├── Partitioning&Regression.R ├── Performance_Metrices.R ├── PredictionMetrics.R ├── RIntro.R ├── Simple Line Plotting.R ├── Specialized Visualization Techniques.R ├── Visualization Techniques.R ├── Welch's T-Test.R ├── cereal.Rdata ├── cereal_data_set.zip └── cereal_data_set ├── cereal.csv ├── cereal.txt ├── cereal.xls ├── cereal2.xls └── cereal_source.txt /.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/.RData -------------------------------------------------------------------------------- /.Rhistory: -------------------------------------------------------------------------------- 1 | install.packages(c("xlsx","rminer","MASS","leaps","class","e1071", 2 | "rpart","rgart.plot","neuralnet","nnet","devtools", 3 | "caret","arules","arulesviz","cluster","xts","matrixcalc", 4 | "forecast","igraph","treemap","rworldmap","ggmap"), 5 | dependencies = T) 6 | df2= read.xlsx("SedanCar.xlsx",1, header = T) 7 | library(xlsx) 8 | df= read.xlsx(file.choose(),1,header = T) 9 | df= read.xlsx(file.choose(),1,header = T) 10 | library(xlsx) 11 | df= read.xlsx(file.choose(),1,header = T) 12 | View(df) 13 | df1= read.xlsx("C:/Users/Harshit/Desktop/Business Analytics in R/SedanCar.xlsx", 14 | 1, header = T) 15 | View(df1) 16 | setwd("C:/Users/Harshit/Desktop/Business Analytics in R/") 17 | df2= read.xlsx("SedanCar.xlsx",1, header = T) 18 | v= c(1,2,3) 19 | v[0] 20 | v[1] 21 | v[0]=6 22 | v[0] 23 | v 24 | v[-1] 25 | v[-2] 26 | v[1:3] 27 | v[1:2] 28 | v[1] 29 | v[3] 30 | sum(v1) 31 | sum(v) 32 | library(matrixcalc) 33 | library(matrixcalc) 34 | df$Ownership 35 | df$Ownership 36 | is.vector(df$Ownership) 37 | is.vector(df$Annual_Income) 38 | df[3,] 39 | 3,1 40 | df[3,1] 41 | df[1,3] 42 | df[1,] 43 | is.factor(df$Ownership) 44 | library(xlsx) 45 | df= read.xlsx(file.choose(),1,header = T) 46 | View(df) 47 | df= df[,!apply(is.na(df),2,all)] 48 | head(df) 49 | summary(df) 50 | plot(df$Annual_Income, df$Household_Area, las= 1, 51 | slab= "Annual Income (a.'lakhs)", ylab= "Houshold Area (00s fts)", 52 | xlim= c(2,12), ylim= c(13,25), pch=c(21,19)[as.numeric(df$Ownership)]) 53 | plot(df$Annual_Income, df$Household_Area, las= 1, 54 | xlab= "Annual Income (a.'lakhs)", ylab= "Houshold Area (00s fts)", 55 | xlim= c(2,12), ylim= c(13,25), pch=c(21,19)[as.numeric(df$Ownership)]) 56 | legend("bottomright",inset= 0.005,c("Owner","Nonowner"), 57 | pch=c(19,21),cex= 0.7,x.intersp = 0.5, y.intersp = 0.5) 58 | df[df$Annual_Income>5 & df$Annual_Income<8.5 & df$Household_Area>18 59 | & df$Household_Area<20, c(1,2)] 60 | df[df$Annual_Income>5 & df$Annual_Income<8.5 & df$Household_Area>18 61 | & df$Household_Area<20, c(1,2)] 62 | abline(h=18.8, col=3) 63 | segments(7,0,7,18.8,col = 3) 64 | segments(5.8,18.8,5.8,26,col = 3) 65 | df[df$Annual_Income>6 & df$Annual_Income<8.5 & df$Household_Area>18 66 | & df$Household_Area<21, c(1,2)] 67 | segments(5.8,19.5,13,19.5,col = 3) 68 | df[df$Annual_Income>6 & df$Annual_Income<8.5 & df$Household_Area>18 69 | & df$Household_Area<21, c(1,2)] 70 | segments(5.8,19.5,13,19.5,col = 3) 71 | df[df$Annual_Income<7 & df$Household_Area>17 & df$Household_Area<19, c(1,2)] 72 | segments(0,18.2,7,18.2,col = 3) 73 | x=6 74 | if(x>7){ 75 | x=x+1 76 | }else if(x>8){ 77 | x=x+2 78 | }else { 79 | x=x+3} 80 | print(x) 81 | n = 5 82 | sum = 1 83 | while(n!=0) 84 | { 85 | sum = sum*n 86 | print(sum) 87 | n = n - 1 88 | if(sum > 50) 89 | { 90 | print("It’s gonna rain") 91 | } 92 | else 93 | { 94 | print("It’s not gonna rain") 95 | } 96 | } 97 | n=100 98 | sum=0 99 | for(i in seq(1,n,1)){ 100 | sum=sum+i 101 | print(c(i,sum)) 102 | if(sum>15) 103 | break 104 | } 105 | x1<-matrix(1:9,3,3) 106 | x2<-matrix(11:19,3,3) 107 | m = rbind(apply(x1,1,sum),apply(x2,1,sum)) 108 | y = apply(m,1,mean) 109 | print(y) 110 | x = c(1:4) 111 | y = c(6,7) 112 | print(x + y) 113 | x <- c("a",1, 3>2) 114 | print(as.logical(x)) 115 | vec1 = c(1,2,3) 116 | vec2 = c("R","Scilab","Java") 117 | vec3 = c("For prototyping", "For prototyping", "For Scaleup") 118 | mylist= cbind(vec1,vec2,vec3) 119 | mylist[2][3]="matlab" 120 | mylist[2,3]="matlab" 121 | mylist 122 | mylist= list(vec1,vec2,vec3) 123 | mylist 124 | mylist[2][3]="matlab" 125 | mylist[2,3]= "matlab" 126 | mylist[[2]][3] = "matlab" 127 | mylist 128 | list(mylist, list(vec4, c(10,11,12))) 129 | list(mylist, list(c(10,11,12))) 130 | list(vec4 = c("10","11","12"), mylist) 131 | A =matrix(c(1:9), nrow = 3, ncol = 3, byrow = T) 132 | A 133 | A[2,] 134 | A[2,3] 135 | diag(A) = c(99,100,101) 136 | A 137 | x <- matrix(1:4, 2, 2) 138 | y <- matrix(rep(10, 4), 2, 2) 139 | print(x * y) 140 | x 141 | y 142 | circle_mimo= function(dia) 143 | { 144 | area=(pi*dia^2)/4 145 | circum=pi*dia 146 | result=c("area"=area, "circum"=circum) 147 | return(result) 148 | } 149 | print(circle_mimo(4)) 150 | x <- 1 151 | f <- function() { 152 | y <- 2 153 | return(c(x, y)) 154 | } 155 | f() 156 | func <- function(){ 157 | X<-3 158 | Y<-x+3 159 | return(c(X,Y)) 160 | } 161 | print(Y) 162 | vec1 = c(1,2,3) 163 | vec2 = c("R","Scilab","Java") 164 | vec3 = c("For prototyping", "For prototyping", "For Scaleup") 165 | df= data.frame(vec1 = c(1,2,3),vec2 = c("R","Scilab","Java"),vec3 = c("For prototyping", "For prototyping", "For Scaleup") 166 | ) 167 | df 168 | print(df[1:2,]) 169 | rbind(df,data.frame(vec1=4,vec2="C",vec3="For Scaleup")) 170 | cbind(df,data.frame(vec4 = c(10,20,30,40))) 171 | cbind(data.frame(vec4 = c(10,20,30,40)),df) 172 | cbind(data.frame(vec4 = c(10,20,30,40),df)) 173 | cbind(df,data.frame(vec4 = c(10,20,30,40))) 174 | cbind(df.data.frame(vec4 = c(10,20,30,40))) 175 | df 176 | df= rbind(df,data.frame(vec1=4,vec2="C",vec3="For Scaleup")) 177 | cbind(df,data.frame(vec4 = c(10,20,30,40))) 178 | df= cbind(df,data.frame(vec4 = c(10,20,30,40))) 179 | df 180 | pd =data.frame ("Name"=c("Senthil","Senthil","Sam", "Sam"), "Month"=c("Jan","Feb","Jan","Feb"), "BS" = c(141.2,139.3,135.2,160.1), 181 | "BP" = c(90,78,80,81)) 182 | pd 183 | library(reshape2) 184 | pd_new = recast(pd,id.var=c("Name","Month"),variable+Month~Name) 185 | print(pd_new) 186 | subset[pd, pd$BS > 140] 187 | subset[pd$BS > 140] 188 | subset(pd$BS > 140,pd) 189 | subset(pd, pd$BS > 140) 190 | a = data.frame(x1= c("A","B","C"), x2=1:3) 191 | b = data.frame(x1= c("A","B","D"), x2=c("Yes","No","Yes")) 192 | a 193 | b 194 | left_join(a,b,by='x1') 195 | install.packages("dplyr",dependencies = T) 196 | left_join(a,b,by='x1') 197 | left_join(a,b) 198 | p<-left_join(a,b) 199 | library(dplyr) 200 | p<-left_join(a,b) 201 | left_join(a,b,by='x1') 202 | left_join(b,a,by='x1') 203 | list(mylist, list(vec4, c(10,11,12))) 204 | list(mylist, list(vec4= c(10,11,12))) 205 | library(xlsx) 206 | #Classification Trees 207 | #Sedancar.xlsx 208 | df= read.xlsx(file.choose(),1,header = T) 209 | df= df[,!apply(is.na(df),2,all)] 210 | str(df) 211 | df$Household_Area 212 | df$columns 213 | columns(df) 214 | help(apply) 215 | par(mar=c(5.1,5.1,5.1,5.1)) 216 | plot(df$Annual_Income, df$Household_Area, las=1, 217 | xlab= "Annual Income", ylab= "Household Area", 218 | xlim = c(2,12), ylim= c(13,25), pch= c(21,19)[as.numeric(df$Ownership)]) 219 | plot(x=df$Annual_Income,y= df$Household_Area, las=1, 220 | xlab= "Annual Income", ylab= "Household Area", 221 | xlim = c(2,12), ylim= c(13,25), pch= c(21,19)[as.numeric(df$Ownership)]) 222 | a= mat(1:9,3,3) 223 | a= matrix(1:9,3,3) 224 | a 225 | a= matrix(1:9,3,3,T) 226 | a 227 | a 228 | as.data.frame(a) 229 | kt= apply(a,2,sum) 230 | kt 231 | kt= apply(a,1,sum) 232 | kt= apply(a,1,all) 233 | kt 234 | legend("bottomright", inset= 0.005, c("owner","Non-owner"), 235 | pch= c(19,21), cex= 0.7, x.intersp = 0.5, y.intersp = 0.5) 236 | help(legend) 237 | #First split 238 | abline(h=18.8) 239 | #First split 240 | abline(h=18.8,v=3) 241 | #First split 242 | abline(h=18.8) 243 | plot(x=df$Annual_Income,y= df$Household_Area, las=1, 244 | xlab= "Annual Income", ylab= "Household Area", 245 | xlim = c(2,12), ylim= c(13,25), pch= c(21,19)[as.numeric(df$Ownership)]) 246 | legend("bottomright", inset= 0.005, c("owner","Non-owner"), 247 | pch= c(19,21), cex= 0.7, x.intersp = 0.5, y.intersp = 0.5) 248 | #First split 249 | abline(h=18.8) 250 | df.sort() 251 | df 252 | df.sort(Annual_Income) 253 | df.sort('Annual_Income') 254 | sort(df$Annual_Income) 255 | head(sort(df$Annual_Income),-1)+ diff(sort(df$Annual_Income)/2) 256 | sort(df$Annual_Income) 257 | sort(df$Annual_Income) 258 | sort(df$Annual_Income) 259 | diff(sort(df$Annual_Income)) 260 | df$Annual_Income 261 | help(diff) 262 | length(diff(sort(df$Annual_Income))) 263 | #For Categorical variablles 264 | # set of categories are divided into two subsets 265 | p1= seq(0,1,0.1) 266 | p1 267 | gini= NULL 268 | for(i in 1:length(p1)) { 269 | gini[i]=1-(p1[i]^2 + (1-p1[i])^2) 270 | } 271 | gini 272 | plot(p1,gini, ylab= "Gini index",type= "l") 273 | entropy= NULL 274 | for(i in 1:length(p1)) { 275 | entropy[i]= -(p1[i]*log2(p1[i])+ (1-p1[i])*log2(1-p1[i])) 276 | } 277 | plot(spline(p1,entropy), type= "l", xlab= "p1", ylab= "Entropy Measure") 278 | help("spline") 279 | plot(spline(p1,gini), ylab= "Gini index",type= "l") 280 | plot(p1,gini, ylab= "Gini index",type= "l") 281 | #First split in sedanCar example 282 | summary(df$Ownership) 283 | giorg= 1-(10/20)^2-(10/20)^2 284 | emorg= -(10/20)*log2(10/20)- (10/20)*log2(10/20) 285 | giorg 286 | emorg= -(10/20)*log2(10/20)- (10/20)*log2(10/20) 287 | emorg 288 | #upper rectangle 289 | giniurec= 1- (7/10)^2- (3/10)^2 290 | emurec= -(7/10)*log2(7/10)- (3/10)*log2(3/10) 291 | ginilrec= giniurec 292 | emlrec= emurec 293 | giniurec 294 | emurec 295 | plot(x=df$Annual_Income,y= df$Household_Area, las=1, 296 | xlab= "Annual Income", ylab= "Household Area", 297 | xlim = c(2,12), ylim= c(13,25), pch= c(21,19)[as.numeric(df$Ownership)]) 298 | legend("bottomright", inset= 0.005, c("owner","Non-owner"), 299 | pch= c(19,21), cex= 0.7, x.intersp = 0.5, y.intersp = 0.5) 300 | par(mar=c(5.1,5.1,5.1,5.1)) 301 | plot(x=df$Annual_Income,y= df$Household_Area, las=1, 302 | xlab= "Annual Income", ylab= "Household Area", 303 | xlim = c(2,12), ylim= c(13,25), pch= c(21,19)[as.numeric(df$Ownership)]) 304 | #First split 305 | abline(h=18.8) 306 | #Second Split 307 | segments(7,0,7,18.8) 308 | #Final Stage 309 | segments(5.8,18.8,5.8,26) 310 | segments(5.8,19.5,13,19.5) 311 | segments(0,18.2,7,18.2) 312 | library(rpart) 313 | library(rpart) 314 | mod = rpart(Ownership~. , method= "class", data= df, 315 | control= rpart.control(cp= 0, minsplit = 2, minbucket= 1, 316 | maxcompete = 0, maxsurrogate = 0 ,xval= 0),parms= list(split= "gini")) 317 | par(mar= c(0,0,0,0), oma= c(0,0,0,0), xpd= NA) 318 | plot(mod, uniform=T, branch= 0.3, compress = T, 319 | margin = 0.1, nspace=1) 320 | help("rpart") 321 | par(mar= c(0,0,0,0), oma= c(0,0,0,0), xpd= NA) 322 | plot(mod, uniform=T, branch= 0.3, compress = T, 323 | margin = 0.1, nspace=1) 324 | text(mod,splits= T, use.n = T, all= F, minlength = 0, 325 | cex= 0.8) 326 | help(plt) 327 | help(plot) 328 | abline(v=5.95) 329 | install.packages("rpart.plot") 330 | prp(mod,type=1, extra=1 , under= T, varlen= 0, cex= 0.7, 331 | compress = T, Margin = 0 , digits = 0 , 332 | split.cex = 0.8, under.cex = 0.8) 333 | prp(mod,type=1, extra=1 , under= T, varlen= 0, cex= 0.7, 334 | compress = T, Margin = 0 , digits = 0 , 335 | split.cex = 0.8, under.cex = 0.8) 336 | help(prp) 337 | help("prp"") 338 | a 339 | as 340 | exit 341 | ; 342 | . 343 | / 344 | \ 345 | `` 346 | `` 347 | ~ 348 | ~~~~ 349 | 3 350 | = 351 | } 352 | [] 353 | ) 354 | 0 355 | "" 356 | help(rpart.plot) 357 | help("rpart.plot") 358 | library(rpart.plot) 359 | help("rpart.plot") 360 | prp(mod,type=1, extra=1 , under= T, varlen= 0, cex= 0.7,compress = T, Margin = 0 , digits = 0 , 361 | split.cex = 0.8, under.cex = 0.8) 362 | #Node numbering 363 | prp(mod,type=1, extra=1 , under= T, varlen= 0, cex= 0.7, 364 | compress = T, Margin = 0 , digits = 0 , 365 | split.cex = 0.8, under.cex = 0.8, nn=T, nn.cex= 0.6) 366 | #First split 367 | modsub= snip.rpart(mod,toss=c(6:7, 12:13, 24:25)) 368 | prp(modsub,type=1, extra=1 , under= T, varlen= 0, cex= 0.7, 369 | compress = T, Margin = 0 , digits = 0 , 370 | split.cex = 0.8, under.cex = 0.8, nn=T, nn.cex= 0.6) 371 | #First 3 splits 372 | modsub1= snip.rpart(mod,toss=c(3,6:712:13, 24:25)) 373 | prp(modsub1,type=1, extra=1 , under= T, varlen= 0, cex= 0.7, 374 | compress = T, Margin = 0 , digits = 0 , 375 | split.cex = 0.8, under.cex = 0.8, nn=T, nn.cex= 0.6) 376 | #First 3 splits 377 | modsub1= snip.rpart(mod,toss=c(3,6:7,12:13, 24:25)) 378 | prp(modsub1,type=1, extra=1 , under= T, varlen= 0, cex= 0.7, 379 | compress = T, Margin = 0 , digits = 0 , 380 | split.cex = 0.8, under.cex = 0.8, nn=T, nn.cex= 0.6) 381 | #First 3 splits 382 | modsub1= snip.rpart(mod,toss=c(3,6:7,12:13, 24:25)) 383 | prp(modsub1,type=1, extra=1 , under= T, varlen= 0, cex= 0.7, 384 | compress = T, Margin = 0 , digits = 0 , 385 | split.cex = 0.8, under.cex = 0.8, nn=T, nn.cex= 0.6) 386 | prp(modsub1,type=1, extra=1 , under= T, varlen= 0, cex= 0.7, 387 | compress = T, Margin = 0 , digits = 0 , 388 | split.cex = 0.8, under.cex = 0.8, nn=T, nn.cex= 0.6) 389 | help("prp") 390 | summary(mod) 391 | summary(mod) 392 | -------------------------------------------------------------------------------- /CART.R: -------------------------------------------------------------------------------- 1 | library(xlsx) 2 | 3 | #Classification Trees 4 | #Sedancar.xlsx 5 | df= read.xlsx(file.choose(),1,header = T) 6 | df= df[,!apply(is.na(df),2,all)] 7 | str(df) 8 | 9 | data.frame("Household Number"= 1:20,"Annual Income (in lakhs)"= df$Annual_Income, 10 | "House Area (in fts)"= df$Household_Area, 11 | "Ownership of Sedan Car"= df$Ownership, 12 | check.names = F) 13 | 14 | par(mar=c(5.1,5.1,5.1,5.1)) 15 | plot(x=df$Annual_Income,y= df$Household_Area, las=1, 16 | xlab= "Annual Income", ylab= "Household Area", 17 | xlim = c(2,12), ylim= c(13,25), pch= c(21,19)[as.numeric(df$Ownership)]) 18 | legend("bottomright", inset= 0.005, c("owner","Non-owner"), 19 | pch= c(19,21), cex= 0.7, x.intersp = 0.5, y.intersp = 0.5) 20 | 21 | #First split 22 | abline(h=18.8) 23 | 24 | #Possible set of split values 25 | # For numerical variables 26 | # Midpoints between pairs of consecutive values for a variable, 27 | #which are ranked as per the impurity (heterogeneity) reduction 28 | #in the resulting rectangular parts 29 | 30 | sort(df$Annual_Income) 31 | head(sort(df$Annual_Income),-1)+ diff(sort(df$Annual_Income)/2) 32 | sort(df$Household_Area) 33 | head(sort(df$Household_Area),-1)+ diff(sort(df$Household_Area)/2) 34 | 35 | #For Categorical variablles 36 | # set of categories are divided into two subsets 37 | p1= seq(0,1,0.1) 38 | gini= NULL 39 | for(i in 1:length(p1)) { 40 | gini[i]=1-(p1[i]^2 + (1-p1[i])^2) 41 | } 42 | plot(p1,gini, ylab= "Gini index",type= "l") 43 | 44 | entropy= NULL 45 | for(i in 1:length(p1)) { 46 | entropy[i]= -(p1[i]*log2(p1[i])+ (1-p1[i])*log2(1-p1[i])) 47 | } 48 | plot(spline(p1,entropy), type= "l", xlab= "p1", ylab= "Entropy Measure") 49 | 50 | #First split in sedanCar example 51 | summary(df$Ownership) 52 | giorg= 1-(10/20)^2-(10/20)^2 53 | emorg= -(10/20)*log2(10/20)- (10/20)*log2(10/20) 54 | 55 | #upper rectangle 56 | giniurec= 1- (7/10)^2- (3/10)^2 57 | emurec= -(7/10)*log2(7/10)- (3/10)*log2(3/10) 58 | ginilrec= giniurec # as upper rectangle and lower rectangle have symmetric proportions 59 | emlrec= emurec 60 | 61 | ginisplit1= (10/20)*giniurec + (10/20)*ginilrec 62 | emsplit1= (10/20)*emlrec + (10/20)*emurec 63 | 64 | ginidelta = ginisplit1- giorg 65 | emdelta= emsplit1- emorg 66 | 67 | #Second Split 68 | segments(7,0,7,18.8) 69 | 70 | #Final Stage 71 | segments(5.8,18.8,5.8,26) 72 | segments(5.8,19.5,13,19.5) 73 | segments(0,18.2,7,18.2) 74 | 75 | library(rpart) 76 | #method = "class" for a classification tree 77 | #method = "anova" for a regression tree 78 | 79 | mod = rpart(Ownership~. , method= "class", data= df, 80 | control= rpart.control(cp= 0, minsplit = 2, minbucket= 1, 81 | maxcompete = 0, maxsurrogate = 0 ,xval= 0),parms= list(split= "gini")) 82 | par(mar= c(0,0,0,0), oma= c(0,0,0,0), xpd= NA) 83 | plot(mod, uniform=T, branch= 0.3, compress = T, 84 | margin = 0.1, nspace=1) 85 | text(mod,splits= T, use.n = T, all= F, minlength = 0, 86 | cex= 0.8) 87 | 88 | install.packages("rpart.plot") 89 | library(rpart.plot) 90 | prp(mod,type=1, extra=1 , under= T, varlen= 0, cex= 0.7,compress = T, Margin = 0 , digits = 0 , 91 | split.cex = 0.8, under.cex = 0.8) 92 | 93 | #Node numbering 94 | prp(mod,type=1, extra=1 , under= T, varlen= 0, cex= 0.7, 95 | compress = T, Margin = 0 , digits = 0 , 96 | split.cex = 0.8, under.cex = 0.8, nn=T, nn.cex= 0.6) 97 | 98 | #First split 99 | modsub= snip.rpart(mod,toss=c(6:7, 12:13, 24:25)) 100 | prp(modsub,type=1, extra=1 , under= T, varlen= 0, cex= 0.7, 101 | compress = T, Margin = 0 , digits = 0 , 102 | split.cex = 0.8, under.cex = 0.8, nn=T, nn.cex= 0.6) 103 | #First 3 splits 104 | modsub1= snip.rpart(mod,toss=c(3,6:7,12:13, 24:25)) 105 | prp(modsub1,type=1, extra=1 , under= T, varlen= 0, cex= 0.7, 106 | compress = T, Margin = 0 , digits = 0 , 107 | split.cex = 0.8, under.cex = 0.8, nn=T, nn.cex= 0.6) 108 | 109 | 110 | summary(mod) 111 | 112 | 113 | ################# 114 | ###promooffers### 115 | 116 | df= read.xlsx(file.choose(),1,header = T) 117 | df= df[,!apply(is.na(df),2,all)] 118 | str(df) 119 | 120 | tPIN= table(as.factor(df$Pin.Code)) 121 | PINnames= dimnames(tPIN)[[1]] 122 | 123 | C_PINcode= NULL 124 | for(x in PINnames) { 125 | C_PINcode= c(C_PINcode, length(which(as.character(df$Pin.Code)==x & df$Promoffer==1))) 126 | } 127 | barplot(C_PINcode, names.arg = PINnames, xlab= "PIN Code", las=3, ylab= "Promotional offers Accepted", 128 | ylim=c(0,20), cex.names= 0.6) 129 | 130 | table(as.factor(C_PINcode)) 131 | 132 | for(x in PINnames) { 133 | index= which(as.character(df$Pin.Code)==x) 134 | df[index,]$Pin.Code=rep(C_PINcode[which(PINnames==x)],length(index)) 135 | } 136 | 137 | df$Pin.Code= as.factor(df$Pin.Code) 138 | df$Education= as.factor(df$Education) 139 | df$Promoffer =as.factor(df$Promoffer) 140 | df$Online= as.factor(df$Online) 141 | 142 | str(df) 143 | 144 | mod = rpart(Promoffer~. , method= "class", data= df, 145 | control= rpart.control(cp= 0, minsplit = 2, minbucket= 1, 146 | maxcompete = 0, maxsurrogate = 0 ,xval= 0),parms= list(split= "gini")) 147 | 148 | mod_predict= predict(mod, df, type= "class") 149 | table("Actual value"=df$Promoffer, "Predicted value"=mod_predict) 150 | mean(mod_predict==df$Promoffer) 151 | 152 | toss1= as.integer(row.names(mod$frame)) 153 | x= mod$frame$var 154 | 155 | 156 | 157 | 158 | ### REGRESSION TREES ### 159 | ###usedcars dataset### 160 | 161 | df= read.xlsx(file.choose(),1,header = T) 162 | df= df[,!apply(is.na(df),2,all)] 163 | str(df) 164 | 165 | Age= 2017-df$Mfg_Year 166 | df= cbind(df,Age) 167 | 168 | dfb= df 169 | df= df[,-c(1,2,3,11)] 170 | 171 | str(df) 172 | df$Transmission= as.factor(df$Transmission) 173 | str(df) 174 | 175 | ## Partitioning (60%:40%) 176 | partidx= sample(1:nrow(df),0.6*nrow(df),replace= F) 177 | dftrain= df[partidx,] 178 | dftest= df[-partidx,] 179 | 180 | library(rpart) 181 | mod = rpart(Price~. , method= "anova", data= dftrain, 182 | control= rpart.control(cp= 0, minsplit = 2, minbucket= 1, 183 | maxcompete = 0, maxsurrogate = 0 ,xval= 0),parms= list(split= "gini")) 184 | 185 | # No of decision nodes 186 | nrow(mod$splits) 187 | 188 | # No of terminal nodes 189 | nrow(mod$frame)-nrow(mod$splits) 190 | 191 | toss1= as.integer(row.names(mod$frame)); toss1 192 | 193 | DFP= data.frame("toss"= toss1, "Svar"=mod$frame$var, 194 | "CP"=mod$frame$complexity); DFP 195 | 196 | DFP1= DFP[DFP$Svar!="",] ;DFP1 197 | 198 | DFP2= DFP1[order(DFP1$CP, -DFP1$toss, decreasing = T),] ; DFP2 199 | 200 | rownames(DFP2)= 1:nrow(DFP2); DFP2 201 | 202 | toss2= DFP2$toss 203 | -------------------------------------------------------------------------------- /Charts & Plots.R: -------------------------------------------------------------------------------- 1 | library(xlsx) 2 | 3 | #NaughtyBicycle.xlsx 4 | df= read.xlsx(file.choose(), 1 ,header = T) 5 | df= df[,!apply(is.na(df), 2,all)] 6 | df= df[,1:2] 7 | head(df) 8 | 9 | #Line Graph 10 | tsv= ts(df$Riders, start= c(2004,1),end=c(2017,3),frequency = 12) 11 | plot(tsv, xlab= "year", ylab="Riders",las=1) # las= styling for axis labels 12 | 13 | at1= seq(as.Date("2004-01-01"), as.Date("2017-03-01"),by="2 years") 14 | labels1=format(at1,"%b-%Y") 15 | at2=format(at1,"%Y") 16 | 17 | par()$mar 18 | par(mar=c(8,4,4,2)+0.1) 19 | 20 | plot(tsv,xlab="",ylab="",xaxt="n",yaxt="n") 21 | axis(1,at=at2,labels = labels1, las=2) 22 | axis(2,las=2) 23 | mtext(side=1,text="Month-Year", line= 5.0) 24 | mtext(side=2,text="Riders", line= 3.3) 25 | 26 | graphics.off() 27 | par()$mar 28 | 29 | #Bar charts 30 | df1= read.xlsx(file.choose(),1, header= T) 31 | df1= df1[,!apply(is.na(df1), 2,all)] 32 | 33 | Age= 2017- df1$Mfg_Year 34 | df1= cbind(df1,Age) 35 | df1= df1[,-c(1,2,3)] 36 | 37 | head(df1) 38 | str(df1) 39 | df1$Transmission= as.factor(df1$Transmission) 40 | df1$C_Price= as.factor(df1$C_Price) 41 | str(df1) 42 | summary(df1) 43 | 44 | #Scatter plots 45 | range(df1$KM) 46 | range(df1$Price) 47 | plot(df1$KM,df1$Price,xlim= c(18,180),ylim= c(1,75),xlab= "KM",ylab = "Price" ) 48 | 49 | df1= df1[df1$Price<70,] 50 | dfb= df1 51 | df1= df1[-23,] 52 | 53 | range(df1$KM) 54 | range(df1$Price) 55 | plot(df1$KM,df1$Price,xlim= c(18,180),ylim= c(1,15),xlab= "KM",ylab = "Price" ) 56 | 57 | 58 | #Bar Chart 59 | avgPrice= c(mean(df1[which(df1$Transmission=='0'),]$Price), 60 | mean(df1[which(df1$Transmission=='1'),]$Price)) 61 | Trans= c("0","1") 62 | 63 | range(avgPrice) 64 | 65 | barplot(avgPrice, names.arg = Trans, xlab= "Transmission", 66 | ylab="Average-Price", ylim= c(0,6)) 67 | 68 | pAll= c((length(which(df1$Transmission=='0'))/length(df1$Transmission))*100, 69 | (length(which(df1$Transmission=='1'))/length(df1$Transmission))*100) 70 | 71 | barplot(pAll, names.arg = Trans, xlab= "Transmission", 72 | ylab="% of all records", ylim= c(0,100)) 73 | 74 | #Histograms 75 | 76 | range(df1$KM) 77 | range(df1$Price) 78 | hist(df1$Price, main="", xlim=c(-5,20),ylim= c(0,50),xlab="Price") 79 | 80 | #boxplot 81 | 82 | boxplot(df1$Price~df1$Transmission, ylim= c(0,15),xlab="Transmission", 83 | ylab="Price") 84 | means= by(df1$Price, df1$Transmission,mean) 85 | points(1:2,means,pch=19) 86 | 87 | range(df1$KM) 88 | 89 | -------------------------------------------------------------------------------- /DataSets(Excel & Csv files)/BicycleRidership.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/BicycleRidership.xlsx -------------------------------------------------------------------------------- /DataSets(Excel & Csv files)/Breakfast_Cereals.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/Breakfast_Cereals.xlsx -------------------------------------------------------------------------------- /DataSets(Excel & Csv files)/Cutoffdata.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/Cutoffdata.xlsx -------------------------------------------------------------------------------- /DataSets(Excel & Csv files)/E-Commerce.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/E-Commerce.xlsx -------------------------------------------------------------------------------- /DataSets(Excel & Csv files)/Financial_Reporting.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/Financial_Reporting.xlsx -------------------------------------------------------------------------------- /DataSets(Excel & Csv files)/FlightDetails.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/FlightDetails.xlsx -------------------------------------------------------------------------------- /DataSets(Excel & Csv files)/InternetCorruption.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/InternetCorruption.xlsx -------------------------------------------------------------------------------- /DataSets(Excel & Csv files)/LiftPrediction.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/LiftPrediction.xlsx -------------------------------------------------------------------------------- /DataSets(Excel & Csv files)/Mypromooffers.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/Mypromooffers.xlsx -------------------------------------------------------------------------------- /DataSets(Excel & Csv files)/Promoffers-9 Variables.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/Promoffers-9 Variables.xlsx -------------------------------------------------------------------------------- /DataSets(Excel & Csv files)/Promoffers.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/Promoffers.xlsx -------------------------------------------------------------------------------- /DataSets(Excel & Csv files)/SedanCar.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/SedanCar.xlsx -------------------------------------------------------------------------------- /DataSets(Excel & Csv files)/UsedCars.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/UsedCars.xlsx -------------------------------------------------------------------------------- /Dimensionality_Reduction.R: -------------------------------------------------------------------------------- 1 | library(xlsx) 2 | 3 | df1= read.xlsx(file.choose(),1, header= T) 4 | df1= df1[,!apply(is.na(df1), 2,all)] 5 | 6 | Age= 2017- df1$Mfg_Year 7 | df1= cbind(df1,Age) 8 | dfb= df1 9 | df1= df1[,-c(1,2,3)] 10 | 11 | head(df1) 12 | str(df1) 13 | 14 | #Summary Statistics 15 | countblank= function(x) sum(x=="") 16 | 17 | dfsum= data.frame(Average= sapply(df1[,-1],mean),Median= sapply(df1[,-1],median), 18 | Min= sapply(df1[,-1],min),Max= sapply(df1[,-1],max), 19 | Std= sapply(df1[,-1],sd),Count= sapply(df1[,-1],length), 20 | Countblank=sapply(df1[,-1],countblank)) 21 | round(dfsum,digits = 2) 22 | 23 | M= cor(df1[,-c(1,5,8)]);M 24 | M[upper.tri(M)]=NA;M 25 | print(round(M,digits = 2),na.print = "") 26 | symnum(M) 27 | 28 | #Reducing Categories 29 | Age_groups= levels(as.factor(df1$Age)) 30 | Age_groups2= as.numeric(Age_groups) 31 | C_PricebyAge1= NULL 32 | C_PricebyAge2= NULL 33 | #Group1 has less than Rs400000 cost 34 | # Rest lies in Group2 35 | for(x in Age_groups2) { 36 | C_PricebyAge1= c(C_PricebyAge1, 37 | 100* sum(df1$Age==x & df1$C_Price==0)/sum(df1$Age==x)) 38 | C_PricebyAge2= c(C_PricebyAge2, 39 | 100* sum(df1$Age==x & df1$C_Price==1)/sum(df1$Age==x)) 40 | } 41 | C_PricebyAge= matrix(c(C_PricebyAge1, C_PricebyAge2),nrow = 2, 42 | ncol= length(Age_groups), byrow= T) 43 | #palette(c("purple","green")) 44 | barplot(C_PricebyAge, names.arg = Age_groups, xlab= "Age", 45 | legend.text = c("0","1"), args.legend = list(x="topright"), 46 | main= "Distribution of C_Price by Age",col = c("blue","green"), 47 | ylim = c(0,100), xlim = c(0,12)) 48 | 49 | Sales= c(45,50, 55,100,51,56,61,125,60,65,70,145,68,74,79,165) 50 | 51 | tsv = ts(Sales, start = c(2012,1),end= c(2015,4), frequency = 4) 52 | 53 | plot(tsv, xlab= "Quarter", ylab= "Sales(in crores)", las=2 , ylim=c(0,180)) 54 | 55 | #BreakfastCereals.xlsx 56 | df2= read.xlsx(file.choose(),1,header = T) 57 | df2= df2[, !apply(is.na(df2), 2,all)] 58 | 59 | df2=cereal 60 | dim(df2) 61 | df2$vitamins= as.factor(df2$vitamins) 62 | df2$vitamins= as.numeric(df2$vitamins) 63 | 64 | df2$mfr= as.factor(df2$mfr) 65 | df2$mfr= as.numeric(df2$mfr) 66 | 67 | df2$mfr= as.factor(df2$mfr) 68 | df2$mfr= as.numeric(df2$mfr) 69 | 70 | sum=NULL 71 | for(x in 1:dim(df2)[1]) { 72 | csum=0 73 | for(y in df2[x,-c(1,9,11)]) { 74 | csum=csum+y 75 | } 76 | sum=c(sum,csum) 77 | } 78 | df2$weight= sum 79 | 80 | df3= as.data.frame(lapply(df2[,-c(1,9,11,12)], function(x){x=100*(x/df2$weight)})) 81 | df3= cbind(df3,df2[,c(1,9,11)]) 82 | 83 | range(df3$potassium) 84 | range(df3$fibre) 85 | 86 | plot(df3$potassium, df3$fibre, xlab="POTASSIUM", ylab="FIBRE") 87 | 88 | v1= var(df3$potassium) 89 | v2= var(df3$fibre) 90 | c12= cov(df3$potassium,df3$fibre) 91 | matrix(c(v1,c12,c12,v2),2,2,T) 92 | 93 | cor(df3$potassium,df3$fibre) 94 | 95 | v1+v2 96 | 100*v1/(v1+v2) 97 | 100*v2/(v1+v2) 98 | 99 | #Principal Component Analysis 100 | dfpca= df3[,c(8,5)] 101 | mod= prcomp(dfpca) 102 | 103 | #adding PC directions to the plot 104 | slp= with(mod, rotation[2,1]/rotation[1,1]) 105 | int= with(mod,center[2]-slp*center[1]) 106 | 107 | #First principal component 108 | abline(coef= c(int,slp)) 109 | mod$rotation 110 | 111 | slp1= -1/slp 112 | int1= with(mod,center[2]-slp1*center[1]) 113 | 114 | #Second principal component 115 | abline(coef= c(int1,slp1)) 116 | mod$rotation 117 | 118 | head(mod$x) 119 | dfpca[1,] 120 | First= mod$rotation[1,1]*(dfpca[1,1]-mean(dfpca[,1]))+ 121 | mod$rotation[1,2]*(dfpca[1,2]-mean(dfpca[,2])); First 122 | 123 | vz1= var(mod$x[,1]) 124 | vz2= var(mod$x[,2]) 125 | vz1+vz2 126 | 100*vz1/(vz1+vz2) 127 | 100*vz2/(vz1+vz2) 128 | -------------------------------------------------------------------------------- /Installation.R: -------------------------------------------------------------------------------- 1 | install.packages(c("xlsx","rminer","MASS","leaps","class","e1071", 2 | "rpart","rgart.plot","neuralnet","nnet","devtools", 3 | "caret","arules","arulesviz","cluster","xts","matrixcalc", 4 | "forecast","igraph","treemap","rworldmap","ggmap"), 5 | dependencies = T) 6 | install.packages("dplyr",dependencies = T) 7 | -------------------------------------------------------------------------------- /KNN.R: -------------------------------------------------------------------------------- 1 | df= read.xlsx(file.choose(),1,header = T) 2 | df= df[,!apply(is.na(df),2,all)] 3 | 4 | #Normalization: 5 | dfb= df 6 | df[,1:2]=scale(df[,1:2],center = T, scale = T) 7 | 8 | partidx= sample(1:nrow(df),15, replace = F) 9 | dftrain=df[partidx,] 10 | dftest=df[-partidx,] 11 | 12 | #Modeling 13 | library(class) 14 | # Building '4NN' 15 | mod= knn(train= dftrain[,1:2],test= dftest[,1:2], 16 | cl=dftrain$Ownership, k=4) 17 | summary(mod) 18 | 19 | #Classification Matrix 20 | table("Actual value"=mod, "Predicted value"=dftest$Ownership) 21 | 22 | mean(mod!=dftest$Ownership) 23 | 24 | #choosing K 25 | modtrain = NULL 26 | modtest = NULL 27 | errtrain = NULL 28 | errtest = NULL 29 | 30 | dftrain= as.data.frame(dftrain) 31 | dftest= as.data.frame(dftest) 32 | 33 | for(i in 1:15) { 34 | modtrain= knn(train = dftrain[,1:2],test=dftrain[,1:2], 35 | cl= dftrain[,3], k=i) 36 | modtest= knn(train = dftrain[,1:2],test=dftest[,1:2], 37 | cl= dftrain[,3], k=i) 38 | errtrain[i]= 100*mean(modtrain!=dftrain$Ownership) 39 | errtest[i]= 100*mean(modtest!=dftest$Ownership) 40 | } 41 | 42 | dfp = data.frame("valueofk"=1:15,"ErrorTraining"=errtrain, 43 | "ErrorValidation"=errtest) 44 | round(dfp,digits = 2) 45 | range(dfp$ErrorValidation) 46 | plot(dfp$valueofk, dfp$ErrorValidation, las=1, type="l", 47 | xlab="value of k", ylab= "Validation Error", 48 | xlim= c(0,16), ylim=c(0,65)) 49 | lines(dfp$valueofk,dfp$ErrorTraining) 50 | 51 | #BEST K 52 | min(errtest) 53 | bestk= dfp[which(errtest==min(errtest)),1] 54 | #or 55 | bestk= dfp[which.min(errtest),1] 56 | 57 | #Predicting class of new observation 58 | #Annual_Income=6 lpa, Household_area= 20 59 | modnew1= knn(train = dftrain[,1:2], test = c(6,20), 60 | cl= dftrain$Ownership, k=bestk) 61 | modnew2= knn(train = dftrain[,1:2], test = c(5,15), 62 | cl= dftrain$Ownership, k=bestk) 63 | -------------------------------------------------------------------------------- /Logistic Regression.R: -------------------------------------------------------------------------------- 1 | #### Probabilty odds, and logit 2 | ## odd= p/(1-p) 3 | curve(p/(1-p), from=0 , to=1, type= "l", xname = "p", las= 1, 4 | xlab= "Probability of success", ylab= "logit", xaxt= "n") 5 | ## logit= log(odd)= log(p/(1-p)) 6 | curve(log(p/(1-p)), from=0 , to=1, type= "l", xname = "p", las= 1, 7 | xlab= "Probability of success", ylab= "logit", xaxt= "n") 8 | axis(1,pos=0) 9 | 10 | df= read.xlsx(file.choose(),1,header = T) 11 | df= df[,!apply(is.na(df),2,all)] 12 | str(df) 13 | 14 | dfb= df 15 | df= df[,-5] 16 | df$Promoffer= as.factor(df$Promoffer) 17 | df$Online= as.factor(df$Online) 18 | 19 | ## Partitioning (60%:40%) 20 | partidx= sample(1:nrow(df),0.6*nrow(df),replace= F) 21 | dftrain= df[partidx,] 22 | dftest= df[-partidx,] 23 | 24 | mod= glm(Promoffer ~ Income, family = binomial(link="logit"),data= dftrain) 25 | summary(mod) 26 | 27 | b0= unname(mod$coefficients[1]) 28 | b1= unname(mod$coefficients[2]) 29 | 30 | # "P(Prmoffer= Yes| Income= X)" = 1/(1+e^ -(b0+b1*x)) 31 | 32 | range(dftrain$Income) 33 | plot(dftrain$Income, as.numeric(as.character(dftrain$Promoffer)), 34 | type="p",xlab = "Income", ylab= "Promoffer") 35 | curve(1/(1+exp(-(mod$coefficients[[1]]+mod$coefficients[[2]]*x))), 36 | xlim= c(0,250), type= "l", xname= "x", add = T) 37 | 38 | mod1= glm(Promoffer ~ ., family = binomial(link="logit"),data= dftrain) 39 | summary(mod1) 40 | 41 | #P=odds/(1+odds) 42 | curve(odds/(1+odds), from =0, to=100, type="l", xname= "odds", 43 | xlab= "Odds", ylab= "Probability of Success") 44 | 45 | #P= exp(logit)/(1+exp(logit)) 46 | curve(exp(logit)/(1+exp(logit)), from =-100, to=100, type="l", xname= "logit", 47 | xlab= "logit", ylab= "Probability of Success") 48 | 49 | modtest= predict(mod1, dftest[,-c(3)],type= "response") 50 | ### response returns probabilities 51 | 52 | modtestl= predict(mod1, dftest[,-c(3)],type= "link") 53 | ### return logit values 54 | 55 | modtestc= ifelse(modtest>0.5,1,0) 56 | 57 | table("Actual value"=dftest$Promoffer, "Predicted"=modtestc) 58 | 59 | mean(modtestc == df$Promoffer) 60 | mean(modtestc != df$Promoffer) 61 | 62 | head(data.frame( 63 | "Predicted class"= modtestc, 64 | "ACtual class"=dftest$Promoffer, 65 | "Prob for 1(success)"= modtest, 66 | "Log odds"= modtestl, 67 | dftest[,-3], check.names = F 68 | )) 69 | 70 | #Cumulative Lift Curve 71 | dflift= data.frame("Probabilty of class 1"=modtest,"Actual class"= as.numeric(as.character(dftest$Promoffer)),check.names = F) 72 | 73 | dflift= dflift[order(dflift[,1],decreasing = T),] 74 | CumACtualClass= cumsum(dflift[,2]) 75 | dflift= cbind(dflift, CumACtualClass) 76 | head(dflift) 77 | 78 | plot(1:nrow(dflift), dflift$CumACtualClass, "l", 79 | xlab = "# cases", ylab="cumulative", xlim= c(0,2100), 80 | ylim = c(0,210)) 81 | legend(800,70,inset=0.005, 82 | c("Cumulative Personal Loan when sorted using predicted values", 83 | "Cumulative Personal Loan using average"), 84 | lty= c(1,2), bty= "n", cex= 0.7, x.intersp=0.3, y.intersp= 0.5) 85 | 86 | 87 | 88 | ################ Flight Details ################ 89 | df= read.xlsx(file.choose(),1,header = T) 90 | df= df[,!apply(is.na(df),2,all)] 91 | str(df) 92 | 93 | dfb= df 94 | df$STD= strptime(format(df$STD, "%H:%M:%S"), "%H:%M:%S") 95 | df$ATD= strptime(format(df$ATD, "%H:%M:%S"), "%H:%M:%S") 96 | df$STA= strptime(format(df$STA, "%H:%M:%S"), "%H:%M:%S") 97 | df$ATA= strptime(format(df$ATA, "%H:%M:%S"), "%H:%M:%S") 98 | 99 | 100 | breaks= seq(strptime("00:00:00","%H:%M:%S"),strptime("24:00:00","%H:%M:%S"), 101 | by= "6 hours") 102 | labelsv= c("0-6","6-12","12-18","18-24") 103 | DEPT= cut(df$ATD, breaks= breaks, right= F, labels = labelsv) 104 | 105 | df1= cbind(df, DEPT) 106 | 107 | df1$Day= as.factor(df1$Day) 108 | levels(df1$Day) 109 | levels(df1$Day=c("Sunday","Monday")) 110 | df1$FLTIME= as.difftime(as.character(df1$FLTIME)) 111 | 112 | str(df1) 113 | head(df1) 114 | 115 | dfb1= df1 116 | df1= df1[,-c(1,3,5:8)] 117 | str(df1) 118 | head(df1) 119 | 120 | -------------------------------------------------------------------------------- /Multiple Linear Regression.R: -------------------------------------------------------------------------------- 1 | library(xlsx) 2 | df= read.xlsx(file.choose(),1,T) 3 | df= df[,!apply(is.na(df),2,all)] 4 | head(df) 5 | 6 | Age= 2017- df$Mfg_Year 7 | df= cbind(df, Age) 8 | 9 | dfb= df 10 | df= df[,-c(1,2,3,11)] 11 | df$Transmission= as.factor(df$Transmission) 12 | 13 | #Partitioning (60%:40%) 14 | partidx= sample(1:nrow(df), 0.6*nrow(df), replace=F) 15 | dftrain= df[partidx,] 16 | dftest= df[-partidx,] 17 | 18 | mod= lm(Price ~ ., dftrain) 19 | summary(mod) 20 | #anova(mod) 21 | 22 | #Goodness of fit 23 | gf= c(mod$df.residual, summary(mod)$r.squared, summary(mod)$sigma, 24 | anova(mod)["Residuals","Sum Sq"]) 25 | gf= as.data.frame(gf,optional = T) 26 | rownames(gf)=c("Residual df","Multiple R-Squared","Std. Dev. Estimate", 27 | "Residual SS") 28 | 29 | modtest= predict(mod,dftest[,-4]) 30 | Residuals= dftest$Price-modtest 31 | head(data.frame(c("Actual Value"=dftest$Price,"Predicted Value"=modtest, Residuals))) 32 | 33 | library(rminer) 34 | M= mmetric(dftest$Price, modtest, c("SSE","RMSE","ME")) 35 | 36 | boxplot(Residuals, main= "Box Plot of residuals", ylab= "Residual", 37 | ylim= c(-6,7),las= 1) 38 | quantile(Residuals, probs=c(0.25,0.75)) 39 | 40 | hist(df$Price, main= "", xlab="Price") 41 | 42 | #Normal Probabilty Plot 43 | qqnorm(df$Price) 44 | qqline(df$Price) 45 | 46 | 47 | 48 | ######### 49 | library(xlsx) 50 | df= read.xlsx(file.choose(),1,T) 51 | df= df[,!apply(is.na(df),2,all)] 52 | head(df) 53 | 54 | Age= 2017- df$Mfg_Year 55 | df= cbind(df, Age) 56 | 57 | dfb= df 58 | df= df[,-c(1,2,3,11)] 59 | df$Transmission= as.factor(df$Transmission) 60 | 61 | plot(df$KM, df$Price, xlim=c(18,180), ylim = c(1,75), 62 | xlab= "KM", ylab="Price") 63 | df= df[-c(13,23,29,73),] 64 | 65 | plot(df$KM, df$Price, xlim=c(25,115), ylim = c(1,14), 66 | xlab= "KM", ylab="Price") 67 | 68 | #Partitioning (60%:40%) 69 | partidx= sample(1:nrow(df), 0.6*nrow(df), replace=F) 70 | dftrain= df[partidx,] 71 | dftest= df[-partidx,] 72 | 73 | #Variable Selection 74 | #Exhaustive Search 75 | library(leaps) 76 | mod3= regsubsets(Price ~., data= dftrain, nbest = 1,nvmax = NULL, 77 | force.in = NULL, force.out = NULL, 78 | method = "exhaustive", intercept = T) 79 | mod3summ= summary(mod3) 80 | 81 | countspch= function(x) sum(x=="*") 82 | om= as.integer(apply(mod3summ$outmat,2,countspch)) 83 | data.frame("Coeff"=as.integer(apply(mod3summ$outmat,1,countspch)), 84 | "RSS"=mod3summ$rss, 85 | "Cp"=round(mod3summ$cp,digits = 2), 86 | "R-sq"=round(mod3summ$rsq, digits = 2), 87 | "Adj.R-sq"=round(mod3summ$adjr2, digits = 2), 88 | mod3summ$outmat[,order(-om)]) 89 | 90 | 91 | #Coefficients of subset models 92 | coef(mod3,1:8) 93 | 94 | #Partial Iterative Searching: 95 | #Forward Selection 96 | 97 | mod4= regsubsets(Price ~., data= dftrain, nbest = 1,nvmax = NULL, 98 | force.in = NULL, force.out = NULL, 99 | method = "forward", intercept = T) 100 | mod4summ= summary(mod4) 101 | 102 | countspch= function(x) sum(x=="*") 103 | om1= as.integer(apply(mod4summ$outmat,2,countspch)) 104 | data.frame("Coeff"=as.integer(apply(mod4summ$outmat,1,countspch)), 105 | "RSS"=mod4summ$rss, 106 | "Cp"=round(mod4summ$cp,digits = 2), 107 | "R-sq"=round(mod4summ$rsq, digits = 2), 108 | "Adj.R-sq"=round(mod4summ$adjr2, digits = 2), 109 | mod4summ$outmat[,order(-om1)]) 110 | 111 | coef(mod4,1:8) 112 | 113 | #Backward elimination: 114 | 115 | mod5= regsubsets(Price ~., data= dftrain, nbest = 1,nvmax = NULL, 116 | force.in = NULL, force.out = NULL, 117 | method = "backward", intercept = T) 118 | mod5summ= summary(mod5) 119 | 120 | countspch= function(x) sum(x=="*") 121 | om2= as.integer(apply(mod5summ$outmat,2,countspch)) 122 | data.frame("Coeff"=as.integer(apply(mod5summ$outmat,1,countspch)), 123 | "RSS"=mod5summ$rss, 124 | "Cp"=round(mod5summ$cp,digits = 2), 125 | "R-sq"=round(mod5summ$rsq, digits = 2), 126 | "Adj.R-sq"=round(mod5summ$adjr2, digits = 2), 127 | mod5summ$outmat[,order(-om2)]) 128 | 129 | coef(mod5,1:8) 130 | 131 | #Sequential Replacement 132 | 133 | mod6= regsubsets(Price ~., data= dftrain, nbest = 1,nvmax = NULL, 134 | force.in = NULL, force.out = NULL, 135 | method = "seqrep", intercept = T) 136 | mod6summ= summary(mod6) 137 | 138 | countspch= function(x) sum(x=="*") 139 | om3= as.integer(apply(mod6summ$outmat,2,countspch)) 140 | data.frame("Coeff"=as.integer(apply(mod6summ$outmat,1,countspch)), 141 | "RSS"=mod6summ$rss, 142 | "Cp"=round(mod6summ$cp,digits = 2), 143 | "R-sq"=round(mod6summ$rsq, digits = 2), 144 | "Adj.R-sq"=round(mod6summ$adjr2, digits = 2), 145 | mod6summ$outmat[,order(-om3)]) 146 | 147 | coef(mod6,1:8) 148 | 149 | #Stepwise Regression 150 | 151 | mod7= step(lm(Price~., data = dftrain), direction = "both") 152 | #options(op) 153 | 154 | -------------------------------------------------------------------------------- /Naive Bayes.R: -------------------------------------------------------------------------------- 1 | library(xlsx) 2 | 3 | # FlightDetails.xlsx 4 | df= read.xlsx(file.choose(),1,header = T) 5 | df= df[,!apply(is.na(df),2,all)] 6 | df= df[!apply(is.na(df),1,all),] 7 | head(df) 8 | str(df) 9 | 10 | dfb= df 11 | df= dfb 12 | # Correct arrival and departure times 13 | df$STD= format(df$STD, "%H:%M:%S") 14 | df$STD= as.POSIXlt(paste(df$Date,df$STD),format = "%Y-%m-%d %H:%M:%S") 15 | df$ATD= format(df$ATD, "%H:%M:%S") 16 | df$ATD= as.POSIXlt(paste(df$Date,df$ATD),format = "%Y-%m-%d %H:%M:%S") 17 | df$STA= format(df$STA, "%H:%M:%S") 18 | df$STA= as.POSIXlt(paste(df$Date,df$STA),format = "%Y-%m-%d %H:%M:%S") 19 | df$ATA= format(df$ATA, "%H:%M:%S") 20 | df$ATA= as.POSIXlt(paste(df$Date,df$ATA),format = "%Y-%m-%d %H:%M:%S") 21 | 22 | head(df) 23 | str(df) 24 | 25 | dfb2= df 26 | 27 | 28 | df=dfb 29 | df$STD=strptime(format(df$STD, "%H:%M:%S"),"%H:%M:%S") 30 | df$ATD=strptime(format(df$ATD, "%H:%M:%S"),"%H:%M:%S") 31 | df$STA=strptime(format(df$STA, "%H:%M:%S"),"%H:%M:%S") 32 | df$ATA=strptime(format(df$ATA, "%H:%M:%S"),"%H:%M:%S") 33 | 34 | head(df) 35 | str(df) 36 | 37 | 38 | #Break departure time into approprinttermal 39 | range(df$ATD) 40 | breaks = seq(strptime("00:00:00", "%H:%M:%S"),strptime("24:00:00","%H:%M:%S"), 41 | by = "6 hours") 42 | labelsv= c("0-6","6-12","12-18","18-24") 43 | DEPT= cut(df$ATD, breaks= breaks, right= F, labels = labelsv) 44 | 45 | df= cbind(df, DEPT) 46 | 47 | df$Day= as.factor(df$Day) 48 | levels(df$Day) 49 | levels(df$Day)= c("Sunday","Monday") 50 | 51 | head(df) 52 | str(df) 53 | 54 | dfb3= df 55 | df= df[,-c(1,3,5,8)] 56 | str(df) 57 | head(df) 58 | 59 | partidx= sample(1:nrow(df),0.6*nrow(df), replace = F) 60 | dftrain=df[partidx,] 61 | dftest=df[-partidx,] 62 | 63 | library(e1071) 64 | mod= naiveBayes(df$Flight.Status ~ .,dftrain) 65 | attributes(mod) 66 | 67 | mod$apariori 68 | mod$tables 69 | path="" 70 | write.xlsx(dftrain, path+"FlightDetails.xlsx") 71 | 72 | mod$tables$Flight.Carrier 73 | mod$tables$Flight.Carrier[1,3] 74 | mod$tables$Flight.Carrier["ontime","Indigo"] 75 | 76 | 77 | # Find exact matches for complete for Exact Bayes 78 | 79 | dftrain[which(dftrain$Flight.Carrier=="Indigo"& 80 | dftrain$SRC=="MAA"& 81 | dftrain$DEST=="IXC"& 82 | dftrain$Day=="Monday"& 83 | dftrain$DEPT=="0-6"),] 84 | 85 | #NAIVE BAYES formulae (NUMERATOR): 86 | 87 | p1= (mod$apriori[["delayed"]]/nrow(dftrain)) 88 | *(mod$tables$Flight.Carrier["delayed","Indigo"]) 89 | *(mod$tables$SRC["delayed","MAA"]) 90 | *(mod$tables$DEST["delayed","IXC"]) 91 | *(mod$tables$DEPT["delayed","0-6"])* 92 | (mod$table$Day["delayed","Monday"]) 93 | print(p1,digits=4) 94 | 95 | # P(ontime|Example) 96 | p2= (mod$apriori[["ontime"]]/nrow(dftrain))* 97 | (mod$tables$Flight.Carrier["ontime","Indigo"]) 98 | *(mod$tables$SRC["ontime","MAA"]) 99 | *(mod$tables$DEST["ontime","IXC"]) 100 | *(mod$tables$DEPT["ontime","0-6"])* 101 | (mod$table$Day["ontime","Monday"]) 102 | print(p1,digits=5) 103 | 104 | # Actual Probablities 105 | # P(delyed|Example) 106 | p1/(p1+p2) 107 | # P(ontime|Example) 108 | p2/(p1+p2) 109 | 110 | 111 | #SCoring test partition 112 | modtest= predict(mod, dftest[,5], type="class") 113 | modtestp= predict(mod, dftest[,5], type="raw") 114 | 115 | table("Actual class"=dftest$Flight.Status, "Predicted class"=modtest) 116 | head(data.frame("Predicted class"=modtest, 117 | "Actual class"=dftest$Flight.Status, 118 | "Prob for 1(success)"=modtestp[,"delayed"], 119 | dftest[-5])) 120 | 121 | #Classification accuracy 122 | mean(modtest==dftest$Flight.Status) 123 | #MisClassification accuracy 124 | mean(modtest!=dftest$Flight.Status) 125 | 126 | #Scoring Training Partition 127 | modtrain= predict(mod, dftrain[,-5]) 128 | table("Actual class"=dftrain$Flight.Status, "Predicted class"=modtrain) 129 | 130 | #Classification accuracy 131 | mean(modtrain==dftest$Flight.Status) 132 | #MisClassification accuracy 133 | mean(modtrain!=dftest$Flight.Status) 134 | 135 | #Cumlulative Lift Curve 136 | cases = 1:nrow(dftest) 137 | modtestn= dftest$Flight.Status 138 | levels(modtestn)= c(1,0) #c("delayed","ontime") 139 | modtestn= as.numeric(as.character(modtestn)) 140 | dfl= data.frame("prob"=modtestp[,"delayed"],"actual class" =modtestn) 141 | dfl= dfl[order(-dfl$prob),] 142 | 143 | cumAC= NULL 144 | cumAC[1]= dfl$actual.class[1] 145 | for(i in 2:nrow(dfl)) { 146 | cumAC[i]= cumAC[i-1]+ dfl$Actual.class[i] 147 | } 148 | 149 | plot(cases, cumAC, type="l", 150 | xlab="# cases", ylab= "Cumulative", xlim=c(0,50), ylim = c(0,20)) 151 | segments(0,0,nrow(dfl),cumAC[nrow(dfl)],lty= 3) 152 | legend(25,5,inset= 0.005, 153 | c("Cumulative 1's sorted by predicted values", 154 | "Cumulative 1's using average"), 155 | lty= c(1,2),cex= 0.7, x.intersp = 0,3, y.intersp = 0.3) 156 | 157 | -------------------------------------------------------------------------------- /Partitioning&Regression.R: -------------------------------------------------------------------------------- 1 | dfh= data.frame("Promotions"=c(2.00,3.50,6.00,6.50,7.50,8.00,9.00), 2 | "sales"=c(5.00,8.00,5.50,14.00,13.50,14.50,13.50)); dfh 3 | 4 | summary(dfh) 5 | 6 | plot(dfh$Promotions, dfh$sales,las=1, 7 | xlab= "Primotions(in crores)", ylab="Sales(in in crores)", 8 | xlim= c(0,10), ylim= c(0,16)) 9 | 10 | lines(spline(dfh$Promotions, dfh$sales, method= "fmm")) 11 | lines(smooth.spline(dfh$Promotions, dfh$sales)) 12 | 13 | library(xlsx) 14 | 15 | df= read.xlsx(file.choose(),1,header = T) 16 | df= df[,!apply(is.na(df),2,all)] 17 | 18 | 19 | names(df) 20 | 21 | df[1:9,] 22 | 23 | # sorting for outlier detection 24 | head(data.frame("KM"=df$KM,"SR_Price"=df$SR_Price, 25 | "Mfg_year"=df$Mfg_Year)[order(-df$KM),]) 26 | 27 | Age= 2017- df$Mfg_Year 28 | df= cbind(df,Age) 29 | df1= df[,-c(1,2,3)] 30 | head(df1) 31 | 32 | set.seed(12345) 33 | 34 | partidx= sample(1:nrow(df1),0.5*nrow(df1),replace = F) 35 | df1train= df1[partidx,] 36 | df1test= df1[-partidx,] 37 | 38 | mod= lm(Price ~ ., df1train) 39 | summary(mod) 40 | Residualtrain= df1train$Price- mod$fitted.values 41 | head(data.frame("Actual value"= df1train$Price, 42 | "Predicted value"= mod$fitted.values, 43 | Residualtrain)) 44 | modtest = predict(mod, df1test[,-c(4)]) 45 | Residualtest= df1test$Price- modtest 46 | head(data.frame("Actual value"=df1test$Price,"Predicted value"=modtest, 47 | Residualtest)) 48 | 49 | install.packages("rminer", dependencies = T) 50 | library(rminer) 51 | mmetric(df1train$Price, mod$fitted.values,c("SSE","RMSE","ME")) 52 | mmetric(df1test$Price, modtest,c("SSE","RMSE","ME")) 53 | 54 | -------------------------------------------------------------------------------- /Performance_Metrices.R: -------------------------------------------------------------------------------- 1 | library(xlsx) 2 | 3 | df= read.xlsx(file.choose(),1, header= T) 4 | df= df[,!apply(is.na(df), 2,all)] 5 | 6 | plot(df$Annual_Income,df$Household_Area, las=1 , xlab= "Annual Income", ylab= "Household Area", 7 | xlim= c(2,12), ylim = c(13,25), pch= c(21,19)[as.numeric(df$Ownership)]) 8 | 9 | legend("bottomright", inset= 0.005, c("owner", "Non-Owner"), pch= c(19,21), cex= 0.7, 10 | x.intersp = 0.5, y.intersp = 0.5) 11 | 12 | #Promoffers.xlsx 13 | df1= read.xlsx(file.choose(),1, header= T) 14 | df1= df1[,!apply(is.na(df1), 2,all)] 15 | 16 | palette() 17 | palette(c("gray","black")) 18 | 19 | plot(df1$Income, df1$Spending, xlim=c(0,225), ylim=c(0,11), 20 | xlab="Income", ylab="Spending", col= as.factor(df1$Promoffer), 21 | pch=19, cex=0.8, panel.first = grid()) 22 | 23 | plot(jitter(df1$Income,1), df1$Spending, xlim=c(0,225), ylim=c(0,11), 24 | xlab="Income", ylab="Spending", col= as.factor(df1$Promoffer), 25 | pch=20, cex=0.8, panel.first = grid()) 26 | par(mar=c(4,4,1,1), oma=c(1,1,1,1)) 27 | 28 | plot(jitter(df1$Income,1), df1$Spending, log= "xy", 29 | xlab="Income", ylab="Spending", col= as.factor(df1$Promoffer), 30 | pch=20, cex=0.7, panel.first = grid()) 31 | palette("default") 32 | 33 | #Classification Marix 34 | cm= matrix(c(400,50,25,2525), 2,2,T, list(c("1","0"),c("1","0"))) 35 | 36 | err= (cm['0','1']+cm['1','0'])/sum(cm) 37 | accuracy= (cm['1','1']+cm['0','0'])/sum(cm) 38 | 39 | #ROC Curve 40 | df2= read.xlsx(file.choose(),1, header= T) 41 | df2= df2[,!apply(is.na(df2), 2,all)] 42 | df2= df2[!apply(is.na(df2),1,all),] 43 | 44 | data.frame("cutoffvalue"=df2$cutoff,"OneMinusSpecificity"=1-df2$specificity, 45 | "Senstivity"=df2$senstivity)[order(df2$cutoff, decreasing = T)] 46 | plot(1-df2$specificity, df2$senstivity, type= "p", 47 | xlab = "1-Specifity", ylab= "Senstivity", pch=19) 48 | 49 | plot(1-df2$specificity, df2$senstivity, type= "s", 50 | xlab = "1-Specifity", ylab= "Senstivity", pch=19) 51 | segments(0,0,1,1,lty= 3) 52 | legend("right", inset= 0.005, c("Random", "ROC"), lty= c(2,1),bty="n", cex= 0.7, 53 | x.intersp = 0.3, y.intersp = 0.3) 54 | 55 | #Cumulative Lifts Curve / Gains Chart 56 | df3= read.xlsx(file.choose(),1, header= T) 57 | df3= df3[,!apply(is.na(df3), 2,all)] 58 | df3= df3[!apply(is.na(df3),1,all),] 59 | 60 | range(df3$Cumulative.Actual.Class) 61 | range(df3$Serial.no.) 62 | plot(df3$Serial.no., df3$Cumulative.Actual.Class, type = "l", 63 | xlab= "# Cases", ylab= "Cumulative", xlim= c(0,30), ylim= c(0,14)) 64 | segments(0,0,24,12,lty= 3) 65 | segments(1,1,12,12,lty= 4, col= "red") 66 | segments(12,12,24,12,lty= 4, col= "red") 67 | legend(22,10, inset= 0.005, c("Cumlative 1's sorted by predicted values", 68 | "Cumlative 1's using random selection"), 69 | lty= c(1,2),bty="n", cex= 0.7,x.intersp = 0.3, y.intersp = 0.3) 70 | 71 | #Decile Chart 72 | decilecases= round(seq(0.1,1,0,1)*length(df3$serial.no.)) 73 | decile= NULL 74 | decilemean= NULL 75 | globalmean= length(which(df3$Actual.Class==1))/length(df3$Actual.Class) 76 | j=0 77 | for(i in decilecases) { 78 | j=j+1 79 | decilemean[j]= df3Cumulative.Actual.Class[i]/i 80 | decile[j]= decilemean[j]/globalmean 81 | } 82 | barplot(decile, names.arg = as.factor(seq(1,10,1)),xlab="Deciles", 83 | ylab= "Decile mean/Global Mean", ylim = c(0,2.5)) 84 | 85 | #Cumulative lift curve(gains chart) incorporating costs 86 | #cutoffdata.xlsx 87 | df4= read.xlsx(file.choose(), 5, colIndex = 1:5, T) 88 | df4= df4[,!apply(is.na(df4), 2,all)] 89 | head(df4) 90 | 91 | range(df4$Cumulative.Cost.) 92 | range(df4$Serial.no.) 93 | plot(df4$Serial.no., df4$Cumulative.Cost.,type= "l", xlab= xlab = "# cases", 94 | ylab="Cumulative costs", xlim= c(0,25), ylim=c(5,140)) 95 | segments(0,0,24,132, lty = 3) 96 | legend(22,10, inset= 0.005, c("Cumlative costs sorted by predicted values", 97 | "reference line"), 98 | lty= c(1,2),bty="n", cex= 0.7,x.intersp = 0.3, y.intersp = 0.3) 99 | -------------------------------------------------------------------------------- /PredictionMetrics.R: -------------------------------------------------------------------------------- 1 | library(xlsx) 2 | df= read.xlsx(file.choose(),1,T) 3 | df= df[,!apply(is.na(df),2,all)] 4 | head(df) 5 | 6 | plot(df$Serial.No, df$Cumulative.value, type = "l", 7 | xlab= "# cases", ylab= "Cumulative value", 8 | xlim= c(0,25), ylim= c(40 ,550)) 9 | segments(0,0,20,544,lty=3) 10 | legend(12,200, inset=0.005, 11 | c("Cumulative value sorted by predicted value", 12 | "reference line"), 13 | lty= c(1,2), bty= "n", cex= 0.7, x.intersp= 0.3, y.intersp= 0.3) 14 | -------------------------------------------------------------------------------- /RIntro.R: -------------------------------------------------------------------------------- 1 | library(xlsx) 2 | 3 | df= read.xlsx(file.choose(),1,header = T) 4 | 5 | df1= read.xlsx("C:/Users/Harshit/Desktop/Business Analytics in R/SedanCar.xlsx", 6 | 1, header = T) 7 | 8 | setwd("C:/Users/Harshit/Desktop/Business Analytics in R/") 9 | 10 | df2= read.xlsx("SedanCar.xlsx",1, header = T) 11 | 12 | library(matrixcalc) 13 | #matrix.inverse(Mat2) 14 | -------------------------------------------------------------------------------- /Simple Line Plotting.R: -------------------------------------------------------------------------------- 1 | library(xlsx) 2 | 3 | df= read.xlsx(file.choose(),1,header = T) 4 | df= df[,!apply(is.na(df),2,all)] 5 | head(df) 6 | summary(df) 7 | 8 | plot(df$Annual_Income, df$Household_Area, las= 1, 9 | xlab= "Annual Income (a.'lakhs)", ylab= "Houshold Area (00s fts)", 10 | xlim= c(2,12), ylim= c(13,25), pch=c(21,19)[as.numeric(df$Ownership)]) 11 | legend("bottomright",inset= 0.005,c("Owner","Nonowner"), 12 | pch=c(19,21),cex= 0.7,x.intersp = 0.5, y.intersp = 0.5) 13 | df[df$Annual_Income>5 & df$Annual_Income<8.5 & df$Household_Area>18 14 | & df$Household_Area<20, c(1,2)] 15 | abline(h=18.8, col=3) 16 | segments(7,0,7,18.8,col = 3) 17 | segments(5.8,18.8,5.8,26,col = 3) 18 | 19 | df[df$Annual_Income>6 & df$Annual_Income<8.5 & df$Household_Area>18 20 | & df$Household_Area<21, c(1,2)] 21 | segments(5.8,19.5,13,19.5,col = 3) 22 | 23 | df[df$Annual_Income<7 & df$Household_Area>17 & df$Household_Area<19, c(1,2)] 24 | segments(0,18.2,7,18.2,col = 3) 25 | -------------------------------------------------------------------------------- /Specialized Visualization Techniques.R: -------------------------------------------------------------------------------- 1 | library(xlsx) 2 | 3 | df1= read.xlsx(file.choose(),1, header= T) 4 | df1= df1[,!apply(is.na(df1), 2,all)] 5 | 6 | Age= 2017- df1$Mfg_Year 7 | df1= cbind(df1,Age) 8 | dfb= df1 9 | df1= df1[,-c(1,2,3)] 10 | 11 | head(df1) 12 | str(df1) 13 | df1$Transmission= as.factor(df1$Transmission) 14 | df1$C_Price= as.factor(df1$C_Price) 15 | str(df1) 16 | summary(df1) 17 | 18 | df1= df1[df1$Price<70,] 19 | dfb= df1 20 | df1= df1[-23,] 21 | 22 | head(df1) 23 | range(df1$KM) 24 | boxplot(df1$Price~df1$Transmission, ylim= c(0,15),xlab="Transmission", 25 | ylab="Price") 26 | means= by(df1$Price, df1$Transmission,mean) 27 | points(1:2,means,pch=3) 28 | 29 | boxplot(df1$KM~df1$C_Price, ylim= c(25,180),xlab="C_Price", 30 | ylab="KM") 31 | means1= by(df1$KM, df1$C_Price,mean) 32 | points(1:2,means1,pch=3) 33 | 34 | boxplot(df1$Age~df1$C_Price, ylim= c(0,12),xlab="C_Price", 35 | ylab="Age") 36 | means2= by(df1$Age, df1$C_Price,mean) 37 | points(1:2,means2,pch=3) 38 | 39 | boxplot(df1$SR_Price~df1$C_Price, ylim= c(0,25),xlab="C_Price", 40 | ylab="SR_Price") 41 | means3= by(df1$SR_Price, df1$C_Price,mean) 42 | points(1:2,means3,pch=3) 43 | 44 | #Heatmaps 45 | #correlation matrix 46 | M= cor(df1[,-c(1,5,8)]) 47 | symnum(M) 48 | M[upper.tri(M)]=NA; 49 | 50 | #correlation table heatmap 51 | heatmap(M,Rowv = NA, symm= T, col= grey.colors(100, start= 0.8, end=0.2),scale= "none", margins= c(8,4)) 52 | 53 | #Missing value heatmap 54 | heatmap(head(as.matrix(df1[,-c(1,5,8)])),Rowv = NA,Colv = NA, 55 | col= grey.colors(1000, start= 0.8, end=0.0),scale= "column", margins= c(8,4)) 56 | 57 | heatmap(as.matrix(df1[,-c(1,5,8)]),Rowv = NA,Colv = NA, 58 | col= grey.colors(1000, start= 0.8, end=0.0),scale= "column", margins= c(8,4)) 59 | 60 | #Multidimensional Visualization 61 | palette() 62 | palette(rainbow(6)) 63 | palette("default") 64 | 65 | range(df1$Age) 66 | plot(df1$Age, df1$KM, xlim=c(0,12),xlab= "Age", ylab= "KM", col= df1$C_Price) 67 | 68 | #separate panel for each group 69 | Age_groups= levels(as.factor(df1$Age)) 70 | Age_groups2= as.numeric(Age_groups) 71 | avgPrice1=NULL 72 | avgPrice2=NULL 73 | for(x in Age_groups2){ 74 | avgPrice1= c(avgPrice1,mean(df1[which(df1$Age == x & df1$Transmission == 0),]$Price)) 75 | avgPrice2= c(avgPrice2,mean(df1[which(df1$Age == x & df1$Transmission == 1),]$Price)) 76 | } 77 | avgPrice1[which(avgPrice1=="Nan")]=0 78 | avgPrice2[which(avgPrice2=="Nan")]=0 79 | avgPrice2[is.nan(avgPrice2)]<-0 80 | 81 | par(mfrow=c(2,1),cex= 0.6, mar= c(3,3,0,0),oma= c(1,1,1,1)) 82 | 83 | range(avgPrice1) 84 | range(avgPrice2) 85 | 86 | #code incomplete 87 | barplot(avgPrice1,names.arg = Age_groups, xlab="",ylab="",xaxt="n",ylim=c(0,9)) 88 | box("plot") 89 | legend("topright",inset=0.005, c("Trans=0"),bty = "n", cex = 1) 90 | mtext("Avg(Price)",side=2, line= 2.2, cex= 0.7, adj=0) 91 | 92 | 93 | barplot(avgPrice2, names.arg = Age_groups, xlab = "",ylab="", ylim= c(0,9)) 94 | box("plot") 95 | legend("topright",inset=0.005, c("Trans=1"),bty = "n", cex = 1) 96 | mtext("Age",side=1, line= 2.2, cex= 0.7) 97 | 98 | # MULTIPANEL PLOT VISUALIZATION 99 | pairs(~ SR_Price + KM + Price + Age, data= df1) 100 | 101 | par(mfrow=c(2,2),cex= 0.6, mar= c(3,3,0,0),oma= c(1,1,1,1)) 102 | 103 | plot(df1$KM,df1$Price, xlim=c(0,180), ylim= c(0,15),xlab="",ylab="") 104 | mtext("KM",side=1, line= 2.2, cex= 0.7) 105 | mtext("Price",side=2, line= 2.2, cex= 0.7) 106 | 107 | plot(df1$KM,df1$Price,log="xy" ,xlim=c(10,1000), ylim= c(0.1,100), 108 | xaxt="n",yaxt="n",xlab="",ylab="") 109 | axis(1,at=c(10,100,1000),labels=c("10","100","1000")) 110 | axis(2,at=c(0.1,1,10,100),labels=c("0.1","1","10","100")) 111 | mtext("KM",side=1, line=2, cex=0.6) 112 | mtext("Price",side=2, line=2, cex=0.6) 113 | 114 | range(dfb$Price) 115 | boxplot(dfb$Price~dfb$Transmission, ylim=c(0,75),xlab="",ylab="") 116 | mtext("Trans",side=1, line=2, cex=0.6) 117 | mtext("Price",side=2, line=2, cex=0.6) 118 | 119 | boxplot(dfb$Price~dfb$Transmission, log="y",ylim=c(0.1,100),xlab="",ylab="") 120 | mtext("Trans",side=1, line=2, cex=0.6) 121 | mtext("Price",side=2, line=2, cex=0.6) 122 | 123 | #AGGREGATIONS , ATTACHING A CURVE , ZOOMING IN 124 | par(mfrow=c(2,2),cex= 0.6, mar= c(2.7,2.5,1,0.5),oma= c(0,0,0,0)) 125 | df= read.xlsx(file.choose(), 1 ,header = T) 126 | df= df[,!apply(is.na(df), 2,all)] 127 | 128 | at1= seq(as.Date("2004-01-01"), as.Date("2017-03-01"),by="2 years") 129 | labels1=format(at1,"%b-%Y") 130 | at2=format(at1,"%Y") 131 | plot(tsv,xaxt="n",yaxt="n") 132 | axis(1,at=at2, labels=format(at1, "%d/%m/%Y"),cex.axis=0.8) 133 | axis(2,cex.axis=0.8) 134 | mtext(side=1,text="Month" ,line=2, cex=0.6) 135 | mtext(side=2,text="Riders", line=2, cex=0.6) 136 | title(main="Overlaying a quadratic curve on Raw Series",adj=0, cex.main=0.9) 137 | 138 | lines(lowess(tsv), col="red") 139 | 140 | t=seq(1,length(df$Month), by=1) 141 | tsq= t*t 142 | points(time(tsv), predict(lm(df$Riders~t+tsq)), col="green") 143 | abline(v=at2, h=axTicks(2), col="gray", lty=3) 144 | 145 | rideBym=NULL 146 | for(it in 1:12){ 147 | rideBym[it]=0 148 | } 149 | 150 | i=1 151 | while(i<=145){ 152 | rideBym[1]=rideBym[1]+df$Riders[i] 153 | rideBym[2]=rideBym[2]+df$Riders[i+1] 154 | rideBym[3]=rideBym[3]+df$Riders[i+2] 155 | rideBym[4]=rideBym[4]+df$Riders[i+3] 156 | rideBym[5]=rideBym[5]+df$Riders[i+4] 157 | rideBym[6]=rideBym[6]+df$Riders[i+5] 158 | rideBym[7]=rideBym[7]+df$Riders[i+6] 159 | rideBym[8]=rideBym[8]+df$Riders[i+7] 160 | rideBym[9]=rideBym[9]+df$Riders[i+8] 161 | rideBym[10]=rideBym[10]+df$Riders[i+9] 162 | rideBym[11]=rideBym[11]+df$Riders[i+10] 163 | rideBym[12]=rideBym[12]+df$Riders[i+11] 164 | i=i+12 165 | } 166 | rideBym[1]=rideBym[1]+df$Riders[i] 167 | rideBym[2]=rideBym[2]+df$Riders[i+1] 168 | rideBym[3]=rideBym[3]+df$Riders[i+2] 169 | 170 | avgBym= c(rideBym[1]/14, rideBym[2]/14,rideBym[3]/14,rideBym[4]/13,rideBym[5]/13, 171 | rideBym[6]/13, rideBym[7]/13,rideBym[8]/13,rideBym[9]/13,rideBym[10]/13, 172 | rideBym[11]/13, rideBym[12]/13) 173 | 174 | tsv1= ts(avgBym, start=1 , end=12, frequency = 1) 175 | plot(tsv1, xaxt="n", yaxt="n") 176 | at3= seq(as.Date("1jan","%d%b"),as.Date("1dec","%d%b"),by="1 month") 177 | at4= seq(as.Date("1","%d"),as.Date("12","%d"),by="1 day") 178 | axis(1,at= format(at4,"%d"), labels = format(at3,"%b"),las=3,cex.axis=0.8) 179 | axis(2,cex.axis=0.8) 180 | mtext(side=1,text="Month" ,line=2, cex=0.6) 181 | mtext(side=2,text="AvgRiders", line=2, cex=0.6) 182 | title(main="Aggregation by Month",adj=0, cex.main=0.9) 183 | 184 | abline(v= format(at4,"%d"), h=axTicks(2), col="gray", lty=3) 185 | 186 | tsvz= window(tsv, start=c(2004,1), end=c(2005,12)) 187 | plot(tsvz, xaxt="n", yaxt="n") 188 | at5=NULL 189 | i=1 190 | while(i<=24){ 191 | at5=c(at5,time(tsvz)[i]) 192 | i=i+4 193 | } 194 | at6= seq(as.Date("2004-01-01"),as.Date("2005-12-01"),by="4 months") 195 | axis(1,at= at5, labels = format(at6,"%d/%m/%Y"),cex.axis=0.8) 196 | axis(2,cex.axis=0.8) 197 | mtext(side=1,text="Month" ,line=2, cex=0.6) 198 | mtext(side=2,text="AvgRiders", line=2, cex=0.6) 199 | title(main="Zooming into first 2 years",adj=0, cex.main=0.9) 200 | 201 | abline(v=at5, h=axTicks(2), col="gray", lty=3) 202 | 203 | plot(aggregate(tsv, FUN=mean),xaxt="n",yaxy="n",cex.axis=0.6) 204 | axis(1,cex.axis=0.8) 205 | axis(2,cex.axis=0.8) 206 | mtext(side=1,text="Year" ,line=2, cex=0.6) 207 | mtext(side=2,text="AvgRiders", line=2, cex=0.6) 208 | title(main="Aggregation for year",adj=0, cex.main=0.9) 209 | grid() 210 | -------------------------------------------------------------------------------- /Visualization Techniques.R: -------------------------------------------------------------------------------- 1 | library(xlsx) 2 | 3 | df1= read.xlsx(file.choose(),1, header= T) 4 | df1= df1[,!apply(is.na(df1), 2,all)] 5 | 6 | Age= 2017- df1$Mfg_Year 7 | df1= cbind(df1,Age) 8 | dffb= df1 9 | df1= df1[,-c(1,2,3)] 10 | 11 | head(df1) 12 | str(df1) 13 | df1$Transmission= as.factor(df1$Transmission) 14 | df1$C_Price= as.factor(df1$C_Price) 15 | str(df1) 16 | summary(df1) 17 | dfb=df1 18 | df1=df1[-23,] 19 | 20 | dffb[dffb$Price>70,] 21 | dffb[dffb$Price>12,] 22 | dffb[dffb$KM>150,] 23 | 24 | dffb= dffb[-c(13,23,29,65,73),] 25 | range(dffb$KM) 26 | range(dffb$Price) 27 | plot(dffb$KM,dffb$Price, xlim= c(25,120), ylim=c(1,9),xlab="KM",ylab="Price", panel.first = grid()) 28 | #dffb$Model= as.factor(dffb$Model) 29 | #dffb$Model= as.numeric(dffb$Model) 30 | text(dffb$KM,dffb$Price , dffb$Model, adj= c(-0.4,-0.4), cex= 0.5) 31 | #dffb$Model= as.factor(dffb$Model) 32 | 33 | df3= read.xlsx(file.choose(),1, header= T) 34 | df3= df3[,!apply(is.na(df3), 2,all)] 35 | 36 | palette() 37 | palette(c("gray","black")) 38 | 39 | plot(df3$Income, df3$Spending, xlim=c(0,225), ylim=c(0,11), 40 | xlab="Income", ylab="Spending", col= as.factor(df3$Promoffer), 41 | pch=19, cex=0.8, panel.first = grid()) 42 | 43 | plot(jitter(df3$Income,1), df3$Spending, xlim=c(0,225), ylim=c(0,11), 44 | xlab="Income", ylab="Spending", col= as.factor(df3$Promoffer), 45 | pch=20, cex=0.8, panel.first = grid()) 46 | par(mar=c(4,4,1,1), oma=c(1,1,1,1)) 47 | 48 | plot(jitter(df3$Income,1), df3$Spending, log= "xy", 49 | xlab="Income", ylab="Spending", col= as.factor(df3$Promoffer), 50 | pch=20, cex=0.7, panel.first = grid()) 51 | palette("default") 52 | 53 | #MULTIVARIATE PLOT 54 | #PARALLEL COORDINATES PLOT 55 | library(MASS) 56 | par(mfrow=c(2,1), cex=0.6, mar= c(3,3,0,0), oma=c(1,1,1,1)) 57 | df4= df1 58 | levels(df4$Fuel_type)=1:length(levels(df4$Fuel_type)) 59 | df4=as.data.frame(lapply(df4,FUN=as.numeric)) 60 | 61 | parcoord(df4[which(df4$C_Price=='1'),-c(4,8)]) 62 | axis(2,at=axTicks(2), labels=c("0%","20%","40%","60%","80%","100%")) 63 | grid() 64 | parcoord(df4[which(df4$C_Price=='2'),-c(4,8)],col="gray") 65 | axis(2,at=axTicks(2), labels=c("0%","20%","40%","60%","80%","100%")) 66 | grid() 67 | 68 | #Specialized Visualization 69 | #Network Data 70 | #Network Graph 71 | #Two mode or bipartite graphs 72 | #Example for association rules 73 | 74 | item1= sample(LETTERS[1:10], size=50 ,T) 75 | pool= letters[1:10] 76 | item2=NULL 77 | for(i in 1:50) { 78 | item2=c(item2,sample(pool[-which(pool==tolower(item1[i]))],size=1,replace=T)) 79 | } 80 | df5= data.frame(item1,item2) 81 | 82 | library(igraph) 83 | g= graph_from_data_frame(df5,directed = F) 84 | 85 | V(g)$label= V(g)$name 86 | V(g)[1:10]$type=1 87 | V(g)[11:20]$type=2 88 | 89 | V(g)$color= "gray" 90 | E(g)$color= "black" 91 | V(g)$shape= "circle" 92 | 93 | V(g)$x= c(runif(10,0,5),runif(10,10,15)) 94 | V(g)$y= c(seq(10,1,by=-1),seq(10,1,by=-1)) 95 | 96 | 97 | 98 | E(g)$weight= count.multiple(g) 99 | g1= simplify(g, remove.multiple = T) 100 | E(g1)$width= 0.5*E(g1)$weight 101 | 102 | 103 | size= NULL 104 | for(i in V(g1)$name){ 105 | size=c(size,length(E(g1)[from(V(g1)[i])])) 106 | } 107 | V(g1)$size= 4*size 108 | par(mar= rep(.1,4)) 109 | 110 | V(g1)$color= "gray" 111 | E(g1)$color= "black" 112 | 113 | plot(g1) 114 | 115 | #Heirarchical Data 116 | #Treemaps 117 | df6= read.xlsx(file.choose(),1, header= T) 118 | df6= df6[,!apply(is.na(df6), 2,all)] 119 | 120 | library(treemap) 121 | rec.size= ifelse(df6$price>=5000+df6$price/10, df6$price) 122 | df6= cbind(df6,rec.size) 123 | 124 | par(mar= rep(.1,4)) 125 | 126 | treemap(df6,index= c("item.category","subcategory","brand"), 127 | vsize= "rec..size", vColor="rating", 128 | type= "value", fun.aggregate = "mean", 129 | palette = gray(0:4/4), fontsize.labels = c(11,9,6), 130 | title= "", position.legend = "none") 131 | 132 | #Geographical data 133 | #Map chart 134 | df7= read.xlsx(file.choose(),1, header= T) 135 | df7= df7[,!apply(is.na(df7), 2,all)] 136 | library(rworldmap) 137 | 138 | mapDevice(rows= 2, columns= 1) 139 | datamap= joinCountryData2Map(df7, nameJoinColumn = "Country", joinCode = "Name") 140 | mapCountryData(datamap, nameColumnToPlot = "Inclusive.Internet.Index", 141 | catMethod = "pretty", colourPalette = gray(7:0/7), 142 | addLegend = F) 143 | mapCountryData(datamap, nameColumnToPlot = "Corruptions.Perceptions.Index", 144 | catMethod = "pretty", colourPalette = gray(7:0/7), 145 | addLegend = F) 146 | -------------------------------------------------------------------------------- /Welch's T-Test.R: -------------------------------------------------------------------------------- 1 | library(xlsx) 2 | 3 | df= read.xlsx(file.choose(),1,header = T) 4 | df= df[,!apply(is.na(df),2,all)] 5 | head(df) 6 | summary(df) 7 | 8 | cov(df$Annual_Income, df$Household_Area) 9 | 10 | cor(df$Annual_Income, df$Household_Area) 11 | 12 | mean(df$Annual_Income) 13 | 14 | median(df$Annual_Income) 15 | 16 | IQR(df$Annual_Income) 17 | 18 | sd(df$Annual_Income) 19 | 20 | var(df$Annual_Income) 21 | 22 | apply(df[,c(1,2)],MARGIN = 2,FUN = sd) 23 | 24 | mmdiff= function(df){ 25 | apply(df,MARGIN = 2,function(x){max(x)-min(x)}) 26 | } 27 | mmdiff(df[,c(1,2)]) 28 | 29 | x= rnorm(100) 30 | y=x+ rnorm(100, mean=0, sd=0.6) 31 | 32 | df1= as.data.frame(cbind(x,y)) 33 | head(df1) 34 | summary(df1) 35 | 36 | plot(df1$x, df1$y, las=1, main= "Scatterplot of x and y", 37 | xlab= "x", ylab="y", 38 | xlim=c(-3,3),ylim=c(-4,4)) 39 | 40 | x1= rnorm(20,mean=50,5) 41 | y1= rnorm(30,mean=60,5) 42 | 43 | t.test(x1,y1,var.equal = T) 44 | 45 | qt(p=0.05/2,df= 48, lower.tail = F) 46 | 47 | #welch's t-test 48 | t.test(x1,y1,var.equal = F) 49 | 50 | Ads= sample(c("AD1","AD2","NoAD"),size=100,replace=T) 51 | purchase= ifelse(Ads=='AD1', rnorm(100,mean=500,sd=80), 52 | ifelse(Ads=='AD2', rnorm(100,mean=600,sd=80), 53 | rnorm(100,mean=200,sd=80))) 54 | df2= data.frame(Ads= as.factor(Ads),purchase) 55 | head(df2) 56 | summary(df2$Ads) 57 | summary(df2[df2$Ads=='AD1',2]) 58 | summary(df2[df2$Ads=='AD2',2]) 59 | summary(df2[df2$Ads=='NoAD',2]) 60 | 61 | mod= aov(purchase~Ads, data= df2) 62 | summary(mod) 63 | -------------------------------------------------------------------------------- /cereal.Rdata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/cereal.Rdata -------------------------------------------------------------------------------- /cereal_data_set.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/cereal_data_set.zip -------------------------------------------------------------------------------- /cereal_data_set/cereal.csv: -------------------------------------------------------------------------------- 1 | 100%_Bran N C 70 4 1 130 10 5 6 3 280 25 1 0.33 2 | 100%_Natural_Bran Q C 120 3 5 15 2 8 8 3 135 0 1 -1 3 | All-Bran K C 70 4 1 260 9 7 5 3 320 25 1 0.33 4 | All-Bran_with_Extra_Fiber K C 50 4 0 140 14 8 0 3 330 25 1 0.5 5 | Almond_Delight R C 110 2 2 200 1 14 8 3 -1 25 1 0.75 6 | Apple_Cinnamon_Cheerios G C 110 2 2 180 1.5 10.5 10 1 70 25 1 0.75 7 | Apple_Jacks K C 110 2 0 125 1 11 14 2 30 25 1 1 8 | Basic_4 G C 130 3 2 210 2 18 8 3 100 25 1.33 0.75 9 | Bran_Chex R C 90 2 1 200 4 15 6 1 125 25 1 0.67 10 | Bran_Flakes P C 90 3 0 210 5 13 5 3 190 25 1 0.67 11 | Cap'n'Crunch Q C 120 1 2 220 0 12 12 2 35 25 1 0.75 12 | Cheerios G C 110 6 2 290 2 17 1 1 105 25 1 1.25 13 | Cinnamon_Toast_Crunch G C 120 1 3 210 0 13 9 2 45 25 1 0.75 14 | Clusters G C 110 3 2 140 2 13 7 3 105 25 1 0.5 15 | Cocoa_Puffs G C 110 1 1 180 0 12 13 2 55 25 1 1 16 | Corn_Chex R C 110 2 0 280 0 22 3 1 25 25 1 1 17 | Corn_Flakes K C 100 2 0 290 1 21 2 1 35 25 1 1 18 | Corn_Pops K C 110 1 0 90 1 13 12 2 20 25 1 1 19 | Count_Chocula G C 110 1 1 180 0 12 13 2 65 25 1 1 20 | Cracklin'_Oat_Bran K C 110 3 3 140 4 10 7 3 160 25 1 0.5 21 | Cream_of_Wheat_(Quick) N H 100 3 0 80 1 21 0 2 -1 0 1 1 22 | Crispix K C 110 2 0 220 1 21 3 3 30 25 1 1 23 | Crispy_Wheat_&_Raisins G C 100 2 1 140 2 11 10 3 120 25 1 0.75 24 | Double_Chex R C 100 2 0 190 1 18 5 3 80 25 1 0.75 25 | Froot_Loops K C 110 2 1 125 1 11 13 2 30 25 1 1 26 | Frosted_Flakes K C 110 1 0 200 1 14 11 1 25 25 1 0.75 27 | Frosted_Mini-Wheats K C 100 3 0 0 3 14 7 2 100 25 1 0.8 28 | Fruit_&_Fibre_Dates,_Walnuts,_and_Oats P C 120 3 2 160 5 12 10 3 200 25 1.25 0.67 29 | Fruitful_Bran K C 120 3 0 240 5 14 12 3 190 25 1.33 0.67 30 | Fruity_Pebbles P C 110 1 1 135 0 13 12 2 25 25 1 0.75 31 | Golden_Crisp P C 100 2 0 45 0 11 15 1 40 25 1 0.88 32 | Golden_Grahams G C 110 1 1 280 0 15 9 2 45 25 1 0.75 33 | Grape_Nuts_Flakes P C 100 3 1 140 3 15 5 3 85 25 1 0.88 34 | Grape-Nuts P C 110 3 0 170 3 17 3 3 90 25 1 0.25 35 | Great_Grains_Pecan P C 120 3 3 75 3 13 4 3 100 25 1 0.33 36 | Honey_Graham_Ohs Q C 120 1 2 220 1 12 11 2 45 25 1 1 37 | Honey_Nut_Cheerios G C 110 3 1 250 1.5 11.5 10 1 90 25 1 0.75 38 | Honey-comb P C 110 1 0 180 0 14 11 1 35 25 1 1.33 39 | Just_Right_Crunchy__Nuggets K C 110 2 1 170 1 17 6 3 60 100 1 -1 40 | Just_Right_Fruit_&_Nut K C 140 3 1 170 2 20 9 3 95 100 1.3 0.75 41 | Kix G C 110 2 1 260 0 21 3 2 40 25 1 1.5 42 | Life Q C 100 4 2 150 2 12 6 2 95 25 1 0.67 43 | Lucky_Charms G C 110 2 1 180 0 12 12 2 55 25 1 1 44 | Maypo A H 100 4 1 0 0 16 3 2 95 25 1 -1 45 | Muesli_Raisins,_Dates,_&_Almonds R C 150 4 3 95 3 16 11 3 170 25 -1 -1 46 | Muesli_Raisins,_Peaches,_&_Pecans R C 150 4 3 150 3 16 11 3 170 25 -1 -1 47 | Mueslix_Crispy_Blend K C 160 3 2 150 3 17 13 3 160 25 1.5 0.67 48 | Multi-Grain_Cheerios G C 100 2 1 220 2 15 6 1 90 25 1 1 49 | Nut&Honey_Crunch K C 120 2 1 190 0 15 9 2 40 25 1 0.67 50 | Nutri-Grain_Almond-Raisin K C 140 3 2 220 3 21 7 3 130 25 1.33 0.67 51 | Nutri-grain_Wheat K C 90 3 0 170 3 18 2 3 90 25 1 -1 52 | Oatmeal_Raisin_Crisp G C 130 3 2 170 1.5 13.5 10 3 120 25 1.25 0.5 53 | Post_Nat._Raisin_Bran P C 120 3 1 200 6 11 14 3 260 25 1.33 0.67 54 | Product_19 K C 100 3 0 320 1 20 3 3 45 100 1 1 55 | Puffed_Rice Q C 50 1 0 0 0 13 0 3 15 0 0.5 1 56 | Puffed_Wheat Q C 50 2 0 0 1 10 0 3 50 0 0.5 -1 57 | Quaker_Oat_Squares Q C 100 4 1 135 2 14 6 3 110 25 1 0.5 58 | Quaker_Oatmeal Q H 100 5 2 0 2.7 -1 -1 1 110 0 1 0.67 59 | Raisin_Bran K C 120 3 1 210 5 14 12 2 240 25 1.33 0.75 60 | Raisin_Nut_Bran G C 100 3 2 140 2.5 10.5 8 3 140 25 1 0.5 61 | Raisin_Squares K C 90 2 0 0 2 15 6 3 110 25 1 0.5 62 | Rice_Chex R C 110 1 0 240 0 23 2 1 30 25 1 1.13 63 | Rice_Krispies K C 110 2 0 290 0 22 3 1 35 25 1 1 64 | Shredded_Wheat N C 80 2 0 0 3 16 0 1 95 0 0.83 -1 65 | Shredded_Wheat_'n'Bran N C 90 3 0 0 4 19 0 1 140 0 1 0.67 66 | Shredded_Wheat_spoon_size N C 90 3 0 0 3 20 0 1 120 0 1 0.67 67 | Smacks K C 110 2 1 70 1 9 15 2 40 25 1 0.75 68 | Special_K K C 110 6 0 230 1 16 3 1 55 25 1 1 69 | Strawberry_Fruit_Wheats N C 90 2 0 15 3 15 5 2 90 25 1 -1 70 | Total_Corn_Flakes G C 110 2 1 200 0 21 3 3 35 100 1 1 71 | Total_Raisin_Bran G C 140 3 1 190 4 15 14 3 230 100 1.5 1 72 | Total_Whole_Grain G C 100 3 1 200 3 16 3 3 110 100 1 1 73 | Triples G C 110 2 1 250 0 21 3 3 60 25 1 0.75 74 | Trix G C 110 1 1 140 0 13 12 2 25 25 1 1 75 | Wheat_Chex R C 100 3 1 230 3 17 3 1 115 25 1 0.67 76 | Wheaties G C 100 3 1 200 3 17 3 1 110 25 1 1 77 | Wheaties_Honey_Gold G C 110 2 1 200 1 16 8 1 60 25 1 0.75 78 | -------------------------------------------------------------------------------- /cereal_data_set/cereal.txt: -------------------------------------------------------------------------------- 1 | 70 4 1 130 10 5 6 280 25 1 0.33 2 | 120 3 5 15 2 8 8 135 0 1 -1 3 | 70 4 1 260 9 7 5 320 25 1 0.33 4 | 50 4 0 140 14 8 0 330 25 1 0.5 5 | 110 2 2 200 1 14 8 -1 25 1 0.75 6 | 110 2 2 180 1.5 10.5 10 70 25 1 0.75 7 | 110 2 0 125 1 11 14 30 25 1 1 8 | 130 3 2 210 2 18 8 100 25 1.33 0.75 9 | 90 2 1 200 4 15 6 125 25 1 0.67 10 | 90 3 0 210 5 13 5 190 25 1 0.67 11 | 120 1 2 220 0 12 12 35 25 1 0.75 12 | 110 6 2 290 2 17 1 105 25 1 1.25 13 | 120 1 3 210 0 13 9 45 25 1 0.75 14 | 110 3 2 140 2 13 7 105 25 1 0.5 15 | 110 1 1 180 0 12 13 55 25 1 1 16 | 110 2 0 280 0 22 3 25 25 1 1 17 | 100 2 0 290 1 21 2 35 25 1 1 18 | 110 1 0 90 1 13 12 20 25 1 1 19 | 110 1 1 180 0 12 13 65 25 1 1 20 | 110 3 3 140 4 10 7 160 25 1 0.5 21 | 100 3 0 80 1 21 0 -1 0 1 1 22 | 110 2 0 220 1 21 3 30 25 1 1 23 | 100 2 1 140 2 11 10 120 25 1 0.75 24 | 100 2 0 190 1 18 5 80 25 1 0.75 25 | 110 2 1 125 1 11 13 30 25 1 1 26 | 110 1 0 200 1 14 11 25 25 1 0.75 27 | 100 3 0 0 3 14 7 100 25 1 0.8 28 | 120 3 2 160 5 12 10 200 25 1.25 0.67 29 | 120 3 0 240 5 14 12 190 25 1.33 0.67 30 | 110 1 1 135 0 13 12 25 25 1 0.75 31 | 100 2 0 45 0 11 15 40 25 1 0.88 32 | 110 1 1 280 0 15 9 45 25 1 0.75 33 | 100 3 1 140 3 15 5 85 25 1 0.88 34 | 110 3 0 170 3 17 3 90 25 1 0.25 35 | 120 3 3 75 3 13 4 100 25 1 0.33 36 | 120 1 2 220 1 12 11 45 25 1 1 37 | 110 3 1 250 1.5 11.5 10 90 25 1 0.75 38 | 110 1 0 180 0 14 11 35 25 1 1.33 39 | 110 2 1 170 1 17 6 60 100 1 -1 40 | 140 3 1 170 2 20 9 95 100 1.3 0.75 41 | 110 2 1 260 0 21 3 40 25 1 1.5 42 | 100 4 2 150 2 12 6 95 25 1 0.67 43 | 110 2 1 180 0 12 12 55 25 1 1 44 | 100 4 1 0 0 16 3 95 25 1 -1 45 | 150 4 3 95 3 16 11 170 25 -1 -1 46 | 150 4 3 150 3 16 11 170 25 -1 -1 47 | 160 3 2 150 3 17 13 160 25 1.5 0.67 48 | 100 2 1 220 2 15 6 90 25 1 1 49 | 120 2 1 190 0 15 9 40 25 1 0.67 50 | 140 3 2 220 3 21 7 130 25 1.33 0.67 51 | 90 3 0 170 3 18 2 90 25 1 -1 52 | 130 3 2 170 1.5 13.5 10 120 25 1.25 0.5 53 | 120 3 1 200 6 11 14 260 25 1.33 0.67 54 | 100 3 0 320 1 20 3 45 100 1 1 55 | 50 1 0 0 0 13 0 15 0 0.5 1 56 | 50 2 0 0 1 10 0 50 0 0.5 -1 57 | 100 4 1 135 2 14 6 110 25 1 0.5 58 | 100 5 2 0 2.7 -1 -1 110 0 1 0.67 59 | 120 3 1 210 5 14 12 240 25 1.33 0.75 60 | 100 3 2 140 2.5 10.5 8 140 25 1 0.5 61 | 90 2 0 0 2 15 6 110 25 1 0.5 62 | 110 1 0 240 0 23 2 30 25 1 1.13 63 | 110 2 0 290 0 22 3 35 25 1 1 64 | 80 2 0 0 3 16 0 95 0 0.83 -1 65 | 90 3 0 0 4 19 0 140 0 1 0.67 66 | 90 3 0 0 3 20 0 120 0 1 0.67 67 | 110 2 1 70 1 9 15 40 25 1 0.75 68 | 110 6 0 230 1 16 3 55 25 1 1 69 | 90 2 0 15 3 15 5 90 25 1 -1 70 | 110 2 1 200 0 21 3 35 100 1 1 71 | 140 3 1 190 4 15 14 230 100 1.5 1 72 | 100 3 1 200 3 16 3 110 100 1 1 73 | 110 2 1 250 0 21 3 60 25 1 0.75 74 | 110 1 1 140 0 13 12 25 25 1 1 75 | 100 3 1 230 3 17 3 115 25 1 0.67 76 | 100 3 1 200 3 17 3 110 25 1 1 77 | 110 2 1 200 1 16 8 60 25 1 0.75 78 | -------------------------------------------------------------------------------- /cereal_data_set/cereal.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/cereal_data_set/cereal.xls -------------------------------------------------------------------------------- /cereal_data_set/cereal2.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/cereal_data_set/cereal2.xls -------------------------------------------------------------------------------- /cereal_data_set/cereal_source.txt: -------------------------------------------------------------------------------- 1 | 2 | StatLib---1993 Graphics Exposition 3 | 4 | 5 | "Serial Correlation or Cereal Correlation ??" 6 | 7 | Call for Poster Presentations for the 1993 Statistical Graphics Exposition 8 | 9 | 10 | REVISED README FILE 11 | 12 | (new breakfast cereal data and new information about the data) 13 | 14 | Every two years the Section on Statistical Graphics sponsors a special 15 | exposition where one or more data sets are made available, analyzed by 16 | anyone interested and presented in a special poster session at the 17 | Annual Meeting. 18 | 19 | For the 1993 Statistical Graphics Exposition, there are two datasets to 20 | analyze, one synthesized, one real: 21 | 22 | OSCILLATOR TIME SERIES - a synthesized univariate time series with 1024 23 | observations. These data are similar to those which might be found in a 24 | university or industrial laboratory setting, or possibly from a process 25 | monitor on a plant floor. They show obvious structure, but there is more 26 | than one feature present, and good graphics are key to uncovering the 27 | features. The objective is to find ALL the features. At the Exposition 28 | next year, the algorithm and coefficients by which the dataset was 29 | constructed will be presented, along with the stages of analysis which 30 | would uncover the features. Some questions to consider: 31 | 32 | * What graphics are helpful in selecting the right analytical tools? 33 | * What combinations of graphics are essential to finding all the 34 | features? 35 | * For what features are the traditional graphics and analytical 36 | tools weak? 37 | * Are there graphics that you can retrospectively develop which more 38 | clearly reveal the features which were hard to uncover? 39 | 40 | The oscillator data are available in an ASCII file, one 41 | observation per record. To obtain the data, send an email message to 42 | statlib@lib.stat.cmu.edu containing the single line: 43 | send oscillator from 1993.expo 44 | 45 | BREAKFAST CEREAL DATA (REVISED)- a multivariate dataset describing 46 | seventy-seven commonly available breakfast cereals, based on the 47 | information now available on the newly-mandated F&DA food label. What 48 | are you getting when you eat a bowl of cereal? Can you get a lot of 49 | fiber without a lot of calories? Can you describe what cereals are 50 | displayed on high, low, and middle shelves? The good news is that none 51 | of the cereals for which we collected data had any cholesterol, and 52 | manufacturers rarely use artificial sweeteners and colors, nowadays. 53 | However, there is still a lot of data for the consumer to understand 54 | while choosing a good breakfast cereal. 55 | 56 | Two new variables have been added to the data (end of each record): 57 | 58 | weight (in ounces) of one serving (serving size) [weight] cups per 59 | serving [cups] 60 | 61 | Otherwise, the data are the same, except for minor typo corrections. The 62 | addition of these variables (suggested by Abbe Herzig of Consumers 63 | Union. Cereals vary considerably in their densities and listed serving 64 | sizes. Thus, the serving sizes listed on cereal labels (in weight units) 65 | translate into different amounts of nutrients in your bowl. Most people 66 | simply fill a cereal bowl (resulting in constant volume, but not 67 | weight). The new variables help standardize other ways, which provides 68 | other ways to differentiate and group cereals. 69 | 70 | Here are some facts about nutrition that might help you in your 71 | analysis. Nutritional recommendations are drawn from the references at 72 | the end of this document: 73 | 74 | * Adults should consume between 20 and 35 grams of dietary fiber per 75 | day. 76 | * The recommended daily intake (RDI) for calories is 2200 for women 77 | and 2900 for men. 78 | * Calories come in three food components. There are 9 calories per 79 | gram of fat, and 4 calories per gram of carbohydrate and protein. 80 | * Overall, in your diet, no more than 10% of your calories should be 81 | consumed from simple carbohydrates (sugars), and no more than 30% 82 | should come from fat. The RDI of protein is 50 grams for women and 83 | 63 grams for men. The balance of calories should be consumed in 84 | the form of complex carbohydrates (starches). 85 | * The average adult with no defined risk factors or other dietary 86 | restrictions should consume between 1800 and 2400 mg of sodium per 87 | day. 88 | * The type and amount of milk added to cereal can make a significant 89 | difference in the fat and protein content of your breakfast. 90 | 91 | One possible task is to develop a graphic that would allow the consumer 92 | to quickly compare a particular cereal to other possible choices. Some 93 | additional questions to consider, and try to answer with effective 94 | graphics: 95 | 96 | * Can you find the correlations you might expect? Are there any 97 | surprising correlations? 98 | * What is the true "dimensionality" of the data? 99 | * Are there any cereals which are virtually identical? 100 | * Is there any way to discriminate among the major manufacturers by 101 | cereal characteristics, or do they each have a "balanced 102 | portfolio" of cereals? 103 | * Do the nutritional claims made in cereal advertisements stand the 104 | scrutiny of data analysis? 105 | * Are there cereals which are clearly nutritionally superior, or 106 | inferior? Are there clusters of cereals? 107 | * Is a ranking or scoring scheme possible or reasonable, and if so, 108 | are there cereals which are nutritionally superior or inferior 109 | under all reasonable weighting schemes? 110 | 111 | The variables of the dataset are listed below, in order. For 112 | convenience, we suggest that you use the variable name supplied in 113 | square brackets. 114 | 115 | Breakfast cereal variables: cereal name [name] manufacturer (e.g., 116 | Kellogg's) [mfr] type (cold/hot) [type] calories (number) [calories] 117 | protein(g) [protein] fat(g) [fat] sodium(mg) [sodium] dietary fiber(g) 118 | [fiber] complex carbohydrates(g) [carbo] sugars(g) [sugars] display 119 | shelf (1, 2, or 3, counting from the floor) [shelf] potassium(mg) 120 | [potass] vitamins & minerals (0, 25, or 100, respectively indicating 121 | 'none added'; 'enriched, often to 25% FDA recommended'; '100% of FDA 122 | recommended') [vitamins] weight (in ounces) of one serving (serving 123 | size) [weight] cups per serving [cups] 124 | 125 | Manufacturers are represented by their first initial: A=American Home 126 | Food Products, G=General Mills, K=Kelloggs, N=Nabisco, P=Post, Q=Quaker 127 | Oats, R=Ralston Purina) 128 | 129 | The breakfast cereal data are available in an ASCII file, one 130 | cereal per record, with underscores in place of the spaces in the cereal 131 | name, and spaces separating the different variables. The value -1 132 | indicates missing data. To obtain the data, send an email message to: 133 | statlib@lib.stat.cmu.edu containing the single line: 134 | 135 | send cereal from 1993.expo 136 | 137 | Work alone or put together a team of data analysts to look at one or 138 | both of these two data sets! Try to answer the questions posed here or 139 | conduct an exploratory analysis to find and answer your own questions. 140 | 141 | To participate in the Exposition, you must submit a contributed paper 142 | abstract for inclusion in the formal ASA Contributed Paper Program. This 143 | reserves a poster session slot for you. Your abstract, on the official 144 | ASA abstract form, is due by the contributed paper deadline, February 1, 145 | 1993. 146 | 147 | If you do not have electronic mail access, try to get the data files 148 | from someone who already has them. If you cannot obtain the data via 149 | electronic mail, contact David Coleman, AMCT-D, Alcoa Technology Center, 150 | Alcoa Center, PA 15069, or e-mail COLEMAN1@ncf.al.alcoa.com 151 | 152 | 153 | References: 154 | 155 | National Research Council, 1989a. "Diet and Health: Implications for 156 | Reducing Chronic Disease Risk". National Academy Press, Washington, D.C. 157 | 158 | National Research Council, 1989b. "Recommended Dietary Allowances, 10th 159 | Ed." National Academy Press, Washington, D.C. 160 | 161 | National Cancer Institute, 1987. "Diet, Nutrition, and Cancer 162 | Prevention: A Guide to Food Choices," NIH Publ. No. 87-2878. National 163 | Institutes of Health, Public Health Service, U.S. Department of Health 164 | and Human Service, U.S. Government Printing Office, Washington, D.C. 165 | 166 | --------------------------------------------------------------------------------