├── .RData
├── .Rhistory
├── CART.R
├── Charts & Plots.R
├── DataSets(Excel & Csv files)
    ├── BicycleRidership.xlsx
    ├── Breakfast_Cereals.xlsx
    ├── Cutoffdata.xlsx
    ├── E-Commerce.xlsx
    ├── Financial_Reporting.xlsx
    ├── FlightDetails.xlsx
    ├── InternetCorruption.xlsx
    ├── LiftPrediction.xlsx
    ├── Mypromooffers.xlsx
    ├── Promoffers-9 Variables.xlsx
    ├── Promoffers.xlsx
    ├── SedanCar.xlsx
    └── UsedCars.xlsx
├── Dimensionality_Reduction.R
├── Installation.R
├── KNN.R
├── Logistic Regression.R
├── Multiple Linear Regression.R
├── Naive Bayes.R
├── Partitioning&Regression.R
├── Performance_Metrices.R
├── PredictionMetrics.R
├── RIntro.R
├── Simple Line Plotting.R
├── Specialized Visualization Techniques.R
├── Visualization Techniques.R
├── Welch's T-Test.R
├── cereal.Rdata
├── cereal_data_set.zip
└── cereal_data_set
    ├── cereal.csv
    ├── cereal.txt
    ├── cereal.xls
    ├── cereal2.xls
    └── cereal_source.txt


/.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/.RData


--------------------------------------------------------------------------------
/.Rhistory:
--------------------------------------------------------------------------------
  1 | install.packages(c("xlsx","rminer","MASS","leaps","class","e1071",
  2 | "rpart","rgart.plot","neuralnet","nnet","devtools",
  3 | "caret","arules","arulesviz","cluster","xts","matrixcalc",
  4 | "forecast","igraph","treemap","rworldmap","ggmap"),
  5 | dependencies = T)
  6 | df2= read.xlsx("SedanCar.xlsx",1, header = T)
  7 | library(xlsx)
  8 | df= read.xlsx(file.choose(),1,header = T)
  9 | df= read.xlsx(file.choose(),1,header = T)
 10 | library(xlsx)
 11 | df= read.xlsx(file.choose(),1,header = T)
 12 | View(df)
 13 | df1= read.xlsx("C:/Users/Harshit/Desktop/Business Analytics in R/SedanCar.xlsx",
 14 | 1, header = T)
 15 | View(df1)
 16 | setwd("C:/Users/Harshit/Desktop/Business Analytics in R/")
 17 | df2= read.xlsx("SedanCar.xlsx",1, header = T)
 18 | v= c(1,2,3)
 19 | v[0]
 20 | v[1]
 21 | v[0]=6
 22 | v[0]
 23 | v
 24 | v[-1]
 25 | v[-2]
 26 | v[1:3]
 27 | v[1:2]
 28 | v[1]
 29 | v[3]
 30 | sum(v1)
 31 | sum(v)
 32 | library(matrixcalc)
 33 | library(matrixcalc)
 34 | df$Ownership
 35 | df$Ownership
 36 | is.vector(df$Ownership)
 37 | is.vector(df$Annual_Income)
 38 | df[3,]
 39 | 3,1
 40 | df[3,1]
 41 | df[1,3]
 42 | df[1,]
 43 | is.factor(df$Ownership)
 44 | library(xlsx)
 45 | df= read.xlsx(file.choose(),1,header = T)
 46 | View(df)
 47 | df= df[,!apply(is.na(df),2,all)]
 48 | head(df)
 49 | summary(df)
 50 | plot(df$Annual_Income, df$Household_Area, las= 1,
 51 | slab= "Annual Income (a.'lakhs)", ylab= "Houshold Area (00s fts)",
 52 | xlim= c(2,12), ylim= c(13,25), pch=c(21,19)[as.numeric(df$Ownership)])
 53 | plot(df$Annual_Income, df$Household_Area, las= 1,
 54 | xlab= "Annual Income (a.'lakhs)", ylab= "Houshold Area (00s fts)",
 55 | xlim= c(2,12), ylim= c(13,25), pch=c(21,19)[as.numeric(df$Ownership)])
 56 | legend("bottomright",inset= 0.005,c("Owner","Nonowner"),
 57 | pch=c(19,21),cex= 0.7,x.intersp = 0.5, y.intersp = 0.5)
 58 | df[df$Annual_Income>5 & df$Annual_Income<8.5 & df$Household_Area>18
 59 | & df$Household_Area<20, c(1,2)]
 60 | df[df$Annual_Income>5 & df$Annual_Income<8.5 & df$Household_Area>18
 61 | & df$Household_Area<20, c(1,2)]
 62 | abline(h=18.8, col=3)
 63 | segments(7,0,7,18.8,col = 3)
 64 | segments(5.8,18.8,5.8,26,col = 3)
 65 | df[df$Annual_Income>6 & df$Annual_Income<8.5 & df$Household_Area>18
 66 | & df$Household_Area<21, c(1,2)]
 67 | segments(5.8,19.5,13,19.5,col = 3)
 68 | df[df$Annual_Income>6 & df$Annual_Income<8.5 & df$Household_Area>18
 69 | & df$Household_Area<21, c(1,2)]
 70 | segments(5.8,19.5,13,19.5,col = 3)
 71 | df[df$Annual_Income<7 & df$Household_Area>17 & df$Household_Area<19, c(1,2)]
 72 | segments(0,18.2,7,18.2,col = 3)
 73 | x=6
 74 | if(x>7){
 75 | x=x+1
 76 | }else if(x>8){
 77 | x=x+2
 78 | }else {
 79 | x=x+3}
 80 | print(x)
 81 | n = 5
 82 | sum = 1
 83 | while(n!=0)
 84 | {
 85 | sum = sum*n
 86 | print(sum)
 87 | n = n - 1
 88 | if(sum > 50)
 89 | {
 90 | print("It’s gonna rain")
 91 | }
 92 | else
 93 | {
 94 | print("It’s not gonna rain")
 95 | }
 96 | }
 97 | n=100
 98 | sum=0
 99 | for(i in seq(1,n,1)){
100 | sum=sum+i
101 | print(c(i,sum))
102 | if(sum>15)
103 | break
104 | }
105 | x1<-matrix(1:9,3,3)
106 | x2<-matrix(11:19,3,3)
107 | m = rbind(apply(x1,1,sum),apply(x2,1,sum))
108 | y = apply(m,1,mean)
109 | print(y)
110 | x = c(1:4)
111 | y = c(6,7)
112 | print(x + y)
113 | x <- c("a",1, 3>2)
114 | print(as.logical(x))
115 | vec1 = c(1,2,3)
116 | vec2 = c("R","Scilab","Java")
117 | vec3 = c("For prototyping", "For prototyping", "For Scaleup")
118 | mylist= cbind(vec1,vec2,vec3)
119 | mylist[2][3]="matlab"
120 | mylist[2,3]="matlab"
121 | mylist
122 | mylist= list(vec1,vec2,vec3)
123 | mylist
124 | mylist[2][3]="matlab"
125 | mylist[2,3]= "matlab"
126 | mylist[[2]][3] = "matlab"
127 | mylist
128 | list(mylist, list(vec4, c(10,11,12)))
129 | list(mylist, list(c(10,11,12)))
130 | list(vec4 = c("10","11","12"), mylist)
131 | A =matrix(c(1:9), nrow = 3, ncol = 3, byrow = T)
132 | A
133 | A[2,]
134 | A[2,3]
135 | diag(A) = c(99,100,101)
136 | A
137 | x <- matrix(1:4, 2, 2)
138 | y <- matrix(rep(10, 4), 2, 2)
139 | print(x * y)
140 | x
141 | y
142 | circle_mimo= function(dia)
143 | {
144 | area=(pi*dia^2)/4
145 | circum=pi*dia
146 | result=c("area"=area, "circum"=circum)
147 | return(result)
148 | }
149 | print(circle_mimo(4))
150 | x <- 1
151 | f <- function() {
152 | y <- 2
153 | return(c(x, y))
154 | }
155 | f()
156 | func <- function(){
157 | X<-3
158 | Y<-x+3
159 | return(c(X,Y))
160 | }
161 | print(Y)
162 | vec1 = c(1,2,3)
163 | vec2 = c("R","Scilab","Java")
164 | vec3 = c("For prototyping", "For prototyping", "For Scaleup")
165 | df= data.frame(vec1 = c(1,2,3),vec2 = c("R","Scilab","Java"),vec3 = c("For prototyping", "For prototyping", "For Scaleup")
166 | )
167 | df
168 | print(df[1:2,])
169 | rbind(df,data.frame(vec1=4,vec2="C",vec3="For Scaleup"))
170 | cbind(df,data.frame(vec4 = c(10,20,30,40)))
171 | cbind(data.frame(vec4 = c(10,20,30,40)),df)
172 | cbind(data.frame(vec4 = c(10,20,30,40),df))
173 | cbind(df,data.frame(vec4 = c(10,20,30,40)))
174 | cbind(df.data.frame(vec4 = c(10,20,30,40)))
175 | df
176 | df=  rbind(df,data.frame(vec1=4,vec2="C",vec3="For Scaleup"))
177 | cbind(df,data.frame(vec4 = c(10,20,30,40)))
178 | df= cbind(df,data.frame(vec4 = c(10,20,30,40)))
179 | df
180 | pd =data.frame ("Name"=c("Senthil","Senthil","Sam", "Sam"), "Month"=c("Jan","Feb","Jan","Feb"), "BS" = c(141.2,139.3,135.2,160.1),
181 | "BP" = c(90,78,80,81))
182 | pd
183 | library(reshape2)
184 | pd_new = recast(pd,id.var=c("Name","Month"),variable+Month~Name)
185 | print(pd_new)
186 | subset[pd, pd$BS > 140]
187 | subset[pd$BS > 140]
188 | subset(pd$BS > 140,pd)
189 | subset(pd, pd$BS > 140)
190 | a = data.frame(x1= c("A","B","C"), x2=1:3)
191 | b = data.frame(x1= c("A","B","D"), x2=c("Yes","No","Yes"))
192 | a
193 | b
194 | left_join(a,b,by='x1')
195 | install.packages("dplyr",dependencies = T)
196 | left_join(a,b,by='x1')
197 | left_join(a,b)
198 | p<-left_join(a,b)
199 | library(dplyr)
200 | p<-left_join(a,b)
201 | left_join(a,b,by='x1')
202 | left_join(b,a,by='x1')
203 | list(mylist, list(vec4, c(10,11,12)))
204 | list(mylist, list(vec4= c(10,11,12)))
205 | library(xlsx)
206 | #Classification Trees
207 | #Sedancar.xlsx
208 | df= read.xlsx(file.choose(),1,header = T)
209 | df= df[,!apply(is.na(df),2,all)]
210 | str(df)
211 | df$Household_Area
212 | df$columns
213 | columns(df)
214 | help(apply)
215 | par(mar=c(5.1,5.1,5.1,5.1))
216 | plot(df$Annual_Income, df$Household_Area, las=1,
217 | xlab= "Annual Income", ylab= "Household Area",
218 | xlim = c(2,12), ylim= c(13,25), pch= c(21,19)[as.numeric(df$Ownership)])
219 | plot(x=df$Annual_Income,y= df$Household_Area, las=1,
220 | xlab= "Annual Income", ylab= "Household Area",
221 | xlim = c(2,12), ylim= c(13,25), pch= c(21,19)[as.numeric(df$Ownership)])
222 | a= mat(1:9,3,3)
223 | a= matrix(1:9,3,3)
224 | a
225 | a= matrix(1:9,3,3,T)
226 | a
227 | a
228 | as.data.frame(a)
229 | kt= apply(a,2,sum)
230 | kt
231 | kt= apply(a,1,sum)
232 | kt= apply(a,1,all)
233 | kt
234 | legend("bottomright", inset= 0.005, c("owner","Non-owner"),
235 | pch= c(19,21), cex= 0.7, x.intersp = 0.5, y.intersp = 0.5)
236 | help(legend)
237 | #First split
238 | abline(h=18.8)
239 | #First split
240 | abline(h=18.8,v=3)
241 | #First split
242 | abline(h=18.8)
243 | plot(x=df$Annual_Income,y= df$Household_Area, las=1,
244 | xlab= "Annual Income", ylab= "Household Area",
245 | xlim = c(2,12), ylim= c(13,25), pch= c(21,19)[as.numeric(df$Ownership)])
246 | legend("bottomright", inset= 0.005, c("owner","Non-owner"),
247 | pch= c(19,21), cex= 0.7, x.intersp = 0.5, y.intersp = 0.5)
248 | #First split
249 | abline(h=18.8)
250 | df.sort()
251 | df
252 | df.sort(Annual_Income)
253 | df.sort('Annual_Income')
254 | sort(df$Annual_Income)
255 | head(sort(df$Annual_Income),-1)+ diff(sort(df$Annual_Income)/2)
256 | sort(df$Annual_Income)
257 | sort(df$Annual_Income)
258 | sort(df$Annual_Income)
259 | diff(sort(df$Annual_Income))
260 | df$Annual_Income
261 | help(diff)
262 | length(diff(sort(df$Annual_Income)))
263 | #For Categorical variablles
264 | # set of categories are divided into two subsets
265 | p1= seq(0,1,0.1)
266 | p1
267 | gini= NULL
268 | for(i in 1:length(p1)) {
269 | gini[i]=1-(p1[i]^2 + (1-p1[i])^2)
270 | }
271 | gini
272 | plot(p1,gini, ylab= "Gini index",type= "l")
273 | entropy= NULL
274 | for(i in 1:length(p1)) {
275 | entropy[i]= -(p1[i]*log2(p1[i])+ (1-p1[i])*log2(1-p1[i]))
276 | }
277 | plot(spline(p1,entropy), type= "l", xlab= "p1", ylab= "Entropy Measure")
278 | help("spline")
279 | plot(spline(p1,gini), ylab= "Gini index",type= "l")
280 | plot(p1,gini, ylab= "Gini index",type= "l")
281 | #First split in sedanCar example
282 | summary(df$Ownership)
283 | giorg= 1-(10/20)^2-(10/20)^2
284 | emorg= -(10/20)*log2(10/20)- (10/20)*log2(10/20)
285 | giorg
286 | emorg= -(10/20)*log2(10/20)- (10/20)*log2(10/20)
287 | emorg
288 | #upper rectangle
289 | giniurec= 1- (7/10)^2- (3/10)^2
290 | emurec= -(7/10)*log2(7/10)- (3/10)*log2(3/10)
291 | ginilrec= giniurec
292 | emlrec= emurec
293 | giniurec
294 | emurec
295 | plot(x=df$Annual_Income,y= df$Household_Area, las=1,
296 | xlab= "Annual Income", ylab= "Household Area",
297 | xlim = c(2,12), ylim= c(13,25), pch= c(21,19)[as.numeric(df$Ownership)])
298 | legend("bottomright", inset= 0.005, c("owner","Non-owner"),
299 | pch= c(19,21), cex= 0.7, x.intersp = 0.5, y.intersp = 0.5)
300 | par(mar=c(5.1,5.1,5.1,5.1))
301 | plot(x=df$Annual_Income,y= df$Household_Area, las=1,
302 | xlab= "Annual Income", ylab= "Household Area",
303 | xlim = c(2,12), ylim= c(13,25), pch= c(21,19)[as.numeric(df$Ownership)])
304 | #First split
305 | abline(h=18.8)
306 | #Second Split
307 | segments(7,0,7,18.8)
308 | #Final Stage
309 | segments(5.8,18.8,5.8,26)
310 | segments(5.8,19.5,13,19.5)
311 | segments(0,18.2,7,18.2)
312 | library(rpart)
313 | library(rpart)
314 | mod = rpart(Ownership~. , method= "class", data= df,
315 | control= rpart.control(cp= 0, minsplit = 2, minbucket= 1,
316 | maxcompete = 0, maxsurrogate = 0 ,xval= 0),parms= list(split= "gini"))
317 | par(mar= c(0,0,0,0), oma= c(0,0,0,0), xpd= NA)
318 | plot(mod, uniform=T, branch= 0.3, compress = T,
319 | margin = 0.1, nspace=1)
320 | help("rpart")
321 | par(mar= c(0,0,0,0), oma= c(0,0,0,0), xpd= NA)
322 | plot(mod, uniform=T, branch= 0.3, compress = T,
323 | margin = 0.1, nspace=1)
324 | text(mod,splits= T, use.n = T, all= F, minlength = 0,
325 | cex= 0.8)
326 | help(plt)
327 | help(plot)
328 | abline(v=5.95)
329 | install.packages("rpart.plot")
330 | prp(mod,type=1, extra=1 , under= T, varlen= 0, cex= 0.7,
331 | compress = T, Margin = 0 , digits = 0 ,
332 | split.cex = 0.8, under.cex = 0.8)
333 | prp(mod,type=1, extra=1 , under= T, varlen= 0, cex= 0.7,
334 | compress = T, Margin = 0 , digits = 0 ,
335 | split.cex = 0.8, under.cex = 0.8)
336 | help(prp)
337 | help("prp"")
338 | a
339 | as
340 | exit
341 | ;
342 | .
343 | /
344 | \
345 | ``
346 | ``
347 | ~
348 | ~~~~
349 | 3
350 | =
351 | }
352 | []
353 | )
354 | 0
355 | ""
356 | help(rpart.plot)
357 | help("rpart.plot")
358 | library(rpart.plot)
359 | help("rpart.plot")
360 | prp(mod,type=1, extra=1 , under= T, varlen= 0, cex= 0.7,compress = T, Margin = 0 , digits = 0 ,
361 | split.cex = 0.8, under.cex = 0.8)
362 | #Node numbering
363 | prp(mod,type=1, extra=1 , under= T, varlen= 0, cex= 0.7,
364 | compress = T, Margin = 0 , digits = 0 ,
365 | split.cex = 0.8, under.cex = 0.8, nn=T, nn.cex= 0.6)
366 | #First split
367 | modsub= snip.rpart(mod,toss=c(6:7, 12:13, 24:25))
368 | prp(modsub,type=1, extra=1 , under= T, varlen= 0, cex= 0.7,
369 | compress = T, Margin = 0 , digits = 0 ,
370 | split.cex = 0.8, under.cex = 0.8, nn=T, nn.cex= 0.6)
371 | #First 3 splits
372 | modsub1= snip.rpart(mod,toss=c(3,6:712:13, 24:25))
373 | prp(modsub1,type=1, extra=1 , under= T, varlen= 0, cex= 0.7,
374 | compress = T, Margin = 0 , digits = 0 ,
375 | split.cex = 0.8, under.cex = 0.8, nn=T, nn.cex= 0.6)
376 | #First 3 splits
377 | modsub1= snip.rpart(mod,toss=c(3,6:7,12:13, 24:25))
378 | prp(modsub1,type=1, extra=1 , under= T, varlen= 0, cex= 0.7,
379 | compress = T, Margin = 0 , digits = 0 ,
380 | split.cex = 0.8, under.cex = 0.8, nn=T, nn.cex= 0.6)
381 | #First 3 splits
382 | modsub1= snip.rpart(mod,toss=c(3,6:7,12:13, 24:25))
383 | prp(modsub1,type=1, extra=1 , under= T, varlen= 0, cex= 0.7,
384 | compress = T, Margin = 0 , digits = 0 ,
385 | split.cex = 0.8, under.cex = 0.8, nn=T, nn.cex= 0.6)
386 | prp(modsub1,type=1, extra=1 , under= T, varlen= 0, cex= 0.7,
387 | compress = T, Margin = 0 , digits = 0 ,
388 | split.cex = 0.8, under.cex = 0.8, nn=T, nn.cex= 0.6)
389 | help("prp")
390 | summary(mod)
391 | summary(mod)
392 | 


--------------------------------------------------------------------------------
/CART.R:
--------------------------------------------------------------------------------
  1 | library(xlsx)
  2 | 
  3 | #Classification Trees
  4 | #Sedancar.xlsx
  5 | df= read.xlsx(file.choose(),1,header = T)
  6 | df= df[,!apply(is.na(df),2,all)]
  7 | str(df)
  8 | 
  9 | data.frame("Household Number"= 1:20,"Annual Income (in lakhs)"= df$Annual_Income,
 10 |            "House Area (in fts)"= df$Household_Area,
 11 |            "Ownership of Sedan Car"= df$Ownership,
 12 |            check.names = F)
 13 | 
 14 | par(mar=c(5.1,5.1,5.1,5.1))
 15 | plot(x=df$Annual_Income,y= df$Household_Area, las=1,
 16 |      xlab= "Annual Income", ylab= "Household Area",
 17 |      xlim = c(2,12), ylim= c(13,25), pch= c(21,19)[as.numeric(df$Ownership)])
 18 | legend("bottomright", inset= 0.005, c("owner","Non-owner"),
 19 |        pch= c(19,21), cex= 0.7, x.intersp = 0.5, y.intersp = 0.5)
 20 | 
 21 | #First split
 22 | abline(h=18.8)
 23 | 
 24 | #Possible set of split values
 25 | # For numerical variables
 26 | # Midpoints between pairs of consecutive values for a variable,
 27 | #which are ranked as per the impurity (heterogeneity) reduction
 28 | #in the resulting rectangular parts
 29 | 
 30 | sort(df$Annual_Income)
 31 | head(sort(df$Annual_Income),-1)+ diff(sort(df$Annual_Income)/2)
 32 | sort(df$Household_Area)
 33 | head(sort(df$Household_Area),-1)+ diff(sort(df$Household_Area)/2)
 34 | 
 35 | #For Categorical variablles
 36 | # set of categories are divided into two subsets
 37 | p1= seq(0,1,0.1)
 38 | gini= NULL
 39 | for(i in 1:length(p1)) {
 40 |   gini[i]=1-(p1[i]^2 + (1-p1[i])^2)
 41 | }
 42 | plot(p1,gini, ylab= "Gini index",type= "l")
 43 | 
 44 | entropy= NULL
 45 | for(i in 1:length(p1)) {
 46 |   entropy[i]= -(p1[i]*log2(p1[i])+ (1-p1[i])*log2(1-p1[i]))
 47 | }
 48 | plot(spline(p1,entropy), type= "l", xlab= "p1", ylab= "Entropy Measure")
 49 | 
 50 | #First split in sedanCar example
 51 | summary(df$Ownership)
 52 | giorg= 1-(10/20)^2-(10/20)^2
 53 | emorg= -(10/20)*log2(10/20)- (10/20)*log2(10/20)
 54 | 
 55 | #upper rectangle
 56 | giniurec= 1- (7/10)^2- (3/10)^2
 57 | emurec= -(7/10)*log2(7/10)- (3/10)*log2(3/10)
 58 | ginilrec= giniurec # as upper rectangle and lower rectangle have symmetric proportions
 59 | emlrec= emurec
 60 | 
 61 | ginisplit1= (10/20)*giniurec + (10/20)*ginilrec
 62 | emsplit1= (10/20)*emlrec + (10/20)*emurec
 63 | 
 64 | ginidelta = ginisplit1- giorg
 65 | emdelta= emsplit1- emorg
 66 | 
 67 | #Second Split
 68 | segments(7,0,7,18.8)
 69 | 
 70 | #Final Stage
 71 | segments(5.8,18.8,5.8,26)
 72 | segments(5.8,19.5,13,19.5)
 73 | segments(0,18.2,7,18.2)
 74 | 
 75 | library(rpart)
 76 | #method = "class" for a classification tree
 77 | #method = "anova" for a regression tree
 78 | 
 79 | mod = rpart(Ownership~. , method= "class", data= df,
 80 |             control= rpart.control(cp= 0, minsplit = 2, minbucket= 1,
 81 |            maxcompete = 0, maxsurrogate = 0 ,xval= 0),parms= list(split= "gini"))
 82 | par(mar= c(0,0,0,0), oma= c(0,0,0,0), xpd= NA)
 83 | plot(mod, uniform=T, branch= 0.3, compress = T,
 84 |      margin = 0.1, nspace=1)
 85 | text(mod,splits= T, use.n = T, all= F, minlength = 0,
 86 |      cex= 0.8)
 87 | 
 88 | install.packages("rpart.plot")
 89 | library(rpart.plot)
 90 | prp(mod,type=1, extra=1 , under= T, varlen= 0, cex= 0.7,compress = T, Margin = 0 , digits = 0 ,
 91 |     split.cex = 0.8, under.cex = 0.8)
 92 | 
 93 | #Node numbering 
 94 | prp(mod,type=1, extra=1 , under= T, varlen= 0, cex= 0.7,
 95 |     compress = T, Margin = 0 , digits = 0 ,
 96 |     split.cex = 0.8, under.cex = 0.8, nn=T, nn.cex= 0.6)
 97 | 
 98 | #First split
 99 | modsub= snip.rpart(mod,toss=c(6:7, 12:13, 24:25))
100 | prp(modsub,type=1, extra=1 , under= T, varlen= 0, cex= 0.7,
101 |     compress = T, Margin = 0 , digits = 0 ,
102 |     split.cex = 0.8, under.cex = 0.8, nn=T, nn.cex= 0.6)
103 | #First 3 splits
104 | modsub1= snip.rpart(mod,toss=c(3,6:7,12:13, 24:25))
105 | prp(modsub1,type=1, extra=1 , under= T, varlen= 0, cex= 0.7,
106 |     compress = T, Margin = 0 , digits = 0 ,
107 |     split.cex = 0.8, under.cex = 0.8, nn=T, nn.cex= 0.6)
108 | 
109 | 
110 | summary(mod)
111 | 
112 | 
113 | #################
114 | ###promooffers###
115 | 
116 | df= read.xlsx(file.choose(),1,header = T)
117 | df= df[,!apply(is.na(df),2,all)]
118 | str(df)
119 | 
120 | tPIN= table(as.factor(df$Pin.Code))
121 | PINnames= dimnames(tPIN)[[1]]
122 | 
123 | C_PINcode= NULL
124 | for(x in PINnames) {
125 |   C_PINcode= c(C_PINcode, length(which(as.character(df$Pin.Code)==x & df$Promoffer==1)))
126 | }
127 | barplot(C_PINcode, names.arg = PINnames, xlab= "PIN Code", las=3, ylab= "Promotional offers Accepted",
128 |         ylim=c(0,20), cex.names= 0.6)
129 | 
130 | table(as.factor(C_PINcode))
131 | 
132 | for(x in PINnames) {
133 |   index= which(as.character(df$Pin.Code)==x)
134 |   df[index,]$Pin.Code=rep(C_PINcode[which(PINnames==x)],length(index))
135 | }
136 | 
137 | df$Pin.Code= as.factor(df$Pin.Code)
138 | df$Education= as.factor(df$Education)
139 | df$Promoffer =as.factor(df$Promoffer)
140 | df$Online= as.factor(df$Online)
141 | 
142 | str(df)
143 | 
144 | mod = rpart(Promoffer~. , method= "class", data= df,
145 |             control= rpart.control(cp= 0, minsplit = 2, minbucket= 1,
146 |                                    maxcompete = 0, maxsurrogate = 0 ,xval= 0),parms= list(split= "gini"))
147 | 
148 | mod_predict= predict(mod, df, type= "class")
149 | table("Actual value"=df$Promoffer, "Predicted value"=mod_predict)
150 | mean(mod_predict==df$Promoffer)
151 | 
152 | toss1= as.integer(row.names(mod$frame))
153 | x= mod$frame$var
154 | 
155 | 
156 | 
157 | 
158 | ### REGRESSION TREES ###
159 | ###usedcars dataset###
160 | 
161 | df= read.xlsx(file.choose(),1,header = T)
162 | df= df[,!apply(is.na(df),2,all)]
163 | str(df)
164 | 
165 | Age= 2017-df$Mfg_Year
166 | df= cbind(df,Age)
167 | 
168 | dfb= df
169 | df= df[,-c(1,2,3,11)]
170 | 
171 | str(df)
172 | df$Transmission= as.factor(df$Transmission)
173 | str(df)
174 | 
175 | ## Partitioning (60%:40%)
176 | partidx= sample(1:nrow(df),0.6*nrow(df),replace= F)
177 | dftrain= df[partidx,]
178 | dftest= df[-partidx,]
179 | 
180 | library(rpart)
181 | mod = rpart(Price~. , method= "anova", data= dftrain,
182 |             control= rpart.control(cp= 0, minsplit = 2, minbucket= 1,
183 |                                    maxcompete = 0, maxsurrogate = 0 ,xval= 0),parms= list(split= "gini"))
184 | 
185 | # No of decision nodes
186 | nrow(mod$splits)
187 | 
188 | # No of terminal nodes
189 | nrow(mod$frame)-nrow(mod$splits)
190 | 
191 | toss1= as.integer(row.names(mod$frame)); toss1
192 | 
193 | DFP= data.frame("toss"= toss1, "Svar"=mod$frame$var,
194 |                 "CP"=mod$frame$complexity); DFP
195 | 
196 | DFP1= DFP[DFP$Svar!="<leaf>",] ;DFP1
197 | 
198 | DFP2= DFP1[order(DFP1$CP, -DFP1$toss, decreasing = T),] ; DFP2
199 | 
200 | rownames(DFP2)= 1:nrow(DFP2); DFP2
201 | 
202 | toss2= DFP2$toss
203 | 


--------------------------------------------------------------------------------
/Charts & Plots.R:
--------------------------------------------------------------------------------
 1 | library(xlsx)
 2 | 
 3 | #NaughtyBicycle.xlsx
 4 | df= read.xlsx(file.choose(), 1 ,header = T)
 5 | df= df[,!apply(is.na(df), 2,all)]
 6 | df= df[,1:2]
 7 | head(df)
 8 | 
 9 | #Line Graph
10 | tsv= ts(df$Riders, start= c(2004,1),end=c(2017,3),frequency = 12)
11 | plot(tsv, xlab= "year", ylab="Riders",las=1) # las= styling for axis labels
12 | 
13 | at1= seq(as.Date("2004-01-01"), as.Date("2017-03-01"),by="2 years")
14 | labels1=format(at1,"%b-%Y")
15 | at2=format(at1,"%Y")
16 | 
17 | par()$mar
18 | par(mar=c(8,4,4,2)+0.1)
19 | 
20 | plot(tsv,xlab="",ylab="",xaxt="n",yaxt="n")
21 | axis(1,at=at2,labels = labels1, las=2)
22 | axis(2,las=2)
23 | mtext(side=1,text="Month-Year", line= 5.0)
24 | mtext(side=2,text="Riders", line= 3.3)
25 | 
26 | graphics.off()
27 | par()$mar
28 | 
29 | #Bar charts
30 | df1= read.xlsx(file.choose(),1, header= T)
31 | df1= df1[,!apply(is.na(df1), 2,all)]
32 | 
33 | Age= 2017- df1$Mfg_Year
34 | df1= cbind(df1,Age)
35 | df1= df1[,-c(1,2,3)]
36 | 
37 | head(df1)
38 | str(df1)
39 | df1$Transmission= as.factor(df1$Transmission)
40 | df1$C_Price= as.factor(df1$C_Price)
41 | str(df1)
42 | summary(df1)
43 | 
44 | #Scatter plots
45 | range(df1$KM)
46 | range(df1$Price)
47 | plot(df1$KM,df1$Price,xlim= c(18,180),ylim= c(1,75),xlab= "KM",ylab = "Price" )
48 | 
49 | df1= df1[df1$Price<70,]
50 | dfb= df1
51 | df1= df1[-23,]
52 | 
53 | range(df1$KM)
54 | range(df1$Price)
55 | plot(df1$KM,df1$Price,xlim= c(18,180),ylim= c(1,15),xlab= "KM",ylab = "Price" )
56 | 
57 | 
58 | #Bar Chart 
59 | avgPrice= c(mean(df1[which(df1$Transmission=='0'),]$Price),
60 |             mean(df1[which(df1$Transmission=='1'),]$Price))
61 | Trans= c("0","1")
62 | 
63 | range(avgPrice)
64 | 
65 | barplot(avgPrice, names.arg = Trans, xlab= "Transmission",
66 |         ylab="Average-Price", ylim= c(0,6))
67 | 
68 | pAll= c((length(which(df1$Transmission=='0'))/length(df1$Transmission))*100,
69 |         (length(which(df1$Transmission=='1'))/length(df1$Transmission))*100)
70 |     
71 | barplot(pAll, names.arg = Trans, xlab= "Transmission",
72 |         ylab="% of all records", ylim= c(0,100))
73 | 
74 | #Histograms
75 | 
76 | range(df1$KM)
77 | range(df1$Price)
78 | hist(df1$Price, main="", xlim=c(-5,20),ylim= c(0,50),xlab="Price")
79 | 
80 | #boxplot
81 | 
82 | boxplot(df1$Price~df1$Transmission, ylim= c(0,15),xlab="Transmission",
83 |         ylab="Price")
84 | means= by(df1$Price, df1$Transmission,mean)
85 | points(1:2,means,pch=19)
86 | 
87 | range(df1$KM)
88 | 
89 | 


--------------------------------------------------------------------------------
/DataSets(Excel & Csv files)/BicycleRidership.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/BicycleRidership.xlsx


--------------------------------------------------------------------------------
/DataSets(Excel & Csv files)/Breakfast_Cereals.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/Breakfast_Cereals.xlsx


--------------------------------------------------------------------------------
/DataSets(Excel & Csv files)/Cutoffdata.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/Cutoffdata.xlsx


--------------------------------------------------------------------------------
/DataSets(Excel & Csv files)/E-Commerce.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/E-Commerce.xlsx


--------------------------------------------------------------------------------
/DataSets(Excel & Csv files)/Financial_Reporting.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/Financial_Reporting.xlsx


--------------------------------------------------------------------------------
/DataSets(Excel & Csv files)/FlightDetails.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/FlightDetails.xlsx


--------------------------------------------------------------------------------
/DataSets(Excel & Csv files)/InternetCorruption.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/InternetCorruption.xlsx


--------------------------------------------------------------------------------
/DataSets(Excel & Csv files)/LiftPrediction.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/LiftPrediction.xlsx


--------------------------------------------------------------------------------
/DataSets(Excel & Csv files)/Mypromooffers.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/Mypromooffers.xlsx


--------------------------------------------------------------------------------
/DataSets(Excel & Csv files)/Promoffers-9 Variables.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/Promoffers-9 Variables.xlsx


--------------------------------------------------------------------------------
/DataSets(Excel & Csv files)/Promoffers.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/Promoffers.xlsx


--------------------------------------------------------------------------------
/DataSets(Excel & Csv files)/SedanCar.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/SedanCar.xlsx


--------------------------------------------------------------------------------
/DataSets(Excel & Csv files)/UsedCars.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/DataSets(Excel & Csv files)/UsedCars.xlsx


--------------------------------------------------------------------------------
/Dimensionality_Reduction.R:
--------------------------------------------------------------------------------
  1 | library(xlsx)
  2 | 
  3 | df1= read.xlsx(file.choose(),1, header= T)
  4 | df1= df1[,!apply(is.na(df1), 2,all)]
  5 | 
  6 | Age= 2017- df1$Mfg_Year
  7 | df1= cbind(df1,Age)
  8 | dfb= df1
  9 | df1= df1[,-c(1,2,3)]
 10 | 
 11 | head(df1)
 12 | str(df1)
 13 | 
 14 | #Summary Statistics
 15 | countblank= function(x) sum(x=="")
 16 | 
 17 | dfsum= data.frame(Average= sapply(df1[,-1],mean),Median= sapply(df1[,-1],median),
 18 |                   Min= sapply(df1[,-1],min),Max= sapply(df1[,-1],max),
 19 |                   Std= sapply(df1[,-1],sd),Count= sapply(df1[,-1],length),
 20 |                   Countblank=sapply(df1[,-1],countblank))            
 21 | round(dfsum,digits = 2)                  
 22 | 
 23 | M= cor(df1[,-c(1,5,8)]);M
 24 | M[upper.tri(M)]=NA;M
 25 | print(round(M,digits = 2),na.print = "")
 26 | symnum(M)
 27 | 
 28 | #Reducing Categories
 29 | Age_groups= levels(as.factor(df1$Age))
 30 | Age_groups2= as.numeric(Age_groups)
 31 | C_PricebyAge1= NULL
 32 | C_PricebyAge2= NULL
 33 | #Group1 has less than Rs400000 cost
 34 | # Rest lies in Group2
 35 | for(x in Age_groups2) {
 36 |   C_PricebyAge1= c(C_PricebyAge1,
 37 |                    100* sum(df1$Age==x & df1$C_Price==0)/sum(df1$Age==x))
 38 |   C_PricebyAge2= c(C_PricebyAge2,
 39 |                    100* sum(df1$Age==x & df1$C_Price==1)/sum(df1$Age==x))
 40 | }
 41 | C_PricebyAge= matrix(c(C_PricebyAge1, C_PricebyAge2),nrow = 2,
 42 |                      ncol= length(Age_groups), byrow= T)
 43 | #palette(c("purple","green"))
 44 | barplot(C_PricebyAge, names.arg = Age_groups, xlab= "Age", 
 45 |         legend.text = c("0","1"), args.legend = list(x="topright"),
 46 |         main= "Distribution of C_Price by Age",col = c("blue","green"),
 47 |         ylim = c(0,100), xlim = c(0,12))
 48 | 
 49 | Sales= c(45,50, 55,100,51,56,61,125,60,65,70,145,68,74,79,165)
 50 | 
 51 | tsv = ts(Sales, start = c(2012,1),end= c(2015,4), frequency = 4)
 52 | 
 53 | plot(tsv, xlab= "Quarter", ylab= "Sales(in crores)", las=2 , ylim=c(0,180))
 54 | 
 55 | #BreakfastCereals.xlsx
 56 | df2= read.xlsx(file.choose(),1,header = T)
 57 | df2= df2[, !apply(is.na(df2), 2,all)]
 58 | 
 59 | df2=cereal
 60 | dim(df2)
 61 | df2$vitamins= as.factor(df2$vitamins)
 62 | df2$vitamins= as.numeric(df2$vitamins)
 63 | 
 64 | df2$mfr= as.factor(df2$mfr)
 65 | df2$mfr= as.numeric(df2$mfr)
 66 | 
 67 | df2$mfr= as.factor(df2$mfr)
 68 | df2$mfr= as.numeric(df2$mfr)
 69 | 
 70 | sum=NULL
 71 | for(x in 1:dim(df2)[1]) {
 72 |   csum=0
 73 |   for(y in df2[x,-c(1,9,11)]) {
 74 |     csum=csum+y
 75 |   }
 76 |   sum=c(sum,csum)
 77 | }
 78 | df2$weight= sum
 79 | 
 80 | df3= as.data.frame(lapply(df2[,-c(1,9,11,12)], function(x){x=100*(x/df2$weight)}))
 81 | df3= cbind(df3,df2[,c(1,9,11)])
 82 | 
 83 | range(df3$potassium)
 84 | range(df3$fibre)
 85 | 
 86 | plot(df3$potassium, df3$fibre, xlab="POTASSIUM", ylab="FIBRE")
 87 | 
 88 | v1= var(df3$potassium)
 89 | v2= var(df3$fibre)
 90 | c12= cov(df3$potassium,df3$fibre)
 91 | matrix(c(v1,c12,c12,v2),2,2,T)
 92 | 
 93 | cor(df3$potassium,df3$fibre)
 94 | 
 95 | v1+v2
 96 | 100*v1/(v1+v2)
 97 | 100*v2/(v1+v2)
 98 | 
 99 | #Principal Component Analysis
100 | dfpca= df3[,c(8,5)]
101 | mod= prcomp(dfpca)
102 | 
103 | #adding PC directions to the plot
104 | slp= with(mod, rotation[2,1]/rotation[1,1])
105 | int= with(mod,center[2]-slp*center[1])
106 | 
107 | #First principal component
108 | abline(coef= c(int,slp))
109 | mod$rotation
110 | 
111 | slp1= -1/slp
112 | int1= with(mod,center[2]-slp1*center[1])
113 | 
114 | #Second principal component
115 | abline(coef= c(int1,slp1))
116 | mod$rotation
117 | 
118 | head(mod$x)
119 | dfpca[1,]
120 | First= mod$rotation[1,1]*(dfpca[1,1]-mean(dfpca[,1]))+
121 |   mod$rotation[1,2]*(dfpca[1,2]-mean(dfpca[,2])); First
122 | 
123 | vz1= var(mod$x[,1])
124 | vz2= var(mod$x[,2])
125 | vz1+vz2
126 | 100*vz1/(vz1+vz2)
127 | 100*vz2/(vz1+vz2)
128 | 


--------------------------------------------------------------------------------
/Installation.R:
--------------------------------------------------------------------------------
1 | install.packages(c("xlsx","rminer","MASS","leaps","class","e1071",
2 |                    "rpart","rgart.plot","neuralnet","nnet","devtools",
3 |                    "caret","arules","arulesviz","cluster","xts","matrixcalc",
4 |                    "forecast","igraph","treemap","rworldmap","ggmap"),
5 |                  dependencies = T)
6 | install.packages("dplyr",dependencies = T)
7 | 


--------------------------------------------------------------------------------
/KNN.R:
--------------------------------------------------------------------------------
 1 | df= read.xlsx(file.choose(),1,header = T)
 2 | df= df[,!apply(is.na(df),2,all)]
 3 | 
 4 | #Normalization:
 5 | dfb= df
 6 | df[,1:2]=scale(df[,1:2],center = T, scale = T)
 7 | 
 8 | partidx= sample(1:nrow(df),15, replace = F)
 9 | dftrain=df[partidx,]
10 | dftest=df[-partidx,]
11 | 
12 | #Modeling 
13 | library(class)
14 | # Building '4NN'
15 | mod= knn(train= dftrain[,1:2],test= dftest[,1:2],
16 |          cl=dftrain$Ownership, k=4)
17 | summary(mod)
18 |   
19 | #Classification Matrix 
20 | table("Actual value"=mod, "Predicted value"=dftest$Ownership)
21 | 
22 | mean(mod!=dftest$Ownership)
23 | 
24 | #choosing K
25 | modtrain = NULL
26 | modtest = NULL
27 | errtrain = NULL
28 | errtest = NULL
29 | 
30 | dftrain= as.data.frame(dftrain)
31 | dftest= as.data.frame(dftest)
32 | 
33 | for(i in 1:15) {
34 |   modtrain= knn(train = dftrain[,1:2],test=dftrain[,1:2],
35 |                 cl= dftrain[,3], k=i)
36 |   modtest= knn(train = dftrain[,1:2],test=dftest[,1:2],
37 |                 cl= dftrain[,3], k=i)
38 |   errtrain[i]= 100*mean(modtrain!=dftrain$Ownership)
39 |   errtest[i]= 100*mean(modtest!=dftest$Ownership)
40 |   }
41 | 
42 | dfp = data.frame("valueofk"=1:15,"ErrorTraining"=errtrain, 
43 |                  "ErrorValidation"=errtest)
44 | round(dfp,digits = 2)
45 | range(dfp$ErrorValidation)
46 | plot(dfp$valueofk, dfp$ErrorValidation, las=1, type="l",
47 |      xlab="value of k", ylab= "Validation Error",
48 |      xlim= c(0,16), ylim=c(0,65))
49 | lines(dfp$valueofk,dfp$ErrorTraining)
50 | 
51 | #BEST K
52 | min(errtest)
53 | bestk= dfp[which(errtest==min(errtest)),1]
54 | #or
55 | bestk= dfp[which.min(errtest),1]
56 | 
57 | #Predicting class of new observation
58 | #Annual_Income=6 lpa, Household_area= 20
59 | modnew1= knn(train = dftrain[,1:2], test = c(6,20),
60 |              cl= dftrain$Ownership, k=bestk)
61 | modnew2= knn(train = dftrain[,1:2], test = c(5,15),
62 |              cl= dftrain$Ownership, k=bestk)
63 | 


--------------------------------------------------------------------------------
/Logistic Regression.R:
--------------------------------------------------------------------------------
  1 | #### Probabilty odds, and logit
  2 | ## odd= p/(1-p)
  3 | curve(p/(1-p), from=0 , to=1, type= "l", xname = "p", las= 1,
  4 |       xlab= "Probability of success", ylab= "logit", xaxt= "n")
  5 | ## logit= log(odd)= log(p/(1-p))
  6 | curve(log(p/(1-p)), from=0 , to=1, type= "l", xname = "p", las= 1,
  7 |       xlab= "Probability of success", ylab= "logit", xaxt= "n")
  8 | axis(1,pos=0)
  9 | 
 10 | df= read.xlsx(file.choose(),1,header = T)
 11 | df= df[,!apply(is.na(df),2,all)]
 12 | str(df)
 13 | 
 14 | dfb= df
 15 | df= df[,-5]
 16 | df$Promoffer= as.factor(df$Promoffer)
 17 | df$Online= as.factor(df$Online)
 18 | 
 19 | ## Partitioning (60%:40%)
 20 | partidx= sample(1:nrow(df),0.6*nrow(df),replace= F)
 21 | dftrain= df[partidx,]
 22 | dftest= df[-partidx,]
 23 | 
 24 | mod= glm(Promoffer ~ Income, family = binomial(link="logit"),data= dftrain)
 25 | summary(mod)
 26 | 
 27 | b0= unname(mod$coefficients[1])
 28 | b1= unname(mod$coefficients[2])
 29 | 
 30 | # "P(Prmoffer= Yes| Income= X)" = 1/(1+e^ -(b0+b1*x))
 31 |  
 32 | range(dftrain$Income)
 33 | plot(dftrain$Income, as.numeric(as.character(dftrain$Promoffer)),
 34 |      type="p",xlab = "Income", ylab= "Promoffer")
 35 | curve(1/(1+exp(-(mod$coefficients[[1]]+mod$coefficients[[2]]*x))),
 36 |       xlim= c(0,250), type= "l", xname= "x", add = T)
 37 | 
 38 | mod1= glm(Promoffer ~ ., family = binomial(link="logit"),data= dftrain)
 39 | summary(mod1)
 40 | 
 41 | #P=odds/(1+odds)
 42 | curve(odds/(1+odds), from =0, to=100, type="l", xname= "odds",
 43 |       xlab= "Odds", ylab= "Probability of Success")
 44 | 
 45 | #P= exp(logit)/(1+exp(logit))
 46 | curve(exp(logit)/(1+exp(logit)), from =-100, to=100, type="l", xname= "logit",
 47 |       xlab= "logit", ylab= "Probability of Success")
 48 | 
 49 | modtest= predict(mod1, dftest[,-c(3)],type= "response")
 50 | ### response returns probabilities
 51 | 
 52 | modtestl= predict(mod1, dftest[,-c(3)],type= "link")
 53 | ### return logit values
 54 | 
 55 | modtestc= ifelse(modtest>0.5,1,0)
 56 | 
 57 | table("Actual value"=dftest$Promoffer, "Predicted"=modtestc)
 58 | 
 59 | mean(modtestc == df$Promoffer)
 60 | mean(modtestc != df$Promoffer)
 61 | 
 62 | head(data.frame(
 63 |   "Predicted class"= modtestc,
 64 |   "ACtual class"=dftest$Promoffer,
 65 |   "Prob for 1(success)"= modtest,
 66 |   "Log odds"= modtestl,
 67 |   dftest[,-3], check.names = F
 68 | )) 
 69 | 
 70 | #Cumulative Lift Curve
 71 | dflift= data.frame("Probabilty of class 1"=modtest,"Actual class"= as.numeric(as.character(dftest$Promoffer)),check.names = F)
 72 | 
 73 | dflift= dflift[order(dflift[,1],decreasing = T),]
 74 | CumACtualClass= cumsum(dflift[,2])
 75 | dflift= cbind(dflift, CumACtualClass)
 76 | head(dflift)
 77 | 
 78 | plot(1:nrow(dflift), dflift$CumACtualClass, "l",
 79 |      xlab = "# cases", ylab="cumulative", xlim= c(0,2100),
 80 |      ylim = c(0,210))
 81 | legend(800,70,inset=0.005,
 82 |        c("Cumulative Personal Loan when sorted using predicted values",
 83 |          "Cumulative Personal Loan using average"),
 84 |        lty= c(1,2), bty= "n", cex= 0.7, x.intersp=0.3, y.intersp= 0.5)
 85 | 
 86 | 
 87 | 
 88 | ################ Flight Details ################
 89 | df= read.xlsx(file.choose(),1,header = T)
 90 | df= df[,!apply(is.na(df),2,all)]
 91 | str(df)
 92 | 
 93 | dfb= df
 94 | df$STD= strptime(format(df$STD, "%H:%M:%S"), "%H:%M:%S")
 95 | df$ATD= strptime(format(df$ATD, "%H:%M:%S"), "%H:%M:%S")
 96 | df$STA= strptime(format(df$STA, "%H:%M:%S"), "%H:%M:%S")
 97 | df$ATA= strptime(format(df$ATA, "%H:%M:%S"), "%H:%M:%S")
 98 | 
 99 | 
100 | breaks= seq(strptime("00:00:00","%H:%M:%S"),strptime("24:00:00","%H:%M:%S"),
101 |             by= "6 hours")
102 | labelsv= c("0-6","6-12","12-18","18-24")
103 | DEPT= cut(df$ATD, breaks= breaks, right= F, labels = labelsv)
104 | 
105 | df1= cbind(df, DEPT)
106 | 
107 | df1$Day= as.factor(df1$Day)
108 | levels(df1$Day)
109 | levels(df1$Day=c("Sunday","Monday"))
110 | df1$FLTIME= as.difftime(as.character(df1$FLTIME))
111 | 
112 | str(df1)
113 | head(df1)
114 | 
115 | dfb1= df1
116 | df1= df1[,-c(1,3,5:8)]
117 | str(df1)
118 | head(df1)
119 | 
120 | 


--------------------------------------------------------------------------------
/Multiple Linear Regression.R:
--------------------------------------------------------------------------------
  1 | library(xlsx)
  2 | df= read.xlsx(file.choose(),1,T)
  3 | df= df[,!apply(is.na(df),2,all)]
  4 | head(df)
  5 | 
  6 | Age= 2017- df$Mfg_Year
  7 | df= cbind(df, Age)
  8 | 
  9 | dfb= df
 10 | df= df[,-c(1,2,3,11)]
 11 | df$Transmission= as.factor(df$Transmission)
 12 | 
 13 | #Partitioning (60%:40%)
 14 | partidx= sample(1:nrow(df), 0.6*nrow(df), replace=F)
 15 | dftrain= df[partidx,]
 16 | dftest= df[-partidx,]
 17 | 
 18 | mod= lm(Price ~ ., dftrain)
 19 | summary(mod)
 20 | #anova(mod)
 21 | 
 22 | #Goodness of fit
 23 | gf= c(mod$df.residual, summary(mod)$r.squared, summary(mod)$sigma,
 24 |       anova(mod)["Residuals","Sum Sq"])
 25 | gf= as.data.frame(gf,optional = T)
 26 | rownames(gf)=c("Residual df","Multiple R-Squared","Std. Dev. Estimate",
 27 |               "Residual SS")
 28 | 
 29 | modtest= predict(mod,dftest[,-4])
 30 | Residuals= dftest$Price-modtest
 31 | head(data.frame(c("Actual Value"=dftest$Price,"Predicted Value"=modtest, Residuals)))
 32 | 
 33 | library(rminer)
 34 | M= mmetric(dftest$Price, modtest, c("SSE","RMSE","ME"))
 35 | 
 36 | boxplot(Residuals, main= "Box Plot of residuals", ylab= "Residual",
 37 |         ylim= c(-6,7),las= 1)
 38 | quantile(Residuals, probs=c(0.25,0.75))
 39 | 
 40 | hist(df$Price, main= "", xlab="Price")
 41 | 
 42 | #Normal Probabilty Plot
 43 | qqnorm(df$Price)
 44 | qqline(df$Price)
 45 | 
 46 | 
 47 | 
 48 | #########
 49 | library(xlsx)
 50 | df= read.xlsx(file.choose(),1,T)
 51 | df= df[,!apply(is.na(df),2,all)]
 52 | head(df)
 53 | 
 54 | Age= 2017- df$Mfg_Year
 55 | df= cbind(df, Age)
 56 | 
 57 | dfb= df
 58 | df= df[,-c(1,2,3,11)]
 59 | df$Transmission= as.factor(df$Transmission)
 60 | 
 61 | plot(df$KM, df$Price, xlim=c(18,180),  ylim = c(1,75),
 62 |      xlab= "KM", ylab="Price")
 63 | df= df[-c(13,23,29,73),]
 64 | 
 65 | plot(df$KM, df$Price, xlim=c(25,115),  ylim = c(1,14),
 66 |      xlab= "KM", ylab="Price")
 67 | 
 68 | #Partitioning (60%:40%)
 69 | partidx= sample(1:nrow(df), 0.6*nrow(df), replace=F)
 70 | dftrain= df[partidx,]
 71 | dftest= df[-partidx,]
 72 | 
 73 | #Variable Selection 
 74 | #Exhaustive Search
 75 | library(leaps)
 76 | mod3= regsubsets(Price ~., data= dftrain, nbest = 1,nvmax = NULL,
 77 |                  force.in = NULL, force.out = NULL,
 78 |                  method = "exhaustive", intercept = T)
 79 | mod3summ= summary(mod3)
 80 | 
 81 | countspch= function(x) sum(x=="*")
 82 | om= as.integer(apply(mod3summ$outmat,2,countspch))
 83 | data.frame("Coeff"=as.integer(apply(mod3summ$outmat,1,countspch)),
 84 |            "RSS"=mod3summ$rss,
 85 |            "Cp"=round(mod3summ$cp,digits = 2),
 86 |            "R-sq"=round(mod3summ$rsq, digits = 2),
 87 |            "Adj.R-sq"=round(mod3summ$adjr2, digits = 2),
 88 |            mod3summ$outmat[,order(-om)])
 89 | 
 90 | 
 91 | #Coefficients of subset models
 92 | coef(mod3,1:8)
 93 | 
 94 | #Partial Iterative Searching:
 95 | #Forward Selection
 96 | 
 97 | mod4= regsubsets(Price ~., data= dftrain, nbest = 1,nvmax = NULL,
 98 |                  force.in = NULL, force.out = NULL,
 99 |                  method = "forward", intercept = T)
100 | mod4summ= summary(mod4)
101 | 
102 | countspch= function(x) sum(x=="*")
103 | om1= as.integer(apply(mod4summ$outmat,2,countspch))
104 | data.frame("Coeff"=as.integer(apply(mod4summ$outmat,1,countspch)),
105 |            "RSS"=mod4summ$rss,
106 |            "Cp"=round(mod4summ$cp,digits = 2),
107 |            "R-sq"=round(mod4summ$rsq, digits = 2),
108 |            "Adj.R-sq"=round(mod4summ$adjr2, digits = 2),
109 |            mod4summ$outmat[,order(-om1)])
110 | 
111 | coef(mod4,1:8)
112 | 
113 | #Backward elimination:
114 | 
115 | mod5= regsubsets(Price ~., data= dftrain, nbest = 1,nvmax = NULL,
116 |                  force.in = NULL, force.out = NULL,
117 |                  method = "backward", intercept = T)
118 | mod5summ= summary(mod5)
119 | 
120 | countspch= function(x) sum(x=="*")
121 | om2= as.integer(apply(mod5summ$outmat,2,countspch))
122 | data.frame("Coeff"=as.integer(apply(mod5summ$outmat,1,countspch)),
123 |            "RSS"=mod5summ$rss,
124 |            "Cp"=round(mod5summ$cp,digits = 2),
125 |            "R-sq"=round(mod5summ$rsq, digits = 2),
126 |            "Adj.R-sq"=round(mod5summ$adjr2, digits = 2),
127 |            mod5summ$outmat[,order(-om2)])
128 | 
129 | coef(mod5,1:8)
130 | 
131 | #Sequential Replacement
132 | 
133 | mod6= regsubsets(Price ~., data= dftrain, nbest = 1,nvmax = NULL,
134 |                  force.in = NULL, force.out = NULL,
135 |                  method = "seqrep", intercept = T)
136 | mod6summ= summary(mod6)
137 | 
138 | countspch= function(x) sum(x=="*")
139 | om3= as.integer(apply(mod6summ$outmat,2,countspch))
140 | data.frame("Coeff"=as.integer(apply(mod6summ$outmat,1,countspch)),
141 |            "RSS"=mod6summ$rss,
142 |            "Cp"=round(mod6summ$cp,digits = 2),
143 |            "R-sq"=round(mod6summ$rsq, digits = 2),
144 |            "Adj.R-sq"=round(mod6summ$adjr2, digits = 2),
145 |            mod6summ$outmat[,order(-om3)])
146 | 
147 | coef(mod6,1:8)
148 | 
149 | #Stepwise Regression
150 | 
151 | mod7= step(lm(Price~., data = dftrain), direction = "both")
152 | #options(op)
153 | 
154 | 


--------------------------------------------------------------------------------
/Naive Bayes.R:
--------------------------------------------------------------------------------
  1 | library(xlsx)
  2 | 
  3 | # FlightDetails.xlsx
  4 | df= read.xlsx(file.choose(),1,header = T)
  5 | df= df[,!apply(is.na(df),2,all)]
  6 | df= df[!apply(is.na(df),1,all),]
  7 | head(df)
  8 | str(df)
  9 | 
 10 | dfb= df
 11 | df= dfb
 12 | # Correct arrival and departure times
 13 | df$STD= format(df$STD, "%H:%M:%S")
 14 | df$STD= as.POSIXlt(paste(df$Date,df$STD),format = "%Y-%m-%d %H:%M:%S")
 15 | df$ATD= format(df$ATD, "%H:%M:%S")
 16 | df$ATD= as.POSIXlt(paste(df$Date,df$ATD),format = "%Y-%m-%d %H:%M:%S")
 17 | df$STA= format(df$STA, "%H:%M:%S")
 18 | df$STA= as.POSIXlt(paste(df$Date,df$STA),format = "%Y-%m-%d %H:%M:%S")
 19 | df$ATA= format(df$ATA, "%H:%M:%S")
 20 | df$ATA= as.POSIXlt(paste(df$Date,df$ATA),format = "%Y-%m-%d %H:%M:%S")
 21 | 
 22 | head(df)
 23 | str(df)
 24 | 
 25 | dfb2= df
 26 | 
 27 | 
 28 | df=dfb
 29 | df$STD=strptime(format(df$STD, "%H:%M:%S"),"%H:%M:%S")
 30 | df$ATD=strptime(format(df$ATD, "%H:%M:%S"),"%H:%M:%S")
 31 | df$STA=strptime(format(df$STA, "%H:%M:%S"),"%H:%M:%S")
 32 | df$ATA=strptime(format(df$ATA, "%H:%M:%S"),"%H:%M:%S")
 33 | 
 34 | head(df)
 35 | str(df)
 36 | 
 37 | 
 38 | #Break departure time into approprinttermal
 39 | range(df$ATD)
 40 | breaks = seq(strptime("00:00:00", "%H:%M:%S"),strptime("24:00:00","%H:%M:%S"),
 41 |              by = "6 hours")
 42 | labelsv= c("0-6","6-12","12-18","18-24")
 43 | DEPT= cut(df$ATD, breaks= breaks, right= F, labels = labelsv)
 44 | 
 45 | df= cbind(df, DEPT)
 46 | 
 47 | df$Day= as.factor(df$Day)
 48 | levels(df$Day)
 49 | levels(df$Day)= c("Sunday","Monday")
 50 | 
 51 | head(df)
 52 | str(df)
 53 | 
 54 | dfb3= df
 55 | df= df[,-c(1,3,5,8)]
 56 | str(df)
 57 | head(df)
 58 | 
 59 | partidx= sample(1:nrow(df),0.6*nrow(df), replace = F)
 60 | dftrain=df[partidx,]
 61 | dftest=df[-partidx,]
 62 | 
 63 | library(e1071)
 64 | mod= naiveBayes(df$Flight.Status ~ .,dftrain)
 65 | attributes(mod)
 66 | 
 67 | mod$apariori
 68 | mod$tables
 69 | path=""
 70 | write.xlsx(dftrain, path+"FlightDetails.xlsx")
 71 | 
 72 | mod$tables$Flight.Carrier
 73 | mod$tables$Flight.Carrier[1,3]
 74 | mod$tables$Flight.Carrier["ontime","Indigo"]
 75 | 
 76 | 
 77 | # Find exact matches for complete for Exact Bayes
 78 | 
 79 | dftrain[which(dftrain$Flight.Carrier=="Indigo"&
 80 |                 dftrain$SRC=="MAA"&
 81 |                 dftrain$DEST=="IXC"&
 82 |                 dftrain$Day=="Monday"&
 83 |               dftrain$DEPT=="0-6"),]
 84 | 
 85 | #NAIVE BAYES formulae (NUMERATOR):
 86 | 
 87 | p1= (mod$apriori[["delayed"]]/nrow(dftrain))
 88 |       *(mod$tables$Flight.Carrier["delayed","Indigo"])
 89 |       *(mod$tables$SRC["delayed","MAA"])
 90 |         *(mod$tables$DEST["delayed","IXC"])
 91 |           *(mod$tables$DEPT["delayed","0-6"])*
 92 |     (mod$table$Day["delayed","Monday"])
 93 | print(p1,digits=4)
 94 | 
 95 | # P(ontime|Example)
 96 | p2= (mod$apriori[["ontime"]]/nrow(dftrain))*
 97 |   (mod$tables$Flight.Carrier["ontime","Indigo"])
 98 | *(mod$tables$SRC["ontime","MAA"])
 99 | *(mod$tables$DEST["ontime","IXC"])
100 | *(mod$tables$DEPT["ontime","0-6"])*
101 |   (mod$table$Day["ontime","Monday"])
102 | print(p1,digits=5)
103 | 
104 | # Actual Probablities
105 | # P(delyed|Example)
106 | p1/(p1+p2)
107 | # P(ontime|Example)
108 | p2/(p1+p2)
109 | 
110 | 
111 | #SCoring test partition
112 | modtest= predict(mod, dftest[,5], type="class")
113 | modtestp= predict(mod, dftest[,5], type="raw")
114 | 
115 | table("Actual class"=dftest$Flight.Status, "Predicted class"=modtest)
116 | head(data.frame("Predicted class"=modtest,
117 |                 "Actual class"=dftest$Flight.Status,
118 |                 "Prob for 1(success)"=modtestp[,"delayed"],
119 |                 dftest[-5]))
120 | 
121 | #Classification accuracy 
122 | mean(modtest==dftest$Flight.Status)
123 | #MisClassification accuracy 
124 | mean(modtest!=dftest$Flight.Status)
125 | 
126 | #Scoring Training Partition
127 | modtrain= predict(mod, dftrain[,-5])
128 | table("Actual class"=dftrain$Flight.Status, "Predicted class"=modtrain)
129 | 
130 | #Classification accuracy 
131 | mean(modtrain==dftest$Flight.Status)
132 | #MisClassification accuracy 
133 | mean(modtrain!=dftest$Flight.Status)
134 | 
135 | #Cumlulative Lift Curve
136 | cases = 1:nrow(dftest)
137 | modtestn= dftest$Flight.Status
138 | levels(modtestn)= c(1,0) #c("delayed","ontime")
139 | modtestn= as.numeric(as.character(modtestn))
140 | dfl= data.frame("prob"=modtestp[,"delayed"],"actual class" =modtestn)
141 | dfl= dfl[order(-dfl$prob),]
142 | 
143 | cumAC= NULL
144 | cumAC[1]= dfl$actual.class[1]
145 | for(i in 2:nrow(dfl)) {
146 |   cumAC[i]= cumAC[i-1]+ dfl$Actual.class[i]
147 | }
148 | 
149 | plot(cases, cumAC, type="l",
150 |      xlab="# cases", ylab= "Cumulative", xlim=c(0,50), ylim = c(0,20))
151 | segments(0,0,nrow(dfl),cumAC[nrow(dfl)],lty= 3)
152 | legend(25,5,inset= 0.005,
153 |        c("Cumulative 1's sorted by predicted values",
154 |          "Cumulative 1's using average"),
155 |        lty= c(1,2),cex= 0.7, x.intersp = 0,3, y.intersp = 0.3)
156 | 
157 | 


--------------------------------------------------------------------------------
/Partitioning&Regression.R:
--------------------------------------------------------------------------------
 1 | dfh= data.frame("Promotions"=c(2.00,3.50,6.00,6.50,7.50,8.00,9.00),
 2 |                 "sales"=c(5.00,8.00,5.50,14.00,13.50,14.50,13.50)); dfh
 3 | 
 4 | summary(dfh)
 5 | 
 6 | plot(dfh$Promotions, dfh$sales,las=1,
 7 |      xlab= "Primotions(in crores)", ylab="Sales(in in crores)",
 8 |      xlim= c(0,10), ylim= c(0,16))
 9 | 
10 | lines(spline(dfh$Promotions, dfh$sales, method= "fmm"))
11 | lines(smooth.spline(dfh$Promotions, dfh$sales))
12 | 
13 | library(xlsx)     
14 | 
15 | df= read.xlsx(file.choose(),1,header = T)
16 | df= df[,!apply(is.na(df),2,all)]
17 | 
18 | 
19 | names(df)
20 | 
21 | df[1:9,]
22 | 
23 | # sorting for outlier detection
24 | head(data.frame("KM"=df$KM,"SR_Price"=df$SR_Price,
25 |                 "Mfg_year"=df$Mfg_Year)[order(-df$KM),])
26 | 
27 | Age= 2017- df$Mfg_Year
28 | df= cbind(df,Age)
29 | df1= df[,-c(1,2,3)]
30 | head(df1)
31 | 
32 | set.seed(12345)
33 | 
34 | partidx= sample(1:nrow(df1),0.5*nrow(df1),replace = F)
35 | df1train= df1[partidx,]
36 | df1test= df1[-partidx,]
37 | 
38 | mod= lm(Price ~ ., df1train)
39 | summary(mod)
40 | Residualtrain= df1train$Price- mod$fitted.values
41 | head(data.frame("Actual value"= df1train$Price,
42 |                 "Predicted value"= mod$fitted.values,
43 |                 Residualtrain))
44 | modtest = predict(mod, df1test[,-c(4)])
45 | Residualtest= df1test$Price- modtest
46 | head(data.frame("Actual value"=df1test$Price,"Predicted value"=modtest,
47 |                 Residualtest))
48 | 
49 | install.packages("rminer", dependencies = T) 
50 | library(rminer)
51 | mmetric(df1train$Price, mod$fitted.values,c("SSE","RMSE","ME"))
52 | mmetric(df1test$Price, modtest,c("SSE","RMSE","ME"))
53 | 
54 | 


--------------------------------------------------------------------------------
/Performance_Metrices.R:
--------------------------------------------------------------------------------
 1 | library(xlsx)
 2 | 
 3 | df= read.xlsx(file.choose(),1, header= T)
 4 | df= df[,!apply(is.na(df), 2,all)]
 5 | 
 6 | plot(df$Annual_Income,df$Household_Area, las=1 , xlab= "Annual Income", ylab= "Household Area",
 7 |      xlim= c(2,12), ylim = c(13,25), pch= c(21,19)[as.numeric(df$Ownership)])
 8 | 
 9 | legend("bottomright", inset= 0.005, c("owner", "Non-Owner"), pch= c(19,21), cex= 0.7,
10 |        x.intersp = 0.5, y.intersp = 0.5)
11 | 
12 | #Promoffers.xlsx
13 | df1= read.xlsx(file.choose(),1, header= T)
14 | df1= df1[,!apply(is.na(df1), 2,all)]
15 | 
16 | palette()
17 | palette(c("gray","black"))
18 | 
19 | plot(df1$Income, df1$Spending, xlim=c(0,225), ylim=c(0,11),
20 |      xlab="Income", ylab="Spending", col= as.factor(df1$Promoffer), 
21 |      pch=19, cex=0.8, panel.first = grid())
22 | 
23 | plot(jitter(df1$Income,1), df1$Spending, xlim=c(0,225), ylim=c(0,11),
24 |      xlab="Income", ylab="Spending", col= as.factor(df1$Promoffer), 
25 |      pch=20, cex=0.8, panel.first = grid())
26 | par(mar=c(4,4,1,1), oma=c(1,1,1,1))
27 | 
28 | plot(jitter(df1$Income,1), df1$Spending, log= "xy",
29 |      xlab="Income", ylab="Spending", col= as.factor(df1$Promoffer), 
30 |      pch=20, cex=0.7, panel.first = grid())
31 | palette("default")
32 | 
33 | #Classification Marix 
34 | cm= matrix(c(400,50,25,2525), 2,2,T, list(c("1","0"),c("1","0")))
35 | 
36 | err= (cm['0','1']+cm['1','0'])/sum(cm)
37 | accuracy= (cm['1','1']+cm['0','0'])/sum(cm)
38 | 
39 | #ROC Curve
40 | df2= read.xlsx(file.choose(),1, header= T)
41 | df2= df2[,!apply(is.na(df2), 2,all)]
42 | df2= df2[!apply(is.na(df2),1,all),]
43 | 
44 | data.frame("cutoffvalue"=df2$cutoff,"OneMinusSpecificity"=1-df2$specificity,
45 |            "Senstivity"=df2$senstivity)[order(df2$cutoff, decreasing = T)]
46 | plot(1-df2$specificity, df2$senstivity, type= "p", 
47 |      xlab = "1-Specifity", ylab= "Senstivity", pch=19)
48 | 
49 | plot(1-df2$specificity, df2$senstivity, type= "s", 
50 |      xlab = "1-Specifity", ylab= "Senstivity", pch=19)
51 | segments(0,0,1,1,lty= 3)
52 | legend("right", inset= 0.005, c("Random", "ROC"), lty= c(2,1),bty="n", cex= 0.7,
53 |        x.intersp = 0.3, y.intersp = 0.3)
54 | 
55 | #Cumulative Lifts Curve / Gains Chart
56 | df3= read.xlsx(file.choose(),1, header= T)
57 | df3= df3[,!apply(is.na(df3), 2,all)]
58 | df3= df3[!apply(is.na(df3),1,all),]
59 | 
60 | range(df3$Cumulative.Actual.Class)
61 | range(df3$Serial.no.)
62 | plot(df3$Serial.no., df3$Cumulative.Actual.Class, type = "l",
63 |      xlab= "# Cases", ylab= "Cumulative", xlim= c(0,30), ylim= c(0,14))
64 | segments(0,0,24,12,lty= 3)
65 | segments(1,1,12,12,lty= 4, col= "red")
66 | segments(12,12,24,12,lty= 4, col= "red")
67 | legend(22,10, inset= 0.005, c("Cumlative 1's sorted by predicted values",
68 |                               "Cumlative 1's using random selection"),
69 |         lty= c(1,2),bty="n", cex= 0.7,x.intersp = 0.3, y.intersp = 0.3)
70 | 
71 | #Decile Chart
72 | decilecases= round(seq(0.1,1,0,1)*length(df3$serial.no.))
73 | decile= NULL
74 | decilemean= NULL
75 | globalmean= length(which(df3$Actual.Class==1))/length(df3$Actual.Class)
76 | j=0
77 | for(i in decilecases) {
78 |   j=j+1
79 |   decilemean[j]= df3Cumulative.Actual.Class[i]/i
80 |   decile[j]= decilemean[j]/globalmean
81 | }
82 | barplot(decile, names.arg = as.factor(seq(1,10,1)),xlab="Deciles",
83 |         ylab= "Decile mean/Global Mean", ylim = c(0,2.5))
84 | 
85 | #Cumulative lift curve(gains chart) incorporating costs
86 | #cutoffdata.xlsx
87 | df4= read.xlsx(file.choose(), 5, colIndex = 1:5, T)
88 | df4= df4[,!apply(is.na(df4), 2,all)]
89 | head(df4)
90 | 
91 | range(df4$Cumulative.Cost.)
92 | range(df4$Serial.no.)
93 | plot(df4$Serial.no., df4$Cumulative.Cost.,type= "l", xlab= xlab = "# cases",
94 |      ylab="Cumulative costs", xlim= c(0,25), ylim=c(5,140))
95 | segments(0,0,24,132, lty = 3)
96 | legend(22,10, inset= 0.005, c("Cumlative costs sorted by predicted values",
97 |                               "reference line"),
98 |        lty= c(1,2),bty="n", cex= 0.7,x.intersp = 0.3, y.intersp = 0.3)
99 | 


--------------------------------------------------------------------------------
/PredictionMetrics.R:
--------------------------------------------------------------------------------
 1 | library(xlsx)
 2 | df= read.xlsx(file.choose(),1,T)
 3 | df= df[,!apply(is.na(df),2,all)]
 4 | head(df)
 5 | 
 6 | plot(df$Serial.No, df$Cumulative.value, type = "l",
 7 |      xlab= "# cases", ylab= "Cumulative value",
 8 |      xlim= c(0,25), ylim= c(40 ,550))
 9 | segments(0,0,20,544,lty=3)
10 | legend(12,200, inset=0.005,
11 |        c("Cumulative value sorted by predicted value",
12 |          "reference line"),
13 |        lty= c(1,2), bty= "n", cex= 0.7, x.intersp= 0.3, y.intersp= 0.3)
14 | 


--------------------------------------------------------------------------------
/RIntro.R:
--------------------------------------------------------------------------------
 1 | library(xlsx)
 2 | 
 3 | df= read.xlsx(file.choose(),1,header = T)
 4 | 
 5 | df1= read.xlsx("C:/Users/Harshit/Desktop/Business Analytics in R/SedanCar.xlsx",
 6 |                1, header = T)
 7 | 
 8 | setwd("C:/Users/Harshit/Desktop/Business Analytics in R/")
 9 | 
10 | df2= read.xlsx("SedanCar.xlsx",1, header = T)
11 | 
12 | library(matrixcalc)
13 | #matrix.inverse(Mat2)
14 | 


--------------------------------------------------------------------------------
/Simple Line Plotting.R:
--------------------------------------------------------------------------------
 1 | library(xlsx)
 2 | 
 3 | df= read.xlsx(file.choose(),1,header = T)
 4 | df= df[,!apply(is.na(df),2,all)]
 5 | head(df)
 6 | summary(df)
 7 | 
 8 | plot(df$Annual_Income, df$Household_Area, las= 1,
 9 |      xlab= "Annual Income (a.'lakhs)", ylab= "Houshold Area (00s fts)",
10 |      xlim= c(2,12), ylim= c(13,25), pch=c(21,19)[as.numeric(df$Ownership)])
11 | legend("bottomright",inset= 0.005,c("Owner","Nonowner"),
12 |        pch=c(19,21),cex= 0.7,x.intersp = 0.5, y.intersp = 0.5)
13 | df[df$Annual_Income>5 & df$Annual_Income<8.5 & df$Household_Area>18
14 |    & df$Household_Area<20, c(1,2)]
15 | abline(h=18.8, col=3)
16 | segments(7,0,7,18.8,col = 3)
17 | segments(5.8,18.8,5.8,26,col = 3)
18 | 
19 | df[df$Annual_Income>6 & df$Annual_Income<8.5 & df$Household_Area>18 
20 |    & df$Household_Area<21, c(1,2)]
21 | segments(5.8,19.5,13,19.5,col = 3)
22 | 
23 | df[df$Annual_Income<7 & df$Household_Area>17 & df$Household_Area<19, c(1,2)]
24 | segments(0,18.2,7,18.2,col = 3)
25 | 


--------------------------------------------------------------------------------
/Specialized Visualization Techniques.R:
--------------------------------------------------------------------------------
  1 | library(xlsx)
  2 | 
  3 | df1= read.xlsx(file.choose(),1, header= T)
  4 | df1= df1[,!apply(is.na(df1), 2,all)]
  5 | 
  6 | Age= 2017- df1$Mfg_Year
  7 | df1= cbind(df1,Age)
  8 | dfb= df1
  9 | df1= df1[,-c(1,2,3)]
 10 | 
 11 | head(df1)
 12 | str(df1)
 13 | df1$Transmission= as.factor(df1$Transmission)
 14 | df1$C_Price= as.factor(df1$C_Price)
 15 | str(df1)
 16 | summary(df1)
 17 | 
 18 | df1= df1[df1$Price<70,]
 19 | dfb= df1
 20 | df1= df1[-23,]
 21 | 
 22 | head(df1)
 23 | range(df1$KM)
 24 | boxplot(df1$Price~df1$Transmission, ylim= c(0,15),xlab="Transmission",
 25 |         ylab="Price")
 26 | means= by(df1$Price, df1$Transmission,mean)
 27 | points(1:2,means,pch=3)
 28 | 
 29 | boxplot(df1$KM~df1$C_Price, ylim= c(25,180),xlab="C_Price",
 30 |         ylab="KM")
 31 | means1= by(df1$KM, df1$C_Price,mean)
 32 | points(1:2,means1,pch=3)
 33 | 
 34 | boxplot(df1$Age~df1$C_Price, ylim= c(0,12),xlab="C_Price",
 35 |         ylab="Age")
 36 | means2= by(df1$Age, df1$C_Price,mean)
 37 | points(1:2,means2,pch=3)
 38 | 
 39 | boxplot(df1$SR_Price~df1$C_Price, ylim= c(0,25),xlab="C_Price",
 40 |         ylab="SR_Price")
 41 | means3= by(df1$SR_Price, df1$C_Price,mean)
 42 | points(1:2,means3,pch=3)
 43 | 
 44 | #Heatmaps
 45 | #correlation matrix
 46 | M= cor(df1[,-c(1,5,8)])
 47 | symnum(M)
 48 | M[upper.tri(M)]=NA;
 49 | 
 50 | #correlation table heatmap
 51 | heatmap(M,Rowv = NA, symm= T, col= grey.colors(100, start= 0.8, end=0.2),scale= "none", margins= c(8,4))
 52 | 
 53 | #Missing value heatmap
 54 | heatmap(head(as.matrix(df1[,-c(1,5,8)])),Rowv = NA,Colv = NA,
 55 |         col= grey.colors(1000, start= 0.8, end=0.0),scale= "column", margins= c(8,4))
 56 | 
 57 | heatmap(as.matrix(df1[,-c(1,5,8)]),Rowv = NA,Colv = NA, 
 58 |         col= grey.colors(1000, start= 0.8, end=0.0),scale= "column", margins= c(8,4))
 59 | 
 60 | #Multidimensional Visualization
 61 | palette()
 62 | palette(rainbow(6))
 63 | palette("default")
 64 | 
 65 | range(df1$Age)
 66 | plot(df1$Age, df1$KM, xlim=c(0,12),xlab= "Age", ylab= "KM", col= df1$C_Price)
 67 | 
 68 | #separate panel for each group
 69 | Age_groups= levels(as.factor(df1$Age))
 70 | Age_groups2= as.numeric(Age_groups)
 71 | avgPrice1=NULL
 72 | avgPrice2=NULL
 73 | for(x in Age_groups2){
 74 |   avgPrice1= c(avgPrice1,mean(df1[which(df1$Age == x & df1$Transmission == 0),]$Price))
 75 |   avgPrice2= c(avgPrice2,mean(df1[which(df1$Age == x & df1$Transmission == 1),]$Price))
 76 | }
 77 | avgPrice1[which(avgPrice1=="Nan")]=0
 78 | avgPrice2[which(avgPrice2=="Nan")]=0
 79 | avgPrice2[is.nan(avgPrice2)]<-0
 80 | 
 81 | par(mfrow=c(2,1),cex= 0.6, mar= c(3,3,0,0),oma= c(1,1,1,1))  
 82 | 
 83 | range(avgPrice1)
 84 | range(avgPrice2)
 85 | 
 86 | #code incomplete
 87 | barplot(avgPrice1,names.arg = Age_groups, xlab="",ylab="",xaxt="n",ylim=c(0,9))
 88 | box("plot")
 89 | legend("topright",inset=0.005, c("Trans=0"),bty = "n", cex = 1)
 90 | mtext("Avg(Price)",side=2, line= 2.2, cex= 0.7, adj=0)
 91 | 
 92 | 
 93 | barplot(avgPrice2, names.arg = Age_groups, xlab = "",ylab="", ylim= c(0,9))
 94 | box("plot")
 95 | legend("topright",inset=0.005, c("Trans=1"),bty = "n", cex = 1)
 96 | mtext("Age",side=1, line= 2.2, cex= 0.7)
 97 | 
 98 | # MULTIPANEL PLOT VISUALIZATION
 99 | pairs(~ SR_Price + KM + Price + Age, data= df1)
100 | 
101 | par(mfrow=c(2,2),cex= 0.6, mar= c(3,3,0,0),oma= c(1,1,1,1))  
102 | 
103 | plot(df1$KM,df1$Price, xlim=c(0,180), ylim= c(0,15),xlab="",ylab="")
104 | mtext("KM",side=1, line= 2.2, cex= 0.7)
105 | mtext("Price",side=2, line= 2.2, cex= 0.7)
106 | 
107 | plot(df1$KM,df1$Price,log="xy" ,xlim=c(10,1000), ylim= c(0.1,100),
108 |      xaxt="n",yaxt="n",xlab="",ylab="")
109 | axis(1,at=c(10,100,1000),labels=c("10","100","1000"))
110 | axis(2,at=c(0.1,1,10,100),labels=c("0.1","1","10","100"))
111 | mtext("KM",side=1, line=2, cex=0.6)
112 | mtext("Price",side=2, line=2, cex=0.6)
113 | 
114 | range(dfb$Price)
115 | boxplot(dfb$Price~dfb$Transmission, ylim=c(0,75),xlab="",ylab="")
116 | mtext("Trans",side=1, line=2, cex=0.6)
117 | mtext("Price",side=2, line=2, cex=0.6)
118 | 
119 | boxplot(dfb$Price~dfb$Transmission, log="y",ylim=c(0.1,100),xlab="",ylab="")
120 | mtext("Trans",side=1, line=2, cex=0.6)
121 | mtext("Price",side=2, line=2, cex=0.6)
122 | 
123 | #AGGREGATIONS , ATTACHING A CURVE , ZOOMING IN 
124 | par(mfrow=c(2,2),cex= 0.6, mar= c(2.7,2.5,1,0.5),oma= c(0,0,0,0))  
125 | df= read.xlsx(file.choose(), 1 ,header = T)
126 | df= df[,!apply(is.na(df), 2,all)]
127 | 
128 | at1= seq(as.Date("2004-01-01"), as.Date("2017-03-01"),by="2 years")
129 | labels1=format(at1,"%b-%Y")
130 | at2=format(at1,"%Y")
131 | plot(tsv,xaxt="n",yaxt="n")
132 | axis(1,at=at2, labels=format(at1, "%d/%m/%Y"),cex.axis=0.8)
133 | axis(2,cex.axis=0.8)
134 | mtext(side=1,text="Month" ,line=2, cex=0.6)
135 | mtext(side=2,text="Riders", line=2, cex=0.6)
136 | title(main="Overlaying a quadratic curve on Raw Series",adj=0, cex.main=0.9)
137 | 
138 | lines(lowess(tsv), col="red")
139 | 
140 | t=seq(1,length(df$Month), by=1)
141 | tsq= t*t
142 | points(time(tsv), predict(lm(df$Riders~t+tsq)), col="green")
143 | abline(v=at2, h=axTicks(2), col="gray", lty=3)
144 | 
145 | rideBym=NULL
146 | for(it in 1:12){
147 |   rideBym[it]=0
148 | }
149 | 
150 | i=1
151 | while(i<=145){
152 |   rideBym[1]=rideBym[1]+df$Riders[i]
153 |   rideBym[2]=rideBym[2]+df$Riders[i+1]
154 |   rideBym[3]=rideBym[3]+df$Riders[i+2]
155 |   rideBym[4]=rideBym[4]+df$Riders[i+3]
156 |   rideBym[5]=rideBym[5]+df$Riders[i+4]
157 |   rideBym[6]=rideBym[6]+df$Riders[i+5]
158 |   rideBym[7]=rideBym[7]+df$Riders[i+6]
159 |   rideBym[8]=rideBym[8]+df$Riders[i+7]
160 |   rideBym[9]=rideBym[9]+df$Riders[i+8]
161 |   rideBym[10]=rideBym[10]+df$Riders[i+9]
162 |   rideBym[11]=rideBym[11]+df$Riders[i+10]
163 |   rideBym[12]=rideBym[12]+df$Riders[i+11]
164 |   i=i+12
165 | }
166 | rideBym[1]=rideBym[1]+df$Riders[i]
167 | rideBym[2]=rideBym[2]+df$Riders[i+1]
168 | rideBym[3]=rideBym[3]+df$Riders[i+2]
169 | 
170 | avgBym=  c(rideBym[1]/14, rideBym[2]/14,rideBym[3]/14,rideBym[4]/13,rideBym[5]/13,
171 |   rideBym[6]/13, rideBym[7]/13,rideBym[8]/13,rideBym[9]/13,rideBym[10]/13,
172 |   rideBym[11]/13, rideBym[12]/13)
173 | 
174 | tsv1= ts(avgBym, start=1 , end=12, frequency = 1)
175 | plot(tsv1, xaxt="n", yaxt="n")
176 | at3= seq(as.Date("1jan","%d%b"),as.Date("1dec","%d%b"),by="1 month")
177 | at4= seq(as.Date("1","%d"),as.Date("12","%d"),by="1 day")
178 | axis(1,at= format(at4,"%d"), labels = format(at3,"%b"),las=3,cex.axis=0.8)
179 | axis(2,cex.axis=0.8)
180 | mtext(side=1,text="Month" ,line=2, cex=0.6)
181 | mtext(side=2,text="AvgRiders", line=2, cex=0.6)
182 | title(main="Aggregation by Month",adj=0, cex.main=0.9)
183 | 
184 | abline(v= format(at4,"%d"), h=axTicks(2), col="gray", lty=3)
185 |  
186 | tsvz= window(tsv, start=c(2004,1), end=c(2005,12))
187 | plot(tsvz, xaxt="n", yaxt="n")
188 | at5=NULL
189 | i=1
190 | while(i<=24){
191 |   at5=c(at5,time(tsvz)[i])
192 |   i=i+4
193 | }
194 | at6= seq(as.Date("2004-01-01"),as.Date("2005-12-01"),by="4 months")
195 | axis(1,at= at5, labels = format(at6,"%d/%m/%Y"),cex.axis=0.8)
196 | axis(2,cex.axis=0.8)
197 | mtext(side=1,text="Month" ,line=2, cex=0.6)
198 | mtext(side=2,text="AvgRiders", line=2, cex=0.6)
199 | title(main="Zooming into first 2 years",adj=0, cex.main=0.9)
200 | 
201 | abline(v=at5, h=axTicks(2), col="gray", lty=3)
202 | 
203 | plot(aggregate(tsv, FUN=mean),xaxt="n",yaxy="n",cex.axis=0.6)
204 | axis(1,cex.axis=0.8)
205 | axis(2,cex.axis=0.8)
206 | mtext(side=1,text="Year" ,line=2, cex=0.6)
207 | mtext(side=2,text="AvgRiders", line=2, cex=0.6)
208 | title(main="Aggregation for year",adj=0, cex.main=0.9)
209 | grid()
210 | 


--------------------------------------------------------------------------------
/Visualization Techniques.R:
--------------------------------------------------------------------------------
  1 | library(xlsx)
  2 | 
  3 | df1= read.xlsx(file.choose(),1, header= T)
  4 | df1= df1[,!apply(is.na(df1), 2,all)]
  5 | 
  6 | Age= 2017- df1$Mfg_Year
  7 | df1= cbind(df1,Age)
  8 | dffb= df1
  9 | df1= df1[,-c(1,2,3)]
 10 | 
 11 | head(df1)
 12 | str(df1)
 13 | df1$Transmission= as.factor(df1$Transmission)
 14 | df1$C_Price= as.factor(df1$C_Price)
 15 | str(df1)
 16 | summary(df1)
 17 | dfb=df1
 18 | df1=df1[-23,]
 19 | 
 20 | dffb[dffb$Price>70,]
 21 | dffb[dffb$Price>12,]
 22 | dffb[dffb$KM>150,]
 23 | 
 24 | dffb= dffb[-c(13,23,29,65,73),]
 25 | range(dffb$KM)
 26 | range(dffb$Price)
 27 | plot(dffb$KM,dffb$Price, xlim= c(25,120), ylim=c(1,9),xlab="KM",ylab="Price", panel.first = grid())
 28 | #dffb$Model= as.factor(dffb$Model)
 29 | #dffb$Model= as.numeric(dffb$Model)
 30 | text(dffb$KM,dffb$Price , dffb$Model, adj= c(-0.4,-0.4), cex= 0.5)
 31 | #dffb$Model= as.factor(dffb$Model)
 32 | 
 33 | df3= read.xlsx(file.choose(),1, header= T)
 34 | df3= df3[,!apply(is.na(df3), 2,all)]
 35 | 
 36 | palette()
 37 | palette(c("gray","black"))
 38 | 
 39 | plot(df3$Income, df3$Spending, xlim=c(0,225), ylim=c(0,11),
 40 |      xlab="Income", ylab="Spending", col= as.factor(df3$Promoffer), 
 41 |      pch=19, cex=0.8, panel.first = grid())
 42 | 
 43 | plot(jitter(df3$Income,1), df3$Spending, xlim=c(0,225), ylim=c(0,11),
 44 |      xlab="Income", ylab="Spending", col= as.factor(df3$Promoffer), 
 45 |      pch=20, cex=0.8, panel.first = grid())
 46 | par(mar=c(4,4,1,1), oma=c(1,1,1,1))
 47 | 
 48 | plot(jitter(df3$Income,1), df3$Spending, log= "xy",
 49 |      xlab="Income", ylab="Spending", col= as.factor(df3$Promoffer), 
 50 |      pch=20, cex=0.7, panel.first = grid())
 51 | palette("default")
 52 | 
 53 | #MULTIVARIATE PLOT
 54 | #PARALLEL COORDINATES PLOT
 55 | library(MASS)
 56 | par(mfrow=c(2,1), cex=0.6, mar= c(3,3,0,0), oma=c(1,1,1,1))
 57 | df4= df1
 58 | levels(df4$Fuel_type)=1:length(levels(df4$Fuel_type))
 59 | df4=as.data.frame(lapply(df4,FUN=as.numeric))
 60 | 
 61 | parcoord(df4[which(df4$C_Price=='1'),-c(4,8)])
 62 | axis(2,at=axTicks(2), labels=c("0%","20%","40%","60%","80%","100%"))
 63 | grid()
 64 | parcoord(df4[which(df4$C_Price=='2'),-c(4,8)],col="gray")
 65 | axis(2,at=axTicks(2), labels=c("0%","20%","40%","60%","80%","100%"))
 66 | grid()
 67 | 
 68 | #Specialized Visualization
 69 | #Network Data
 70 | #Network Graph
 71 | #Two mode or bipartite graphs
 72 | #Example for association rules
 73 | 
 74 | item1= sample(LETTERS[1:10], size=50 ,T)
 75 | pool= letters[1:10]
 76 | item2=NULL
 77 | for(i in 1:50) {
 78 |   item2=c(item2,sample(pool[-which(pool==tolower(item1[i]))],size=1,replace=T))
 79 | }
 80 | df5= data.frame(item1,item2)
 81 | 
 82 | library(igraph)
 83 | g= graph_from_data_frame(df5,directed = F)
 84 | 
 85 | V(g)$label= V(g)$name
 86 | V(g)[1:10]$type=1
 87 | V(g)[11:20]$type=2
 88 | 
 89 | V(g)$color= "gray"
 90 | E(g)$color= "black"
 91 | V(g)$shape= "circle"
 92 | 
 93 | V(g)$x= c(runif(10,0,5),runif(10,10,15))
 94 | V(g)$y= c(seq(10,1,by=-1),seq(10,1,by=-1))
 95 | 
 96 | 
 97 | 
 98 | E(g)$weight= count.multiple(g)
 99 | g1= simplify(g, remove.multiple = T)
100 | E(g1)$width= 0.5*E(g1)$weight
101 | 
102 | 
103 | size= NULL
104 | for(i in V(g1)$name){
105 |   size=c(size,length(E(g1)[from(V(g1)[i])]))
106 | }
107 | V(g1)$size= 4*size
108 | par(mar= rep(.1,4))
109 | 
110 | V(g1)$color= "gray"
111 | E(g1)$color= "black"
112 | 
113 | plot(g1)
114 | 
115 | #Heirarchical Data
116 | #Treemaps
117 | df6= read.xlsx(file.choose(),1, header= T)
118 | df6= df6[,!apply(is.na(df6), 2,all)]
119 | 
120 | library(treemap)
121 | rec.size= ifelse(df6$price>=5000+df6$price/10, df6$price)
122 | df6= cbind(df6,rec.size)
123 | 
124 | par(mar= rep(.1,4))
125 | 
126 | treemap(df6,index= c("item.category","subcategory","brand"),
127 |         vsize= "rec..size", vColor="rating",
128 |         type= "value", fun.aggregate = "mean",
129 |         palette = gray(0:4/4), fontsize.labels = c(11,9,6),
130 |         title= "", position.legend = "none")
131 | 
132 | #Geographical data
133 | #Map chart 
134 | df7= read.xlsx(file.choose(),1, header= T)
135 | df7= df7[,!apply(is.na(df7), 2,all)]
136 | library(rworldmap)
137 | 
138 | mapDevice(rows= 2, columns= 1)
139 | datamap= joinCountryData2Map(df7, nameJoinColumn = "Country", joinCode = "Name")
140 | mapCountryData(datamap, nameColumnToPlot = "Inclusive.Internet.Index",
141 |                catMethod = "pretty", colourPalette = gray(7:0/7),
142 |                addLegend = F)
143 | mapCountryData(datamap, nameColumnToPlot = "Corruptions.Perceptions.Index",
144 |                catMethod = "pretty", colourPalette = gray(7:0/7),
145 |                addLegend = F)
146 | 


--------------------------------------------------------------------------------
/Welch's T-Test.R:
--------------------------------------------------------------------------------
 1 | library(xlsx)
 2 | 
 3 | df= read.xlsx(file.choose(),1,header = T)
 4 | df= df[,!apply(is.na(df),2,all)]
 5 | head(df)
 6 | summary(df)
 7 | 
 8 | cov(df$Annual_Income, df$Household_Area)
 9 | 
10 | cor(df$Annual_Income, df$Household_Area)
11 | 
12 | mean(df$Annual_Income)
13 | 
14 | median(df$Annual_Income)
15 | 
16 | IQR(df$Annual_Income)
17 | 
18 | sd(df$Annual_Income)
19 | 
20 | var(df$Annual_Income)
21 | 
22 | apply(df[,c(1,2)],MARGIN = 2,FUN = sd)
23 | 
24 | mmdiff= function(df){
25 |   apply(df,MARGIN = 2,function(x){max(x)-min(x)})
26 | }
27 | mmdiff(df[,c(1,2)])
28 | 
29 | x= rnorm(100)
30 | y=x+ rnorm(100, mean=0, sd=0.6)
31 | 
32 | df1= as.data.frame(cbind(x,y))
33 | head(df1)
34 | summary(df1)
35 |   
36 | plot(df1$x, df1$y, las=1, main= "Scatterplot of x and y",
37 |      xlab= "x", ylab="y",
38 |      xlim=c(-3,3),ylim=c(-4,4))
39 | 
40 | x1= rnorm(20,mean=50,5)
41 | y1= rnorm(30,mean=60,5)
42 | 
43 | t.test(x1,y1,var.equal = T)
44 | 
45 | qt(p=0.05/2,df= 48, lower.tail = F)
46 | 
47 | #welch's t-test
48 | t.test(x1,y1,var.equal = F)
49 | 
50 | Ads= sample(c("AD1","AD2","NoAD"),size=100,replace=T)
51 | purchase= ifelse(Ads=='AD1', rnorm(100,mean=500,sd=80),
52 |           ifelse(Ads=='AD2', rnorm(100,mean=600,sd=80),
53 |                  rnorm(100,mean=200,sd=80)))
54 | df2= data.frame(Ads= as.factor(Ads),purchase) 
55 | head(df2)
56 | summary(df2$Ads)
57 | summary(df2[df2$Ads=='AD1',2])
58 | summary(df2[df2$Ads=='AD2',2])
59 | summary(df2[df2$Ads=='NoAD',2])
60 | 
61 | mod= aov(purchase~Ads, data= df2)
62 | summary(mod)
63 | 


--------------------------------------------------------------------------------
/cereal.Rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/cereal.Rdata


--------------------------------------------------------------------------------
/cereal_data_set.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/cereal_data_set.zip


--------------------------------------------------------------------------------
/cereal_data_set/cereal.csv:
--------------------------------------------------------------------------------
 1 | 100%_Bran N C 70 4 1 130 10 5 6 3 280 25 1 0.33
 2 | 100%_Natural_Bran Q C 120 3 5 15 2 8 8 3 135 0 1 -1
 3 | All-Bran K C 70 4 1 260 9 7 5 3 320 25 1 0.33
 4 | All-Bran_with_Extra_Fiber K C 50 4 0 140 14 8 0 3 330 25 1 0.5
 5 | Almond_Delight R C 110 2 2 200 1 14 8 3 -1 25 1 0.75
 6 | Apple_Cinnamon_Cheerios G C 110 2 2 180 1.5 10.5 10 1 70 25 1 0.75
 7 | Apple_Jacks K C 110 2 0 125 1 11 14 2 30 25 1 1
 8 | Basic_4 G C 130 3 2 210 2 18 8 3 100 25 1.33 0.75
 9 | Bran_Chex R C 90 2 1 200 4 15 6 1 125 25 1 0.67
10 | Bran_Flakes P C 90 3 0 210 5 13 5 3 190 25 1 0.67
11 | Cap'n'Crunch Q C 120 1 2 220 0 12 12 2 35 25 1 0.75
12 | Cheerios G C 110 6 2 290 2 17 1 1 105 25 1 1.25
13 | Cinnamon_Toast_Crunch G C 120 1 3 210 0 13 9 2 45 25 1 0.75
14 | Clusters G C 110 3 2 140 2 13 7 3 105 25 1 0.5
15 | Cocoa_Puffs G C 110 1 1 180 0 12 13 2 55 25 1 1
16 | Corn_Chex R C 110 2 0 280 0 22 3 1 25 25 1 1
17 | Corn_Flakes K C 100 2 0 290 1 21 2 1 35 25 1 1
18 | Corn_Pops K C 110 1 0 90 1 13 12 2 20 25 1 1
19 | Count_Chocula G C 110 1 1 180 0 12 13 2 65 25 1 1
20 | Cracklin'_Oat_Bran K C 110 3 3 140 4 10 7 3 160 25 1 0.5
21 | Cream_of_Wheat_(Quick) N H 100 3 0 80 1 21 0 2 -1 0 1 1
22 | Crispix K C 110 2 0 220 1 21 3 3 30 25 1 1
23 | Crispy_Wheat_&_Raisins G C 100 2 1 140 2 11 10 3 120 25 1 0.75
24 | Double_Chex R C 100 2 0 190 1 18 5 3 80 25 1 0.75
25 | Froot_Loops K C 110 2 1 125 1 11 13 2 30 25 1 1
26 | Frosted_Flakes K C 110 1 0 200 1 14 11 1 25 25 1 0.75
27 | Frosted_Mini-Wheats K C 100 3 0 0 3 14 7 2 100 25 1 0.8
28 | Fruit_&_Fibre_Dates,_Walnuts,_and_Oats P C 120 3 2 160 5 12 10 3 200 25 1.25 0.67
29 | Fruitful_Bran K C 120 3 0 240 5 14 12 3 190 25 1.33 0.67
30 | Fruity_Pebbles P C 110 1 1 135 0 13 12 2 25 25 1 0.75
31 | Golden_Crisp P C 100 2 0 45 0 11 15 1 40 25 1 0.88
32 | Golden_Grahams G C 110 1 1 280 0 15 9 2 45 25 1 0.75
33 | Grape_Nuts_Flakes P C 100 3 1 140 3 15 5 3 85 25 1 0.88
34 | Grape-Nuts P C 110 3 0 170 3 17 3 3 90 25 1 0.25
35 | Great_Grains_Pecan P C 120 3 3 75 3 13 4 3 100 25 1 0.33
36 | Honey_Graham_Ohs Q C 120 1 2 220 1 12 11 2 45 25 1 1
37 | Honey_Nut_Cheerios G C 110 3 1 250 1.5 11.5 10 1 90 25 1 0.75
38 | Honey-comb P C 110 1 0 180 0 14 11 1 35 25 1 1.33
39 | Just_Right_Crunchy__Nuggets K C 110 2 1 170 1 17 6 3 60 100 1 -1
40 | Just_Right_Fruit_&_Nut K C 140 3 1 170 2 20 9 3 95 100 1.3 0.75
41 | Kix G C 110 2 1 260 0 21 3 2 40 25 1 1.5
42 | Life Q C 100 4 2 150 2 12 6 2 95 25 1 0.67
43 | Lucky_Charms G C 110 2 1 180 0 12 12 2 55 25 1 1
44 | Maypo A H 100 4 1 0 0 16 3 2 95 25 1 -1
45 | Muesli_Raisins,_Dates,_&_Almonds R C 150 4 3 95 3 16 11 3 170 25 -1 -1
46 | Muesli_Raisins,_Peaches,_&_Pecans R C 150 4 3 150 3 16 11 3 170 25 -1 -1
47 | Mueslix_Crispy_Blend K C 160 3 2 150 3 17 13 3 160 25 1.5 0.67
48 | Multi-Grain_Cheerios G C 100 2 1 220 2 15 6 1 90 25 1 1
49 | Nut&Honey_Crunch K C 120 2 1 190 0 15 9 2 40 25 1 0.67
50 | Nutri-Grain_Almond-Raisin K C 140 3 2 220 3 21 7 3 130 25 1.33 0.67
51 | Nutri-grain_Wheat K C 90 3 0 170 3 18 2 3 90 25 1 -1
52 | Oatmeal_Raisin_Crisp G C 130 3 2 170 1.5 13.5 10 3 120 25 1.25 0.5
53 | Post_Nat._Raisin_Bran P C 120 3 1 200 6 11 14 3 260 25 1.33 0.67
54 | Product_19 K C 100 3 0 320 1 20 3 3 45 100 1 1
55 | Puffed_Rice Q C 50 1 0 0 0 13 0 3 15 0 0.5 1
56 | Puffed_Wheat Q C 50 2 0 0 1 10 0 3 50 0 0.5 -1
57 | Quaker_Oat_Squares Q C 100 4 1 135 2 14 6 3 110 25 1 0.5
58 | Quaker_Oatmeal Q H 100 5 2 0 2.7 -1 -1 1 110 0 1 0.67
59 | Raisin_Bran K C 120 3 1 210 5 14 12 2 240 25 1.33 0.75
60 | Raisin_Nut_Bran G C 100 3 2 140 2.5 10.5 8 3 140 25 1 0.5
61 | Raisin_Squares K C 90 2 0 0 2 15 6 3 110 25 1 0.5
62 | Rice_Chex R C 110 1 0 240 0 23 2 1 30 25 1 1.13
63 | Rice_Krispies K C 110 2 0 290 0 22 3 1 35 25 1 1
64 | Shredded_Wheat N C 80 2 0 0 3 16 0 1 95 0 0.83 -1
65 | Shredded_Wheat_'n'Bran N C 90 3 0 0 4 19 0 1 140 0 1 0.67
66 | Shredded_Wheat_spoon_size N C 90 3 0 0 3 20 0 1 120 0 1 0.67
67 | Smacks K C 110 2 1 70 1 9 15 2 40 25 1 0.75
68 | Special_K K C 110 6 0 230 1 16 3 1 55 25 1 1
69 | Strawberry_Fruit_Wheats N C 90 2 0 15 3 15 5 2 90 25 1 -1
70 | Total_Corn_Flakes G C 110 2 1 200 0 21 3 3 35 100 1 1
71 | Total_Raisin_Bran G C 140 3 1 190 4 15 14 3 230 100 1.5 1
72 | Total_Whole_Grain G C 100 3 1 200 3 16 3 3 110 100 1 1
73 | Triples G C 110 2 1 250 0 21 3 3 60 25 1 0.75
74 | Trix G C 110 1 1 140 0 13 12 2 25 25 1 1
75 | Wheat_Chex R C 100 3 1 230 3 17 3 1 115 25 1 0.67
76 | Wheaties G C 100 3 1 200 3 17 3 1 110 25 1 1
77 | Wheaties_Honey_Gold G C 110 2 1 200 1 16 8 1 60 25 1 0.75
78 | 


--------------------------------------------------------------------------------
/cereal_data_set/cereal.txt:
--------------------------------------------------------------------------------
 1 | 70 4 1 130 10 5 6  280 25 1 0.33
 2 | 120 3 5 15 2 8 8  135 0 1 -1
 3 | 70 4 1 260 9 7 5  320 25 1 0.33
 4 | 50 4 0 140 14 8 0  330 25 1 0.5
 5 | 110 2 2 200 1 14 8  -1 25 1 0.75
 6 | 110 2 2 180 1.5 10.5 10  70 25 1 0.75
 7 | 110 2 0 125 1 11 14  30 25 1 1
 8 | 130 3 2 210 2 18 8  100 25 1.33 0.75
 9 | 90 2 1 200 4 15 6  125 25 1 0.67
10 | 90 3 0 210 5 13 5  190 25 1 0.67
11 | 120 1 2 220 0 12 12  35 25 1 0.75
12 | 110 6 2 290 2 17 1  105 25 1 1.25
13 | 120 1 3 210 0 13 9  45 25 1 0.75
14 | 110 3 2 140 2 13 7  105 25 1 0.5
15 | 110 1 1 180 0 12 13  55 25 1 1
16 | 110 2 0 280 0 22 3  25 25 1 1
17 | 100 2 0 290 1 21 2  35 25 1 1
18 | 110 1 0 90 1 13 12  20 25 1 1
19 | 110 1 1 180 0 12 13  65 25 1 1
20 | 110 3 3 140 4 10 7  160 25 1 0.5
21 | 100 3 0 80 1 21 0  -1 0 1 1
22 | 110 2 0 220 1 21 3  30 25 1 1
23 | 100 2 1 140 2 11 10  120 25 1 0.75
24 | 100 2 0 190 1 18 5  80 25 1 0.75
25 | 110 2 1 125 1 11 13  30 25 1 1
26 | 110 1 0 200 1 14 11  25 25 1 0.75
27 | 100 3 0 0 3 14 7  100 25 1 0.8
28 | 120 3 2 160 5 12 10  200 25 1.25 0.67
29 | 120 3 0 240 5 14 12  190 25 1.33 0.67
30 | 110 1 1 135 0 13 12  25 25 1 0.75
31 | 100 2 0 45 0 11 15  40 25 1 0.88
32 | 110 1 1 280 0 15 9  45 25 1 0.75
33 | 100 3 1 140 3 15 5  85 25 1 0.88
34 | 110 3 0 170 3 17 3  90 25 1 0.25
35 | 120 3 3 75 3 13 4  100 25 1 0.33
36 | 120 1 2 220 1 12 11  45 25 1 1
37 | 110 3 1 250 1.5 11.5 10  90 25 1 0.75
38 | 110 1 0 180 0 14 11  35 25 1 1.33
39 | 110 2 1 170 1 17 6  60 100 1 -1
40 | 140 3 1 170 2 20 9  95 100 1.3 0.75
41 | 110 2 1 260 0 21 3  40 25 1 1.5
42 | 100 4 2 150 2 12 6  95 25 1 0.67
43 | 110 2 1 180 0 12 12  55 25 1 1
44 | 100 4 1 0 0 16 3  95 25 1 -1
45 | 150 4 3 95 3 16 11  170 25 -1 -1
46 | 150 4 3 150 3 16 11  170 25 -1 -1
47 | 160 3 2 150 3 17 13  160 25 1.5 0.67
48 | 100 2 1 220 2 15 6  90 25 1 1
49 | 120 2 1 190 0 15 9  40 25 1 0.67
50 | 140 3 2 220 3 21 7  130 25 1.33 0.67
51 | 90 3 0 170 3 18 2  90 25 1 -1
52 | 130 3 2 170 1.5 13.5 10  120 25 1.25 0.5
53 | 120 3 1 200 6 11 14  260 25 1.33 0.67
54 | 100 3 0 320 1 20 3  45 100 1 1
55 | 50 1 0 0 0 13 0  15 0 0.5 1
56 | 50 2 0 0 1 10 0  50 0 0.5 -1
57 | 100 4 1 135 2 14 6  110 25 1 0.5
58 | 100 5 2 0 2.7 -1 -1  110 0 1 0.67
59 | 120 3 1 210 5 14 12  240 25 1.33 0.75
60 | 100 3 2 140 2.5 10.5 8  140 25 1 0.5
61 | 90 2 0 0 2 15 6  110 25 1 0.5
62 | 110 1 0 240 0 23 2  30 25 1 1.13
63 | 110 2 0 290 0 22 3  35 25 1 1
64 | 80 2 0 0 3 16 0  95 0 0.83 -1
65 | 90 3 0 0 4 19 0  140 0 1 0.67
66 | 90 3 0 0 3 20 0  120 0 1 0.67
67 | 110 2 1 70 1 9 15  40 25 1 0.75
68 | 110 6 0 230 1 16 3  55 25 1 1
69 | 90 2 0 15 3 15 5  90 25 1 -1
70 | 110 2 1 200 0 21 3  35 100 1 1
71 | 140 3 1 190 4 15 14  230 100 1.5 1
72 | 100 3 1 200 3 16 3  110 100 1 1
73 | 110 2 1 250 0 21 3  60 25 1 0.75
74 | 110 1 1 140 0 13 12  25 25 1 1
75 | 100 3 1 230 3 17 3  115 25 1 0.67
76 | 100 3 1 200 3 17 3  110 25 1 1
77 | 110 2 1 200 1 16 8  60 25 1 0.75
78 | 


--------------------------------------------------------------------------------
/cereal_data_set/cereal.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/cereal_data_set/cereal.xls


--------------------------------------------------------------------------------
/cereal_data_set/cereal2.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harshitsaini/Business-Analytics-Data-Mining/9a0613906c2a7f945cbd2a2855d07dc9b2e98778/cereal_data_set/cereal2.xls


--------------------------------------------------------------------------------
/cereal_data_set/cereal_source.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |   StatLib---1993 Graphics Exposition
  3 | 
  4 | 
  5 |     "Serial Correlation or Cereal Correlation ??"
  6 | 
  7 | Call for Poster Presentations for the 1993 Statistical Graphics Exposition
  8 | 
  9 | 
 10 |       REVISED README FILE
 11 | 
 12 | (new breakfast cereal data and new information about the data)
 13 | 
 14 | Every two years the Section on Statistical Graphics sponsors a special
 15 | exposition where one or more data sets are made available, analyzed by
 16 | anyone interested and presented in a special poster session at the
 17 | Annual Meeting.
 18 | 
 19 | For the 1993 Statistical Graphics Exposition, there are two datasets to
 20 | analyze, one synthesized, one real:
 21 | 
 22 | OSCILLATOR TIME SERIES - a synthesized univariate time series with 1024
 23 | observations. These data are similar to those which might be found in a
 24 | university or industrial laboratory setting, or possibly from a process
 25 | monitor on a plant floor. They show obvious structure, but there is more
 26 | than one feature present, and good graphics are key to uncovering the
 27 | features. The objective is to find ALL the features. At the Exposition
 28 | next year, the algorithm and coefficients by which the dataset was
 29 | constructed will be presented, along with the stages of analysis which
 30 | would uncover the features. Some questions to consider:
 31 | 
 32 |     * What graphics are helpful in selecting the right analytical tools?
 33 |     * What combinations of graphics are essential to finding all the
 34 |       features?
 35 |     * For what features are the traditional graphics and analytical
 36 |       tools weak?
 37 |     * Are there graphics that you can retrospectively develop which more
 38 |       clearly reveal the features which were hard to uncover? 
 39 | 
 40 | The oscillator <oscillator> data are available in an ASCII file, one
 41 | observation per record. To obtain the data, send an email message to
 42 | statlib@lib.stat.cmu.edu containing the single line:
 43 | send oscillator from 1993.expo
 44 | 
 45 | BREAKFAST CEREAL DATA (REVISED)- a multivariate dataset describing
 46 | seventy-seven commonly available breakfast cereals, based on the
 47 | information now available on the newly-mandated F&DA food label. What
 48 | are you getting when you eat a bowl of cereal? Can you get a lot of
 49 | fiber without a lot of calories? Can you describe what cereals are
 50 | displayed on high, low, and middle shelves? The good news is that none
 51 | of the cereals for which we collected data had any cholesterol, and
 52 | manufacturers rarely use artificial sweeteners and colors, nowadays.
 53 | However, there is still a lot of data for the consumer to understand
 54 | while choosing a good breakfast cereal.
 55 | 
 56 | Two new variables have been added to the data (end of each record):
 57 | 
 58 | weight (in ounces) of one serving (serving size) [weight] cups per
 59 | serving [cups]
 60 | 
 61 | Otherwise, the data are the same, except for minor typo corrections. The
 62 | addition of these variables (suggested by Abbe Herzig of Consumers
 63 | Union. Cereals vary considerably in their densities and listed serving
 64 | sizes. Thus, the serving sizes listed on cereal labels (in weight units)
 65 | translate into different amounts of nutrients in your bowl. Most people
 66 | simply fill a cereal bowl (resulting in constant volume, but not
 67 | weight). The new variables help standardize other ways, which provides
 68 | other ways to differentiate and group cereals.
 69 | 
 70 | Here are some facts about nutrition that might help you in your
 71 | analysis. Nutritional recommendations are drawn from the references at
 72 | the end of this document:
 73 | 
 74 |     * Adults should consume between 20 and 35 grams of dietary fiber per
 75 |       day.
 76 |     * The recommended daily intake (RDI) for calories is 2200 for women
 77 |       and 2900 for men.
 78 |     * Calories come in three food components. There are 9 calories per
 79 |       gram of fat, and 4 calories per gram of carbohydrate and protein.
 80 |     * Overall, in your diet, no more than 10% of your calories should be
 81 |       consumed from simple carbohydrates (sugars), and no more than 30%
 82 |       should come from fat. The RDI of protein is 50 grams for women and
 83 |       63 grams for men. The balance of calories should be consumed in
 84 |       the form of complex carbohydrates (starches).
 85 |     * The average adult with no defined risk factors or other dietary
 86 |       restrictions should consume between 1800 and 2400 mg of sodium per
 87 |       day.
 88 |     * The type and amount of milk added to cereal can make a significant
 89 |       difference in the fat and protein content of your breakfast. 
 90 | 
 91 | One possible task is to develop a graphic that would allow the consumer
 92 | to quickly compare a particular cereal to other possible choices. Some
 93 | additional questions to consider, and try to answer with effective
 94 | graphics:
 95 | 
 96 |     * Can you find the correlations you might expect? Are there any
 97 |       surprising correlations?
 98 |     * What is the true "dimensionality" of the data?
 99 |     * Are there any cereals which are virtually identical?
100 |     * Is there any way to discriminate among the major manufacturers by
101 |       cereal characteristics, or do they each have a "balanced
102 |       portfolio" of cereals?
103 |     * Do the nutritional claims made in cereal advertisements stand the
104 |       scrutiny of data analysis?
105 |     * Are there cereals which are clearly nutritionally superior, or
106 |       inferior? Are there clusters of cereals?
107 |     * Is a ranking or scoring scheme possible or reasonable, and if so,
108 |       are there cereals which are nutritionally superior or inferior
109 |       under all reasonable weighting schemes? 
110 | 
111 | The variables of the dataset are listed below, in order. For
112 | convenience, we suggest that you use the variable name supplied in
113 | square brackets.
114 | 
115 | Breakfast cereal variables: cereal name [name] manufacturer (e.g.,
116 | Kellogg's) [mfr] type (cold/hot) [type] calories (number) [calories]
117 | protein(g) [protein] fat(g) [fat] sodium(mg) [sodium] dietary fiber(g)
118 | [fiber] complex carbohydrates(g) [carbo] sugars(g) [sugars] display
119 | shelf (1, 2, or 3, counting from the floor) [shelf] potassium(mg)
120 | [potass] vitamins & minerals (0, 25, or 100, respectively indicating
121 | 'none added'; 'enriched, often to 25% FDA recommended'; '100% of FDA
122 | recommended') [vitamins] weight (in ounces) of one serving (serving
123 | size) [weight] cups per serving [cups]
124 | 
125 | Manufacturers are represented by their first initial: A=American Home
126 | Food Products, G=General Mills, K=Kelloggs, N=Nabisco, P=Post, Q=Quaker
127 | Oats, R=Ralston Purina)
128 | 
129 | The breakfast cereal <cereal> data are available in an ASCII file, one
130 | cereal per record, with underscores in place of the spaces in the cereal
131 | name, and spaces separating the different variables. The value -1
132 | indicates missing data. To obtain the data, send an email message to:
133 | statlib@lib.stat.cmu.edu containing the single line:
134 | 
135 | send cereal from 1993.expo
136 | 
137 | Work alone or put together a team of data analysts to look at one or
138 | both of these two data sets! Try to answer the questions posed here or
139 | conduct an exploratory analysis to find and answer your own questions.
140 | 
141 | To participate in the Exposition, you must submit a contributed paper
142 | abstract for inclusion in the formal ASA Contributed Paper Program. This
143 | reserves a poster session slot for you. Your abstract, on the official
144 | ASA abstract form, is due by the contributed paper deadline, February 1,
145 | 1993.
146 | 
147 | If you do not have electronic mail access, try to get the data files
148 | from someone who already has them. If you cannot obtain the data via
149 | electronic mail, contact David Coleman, AMCT-D, Alcoa Technology Center,
150 | Alcoa Center, PA 15069, or e-mail COLEMAN1@ncf.al.alcoa.com
151 | 
152 | 
153 |         References:
154 | 
155 | National Research Council, 1989a. "Diet and Health: Implications for
156 | Reducing Chronic Disease Risk". National Academy Press, Washington, D.C.
157 | 
158 | National Research Council, 1989b. "Recommended Dietary Allowances, 10th
159 | Ed." National Academy Press, Washington, D.C.
160 | 
161 | National Cancer Institute, 1987. "Diet, Nutrition, and Cancer
162 | Prevention: A Guide to Food Choices," NIH Publ. No. 87-2878. National
163 | Institutes of Health, Public Health Service, U.S. Department of Health
164 | and Human Service, U.S. Government Printing Office, Washington, D.C.
165 | 
166 | 


--------------------------------------------------------------------------------