├── .gitignore
├── Boosting.R
├── BoostingDoc.Rmd
├── Bootstrapping.R
├── DecisionTrees.R
├── DiscriminantAnalysis.R
├── GAM.Rmd
├── GeneralizedAdditiveModels.R
├── K-MEANS.Rmd
├── K-NN.R
├── LogisticRegression.R
├── Model Selection Techinques.R
├── NonLinearModel.Rmd
├── NonLinearModel.pdf
├── NonLinearModels.R
├── Notes
    ├── LINEAR REGRESSION.docx
    └── Models.docx
├── PCA-notebook.rmd
├── PCA.R
├── Plots
    ├── 10 fold CV for Model Selection in Forward Stepwise.png
    ├── 10 foldcverror.png
    ├── 5 fold vs 10 fold CV error.png
    ├── Backward stepwise Selection.png
    ├── Forward Stepwise Selection.png
    ├── LOOCV.errors.png
    ├── Lasso-Cross Validaation and Model Selection.png
    ├── Lasso-Validation Set Implementation.png
    ├── Model Selection Best Subset Cp statistic.png
    ├── Number of variables vs Cp and Adjs R2.png
    ├── PRINCIPAL_COMPONENTSPLOT.png
    ├── Quality of Model statistics plot.png
    ├── RMS error on TRAIN vs TEST.png
    ├── Ridge Regression.png
    ├── bootstrapOutput.png
    └── lasso-Fraction of Deviance explained.png
├── README.md
├── RadialKernelSVM.R
├── RadialKernelSVMnotebook.Rmd
├── RandomForestEnsemble.Rmd
├── RandomForests.R
├── Regression.R
├── Ridge Regression and Lasso-Regularization Techniques.R
├── SVMNotebook.Rmd
├── Splines.R
├── Splines.Rmd
├── TreeBasedTechniques.Rmd
├── crossValidation.R
└── svm.R


/.gitignore:
--------------------------------------------------------------------------------
1 | .RData
2 | .Rhistory
3 | *.html
4 | rsconnect/*
5 | 


--------------------------------------------------------------------------------
/Boosting.R:
--------------------------------------------------------------------------------
 1 | #Boosting in R
 2 | 
 3 | 
 4 | require(gbm)
 5 | require(MASS)
 6 | 
 7 | 
 8 | Boston.boost<-gbm(medv ~ . ,data = Boston[-train,],distribution = "gaussian",n.trees = 10000,
 9 |                   shrinkage = 0.01, interaction.depth = 4)
10 | Boston.boost
11 | 
12 | summary(Boston.boost) #Summary gives a table of Variable Importance and a plot of Variable Importance
13 | 
14 | 
15 | plot(Boston.boost,i="lstat") #Plot of Response variable with lstat variable
16 | #Inverse relation with lstat variable ie
17 | 
18 | plot(Boston.boost,i="rm") 
19 | #as the average number of rooms increases the the price increases
20 | 
21 | 
22 | n.trees = seq(from=100 ,to=10000, by=100) #no of trees-a vector of 100 values 
23 | 
24 | #Generating a Prediction matrix for each Tree
25 | predmatrix<-predict(Boston.boost,Boston[-train,],n.trees = n.trees)
26 | dim(predmatrix) #dimentions of the Prediction Matrix
27 | 
28 | #Calculating The Mean squared Test Error
29 | test.error<-with(Boston[-train,],apply( (predmatrix-medv)^2,2,mean))
30 | head(test.error)
31 | 
32 | #Plotting 
33 | 
34 | plot(n.trees , test.error , pch=19,col="blue",xlab="Number of Trees",ylab="Test Error", main = "Perfomance of Boosting on Test Set")
35 | 
36 | #adding the RandomForests Minimum Error line trained on same data and similar parameters
37 | abline(h = min(test.err),col="red")
38 | legend("topright",c("Minimum Test error Line for Random Forests"),col="red",lty=1,lwd=1)
39 | 
40 | 


--------------------------------------------------------------------------------
/BoostingDoc.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Boosting in R"
  3 | output:
  4 |   html_document: default
  5 |   html_notebook: default
  6 | ---
  7 | 
  8 | 
  9 | ## Boosting
 10 | 
 11 | Random Forests are actually used to reduce the variance of the Trees by averaging them. So it generates big Bushy trees and then averages them to get rid of variance.
 12 | 
 13 | __Boosting__ on other hand generates smaller simpler trees and goes at the *__Bias__*.So the Idea in Boosting is to convert a *__Weak learner__* to a *__Strong Learner__* by doing *weighted averaging* of lots of Models generated on Harder Examples and using the Information from a previous Model.  
 14 | 
 15 | Harder Examples in the sense means the training Examples which were not classified correctly or more generally which were not predicted correctly by the previous Model.
 16 | 
 17 | Boosting is a Sequential Method. Each tree that's added into the mix is added to improve the perfomance of previous collection of Trees.
 18 | 
 19 | 
 20 | 
 21 | -----
 22 | 
 23 | 
 24 | ###Implementing Gradient Boosting in R using gbm package
 25 | 
 26 | 
 27 | 'gbm' package is the Gradient Boosting Package.
 28 | 
 29 | ```{r,warning=FALSE,message=FALSE}
 30 | require(gbm)
 31 | require(MASS)
 32 | 
 33 | ```
 34 | 
 35 | 
 36 | Building the Boosted Trees on Boston Housing Dataset.
 37 | 
 38 | 
 39 | ```{r}
 40 | 
 41 | Boston.boost<-gbm(medv ~ .,data = Boston[-train,],distribution = "gaussian",n.trees = 10000,
 42 |                   shrinkage = 0.01, interaction.depth = 4)
 43 | Boston.boost
 44 | 
 45 | summary(Boston.boost) #Summary gives a table of Variable Importance and a plot of Variable Importance
 46 | 
 47 | 
 48 | ```
 49 | 
 50 | The above Boosted Model is a Gradient Boosted Model which generates 10000 trees and the shrinkage parameter $\lambda= 0.01$ which is also a sort of __Learning Rate__. Next parameter is the interaction depth which is the total *splits* we want to do.So here each tree is a small tree with only 4 splits.
 51 | 
 52 | The summary of the Model gives a *__Feature importance Plot__* . And the 2 most important features which explaines the maximum variance in the Data set is 'lstat' and 'rm'.
 53 | 
 54 | 
 55 | -----
 56 | 
 57 | 
 58 | ###Let's plot the Partial Dependence Plots
 59 | 
 60 | 
 61 | The partial Dependence Plots will tell us the relationship and dependence of the variables with the Response variable.
 62 | 
 63 | ```{r}
 64 | 
 65 | plot(Boston.boost,i="lstat") #Plot of Response variable with lstat variable
 66 | #Inverse relation with lstat variable ie
 67 | 
 68 | plot(Boston.boost,i="rm") 
 69 | #as the average number of rooms increases the the price increases
 70 | 
 71 | 
 72 | ```
 73 | 
 74 | In the above plots, the y-axis contains the Response values and the x-axis contains the variable values.So 'medv' is inversely related to the 'lstat' variable , and the 'rm' variable is related directly to 'medv'.
 75 | 
 76 | 
 77 | 
 78 | ------
 79 | 
 80 | 
 81 | ### Prediction on Test Set
 82 | 
 83 | We will compute the Test Error as a function of number of Trees.
 84 | 
 85 | ```{r}
 86 | n.trees = seq(from=100 ,to=10000, by=100) #no of trees-a vector of 100 values 
 87 | 
 88 | #Generating a Prediction matrix for each Tree
 89 | predmatrix<-predict(Boston.boost,Boston[-train,],n.trees = n.trees)
 90 | dim(predmatrix) #dimentions of the Prediction Matrix
 91 | 
 92 | #Calculating The Mean squared Test Error
 93 | test.error<-with(Boston[-train,],apply( (predmatrix-medv)^2,2,mean))
 94 | head(test.error)
 95 | 
 96 | #Plotting 
 97 | 
 98 | plot(n.trees , test.error , pch=19,col="blue",xlab="Number of Trees",ylab="Test Error", main = "Perfomance of Boosting on Test Set")
 99 | 
100 | #adding the RandomForests Minimum Error line 
101 | abline(h = min(test.err),col="red")
102 | legend("topright",c("Minimum Test error Line for Random Forests"),col="red",lty=1,lwd=1)
103 | 
104 | 
105 | ```
106 | Boosting outperforms Random Forests on same Test dataset with  lesser  Mean squared Test Errors.
107 | 
108 | 
109 | -----
110 | 
111 | 
112 | ###Conclusion
113 | 
114 | In the above plot we can notice that if Boosting is done properly by selecting appropiate Tuning parameters such as Shrinkage parameter $\lambda$ and secondly the Number of Splits we want , then it can outperform Random Forests most of the times.
115 | 
116 | Both methods are amazingly good Ensembling Techniques and reduce Overfitting and improve the perfomance of Statistical Models.
117 | 


--------------------------------------------------------------------------------
/Bootstrapping.R:
--------------------------------------------------------------------------------
 1 | #Implementation of Resampling technique -Bootstrapping 
 2 | 
 3 | require(ISLR)# package which has the datasets used in the demonstration
 4 | require(boot)
 5 | 
 6 | #alpha=  VAR(Y) - COV(X,Y) / VAR(X) + VAR(Y) - 2COV(X,Y)
 7 | 
 8 | #writing a function to compute the alpha
 9 | 
10 | 
11 | alpha=function(x,y) 
12 | {
13 |   vx = var(x)
14 |   vy = var(y)
15 |   cxy = cov(x,y)
16 |   (vy-cxy)/ (vx + vy - 2*cxy)
17 |   #last line will be printed to console
18 |   
19 | }
20 | #let's test the function out
21 | ?Portfolio
22 | plot(Portfolio$X, Portfolio$Y)
23 | 
24 | alpha(Portfolio$X,Portfolio$Y)
25 | 
26 | #Now we will use Bootstrap to calculate the standard error of alpha
27 | alpha.se<-function(data,index)
28 | {
29 |   with(data[index,],alpha(X,Y))
30 |   
31 | }
32 | #Now Bootstrap will create re-samples form Original dataset with replacement
33 | #and compute the Standard error for alpha from all those bootstrap data sets
34 | 
35 | 
36 | alpha.se(Portfolio,1:100)
37 | 
38 | set.seed(1) # for reproducable results
39 | alpha.se(Portfolio,sample(1:100,100,replace = TRUE))
40 | 
41 | 
42 | boot.out = boot(data = Portfolio,alpha.se,R=1000)#1000 bootstrap samples
43 | boot.out
44 | plot(boot.out,main="Overview for the Bootstrapping applied")
45 | #We are only interested in finding the Standard error of the alpha
46 | # BOTH THE CURVES LOOK PRETTY NORMALLY DISTRINUTED OR MAYBE GAUSSIAN
47 | 
48 | 
49 | 
50 | #now using the output of a bootstrap to find out confidence intervals for alpha
51 | boot.conf<-boot.ci(boot.out,conf = 0.95, type='all')
52 | boot.conf
53 | 
54 | 
55 | 
56 | 
57 | #hence this is how we can use Bootstrapping to compute the Standard errors, 
58 | #confidence intervals of various statistics and learning techniques


--------------------------------------------------------------------------------
/DecisionTrees.R:
--------------------------------------------------------------------------------
 1 | #Desicion Trees in R ====================
 2 | #Requiring Packages
 3 | 
 4 | require(ISLR) #package containing data
 5 | require(ggplot2)
 6 | require(tree)
 7 | 
 8 | #Using the Carseats data set 
 9 | 
10 | attach(Carseats)
11 | ?Carseats
12 | 
13 | 
14 | #Checking the distribution of Sales
15 | 
16 | ggplot(aes(x = Sales),data = Carseats) + 
17 |   geom_histogram(color="black",fill = 'purple',alpha = 0.6, bins=30) + 
18 |   labs(x = "Unit Sales in Thousands", y = "Frequency")
19 | 
20 | 
21 | #Making a Factor variable from Sales
22 | 
23 | HighSales<-ifelse(Sales <= 8,"No","Yes")
24 | head(HighSales)
25 | 
26 | #Making a Data frame
27 | Carseats<-data.frame(Carseats,HighSales)
28 | 
29 | #We will use the tree() function to fit a Desicion Tree
30 | ?tree
31 | 
32 | #Excluding the Sales atrribute
33 | CarTree<-tree(HighSales ~ . -Sales , data = Carseats,split = c("deviance","gini"))
34 | #split argument split	to specify the splitting criterion to use.
35 | 
36 | CarTree #Outputs a Tree with various Splits at different Variables and Response at Terminals Nodes
37 | #The numeric values within the braces are the Proportions of Yes and No for each split.
38 | 
39 | #Summary of the Decision Tree
40 | summary(CarTree)
41 | 
42 | plot(CarTree)
43 | #Adding Predictors as text to plot
44 | text(CarTree ,pretty = 1 )
45 | 
46 | 
47 | 
48 | 
49 | set.seed(1001)
50 | #A training sample of 250  examples sampled without replacement
51 | train<-sample(1:nrow(Carseats), 250)
52 | #Fitting another Model
53 | tree1<-tree(HighSales ~ .-Sales , data = Carseats, subset = train)
54 | summary(tree1)
55 | #Plotting
56 | plot(tree1);text(tree1)
57 | 
58 | 
59 | 
60 | #Predicting the Class labels for Test set
61 | pred<-predict(tree1, newdata = Carseats[-train,],type = "class")
62 | head(pred)
63 | 
64 | #Confusion Matrix to check number of Misclassifications
65 | with(Carseats[-train,],table(pred,HighSales))
66 | 
67 | #Misclassification Error Rate on Test Set
68 | mean(pred!=Carseats[-train,]$HighSales)
69 | 
70 | 
71 | 
72 | 
73 | #Pruning-----------------
74 | 
75 | 
76 | #10 fold CV
77 | #Performing Cost Complexity Pruning
78 | cv.tree1<-cv.tree(tree1, FUN=prune.misclass)
79 | cv.tree1
80 | plot(cv.tree1)
81 | #Deviance minimum for tree size 15 i.e 15 Splits 
82 | 
83 | prune.tree1<-prune.misclass(tree1,best = 15)
84 | plot(prune.tree1);text(prune.tree1)
85 | 
86 | 
87 | 
88 | 
89 | pred1<-predict(prune.tree1 , Carseats[-train,],type="class")
90 | 
91 | #Confusion Matrix
92 | with(Carseats[-train,],table(pred1,HighSales))
93 | 
94 | #Misclassification Rate
95 | ErrorPrune<-mean(pred1!=Carseats[-train,]$HighSales)
96 | ErrorPrune
97 | #Error reduced to 25 %
98 | 
99 | 


--------------------------------------------------------------------------------
/DiscriminantAnalysis.R:
--------------------------------------------------------------------------------
 1 | #CLASSIFICATION ON LINEAR DISCRIMINAT ANALYSIS-A classifier suitable for small data sets
 2 | #having low dimenstions and less predictors in input space and also stable for multiclass
 3 | #classification K>2(class labels > 2)
 4 | 
 5 | #Uses Bayes theoram as a Base Model
 6 | require(MASS)
 7 | 
 8 | #using LDA on Smarket dataset
 9 | 
10 | 
11 | #Test Data Frame
12 | Test.2005<-subset(Smarket,Year==2005 , select = c(Lag1,Lag2,Direction))
13 | 
14 | 
15 | #Using Previous 2 Days Returns to predict the Direction of Market on The Particular day
16 | #Model trained on Training Data = Inductive Learning
17 | fit1<-lda(Direction ~ Lag1 + Lag2 , data  = Smarket, subset = Year < 2005) 
18 | summary(fit1)
19 | #Perfomance on Training Data
20 | mean(predict(fit1)$class==Smarket$Direction[Year<2005])
21 | 
22 | 
23 | 
24 | 
25 | #Predictions on TEST DATA SET
26 | fit1.pred<-predict(fit1,newdata = Test.2005)
27 | #returns a list with Classified Label for that data point ,
28 | # Probabilities of each class -Here 'Up' and 'Down' & Discriminant Score
29 | 
30 | #creating a Data frame of predictions
31 | df<-data.frame(fit1.pred)
32 | 
33 | #If predictors are Quantitative variables then we classify test points to the class label
34 | #having higher Densities(Prior Probability) | Higher Conditional Probability(Pr(Y|X=xi))
35 | 
36 | head(df)
37 | 
38 | #Confusion Matrix for Model's Perfomance
39 | 
40 | table(predicted=df$class,True=Test.2005$Direction)
41 | #Accuracy rate of 56% , same as that of Logistic Regression Model3
42 | 
43 | #56% correct classifications , and 44% misclassifications
44 | 
45 | 
46 | 
47 | 
48 | #MODEL2 - QUADRATIC DISCRIMINANT ANALYSIS(More Complex Due to different Covariance 
49 | #matrix for each class label K)
50 | 
51 | fit2<-qda(Direction ~ Lag1 + Lag2  , data  = Smarket, subset = Year < 2005)
52 | fit2
53 | 
54 | #Perfomance on Training Data
55 | t1<-table(Predicted=predict(fit2)$class, True = subset(Smarket,Year < 2005)$Direction )
56 | mean(predict(fit2)$class==Smarket$Direction[Year<2005])
57 | 
58 | #Predictions on TEST DATA
59 | fit2.pred<-predict(fit2,newdata = Test.2005)
60 | df2<-data.frame(fit2.pred)
61 | 
62 | table(Predicted = df2$class,True=Test.2005$Direction)
63 | 
64 | mean(df2$class==Test.2005$Direction)
65 | #Hence The overall accuracy on TEST Data has increased to 60%-better Than LDA Model(fit1)
66 | 
67 | 
68 | #Hence Overall accuracy of QDA is higher than LDA
69 | #As QDA is much more complex and complicated than LDA due to the quadratic terms 
70 | #in the formula , although the Training Error for both are Same , but 
71 | #Generalization accuracy for QDA is higher = 60% , for LDA = 56%
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/GAM.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Generalized Additive Models"
  3 | author: "Anish Singh Walia"
  4 | date: "June 27, 2017"
  5 | output: html_document
  6 | ---
  7 | 
  8 | 
  9 | 
 10 | $\text{This article is going to talk about Generalized Additive Models and their implementation in R.}$
 11 | 
 12 | This is also a famous and very flexible technique of fitting and Modelling Non Linear Functions which are more flexible and fits data well.
 13 | In this technique we simply add __Non linear Functions__ on different variables to the Regression equation. 
 14 | $\text{That Non linear function can be anything - Cubic Spline  , natural Spline ,Smoothing Splines and even polynomial function}$
 15 | 
 16 | 
 17 | $$f(x) = y_i = \alpha \ +  \ f_1(x_1) \ + f_2(x_2) + \ .... + \ f_p(x_p) \ + \epsilon_i $$
 18 | 
 19 | $$\text{where} \ f_p(x_p) \ \text {is a Non Linear function on} \  x_p \ variables.$$
 20 | 
 21 | 
 22 | Requiring the __'gam'__ package which helps in fitting *__Generalized Additive Models__*. 
 23 | 
 24 | 
 25 | ```{r,message=FALSE,warning=FALSE , message=FALSE, warning=FALSE}
 26 | #requiring the Package 
 27 | require(gam)
 28 | 
 29 | #ISLR package contains the 'Wage' Dataset
 30 | require(ISLR)
 31 | attach(Wage) #Mid-Atlantic Wage Data
 32 | 
 33 | ?Wage # To search more on the dataset
 34 | 
 35 | gam1<-gam(wage~s(age,df=6)+s(year,df=6)+education ,data = Wage)
 36 | #in the above function s() is the shorthand for fitting smoothing splines in gam() function
 37 | summary(gam1)
 38 | #Plotting the Model
 39 | par(mfrow=c(1,3))
 40 | plot(gam1,se = TRUE)
 41 | 
 42 | 
 43 | 
 44 | ```
 45 | In the above Plots the Y-axis contains the Non Linear functions and x-axis contains the Predictors used in the Model and the dashed lines Represent the __Standard Error bands__.The Whole Model is *__Additive__* in nature.
 46 | 
 47 | $$\textbf {The Curvy plots shows that the functions are Non linear in nature}$$
 48 | 
 49 | 
 50 | ---
 51 | 
 52 | 
 53 | ### We can also fit a Logistic Regression Model using gam()
 54 | 
 55 | ```{r}
 56 | #logistic Regression Model
 57 | gam2<-gam(I(wage >250) ~ s(age,df=4) + s(year,df=4) +education , data=Wage,family=binomial)
 58 | 
 59 | plot(gam2,se=T)
 60 | 
 61 | 
 62 | 
 63 | 
 64 | ```
 65 | 
 66 | ####So we are plotting the logit of Probabilities of each variable as a saperate function but on the whole additive in nature.
 67 | 
 68 | 
 69 | ---
 70 | 
 71 | ###Now we can also check if we need Non linear Terms for Year variable or not?
 72 | 
 73 | ```{r}
 74 | #fitting the Additive Regression Model which is linear in Year
 75 | gam3<-gam(I(wage >250) ~ s(age,df=4)+ year + education , data =Wage, family = binomial)
 76 | plot(gam3)
 77 | 
 78 | #anova() function to test the goodness of fit and choose the best Model
 79 | #Using Chi-squared Non parametric Test due to Classification Problem and categorial Target
 80 | anova(gam2,gam3,test = "Chisq")
 81 | 
 82 | 
 83 | 
 84 | ```
 85 | $$\text {The plot for the Year is a straight Line i.e it is Linear function in Year.}$$
 86 | 
 87 | 
 88 | As the above Test indicates that Model with __Non linear terms for Year__ is not Significant.So we can neglect that Model.
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | ###Now we can also fit a Additive Model using lm() function
 95 | 
 96 | 
 97 | ```{r}
 98 | lm1<-lm(wage ~ ns(age,df=4) + ns(year,df=4)+ education , data  = Wage)
 99 | #ns() is function used to fit a Natural Spline
100 | lm1
101 | 
102 | 
103 | #Now plotting the Model
104 | 
105 | plot.gam(lm1,se=T)
106 | #Hence the Results are same
107 | ```
108 | 
109 | 
110 | ####So by using the lm() function too we can fit a Genaralized Additive Model. 
111 | 
112 | 
113 | 
114 | ---
115 | 
116 | 
117 | ## Conclusion
118 | 
119 | 
120 | ####Hence GAMs are a very nice technique and method to Model Non linearities and Learn complex function other than just Linear functions.They are easily interpretable too.
121 | 
122 | ####And the most basic idea behind learning Non Linearities is to transform the Data and the variables which can capture and Learn and make sense of something more complicated than just a linear relationship.  
123 | 
124 | $$\text {Because the truth is not always "Linear"}$$
125 | 


--------------------------------------------------------------------------------
/GeneralizedAdditiveModels.R:
--------------------------------------------------------------------------------
 1 | #generalized Additive Models in R
 2 | 
 3 | #requiring the Package 
 4 | require(gam)
 5 | require(ISLR)
 6 | attach(Wage)
 7 | 
 8 | gam1<-gam(wage~s(age,df=6)+s(year,df=6)+education ,data = Wage)
 9 | #in the above function s() is the shorthand for fitting smoothing splines in gam() function
10 | summary(gam1)
11 | #Plotting the Model
12 | par(mfrow=c(1,3))
13 | plot(gam1,se = TRUE)
14 | 
15 | 
16 | 
17 | 
18 | #logistic Regression Model
19 | gam2<-gam(I(wage >250) ~ s(age,df=4) + s(year,df=4) +education , data=Wage,family=binomial)
20 | 
21 | plot(gam2,se=T)
22 | 
23 | 
24 | #fitting the Additive Regression Model which is linear in Year
25 | gam3<-gam(I(wage >250) ~ s(age,df=4)+ year + education , data =Wage, family = binomial)
26 | plot(gam3)
27 | 
28 | #anova() function to test the goodness of fit and choose the best Model
29 | #Using Chi-squared Non parametric Test due to Classification Problem and categorial Target
30 | anova(gam2,gam3,test = "Chisq")
31 | 
32 | 
33 | 
34 | lm1<-lm(wage ~ ns(age,df=4) + ns(year,df=4)+ education , data  = Wage)
35 | #ns() is function used to fit a Natural Spline
36 | lm1
37 | 
38 | 
39 | #Now plotting the Model
40 | 
41 | plot.gam(lm1,se=T)
42 | #Hence the Results are same
43 | 


--------------------------------------------------------------------------------
/K-MEANS.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Clustering in R"
 3 | output:
 4 |   html_document: default
 5 |   html_notebook: default
 6 | ---
 7 | 
 8 | This article consists the tutorial on how to cluster data in R. Clustering is a unsupervised learning technique in which the dataset has no target variable $Y$. Clustering mainly aims at finding similarities between the features $X_i$ using a similarity metric and grouping them together into clusters/groups.
 9 | 
10 | K-Means clustering is a clustering algorithm which aims at clustering __continious(numeric)__ data into $K$ clusters which are needed to be specified before feeding data to the model.__Scaling__ of features matter in k-means algorithm as it computes the euclidean distance between the cluster centroid and the data points in each iteration, hence we need to standardize the variables if they are skewed or unscaled.
11 | 
12 | We solve for a objective in k-means i.e we want to minimize the within cluster variance $WCV$ , which simply implies that the points within a cluster should be as close as possible to the cluster centroid(mean for that cluster).
13 | $$minimize {\sum_{k=1}^{K} WCV(C_k) } $$ over the clusters $c_1,c_2,c_3.....c_k$.
14 | 
15 | 
16 | The function can be further written as- 
17 | $$minimize {\sum_{k=1}^{K} \frac{1}{|C_k|} \sum_{i \in C_k}\sum_{j=1}^{p}(x_{ij}-\bar x_{kj} )^2}$$
18 | 
19 | , where $K$ are the number of clusters and $|C_k|$ are the number of observations in $K^{th}$ cluster, $p$ are the number of variables,and most importantly $\bar x_{kj}$ is the mean of the $K^{th}$ cluster i.e the __centroid__ value for that cluster.
20 | 
21 | The centroid value for a cluster is equal to the mean of the observations in that cluster i.e $$ \bar x_{kj} = \frac{1}{|C_k|} \sum_{i \in C_k} x_{ij} $$.
22 | 
23 | This simply means that we want to partition the observations into $K$-clusters such that the total within-cluster variation ,summed over all $K$-clusters is as small as possible i.e they are as close to each other as possible.
24 | 
25 | 
26 | ### The K-means algorithm
27 | 
28 | 1) Randomly assign a number 1 to K,to each of the observations. These serves as initial cluster assignments.
29 | 
30 | 2) Iterate until the cluster assignments stop changing :
31 |       
32 |        2.a For each of the k-clusters compute the cluster __centroid__. The $K^{th}$ cluster centroid is the __mean__ for the observations in the $K^{th}$ cluster.
33 |        
34 |        2.b Assign each observation to the cluster whose __centroid__ is closest to it by calculating the distance between them.(where closest is defined by the distance metric-Euclidean distance).
35 |        
36 |        
37 | 3) Stop:if cluster assignments stop changing , else go to : step 2).
38 | 
39 | 
40 | The algorithm is guaranteed to decrease the value of the objective $WCV(C_k)$. The __local minima__  will be founded by K-means ,however it is not guaranteed that it will give us the __global minima__.
41 | 
42 | Local minima is the smallest value of a function within a range.
43 | 
44 | Global minima is the smallest value of a function over the enrtire domain. 
45 | 
46 | So this means that K-means will land you in a valley, but not necessarily in the lowest/deepest valley because the function is not __CONVEX__.
47 | 
48 | 
49 | --------------
50 | 
51 | ### Implementing K-means in R
52 | 
53 | K-means can work in any dimension but for purposes of demonstration I will use it in 2-D. I am going to generate some fake data and try to cluster it.
54 | 
55 | ```{r}
56 | #setting seed
57 | set.seed(101)
58 | x=matrix(rnorm(100^2),100,2) # a 100 x 2 dim matrix
59 | xmean = matrix(rnorm(8,sd=4),4,2) # a 4 x 2 dim matrix, as we want 4 clusters
60 | 
61 | which=sample(1:4,100,replace=T) #random sample 
62 | x=x+xmean[which,]
63 | 
64 | #plotting the data
65 | plot(x,col=which,pch=19)
66 | 
67 | ```
68 | 
69 | Now the plot above shows 4 clusters. We know the clusters but now let's feed the data to k-means and check out its performance and how it clusters the data.
70 | 
71 | 
72 | ```{r}
73 | km.out<-kmeans(x,4,nstart=15)
74 | km.out
75 | 
76 | ```
77 | Let's plot the clusters made by K-means algorithm.
78 | ```{r}
79 | 
80 | plot(x,col=km.out$cluster,cex=2,pch=1,lwd=2)
81 | points(x,col=which,pch=19)
82 | points(x,col=c(4,3,2,1)[which],pch=19)
83 | 
84 | ```
85 | 
86 | Now the inner circles represent the actual cluster assignments , whereas the outer circles represent the cluster assignments by K-means algorithm.
87 | So we can easily notice the mismatches.
88 |  
89 | ---------------------------------
90 | 
91 | ##Conclusion
92 | 
93 | So this was a small article and tutorial on how to implement K-means clustering in R. K-means clustering is a nice method to cluster numeric data. The only drawback is we need some domain knowledge to tell the algorithm about the number fo clusters we want a-priori. Secondly, K-means is suited only for data which is __normally distributed__ or either standardized. So scaling of variables actually matter a lot in K-means clustering.
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/K-NN.R:
--------------------------------------------------------------------------------
 1 | #IMPLEMENTING K-NN(LAZY LEARNER) IN R
 2 | #K-NN is a lazy learner and a Simple algorithm , but works good most of the times due to 
 3 | #the simple  inductive bias it has.
 4 | 
 5 | #We find out the closest neighbouring points(Xi) to the Query(test) point 'q' using a 
 6 | #Distance metric and classify to that point -Nearest Neighbour in training data,for k=1
 7 | 
 8 | 
 9 | #package for k-NN
10 | require(ISLR)
11 | library(class)
12 | library(dplyr)
13 | ?knn
14 | #Classifiaction of Smarket Data set
15 | attach(Smarket)
16 | 
17 | #Training Data Set-consisting of Predictors Lag1 and Lag2
18 | Smarket %>% filter(Year < 2005) %>%  
19 |   select(Lag1,Lag2) ->train_set
20 | 
21 | #Test Data Set
22 | Smarket %>% filter(Year == 2005) %>%  
23 |   select(Lag1,Lag2) ->test_set
24 | 
25 | 
26 | model1<-knn(train_set,test_set ,Direction[Year<2005],k=1)
27 | #model1 returns some class labels for the Test points=test_set using Euclidean distance
28 | head(model1)
29 | 
30 | #accuracy of the model-CONFUSION MATRIX
31 | table(Predicted=model1,True=Direction[Year==2005])
32 | #True positives and negetives and perfomance of the model
33 | mean(model1==Direction[Year==2005])
34 | #for 1-NN , accuracy is 50% and error=50% , Poor perfomance
35 | 
36 | 
37 | 
38 | 
39 | #Model2-3-NN
40 | model2<-knn(train_set,test_set ,Direction[Year<2005],k=3)
41 |             
42 | table(Predicted=model2,True=Direction[Year==2005])
43 | #True positives and negetives and perfomance of the model
44 | mean(model2==Direction[Year==2005])           
45 | #accuracy improved to 53%, i.e does slightly better than chance. as Error<50%
46 | 
47 | 
48 | 
49 | 
50 | #Model3-
51 | 
52 | 
53 | model3<-knn(train_set,test_set ,Direction[Year<2005],k=100)
54 | 
55 | table(Predicted=model3,True=Direction[Year==2005])
56 | #True positives and negetives and perfomance of the model
57 | mean(model3==Direction[Year==2005])   
58 | 
59 | #NOTE-100 NN gives accuracy of 54%
60 | 
61 | 
62 | 
63 | #Model4- 200-NN
64 | model4<-knn(train_set,test_set ,Direction[Year<2005],k=200)
65 | table(Predicted=model4,True=Direction[Year==2005])
66 | #True positives and negetives and perfomance of the model
67 | mean(model4==Direction[Year==2005])  
68 | #200-NN set gives accuracy of 57%
69 | 
70 | 
71 | 
72 | #MODEL-5, 
73 | 
74 | model5<-knn(train_set,test_set ,Direction[Year<2005],k=300)
75 | table(Predicted=model5,True=Direction[Year==2005])
76 | #True positives and negetives and perfomance of the model
77 | mean(model5==Direction[Year==2005])  
78 | 
79 | #300-NN gives accuracy of 61%(highest)


--------------------------------------------------------------------------------
/LogisticRegression.R:
--------------------------------------------------------------------------------
  1 | #CLASSIFICATION TECHNIQUES USING R
  2 | 
  3 | 
  4 | #1) LOGISTIC REGFRESSION using glm() function-Generalized linear Models , followed by
  5 | # family = binomial
  6 | 
  7 | require(ISLR) #packages with all datasets
  8 | 
  9 | #using Smarket dataset
 10 | 
 11 | names(Smarket) #Stock Market Data
 12 | ?Smarket
 13 | view(Smarket)
 14 | summary(Smarket)
 15 | 
 16 | #Direction var -will be used as a Binary Response variable -to predict whether market will
 17 | #move Up or Down  on a given day
 18 | 
 19 | 
 20 | #Constructing a Scatterplot Matrix
 21 | pairs(Smarket,col = Smarket$Direction)
 22 | #By looking at the scatterplot matrix we can easily see that there are no Correlations
 23 | #between Variables - As it is a Stock market Data- No BIG Surprise!!
 24 | 
 25 | #Fitting a Logistic REgression Model
 26 | mod1<-glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume ,data =Smarket , family =binomial)
 27 | 
 28 | #The model will compute the Prob values of Direction given  these inputs/predictors
 29 | 
 30 | summary(mod1)
 31 | #Non of the variables have significant p-values , this only means that they are not 
 32 | #correlated or very correlated, and also suggest that none of the variables are related
 33 | #to the Response(Direction) variable, i.e H0 is true-no relations b/w predictor and Res
 34 | #and again for this kind of datasets it is not a big surprise. 
 35 | 
 36 | #Still we can make Predictions and calculate Probabilities values
 37 | probs<-fitted(mod1,'response')
 38 | #probs a vector of fitted Pr(Y=Direction|Inputs) values by the Model
 39 | head(probs)
 40 | #Prob values nearby 0.5
 41 | probs<-ifelse(probs>=0.5,'Up','Down')
 42 | #if prob >=0.5 then 'Up' , otherwise Market 'Down'
 43 | attach(Smarket)
 44 | #Forming a CONFUSION MATRIX to check the number of mismatches(misclassifications)
 45 | table(Predicted=probs,True=Direction)
 46 | #lots of mismatches
 47 | #Accuracy of the model
 48 | mean(probs==Direction) 
 49 | #i.e The Model performs Slightly Better than chance , ie when Error Rate < 1/2 or 50%
 50 | # % of matches  = 52 %  , Error(mismatches)=47%
 51 | #We might be OVERFITTING with such high accuracy on TRAINING DATA
 52 | 
 53 | 
 54 | #NOW WE SEGREGATE INTO Training and Test Data and see if we do any better?
 55 | 
 56 | train = Year < 2005 
 57 | mod2<-glm(Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + Volume ,
 58 |                         data =Smarket ,subset = train, family =binomial)
 59 | 
 60 | summary(mod2)
 61 | 
 62 | #Lets check the predictions of this model with Traiing data
 63 | #But we will generalize & predict Direction  on TEST DATA
 64 | prob1<-predict(mod2,newdata = subset(Smarket,!train),type='response')
 65 | mod2.pred<-ifelse(prob1>=0.5,'Up','Down')
 66 | head(mod2.pred)
 67 | #Dataframe of True Directions and Predicted Direction on TEST Data
 68 | pred.df<-data.frame(True_Direction = subset(Smarket,!train)$Direction , Predicted = mod2.pred)
 69 | 
 70 | #CONFUSION MATRIX:
 71 | table(Predicted = mod2.pred , Ture = Smarket$Direction[!train])
 72 | #Accuracy of the Model2-no of matches-correct classifications
 73 | mean(mod2.pred==subset(Smarket,!train)$Direction)
 74 | #accuracy decreases to 48%
 75 | 
 76 | #Hence the model performs poorer than 1st Model and also Overfits
 77 | 
 78 | 
 79 | 
 80 | 
 81 | #Smaller Model with Lesser Input variables
 82 | mod3<-glm(Direction ~ Lag1 + Lag2,data = Smarket,subset=train , family = binomial)
 83 | summary(mod3)
 84 | #AIC(mod2,mod3)
 85 | #df      AIC
 86 | #mod2  7 1395.105
 87 | #mod3  3 1387.402
 88 | #By looking at the AIC(Infor criterion) we can say that model3 is better and good as
 89 | #less information is lost
 90 | 
 91 | #PREDICTIONS ON TEST
 92 | prob1<-predict(mod3,newdata = subset(Smarket,!train),type='response')
 93 | mod3.pred<-ifelse(prob1>=0.5,'Up','Down')
 94 | head(mod3.pred)
 95 | #Dataframe of True Directions and Predicted Direction on TEST Data
 96 | pred.df2<-data.frame(lag1 = Smarket$Lag1[!train] , Lag2 = Smarket$Lag2[!train],True_direction = 
 97 |                        Smarket$Direction[!train],Predicted = mod3.pred)
 98 | 
 99 | #CONFUSION MATRIX:
100 | table(Predicted = mod3.pred , Ture = Smarket$Direction[!train])
101 | #Accuracy of the Model2-no of matches-correct classifications
102 | mean(mod3.pred==subset(Smarket,!train)$Direction)
103 | 
104 | #Hence by taking Less predictors the Accuracy of the Model has improved to 56%
105 | predict(mod3,newdata = data.frame(Lag1=0.967,Lag2=0.600),type='response')
106 | 
107 | 


--------------------------------------------------------------------------------
/Model Selection Techinques.R:
--------------------------------------------------------------------------------
  1 | #MODEL SELECTION in R
  2 | 
  3 | 
  4 | require(ISLR)
  5 | require(dplyr)
  6 | #Major League Baseball Data from the 1986 and 1987 seasons
  7 | summary(Hitters)
  8 | str(Hitters)
  9 | ?Hitters
 10 | 
 11 | #Removing the Missing values
 12 | Hitters<-na.omit(Hitters)
 13 | attach(Hitters)
 14 | 
 15 | 
 16 | 
 17 | #Best Subset Selection-Model selection procedure which searches for all possible Models 
 18 | #and the best Model amongst those Models with 0 predictors to Model with all predictors
 19 | #with the search space equal to 2^p , p = no of  predictors
 20 | #Amongst those Models the Best one is selected with least AIC or BIC values
 21 | #or highest adjusted R-squared value or least CV error.
 22 | install.packages("leaps")
 23 | #Package to do Subset Selection
 24 | require(leaps)
 25 | ?leaps
 26 | 
 27 | #Model selection by exhaustive search, forward or backward stepwise, or sequential replacement
 28 | reggfit.full<-regsubsets(Salary ~ . , data = Hitters )
 29 | reggfit.full
 30 | summary(reggfit.full)->Modsumm
 31 | #by default it goes upto only 8 subsets of Predictors and 8 Models only
 32 | #Models are not Nested like in Forawrd Stepwise selection
 33 | 
 34 | #Residual Sum of squares for each Model
 35 | Modsumm$adjr2
 36 | plot(reggfit.full,scale = 'Cp')
 37 | 
 38 | 
 39 | 
 40 | #Best Subset selection with all 19 variables
 41 | Mod2<-regsubsets(Salary ~ . , data = Hitters , nvmax = 19)
 42 | summod2<-summary(Mod2)
 43 | summod2
 44 | plot(Mod2,scale="Cp",xlab="Variables" , ylab =" Cp Statistic, lesser the better Model")
 45 | par(mfrow=c(1,2))
 46 | 
 47 | #plot of CP vs number of predictors in Model
 48 | plot(summod2$cp,xlab="Number of Variables", ylab = "Cp statistic-Lesser The better")
 49 | which.min(summod2$cp)#Model with 10 predictors has least Cp
 50 | points(10,summod2$cp[10],pch=20,col='blue')
 51 | #coloring the Best Model with 10 predictors
 52 | 
 53 | #Plot of Adjusted R squared vs number of predictors
 54 | plot(summod2$adjr2,xlab="Number of Variables", ylab = "Adjusted R-squared,Larger the better")
 55 | which.max(summod2$adjr2)
 56 | 
 57 | #To find the Coefficients of the Model selected with index 10
 58 | coef(Mod2,id = 10)
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | 
 65 | 
 66 | 
 67 | #FORWARD STEPWISE SELECTION
 68 | forwmod<-regsubsets(Salary ~ . , data  = Hitters , method = 'forward',nvmax = 19 )
 69 | 
 70 | sumfor<-summary(forwmod)
 71 | #now the models are prefectly Nested-i.e the new Model is evaluated by adding 1 more variable
 72 | #to the previous Model with K predictors upto model with all p predictors and the search space
 73 | # is p^2 models.
 74 | sumfor$cp #cp statistic- the lesser the better
 75 | sumfor$bic #BIC statistic-the lesser the better-BIC penalizes larger Models with more predictors  
 76 | 
 77 | sumfor$adjr2
 78 | which.max(sumfor$adjr2) #model with 11 predictors has highest Adjst R-squared
 79 | 
 80 | 
 81 | plot(sumfor$adjr2,xlab="Number of Variables" , ylab =  " Adjusted R-squared")
 82 | points(11,sumfor$adjr2[11],pch=20,col='blue')
 83 | #Adjst R-squred is highest for Model with 11 predictors
 84 | title("Forward Stepwise Selection")
 85 | plot(sumfor$bic,xlab='Number of Predictors '  , ylab = "BIC statistic")
 86 | points(6,sumfor$bic[6],pch=20,col='red')
 87 | #BIC is least for a model with 6 predictors 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | #BACKWARD STEPWISE SELECTION
 95 | backmod<-regsubsets(Salary ~ . , data  = Hitters , method = 'back', nvmax = 19 )
 96 | backsum<-summary(backmod)
 97 | backsum
 98 | 
 99 | 
100 | which.min(backsum$rss)
101 | #RSS least for model with all predictors in it i.e 19 , as expected
102 | #as we add more and more variables in the Model RSS value decreases but we cannot select
103 | #that model because it is certainly Overfitting on training data and has very high variance
104 | 
105 | which.max(backsum$adjr2)
106 | which.min(backsum$cp)
107 | which.min(backsum$bic)#BIC penelizes larger Models
108 | 
109 | plot(sumfor$adjr2,xlab="Number of Variables" , ylab =  " Adjusted R-squared")
110 | points(11,sumfor$adjr2[11],pch=20,col='green')
111 | #Adjst R-squred is highest for Model with 11 predictors
112 | title("Backward Stepwise Selection")
113 | plot(sumfor$bic,xlab='Number of Predictors '  , ylab = "BIC statistic")
114 | points(8,sumfor$bic[8],pch=20,col='yellow')
115 | #BIC is least for a model with 8 predictors 
116 | 
117 | #Mean squared error on Training data reduces as  model's complexity increases
118 | # as expected 
119 | #sometimes we have to clear the memory to make things work
120 | #becasue R first loads everything in RAM then executes it
121 | 
122 | par(mfrow=c(2,2))
123 | 
124 | #PLOT OF COMPARISON OF VARIOUS STATISTICS TO SELECT THE BEST MODEL AND ALSO SHOWING 
125 | # HOW MSE on TRAINING DATA DECREASES due to INCREASE IN MODEL COMPLEXITY AND VARIANCE  
126 | 
127 | plot(backsum$rss/nrow(Hitters),type='b',pch=19,xlab="Number of Variables",
128 |      ylab = "Mean Squared Error on Training Data")
129 | title("Overfitting Cases and increase in Model Variance as no of Predictors Increases")
130 | plot(backsum$bic,type='b',pch=19,col="blue",xlab="Number of Variables",
131 |      ylab="BIC value")
132 | plot(backsum$rsq,type="b",pch=19,col='red',xlab="Number of Variables",
133 |      ylab="R-squared on Training data")
134 | 
135 | plot(backsum$adjr2,type='b',col='green',pch=19,xlab="Number of Variables",
136 |      ylab=('Adjusted R-squared'))
137 | 
138 | coef(backmod,id=8)
139 | 
140 | 
141 | 
142 | 
143 | 
144 | #Model Selection using a Validation Set
145 | 
146 | #Making training and test Data set
147 | 
148 | dim(Hitters)
149 | 
150 | set.seed(1000)#for reproducable results
151 | 
152 | #Randomly making Training rows for Training data set
153 | #180 random numbers for 1 to 263
154 | trainrow<-sample(seq(263),180,replace = FALSE)
155 | head(trainrow)
156 | #now training using Forward Stepwise selection
157 | forw<-regsubsets(Salary ~ . ,data=Hitters[trainrow,] , 
158 |                  method = "forward",nvmax =19)
159 | 
160 | 
161 | #Validation Error vector
162 | #AS there are 19 Models with 1 to 19 predictors in them
163 | val.error<-rep(NA,19)
164 | ?model.matrix() #model.matrix creates a design (or model) matrix
165 | 
166 | #Test data Set- exculding the observations used by Train data
167 | x.test<-model.matrix(Salary ~ ., data = Hitters[-trainrow,])
168 | 
169 | for(i in 1:19) {
170 |   coefi = coef(forw , id = i)
171 |   pred = x.test[,names(coefi)]%*%coefi
172 |   #Coef value for for all 19 variables
173 |   val.error[i] = mean((Hitters$Salary[-trainrow]-pred )^2)
174 |   #Mean sqrd Error on Test set for all 19 Models
175 |   
176 | }
177 | 
178 | val.error
179 | #Plotting the RMStest error for Each Model
180 | plot(sqrt(val.error),pch=19,ylim=c(280,400),type='b',ylab="RMS error",xlab="No of Predictors")
181 | #also adding the Mean RSS omn training data
182 | title("MSE on TEST SET vs MSE on TRAIN SET")
183 | points(sqrt(forw$rss[-1]/180),type='b',pch=19,col='blue')
184 | legend('topright',pch=19,c("MSE on Test Set ","MSE on Training Data"),col=c("black","blue"))
185 | #AS expected the MSE in Training data decrease as the no of predictors
186 | #in the Model Increases due to forward stepwise which adds 1 more Best 
187 | #predictor to the Next model, but due to this the variance of 
188 | #the Models increases with more predictors and it starts
189 | #to Overfit the training data and performs poorly on Test Set.
190 | #Also in the Plot we can see the Train error is least for Models with
191 | # highest TEST ERROR , this is OVERFITTING due to high Model variances
192 | 
193 | 
194 | 
195 | #function to compute Error and predictions on test set
196 | predict.regsubset<-function(object,newdata,id,...)
197 | {
198 |   form=as.formula(object$call[[2]])
199 |   mat<-model.matrix(form,newdata)#Test Set
200 |   coefi<-coef(object,id=id)# Model coefficients
201 |   #last line are the predicted values
202 |   mat[,names(coefi)]%*%coefi
203 | }
204 | 
205 | 
206 | #CROSS VALIDATION FOR MODEL SELECTION
207 | #10 fold CV
208 | set.seed(10)
209 | #dividing the data set in 10 equal parts
210 | #random rows for each fold , each fold has same size
211 | folds=sample(rep(1:10,length = nrow(Hitters)))
212 | #repeat 1 to 10 ,263 times(size of data set)
213 | folds
214 | table(folds)
215 | 
216 | #10 rows of each Model with 19 variables(columns)
217 | cv.errors<-matrix(NA,10,19)
218 | 
219 | for(k in 1:10)
220 | {
221 | #fitting the Models on each k-1 folds
222 |   best.fit<-regsubsets(Salary ~ . , data = Hitters[folds!=k,],
223 |                        nvmax=19,method='forward')
224 | #predictions on left out Kth fold -Validation Set
225 | for(i in 1:19)
226 | {
227 |   pred = predict.regsubset(best.fit,Hitters[folds==k,],
228 |                            id=i) #predictions on k-th fold
229 |   cv.errors[k,i]=mean((Hitters$Salary[folds==k]-pred)^2)
230 |   #cross validation error for each K fold Model and  its ith submodel
231 | }
232 | 
233 | }
234 | 
235 | head(cv.errors)
236 | pred#a list of Predicted Salaries for each player for kth fold
237 | #10 Models in total with 19 SubModels for each Model
238 | 
239 | #Cross validation Error for each 20  Sub Models
240 | rmse.cv=sqrt(apply(cv.errors,2,mean))
241 | rmse.cv
242 | plot(rmse.cv,pch=19,type='b',ylim=c(320,400),ylab="Root Mean squared Cross validation Error",
243 |      xlab="Number of Predictors")
244 | title("CV error for each Submodel")
245 | 
246 | #It tends to prefer Model with 10 and 11 predictors as CV error is least for them
247 | 
248 | 
249 | 
250 | 
251 | 
252 | 
253 | 
254 | 
255 | 
256 | 
257 | 
258 | 
259 | 


--------------------------------------------------------------------------------
/NonLinearModel.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Non Linear Models"
  3 | output:
  4 |   word_document: default
  5 |   html_notebook: default
  6 |   pdf_document: default
  7 |   html_document: default
  8 | ---
  9 | 
 10 | 
 11 | 
 12 | #### MODELLING NON LINEARITIES in DATA using various Non linear functions-
 13 | #### The most basic idea behind adding Non linear properties in the Model is by transforming the  data or the variables,alomst every trick transforms the variables to Model Non linearitites.
 14 | #### Say Kernel Smoothing techniques  , Splines or Step functions etc.
 15 | 
 16 | 
 17 | 
 18 | ```{r, message=FALSE,warning=FALSE}
 19 | #Package containing the Dataset
 20 | require(ISLR)
 21 | attach(Wage)#Dataset
 22 | 
 23 | ```
 24 | 
 25 | ---
 26 | 
 27 | 
 28 | ## Polynomial Regression
 29 | 
 30 | First we will use polynomials ,  and focus on only one predictor age.
 31 | 
 32 | 
 33 | ```{r,message=FALSE,warning=FALSE}
 34 | 
 35 | mod<-lm(wage~poly(age,4),data =Wage)
 36 | #Summary of the Model
 37 | summary(mod)
 38 | 
 39 | 
 40 | ```
 41 | 
 42 | It looks like the *__Quadatric__* coefficient is not Sifgificant.So we can stop tell 3.
 43 | 
 44 | 
 45 | ### Plotting the Model and Making Predictions-
 46 | ```{r fig.width=7, fig.height=6}
 47 | #Range of age variable
 48 | agelims<-range(age)
 49 | #Generating Test Data
 50 | age.grid<-seq(from=agelims[1], to = agelims[2])
 51 | #Making Predctions on Test data
 52 | pred<-predict(mod,newdata = list(age=age.grid),se=TRUE)
 53 | #Standard Error Bands- within 2 Standard Deviations
 54 | se.tab<-cbind(pred$fit+2*pred$se.fit,pred$fit -  2*pred$se.fit)
 55 | plot(age,wage,col="darkgrey")
 56 | #Plotting the age values vs Predicted Wage values for those Ages
 57 | lines(age.grid,pred$fit,col="blue",lwd=2)
 58 | #To plot the Error bands around the Regression Line
 59 | matlines(x=age.grid,y=se.tab,lty =2,col="blue")
 60 |  
 61 | 
 62 | ```
 63 | 
 64 | 
 65 | #### Other Methods to fit polynomials 
 66 | 
 67 | This time we are going to wrap the polynimials inside the I() Identity function and
 68 | now we are representing the polynomials on a different basis.
 69 | 
 70 | ```{r}
 71 | #This time we will use different basis of polynomials
 72 | fit2<-lm(wage ~ age + I(age^2) + I(age^3) + I(age^4),data = Wage)
 73 | summary(fit2)
 74 | 
 75 | plot(fitted(mod),fitted(fit2),xlab="First Polynomial Model",ylab="Polynomial Model wrapped inside Identity function", main="Fitted values of Both models are exactly same")
 76 | 
 77 | 
 78 | 
 79 | 
 80 | 
 81 | ```
 82 | 
 83 | *__We can notice that the coefficients and the summary is different though we have used the same degree of polynomials and this is merely due to the different representations of the polynomils using Identity I() function.__*
 84 | 
 85 | *Things we are interested in is the Fitted polynomial and we can notice that the 
 86 | fitted values of both The model above and this Model has not changed.*
 87 | 
 88 | 
 89 | 
 90 | ----
 91 | 
 92 | ### Now we will use anova() to test different Models with different Predictors
 93 | 
 94 | ```{r}
 95 | #Making Nested Models-i.e Each Next Model includes previous Model and is a special case for previous one
 96 | mod1<-lm(wage ~ education , data = Wage)
 97 | mod2<-lm(wage ~ education + age,data = Wage)
 98 | mod3<-lm(wage ~ education + age + poly(age,2),data = Wage)
 99 | mod4<-lm(wage ~ education + age + poly(age,3),data = Wage)
100 | #using anova() function
101 | anova(mod1,mod2,mod3,mod4)
102 | BIC(mod1,mod2,mod3,mod4)
103 | 
104 | 
105 | ```
106 | 
107 | Seeing the Above values,Model 4 which is the most Complex one is the most Insignificant Model as the p-values indicate.Though the RSS value of Model 4 is least,and this is a expected as it fitting data too *__hard(Overfitting)__*.
108 | 
109 | Model2 and Model3 are the best ones and seem to balance the Bias-Variace Tradeoffs.
110 | 
111 | ---
112 | 
113 | 
114 | ### Polynomial Logistic Regression
115 | 
116 | ```{r}
117 | #Logistic Regression Model the Binary Response variable;
118 | logmod<-glm(I(wage > 250 ) ~ poly(age,3),data = Wage , family = "binomial")
119 | summary(logmod)
120 | #doing Predictions
121 | pred2<-predict(logmod,newdata = list(age=age.grid),se=TRUE)
122 | #Standard Error Bands
123 | #a Matrix with 3 columns
124 | #Confidence intervals
125 | se.band<-pred2$fit + cbind(fit=0,lower=-2*pred2$se.fit , upper = 2*pred2$se.fit )
126 | se.band[1:5,]
127 | 
128 | 
129 | ```
130 | We have done computations on the Logit scale , to convert it to probabilities we will use LateX language which is used in tysetting Mathematical formulas-
131 | 
132 | This is the formula to compute the probabilities
133 | $$p=\frac {e^\eta}{1 + e^\eta}.$$
134 | 
135 | 
136 | ```{r}
137 | #comuting the 95% confidence interval for the Fitted Probabilities value
138 | prob.bands = exp(se.band)/ (1 + exp(se.band))
139 | matplot(age.grid,prob.bands,col="blue",lwd = c(2,2,2),lty=c(1,2,2),
140 |         type="l",ylim=c(0,.1),xlab="Age",ylab="Probability Values")
141 | 
142 | #jitter() function to uniformly add random noise to properly see the densities
143 | points(jitter(age),I(wage > 250)/10 , pch="I",cex=0.5)
144 | 
145 | 
146 | ```
147 | 
148 | The *__blue dotted lines__* represent the 95% Confidence Interval of the fitted Probabilities.
149 | 
150 | The black dots are the actual Probability values for Binary Response Wage, i.e
151 | if wage > 250 is true then 1(TRUE) ,otherwise 0(FALSE).
152 | 


--------------------------------------------------------------------------------
/NonLinearModel.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/NonLinearModel.pdf


--------------------------------------------------------------------------------
/NonLinearModels.R:
--------------------------------------------------------------------------------
 1 | #MODELLING NON LINEARITIES in DATA using various Non linear functions
 2 | #The most basic idead behind adding Non linear properties in the Model is by transforming the 
 3 | #data or the variables , alomst every trick transforms the variables to Model Non linearitites
 4 | #Say Kernel Smoothing techniques  , Splines or Step functions etc
 5 | 
 6 | 
 7 | require(ISLR)
 8 | attach(Wage)
 9 | 
10 | mod<-lm(wage~poly(age,4),data =Wage)
11 | #Summary of the Model
12 | summary(mod)
13 | #Range of age variable
14 | agelims<-range(age)
15 | #Generating Test Data
16 | age.grid<-seq(from=agelims[1], to = agelims[2])
17 | #Making Predctions on Test data
18 | pred<-predict(mod,newdata = list(age=age.grid),se=TRUE)
19 | #Standard Error Bands- within 2 Standard Deviations
20 | se.tab<-cbind(pred$fit+2*pred$se.fit,pred$fit -  2*pred$se.fit)
21 | plot(age,wage,col="darkgrey")
22 | #Plotting the age values vs Predicted Wage values for those Ages
23 | lines(age.grid,pred$fit,col="blue",lwd=2)
24 | #To plot the Error bands around the Regression Line
25 | matlines(x=age.grid,y=se.tab,lty =2,col="blue")
26 | 
27 | 
28 | #This time we will use different basis of polynomials
29 | fit2<-lm(wage ~ age + I(age^2) + I(age^3) + I(age^4),data = Wage)
30 | summary(fit2)
31 | 
32 | plot(fitted(mod),fitted(fit2),xlab="First Polynomial Model",ylab="Polynomial Model wrapped inside Identity function", main="Fitted values of Both models are exactly same")
33 | 
34 | 
35 | #Making Nested Models-i.e Each Next Model includes previous Model and is a special case for previous one
36 | mod1<-lm(wage ~ education , data = Wage)
37 | mod2<-lm(wage ~ education + age,data = Wage)
38 | mod3<-lm(wage ~ education + age + poly(age,2),data = Wage)
39 | mod4<-lm(wage ~ education + age + poly(age,3),data = Wage)
40 | #using anova() function
41 | anova(mod1,mod2,mod3,mod4)
42 | BIC(mod1,mod2,mod3,mod4)
43 | 
44 | 
45 | #Logistic Regression Model the Binary Response variable;
46 | logmod<-glm(I(wage > 250 ) ~ poly(age,3),data = Wage , family = "binomial")
47 | summary(logmod)
48 | #doing Predictions
49 | pred2<-predict(logmod,newdata = list(age=age.grid),se=TRUE)
50 | #Standard Error Bands
51 | #a Matrix with 3 columns
52 | #Confidence intervals
53 | se.band<-pred2$fit + cbind(fit=0,lower=-2*pred2$se.fit , upper = 2*pred2$se.fit )
54 | se.band[1:5,]
55 | 
56 | #comuting the 95% confidence interval for the Fitted Probabilities value
57 | prob.bands = exp(se.band)/ (1 + exp(se.band))
58 | matplot(age.grid,prob.bands,col="blue",lwd = c(2,2,2),lty=c(1,2,2),
59 |         type="l",ylim=c(0,.1),xlab="Age",ylab="Probability Values")
60 | 
61 | #jitter() function to uniformly add random noise to properly see the densities
62 | points(jitter(age),I(wage > 250)/10 , pch="I",cex=0.5)
63 | 
64 | 


--------------------------------------------------------------------------------
/Notes/LINEAR REGRESSION.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/Notes/LINEAR REGRESSION.docx


--------------------------------------------------------------------------------
/Notes/Models.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/Notes/Models.docx


--------------------------------------------------------------------------------
/PCA-notebook.rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Principal Components Analysis"
  3 | author: "Anish Singh Walia"
  4 | output:
  5 |   html_document: default
  6 |   html_notebook: default
  7 | ---
  8 | 
  9 | ##Unsupervised Learning
 10 | 
 11 | __Unsupervised__ learning is a machine learning technique in which the dataset has no target variable or no response value-$Y$.The data is unlabelled.
 12 | Simply saying,there is no target value to supervise the learning process of a learner unlike in __Supervised__ learning where we had training examples which had both input variables $X_i$ and target variable-$Y$ -{$(x_i,y_i)$} and by looking and learning from the training examples the learner used to generate a *mapping* function(also called a __hypothesis__) $f : x_i-> y$ which mapped $x_i$ values to $y$ and learned the relationship between input variables and target variable so that we could generalize it to some random unseen test examples and predict the target value.
 13 | 
 14 | The best example of unsupervised learning is when a small child given some unlabelled pictures of cats and dogs , so by only looking at the structural similarities and disimilarities between the images , he classifies one as a dog and other as cat.
 15 | 
 16 | There are lots of examples of unsupervised learning around us.
 17 | 
 18 | __Unsupervised learning is mostly used as a preprocessing tool for supervised learning__.e.g-like PCA could be used to select a linear combination of predictors-$X_i$ which explains the most variability in the data , and reduce a high-dimentional dataset to a lower dimentional view with only most relevant and important features which can be used as inputs in a supervised learning model.
 19 | 
 20 | e.g If we have a dataset with 100 predictors and we wanted to generate a model,it would be highly inefficient to use all those 100 predictors because that would increase the variance and complexity of the model and which in turn would lead to __overfitting__.Instead what PCA does is find 10 most correlated variables and linearly combine them to generate a principal component -$Z_1$. 
 21 | 
 22 | ----------------------
 23 | 
 24 | 
 25 | ##Principal Components Analysis
 26 | 
 27 | PCA introduces a lower-dimentional representation of the dataset.It finds a sequence of linear combination of the variables called the principal components-$Z_1,Z_2...Z_m$ that explain the maximum variance in the data and are mutually uncorrelated.
 28 | 
 29 | What we try to do is find most relevant set of variables and simply linearly combine the set of variables into a single variable-$Z_m$.
 30 | 
 31 | 1)The first principal component $PC_1$ has the highest variance across data.
 32 | 
 33 | 2)The second principal component $PC_2$ is uncorrelated with $PCA_1$ which also has high variance.
 34 | 
 35 | We have tons of correlated variables in a high dimentional dataset and what PCA tries to do is pair and combine them to a set of some important variables that summarize all information in the data.
 36 | 
 37 | PCA will give us new set of variables called principal components which could be further be used as inputs in a supervised learning model.
 38 | So now we have lesser and most important set of variables paired together to form a new single variable which explains most variance in data.
 39 | This technique is often termed as __Dimentionality Reduction__ which is famous technique to do feature selection and use only relevant features in the Model.
 40 | 
 41 | 
 42 | ###Details
 43 | 
 44 | 1) we have a set of input column vectors $x_1,x_2,x_3.....x_p$ with $n$ observations in dataset.
 45 | 
 46 | 2) The $1^{st}$ principal component $Z_1$ of a set of features is the __normalized linear combination__ of the features $x_1,x_2....x_p$.
 47 |     $$Z_1=z_{i1} = \sum_{i=1}^n \phi_{11}x_1 + \phi_{{21}}x_2 + \phi_{31}x_3 + .........\phi_{pi}x_p $$,
 48 | where n=no of observations, p = number of variables.It is a linear combination to find out the highest variance across data.
 49 | By normalized I mean $\sum_{j=1}^{p} \phi_{j1}^2 = 1$.
 50 | 
 51 | 3) We refer to the weights $\phi_{pi}$ as __Loadings__.The loadings make up the principal components loading vector.
 52 | $$\phi_1 = (\phi_{11},\phi_{21},\phi_{31}......,\phi_{p1})^T$$ is the loadings vector for $PC_1$.
 53 | 
 54 | 4) We constrain the loadings so that their sum of squares could be 1 , as otherwise setting these elements to be arbitarily large in absolute value could result in an arbitarily large variance.
 55 | 
 56 | 
 57 | 
 58 | The first Principal component solves the below optimization problem of maximizing variance across the components--
 59 | 
 60 | $$maximize: \frac{1}{n} \sum_{i=1}^n \sum_{j=1}^p (\phi{ji}.X_{ij})^2  subject \ to \sum_{j=1}^p \phi_{ji}^2=1  $$
 61 | Here each principal component has mean 0.
 62 | 
 63 | The above problem can be solved via Single value decomposition of matrix $X$ ,which is a standard technique in linear algebra.
 64 | 
 65 | 
 66 | Enough maths now let's start implementing PCA in R.
 67 | 
 68 | --------------------------------------
 69 | 
 70 | We will use USAarrests data
 71 | 
 72 | ##Implementing PCA in R
 73 | 
 74 | ```{r}
 75 | ?USArrests
 76 | #dataset which contains Violent Crime Rates by US State
 77 | dim(USArrests)
 78 | dimnames(USArrests)
 79 | 
 80 | 
 81 | ```
 82 | 
 83 | ```{r}
 84 | #finding mean of all 
 85 | apply(USArrests,2,mean)
 86 | apply(USArrests,2,var) #
 87 | 
 88 | 
 89 | ```
 90 | 
 91 | There is a lot of difference in variances of each variables. In PCA mean does not playes a role , but variance plays a major role in defining PC so very large differences in variance value of a variable will definately dominate the PC.
 92 | We need to standardize the variable so as to get mean $\mu=0$  and variance $\sigma^2=1$.
 93 | To standardize we use formula$x' = \frac{x - mean(x)}{sd(x)}$.
 94 | 
 95 | The function prcomp() will do the needful of standardizing the variables.
 96 | 
 97 | 
 98 | ```{r}
 99 | pca.out<-prcomp(USArrests,scale=TRUE)
100 | pca.out
101 | #summary of the PCA
102 | summary(pca.out)
103 | names(pca.out)
104 | 
105 | 
106 | ```
107 | 
108 | Now as we can see maximum % of variance is explained by PC1 , and all PCs are mutually uncorrelated.Around 62 % of variance is explained by $PC_1$.
109 | 
110 | 
111 | Let's build a biplot to understand better.
112 | 
113 | ```{r}
114 | biplot(pca.out,scale = 0, cex=0.65)
115 | 
116 | ```
117 | 
118 | 
119 | Now in the above plot red colored arrows represent the variables and each direction represent the direction which explains the most variation.
120 | eg for all the countries in the direction of 'UrbanPop' are countries with most urban-population and opposite to tht direction are the countries with least .
121 | So this is how we interpret our Biplot.
122 | 
123 | 
124 | 
125 | --------------
126 | 
127 | 
128 | ##Conclusion
129 | 
130 | PCA is a great preprocessing tool for picking out the most relevant linear combination of variables and use them in our predictive model.It helps us find out the variables which explain the most variation in the data and only use them.PCA plays a major role in the data analysis process before going for advanced analytics.PCA only looks the input variables and them pair them.
131 | 
132 | The only __drawback__ PCA has is that it generates the principal components in a __unsupervised__ manner i.e without looking the __target__ values ,hence the principal components which explain the most variation in dataset without target-$Y$ variable,may or may not explain good percentage of variance in the response variable$Y$ which could affect the perfomance of the predictive model. 
133 | 
134 | Hope you guys liked the article , make sure to like and share it. 
135 | Happy coding!!


--------------------------------------------------------------------------------
/PCA.R:
--------------------------------------------------------------------------------
 1 | #unsupervised learning -PCA on UDArrests data set
 2 | 
 3 | #we will use USAarrests data
 4 | 
 5 | ?USArrests
 6 | #dataset which contains Violent Crime Rates by US State
 7 | dim(USArrests)
 8 | dimnames(USArrests)
 9 | 
10 | #finding mean of all 
11 | apply(USArrests,2,mean)
12 | apply(USArrests,2,var) #there is a lot of difference in variances of each variable
13 | #in PCA mean does not playes a role , but variance plays a major role in defining PC
14 | #so very large differences in VAR value of a variable will definately dominate the PC.
15 | 
16 | #So will have to standardize the variables to Unit variance and SD
17 | #It is done via (X -mean(X) / sd(X) )
18 | 
19 | #using prcomp() function to do so and get PC.
20 | 
21 | 
22 | pca.out<-prcomp(USArrests,scale=TRUE)
23 | pca.out
24 | #summary of the PCA
25 | summary(pca.out)
26 | #maximum % of variance is explained by PC1 , and all PCs are mutually uncorrelated. 
27 | names(pca.out)
28 | 
29 | #Biplot of the Principal Components which explains the variance in data in each direction
30 | #of the variable
31 | biplot(pca.out,scale = 0, cex=0.65)
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/Plots/10 fold CV for Model Selection in Forward Stepwise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/Plots/10 fold CV for Model Selection in Forward Stepwise.png


--------------------------------------------------------------------------------
/Plots/10 foldcverror.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/Plots/10 foldcverror.png


--------------------------------------------------------------------------------
/Plots/5 fold vs 10 fold CV error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/Plots/5 fold vs 10 fold CV error.png


--------------------------------------------------------------------------------
/Plots/Backward stepwise Selection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/Plots/Backward stepwise Selection.png


--------------------------------------------------------------------------------
/Plots/Forward Stepwise Selection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/Plots/Forward Stepwise Selection.png


--------------------------------------------------------------------------------
/Plots/LOOCV.errors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/Plots/LOOCV.errors.png


--------------------------------------------------------------------------------
/Plots/Lasso-Cross Validaation and Model Selection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/Plots/Lasso-Cross Validaation and Model Selection.png


--------------------------------------------------------------------------------
/Plots/Lasso-Validation Set Implementation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/Plots/Lasso-Validation Set Implementation.png


--------------------------------------------------------------------------------
/Plots/Model Selection Best Subset Cp statistic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/Plots/Model Selection Best Subset Cp statistic.png


--------------------------------------------------------------------------------
/Plots/Number of variables vs Cp and Adjs R2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/Plots/Number of variables vs Cp and Adjs R2.png


--------------------------------------------------------------------------------
/Plots/PRINCIPAL_COMPONENTSPLOT.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/Plots/PRINCIPAL_COMPONENTSPLOT.png


--------------------------------------------------------------------------------
/Plots/Quality of Model statistics plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/Plots/Quality of Model statistics plot.png


--------------------------------------------------------------------------------
/Plots/RMS error on TRAIN vs TEST.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/Plots/RMS error on TRAIN vs TEST.png


--------------------------------------------------------------------------------
/Plots/Ridge Regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/Plots/Ridge Regression.png


--------------------------------------------------------------------------------
/Plots/bootstrapOutput.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/Plots/bootstrapOutput.png


--------------------------------------------------------------------------------
/Plots/lasso-Fraction of Deviance explained.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/Plots/lasso-Fraction of Deviance explained.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Statistical-Learning-Techniques
 2 | 
 3 | This is a Statistical Learning repository which will consist of various Learning algorithms and their implementation in R 
 4 | and their in depth interpretation. Below are the links to the implementation and their in-depth explanation of the learning algorithms in R. All the documents below contain the under-lying mathematical concepts explained with respect to a simple case study in R.
 5 | 
 6 | 
 7 | ### Topics Covered :
 8 | 
 9 | 1) Supervised Learning
10 | 
11 | 2) Model Selection techniques - AIC, BIC, Mallow's Cp ,  Adjusted R-squared ,  Cross validation error.
12 | 
13 | 3) Shrinkage Methods and Regularization techniques - Ridge Regression , LASSO, L1 norm, L2 norm.
14 | 
15 | 4) Non-linear Regression and parametric models
16 | 
17 | 5) Non-parametric model - K-nearest neighbor algorithm
18 | 
19 | 6) Tree based Modelling - Decision Trees
20 | 
21 | 7) Bayesian Modelling technique : Naive Bayes algorithm.
22 | 
23 | 8) Ensemble learning - Random Forests, Gradient Boosting , Bagging.
24 | 
25 | 9) Re-sampling methods and Cross Validation 
26 | 
27 | 10) Unsupervised learning
28 | 
29 | 
30 | 
31 | 
32 | ### First Document published on Polynomial Regression Tecnhiques which adds Non linearities to the Model and makes the model learn Non linear Functions.
33 | 
34 | http://rpubs.com/anish20/polynomialRegression
35 | 
36 | 
37 | ---
38 | 
39 | ### Second Document on step by step Implementation of Cubic Splines and Smoothing Splines in R and how they transform the variables by adding Truncated Basis functions b(X) and how it is better from Polynomial Regression, to learn Non linear Functional Mappings from X(Predictors) to Y(Target) 
40 | 
41 | 
42 | http://rpubs.com/anish20/Splines
43 | 
44 | ----
45 | 
46 | 
47 | ### Third Document on implementing Generalized Additive Models in R and their overview
48 | 
49 | http://rpubs.com/anish20/GeneralizedAdditiveModelsinR
50 | 
51 | ----
52 | 
53 | ### Fourth document on Implementing Decision Trees in R using the 'tree' package
54 | 
55 | http://rpubs.com/anish20/decisionTreesinR
56 | 
57 | 
58 | ----
59 | 
60 | ### Fifth Article which Explains the concept of Random Forests and how to Implement it in R
61 | 
62 | http://rpubs.com/anish20/RandomForests
63 | 
64 | ----
65 | 
66 | ### Article on Support Vector Machine in R
67 | 
68 | http://rpubs.com/anish20/svmR
69 | 
70 | 
71 | --------
72 | 
73 | ### Article on Radial Kernel Support vector classifier
74 | 
75 | http://rpubs.com/anish20/radialSVM
76 | 
77 | 
78 | --------
79 | 
80 | ### Article on implementing PCA in R and the maths behind it.
81 | 
82 | http://rpubs.com/anish20/PCA
83 | 
84 | --------
85 | 
86 | ### Article which explaines K-means clustering algorithm and its implementation in R
87 | 
88 | http://rpubs.com/anish20/k-means
89 | 


--------------------------------------------------------------------------------
/RadialKernelSVM.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anishsingh20/Statistical-Learning-using-R/7153e8980c4837fb0b1cfed4135afe7032a79e88/RadialKernelSVM.R


--------------------------------------------------------------------------------
/RadialKernelSVMnotebook.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Radial Kernel SVM"
  3 | output:
  4 |   html_document: default
  5 |   html_notebook: default
  6 | ---
  7 | 
  8 | 
  9 | ###Radial Kernel Support Vector Machine
 10 | 
 11 | This article will be all about how to separate non linear data using a *__non-linear decision boundary__ *, which cannot be simply separated by a linear separator.
 12 | 
 13 | It is often encountered that Linear Separators and boundaries fail because of the non linear interactions in the data and the non linear dependence between the features in feature space.
 14 | 
 15 | The trick here is that ,we will do __feature expansion__.
 16 | 
 17 | So how we solve this problem is via doing a non linear transformation on the features($X_i$) and converting them to a higher dimentional space called a feature space.Now by this transformation we are able to saperate non linear data using a non linear decision boundary.
 18 | 
 19 | Non linearities can simply be added by using higher dimention terms such as square and cubic polynomial terms .
 20 | 
 21 | $$y_i = \beta_0 + \beta_1X_{1} \ +  \beta_2X_{1}^2 +  \beta_3X_2 + \beta_4X_2^2 +  \beta_5X_2^3 .... = 0  $$ is the equation of the non linear hyperplane which is generated if we use *higher degree polynomials* terms to fit to data to get a non linear decision boundary.
 22 | 
 23 | What we are actually doing is that we are fitting a SVM is an enlarged space.We enlarge the space of features by doing non linear transformations.
 24 | 
 25 | But the problem with __Polynomials__ are that in higher dimentions i.e when having lots of predictors it gets wild and generally overfits at higher degrees of polynomials.
 26 | 
 27 | Hence there is another elegant way of adding non linearities in SVM is by the use of *__Kernel trick__*.
 28 | 
 29 | -----------------
 30 | 
 31 | 
 32 | ####Kernel Function
 33 | 
 34 | Kernel function is a function of form--
 35 | $$K(x,y) = (1 + \sum_{j=1}^{p} x_{ij}. y_{ij})^d$$ where d = degree of polynomial.
 36 | 
 37 | Now the type of Kernel function we are going to use here is a __Radial kernel__.
 38 | 
 39 | The radial kernel is of form:
 40 | $$k(x,y) = \exp(- \  \gamma \ \sum_{j=1}^{p}(x_{ij} - y_{ij})^2) $$
 41 | Here $\gamma$ is a hyper parameter or a __tuning parameter__ which accounts for the smoothness of the decision boundary and controls the variance of the model.
 42 | 
 43 | If $\gamma$ is very large then we get quiet fluctuating and wiggly decision boundaries which accounts for high variance and overfitting.
 44 | 
 45 | If $\gamma$ is small , the decison line or boundary is smoother and has low variance.
 46 | 
 47 | 
 48 | --------------
 49 | 
 50 | 
 51 | ### Implementation in R
 52 | 
 53 | 
 54 | ```{r,message=FALSE,warning=FALSE}
 55 | require(e1071)
 56 | require(ElemStatLearn)#package containing the dataset
 57 | 
 58 | #Loading the data
 59 | attach(mixture.example) #is just a simulated mixture data with 200 rows and 2 classes
 60 | names(mixture.example)
 61 | 
 62 | ```
 63 | 
 64 | The following data is also 2-D , so lets plot it.
 65 | ```{r}
 66 | plot(x,col=y+3)
 67 | #converting data to a data frame
 68 | data<-data.frame(y=factor(y),x)
 69 | head(data)
 70 | ```
 71 | Now let's fit a Radial kernel using *svm()* function.
 72 | 
 73 | ```{r}
 74 | Radialsvm<-svm(factor(y) ~ .,data=data,kernel="radial",cost=5,scale=F)
 75 | Radialsvm
 76 | #number of support vectors are 110
 77 | 
 78 | #Confusion matrix to ckeck the accuracy
 79 | table(predicted=Radialsvm$fitted,actual=data$y)
 80 | #misclassification Rate
 81 | mean(Radialsvm$fitted!=data$y)*100 #17% wrong predictions
 82 | 
 83 | 
 84 | ```
 85 | 
 86 | Now let's create a grid and make prediction on that grid values.
 87 | 
 88 | ```{r}
 89 | xgrid=expand.grid(X1=px1,X2=px2) #generating grid points
 90 | 
 91 | ygrid=predict(Radialsvm,newdata = xgrid) #ygird consisting of predicted Response values
 92 | 
 93 | #lets plot the non linear decision boundary
 94 | plot(xgrid,col=as.numeric(ygrid),pch=20,cex=0.3)
 95 | points(x,col=y+1,pch=19) #we can see that the decision boundary is non linear
 96 | 
 97 | 
 98 | ```
 99 | 
100 | 
101 | Now we can also improve the fit , by actually including the decision boundary using the contour() function.
102 | 
103 | ```{r}
104 | func = predict(Radialsvm,xgrid,decision.values = TRUE)
105 | func=attributes(func)$decision #to pull out all the attributes and use decision attr
106 | plot(xgrid,col=as.numeric(ygrid),pch=20,cex=0.3)
107 | points(x,col=y+1,pch=19)
108 | contour(px1,px2,matrix(func,69,99),level=0,add=TRUE,lwd=3) #adds the non linear decision boundary
109 | contour(px1,px2,matrix(prob,69,99),level=0.5,add=T,col="blue",lwd=3)#this is the true decision boundary i.e Bayes decision boundary
110 | legend("topright",c("True Decision Boundary","Fitted Decision Boundary"),lwd=3,col=c("blue","black"))
111 | 
112 | 
113 | 
114 | ```
115 | The above plot shows us the tradeoff between the __True Bayes decision boundary__ and the __Fitted decision boundary__ generated by the Radial kernel by learning from data.Both look quiet similar and seems that SVM has done a good functional approximation of the actual true function.
116 | 
117 | 
118 | 
119 | 
120 | --------------
121 | 
122 | 
123 | ### Conclusion
124 | 
125 | Radial kernel support vector machine is a good approch when the data is not linearly separable.The idea behind generating non linear decision boundaries is that we need to do some non linear transformations on the features $X_i$ which transforms them to a higher dimention space.We do this non linear transformation using the *__Kernel trick__*.Now there are 2 hyperparameters in the SVM i.e the regularization parameter __'c'__ and $\gamma$.We can implement cross validation to find the best values of both these tuning parameters which affect our classifier's $C(X)$ perfomance.Another way of finding the best value for these hyperparameters are by using certain optimization techniques such as *__Bayesian Optimization__*.
126 | 
127 | 


--------------------------------------------------------------------------------
/RandomForestEnsemble.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Random Forests in R"
  3 | output:
  4 |   html_document: default
  5 |   html_notebook: default
  6 | ---
  7 | 
  8 | 
  9 | ## Random Forests 
 10 | 
 11 | Random Forest is a __Ensembling__ technique which is similar to a famous Ensemble technique called  *__Bagging__* but a different tweak in it. In Random Forests the idea is to __decorrelate__ the several trees which are generated on the different bootstrapped samples from training Data.And then we simply reduce the Variance of the Trees by averaging them. 
 12 | 
 13 | Averaging the Trees helps us to reduce the variance and also improve the Perfomance of Decision Trees on Test Set and eventually avoid Overfitting.
 14 | 
 15 | The idea is to build a lots of Trees in such a way so as to make the *Correlation* between the Trees smaller.
 16 | 
 17 | Another major difference is that we only consider a Random subset of predictors $m$ each time we do a split on training examples.Whereas usually in Trees we consider all the predictors while doing a split and choose best amongst them. Typically  \(m=\sqrt{p}\) where $p$ are the number of predictors.
 18 | 
 19 | Now it seems crazy to throw away lots of predictors but it actually makes sense because the effect of doing so is that each tree uses different predictors to split data at different times.
 20 | 
 21 | *So by doing this trick of throwing away Predictors, we have decorrelated the Trees and the resulting average seems a little better. *
 22 | 
 23 | 
 24 | 
 25 | -----
 26 | 
 27 | 
 28 | ## Implementing Random Forests in R
 29 | 
 30 | Loading the Packages
 31 | 
 32 | ```{r,warning=FALSE,message=FALSE}
 33 | require(randomForest)
 34 | require(MASS)#Package which contains the Boston housing dataset
 35 | 
 36 | dim(Boston)
 37 | attach(Boston)
 38 | set.seed(101)
 39 | 
 40 | ```
 41 | 
 42 | 
 43 | #####Saperating Training and Test Sets
 44 | 
 45 | We will use 300 samples in Training Set
 46 | 
 47 | ```{r}
 48 | #training Sample with 300 observations
 49 | train<-sample(1:nrow(Boston),300)
 50 | ?Boston  
 51 | 
 52 | ```
 53 | 
 54 | We are going to use variable $'medv'$ as the Response variable , which is the Median Housing Value.
 55 | We will fit 500 Trees. 
 56 | ```{r}
 57 | Boston.rf<-randomForest(medv ~ . , data = Boston , subset = train)
 58 | Boston.rf
 59 | ```
 60 | The above MSE and Variance explained are actually calculated using *Out of Bag Error Estimation*.In this $\frac23$rd of Training data is used in tranining and the reamining $\frac13$ are used to Validate the Trees.Also the number of variable randomly selected  at each split are 4.
 61 | 
 62 | 
 63 | Plotting the Random Forests
 64 | ```{r}
 65 | plot(Boston.rf)
 66 | ```
 67 | 
 68 | 
 69 | 
 70 | -----
 71 | 
 72 | 
 73 | ### Now we can compare the Out of Bag Sample Errors and Error on Test set
 74 | 
 75 | The above Random Forest model chose Randomly 4 variables to be considered at each split.
 76 | We could now try all possible 13 predictors which can be considered at each split.
 77 | 
 78 | ```{r}
 79 | oob.err<-double(13)
 80 | test.err<-double(13)
 81 | 
 82 | #mtry is no of Variables randomly chosen at each split
 83 | for(mtry in 1:13) 
 84 | {
 85 |   rf=randomForest(medv ~ . , data = Boston , subset = train,mtry=mtry,ntree=400) 
 86 |   oob.err[mtry] = rf$mse[400] #Error of all Trees fitted
 87 |   
 88 |   pred<-predict(rf,Boston[-train,]) #Predictions on Test Set for each Tree
 89 |   test.err[mtry]= with(Boston[-train,], mean( (medv - pred)^2)) #Mean Squared Test Error
 90 |   
 91 |   cat(mtry," ")
 92 |   
 93 | }
 94 | 
 95 | test.err  
 96 | oob.err
 97 | 
 98 | ```
 99 | 
100 | 
101 | What happens is that 13 times 400 Trees have been grown.
102 | 
103 | 
104 | 
105 | ---------
106 | 
107 | 
108 | 
109 | ###Comparing both Test Error and Out of Sample Estimation for Random Forests
110 | 
111 | ```{r}
112 | 
113 | matplot(1:mtry , cbind(oob.err,test.err), pch=19 , col=c("red","blue"),type="b",ylab="Mean Squared Error",xlab="Number of Predictors Considered at each Split")
114 | legend("topright",legend=c("Out of Bag Error","Test Error"),pch=19, col=c("red","blue"))
115 | 
116 | ```
117 | 
118 | 
119 | Now what we observe is that the Red line is the Out of Bag Error Estimates and the Blue Line is the Error calculated on Test Set.Both curves are quiet smooth and the error estimates are somewhat correlated too.
120 | The Error Tends to be minimized at around $mtry = 4$.  
121 | 
122 | On the Extreme Right Hand Side of the above Plot we considered all possible 13 predictors at each Split which is __Bagging__.
123 | 
124 | 
125 | ----
126 | 
127 | 
128 | 
129 | ### Conclusion
130 | 
131 | 
132 | Random Forests are a very Nice technique to fit a Stronger Model by averaging Lots of Trees and reduicing the Variance and avoiding Overfitting in Trees build on Training Data.Decision Trees themselves are bad in Prediction on test set,but when used with Ensembling Techniques like Bagging , Random Forests etc their Predictive perfomance are improved a lot.
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 


--------------------------------------------------------------------------------
/RandomForests.R:
--------------------------------------------------------------------------------
 1 | #ENSEMBLE LEARNING in R
 2 | require(randomForest)
 3 | require(MASS)#Package which contains the Boston housing dataset
 4 | 
 5 | dim(Boston)
 6 | attach(Boston)
 7 | set.seed(101)
 8 | 
 9 | 
10 | #training Sample with 300 observations
11 | train<-sample(1:nrow(Boston),300)
12 | ?Boston
13 | 
14 | Boston.rf<-randomForest(medv ~ . , data = Boston , subset = train)
15 | Boston.rf
16 | 
17 | plot(Boston.rf)
18 | 
19 | 
20 | oob.err<-double(13)
21 | test.err<-double(13)
22 | 
23 | #mtry is no of Variables randomly chosen at each split
24 | for(mtry in 1:13) 
25 | {
26 |   rf=randomForest(medv ~ . , data = Boston , subset = train,mtry=mtry,ntree=400) 
27 |   oob.err[mtry] = rf$mse[400] #Error of all Trees fitted
28 |   
29 |   pred<-predict(rf,Boston[-train,]) #Predictions on Test Set for each Tree
30 |   test.err[mtry]= with(Boston[-train,], mean( (medv - pred)^2)) #Mean Squared Test Error
31 |   
32 |   cat(mtry," ")
33 |   
34 | }
35 | 
36 | test.err  
37 | oob.err
38 | 
39 | 
40 | matplot(1:mtry , cbind(oob.err,test.err), pch=19 , col=c("red","blue"),type="b",ylab="Mean Squared Error",xlab="Number of Predictors Considered at each Split")
41 | legend("topright",legend=c("Out of Bag Error","Test Error"),pch=19, col=c("red","blue"))
42 | 
43 | 


--------------------------------------------------------------------------------
/Regression.R:
--------------------------------------------------------------------------------
  1 | # LINEAR REGRESSION STATSLEARN
  2 | 
  3 | 
  4 | 
  5 | library(MASS)
  6 | 
  7 | install.packages('ISLR')
  8 | library(ISLR)
  9 | #package for Datasets
 10 | 
 11 | library(ggplot2)
 12 | 
 13 | names(Boston)
 14 | 
 15 | 
 16 | #PART-1 DESCRIPTIVE ANALYSIS OF DATA- using Mean,medians,summaries, plotting and Visualizations
 17 | 
 18 | #Plotting Data first -finding relations b/w the variables in the Dataset and analysing those variables
 19 | # and knowing the dataset inside out 
 20 | ?Boston
 21 | plot1<-ggplot(aes(x = lstat, y = medv),data = Boston) + 
 22 |   geom_point() + 
 23 |     geom_smooth(method = 'lm')
 24 | #inverse relation b/w the variables-as the lower status population % increases the Median sallaries decreases
 25 | 
 26 | 
 27 | 
 28 | #PREDICTIVE MODELLING -PART2
 29 | 
 30 | # LINEAR MODEL1
 31 | mod1<-lm(medv ~ lstat , data = Boston)
 32 | 
 33 | summary(mod1)
 34 | #Significant p-values and t-values showing a negetive relation b/w X and Y
 35 | plot(lstat ~ medv ,data = Boston)
 36 | abline(mod1,col='red' ) #fitting the model to the Plot
 37 | 
 38 | 
 39 | #Model components such as residuals , fitted Y values etc 
 40 | names(mod1)
 41 | summary(resid(mod1))
 42 | #RESIDUALS SHOULD ALWAYS BE NORMALLY DISTRIBUTED I.E BELL SHAPED
 43 | hist(resid(mod1))
 44 | #USING GGPLOT2 syntax
 45 | ggplot(aes(x = residuals(mod1)),data = mod1)+ 
 46 |          geom_histogram(binwidth=5)
 47 | 
 48 | 
 49 | #confidence intervals for each regression coefficients
 50 | confint(mod1)
 51 | 
 52 | #Predictions and Genrelizations
 53 | predict(mod1 , data.frame(lstat = c(5,30,10)))
 54 | BIC(mod1)
 55 | 
 56 | par(mfrow=c(2,2))
 57 | #plotting the Linear model
 58 | plot(mod1)
 59 | 
 60 | 
 61 | #Multiple Regression
 62 | mod2<-lm(medv ~ ., data =Boston) 
 63 | #including all the sttr as predictors
 64 | #Backward Model Selection technique
 65 | 
 66 | 
 67 | mod2
 68 | summary(mod2)
 69 | #Age ,indus variable not significant when all variables included which says tha
 70 | # there is correlations of these variables with other variables
 71 | 
 72 | AIC(mod2,mod1)
 73 | 
 74 | #Updatiing the Model- and removing irrelevetn features(inputs)
 75 | mod3<-update(mod2, ~. - age - indus)
 76 | summary(mod3)
 77 | 
 78 | #interaction between variables
 79 | mod4<-lm(medv ~ lstat*age,data = Boston)
 80 | summary(mod4)
 81 | 
 82 | #non-linear Models
 83 | mod5<-lm(medv ~ lstat + I(lstat^2),data = Boston); summary(mod5)
 84 | 
 85 | #Plotting The non-linear Models
 86 | attach(Boston)
 87 | par(mfrow=c(1,1))
 88 | plot(medv~lstat)
 89 | #plotting the regression line on the scatterplot-cannot use abline now
 90 | points(lstat , fitted(mod5), col='blue', pch=20)
 91 | 
 92 | #Another method of polynomial regression using poly() function
 93 | mod6<-lm(medv~poly(lstat,5))
 94 | summary(mod6)
 95 | #This model is more complicated and flexible due to higher degree and has lesser training Error
 96 | 
 97 | points(lstat , fitted(mod6),col= 'red', pch=20)
 98 | 
 99 | 
100 | #writing a R function
101 | 
102 | regplot<-function(x,y,...) { 
103 |   
104 |   plot(x,y,...)
105 |   #linear regression Model
106 |   mod<-lm(y~x)
107 |   #to fit regression line to the scatterplot
108 |   abline(mod,col='red')
109 |   
110 | }
111 | 
112 | 
113 | 
114 | 


--------------------------------------------------------------------------------
/Ridge Regression and Lasso-Regularization Techniques.R:
--------------------------------------------------------------------------------
 1 | #RIDGE REGRESSION AND LASSO-Model Selection Techniques
 2 | 
 3 | require(glmnet)
 4 | ?glmnet
 5 | 
 6 | attach(Hitters)
 7 | #Package 'glmnet' does not uses Model Formula language, so we will set up the
 8 | #Predictors and Response variable for the Model
 9 | 
10 | x = model.matrix(Salary~.-1 , data = Hitters)#Predictors  
11 | y = Hitters$Salary #Response Variable to be used in Linear Model
12 | 
13 | #First we will do Ridge Regression by setting alpha=0
14 | #In Ridge Regression difference is that it includes all the variables p
15 | #in the Models and does not includes a subset of variables.
16 | #So in shrinkage methods we will simply shrink the coefficient value towards
17 | #0 
18 | 
19 | 
20 | ridge<-glmnet(x, y ,alpha = 0)#Ridge Regression Model
21 | summary(ridge)
22 | ridge
23 | #Plotting lambda(Tuning Parameter) vs Cofficient values of variables.
24 | plot(ridge,xvar = "lambda",label = TRUE)
25 | plot(ridge,xvar = "dev",label = TRUE)
26 | #Plots the fraction of deviance Explained-similr to R-squared value
27 | 
28 | #In Shrinkage techniques we will Shrink the Cofficient values towards 0 as
29 | #value of lambda increases to reduce the Error value
30 | 
31 | #Corss validation with 10 folds
32 | cv.ridge = cv.glmnet(x,y,alpha=0)
33 | plot(cv.ridge)#all 20 variables in the Model-19 predictors + 1 intercept for each
34 | #Plot of MSE on Validation Set vs Lambda(Tuning Parameter)
35 | #Test Error first Decreases as RIDGE REGERSSION decreases the Model's variance
36 | #and complexity but after a point as lambda increases the Bias increases a
37 | #lot which causes Underfitting and again Error goes up.
38 | #With increase in Lambda the variance of Model decreases at cost of High Bias
39 | 
40 | 
41 | 
42 | #LASSO-Another shrinkage Technique which Does Variable Selection as well as
43 | #shrinks the cofficient value towards 0 and sometimes excatly 0 which 
44 | #reduces the Complexity of the Models , The best part is that it selects
45 | #a subset of variables for the Model , unlike RIDGE REGRESSION
46 | #It prefers Sparse Models
47 | 
48 | #by default alpha=  0
49 | lasso<-glmnet(x,y)
50 | lasso
51 | plot(lasso,xvar ="lambda",label = T)
52 | plot(lasso,xvar = "dev",label = T)
53 | #At the end there is quiet a big jump in coef values, so it might be 
54 | #overfitting
55 | 
56 | #again using Cross validation
57 | lasso.cv<-cv.glmnet(x,y)
58 | plot(lasso.cv)
59 | #We get least errors at about 14 predictors and withing 1 SE is Model with 
60 | #around 5 predictors
61 | #Finding the coeffienct values for best Model
62 | coef(lasso.cv)#best Model selected is with 5 predictors- within 1 Standard error
63 | # of the minimum 
64 | 
65 | 
66 | 
67 | 
68 | #Using a Validation Set
69 | 
70 | #Train on training Set
71 | lasso.val<-glmnet(x[trainrow,],y[trainrow])
72 | plot(lasso.val,xvar = 'dev',label = T)
73 | pred=predict(lasso.val,x[-trainrow,])#Predictions on Test set
74 | pred
75 | dim(pred)
76 | rmse = sqrt(apply((y[-trainrow]- pred)^2 ,2 ,mean))#Root mean squared error
77 | #on Validation Set
78 | 
79 | plot(log(lasso.val$lambda),rmse,type='b',pch=19,col='red',
80 |      ylab="Root Mean Square Error on Validation Set",xlab=("Log of Lambda(Tuning Parameter)"))
81 | title("Lasso Implementation")
82 | #On the Left side of the plot, when lambda is small - it represents Overfitting,high variance
83 | #on the Right Hand side- when lambda is very large it repssents Underfitting,high Bias
84 | #Somewhere in the middle of the plot the Bias Variance is balanced
85 | 
86 | #Extracting the Best value of Lambda which gives least Error on Validation Set
87 | lam.best<-lasso.val$lambda[order(rmse)[1]]
88 | lam.best
89 | #order in ascending order of rmse and we want the 1st of that list
90 | lam.best
91 | 
92 | #outputs a sparse matrix format
93 | coef(lasso.val,s=lam.best)
94 | #It outputs a Model with 15 predictors which has least RMSE error on Validation Set


--------------------------------------------------------------------------------
/SVMNotebook.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Support Vector Machines"
  3 | output:
  4 |   html_notebook: default
  5 |   html_document: default
  6 | ---
  7 | 
  8 | This article will explain how to implement Support Vector Machines in R and their in depth interpretation.
  9 | 
 10 | SVM does not uses any Probability Model as such like other Classifiers use , because it directly looks for a Hyperplane which divides and sagments the data and classes.
 11 | 
 12 | General form of a Hyperplane is :
 13 | 
 14 | $$\beta_0  + \beta_1X_1  +  \beta_2X_2  + . . .. . \beta_pX_p = 0 $$
 15 | where $p$ is the number of Dimentions.
 16 | 
 17 | 
 18 | 1) For $p=2$ i.e for a 2-D space it is a Line.
 19 | 
 20 | 2)The vector $(\beta_1,\beta_2,\beta_3...\beta_p) is \ just \ a \ Normal\ vector.$ A vector in simple terms is just a 1-Dimentional Tensor or a 1-D array.
 21 | 
 22 | 
 23 | 
 24 | __Support Vector Classifiers__ are majorly used  for solving a binary clssification problem where we only have 2 class labels say $Y = [-1,1]$ and a bunch of predictors $X_i$ .And what SVM does is that it generates Hyperplanes which in simple terms are just __straight lines or planes__ or are Non-linear curves , and these lines are used to saperate the data or sagment the data into 2 categories or more depending on the type of Classification problem.
 25 | 
 26 | We try to find a __plane__ which saperates the classes in some feature space $X_i$.
 27 | 
 28 | Another concept in SVM is of *Maximal Margin Classifiers*.What it means is that amongst a set of separating hyperplanes SVM aims at finding the one which maximizes the margin $M$.This simply means that we want to maximize the gap or the distance between the 2 classes from the Decision Boundary(separating plane).
 29 | 
 30 | This concept of separating data linearly into 2 different classes using a Linear Separator or a straight linear line is called *__Linear Separability__*.
 31 | 
 32 | The term *__Support Vectors__* in SVM are the data points or training examples which are used to define or maximizing the margin.The support vectors are the points which are close to the decision boundary or on the wrong side of the boundary.  
 33 | 
 34 | 
 35 | 
 36 | -------------
 37 | 
 38 | ###Linear SVM Classifier in R
 39 | 
 40 | 
 41 | ```{r}
 42 | set.seed(10023)
 43 | #generating data
 44 | #a matrix with 20 rows and 2 columns
 45 | x=matrix(rnorm(40),20,2) #predictors
 46 | x
 47 | y=rep(c(-1,1),c(10,10))#Binary response value
 48 | x[y==1,]=x[y==1,]+1 #2 classes are [-1,1]
 49 | 
 50 | #plotting the points
 51 | plot(x,col=y+2,pch=19)
 52 | 
 53 | ```
 54 | 
 55 | 
 56 | 
 57 | 
 58 | ------------
 59 | 
 60 | #### Using the 'e1071' package to fit a SVM classifier
 61 | 
 62 | ```{r,message=FALSE,warning=FALSE}
 63 | require(e1071)
 64 | #converting to a data frame
 65 | data<-data.frame(x,y=as.factor(y))
 66 | head(data)
 67 | 
 68 | svm<-svm(y ~ .,data=data,kernel="linear",cost=10,scale = F)
 69 | #here cost 'c' is a tuning parameter .The larger it is more stable the margin becomes, it is like a Regularization parameter 
 70 | svm
 71 | svm$index #gives us the index of the Support Vectors
 72 | #so we have 10 support vectors
 73 | svm$fitted #to find the fitted values
 74 | 
 75 | #Confusion Matrix of Fitted values and Actual Response values
 76 | table(Predicted=svm$fitted,Actual=y)
 77 | 
 78 | #accuracy on Training Set
 79 | mean(svm$fitted==y)*100 #has 80 % accuracy on Training Set
 80 | 
 81 | #plotting
 82 | plot(svm,data)
 83 | ```
 84 | 
 85 | 
 86 | We can also create our own plot.
 87 | ```{r, message=FALSE, warning=FALSE}
 88 | #First Making Grids using a function
 89 | make.grid<-function(x,n=75) {
 90 |   grange=apply(x,2,range)
 91 |   x1=seq(from=grange[1,1],to=grange[2,1],length=n)
 92 |   x2=seq(from=grange[1,2],to=grange[2,2],length=n)
 93 |   expand.grid(X1=x1,X2=x2) #it makes a Lattice for us
 94 | }
 95 | xgrid=make.grid(x) #is a 75x75 matrix
 96 | 
 97 | #now predicting on this new Test Set
 98 | ygrid=predict(svm,xgrid)
 99 | 
100 | #plotting the Linear Separator
101 | plot(xgrid,col=c("red","blue")[as.numeric(ygrid)],pch=19,cex=.2)
102 | #creates 2 regions
103 | points(x,col=y+3,pch=19) #adding the points on Plot
104 | points(x[svm$index,],pch=5,cex=2) #Highlighting the Support Vectors
105 | ```
106 | In the above Plot the __Highlighted points__ are the Support Vectors which were used in determining the Decision Boundary.
107 | 
108 | 
109 | --------------
110 | 
111 | 
112 | ####Extracting the Coefficient values of the Linear SVM equation
113 | 
114 | The $\beta$ here are the coefficient values of the SVM model.As it is a Linear SVM classifier, the linear equation is linearly dependent on the predictors $X_1$ and $X_2$.
115 | 
116 | 
117 | $y_i=f(x,\beta) = \beta_0 + \beta_1.X_1 + \beta_2X_2$ , is the mathematical equation for the Linear SVM classifier.
118 | 
119 | 
120 | 
121 | 
122 | ```{r}
123 | beta = drop(t(svm$coefs)%*%x[svm$index,])
124 | beta
125 | 
126 | 
127 | ```
128 | 
129 | 
130 | ```{r}
131 | beta0=svm$rho
132 | beta0 #the intercept value
133 | #again Plotting
134 | plot(xgrid,col=c("red","blue")[as.numeric(ygrid)],pch=19,cex=.2)
135 | #creates 2 regions
136 | points(x,col=y+3,pch=19) #adding the points on Plot
137 | points(x[svm$index,],pch=5,cex=2)
138 | abline(beta0/beta[2],-beta[1]/beta[2],lty=1)#is the Decision boundary or Plane
139 | #below are for adding the soft margins
140 | abline((beta0-1)/beta[2],-beta[1]/beta[2],lty=2)
141 | abline((beta0+1)/beta[2],-beta[1]/beta[2],lty=2)
142 | ```
143 | In the above plot the dashed lines are actually the __Soft margins__ which are again margins which include the support vectors within or on them. And how small or wide these soft margins becomes depend on the value of our tuning parameter $c$ which we assigned the value as 10 in the above SVM.
144 | 
145 | 
146 | ---------------
147 | 
148 | 
149 | 
150 | ###Conclusion
151 | 
152 | Support Vector Machines are actually very strong and accurate technique to do Classification.SVM are preferable when the classes are saperated well like in the example we did above we had 10 labels for 1 and 10 for -1.One unique thing about SVMs are that they don't actually follow or use a Conditional Probability Model $Pr(Y | X_i)$ like other classifiers do.
153 | 
154 | Linear SVM can not always be useful.Linear SVM can only be used when the data is linearly saperable. 
155 | 
156 | When the the data is __Non linearly saperable__ i.e has Non linearities in it we need to do Feature Expansion i.e do a Non linear transform to the features to convert to higher dimentions and use a Non linear function $f(x,\beta)$ which is Non linear in predictors $X_i$ to get a Non linear Decision Boundary which saperates the data in an enlarged feature space.An example is __Radial SVMs__ which uses a radial __kernel__.
157 | 


--------------------------------------------------------------------------------
/Splines.R:
--------------------------------------------------------------------------------
 1 | #loading the Splines Packages
 2 | require(splines)
 3 | #ISLR contains the Dataset
 4 | require(ISLR)
 5 | attach(Wage)
 6 | 
 7 | agelims<-range(age)
 8 | #Generating Test Data
 9 | age.grid<-seq(from=agelims[1], to = agelims[2])
10 | 
11 | #3 cutpoints at ages 25 ,50 ,60
12 | fit<-lm(wage ~ bs(age,knots = c(25,40,60)),data = Wage )
13 | summary(fit)
14 | #Plotting the Regression Line to the scatterplot   
15 | plot(age,wage,col="grey",xlab="Age",ylab="Wages")
16 | points(age.grid,predict(fit,newdata = list(age=age.grid)),col="darkgreen",lwd=2,type="l")
17 | #adding cutpoints
18 | abline(v=c(25,40,60),lty=2,col="darkgreen")
19 | 
20 | 
21 | 
22 | fit1<-smooth.spline(age,wage,df=16)
23 | plot(age,wage,col="grey",xlab="Age",ylab="Wages")
24 | points(age.grid,predict(fit,newdata = list(age=age.grid)),col="darkgreen",lwd=2,type="l")
25 | #adding cutpoints
26 | abline(v=c(25,40,60),lty=2,col="darkgreen")
27 | lines(fit1,col="red",lwd=2)
28 | legend("topright",c("Smoothing Spline with 16 df","Cubic Spline"),col=c("red","darkgreen"),lwd=2)
29 | 
30 | 
31 | 
32 | #Cross Validation to select lambda(Tuning Parameter) for a Model which Minimizes CV Error.
33 | 
34 | fit2<-smooth.spline(age,wage,cv = TRUE)
35 | fit2
36 | #It selects $\lambda=6.794596$ it is a Heuristic and can take various values for how rough the function is
37 | 
38 | plot(age,wage,col="grey")
39 | #Plotting Regression Line
40 | lines(fit2,lwd=2,col="purple")
41 | legend("topright",("Smoothing Splines with 6.78 df selected by CV"),col="purple",lwd=2)
42 | 
43 | 


--------------------------------------------------------------------------------
/Splines.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Splines"
  3 | output:
  4 |   html_document: default
  5 |   html_notebook: default
  6 |   pdf_document: default
  7 | ---
  8 | 
  9 | #What are Splines ?
 10 | 
 11 | Splines are used to add __Non linearities__ to a Linear Model and it is a Flexible Technique than Polynomial Regression.Splines Fit the data very *__smoothly__* in most of the cases where polynomials would become wiggly and overfit the training data.Polynomials 
 12 | tends to get wiggly and fluctuating at the tails which sometimes __overfits__ the training data and doesn't *__generalizes well due to high variance at high degrees of polynomial__.*
 13 | 
 14 | 
 15 | 
 16 | ```{r,message=FALSE,warning=FALSE}
 17 | #loading the Splines Packages
 18 | require(splines)
 19 | #ISLR contains the Dataset
 20 | require(ISLR)
 21 | attach(Wage)
 22 | 
 23 | agelims<-range(age)
 24 | #Generating Test Data
 25 | age.grid<-seq(from=agelims[1], to = agelims[2])
 26 | 
 27 | ```
 28 | 
 29 | ---
 30 | 
 31 | 
 32 | ### Fitting a Cubic Spline with 3 Knots(Cutpoints)
 33 | What It does is that it transforms the Regression Equation by transforming the Variables with a truncated *__Basis__* Function- $$b(x)$$ ,with continious derivatives upto order 2.
 34 | 
 35 | $$\textbf{The order of the continuity}= (d - 1) , \ where \ d \ is \ the \ number \ of \ degrees \ of \ polynomial$$
 36 | 
 37 | $$\textbf {The Regression Equation Becomes}$$ -
 38 | 
 39 | $$f(x) = y_i = \alpha + \beta_1.b_1(x_i)\ +  \beta_2.b_2(x_i)\ + \ .... \beta_{k+3}.b_{k+3}(x_i) \ + \epsilon_i   $$
 40 | 
 41 |  $$\bf where \ \bf b_n(x_i)\ is \  The \  Basis \ Function$$.
 42 |  
 43 | $$\text{The idea here is to transform the variables and add a linear combination of the variables using the Basis power function to the regression function f(x).} $$
 44 | ```{r}
 45 | #3 cutpoints at ages 25 ,50 ,60
 46 | fit<-lm(wage ~ bs(age,knots = c(25,40,60)),data = Wage )
 47 | summary(fit)
 48 | #Plotting the Regression Line to the scatterplot   
 49 | plot(age,wage,col="grey",xlab="Age",ylab="Wages")
 50 | points(age.grid,predict(fit,newdata = list(age=age.grid)),col="darkgreen",lwd=2,type="l")
 51 | #adding cutpoints
 52 | abline(v=c(25,40,60),lty=2,col="darkgreen")
 53 | 
 54 | ```
 55 | The above Plot shows the smoothing and local effect of Cubic Splines , whereas Polynomias might become wiggly and the tail.*__The cubic splines have continious 1st Derivative and continious 2nd derivative.__*
 56 | 
 57 | --------
 58 | 
 59 | 
 60 | ###Smoothing Splines 
 61 | 
 62 | These are mathematically more challenging but they are more smoother and flexible as well.It does not require the selection of the number of Knots , but require selection of only a __Roughness Penalty__ which accounts for the wiggliness(fluctuations) and controls the roughness of the function and variance of the Model.
 63 | 
 64 | $$ \text{Let the RSS(Residual Sum of Squares) be} \ { g(x_i) }$$
 65 | $$ \ minimize \ { g \in RSS} :\ \sum\limits_{i=1}^n ( \ y_i \ - \ g(x_i) \ )^2 + \lambda \ \int g''(t)^2 dt , \quad  \lambda > 0$$
 66 | $$ \text{ where },\ \lambda \int g''(t)^2 dt \  {is \ called \ the \ Roughness \ Penalty. }  $$
 67 | 
 68 | $$ \textbf {The Roughness Penalty controls how wiggly g(x) is. The smaller the }  \lambda , \textbf{ the more wiggly and  fluctuating the function is. }$$  
 69 | 
 70 | $$\textbf {As} \  \lambda ,\textbf {approcahes}\ \infty , \textbf {the function} \ g(x) \textbf  { becomes linear} . $$
 71 | 
 72 | In smoothing Splines we have a __Knot__ at every unique value of $x_i$ .
 73 | 
 74 | 
 75 | ```{r}
 76 | fit1<-smooth.spline(age,wage,df=16)
 77 | plot(age,wage,col="grey",xlab="Age",ylab="Wages")
 78 | points(age.grid,predict(fit,newdata = list(age=age.grid)),col="darkgreen",lwd=2,type="l")
 79 | #adding cutpoints
 80 | abline(v=c(25,40,60),lty=2,col="darkgreen")
 81 | lines(fit1,col="red",lwd=2)
 82 | legend("topright",c("Smoothing Spline with 16 df","Cubic Spline"),col=c("red","darkgreen"),lwd=2)
 83 | 
 84 | ```
 85 | 
 86 | ---
 87 | 
 88 | 
 89 | ### Implementing Cross Validation to select value of $\lambda$  and Implement Smoothing Splines 
 90 | 
 91 | 
 92 | ```{r}
 93 | fit2<-smooth.spline(age,wage,cv = TRUE)
 94 | fit2
 95 | #It selects $\lambda=6.794596$ it is a Heuristic and can take various values for how rough the function is
 96 | 
 97 | plot(age,wage,col="grey")
 98 | #Plotting Regression Line
 99 | lines(fit2,lwd=2,col="purple")
100 | legend("topright",("Smoothing Splines with 6.78 df selected by CV"),col="purple",lwd=2)
101 | 
102 | 
103 | ```
104 | 
105 | ####This Model is also very Smooth and Fits the data well. 
106 | 
107 | ---
108 | 
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/TreeBasedTechniques.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Decision Trees in R"
  3 | output:
  4 |   html_document: default
  5 |   html_notebook: default
  6 | ---
  7 | 
  8 | 
  9 | 
 10 | 
 11 | ### This is a article on how to implement Tree based Learning Technique in R to do Predictive Modelling.
 12 |  
 13 | Trees involve stratifying or sagmenting the Predictor($X_i$) space into a number of simple Regions.The tree based Methods generate a set of $Splitting \  Rules$ which are used to sagment the Predictor Space.These techniques of sagmenting and stratifying data into different Regions $R_j$ are called __Decision Trees__.Decision Trees are used in both Regression and Classification Problems.
 14 | These are Statistical Learning Techniques which are easier to understand and Simpler in terms of interpretablity.
 15 | 
 16 | The Rules generated are of form --
 17 | 
 18 | $$R_j = If (X = X_1 \cap X_2  \cap ....X_p) -->  Y_i $$.
 19 | 
 20 | 
 21 | 
 22 | -----------
 23 | 
 24 | 
 25 | 
 26 | #### Loading the required Packages
 27 | 
 28 | ```{r,warning=FALSE,message=FALSE}
 29 | 
 30 | require(ISLR) #package containing data
 31 | require(ggplot2)
 32 | require(tree)
 33 | 
 34 | #Using the Carseats data set 
 35 | 
 36 | attach(Carseats)
 37 | ?Carseats
 38 | 
 39 | 
 40 | ```
 41 | 
 42 | 
 43 | Carseats is a simulated data set containing sales of child car seats at 400 different stores.
 44 | 
 45 | 
 46 | ```{r}
 47 | #Checking the distribution of Sales
 48 | 
 49 | ggplot(aes(x = Sales),data = Carseats) + 
 50 |   geom_histogram(color="black",fill = 'purple',alpha = 0.6, bins=30) + 
 51 |   labs(x = "Unit Sales in Thousands", y = "Frequency")
 52 | 
 53 | 
 54 | ```
 55 | As the histogram suggests - It is Normally distributed
 56 | Highest frequency of around 8000 Unit Sales
 57 | 
 58 | 
 59 | ```{r}
 60 | 
 61 | #Making a Factor variable from Sales
 62 | 
 63 | HighSales<-ifelse(Sales <= 8,"No","Yes")
 64 | head(HighSales)
 65 | 
 66 | #Making a Data frame
 67 | Carseats<-data.frame(Carseats,HighSales)
 68 | 
 69 | 
 70 | ```
 71 | 
 72 | 
 73 | ----------
 74 | 
 75 | 
 76 | 
 77 | 
 78 | ## Fitting a Binary Classification Tree
 79 | 
 80 | 
 81 | Now we are going to fit a __Tree__ to the Carseats Data to predict if we are going to have High Sales or not.The __tree()__ function uses a *__Top-down Greedy__* approch to fit a Tree which is also known as *__Recursive Binary Splitting__*.It is Greedy because it dosen't finds the best split amongst all possible splits,but only the best splits at the immediate place its looking i.e the best Split at that particular step.
 82 | 
 83 | ```{r}
 84 | #We will use the tree() function to fit a Desicion Tree
 85 | ?tree
 86 | 
 87 | #Excluding the Sales atrribute
 88 | CarTree<-tree(HighSales ~ . -Sales , data = Carseats,split = c("deviance","gini"))
 89 | #split argument split	to specify the splitting criterion to use.
 90 | 
 91 | CarTree #Outputs a Tree with various Splits at different Variables and Response at Terminals Nodes
 92 | #The numeric values within the braces are the Proportions of Yes and No for each split.
 93 | 
 94 | #Summary of the Decision Tree
 95 | summary(CarTree)
 96 | 
 97 | 
 98 | ```
 99 | 
100 | The summary of the Model consists of the imporatant variables used for splitting the data which minimizes the deviance(Error) Rate and another Splitting criterion used is __Gini Indes__, which is also called *Purity Index*.
101 | 
102 | 
103 | ---------------
104 | 
105 | 
106 | 
107 | #Plotting the Decision Tree
108 | 
109 | 
110 | ```{r}
111 | plot(CarTree)
112 | #Adding Predictors as text to plot
113 | text(CarTree ,pretty = 1 )
114 | 
115 | ```
116 | 
117 | This tree is quiet Complicated and hard to understand due to lots of Splits and lots of variables included in the predictor space.The leaf nodes consists of the Response value i.e __Yes / No __.
118 | 
119 | 
120 | ---------------
121 | 
122 | 
123 | ### Splitting data to Training and Test Set
124 | 
125 | 
126 | ```{r}
127 | set.seed(1001)
128 | #A training sample of 250  examples sampled without replacement
129 | train<-sample(1:nrow(Carseats), 250)
130 | #Fitting another Model
131 | tree1<-tree(HighSales ~ .-Sales , data = Carseats, subset = train)
132 | summary(tree1)
133 | #Plotting
134 | plot(tree1);text(tree1)
135 | 
136 | ```
137 | Now the tree is somewhat different and detailed but is quiet hard to interpret too due to lots of splits.
138 | 
139 | 
140 | __Predicting on Test Set__
141 | 
142 | 
143 | ```{r}
144 | #Predicting the Class labels for Test set
145 | pred<-predict(tree1, newdata = Carseats[-train,],type = "class")
146 | head(pred)
147 | 
148 | #Confusion Matrix to check number of Misclassifications
149 | with(Carseats[-train,],table(pred,HighSales))
150 | 
151 | #Misclassification Error Rate on Test Set
152 | mean(pred!=Carseats[-train,]$HighSales)
153 | 
154 | 
155 | 
156 | ```
157 | The __Diagonals__ are the correctly classified Test Examples , whereas the __off-diagonals__ represent the misclassified examples.The Mean Error Rate is $\text{26%}$.
158 | 
159 | 
160 | The above tree was grown to Full length and might have lots of variables in it which might be degrading the Perfomance.We will now use 10 fold __Cross Validation__ to *Prune* the Tree.
161 | 
162 | 
163 | ------
164 | 
165 | 
166 | ## Pruning The tree using Cross Validation
167 | 
168 | 
169 | 
170 | ```{r}
171 | #10 fold CV
172 | #Performing Cost Complexity Pruning
173 | cv.tree1<-cv.tree(tree1, FUN=prune.misclass)
174 | cv.tree1
175 | plot(cv.tree1)
176 | #Deviance minimum for tree size 15 i.e 15 Splits 
177 | 
178 | prune.tree1<-prune.misclass(tree1,best = 15)
179 | plot(prune.tree1);text(prune.tree1)
180 | 
181 | ```
182 | 
183 | 
184 | __Testing the pruned Tree on Test Set__ -
185 | 
186 | ```{r}
187 | pred1<-predict(prune.tree1 , Carseats[-train,],type="class")
188 | 
189 | #Confusion Matrix
190 | with(Carseats[-train,],table(pred1,HighSales))
191 | 
192 | #Misclassification Rate
193 | ErrorPrune<-mean(pred1!=Carseats[-train,]$HighSales)
194 | ErrorPrune
195 | #Error reduced to 25 %
196 | 
197 | 
198 | ```
199 | 
200 | 
201 | 
202 | ------------
203 | 
204 | 
205 | ## Conclusion
206 | 
207 | 
208 | 
209 | As we can notice by the perfomance on Test Set the Pruned Tree dosen't performs better as the Error rate reduced only by a factor of 0.1 % i.e from 26% to 25%.
210 | It's just that Pruning lead us to a more __simpler__ Tree with *lesser Splits and a subset of predictors* which is somewhat easier to interpret and understand.
211 | 
212 | 
213 | Usually *Trees* don't actually give good perfomance on Test Sets , and is called a __Weak Learner__.
214 | 
215 | Applying Ensembling Techniques such as __Random Forests , Bagging and Boosting__ improves the Perfomance of Trees a lot by combining a lot of Trees trained on samples from training examples and finally *__combining(averaging)__* the Trees to form a single Strong Tree which performs nicely.
216 | 
217 | 
218 | Hope you guys liked the article , make sure to share and like it.
219 | 
220 | 
221 | 
222 | 
223 | 


--------------------------------------------------------------------------------
/crossValidation.R:
--------------------------------------------------------------------------------
 1 | #Cross validation and Resampling Techniques 
 2 | 
 3 | require(ISLR)
 4 | 
 5 | #boot package for the Cross valiadation
 6 | require(boot)
 7 | ?cv.glm
 8 | #this is the K-fold cross validation for the generalized linear models
 9 | data("Auto")
10 | attach(Auto)
11 | plot(mpg ,horsepower)
12 | #both have a negetive correlation as mpg increase the HP decreases
13 | 
14 | 
15 | #Leave one Out CV-LOOCV
16 | mod1<-glm(mpg ~ horsepower )
17 | summary(mod1)
18 | #delta is the CV error
19 | cv.glm(Auto,mod1)$delta #pretty slow , leavs one data point and fits a model
20 | # on the remaining data points , then tests on the 1 point
21 | 
22 | #Let's write a formula to compite the Prediction error for LOOCV
23 | 
24 | loocv<-function(fit) 
25 |   {
26 |   h = lm.influence(fit)$h
27 |   mean((residuals(fit)/(1-h))^2)
28 | #hence it will compute the error , h=diagonal element of the hat matrix
29 | #hat matrix is the operator matrix which produces the least square fit
30 | }
31 | 
32 | 
33 | #let's now try out the function
34 | loocv(mod1)
35 | #it will return the CV error directly
36 | 
37 | 
38 | #Now lets try out LOOCV for different polynomials(higher degree regression) 
39 | cv.error = rep(0,5)
40 | #cv.error is a empty vector initiaized with 5 as size to collect error of each Model
41 | degree=1:5
42 | for(d in degree)
43 | {
44 |   model<-glm(mpg ~ poly(horsepower,d),data = Auto)
45 |   cv.error[d] = loocv(model)
46 |   
47 | }
48 | #Plot of degree(K) vs  Cross Validation errors for each Different Model with
49 | #different degrees
50 | plot(x = degree , y = cv.error,type='b' ,title="Cross validation error for different degrees",xlab = "Degree",ylab = "Cross validation error")
51 | #Hence Quadratic Model did a very good job with less CV error 
52 | # along with degree 5
53 | 
54 |   
55 | 
56 | #10 fold cross validation
57 | cv.error10 = rep(0,5)
58 | for(d in degree)
59 | {
60 |   mod<-glm(mpg~poly(horsepower,d),data = Auto)
61 |   cv.error10[d] = cv.glm(Auto, mod, K=10)$delta[1]
62 | }
63 | 
64 | plot(degree, cv.error10,type='b',col='red',xlab="Degree of polynomial",ylab ="10 fold CV error vs 5 fold")
65 | #Hence we can see that Model with quadratic degree is the best one with least
66 | # Cv error , hence we will choose the Model 2 with degree 2
67 | #10 fold is usually more computationally simpler than LOOCV and also 10 considers
68 | # Bias-Variance tradeoffs
69 | 
70 | 
71 | 
72 | mod2<-glm(mpg ~ poly(horsepower,2), data = Auto)
73 | #5-fold CV
74 | cv.glm(Auto,mod2,K=5)$delta
75 | 
76 | cv.error5 = rep(0,5)
77 | for(d in degree) 
78 | {
79 |   mod<-glm(mpg ~ poly(horsepower,d),data =Auto)
80 |   cv.error5[d] = cv.glm(Auto , mod , K=5)$delta[1]
81 | }
82 | 
83 | lines(degree, cv.error5 , col = 'blue',type= 'b')
84 | #adding legends
85 | legend("topright", c("5-fold CV","10-fold CV"),col=c("blue","red"),pch=19)


--------------------------------------------------------------------------------
/svm.R:
--------------------------------------------------------------------------------
 1 | #Support Vector Machines in R
 2 | set.seed(10023)
 3 | #generating data
 4 | #a matrix with 20 rows and 2 columns
 5 | x=matrix(rnorm(40),20,2)
 6 | x
 7 | y=rep(c(-1,1),c(10,10))
 8 | x[y==1,]=x[y==1,]+1 #2 classes are [-1,1]
 9 | 
10 | #plotting the points
11 | plot(x,col=y+2,pch=19)
12 | 
13 | 
14 | #First Making Grids using a function
15 | make.grid<-function(x,n=75) {
16 |   grange=apply(x,2,range)
17 |   x1=seq(from=grange[1,1],to=grange[2,1],length=n)
18 |   x2=seq(from=grange[1,2],to=grange[2,2],length=n)
19 |   expand.grid(X1=x1,X2=x2) #it makes a Lattice for us
20 | }
21 | xgrid=make.grid(x) #is a 75x75 matrix
22 | 
23 | #now predicting on this new Test Set
24 | ygrid=predict(svm,xgrid)
25 | 
26 | #plotting the Linear Separator
27 | plot(xgrid,col=c("red","blue")[as.numeric(ygrid)],pch=19,cex=.2)
28 | #creates 2 regions
29 | points(x,col=y+3,pch=19) #adding the points on Plot
30 | points(x[svm$index,],pch=5,cex=2) #Highlighting the Support Vectors
31 | 
32 | 
33 | require(e1071)
34 | #converting to a data frame
35 | data<-data.frame(x,y=as.factor(y))
36 | head(data)
37 | 
38 | svm<-svm(y ~ .,data=data,kernel="linear",cost=10,scale = F)
39 | #here cost 'c' is a tuning parameter .The larger it is more stable the margin becomes, it is like a Regularization parameter 
40 | svm
41 | svm$index #gives us the index of the Support Vectors
42 | #so we have 10 support vectors
43 | svm$fitted #to find the fitted values
44 | 
45 | #Confusion Matrix of Fitted values and Actual Response values
46 | table(Predicted=svm$fitted,Actual=y)
47 | 
48 | #accuracy on Training Set
49 | mean(svm$fitted==y)*100 #has 80 % accuracy on Training Set
50 | 
51 | #plotting
52 | plot(svm,data)
53 | 
54 | 
55 | 
56 | beta = drop(t(svm$coefs)%*%x[svm$index,])
57 | beta
58 | 
59 | 
60 | beta0=svm$rho
61 | beta0 #the intercept value
62 | #again Plotting
63 | plot(xgrid,col=c("red","blue")[as.numeric(ygrid)],pch=19,cex=.2)
64 | #creates 2 regions
65 | points(x,col=y+3,pch=19) #adding the points on Plot
66 | points(x[svm$index,],pch=5,cex=2)
67 | abline(beta0/beta[2],-beta[1]/beta[2],lty=1)#is the Decision boundary or Plane
68 | #below are for adding the soft margins
69 | abline((beta0-1)/beta[2],-beta[1]/beta[2],lty=2)
70 | abline((beta0+1)/beta[2],-beta[1]/beta[2],lty=2)


--------------------------------------------------------------------------------