├── README.md └── multiple_regression_demo.R /README.md: -------------------------------------------------------------------------------- 1 | This is the code from the StatQuest... 2 | * Multiple Regression in R: https://youtu.be/hokALdIst8k 3 | -------------------------------------------------------------------------------- /multiple_regression_demo.R: -------------------------------------------------------------------------------- 1 | ## Here's the data 2 | mouse.data <- data.frame( 3 | size = c(1.4, 2.6, 1.0, 3.7, 5.5, 3.2, 3.0, 4.9, 6.3), 4 | weight = c(0.9, 1.8, 2.4, 3.5, 3.9, 4.4, 5.1, 5.6, 6.3), 5 | tail = c(0.7, 1.3, 0.7, 2.0, 3.6, 3.0, 2.9, 3.9, 4.0)) 6 | 7 | mouse.data 8 | 9 | ####################################################### 10 | ## 11 | ## Let's start by reviewing simple regression by 12 | ## modeling mouse size with mouse weight. 13 | ## 14 | ####################################################### 15 | 16 | ## STEP 1: Draw a graph of the data to make sure the relationship make sense 17 | plot(mouse.data$weight, mouse.data$size, pch=16, cex=2) 18 | 19 | ## STEP 2: Do the regression 20 | simple.regression <- lm(size ~ weight, data=mouse.data) 21 | 22 | ## STEP 3: Look at the R^2, F-value and p-value 23 | summary(simple.regression) 24 | 25 | abline(simple.regression, lwd=5, col="red") 26 | 27 | ## now let's verify that our formula for R^2 is correct.. 28 | ss.mean <- sum((mouse.data$size - mean(mouse.data$size))^2) 29 | ss.simple <- sum(simple.regression$residuals^2) 30 | 31 | (ss.mean - ss.simple) / ss.mean # this is the R^2 value 32 | 33 | ## now let's verify the our formula for F is correct... 34 | f.simple <- ((ss.mean - ss.simple) / (2 - 1)) / 35 | (ss.simple / (nrow(mouse.data) - 2)) 36 | 37 | f.simple # this is the F-value 38 | 39 | ## Now let's draw a figure that shows how to calculate the p-value from the 40 | ## F-value 41 | ## 42 | ## First, draw the correct f-distribution curve with df1=1 and df2=7 43 | x <- seq(from=0, to=15, by=0.1) 44 | y <- df(x, df1=1, df2=7) 45 | plot(x, y, type="l") 46 | 47 | ## now draw a verticle line where our F-value, f.simple, is. 48 | abline(v=f.simple, col="red") 49 | 50 | ## color the graph on the left side of the line blue 51 | x.zero.to.line <- seq(from=0, to=f.simple, by=0.1) 52 | y.zero.to.line <- df(x.zero.to.line, df1=1, df2=7) 53 | polygon(x=c(x.zero.to.line, 0), y=c(y.zero.to.line, 0), col="blue") 54 | 55 | ## color the graph on the right side of the line red 56 | x.line.to.20 <- seq(from=f.simple, to=20, by=0.1) 57 | y.line.to.20 <- df(x.line.to.20, df1=1, df2=7) 58 | polygon(x=c(x.line.to.20, f.simple), y=c(y.line.to.20, 0), col="red") 59 | 60 | pf(f.simple, df1=1, df2=7) ## the area under the curve that is blue 61 | 62 | 1-pf(f.simple, df1=1, df2=7) ## the area under the curve that is red 63 | 64 | ## lastly, let's compare this p-value to the one from the 65 | ## original regression 66 | summary(simple.regression) 67 | 68 | 69 | ####################################################### 70 | ## 71 | ## Now let's do multiple regression by adding an extra term, tail length 72 | ## 73 | ####################################################### 74 | 75 | ## STEP 1: Draw a graph of the data to make sure the relationship make sense 76 | ## This graph is more complex because it shows the relationships between all 77 | ## of the columns in "mouse.data". 78 | plot(mouse.data) 79 | 80 | ## STEP 2: Do the regression 81 | multiple.regression <- lm(size ~ weight + tail, data=mouse.data) 82 | 83 | ## STEP 3: Look at the R^2, F-value and p-value 84 | summary(multiple.regression) 85 | 86 | 87 | ## again, we can verify that our R^2 value is what we think it is 88 | ss.multiple <- sum(multiple.regression$residuals^2) 89 | 90 | (ss.mean - ss.multiple) / ss.mean 91 | 92 | ## we can also verify that the F-value is what we think it is 93 | f.multiple <- ((ss.mean - ss.multiple) / (3 - 1)) / 94 | (ss.multiple / (nrow(mouse.data) - 3)) 95 | 96 | f.multiple 97 | 98 | ## Again let's draw a figure that shows how to calculate the p-value from the 99 | ## F-value 100 | ## 101 | ## First, draw the correct f-distribution curve with df1=2 and df2=6 102 | x <- seq(from=0, to=20, by=0.1) 103 | y <- df(x, df1=2, df2=6) 104 | plot(x, y, type="l") 105 | 106 | ## now draw a verticle line where our f.value is for this test 107 | abline(v=f.multiple, col="red") 108 | 109 | ## color the graph on the left side of the line blue 110 | x.zero.to.line <- seq(from=0, to=f.multiple, by=0.1) 111 | y.zero.to.line <- df(x.zero.to.line, df1=2, df2=6) 112 | polygon(x=c(x.zero.to.line, 0), y=c(y.zero.to.line, 0), col="blue") 113 | 114 | ## color the graph on the right side of the line red 115 | x.line.to.20 <- seq(from=f.multiple, to=20, by=0.1) 116 | y.line.to.20 <- df(x.line.to.20, df1=2, df2=6) 117 | polygon(x=c(x.line.to.20, f.multiple), y=c(y.line.to.20, 0), col="red") 118 | 119 | 120 | pf(f.multiple, df1=2, df2=6) ## the area under the curve that is blue 121 | 122 | 1-pf(f.multiple, df1=2, df2=6) ## the area under the curve that is red 123 | 124 | ## lastly, let's compare this p-value to the one from the 125 | ## original regression 126 | summary(multiple.regression) 127 | 128 | 129 | 130 | ####################################################### 131 | ## 132 | ## Now, let's see if "tail" makes a significant contribution by 133 | ## comparing the "simple" fit (which does not include the tail data) 134 | ## to the "multiple" fit (which has the extra term for the tail data) 135 | ## 136 | ####################################################### 137 | 138 | f.simple.v.multiple <- ((ss.simple - ss.multiple) / (3-2)) / 139 | (ss.multiple / (nrow(mouse.data) - 3)) 140 | 141 | 1-pf(f.simple.v.multiple, df1=1, df2=6) 142 | 143 | ## Notice that this value is the same as the p-value next to the term for 144 | ## for "tail" in the summary of multiple regression: 145 | summary(multiple.regression) 146 | 147 | ## Thus, the summary already calculated this F-value and p-value for us. 148 | ## this line tells us that including the "tail" term makes a statistically 149 | ## significant difference. The magnitude can be determined by looking 150 | ## at the change in R^2 between the simple and multiple regressions. 151 | --------------------------------------------------------------------------------