├── 01-Slides.pdf ├── 02-Demos.R ├── 03-Solutions.R ├── README.md └── RQuickstart.Rproj /01-Slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/RQuickstart/6d61ff4d5f0d4e4e3abdd7ca2a8df44f52c131a6/01-Slides.pdf -------------------------------------------------------------------------------- /02-Demos.R: -------------------------------------------------------------------------------- 1 | # R 2 | 3 | log10(100) 4 | 5 | library(ggplot2) 6 | 7 | View(mpg) 8 | View(iris) 9 | View(mtcars) 10 | 11 | ?mpg 12 | ?ggplot 13 | ?geom_point 14 | 15 | # Visualizations with ggplot2 16 | 17 | ## plot 18 | 19 | plot(iris$Sepal.Width, iris$Sepal.Length) 20 | 21 | ## ggplot2 22 | 23 | ggplot(iris, aes(Sepal.Width, Sepal.Length)) + 24 | geom_point() 25 | 26 | ggplot(iris, aes(Sepal.Width, Sepal.Length)) + 27 | geom_point(aes(shape = Species, color = Species)) 28 | 29 | ggplot(iris, aes(Sepal.Width, Sepal.Length)) + 30 | geom_point(aes(shape = Species, color = Species)) + 31 | theme_bw() 32 | 33 | ggplot(iris, aes(Sepal.Width, Sepal.Length)) + 34 | geom_point(aes(shape = Species, color = Species)) + 35 | theme_bw() + 36 | geom_smooth(aes(shape = Species, color = Species), method = lm, se = FALSE) 37 | 38 | ggplot(iris, aes(Sepal.Width, Sepal.Length)) + 39 | geom_rug(aes(color = Species), position = "jitter") + 40 | stat_density2d(aes(alpha = ..level.., fill = Species), geom = "polygon") + 41 | theme_bw() + 42 | scale_alpha(range = c(0.05, 0.5)) 43 | 44 | ggplot(iris, aes(Sepal.Width, Sepal.Length)) + 45 | geom_rug(aes(color = Species), position = "jitter") + 46 | stat_density2d(aes(alpha = ..level.., fill = Species), geom = "polygon") + 47 | theme_bw() + 48 | scale_alpha(range = c(0.05, 0.5)) + 49 | facet_wrap( ~ Species) 50 | 51 | ## ggplot 52 | 53 | ggplot(mpg, aes(displ, hwy)) 54 | ggplot(mpg, aes(displ, hwy)) + geom_point() 55 | 56 | # Aesthetics 57 | 58 | ggplot(mpg) + geom_point(aes(x = displ, y = hwy, color = class)) 59 | ggplot(mpg) + geom_point(aes(x = displ, y = hwy, size = class)) 60 | ggplot(mpg) + geom_point(aes(x = displ, y = hwy, shape = class)) 61 | ggplot(mpg) + geom_point(aes(x = displ, y = hwy, alpha = class)) 62 | 63 | ## Mapping vs. setting 64 | 65 | ggplot(mpg, aes(displ, hwy)) + geom_point(mapping = aes(color = class)) 66 | ggplot(mpg, aes(displ, hwy)) + geom_point(color = "green") 67 | ggplot(mpg, aes(displ, hwy)) + geom_point(size = 5) 68 | ggplot(mpg, aes(displ, hwy)) + geom_point(shape = 3) 69 | ggplot(mpg, aes(displ, hwy)) + geom_point(alpha = 0.5) 70 | 71 | ggplot(mpg, aes(displ, hwy)) + geom_point(aes(color = class)) 72 | ggplot(mpg, aes(displ, hwy)) + geom_point(color = "green") 73 | 74 | # Geoms 75 | 76 | ggplot(data = mpg) + geom_point(aes(x = displ, y = hwy)) 77 | ggplot(data = mpg) + geom_smooth(aes(x = displ, y = hwy)) 78 | ggplot(data = mpg) + 79 | geom_point(aes(x = displ, y = hwy)) + 80 | geom_smooth(aes(x = displ, y = hwy)) 81 | 82 | ggplot(mpg) + geom_point(aes(class, hwy)) 83 | 84 | ## Global vs. Local 85 | 86 | ggplot(mpg, aes(x = displ, y = hwy)) + 87 | geom_smooth(method = lm) + 88 | geom_point(aes(color = cyl), data = mpg[1:10, ]) 89 | 90 | ggplot(mpg, aes(displ, hwy, color = class)) + 91 | geom_smooth(method = lm) + 92 | geom_point() 93 | 94 | ggplot(mpg, aes(displ, hwy)) + 95 | geom_smooth(method = lm) + 96 | geom_point(aes(color = class)) 97 | 98 | ggplot(mpg, aes(displ, hwy)) + 99 | geom_point() 100 | 101 | ggplot(mpg, aes(displ, hwy)) + 102 | geom_point() + 103 | geom_point(data = mpg[1:50,], color = "green") 104 | 105 | # Grammar of Graphics 106 | 107 | # Data Wrnagling with dplyr 108 | 109 | library(reportsWS) 110 | View(bnames) 111 | 112 | my_name <- filter(bnames, name == "Garrett", sex == "M") 113 | my_name <- select(my_name, name, year, prop) 114 | ggplot(my_name) + 115 | geom_line(aes(x = year, y = prop)) 116 | 117 | ## dplyr 118 | 119 | library(dplyr) 120 | ?tbl 121 | ?select 122 | ?filter 123 | ?left_join 124 | ?mutate 125 | ?summarise 126 | ?group_by 127 | ?`%>%` 128 | 129 | ## tbl's 130 | 131 | babynames 132 | tbl_df(bnames) 133 | 134 | bnames <- tbl_df(bnames) 135 | 136 | ## Verbs 137 | 138 | arrange(storms, wind) 139 | arrange(storms, desc(wind)) 140 | 141 | select(storms, storm, pressure) 142 | 143 | filter(storms, wind == 50) 144 | filter(storms, wind >= 50) 145 | filter(storms, wind > 60, wind <= 40) 146 | 147 | View(births) 148 | 149 | left_join(songs, artists, by = "name") 150 | 151 | mutate(storms, ratio = pressure / wind) 152 | 153 | summarise(pollution, median = median(amount)) 154 | summarise(pollution, mean = mean(amount), sum = sum(amount), n = n()) 155 | p <- group_by(pollution, city) 156 | summarise(p, mean = mean(amount), sum = sum(amount), n = n()) 157 | 158 | # %>% 159 | 160 | p <- group_by(pollution, city) 161 | summarise(p, mean = mean(amount), sum=sum(amount), n=n()) 162 | my_name <- filter(bnames, name == "Garrett", sex == "M") 163 | my_name <- select(my_name, name, year, prop) 164 | my_name <- left_join(my_name, boys, by = "year") 165 | my_name <- mutate(my_name, n = round(prop * births)) 166 | 167 | summarize(pollution, median = median(amount)) 168 | pollution %>% summarize(median = median(amount)) 169 | 170 | bnames %>% 171 | left_join(births, by = c("year", "sex")) %>% 172 | mutate(n = round(prop * births)) %>% 173 | select(name, sex, year, n) %>% 174 | filter(!is.na(n)) %>% 175 | group_by(name, sex) %>% 176 | summarise(total = sum(n)) %>% 177 | ungroup() %>% 178 | arrange(desc(total)) 179 | 180 | tmp1 <- left_join(bnames, births, by = c("year", "sex")) 181 | tmp2 <- mutate(tmp1, n = round(prop * births)) 182 | tmp3 <- select(tmp2, name, sex, year, n) 183 | tmp4 <- filter(tmp3, !is.na(n)) 184 | tmp5 <- group_by(tmp4, name, sex) 185 | tmp6 <- summarise(tmp5, total = sum(n)) 186 | tmp7 <- ungroup(tmp6) 187 | tmp8 <- arrange(tmp7, desc(total)) 188 | 189 | arrange( 190 | ungroup( 191 | summarise( 192 | group_by( 193 | filter( 194 | select( 195 | mutate( 196 | left_join(bnames, births, by = c("year", "sex")), 197 | n = round(prop * births) 198 | ), name, sex, year, n 199 | ), !is.na(n) 200 | ), name, sex 201 | ), total = sum(n) 202 | ) 203 | ), desc(total), 204 | ) 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | -------------------------------------------------------------------------------- /03-Solutions.R: -------------------------------------------------------------------------------- 1 | # How would you replace this scatterplot with one that draws 2 | # boxplots? Try out your best guess. 3 | # ggplot(mpg) + geom_point(aes(class, hwy)) 4 | ggplot(data = mpg) + geom_boxplot(aes(x = displ, y = hwy)) 5 | 6 | 7 | # How would you create this plot? 8 | # Hint: histograms do not require a y aesthetic. 9 | ggplot(data = mpg) + geom_histogram(aes(x = hwy)) 10 | 11 | 12 | # Make these plots: 13 | # 14 | # Plot 1 15 | # Data = diamonds 16 | # geom = count 17 | # x = cut 18 | # y = color 19 | ggplot(diamonds, aes(x = cut, y = color)) + 20 | geom_count() 21 | 22 | # Plot 2 23 | # Data = diamonds 24 | # geom = point 25 | # x = carat 26 | # y = price 27 | ggplot(diamonds, aes(x = carat, y = price)) + 28 | geom_point() 29 | 30 | 31 | # Create a data set that contains only rows with your name and sex, 32 | # and only the columns name, year, and prop. 33 | # Then plot the data with 34 | # ggplot() + 35 | # geom_line(aes(x = year, y = prop)) 36 | library(dplyr) 37 | my_name <- filter(bnames, name == "Garrett", sex == "M") 38 | my_name <- select(my_name, name, year, prop) 39 | ggplot(my_name) + 40 | geom_line(aes(x = year, y = prop)) 41 | 42 | 43 | # 1. filter() births to just rows with your sex. 44 | boys <- filter(births, sex == "M") 45 | 46 | # 2. Join the result to my_name by year. 47 | my_name <- left_join(my_name, boys, by = "year") 48 | 49 | # 3. Add a new variable to the data: n = round(prop * births) 50 | my_name <- mutate(my_name, n = round(prop * births)) 51 | 52 | # 4. Save the new data. Then plot n over time. 53 | ggplot(my_name) + geom_line(aes(x = year, y = n)) 54 | 55 | 56 | # Work with a neighbor to determine what each line of the 57 | # code below does. 58 | # Take bnames and 59 | bnames %>% 60 | # join to it births by year and sex. 61 | left_join(births, by = c("year", "sex")) %>% 62 | # Then use the result to calculate a new variable, n 63 | mutate(n = round(prop * births)) %>% 64 | # Select from that four columns: name, sex, year, and n 65 | select(name, sex, year, n) %>% 66 | # Filter out rows where n = NA 67 | filter(!is.na(n)) %>% 68 | # Then group by the combination of name and gender 69 | group_by(name, sex) %>% 70 | # Calculate the total number of children for each group 71 | summarise(total = sum(n)) %>% 72 | # Ungroup the data (so we do not arrange within groups) 73 | ungroup() %>% 74 | # Then order the groups from the largest total to the smallest 75 | arrange(desc(total)) 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RQuickstart 2 | Slides, Code, and Exercises to support [R Quickstart tutorial](http://conferences.oreilly.com/strata/hadoop-big-data-ca/public/schedule/detail/48053) at 2016 Strata + Hadoop World San Jose 3 | -------------------------------------------------------------------------------- /RQuickstart.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX --------------------------------------------------------------------------------