├── docs ├── images │ ├── Icon │ ├── south_hall.jpg │ └── ischool_logo.png ├── _config.yml ├── schedule.md ├── syllabus.md └── index.md ├── reading_calls ├── Icon ├── load_data_solution.md ├── project_work_solution.md ├── images │ └── r_bridge_datahub.png ├── restructure_codebase_solution.md ├── choose_a_plot_type.md ├── grouped_data_solution_files │ └── figure-gfm │ │ ├── ungrouped plot-1.png │ │ ├── unnamed-chunk-1-1.png │ │ ├── unnamed-chunk-2-1.png │ │ ├── unnamed-chunk-3-1.png │ │ ├── unnamed-chunk-4-1.png │ │ └── unnamed-chunk-5-1.png ├── make_it_sparkle_solution_files │ └── figure-gfm │ │ └── coding task-1.png ├── make_scatter_plots_solution_files │ └── figure-gfm │ │ ├── age plot-1.png │ │ ├── basic plot-1.png │ │ ├── feeling blue-1.png │ │ ├── fur color plot-1.png │ │ └── non euclidian space-1.png ├── make_bar_plots_solution_files │ └── figure-gfm │ │ └── unnamed-chunk-1-1.png ├── make_line_plots_solution_files │ └── figure-gfm │ │ ├── unnamed-chunk-1-1.png │ │ └── unnamed-chunk-2-1.png ├── additional_features_solution_files │ └── figure-gfm │ │ ├── the old plot-1.png │ │ └── unnamed-chunk-2-1.png ├── how_to_summarise_solution_files │ └── figure-gfm │ │ ├── unnamed-chunk-1-1.png │ │ └── unnamed-chunk-2-1.png ├── working_with_rstudio_solution_files │ └── figure-gfm │ │ └── unnamed-chunk-1-1.png ├── setting_the_extents_solution.md ├── choose_a_plot_type_solution.md ├── join_and_merge_solution.md ├── load_data.md ├── call_to_reading.md ├── share_with_coursemates.md ├── arrange.md ├── filter.md ├── mutate.md ├── project_work.md ├── select.md ├── make_it_sparkle.md ├── pick_a_theme_solution.md ├── make_a_data_set.md ├── base_operations.md ├── additional_ideas_for_projects.md ├── group_by_summarize.md ├── setting_the_extents.md ├── keyboard_shortcuts.md ├── demo_of_project_outcomes.md ├── restructure_codebase.md ├── make_scatter_plots.md ├── alternatives_to_rstudio.md ├── make_line_plots.md ├── grouped_data.md ├── be_your_own_linter.md ├── join_and_merge.md ├── read_about_plots.md ├── r_markdown.md ├── rstudio_cheatsheet.md ├── goals_of_the_project.md ├── control_flow.md ├── tidying_arranging_and_summarizing.md ├── working_with_rstudio.md ├── how_to_summarise.md ├── issuing_code_and_reading_output.md ├── review_code.md ├── installing_r.md ├── be_your_own_linter_solution.md ├── review_code_solution.md ├── pick_a_theme.md ├── installing_rstudio.md ├── ucb_datahub.md ├── issuing_code_and_reading_output_solution.md ├── make_bar_plots_solution.md ├── make_bar_plots.md ├── make_line_plots_solution.md ├── make_it_sparkle_solution.md ├── additional_features.md ├── select_solution.md ├── summarize.md ├── make_a_data_set_solution.md ├── mini_project.md ├── base_operations_solution.md ├── introduction_to_space_data.md ├── how_to_summarise_solution.md ├── summarize_solution.md ├── make_scatter_plots_solution.md ├── working_with_rstudio_solution.md └── mutate_solution.md ├── code ├── nytimes_facet_plot.png ├── squirrel_fur_color.pdf ├── squirrel_fur_color.png ├── 4.1.2_creating_vectors.R ├── 4.3.1_c.R ├── grouped_data_solution_files │ └── figure-gfm │ │ ├── ungrouped plot-1.png │ │ ├── unnamed-chunk-1-1.png │ │ ├── unnamed-chunk-2-1.png │ │ ├── unnamed-chunk-3-1.png │ │ ├── unnamed-chunk-4-1.png │ │ └── unnamed-chunk-5-1.png ├── make_it_sparkle_solution_files │ └── figure-gfm │ │ └── coding task-1.png ├── make_scatter_plots_solution_files │ └── figure-gfm │ │ ├── age plot-1.png │ │ ├── basic plot-1.png │ │ ├── feeling blue-1.png │ │ ├── fur color plot-1.png │ │ └── non euclidian space-1.png ├── make_bar_plots_solution_files │ └── figure-gfm │ │ └── unnamed-chunk-1-1.png ├── additional_features_solution_files │ └── figure-gfm │ │ ├── the old plot-1.png │ │ └── unnamed-chunk-2-1.png ├── how_to_summarise_solution_files │ └── figure-gfm │ │ ├── unnamed-chunk-1-1.png │ │ └── unnamed-chunk-2-1.png ├── make_line_plots_solution_files │ └── figure-gfm │ │ ├── unnamed-chunk-1-1.png │ │ └── unnamed-chunk-2-1.png ├── working_with_rstudio_solution_files │ └── figure-gfm │ │ └── unnamed-chunk-1-1.png ├── 4.1.4_sequences_and_repeats.R ├── 3.17.1_arrange.R ├── make_squirrels_subset.R ├── 4.7.2_joins_and_merges.R ├── 3.20.1_mutate.R ├── 3.1.3_geom_bar.R ├── 3.3.1_geom__.R ├── 3.16.6_filter.R ├── 4.1.5_r_.R ├── 3.19_select.R ├── 3.22_summarise.R ├── 3.5.2_facet_.R ├── 4.13.1_show_hide_comment.Rmd ├── arrange.Rmd ├── select.Rmd ├── 3.25_group_by.R ├── 3.11.1_controlling_plot_extents.R ├── 3.13.1_setting_the_theme.R ├── mutate.Rmd ├── select_solution.Rmd ├── make_a_data_set.Rmd ├── group_by_summarize.Rmd ├── make_line_plots.R ├── code_for_videos.R ├── filter.Rmd ├── make_line_plots_solution.Rmd ├── arrange_solution.Rmd ├── make_bar_plots.Rmd ├── make_bar_plots_solution.Rmd ├── make_bar_plots_solution.md ├── make_it_sparkle.Rmd ├── pick_a_theme.Rmd ├── mutate_solution.Rmd ├── filter_solution.Rmd ├── make_line_plots_solution.md ├── make_scatter_plots.R ├── make_it_sparkle_solution.Rmd ├── make_it_sparkle_solution.md ├── make_a_data_set_solution.Rmd ├── base_operations.Rmd ├── select_solution.md ├── how_to_summarise.Rmd ├── working_with_rstudio.R ├── how_to_summarise_solution.Rmd ├── working_with_rstudio_solution.Rmd ├── make_scatter_plots_solution.Rmd ├── summarize.Rmd ├── summarize_solution.Rmd ├── group_by_summarize_solution.Rmd ├── make_a_data_set_solution.md ├── base_operations_solution.Rmd ├── base_operations_solution.md ├── how_to_summarise_solution.md ├── summarize_solution.md ├── grouped_data.Rmd ├── make_scatter_plots_solution.md ├── working_with_rstudio.md ├── working_with_rstudio_solution.md ├── mutate_solution.md ├── additional_plot_features.Rmd ├── grouped_data_solution.Rmd └── code_in_videos.R ├── resources └── cheatsheet-rstudio_ide.pdf ├── README.md └── .gitignore /docs/images/Icon : -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /reading_calls/Icon : -------------------------------------------------------------------------------- 1 | Blank Code (to be replaced when class is executed) for Icon 2 | -------------------------------------------------------------------------------- /code/nytimes_facet_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/nytimes_facet_plot.png -------------------------------------------------------------------------------- /code/squirrel_fur_color.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/squirrel_fur_color.pdf -------------------------------------------------------------------------------- /code/squirrel_fur_color.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/squirrel_fur_color.png -------------------------------------------------------------------------------- /docs/images/south_hall.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/docs/images/south_hall.jpg -------------------------------------------------------------------------------- /docs/images/ischool_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/docs/images/ischool_logo.png -------------------------------------------------------------------------------- /reading_calls/load_data_solution.md: -------------------------------------------------------------------------------- 1 | # Load Data Solution 2 | 3 | Well, there wasn't anything to do. :100: :tada: 4 | -------------------------------------------------------------------------------- /reading_calls/project_work_solution.md: -------------------------------------------------------------------------------- 1 | Blank Code (to be replaced when class is executed) for project_work_solution.md 2 | -------------------------------------------------------------------------------- /resources/cheatsheet-rstudio_ide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/resources/cheatsheet-rstudio_ide.pdf -------------------------------------------------------------------------------- /reading_calls/images/r_bridge_datahub.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/images/r_bridge_datahub.png -------------------------------------------------------------------------------- /reading_calls/restructure_codebase_solution.md: -------------------------------------------------------------------------------- 1 | # Solutions 2 | 3 | Because we just read this semester, there isn't a solution here. 4 | -------------------------------------------------------------------------------- /code/4.1.2_creating_vectors.R: -------------------------------------------------------------------------------- 1 | x <- seq(from = 10, to = 40, by = 10) 2 | y <- seq(from = 1, to = 4, by = 1)^2 3 | 4 | x - y 5 | 6 | mean(x) 7 | -------------------------------------------------------------------------------- /code/4.3.1_c.R: -------------------------------------------------------------------------------- 1 | animals <- c('aarvark', 'baboon', 'cheetah', 'duck') 2 | size <- c(2, 2, 3, 1) 3 | size 4 | 5 | data.frame( 6 | animals, 7 | size 8 | ) -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-minimal 2 | title: r_bridge 3 | description: Course website for the R Bridge course 4 | logo: ./images/south_hall.jpg 5 | -------------------------------------------------------------------------------- /reading_calls/choose_a_plot_type.md: -------------------------------------------------------------------------------- 1 | # Choosing A Plot Type 2 | 3 | For this semester, we're not asking you to code anything at this point! Keep on keeping on! 4 | -------------------------------------------------------------------------------- /code/grouped_data_solution_files/figure-gfm/ungrouped plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/grouped_data_solution_files/figure-gfm/ungrouped plot-1.png -------------------------------------------------------------------------------- /code/grouped_data_solution_files/figure-gfm/unnamed-chunk-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/grouped_data_solution_files/figure-gfm/unnamed-chunk-1-1.png -------------------------------------------------------------------------------- /code/grouped_data_solution_files/figure-gfm/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/grouped_data_solution_files/figure-gfm/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /code/grouped_data_solution_files/figure-gfm/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/grouped_data_solution_files/figure-gfm/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /code/grouped_data_solution_files/figure-gfm/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/grouped_data_solution_files/figure-gfm/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /code/grouped_data_solution_files/figure-gfm/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/grouped_data_solution_files/figure-gfm/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /code/make_it_sparkle_solution_files/figure-gfm/coding task-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/make_it_sparkle_solution_files/figure-gfm/coding task-1.png -------------------------------------------------------------------------------- /code/make_scatter_plots_solution_files/figure-gfm/age plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/make_scatter_plots_solution_files/figure-gfm/age plot-1.png -------------------------------------------------------------------------------- /code/make_bar_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/make_bar_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png -------------------------------------------------------------------------------- /code/make_scatter_plots_solution_files/figure-gfm/basic plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/make_scatter_plots_solution_files/figure-gfm/basic plot-1.png -------------------------------------------------------------------------------- /code/additional_features_solution_files/figure-gfm/the old plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/additional_features_solution_files/figure-gfm/the old plot-1.png -------------------------------------------------------------------------------- /code/how_to_summarise_solution_files/figure-gfm/unnamed-chunk-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/how_to_summarise_solution_files/figure-gfm/unnamed-chunk-1-1.png -------------------------------------------------------------------------------- /code/how_to_summarise_solution_files/figure-gfm/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/how_to_summarise_solution_files/figure-gfm/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /code/make_line_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/make_line_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png -------------------------------------------------------------------------------- /code/make_line_plots_solution_files/figure-gfm/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/make_line_plots_solution_files/figure-gfm/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /code/make_scatter_plots_solution_files/figure-gfm/feeling blue-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/make_scatter_plots_solution_files/figure-gfm/feeling blue-1.png -------------------------------------------------------------------------------- /code/make_scatter_plots_solution_files/figure-gfm/fur color plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/make_scatter_plots_solution_files/figure-gfm/fur color plot-1.png -------------------------------------------------------------------------------- /code/additional_features_solution_files/figure-gfm/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/additional_features_solution_files/figure-gfm/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /code/make_scatter_plots_solution_files/figure-gfm/non euclidian space-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/make_scatter_plots_solution_files/figure-gfm/non euclidian space-1.png -------------------------------------------------------------------------------- /code/working_with_rstudio_solution_files/figure-gfm/unnamed-chunk-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/working_with_rstudio_solution_files/figure-gfm/unnamed-chunk-1-1.png -------------------------------------------------------------------------------- /reading_calls/grouped_data_solution_files/figure-gfm/ungrouped plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/grouped_data_solution_files/figure-gfm/ungrouped plot-1.png -------------------------------------------------------------------------------- /reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-1-1.png -------------------------------------------------------------------------------- /reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /reading_calls/make_it_sparkle_solution_files/figure-gfm/coding task-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/make_it_sparkle_solution_files/figure-gfm/coding task-1.png -------------------------------------------------------------------------------- /reading_calls/make_scatter_plots_solution_files/figure-gfm/age plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/make_scatter_plots_solution_files/figure-gfm/age plot-1.png -------------------------------------------------------------------------------- /reading_calls/make_scatter_plots_solution_files/figure-gfm/basic plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/make_scatter_plots_solution_files/figure-gfm/basic plot-1.png -------------------------------------------------------------------------------- /reading_calls/make_bar_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/make_bar_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png -------------------------------------------------------------------------------- /reading_calls/make_line_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/make_line_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png -------------------------------------------------------------------------------- /reading_calls/make_line_plots_solution_files/figure-gfm/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/make_line_plots_solution_files/figure-gfm/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /reading_calls/make_scatter_plots_solution_files/figure-gfm/feeling blue-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/make_scatter_plots_solution_files/figure-gfm/feeling blue-1.png -------------------------------------------------------------------------------- /reading_calls/additional_features_solution_files/figure-gfm/the old plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/additional_features_solution_files/figure-gfm/the old plot-1.png -------------------------------------------------------------------------------- /reading_calls/how_to_summarise_solution_files/figure-gfm/unnamed-chunk-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/how_to_summarise_solution_files/figure-gfm/unnamed-chunk-1-1.png -------------------------------------------------------------------------------- /reading_calls/how_to_summarise_solution_files/figure-gfm/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/how_to_summarise_solution_files/figure-gfm/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /reading_calls/make_scatter_plots_solution_files/figure-gfm/fur color plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/make_scatter_plots_solution_files/figure-gfm/fur color plot-1.png -------------------------------------------------------------------------------- /reading_calls/additional_features_solution_files/figure-gfm/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/additional_features_solution_files/figure-gfm/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /reading_calls/working_with_rstudio_solution_files/figure-gfm/unnamed-chunk-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/working_with_rstudio_solution_files/figure-gfm/unnamed-chunk-1-1.png -------------------------------------------------------------------------------- /reading_calls/make_scatter_plots_solution_files/figure-gfm/non euclidian space-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/make_scatter_plots_solution_files/figure-gfm/non euclidian space-1.png -------------------------------------------------------------------------------- /reading_calls/setting_the_extents_solution.md: -------------------------------------------------------------------------------- 1 | # Setting the Extents Solution 2 | 3 | Because you haven't coded anything on this part; there isn't a solution to dig into. 4 | 5 | > A rolling stone gathers no moss. 6 | -------------------------------------------------------------------------------- /reading_calls/choose_a_plot_type_solution.md: -------------------------------------------------------------------------------- 1 | # Choose A Plot Type 2 | 3 | Because you haven't coded anthing at this point -- I suppose that means that you're code is correct? Or, is it a divide by zero error? :thinking: 4 | -------------------------------------------------------------------------------- /reading_calls/join_and_merge_solution.md: -------------------------------------------------------------------------------- 1 | # Joins and Merges Solutions 2 | 3 | Because this was mostly a reading exercise, there's nothing to provide a solution for right now. _Join us_ in moving forward. :expressionless: 4 | -------------------------------------------------------------------------------- /reading_calls/load_data.md: -------------------------------------------------------------------------------- 1 | # Load Data 2 | 3 | - For now, we didn't think that there was enough to do to actually practice loading data. 4 | - Keep these concepts in mind for when you come into w203 or your other enterprises. 5 | 6 | -------------------------------------------------------------------------------- /code/4.1.4_sequences_and_repeats.R: -------------------------------------------------------------------------------- 1 | one_to_ten <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) 2 | 3 | on_to_ten_two <- 10:1 4 | 5 | seq(from = 11, to = 20, by = 2) 6 | seq(from = 1, to = 2, by = 0.11) 7 | 8 | letter_vector <- '' 9 | 10 | rep(c('a', 'b', 'c', 'd'), each = 2) 11 | 12 | rep(1:10, times = 1:10) 13 | -------------------------------------------------------------------------------- /reading_calls/call_to_reading.md: -------------------------------------------------------------------------------- 1 | # Reading about Base Methods 2 | 3 | Please read the following chapter about **Vectors** in _*_R For Data Science_: 4 | 5 | - If you're reading the print copy, read Chapter 16. 6 | - If you're reading the digital copy, read[Chapter 20](https://r4ds.had.co.nz/vectors.html) 7 | -------------------------------------------------------------------------------- /reading_calls/share_with_coursemates.md: -------------------------------------------------------------------------------- 1 | # Share with Coursemates 2 | 3 | If you want to share your work with coursemates, but in a way that is less permanent than posting it to the class page here is ISVC, there's a slack channel that you can use! 4 | 5 | In the I School Slack, it is `#r_bridge_showcase`. 6 | -------------------------------------------------------------------------------- /reading_calls/arrange.md: -------------------------------------------------------------------------------- 1 | # Arrange 2 | 3 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `arrange.Rmd`. 4 | - This code will ask you to arrange the rows of the space launches data. 5 | -------------------------------------------------------------------------------- /reading_calls/filter.md: -------------------------------------------------------------------------------- 1 | # Filter 2 | 3 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `filter.Rmd`. 4 | - This code will ask you to filter the rows of the agencies data based on some criteria. 5 | -------------------------------------------------------------------------------- /reading_calls/mutate.md: -------------------------------------------------------------------------------- 1 | # Mutate 2 | 3 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `mutate.Rmd`. 4 | - This code will ask you to mutate new columns of data from the data that is provided in the space data. 5 | -------------------------------------------------------------------------------- /reading_calls/project_work.md: -------------------------------------------------------------------------------- 1 | # Project Work 2 | 3 | If you're going to work on a project, try to keep it limited to the work that we've covered to this point in 1C, or perhaps stretching just beyond what we've covered. 4 | 5 | You'll want to be careful that you don't burn out on this project in a way that will take away from w203 that's coming up next! 6 | -------------------------------------------------------------------------------- /reading_calls/select.md: -------------------------------------------------------------------------------- 1 | # Select 2 | 3 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `select.Rmd`. 4 | - This code will ask you to select a subset of colums from data that you've arranged the rows of the space launches data. 5 | -------------------------------------------------------------------------------- /reading_calls/make_it_sparkle.md: -------------------------------------------------------------------------------- 1 | # Make it Sparkle 2 | 3 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `make_it_sparkle.Rmd`. 4 | - This code willf ask you to produce descriptive labels on the plot that we've been working with 5 | 6 | 7 | -------------------------------------------------------------------------------- /code/3.17.1_arrange.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | agencies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 4 | launches <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 5 | 6 | agencies %>% 7 | arrange(desc(state_code)) %>% 8 | View() 9 | -------------------------------------------------------------------------------- /reading_calls/pick_a_theme_solution.md: -------------------------------------------------------------------------------- 1 | # Pick a Theme Solution 2 | 3 | There isn't really a **solution*** to this, and up until this point in the course we've been using my favorite theme -- `theme_minimal()`. 4 | 5 | I like that it uses a san-serif font in the plot; that it doesn't use more ink than it needs to, and that the grid lines that are internal to the plot are muted. 6 | 7 | Plot on! :metal: 8 | -------------------------------------------------------------------------------- /code/make_squirrels_subset.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | nyc_squirrels <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-29/nyc_squirrels.csv") 4 | 5 | ss <- nyc_squirrels %>% 6 | select(c('long', 'lat', 'hectare', 'date', 'age', 'primary_fur_color')) %>% 7 | drop_na() %>% 8 | sample_n(1000) 9 | 10 | write_csv(ss, path = './squirrels_subset.csv') -------------------------------------------------------------------------------- /reading_calls/make_a_data_set.md: -------------------------------------------------------------------------------- 1 | # Heading Back to the Farm 2 | 3 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `make_a_data_set.Rmd`. 4 | - When you're there, bring your answers from the last coding exercize (where you were making all the animals and their weight and feeds) into the file. 5 | -------------------------------------------------------------------------------- /reading_calls/base_operations.md: -------------------------------------------------------------------------------- 1 | # Heading to the Little Farm 2 | 3 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `base_operations.Rmd`. 4 | - This code will ask you to create data of many forms! 5 | - There isn't really a point -- there's no analytic task that I'm asking for, just making data. 6 | -------------------------------------------------------------------------------- /reading_calls/additional_ideas_for_projects.md: -------------------------------------------------------------------------------- 1 | # Additional Ideas for Projects 2 | 3 | One other place that you might look for early idea for projects are the MIDS Capstone Showcase pages. 4 | 5 | These projects are universally **great** and some of them might have data that is useful. As well, folks in the community are almost always happy to talk about their work and share what they've done. 6 | 7 | https://www.ischool.berkeley.edu/programs/mids/capstone 8 | -------------------------------------------------------------------------------- /code/4.7.2_joins_and_merges.R: -------------------------------------------------------------------------------- 1 | data_one <- data.frame( 2 | key_id = c('a', 'b', 'c', 'd'), 3 | variable_one = 1:4, 4 | variable_two = (1:4)^2 5 | ) 6 | 7 | data_two <- data.frame( 8 | id_key = c('a', 'b', 'c', 'e'), 9 | variable_a = c('apple', 'bananna', 'cantalope', 'durian'), 10 | variable_b = c('zebra', 'yak', 'gnu', 'wombat') 11 | ) 12 | 13 | d <- merge( 14 | x = data_one, y = data_two, 15 | by.x = 'key_id', by.y = 'id_key', 16 | all.x = TRUE, all.y = TRUE 17 | ) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # R Bridge Course 2 | 3 | Welcome! This is the course repository for the R Bridge course. This course is a non-graded, introductory course that we have built for students who are beginning the Masters of Information and Data Science program in the UC Berkeley School of Information. 4 | 5 | The code for the course is in this repository. The materials for the course website, including a syllabus and schedule, are in the `./docs` folder, and on the course website, linked in the description. 6 | -------------------------------------------------------------------------------- /reading_calls/group_by_summarize.md: -------------------------------------------------------------------------------- 1 | # Group-By Summarize 2 | 3 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `group_by_summarize.Rmd`. 4 | - This code will ask you to group, and summarize data. 5 | - The last question in the set might be a little tricky. The answer that you're shooting for is that the Soviet Union has the most variance in their per-year launches. 6 | 7 | -------------------------------------------------------------------------------- /code/3.20.1_mutate.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | agencies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 4 | launches <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 5 | 6 | agencies %>% 7 | mutate( 8 | count_log_10 = log10(count), 9 | count_log_e = log(count), 10 | count_log_e_10 = count_log_e + 10) %>% 11 | select(agency, contains('count')) %>% 12 | view() 13 | -------------------------------------------------------------------------------- /reading_calls/setting_the_extents.md: -------------------------------------------------------------------------------- 1 | # Setting the Extents 2 | 3 | Setting the plot extents is one of the strangest things within the `ggplot` plotting idiom. 4 | 5 | As we've noted in lecture, there are two options: 6 | 7 | 1. `coord_cartesian()` 8 | 2. `lims()` 9 | 10 | Rather than taking the time to code these at this point, just keep in mind that there are two options. If, in the future you're setting the extents and get an error message that some data has been dropped, `ggplot` will also let you know what the alternative is. 11 | -------------------------------------------------------------------------------- /reading_calls/keyboard_shortcuts.md: -------------------------------------------------------------------------------- 1 | # Keyboard Shortcuts 2 | 3 | The Rstudio team maintains a really great series of one-page resources for many of the major projects in R and Rstudio. They are located on this [cheatsheets](https://rstudio.com/resources/cheatsheets/) website. 4 | 5 | - Particularily relevant at this point is the cheatsheet that describes how to interact with the [Rstudio IDE](https://github.com/rstudio/cheatsheets/blob/main/rstudio-ide.pdf). 6 | - We have also saved a frozen copy of this cheatsheet in the course repo (./resources/cheatsheet-rstudio_ide.pdf). 7 | -------------------------------------------------------------------------------- /code/3.1.3_geom_bar.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(ggplot2) 3 | library(patchwork) 4 | 5 | squirrel_subset <- read.csv('squirrels_subset.csv') 6 | 7 | squirrel_subset_by_color <- squirrel_subset %>% 8 | group_by(primary_fur_color) %>% 9 | summarise(count_by_color = n()) 10 | 11 | plot_col <- squirrel_subset_by_color %>% 12 | ggplot() + 13 | aes(x = primary_fur_color, y = count_by_color) + 14 | geom_col() 15 | 16 | plot_bar <- squirrel_subset %>% 17 | ggplot() + 18 | aes(x = primary_fur_color) + 19 | geom_bar() 20 | 21 | plot_col | plot_bar 22 | -------------------------------------------------------------------------------- /code/3.3.1_geom__.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(ggplot2) 3 | library(patchwork) 4 | 5 | squirrel_subset <- read.csv('squirrels_subset.csv') 6 | 7 | squirrel_scatter <- squirrel_subset %>% 8 | ggplot() + 9 | aes(x = long, y = lat) + 10 | geom_point() 11 | 12 | squirrel_long_histogram <- squirrel_subset %>% 13 | ggplot() + 14 | aes(x = long) + 15 | geom_histogram() 16 | 17 | squirrel_long_density <- squirrel_subset %>% 18 | ggplot() + 19 | aes(x = long) + 20 | geom_density() 21 | 22 | squirrel_long_histogram / squirrel_long_density 23 | 24 | geom_ 25 | -------------------------------------------------------------------------------- /reading_calls/demo_of_project_outcomes.md: -------------------------------------------------------------------------------- 1 | # Demo of Project Outcomes 2 | 3 | This is the first semester that MIDS 1C has been in existence, and so there aren't any outcomes to demo just yet. 4 | 5 | However, if you work on a project, and you want to show it off for people in the future, let me know (@alex.h). What I'll do is bring in your knitted .md file and link it on this page. 6 | 7 | I really, really, really encourage you to share your code, because showing people what all kinds of good, bad, and ugly look like is useful. Plus, it will be a fun record to look back on when you're graduating. 8 | -------------------------------------------------------------------------------- /reading_calls/restructure_codebase.md: -------------------------------------------------------------------------------- 1 | # Reading about codebase structure 2 | 3 | Rather than restructuring a codebase, this semester I'd like you to read about how Cookiecutter data science thinks that a project might be structured. 4 | 5 | https://drivendata.github.io/cookiecutter-data-science/ 6 | 7 | While I think that this is a starting place, it is overkill for small projects. As you come into w203, and in particular the _Hypothesis Testing_ and _Final Lab_ this structure will help you to be able to build a working pipeline that builds toward clean data that you can write clean tests on. 8 | -------------------------------------------------------------------------------- /code/3.16.6_filter.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 4 | launches <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 5 | 6 | # filter 7 | 8 | filter(launches, launch_year < 1968, launch_year > 1966, agency == 'US') 9 | 10 | # magrittr: %>% 11 | 12 | launches %>% 13 | filter(launch_year < 1968) %>% 14 | filter(launch_year > 1966) %>% 15 | filter(agency == 'US' | agency == 'SU') 16 | 17 | 18 | -------------------------------------------------------------------------------- /code/4.1.5_r_.R: -------------------------------------------------------------------------------- 1 | # population average is 100 2 | # population sd is 20 3 | 4 | rnorm(n = 42, mean = 100, sd = 20) 5 | 6 | # 100: 7 | # pop: average 42, 8 | # var: 100 9 | 10 | first_draw <- rnorm(n = 100, mean = 42, sd = 10) 11 | second_draw <- rnorm(n = 100, mean = 42, sd = 10) 12 | 13 | mean(first_draw == second_draw) 14 | 15 | draws <- runif(n = 1000, min = -1, max = 9) 16 | hist(draws, col = 'black') 17 | 18 | 19 | 20 | urn <- c('red_ball', 'blue_ball', 'green_ball') 21 | sample(x = urn, size = 3, replace = FALSE) 22 | 23 | lett <- c('a', 'b', 'c', 'd', 'e', 'f') 24 | sample(lett) 25 | 26 | -------------------------------------------------------------------------------- /code/3.19_select.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | agencies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 4 | launches <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 5 | 6 | launches %>% 7 | select(launch_year, launch_date, agency, agency_type) %>% 8 | filter(launch_year > 1968 & launch_year < 1972) %>% 9 | arrange(desc(launch_year)) %>% 10 | view() 11 | 12 | launches %>% 13 | select(contains('agency'), contains('launch')) %>% 14 | view() 15 | -------------------------------------------------------------------------------- /code/3.22_summarise.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | agencies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 4 | launches <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 5 | 6 | agencies %>% 7 | mutate(count_log = log(count)) %>% 8 | view 9 | 10 | agencies %>% 11 | summarize( 12 | average_launches = mean(count), 13 | var_launches = var(count), 14 | number_of_agencies = length(unique(agencies)), 15 | count_log = log(count) 16 | ) 17 | -------------------------------------------------------------------------------- /code/3.5.2_facet_.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(ggplot2) 3 | library(patchwork) 4 | 5 | squirrel_subset <- read.csv('squirrels_subset.csv') 6 | 7 | squirrel_subset <- squirrel_subset %>% 8 | mutate( 9 | date_f = as.Date.character(date, format = '%m%d%Y') 10 | ) 11 | 12 | squirrel_subset_by_color <- squirrel_subset %>% 13 | group_by(primary_fur_color) %>% 14 | summarise(count_by_color = n()) 15 | 16 | squirrel_subset %>% 17 | ggplot() + 18 | aes(x = date_f) + 19 | geom_histogram() + 20 | facet_wrap(vars(primary_fur_color), nrow = 3) 21 | 22 | squirrel_subset %>% 23 | group_by() 24 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | Icon 6 | 7 | # General 8 | .DS_Store 9 | .AppleDouble 10 | .LSOverride 11 | 12 | # Icon must end with two \r 13 | Icon 14 | Icon\r\r 15 | Icon\? 16 | "Icon? 17 | 18 | # Thumbnails 19 | ._* 20 | 21 | # Files that might appear in the root of a volume 22 | .DocumentRevisions-V100 23 | .fseventsd 24 | .Spotlight-V100 25 | .TemporaryItems 26 | .Trashes 27 | .VolumeIcon.icns 28 | .com.apple.timemachine.donotpresent 29 | 30 | # Directories potentially created on remote AFP share 31 | .AppleDB 32 | .AppleDesktop 33 | Network Trash Folder 34 | Temporary Items 35 | .apdisk 36 | Icon\r\r -------------------------------------------------------------------------------- /code/4.13.1_show_hide_comment.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Show, Hide, Comment" 3 | author: "Alex Hughes" 4 | date: "6/7/2020" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | # Show 13 | 14 | - Code that loads data (always) 15 | - Code that estimates models (always) 16 | - A Dump of Data (never) 17 | 18 | # Hide 19 | 20 | - Cleaning code (unless that is the review that you're looking for) 21 | - Issues and notes to yourself 22 | 23 | # Comment 24 | 25 | - Documentation for functions, in the function 26 | - Any knowledge that someone *running* the code might need. -------------------------------------------------------------------------------- /reading_calls/make_scatter_plots.md: -------------------------------------------------------------------------------- 1 | # Make Scatter Plots 2 | 3 | Ok! Now it is your turn. I've just talked you through how I might make scatter plots that represent features about these squirrels. Now, you go and do the same. 4 | 5 | - Navigate to the [UCB Datahub]( https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) 6 | - Open the file `make_scatter_plots.R` and create the same plots that I've just created in the lectures. 7 | - While you should feel welcome to look back at the lecture content, I'll try to provide enough scaffolding that you can write the code without needing to return to the lecture. 8 | -------------------------------------------------------------------------------- /reading_calls/alternatives_to_rstudio.md: -------------------------------------------------------------------------------- 1 | # Alternatives to Rstudio 2 | 3 | Just as jupyter has been the standard method of working with python for data science, Rstudio has emerged as the standard editor of working with R for data science. However, Rstudio is far from the only option. 4 | 5 | Of particular note for alternatives: 6 | 7 | - Emacs users will be **smugly** satisfied that ESS was purpose built for interactive programming in R. It is very, very good if you want to use a general purpose editor for writing your R code. 8 | - Microsoft’s VSCode has several really impressive extensions for working with the language as well. That isn't too surprising, because the primary development of the R language is now underwritten in part by Microsoft. 9 | 10 | -------------------------------------------------------------------------------- /code/arrange.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Arrange' 3 | author: 'w203: Statistics for Data Science' 4 | date: "8/13/2020" 5 | output: gitihub_document 6 | --- 7 | 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 9 | library(tidyverse) 10 | ``` 11 | 12 | # Task 13 | 14 | Using the `launches` data, the `arrange()` verb, and the `head()` verb: 15 | 16 | - Print the earliest launches 17 | 18 | ```{r} 19 | launches %>% 20 | arrange(launch_date) %>% 21 | head() 22 | ``` 23 | 24 | - Still using the head verb, use the `desec()` adverb to print the *most recent* launches 25 | 26 | ```{r} 27 | 28 | ``` 29 | 30 | Adding in the `state_code` variable, 31 | 32 | - Arrange the data so that it is sorted first by `launch_year` and then by `state_code` 33 | 34 | ```{r} 35 | 36 | ``` 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /reading_calls/make_line_plots.md: -------------------------------------------------------------------------------- 1 | # Make Line Plots 2 | 3 | Ok! Now it is your turn agin. I've just talked you through how I might make line plots that represent connections through time. Now, you go and do the same. Like last time, I'll provide you with most of the boilerplate code, and you can do the work to actually write the line plots out. 4 | 5 | - Navigate to the [UCB Datahub]( https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) 6 | - Open the file `make_line_plots.R` and create the same plots that I've just created in the lectures. 7 | - While you should feel welcome to look back at the lecture content, I'll try to provide enough scaffolding that you can write the code without needing to return to the lecture. 8 | -------------------------------------------------------------------------------- /code/select.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Select' 3 | author: 'w203: Statistics for Data Science' 4 | date: "8/13/2020" 5 | output: github_document 6 | --- 7 | 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 9 | library(tidyverse) 10 | ``` 11 | 12 | ```{r load data, message=FALSE} 13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 15 | ``` 16 | 17 | # Task 18 | 19 | - From the launches data, `select()` only the columns that are related to characteristics about the country 20 | 21 | ```{r} 22 | 23 | ``` 24 | 25 | - From the launches data, `select()` only the columns that are related to time 26 | 27 | ```{r} 28 | 29 | ``` 30 | 31 | -------------------------------------------------------------------------------- /reading_calls/grouped_data.md: -------------------------------------------------------------------------------- 1 | # Grouped data 2 | 3 | We've just discussed approaches to make visual comparisons between groups. The approach we want to use depends on what particular comparisons we want to make between the groups, how many groups we have to compare, and even differences in the scales of these two groups. 4 | 5 | Now, we'll practice both approaches we covered with the `nyc_squirrels` dataset. 6 | 7 | # Coding task 8 | 9 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `grouped_data.Rmd`. 10 | - This code will ask you to plot grouped data two different ways: 11 | - Once using multiple series on the same plot 12 | - Once using `facet_wrap()` to plot each series on its own plot 13 | 14 | 15 | -------------------------------------------------------------------------------- /code/3.25_group_by.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | agencies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 4 | launches <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 5 | 6 | d <- data.frame( 7 | id = 1:15, 8 | individual = rep(LETTERS[1:5], each = 3), 9 | group = rep(c('Group One', 'Group Two'), times = c(9, 6)), 10 | time = rep(1:3, times = 5), 11 | value = (1:15)^2 12 | ) 13 | 14 | d <- d %>% 15 | arrange(sample(id)) 16 | 17 | d %>% 18 | group_by(group, time) %>% 19 | summarise( 20 | value_average = mean(value) 21 | ) 22 | 23 | 24 | launches %>% 25 | group_by(state_code, launch_year) %>% 26 | summarise(total_launches = n()) %>% 27 | arrange(desc(total_launches)) 28 | -------------------------------------------------------------------------------- /code/3.11.1_controlling_plot_extents.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(ggplot2) 3 | library(patchwork) 4 | 5 | squirrel_subset <- readr::read_csv('squirrels_subset.csv') 6 | 7 | squirrel_subset_by_color <- squirrel_subset %>% 8 | mutate(date_f = as.Date.character(date, format = '%m%d%Y')) %>% 9 | group_by(date_f, primary_fur_color) %>% 10 | summarise(count_of_squirrels = n()) 11 | 12 | squirrel_subset_by_color %>% 13 | ggplot() + 14 | aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 15 | stat_smooth(se = FALSE) + 16 | labs( 17 | title = 'Decreasing Count of Squirrels Through Time', 18 | subtitle = 'Moving average smoother estimate', 19 | x = 'Date of Observation', 20 | y = 'Count of Squirrels', 21 | color = 'Primary Fur Color' 22 | ) + 23 | coord_cartesian( 24 | xlim = c(as.Date.character('2018-10-08'), 25 | as.Date.character('2018-10-15'))) -------------------------------------------------------------------------------- /reading_calls/be_your_own_linter.md: -------------------------------------------------------------------------------- 1 | The good folks over at RStudio make a [coding style guide](https://style.tidyverse.org "style_guide") available. Just like all style guides, it is an opinonated resource, but their opinions are pretty reasonable! 2 | 3 | Take a gander at the following sections: 1-5 in the Style Guide. 4 | 5 | I like to think of code as craft: 6 | 7 | - Maybe you're careful about how you make your coffee; or, 8 | - How you cut your vegitables; 9 | - How you park your car; 10 | - How you do your hair 11 | 12 | There is something in your life that you take satisfaction in doing well. That's why you're here! Writing elegant code _can_ be one of those. But, as you're going to read in the **next** section, please don't let the perfect be the enemy of the "it works". 13 | 14 | I write python code with an R accent. I write R code with an "old person" accent. But, I'm learning new things every day. 15 | 16 | -------------------------------------------------------------------------------- /code/3.13.1_setting_the_theme.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(ggplot2) 3 | library(patchwork) 4 | 5 | theme_set(theme_minimal()) 6 | 7 | squirrel_subset <- readr::read_csv('squirrels_subset.csv') 8 | 9 | squirrel_subset_by_color <- squirrel_subset %>% 10 | mutate(date_f = as.Date.character(date, format = '%m%d%Y')) %>% 11 | group_by(date_f, primary_fur_color) %>% 12 | summarise(count_of_squirrels = n()) 13 | 14 | squirrel_subset_by_color %>% 15 | ggplot() + 16 | aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 17 | stat_smooth(se = FALSE) + 18 | labs( 19 | title = 'Decreasing Count of Squirrels Through Time', 20 | subtitle = 'Moving average smoother estimate', 21 | x = 'Date of Observation', 22 | y = 'Count of Squirrels', 23 | color = 'Primary Fur Color') + 24 | coord_cartesian( 25 | xlim = c(as.Date.character('2018-10-08'), 26 | as.Date.character('2018-10-15'))) 27 | -------------------------------------------------------------------------------- /code/mutate.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: '' 3 | author: 'w203: Statistics for Data Science' 4 | date: "8/13/2020" 5 | output: github_document 6 | --- 7 | 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 9 | library(tidyverse) 10 | ``` 11 | 12 | ```{r load data, message=FALSE} 13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 15 | ``` 16 | 17 | # Mutate Task 18 | 19 | - Using the `agencies` data, create a series of variables that contain the log of the `count` of launches. 20 | 21 | ```{r} 22 | 23 | ``` 24 | 25 | - Then, show only the columns that are called either `agency` or `contains()` the string "count". 26 | 27 | ```{r} 28 | 29 | ``` 30 | 31 | - Finally, `arrange()` these descending by `count`. 32 | 33 | ```{r} 34 | 35 | ``` -------------------------------------------------------------------------------- /reading_calls/join_and_merge.md: -------------------------------------------------------------------------------- 1 | # Joins and Merges 2 | 3 | There's a maxim: 4 | 5 | > If the data were easy to have, someone would have already done it! 6 | 7 | Bringing data together from multiple sources, finding the common keys between the data, and arranging things so that they are ready to use is, at different points: fun, tedious, rewarding, and frustrating. 8 | 9 | Please, read the [R For Data Science](https://r4ds.had.co.nz/relational-data.html) chapter on relational data for merging. There is a lot covered in this chapter so, let me provide you an edited list (in case you don't want to read it all). 10 | 11 | - Read: 13.1 Introduction 12 | - Read: 13.1.1 Prerequisites -- and, in fact, log on to teh datahub and load the data so that you can read and type along. 13 | - Read: 13.2 and 13.3, but skip the exercises 14 | - Read: 13.4; and, do the exercises for 13.4.6 15 | - Read: 13.5, but skip the exercises in 13.5.1 16 | - Read: 13.6 17 | - **Skip** 13.7 18 | 19 | -------------------------------------------------------------------------------- /code/select_solution.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Select' 3 | author: 'w203: Statistics for Data Science' 4 | date: "8/13/2020" 5 | output: github_document 6 | --- 7 | 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 9 | library(tidyverse) 10 | ``` 11 | 12 | ```{r load data, message=FALSE} 13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 15 | ``` 16 | 17 | # Task 18 | 19 | - From the launches data, `select()` only the columns that are related to characteristics about the country 20 | 21 | ```{r} 22 | launches %>% 23 | select(agency, state_code, agency_type) 24 | ``` 25 | 26 | - From the launches data, `select()` only the columns that are related to time 27 | 28 | ```{r} 29 | launches %>% 30 | select(tag, JD, launch_date, launch_year) 31 | ``` 32 | 33 | -------------------------------------------------------------------------------- /reading_calls/read_about_plots.md: -------------------------------------------------------------------------------- 1 | # Read About Plots 2 | 3 | Please read in the Book _R For Data Science_ 4 | 5 | - If you are reading from the pyhsical copy of the book, please read pages 1-13. 6 | - If you are reading from the [digital copy](https://r4ds.had.co.nz/explore-intro.html) of the book, please read 7 | - 2: Introduction 8 | - 3: Data visualization sections 3.1, 3.2, 3.3, and 3.4 stopping before the section 3.5 "facets". 9 | 10 | This reading is the same (at least as of the time that we've made the course available). For this content, because the language is changing, I actually think that reading the online (digital) version of the resource is a better idea (although I read from a physical copy of the book). 11 | 12 | As you're reading, you can follow along and execute the code that the authors are talking about in your DataHub, if you like. Consider this, perhaps by making a new file, and writing the code, executing, and reading the output. 13 | -------------------------------------------------------------------------------- /code/make_a_data_set.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Make A Data Set" 3 | author: 'w203: Statistics for Data Science' 4 | date: "8/14/2020" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | library(tidyverse) 10 | ``` 11 | 12 | # Paste Farm Code 13 | 14 | Start by pasting the code that you wrote to create the farm animals below. 15 | 16 | ```{r} 17 | 'farm animal creating code here' 18 | ``` 19 | 20 | ## Combine into a dataset 21 | 22 | Now, combine all of these variables into a single dataset, called `tilden`. 23 | 24 | ## Typecast 25 | 26 | Now do a little bit of mutating and type converting. 27 | 28 | - Since we know that each of the types of animals has a specific amount that they eat, let's label them "hungry boi" if they eat more than average; and "slender boi" if they eat less than average. (Sorry... I know this coding might be getting tedious!). 29 | - Make each of these relative to the mean within that animal type. 30 | 31 | ```{r} 32 | 33 | ``` 34 | 35 | -------------------------------------------------------------------------------- /reading_calls/r_markdown.md: -------------------------------------------------------------------------------- 1 | # Reading Call: R Markdown 2 | 3 | Without being **super** clear about it, for the last several units we've been using R Markdown documents. These are documents that let you mix code and description into a single plain text file. 4 | 5 | There are several benefits to this: 6 | 7 | - You can document your code in place 8 | - You can produce reports from your code that are **always** in sync with the data and codebase 9 | - You can share these documents with others; and keep them under version control 10 | 11 | Now, I'd like you to read a little more detail about the system in R For Data Science. Please read the following: 12 | 13 | - [Chapter 26](https://r4ds.had.co.nz/communicate-intro.html) in the digital version of _R For Data Science_ and [Chapter 27](https://r4ds.had.co.nz/r-markdown.html) in the same. 14 | - In Chapter 27, read sections 1-6, skipping section 7. 15 | - If you're reading in the physical copy, these are the chapters about "Communicating" and "R Markdown" 16 | -------------------------------------------------------------------------------- /code/group_by_summarize.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Split, Apply, Combine' 3 | author: 'w203: Statistics for Data Science' 4 | date: "8/13/2020" 5 | output: github_document 6 | --- 7 | 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 9 | library(tidyverse) 10 | ``` 11 | 12 | ```{r load data, message=FALSE} 13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 15 | ``` 16 | 17 | # Task 18 | 19 | - Using the `launches` data, count the total number of launches per `launch_year`, grouped by `state_code`. 20 | - Then, using `arrange` answer the question: which year was the busiest for any state? 21 | - Then, using `filter` answer the question: what was the busiest year for the US? 22 | - Then, using another variable summary, answer the question: which country has the most variance in the per-year launches? -------------------------------------------------------------------------------- /code/make_line_plots.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(ggplot2) 3 | 4 | squirrel_subset <- read_csv('./squirrels_subset.csv') 5 | 6 | squirrel_subset <- squirrel_subset %>% 7 | mutate(date_f = as.Date.character(date, format = '%m%d%Y')) 8 | 9 | ## Task 1: Do the core work 10 | ## - Make a line plot that shows the squirrels observed by date 11 | 12 | squirrel_subset %>% 13 | group_by(date_f) %>% 14 | summarise(count_of_squirrels = n()) %>% 15 | ggplot() + 16 | aes(x = 'fill this in', y = 'fill this in') + 17 | 'fill this in' 18 | 19 | ## Challenge Task 2: Add on to the core work 20 | ## - We haven't covered this yet, so it is ABSOLUTELY optional. 21 | ## - But, what if you wanted to make a separate line plot for each of the 22 | ## colors of squirrels? 23 | 24 | ## - To do so, you'd have to change how you're grouping (to count colors 25 | ## separately), and then you'd have to bring that color variable (which 26 | ## is a data feature) into your final line plot. 27 | 28 | ## If you're interested, try it! 29 | -------------------------------------------------------------------------------- /reading_calls/rstudio_cheatsheet.md: -------------------------------------------------------------------------------- 1 | # R Markdown Cheatsheet 2 | 3 | Return to the cheatsheet resource that Rstudio maintains. 4 | 5 | - The top-level repository with the cheatsheets is [here](https://rstudio.com/resources/cheatsheets/); and, 6 | - The cheatsheet that is specifically useful right now is the [Rmarkdown cheatsheet](https://github.com/rstudio/cheatsheets/raw/master/rmarkdown-2.0.pdf). (Note, this link might download a PDF when you click it.) 7 | 8 | This cheatsheet is something that you might want to have around when you're producing document and work for w203. 9 | 10 | For now, if you're interseted in seeing how the expanded set of options available to you work, navigate to the UCB Datahub, head to the code folder, and open `make_scatter_plots_solution.Rmd`. 11 | 12 | There is nothing _particularily_ special about this file, but try to see if you can use the Datahub to output the solutions into different formats by changing the `output:` call in the YAML and re-knitting. See what happens if you insert new chunks, or change options that are passed into chunks. 13 | -------------------------------------------------------------------------------- /code/code_for_videos.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(patchwork) 3 | 4 | theme_set(theme_minimal()) 5 | 6 | ggplot(data = ss) + 7 | aes(x = long) + 8 | aes(y = lat) + 9 | geom_point() 10 | 11 | p1 <- ggplot(data = ss, aes(x = long, y = lat)) + geom_point() 12 | 13 | p2 <- ggplot(data = ss, aes(long)) + 14 | geom_histogram() 15 | 16 | p1 / p2 17 | 18 | ggplot(data = ss, aes(x = long, y = lat, shape = primary_fur_color, color = primary_fur_color)) + 19 | geom_point() + 20 | coord_quickmap() 21 | 22 | ggplot(economics, aes(date, unemploy)) + 23 | geom_line() 24 | 25 | ggplot(economics_long, aes(date, value01, colour = variable)) + 26 | geom_line() 27 | 28 | ggplot() + 29 | geom_line(data = economics, mapping = aes(x = date, y = pce)) + 30 | geom_line(data = economics, mapping = aes(x = date, y = pop)) 31 | 32 | economics_long %>% 33 | filter(variable %in% c('pce', 'pop')) %>% 34 | ggplot(aes(date, value, color = variable)) + 35 | geom_line() 36 | 37 | economics_long %>% 38 | ggplot(aes(date, value01, color = variable)) + 39 | geom_line() 40 | -------------------------------------------------------------------------------- /code/filter.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: '' 3 | author: 'w203: Statistics for Data Science' 4 | date: "8/13/2020" 5 | output: gitihub_document 6 | --- 7 | 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 9 | library(tidyverse) 10 | ``` 11 | 12 | ```{r load data} 13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 15 | ``` 16 | 17 | # Task 18 | 19 | Use the magrittr pipr `%>%` and the `filter()` verb to do the following work: 20 | 21 | ## Launches in the 1980s 22 | 23 | Find only the launches that occurred in the 1980s 24 | 25 | ## Launches by France 26 | 27 | Find only the launches that were conducted by France (`state_code == "F"` ) 28 | 29 | ## Launches by France in the 1980s 30 | 31 | Find only the launches by France that were conducted in the 1980s 32 | 33 | ## Launches by France or Russia in the 1980s 34 | 35 | Find launches in the 1980s by either France or the Soviet Union -------------------------------------------------------------------------------- /reading_calls/goals_of_the_project.md: -------------------------------------------------------------------------------- 1 | # Goals of the Project 2 | 3 | If you're going to take on a small project here are the goals that I think might help you to scope it: 4 | 5 | 1. The data should already be collected 6 | 2. The data should be in one, or maybe two files 7 | 3. If there is a time component to the data, then your key insight should be a plot for how simple values of the data change over time; 8 | 4. If there is not a time component to the data, then your key insight should be how different categories that are represented in the data have different values on the outcomes that you care about. 9 | 10 | # Where to Get Data 11 | 12 | - I've pointed to Tidy Tuesday (https://github.com/rfordatascience/tidytuesday) before as a nice place to get data. In fact, that is where both the squirrels and space launches data came from. But, a lot of students will probably end up using data from there. 13 | - Another good option is the #datasets channel in the School's Slack channel. 14 | - Also a good bet are the New York Times, LA Times, Wall Street Journal, and Economist github pages. 15 | -------------------------------------------------------------------------------- /reading_calls/control_flow.md: -------------------------------------------------------------------------------- 1 | # Control Flow 2 | 3 | For individuals who are coming from an engineering background, control flow is second nature. For folks coming from other backgrounds (like myself) the concept can be a little baffling. 4 | 5 | The general idea is that you're going to set a condition, and take some actions given that condition holds. If that condition doesn't hold, you'll do something else -- maybe take some other action, maybe stop, maybe ... 6 | 7 | The thing is: when dealing with data it **MIGHT** seem like a good idea to write loops, but generally it is not. R is built for vector operations -- which you've been using for a while, and used just a moment ago when you type cast the 'thick bois' and 'slender bois'. This vector operation has two benefits: 8 | 9 | 1. It makes the code much more legible, reducing the boilerplate that you've got to write; 10 | 2. It makes the code much faster; most vectorized operations are written in compiled, optimized C. A lower-level, faster language. 11 | 12 | For now, read [this short explaination](https://adv-r.hadley.nz/control-flow.html) about control flow in R. 13 | 14 | 15 | -------------------------------------------------------------------------------- /reading_calls/tidying_arranging_and_summarizing.md: -------------------------------------------------------------------------------- 1 | # Tidying, Arranging, and Summarizing 2 | 3 | We're now to the point that we're going to really start working with data. Until this point, I've basically provided you with the data in the form that you need it. 4 | 5 | But tidying, arranging, and summarizing data is **so** core to many of the tasks that we work on. 6 | 7 | In this section we're going to start a series of really quick loops between a lecture on a concept and a short coding exercise that uses that concept. 8 | 9 | # Reading Task 10 | 11 | To begin please read the following chapter in *R For Data Science*. 12 | 13 | - If you're reading the hard copy, the chapter is **Chapter 5: Exploratory Data Analysis** 14 | - If you're reading the digital copy it is **Chapter 7: Exploratory Data Analysis** and you can get to it [at this link](https://r4ds.had.co.nz/exploratory-data-analysis.html). 15 | 16 | As you're reading along, try to think of each new concept that comes in as a **verb** that _does_ something to data. The idea within the `dplyr` framework is that data **is** and we as data scientists do things to modify the canonical source data. 17 | -------------------------------------------------------------------------------- /code/make_line_plots_solution.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Line Plots Solutions" 3 | author: 'w203: Statistics for Data Science' 4 | output: github_document 5 | --- 6 | 7 | ```{r setup, results = 'hide', warning=FALSE, message=FALSE} 8 | library(tidyverse) 9 | library(ggplot2) 10 | theme_set(theme_minimal()) 11 | knitr::opts_chunk$set(dpi = 200) 12 | ``` 13 | 14 | ```{r load data} 15 | squirrel_subset <- read_csv('./squirrels_subset.csv') 16 | squirrel_subset <- squirrel_subset %>% 17 | mutate(date_f = as.Date.character(date, format = '%m%d%Y')) 18 | ``` 19 | 20 | # Squirrels by date 21 | 22 | - Make a line plot that shows the squirrels observed by date 23 | 24 | ```{r} 25 | squirrel_subset %>% 26 | group_by(date_f) %>% 27 | summarise(count_of_squirrels = n()) %>% 28 | ggplot() + 29 | aes(x = date_f, y = count_of_squirrels) + 30 | geom_line() 31 | ``` 32 | 33 | # Challenge: Squirrels by color by date 34 | 35 | ```{r} 36 | squirrel_subset %>% 37 | group_by(date_f, primary_fur_color) %>% 38 | summarise(count_of_squirrels = n()) %>% 39 | ggplot() + 40 | aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 41 | geom_line() 42 | ``` 43 | 44 | -------------------------------------------------------------------------------- /reading_calls/working_with_rstudio.md: -------------------------------------------------------------------------------- 1 | # Working with Rstudio 2 | 3 | At this point, please go and use the UCB Datahub to do a little bit of practice working with the Rstudio IDE. 4 | 5 | The work is going to be simple -- I am asking you to execute code that has already been mostly written for you. 6 | 7 | - The goal is that you can start to work with the tool; articulate it and see how it responds. 8 | - This way, when we move forward you can use the tool to answer questions that you're interested in. 9 | 10 | If you click [this link to the UCB Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) the datahub will pull a current version of the code that I would like you to work on. 11 | 12 | When the datahub starts up, navigate to the `r_bridge` folder, and then `code` and then to the file called, `working_with_rstudio.R`. 13 | 14 | Although you can't see the image here in the IVSC, if you're lost and would like a picture to show you where I'm navigating, if you "Open in GitHub" there will be a image that shows. 15 | 16 | ![](./images/r_bridge_datahub.png) 17 | 18 | -------------------------------------------------------------------------------- /reading_calls/how_to_summarise.md: -------------------------------------------------------------------------------- 1 | # Summarising the Information In A Plot 2 | 3 | In this lecture I've just made the argument that you might want to reduce the information that is present in a plot as a method of highlight the core insight that you're trying to communicate. 4 | 5 | Plots are pieces of your rhetorical toolkit when you are communicating about data. As such, you have the ability to highlight (or lowlight) the features of the story that you are telling within your plot. 6 | 7 | This doesn't mean that you can lie with your plots; that's not the goal. But, the goal is to make the clearest argument for what you've learned from the plot, while also communicating how you've learnred that bit. 8 | 9 | In the lecture that you just saw, we accomplish this by smoothing the data using the `stat_smoother()` moving average smoother. 10 | 11 | # Coding task 12 | 13 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `how_to_summarize.Rmd`. 14 | - This code will ask you to use the `stat_smooth()` function. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /reading_calls/issuing_code_and_reading_output.md: -------------------------------------------------------------------------------- 1 | # Issuing Code and Reading Output 2 | 3 | At this point, please go and use the UCB Datahub to do a little bit of practice working with the Rstudio IDE. 4 | 5 | The work is going to be simple -- I am asking you to execute code that has already been mostly written for you. 6 | 7 | - The goal is that you can start to work with the tool; articulate it and see how it responds. 8 | - This way, when we move forward you can use the tool to answer questions that you're interested in. 9 | 10 | If you click [this link to the UCB Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) the datahub will pull a current version of the code that I would like you to work on. 11 | 12 | When the datahub starts up, navigate to the `r_bridge` folder, and then `code` and then to the file called, `issuing_code_and_reading_outputs.R`. 13 | 14 | Although you can't see the image here in the IVSC, if you're lost and would like a picture to show you where I'm navigating, if you "Open in GitHub" there will be a image that shows. 15 | 16 | ![](./images/r_bridge_datahub.png) 17 | 18 | -------------------------------------------------------------------------------- /reading_calls/review_code.md: -------------------------------------------------------------------------------- 1 | # Review Code 2 | 3 | This [article](https://www.nature.com/articles/nenergy2016170) applies simple models on top of survey data from respondents who live in rural India. It was published at Nature Energy, which is in the _Nature_ constellation of publication venues. 4 | 5 | I would hate for someone to remark this about my research, but there's nothing really extraordinary about this work. But, we learned something from it. A small thing, but something that _Natue_ thought was important enough to publish. 6 | 7 | Now, the authors publish their data and code along side this work. Follow [this link](https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/QY5R7R/PVKVR8&version=2.0) which will take you to the core code that authors used to generate these findings. 8 | 9 | # Questions 10 | 11 | 1. Does this code conform to the coding style guides that you've just read about? 12 | 2. What is one, specific thing that you would do to reformat their code to make it more legible. 13 | 14 | If you're so inclined -- but don't spend more than 5 minutes doing so -- you could paste this code into your Dataverse and restructure some parts of it to make it more legible. 15 | 16 | 17 | -------------------------------------------------------------------------------- /code/arrange_solution.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Arrange' 3 | author: 'w203: Statistics for Data Science' 4 | date: "8/13/2020" 5 | output: github_document 6 | --- 7 | 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 9 | library(tidyverse) 10 | ``` 11 | 12 | ```{r load data} 13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 15 | ``` 16 | 17 | # Task 18 | 19 | Using the `launches` data, the `arrange()` verb, and the `head()` verb: 20 | 21 | - Print the earliest launches 22 | 23 | ```{r} 24 | launches %>% 25 | arrange(launch_date) %>% 26 | head() 27 | ``` 28 | 29 | - Still using the head verb, use the `desec()` adverb to print the *most recent* launches 30 | 31 | ```{r} 32 | launches %>% 33 | arrange(desc(launch_date)) %>% 34 | head() 35 | ``` 36 | 37 | Adding in the `state_code` variable, 38 | 39 | - Arrange the data so that it is sorted first by `launch_year` and then by `state_code` 40 | 41 | ```{r} 42 | launches %>% 43 | arrange(launch_year, state_code) 44 | ``` 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /code/make_bar_plots.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Making Bar Plots" 3 | author: 'w203: Statistics for Data Science' 4 | output: github_document 5 | --- 6 | 7 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 8 | library(tidyverse) 9 | library(ggplot2) 10 | # install.packages('patchwork') 11 | library(patchwork) 12 | 13 | theme_set(theme_minimal()) 14 | knitr::opts_chunk$set(dpi = 200) 15 | ``` 16 | 17 | ```{r load data} 18 | squirrel_subset <- read.csv('squirrels_subset.csv') 19 | ``` 20 | 21 | ```{r aggregate data} 22 | squirrel_subset_by_color <- squirrel_subset %>% 23 | group_by(primary_fur_color) %>% 24 | summarise(count_by_color = n()) 25 | ``` 26 | 27 | # Task 28 | 29 | Produce two identical plots that have the following characteristics: 30 | 31 | - On the x-axis the plots have the color of the squirrel fur 32 | - On the y-axis the plots have a count of the nubmer of squirrels that have that color fur 33 | 34 | However, make these plots in two ways: 35 | 36 | 1. In one plot, use `geom_bar()` (and the appropriate dataset) 37 | 2. In the other plot, use `geom_col()` (and the appropriate dataset) 38 | 39 | Note that the datsets will be different for each of the different geometries. 40 | 41 | ```{r} 42 | 43 | ``` -------------------------------------------------------------------------------- /reading_calls/installing_r.md: -------------------------------------------------------------------------------- 1 | # Installing R 2 | 3 | R is available to install at https://cran.r-project.org. 4 | 5 | For right now, we want to discourage you from installing locally. Instead, we want you to focus on the work internal to the language; not the dev-ops to be able to use the language. 6 | 7 | ## What a local install would mean 8 | 9 | Installing locally will mean 10 | 11 | - That you get access to full resources on your local machine :tada: 12 | - That you have to manage the access to all of those resources :-1: 13 | - That your setup will be different than everyone else's setup :fire: 14 | 15 | For now, when we have a limited view of what we're actually going to need, we think that we shouldn't over-invest in building technology. This probably means that we are "borrowing" from our future and so will have technical debt to pay. But, for right now, being cash rich and using the language is important. 16 | 17 | As an example, many of the high-performance parts of the language use specific C compilers. In the future, if you want to use these tools, you'll have to back-step to install the specific compilers, and then ensure that the rest of what you've built will still work. 18 | 19 | If you choose to go this way, best of luck! 20 | 21 | -------------------------------------------------------------------------------- /reading_calls/be_your_own_linter_solution.md: -------------------------------------------------------------------------------- 1 | # But Here's the thing 2 | 3 | Look, here's the thing. For some reason people are almost **universally** bashful about sharing their code with others. 4 | 5 | - No really, even though there is some sort of silly taboo against talking about salary (which I think is put in place to keep workers from demanding higher wages and fuller benefits from management... ok, politics aside...) I bet that if we randomly paired a group of people in the class together, they'd rather talk about salary than have to defend why they coded the last exercise the way they did. 6 | 7 | - Just don't worry about it. 8 | 9 | Code, like salaries works better when we can all see how others have approached something, asking why, and coming to a mutual understanding. 10 | 11 | Here's some guidance from David Robinson (formerly with Stack Exchange) and Hadly Wickham (who literally re-wrote the language in the idiom that we're using): 12 | 13 | http://varianceexplained.org/programming/bad-code/ 14 | 15 | > This is exactly how you end up with crises of reproducible research in science. There are many reasons scientists publish papers without sharing their code (none of them defensible), but high on the list is embarrassment: “my code is too ugly to share.” Code shamers aren’t helping! 16 | -------------------------------------------------------------------------------- /reading_calls/review_code_solution.md: -------------------------------------------------------------------------------- 1 | # Review Code Solution 2 | 3 | One part of the code that I thought was particularily problematic was this set: 4 | 5 | ``` 6 | access$fuel_stack[(access$LPG==1) & 7 | (access$m4_q109_firewood=="No"& 8 | ((access$m4_q113_dungcake=="No") & 9 | ((access$m4_q114_agro=="No") & 10 | (access$m4_q115_other_fuel=="No"))))]<-4 #exclusive use 11 | 12 | ``` 13 | 14 | IT is really, really hard to grock just what is and what isn't being selected. 15 | 16 | I'd re-write this, using the `dplyr` framework as: 17 | 18 | ``` 19 | access %>% 20 | filter( 21 | LPG == 1, m_q109 == 'No', m4_q113_dungcake == 'No', 22 | m4_q114_agro == 'No', m4_q115_other_fuel == 'No') %>% 23 | mutate(fuel_stack = 4) 24 | 25 | ``` 26 | 27 | I would also really, really, really like to know what the heck the `m4` object is, and why it is at the start of each of these variables. I would probably want to rename the entire set of varibles. 28 | 29 | The thing is, when code is as poorly structured and named as this code is, it makes it really hard to collaborate! And, the only thing scarier to me than sharing my code, is being the only person who has looked at code that I'm going to deploy. 30 | -------------------------------------------------------------------------------- /reading_calls/pick_a_theme.md: -------------------------------------------------------------------------------- 1 | # Pick a Theme 2 | 3 | I mean, c'mon. One of the most fun things about plotting is making it look "like a plot that I made". Amiright? 4 | 5 | There are a whole host of themes that are available within ggplot. As we've demonstrated in the lecture these can be set in one of two ways: 6 | 7 | 1. By adding a `+ theme_*()` layer onto your plot; or, 8 | 2. By setting the theme for the whole session using `theme_set(theme_*())`. 9 | 10 | (In both of those, you would replace the * with the theme name you want to use.) 11 | 12 | There are several themes that are built and maintained internal to the `ggplot` project. You can see a listing of them at [this link](https://ggplot2.tidyverse.org/reference/ggtheme.html). 13 | 14 | But, there are also a **ton** of other extensions that folks have built. Many of them are listed [at this github for ggthemes](https://github.com/jrnold/ggthemes). If you want to use these themes you can do the following: 15 | 16 | ``` 17 | install.packages('ggthemes', dependencies = TRUE) 18 | library(ggthemes) 19 | ``` 20 | 21 | And then add a theme from this layer as you might like. 22 | 23 | # Coding Task 24 | 25 | If you're interested: 26 | - Head to the Datahub. 27 | - Load the file `pick_a_theme.Rmd` and then work to set themes, picking one that you prefer. 28 | 29 | -------------------------------------------------------------------------------- /docs/schedule.md: -------------------------------------------------------------------------------- 1 | # Schedule 2 | 3 | This course is designed so that students can complete the *incoming*, structured work in four, two-hours long study sessions. (Or thereabouts.) The materials are structured as follows: 4 | 5 | 1. **Install Dependencies** - Get a working system going, and start to navigate the language. Because we advocate that you use a deployed environment hosted by UC Berkeley, this module can be completed in as few as 30 minutes. The student interested in building a version of the compute environment on their own machine will be walked through a Docker install. 6 | 2. **Thinking About Data** - "What information is encoded in my data, and how can I start to explore this information?" This module introduces plotting highly structured data by introducing the grammar of graphics. 7 | 3. **Plotting and Manipulating Data** - This module expands students' expressiveness when moving data into plots and introduces core concepts in data manipulation. The student is still firmly grounded in a data.frame. 8 | 4. **Working at the Project Level** - This module takes the student outside of the dataframe, both looking more broadly at structuring a project workflow and more narrowly at lower-level pieces of the `R` language. 9 | 5. **Sharing a Project** - This module sets the student out to work on a tightly-scoped project. 10 | -------------------------------------------------------------------------------- /reading_calls/installing_rstudio.md: -------------------------------------------------------------------------------- 1 | # Installing Rstudio Locally 2 | 3 | (This is a step that you only need to take if you're going to install locally!) 4 | 5 | Rstudio is available to install at https://rstudio.com/products/rstudio/. 6 | 7 | Rstudio is the IDE that we're advocating folks use while writing code in the R language. 8 | 9 | - It is probably the defacto editor for the language; 10 | - It provides a lot of really, really useful functionality that is specific to statistical computing, and is actively maintained and developed on. 11 | 12 | Like a local install of R, if you install on your local machine you'll have total control over what is available to you. But you'll also have to build everything that you want to be available. 13 | 14 | Whether you use the UCB Datahub, build this with a Docker image, or install locally, Rstudio does a _very_ good job of trying to make things "just work". This means that if you have code that requires a new package, it will suggest the install for that package; if you need to change you install of R it will suggest the install; and it has **enormously&** helpful help documentation. 15 | 16 | (Remember, you only need to install R and Rstudio if you're installing them locally. If you're going to use a resource that we've provided, you don't have to install.) 17 | 18 | Best of luck! 19 | -------------------------------------------------------------------------------- /reading_calls/ucb_datahub.md: -------------------------------------------------------------------------------- 1 | # The UCB Datahub 2 | 3 | The most straightforward way to start working with R is to use the UC Berkeley datahub environment that is provided to you as a student. This hub is located at [http://ischool.datahub.berkeley.edu](http://ischool.datahub.berkeley.edu) and requires only that you use your UCB Single Sign on to authenticate. Upon following this link: 4 | 5 | - Authenticate with your Single Sign On 6 | - Click to authorize this app 7 | - Get coding! 8 | 9 | This has built and is running an instance of Rstudio on UC Berkeley resources that you can fully articulate. There are several features to this _out of the box_: 10 | 11 | - Although your datahub is working on a virtual machine, it is not ephemeral. What does this mean, practically? You can return to code that you have written in your datahub, merely by navigating back to the datahub. In fact, even code that you have written but not saved will probably still be accessible then next time that you log-on to the datahub. 12 | - We have been able to specify libraries that we would like you to use in this instance, and have already compiled them 13 | - We should not undersell how nice this solution is. Thanks to the folks at Berkeley Research IT, BIDS, and CDSS. 14 | - If you're interested in more detail about the operation of the dathub, you can read about it [here](https://docs.datahub.berkeley.edu/en/latest/) 15 | -------------------------------------------------------------------------------- /code/make_bar_plots_solution.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Making Bar Plots" 3 | author: 'w203: Statistics for Data Science' 4 | output: github_document 5 | --- 6 | 7 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 8 | library(tidyverse) 9 | library(ggplot2) 10 | # install.packages('patchwork') 11 | library(patchwork) 12 | 13 | theme_set(theme_minimal()) 14 | knitr::opts_chunk$set(dpi = 200) 15 | ``` 16 | 17 | ```{r load data} 18 | squirrel_subset <- read.csv('squirrels_subset.csv') 19 | ``` 20 | 21 | ```{r aggregate data} 22 | squirrel_subset_by_color <- squirrel_subset %>% 23 | group_by(primary_fur_color) %>% 24 | summarise(count_by_color = n()) 25 | ``` 26 | 27 | # Task 28 | 29 | Produce two identical plots that have the following characteristics: 30 | 31 | - On the x-axis the plots have the color of the squirrel fur 32 | - On the y-axis the plots have a count of the nubmer of squirrels that have that color fur 33 | 34 | However, make these plots in two ways: 35 | 36 | 1. In one plot, use `geom_bar()` (and the appropriate dataset) 37 | 2. In the other plot, use `geom_col()` (and the appropriate dataset) 38 | 39 | Note that the datsets will be different for each of the different geometries. 40 | 41 | ```{r} 42 | plot_col <- squirrel_subset_by_color %>% 43 | ggplot() + 44 | aes(x = primary_fur_color, y = count_by_color) + 45 | geom_col() 46 | 47 | plot_bar <- squirrel_subset %>% 48 | ggplot() + 49 | aes(x = primary_fur_color) + 50 | geom_bar() 51 | 52 | plot_col | plot_bar 53 | ``` -------------------------------------------------------------------------------- /docs/syllabus.md: -------------------------------------------------------------------------------- 1 | # MIDS 1C: R Programming Bridge Course 2 | 3 | This bridge course is distinct from credit bearing courses in the degree -- it has no graded components. Instead, it has been designed so that a broad base of learners can come up to speed with some of the core pieces of the R language. 4 | 5 | In contrast to many methods of learning new languages, this course focuses on working with high-level parts of the language -- namely plotting (from `ggplot2`) and data manipulation (from `dplyr`) before addressing lower-level parts of the language. Indeed, this course is at an even higher-level part of the langauge api than the *R for Data Science* textbook. 6 | 7 | I (Alex) made this choice deliberately, based on a model of spoken language-learning: When learning a new language, we try to build concepts and vocabulary, even if this means that we conjugate verbs incorrectly or use inefficient methods of expressing ourselves. Often times, coding-language-learning tries to approach their language as though there is some axomatic truth from which the language is derived. These "truths" are enshrined in style guides, deployed code, and bravado; but they hide a more important truth -- just start communicating and see what you can get done. 8 | 9 | You can probably nagivate a busy market and appreciate the culture in a foreign land where you don't speak the language. Similarily, you can probably write clumsy code that expresses your intent and learn from your data. Just start writing and learning; we'll speak fluently soon enough. 10 | 11 | 12 | -------------------------------------------------------------------------------- /reading_calls/issuing_code_and_reading_output_solution.md: -------------------------------------------------------------------------------- 1 | # Solutions: Issuing Code and Reading Output 2 | 3 | If you're interested in seeing the solutions to the _small_ amount of code that I wrote, you can view the file back in the datahub. It is in the file `issuing_code_and_reading_outputs_solutions.R` 4 | 5 | But, I really only asked you to combine two lines, so you can probably view it here instead. 6 | 7 | ## Cats 8 | If you want to make a `cats` object that contains a vector of cat names, you can do so in the following way (you pick the names). 9 | 10 | ``` 11 | cats <- c('Fluffy', 'Tiger') 12 | ``` 13 | 14 | ## Top Dogs 15 | 16 | To create an object called `top_dog_names` that contains the most frequently used dog names you can "assign" the filtering pipeline that I wrote into the object using the `<-` operator. 17 | 18 | This _isn't_ going to assign the pipeline functions into that object; rather, it is going to assign the results of the pipeline into that object. This is an important difference that some will be interested in. 19 | 20 | - What this means: if you change the upstream data that is in `nyc_license` it will not change the values that are listed in the `top_dog_names` object. The elements of that object are written once at the time that you executed the code. 21 | 22 | ``` 23 | top_dog_names <- nyc_license %>% 24 | group_by(animal_name) %>% 25 | summarize(total = n()) %>% 26 | arrange(desc(total)) # this arranges in decending order 27 | ``` 28 | 29 | (Fun fact, [Top Dog](http://www.topdoghotdogs.com) is delicious. ) 30 | -------------------------------------------------------------------------------- /code/make_bar_plots_solution.md: -------------------------------------------------------------------------------- 1 | Making Bar Plots 2 | ================ 3 | w203: Statistics for Data Science 4 | 5 | ``` r 6 | library(tidyverse) 7 | library(ggplot2) 8 | # install.packages('patchwork') 9 | library(patchwork) 10 | 11 | theme_set(theme_minimal()) 12 | knitr::opts_chunk$set(dpi = 200) 13 | ``` 14 | 15 | ``` r 16 | squirrel_subset <- read.csv('squirrels_subset.csv') 17 | ``` 18 | 19 | ``` r 20 | squirrel_subset_by_color <- squirrel_subset %>% 21 | group_by(primary_fur_color) %>% 22 | summarise(count_by_color = n()) 23 | ``` 24 | 25 | ## `summarise()` ungrouping output (override with `.groups` argument) 26 | 27 | # Task 28 | 29 | Produce two identical plots that have the following characteristics: 30 | 31 | - On the x-axis the plots have the color of the squirrel fur 32 | - On the y-axis the plots have a count of the nubmer of squirrels that 33 | have that color fur 34 | 35 | However, make these plots in two ways: 36 | 37 | 1. In one plot, use `geom_bar()` (and the appropriate dataset) 38 | 2. In the other plot, use `geom_col()` (and the appropriate dataset) 39 | 40 | Note that the datsets will be different for each of the different 41 | geometries. 42 | 43 | ``` r 44 | plot_col <- squirrel_subset_by_color %>% 45 | ggplot() + 46 | aes(x = primary_fur_color, y = count_by_color) + 47 | geom_col() 48 | 49 | plot_bar <- squirrel_subset %>% 50 | ggplot() + 51 | aes(x = primary_fur_color) + 52 | geom_bar() 53 | 54 | plot_col | plot_bar 55 | ``` 56 | 57 | ![](make_bar_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png) 58 | -------------------------------------------------------------------------------- /code/make_it_sparkle.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "make_it_sparkle.Rmd" 3 | author: 'w203: Statistics for Data Science' 4 | output: github_document 5 | --- 6 | 7 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 8 | library(tidyverse) 9 | library(ggplot2) 10 | # install.packages('patchwork') 11 | library(patchwork) 12 | 13 | theme_set(theme_minimal()) 14 | knitr::opts_chunk$set(dpi = 200) 15 | ``` 16 | 17 | ```{r load and mutate data} 18 | squirrel_subset <- read.csv('squirrels_subset.csv') 19 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color)) 20 | 21 | squirrel_subset <- squirrel_subset %>% 22 | mutate(date_f = as.Date.character(date, format = '%m%d%Y')) 23 | ``` 24 | 25 | # Coding Task 26 | 27 | Suppose that you want to communicate to someone who has less context with the data. Set the `labs()` argument to include 28 | 29 | - A title 30 | - A subtitle 31 | - Descriptive Labels for the axes 32 | - A label for what the colors mean. 33 | 34 | Each of the labels should be in **plain spoken language** and should be in sentence case: 35 | 36 | - The first letter of the first work should be capitalized 37 | - The rest of the letters should *not* be capitalized 38 | - Everything should be a work that you can speak aloud -- i.e. **not** a variable name. 39 | 40 | ```{r coding task} 41 | squirrel_subset %>% 42 | group_by(date_f, primary_fur_color) %>% 43 | summarise(count_of_squirrels = n()) %>% 44 | ggplot() + 45 | aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 46 | stat_smooth(se = FALSE) # fill out the labs() arg! 47 | ``` 48 | -------------------------------------------------------------------------------- /reading_calls/make_bar_plots_solution.md: -------------------------------------------------------------------------------- 1 | Making Bar Plots 2 | ================ 3 | w203: Statistics for Data Science 4 | 5 | ``` r 6 | library(tidyverse) 7 | library(ggplot2) 8 | # install.packages('patchwork') 9 | library(patchwork) 10 | 11 | theme_set(theme_minimal()) 12 | knitr::opts_chunk$set(dpi = 200) 13 | ``` 14 | 15 | ``` r 16 | squirrel_subset <- read.csv('squirrels_subset.csv') 17 | ``` 18 | 19 | ``` r 20 | squirrel_subset_by_color <- squirrel_subset %>% 21 | group_by(primary_fur_color) %>% 22 | summarise(count_by_color = n()) 23 | ``` 24 | 25 | ## `summarise()` ungrouping output (override with `.groups` argument) 26 | 27 | # Task 28 | 29 | Produce two identical plots that have the following characteristics: 30 | 31 | - On the x-axis the plots have the color of the squirrel fur 32 | - On the y-axis the plots have a count of the nubmer of squirrels that 33 | have that color fur 34 | 35 | However, make these plots in two ways: 36 | 37 | 1. In one plot, use `geom_bar()` (and the appropriate dataset) 38 | 2. In the other plot, use `geom_col()` (and the appropriate dataset) 39 | 40 | Note that the datsets will be different for each of the different 41 | geometries. 42 | 43 | ``` r 44 | plot_col <- squirrel_subset_by_color %>% 45 | ggplot() + 46 | aes(x = primary_fur_color, y = count_by_color) + 47 | geom_col() 48 | 49 | plot_bar <- squirrel_subset %>% 50 | ggplot() + 51 | aes(x = primary_fur_color) + 52 | geom_bar() 53 | 54 | plot_col | plot_bar 55 | ``` 56 | 57 | ![](make_bar_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png) 58 | -------------------------------------------------------------------------------- /code/pick_a_theme.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Pick A Theme" 3 | author: 'w203: Statistics for Data Science' 4 | output: github_document 5 | --- 6 | 7 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 8 | library(tidyverse) 9 | library(ggplot2) 10 | # install.packages('patchwork') 11 | library(patchwork) 12 | 13 | knitr::opts_chunk$set(dpi = 200) 14 | ``` 15 | 16 | ```{r load and mutate data} 17 | squirrel_subset <- read.csv('squirrels_subset.csv') 18 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color)) 19 | 20 | squirrel_subset <- squirrel_subset %>% 21 | mutate(date_f = as.Date.character(date, format = '%m%d%Y')) 22 | ``` 23 | 24 | # Coding Task 25 | 26 | What's your style? A color maximalist? A Tufte minimalist? 27 | 28 | The only requirement here is that you pick with communication in mind. Try out a few themes to see which you think is to your liking -- or, just use no theme at all and rely on the base colors! 29 | 30 | ```{r} 31 | ?theme_bw 32 | ``` 33 | 34 | Remove the comment and try a few themes out. 35 | 36 | ```{r coding task} 37 | squirrel_subset %>% 38 | group_by(date_f, primary_fur_color) %>% 39 | summarise(count_of_squirrels = n()) %>% 40 | ggplot() + 41 | aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 42 | stat_smooth(se = FALSE) + 43 | labs( 44 | title = 'There are a lot of grey squirrels', 45 | subtitle = 'But, people are collecting data in later days', 46 | x = 'Date of observation', 47 | y = 'Count of squirrels', 48 | color = 'Primary Fur Color') # + theme_minimal() 49 | ``` 50 | 51 | 52 | -------------------------------------------------------------------------------- /reading_calls/make_bar_plots.md: -------------------------------------------------------------------------------- 1 | # Making Bar Plots 2 | 3 | There are two ways to produce bar plots in `ggplot`. 4 | 5 | - With `geom_bar()`) you allow `ggplot` to do the functional counting and mapping for you at the time that you draw this plot. 6 | - Sometimes this is the easier way to go about producing a bar plot; especially when the data is relatively simple that you're working with. 7 | - As well, when you're exploring data, it is quite nice to be able to view counts of factors, without an intermediate step to "roll-up" the observations. 8 | - With `geom_col()` **you** do the aggregating ahead of time and then tell the plot what height you want to map onto the y-axis. 9 | - When the data is complex, or if you have a particular way that you want to do the counting, this can be easier to produce what you'd like to see (because you can write the code to do the counting, rather than relying on the buried, built-in counting method that `ggplot` will use) 10 | 11 | For me, the determination about which to use really comes down to: how easy is it to count these observations? If the answer is **anything** but "very easy" then I use `geom_col()`; otherwise, I use `geom_bar()`. 12 | 13 | # Coding task 14 | 15 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `make_bar_plots.Rmd`. 16 | - This code will ask you to make bar plots with different features. 17 | - Like the last code, we'll now be working with R Markdown files. 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /code/mutate_solution.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Mutating a New Varible' 3 | author: 'w203: Statistics for Data Science' 4 | date: "8/13/2020" 5 | output: github_document 6 | --- 7 | 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 9 | library(tidyverse) 10 | ``` 11 | 12 | ```{r load data, message=FALSE} 13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 15 | ``` 16 | 17 | # Mutate Task 18 | 19 | - Using the `agencies` data, create a series of variables that contain the log of the `count` of launches. 20 | 21 | ```{r} 22 | agencies_log <- agencies %>% 23 | mutate(count_log = log(count)) 24 | 25 | agencies_log 26 | ``` 27 | 28 | > But not that you don't *have* to assign this to a new object. 29 | 30 | - Then, show only the columns that are called either `agency` or `contains()` the string "count". 31 | 32 | ```{r} 33 | agencies_log %>% 34 | select(agency, contains('count')) 35 | ``` 36 | 37 | - Finally, `arrange()` these descending by `count`. 38 | 39 | ```{r} 40 | agencies_log %>% 41 | select(agency, contains('count')) %>% 42 | arrange(desc(count)) 43 | ``` 44 | 45 | > HA! It looks as though the data came in the door arranged by count. However, I would **never** suggest relying on this. If you want the data arranged by count, write the code to do so. The upstream data that comes into your analysis could change; potentially without you knowing. 46 | > 47 | > If you want your data to have some particualr characteristic, you should write the code that makes it be so. -------------------------------------------------------------------------------- /code/filter_solution.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Filter Solution' 3 | author: 'w203: Statistics for Data Science' 4 | date: "8/13/2020" 5 | output: github_document 6 | --- 7 | 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 9 | library(tidyverse) 10 | ``` 11 | 12 | ```{r load data, results = 'hide', message=FALSE} 13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 15 | ``` 16 | 17 | # Task 18 | 19 | Use the magrittr pipr `%>%` and the `filter()` verb to do the following work: 20 | 21 | ## Launches in the 1980s 22 | 23 | Find only the launches that occurred in the 1980s 24 | 25 | ```{r} 26 | launches %>% 27 | filter(launch_year >= 1980) %>% 28 | filter(launch_year < 1990) 29 | ``` 30 | 31 | Or, equivalently 32 | 33 | ```{r} 34 | launches %>% 35 | filter(launch_year >= 1980, launch_year < 1990) 36 | ``` 37 | 38 | ## Launches by France 39 | 40 | Find only the launches that were conducted by France (`state_code == "F"` ) 41 | 42 | ```{r} 43 | launches %>% 44 | filter(state_code == "F") 45 | ``` 46 | 47 | ## Launches by France in the 1980s 48 | 49 | Find only the launches by France that were conducted in the 1980s 50 | 51 | ```{r} 52 | launches %>% 53 | filter(launch_year >= 1980, launch_year < 1990, state_code == "F") 54 | ``` 55 | 56 | ## Launches by France or Russia in the 1980s 57 | 58 | Find launches in the 1980s by either France or the Soviet Union 59 | 60 | ```{r} 61 | launches %>% 62 | filter(launch_year >= 1980, launch_year < 1990, state_code %in% c("F", "SU")) 63 | ``` 64 | 65 | -------------------------------------------------------------------------------- /code/make_line_plots_solution.md: -------------------------------------------------------------------------------- 1 | Line Plots Solutions 2 | ================ 3 | w203: Statistics for Data Science 4 | 5 | ``` r 6 | library(tidyverse) 7 | library(ggplot2) 8 | theme_set(theme_minimal()) 9 | knitr::opts_chunk$set(dpi = 200) 10 | ``` 11 | 12 | ``` r 13 | squirrel_subset <- read_csv('./squirrels_subset.csv') 14 | ``` 15 | 16 | ## Parsed with column specification: 17 | ## cols( 18 | ## long = col_double(), 19 | ## lat = col_double(), 20 | ## hectare = col_character(), 21 | ## date = col_double(), 22 | ## age = col_character(), 23 | ## primary_fur_color = col_character() 24 | ## ) 25 | 26 | ``` r 27 | squirrel_subset <- squirrel_subset %>% 28 | mutate(date_f = as.Date.character(date, format = '%m%d%Y')) 29 | ``` 30 | 31 | # Squirrels by date 32 | 33 | - Make a line plot that shows the squirrels observed by date 34 | 35 | 36 | 37 | ``` r 38 | squirrel_subset %>% 39 | group_by(date_f) %>% 40 | summarise(count_of_squirrels = n()) %>% 41 | ggplot() + 42 | aes(x = date_f, y = count_of_squirrels) + 43 | geom_line() 44 | ``` 45 | 46 | ## `summarise()` ungrouping output (override with `.groups` argument) 47 | 48 | ![](make_line_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png) 49 | 50 | # Challenge: Squirrels by color by date 51 | 52 | ``` r 53 | squirrel_subset %>% 54 | group_by(date_f, primary_fur_color) %>% 55 | summarise(count_of_squirrels = n()) %>% 56 | ggplot() + 57 | aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 58 | geom_line() 59 | ``` 60 | 61 | ## `summarise()` regrouping output by 'date_f' (override with `.groups` argument) 62 | 63 | ![](make_line_plots_solution_files/figure-gfm/unnamed-chunk-2-1.png) 64 | -------------------------------------------------------------------------------- /reading_calls/make_line_plots_solution.md: -------------------------------------------------------------------------------- 1 | Line Plots Solutions 2 | ================ 3 | w203: Statistics for Data Science 4 | 5 | ``` r 6 | library(tidyverse) 7 | library(ggplot2) 8 | theme_set(theme_minimal()) 9 | knitr::opts_chunk$set(dpi = 200) 10 | ``` 11 | 12 | ``` r 13 | squirrel_subset <- read_csv('./squirrels_subset.csv') 14 | ``` 15 | 16 | ## Parsed with column specification: 17 | ## cols( 18 | ## long = col_double(), 19 | ## lat = col_double(), 20 | ## hectare = col_character(), 21 | ## date = col_double(), 22 | ## age = col_character(), 23 | ## primary_fur_color = col_character() 24 | ## ) 25 | 26 | ``` r 27 | squirrel_subset <- squirrel_subset %>% 28 | mutate(date_f = as.Date.character(date, format = '%m%d%Y')) 29 | ``` 30 | 31 | # Squirrels by date 32 | 33 | - Make a line plot that shows the squirrels observed by date 34 | 35 | 36 | 37 | ``` r 38 | squirrel_subset %>% 39 | group_by(date_f) %>% 40 | summarise(count_of_squirrels = n()) %>% 41 | ggplot() + 42 | aes(x = date_f, y = count_of_squirrels) + 43 | geom_line() 44 | ``` 45 | 46 | ## `summarise()` ungrouping output (override with `.groups` argument) 47 | 48 | ![](make_line_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png) 49 | 50 | # Challenge: Squirrels by color by date 51 | 52 | ``` r 53 | squirrel_subset %>% 54 | group_by(date_f, primary_fur_color) %>% 55 | summarise(count_of_squirrels = n()) %>% 56 | ggplot() + 57 | aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 58 | geom_line() 59 | ``` 60 | 61 | ## `summarise()` regrouping output by 'date_f' (override with `.groups` argument) 62 | 63 | ![](make_line_plots_solution_files/figure-gfm/unnamed-chunk-2-1.png) 64 | -------------------------------------------------------------------------------- /code/make_scatter_plots.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(ggplot2) 3 | 4 | 5 | ## Load data: this should just work 6 | squirrel_subset <- read_csv('./r_bridge/code/squirrels_subset.csv') 7 | 8 | ## Create a basic plot of the observations on the lat and long axis 9 | 10 | ggplot(data = squirrel_subset) + 11 | aes(x = lat, y = long) + 12 | geom_point() 13 | 14 | ## Does this look like Central Park? 15 | 16 | 17 | ## Task 1: Color by age 18 | 19 | ## - Now, write code that will modify the plot so that it is colored by age 20 | 21 | ## Task 2: Make Every Point Blue 22 | 23 | ## - Now, write code that will make every point blue, not colored by age. 24 | ## - Notice that now this choice isn't an attribute of the data. Where 25 | ## does this mean that the `color` argument should go? 26 | 27 | ## Task 3: Color by the Fur Color 28 | 29 | ## - Now, write code that will color the points by the variable `primary_fur_color`. 30 | ## - Notice that this now *is* an attribute of the data. So, where should the 31 | ## `color` argument go? 32 | 33 | ## Task 4: Put onto non-euclidian space 34 | 35 | ## - If you think carefully about this, we're mapping the geographic coordinate 36 | ## system onto the euclidian coordinate system. This isn't a _huge_ deal in this 37 | ## case because we're only covering central park. But, what's right is right... 38 | 39 | ## - ggplot has the ability to map onto the geographic coordinate system using the 40 | ## additional function `coord_quickmap()` (which is an approximation) or `coord_map()` 41 | ## which is not an approximation 42 | 43 | ## - Given what you understand about the layering system that ggplot uses, can 44 | ## you add on this new layer that is the `coord_quickmap()` coordiante system? 45 | ## - If so, how much does it change the plot? 46 | -------------------------------------------------------------------------------- /code/make_it_sparkle_solution.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "make_it_sparkle.Rmd" 3 | author: 'w203: Statistics for Data Science' 4 | output: github_document 5 | --- 6 | 7 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 8 | library(tidyverse) 9 | library(ggplot2) 10 | # install.packages('patchwork') 11 | library(patchwork) 12 | 13 | theme_set(theme_minimal()) 14 | knitr::opts_chunk$set(dpi = 200) 15 | ``` 16 | 17 | ```{r load and mutate data} 18 | squirrel_subset <- read.csv('squirrels_subset.csv') 19 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color)) 20 | 21 | squirrel_subset <- squirrel_subset %>% 22 | mutate(date_f = as.Date.character(date, format = '%m%d%Y')) 23 | ``` 24 | 25 | # Coding Task 26 | 27 | Suppose that you want to communicate to someone who has less context with the data. Set the `labs()` argument to include 28 | 29 | - A title 30 | - A subtitle 31 | - Descriptive Labels for the axes 32 | - A label for what the colors mean. 33 | 34 | Each of the labels should be in **plain spoken language** and should be in sentence case: 35 | 36 | - The first letter of the first work should be capitalized 37 | - The rest of the letters should *not* be capitalized 38 | - Everything should be a work that you can speak aloud -- i.e. **not** a variable name. 39 | 40 | ```{r coding task} 41 | squirrel_subset %>% 42 | group_by(date_f, primary_fur_color) %>% 43 | summarise(count_of_squirrels = n()) %>% 44 | ggplot() + 45 | aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 46 | stat_smooth(se = FALSE) + 47 | labs( 48 | title = 'There are a lot of grey squirrels', 49 | subtitle = 'But, people are collecting data in later days', 50 | x = 'Date of observation', 51 | y = 'Count of squirrels', 52 | color = 'Primary Fur Color' 53 | ) 54 | ``` 55 | -------------------------------------------------------------------------------- /code/make_it_sparkle_solution.md: -------------------------------------------------------------------------------- 1 | make\_it\_sparkle.Rmd 2 | ================ 3 | w203: Statistics for Data Science 4 | 5 | ``` r 6 | library(tidyverse) 7 | library(ggplot2) 8 | # install.packages('patchwork') 9 | library(patchwork) 10 | 11 | theme_set(theme_minimal()) 12 | knitr::opts_chunk$set(dpi = 200) 13 | ``` 14 | 15 | ``` r 16 | squirrel_subset <- read.csv('squirrels_subset.csv') 17 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color)) 18 | 19 | squirrel_subset <- squirrel_subset %>% 20 | mutate(date_f = as.Date.character(date, format = '%m%d%Y')) 21 | ``` 22 | 23 | # Coding Task 24 | 25 | Suppose that you want to communicate to someone who has less context 26 | with the data. Set the `labs()` argument to include 27 | 28 | - A title 29 | - A subtitle 30 | - Descriptive Labels for the axes 31 | - A label for what the colors mean. 32 | 33 | Each of the labels should be in **plain spoken language** and should be 34 | in sentence case: 35 | 36 | - The first letter of the first work should be capitalized 37 | - The rest of the letters should *not* be capitalized 38 | - Everything should be a work that you can speak aloud – i.e. **not** 39 | a variable name. 40 | 41 | 42 | 43 | ``` r 44 | squirrel_subset %>% 45 | group_by(date_f, primary_fur_color) %>% 46 | summarise(count_of_squirrels = n()) %>% 47 | ggplot() + 48 | aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 49 | stat_smooth(se = FALSE) + 50 | labs( 51 | title = 'There are a lot of grey squirrels', 52 | subtitle = 'But, people are collecting data in later days', 53 | x = 'Date of observation', 54 | y = 'Count of squirrels', 55 | color = 'Primary Fur Color' 56 | ) 57 | ``` 58 | 59 | ## `summarise()` regrouping output by 'date_f' (override with `.groups` argument) 60 | 61 | ## `geom_smooth()` using method = 'loess' and formula 'y ~ x' 62 | 63 | ![](make_it_sparkle_solution_files/figure-gfm/coding%20task-1.png) 64 | -------------------------------------------------------------------------------- /reading_calls/make_it_sparkle_solution.md: -------------------------------------------------------------------------------- 1 | make\_it\_sparkle.Rmd 2 | ================ 3 | w203: Statistics for Data Science 4 | 5 | ``` r 6 | library(tidyverse) 7 | library(ggplot2) 8 | # install.packages('patchwork') 9 | library(patchwork) 10 | 11 | theme_set(theme_minimal()) 12 | knitr::opts_chunk$set(dpi = 200) 13 | ``` 14 | 15 | ``` r 16 | squirrel_subset <- read.csv('squirrels_subset.csv') 17 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color)) 18 | 19 | squirrel_subset <- squirrel_subset %>% 20 | mutate(date_f = as.Date.character(date, format = '%m%d%Y')) 21 | ``` 22 | 23 | # Coding Task 24 | 25 | Suppose that you want to communicate to someone who has less context 26 | with the data. Set the `labs()` argument to include 27 | 28 | - A title 29 | - A subtitle 30 | - Descriptive Labels for the axes 31 | - A label for what the colors mean. 32 | 33 | Each of the labels should be in **plain spoken language** and should be 34 | in sentence case: 35 | 36 | - The first letter of the first work should be capitalized 37 | - The rest of the letters should *not* be capitalized 38 | - Everything should be a work that you can speak aloud – i.e. **not** 39 | a variable name. 40 | 41 | 42 | 43 | ``` r 44 | squirrel_subset %>% 45 | group_by(date_f, primary_fur_color) %>% 46 | summarise(count_of_squirrels = n()) %>% 47 | ggplot() + 48 | aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 49 | stat_smooth(se = FALSE) + 50 | labs( 51 | title = 'There are a lot of grey squirrels', 52 | subtitle = 'But, people are collecting data in later days', 53 | x = 'Date of observation', 54 | y = 'Count of squirrels', 55 | color = 'Primary Fur Color' 56 | ) 57 | ``` 58 | 59 | ## `summarise()` regrouping output by 'date_f' (override with `.groups` argument) 60 | 61 | ## `geom_smooth()` using method = 'loess' and formula 'y ~ x' 62 | 63 | ![](make_it_sparkle_solution_files/figure-gfm/coding%20task-1.png) 64 | -------------------------------------------------------------------------------- /code/make_a_data_set_solution.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Make A Data Set" 3 | author: 'w203: Statistics for Data Science' 4 | date: "8/14/2020" 5 | output: github_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | library(tidyverse) 10 | ``` 11 | 12 | # Paste Farm Code 13 | 14 | Start by pasting the code that you wrote to create the farm animals below. 15 | 16 | ```{r} 17 | ID <- 1:300 18 | animal <- c(rep('cow', 10), rep('sheep', 50), rep('goat', 40), rep('chicken', 70), rep('rabbit', 130)) 19 | petting_zoo <- rep('no', 300) 20 | petting_zoo[animal == 'rabbit'] <- 'yes' 21 | 22 | weight <- rep(NA, 300) 23 | weight[animal == 'cow'] <- rnorm(n = 10, mean = 1000, sd = 100) 24 | weight[animal == 'sheep'] <- rnorm(n = 50, mean = 100, sd = 2) 25 | weight[animal == 'goat'] <- rnorm(n = 40, mean = 40, sd = 2) 26 | weight[animal == 'chicken'] <- .2 27 | weight[animal == 'rabbit'] <- NA 28 | 29 | feed <- rep(NA, 300) 30 | 31 | feed[animal == 'cow'] <- weight[animal == 'cow'] * .03 32 | feed[animal == 'sheep'] <- weight[animal == 'sheep'] *.02 33 | feed[animal == 'goat'] <- weight[animal == 'goat'] * .07 34 | feed[animal == 'chicken'] <- weight[animal == 'chicken'] * .02 35 | feed[animal == 'rabbit'] <- NA 36 | ``` 37 | 38 | ## Combine into a dataset 39 | 40 | Now, combine all of these variables into a single dataset, called `tilden`. 41 | 42 | ```{r} 43 | tilden <- data.frame( 44 | ID, animal, petting_zoo, weight, feed 45 | ) 46 | ``` 47 | 48 | 49 | ## Typecast 50 | 51 | Now do a little bit of mutating and type converting. 52 | 53 | - Since we know that each of the types of animals has a specific amount that they eat, let's label them "hungry boi" if they eat more than average; and "slender boi" if they eat less than average. (Sorry... I know this coding might be getting tedious!). 54 | - Make each of these relative to the mean within that animal type. 55 | 56 | ```{r} 57 | tilden %>% 58 | group_by(animal) %>% 59 | mutate(hungry_slender = ifelse(weight > mean(weight), 'hungry boi', 'slender boi')) 60 | ``` -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Course Syllabus 2 | 3 | As a bridge course, this course is distinct from the main course offerings of the School of Information -- there are no syncronous discussion sections, and there are no graded assessments. Instead, this is a self-paced course of study designed to be completed in roughly a week before beginning a credit-bearing course. 4 | 5 | - [Syllabus](./syllabus.md) 6 | - [Schedule](./schedule.md) 7 | 8 | The class works closely from [R for Data Science](https://r4ds.had.co.nz "r4ds") to prepare students for the theoretical and applied work. We recommend the course for students preparing to take *Statistics for Data Scientists* (w203); but the course might also be a useful refresher for students who are taking *Experiments and Causal Inference* (w241) and *Statistical Methods for Discrete Rseponse, Time Series, and Panel Data* (w271). 9 | 10 | In contrast to many methods of learning new languages, this course focuses on working with high-level parts of the language -- namely plotting (from `ggplot2`) and data manipulation (from `dplyr`) before addressing lower-level parts of the language. Indeed, this course is at an even higher-level part of the langauge api than the *R for Data Science* textbook. 11 | 12 | I (Alex) made this choice deliberately, based on a model of spoken language-learning: When learning a new language, we try to build concepts and vocabulary, even if this means that we conjugate verbs incorrectly or use inefficient methods of expressing ourselves. Often times, coding-language-learning tries to approach their language as though there is some axomatic truth from which the language is derived. These "truths" are enshrined in style guides, deployed code, and bravado; but they hide a more important truth -- just start communicating and see what you can get done. 13 | 14 | You can probably nagivate a busy market and appreciate the culture in a foreign land where you don't speak the language. Similarily, you can probably write clumsy code that expresses your intent and learn from your data. Just start writing and learning; we'll speak fluently soon enough. 15 | -------------------------------------------------------------------------------- /code/base_operations.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Heading to the Little Farm" 3 | author: 'w203: Statistics for Data Science' 4 | output: github_document 5 | --- 6 | 7 | # The Little Farm 8 | 9 | Just around the corner from campus in Berkeley is the Tilden "Little Farm" its a sort of hobby farm that has cows, goats, chickens, sheep, and a small menagerie of other cute animals. I think that idea is that kids can see animals -- but I've also got to admit that it is kind of weird to have a tiny farm in the middle of the neighborhood. 10 | 11 | ## Create Farm Data 12 | 13 | Let's use all the methods that you've just been working on to produce a dataset that represents the farm. Using methods that you're familiar with, create the following dataset. 14 | 15 | - A column that ranges from 1 - 300 called "ID" that is an ID for the animal. 16 | - A column that has the type of animal that is being recorded: 17 | - There are 10 cows 18 | - There are 50 sheep 19 | - There are 40 goats 20 | - There are 70 chickens 21 | - There are 130 rabbits 22 | - A column that describes whether that animal belongs in the petting zoo 23 | - A column that describes the weight of the animals: 24 | - Cows weight are normally distributed, with a mean of 1000kg and a sd of 100kg 25 | - Sheep are normally distributed, with a mean of 100kg and a sd of 2 kg 26 | - Goats are normally distributed, with a mean of 40kg and a sd of 2kg 27 | - Chickens all weight .2 kg 28 | - Rabbits don't stand still long enough to be weighted, so there is no data 29 | - A column that described how much feed that animal needs -- this is animal specific, and depends on how much the animal weighs. 30 | - A cow needs 3% of its body weight each day to stay alive 31 | - A sheep needs 2% of its body weight each day to stay alive 32 | - A goat needs 7% of its body weight each day to stay alive 33 | - A chicken needs 2% of its body weight each day to stay alive 34 | 35 | You are free to use `dplyr` actions to modify variables, if you like. 36 | 37 | ```{r} 38 | library(tidyverse) 39 | ``` 40 | 41 | -------------------------------------------------------------------------------- /reading_calls/additional_features.md: -------------------------------------------------------------------------------- 1 | # aes(x= , y= , fill =) 2 | 3 | The x and y coordinate mappings are the most straightforward set of mappings that we typically interact with in the `aes()` function call. But, we can map more data into our plot by passing additional aesthetic arguments. 4 | 5 | The `color` and `fill` aesthetic mappings change the colors of the geometries that are plotted, depending on values that they are mapped to. When the data passed to these arguments are continuous, the colors will be placed onto a gradient scale; when the data passed to these arguments are categorical, a different color will be used for each level within the variable. Apart from `x` and `y`, `color` and `fill` are the two aesthetics that I use most frequently; probably because they're both used to communicate grouping structure within your plot. Two pieces of trivia, that actually end up mattering: 6 | 7 | - `color` defines the external “outline” of geometries. In some other plotting languages this might be called the "stroke" 8 | - `fill` defines the internal colors of the geometries. 9 | 10 | Something like a line (generated using `geom_line()`) doesn't have any internal space, and so line color is controlled using `color`. However, bars (`geom_bar()`) **do** have internal space so their colors are controlled by `fill`. 11 | 12 | The `shape` of a geometry can also be set by a variable in the data. Shape most often affects geom_point, but also a few other geometries that you might find to use. 13 | 14 | The `size` of a geometry determines the size (in mm) of the geometry. For points, this is the size of the point, for lines, this is the width of the line. 15 | 16 | # Coding task 17 | 18 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `additional_plot_features.Rmd`. 19 | - This code will ask you to make scatter plots with different features. 20 | - Note that we're switching to working with R Markdown file now, which are enhancements over the .R files that we've been using to this point. We'll explain that in the file itself. 21 | 22 | -------------------------------------------------------------------------------- /code/select_solution.md: -------------------------------------------------------------------------------- 1 | Select 2 | ================ 3 | w203: Statistics for Data Science 4 | 8/13/2020 5 | 6 | ``` r 7 | library(tidyverse) 8 | ``` 9 | 10 | ``` r 11 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 12 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 13 | ``` 14 | 15 | # Task 16 | 17 | - From the launches data, `select()` only the columns that are related 18 | to characteristics about the country 19 | 20 | 21 | 22 | ``` r 23 | launches %>% 24 | select(agency, state_code, agency_type) 25 | ``` 26 | 27 | ## # A tibble: 5,726 x 3 28 | ## agency state_code agency_type 29 | ## 30 | ## 1 US US state 31 | ## 2 US US state 32 | ## 3 US US state 33 | ## 4 US US state 34 | ## 5 US US state 35 | ## 6 US US state 36 | ## 7 US US state 37 | ## 8 US US state 38 | ## 9 US US state 39 | ## 10 US US state 40 | ## # … with 5,716 more rows 41 | 42 | - From the launches data, `select()` only the columns that are related 43 | to time 44 | 45 | 46 | 47 | ``` r 48 | launches %>% 49 | select(tag, JD, launch_date, launch_year) 50 | ``` 51 | 52 | ## # A tibble: 5,726 x 4 53 | ## tag JD launch_date launch_year 54 | ## 55 | ## 1 1967-065 2439671. 1967-06-29 1967 56 | ## 2 1967-080 2439726. 1967-08-23 1967 57 | ## 3 1967-096 2439775. 1967-10-11 1967 58 | ## 4 1968-042 2440000. 1968-05-23 1968 59 | ## 5 1968-092 2440153. 1968-10-23 1968 60 | ## 6 1969-062 2440426. 1969-07-23 1969 61 | ## 7 1970-012 2440629. 1970-02-11 1970 62 | ## 8 1970-070 2440833. 1970-09-03 1970 63 | ## 9 1971-012 2441000. 1971-02-17 1971 64 | ## 10 1971-054 2441111. 1971-06-08 1971 65 | ## # … with 5,716 more rows 66 | -------------------------------------------------------------------------------- /reading_calls/select_solution.md: -------------------------------------------------------------------------------- 1 | Select 2 | ================ 3 | w203: Statistics for Data Science 4 | 8/13/2020 5 | 6 | ``` r 7 | library(tidyverse) 8 | ``` 9 | 10 | ``` r 11 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 12 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 13 | ``` 14 | 15 | # Task 16 | 17 | - From the launches data, `select()` only the columns that are related 18 | to characteristics about the country 19 | 20 | 21 | 22 | ``` r 23 | launches %>% 24 | select(agency, state_code, agency_type) 25 | ``` 26 | 27 | ## # A tibble: 5,726 x 3 28 | ## agency state_code agency_type 29 | ## 30 | ## 1 US US state 31 | ## 2 US US state 32 | ## 3 US US state 33 | ## 4 US US state 34 | ## 5 US US state 35 | ## 6 US US state 36 | ## 7 US US state 37 | ## 8 US US state 38 | ## 9 US US state 39 | ## 10 US US state 40 | ## # … with 5,716 more rows 41 | 42 | - From the launches data, `select()` only the columns that are related 43 | to time 44 | 45 | 46 | 47 | ``` r 48 | launches %>% 49 | select(tag, JD, launch_date, launch_year) 50 | ``` 51 | 52 | ## # A tibble: 5,726 x 4 53 | ## tag JD launch_date launch_year 54 | ## 55 | ## 1 1967-065 2439671. 1967-06-29 1967 56 | ## 2 1967-080 2439726. 1967-08-23 1967 57 | ## 3 1967-096 2439775. 1967-10-11 1967 58 | ## 4 1968-042 2440000. 1968-05-23 1968 59 | ## 5 1968-092 2440153. 1968-10-23 1968 60 | ## 6 1969-062 2440426. 1969-07-23 1969 61 | ## 7 1970-012 2440629. 1970-02-11 1970 62 | ## 8 1970-070 2440833. 1970-09-03 1970 63 | ## 9 1971-012 2441000. 1971-02-17 1971 64 | ## 10 1971-054 2441111. 1971-06-08 1971 65 | ## # … with 5,716 more rows 66 | -------------------------------------------------------------------------------- /code/how_to_summarise.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "How to Smooth" 3 | author: 'w203: Statistics for Data Science' 4 | output: github_document 5 | --- 6 | 7 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 8 | library(tidyverse) 9 | library(ggplot2) 10 | # install.packages('patchwork') 11 | library(patchwork) 12 | 13 | theme_set(theme_minimal()) 14 | knitr::opts_chunk$set(dpi = 200) 15 | ``` 16 | 17 | ```{r load and mutate data} 18 | squirrel_subset <- read.csv('squirrels_subset.csv') 19 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color)) 20 | 21 | squirrel_subset <- squirrel_subset %>% 22 | mutate(date_f = as.Date.character(date, format = '%m%d%Y')) 23 | ``` 24 | 25 | # Coding Task 26 | 27 | There if you plot the grouped plot that we've shown a few times now, there's a lot of movement in the observations on a daily basis. This might questions to focus on a particularly productive day on the squirrel census -- "I don't know why there were more this day!" -- which isn't really the point of the plot. Instead, the point of the plot is that there are many more Gray squirrels but that the number seems to be decreasing through the census time. 28 | 29 | ```{r} 30 | squirrel_subset %>% 31 | group_by(date_f, primary_fur_color) %>% 32 | summarise(count_of_colors = n()) %>% 33 | ggplot() + 34 | aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 35 | geom_line() 36 | ``` 37 | 38 | ## Your Task 39 | 40 | Change the plot so that instead the plot uses the `stat_smooth()` function -- to do so, you'll have to change the `geom_line()` call to something else. 41 | 42 | - First run the smoother as is 43 | - Then, suppress the reporting of the standard errors (you will likely have to look into the help documentation to figure out what the particular argument is that controls those error bars) 44 | - Then, re-plot again but change the variable that controls the "wiggliness" of the lines. Is there a level of this variable that you think best communicates the point you want to make with this data? 45 | 46 | ```{r} 47 | squirrel_subset %>% 48 | group_by(date_f, primary_fur_color) %>% 49 | summarise(count_of_colors = n()) %>% 50 | ggplot() + 51 | aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 52 | geom_line() 53 | ``` 54 | -------------------------------------------------------------------------------- /code/working_with_rstudio.R: -------------------------------------------------------------------------------- 1 | ## Welcome! You've made it over to the IDE. 2 | 3 | ## Any line that starts with one or more `#` will be commented out. 4 | ## This means that if you run that line, nothing will actually occur in the 5 | ## interpretor. 6 | 7 | ## To run this code below you can do the following: 8 | ## - If you are on a Mac, on the line that you want to run you can press `command+return` 9 | ## which means to hold command and then press return. 10 | ## - If you are on a Windows or Linux machine, then you can run that line by pressing `alt+return` 11 | ## which means to hold the alt key and press return. 12 | 13 | ## When you run the first line one of two things might happen: 14 | ## 1. You might get an error because you haven't installed that package. If this happens, 15 | ## look near the top of your Rstudio screen -- there should be a helper that asks if 16 | ## you want to install this library. You do and can click "install". 17 | ## 2. If you've already installed that library, then it should load the package, which 18 | ## you will see in the console below. 19 | 20 | 21 | library(ggplot2) 22 | 23 | ## Now, if you want to create some data, you can either 24 | ## - Run the first line where you are creating the object `d` that is a data.frame; or, 25 | ## - Highlight the region that you want to run and then run that region (using command+return 26 | ## or alt+return). 27 | 28 | 29 | d <- data.frame( 30 | id = 1:1000, 31 | x = rnorm(1000, mean = 0, sd = 1), 32 | y = rnorm(1000, mean = 10, sd = 2), 33 | color = sample(c('red', 'blue'), size = 1000, replace = TRUE) 34 | ) 35 | 36 | ## To produce the plot below, run these lines. Do you need to run all the lines? 37 | ## Or, can you run just the first? Why do you think the IDE chooses to behave this way? 38 | 39 | ggplot(data = d, aes(x=x, y=y)) + 40 | geom_point() 41 | 42 | ## Finally, you can run code that doesn't have any visible side effects. 43 | ## If you run the line below, what do you see in your console? Just that the line has run? 44 | ## But, now look into the `Environment` tab that is visible to you -- is there a record 45 | ## of this `mod` that you just created? 46 | 47 | mod <- lm(y ~ x, data = d) 48 | 49 | ## The model that you created is stored in the working memory and can be called by 50 | ## naming the object. 51 | 52 | mod 53 | 54 | ## If you want to use the summary function on the model, you can and you will see 55 | ## a different return printed to the console. 56 | 57 | summary(mod) 58 | -------------------------------------------------------------------------------- /code/how_to_summarise_solution.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "How to Smooth" 3 | author: 'w203: Statistics for Data Science' 4 | output: github_document 5 | --- 6 | 7 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 8 | library(tidyverse) 9 | library(ggplot2) 10 | # install.packages('patchwork') 11 | library(patchwork) 12 | 13 | theme_set(theme_minimal()) 14 | knitr::opts_chunk$set(dpi = 200) 15 | ``` 16 | 17 | ```{r load and mutate data} 18 | squirrel_subset <- read.csv('squirrels_subset.csv') 19 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color)) 20 | 21 | squirrel_subset <- squirrel_subset %>% 22 | mutate(date_f = as.Date.character(date, format = '%m%d%Y')) 23 | ``` 24 | 25 | # Coding Task 26 | 27 | There if you plot the grouped plot that we've shown a few times now, there's a lot of movement in the observations on a daily basis. This might questions to focus on a particularly productive day on the squirrel census -- "I don't know why there were more this day!" -- which isn't really the point of the plot. Instead, the point of the plot is that there are many more Gray squirrels but that the number seems to be decreasing through the census time. 28 | 29 | ```{r} 30 | squirrel_subset %>% 31 | group_by(date_f, primary_fur_color) %>% 32 | summarise(count_of_colors = n()) %>% 33 | ggplot() + 34 | aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 35 | geom_line() 36 | ``` 37 | 38 | ## Your Task 39 | 40 | Change the plot so that instead the plot uses the `stat_smooth()` function -- to do so, you'll have to change the `geom_line()` call to something else. 41 | 42 | - First run the smoother as is 43 | - Then, suppress the reporting of the standard errors (you will likely have to look into the help documentation to figure out what the particular argument is that controls those error bars) 44 | - Then, re-plot again but change the variable that controls the "wiggliness" of the lines. Is there a level of this variable that you think best communicates the point you want to make with this data? 45 | 46 | ```{r} 47 | squirrel_subset %>% 48 | group_by(date_f, primary_fur_color) %>% 49 | summarise(count_of_colors = n()) %>% 50 | ggplot() + 51 | aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 52 | stat_smooth(span = .8, se = FALSE) 53 | ``` 54 | > I think that this `span = 0.8` is my preferred span. You can see that it is *just* barely staying smooth -- there are a few points where this plot seems to try to pull away from the general line. Setting the span higher is over simplifying the trend, to my eye, setting is smaller recoveres most of the "noise" that we wanted to smooth out of the data. -------------------------------------------------------------------------------- /code/working_with_rstudio_solution.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Interacting with the IDE Solutions" 3 | author: 'w203: Statistics for Data Science' 4 | date: "8/13/2020" 5 | output: github_document 6 | --- 7 | ```{r} 8 | ## Welcome! You've made it over to the IDE. 9 | 10 | ## Any line that starts with one or more `#` will be commented out. 11 | ## This means that if you run that line, nothing will actually occur in the 12 | ## interpretor. 13 | 14 | ## To run this code below you can do the following: 15 | ## - If you are on a Mac, on the line that you want to run you can press `command+return` 16 | ## which means to hold command and then press return. 17 | ## - If you are on a Windows or Linux machine, then you can run that line by pressing `alt+return` 18 | ## which means to hold the alt key and press return. 19 | 20 | ## When you run the first line one of two things might happen: 21 | ## 1. You might get an error because you haven't installed that package. If this happens, 22 | ## look near the top of your Rstudio screen -- there should be a helper that asks if 23 | ## you want to install this library. You do and can click "install". 24 | ## 2. If you've already installed that library, then it should load the package, which 25 | ## you will see in the console below. 26 | 27 | 28 | library(ggplot2) 29 | 30 | ## Now, if you want to create some data, you can either 31 | ## - Run the first line where you are creating the object `d` that is a data.frame; or, 32 | ## - Highlight the region that you want to run and then run that region (using command+return 33 | ## or alt+return). 34 | 35 | 36 | d <- data.frame( 37 | id = 1:1000, 38 | x = rnorm(1000, mean = 0, sd = 1), 39 | y = rnorm(1000, mean = 10, sd = 2), 40 | color = sample(c('red', 'blue'), size = 1000, replace = TRUE) 41 | ) 42 | 43 | ## To produce the plot below, run these lines. Do you need to run all the lines? 44 | ## Or, can you run just the first? Why do you think the IDE chooses to behave this way? 45 | 46 | ggplot(data = d, aes(x=x, y=y)) + 47 | geom_point() 48 | 49 | ## Finally, you can run code that doesn't have any visible side effects. 50 | ## If you run the line below, what do you see in your console? Just that the line has run? 51 | ## But, now look into the `Environment` tab that is visible to you -- is there a record 52 | ## of this `mod` that you just created? 53 | 54 | mod <- lm(y ~ x, data = d) 55 | 56 | ## The model that you created is stored in the working memory and can be called by 57 | ## naming the object. 58 | 59 | mod 60 | 61 | ## If you want to use the summary function on the model, you can and you will see 62 | ## a different return printed to the console. 63 | 64 | summary(mod) 65 | ``` -------------------------------------------------------------------------------- /code/make_scatter_plots_solution.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | Title: 'Make Scatter Plots Solution' 3 | output: github_document 4 | --- 5 | 6 | ```{r setup, results = 'hide', warning=FALSE, message=FALSE} 7 | library(tidyverse) 8 | library(ggplot2) 9 | theme_set(theme_minimal()) 10 | knitr::opts_chunk$set(dpi = 200) 11 | ``` 12 | 13 | This output is best viewed over in github because we cannot render images into 14 | the ISVC. :tada: 15 | 16 | # Load data: this should just work 17 | ```{r load data} 18 | squirrel_subset <- read_csv('./squirrels_subset.csv') 19 | ``` 20 | 21 | ## Create a basic plot of the observations on the lat and long axis 22 | 23 | ```{r basic plot} 24 | ggplot(data = squirrel_subset) + 25 | aes(x = long, y = lat) + 26 | geom_point() 27 | ``` 28 | 29 | > This looks like central park to me! 30 | 31 | # Task 1: Color by age 32 | 33 | Write code that will modify the plot so that it is colored by age. 34 | 35 | ```{r age plot} 36 | ggplot(data = squirrel_subset) + 37 | aes(x = long, y = lat, color = age) + 38 | geom_point() 39 | ``` 40 | 41 | # Task 2: Make Every Point Blue 42 | 43 | - Now, write code that will make every point blue, not colored by age. 44 | - Notice that now this choice isn't an attribute of the data. 45 | - Where does this mean that the `color` argument should go? 46 | 47 | ```{r feeling blue} 48 | ggplot(data = squirrel_subset) + 49 | aes(x = long, y = lat) + 50 | geom_point(color = 'blue') 51 | ``` 52 | 53 | # Task 3: Color by the Fur Color 54 | 55 | - Now, write code that will color the points by the variable `primary_fur_color`. 56 | - Notice that this now *is* an attribute of the data. So, where should the `color` argument go? 57 | 58 | ```{r fur color plot} 59 | ggplot(data = squirrel_subset) + 60 | aes(x = long, y = lat, color = primary_fur_color) + 61 | geom_point() 62 | ``` 63 | 64 | # Task 4: Put onto non-euclidian space 65 | 66 | - If you think carefully about this, we're mapping the geographic coordinate 67 | system onto the euclidian coordinate system. This isn't a _huge_ deal in this 68 | case because we're only covering central park. But, what's right is right... 69 | 70 | - ggplot has the ability to map onto the geographic coordinate system using the 71 | additional function `coord_quickmap()` (which is an approximation) or `coord_map()` 72 | which is not an approximation 73 | 74 | - Given what you understand about the layering system that ggplot uses, can 75 | you add on this new layer that is the `coord_quickmap()` coordiante system? 76 | - If so, how much does it change the plot? 77 | 78 | ```{r non euclidian space} 79 | ggplot(data = squirrel_subset) + 80 | aes(x = long, y = lat, color = primary_fur_color) + 81 | coord_quickmap() + 82 | geom_point() 83 | ``` -------------------------------------------------------------------------------- /reading_calls/summarize.md: -------------------------------------------------------------------------------- 1 | # How to Summarise 2 | 3 | Up to this point, any time that we've made a mapping of data, I've done the mapping for you. Now it is time to take the gloves off and get to it yourself. 4 | 5 | When you summarise data you are making the active decision: 6 | 7 | > I would like to show less data -- I would like to produce some summary of the data that is a useful simplification 8 | 9 | As we move into s203, you'll see that these summaries have a formal definition as a _statistic_ where we're making a mapping of a random variable into a lower-dimensional position representation on the real number space; but that formalism isn't necessary yet. 10 | 11 | A summary of data could come in one of many forms -- in fact, maybe without knowing it, we've already been summarising data when we've been asking 12 | 13 | > "How many squirrels of each color did we observe on these days?" 14 | 15 | The answer to that question takes the **whole** data seires that we have, and produces a smaller, shorter representation that we're then reasoning about and plotting. 16 | 17 | Other forms of summary could be averages, medians, variances, or **any** other statistic. The usefulness of a summary can only be assessed in the context where it is being used, so without some other criteria is isn't really possible to _prefer_ one summary over another. 18 | 19 | # Summarising within dplyr 20 | 21 | To produce a summary of a variable within dplyr you use the `summarise` (alternatively spelled `summarize` -- the package author is from New Zealand and so chooses to spell in his native spelling. There is actually a **ton** of interesting sociology and ethnography about why the de facto language for coding is some variant of the English language. Personally, I'm a total mixing pot of spelling for this -- first, I don't really care about spelling; second, my mum is Canadian (and so spells one way) but I grew up in the USA (so spell the other way)). 22 | 23 | At its simplest, this might look as the following: 24 | 25 | ``` 26 | data_frame %>% 27 | summarise(average_of_variable = mean(variable)) 28 | ``` 29 | 30 | This process would start from the object called `data_frame`, and would then produce a new variable called `average_of_variable` that applies the `mean()` function against the variable called `variable`. 31 | 32 | As a result of this call, we would return back a single number, with the name average_of_variable`. 33 | 34 | # Coding task 35 | 36 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code summarize.Rmd`. 37 | - This code will ask you to produce a few short summaries of data. 38 | -------------------------------------------------------------------------------- /code/summarize.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Summarizing a Series of Variables' 3 | author: 'w203: Statistics for Data Science' 4 | date: "8/13/2020" 5 | output: github_document 6 | --- 7 | 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 9 | library(tidyverse) 10 | ``` 11 | 12 | ```{r load data, message=FALSE} 13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 15 | ``` 16 | 17 | # Task 18 | 19 | With the agencies data, produce a *meaningful* summary of the following variables: 20 | 21 | - The average number of launches 22 | - The median of the number of launches 23 | - The variance of the number of launches 24 | - A count of the unique number of agencies. (In the lecture, I wrote a method of accomplishing this using base tools; you can also use the `dplyr` function `n_distinct`) 25 | 26 | ```{r} 27 | agencies %>% 28 | summarize( 29 | launches_mean = mean(count), 30 | launches_median = median(count), 31 | launches_var = var(count), 32 | agencies_count = n_distinct(agency, na.rm = TRUE), 33 | agencies_count_2 = length(unique(agency)) 34 | ) 35 | ``` 36 | 37 | > Notice a few things that I've done in this code: 38 | > 1. Each of the mutate varaibles that I've written starts with the same variable "slug" -- this this case `variable_`. I thought that this was a more useful variable name than `count`. But, more to the point, this way there is a consistent look-up (both visually and programatically) for all the variable that are associate with this concenpt. If, instead, you wrote this as `mean_launches` which *does* have a more natural reading aloud, then the ordering of these variables might move apart when you consider, say `var_launches`. 39 | > 2. I've added extra white-space after the new varaibles that I've created so that I can align the `=` signs. This is always allowed within the code style, and helps to set apart the variables that you're making from those that exist. Just compare the two blocks below to see. 40 | 41 | agencies %>% 42 | summarize( 43 | launches_mean = mean(count), 44 | launches_median = median(count), 45 | launches_var = var(count), 46 | agencies_count = n_distinct(agency, na.rm = TRUE), 47 | agencies_count_2 = length(unique(agency)) 48 | ) 49 | 50 | agencies %>% 51 | summarize( 52 | launches_mean = mean(count), 53 | launches_median = median(count), 54 | launches_var = var(count), 55 | agencies_count = n_distinct(agency, na.rm = TRUE), 56 | agencies_count_2 = length(unique(agency)) 57 | ) 58 | -------------------------------------------------------------------------------- /code/summarize_solution.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Summarizing a Series of Variables' 3 | author: 'w203: Statistics for Data Science' 4 | date: "8/13/2020" 5 | output: github_document 6 | --- 7 | 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 9 | library(tidyverse) 10 | ``` 11 | 12 | ```{r load data, message=FALSE} 13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 15 | ``` 16 | 17 | # Task 18 | 19 | With the agencies data, produce a *meaningful* summary of the following variables: 20 | 21 | - The average number of launches 22 | - The median of the number of launches 23 | - The variance of the number of launches 24 | - A count of the unique number of agencies. (In the lecture, I wrote a method of accomplishing this using base tools; you can also use the `dplyr` function `n_distinct`) 25 | 26 | ```{r} 27 | agencies %>% 28 | summarize( 29 | launches_mean = mean(count), 30 | launches_median = median(count), 31 | launches_var = var(count), 32 | agencies_count = n_distinct(agency, na.rm = TRUE), 33 | agencies_count_2 = length(unique(agency)) 34 | ) 35 | ``` 36 | 37 | > Notice a few things that I've done in this code: 38 | > 39 | > 1. Each of the mutate varaibles that I've written starts with the same variable "slug" -- this this case `variable_` or `agencies_`. I thought that this was a more useful variable name than `count`. But, more to the point, this way there is a consistent look-up (both visually and programatically) for all the variable that are associate with this concenpt. If, instead, you wrote this as `mean_launches` which *does* have a more natural reading aloud, then the ordering of these variables might move apart when you consider, say `var_launches`. 40 | > 2. I've added extra white-space after the new varaibles that I've created so that I can align the `=` signs. This is always allowed within the code style, and helps to set apart the variables that you're making from those that exist. Just compare the two blocks below to see. 41 | 42 | agencies %>% 43 | summarize( 44 | launches_mean = mean(count), 45 | launches_median = median(count), 46 | launches_var = var(count), 47 | agencies_count = n_distinct(agency, na.rm = TRUE), 48 | agencies_count_2 = length(unique(agency)) 49 | ) 50 | 51 | agencies %>% 52 | summarize( 53 | launches_mean = mean(count), 54 | launches_median = median(count), 55 | launches_var = var(count), 56 | agencies_count = n_distinct(agency, na.rm = TRUE), 57 | agencies_count_2 = length(unique(agency)) 58 | ) 59 | -------------------------------------------------------------------------------- /code/group_by_summarize_solution.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Split, Apply, Combine' 3 | author: 'w203: Statistics for Data Science' 4 | date: "8/13/2020" 5 | output: github_document 6 | --- 7 | 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 9 | library(tidyverse) 10 | ``` 11 | 12 | ```{r load data, message=FALSE} 13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 15 | ``` 16 | 17 | # Task 18 | 19 | - Using the `launches` data, count the total number of launches per `launch_year`, grouped by `state_code`. 20 | 21 | ```{r} 22 | launches %>% 23 | group_by(state_code, launch_year) %>% 24 | summarise(launches_total = n()) 25 | ``` 26 | 27 | > Roger that. 28 | 29 | - Then, using `arrange` answer the question: which year was the busiest for any state? 30 | 31 | ```{r} 32 | launches %>% 33 | group_by(state_code, launch_year) %>% 34 | summarise(launches_total = n()) %>% 35 | arrange(desc(launches_total)) 36 | ``` 37 | 38 | > It looks like the busiest year for any state was 1982 in the Soviet Union, followed very closely by other years right around the same time for the soviet union. 39 | 40 | - Then, using `filter` answer the question: what was the busiest year for the US? 41 | 42 | ```{r} 43 | launches %>% 44 | group_by(state_code, launch_year) %>% 45 | summarise(launches_total = n()) %>% 46 | filter(state_code == 'US') %>% 47 | arrange(desc(launches_total)) 48 | ``` 49 | 50 | > The busiest year for the US was 1966. And, although it is a different decade than the busiest year for the Soviet Union, you can see that when a country makes an investment in their space exploration program, they seem to have pretty bursty output for launches. 51 | 52 | - Then, using another variable summary, answer the question: which country has the most variance in the per-year launches? 53 | 54 | This is a little tricky, because I'm not asking you to include the launch year in the *final* grouping, instead, you're going to summarize *across* launch years. To do this, start as we have, but then part way through, drop the grouping by year. To do this, declare a new `group_by()` call, and then proceed with your summary. 55 | 56 | ```{r} 57 | launches %>% 58 | group_by(state_code, launch_year) %>% 59 | summarise(launches_total = n()) %>% 60 | group_by(state_code) %>% 61 | summarise(launches_variance = var(launches_total, na.rm = T)) %>% 62 | arrange(desc(launches_variance)) 63 | ``` 64 | 65 | Think about how hard this would have been if you were writing this in some *other* idiom. It isn't that this was necessariliy easy, but that it is possible. And, it is legible while you're doing it! Neat. -------------------------------------------------------------------------------- /code/make_a_data_set_solution.md: -------------------------------------------------------------------------------- 1 | Make A Data Set 2 | ================ 3 | w203: Statistics for Data Science 4 | 8/14/2020 5 | 6 | # Paste Farm Code 7 | 8 | Start by pasting the code that you wrote to create the farm animals 9 | below. 10 | 11 | ``` r 12 | ID <- 1:300 13 | animal <- c(rep('cow', 10), rep('sheep', 50), rep('goat', 40), rep('chicken', 70), rep('rabbit', 130)) 14 | petting_zoo <- rep('no', 300) 15 | petting_zoo[animal == 'rabbit'] <- 'yes' 16 | 17 | weight <- rep(NA, 300) 18 | weight[animal == 'cow'] <- rnorm(n = 10, mean = 1000, sd = 100) 19 | weight[animal == 'sheep'] <- rnorm(n = 50, mean = 100, sd = 2) 20 | weight[animal == 'goat'] <- rnorm(n = 40, mean = 40, sd = 2) 21 | weight[animal == 'chicken'] <- .2 22 | weight[animal == 'rabbit'] <- NA 23 | 24 | feed <- rep(NA, 300) 25 | 26 | feed[animal == 'cow'] <- weight[animal == 'cow'] * .03 27 | feed[animal == 'sheep'] <- weight[animal == 'sheep'] *.02 28 | feed[animal == 'goat'] <- weight[animal == 'goat'] * .07 29 | feed[animal == 'chicken'] <- weight[animal == 'chicken'] * .02 30 | feed[animal == 'rabbit'] <- NA 31 | ``` 32 | 33 | ## Combine into a dataset 34 | 35 | Now, combine all of these variables into a single dataset, called 36 | `tilden`. 37 | 38 | ``` r 39 | tilden <- data.frame( 40 | ID, animal, petting_zoo, weight, feed 41 | ) 42 | ``` 43 | 44 | ## Typecast 45 | 46 | Now do a little bit of mutating and type converting. 47 | 48 | - Since we know that each of the types of animals has a specific 49 | amount that they eat, let’s label them “hungry boi” if they eat more 50 | than average; and “slender boi” if they eat less than average. 51 | (Sorry… I know this coding might be getting tedious\!). 52 | - Make each of these relative to the mean within that animal type. 53 | 54 | 55 | 56 | ``` r 57 | tilden %>% 58 | group_by(animal) %>% 59 | mutate(hungry_slender = ifelse(weight > mean(weight), 'hungry boi', 'slender boi')) 60 | ``` 61 | 62 | ## # A tibble: 300 x 6 63 | ## # Groups: animal [5] 64 | ## ID animal petting_zoo weight feed hungry_slender 65 | ## 66 | ## 1 1 cow no 1269. 38.1 hungry boi 67 | ## 2 2 cow no 1013. 30.4 slender boi 68 | ## 3 3 cow no 1098. 32.9 hungry boi 69 | ## 4 4 cow no 926. 27.8 slender boi 70 | ## 5 5 cow no 1196. 35.9 hungry boi 71 | ## 6 6 cow no 1036. 31.1 slender boi 72 | ## 7 7 cow no 950. 28.5 slender boi 73 | ## 8 8 cow no 981. 29.4 slender boi 74 | ## 9 9 cow no 981. 29.4 slender boi 75 | ## 10 10 cow no 1118. 33.5 hungry boi 76 | ## # … with 290 more rows 77 | -------------------------------------------------------------------------------- /reading_calls/make_a_data_set_solution.md: -------------------------------------------------------------------------------- 1 | Make A Data Set 2 | ================ 3 | w203: Statistics for Data Science 4 | 8/14/2020 5 | 6 | # Paste Farm Code 7 | 8 | Start by pasting the code that you wrote to create the farm animals 9 | below. 10 | 11 | ``` r 12 | ID <- 1:300 13 | animal <- c(rep('cow', 10), rep('sheep', 50), rep('goat', 40), rep('chicken', 70), rep('rabbit', 130)) 14 | petting_zoo <- rep('no', 300) 15 | petting_zoo[animal == 'rabbit'] <- 'yes' 16 | 17 | weight <- rep(NA, 300) 18 | weight[animal == 'cow'] <- rnorm(n = 10, mean = 1000, sd = 100) 19 | weight[animal == 'sheep'] <- rnorm(n = 50, mean = 100, sd = 2) 20 | weight[animal == 'goat'] <- rnorm(n = 40, mean = 40, sd = 2) 21 | weight[animal == 'chicken'] <- .2 22 | weight[animal == 'rabbit'] <- NA 23 | 24 | feed <- rep(NA, 300) 25 | 26 | feed[animal == 'cow'] <- weight[animal == 'cow'] * .03 27 | feed[animal == 'sheep'] <- weight[animal == 'sheep'] *.02 28 | feed[animal == 'goat'] <- weight[animal == 'goat'] * .07 29 | feed[animal == 'chicken'] <- weight[animal == 'chicken'] * .02 30 | feed[animal == 'rabbit'] <- NA 31 | ``` 32 | 33 | ## Combine into a dataset 34 | 35 | Now, combine all of these variables into a single dataset, called 36 | `tilden`. 37 | 38 | ``` r 39 | tilden <- data.frame( 40 | ID, animal, petting_zoo, weight, feed 41 | ) 42 | ``` 43 | 44 | ## Typecast 45 | 46 | Now do a little bit of mutating and type converting. 47 | 48 | - Since we know that each of the types of animals has a specific 49 | amount that they eat, let’s label them “hungry boi” if they eat more 50 | than average; and “slender boi” if they eat less than average. 51 | (Sorry… I know this coding might be getting tedious\!). 52 | - Make each of these relative to the mean within that animal type. 53 | 54 | 55 | 56 | ``` r 57 | tilden %>% 58 | group_by(animal) %>% 59 | mutate(hungry_slender = ifelse(weight > mean(weight), 'hungry boi', 'slender boi')) 60 | ``` 61 | 62 | ## # A tibble: 300 x 6 63 | ## # Groups: animal [5] 64 | ## ID animal petting_zoo weight feed hungry_slender 65 | ## 66 | ## 1 1 cow no 1269. 38.1 hungry boi 67 | ## 2 2 cow no 1013. 30.4 slender boi 68 | ## 3 3 cow no 1098. 32.9 hungry boi 69 | ## 4 4 cow no 926. 27.8 slender boi 70 | ## 5 5 cow no 1196. 35.9 hungry boi 71 | ## 6 6 cow no 1036. 31.1 slender boi 72 | ## 7 7 cow no 950. 28.5 slender boi 73 | ## 8 8 cow no 981. 29.4 slender boi 74 | ## 9 9 cow no 981. 29.4 slender boi 75 | ## 10 10 cow no 1118. 33.5 hungry boi 76 | ## # … with 290 more rows 77 | -------------------------------------------------------------------------------- /reading_calls/mini_project.md: -------------------------------------------------------------------------------- 1 | # Mini Project 2 | 3 | If you're having fun with this split-apply-combine framework here's an idea for about a 60 minute project that you could work on. 4 | 5 | - I want to emphasize **this is not necessary** but if you want to solidify your work everything that we've done to this point, it might help! 6 | 7 | # Seattle Bike Data 8 | 9 | As I'm writing this, I'm on my way to Seattle -- probably my favorite American city. The city has a fantatsic culture around biking, especially for communting to and from work. Because the city was interested in knowing just how many people ride along their bike trails, they installed counters tha record the following information: 10 | 11 | - The date (and hour) of the observation; 12 | - The number of cyclists observed (total) in that hour; which is a combination of two other fields that are recorded: 13 | - How many cyclists are recorded going east-bound; and, 14 | - How many cyclists are recorded going west-bound. 15 | 16 | ## Writeup using this data 17 | 18 | The Seattle Times did a [write-up](https://www.seattletimes.com/seattle-news/transportation/what-we-can-learn-from-seattles-bike-counter-data/#interactive) using the data that you can read. 19 | 20 | ## Mini-project using this data 21 | 22 | The Seattle bike data is available[[here]](https://www.seattle.gov/transportation/projects-and-programs/programs/bike-program/bike-counters). But, somewhat annoyingly, for the busiest routes, they're only making a dashboard available. 23 | 24 | However, for the bike counter that is along the I-90 Bridge that separates Seattle from Richmond, they make the full dataset available. A link is [[here]](https://www.seattle.gov/transportation/projects-and-programs/programs/bike-program/bike-counters). 25 | 26 | With this data: 27 | 28 | 1. Download a .csv file of the data. 29 | 2. Load this data 30 | - If it is in your downloads folder: then you can read the data with the following call: `bike_data <- read_csv('~/Downloads/NAME_OF_THE_DATA_THAT_YOU_DOWNLOADED.csv')` where you will replace the `NAME_OF_THE_DATA_THAT_YOU_DOWNLOADED` callout with its actual name. 31 | 3. See if you can work with the data to identify whether there are patterns: 32 | - Are there more east-bound or west-bound rides at certain parts of the day? Why might this be? 33 | - Are there months of the year that there are more (or fewer) total rides? 34 | - Are the weekends different from the weekdays? 35 | 36 | One small complication is that the `date` field hasn't actually been turned into something that you can use. To use it (without using more advanced POSIX time series types), you will have to `mutate` pieces off of this data). 37 | 38 | Here is a little example that will pull the year off this column: 39 | 40 | ``` 41 | bike_data %>% 42 | mutate(year = substr(Date, start = 1, stop = 4)) 43 | ``` 44 | 45 | Have fun! 46 | -------------------------------------------------------------------------------- /code/base_operations_solution.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Heading to the Little Farm" 3 | author: 'w203: Statistics for Data Science' 4 | output: github_document 5 | --- 6 | 7 | # The Little Farm 8 | 9 | Just around the corner from campus in Berkeley is the Tilden "Little Farm" its a sort of hobby farm that has cows, goats, chickens, sheep, and a small menagerie of other cute animals. I think that idea is that kids can see animals -- but I've also got to admit that it is kind of weird to have a tiny farm in the middle of the neighborhood. 10 | 11 | ## Create Farm Data 12 | 13 | Let's use all the methods that you've just been working on to produce a dataset that represents the farm. Using methods that you're familiar with, create the following dataset. 14 | 15 | - A column that ranges from 1 - 300 called "ID" that is an ID for the animal. 16 | - A column that has the type of animal that is being recorded: 17 | - There are 10 cows 18 | - There are 50 sheep 19 | - There are 40 goats 20 | - There are 70 chickens 21 | - There are 130 rabbits 22 | - A column that describes whether that animal belongs in the petting zoo 23 | - A column that describes the weight of the animals: 24 | - Cows weight are normally distributed, with a mean of 1000kg and a sd of 100kg 25 | - Sheep are normally distributed, with a mean of 100kg and a sd of 2 kg 26 | - Goats are normally distributed, with a mean of 40kg and a sd of 2kg 27 | - Chickens all weigh .2 kg 28 | - Rabbits don't stand still long enough to be weighted, so there is no data 29 | - A column that described how much feed that animal needs -- this is animal specific, and depends on how much the animal weighs. 30 | - A cow needs 3% of its body weight each day to stay alive 31 | - A sheep needs 2% of its body weight each day to stay alive 32 | - A goat needs 7% of its body weight each day to stay alive 33 | - A chicken needs 2% of its body weight each day to stay alive 34 | 35 | For this activity, you cannot use the `tidyverse`. 36 | 37 | ```{r} 38 | ID <- 1:300 39 | animal <- c(rep('cow', 10), rep('sheep', 50), rep('goat', 40), rep('chicken', 70), rep('rabbit', 130)) 40 | petting_zoo <- rep('no', 300) 41 | petting_zoo[animal == 'rabbit'] <- 'yes' 42 | 43 | weight <- rep(NA, 300) 44 | weight[animal == 'cow'] <- rnorm(n = 10, mean = 1000, sd = 100) 45 | weight[animal == 'sheep'] <- rnorm(n = 50, mean = 100, sd = 2) 46 | weight[animal == 'goat'] <- rnorm(n = 40, mean = 40, sd = 2) 47 | weight[animal == 'chicken'] <- .2 48 | weight[animal == 'rabbit'] <- NA 49 | 50 | feed <- rep(NA, 300) 51 | 52 | feed[animal == 'cow'] <- weight[animal == 'cow'] * .03 53 | feed[animal == 'sheep'] <- weight[animal == 'sheep'] *.02 54 | feed[animal == 'goat'] <- weight[animal == 'goat'] * .07 55 | feed[animal == 'chicken'] <- weight[animal == 'chicken'] * .02 56 | feed[animal == 'rabbit'] <- NA 57 | ``` 58 | -------------------------------------------------------------------------------- /code/base_operations_solution.md: -------------------------------------------------------------------------------- 1 | Heading to the Little Farm 2 | ================ 3 | w203: Statistics for Data Science 4 | 5 | # The Little Farm 6 | 7 | Just around the corner from campus in Berkeley is the Tilden “Little 8 | Farm” its a sort of hobby farm that has cows, goats, chickens, sheep, 9 | and a small menagerie of other cute animals. I think that idea is that 10 | kids can see animals – but I’ve also got to admit that it is kind of 11 | weird to have a tiny farm in the middle of the neighborhood. 12 | 13 | ## Create Farm Data 14 | 15 | Let’s use all the methods that you’ve just been working on to produce a 16 | dataset that represents the farm. Using methods that you’re familiar 17 | with, create the following dataset. 18 | 19 | - A column that ranges from 1 - 300 called “ID” that is an ID for the 20 | animal. 21 | - A column that has the type of animal that is being recorded: 22 | - There are 10 cows 23 | - There are 50 sheep 24 | - There are 40 goats 25 | - There are 70 chickens 26 | - There are 130 rabbits 27 | - A column that describes whether that animal belongs in the petting 28 | zoo 29 | - A column that describes the weight of the animals: 30 | - Cows weight are normally distributed, with a mean of 1000kg and 31 | a sd of 100kg 32 | - Sheep are normally distributed, with a mean of 100kg and a sd of 33 | 2 kg 34 | - Goats are normally distributed, with a mean of 40kg and a sd of 35 | 2kg 36 | - Chickens all weigh .2 kg 37 | - Rabbits don’t stand still long enough to be weighted, so there 38 | is no data 39 | - A column that described how much feed that animal needs – this is 40 | animal specific, and depends on how much the animal weighs. 41 | - A cow needs 3% of its body weight each day to stay alive 42 | - A sheep needs 2% of its body weight each day to stay alive 43 | - A goat needs 7% of its body weight each day to stay alive 44 | - A chicken needs 2% of its body weight each day to stay alive 45 | 46 | For this activity, you cannot use the `tidyverse`. 47 | 48 | ``` r 49 | ID <- 1:300 50 | animal <- c(rep('cow', 10), rep('sheep', 50), rep('goat', 40), rep('chicken', 70), rep('rabbit', 130)) 51 | petting_zoo <- rep('no', 300) 52 | petting_zoo[animal == 'rabbit'] <- 'yes' 53 | 54 | weight <- rep(NA, 300) 55 | weight[animal == 'cow'] <- rnorm(n = 10, mean = 1000, sd = 100) 56 | weight[animal == 'sheep'] <- rnorm(n = 50, mean = 100, sd = 2) 57 | weight[animal == 'goat'] <- rnorm(n = 40, mean = 40, sd = 2) 58 | weight[animal == 'chicken'] <- .2 59 | weight[animal == 'rabbit'] <- NA 60 | 61 | feed <- rep(NA, 300) 62 | 63 | feed[animal == 'cow'] <- weight[animal == 'cow'] * .03 64 | feed[animal == 'sheep'] <- weight[animal == 'sheep'] *.02 65 | feed[animal == 'goat'] <- weight[animal == 'goat'] * .07 66 | feed[animal == 'chicken'] <- weight[animal == 'chicken'] * .02 67 | feed[animal == 'rabbit'] <- NA 68 | ``` 69 | -------------------------------------------------------------------------------- /reading_calls/base_operations_solution.md: -------------------------------------------------------------------------------- 1 | Heading to the Little Farm 2 | ================ 3 | w203: Statistics for Data Science 4 | 5 | # The Little Farm 6 | 7 | Just around the corner from campus in Berkeley is the Tilden “Little 8 | Farm” its a sort of hobby farm that has cows, goats, chickens, sheep, 9 | and a small menagerie of other cute animals. I think that idea is that 10 | kids can see animals – but I’ve also got to admit that it is kind of 11 | weird to have a tiny farm in the middle of the neighborhood. 12 | 13 | ## Create Farm Data 14 | 15 | Let’s use all the methods that you’ve just been working on to produce a 16 | dataset that represents the farm. Using methods that you’re familiar 17 | with, create the following dataset. 18 | 19 | - A column that ranges from 1 - 300 called “ID” that is an ID for the 20 | animal. 21 | - A column that has the type of animal that is being recorded: 22 | - There are 10 cows 23 | - There are 50 sheep 24 | - There are 40 goats 25 | - There are 70 chickens 26 | - There are 130 rabbits 27 | - A column that describes whether that animal belongs in the petting 28 | zoo 29 | - A column that describes the weight of the animals: 30 | - Cows weight are normally distributed, with a mean of 1000kg and 31 | a sd of 100kg 32 | - Sheep are normally distributed, with a mean of 100kg and a sd of 33 | 2 kg 34 | - Goats are normally distributed, with a mean of 40kg and a sd of 35 | 2kg 36 | - Chickens all weigh .2 kg 37 | - Rabbits don’t stand still long enough to be weighted, so there 38 | is no data 39 | - A column that described how much feed that animal needs – this is 40 | animal specific, and depends on how much the animal weighs. 41 | - A cow needs 3% of its body weight each day to stay alive 42 | - A sheep needs 2% of its body weight each day to stay alive 43 | - A goat needs 7% of its body weight each day to stay alive 44 | - A chicken needs 2% of its body weight each day to stay alive 45 | 46 | For this activity, you cannot use the `tidyverse`. 47 | 48 | ``` r 49 | ID <- 1:300 50 | animal <- c(rep('cow', 10), rep('sheep', 50), rep('goat', 40), rep('chicken', 70), rep('rabbit', 130)) 51 | petting_zoo <- rep('no', 300) 52 | petting_zoo[animal == 'rabbit'] <- 'yes' 53 | 54 | weight <- rep(NA, 300) 55 | weight[animal == 'cow'] <- rnorm(n = 10, mean = 1000, sd = 100) 56 | weight[animal == 'sheep'] <- rnorm(n = 50, mean = 100, sd = 2) 57 | weight[animal == 'goat'] <- rnorm(n = 40, mean = 40, sd = 2) 58 | weight[animal == 'chicken'] <- .2 59 | weight[animal == 'rabbit'] <- NA 60 | 61 | feed <- rep(NA, 300) 62 | 63 | feed[animal == 'cow'] <- weight[animal == 'cow'] * .03 64 | feed[animal == 'sheep'] <- weight[animal == 'sheep'] *.02 65 | feed[animal == 'goat'] <- weight[animal == 'goat'] * .07 66 | feed[animal == 'chicken'] <- weight[animal == 'chicken'] * .02 67 | feed[animal == 'rabbit'] <- NA 68 | ``` 69 | -------------------------------------------------------------------------------- /reading_calls/introduction_to_space_data.md: -------------------------------------------------------------------------------- 1 | # Introduction to Space Data 2 | 3 | From here forward we're going to switch the data that we're using from squirrels to space. 4 | 5 | I came to this new data at[ Tidy Tuesday](https://github.com/rfordatascience/tidytuesday/tree/master/data/2019/2019-01-15), a collection of data exploration challenges that come online every Tuesday. They're pretty neat sometimes! 6 | 7 | The data came to them from an article that the Economist published, which in turn came from two different sources. You can read about these sources on the website if you're interested. 8 | 9 | Below I have reproduced the data file names and variable definitions. 10 | 11 | ## Data files 12 | 13 | | File | Description | Source | 14 | | -------- | ---------------------- | ---------------------------------- | 15 | | [agencies](agencies.csv) | Space launch providers | Jonathan McDowell; _The Economist_ | 16 | | [launches](launches.csv) | Individual space launches | Jonathan McDowell; _The Economist_ | 17 | 18 | ## Codebook 19 | 20 | ### launches 21 | 22 | | variable | definition | 23 | | ----------- | ---------------------------------------- | 24 | | tag | Harvard or [COSPAR][cospar] id of launch | 25 | | JD | [Julian Date][jd] of launch | 26 | | launch_date | date of launch | 27 | | launch_year | year of launch | 28 | | type | type of launch vehicle | 29 | | variant | variant of launch vehicle | 30 | | mission | | 31 | | agency | launching agency | 32 | | state_code | launching agency's state | 33 | | category | success (O) or failure (F) | 34 | | agency_type | type of agency | 35 | 36 | ### agencies 37 | 38 | | variable | definition | 39 | | ------------------ | ----------------------- | 40 | | agency | org phase code | 41 | | count | number of launches | 42 | | ucode | org Ucode | 43 | | state_code | responsible state | 44 | | type | type of org | 45 | | class | class of org | 46 | | tstart | org/phase founding date | 47 | | tstop | org/phase ending date | 48 | | short_name | short name | 49 | | name | full name | 50 | | location | plain english location | 51 | | longitude | | 52 | | latitude | | 53 | | error | uncertainty in long/lat | 54 | | parent | parent org | 55 | | short_english_name | english short name | 56 | | english_name | english full name | 57 | | unicode_name | unicode full name | 58 | | agency_type | type of agency | 59 | -------------------------------------------------------------------------------- /code/how_to_summarise_solution.md: -------------------------------------------------------------------------------- 1 | How to Smooth 2 | ================ 3 | w203: Statistics for Data Science 4 | 5 | ``` r 6 | library(tidyverse) 7 | library(ggplot2) 8 | # install.packages('patchwork') 9 | library(patchwork) 10 | 11 | theme_set(theme_minimal()) 12 | knitr::opts_chunk$set(dpi = 200) 13 | ``` 14 | 15 | ``` r 16 | squirrel_subset <- read.csv('squirrels_subset.csv') 17 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color)) 18 | 19 | squirrel_subset <- squirrel_subset %>% 20 | mutate(date_f = as.Date.character(date, format = '%m%d%Y')) 21 | ``` 22 | 23 | # Coding Task 24 | 25 | There if you plot the grouped plot that we’ve shown a few times now, 26 | there’s a lot of movement in the observations on a daily basis. This 27 | might questions to focus on a particularly productive day on the 28 | squirrel census – “I don’t know why there were more this day\!” – which 29 | isn’t really the point of the plot. Instead, the point of the plot is 30 | that there are many more Gray squirrels but that the number seems to be 31 | decreasing through the census time. 32 | 33 | ``` r 34 | squirrel_subset %>% 35 | group_by(date_f, primary_fur_color) %>% 36 | summarise(count_of_colors = n()) %>% 37 | ggplot() + 38 | aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 39 | geom_line() 40 | ``` 41 | 42 | ## `summarise()` regrouping output by 'date_f' (override with `.groups` argument) 43 | 44 | ![](how_to_summarise_solution_files/figure-gfm/unnamed-chunk-1-1.png) 45 | 46 | ## Your Task 47 | 48 | Change the plot so that instead the plot uses the `stat_smooth()` 49 | function – to do so, you’ll have to change the `geom_line()` call to 50 | something else. 51 | 52 | - First run the smoother as is 53 | - Then, suppress the reporting of the standard errors (you will likely 54 | have to look into the help documentation to figure out what the 55 | particular argument is that controls those error bars) 56 | - Then, re-plot again but change the variable that controls the 57 | “wiggliness” of the lines. Is there a level of this variable that 58 | you think best communicates the point you want to make with this 59 | data? 60 | 61 | 62 | 63 | ``` r 64 | squirrel_subset %>% 65 | group_by(date_f, primary_fur_color) %>% 66 | summarise(count_of_colors = n()) %>% 67 | ggplot() + 68 | aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 69 | stat_smooth(span = .8, se = FALSE) 70 | ``` 71 | 72 | ## `summarise()` regrouping output by 'date_f' (override with `.groups` argument) 73 | 74 | ## `geom_smooth()` using method = 'loess' and formula 'y ~ x' 75 | 76 | ![](how_to_summarise_solution_files/figure-gfm/unnamed-chunk-2-1.png) 77 | \> I think that this `span = 0.8` is my preferred span. You can see that 78 | it is *just* barely staying smooth – there are a few points where this 79 | plot seems to try to pull away from the general line. Setting the span 80 | higher is over simplifying the trend, to my eye, setting is smaller 81 | recoveres most of the “noise” that we wanted to smooth out of the data. 82 | -------------------------------------------------------------------------------- /reading_calls/how_to_summarise_solution.md: -------------------------------------------------------------------------------- 1 | How to Smooth 2 | ================ 3 | w203: Statistics for Data Science 4 | 5 | ``` r 6 | library(tidyverse) 7 | library(ggplot2) 8 | # install.packages('patchwork') 9 | library(patchwork) 10 | 11 | theme_set(theme_minimal()) 12 | knitr::opts_chunk$set(dpi = 200) 13 | ``` 14 | 15 | ``` r 16 | squirrel_subset <- read.csv('squirrels_subset.csv') 17 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color)) 18 | 19 | squirrel_subset <- squirrel_subset %>% 20 | mutate(date_f = as.Date.character(date, format = '%m%d%Y')) 21 | ``` 22 | 23 | # Coding Task 24 | 25 | There if you plot the grouped plot that we’ve shown a few times now, 26 | there’s a lot of movement in the observations on a daily basis. This 27 | might questions to focus on a particularly productive day on the 28 | squirrel census – “I don’t know why there were more this day\!” – which 29 | isn’t really the point of the plot. Instead, the point of the plot is 30 | that there are many more Gray squirrels but that the number seems to be 31 | decreasing through the census time. 32 | 33 | ``` r 34 | squirrel_subset %>% 35 | group_by(date_f, primary_fur_color) %>% 36 | summarise(count_of_colors = n()) %>% 37 | ggplot() + 38 | aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 39 | geom_line() 40 | ``` 41 | 42 | ## `summarise()` regrouping output by 'date_f' (override with `.groups` argument) 43 | 44 | ![](how_to_summarise_solution_files/figure-gfm/unnamed-chunk-1-1.png) 45 | 46 | ## Your Task 47 | 48 | Change the plot so that instead the plot uses the `stat_smooth()` 49 | function – to do so, you’ll have to change the `geom_line()` call to 50 | something else. 51 | 52 | - First run the smoother as is 53 | - Then, suppress the reporting of the standard errors (you will likely 54 | have to look into the help documentation to figure out what the 55 | particular argument is that controls those error bars) 56 | - Then, re-plot again but change the variable that controls the 57 | “wiggliness” of the lines. Is there a level of this variable that 58 | you think best communicates the point you want to make with this 59 | data? 60 | 61 | 62 | 63 | ``` r 64 | squirrel_subset %>% 65 | group_by(date_f, primary_fur_color) %>% 66 | summarise(count_of_colors = n()) %>% 67 | ggplot() + 68 | aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 69 | stat_smooth(span = .8, se = FALSE) 70 | ``` 71 | 72 | ## `summarise()` regrouping output by 'date_f' (override with `.groups` argument) 73 | 74 | ## `geom_smooth()` using method = 'loess' and formula 'y ~ x' 75 | 76 | ![](how_to_summarise_solution_files/figure-gfm/unnamed-chunk-2-1.png) 77 | \> I think that this `span = 0.8` is my preferred span. You can see that 78 | it is *just* barely staying smooth – there are a few points where this 79 | plot seems to try to pull away from the general line. Setting the span 80 | higher is over simplifying the trend, to my eye, setting is smaller 81 | recoveres most of the “noise” that we wanted to smooth out of the data. 82 | -------------------------------------------------------------------------------- /code/summarize_solution.md: -------------------------------------------------------------------------------- 1 | Summarizing a Series of Variables 2 | ================ 3 | w203: Statistics for Data Science 4 | 8/13/2020 5 | 6 | ``` r 7 | library(tidyverse) 8 | ``` 9 | 10 | ``` r 11 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 12 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 13 | ``` 14 | 15 | # Task 16 | 17 | With the agencies data, produce a *meaningful* summary of the following 18 | variables: 19 | 20 | - The average number of launches 21 | - The median of the number of launches 22 | - The variance of the number of launches 23 | - A count of the unique number of agencies. (In the lecture, I wrote a 24 | method of accomplishing this using base tools; you can also use the 25 | `dplyr` function `n_distinct`) 26 | 27 | 28 | 29 | ``` r 30 | agencies %>% 31 | summarize( 32 | launches_mean = mean(count), 33 | launches_median = median(count), 34 | launches_var = var(count), 35 | agencies_count = n_distinct(agency, na.rm = TRUE), 36 | agencies_count_2 = length(unique(agency)) 37 | ) 38 | ``` 39 | 40 | ## # A tibble: 1 x 5 41 | ## launches_mean launches_median launches_var agencies_count agencies_count_2 42 | ## 43 | ## 1 77.1 12 46203. 74 74 44 | 45 | > Notice a few things that I’ve done in this code: 46 | > 47 | > 1. Each of the mutate varaibles that I’ve written starts with the 48 | > same variable “slug” – this this case `variable_` or `agencies_`. 49 | > I thought that this was a more useful variable name than `count`. 50 | > But, more to the point, this way there is a consistent look-up 51 | > (both visually and programatically) for all the variable that are 52 | > associate with this concenpt. If, instead, you wrote this as 53 | > `mean_launches` which *does* have a more natural reading aloud, 54 | > then the ordering of these variables might move apart when you 55 | > consider, say `var_launches`. 56 | > 2. I’ve added extra white-space after the new varaibles that I’ve 57 | > created so that I can align the `=` signs. This is always allowed 58 | > within the code style, and helps to set apart the variables that 59 | > you’re making from those that exist. Just compare the two blocks 60 | > below to see. 61 | 62 | agencies %>% 63 | summarize( 64 | launches_mean = mean(count), 65 | launches_median = median(count), 66 | launches_var = var(count), 67 | agencies_count = n_distinct(agency, na.rm = TRUE), 68 | agencies_count_2 = length(unique(agency)) 69 | ) 70 | 71 | agencies %>% 72 | summarize( 73 | launches_mean = mean(count), 74 | launches_median = median(count), 75 | launches_var = var(count), 76 | agencies_count = n_distinct(agency, na.rm = TRUE), 77 | agencies_count_2 = length(unique(agency)) 78 | ) 79 | -------------------------------------------------------------------------------- /reading_calls/summarize_solution.md: -------------------------------------------------------------------------------- 1 | Summarizing a Series of Variables 2 | ================ 3 | w203: Statistics for Data Science 4 | 8/13/2020 5 | 6 | ``` r 7 | library(tidyverse) 8 | ``` 9 | 10 | ``` r 11 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 12 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 13 | ``` 14 | 15 | # Task 16 | 17 | With the agencies data, produce a *meaningful* summary of the following 18 | variables: 19 | 20 | - The average number of launches 21 | - The median of the number of launches 22 | - The variance of the number of launches 23 | - A count of the unique number of agencies. (In the lecture, I wrote a 24 | method of accomplishing this using base tools; you can also use the 25 | `dplyr` function `n_distinct`) 26 | 27 | 28 | 29 | ``` r 30 | agencies %>% 31 | summarize( 32 | launches_mean = mean(count), 33 | launches_median = median(count), 34 | launches_var = var(count), 35 | agencies_count = n_distinct(agency, na.rm = TRUE), 36 | agencies_count_2 = length(unique(agency)) 37 | ) 38 | ``` 39 | 40 | ## # A tibble: 1 x 5 41 | ## launches_mean launches_median launches_var agencies_count agencies_count_2 42 | ## 43 | ## 1 77.1 12 46203. 74 74 44 | 45 | > Notice a few things that I’ve done in this code: 46 | > 47 | > 1. Each of the mutate varaibles that I’ve written starts with the 48 | > same variable “slug” – this this case `variable_` or `agencies_`. 49 | > I thought that this was a more useful variable name than `count`. 50 | > But, more to the point, this way there is a consistent look-up 51 | > (both visually and programatically) for all the variable that are 52 | > associate with this concenpt. If, instead, you wrote this as 53 | > `mean_launches` which *does* have a more natural reading aloud, 54 | > then the ordering of these variables might move apart when you 55 | > consider, say `var_launches`. 56 | > 2. I’ve added extra white-space after the new varaibles that I’ve 57 | > created so that I can align the `=` signs. This is always allowed 58 | > within the code style, and helps to set apart the variables that 59 | > you’re making from those that exist. Just compare the two blocks 60 | > below to see. 61 | 62 | agencies %>% 63 | summarize( 64 | launches_mean = mean(count), 65 | launches_median = median(count), 66 | launches_var = var(count), 67 | agencies_count = n_distinct(agency, na.rm = TRUE), 68 | agencies_count_2 = length(unique(agency)) 69 | ) 70 | 71 | agencies %>% 72 | summarize( 73 | launches_mean = mean(count), 74 | launches_median = median(count), 75 | launches_var = var(count), 76 | agencies_count = n_distinct(agency, na.rm = TRUE), 77 | agencies_count_2 = length(unique(agency)) 78 | ) 79 | -------------------------------------------------------------------------------- /code/grouped_data.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Grouped Data" 3 | author: 'w203: Statistics for Data Science' 4 | output: github_document 5 | --- 6 | 7 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 8 | library(tidyverse) 9 | library(ggplot2) 10 | # install.packages('patchwork') 11 | library(patchwork) 12 | 13 | theme_set(theme_minimal()) 14 | knitr::opts_chunk$set(dpi = 200) 15 | ``` 16 | 17 | ```{r load and mutate data} 18 | squirrel_subset <- read.csv('squirrels_subset.csv') 19 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color)) 20 | 21 | squirrel_subset <- squirrel_subset %>% 22 | mutate(date_f = as.Date.character(date, format = '%m%d%Y')) 23 | ``` 24 | 25 | # Plots Groups with Colors 26 | 27 | A few days ago, we plotted the count of squirrels that were observed over time. 28 | 29 | - Because it was by time, we reasoned that a line plot did a good job of illustrating the connections between the observations 30 | - The plot looked something like what is below 31 | 32 | ```{r ungrouped plot} 33 | squirrel_subset %>% 34 | group_by(date_f) %>% 35 | summarise(count_of_squirrels = n()) %>% 36 | ggplot() + 37 | aes(x = date_f, y = count_of_squirrels) + 38 | geom_line() 39 | ``` 40 | At that point, we asked a challenge question of you, that asked, 41 | 42 | > Could you also make this plot and represent the color of the squirrels in the plot? 43 | 44 | Here, I've written the first set of lines that would do this for you -- this takes the squirrel subset data, groups by date and fur color, and then counts the number of squirrels that are observed in each of these combinations. 45 | 46 | Complete the plot, by: 47 | 48 | - Adding a `ggplot()` call; 49 | - Adding an `aes()` call; and, 50 | - Adding a `geom_line()` call to produce the line 51 | 52 | Think, as you're drawing this plot -- what parts of this are mapping from data that I want to bring into the plot? What do I want to map that information onto? This might help to keep clear the code that you want to write. 53 | 54 | ```{r} 55 | squirrel_subset %>% 56 | group_by(date_f, primary_fur_color) %>% 57 | summarise(count_of_colors = n()) 58 | ``` 59 | 60 | # Plot Groups with Different Graphs 61 | 62 | Although I think that it is probably uniformly **less** effective of a representation in this case, you might instead want to plot each group on a different axis. 63 | 64 | - To do so, use the `facet_wrap()` function to place each of the `primary_fur_colors` onto their own set of axes. 65 | - To help you along, I'll note that within `facet_wrap()` you will probably have to use the argument `facets = vars(primary_fur_color)`. This is a bit of a weird part of the `ggplot` api, and something that I hope they'll fix in the future. 66 | - However, at least their fair about telling you that you'll have to use the `vars()` function -- look into the help documentation for this function. 67 | 68 | ```{r} 69 | ?facet_wrap() 70 | ``` 71 | 72 | - Like before, I'll start you down the road for this plot by doing the data mapping. 73 | - Which way communicates more clearly for you? Aligning the plots by rows? Or aligning them by columns? Why do you think this is? 74 | 75 | ```{r} 76 | squirrel_subset %>% 77 | group_by(date_f, primary_fur_color) %>% 78 | summarise(count_of_squirrels = n()) 79 | ``` 80 | 81 | -------------------------------------------------------------------------------- /code/make_scatter_plots_solution.md: -------------------------------------------------------------------------------- 1 | 2 | ``` r 3 | library(tidyverse) 4 | library(ggplot2) 5 | theme_set(theme_minimal()) 6 | knitr::opts_chunk$set(dpi = 200) 7 | ``` 8 | 9 | This output is best viewed over in github because we cannot render 10 | images into the ISVC. 11 | 12 | # Load data: this should just work 13 | 14 | ``` r 15 | squirrel_subset <- read_csv('./squirrels_subset.csv') 16 | ``` 17 | 18 | ## Parsed with column specification: 19 | ## cols( 20 | ## long = col_double(), 21 | ## lat = col_double(), 22 | ## hectare = col_character(), 23 | ## date = col_double(), 24 | ## age = col_character(), 25 | ## primary_fur_color = col_character() 26 | ## ) 27 | 28 | ## Create a basic plot of the observations on the lat and long axis 29 | 30 | ``` r 31 | ggplot(data = squirrel_subset) + 32 | aes(x = long, y = lat) + 33 | geom_point() 34 | ``` 35 | 36 | ![](make_scatter_plots_solution_files/figure-gfm/basic%20plot-1.png) 37 | 38 | > This looks like central park to me\! 39 | 40 | # Task 1: Color by age 41 | 42 | Write code that will modify the plot so that it is colored by age. 43 | 44 | ``` r 45 | ggplot(data = squirrel_subset) + 46 | aes(x = long, y = lat, color = age) + 47 | geom_point() 48 | ``` 49 | 50 | ![](make_scatter_plots_solution_files/figure-gfm/age%20plot-1.png) 51 | 52 | # Task 2: Make Every Point Blue 53 | 54 | - Now, write code that will make every point blue, not colored by age. 55 | - Notice that now this choice isn’t an attribute of the data. 56 | - Where does this mean that the `color` argument should go? 57 | 58 | 59 | 60 | ``` r 61 | ggplot(data = squirrel_subset) + 62 | aes(x = long, y = lat) + 63 | geom_point(color = 'blue') 64 | ``` 65 | 66 | ![](make_scatter_plots_solution_files/figure-gfm/feeling%20blue-1.png) 67 | 68 | # Task 3: Color by the Fur Color 69 | 70 | - Now, write code that will color the points by the variable 71 | `primary_fur_color`. 72 | - Notice that this now *is* an attribute of the data. So, where should 73 | the `color` argument go? 74 | 75 | 76 | 77 | ``` r 78 | ggplot(data = squirrel_subset) + 79 | aes(x = long, y = lat, color = primary_fur_color) + 80 | geom_point() 81 | ``` 82 | 83 | ![](make_scatter_plots_solution_files/figure-gfm/fur%20color%20plot-1.png) 84 | 85 | # Task 4: Put onto non-euclidian space 86 | 87 | - If you think carefully about this, we’re mapping the geographic 88 | coordinate system onto the euclidian coordinate system. This isn’t a 89 | *huge* deal in this case because we’re only covering central park. 90 | But, what’s right is right… 91 | 92 | - ggplot has the ability to map onto the geographic coordinate system 93 | using the additional function `coord_quickmap()` (which is an 94 | approximation) or `coord_map()` which is not an approximation 95 | 96 | - Given what you understand about the layering system that ggplot 97 | uses, can you add on this new layer that is the `coord_quickmap()` 98 | coordiante system? 99 | 100 | - If so, how much does it change the plot? 101 | 102 | 103 | 104 | ``` r 105 | ggplot(data = squirrel_subset) + 106 | aes(x = long, y = lat, color = primary_fur_color) + 107 | coord_quickmap() + 108 | geom_point() 109 | ``` 110 | 111 | ![](make_scatter_plots_solution_files/figure-gfm/non%20euclidian%20space-1.png) 112 | -------------------------------------------------------------------------------- /reading_calls/make_scatter_plots_solution.md: -------------------------------------------------------------------------------- 1 | 2 | ``` r 3 | library(tidyverse) 4 | library(ggplot2) 5 | theme_set(theme_minimal()) 6 | knitr::opts_chunk$set(dpi = 200) 7 | ``` 8 | 9 | This output is best viewed over in github because we cannot render 10 | images into the ISVC. 11 | 12 | # Load data: this should just work 13 | 14 | ``` r 15 | squirrel_subset <- read_csv('./squirrels_subset.csv') 16 | ``` 17 | 18 | ## Parsed with column specification: 19 | ## cols( 20 | ## long = col_double(), 21 | ## lat = col_double(), 22 | ## hectare = col_character(), 23 | ## date = col_double(), 24 | ## age = col_character(), 25 | ## primary_fur_color = col_character() 26 | ## ) 27 | 28 | ## Create a basic plot of the observations on the lat and long axis 29 | 30 | ``` r 31 | ggplot(data = squirrel_subset) + 32 | aes(x = long, y = lat) + 33 | geom_point() 34 | ``` 35 | 36 | ![](make_scatter_plots_solution_files/figure-gfm/basic%20plot-1.png) 37 | 38 | > This looks like central park to me\! 39 | 40 | # Task 1: Color by age 41 | 42 | Write code that will modify the plot so that it is colored by age. 43 | 44 | ``` r 45 | ggplot(data = squirrel_subset) + 46 | aes(x = long, y = lat, color = age) + 47 | geom_point() 48 | ``` 49 | 50 | ![](make_scatter_plots_solution_files/figure-gfm/age%20plot-1.png) 51 | 52 | # Task 2: Make Every Point Blue 53 | 54 | - Now, write code that will make every point blue, not colored by age. 55 | - Notice that now this choice isn’t an attribute of the data. 56 | - Where does this mean that the `color` argument should go? 57 | 58 | 59 | 60 | ``` r 61 | ggplot(data = squirrel_subset) + 62 | aes(x = long, y = lat) + 63 | geom_point(color = 'blue') 64 | ``` 65 | 66 | ![](make_scatter_plots_solution_files/figure-gfm/feeling%20blue-1.png) 67 | 68 | # Task 3: Color by the Fur Color 69 | 70 | - Now, write code that will color the points by the variable 71 | `primary_fur_color`. 72 | - Notice that this now *is* an attribute of the data. So, where should 73 | the `color` argument go? 74 | 75 | 76 | 77 | ``` r 78 | ggplot(data = squirrel_subset) + 79 | aes(x = long, y = lat, color = primary_fur_color) + 80 | geom_point() 81 | ``` 82 | 83 | ![](make_scatter_plots_solution_files/figure-gfm/fur%20color%20plot-1.png) 84 | 85 | # Task 4: Put onto non-euclidian space 86 | 87 | - If you think carefully about this, we’re mapping the geographic 88 | coordinate system onto the euclidian coordinate system. This isn’t a 89 | *huge* deal in this case because we’re only covering central park. 90 | But, what’s right is right… 91 | 92 | - ggplot has the ability to map onto the geographic coordinate system 93 | using the additional function `coord_quickmap()` (which is an 94 | approximation) or `coord_map()` which is not an approximation 95 | 96 | - Given what you understand about the layering system that ggplot 97 | uses, can you add on this new layer that is the `coord_quickmap()` 98 | coordiante system? 99 | 100 | - If so, how much does it change the plot? 101 | 102 | 103 | 104 | ``` r 105 | ggplot(data = squirrel_subset) + 106 | aes(x = long, y = lat, color = primary_fur_color) + 107 | coord_quickmap() + 108 | geom_point() 109 | ``` 110 | 111 | ![](make_scatter_plots_solution_files/figure-gfm/non%20euclidian%20space-1.png) 112 | -------------------------------------------------------------------------------- /code/working_with_rstudio.md: -------------------------------------------------------------------------------- 1 | Interacting with the IDE Solutions 2 | ================ 3 | w203: Statistics for Data Science 4 | 8/13/2020 5 | 6 | ``` r 7 | ## Welcome! You've made it over to the IDE. 8 | 9 | ## Any line that starts with one or more `#` will be commented out. 10 | ## This means that if you run that line, nothing will actually occur in the 11 | ## interpretor. 12 | 13 | ## To run this code below you can do the following: 14 | ## - If you are on a Mac, on the line that you want to run you can press `command+return` 15 | ## which means to hold command and then press return. 16 | ## - If you are on a Windows or Linux machine, then you can run that line by pressing `alt+return` 17 | ## which means to hold the alt key and press return. 18 | 19 | ## When you run the first line one of two things might happen: 20 | ## 1. You might get an error because you haven't installed that package. If this happens, 21 | ## look near the top of your Rstudio screen -- there should be a helper that asks if 22 | ## you want to install this library. You do and can click "install". 23 | ## 2. If you've already installed that library, then it should load the package, which 24 | ## you will see in the console below. 25 | 26 | 27 | library(ggplot2) 28 | 29 | ## Now, if you want to create some data, you can either 30 | ## - Run the first line where you are creating the object `d` that is a data.frame; or, 31 | ## - Highlight the region that you want to run and then run that region (using command+return 32 | ## or alt+return). 33 | 34 | 35 | d <- data.frame( 36 | id = 1:1000, 37 | x = rnorm(1000, mean = 0, sd = 1), 38 | y = rnorm(1000, mean = 10, sd = 2), 39 | color = sample(c('red', 'blue'), size = 1000, replace = TRUE) 40 | ) 41 | 42 | ## To produce the plot below, run these lines. Do you need to run all the lines? 43 | ## Or, can you run just the first? Why do you think the IDE chooses to behave this way? 44 | 45 | ggplot(data = d, aes(x=x, y=y)) + 46 | geom_point() 47 | ``` 48 | 49 | ![](working_with_rstudio_files/figure-gfm/unnamed-chunk-1-1.png) 50 | 51 | ``` r 52 | ## Finally, you can run code that doesn't have any visible side effects. 53 | ## If you run the line below, what do you see in your console? Just that the line has run? 54 | ## But, now look into the `Environment` tab that is visible to you -- is there a record 55 | ## of this `mod` that you just created? 56 | 57 | mod <- lm(y ~ x, data = d) 58 | 59 | ## The model that you created is stored in the working memory and can be called by 60 | ## naming the object. 61 | 62 | mod 63 | ``` 64 | 65 | ## 66 | ## Call: 67 | ## lm(formula = y ~ x, data = d) 68 | ## 69 | ## Coefficients: 70 | ## (Intercept) x 71 | ## 9.93161 -0.06939 72 | 73 | ``` r 74 | ## If you want to use the summary function on the model, you can and you will see 75 | ## a different return printed to the console. 76 | 77 | summary(mod) 78 | ``` 79 | 80 | ## 81 | ## Call: 82 | ## lm(formula = y ~ x, data = d) 83 | ## 84 | ## Residuals: 85 | ## Min 1Q Median 3Q Max 86 | ## -5.7238 -1.3353 -0.0679 1.3411 6.1006 87 | ## 88 | ## Coefficients: 89 | ## Estimate Std. Error t value Pr(>|t|) 90 | ## (Intercept) 9.93161 0.06249 158.925 <2e-16 *** 91 | ## x -0.06939 0.06334 -1.096 0.273 92 | ## --- 93 | ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 94 | ## 95 | ## Residual standard error: 1.976 on 998 degrees of freedom 96 | ## Multiple R-squared: 0.001201, Adjusted R-squared: 0.0002006 97 | ## F-statistic: 1.2 on 1 and 998 DF, p-value: 0.2735 98 | -------------------------------------------------------------------------------- /code/working_with_rstudio_solution.md: -------------------------------------------------------------------------------- 1 | Interacting with the IDE Solutions 2 | ================ 3 | w203: Statistics for Data Science 4 | 8/13/2020 5 | 6 | ``` r 7 | ## Welcome! You've made it over to the IDE. 8 | 9 | ## Any line that starts with one or more `#` will be commented out. 10 | ## This means that if you run that line, nothing will actually occur in the 11 | ## interpretor. 12 | 13 | ## To run this code below you can do the following: 14 | ## - If you are on a Mac, on the line that you want to run you can press `command+return` 15 | ## which means to hold command and then press return. 16 | ## - If you are on a Windows or Linux machine, then you can run that line by pressing `alt+return` 17 | ## which means to hold the alt key and press return. 18 | 19 | ## When you run the first line one of two things might happen: 20 | ## 1. You might get an error because you haven't installed that package. If this happens, 21 | ## look near the top of your Rstudio screen -- there should be a helper that asks if 22 | ## you want to install this library. You do and can click "install". 23 | ## 2. If you've already installed that library, then it should load the package, which 24 | ## you will see in the console below. 25 | 26 | 27 | library(ggplot2) 28 | 29 | ## Now, if you want to create some data, you can either 30 | ## - Run the first line where you are creating the object `d` that is a data.frame; or, 31 | ## - Highlight the region that you want to run and then run that region (using command+return 32 | ## or alt+return). 33 | 34 | 35 | d <- data.frame( 36 | id = 1:1000, 37 | x = rnorm(1000, mean = 0, sd = 1), 38 | y = rnorm(1000, mean = 10, sd = 2), 39 | color = sample(c('red', 'blue'), size = 1000, replace = TRUE) 40 | ) 41 | 42 | ## To produce the plot below, run these lines. Do you need to run all the lines? 43 | ## Or, can you run just the first? Why do you think the IDE chooses to behave this way? 44 | 45 | ggplot(data = d, aes(x=x, y=y)) + 46 | geom_point() 47 | ``` 48 | 49 | ![](working_with_rstudio_solution_files/figure-gfm/unnamed-chunk-1-1.png) 50 | 51 | ``` r 52 | ## Finally, you can run code that doesn't have any visible side effects. 53 | ## If you run the line below, what do you see in your console? Just that the line has run? 54 | ## But, now look into the `Environment` tab that is visible to you -- is there a record 55 | ## of this `mod` that you just created? 56 | 57 | mod <- lm(y ~ x, data = d) 58 | 59 | ## The model that you created is stored in the working memory and can be called by 60 | ## naming the object. 61 | 62 | mod 63 | ``` 64 | 65 | ## 66 | ## Call: 67 | ## lm(formula = y ~ x, data = d) 68 | ## 69 | ## Coefficients: 70 | ## (Intercept) x 71 | ## 9.9418 0.0556 72 | 73 | ``` r 74 | ## If you want to use the summary function on the model, you can and you will see 75 | ## a different return printed to the console. 76 | 77 | summary(mod) 78 | ``` 79 | 80 | ## 81 | ## Call: 82 | ## lm(formula = y ~ x, data = d) 83 | ## 84 | ## Residuals: 85 | ## Min 1Q Median 3Q Max 86 | ## -6.8812 -1.2280 -0.0065 1.3295 5.8167 87 | ## 88 | ## Coefficients: 89 | ## Estimate Std. Error t value Pr(>|t|) 90 | ## (Intercept) 9.94179 0.06298 157.9 <2e-16 *** 91 | ## x 0.05560 0.06176 0.9 0.368 92 | ## --- 93 | ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 94 | ## 95 | ## Residual standard error: 1.991 on 998 degrees of freedom 96 | ## Multiple R-squared: 0.0008113, Adjusted R-squared: -0.0001899 97 | ## F-statistic: 0.8103 on 1 and 998 DF, p-value: 0.3683 98 | -------------------------------------------------------------------------------- /reading_calls/working_with_rstudio_solution.md: -------------------------------------------------------------------------------- 1 | Interacting with the IDE Solutions 2 | ================ 3 | w203: Statistics for Data Science 4 | 8/13/2020 5 | 6 | ``` r 7 | ## Welcome! You've made it over to the IDE. 8 | 9 | ## Any line that starts with one or more `#` will be commented out. 10 | ## This means that if you run that line, nothing will actually occur in the 11 | ## interpretor. 12 | 13 | ## To run this code below you can do the following: 14 | ## - If you are on a Mac, on the line that you want to run you can press `command+return` 15 | ## which means to hold command and then press return. 16 | ## - If you are on a Windows or Linux machine, then you can run that line by pressing `alt+return` 17 | ## which means to hold the alt key and press return. 18 | 19 | ## When you run the first line one of two things might happen: 20 | ## 1. You might get an error because you haven't installed that package. If this happens, 21 | ## look near the top of your Rstudio screen -- there should be a helper that asks if 22 | ## you want to install this library. You do and can click "install". 23 | ## 2. If you've already installed that library, then it should load the package, which 24 | ## you will see in the console below. 25 | 26 | 27 | library(ggplot2) 28 | 29 | ## Now, if you want to create some data, you can either 30 | ## - Run the first line where you are creating the object `d` that is a data.frame; or, 31 | ## - Highlight the region that you want to run and then run that region (using command+return 32 | ## or alt+return). 33 | 34 | 35 | d <- data.frame( 36 | id = 1:1000, 37 | x = rnorm(1000, mean = 0, sd = 1), 38 | y = rnorm(1000, mean = 10, sd = 2), 39 | color = sample(c('red', 'blue'), size = 1000, replace = TRUE) 40 | ) 41 | 42 | ## To produce the plot below, run these lines. Do you need to run all the lines? 43 | ## Or, can you run just the first? Why do you think the IDE chooses to behave this way? 44 | 45 | ggplot(data = d, aes(x=x, y=y)) + 46 | geom_point() 47 | ``` 48 | 49 | ![](working_with_rstudio_solution_files/figure-gfm/unnamed-chunk-1-1.png) 50 | 51 | ``` r 52 | ## Finally, you can run code that doesn't have any visible side effects. 53 | ## If you run the line below, what do you see in your console? Just that the line has run? 54 | ## But, now look into the `Environment` tab that is visible to you -- is there a record 55 | ## of this `mod` that you just created? 56 | 57 | mod <- lm(y ~ x, data = d) 58 | 59 | ## The model that you created is stored in the working memory and can be called by 60 | ## naming the object. 61 | 62 | mod 63 | ``` 64 | 65 | ## 66 | ## Call: 67 | ## lm(formula = y ~ x, data = d) 68 | ## 69 | ## Coefficients: 70 | ## (Intercept) x 71 | ## 9.9418 0.0556 72 | 73 | ``` r 74 | ## If you want to use the summary function on the model, you can and you will see 75 | ## a different return printed to the console. 76 | 77 | summary(mod) 78 | ``` 79 | 80 | ## 81 | ## Call: 82 | ## lm(formula = y ~ x, data = d) 83 | ## 84 | ## Residuals: 85 | ## Min 1Q Median 3Q Max 86 | ## -6.8812 -1.2280 -0.0065 1.3295 5.8167 87 | ## 88 | ## Coefficients: 89 | ## Estimate Std. Error t value Pr(>|t|) 90 | ## (Intercept) 9.94179 0.06298 157.9 <2e-16 *** 91 | ## x 0.05560 0.06176 0.9 0.368 92 | ## --- 93 | ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 94 | ## 95 | ## Residual standard error: 1.991 on 998 degrees of freedom 96 | ## Multiple R-squared: 0.0008113, Adjusted R-squared: -0.0001899 97 | ## F-statistic: 0.8103 on 1 and 998 DF, p-value: 0.3683 98 | -------------------------------------------------------------------------------- /code/mutate_solution.md: -------------------------------------------------------------------------------- 1 | Mutating a New Varible 2 | ================ 3 | w203: Statistics for Data Science 4 | 8/13/2020 5 | 6 | ``` r 7 | library(tidyverse) 8 | ``` 9 | 10 | ``` r 11 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 12 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 13 | ``` 14 | 15 | # Mutate Task 16 | 17 | - Using the `agencies` data, create a series of variables that contain 18 | the log of the `count` of launches. 19 | 20 | 21 | 22 | ``` r 23 | agencies_log <- agencies %>% 24 | mutate(count_log = log(count)) 25 | 26 | agencies_log 27 | ``` 28 | 29 | ## # A tibble: 74 x 20 30 | ## agency count ucode state_code type class tstart tstop short_name name 31 | ## 32 | ## 1 RVSN 1528 RVSN SU O/LA D 1960 1991… RVSN Rake… 33 | ## 2 UNKS 904 GUKOS SU O/LA D 1986 … 1991 UNKS Upra… 34 | ## 3 NASA 469 NASA US O/LA… C 1958 … - NASA Nati… 35 | ## 4 USAF 388 USAF US O/LA… D 1947 … - USAF Unit… 36 | ## 5 AE 258 AE F O/LA B 1980 … * Arianespa… Aria… 37 | ## 6 AFSC 247 AFSC US LA D 1961 … 1992… AFSC US A… 38 | ## 7 VKSR 200 GUKOS RU O/LA D 1997 … 2001… VKS RVSN Voen… 39 | ## 8 CALT 181 CALT CN LA/L… C 1957 … - CALT Zhon… 40 | ## 9 FKA 128 MOM RU O/LA C 2004 2016… Roskosmos Fede… 41 | ## 10 SAST 105 SBA CN O/LA… B 1993 - SAST Shan… 42 | ## # … with 64 more rows, and 10 more variables: location , longitude , 43 | ## # latitude , error , parent , short_english_name , 44 | ## # english_name , unicode_name , agency_type , count_log 45 | 46 | > But not that you don’t *have* to assign this to a new object. 47 | 48 | - Then, show only the columns that are called either `agency` or 49 | `contains()` the string “count”. 50 | 51 | 52 | 53 | ``` r 54 | agencies_log %>% 55 | select(agency, contains('count')) 56 | ``` 57 | 58 | ## # A tibble: 74 x 3 59 | ## agency count count_log 60 | ## 61 | ## 1 RVSN 1528 7.33 62 | ## 2 UNKS 904 6.81 63 | ## 3 NASA 469 6.15 64 | ## 4 USAF 388 5.96 65 | ## 5 AE 258 5.55 66 | ## 6 AFSC 247 5.51 67 | ## 7 VKSR 200 5.30 68 | ## 8 CALT 181 5.20 69 | ## 9 FKA 128 4.85 70 | ## 10 SAST 105 4.65 71 | ## # … with 64 more rows 72 | 73 | - Finally, `arrange()` these descending by `count`. 74 | 75 | 76 | 77 | ``` r 78 | agencies_log %>% 79 | select(agency, contains('count')) %>% 80 | arrange(desc(count)) 81 | ``` 82 | 83 | ## # A tibble: 74 x 3 84 | ## agency count count_log 85 | ## 86 | ## 1 RVSN 1528 7.33 87 | ## 2 UNKS 904 6.81 88 | ## 3 NASA 469 6.15 89 | ## 4 USAF 388 5.96 90 | ## 5 AE 258 5.55 91 | ## 6 AFSC 247 5.51 92 | ## 7 VKSR 200 5.30 93 | ## 8 CALT 181 5.20 94 | ## 9 FKA 128 4.85 95 | ## 10 SAST 105 4.65 96 | ## # … with 64 more rows 97 | 98 | > HA\! It looks as though the data came in the door arranged by count. 99 | > However, I would **never** suggest relying on this. If you want the 100 | > data arranged by count, write the code to do so. The upstream data 101 | > that comes into your analysis could change; potentially without you 102 | > knowing. 103 | > 104 | > If you want your data to have some particualr characteristic, you 105 | > should write the code that makes it be so. 106 | -------------------------------------------------------------------------------- /reading_calls/mutate_solution.md: -------------------------------------------------------------------------------- 1 | Mutating a New Varible 2 | ================ 3 | w203: Statistics for Data Science 4 | 8/13/2020 5 | 6 | ``` r 7 | library(tidyverse) 8 | ``` 9 | 10 | ``` r 11 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv') 12 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv') 13 | ``` 14 | 15 | # Mutate Task 16 | 17 | - Using the `agencies` data, create a series of variables that contain 18 | the log of the `count` of launches. 19 | 20 | 21 | 22 | ``` r 23 | agencies_log <- agencies %>% 24 | mutate(count_log = log(count)) 25 | 26 | agencies_log 27 | ``` 28 | 29 | ## # A tibble: 74 x 20 30 | ## agency count ucode state_code type class tstart tstop short_name name 31 | ## 32 | ## 1 RVSN 1528 RVSN SU O/LA D 1960 1991… RVSN Rake… 33 | ## 2 UNKS 904 GUKOS SU O/LA D 1986 … 1991 UNKS Upra… 34 | ## 3 NASA 469 NASA US O/LA… C 1958 … - NASA Nati… 35 | ## 4 USAF 388 USAF US O/LA… D 1947 … - USAF Unit… 36 | ## 5 AE 258 AE F O/LA B 1980 … * Arianespa… Aria… 37 | ## 6 AFSC 247 AFSC US LA D 1961 … 1992… AFSC US A… 38 | ## 7 VKSR 200 GUKOS RU O/LA D 1997 … 2001… VKS RVSN Voen… 39 | ## 8 CALT 181 CALT CN LA/L… C 1957 … - CALT Zhon… 40 | ## 9 FKA 128 MOM RU O/LA C 2004 2016… Roskosmos Fede… 41 | ## 10 SAST 105 SBA CN O/LA… B 1993 - SAST Shan… 42 | ## # … with 64 more rows, and 10 more variables: location , longitude , 43 | ## # latitude , error , parent , short_english_name , 44 | ## # english_name , unicode_name , agency_type , count_log 45 | 46 | > But not that you don’t *have* to assign this to a new object. 47 | 48 | - Then, show only the columns that are called either `agency` or 49 | `contains()` the string “count”. 50 | 51 | 52 | 53 | ``` r 54 | agencies_log %>% 55 | select(agency, contains('count')) 56 | ``` 57 | 58 | ## # A tibble: 74 x 3 59 | ## agency count count_log 60 | ## 61 | ## 1 RVSN 1528 7.33 62 | ## 2 UNKS 904 6.81 63 | ## 3 NASA 469 6.15 64 | ## 4 USAF 388 5.96 65 | ## 5 AE 258 5.55 66 | ## 6 AFSC 247 5.51 67 | ## 7 VKSR 200 5.30 68 | ## 8 CALT 181 5.20 69 | ## 9 FKA 128 4.85 70 | ## 10 SAST 105 4.65 71 | ## # … with 64 more rows 72 | 73 | - Finally, `arrange()` these descending by `count`. 74 | 75 | 76 | 77 | ``` r 78 | agencies_log %>% 79 | select(agency, contains('count')) %>% 80 | arrange(desc(count)) 81 | ``` 82 | 83 | ## # A tibble: 74 x 3 84 | ## agency count count_log 85 | ## 86 | ## 1 RVSN 1528 7.33 87 | ## 2 UNKS 904 6.81 88 | ## 3 NASA 469 6.15 89 | ## 4 USAF 388 5.96 90 | ## 5 AE 258 5.55 91 | ## 6 AFSC 247 5.51 92 | ## 7 VKSR 200 5.30 93 | ## 8 CALT 181 5.20 94 | ## 9 FKA 128 4.85 95 | ## 10 SAST 105 4.65 96 | ## # … with 64 more rows 97 | 98 | > HA\! It looks as though the data came in the door arranged by count. 99 | > However, I would **never** suggest relying on this. If you want the 100 | > data arranged by count, write the code to do so. The upstream data 101 | > that comes into your analysis could change; potentially without you 102 | > knowing. 103 | > 104 | > If you want your data to have some particualr characteristic, you 105 | > should write the code that makes it be so. 106 | -------------------------------------------------------------------------------- /code/additional_plot_features.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Additional Plot Features" 3 | author: 'w203: Statistics for Data Science' 4 | output: github_document 5 | --- 6 | 7 | From here on out, we're going to start working in **R Markdown Files**. These files let us weave code and explanation together. What you're reading right here is explanation -- this isn't actually run by the interpreter in the console. 8 | 9 | But, what you're seeing just below -- in a "chunk" set off by three bac-kticks -- is a code block. 10 | 11 | If you look at line 14, and run this line just as you would have if this were an .R file (it is a .Rmd file) what happens? 12 | 13 | ```{r} 14 | print('Hello world.') 15 | ``` 16 | 17 | Now, the result rather than being printed to the console is both printed in the console and in line with this code. One piece that you might have noticed is the curly braces after the first set of back-ticks. This is telling the interpreter that this is code that is written in the R languge. It is possible (though we won't do this for now) to write python, julia, c++ or other code within these chunks and have the interpreter evaluate them. 18 | 19 | Below, I'm going to write a chunk that I've called "setup" that is going to load libraries and set themes and set parameters for plots. I've also added extra arguments to this code chunk declaration -- `results = 'hide', warning=FALSE, message=FALSE`. These control how the chunk works, but let's not dig **too** far into that just yet. 20 | 21 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 22 | library(tidyverse) 23 | library(ggplot2) 24 | theme_set(theme_minimal()) 25 | knitr::opts_chunk$set(dpi = 200) 26 | ``` 27 | 28 | In this chunk, which I've called "load data" I'm going to load the data, and then create the date field that we've used all along. 29 | 30 | ```{r load data} 31 | squirrel_subset <- read_csv('./squirrels_subset.csv') 32 | squirrel_subset <- squirrel_subset %>% 33 | mutate(date_f = as.Date.character(date, format = '%m%d%Y')) 34 | ``` 35 | 36 | Now, you can see below that I'm creating a section in the markdown space by starting the line with a "hash". If this were in a code chunk, it would comment out the line, but here in the markdown space, this will create a level-1 heading. Two hashes would create a level-2 heading (one further indent). 37 | 38 | # Data Reminder 39 | 40 | As a reminder, we're working with data that is from the "census of squirrels" in NYC's central park. 41 | Each row is an observation of a squirrel, and each observation has as much data as possible about the observation. 42 | 43 | ## Previous Plots 44 | 45 | The last set of plots that we made with `geom_point()` were simple scatter plots where we passed: 46 | 47 | - The `long`; 48 | - The `lat`; and, 49 | - One other feature. 50 | 51 | ```{r the old plot} 52 | ggplot(data = squirrel_subset) + 53 | aes(x = long, y = lat) + 54 | geom_point() + 55 | coord_quickmap() 56 | ``` 57 | 58 | 59 | # This plot 60 | 61 | For this plot, I'd like you to represent four data series in a single plot. Because each of these are information in the dataset that we want to control the way the plot is built, each of the series should go into the `aes()` funciton. 62 | 63 | 1. The `long`; 64 | 2. The `lat`; 65 | 3. The `age`; 66 | 4. The `primary_fur_color`. 67 | 68 | I'm sure you're wondering? What aestetic options are available to me? To look into this, let's look into the documentation for `geom_point()`, because the `geom_point()` inherits its aestetics from the `aes()` function. 69 | 70 | When you run the line below, your Rstudio should then open a help browser 71 | 72 | ```{r geom_point_help} 73 | ?geom_point() 74 | ``` 75 | 76 | I see several aesthetics listed: 77 | 78 | - x 79 | - y 80 | - alpha, ... and many more. 81 | 82 | Before you start to build the plot -- think about what you might want to show -- you might go as far as to write down the plot that you want to create (as I've just suggested in the lecture) but that isn't strictly necessary because this is reasonably straightforward data. 83 | Once you know what you'd like to show, pass data series into these aesthetics until you have created a plot that you think does a good job of representing your idea. 84 | 85 | ```{r} 86 | 87 | ``` -------------------------------------------------------------------------------- /code/grouped_data_solution.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Grouped Data" 3 | author: 'w203: Statistics for Data Science' 4 | output: github_document 5 | --- 6 | 7 | ```{r setup, results='hide', warning=FALSE, message=FALSE} 8 | library(tidyverse) 9 | library(ggplot2) 10 | # install.packages('patchwork') 11 | library(patchwork) 12 | 13 | theme_set(theme_minimal()) 14 | knitr::opts_chunk$set(dpi = 200) 15 | ``` 16 | 17 | ```{r load and mutate data} 18 | squirrel_subset <- read.csv('squirrels_subset.csv') 19 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color)) 20 | 21 | squirrel_subset <- squirrel_subset %>% 22 | mutate(date_f = as.Date.character(date, format = '%m%d%Y')) 23 | 24 | ``` 25 | 26 | # Plots Groups with Colors 27 | 28 | A few days ago, we plotted the count of squirrels that were observed over time. 29 | 30 | - Because it was by time, we reasoned that a line plot did a good job of illustrating the connections between the observations 31 | - The plot looked something like what is below 32 | 33 | ```{r ungrouped plot} 34 | squirrel_subset %>% 35 | group_by(date_f) %>% 36 | summarise(count_of_squirrels = n()) %>% 37 | ggplot() + 38 | aes(x = date_f, y = count_of_squirrels) + 39 | geom_line() 40 | ``` 41 | At that point, we asked a challenge question of you, that asked, 42 | 43 | > Could you also make this plot and represent the color of the squirrels in the plot? 44 | 45 | Here, I've written the first set of lines that would do this for you -- this takes the squirrel subset data, groups by date and fur color, and then counts the number of squirrels that are observed in each of these combinations. 46 | 47 | Complete the plot, by: 48 | 49 | - Adding a `ggplot()` call; 50 | - Adding an `aes()` call; and, 51 | - Adding a `geom_line()` call to produce the line 52 | 53 | Think, as you're drawing this plot -- what parts of this are mapping from data that I want to bring into the plot? What do I want to map that information onto? This might help to keep clear the code that you want to write. 54 | 55 | ```{r} 56 | squirrel_subset %>% 57 | group_by(date_f, primary_fur_color) %>% 58 | summarise(count_of_colors = n()) %>% 59 | ggplot() + 60 | aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 61 | geom_line() 62 | ``` 63 | 64 | # Plot Groups with Different Graphs 65 | 66 | Although I think that it is probably uniformly **less** effective of a representation in this case, you might instead want to plot each group on a different axis. 67 | 68 | - To do so, use the `facet_wrap()` function to place each of the `primary_fur_colors` onto their own set of axes. 69 | - To help you along, I'll note that within `facet_wrap()` you will probably have to use the argument `facets = vars(primary_fur_color)`. This is a bit of a weird part of the `ggplot` api, and something that I hope they'll fix in the future. 70 | - However, at least their fair about telling you that you'll have to use the `vars()` function -- look into the help documentation for this function. 71 | 72 | ```{r} 73 | ?facet_wrap() 74 | ``` 75 | 76 | - Like before, I'll start you down the road for this plot by doing the data mapping. 77 | - Which way communicates more clearly for you? Aligning the plots by rows? Or aligning them by columns? Why do you think this is? 78 | 79 | ```{r} 80 | squirrel_subset %>% 81 | group_by(date_f, primary_fur_color) %>% 82 | summarise(count_of_squirrels = n()) %>% 83 | ggplot() + 84 | aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 85 | facet_wrap(facets = vars(primary_fur_color), nrow = 3) + 86 | geom_line() 87 | ``` 88 | 89 | > For me, I think that these plots work a **little** bit better when they are stacked vertically, because then I can see that all the dates align. Of course, immediately upon realizing this, then it becomes very clear that this plot would be more successful it were to be placed on a single set of axes. 90 | > 91 | > This kind of iterative plot making is (or at least can be) quite fun; and, once you realize that this mapping doesn't work better, you can return to the single set of axes. 92 | 93 | ```{r} 94 | squirrel_subset %>% 95 | group_by(date_f, primary_fur_color) %>% 96 | summarise(count_of_colors = n()) %>% 97 | ggplot() + 98 | aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 99 | geom_line() 100 | ``` 101 | 102 | -------------------------------------------------------------------------------- /code/code_in_videos.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(ggplot2) 3 | library(patchwork) 4 | 5 | squirrel_subset <- read.csv('squirrels_subset.csv') 6 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color)) 7 | 8 | squirrel_subset <- squirrel_subset %>% 9 | mutate(date_f = as.Date.character(date, format = '%m%d%Y')) 10 | 11 | squirrel_subset %>% 12 | group_by(date_f) %>% 13 | summarise(count_of_squirrels = n()) %>% 14 | ggplot() + 15 | aes(x = date_f, y = count_of_squirrels) + 16 | geom_line() 17 | 18 | squirrel_subset %>% 19 | ggplot() + 20 | aes(primary_fur_color) + 21 | geom_bar() + 22 | facet_grid(cols = vars(date_f)) 23 | 24 | squirrel_subset %>% 25 | ggplot() + 26 | aes(date_f) + 27 | geom_bar() + 28 | facet_grid(cols = vars(primary_fur_color)) 29 | 30 | squirrel_subset %>% 31 | group_by(date_f, primary_fur_color) %>% 32 | summarise(count_of_colors = n()) %>% 33 | ggplot() + 34 | aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 35 | geom_line() 36 | 37 | p1 <- squirrel_subset %>% 38 | ggplot() + 39 | aes(x = long, y = lat, color = primary_fur_color) + 40 | geom_point() 41 | 42 | p2 <- squirrel_subset %>% 43 | ggplot() + 44 | aes(long, fill = primary_fur_color) + 45 | geom_histogram(stat = 'density') 46 | 47 | p1 / p2 48 | 49 | 50 | 51 | 52 | squirrel_subset %>% 53 | group-by(date_f) %>% 54 | summarise(count = n()) %>% 55 | ggplot() + 56 | aes(x = date_f, y = count) + 57 | geom_line() 58 | 59 | ggplot(data = squirrel_subset) + 60 | aes(x = long, y = lat, color = primary_fur_color) + 61 | geom_point() + 62 | coord_map() 63 | 64 | 65 | squirrel_subset %>% 66 | group_by(date_f, primary_fur_color) %>% 67 | summarise(count_of_squirrels = n()) %>% 68 | ggplot() + 69 | aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 70 | geom_line() + 71 | theme_minimal() 72 | 73 | squirrel_subset %>% 74 | ggplot() + 75 | aes(x = date_f) + 76 | geom_histogram() + 77 | facet_wrap(vars(primary_fur_color), ncol = 1) 78 | 79 | 80 | squirrel_subset %>% 81 | group_by(date_f, primary_fur_color) %>% 82 | summarise(count_of_squirrels = n()) %>% 83 | ggplot() + 84 | aes(x = date_f, y = count_of_squirrels) + 85 | facet_wrap(facets = vars(primary_fur_color), nrow = 1) + 86 | geom_line() 87 | 88 | squirrel_subset %>% 89 | ggplot() + 90 | aes(date_f, fill = primary_fur_color) + 91 | geom_histogram(position = 'dodge') 92 | 93 | squirrel_subset %>% 94 | summarise(count_of_squirrels = n()) %>% 95 | ggplot() + 96 | aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 97 | stat_smooth(se = FALSE) 98 | 99 | 100 | squirrel_subset %>% 101 | group_by(date_f, primary_fur_color) %>% 102 | summarise(count_of_squirrels = n()) %>% 103 | ggplot() + 104 | aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 105 | stat_smooth(se = FALSE) + 106 | labs( 107 | title = 'There are a lot of grey squirrels', 108 | subtitle = 'But, people are collecting data in later days', 109 | x = 'Date of observation', 110 | y = 'Count of squirrels', 111 | color = 'Primary Fur Color' 112 | ) 113 | 114 | squirrel_subset %>% 115 | group_by(date_f, primary_fur_color) %>% 116 | summarise(count_of_squirrels = n()) %>% 117 | ggplot() + 118 | aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 119 | stat_smooth(se = FALSE) + 120 | lims( 121 | x = c(as.Date.character('2018-10-07'), 122 | as.Date.character('2018-10-16'))) + 123 | labs( 124 | title = 'There are a lot of grey squirrels', 125 | subtitle = 'But, people are collecting data in later days', 126 | x = 'Date of observation', 127 | y = 'Count of squirrels', 128 | color = 'Primary Fur Color' 129 | ) 130 | 131 | squirrel_subset %>% 132 | group_by(date_f, primary_fur_color) %>% 133 | summarise(count_of_squirrels = n()) %>% 134 | ggplot() + 135 | aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 136 | stat_smooth(se = FALSE) + 137 | coord_cartesian( 138 | xlim = c(as.Date.character('2018-10-07'), 139 | as.Date.character('2018-10-16')) 140 | ) + 141 | labs( 142 | title = 'There are a lot of grey squirrels', 143 | subtitle = 'But, people are collecting data in later days', 144 | x = 'Date of observation', 145 | y = 'Count of squirrels', 146 | color = 'Primary Fur Color' 147 | ) 148 | --------------------------------------------------------------------------------