├── docs
    ├── images
    │   ├── Icon
    │   ├── south_hall.jpg
    │   └── ischool_logo.png
    ├── _config.yml
    ├── schedule.md
    ├── syllabus.md
    └── index.md
├── reading_calls
    ├── Icon
    ├── load_data_solution.md
    ├── project_work_solution.md
    ├── images
    │   └── r_bridge_datahub.png
    ├── restructure_codebase_solution.md
    ├── choose_a_plot_type.md
    ├── grouped_data_solution_files
    │   └── figure-gfm
    │   │   ├── ungrouped plot-1.png
    │   │   ├── unnamed-chunk-1-1.png
    │   │   ├── unnamed-chunk-2-1.png
    │   │   ├── unnamed-chunk-3-1.png
    │   │   ├── unnamed-chunk-4-1.png
    │   │   └── unnamed-chunk-5-1.png
    ├── make_it_sparkle_solution_files
    │   └── figure-gfm
    │   │   └── coding task-1.png
    ├── make_scatter_plots_solution_files
    │   └── figure-gfm
    │   │   ├── age plot-1.png
    │   │   ├── basic plot-1.png
    │   │   ├── feeling blue-1.png
    │   │   ├── fur color plot-1.png
    │   │   └── non euclidian space-1.png
    ├── make_bar_plots_solution_files
    │   └── figure-gfm
    │   │   └── unnamed-chunk-1-1.png
    ├── make_line_plots_solution_files
    │   └── figure-gfm
    │   │   ├── unnamed-chunk-1-1.png
    │   │   └── unnamed-chunk-2-1.png
    ├── additional_features_solution_files
    │   └── figure-gfm
    │   │   ├── the old plot-1.png
    │   │   └── unnamed-chunk-2-1.png
    ├── how_to_summarise_solution_files
    │   └── figure-gfm
    │   │   ├── unnamed-chunk-1-1.png
    │   │   └── unnamed-chunk-2-1.png
    ├── working_with_rstudio_solution_files
    │   └── figure-gfm
    │   │   └── unnamed-chunk-1-1.png
    ├── setting_the_extents_solution.md
    ├── choose_a_plot_type_solution.md
    ├── join_and_merge_solution.md
    ├── load_data.md
    ├── call_to_reading.md
    ├── share_with_coursemates.md
    ├── arrange.md
    ├── filter.md
    ├── mutate.md
    ├── project_work.md
    ├── select.md
    ├── make_it_sparkle.md
    ├── pick_a_theme_solution.md
    ├── make_a_data_set.md
    ├── base_operations.md
    ├── additional_ideas_for_projects.md
    ├── group_by_summarize.md
    ├── setting_the_extents.md
    ├── keyboard_shortcuts.md
    ├── demo_of_project_outcomes.md
    ├── restructure_codebase.md
    ├── make_scatter_plots.md
    ├── alternatives_to_rstudio.md
    ├── make_line_plots.md
    ├── grouped_data.md
    ├── be_your_own_linter.md
    ├── join_and_merge.md
    ├── read_about_plots.md
    ├── r_markdown.md
    ├── rstudio_cheatsheet.md
    ├── goals_of_the_project.md
    ├── control_flow.md
    ├── tidying_arranging_and_summarizing.md
    ├── working_with_rstudio.md
    ├── how_to_summarise.md
    ├── issuing_code_and_reading_output.md
    ├── review_code.md
    ├── installing_r.md
    ├── be_your_own_linter_solution.md
    ├── review_code_solution.md
    ├── pick_a_theme.md
    ├── installing_rstudio.md
    ├── ucb_datahub.md
    ├── issuing_code_and_reading_output_solution.md
    ├── make_bar_plots_solution.md
    ├── make_bar_plots.md
    ├── make_line_plots_solution.md
    ├── make_it_sparkle_solution.md
    ├── additional_features.md
    ├── select_solution.md
    ├── summarize.md
    ├── make_a_data_set_solution.md
    ├── mini_project.md
    ├── base_operations_solution.md
    ├── introduction_to_space_data.md
    ├── how_to_summarise_solution.md
    ├── summarize_solution.md
    ├── make_scatter_plots_solution.md
    ├── working_with_rstudio_solution.md
    └── mutate_solution.md
├── code
    ├── nytimes_facet_plot.png
    ├── squirrel_fur_color.pdf
    ├── squirrel_fur_color.png
    ├── 4.1.2_creating_vectors.R
    ├── 4.3.1_c.R
    ├── grouped_data_solution_files
    │   └── figure-gfm
    │   │   ├── ungrouped plot-1.png
    │   │   ├── unnamed-chunk-1-1.png
    │   │   ├── unnamed-chunk-2-1.png
    │   │   ├── unnamed-chunk-3-1.png
    │   │   ├── unnamed-chunk-4-1.png
    │   │   └── unnamed-chunk-5-1.png
    ├── make_it_sparkle_solution_files
    │   └── figure-gfm
    │   │   └── coding task-1.png
    ├── make_scatter_plots_solution_files
    │   └── figure-gfm
    │   │   ├── age plot-1.png
    │   │   ├── basic plot-1.png
    │   │   ├── feeling blue-1.png
    │   │   ├── fur color plot-1.png
    │   │   └── non euclidian space-1.png
    ├── make_bar_plots_solution_files
    │   └── figure-gfm
    │   │   └── unnamed-chunk-1-1.png
    ├── additional_features_solution_files
    │   └── figure-gfm
    │   │   ├── the old plot-1.png
    │   │   └── unnamed-chunk-2-1.png
    ├── how_to_summarise_solution_files
    │   └── figure-gfm
    │   │   ├── unnamed-chunk-1-1.png
    │   │   └── unnamed-chunk-2-1.png
    ├── make_line_plots_solution_files
    │   └── figure-gfm
    │   │   ├── unnamed-chunk-1-1.png
    │   │   └── unnamed-chunk-2-1.png
    ├── working_with_rstudio_solution_files
    │   └── figure-gfm
    │   │   └── unnamed-chunk-1-1.png
    ├── 4.1.4_sequences_and_repeats.R
    ├── 3.17.1_arrange.R
    ├── make_squirrels_subset.R
    ├── 4.7.2_joins_and_merges.R
    ├── 3.20.1_mutate.R
    ├── 3.1.3_geom_bar.R
    ├── 3.3.1_geom__.R
    ├── 3.16.6_filter.R
    ├── 4.1.5_r_.R
    ├── 3.19_select.R
    ├── 3.22_summarise.R
    ├── 3.5.2_facet_.R
    ├── 4.13.1_show_hide_comment.Rmd
    ├── arrange.Rmd
    ├── select.Rmd
    ├── 3.25_group_by.R
    ├── 3.11.1_controlling_plot_extents.R
    ├── 3.13.1_setting_the_theme.R
    ├── mutate.Rmd
    ├── select_solution.Rmd
    ├── make_a_data_set.Rmd
    ├── group_by_summarize.Rmd
    ├── make_line_plots.R
    ├── code_for_videos.R
    ├── filter.Rmd
    ├── make_line_plots_solution.Rmd
    ├── arrange_solution.Rmd
    ├── make_bar_plots.Rmd
    ├── make_bar_plots_solution.Rmd
    ├── make_bar_plots_solution.md
    ├── make_it_sparkle.Rmd
    ├── pick_a_theme.Rmd
    ├── mutate_solution.Rmd
    ├── filter_solution.Rmd
    ├── make_line_plots_solution.md
    ├── make_scatter_plots.R
    ├── make_it_sparkle_solution.Rmd
    ├── make_it_sparkle_solution.md
    ├── make_a_data_set_solution.Rmd
    ├── base_operations.Rmd
    ├── select_solution.md
    ├── how_to_summarise.Rmd
    ├── working_with_rstudio.R
    ├── how_to_summarise_solution.Rmd
    ├── working_with_rstudio_solution.Rmd
    ├── make_scatter_plots_solution.Rmd
    ├── summarize.Rmd
    ├── summarize_solution.Rmd
    ├── group_by_summarize_solution.Rmd
    ├── make_a_data_set_solution.md
    ├── base_operations_solution.Rmd
    ├── base_operations_solution.md
    ├── how_to_summarise_solution.md
    ├── summarize_solution.md
    ├── grouped_data.Rmd
    ├── make_scatter_plots_solution.md
    ├── working_with_rstudio.md
    ├── working_with_rstudio_solution.md
    ├── mutate_solution.md
    ├── additional_plot_features.Rmd
    ├── grouped_data_solution.Rmd
    └── code_in_videos.R
├── resources
    └── cheatsheet-rstudio_ide.pdf
├── README.md
└── .gitignore


/docs/images/Icon:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/reading_calls/Icon:
--------------------------------------------------------------------------------
1 | Blank Code (to be replaced when class is executed) for Icon
2 | 


--------------------------------------------------------------------------------
/code/nytimes_facet_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/nytimes_facet_plot.png


--------------------------------------------------------------------------------
/code/squirrel_fur_color.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/squirrel_fur_color.pdf


--------------------------------------------------------------------------------
/code/squirrel_fur_color.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/squirrel_fur_color.png


--------------------------------------------------------------------------------
/docs/images/south_hall.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/docs/images/south_hall.jpg


--------------------------------------------------------------------------------
/docs/images/ischool_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/docs/images/ischool_logo.png


--------------------------------------------------------------------------------
/reading_calls/load_data_solution.md:
--------------------------------------------------------------------------------
1 | # Load Data Solution 
2 | 
3 | Well, there wasn't anything to do. :100: :tada:
4 | 


--------------------------------------------------------------------------------
/reading_calls/project_work_solution.md:
--------------------------------------------------------------------------------
1 | Blank Code (to be replaced when class is executed) for project_work_solution.md
2 | 


--------------------------------------------------------------------------------
/resources/cheatsheet-rstudio_ide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/resources/cheatsheet-rstudio_ide.pdf


--------------------------------------------------------------------------------
/reading_calls/images/r_bridge_datahub.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/images/r_bridge_datahub.png


--------------------------------------------------------------------------------
/reading_calls/restructure_codebase_solution.md:
--------------------------------------------------------------------------------
1 | # Solutions 
2 | 
3 | Because we just read this semester, there isn't a solution here. 
4 | 


--------------------------------------------------------------------------------
/code/4.1.2_creating_vectors.R:
--------------------------------------------------------------------------------
1 | x <- seq(from = 10, to = 40, by = 10)
2 | y <- seq(from = 1, to = 4, by = 1)^2
3 | 
4 | x - y
5 | 
6 | mean(x)
7 | 


--------------------------------------------------------------------------------
/code/4.3.1_c.R:
--------------------------------------------------------------------------------
1 | animals <- c('aarvark', 'baboon', 'cheetah', 'duck')
2 | size    <- c(2, 2, 3, 1)
3 | size
4 | 
5 | data.frame(
6 |   animals, 
7 |   size
8 | )


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-minimal
2 | title: r_bridge
3 | description: Course website for the R Bridge course
4 | logo: ./images/south_hall.jpg
5 | 


--------------------------------------------------------------------------------
/reading_calls/choose_a_plot_type.md:
--------------------------------------------------------------------------------
1 | # Choosing A Plot Type 
2 | 
3 | For this semester, we're not asking you to code anything at this point! Keep on keeping on! 
4 | 


--------------------------------------------------------------------------------
/code/grouped_data_solution_files/figure-gfm/ungrouped plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/grouped_data_solution_files/figure-gfm/ungrouped plot-1.png


--------------------------------------------------------------------------------
/code/grouped_data_solution_files/figure-gfm/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/grouped_data_solution_files/figure-gfm/unnamed-chunk-1-1.png


--------------------------------------------------------------------------------
/code/grouped_data_solution_files/figure-gfm/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/grouped_data_solution_files/figure-gfm/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/code/grouped_data_solution_files/figure-gfm/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/grouped_data_solution_files/figure-gfm/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/code/grouped_data_solution_files/figure-gfm/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/grouped_data_solution_files/figure-gfm/unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/code/grouped_data_solution_files/figure-gfm/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/grouped_data_solution_files/figure-gfm/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/code/make_it_sparkle_solution_files/figure-gfm/coding task-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/make_it_sparkle_solution_files/figure-gfm/coding task-1.png


--------------------------------------------------------------------------------
/code/make_scatter_plots_solution_files/figure-gfm/age plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/make_scatter_plots_solution_files/figure-gfm/age plot-1.png


--------------------------------------------------------------------------------
/code/make_bar_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/make_bar_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png


--------------------------------------------------------------------------------
/code/make_scatter_plots_solution_files/figure-gfm/basic plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/make_scatter_plots_solution_files/figure-gfm/basic plot-1.png


--------------------------------------------------------------------------------
/code/additional_features_solution_files/figure-gfm/the old plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/additional_features_solution_files/figure-gfm/the old plot-1.png


--------------------------------------------------------------------------------
/code/how_to_summarise_solution_files/figure-gfm/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/how_to_summarise_solution_files/figure-gfm/unnamed-chunk-1-1.png


--------------------------------------------------------------------------------
/code/how_to_summarise_solution_files/figure-gfm/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/how_to_summarise_solution_files/figure-gfm/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/code/make_line_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/make_line_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png


--------------------------------------------------------------------------------
/code/make_line_plots_solution_files/figure-gfm/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/make_line_plots_solution_files/figure-gfm/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/code/make_scatter_plots_solution_files/figure-gfm/feeling blue-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/make_scatter_plots_solution_files/figure-gfm/feeling blue-1.png


--------------------------------------------------------------------------------
/code/make_scatter_plots_solution_files/figure-gfm/fur color plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/make_scatter_plots_solution_files/figure-gfm/fur color plot-1.png


--------------------------------------------------------------------------------
/code/additional_features_solution_files/figure-gfm/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/additional_features_solution_files/figure-gfm/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/code/make_scatter_plots_solution_files/figure-gfm/non euclidian space-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/make_scatter_plots_solution_files/figure-gfm/non euclidian space-1.png


--------------------------------------------------------------------------------
/code/working_with_rstudio_solution_files/figure-gfm/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/code/working_with_rstudio_solution_files/figure-gfm/unnamed-chunk-1-1.png


--------------------------------------------------------------------------------
/reading_calls/grouped_data_solution_files/figure-gfm/ungrouped plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/grouped_data_solution_files/figure-gfm/ungrouped plot-1.png


--------------------------------------------------------------------------------
/reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-1-1.png


--------------------------------------------------------------------------------
/reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/grouped_data_solution_files/figure-gfm/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/reading_calls/make_it_sparkle_solution_files/figure-gfm/coding task-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/make_it_sparkle_solution_files/figure-gfm/coding task-1.png


--------------------------------------------------------------------------------
/reading_calls/make_scatter_plots_solution_files/figure-gfm/age plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/make_scatter_plots_solution_files/figure-gfm/age plot-1.png


--------------------------------------------------------------------------------
/reading_calls/make_scatter_plots_solution_files/figure-gfm/basic plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/make_scatter_plots_solution_files/figure-gfm/basic plot-1.png


--------------------------------------------------------------------------------
/reading_calls/make_bar_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/make_bar_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png


--------------------------------------------------------------------------------
/reading_calls/make_line_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/make_line_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png


--------------------------------------------------------------------------------
/reading_calls/make_line_plots_solution_files/figure-gfm/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/make_line_plots_solution_files/figure-gfm/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/reading_calls/make_scatter_plots_solution_files/figure-gfm/feeling blue-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/make_scatter_plots_solution_files/figure-gfm/feeling blue-1.png


--------------------------------------------------------------------------------
/reading_calls/additional_features_solution_files/figure-gfm/the old plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/additional_features_solution_files/figure-gfm/the old plot-1.png


--------------------------------------------------------------------------------
/reading_calls/how_to_summarise_solution_files/figure-gfm/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/how_to_summarise_solution_files/figure-gfm/unnamed-chunk-1-1.png


--------------------------------------------------------------------------------
/reading_calls/how_to_summarise_solution_files/figure-gfm/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/how_to_summarise_solution_files/figure-gfm/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/reading_calls/make_scatter_plots_solution_files/figure-gfm/fur color plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/make_scatter_plots_solution_files/figure-gfm/fur color plot-1.png


--------------------------------------------------------------------------------
/reading_calls/additional_features_solution_files/figure-gfm/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/additional_features_solution_files/figure-gfm/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/reading_calls/working_with_rstudio_solution_files/figure-gfm/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/working_with_rstudio_solution_files/figure-gfm/unnamed-chunk-1-1.png


--------------------------------------------------------------------------------
/reading_calls/make_scatter_plots_solution_files/figure-gfm/non euclidian space-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UCB-MIDS/r_bridge/HEAD/reading_calls/make_scatter_plots_solution_files/figure-gfm/non euclidian space-1.png


--------------------------------------------------------------------------------
/reading_calls/setting_the_extents_solution.md:
--------------------------------------------------------------------------------
1 | # Setting the Extents Solution 
2 | 
3 | Because you haven't coded anything on this part; there isn't a solution to dig into. 
4 | 
5 | > A rolling stone gathers no moss. 
6 | 


--------------------------------------------------------------------------------
/reading_calls/choose_a_plot_type_solution.md:
--------------------------------------------------------------------------------
1 | # Choose A Plot Type 
2 | 
3 | Because you haven't coded anthing at this point -- I suppose that means that you're code is correct? Or, is it a divide by zero error? :thinking:
4 | 


--------------------------------------------------------------------------------
/reading_calls/join_and_merge_solution.md:
--------------------------------------------------------------------------------
1 | # Joins and Merges Solutions 
2 | 
3 | Because this was mostly a reading exercise, there's nothing to provide a solution for right now. _Join us_ in moving forward. :expressionless:
4 | 


--------------------------------------------------------------------------------
/reading_calls/load_data.md:
--------------------------------------------------------------------------------
1 | # Load Data 
2 | 
3 | - For now, we didn't think that there was enough to do to actually practice loading data. 
4 | - Keep these concepts in mind for when you come into w203 or your other enterprises. 
5 | 
6 | 


--------------------------------------------------------------------------------
/code/4.1.4_sequences_and_repeats.R:
--------------------------------------------------------------------------------
 1 | one_to_ten <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
 2 | 
 3 | on_to_ten_two <- 10:1
 4 | 
 5 | seq(from = 11, to = 20, by = 2)
 6 | seq(from = 1, to = 2, by = 0.11)
 7 | 
 8 | letter_vector <- ''
 9 | 
10 | rep(c('a', 'b', 'c', 'd'), each = 2)
11 | 
12 | rep(1:10, times = 1:10)
13 | 


--------------------------------------------------------------------------------
/reading_calls/call_to_reading.md:
--------------------------------------------------------------------------------
1 | # Reading about Base Methods 
2 | 
3 | Please read the following chapter about **Vectors** in _*_R For Data Science_: 
4 | 
5 | - If you're reading the print copy, read Chapter 16. 
6 | - If you're reading the digital copy, read[Chapter 20](https://r4ds.had.co.nz/vectors.html) 
7 | 


--------------------------------------------------------------------------------
/reading_calls/share_with_coursemates.md:
--------------------------------------------------------------------------------
1 | # Share with Coursemates 
2 | 
3 | If you want to share your work with coursemates, but in a way that is less permanent than posting it to the class page here is ISVC, there's a slack channel that you can use!
4 | 
5 | In the I School Slack, it is `#r_bridge_showcase`. 
6 | 


--------------------------------------------------------------------------------
/reading_calls/arrange.md:
--------------------------------------------------------------------------------
1 | # Arrange 
2 | 
3 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `arrange.Rmd`. 
4 | - This code will ask you to arrange the rows of the space launches data. 
5 | 


--------------------------------------------------------------------------------
/reading_calls/filter.md:
--------------------------------------------------------------------------------
1 | # Filter
2 | 
3 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `filter.Rmd`. 
4 | - This code will ask you to filter the rows of the agencies data based on some criteria.  
5 | 


--------------------------------------------------------------------------------
/reading_calls/mutate.md:
--------------------------------------------------------------------------------
1 | # Mutate
2 | 
3 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `mutate.Rmd`. 
4 | - This code will ask you to mutate new columns of data from the data that is provided in the space data. 
5 | 


--------------------------------------------------------------------------------
/reading_calls/project_work.md:
--------------------------------------------------------------------------------
1 | # Project Work 
2 | 
3 | If you're going to work on a project, try to keep it limited to the work that we've covered to this point in 1C, or perhaps stretching just beyond what we've covered. 
4 | 
5 | You'll want to be careful that you don't burn out on this project in a way that will take away from w203 that's coming up next! 
6 | 


--------------------------------------------------------------------------------
/reading_calls/select.md:
--------------------------------------------------------------------------------
1 | # Select
2 | 
3 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `select.Rmd`. 
4 | - This code will ask you to select a subset of colums from data that you've arranged the rows of the space launches data. 
5 | 


--------------------------------------------------------------------------------
/reading_calls/make_it_sparkle.md:
--------------------------------------------------------------------------------
1 | # Make it Sparkle 
2 | 
3 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `make_it_sparkle.Rmd`. 
4 | - This code willf ask you to produce descriptive labels on the plot that we've been working with 
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/code/3.17.1_arrange.R:
--------------------------------------------------------------------------------
1 | library(tidyverse)
2 | 
3 | agencies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
4 | launches <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
5 | 
6 | agencies %>% 
7 |   arrange(desc(state_code)) %>%  
8 |   View()
9 | 


--------------------------------------------------------------------------------
/reading_calls/pick_a_theme_solution.md:
--------------------------------------------------------------------------------
1 | # Pick a Theme Solution 
2 | 
3 | There isn't really a **solution*** to this, and up until this point in the course we've been using my favorite theme -- `theme_minimal()`. 
4 | 
5 | I like that it uses a san-serif font in the plot; that it doesn't use more ink than it needs to, and that the grid lines that are internal to the plot are muted. 
6 | 
7 | Plot on! :metal:
8 | 


--------------------------------------------------------------------------------
/code/make_squirrels_subset.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | 
 3 | nyc_squirrels <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-29/nyc_squirrels.csv")
 4 | 
 5 | ss <- nyc_squirrels %>% 
 6 |   select(c('long', 'lat', 'hectare', 'date', 'age', 'primary_fur_color')) %>%  
 7 |   drop_na() %>%  
 8 |   sample_n(1000)
 9 | 
10 | write_csv(ss, path = './squirrels_subset.csv')


--------------------------------------------------------------------------------
/reading_calls/make_a_data_set.md:
--------------------------------------------------------------------------------
1 | # Heading Back to the Farm 
2 | 
3 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `make_a_data_set.Rmd`.
4 | - When you're there, bring your answers from the last coding exercize (where you were making all the animals and their weight and feeds) into the file.  
5 | 


--------------------------------------------------------------------------------
/reading_calls/base_operations.md:
--------------------------------------------------------------------------------
1 | # Heading to the Little Farm 
2 | 
3 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `base_operations.Rmd`. 
4 | - This code will ask you to create data of many forms!
5 | - There isn't really a point -- there's no analytic task that I'm asking for, just making data. 
6 | 


--------------------------------------------------------------------------------
/reading_calls/additional_ideas_for_projects.md:
--------------------------------------------------------------------------------
1 | # Additional Ideas for Projects
2 | 
3 | One other place that you might look for early idea for projects are the MIDS Capstone Showcase pages. 
4 | 
5 | These projects are universally **great** and some of them might have data that is useful. As well, folks in the community are almost always happy to talk about their work and share what they've done. 
6 | 
7 | https://www.ischool.berkeley.edu/programs/mids/capstone
8 | 


--------------------------------------------------------------------------------
/code/4.7.2_joins_and_merges.R:
--------------------------------------------------------------------------------
 1 | data_one <- data.frame(
 2 |   key_id = c('a', 'b', 'c', 'd'), 
 3 |   variable_one = 1:4, 
 4 |   variable_two = (1:4)^2
 5 | )
 6 | 
 7 | data_two <- data.frame(
 8 |   id_key = c('a', 'b', 'c', 'e'), 
 9 |   variable_a = c('apple', 'bananna', 'cantalope', 'durian'),
10 |   variable_b = c('zebra', 'yak', 'gnu', 'wombat')
11 | )
12 | 
13 | d <- merge(
14 |   x = data_one, y = data_two, 
15 |   by.x = 'key_id', by.y = 'id_key', 
16 |   all.x = TRUE, all.y = TRUE
17 |   )


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # R Bridge Course
2 | 
3 | Welcome! This is the course repository for the R Bridge course. This course is a non-graded, introductory course that we have built for students who are beginning the Masters of Information and Data Science program in the UC Berkeley School of Information. 
4 | 
5 | The code for the course is in this repository. The materials for the course website, including a syllabus and schedule, are in the `./docs` folder, and on the course website, linked in the description. 
6 | 


--------------------------------------------------------------------------------
/reading_calls/group_by_summarize.md:
--------------------------------------------------------------------------------
1 | # Group-By Summarize
2 | 
3 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `group_by_summarize.Rmd`. 
4 | - This code will ask you to group, and summarize data. 
5 | - The last question in the set might be a little tricky. The answer that you're shooting for is that the Soviet Union has the most variance in their per-year launches. 
6 | 
7 | 


--------------------------------------------------------------------------------
/code/3.20.1_mutate.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | 
 3 | agencies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
 4 | launches <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
 5 | 
 6 | agencies %>%  
 7 |   mutate(
 8 |     count_log_10 = log10(count), 
 9 |     count_log_e  = log(count), 
10 |     count_log_e_10 = count_log_e + 10) %>%  
11 |   select(agency, contains('count')) %>%  
12 |   view()
13 | 


--------------------------------------------------------------------------------
/reading_calls/setting_the_extents.md:
--------------------------------------------------------------------------------
 1 | # Setting the Extents 
 2 | 
 3 | Setting the plot extents is one of the strangest things within the `ggplot` plotting idiom. 
 4 | 
 5 | As we've noted in lecture, there are two options: 
 6 | 
 7 | 1. `coord_cartesian()`
 8 | 2. `lims()` 
 9 | 
10 | Rather than taking the time to code these at this point, just keep in mind that there are two options. If, in the future you're setting the extents and get an error message that some data has been dropped, `ggplot` will also let you know what the alternative is. 
11 | 


--------------------------------------------------------------------------------
/reading_calls/keyboard_shortcuts.md:
--------------------------------------------------------------------------------
1 | # Keyboard Shortcuts 
2 | 
3 | The Rstudio team maintains a really great series of one-page resources for many of the major projects in R and Rstudio. They are located on this [cheatsheets](https://rstudio.com/resources/cheatsheets/) website.
4 | 
5 | - Particularily relevant at this point is the cheatsheet that describes how to interact with the [Rstudio IDE](https://github.com/rstudio/cheatsheets/blob/main/rstudio-ide.pdf). 
6 | - We have also saved a frozen copy of this cheatsheet in the course repo (./resources/cheatsheet-rstudio_ide.pdf).
7 | 


--------------------------------------------------------------------------------
/code/3.1.3_geom_bar.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(ggplot2)
 3 | library(patchwork)
 4 | 
 5 | squirrel_subset <- read.csv('squirrels_subset.csv')
 6 | 
 7 | squirrel_subset_by_color <- squirrel_subset %>%  
 8 |   group_by(primary_fur_color) %>%  
 9 |   summarise(count_by_color = n())
10 | 
11 | plot_col <- squirrel_subset_by_color %>%  
12 |   ggplot() + 
13 |   aes(x = primary_fur_color, y = count_by_color) + 
14 |   geom_col()
15 | 
16 | plot_bar <- squirrel_subset %>%  
17 |   ggplot() + 
18 |   aes(x = primary_fur_color) + 
19 |   geom_bar()
20 | 
21 | plot_col | plot_bar
22 | 


--------------------------------------------------------------------------------
/code/3.3.1_geom__.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(ggplot2)
 3 | library(patchwork)
 4 | 
 5 | squirrel_subset <- read.csv('squirrels_subset.csv')
 6 | 
 7 | squirrel_scatter <- squirrel_subset %>%  
 8 |   ggplot() + 
 9 |   aes(x = long, y = lat) + 
10 |   geom_point()
11 | 
12 | squirrel_long_histogram <- squirrel_subset %>%  
13 |   ggplot() + 
14 |   aes(x = long) + 
15 |   geom_histogram()
16 | 
17 | squirrel_long_density <- squirrel_subset %>%  
18 |   ggplot() + 
19 |   aes(x = long) + 
20 |   geom_density()
21 | 
22 | squirrel_long_histogram / squirrel_long_density
23 | 
24 | geom_
25 | 


--------------------------------------------------------------------------------
/reading_calls/demo_of_project_outcomes.md:
--------------------------------------------------------------------------------
1 | # Demo of Project Outcomes 
2 | 
3 | This is the first semester that MIDS 1C has been in existence, and so there aren't any outcomes to demo just yet. 
4 | 
5 | However, if you work on a project, and you want to show it off for people in the future, let me know (@alex.h). What I'll do is bring in your knitted .md file and link it on this page. 
6 | 
7 | I really, really, really encourage you to share your code, because showing people what all kinds of good, bad, and ugly look like is useful. Plus, it will be a fun record to look back on when you're graduating. 
8 | 


--------------------------------------------------------------------------------
/reading_calls/restructure_codebase.md:
--------------------------------------------------------------------------------
1 | # Reading about codebase structure 
2 | 
3 | Rather than restructuring a codebase, this semester I'd like you to read about how Cookiecutter data science thinks that a project might be structured. 
4 | 
5 | https://drivendata.github.io/cookiecutter-data-science/
6 | 
7 | While I think that this is a starting place, it is overkill for small projects. As you come into w203, and in particular the _Hypothesis Testing_ and _Final Lab_ this structure will help you to be able to build a working pipeline that builds toward clean data that you can write clean tests on. 
8 | 


--------------------------------------------------------------------------------
/code/3.16.6_filter.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | 
 3 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
 4 | launches <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
 5 | 
 6 | # filter 
 7 | 
 8 | filter(launches, launch_year < 1968, launch_year > 1966, agency == 'US')
 9 | 
10 | # magrittr: %>%
11 | 
12 | launches %>%  
13 |   filter(launch_year < 1968) %>%  
14 |   filter(launch_year > 1966) %>%  
15 |   filter(agency == 'US' | agency == 'SU')
16 |   
17 | 
18 | 


--------------------------------------------------------------------------------
/code/4.1.5_r_.R:
--------------------------------------------------------------------------------
 1 | # population average is 100
 2 | # population sd is 20
 3 | 
 4 | rnorm(n = 42, mean = 100, sd = 20)
 5 | 
 6 | # 100: 
 7 | # pop: average 42, 
 8 | # var: 100
 9 | 
10 | first_draw <- rnorm(n = 100, mean = 42, sd = 10)
11 | second_draw <- rnorm(n = 100, mean = 42, sd = 10)
12 | 
13 | mean(first_draw == second_draw)
14 | 
15 | draws <- runif(n = 1000, min = -1, max = 9)
16 | hist(draws, col = 'black')
17 | 
18 | 
19 | 
20 | urn <- c('red_ball', 'blue_ball', 'green_ball')
21 | sample(x = urn, size = 3, replace = FALSE)
22 | 
23 | lett <- c('a', 'b', 'c', 'd', 'e', 'f')
24 | sample(lett)
25 | 
26 | 


--------------------------------------------------------------------------------
/code/3.19_select.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | 
 3 | agencies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
 4 | launches <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
 5 | 
 6 | launches %>%  
 7 |   select(launch_year, launch_date, agency, agency_type) %>% 
 8 |   filter(launch_year > 1968 & launch_year < 1972) %>%  
 9 |   arrange(desc(launch_year)) %>% 
10 |   view()
11 | 
12 | launches %>%  
13 |   select(contains('agency'), contains('launch')) %>%  
14 |   view()
15 | 


--------------------------------------------------------------------------------
/code/3.22_summarise.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | 
 3 | agencies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
 4 | launches <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
 5 | 
 6 | agencies %>%  
 7 |   mutate(count_log = log(count)) %>%  
 8 |   view
 9 | 
10 | agencies %>%  
11 |   summarize(
12 |     average_launches = mean(count), 
13 |     var_launches = var(count), 
14 |     number_of_agencies = length(unique(agencies)), 
15 |     count_log = log(count)
16 |   )
17 | 


--------------------------------------------------------------------------------
/code/3.5.2_facet_.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(ggplot2)
 3 | library(patchwork)
 4 | 
 5 | squirrel_subset <- read.csv('squirrels_subset.csv')
 6 | 
 7 | squirrel_subset <- squirrel_subset %>%  
 8 |   mutate(
 9 |     date_f = as.Date.character(date, format = '%m%d%Y')
10 |   )
11 | 
12 | squirrel_subset_by_color <- squirrel_subset %>%  
13 |   group_by(primary_fur_color) %>%  
14 |   summarise(count_by_color = n())
15 | 
16 | squirrel_subset %>%  
17 |   ggplot() + 
18 |   aes(x = date_f) + 
19 |   geom_histogram() + 
20 |   facet_wrap(vars(primary_fur_color), nrow = 3)
21 | 
22 | squirrel_subset %>%  
23 |   group_by()
24 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | Icon
 6 | 
 7 | # General
 8 | .DS_Store
 9 | .AppleDouble
10 | .LSOverride
11 | 
12 | # Icon must end with two \r
13 | Icon
14 | Icon\r\r
15 | Icon\?
16 | "Icon?
17 | 
18 | # Thumbnails
19 | ._*
20 | 
21 | # Files that might appear in the root of a volume
22 | .DocumentRevisions-V100
23 | .fseventsd
24 | .Spotlight-V100
25 | .TemporaryItems
26 | .Trashes
27 | .VolumeIcon.icns
28 | .com.apple.timemachine.donotpresent
29 | 
30 | # Directories potentially created on remote AFP share
31 | .AppleDB
32 | .AppleDesktop
33 | Network Trash Folder
34 | Temporary Items
35 | .apdisk
36 | Icon\r\r


--------------------------------------------------------------------------------
/code/4.13.1_show_hide_comment.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Show, Hide, Comment"
 3 | author: "Alex Hughes"
 4 | date: "6/7/2020"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | # Show 
13 | 
14 | - Code that loads data (always) 
15 | - Code that estimates models (always)
16 | - A Dump of Data (never)
17 | 
18 | # Hide 
19 | 
20 | - Cleaning code (unless that is the review that you're looking for)
21 | - Issues and notes to yourself 
22 | 
23 | # Comment 
24 | 
25 | - Documentation for functions, in the function
26 | - Any knowledge that someone *running* the code might need. 


--------------------------------------------------------------------------------
/reading_calls/make_scatter_plots.md:
--------------------------------------------------------------------------------
1 | # Make Scatter Plots 
2 | 
3 | Ok! Now it is your turn. I've just talked you through how I might make scatter plots that represent features about these squirrels. Now, you go and do the same. 
4 | 
5 | - Navigate to the  [UCB Datahub]( https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master)
6 | - Open the file `make_scatter_plots.R` and create the same plots that I've just created in the lectures.  
7 | - While you should feel welcome to look back at the lecture content, I'll try to provide enough scaffolding that you can write the code without needing to return to the lecture. 
8 | 


--------------------------------------------------------------------------------
/reading_calls/alternatives_to_rstudio.md:
--------------------------------------------------------------------------------
 1 | # Alternatives to Rstudio 
 2 | 
 3 | Just as jupyter has been the standard method of working with python for data science, Rstudio has emerged as the standard editor of working with R for data science. However, Rstudio is far from the only option. 
 4 | 
 5 | Of particular note for alternatives: 
 6 | 
 7 | - Emacs users will be **smugly** satisfied that ESS was purpose built for interactive programming in R. It is very, very good if you want to use a general purpose editor for writing your R code. 
 8 | - Microsoft’s VSCode has several really impressive extensions for working with the language as well. That isn't too surprising, because the primary development of the R language is now underwritten in part by Microsoft.  
 9 | 
10 | 


--------------------------------------------------------------------------------
/code/arrange.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Arrange'
 3 | author: 'w203: Statistics for Data Science'
 4 | date: "8/13/2020"
 5 | output: gitihub_document
 6 | ---
 7 | 
 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 9 | library(tidyverse)
10 | ```
11 | 
12 | # Task 
13 | 
14 | Using the `launches` data, the `arrange()` verb, and the `head()` verb: 
15 | 
16 | - Print the earliest launches
17 | 
18 | ```{r}
19 | launches %>% 
20 |   arrange(launch_date) %>% 
21 |   head()
22 | ```
23 | 
24 | - Still using the head verb, use the `desec()` adverb to print the *most recent* launches
25 | 
26 | ```{r}
27 | 
28 | ```
29 | 
30 | Adding in the `state_code` variable, 
31 | 
32 | - Arrange the data so that it is sorted first by `launch_year` and then by `state_code`
33 | 
34 | ```{r}
35 | 
36 | ```
37 | 
38 | 
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/reading_calls/make_line_plots.md:
--------------------------------------------------------------------------------
1 | # Make Line Plots 
2 | 
3 | Ok! Now it is your turn agin. I've just talked you through how I might make line plots that represent connections through time. Now, you go and do the same. Like last time, I'll provide you with most of the boilerplate code, and you can do the work to actually write the line plots out.  
4 | 
5 | - Navigate to the  [UCB Datahub]( https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master)
6 | - Open the file `make_line_plots.R` and create the same plots that I've just created in the lectures.  
7 | - While you should feel welcome to look back at the lecture content, I'll try to provide enough scaffolding that you can write the code without needing to return to the lecture. 
8 | 


--------------------------------------------------------------------------------
/code/select.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Select'
 3 | author: 'w203: Statistics for Data Science'
 4 | date: "8/13/2020"
 5 | output: github_document
 6 | ---
 7 | 
 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 9 | library(tidyverse)
10 | ```
11 | 
12 | ```{r load data, message=FALSE}
13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
15 | ```
16 | 
17 | # Task 
18 | 
19 | - From the launches data, `select()` only the columns that are related to characteristics about the country
20 | 
21 | ```{r}
22 | 
23 | ```
24 | 
25 | - From the launches data, `select()` only the columns that are related to time
26 | 
27 | ```{r}
28 | 
29 | ```
30 | 
31 | 


--------------------------------------------------------------------------------
/reading_calls/grouped_data.md:
--------------------------------------------------------------------------------
 1 | # Grouped data
 2 | 
 3 | We've just discussed approaches to make visual comparisons between groups. The approach we want to use depends on what particular comparisons we want to make between the groups, how many groups we have to compare, and even differences in the scales of these two groups.
 4 | 
 5 | Now, we'll practice both approaches we covered with the `nyc_squirrels` dataset.
 6 | 
 7 | # Coding task 
 8 | 
 9 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `grouped_data.Rmd`. 
10 | - This code will ask you to plot grouped data two different ways:
11 |   - Once using multiple series on the same plot
12 |   - Once using `facet_wrap()` to plot each series on its own plot
13 |  
14 | 
15 | 


--------------------------------------------------------------------------------
/code/3.25_group_by.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | 
 3 | agencies <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
 4 | launches <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
 5 | 
 6 | d <- data.frame(
 7 |   id    = 1:15, 
 8 |   individual  = rep(LETTERS[1:5], each = 3), 
 9 |   group = rep(c('Group One', 'Group Two'), times = c(9, 6)),
10 |   time  = rep(1:3, times = 5), 
11 |   value = (1:15)^2
12 | )
13 | 
14 | d <- d %>%  
15 |   arrange(sample(id))
16 | 
17 | d %>%  
18 |   group_by(group, time) %>%  
19 |   summarise(
20 |     value_average = mean(value)
21 |   )
22 | 
23 | 
24 | launches %>%  
25 |   group_by(state_code, launch_year) %>%  
26 |   summarise(total_launches = n()) %>%  
27 |   arrange(desc(total_launches))
28 | 


--------------------------------------------------------------------------------
/code/3.11.1_controlling_plot_extents.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(ggplot2)
 3 | library(patchwork)
 4 | 
 5 | squirrel_subset <- readr::read_csv('squirrels_subset.csv')
 6 | 
 7 | squirrel_subset_by_color <- squirrel_subset %>%  
 8 |   mutate(date_f = as.Date.character(date, format = '%m%d%Y')) %>% 
 9 |   group_by(date_f, primary_fur_color) %>%  
10 |   summarise(count_of_squirrels = n())
11 | 
12 | squirrel_subset_by_color %>%  
13 |   ggplot() + 
14 |   aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 
15 |   stat_smooth(se = FALSE) + 
16 |   labs(
17 |     title = 'Decreasing Count of Squirrels Through Time', 
18 |     subtitle = 'Moving average smoother estimate', 
19 |     x = 'Date of Observation', 
20 |     y = 'Count of Squirrels', 
21 |     color = 'Primary Fur Color'
22 |   ) + 
23 |   coord_cartesian(
24 |     xlim = c(as.Date.character('2018-10-08'),
25 |              as.Date.character('2018-10-15')))


--------------------------------------------------------------------------------
/reading_calls/be_your_own_linter.md:
--------------------------------------------------------------------------------
 1 | The good folks over at RStudio make a [coding style guide](https://style.tidyverse.org "style_guide") available. Just like all style guides, it is an opinonated resource, but their opinions are pretty reasonable! 
 2 | 
 3 | Take a gander at the following sections: 1-5 in the Style Guide. 
 4 | 
 5 | I like to think of code as craft: 
 6 | 
 7 | - Maybe you're careful about how you make your coffee; or, 
 8 | - How you cut your vegitables; 
 9 | - How you park your car; 
10 | - How you do your hair
11 | 
12 | There is something in your life that you take satisfaction in doing well. That's why you're here! Writing elegant code _can_ be one of those. But, as you're going to read in the **next** section, please don't let the perfect be the enemy of the "it works". 
13 | 
14 | I write python code with an R accent. I write R code with an "old person" accent. But, I'm learning new things every day. 
15 | 
16 | 


--------------------------------------------------------------------------------
/code/3.13.1_setting_the_theme.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(ggplot2)
 3 | library(patchwork)
 4 | 
 5 | theme_set(theme_minimal())
 6 | 
 7 | squirrel_subset <- readr::read_csv('squirrels_subset.csv')
 8 | 
 9 | squirrel_subset_by_color <- squirrel_subset %>%  
10 |   mutate(date_f = as.Date.character(date, format = '%m%d%Y')) %>% 
11 |   group_by(date_f, primary_fur_color) %>%  
12 |   summarise(count_of_squirrels = n())
13 | 
14 | squirrel_subset_by_color %>%  
15 |   ggplot() + 
16 |   aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 
17 |   stat_smooth(se = FALSE) + 
18 |   labs(
19 |     title = 'Decreasing Count of Squirrels Through Time', 
20 |     subtitle = 'Moving average smoother estimate', 
21 |     x = 'Date of Observation', 
22 |     y = 'Count of Squirrels', 
23 |     color = 'Primary Fur Color') + 
24 |   coord_cartesian(
25 |     xlim = c(as.Date.character('2018-10-08'),
26 |              as.Date.character('2018-10-15'))) 
27 | 


--------------------------------------------------------------------------------
/code/mutate.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: ''
 3 | author: 'w203: Statistics for Data Science'
 4 | date: "8/13/2020"
 5 | output: github_document
 6 | ---
 7 | 
 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 9 | library(tidyverse)
10 | ```
11 | 
12 | ```{r load data, message=FALSE}
13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
15 | ```
16 | 
17 | # Mutate Task 
18 | 
19 | - Using the `agencies` data, create a series of variables that contain the log of the `count` of launches. 
20 | 
21 | ```{r}
22 | 
23 | ```
24 | 
25 | - Then, show only the columns that are called either `agency` or `contains()` the string "count". 
26 | 
27 | ```{r}
28 | 
29 | ```
30 | 
31 | - Finally, `arrange()` these descending by `count`. 
32 | 
33 | ```{r}
34 | 
35 | ```


--------------------------------------------------------------------------------
/reading_calls/join_and_merge.md:
--------------------------------------------------------------------------------
 1 | # Joins and Merges 
 2 | 
 3 | There's a maxim: 
 4 | 
 5 | > If the data were easy to have, someone would have already done it!
 6 | 
 7 | Bringing data together from multiple sources, finding the common keys between the data, and arranging things so that they are ready to use is, at different points: fun, tedious, rewarding, and frustrating.
 8 | 
 9 | Please, read the [R For Data Science](https://r4ds.had.co.nz/relational-data.html) chapter on relational data for merging. There is a lot covered in this chapter so, let me provide you an edited list (in case you don't want to read it all). 
10 | 
11 | - Read: 13.1 Introduction
12 | - Read: 13.1.1 Prerequisites -- and, in fact, log on to teh datahub and load the data so that you can read and type along. 
13 | - Read: 13.2 and 13.3, but skip the exercises
14 | - Read: 13.4; and, do the exercises for 13.4.6
15 | - Read: 13.5, but skip the exercises in 13.5.1
16 | - Read: 13.6
17 | - **Skip** 13.7 
18 | 
19 | 


--------------------------------------------------------------------------------
/code/select_solution.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Select'
 3 | author: 'w203: Statistics for Data Science'
 4 | date: "8/13/2020"
 5 | output: github_document
 6 | ---
 7 | 
 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 9 | library(tidyverse)
10 | ```
11 | 
12 | ```{r load data, message=FALSE}
13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
15 | ```
16 | 
17 | # Task 
18 | 
19 | - From the launches data, `select()` only the columns that are related to characteristics about the country
20 | 
21 | ```{r}
22 | launches %>% 
23 |   select(agency, state_code, agency_type)
24 | ```
25 | 
26 | - From the launches data, `select()` only the columns that are related to time
27 | 
28 | ```{r}
29 | launches %>% 
30 |   select(tag, JD, launch_date, launch_year)
31 | ```
32 | 
33 | 


--------------------------------------------------------------------------------
/reading_calls/read_about_plots.md:
--------------------------------------------------------------------------------
 1 | # Read About Plots 
 2 | 
 3 | Please read in the Book _R For Data Science_ 
 4 | 
 5 | - If you are reading from the pyhsical copy of the book, please read pages 1-13.  
 6 | - If you are reading from the [digital copy](https://r4ds.had.co.nz/explore-intro.html) of the book, please read 
 7 |     - 2: Introduction
 8 |     - 3: Data visualization sections 3.1, 3.2, 3.3, and 3.4 stopping before the section 3.5 "facets". 
 9 |     
10 | This reading is the same (at least as of the time that we've made the course available). For this content, because the language is changing, I actually think that reading the online (digital) version of the resource is a better idea (although I read from a physical copy of the book). 
11 | 
12 | As you're reading, you can follow along and execute the code that the authors are talking about in your DataHub, if you like. Consider this, perhaps by making a new file, and writing the code, executing, and reading the output. 
13 | 


--------------------------------------------------------------------------------
/code/make_a_data_set.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Make A Data Set"
 3 | author: 'w203: Statistics for Data Science'
 4 | date: "8/14/2020"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | library(tidyverse)
10 | ```
11 | 
12 | # Paste Farm Code 
13 | 
14 | Start by pasting the code that you wrote to create the farm animals below. 
15 | 
16 | ```{r}
17 | 'farm animal creating code here' 
18 | ```
19 | 
20 | ## Combine into a dataset 
21 | 
22 | Now, combine all of these variables into a single dataset, called `tilden`. 
23 | 
24 | ## Typecast 
25 | 
26 | Now do a little bit of mutating and type converting. 
27 | 
28 | - Since we know that each of the types of animals has a specific amount that they eat, let's label them "hungry boi" if they eat more than average; and "slender boi" if they eat less than average. (Sorry... I know this coding might be getting tedious!). 
29 | - Make each of these relative to the mean within that animal type. 
30 | 
31 | ```{r}
32 | 
33 | ```
34 | 
35 | 


--------------------------------------------------------------------------------
/reading_calls/r_markdown.md:
--------------------------------------------------------------------------------
 1 | # Reading Call: R Markdown 
 2 | 
 3 | Without being **super** clear about it, for the last several units we've been using R Markdown documents. These are documents that let you mix code and description into a single plain text file. 
 4 | 
 5 | There are several benefits to this: 
 6 | 
 7 | - You can document your code in place
 8 | - You can produce reports from your code that are **always** in sync with the data and codebase
 9 | - You can share these documents with others; and keep them under version control 
10 | 
11 | Now, I'd like you to read a little more detail about the system in R For Data Science. Please read the following: 
12 | 
13 | - [Chapter 26](https://r4ds.had.co.nz/communicate-intro.html) in the digital version of _R For Data Science_ and [Chapter 27](https://r4ds.had.co.nz/r-markdown.html) in the same. 
14 | - In Chapter 27, read sections 1-6, skipping section 7. 
15 | - If you're reading in the physical copy, these are the chapters about "Communicating" and "R Markdown"
16 | 


--------------------------------------------------------------------------------
/code/group_by_summarize.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Split, Apply, Combine'
 3 | author: 'w203: Statistics for Data Science'
 4 | date: "8/13/2020"
 5 | output: github_document
 6 | ---
 7 | 
 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 9 | library(tidyverse)
10 | ```
11 | 
12 | ```{r load data, message=FALSE}
13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
15 | ```
16 | 
17 | # Task 
18 | 
19 | - Using the `launches` data, count the total number of launches per `launch_year`, grouped by `state_code`. 
20 | - Then, using `arrange` answer the question: which year was the busiest for any state? 
21 | - Then, using `filter` answer the question: what was the busiest year for the US? 
22 | - Then, using another variable summary, answer the question: which country has the most variance in the per-year launches?


--------------------------------------------------------------------------------
/code/make_line_plots.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(ggplot2)
 3 | 
 4 | squirrel_subset <- read_csv('./squirrels_subset.csv')
 5 | 
 6 | squirrel_subset <- squirrel_subset %>%
 7 |   mutate(date_f = as.Date.character(date, format = '%m%d%Y'))
 8 | 
 9 | ## Task 1: Do the core work
10 | ## - Make a line plot that shows the squirrels observed by date
11 | 
12 | squirrel_subset %>%
13 |   group_by(date_f) %>%
14 |   summarise(count_of_squirrels = n()) %>%
15 |   ggplot() +
16 |   aes(x = 'fill this in', y = 'fill this in') +
17 |   'fill this in'
18 | 
19 | ## Challenge Task 2: Add on to the core work
20 | ## - We haven't covered this yet, so it is ABSOLUTELY optional.
21 | ## - But, what if you wanted to make a separate line plot for each of the
22 | ##   colors of squirrels?
23 | 
24 | ## - To do so, you'd have to change how you're grouping (to count colors
25 | ##   separately), and then you'd have to bring that color variable (which
26 | ##   is a data feature) into your final line plot.
27 | 
28 | ## If you're interested, try it!
29 | 


--------------------------------------------------------------------------------
/reading_calls/rstudio_cheatsheet.md:
--------------------------------------------------------------------------------
 1 | # R Markdown Cheatsheet 
 2 | 
 3 | Return to the cheatsheet resource that Rstudio maintains. 
 4 | 
 5 | - The top-level repository with the cheatsheets is [here](https://rstudio.com/resources/cheatsheets/); and, 
 6 | - The cheatsheet that is specifically useful right now is the [Rmarkdown cheatsheet](https://github.com/rstudio/cheatsheets/raw/master/rmarkdown-2.0.pdf). (Note, this link might download a PDF when you click it.)  
 7 | 
 8 | This cheatsheet is something that you might want to have around when you're producing document and work for w203. 
 9 | 
10 | For now, if you're interseted in seeing how the expanded set of options available to you work, navigate to the UCB Datahub, head to the code folder, and open `make_scatter_plots_solution.Rmd`. 
11 | 
12 | There is nothing _particularily_ special about this file, but try to see if you can use the Datahub to output the solutions into different formats by changing the `output:` call in the YAML and re-knitting. See what happens if you insert new chunks, or change options that are passed into chunks.  
13 | 


--------------------------------------------------------------------------------
/code/code_for_videos.R:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | library(patchwork)
 3 | 
 4 | theme_set(theme_minimal())
 5 | 
 6 | ggplot(data = ss) + 
 7 |   aes(x = long) + 
 8 |   aes(y = lat) + 
 9 |   geom_point()
10 | 
11 | p1 <- ggplot(data = ss, aes(x = long, y = lat)) + geom_point()
12 | 
13 | p2 <- ggplot(data = ss, aes(long)) + 
14 |   geom_histogram()
15 | 
16 | p1 / p2
17 | 
18 | ggplot(data = ss, aes(x = long, y = lat, shape = primary_fur_color, color = primary_fur_color)) + 
19 |   geom_point() + 
20 |   coord_quickmap()
21 | 
22 | ggplot(economics, aes(date, unemploy)) + 
23 |   geom_line()
24 | 
25 | ggplot(economics_long, aes(date, value01, colour = variable)) +
26 |   geom_line()
27 | 
28 | ggplot() + 
29 |   geom_line(data = economics, mapping = aes(x = date, y = pce)) + 
30 |   geom_line(data = economics, mapping = aes(x = date, y = pop))
31 | 
32 | economics_long %>%  
33 |   filter(variable %in% c('pce', 'pop')) %>%  
34 |   ggplot(aes(date, value, color = variable)) + 
35 |   geom_line()
36 | 
37 | economics_long %>%  
38 |   ggplot(aes(date, value01, color = variable)) + 
39 |   geom_line()
40 | 


--------------------------------------------------------------------------------
/code/filter.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: ''
 3 | author: 'w203: Statistics for Data Science'
 4 | date: "8/13/2020"
 5 | output: gitihub_document
 6 | ---
 7 | 
 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 9 | library(tidyverse)
10 | ```
11 | 
12 | ```{r load data}
13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
15 | ```
16 | 
17 | # Task 
18 | 
19 | Use the magrittr pipr `%>%` and the `filter()` verb to do the following work: 
20 | 
21 | ## Launches in the 1980s
22 | 
23 | Find only the launches that occurred in the 1980s 
24 | 
25 | ## Launches by France 
26 | 
27 | Find only the launches that were conducted by France (`state_code == "F"` )
28 | 
29 | ## Launches by France in the 1980s
30 | 
31 | Find only the launches by France that were conducted in the 1980s
32 | 
33 | ## Launches by France or Russia in the 1980s 
34 | 
35 | Find launches in the 1980s by either France or the Soviet Union


--------------------------------------------------------------------------------
/reading_calls/goals_of_the_project.md:
--------------------------------------------------------------------------------
 1 | # Goals of the Project 
 2 | 
 3 | If you're going to take on a small project here are the goals that I think might help you to scope it: 
 4 | 
 5 | 1. The data should already be collected 
 6 | 2. The data should be in one, or maybe two files
 7 | 3. If there is a time component to the data, then your key insight should be a plot for how simple values of the data change over time; 
 8 | 4. If there is not a time component to the data, then your key insight should be how different categories that are represented in the data have different values on the outcomes that you care about. 
 9 | 
10 | # Where to Get Data 
11 | 
12 | - I've pointed to Tidy Tuesday (https://github.com/rfordatascience/tidytuesday) before as a nice place to get data. In fact, that is where both the squirrels and space launches data came from. But, a lot of students will probably end up using data from there. 
13 | - Another good option is the #datasets channel in the School's Slack channel. 
14 | - Also a good bet are the New York Times, LA Times, Wall Street Journal, and Economist github pages. 
15 | 


--------------------------------------------------------------------------------
/reading_calls/control_flow.md:
--------------------------------------------------------------------------------
 1 | # Control Flow 
 2 | 
 3 | For individuals who are coming from an engineering background, control flow is second nature. For folks coming from other backgrounds (like myself) the concept can be a little baffling. 
 4 | 
 5 | The general idea is that you're going to set a condition, and take some actions given that condition holds. If that condition doesn't hold, you'll do something else -- maybe take some other action, maybe stop, maybe ... 
 6 | 
 7 | The thing is: when dealing with data it **MIGHT** seem like a good idea to write loops, but generally it is not. R is built for vector operations -- which you've been using for a while, and used just a moment ago when you type cast the 'thick bois' and 'slender bois'. This vector operation has two benefits: 
 8 | 
 9 | 1. It makes the code much more legible, reducing the boilerplate that you've got to write; 
10 | 2. It makes the code much faster; most vectorized operations are written in compiled, optimized C. A lower-level, faster language. 
11 | 
12 | For now, read [this short explaination](https://adv-r.hadley.nz/control-flow.html) about control flow in R.  
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/reading_calls/tidying_arranging_and_summarizing.md:
--------------------------------------------------------------------------------
 1 | # Tidying, Arranging, and Summarizing 
 2 | 
 3 | We're now to the point that we're going to really start working with data. Until this point, I've basically provided you with the data in the form that you need it. 
 4 | 
 5 | But tidying, arranging, and summarizing data is **so** core to many of the tasks that we work on. 
 6 | 
 7 | In this section we're going to start a series of really quick loops between a lecture on a concept and a short coding exercise that uses that concept. 
 8 | 
 9 | # Reading Task 
10 | 
11 | To begin please read the following chapter in *R For Data Science*. 
12 | 
13 | - If you're reading the hard copy, the chapter is **Chapter 5: Exploratory Data Analysis**
14 | - If you're reading the digital copy it is **Chapter 7: Exploratory Data Analysis** and you can get to it [at this link](https://r4ds.had.co.nz/exploratory-data-analysis.html). 
15 | 
16 | As you're reading along, try to think of each new concept that comes in as a **verb** that _does_ something to data. The idea within the `dplyr` framework is that data **is** and we as data scientists do things to modify the canonical source data. 
17 | 


--------------------------------------------------------------------------------
/code/make_line_plots_solution.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Line Plots Solutions"
 3 | author: 'w203: Statistics for Data Science'
 4 | output: github_document
 5 | ---
 6 | 
 7 | ```{r setup, results = 'hide', warning=FALSE, message=FALSE}
 8 | library(tidyverse)
 9 | library(ggplot2)
10 | theme_set(theme_minimal())
11 | knitr::opts_chunk$set(dpi = 200)
12 | ```
13 | 
14 | ```{r load data}
15 | squirrel_subset <- read_csv('./squirrels_subset.csv')
16 | squirrel_subset <- squirrel_subset %>%
17 |   mutate(date_f = as.Date.character(date, format = '%m%d%Y'))
18 | ```
19 | 
20 | # Squirrels by date 
21 | 
22 | - Make a line plot that shows the squirrels observed by date
23 | 
24 | ```{r}
25 | squirrel_subset %>%
26 |   group_by(date_f) %>%
27 |   summarise(count_of_squirrels = n()) %>%
28 |   ggplot() +
29 |   aes(x = date_f, y = count_of_squirrels) +
30 |   geom_line()
31 | ```
32 | 
33 | # Challenge: Squirrels by color by date
34 | 
35 | ```{r}
36 | squirrel_subset %>%
37 |   group_by(date_f, primary_fur_color) %>%
38 |   summarise(count_of_squirrels = n()) %>%
39 |   ggplot() +
40 |   aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) +
41 |   geom_line()
42 | ```
43 | 
44 | 


--------------------------------------------------------------------------------
/reading_calls/working_with_rstudio.md:
--------------------------------------------------------------------------------
 1 | # Working with Rstudio 
 2 | 
 3 | At this point, please go and use the UCB Datahub to do a little bit of practice working with the Rstudio IDE. 
 4 | 
 5 | The work is going to be simple -- I am asking you to execute code that has already been mostly written for you. 
 6 | 
 7 | - The goal is that you can start to work with the tool; articulate it and see how it responds. 
 8 | - This way, when we move forward you can use the tool to answer questions that you're interested in. 
 9 | 
10 | If you click [this link to the UCB Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) the datahub will pull a current version of the code that I would like you to work on. 
11 | 
12 | When the datahub starts up, navigate to the `r_bridge` folder, and then `code` and then to the file called, `working_with_rstudio.R`. 
13 | 
14 | Although you can't see the image here in the IVSC, if you're lost and would like a picture to show you where I'm navigating, if you "Open in GitHub" there will be a image that shows. 
15 | 
16 | ![](./images/r_bridge_datahub.png)
17 |  
18 | 


--------------------------------------------------------------------------------
/reading_calls/how_to_summarise.md:
--------------------------------------------------------------------------------
 1 | # Summarising the Information In A Plot 
 2 | 
 3 | In this lecture I've just made the argument that you might want to reduce the information that is present in a plot as a method of highlight the core insight that you're trying to communicate. 
 4 | 
 5 | Plots are pieces of your rhetorical toolkit when you are communicating about data. As such, you have the ability to highlight (or lowlight) the features of the story that you are telling within your plot. 
 6 | 
 7 | This doesn't mean that you can lie with your plots; that's not the goal. But, the goal is to make the clearest argument for what you've learned from the plot, while also communicating how you've learnred that bit. 
 8 | 
 9 | In the lecture that you just saw, we accomplish this by smoothing the data using the `stat_smoother()` moving average smoother. 
10 | 
11 | # Coding task 
12 | 
13 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `how_to_summarize.Rmd`. 
14 | - This code will ask you to use the `stat_smooth()` function.   
15 | 
16 | 
17 |  
18 | 


--------------------------------------------------------------------------------
/reading_calls/issuing_code_and_reading_output.md:
--------------------------------------------------------------------------------
 1 | # Issuing Code and Reading Output 
 2 | 
 3 | At this point, please go and use the UCB Datahub to do a little bit of practice working with the Rstudio IDE. 
 4 | 
 5 | The work is going to be simple -- I am asking you to execute code that has already been mostly written for you. 
 6 | 
 7 | - The goal is that you can start to work with the tool; articulate it and see how it responds. 
 8 | - This way, when we move forward you can use the tool to answer questions that you're interested in. 
 9 | 
10 | If you click [this link to the UCB Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) the datahub will pull a current version of the code that I would like you to work on. 
11 | 
12 | When the datahub starts up, navigate to the `r_bridge` folder, and then `code` and then to the file called, `issuing_code_and_reading_outputs.R`. 
13 | 
14 | Although you can't see the image here in the IVSC, if you're lost and would like a picture to show you where I'm navigating, if you "Open in GitHub" there will be a image that shows. 
15 | 
16 | ![](./images/r_bridge_datahub.png)
17 |  
18 | 


--------------------------------------------------------------------------------
/reading_calls/review_code.md:
--------------------------------------------------------------------------------
 1 | # Review Code 
 2 | 
 3 | This [article](https://www.nature.com/articles/nenergy2016170) applies simple models on top of survey data from respondents who live in rural India. It was published at Nature Energy, which is in the _Nature_ constellation of publication venues. 
 4 | 
 5 | I would hate for someone to remark this about my research, but there's nothing really extraordinary about this work. But, we learned something from it. A small thing, but something that _Natue_ thought was important enough to publish. 
 6 | 
 7 | Now, the authors publish their data and code along side this work. Follow [this link](https://dataverse.harvard.edu/file.xhtml?persistentId=doi:10.7910/DVN/QY5R7R/PVKVR8&version=2.0) which will take you to the core code that authors used to generate these findings. 
 8 | 
 9 | # Questions
10 | 
11 | 1. Does this code conform to the coding style guides that you've just read about? 
12 | 2. What is one, specific thing that you would do to reformat their code to make it more legible. 
13 | 
14 | If you're so inclined -- but don't spend more than 5 minutes doing so -- you could paste this code into your Dataverse and restructure some parts of it to make it more legible. 
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/code/arrange_solution.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Arrange'
 3 | author: 'w203: Statistics for Data Science'
 4 | date: "8/13/2020"
 5 | output: github_document
 6 | ---
 7 | 
 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 9 | library(tidyverse)
10 | ```
11 | 
12 | ```{r load data}
13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
15 | ```
16 | 
17 | # Task 
18 | 
19 | Using the `launches` data, the `arrange()` verb, and the `head()` verb: 
20 | 
21 | - Print the earliest launches
22 | 
23 | ```{r}
24 | launches %>% 
25 |   arrange(launch_date) %>% 
26 |   head()
27 | ```
28 | 
29 | - Still using the head verb, use the `desec()` adverb to print the *most recent* launches
30 | 
31 | ```{r}
32 | launches %>% 
33 |   arrange(desc(launch_date)) %>% 
34 |   head()
35 | ```
36 | 
37 | Adding in the `state_code` variable, 
38 | 
39 | - Arrange the data so that it is sorted first by `launch_year` and then by `state_code`
40 | 
41 | ```{r}
42 | launches %>% 
43 |   arrange(launch_year, state_code)
44 | ```
45 | 
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/code/make_bar_plots.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Making Bar Plots"
 3 | author: 'w203: Statistics for Data Science'
 4 | output: github_document
 5 | ---
 6 | 
 7 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 8 | library(tidyverse)
 9 | library(ggplot2)
10 | # install.packages('patchwork')
11 | library(patchwork)
12 | 
13 | theme_set(theme_minimal())
14 | knitr::opts_chunk$set(dpi = 200)
15 | ```
16 | 
17 | ```{r load data}
18 | squirrel_subset <- read.csv('squirrels_subset.csv')
19 | ```
20 | 
21 | ```{r aggregate data}
22 | squirrel_subset_by_color <- squirrel_subset %>%  
23 |   group_by(primary_fur_color) %>%  
24 |   summarise(count_by_color = n())
25 | ```
26 | 
27 | # Task 
28 | 
29 | Produce two identical plots that have the following characteristics: 
30 | 
31 | - On the x-axis the plots have the color of the squirrel fur
32 | - On the y-axis the plots have a count of the nubmer of squirrels that have that color fur
33 | 
34 | However, make these plots in two ways: 
35 | 
36 | 1. In one plot, use `geom_bar()` (and the appropriate dataset)
37 | 2. In the other plot, use `geom_col()` (and the appropriate dataset)
38 | 
39 | Note that the datsets will be different for each of the different geometries. 
40 | 
41 | ```{r}
42 | 
43 | ```


--------------------------------------------------------------------------------
/reading_calls/installing_r.md:
--------------------------------------------------------------------------------
 1 | # Installing R 
 2 | 
 3 | R is available to install at https://cran.r-project.org. 
 4 | 
 5 | For right now, we want to discourage you from installing locally. Instead, we want you to focus on the work internal to the language; not the dev-ops to be able to use the language. 
 6 | 
 7 | ## What a local install would mean
 8 | 
 9 | Installing locally will mean 
10 | 
11 | - That you get access to full resources on your local machine :tada: 
12 | - That you have to manage the access to all of those resources :-1: 
13 | - That your setup will be different than everyone else's setup :fire:
14 | 
15 | For now, when we have a limited view of what we're actually going to need, we think that we shouldn't over-invest in building technology. This probably means that we are "borrowing" from our future and so will have technical debt to pay. But, for right now, being cash rich and using the language is important. 
16 | 
17 | As an example, many of the high-performance parts of the language use specific C compilers. In the future, if you want to use these tools, you'll have to back-step to install the specific compilers, and then ensure that the rest of what you've built will still work. 
18 | 
19 | If you choose to go this way, best of luck!
20 | 
21 | 


--------------------------------------------------------------------------------
/reading_calls/be_your_own_linter_solution.md:
--------------------------------------------------------------------------------
 1 | # But Here's the thing 
 2 | 
 3 | Look, here's the thing. For some reason people are almost **universally** bashful about sharing their code with others. 
 4 | 
 5 | - No really, even though there is  some sort of silly taboo against talking about salary (which I think is put in place to keep workers from demanding higher wages and fuller benefits from management... ok, politics aside...) I bet that if we randomly paired a group of people in the class together, they'd rather talk about salary than have to defend why they coded the last exercise the way they did. 
 6 | 
 7 | - Just don't worry about it. 
 8 | 
 9 | Code, like salaries works better when we can all see how others have approached something, asking why, and coming to a mutual understanding. 
10 | 
11 | Here's some guidance from David Robinson (formerly with Stack Exchange) and Hadly Wickham (who literally re-wrote the language in the idiom that we're using): 
12 | 
13 | http://varianceexplained.org/programming/bad-code/
14 | 
15 | > This is exactly how you end up with crises of reproducible research in science. There are many reasons scientists publish papers without sharing their code (none of them defensible), but high on the list is embarrassment: “my code is too ugly to share.” Code shamers aren’t helping!
16 | 


--------------------------------------------------------------------------------
/reading_calls/review_code_solution.md:
--------------------------------------------------------------------------------
 1 | # Review Code Solution 
 2 | 
 3 | One part of the code that I thought was particularily problematic was this set: 
 4 | 
 5 | ```
 6 | access$fuel_stack[(access$LPG==1) &
 7 |                     (access$m4_q109_firewood=="No"& 
 8 |                        ((access$m4_q113_dungcake=="No") & 
 9 |                           ((access$m4_q114_agro=="No") & 
10 |                              (access$m4_q115_other_fuel=="No"))))]<-4 #exclusive use
11 | 
12 | ```
13 | 
14 | IT is really, really hard to grock just what is and what isn't being selected. 
15 | 
16 | I'd re-write this, using the `dplyr` framework as: 
17 | 
18 | ```
19 | access %>% 
20 |   filter(
21 |     LPG == 1, m_q109 == 'No', m4_q113_dungcake == 'No', 
22 |     m4_q114_agro == 'No', m4_q115_other_fuel == 'No') %>%
23 |   mutate(fuel_stack = 4)
24 |   
25 | ```
26 | 
27 | I would also really, really, really like to know what the heck the `m4` object is, and why it is at the start of each of these variables. I would probably want to rename the entire set of varibles. 
28 | 
29 | The thing is, when code is as poorly structured and named as this code is, it makes it really hard to collaborate! And, the only thing scarier to me than sharing my code, is being the only person who has looked at code that I'm going to deploy.
30 | 


--------------------------------------------------------------------------------
/reading_calls/pick_a_theme.md:
--------------------------------------------------------------------------------
 1 | # Pick a Theme 
 2 | 
 3 | I mean, c'mon. One of the most fun things about plotting is making it look "like a plot that I made". Amiright? 
 4 | 
 5 | There are a whole host of themes that are available within ggplot. As we've demonstrated in the lecture these can be set in one of two ways: 
 6 | 
 7 | 1. By adding a `+ theme_*()` layer onto your plot; or, 
 8 | 2. By setting the theme for the whole session using `theme_set(theme_*())`. 
 9 | 
10 | (In both of those, you would replace the * with the theme name you want to use.)
11 | 
12 | There are several themes that are built and maintained internal to the `ggplot` project. You can see a listing of them at [this link](https://ggplot2.tidyverse.org/reference/ggtheme.html). 
13 | 
14 | But, there are also a **ton** of other extensions that folks have built. Many of them are listed [at this github for ggthemes](https://github.com/jrnold/ggthemes). If you want to use these themes you can do the following: 
15 | 
16 | ```
17 | install.packages('ggthemes', dependencies = TRUE)
18 | library(ggthemes)
19 | ```
20 | 
21 | And then add a theme from this layer as you might like. 
22 | 
23 | # Coding Task 
24 | 
25 | If you're interested: 
26 | - Head to the Datahub.
27 | - Load the file `pick_a_theme.Rmd` and then work to set themes, picking one that you prefer. 
28 | 
29 | 


--------------------------------------------------------------------------------
/docs/schedule.md:
--------------------------------------------------------------------------------
 1 | # Schedule 
 2 | 
 3 | This course is designed so that students can complete the *incoming*, structured work in four, two-hours long study sessions. (Or thereabouts.) The materials are structured as follows: 
 4 | 
 5 | 1. **Install Dependencies** - Get a working system going, and start to navigate the language. Because we advocate that you use a deployed environment hosted by UC Berkeley, this module can be completed in as few as 30 minutes. The student interested in building a version of the compute environment on their own machine will be walked through a Docker install.
 6 | 2. **Thinking About Data** - "What information is encoded in my data, and how can I start to explore this information?" This module introduces plotting highly structured data by introducing the grammar of graphics. 
 7 | 3. **Plotting and Manipulating Data** - This module expands students' expressiveness when moving data into plots and introduces core concepts in data manipulation. The student is still firmly grounded in a data.frame. 
 8 | 4. **Working at the Project Level** - This module takes the student outside of the dataframe, both looking more broadly at structuring a project workflow and more narrowly at lower-level pieces of the `R` language. 
 9 | 5. **Sharing a Project** - This module sets the student out to work on a tightly-scoped project. 
10 | 


--------------------------------------------------------------------------------
/reading_calls/installing_rstudio.md:
--------------------------------------------------------------------------------
 1 | # Installing Rstudio Locally 
 2 | 
 3 | (This is a step that you only need to take if you're going to install locally!)
 4 | 
 5 | Rstudio is available to install at https://rstudio.com/products/rstudio/.
 6 | 
 7 | Rstudio is the IDE that we're advocating folks use while writing code in the R language. 
 8 | 
 9 | - It is probably the defacto editor for the language; 
10 | - It provides a lot of really, really useful functionality that is specific to statistical computing, and is actively maintained and developed on. 
11 | 
12 | Like a local install of R, if you install on your local machine you'll have total control over what is available to you. But you'll also have to build everything that you want to be available. 
13 | 
14 | Whether you use the UCB Datahub, build this with a Docker image, or install locally, Rstudio does a _very_ good job of trying to make things "just work". This means that if you have code that requires a new package, it will suggest the install for that package; if you need to change you install of R it will suggest the install; and it has **enormously&** helpful help documentation. 
15 | 
16 | (Remember, you only need to install R and Rstudio if you're installing them locally. If you're going to use a resource that we've provided, you don't have to install.)
17 | 
18 | Best of luck! 
19 | 


--------------------------------------------------------------------------------
/reading_calls/ucb_datahub.md:
--------------------------------------------------------------------------------
 1 | # The UCB Datahub 
 2 | 
 3 | The most straightforward way to start working with R is to use the UC Berkeley datahub environment that is provided to you as a student. This hub is located at [http://ischool.datahub.berkeley.edu](http://ischool.datahub.berkeley.edu) and requires only that you use your UCB Single Sign on to authenticate. Upon following this link: 
 4 | 
 5 | - Authenticate with your Single Sign On 
 6 | - Click to authorize this app
 7 | - Get coding! 
 8 | 
 9 | This has built and is running an instance of Rstudio on UC Berkeley resources that you can fully articulate. There are several features to this _out of the box_: 
10 | 
11 | - Although your datahub is working on a virtual machine, it is not ephemeral. What does this mean, practically? You can return to code that you have written in your datahub, merely by navigating back to the datahub. In fact, even code that you have written but not saved will probably still be accessible then next time that you log-on to the datahub. 
12 | - We have been able to specify libraries that we would like you to use in this instance, and have already compiled them
13 | - We should not undersell how nice this solution is. Thanks to the folks at Berkeley Research IT, BIDS, and CDSS. 
14 | - If you're interested in more detail about the operation of the dathub, you can read about it [here](https://docs.datahub.berkeley.edu/en/latest/)
15 | 


--------------------------------------------------------------------------------
/code/make_bar_plots_solution.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Making Bar Plots"
 3 | author: 'w203: Statistics for Data Science'
 4 | output: github_document
 5 | ---
 6 | 
 7 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 8 | library(tidyverse)
 9 | library(ggplot2)
10 | # install.packages('patchwork')
11 | library(patchwork)
12 | 
13 | theme_set(theme_minimal())
14 | knitr::opts_chunk$set(dpi = 200)
15 | ```
16 | 
17 | ```{r load data}
18 | squirrel_subset <- read.csv('squirrels_subset.csv')
19 | ```
20 | 
21 | ```{r aggregate data}
22 | squirrel_subset_by_color <- squirrel_subset %>%  
23 |   group_by(primary_fur_color) %>%  
24 |   summarise(count_by_color = n())
25 | ```
26 | 
27 | # Task 
28 | 
29 | Produce two identical plots that have the following characteristics: 
30 | 
31 | - On the x-axis the plots have the color of the squirrel fur
32 | - On the y-axis the plots have a count of the nubmer of squirrels that have that color fur
33 | 
34 | However, make these plots in two ways: 
35 | 
36 | 1. In one plot, use `geom_bar()` (and the appropriate dataset)
37 | 2. In the other plot, use `geom_col()` (and the appropriate dataset)
38 | 
39 | Note that the datsets will be different for each of the different geometries. 
40 | 
41 | ```{r}
42 | plot_col <- squirrel_subset_by_color %>%  
43 |   ggplot() + 
44 |   aes(x = primary_fur_color, y = count_by_color) + 
45 |   geom_col()
46 | 
47 | plot_bar <- squirrel_subset %>%  
48 |   ggplot() + 
49 |   aes(x = primary_fur_color) + 
50 |   geom_bar()
51 | 
52 | plot_col | plot_bar
53 | ```


--------------------------------------------------------------------------------
/docs/syllabus.md:
--------------------------------------------------------------------------------
 1 | # MIDS 1C: R Programming Bridge Course
 2 | 
 3 | This bridge course is distinct from credit bearing courses in the degree -- it has no graded components. Instead, it has been designed so that a broad base of learners can come up to speed with some of the core pieces of the R language. 
 4 | 
 5 | In contrast to many methods of learning new languages, this course focuses on working with high-level parts of the language -- namely plotting (from `ggplot2`) and data manipulation (from `dplyr`) before addressing lower-level parts of the language. Indeed, this course is at an even higher-level part of the langauge api than the *R for Data Science* textbook. 
 6 | 
 7 | I (Alex) made this choice deliberately, based on a model of spoken language-learning: When learning a new language, we try to build concepts and vocabulary, even if this means that we conjugate verbs incorrectly or use inefficient methods of expressing ourselves. Often times, coding-language-learning tries to approach their language as though there is some axomatic truth from which the language is derived. These "truths" are enshrined in style guides, deployed code, and bravado; but they hide a more important truth -- just start communicating and see what you can get done. 
 8 | 
 9 | You can probably nagivate a busy market and appreciate the culture in a foreign land where you don't speak the language. Similarily, you can probably write clumsy code that expresses your intent and learn from your data. Just start writing and learning; we'll speak fluently soon enough. 
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/reading_calls/issuing_code_and_reading_output_solution.md:
--------------------------------------------------------------------------------
 1 | # Solutions: Issuing Code and Reading Output
 2 | 
 3 | If you're interested in seeing the solutions to the _small_ amount of code that I wrote, you can view the file back in the datahub. It is in the file `issuing_code_and_reading_outputs_solutions.R` 
 4 | 
 5 | But, I really only asked you to combine two lines, so you can probably view it here instead.  
 6 | 
 7 | ## Cats 
 8 | If you want to make a `cats` object that contains a vector of cat names, you can do so in the following way (you pick the names). 
 9 | 
10 | ```
11 | cats <- c('Fluffy', 'Tiger')
12 | ```
13 | 
14 | ## Top Dogs 
15 | 
16 | To create an object called `top_dog_names` that contains the most frequently used dog names you can "assign" the filtering pipeline that I wrote into the object using the `<-` operator.
17 | 
18 | This _isn't_ going to assign the pipeline functions into that object; rather, it is going to assign the results of the pipeline into that object. This is an important difference that some will be interested in. 
19 | 
20 | - What this means: if you change the upstream data that is in `nyc_license` it will not change the values that are listed in the `top_dog_names` object. The elements of that object are written once at the time that you executed the code. 
21 | 
22 | ```
23 | top_dog_names <- nyc_license %>% 
24 |   group_by(animal_name) %>% 
25 |   summarize(total = n()) %>% 
26 |   arrange(desc(total)) # this arranges in decending order 
27 | ```
28 | 
29 | (Fun fact, [Top Dog](http://www.topdoghotdogs.com) is delicious. )
30 | 


--------------------------------------------------------------------------------
/code/make_bar_plots_solution.md:
--------------------------------------------------------------------------------
 1 | Making Bar Plots
 2 | ================
 3 | w203: Statistics for Data Science
 4 | 
 5 | ``` r
 6 | library(tidyverse)
 7 | library(ggplot2)
 8 | # install.packages('patchwork')
 9 | library(patchwork)
10 | 
11 | theme_set(theme_minimal())
12 | knitr::opts_chunk$set(dpi = 200)
13 | ```
14 | 
15 | ``` r
16 | squirrel_subset <- read.csv('squirrels_subset.csv')
17 | ```
18 | 
19 | ``` r
20 | squirrel_subset_by_color <- squirrel_subset %>%  
21 |   group_by(primary_fur_color) %>%  
22 |   summarise(count_by_color = n())
23 | ```
24 | 
25 |     ## `summarise()` ungrouping output (override with `.groups` argument)
26 | 
27 | # Task
28 | 
29 | Produce two identical plots that have the following characteristics:
30 | 
31 |   - On the x-axis the plots have the color of the squirrel fur
32 |   - On the y-axis the plots have a count of the nubmer of squirrels that
33 |     have that color fur
34 | 
35 | However, make these plots in two ways:
36 | 
37 | 1.  In one plot, use `geom_bar()` (and the appropriate dataset)
38 | 2.  In the other plot, use `geom_col()` (and the appropriate dataset)
39 | 
40 | Note that the datsets will be different for each of the different
41 | geometries.
42 | 
43 | ``` r
44 | plot_col <- squirrel_subset_by_color %>%  
45 |   ggplot() + 
46 |   aes(x = primary_fur_color, y = count_by_color) + 
47 |   geom_col()
48 | 
49 | plot_bar <- squirrel_subset %>%  
50 |   ggplot() + 
51 |   aes(x = primary_fur_color) + 
52 |   geom_bar()
53 | 
54 | plot_col | plot_bar
55 | ```
56 | 
57 | ![](make_bar_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png)<!-- -->
58 | 


--------------------------------------------------------------------------------
/code/make_it_sparkle.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "make_it_sparkle.Rmd"
 3 | author: 'w203: Statistics for Data Science'
 4 | output: github_document
 5 | ---
 6 | 
 7 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 8 | library(tidyverse)
 9 | library(ggplot2)
10 | # install.packages('patchwork')
11 | library(patchwork)
12 | 
13 | theme_set(theme_minimal())
14 | knitr::opts_chunk$set(dpi = 200)
15 | ```
16 | 
17 | ```{r load and mutate data}
18 | squirrel_subset <- read.csv('squirrels_subset.csv')
19 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color))
20 | 
21 | squirrel_subset <- squirrel_subset %>%
22 |   mutate(date_f = as.Date.character(date, format = '%m%d%Y'))
23 | ```
24 | 
25 | # Coding Task 
26 | 
27 | Suppose that you want to communicate to someone who has less context with the data. Set the `labs()` argument to include 
28 | 
29 | - A title 
30 | - A subtitle 
31 | - Descriptive Labels for the axes 
32 | - A label for what the colors mean. 
33 | 
34 | Each of the labels should be in **plain spoken language** and should be in sentence case: 
35 | 
36 | - The first letter of the first work should be capitalized
37 | - The rest of the letters should *not* be capitalized
38 | - Everything should be a work that you can speak aloud -- i.e. **not** a variable name. 
39 | 
40 | ```{r coding task}
41 | squirrel_subset %>%  
42 |   group_by(date_f, primary_fur_color) %>%  
43 |   summarise(count_of_squirrels = n()) %>%  
44 |   ggplot() + 
45 |   aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 
46 |   stat_smooth(se = FALSE) # fill out the labs() arg!
47 | ```
48 | 


--------------------------------------------------------------------------------
/reading_calls/make_bar_plots_solution.md:
--------------------------------------------------------------------------------
 1 | Making Bar Plots
 2 | ================
 3 | w203: Statistics for Data Science
 4 | 
 5 | ``` r
 6 | library(tidyverse)
 7 | library(ggplot2)
 8 | # install.packages('patchwork')
 9 | library(patchwork)
10 | 
11 | theme_set(theme_minimal())
12 | knitr::opts_chunk$set(dpi = 200)
13 | ```
14 | 
15 | ``` r
16 | squirrel_subset <- read.csv('squirrels_subset.csv')
17 | ```
18 | 
19 | ``` r
20 | squirrel_subset_by_color <- squirrel_subset %>%  
21 |   group_by(primary_fur_color) %>%  
22 |   summarise(count_by_color = n())
23 | ```
24 | 
25 |     ## `summarise()` ungrouping output (override with `.groups` argument)
26 | 
27 | # Task
28 | 
29 | Produce two identical plots that have the following characteristics:
30 | 
31 |   - On the x-axis the plots have the color of the squirrel fur
32 |   - On the y-axis the plots have a count of the nubmer of squirrels that
33 |     have that color fur
34 | 
35 | However, make these plots in two ways:
36 | 
37 | 1.  In one plot, use `geom_bar()` (and the appropriate dataset)
38 | 2.  In the other plot, use `geom_col()` (and the appropriate dataset)
39 | 
40 | Note that the datsets will be different for each of the different
41 | geometries.
42 | 
43 | ``` r
44 | plot_col <- squirrel_subset_by_color %>%  
45 |   ggplot() + 
46 |   aes(x = primary_fur_color, y = count_by_color) + 
47 |   geom_col()
48 | 
49 | plot_bar <- squirrel_subset %>%  
50 |   ggplot() + 
51 |   aes(x = primary_fur_color) + 
52 |   geom_bar()
53 | 
54 | plot_col | plot_bar
55 | ```
56 | 
57 | ![](make_bar_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png)<!-- -->
58 | 


--------------------------------------------------------------------------------
/code/pick_a_theme.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Pick A Theme"
 3 | author: 'w203: Statistics for Data Science'
 4 | output: github_document
 5 | ---
 6 | 
 7 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 8 | library(tidyverse)
 9 | library(ggplot2)
10 | # install.packages('patchwork')
11 | library(patchwork)
12 | 
13 | knitr::opts_chunk$set(dpi = 200)
14 | ```
15 | 
16 | ```{r load and mutate data}
17 | squirrel_subset <- read.csv('squirrels_subset.csv')
18 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color))
19 | 
20 | squirrel_subset <- squirrel_subset %>%
21 |   mutate(date_f = as.Date.character(date, format = '%m%d%Y'))
22 | ```
23 | 
24 | # Coding Task 
25 | 
26 | What's your style? A color maximalist? A Tufte minimalist? 
27 | 
28 | The only requirement here is that you pick with communication in mind. Try out a few themes to see which you think is to your liking -- or, just use no theme at all and rely on the base colors! 
29 | 
30 | ```{r}
31 | ?theme_bw
32 | ```
33 | 
34 | Remove the comment and try a few themes out.
35 | 
36 | ```{r coding task}
37 | squirrel_subset %>%  
38 |   group_by(date_f, primary_fur_color) %>%  
39 |   summarise(count_of_squirrels = n()) %>%  
40 |   ggplot() + 
41 |   aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 
42 |   stat_smooth(se = FALSE) + 
43 |   labs(
44 |     title = 'There are a lot of grey squirrels',
45 |     subtitle = 'But, people are collecting data in later days',
46 |     x = 'Date of observation', 
47 |     y = 'Count of squirrels', 
48 |     color = 'Primary Fur Color') # + theme_minimal() 
49 | ```
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/reading_calls/make_bar_plots.md:
--------------------------------------------------------------------------------
 1 | # Making Bar Plots 
 2 | 
 3 | There are two ways to produce bar plots in `ggplot`. 
 4 | 
 5 | - With `geom_bar()`) you allow `ggplot` to do the functional counting and mapping for you at the time that you draw this plot. 
 6 |     - Sometimes this is the easier way to go about producing a bar plot; especially when the data is relatively simple that you're working with.
 7 |     - As well, when you're exploring data, it is quite nice to be able to view counts of factors, without an intermediate step to "roll-up" the observations. 
 8 | - With `geom_col()` **you** do the aggregating ahead of time and then tell the plot what height you want to map onto the y-axis.
 9 |     - When the data is complex, or if you have a particular way that you want to do the counting, this can be easier to produce what you'd like to see (because you can write the code to do the counting, rather than relying on the buried, built-in counting method that `ggplot` will use)
10 |     
11 | For me, the determination about which to use really comes down to: how easy is it to count these observations? If the answer is **anything** but "very easy" then I use `geom_col()`; otherwise, I use `geom_bar()`.  
12 | 
13 | # Coding task 
14 | 
15 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `make_bar_plots.Rmd`. 
16 | - This code will ask you to make bar plots with different features.
17 | - Like the last code, we'll now be working with R Markdown files.  
18 | 
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/code/mutate_solution.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Mutating a New Varible'
 3 | author: 'w203: Statistics for Data Science'
 4 | date: "8/13/2020"
 5 | output: github_document
 6 | ---
 7 | 
 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 9 | library(tidyverse)
10 | ```
11 | 
12 | ```{r load data, message=FALSE}
13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
15 | ```
16 | 
17 | # Mutate Task 
18 | 
19 | - Using the `agencies` data, create a series of variables that contain the log of the `count` of launches. 
20 | 
21 | ```{r}
22 | agencies_log <- agencies %>% 
23 |   mutate(count_log = log(count))
24 | 
25 | agencies_log
26 | ```
27 | 
28 | > But not that you don't *have* to assign this to a new object.
29 | 
30 | - Then, show only the columns that are called either `agency` or `contains()` the string "count". 
31 | 
32 | ```{r}
33 | agencies_log %>%  
34 |   select(agency, contains('count'))
35 | ```
36 | 
37 | - Finally, `arrange()` these descending by `count`. 
38 | 
39 | ```{r}
40 | agencies_log %>%  
41 |   select(agency, contains('count')) %>%  
42 |   arrange(desc(count))
43 | ```
44 | 
45 | > HA! It looks as though the data came in the door arranged by count. However, I would **never** suggest relying on this. If you want the data arranged by count, write the code to do so. The upstream data that comes into your analysis could change; potentially without you knowing. 
46 | > 
47 | > If you want your data to have some particualr characteristic, you should write the code that makes it be so. 


--------------------------------------------------------------------------------
/code/filter_solution.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Filter Solution'
 3 | author: 'w203: Statistics for Data Science'
 4 | date: "8/13/2020"
 5 | output: github_document
 6 | ---
 7 | 
 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 9 | library(tidyverse)
10 | ```
11 | 
12 | ```{r load data, results = 'hide', message=FALSE}
13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
15 | ```
16 | 
17 | # Task 
18 | 
19 | Use the magrittr pipr `%>%` and the `filter()` verb to do the following work: 
20 | 
21 | ## Launches in the 1980s
22 | 
23 | Find only the launches that occurred in the 1980s 
24 | 
25 | ```{r}
26 | launches %>% 
27 |   filter(launch_year >= 1980) %>% 
28 |   filter(launch_year < 1990)
29 | ```
30 | 
31 | Or, equivalently
32 | 
33 | ```{r}
34 | launches %>% 
35 |   filter(launch_year >= 1980, launch_year < 1990)
36 | ```
37 | 
38 | ## Launches by France 
39 | 
40 | Find only the launches that were conducted by France (`state_code == "F"` )
41 | 
42 | ```{r}
43 | launches %>% 
44 |   filter(state_code == "F")
45 | ```
46 | 
47 | ## Launches by France in the 1980s
48 | 
49 | Find only the launches by France that were conducted in the 1980s
50 | 
51 | ```{r}
52 | launches %>% 
53 |   filter(launch_year >= 1980, launch_year < 1990, state_code == "F")
54 | ```
55 | 
56 | ## Launches by France or Russia in the 1980s 
57 | 
58 | Find launches in the 1980s by either France or the Soviet Union
59 | 
60 | ```{r}
61 | launches %>% 
62 |   filter(launch_year >= 1980, launch_year < 1990, state_code %in% c("F", "SU"))
63 | ```
64 | 
65 | 


--------------------------------------------------------------------------------
/code/make_line_plots_solution.md:
--------------------------------------------------------------------------------
 1 | Line Plots Solutions
 2 | ================
 3 | w203: Statistics for Data Science
 4 | 
 5 | ``` r
 6 | library(tidyverse)
 7 | library(ggplot2)
 8 | theme_set(theme_minimal())
 9 | knitr::opts_chunk$set(dpi = 200)
10 | ```
11 | 
12 | ``` r
13 | squirrel_subset <- read_csv('./squirrels_subset.csv')
14 | ```
15 | 
16 |     ## Parsed with column specification:
17 |     ## cols(
18 |     ##   long = col_double(),
19 |     ##   lat = col_double(),
20 |     ##   hectare = col_character(),
21 |     ##   date = col_double(),
22 |     ##   age = col_character(),
23 |     ##   primary_fur_color = col_character()
24 |     ## )
25 | 
26 | ``` r
27 | squirrel_subset <- squirrel_subset %>%
28 |   mutate(date_f = as.Date.character(date, format = '%m%d%Y'))
29 | ```
30 | 
31 | # Squirrels by date
32 | 
33 |   - Make a line plot that shows the squirrels observed by date
34 | 
35 | <!-- end list -->
36 | 
37 | ``` r
38 | squirrel_subset %>%
39 |   group_by(date_f) %>%
40 |   summarise(count_of_squirrels = n()) %>%
41 |   ggplot() +
42 |   aes(x = date_f, y = count_of_squirrels) +
43 |   geom_line()
44 | ```
45 | 
46 |     ## `summarise()` ungrouping output (override with `.groups` argument)
47 | 
48 | ![](make_line_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png)<!-- -->
49 | 
50 | # Challenge: Squirrels by color by date
51 | 
52 | ``` r
53 | squirrel_subset %>%
54 |   group_by(date_f, primary_fur_color) %>%
55 |   summarise(count_of_squirrels = n()) %>%
56 |   ggplot() +
57 |   aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) +
58 |   geom_line()
59 | ```
60 | 
61 |     ## `summarise()` regrouping output by 'date_f' (override with `.groups` argument)
62 | 
63 | ![](make_line_plots_solution_files/figure-gfm/unnamed-chunk-2-1.png)<!-- -->
64 | 


--------------------------------------------------------------------------------
/reading_calls/make_line_plots_solution.md:
--------------------------------------------------------------------------------
 1 | Line Plots Solutions
 2 | ================
 3 | w203: Statistics for Data Science
 4 | 
 5 | ``` r
 6 | library(tidyverse)
 7 | library(ggplot2)
 8 | theme_set(theme_minimal())
 9 | knitr::opts_chunk$set(dpi = 200)
10 | ```
11 | 
12 | ``` r
13 | squirrel_subset <- read_csv('./squirrels_subset.csv')
14 | ```
15 | 
16 |     ## Parsed with column specification:
17 |     ## cols(
18 |     ##   long = col_double(),
19 |     ##   lat = col_double(),
20 |     ##   hectare = col_character(),
21 |     ##   date = col_double(),
22 |     ##   age = col_character(),
23 |     ##   primary_fur_color = col_character()
24 |     ## )
25 | 
26 | ``` r
27 | squirrel_subset <- squirrel_subset %>%
28 |   mutate(date_f = as.Date.character(date, format = '%m%d%Y'))
29 | ```
30 | 
31 | # Squirrels by date
32 | 
33 |   - Make a line plot that shows the squirrels observed by date
34 | 
35 | <!-- end list -->
36 | 
37 | ``` r
38 | squirrel_subset %>%
39 |   group_by(date_f) %>%
40 |   summarise(count_of_squirrels = n()) %>%
41 |   ggplot() +
42 |   aes(x = date_f, y = count_of_squirrels) +
43 |   geom_line()
44 | ```
45 | 
46 |     ## `summarise()` ungrouping output (override with `.groups` argument)
47 | 
48 | ![](make_line_plots_solution_files/figure-gfm/unnamed-chunk-1-1.png)<!-- -->
49 | 
50 | # Challenge: Squirrels by color by date
51 | 
52 | ``` r
53 | squirrel_subset %>%
54 |   group_by(date_f, primary_fur_color) %>%
55 |   summarise(count_of_squirrels = n()) %>%
56 |   ggplot() +
57 |   aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) +
58 |   geom_line()
59 | ```
60 | 
61 |     ## `summarise()` regrouping output by 'date_f' (override with `.groups` argument)
62 | 
63 | ![](make_line_plots_solution_files/figure-gfm/unnamed-chunk-2-1.png)<!-- -->
64 | 


--------------------------------------------------------------------------------
/code/make_scatter_plots.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(ggplot2)
 3 | 
 4 | 
 5 | ## Load data: this should just work
 6 | squirrel_subset <- read_csv('./r_bridge/code/squirrels_subset.csv')
 7 | 
 8 | ## Create a basic plot of the observations on the lat and long axis
 9 | 
10 | ggplot(data = squirrel_subset) +
11 |   aes(x = lat, y = long) +
12 |   geom_point()
13 | 
14 | ## Does this look like Central Park?
15 | 
16 | 
17 | ## Task 1: Color by age
18 | 
19 | ## - Now, write code that will modify the plot so that it is colored by age
20 | 
21 | ## Task 2: Make Every Point Blue
22 | 
23 | ## - Now, write code that will make every point blue, not colored by age.
24 | ## - Notice that now this choice isn't an attribute of the data. Where
25 | ##   does this mean that the `color` argument should go?
26 | 
27 | ## Task 3: Color by the Fur Color
28 | 
29 | ## - Now, write code that will color the points by the variable `primary_fur_color`.
30 | ## - Notice that this now *is* an attribute of the data. So, where should the
31 | ##  `color` argument go?
32 | 
33 | ## Task 4: Put onto non-euclidian space
34 | 
35 | ## - If you think carefully about this, we're mapping the geographic coordinate
36 | ##   system onto the euclidian coordinate system. This isn't a _huge_ deal in this
37 | ##   case because we're only covering central park. But, what's right is right...
38 | 
39 | ## - ggplot has the ability to map onto the geographic coordinate system using the
40 | ##   additional function `coord_quickmap()` (which is an approximation) or `coord_map()`
41 | ##   which is not an approximation
42 | 
43 | ## - Given what you understand about the layering system that ggplot uses, can
44 | ##   you add on this new layer that is the `coord_quickmap()` coordiante system?
45 | ## - If so, how much does it change the plot?
46 | 


--------------------------------------------------------------------------------
/code/make_it_sparkle_solution.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "make_it_sparkle.Rmd"
 3 | author: 'w203: Statistics for Data Science'
 4 | output: github_document
 5 | ---
 6 | 
 7 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 8 | library(tidyverse)
 9 | library(ggplot2)
10 | # install.packages('patchwork')
11 | library(patchwork)
12 | 
13 | theme_set(theme_minimal())
14 | knitr::opts_chunk$set(dpi = 200)
15 | ```
16 | 
17 | ```{r load and mutate data}
18 | squirrel_subset <- read.csv('squirrels_subset.csv')
19 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color))
20 | 
21 | squirrel_subset <- squirrel_subset %>%
22 |   mutate(date_f = as.Date.character(date, format = '%m%d%Y'))
23 | ```
24 | 
25 | # Coding Task 
26 | 
27 | Suppose that you want to communicate to someone who has less context with the data. Set the `labs()` argument to include 
28 | 
29 | - A title 
30 | - A subtitle 
31 | - Descriptive Labels for the axes 
32 | - A label for what the colors mean. 
33 | 
34 | Each of the labels should be in **plain spoken language** and should be in sentence case: 
35 | 
36 | - The first letter of the first work should be capitalized
37 | - The rest of the letters should *not* be capitalized
38 | - Everything should be a work that you can speak aloud -- i.e. **not** a variable name. 
39 | 
40 | ```{r coding task}
41 | squirrel_subset %>%  
42 |   group_by(date_f, primary_fur_color) %>%  
43 |   summarise(count_of_squirrels = n()) %>%  
44 |   ggplot() + 
45 |   aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 
46 |   stat_smooth(se = FALSE) + 
47 |   labs(
48 |     title = 'There are a lot of grey squirrels',
49 |     subtitle = 'But, people are collecting data in later days',
50 |     x = 'Date of observation', 
51 |     y = 'Count of squirrels', 
52 |     color = 'Primary Fur Color'
53 |   )
54 | ```
55 | 


--------------------------------------------------------------------------------
/code/make_it_sparkle_solution.md:
--------------------------------------------------------------------------------
 1 | make\_it\_sparkle.Rmd
 2 | ================
 3 | w203: Statistics for Data Science
 4 | 
 5 | ``` r
 6 | library(tidyverse)
 7 | library(ggplot2)
 8 | # install.packages('patchwork')
 9 | library(patchwork)
10 | 
11 | theme_set(theme_minimal())
12 | knitr::opts_chunk$set(dpi = 200)
13 | ```
14 | 
15 | ``` r
16 | squirrel_subset <- read.csv('squirrels_subset.csv')
17 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color))
18 | 
19 | squirrel_subset <- squirrel_subset %>%
20 |   mutate(date_f = as.Date.character(date, format = '%m%d%Y'))
21 | ```
22 | 
23 | # Coding Task
24 | 
25 | Suppose that you want to communicate to someone who has less context
26 | with the data. Set the `labs()` argument to include
27 | 
28 |   - A title
29 |   - A subtitle
30 |   - Descriptive Labels for the axes
31 |   - A label for what the colors mean.
32 | 
33 | Each of the labels should be in **plain spoken language** and should be
34 | in sentence case:
35 | 
36 |   - The first letter of the first work should be capitalized
37 |   - The rest of the letters should *not* be capitalized
38 |   - Everything should be a work that you can speak aloud – i.e. **not**
39 |     a variable name.
40 | 
41 | <!-- end list -->
42 | 
43 | ``` r
44 | squirrel_subset %>%  
45 |   group_by(date_f, primary_fur_color) %>%  
46 |   summarise(count_of_squirrels = n()) %>%  
47 |   ggplot() + 
48 |   aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 
49 |   stat_smooth(se = FALSE) + 
50 |   labs(
51 |     title = 'There are a lot of grey squirrels',
52 |     subtitle = 'But, people are collecting data in later days',
53 |     x = 'Date of observation', 
54 |     y = 'Count of squirrels', 
55 |     color = 'Primary Fur Color'
56 |   )
57 | ```
58 | 
59 |     ## `summarise()` regrouping output by 'date_f' (override with `.groups` argument)
60 | 
61 |     ## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
62 | 
63 | ![](make_it_sparkle_solution_files/figure-gfm/coding%20task-1.png)<!-- -->
64 | 


--------------------------------------------------------------------------------
/reading_calls/make_it_sparkle_solution.md:
--------------------------------------------------------------------------------
 1 | make\_it\_sparkle.Rmd
 2 | ================
 3 | w203: Statistics for Data Science
 4 | 
 5 | ``` r
 6 | library(tidyverse)
 7 | library(ggplot2)
 8 | # install.packages('patchwork')
 9 | library(patchwork)
10 | 
11 | theme_set(theme_minimal())
12 | knitr::opts_chunk$set(dpi = 200)
13 | ```
14 | 
15 | ``` r
16 | squirrel_subset <- read.csv('squirrels_subset.csv')
17 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color))
18 | 
19 | squirrel_subset <- squirrel_subset %>%
20 |   mutate(date_f = as.Date.character(date, format = '%m%d%Y'))
21 | ```
22 | 
23 | # Coding Task
24 | 
25 | Suppose that you want to communicate to someone who has less context
26 | with the data. Set the `labs()` argument to include
27 | 
28 |   - A title
29 |   - A subtitle
30 |   - Descriptive Labels for the axes
31 |   - A label for what the colors mean.
32 | 
33 | Each of the labels should be in **plain spoken language** and should be
34 | in sentence case:
35 | 
36 |   - The first letter of the first work should be capitalized
37 |   - The rest of the letters should *not* be capitalized
38 |   - Everything should be a work that you can speak aloud – i.e. **not**
39 |     a variable name.
40 | 
41 | <!-- end list -->
42 | 
43 | ``` r
44 | squirrel_subset %>%  
45 |   group_by(date_f, primary_fur_color) %>%  
46 |   summarise(count_of_squirrels = n()) %>%  
47 |   ggplot() + 
48 |   aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 
49 |   stat_smooth(se = FALSE) + 
50 |   labs(
51 |     title = 'There are a lot of grey squirrels',
52 |     subtitle = 'But, people are collecting data in later days',
53 |     x = 'Date of observation', 
54 |     y = 'Count of squirrels', 
55 |     color = 'Primary Fur Color'
56 |   )
57 | ```
58 | 
59 |     ## `summarise()` regrouping output by 'date_f' (override with `.groups` argument)
60 | 
61 |     ## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
62 | 
63 | ![](make_it_sparkle_solution_files/figure-gfm/coding%20task-1.png)<!-- -->
64 | 


--------------------------------------------------------------------------------
/code/make_a_data_set_solution.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Make A Data Set"
 3 | author: 'w203: Statistics for Data Science'
 4 | date: "8/14/2020"
 5 | output: github_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | library(tidyverse)
10 | ```
11 | 
12 | # Paste Farm Code 
13 | 
14 | Start by pasting the code that you wrote to create the farm animals below. 
15 | 
16 | ```{r}
17 | ID <- 1:300
18 | animal <- c(rep('cow', 10), rep('sheep', 50), rep('goat', 40), rep('chicken', 70), rep('rabbit', 130))
19 | petting_zoo <- rep('no', 300)
20 | petting_zoo[animal == 'rabbit'] <- 'yes'
21 | 
22 | weight <- rep(NA, 300)
23 | weight[animal == 'cow']     <- rnorm(n = 10, mean = 1000, sd = 100)
24 | weight[animal == 'sheep']   <- rnorm(n = 50, mean = 100, sd = 2)
25 | weight[animal == 'goat']    <- rnorm(n = 40, mean = 40, sd = 2)
26 | weight[animal == 'chicken'] <- .2
27 | weight[animal == 'rabbit']  <- NA
28 | 
29 | feed <- rep(NA, 300)
30 | 
31 | feed[animal == 'cow']     <- weight[animal == 'cow'] * .03   
32 | feed[animal == 'sheep']   <- weight[animal == 'sheep'] *.02  
33 | feed[animal == 'goat']    <- weight[animal == 'goat'] * .07
34 | feed[animal == 'chicken'] <- weight[animal == 'chicken'] * .02
35 | feed[animal == 'rabbit']  <- NA
36 | ```
37 | 
38 | ## Combine into a dataset 
39 | 
40 | Now, combine all of these variables into a single dataset, called `tilden`. 
41 | 
42 | ```{r}
43 | tilden <- data.frame(
44 |   ID, animal, petting_zoo, weight, feed
45 | )
46 | ```
47 | 
48 | 
49 | ## Typecast 
50 | 
51 | Now do a little bit of mutating and type converting. 
52 | 
53 | - Since we know that each of the types of animals has a specific amount that they eat, let's label them "hungry boi" if they eat more than average; and "slender boi" if they eat less than average. (Sorry... I know this coding might be getting tedious!). 
54 | - Make each of these relative to the mean within that animal type. 
55 | 
56 | ```{r}
57 | tilden %>%
58 |   group_by(animal) %>%  
59 |   mutate(hungry_slender = ifelse(weight > mean(weight), 'hungry boi', 'slender boi'))
60 | ```


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Course Syllabus 
 2 | 
 3 | As a bridge course, this course is distinct from the main course offerings of the School of Information -- there are no syncronous discussion sections, and there are no graded assessments. Instead, this is a self-paced course of study designed to be completed in roughly a week before beginning a credit-bearing course. 
 4 | 
 5 | - [Syllabus](./syllabus.md)
 6 | - [Schedule](./schedule.md)
 7 | 
 8 | The class works closely from [R for Data Science](https://r4ds.had.co.nz "r4ds") to prepare students for the theoretical and applied work. We recommend the course for students preparing to take *Statistics for Data Scientists* (w203); but the course might also be a useful refresher for students who are taking *Experiments and Causal Inference* (w241) and *Statistical Methods for Discrete Rseponse, Time Series, and Panel Data* (w271). 
 9 | 
10 | In contrast to many methods of learning new languages, this course focuses on working with high-level parts of the language -- namely plotting (from `ggplot2`) and data manipulation (from `dplyr`) before addressing lower-level parts of the language. Indeed, this course is at an even higher-level part of the langauge api than the *R for Data Science* textbook. 
11 | 
12 | I (Alex) made this choice deliberately, based on a model of spoken language-learning: When learning a new language, we try to build concepts and vocabulary, even if this means that we conjugate verbs incorrectly or use inefficient methods of expressing ourselves. Often times, coding-language-learning tries to approach their language as though there is some axomatic truth from which the language is derived. These "truths" are enshrined in style guides, deployed code, and bravado; but they hide a more important truth -- just start communicating and see what you can get done. 
13 | 
14 | You can probably nagivate a busy market and appreciate the culture in a foreign land where you don't speak the language. Similarily, you can probably write clumsy code that expresses your intent and learn from your data. Just start writing and learning; we'll speak fluently soon enough. 
15 | 


--------------------------------------------------------------------------------
/code/base_operations.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Heading to the Little Farm"
 3 | author: 'w203: Statistics for Data Science'
 4 | output: github_document
 5 | ---
 6 | 
 7 | # The Little Farm 
 8 | 
 9 | Just around the corner from campus in Berkeley is the Tilden "Little Farm" its a sort of hobby farm that has cows, goats, chickens, sheep, and a small menagerie of other cute animals. I think that idea is that kids can see animals -- but I've also got to admit that it is kind of weird to have a tiny farm in the middle of the neighborhood. 
10 | 
11 | ## Create Farm Data 
12 | 
13 | Let's use all the methods that you've just been working on to produce a dataset that represents the farm. Using methods that you're familiar with, create the following dataset. 
14 | 
15 | - A column that ranges from 1 - 300 called "ID" that is an ID for the animal. 
16 | - A column that has the type of animal that is being recorded: 
17 |     - There are 10 cows
18 |     - There are 50 sheep
19 |     - There are 40 goats 
20 |     - There are 70 chickens
21 |     - There are 130 rabbits 
22 | - A column that describes whether that animal belongs in the petting zoo 
23 | - A column that describes the weight of the animals: 
24 |     - Cows weight are normally distributed, with a mean of 1000kg and a sd of 100kg
25 |     - Sheep are normally distributed, with a mean of 100kg and a sd of 2 kg
26 |     - Goats are normally distributed, with a mean of 40kg and a sd of 2kg 
27 |     - Chickens all weight .2 kg
28 |     - Rabbits don't stand still long enough to be weighted, so there is no data
29 | - A column that described how much feed that animal needs -- this is animal specific, and depends on how much the animal weighs. 
30 |     - A cow needs 3% of its body weight each day to stay alive 
31 |     - A sheep needs 2% of its body weight each day to stay alive
32 |     - A goat needs 7% of its body weight each day to stay alive 
33 |     - A chicken needs 2% of its body weight each day to stay alive
34 |     
35 | You are free to use `dplyr` actions to modify variables, if you like.     
36 |     
37 | ```{r}
38 | library(tidyverse)
39 | ```
40 |     
41 |   


--------------------------------------------------------------------------------
/reading_calls/additional_features.md:
--------------------------------------------------------------------------------
 1 | # aes(x= , y= , fill =) 
 2 | 
 3 | The x and y coordinate mappings are the most straightforward set of mappings that we typically interact with in the `aes()` function call. But, we can map more data into our plot by passing additional aesthetic arguments. 
 4 | 
 5 | The `color` and `fill` aesthetic mappings change the colors of the geometries that are plotted, depending on values that they are mapped to. When the data passed to these arguments are continuous, the colors will be placed onto a gradient scale; when the data passed to these arguments are categorical, a different color will be used for each level within the variable. Apart from `x` and `y`, `color` and `fill` are the two aesthetics that I use most frequently; probably because they're both used to communicate grouping structure within your plot. Two pieces of trivia, that actually end up mattering: 
 6 | 
 7 | - `color` defines the external “outline” of geometries. In some other plotting languages this might be called the "stroke"
 8 | - `fill` defines the internal colors of the geometries. 
 9 | 
10 | Something like a line (generated using `geom_line()`) doesn't have any internal space, and so line color is controlled using `color`. However, bars (`geom_bar()`) **do** have internal space so their colors are controlled by `fill`. 
11 | 
12 | The `shape` of a geometry can also be set by a variable in the data. Shape most often affects geom_point, but also a few other geometries that you might find to use. 
13 | 
14 | The `size` of a geometry determines the size (in mm) of the geometry. For points, this is the size of the point, for lines, this is the width of the line. 
15 | 
16 | # Coding task 
17 | 
18 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code `additional_plot_features.Rmd`. 
19 | - This code will ask you to make scatter plots with different features.
20 | - Note that we're switching to working with R Markdown file now, which are enhancements over the .R files that we've been using to this point. We'll explain that in the file itself. 
21 | 
22 | 


--------------------------------------------------------------------------------
/code/select_solution.md:
--------------------------------------------------------------------------------
 1 | Select
 2 | ================
 3 | w203: Statistics for Data Science
 4 | 8/13/2020
 5 | 
 6 | ``` r
 7 | library(tidyverse)
 8 | ```
 9 | 
10 | ``` r
11 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
12 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
13 | ```
14 | 
15 | # Task
16 | 
17 |   - From the launches data, `select()` only the columns that are related
18 |     to characteristics about the country
19 | 
20 | <!-- end list -->
21 | 
22 | ``` r
23 | launches %>% 
24 |   select(agency, state_code, agency_type)
25 | ```
26 | 
27 |     ## # A tibble: 5,726 x 3
28 |     ##    agency state_code agency_type
29 |     ##    <chr>  <chr>      <chr>      
30 |     ##  1 US     US         state      
31 |     ##  2 US     US         state      
32 |     ##  3 US     US         state      
33 |     ##  4 US     US         state      
34 |     ##  5 US     US         state      
35 |     ##  6 US     US         state      
36 |     ##  7 US     US         state      
37 |     ##  8 US     US         state      
38 |     ##  9 US     US         state      
39 |     ## 10 US     US         state      
40 |     ## # … with 5,716 more rows
41 | 
42 |   - From the launches data, `select()` only the columns that are related
43 |     to time
44 | 
45 | <!-- end list -->
46 | 
47 | ``` r
48 | launches %>% 
49 |   select(tag, JD, launch_date, launch_year)
50 | ```
51 | 
52 |     ## # A tibble: 5,726 x 4
53 |     ##    tag            JD launch_date launch_year
54 |     ##    <chr>       <dbl> <date>            <dbl>
55 |     ##  1 1967-065 2439671. 1967-06-29         1967
56 |     ##  2 1967-080 2439726. 1967-08-23         1967
57 |     ##  3 1967-096 2439775. 1967-10-11         1967
58 |     ##  4 1968-042 2440000. 1968-05-23         1968
59 |     ##  5 1968-092 2440153. 1968-10-23         1968
60 |     ##  6 1969-062 2440426. 1969-07-23         1969
61 |     ##  7 1970-012 2440629. 1970-02-11         1970
62 |     ##  8 1970-070 2440833. 1970-09-03         1970
63 |     ##  9 1971-012 2441000. 1971-02-17         1971
64 |     ## 10 1971-054 2441111. 1971-06-08         1971
65 |     ## # … with 5,716 more rows
66 | 


--------------------------------------------------------------------------------
/reading_calls/select_solution.md:
--------------------------------------------------------------------------------
 1 | Select
 2 | ================
 3 | w203: Statistics for Data Science
 4 | 8/13/2020
 5 | 
 6 | ``` r
 7 | library(tidyverse)
 8 | ```
 9 | 
10 | ``` r
11 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
12 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
13 | ```
14 | 
15 | # Task
16 | 
17 |   - From the launches data, `select()` only the columns that are related
18 |     to characteristics about the country
19 | 
20 | <!-- end list -->
21 | 
22 | ``` r
23 | launches %>% 
24 |   select(agency, state_code, agency_type)
25 | ```
26 | 
27 |     ## # A tibble: 5,726 x 3
28 |     ##    agency state_code agency_type
29 |     ##    <chr>  <chr>      <chr>      
30 |     ##  1 US     US         state      
31 |     ##  2 US     US         state      
32 |     ##  3 US     US         state      
33 |     ##  4 US     US         state      
34 |     ##  5 US     US         state      
35 |     ##  6 US     US         state      
36 |     ##  7 US     US         state      
37 |     ##  8 US     US         state      
38 |     ##  9 US     US         state      
39 |     ## 10 US     US         state      
40 |     ## # … with 5,716 more rows
41 | 
42 |   - From the launches data, `select()` only the columns that are related
43 |     to time
44 | 
45 | <!-- end list -->
46 | 
47 | ``` r
48 | launches %>% 
49 |   select(tag, JD, launch_date, launch_year)
50 | ```
51 | 
52 |     ## # A tibble: 5,726 x 4
53 |     ##    tag            JD launch_date launch_year
54 |     ##    <chr>       <dbl> <date>            <dbl>
55 |     ##  1 1967-065 2439671. 1967-06-29         1967
56 |     ##  2 1967-080 2439726. 1967-08-23         1967
57 |     ##  3 1967-096 2439775. 1967-10-11         1967
58 |     ##  4 1968-042 2440000. 1968-05-23         1968
59 |     ##  5 1968-092 2440153. 1968-10-23         1968
60 |     ##  6 1969-062 2440426. 1969-07-23         1969
61 |     ##  7 1970-012 2440629. 1970-02-11         1970
62 |     ##  8 1970-070 2440833. 1970-09-03         1970
63 |     ##  9 1971-012 2441000. 1971-02-17         1971
64 |     ## 10 1971-054 2441111. 1971-06-08         1971
65 |     ## # … with 5,716 more rows
66 | 


--------------------------------------------------------------------------------
/code/how_to_summarise.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "How to Smooth"
 3 | author: 'w203: Statistics for Data Science'
 4 | output: github_document
 5 | ---
 6 | 
 7 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 8 | library(tidyverse)
 9 | library(ggplot2)
10 | # install.packages('patchwork')
11 | library(patchwork)
12 | 
13 | theme_set(theme_minimal())
14 | knitr::opts_chunk$set(dpi = 200)
15 | ```
16 | 
17 | ```{r load and mutate data}
18 | squirrel_subset <- read.csv('squirrels_subset.csv')
19 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color))
20 | 
21 | squirrel_subset <- squirrel_subset %>%
22 |   mutate(date_f = as.Date.character(date, format = '%m%d%Y'))
23 | ```
24 | 
25 | # Coding Task 
26 | 
27 | There if you plot the grouped plot that we've shown a few times now, there's a lot of movement in the observations on a daily basis. This might questions to focus on a particularly productive day on the squirrel census -- "I don't know why there were more this day!" -- which isn't really the point of the plot. Instead, the point of the plot is that there are many more Gray squirrels but that the number seems to be decreasing through the census time. 
28 | 
29 | ```{r}
30 | squirrel_subset %>%
31 |   group_by(date_f, primary_fur_color) %>%
32 |   summarise(count_of_colors = n()) %>% 
33 |   ggplot() + 
34 |   aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 
35 |   geom_line()
36 | ```
37 | 
38 | ## Your Task 
39 | 
40 | Change the plot so that instead the plot uses the `stat_smooth()` function -- to do so, you'll have to change the `geom_line()` call to something else. 
41 | 
42 | - First run the smoother as is
43 | - Then, suppress the reporting of the standard errors (you will likely have to look into the help documentation to figure out what the particular argument is that controls those error bars)
44 | - Then, re-plot again but change the variable that controls the "wiggliness" of the lines. Is there a level of this variable that you think best communicates the point you want to make with this data? 
45 | 
46 | ```{r}
47 | squirrel_subset %>%
48 |   group_by(date_f, primary_fur_color) %>%
49 |   summarise(count_of_colors = n()) %>% 
50 |   ggplot() + 
51 |   aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 
52 |   geom_line() 
53 | ```
54 | 


--------------------------------------------------------------------------------
/code/working_with_rstudio.R:
--------------------------------------------------------------------------------
 1 | ## Welcome! You've made it over to the IDE.
 2 | 
 3 | ## Any line that starts with one or more `#` will be commented out.
 4 | ## This means that if you run that line, nothing will actually occur in the
 5 | ## interpretor. 
 6 | 
 7 | ## To run this code below you can do the following:
 8 | ## - If you are on a Mac, on the line that you want to run you can press `command+return`
 9 | ##   which means to hold command and then press return.
10 | ## - If you are on a Windows or Linux machine, then you can run that line by pressing `alt+return`
11 | ##   which means to hold the alt key and press return.
12 | 
13 | ## When you run the first line one of two things might happen:
14 | ## 1. You might get an error because you haven't installed that package. If this happens,
15 | ##    look near the top of your Rstudio screen -- there should be a helper that asks if
16 | ##    you want to install this library. You do and can click "install".
17 | ## 2. If you've already installed that library, then it should load the package, which
18 | ##    you will see in the console below. 
19 | 
20 | 
21 | library(ggplot2)
22 | 
23 | ## Now, if you want to create some data, you can either
24 | ##  - Run the first line where you are creating the object `d` that is a data.frame; or,
25 | ##  - Highlight the region that you want to run and then run that region (using command+return
26 | ##    or alt+return). 
27 | 
28 | 
29 | d <- data.frame(
30 |   id = 1:1000, 
31 |   x  = rnorm(1000, mean = 0, sd = 1), 
32 |   y  = rnorm(1000, mean = 10, sd = 2),
33 |   color = sample(c('red', 'blue'), size = 1000, replace = TRUE)
34 | )
35 | 
36 | ## To produce the plot below, run these lines. Do you need to run all the lines?
37 | ## Or, can you run just the first? Why do you think the IDE chooses to behave this way?
38 | 
39 | ggplot(data = d, aes(x=x, y=y)) + 
40 |   geom_point()
41 | 
42 | ## Finally, you can run code that doesn't have any visible side effects.
43 | ## If you run the line below, what do you see in your console? Just that the line has run?
44 | ## But, now look into the `Environment` tab that is visible to you -- is there a record
45 | ## of this `mod` that you just created?
46 | 
47 | mod <- lm(y ~ x, data = d)
48 | 
49 | ## The model that you created is stored in the working memory and can be called by
50 | ## naming the object.
51 | 
52 | mod
53 | 
54 | ## If you want to use the summary function on the model, you can and you will see
55 | ## a different return printed to the console.
56 | 
57 | summary(mod)
58 | 


--------------------------------------------------------------------------------
/code/how_to_summarise_solution.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "How to Smooth"
 3 | author: 'w203: Statistics for Data Science'
 4 | output: github_document
 5 | ---
 6 | 
 7 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 8 | library(tidyverse)
 9 | library(ggplot2)
10 | # install.packages('patchwork')
11 | library(patchwork)
12 | 
13 | theme_set(theme_minimal())
14 | knitr::opts_chunk$set(dpi = 200)
15 | ```
16 | 
17 | ```{r load and mutate data}
18 | squirrel_subset <- read.csv('squirrels_subset.csv')
19 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color))
20 | 
21 | squirrel_subset <- squirrel_subset %>%
22 |   mutate(date_f = as.Date.character(date, format = '%m%d%Y'))
23 | ```
24 | 
25 | # Coding Task 
26 | 
27 | There if you plot the grouped plot that we've shown a few times now, there's a lot of movement in the observations on a daily basis. This might questions to focus on a particularly productive day on the squirrel census -- "I don't know why there were more this day!" -- which isn't really the point of the plot. Instead, the point of the plot is that there are many more Gray squirrels but that the number seems to be decreasing through the census time. 
28 | 
29 | ```{r}
30 | squirrel_subset %>%
31 |   group_by(date_f, primary_fur_color) %>%
32 |   summarise(count_of_colors = n()) %>% 
33 |   ggplot() + 
34 |   aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 
35 |   geom_line()
36 | ```
37 | 
38 | ## Your Task 
39 | 
40 | Change the plot so that instead the plot uses the `stat_smooth()` function -- to do so, you'll have to change the `geom_line()` call to something else. 
41 | 
42 | - First run the smoother as is
43 | - Then, suppress the reporting of the standard errors (you will likely have to look into the help documentation to figure out what the particular argument is that controls those error bars)
44 | - Then, re-plot again but change the variable that controls the "wiggliness" of the lines. Is there a level of this variable that you think best communicates the point you want to make with this data? 
45 | 
46 | ```{r}
47 | squirrel_subset %>%
48 |   group_by(date_f, primary_fur_color) %>%
49 |   summarise(count_of_colors = n()) %>% 
50 |   ggplot() + 
51 |   aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 
52 |   stat_smooth(span = .8, se = FALSE)
53 | ```
54 | > I think that this `span = 0.8` is my preferred span. You can see that it is *just* barely staying smooth -- there are a few points where this plot seems to try to pull away from the general line. Setting the span higher is over simplifying the trend, to my eye, setting is smaller recoveres most of the "noise" that we wanted to smooth out of the data. 


--------------------------------------------------------------------------------
/code/working_with_rstudio_solution.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Interacting with the IDE Solutions"
 3 | author: 'w203: Statistics for Data Science'
 4 | date: "8/13/2020"
 5 | output: github_document
 6 | ---
 7 | ```{r}
 8 | ## Welcome! You've made it over to the IDE.
 9 | 
10 | ## Any line that starts with one or more `#` will be commented out.
11 | ## This means that if you run that line, nothing will actually occur in the
12 | ## interpretor. 
13 | 
14 | ## To run this code below you can do the following:
15 | ## - If you are on a Mac, on the line that you want to run you can press `command+return`
16 | ##   which means to hold command and then press return.
17 | ## - If you are on a Windows or Linux machine, then you can run that line by pressing `alt+return`
18 | ##   which means to hold the alt key and press return.
19 | 
20 | ## When you run the first line one of two things might happen:
21 | ## 1. You might get an error because you haven't installed that package. If this happens,
22 | ##    look near the top of your Rstudio screen -- there should be a helper that asks if
23 | ##    you want to install this library. You do and can click "install".
24 | ## 2. If you've already installed that library, then it should load the package, which
25 | ##    you will see in the console below. 
26 | 
27 | 
28 | library(ggplot2)
29 | 
30 | ## Now, if you want to create some data, you can either
31 | ##  - Run the first line where you are creating the object `d` that is a data.frame; or,
32 | ##  - Highlight the region that you want to run and then run that region (using command+return
33 | ##    or alt+return). 
34 | 
35 | 
36 | d <- data.frame(
37 |   id = 1:1000, 
38 |   x  = rnorm(1000, mean = 0, sd = 1), 
39 |   y  = rnorm(1000, mean = 10, sd = 2),
40 |   color = sample(c('red', 'blue'), size = 1000, replace = TRUE)
41 | )
42 | 
43 | ## To produce the plot below, run these lines. Do you need to run all the lines?
44 | ## Or, can you run just the first? Why do you think the IDE chooses to behave this way?
45 | 
46 | ggplot(data = d, aes(x=x, y=y)) + 
47 |   geom_point()
48 | 
49 | ## Finally, you can run code that doesn't have any visible side effects.
50 | ## If you run the line below, what do you see in your console? Just that the line has run?
51 | ## But, now look into the `Environment` tab that is visible to you -- is there a record
52 | ## of this `mod` that you just created?
53 | 
54 | mod <- lm(y ~ x, data = d)
55 | 
56 | ## The model that you created is stored in the working memory and can be called by
57 | ## naming the object.
58 | 
59 | mod
60 | 
61 | ## If you want to use the summary function on the model, you can and you will see
62 | ## a different return printed to the console. 
63 | 
64 | summary(mod)
65 | ```


--------------------------------------------------------------------------------
/code/make_scatter_plots_solution.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | Title: 'Make Scatter Plots Solution'
 3 | output: github_document
 4 | ---
 5 | 
 6 | ```{r setup, results = 'hide', warning=FALSE, message=FALSE}
 7 | library(tidyverse)
 8 | library(ggplot2)
 9 | theme_set(theme_minimal())
10 | knitr::opts_chunk$set(dpi = 200)
11 | ```
12 | 
13 | This output is best viewed over in github because we cannot render images into
14 | the ISVC. :tada:
15 | 
16 | # Load data: this should just work
17 | ```{r load data}
18 | squirrel_subset <- read_csv('./squirrels_subset.csv')
19 | ```
20 | 
21 | ## Create a basic plot of the observations on the lat and long axis
22 | 
23 | ```{r basic plot}
24 | ggplot(data = squirrel_subset) +
25 |   aes(x = long, y = lat) +
26 |   geom_point()
27 | ```
28 | 
29 | > This looks like central park to me! 
30 | 
31 | # Task 1: Color by age
32 | 
33 | Write code that will modify the plot so that it is colored by age. 
34 | 
35 | ```{r age plot}
36 | ggplot(data = squirrel_subset) +
37 |   aes(x = long, y = lat, color = age) +
38 |   geom_point()
39 | ```
40 | 
41 | # Task 2: Make Every Point Blue
42 | 
43 | - Now, write code that will make every point blue, not colored by age.
44 | - Notice that now this choice isn't an attribute of the data. 
45 | - Where does this mean that the `color` argument should go?
46 | 
47 | ```{r feeling blue}
48 | ggplot(data = squirrel_subset) +
49 |   aes(x = long, y = lat) +
50 |   geom_point(color = 'blue')
51 | ```
52 | 
53 | # Task 3: Color by the Fur Color
54 | 
55 | - Now, write code that will color the points by the variable `primary_fur_color`.
56 | - Notice that this now *is* an attribute of the data. So, where should the `color` argument go?
57 | 
58 | ```{r fur color plot}
59 | ggplot(data = squirrel_subset) +
60 |   aes(x = long, y = lat, color = primary_fur_color) +
61 |   geom_point()
62 | ```
63 | 
64 | # Task 4: Put onto non-euclidian space
65 | 
66 | - If you think carefully about this, we're mapping the geographic coordinate
67 |  system onto the euclidian coordinate system. This isn't a _huge_ deal in this
68 |  case because we're only covering central park. But, what's right is right...
69 | 
70 | - ggplot has the ability to map onto the geographic coordinate system using the
71 |  additional function `coord_quickmap()` (which is an approximation) or `coord_map()`
72 |  which is not an approximation
73 | 
74 | - Given what you understand about the layering system that ggplot uses, can
75 |  you add on this new layer that is the `coord_quickmap()` coordiante system?
76 | - If so, how much does it change the plot? 
77 | 
78 | ```{r non euclidian space}
79 | ggplot(data = squirrel_subset) +
80 |   aes(x = long, y = lat, color = primary_fur_color) +
81 |   coord_quickmap() +
82 |   geom_point()
83 | ```


--------------------------------------------------------------------------------
/reading_calls/summarize.md:
--------------------------------------------------------------------------------
 1 | # How to Summarise 
 2 | 
 3 | Up to this point, any time that we've made a mapping of data, I've done the mapping for you. Now it is time to take the gloves off and get to it yourself. 
 4 | 
 5 | When you summarise data you are making the active decision: 
 6 | 
 7 | > I would like to show less data -- I would like to produce some summary of the data that is a useful simplification 
 8 | 
 9 | As we move into s203, you'll see that these summaries have a formal definition as a _statistic_ where we're making a mapping of a random variable into a lower-dimensional position representation on the real number space; but that formalism isn't necessary yet. 
10 | 
11 | A summary of data could come in one of many forms -- in fact, maybe without knowing it, we've already been summarising data when we've been asking 
12 | 
13 | > "How many squirrels of each color did we observe on these days?"
14 | 
15 | The answer to that question takes the **whole** data seires that we have, and produces a smaller, shorter representation that we're then reasoning about and plotting. 
16 | 
17 | Other forms of summary could be averages, medians, variances, or **any** other statistic. The usefulness of a summary can only be assessed in the context where it is being used, so without some other criteria is isn't really possible to _prefer_ one summary over another. 
18 | 
19 | # Summarising within dplyr 
20 | 
21 | To produce a summary of a variable within dplyr you use the `summarise` (alternatively spelled `summarize` -- the package author is from New Zealand and so chooses to spell in his native spelling. There is actually a **ton** of interesting sociology and ethnography about why the de facto language for coding is some variant of the English language. Personally, I'm a total mixing pot of spelling for this -- first, I don't really care about spelling; second, my mum is Canadian (and so spells one way) but I grew up in the USA (so spell the other way)). 
22 | 
23 | At its simplest, this might look as the following: 
24 | 
25 | ```
26 | data_frame %>% 
27 |   summarise(average_of_variable = mean(variable))
28 | ```
29 | 
30 | This process would start from the object called `data_frame`, and would then produce a new variable called `average_of_variable` that applies the `mean()` function against the variable called `variable`. 
31 | 
32 | As a result of this call, we would return back a single number, with the name average_of_variable`.
33 | 
34 | # Coding task 
35 | 
36 | - Navigate to the [Datahub](https://ischool.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FUCB-MIDS%2Fr_bridge&urlpath=rstudio%2F&branch=master) and open the code summarize.Rmd`. 
37 | - This code will ask you to produce a few short summaries of data. 
38 | 


--------------------------------------------------------------------------------
/code/summarize.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Summarizing a Series of Variables'
 3 | author: 'w203: Statistics for Data Science'
 4 | date: "8/13/2020"
 5 | output: github_document
 6 | ---
 7 | 
 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 9 | library(tidyverse)
10 | ```
11 | 
12 | ```{r load data, message=FALSE}
13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
15 | ```
16 | 
17 | # Task 
18 | 
19 | With the agencies data, produce a *meaningful* summary of the following variables: 
20 | 
21 | - The average number of launches
22 | - The median of the number of launches
23 | - The variance of the number of launches
24 | - A count of the unique number of agencies. (In the lecture, I wrote a method of accomplishing this using base tools; you can also use the `dplyr` function `n_distinct`)
25 | 
26 | ```{r}
27 | agencies %>%  
28 |   summarize(
29 |     launches_mean    = mean(count), 
30 |     launches_median  = median(count),
31 |     launches_var     = var(count), 
32 |     agencies_count   = n_distinct(agency, na.rm = TRUE),
33 |     agencies_count_2 = length(unique(agency))
34 |   )
35 | ```
36 | 
37 | > Notice a few things that I've done in this code: 
38 | > 1. Each of the mutate varaibles that I've written starts with the same variable "slug" -- this this case `variable_`. I thought that this was a more useful variable name than `count`. But, more to the point, this way there is a consistent look-up (both visually and programatically) for all the variable that are associate with this concenpt. If, instead, you wrote this as `mean_launches` which *does* have a more natural reading aloud, then the ordering of these variables might move apart when you consider, say `var_launches`. 
39 | > 2. I've added extra white-space after the new varaibles that I've created so that I can align the `=` signs. This is always allowed within the code style, and helps to set apart the variables that you're making from those that exist. Just compare the two blocks below to see. 
40 | 
41 |     agencies %>%  
42 |       summarize(
43 |         launches_mean    = mean(count), 
44 |         launches_median  = median(count),
45 |         launches_var     = var(count), 
46 |         agencies_count   = n_distinct(agency, na.rm = TRUE),
47 |         agencies_count_2 = length(unique(agency))
48 |       )
49 |   
50 |     agencies %>%  
51 |     summarize(
52 |       launches_mean    = mean(count), 
53 |       launches_median  = median(count),
54 |       launches_var     = var(count), 
55 |       agencies_count   = n_distinct(agency, na.rm = TRUE),
56 |       agencies_count_2 = length(unique(agency))
57 |     )
58 | 


--------------------------------------------------------------------------------
/code/summarize_solution.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Summarizing a Series of Variables'
 3 | author: 'w203: Statistics for Data Science'
 4 | date: "8/13/2020"
 5 | output: github_document
 6 | ---
 7 | 
 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 9 | library(tidyverse)
10 | ```
11 | 
12 | ```{r load data, message=FALSE}
13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
15 | ```
16 | 
17 | # Task 
18 | 
19 | With the agencies data, produce a *meaningful* summary of the following variables: 
20 | 
21 | - The average number of launches
22 | - The median of the number of launches
23 | - The variance of the number of launches
24 | - A count of the unique number of agencies. (In the lecture, I wrote a method of accomplishing this using base tools; you can also use the `dplyr` function `n_distinct`)
25 | 
26 | ```{r}
27 | agencies %>%  
28 |   summarize(
29 |     launches_mean    = mean(count), 
30 |     launches_median  = median(count),
31 |     launches_var     = var(count), 
32 |     agencies_count   = n_distinct(agency, na.rm = TRUE),
33 |     agencies_count_2 = length(unique(agency))
34 |   )
35 | ```
36 | 
37 | > Notice a few things that I've done in this code: 
38 | > 
39 | > 1. Each of the mutate varaibles that I've written starts with the same variable "slug" -- this this case `variable_` or `agencies_`. I thought that this was a more useful variable name than `count`. But, more to the point, this way there is a consistent look-up (both visually and programatically) for all the variable that are associate with this concenpt. If, instead, you wrote this as `mean_launches` which *does* have a more natural reading aloud, then the ordering of these variables might move apart when you consider, say `var_launches`. 
40 | > 2. I've added extra white-space after the new varaibles that I've created so that I can align the `=` signs. This is always allowed within the code style, and helps to set apart the variables that you're making from those that exist. Just compare the two blocks below to see. 
41 | 
42 |     agencies %>%  
43 |       summarize(
44 |         launches_mean = mean(count), 
45 |         launches_median = median(count),
46 |         launches_var = var(count), 
47 |         agencies_count = n_distinct(agency, na.rm = TRUE),
48 |         agencies_count_2 = length(unique(agency))
49 |       )
50 |   
51 |     agencies %>%  
52 |     summarize(
53 |       launches_mean    = mean(count), 
54 |       launches_median  = median(count),
55 |       launches_var     = var(count), 
56 |       agencies_count   = n_distinct(agency, na.rm = TRUE),
57 |       agencies_count_2 = length(unique(agency))
58 |     )
59 | 


--------------------------------------------------------------------------------
/code/group_by_summarize_solution.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Split, Apply, Combine'
 3 | author: 'w203: Statistics for Data Science'
 4 | date: "8/13/2020"
 5 | output: github_document
 6 | ---
 7 | 
 8 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 9 | library(tidyverse)
10 | ```
11 | 
12 | ```{r load data, message=FALSE}
13 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
14 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
15 | ```
16 | 
17 | # Task 
18 | 
19 | - Using the `launches` data, count the total number of launches per `launch_year`, grouped by `state_code`. 
20 | 
21 | ```{r}
22 | launches %>%  
23 |   group_by(state_code, launch_year) %>%  
24 |   summarise(launches_total = n()) 
25 | ```
26 | 
27 | > Roger that. 
28 | 
29 | - Then, using `arrange` answer the question: which year was the busiest for any state? 
30 | 
31 | ```{r}
32 | launches %>%  
33 |   group_by(state_code, launch_year) %>%  
34 |   summarise(launches_total = n()) %>%  
35 |   arrange(desc(launches_total))
36 | ```
37 | 
38 | > It looks like the busiest year for any state was 1982 in the Soviet Union, followed very closely by other years right around the same time for the soviet union.
39 | 
40 | - Then, using `filter` answer the question: what was the busiest year for the US? 
41 | 
42 | ```{r}
43 | launches %>%  
44 |   group_by(state_code, launch_year) %>%  
45 |   summarise(launches_total = n()) %>%  
46 |   filter(state_code == 'US') %>%  
47 |   arrange(desc(launches_total))
48 | ```
49 | 
50 | > The busiest year for the US was 1966. And, although it is a different decade than the busiest year for the Soviet Union, you can see that when a country makes an investment in their space exploration program, they seem to have pretty bursty output for launches. 
51 | 
52 | - Then, using another variable summary, answer the question: which country has the most variance in the per-year launches?
53 | 
54 | This is a little tricky, because I'm not asking you to include the launch year in the *final* grouping, instead, you're going to summarize *across* launch years. To do this, start as we have, but then part way through, drop the grouping by year. To do this, declare a new `group_by()` call, and then proceed with your summary.  
55 | 
56 | ```{r}
57 | launches %>%  
58 |   group_by(state_code, launch_year) %>%  
59 |   summarise(launches_total = n()) %>%  
60 |   group_by(state_code) %>%  
61 |   summarise(launches_variance = var(launches_total, na.rm = T)) %>% 
62 |   arrange(desc(launches_variance))
63 | ```
64 | 
65 | Think about how hard this would have been if you were writing this in some *other* idiom. It isn't that this was necessariliy easy, but that it is possible. And, it is legible while you're doing it! Neat. 


--------------------------------------------------------------------------------
/code/make_a_data_set_solution.md:
--------------------------------------------------------------------------------
 1 | Make A Data Set
 2 | ================
 3 | w203: Statistics for Data Science
 4 | 8/14/2020
 5 | 
 6 | # Paste Farm Code
 7 | 
 8 | Start by pasting the code that you wrote to create the farm animals
 9 | below.
10 | 
11 | ``` r
12 | ID <- 1:300
13 | animal <- c(rep('cow', 10), rep('sheep', 50), rep('goat', 40), rep('chicken', 70), rep('rabbit', 130))
14 | petting_zoo <- rep('no', 300)
15 | petting_zoo[animal == 'rabbit'] <- 'yes'
16 | 
17 | weight <- rep(NA, 300)
18 | weight[animal == 'cow']     <- rnorm(n = 10, mean = 1000, sd = 100)
19 | weight[animal == 'sheep']   <- rnorm(n = 50, mean = 100, sd = 2)
20 | weight[animal == 'goat']    <- rnorm(n = 40, mean = 40, sd = 2)
21 | weight[animal == 'chicken'] <- .2
22 | weight[animal == 'rabbit']  <- NA
23 | 
24 | feed <- rep(NA, 300)
25 | 
26 | feed[animal == 'cow']     <- weight[animal == 'cow'] * .03   
27 | feed[animal == 'sheep']   <- weight[animal == 'sheep'] *.02  
28 | feed[animal == 'goat']    <- weight[animal == 'goat'] * .07
29 | feed[animal == 'chicken'] <- weight[animal == 'chicken'] * .02
30 | feed[animal == 'rabbit']  <- NA
31 | ```
32 | 
33 | ## Combine into a dataset
34 | 
35 | Now, combine all of these variables into a single dataset, called
36 | `tilden`.
37 | 
38 | ``` r
39 | tilden <- data.frame(
40 |   ID, animal, petting_zoo, weight, feed
41 | )
42 | ```
43 | 
44 | ## Typecast
45 | 
46 | Now do a little bit of mutating and type converting.
47 | 
48 |   - Since we know that each of the types of animals has a specific
49 |     amount that they eat, let’s label them “hungry boi” if they eat more
50 |     than average; and “slender boi” if they eat less than average.
51 |     (Sorry… I know this coding might be getting tedious\!).
52 |   - Make each of these relative to the mean within that animal type.
53 | 
54 | <!-- end list -->
55 | 
56 | ``` r
57 | tilden %>%
58 |   group_by(animal) %>%  
59 |   mutate(hungry_slender = ifelse(weight > mean(weight), 'hungry boi', 'slender boi'))
60 | ```
61 | 
62 |     ## # A tibble: 300 x 6
63 |     ## # Groups:   animal [5]
64 |     ##       ID animal petting_zoo weight  feed hungry_slender
65 |     ##    <int> <chr>  <chr>        <dbl> <dbl> <chr>         
66 |     ##  1     1 cow    no           1269.  38.1 hungry boi    
67 |     ##  2     2 cow    no           1013.  30.4 slender boi   
68 |     ##  3     3 cow    no           1098.  32.9 hungry boi    
69 |     ##  4     4 cow    no            926.  27.8 slender boi   
70 |     ##  5     5 cow    no           1196.  35.9 hungry boi    
71 |     ##  6     6 cow    no           1036.  31.1 slender boi   
72 |     ##  7     7 cow    no            950.  28.5 slender boi   
73 |     ##  8     8 cow    no            981.  29.4 slender boi   
74 |     ##  9     9 cow    no            981.  29.4 slender boi   
75 |     ## 10    10 cow    no           1118.  33.5 hungry boi    
76 |     ## # … with 290 more rows
77 | 


--------------------------------------------------------------------------------
/reading_calls/make_a_data_set_solution.md:
--------------------------------------------------------------------------------
 1 | Make A Data Set
 2 | ================
 3 | w203: Statistics for Data Science
 4 | 8/14/2020
 5 | 
 6 | # Paste Farm Code
 7 | 
 8 | Start by pasting the code that you wrote to create the farm animals
 9 | below.
10 | 
11 | ``` r
12 | ID <- 1:300
13 | animal <- c(rep('cow', 10), rep('sheep', 50), rep('goat', 40), rep('chicken', 70), rep('rabbit', 130))
14 | petting_zoo <- rep('no', 300)
15 | petting_zoo[animal == 'rabbit'] <- 'yes'
16 | 
17 | weight <- rep(NA, 300)
18 | weight[animal == 'cow']     <- rnorm(n = 10, mean = 1000, sd = 100)
19 | weight[animal == 'sheep']   <- rnorm(n = 50, mean = 100, sd = 2)
20 | weight[animal == 'goat']    <- rnorm(n = 40, mean = 40, sd = 2)
21 | weight[animal == 'chicken'] <- .2
22 | weight[animal == 'rabbit']  <- NA
23 | 
24 | feed <- rep(NA, 300)
25 | 
26 | feed[animal == 'cow']     <- weight[animal == 'cow'] * .03   
27 | feed[animal == 'sheep']   <- weight[animal == 'sheep'] *.02  
28 | feed[animal == 'goat']    <- weight[animal == 'goat'] * .07
29 | feed[animal == 'chicken'] <- weight[animal == 'chicken'] * .02
30 | feed[animal == 'rabbit']  <- NA
31 | ```
32 | 
33 | ## Combine into a dataset
34 | 
35 | Now, combine all of these variables into a single dataset, called
36 | `tilden`.
37 | 
38 | ``` r
39 | tilden <- data.frame(
40 |   ID, animal, petting_zoo, weight, feed
41 | )
42 | ```
43 | 
44 | ## Typecast
45 | 
46 | Now do a little bit of mutating and type converting.
47 | 
48 |   - Since we know that each of the types of animals has a specific
49 |     amount that they eat, let’s label them “hungry boi” if they eat more
50 |     than average; and “slender boi” if they eat less than average.
51 |     (Sorry… I know this coding might be getting tedious\!).
52 |   - Make each of these relative to the mean within that animal type.
53 | 
54 | <!-- end list -->
55 | 
56 | ``` r
57 | tilden %>%
58 |   group_by(animal) %>%  
59 |   mutate(hungry_slender = ifelse(weight > mean(weight), 'hungry boi', 'slender boi'))
60 | ```
61 | 
62 |     ## # A tibble: 300 x 6
63 |     ## # Groups:   animal [5]
64 |     ##       ID animal petting_zoo weight  feed hungry_slender
65 |     ##    <int> <chr>  <chr>        <dbl> <dbl> <chr>         
66 |     ##  1     1 cow    no           1269.  38.1 hungry boi    
67 |     ##  2     2 cow    no           1013.  30.4 slender boi   
68 |     ##  3     3 cow    no           1098.  32.9 hungry boi    
69 |     ##  4     4 cow    no            926.  27.8 slender boi   
70 |     ##  5     5 cow    no           1196.  35.9 hungry boi    
71 |     ##  6     6 cow    no           1036.  31.1 slender boi   
72 |     ##  7     7 cow    no            950.  28.5 slender boi   
73 |     ##  8     8 cow    no            981.  29.4 slender boi   
74 |     ##  9     9 cow    no            981.  29.4 slender boi   
75 |     ## 10    10 cow    no           1118.  33.5 hungry boi    
76 |     ## # … with 290 more rows
77 | 


--------------------------------------------------------------------------------
/reading_calls/mini_project.md:
--------------------------------------------------------------------------------
 1 | # Mini Project 
 2 | 
 3 | If you're having fun with this split-apply-combine framework here's an idea for about a 60 minute project that you could work on. 
 4 | 
 5 | - I want to emphasize **this is not necessary** but if you want to solidify your work everything that we've done to this point, it might help! 
 6 | 
 7 | # Seattle Bike Data 
 8 | 
 9 | As I'm writing this, I'm on my way to Seattle -- probably my favorite American city. The city has a fantatsic culture around biking, especially for communting to and from work. Because the city was interested in knowing just how many people ride along their bike trails, they installed counters tha record the following information: 
10 | 
11 | - The date (and hour) of the observation;
12 | - The number of cyclists observed (total) in that hour; which is a combination of two other fields that are recorded: 
13 |     - How many cyclists are recorded going east-bound; and, 
14 |     - How many cyclists are recorded going west-bound. 
15 |     
16 | ## Writeup using this data 
17 | 
18 | The Seattle Times did a [write-up](https://www.seattletimes.com/seattle-news/transportation/what-we-can-learn-from-seattles-bike-counter-data/#interactive) using the data that you can read. 
19 | 
20 | ## Mini-project using this data 
21 | 
22 | The Seattle bike data is available[[here]](https://www.seattle.gov/transportation/projects-and-programs/programs/bike-program/bike-counters). But, somewhat annoyingly, for the busiest routes, they're only making a dashboard available. 
23 | 
24 | However, for the bike counter that is along the I-90 Bridge that separates Seattle from Richmond, they make the full dataset available. A link is [[here]](https://www.seattle.gov/transportation/projects-and-programs/programs/bike-program/bike-counters). 
25 | 
26 | With this data: 
27 | 
28 | 1. Download a .csv file of the data. 
29 | 2. Load this data
30 |     - If it is in your downloads folder: then you can read the data with the following call: `bike_data <- read_csv('~/Downloads/NAME_OF_THE_DATA_THAT_YOU_DOWNLOADED.csv')` where you will replace the `NAME_OF_THE_DATA_THAT_YOU_DOWNLOADED` callout with its actual name. 
31 | 3. See if you can work with the data to identify whether there are patterns: 
32 |     - Are there more east-bound or west-bound rides at certain parts of the day? Why might this be? 
33 |     - Are there months of the year that there are more (or fewer) total rides? 
34 |     - Are the weekends different from the weekdays? 
35 |     
36 | One small complication is that the `date` field hasn't actually been turned into something that you can use. To use it (without using more advanced POSIX time series types), you will have to `mutate` pieces off of this data). 
37 | 
38 | Here is a little example that will pull the year off this column: 
39 | 
40 | ```
41 | bike_data %>% 
42 |     mutate(year = substr(Date, start = 1, stop = 4))
43 | ```
44 | 
45 | Have fun! 
46 | 


--------------------------------------------------------------------------------
/code/base_operations_solution.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Heading to the Little Farm"
 3 | author: 'w203: Statistics for Data Science'
 4 | output: github_document
 5 | ---
 6 | 
 7 | # The Little Farm 
 8 | 
 9 | Just around the corner from campus in Berkeley is the Tilden "Little Farm" its a sort of hobby farm that has cows, goats, chickens, sheep, and a small menagerie of other cute animals. I think that idea is that kids can see animals -- but I've also got to admit that it is kind of weird to have a tiny farm in the middle of the neighborhood. 
10 | 
11 | ## Create Farm Data 
12 | 
13 | Let's use all the methods that you've just been working on to produce a dataset that represents the farm. Using methods that you're familiar with, create the following dataset. 
14 | 
15 | - A column that ranges from 1 - 300 called "ID" that is an ID for the animal. 
16 | - A column that has the type of animal that is being recorded: 
17 |     - There are 10 cows
18 |     - There are 50 sheep
19 |     - There are 40 goats 
20 |     - There are 70 chickens
21 |     - There are 130 rabbits 
22 | - A column that describes whether that animal belongs in the petting zoo 
23 | - A column that describes the weight of the animals: 
24 |     - Cows weight are normally distributed, with a mean of 1000kg and a sd of 100kg
25 |     - Sheep are normally distributed, with a mean of 100kg and a sd of 2 kg
26 |     - Goats are normally distributed, with a mean of 40kg and a sd of 2kg 
27 |     - Chickens all weigh .2 kg
28 |     - Rabbits don't stand still long enough to be weighted, so there is no data
29 | - A column that described how much feed that animal needs -- this is animal specific, and depends on how much the animal weighs. 
30 |     - A cow needs 3% of its body weight each day to stay alive 
31 |     - A sheep needs 2% of its body weight each day to stay alive
32 |     - A goat needs 7% of its body weight each day to stay alive 
33 |     - A chicken needs 2% of its body weight each day to stay alive
34 |     
35 | For this activity, you cannot use the `tidyverse`.    
36 |     
37 | ```{r}
38 | ID <- 1:300
39 | animal <- c(rep('cow', 10), rep('sheep', 50), rep('goat', 40), rep('chicken', 70), rep('rabbit', 130))
40 | petting_zoo <- rep('no', 300)
41 | petting_zoo[animal == 'rabbit'] <- 'yes'
42 | 
43 | weight <- rep(NA, 300)
44 | weight[animal == 'cow']     <- rnorm(n = 10, mean = 1000, sd = 100)
45 | weight[animal == 'sheep']   <- rnorm(n = 50, mean = 100, sd = 2)
46 | weight[animal == 'goat']    <- rnorm(n = 40, mean = 40, sd = 2)
47 | weight[animal == 'chicken'] <- .2
48 | weight[animal == 'rabbit']  <- NA
49 | 
50 | feed <- rep(NA, 300)
51 | 
52 | feed[animal == 'cow']     <- weight[animal == 'cow'] * .03   
53 | feed[animal == 'sheep']   <- weight[animal == 'sheep'] *.02  
54 | feed[animal == 'goat']    <- weight[animal == 'goat'] * .07
55 | feed[animal == 'chicken'] <- weight[animal == 'chicken'] * .02
56 | feed[animal == 'rabbit']  <- NA
57 | ```
58 | 


--------------------------------------------------------------------------------
/code/base_operations_solution.md:
--------------------------------------------------------------------------------
 1 | Heading to the Little Farm
 2 | ================
 3 | w203: Statistics for Data Science
 4 | 
 5 | # The Little Farm
 6 | 
 7 | Just around the corner from campus in Berkeley is the Tilden “Little
 8 | Farm” its a sort of hobby farm that has cows, goats, chickens, sheep,
 9 | and a small menagerie of other cute animals. I think that idea is that
10 | kids can see animals – but I’ve also got to admit that it is kind of
11 | weird to have a tiny farm in the middle of the neighborhood.
12 | 
13 | ## Create Farm Data
14 | 
15 | Let’s use all the methods that you’ve just been working on to produce a
16 | dataset that represents the farm. Using methods that you’re familiar
17 | with, create the following dataset.
18 | 
19 |   - A column that ranges from 1 - 300 called “ID” that is an ID for the
20 |     animal.
21 |   - A column that has the type of animal that is being recorded:
22 |       - There are 10 cows
23 |       - There are 50 sheep
24 |       - There are 40 goats
25 |       - There are 70 chickens
26 |       - There are 130 rabbits
27 |   - A column that describes whether that animal belongs in the petting
28 |     zoo
29 |   - A column that describes the weight of the animals:
30 |       - Cows weight are normally distributed, with a mean of 1000kg and
31 |         a sd of 100kg
32 |       - Sheep are normally distributed, with a mean of 100kg and a sd of
33 |         2 kg
34 |       - Goats are normally distributed, with a mean of 40kg and a sd of
35 |         2kg
36 |       - Chickens all weigh .2 kg
37 |       - Rabbits don’t stand still long enough to be weighted, so there
38 |         is no data
39 |   - A column that described how much feed that animal needs – this is
40 |     animal specific, and depends on how much the animal weighs.
41 |       - A cow needs 3% of its body weight each day to stay alive
42 |       - A sheep needs 2% of its body weight each day to stay alive
43 |       - A goat needs 7% of its body weight each day to stay alive
44 |       - A chicken needs 2% of its body weight each day to stay alive
45 | 
46 | For this activity, you cannot use the `tidyverse`.
47 | 
48 | ``` r
49 | ID <- 1:300
50 | animal <- c(rep('cow', 10), rep('sheep', 50), rep('goat', 40), rep('chicken', 70), rep('rabbit', 130))
51 | petting_zoo <- rep('no', 300)
52 | petting_zoo[animal == 'rabbit'] <- 'yes'
53 | 
54 | weight <- rep(NA, 300)
55 | weight[animal == 'cow']     <- rnorm(n = 10, mean = 1000, sd = 100)
56 | weight[animal == 'sheep']   <- rnorm(n = 50, mean = 100, sd = 2)
57 | weight[animal == 'goat']    <- rnorm(n = 40, mean = 40, sd = 2)
58 | weight[animal == 'chicken'] <- .2
59 | weight[animal == 'rabbit']  <- NA
60 | 
61 | feed <- rep(NA, 300)
62 | 
63 | feed[animal == 'cow']     <- weight[animal == 'cow'] * .03   
64 | feed[animal == 'sheep']   <- weight[animal == 'sheep'] *.02  
65 | feed[animal == 'goat']    <- weight[animal == 'goat'] * .07
66 | feed[animal == 'chicken'] <- weight[animal == 'chicken'] * .02
67 | feed[animal == 'rabbit']  <- NA
68 | ```
69 | 


--------------------------------------------------------------------------------
/reading_calls/base_operations_solution.md:
--------------------------------------------------------------------------------
 1 | Heading to the Little Farm
 2 | ================
 3 | w203: Statistics for Data Science
 4 | 
 5 | # The Little Farm
 6 | 
 7 | Just around the corner from campus in Berkeley is the Tilden “Little
 8 | Farm” its a sort of hobby farm that has cows, goats, chickens, sheep,
 9 | and a small menagerie of other cute animals. I think that idea is that
10 | kids can see animals – but I’ve also got to admit that it is kind of
11 | weird to have a tiny farm in the middle of the neighborhood.
12 | 
13 | ## Create Farm Data
14 | 
15 | Let’s use all the methods that you’ve just been working on to produce a
16 | dataset that represents the farm. Using methods that you’re familiar
17 | with, create the following dataset.
18 | 
19 |   - A column that ranges from 1 - 300 called “ID” that is an ID for the
20 |     animal.
21 |   - A column that has the type of animal that is being recorded:
22 |       - There are 10 cows
23 |       - There are 50 sheep
24 |       - There are 40 goats
25 |       - There are 70 chickens
26 |       - There are 130 rabbits
27 |   - A column that describes whether that animal belongs in the petting
28 |     zoo
29 |   - A column that describes the weight of the animals:
30 |       - Cows weight are normally distributed, with a mean of 1000kg and
31 |         a sd of 100kg
32 |       - Sheep are normally distributed, with a mean of 100kg and a sd of
33 |         2 kg
34 |       - Goats are normally distributed, with a mean of 40kg and a sd of
35 |         2kg
36 |       - Chickens all weigh .2 kg
37 |       - Rabbits don’t stand still long enough to be weighted, so there
38 |         is no data
39 |   - A column that described how much feed that animal needs – this is
40 |     animal specific, and depends on how much the animal weighs.
41 |       - A cow needs 3% of its body weight each day to stay alive
42 |       - A sheep needs 2% of its body weight each day to stay alive
43 |       - A goat needs 7% of its body weight each day to stay alive
44 |       - A chicken needs 2% of its body weight each day to stay alive
45 | 
46 | For this activity, you cannot use the `tidyverse`.
47 | 
48 | ``` r
49 | ID <- 1:300
50 | animal <- c(rep('cow', 10), rep('sheep', 50), rep('goat', 40), rep('chicken', 70), rep('rabbit', 130))
51 | petting_zoo <- rep('no', 300)
52 | petting_zoo[animal == 'rabbit'] <- 'yes'
53 | 
54 | weight <- rep(NA, 300)
55 | weight[animal == 'cow']     <- rnorm(n = 10, mean = 1000, sd = 100)
56 | weight[animal == 'sheep']   <- rnorm(n = 50, mean = 100, sd = 2)
57 | weight[animal == 'goat']    <- rnorm(n = 40, mean = 40, sd = 2)
58 | weight[animal == 'chicken'] <- .2
59 | weight[animal == 'rabbit']  <- NA
60 | 
61 | feed <- rep(NA, 300)
62 | 
63 | feed[animal == 'cow']     <- weight[animal == 'cow'] * .03   
64 | feed[animal == 'sheep']   <- weight[animal == 'sheep'] *.02  
65 | feed[animal == 'goat']    <- weight[animal == 'goat'] * .07
66 | feed[animal == 'chicken'] <- weight[animal == 'chicken'] * .02
67 | feed[animal == 'rabbit']  <- NA
68 | ```
69 | 


--------------------------------------------------------------------------------
/reading_calls/introduction_to_space_data.md:
--------------------------------------------------------------------------------
 1 | # Introduction to Space Data 
 2 | 
 3 | From here forward we're going to switch the data that we're using from squirrels to space. 
 4 | 
 5 | I came to this new data at[ Tidy Tuesday](https://github.com/rfordatascience/tidytuesday/tree/master/data/2019/2019-01-15), a collection of data exploration challenges that come online every Tuesday. They're pretty neat sometimes! 
 6 | 
 7 | The data came to them from an article that the Economist published, which in turn came from two different sources. You can read about these sources on the website if you're interested.
 8 | 
 9 | Below I have reproduced the data file names and variable definitions. 
10 | 
11 | ## Data files
12 | 
13 | | File                     | Description               | Source                             |
14 | | --------                 | ----------------------    | ---------------------------------- |
15 | | [agencies](agencies.csv) | Space launch providers    | Jonathan McDowell; _The Economist_ |
16 | | [launches](launches.csv) | Individual space launches | Jonathan McDowell; _The Economist_ |
17 | 
18 | ## Codebook
19 | 
20 | ### launches
21 | 
22 | | variable    | definition                               |
23 | | ----------- | ---------------------------------------- |
24 | | tag         | Harvard or [COSPAR][cospar] id of launch |
25 | | JD          | [Julian Date][jd] of launch              |
26 | | launch_date | date of launch                           |
27 | | launch_year | year of launch                           |
28 | | type        | type of launch vehicle                   |
29 | | variant     | variant of launch vehicle                |
30 | | mission     |                                          |
31 | | agency      | launching agency                         |
32 | | state_code  | launching agency's state                 |
33 | | category    | success (O) or failure (F)               |
34 | | agency_type | type of agency                           |
35 | 
36 | ### agencies
37 | 
38 | | variable           | definition              |
39 | | ------------------ | ----------------------- |
40 | | agency             | org phase code          |
41 | | count              | number of launches      |
42 | | ucode              | org Ucode               |
43 | | state_code         | responsible state       |
44 | | type               | type of org             |
45 | | class              | class of org            |
46 | | tstart             | org/phase founding date |
47 | | tstop              | org/phase ending date   |
48 | | short_name         | short name              |
49 | | name               | full name               |
50 | | location           | plain english location  |
51 | | longitude          |                         |
52 | | latitude           |                         |
53 | | error              | uncertainty in long/lat |
54 | | parent             | parent org              |
55 | | short_english_name | english short name      |
56 | | english_name       | english full name       |
57 | | unicode_name       | unicode full name       |
58 | | agency_type        | type of agency          |
59 | 


--------------------------------------------------------------------------------
/code/how_to_summarise_solution.md:
--------------------------------------------------------------------------------
 1 | How to Smooth
 2 | ================
 3 | w203: Statistics for Data Science
 4 | 
 5 | ``` r
 6 | library(tidyverse)
 7 | library(ggplot2)
 8 | # install.packages('patchwork')
 9 | library(patchwork)
10 | 
11 | theme_set(theme_minimal())
12 | knitr::opts_chunk$set(dpi = 200)
13 | ```
14 | 
15 | ``` r
16 | squirrel_subset <- read.csv('squirrels_subset.csv')
17 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color))
18 | 
19 | squirrel_subset <- squirrel_subset %>%
20 |   mutate(date_f = as.Date.character(date, format = '%m%d%Y'))
21 | ```
22 | 
23 | # Coding Task
24 | 
25 | There if you plot the grouped plot that we’ve shown a few times now,
26 | there’s a lot of movement in the observations on a daily basis. This
27 | might questions to focus on a particularly productive day on the
28 | squirrel census – “I don’t know why there were more this day\!” – which
29 | isn’t really the point of the plot. Instead, the point of the plot is
30 | that there are many more Gray squirrels but that the number seems to be
31 | decreasing through the census time.
32 | 
33 | ``` r
34 | squirrel_subset %>%
35 |   group_by(date_f, primary_fur_color) %>%
36 |   summarise(count_of_colors = n()) %>% 
37 |   ggplot() + 
38 |   aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 
39 |   geom_line()
40 | ```
41 | 
42 |     ## `summarise()` regrouping output by 'date_f' (override with `.groups` argument)
43 | 
44 | ![](how_to_summarise_solution_files/figure-gfm/unnamed-chunk-1-1.png)<!-- -->
45 | 
46 | ## Your Task
47 | 
48 | Change the plot so that instead the plot uses the `stat_smooth()`
49 | function – to do so, you’ll have to change the `geom_line()` call to
50 | something else.
51 | 
52 |   - First run the smoother as is
53 |   - Then, suppress the reporting of the standard errors (you will likely
54 |     have to look into the help documentation to figure out what the
55 |     particular argument is that controls those error bars)
56 |   - Then, re-plot again but change the variable that controls the
57 |     “wiggliness” of the lines. Is there a level of this variable that
58 |     you think best communicates the point you want to make with this
59 |     data?
60 | 
61 | <!-- end list -->
62 | 
63 | ``` r
64 | squirrel_subset %>%
65 |   group_by(date_f, primary_fur_color) %>%
66 |   summarise(count_of_colors = n()) %>% 
67 |   ggplot() + 
68 |   aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 
69 |   stat_smooth(span = .8, se = FALSE)
70 | ```
71 | 
72 |     ## `summarise()` regrouping output by 'date_f' (override with `.groups` argument)
73 | 
74 |     ## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
75 | 
76 | ![](how_to_summarise_solution_files/figure-gfm/unnamed-chunk-2-1.png)<!-- -->
77 | \> I think that this `span = 0.8` is my preferred span. You can see that
78 | it is *just* barely staying smooth – there are a few points where this
79 | plot seems to try to pull away from the general line. Setting the span
80 | higher is over simplifying the trend, to my eye, setting is smaller
81 | recoveres most of the “noise” that we wanted to smooth out of the data.
82 | 


--------------------------------------------------------------------------------
/reading_calls/how_to_summarise_solution.md:
--------------------------------------------------------------------------------
 1 | How to Smooth
 2 | ================
 3 | w203: Statistics for Data Science
 4 | 
 5 | ``` r
 6 | library(tidyverse)
 7 | library(ggplot2)
 8 | # install.packages('patchwork')
 9 | library(patchwork)
10 | 
11 | theme_set(theme_minimal())
12 | knitr::opts_chunk$set(dpi = 200)
13 | ```
14 | 
15 | ``` r
16 | squirrel_subset <- read.csv('squirrels_subset.csv')
17 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color))
18 | 
19 | squirrel_subset <- squirrel_subset %>%
20 |   mutate(date_f = as.Date.character(date, format = '%m%d%Y'))
21 | ```
22 | 
23 | # Coding Task
24 | 
25 | There if you plot the grouped plot that we’ve shown a few times now,
26 | there’s a lot of movement in the observations on a daily basis. This
27 | might questions to focus on a particularly productive day on the
28 | squirrel census – “I don’t know why there were more this day\!” – which
29 | isn’t really the point of the plot. Instead, the point of the plot is
30 | that there are many more Gray squirrels but that the number seems to be
31 | decreasing through the census time.
32 | 
33 | ``` r
34 | squirrel_subset %>%
35 |   group_by(date_f, primary_fur_color) %>%
36 |   summarise(count_of_colors = n()) %>% 
37 |   ggplot() + 
38 |   aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 
39 |   geom_line()
40 | ```
41 | 
42 |     ## `summarise()` regrouping output by 'date_f' (override with `.groups` argument)
43 | 
44 | ![](how_to_summarise_solution_files/figure-gfm/unnamed-chunk-1-1.png)<!-- -->
45 | 
46 | ## Your Task
47 | 
48 | Change the plot so that instead the plot uses the `stat_smooth()`
49 | function – to do so, you’ll have to change the `geom_line()` call to
50 | something else.
51 | 
52 |   - First run the smoother as is
53 |   - Then, suppress the reporting of the standard errors (you will likely
54 |     have to look into the help documentation to figure out what the
55 |     particular argument is that controls those error bars)
56 |   - Then, re-plot again but change the variable that controls the
57 |     “wiggliness” of the lines. Is there a level of this variable that
58 |     you think best communicates the point you want to make with this
59 |     data?
60 | 
61 | <!-- end list -->
62 | 
63 | ``` r
64 | squirrel_subset %>%
65 |   group_by(date_f, primary_fur_color) %>%
66 |   summarise(count_of_colors = n()) %>% 
67 |   ggplot() + 
68 |   aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 
69 |   stat_smooth(span = .8, se = FALSE)
70 | ```
71 | 
72 |     ## `summarise()` regrouping output by 'date_f' (override with `.groups` argument)
73 | 
74 |     ## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
75 | 
76 | ![](how_to_summarise_solution_files/figure-gfm/unnamed-chunk-2-1.png)<!-- -->
77 | \> I think that this `span = 0.8` is my preferred span. You can see that
78 | it is *just* barely staying smooth – there are a few points where this
79 | plot seems to try to pull away from the general line. Setting the span
80 | higher is over simplifying the trend, to my eye, setting is smaller
81 | recoveres most of the “noise” that we wanted to smooth out of the data.
82 | 


--------------------------------------------------------------------------------
/code/summarize_solution.md:
--------------------------------------------------------------------------------
 1 | Summarizing a Series of Variables
 2 | ================
 3 | w203: Statistics for Data Science
 4 | 8/13/2020
 5 | 
 6 | ``` r
 7 | library(tidyverse)
 8 | ```
 9 | 
10 | ``` r
11 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
12 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
13 | ```
14 | 
15 | # Task
16 | 
17 | With the agencies data, produce a *meaningful* summary of the following
18 | variables:
19 | 
20 |   - The average number of launches
21 |   - The median of the number of launches
22 |   - The variance of the number of launches
23 |   - A count of the unique number of agencies. (In the lecture, I wrote a
24 |     method of accomplishing this using base tools; you can also use the
25 |     `dplyr` function `n_distinct`)
26 | 
27 | <!-- end list -->
28 | 
29 | ``` r
30 | agencies %>%  
31 |   summarize(
32 |     launches_mean    = mean(count), 
33 |     launches_median  = median(count),
34 |     launches_var     = var(count), 
35 |     agencies_count   = n_distinct(agency, na.rm = TRUE),
36 |     agencies_count_2 = length(unique(agency))
37 |   )
38 | ```
39 | 
40 |     ## # A tibble: 1 x 5
41 |     ##   launches_mean launches_median launches_var agencies_count agencies_count_2
42 |     ##           <dbl>           <dbl>        <dbl>          <int>            <int>
43 |     ## 1          77.1              12       46203.             74               74
44 | 
45 | > Notice a few things that I’ve done in this code:
46 | > 
47 | > 1.  Each of the mutate varaibles that I’ve written starts with the
48 | >     same variable “slug” – this this case `variable_` or `agencies_`.
49 | >     I thought that this was a more useful variable name than `count`.
50 | >     But, more to the point, this way there is a consistent look-up
51 | >     (both visually and programatically) for all the variable that are
52 | >     associate with this concenpt. If, instead, you wrote this as
53 | >     `mean_launches` which *does* have a more natural reading aloud,
54 | >     then the ordering of these variables might move apart when you
55 | >     consider, say `var_launches`.
56 | > 2.  I’ve added extra white-space after the new varaibles that I’ve
57 | >     created so that I can align the `=` signs. This is always allowed
58 | >     within the code style, and helps to set apart the variables that
59 | >     you’re making from those that exist. Just compare the two blocks
60 | >     below to see.
61 | 
62 |     agencies %>%  
63 |       summarize(
64 |         launches_mean = mean(count), 
65 |         launches_median = median(count),
66 |         launches_var = var(count), 
67 |         agencies_count = n_distinct(agency, na.rm = TRUE),
68 |         agencies_count_2 = length(unique(agency))
69 |       )
70 |     
71 |     agencies %>%  
72 |     summarize(
73 |       launches_mean    = mean(count), 
74 |       launches_median  = median(count),
75 |       launches_var     = var(count), 
76 |       agencies_count   = n_distinct(agency, na.rm = TRUE),
77 |       agencies_count_2 = length(unique(agency))
78 |     )
79 | 


--------------------------------------------------------------------------------
/reading_calls/summarize_solution.md:
--------------------------------------------------------------------------------
 1 | Summarizing a Series of Variables
 2 | ================
 3 | w203: Statistics for Data Science
 4 | 8/13/2020
 5 | 
 6 | ``` r
 7 | library(tidyverse)
 8 | ```
 9 | 
10 | ``` r
11 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
12 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
13 | ```
14 | 
15 | # Task
16 | 
17 | With the agencies data, produce a *meaningful* summary of the following
18 | variables:
19 | 
20 |   - The average number of launches
21 |   - The median of the number of launches
22 |   - The variance of the number of launches
23 |   - A count of the unique number of agencies. (In the lecture, I wrote a
24 |     method of accomplishing this using base tools; you can also use the
25 |     `dplyr` function `n_distinct`)
26 | 
27 | <!-- end list -->
28 | 
29 | ``` r
30 | agencies %>%  
31 |   summarize(
32 |     launches_mean    = mean(count), 
33 |     launches_median  = median(count),
34 |     launches_var     = var(count), 
35 |     agencies_count   = n_distinct(agency, na.rm = TRUE),
36 |     agencies_count_2 = length(unique(agency))
37 |   )
38 | ```
39 | 
40 |     ## # A tibble: 1 x 5
41 |     ##   launches_mean launches_median launches_var agencies_count agencies_count_2
42 |     ##           <dbl>           <dbl>        <dbl>          <int>            <int>
43 |     ## 1          77.1              12       46203.             74               74
44 | 
45 | > Notice a few things that I’ve done in this code:
46 | > 
47 | > 1.  Each of the mutate varaibles that I’ve written starts with the
48 | >     same variable “slug” – this this case `variable_` or `agencies_`.
49 | >     I thought that this was a more useful variable name than `count`.
50 | >     But, more to the point, this way there is a consistent look-up
51 | >     (both visually and programatically) for all the variable that are
52 | >     associate with this concenpt. If, instead, you wrote this as
53 | >     `mean_launches` which *does* have a more natural reading aloud,
54 | >     then the ordering of these variables might move apart when you
55 | >     consider, say `var_launches`.
56 | > 2.  I’ve added extra white-space after the new varaibles that I’ve
57 | >     created so that I can align the `=` signs. This is always allowed
58 | >     within the code style, and helps to set apart the variables that
59 | >     you’re making from those that exist. Just compare the two blocks
60 | >     below to see.
61 | 
62 |     agencies %>%  
63 |       summarize(
64 |         launches_mean = mean(count), 
65 |         launches_median = median(count),
66 |         launches_var = var(count), 
67 |         agencies_count = n_distinct(agency, na.rm = TRUE),
68 |         agencies_count_2 = length(unique(agency))
69 |       )
70 |     
71 |     agencies %>%  
72 |     summarize(
73 |       launches_mean    = mean(count), 
74 |       launches_median  = median(count),
75 |       launches_var     = var(count), 
76 |       agencies_count   = n_distinct(agency, na.rm = TRUE),
77 |       agencies_count_2 = length(unique(agency))
78 |     )
79 | 


--------------------------------------------------------------------------------
/code/grouped_data.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Grouped Data"
 3 | author: 'w203: Statistics for Data Science'
 4 | output: github_document
 5 | ---
 6 | 
 7 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
 8 | library(tidyverse)
 9 | library(ggplot2)
10 | # install.packages('patchwork')
11 | library(patchwork)
12 | 
13 | theme_set(theme_minimal())
14 | knitr::opts_chunk$set(dpi = 200)
15 | ```
16 | 
17 | ```{r load and mutate data}
18 | squirrel_subset <- read.csv('squirrels_subset.csv')
19 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color))
20 | 
21 | squirrel_subset <- squirrel_subset %>%  
22 |   mutate(date_f = as.Date.character(date, format = '%m%d%Y'))
23 | ```
24 | 
25 | # Plots Groups with Colors
26 | 
27 | A few days ago, we plotted the count of squirrels that were observed over time. 
28 | 
29 | - Because it was by time, we reasoned that a line plot did a good job of illustrating the connections between the observations
30 | - The plot looked something like what is below
31 | 
32 | ```{r ungrouped plot}
33 | squirrel_subset %>%  
34 |   group_by(date_f) %>%  
35 |   summarise(count_of_squirrels = n()) %>%  
36 |   ggplot() + 
37 |   aes(x = date_f, y = count_of_squirrels) + 
38 |   geom_line()
39 | ```
40 | At that point, we asked a challenge question of you, that asked, 
41 | 
42 | > Could you also make this plot and represent the color of the squirrels in the plot? 
43 | 
44 | Here, I've written the first set of lines that would do this for you -- this takes the squirrel subset data, groups by date and fur color, and then counts the number of squirrels that are observed in each of these combinations. 
45 | 
46 | Complete the plot, by: 
47 | 
48 | - Adding a `ggplot()` call;
49 | - Adding an `aes()` call; and, 
50 | - Adding a `geom_line()` call to produce the line
51 | 
52 | Think, as you're drawing this plot -- what parts of this are mapping from data that I want to bring into the plot? What do I want to map that information onto? This might help to keep clear the code that you want to write. 
53 | 
54 | ```{r}
55 | squirrel_subset %>%
56 |   group_by(date_f, primary_fur_color) %>%
57 |   summarise(count_of_colors = n()) 
58 | ```
59 | 
60 | # Plot Groups with Different Graphs
61 | 
62 | Although I think that it is probably uniformly **less** effective of a representation in this case, you might instead want to plot each group on a different axis. 
63 | 
64 | - To do so, use the `facet_wrap()` function to place each of the `primary_fur_colors` onto their own set of axes. 
65 | - To help you along, I'll note that within `facet_wrap()` you will probably have to use the argument `facets = vars(primary_fur_color)`. This is a bit of a weird part of the `ggplot` api, and something that I hope they'll fix in the future.
66 | - However, at least their fair about telling you that you'll have to use the `vars()` function -- look into the help documentation for this function.
67 | 
68 | ```{r}
69 | ?facet_wrap()
70 | ```
71 | 
72 | - Like before, I'll start you down the road for this plot by doing the data mapping. 
73 | - Which way communicates more clearly for you? Aligning the plots by rows? Or aligning them by columns? Why do you think this is? 
74 | 
75 | ```{r}
76 | squirrel_subset %>%  
77 |   group_by(date_f, primary_fur_color) %>%  
78 |   summarise(count_of_squirrels = n())
79 | ```
80 | 
81 | 


--------------------------------------------------------------------------------
/code/make_scatter_plots_solution.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ``` r
  3 | library(tidyverse)
  4 | library(ggplot2)
  5 | theme_set(theme_minimal())
  6 | knitr::opts_chunk$set(dpi = 200)
  7 | ```
  8 | 
  9 | This output is best viewed over in github because we cannot render
 10 | images into the ISVC.
 11 | 
 12 | # Load data: this should just work
 13 | 
 14 | ``` r
 15 | squirrel_subset <- read_csv('./squirrels_subset.csv')
 16 | ```
 17 | 
 18 |     ## Parsed with column specification:
 19 |     ## cols(
 20 |     ##   long = col_double(),
 21 |     ##   lat = col_double(),
 22 |     ##   hectare = col_character(),
 23 |     ##   date = col_double(),
 24 |     ##   age = col_character(),
 25 |     ##   primary_fur_color = col_character()
 26 |     ## )
 27 | 
 28 | ## Create a basic plot of the observations on the lat and long axis
 29 | 
 30 | ``` r
 31 | ggplot(data = squirrel_subset) +
 32 |   aes(x = long, y = lat) +
 33 |   geom_point()
 34 | ```
 35 | 
 36 | ![](make_scatter_plots_solution_files/figure-gfm/basic%20plot-1.png)<!-- -->
 37 | 
 38 | > This looks like central park to me\!
 39 | 
 40 | # Task 1: Color by age
 41 | 
 42 | Write code that will modify the plot so that it is colored by age.
 43 | 
 44 | ``` r
 45 | ggplot(data = squirrel_subset) +
 46 |   aes(x = long, y = lat, color = age) +
 47 |   geom_point()
 48 | ```
 49 | 
 50 | ![](make_scatter_plots_solution_files/figure-gfm/age%20plot-1.png)<!-- -->
 51 | 
 52 | # Task 2: Make Every Point Blue
 53 | 
 54 |   - Now, write code that will make every point blue, not colored by age.
 55 |   - Notice that now this choice isn’t an attribute of the data.
 56 |   - Where does this mean that the `color` argument should go?
 57 | 
 58 | <!-- end list -->
 59 | 
 60 | ``` r
 61 | ggplot(data = squirrel_subset) +
 62 |   aes(x = long, y = lat) +
 63 |   geom_point(color = 'blue')
 64 | ```
 65 | 
 66 | ![](make_scatter_plots_solution_files/figure-gfm/feeling%20blue-1.png)<!-- -->
 67 | 
 68 | # Task 3: Color by the Fur Color
 69 | 
 70 |   - Now, write code that will color the points by the variable
 71 |     `primary_fur_color`.
 72 |   - Notice that this now *is* an attribute of the data. So, where should
 73 |     the `color` argument go?
 74 | 
 75 | <!-- end list -->
 76 | 
 77 | ``` r
 78 | ggplot(data = squirrel_subset) +
 79 |   aes(x = long, y = lat, color = primary_fur_color) +
 80 |   geom_point()
 81 | ```
 82 | 
 83 | ![](make_scatter_plots_solution_files/figure-gfm/fur%20color%20plot-1.png)<!-- -->
 84 | 
 85 | # Task 4: Put onto non-euclidian space
 86 | 
 87 |   - If you think carefully about this, we’re mapping the geographic
 88 |     coordinate system onto the euclidian coordinate system. This isn’t a
 89 |     *huge* deal in this case because we’re only covering central park.
 90 |     But, what’s right is right…
 91 | 
 92 |   - ggplot has the ability to map onto the geographic coordinate system
 93 |     using the additional function `coord_quickmap()` (which is an
 94 |     approximation) or `coord_map()` which is not an approximation
 95 | 
 96 |   - Given what you understand about the layering system that ggplot
 97 |     uses, can you add on this new layer that is the `coord_quickmap()`
 98 |     coordiante system?
 99 | 
100 |   - If so, how much does it change the plot?
101 | 
102 | <!-- end list -->
103 | 
104 | ``` r
105 | ggplot(data = squirrel_subset) +
106 |   aes(x = long, y = lat, color = primary_fur_color) +
107 |   coord_quickmap() +
108 |   geom_point()
109 | ```
110 | 
111 | ![](make_scatter_plots_solution_files/figure-gfm/non%20euclidian%20space-1.png)<!-- -->
112 | 


--------------------------------------------------------------------------------
/reading_calls/make_scatter_plots_solution.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ``` r
  3 | library(tidyverse)
  4 | library(ggplot2)
  5 | theme_set(theme_minimal())
  6 | knitr::opts_chunk$set(dpi = 200)
  7 | ```
  8 | 
  9 | This output is best viewed over in github because we cannot render
 10 | images into the ISVC.
 11 | 
 12 | # Load data: this should just work
 13 | 
 14 | ``` r
 15 | squirrel_subset <- read_csv('./squirrels_subset.csv')
 16 | ```
 17 | 
 18 |     ## Parsed with column specification:
 19 |     ## cols(
 20 |     ##   long = col_double(),
 21 |     ##   lat = col_double(),
 22 |     ##   hectare = col_character(),
 23 |     ##   date = col_double(),
 24 |     ##   age = col_character(),
 25 |     ##   primary_fur_color = col_character()
 26 |     ## )
 27 | 
 28 | ## Create a basic plot of the observations on the lat and long axis
 29 | 
 30 | ``` r
 31 | ggplot(data = squirrel_subset) +
 32 |   aes(x = long, y = lat) +
 33 |   geom_point()
 34 | ```
 35 | 
 36 | ![](make_scatter_plots_solution_files/figure-gfm/basic%20plot-1.png)<!-- -->
 37 | 
 38 | > This looks like central park to me\!
 39 | 
 40 | # Task 1: Color by age
 41 | 
 42 | Write code that will modify the plot so that it is colored by age.
 43 | 
 44 | ``` r
 45 | ggplot(data = squirrel_subset) +
 46 |   aes(x = long, y = lat, color = age) +
 47 |   geom_point()
 48 | ```
 49 | 
 50 | ![](make_scatter_plots_solution_files/figure-gfm/age%20plot-1.png)<!-- -->
 51 | 
 52 | # Task 2: Make Every Point Blue
 53 | 
 54 |   - Now, write code that will make every point blue, not colored by age.
 55 |   - Notice that now this choice isn’t an attribute of the data.
 56 |   - Where does this mean that the `color` argument should go?
 57 | 
 58 | <!-- end list -->
 59 | 
 60 | ``` r
 61 | ggplot(data = squirrel_subset) +
 62 |   aes(x = long, y = lat) +
 63 |   geom_point(color = 'blue')
 64 | ```
 65 | 
 66 | ![](make_scatter_plots_solution_files/figure-gfm/feeling%20blue-1.png)<!-- -->
 67 | 
 68 | # Task 3: Color by the Fur Color
 69 | 
 70 |   - Now, write code that will color the points by the variable
 71 |     `primary_fur_color`.
 72 |   - Notice that this now *is* an attribute of the data. So, where should
 73 |     the `color` argument go?
 74 | 
 75 | <!-- end list -->
 76 | 
 77 | ``` r
 78 | ggplot(data = squirrel_subset) +
 79 |   aes(x = long, y = lat, color = primary_fur_color) +
 80 |   geom_point()
 81 | ```
 82 | 
 83 | ![](make_scatter_plots_solution_files/figure-gfm/fur%20color%20plot-1.png)<!-- -->
 84 | 
 85 | # Task 4: Put onto non-euclidian space
 86 | 
 87 |   - If you think carefully about this, we’re mapping the geographic
 88 |     coordinate system onto the euclidian coordinate system. This isn’t a
 89 |     *huge* deal in this case because we’re only covering central park.
 90 |     But, what’s right is right…
 91 | 
 92 |   - ggplot has the ability to map onto the geographic coordinate system
 93 |     using the additional function `coord_quickmap()` (which is an
 94 |     approximation) or `coord_map()` which is not an approximation
 95 | 
 96 |   - Given what you understand about the layering system that ggplot
 97 |     uses, can you add on this new layer that is the `coord_quickmap()`
 98 |     coordiante system?
 99 | 
100 |   - If so, how much does it change the plot?
101 | 
102 | <!-- end list -->
103 | 
104 | ``` r
105 | ggplot(data = squirrel_subset) +
106 |   aes(x = long, y = lat, color = primary_fur_color) +
107 |   coord_quickmap() +
108 |   geom_point()
109 | ```
110 | 
111 | ![](make_scatter_plots_solution_files/figure-gfm/non%20euclidian%20space-1.png)<!-- -->
112 | 


--------------------------------------------------------------------------------
/code/working_with_rstudio.md:
--------------------------------------------------------------------------------
 1 | Interacting with the IDE Solutions
 2 | ================
 3 | w203: Statistics for Data Science
 4 | 8/13/2020
 5 | 
 6 | ``` r
 7 | ## Welcome! You've made it over to the IDE.
 8 | 
 9 | ## Any line that starts with one or more `#` will be commented out.
10 | ## This means that if you run that line, nothing will actually occur in the
11 | ## interpretor. 
12 | 
13 | ## To run this code below you can do the following:
14 | ## - If you are on a Mac, on the line that you want to run you can press `command+return`
15 | ##   which means to hold command and then press return.
16 | ## - If you are on a Windows or Linux machine, then you can run that line by pressing `alt+return`
17 | ##   which means to hold the alt key and press return.
18 | 
19 | ## When you run the first line one of two things might happen:
20 | ## 1. You might get an error because you haven't installed that package. If this happens,
21 | ##    look near the top of your Rstudio screen -- there should be a helper that asks if
22 | ##    you want to install this library. You do and can click "install".
23 | ## 2. If you've already installed that library, then it should load the package, which
24 | ##    you will see in the console below. 
25 | 
26 | 
27 | library(ggplot2)
28 | 
29 | ## Now, if you want to create some data, you can either
30 | ##  - Run the first line where you are creating the object `d` that is a data.frame; or,
31 | ##  - Highlight the region that you want to run and then run that region (using command+return
32 | ##    or alt+return). 
33 | 
34 | 
35 | d <- data.frame(
36 |   id = 1:1000, 
37 |   x  = rnorm(1000, mean = 0, sd = 1), 
38 |   y  = rnorm(1000, mean = 10, sd = 2),
39 |   color = sample(c('red', 'blue'), size = 1000, replace = TRUE)
40 | )
41 | 
42 | ## To produce the plot below, run these lines. Do you need to run all the lines?
43 | ## Or, can you run just the first? Why do you think the IDE chooses to behave this way?
44 | 
45 | ggplot(data = d, aes(x=x, y=y)) + 
46 |   geom_point()
47 | ```
48 | 
49 | ![](working_with_rstudio_files/figure-gfm/unnamed-chunk-1-1.png)<!-- -->
50 | 
51 | ``` r
52 | ## Finally, you can run code that doesn't have any visible side effects.
53 | ## If you run the line below, what do you see in your console? Just that the line has run?
54 | ## But, now look into the `Environment` tab that is visible to you -- is there a record
55 | ## of this `mod` that you just created?
56 | 
57 | mod <- lm(y ~ x, data = d)
58 | 
59 | ## The model that you created is stored in the working memory and can be called by
60 | ## naming the object.
61 | 
62 | mod
63 | ```
64 | 
65 |     ## 
66 |     ## Call:
67 |     ## lm(formula = y ~ x, data = d)
68 |     ## 
69 |     ## Coefficients:
70 |     ## (Intercept)            x  
71 |     ##     9.93161     -0.06939
72 | 
73 | ``` r
74 | ## If you want to use the summary function on the model, you can and you will see
75 | ## a different return printed to the console. 
76 | 
77 | summary(mod)
78 | ```
79 | 
80 |     ## 
81 |     ## Call:
82 |     ## lm(formula = y ~ x, data = d)
83 |     ## 
84 |     ## Residuals:
85 |     ##     Min      1Q  Median      3Q     Max 
86 |     ## -5.7238 -1.3353 -0.0679  1.3411  6.1006 
87 |     ## 
88 |     ## Coefficients:
89 |     ##             Estimate Std. Error t value Pr(>|t|)    
90 |     ## (Intercept)  9.93161    0.06249 158.925   <2e-16 ***
91 |     ## x           -0.06939    0.06334  -1.096    0.273    
92 |     ## ---
93 |     ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
94 |     ## 
95 |     ## Residual standard error: 1.976 on 998 degrees of freedom
96 |     ## Multiple R-squared:  0.001201,   Adjusted R-squared:  0.0002006 
97 |     ## F-statistic:   1.2 on 1 and 998 DF,  p-value: 0.2735
98 | 


--------------------------------------------------------------------------------
/code/working_with_rstudio_solution.md:
--------------------------------------------------------------------------------
 1 | Interacting with the IDE Solutions
 2 | ================
 3 | w203: Statistics for Data Science
 4 | 8/13/2020
 5 | 
 6 | ``` r
 7 | ## Welcome! You've made it over to the IDE.
 8 | 
 9 | ## Any line that starts with one or more `#` will be commented out.
10 | ## This means that if you run that line, nothing will actually occur in the
11 | ## interpretor. 
12 | 
13 | ## To run this code below you can do the following:
14 | ## - If you are on a Mac, on the line that you want to run you can press `command+return`
15 | ##   which means to hold command and then press return.
16 | ## - If you are on a Windows or Linux machine, then you can run that line by pressing `alt+return`
17 | ##   which means to hold the alt key and press return.
18 | 
19 | ## When you run the first line one of two things might happen:
20 | ## 1. You might get an error because you haven't installed that package. If this happens,
21 | ##    look near the top of your Rstudio screen -- there should be a helper that asks if
22 | ##    you want to install this library. You do and can click "install".
23 | ## 2. If you've already installed that library, then it should load the package, which
24 | ##    you will see in the console below. 
25 | 
26 | 
27 | library(ggplot2)
28 | 
29 | ## Now, if you want to create some data, you can either
30 | ##  - Run the first line where you are creating the object `d` that is a data.frame; or,
31 | ##  - Highlight the region that you want to run and then run that region (using command+return
32 | ##    or alt+return). 
33 | 
34 | 
35 | d <- data.frame(
36 |   id = 1:1000, 
37 |   x  = rnorm(1000, mean = 0, sd = 1), 
38 |   y  = rnorm(1000, mean = 10, sd = 2),
39 |   color = sample(c('red', 'blue'), size = 1000, replace = TRUE)
40 | )
41 | 
42 | ## To produce the plot below, run these lines. Do you need to run all the lines?
43 | ## Or, can you run just the first? Why do you think the IDE chooses to behave this way?
44 | 
45 | ggplot(data = d, aes(x=x, y=y)) + 
46 |   geom_point()
47 | ```
48 | 
49 | ![](working_with_rstudio_solution_files/figure-gfm/unnamed-chunk-1-1.png)<!-- -->
50 | 
51 | ``` r
52 | ## Finally, you can run code that doesn't have any visible side effects.
53 | ## If you run the line below, what do you see in your console? Just that the line has run?
54 | ## But, now look into the `Environment` tab that is visible to you -- is there a record
55 | ## of this `mod` that you just created?
56 | 
57 | mod <- lm(y ~ x, data = d)
58 | 
59 | ## The model that you created is stored in the working memory and can be called by
60 | ## naming the object.
61 | 
62 | mod
63 | ```
64 | 
65 |     ## 
66 |     ## Call:
67 |     ## lm(formula = y ~ x, data = d)
68 |     ## 
69 |     ## Coefficients:
70 |     ## (Intercept)            x  
71 |     ##      9.9418       0.0556
72 | 
73 | ``` r
74 | ## If you want to use the summary function on the model, you can and you will see
75 | ## a different return printed to the console. 
76 | 
77 | summary(mod)
78 | ```
79 | 
80 |     ## 
81 |     ## Call:
82 |     ## lm(formula = y ~ x, data = d)
83 |     ## 
84 |     ## Residuals:
85 |     ##     Min      1Q  Median      3Q     Max 
86 |     ## -6.8812 -1.2280 -0.0065  1.3295  5.8167 
87 |     ## 
88 |     ## Coefficients:
89 |     ##             Estimate Std. Error t value Pr(>|t|)    
90 |     ## (Intercept)  9.94179    0.06298   157.9   <2e-16 ***
91 |     ## x            0.05560    0.06176     0.9    0.368    
92 |     ## ---
93 |     ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
94 |     ## 
95 |     ## Residual standard error: 1.991 on 998 degrees of freedom
96 |     ## Multiple R-squared:  0.0008113,  Adjusted R-squared:  -0.0001899 
97 |     ## F-statistic: 0.8103 on 1 and 998 DF,  p-value: 0.3683
98 | 


--------------------------------------------------------------------------------
/reading_calls/working_with_rstudio_solution.md:
--------------------------------------------------------------------------------
 1 | Interacting with the IDE Solutions
 2 | ================
 3 | w203: Statistics for Data Science
 4 | 8/13/2020
 5 | 
 6 | ``` r
 7 | ## Welcome! You've made it over to the IDE.
 8 | 
 9 | ## Any line that starts with one or more `#` will be commented out.
10 | ## This means that if you run that line, nothing will actually occur in the
11 | ## interpretor. 
12 | 
13 | ## To run this code below you can do the following:
14 | ## - If you are on a Mac, on the line that you want to run you can press `command+return`
15 | ##   which means to hold command and then press return.
16 | ## - If you are on a Windows or Linux machine, then you can run that line by pressing `alt+return`
17 | ##   which means to hold the alt key and press return.
18 | 
19 | ## When you run the first line one of two things might happen:
20 | ## 1. You might get an error because you haven't installed that package. If this happens,
21 | ##    look near the top of your Rstudio screen -- there should be a helper that asks if
22 | ##    you want to install this library. You do and can click "install".
23 | ## 2. If you've already installed that library, then it should load the package, which
24 | ##    you will see in the console below. 
25 | 
26 | 
27 | library(ggplot2)
28 | 
29 | ## Now, if you want to create some data, you can either
30 | ##  - Run the first line where you are creating the object `d` that is a data.frame; or,
31 | ##  - Highlight the region that you want to run and then run that region (using command+return
32 | ##    or alt+return). 
33 | 
34 | 
35 | d <- data.frame(
36 |   id = 1:1000, 
37 |   x  = rnorm(1000, mean = 0, sd = 1), 
38 |   y  = rnorm(1000, mean = 10, sd = 2),
39 |   color = sample(c('red', 'blue'), size = 1000, replace = TRUE)
40 | )
41 | 
42 | ## To produce the plot below, run these lines. Do you need to run all the lines?
43 | ## Or, can you run just the first? Why do you think the IDE chooses to behave this way?
44 | 
45 | ggplot(data = d, aes(x=x, y=y)) + 
46 |   geom_point()
47 | ```
48 | 
49 | ![](working_with_rstudio_solution_files/figure-gfm/unnamed-chunk-1-1.png)<!-- -->
50 | 
51 | ``` r
52 | ## Finally, you can run code that doesn't have any visible side effects.
53 | ## If you run the line below, what do you see in your console? Just that the line has run?
54 | ## But, now look into the `Environment` tab that is visible to you -- is there a record
55 | ## of this `mod` that you just created?
56 | 
57 | mod <- lm(y ~ x, data = d)
58 | 
59 | ## The model that you created is stored in the working memory and can be called by
60 | ## naming the object.
61 | 
62 | mod
63 | ```
64 | 
65 |     ## 
66 |     ## Call:
67 |     ## lm(formula = y ~ x, data = d)
68 |     ## 
69 |     ## Coefficients:
70 |     ## (Intercept)            x  
71 |     ##      9.9418       0.0556
72 | 
73 | ``` r
74 | ## If you want to use the summary function on the model, you can and you will see
75 | ## a different return printed to the console. 
76 | 
77 | summary(mod)
78 | ```
79 | 
80 |     ## 
81 |     ## Call:
82 |     ## lm(formula = y ~ x, data = d)
83 |     ## 
84 |     ## Residuals:
85 |     ##     Min      1Q  Median      3Q     Max 
86 |     ## -6.8812 -1.2280 -0.0065  1.3295  5.8167 
87 |     ## 
88 |     ## Coefficients:
89 |     ##             Estimate Std. Error t value Pr(>|t|)    
90 |     ## (Intercept)  9.94179    0.06298   157.9   <2e-16 ***
91 |     ## x            0.05560    0.06176     0.9    0.368    
92 |     ## ---
93 |     ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
94 |     ## 
95 |     ## Residual standard error: 1.991 on 998 degrees of freedom
96 |     ## Multiple R-squared:  0.0008113,  Adjusted R-squared:  -0.0001899 
97 |     ## F-statistic: 0.8103 on 1 and 998 DF,  p-value: 0.3683
98 | 


--------------------------------------------------------------------------------
/code/mutate_solution.md:
--------------------------------------------------------------------------------
  1 | Mutating a New Varible
  2 | ================
  3 | w203: Statistics for Data Science
  4 | 8/13/2020
  5 | 
  6 | ``` r
  7 | library(tidyverse)
  8 | ```
  9 | 
 10 | ``` r
 11 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
 12 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
 13 | ```
 14 | 
 15 | # Mutate Task
 16 | 
 17 |   - Using the `agencies` data, create a series of variables that contain
 18 |     the log of the `count` of launches.
 19 | 
 20 | <!-- end list -->
 21 | 
 22 | ``` r
 23 | agencies_log <- agencies %>% 
 24 |   mutate(count_log = log(count))
 25 | 
 26 | agencies_log
 27 | ```
 28 | 
 29 |     ## # A tibble: 74 x 20
 30 |     ##    agency count ucode state_code type  class tstart tstop short_name name 
 31 |     ##    <chr>  <dbl> <chr> <chr>      <chr> <chr> <chr>  <chr> <chr>      <chr>
 32 |     ##  1 RVSN    1528 RVSN  SU         O/LA  D     1960   1991… RVSN       Rake…
 33 |     ##  2 UNKS     904 GUKOS SU         O/LA  D     1986 … 1991  UNKS       Upra…
 34 |     ##  3 NASA     469 NASA  US         O/LA… C     1958 … -     NASA       Nati…
 35 |     ##  4 USAF     388 USAF  US         O/LA… D     1947 … -     USAF       Unit…
 36 |     ##  5 AE       258 AE    F          O/LA  B     1980 … *     Arianespa… Aria…
 37 |     ##  6 AFSC     247 AFSC  US         LA    D     1961 … 1992… AFSC       US A…
 38 |     ##  7 VKSR     200 GUKOS RU         O/LA  D     1997 … 2001… VKS RVSN   Voen…
 39 |     ##  8 CALT     181 CALT  CN         LA/L… C     1957 … -     CALT       Zhon…
 40 |     ##  9 FKA      128 MOM   RU         O/LA  C     2004   2016… Roskosmos  Fede…
 41 |     ## 10 SAST     105 SBA   CN         O/LA… B     1993   -     SAST       Shan…
 42 |     ## # … with 64 more rows, and 10 more variables: location <chr>, longitude <chr>,
 43 |     ## #   latitude <chr>, error <chr>, parent <chr>, short_english_name <chr>,
 44 |     ## #   english_name <chr>, unicode_name <chr>, agency_type <chr>, count_log <dbl>
 45 | 
 46 | > But not that you don’t *have* to assign this to a new object.
 47 | 
 48 |   - Then, show only the columns that are called either `agency` or
 49 |     `contains()` the string “count”.
 50 | 
 51 | <!-- end list -->
 52 | 
 53 | ``` r
 54 | agencies_log %>%  
 55 |   select(agency, contains('count'))
 56 | ```
 57 | 
 58 |     ## # A tibble: 74 x 3
 59 |     ##    agency count count_log
 60 |     ##    <chr>  <dbl>     <dbl>
 61 |     ##  1 RVSN    1528      7.33
 62 |     ##  2 UNKS     904      6.81
 63 |     ##  3 NASA     469      6.15
 64 |     ##  4 USAF     388      5.96
 65 |     ##  5 AE       258      5.55
 66 |     ##  6 AFSC     247      5.51
 67 |     ##  7 VKSR     200      5.30
 68 |     ##  8 CALT     181      5.20
 69 |     ##  9 FKA      128      4.85
 70 |     ## 10 SAST     105      4.65
 71 |     ## # … with 64 more rows
 72 | 
 73 |   - Finally, `arrange()` these descending by `count`.
 74 | 
 75 | <!-- end list -->
 76 | 
 77 | ``` r
 78 | agencies_log %>%  
 79 |   select(agency, contains('count')) %>%  
 80 |   arrange(desc(count))
 81 | ```
 82 | 
 83 |     ## # A tibble: 74 x 3
 84 |     ##    agency count count_log
 85 |     ##    <chr>  <dbl>     <dbl>
 86 |     ##  1 RVSN    1528      7.33
 87 |     ##  2 UNKS     904      6.81
 88 |     ##  3 NASA     469      6.15
 89 |     ##  4 USAF     388      5.96
 90 |     ##  5 AE       258      5.55
 91 |     ##  6 AFSC     247      5.51
 92 |     ##  7 VKSR     200      5.30
 93 |     ##  8 CALT     181      5.20
 94 |     ##  9 FKA      128      4.85
 95 |     ## 10 SAST     105      4.65
 96 |     ## # … with 64 more rows
 97 | 
 98 | > HA\! It looks as though the data came in the door arranged by count.
 99 | > However, I would **never** suggest relying on this. If you want the
100 | > data arranged by count, write the code to do so. The upstream data
101 | > that comes into your analysis could change; potentially without you
102 | > knowing.
103 | > 
104 | > If you want your data to have some particualr characteristic, you
105 | > should write the code that makes it be so.
106 | 


--------------------------------------------------------------------------------
/reading_calls/mutate_solution.md:
--------------------------------------------------------------------------------
  1 | Mutating a New Varible
  2 | ================
  3 | w203: Statistics for Data Science
  4 | 8/13/2020
  5 | 
  6 | ``` r
  7 | library(tidyverse)
  8 | ```
  9 | 
 10 | ``` r
 11 | agencies <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv')
 12 | launches <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv')
 13 | ```
 14 | 
 15 | # Mutate Task
 16 | 
 17 |   - Using the `agencies` data, create a series of variables that contain
 18 |     the log of the `count` of launches.
 19 | 
 20 | <!-- end list -->
 21 | 
 22 | ``` r
 23 | agencies_log <- agencies %>% 
 24 |   mutate(count_log = log(count))
 25 | 
 26 | agencies_log
 27 | ```
 28 | 
 29 |     ## # A tibble: 74 x 20
 30 |     ##    agency count ucode state_code type  class tstart tstop short_name name 
 31 |     ##    <chr>  <dbl> <chr> <chr>      <chr> <chr> <chr>  <chr> <chr>      <chr>
 32 |     ##  1 RVSN    1528 RVSN  SU         O/LA  D     1960   1991… RVSN       Rake…
 33 |     ##  2 UNKS     904 GUKOS SU         O/LA  D     1986 … 1991  UNKS       Upra…
 34 |     ##  3 NASA     469 NASA  US         O/LA… C     1958 … -     NASA       Nati…
 35 |     ##  4 USAF     388 USAF  US         O/LA… D     1947 … -     USAF       Unit…
 36 |     ##  5 AE       258 AE    F          O/LA  B     1980 … *     Arianespa… Aria…
 37 |     ##  6 AFSC     247 AFSC  US         LA    D     1961 … 1992… AFSC       US A…
 38 |     ##  7 VKSR     200 GUKOS RU         O/LA  D     1997 … 2001… VKS RVSN   Voen…
 39 |     ##  8 CALT     181 CALT  CN         LA/L… C     1957 … -     CALT       Zhon…
 40 |     ##  9 FKA      128 MOM   RU         O/LA  C     2004   2016… Roskosmos  Fede…
 41 |     ## 10 SAST     105 SBA   CN         O/LA… B     1993   -     SAST       Shan…
 42 |     ## # … with 64 more rows, and 10 more variables: location <chr>, longitude <chr>,
 43 |     ## #   latitude <chr>, error <chr>, parent <chr>, short_english_name <chr>,
 44 |     ## #   english_name <chr>, unicode_name <chr>, agency_type <chr>, count_log <dbl>
 45 | 
 46 | > But not that you don’t *have* to assign this to a new object.
 47 | 
 48 |   - Then, show only the columns that are called either `agency` or
 49 |     `contains()` the string “count”.
 50 | 
 51 | <!-- end list -->
 52 | 
 53 | ``` r
 54 | agencies_log %>%  
 55 |   select(agency, contains('count'))
 56 | ```
 57 | 
 58 |     ## # A tibble: 74 x 3
 59 |     ##    agency count count_log
 60 |     ##    <chr>  <dbl>     <dbl>
 61 |     ##  1 RVSN    1528      7.33
 62 |     ##  2 UNKS     904      6.81
 63 |     ##  3 NASA     469      6.15
 64 |     ##  4 USAF     388      5.96
 65 |     ##  5 AE       258      5.55
 66 |     ##  6 AFSC     247      5.51
 67 |     ##  7 VKSR     200      5.30
 68 |     ##  8 CALT     181      5.20
 69 |     ##  9 FKA      128      4.85
 70 |     ## 10 SAST     105      4.65
 71 |     ## # … with 64 more rows
 72 | 
 73 |   - Finally, `arrange()` these descending by `count`.
 74 | 
 75 | <!-- end list -->
 76 | 
 77 | ``` r
 78 | agencies_log %>%  
 79 |   select(agency, contains('count')) %>%  
 80 |   arrange(desc(count))
 81 | ```
 82 | 
 83 |     ## # A tibble: 74 x 3
 84 |     ##    agency count count_log
 85 |     ##    <chr>  <dbl>     <dbl>
 86 |     ##  1 RVSN    1528      7.33
 87 |     ##  2 UNKS     904      6.81
 88 |     ##  3 NASA     469      6.15
 89 |     ##  4 USAF     388      5.96
 90 |     ##  5 AE       258      5.55
 91 |     ##  6 AFSC     247      5.51
 92 |     ##  7 VKSR     200      5.30
 93 |     ##  8 CALT     181      5.20
 94 |     ##  9 FKA      128      4.85
 95 |     ## 10 SAST     105      4.65
 96 |     ## # … with 64 more rows
 97 | 
 98 | > HA\! It looks as though the data came in the door arranged by count.
 99 | > However, I would **never** suggest relying on this. If you want the
100 | > data arranged by count, write the code to do so. The upstream data
101 | > that comes into your analysis could change; potentially without you
102 | > knowing.
103 | > 
104 | > If you want your data to have some particualr characteristic, you
105 | > should write the code that makes it be so.
106 | 


--------------------------------------------------------------------------------
/code/additional_plot_features.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Additional Plot Features"
 3 | author: 'w203: Statistics for Data Science'
 4 | output: github_document
 5 | ---
 6 | 
 7 | From here on out, we're going to start working in **R Markdown Files**. These files let us weave code and explanation together. What you're reading right here is explanation -- this isn't actually run by the interpreter in the console. 
 8 | 
 9 | But, what you're seeing just below -- in a "chunk" set off by three bac-kticks -- is a code block. 
10 | 
11 | If you look at line 14, and run this line just as you would have if this were an .R file (it is a .Rmd file) what happens? 
12 | 
13 | ```{r}
14 | print('Hello world.')
15 | ```
16 | 
17 | Now, the result rather than being printed to the console is both printed in the console and in line with this code. One piece that you might have noticed is the curly braces after the first set of back-ticks. This is telling the interpreter that this is code that is written in the R languge. It is possible (though we won't do this for now) to write python, julia, c++ or other code within these chunks and have the interpreter evaluate them. 
18 | 
19 | Below, I'm going to write a chunk that I've called "setup" that is going to load libraries and set themes and set parameters for plots. I've also added extra arguments to this code chunk declaration -- `results = 'hide', warning=FALSE, message=FALSE`. These control how the chunk works, but let's not dig **too** far into that just yet. 
20 | 
21 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
22 | library(tidyverse)
23 | library(ggplot2)
24 | theme_set(theme_minimal())
25 | knitr::opts_chunk$set(dpi = 200)
26 | ```
27 | 
28 | In this chunk, which I've called "load data" I'm going to load the data, and then create the date field that we've used all along. 
29 | 
30 | ```{r load data}
31 | squirrel_subset <- read_csv('./squirrels_subset.csv')
32 | squirrel_subset <- squirrel_subset %>%
33 |   mutate(date_f = as.Date.character(date, format = '%m%d%Y'))
34 | ```
35 | 
36 | Now, you can see below that I'm creating a section in the markdown space by starting the line with a "hash". If this were in a code chunk, it would comment out the line, but here in the markdown space, this will create a level-1 heading. Two hashes would create a level-2 heading (one further indent). 
37 | 
38 | # Data Reminder 
39 | 
40 | As a reminder, we're working with data that is from the "census of squirrels" in NYC's central park. 
41 | Each row is an observation of a squirrel, and each observation has as much data as possible about the observation. 
42 | 
43 | ## Previous Plots 
44 | 
45 | The last set of plots that we made with `geom_point()` were simple scatter plots where we passed: 
46 | 
47 | - The `long`; 
48 | - The `lat`; and, 
49 | - One other feature. 
50 | 
51 | ```{r the old plot}
52 | ggplot(data = squirrel_subset) + 
53 |   aes(x = long, y = lat) + 
54 |   geom_point() + 
55 |   coord_quickmap()
56 | ```
57 | 
58 | 
59 | # This plot 
60 | 
61 | For this plot, I'd like you to represent four data series in a single plot. Because each of these are information in the dataset that we want to control the way the plot is built, each of the series should go into the `aes()` funciton. 
62 | 
63 | 1. The `long`; 
64 | 2. The `lat`; 
65 | 3. The `age`; 
66 | 4. The `primary_fur_color`. 
67 | 
68 | I'm sure you're wondering? What aestetic options are available to me? To look into this, let's look into the documentation for `geom_point()`, because the `geom_point()` inherits its aestetics from the `aes()` function. 
69 | 
70 | When you run the line below, your Rstudio should then open a help browser
71 | 
72 | ```{r geom_point_help}
73 | ?geom_point()
74 | ```
75 | 
76 | I see several aesthetics listed: 
77 | 
78 | - x
79 | - y 
80 | - alpha, ... and many more. 
81 | 
82 | Before you start to build the plot -- think about what you might want to show -- you might go as far as to write down the plot that you want to create (as I've just suggested in the lecture) but that isn't strictly necessary because this is reasonably straightforward data. 
83 | Once you know what you'd like to show, pass data series into these aesthetics until you have created a plot that you think does a good job of representing your idea. 
84 | 
85 | ```{r}
86 | 
87 | ```


--------------------------------------------------------------------------------
/code/grouped_data_solution.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Grouped Data"
  3 | author: 'w203: Statistics for Data Science'
  4 | output: github_document
  5 | ---
  6 | 
  7 | ```{r setup, results='hide', warning=FALSE, message=FALSE}
  8 | library(tidyverse)
  9 | library(ggplot2)
 10 | # install.packages('patchwork')
 11 | library(patchwork)
 12 | 
 13 | theme_set(theme_minimal())
 14 | knitr::opts_chunk$set(dpi = 200)
 15 | ```
 16 | 
 17 | ```{r load and mutate data}
 18 | squirrel_subset <- read.csv('squirrels_subset.csv')
 19 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color))
 20 | 
 21 | squirrel_subset <- squirrel_subset %>%  
 22 |   mutate(date_f = as.Date.character(date, format = '%m%d%Y'))
 23 | 
 24 | ```
 25 | 
 26 | # Plots Groups with Colors
 27 | 
 28 | A few days ago, we plotted the count of squirrels that were observed over time. 
 29 | 
 30 | - Because it was by time, we reasoned that a line plot did a good job of illustrating the connections between the observations
 31 | - The plot looked something like what is below
 32 | 
 33 | ```{r ungrouped plot}
 34 | squirrel_subset %>%  
 35 |   group_by(date_f) %>%  
 36 |   summarise(count_of_squirrels = n()) %>%  
 37 |   ggplot() + 
 38 |   aes(x = date_f, y = count_of_squirrels) + 
 39 |   geom_line()
 40 | ```
 41 | At that point, we asked a challenge question of you, that asked, 
 42 | 
 43 | > Could you also make this plot and represent the color of the squirrels in the plot? 
 44 | 
 45 | Here, I've written the first set of lines that would do this for you -- this takes the squirrel subset data, groups by date and fur color, and then counts the number of squirrels that are observed in each of these combinations. 
 46 | 
 47 | Complete the plot, by: 
 48 | 
 49 | - Adding a `ggplot()` call;
 50 | - Adding an `aes()` call; and, 
 51 | - Adding a `geom_line()` call to produce the line
 52 | 
 53 | Think, as you're drawing this plot -- what parts of this are mapping from data that I want to bring into the plot? What do I want to map that information onto? This might help to keep clear the code that you want to write. 
 54 | 
 55 | ```{r}
 56 | squirrel_subset %>%
 57 |   group_by(date_f, primary_fur_color) %>%
 58 |   summarise(count_of_colors = n()) %>% 
 59 |   ggplot() + 
 60 |   aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 
 61 |   geom_line()
 62 | ```
 63 | 
 64 | # Plot Groups with Different Graphs
 65 | 
 66 | Although I think that it is probably uniformly **less** effective of a representation in this case, you might instead want to plot each group on a different axis. 
 67 | 
 68 | - To do so, use the `facet_wrap()` function to place each of the `primary_fur_colors` onto their own set of axes. 
 69 | - To help you along, I'll note that within `facet_wrap()` you will probably have to use the argument `facets = vars(primary_fur_color)`. This is a bit of a weird part of the `ggplot` api, and something that I hope they'll fix in the future.
 70 | - However, at least their fair about telling you that you'll have to use the `vars()` function -- look into the help documentation for this function.
 71 | 
 72 | ```{r}
 73 | ?facet_wrap()
 74 | ```
 75 | 
 76 | - Like before, I'll start you down the road for this plot by doing the data mapping. 
 77 | - Which way communicates more clearly for you? Aligning the plots by rows? Or aligning them by columns? Why do you think this is? 
 78 | 
 79 | ```{r}
 80 | squirrel_subset %>%  
 81 |   group_by(date_f, primary_fur_color) %>%  
 82 |   summarise(count_of_squirrels = n()) %>%  
 83 |   ggplot() + 
 84 |   aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 
 85 |   facet_wrap(facets = vars(primary_fur_color), nrow = 3) + 
 86 |   geom_line()
 87 | ```
 88 | 
 89 | > For me, I think that these plots work a **little** bit better when they are stacked vertically, because then I can see that all the dates align. Of course, immediately upon realizing this, then it becomes very clear that this plot would be more successful it were to be placed on a single set of axes. 
 90 | > 
 91 | > This kind of iterative plot making is (or at least can be) quite fun; and, once you realize that this mapping doesn't work better, you can return to the single set of axes. 
 92 | 
 93 | ```{r}
 94 | squirrel_subset %>%
 95 |   group_by(date_f, primary_fur_color) %>%
 96 |   summarise(count_of_colors = n()) %>% 
 97 |   ggplot() + 
 98 |   aes(x = date_f, y = count_of_colors, color = primary_fur_color) + 
 99 |   geom_line()
100 | ```
101 | 
102 | 


--------------------------------------------------------------------------------
/code/code_in_videos.R:
--------------------------------------------------------------------------------
  1 | library(tidyverse)
  2 | library(ggplot2)
  3 | library(patchwork)
  4 | 
  5 | squirrel_subset <- read.csv('squirrels_subset.csv')
  6 | squirrel_subset <- filter(squirrel_subset, !is.na(primary_fur_color))
  7 | 
  8 | squirrel_subset <- squirrel_subset %>%  
  9 |   mutate(date_f = as.Date.character(date, format = '%m%d%Y'))
 10 | 
 11 | squirrel_subset %>%  
 12 |   group_by(date_f) %>%  
 13 |   summarise(count_of_squirrels = n()) %>%  
 14 |   ggplot() + 
 15 |   aes(x = date_f, y = count_of_squirrels) + 
 16 |   geom_line()
 17 | 
 18 | squirrel_subset %>%  
 19 |   ggplot() + 
 20 |   aes(primary_fur_color) + 
 21 |   geom_bar() + 
 22 |   facet_grid(cols = vars(date_f))
 23 | 
 24 | squirrel_subset %>%  
 25 |   ggplot() + 
 26 |   aes(date_f) + 
 27 |   geom_bar() + 
 28 |   facet_grid(cols = vars(primary_fur_color))
 29 | 
 30 | squirrel_subset %>%
 31 |   group_by(date_f, primary_fur_color) %>%
 32 |   summarise(count_of_colors = n()) %>%
 33 |   ggplot() +
 34 |   aes(x = date_f, y = count_of_colors, color = primary_fur_color) +
 35 |   geom_line()
 36 | 
 37 | p1 <- squirrel_subset %>%  
 38 |   ggplot() + 
 39 |   aes(x = long, y = lat, color = primary_fur_color) + 
 40 |   geom_point()  
 41 | 
 42 | p2 <- squirrel_subset %>%  
 43 |   ggplot() + 
 44 |   aes(long, fill = primary_fur_color) + 
 45 |   geom_histogram(stat = 'density')
 46 | 
 47 | p1 / p2
 48 | 
 49 | 
 50 | 
 51 | 
 52 | squirrel_subset %>%
 53 |   group-by(date_f) %>%  
 54 |   summarise(count = n()) %>% 
 55 |   ggplot() +
 56 |   aes(x = date_f, y = count) + 
 57 |   geom_line()
 58 | 
 59 | ggplot(data = squirrel_subset) + 
 60 |   aes(x = long, y = lat, color = primary_fur_color) + 
 61 |   geom_point() + 
 62 |   coord_map()
 63 | 
 64 | 
 65 | squirrel_subset %>%  
 66 |   group_by(date_f, primary_fur_color) %>%  
 67 |   summarise(count_of_squirrels = n()) %>%  
 68 |   ggplot() + 
 69 |   aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 
 70 |   geom_line() + 
 71 |   theme_minimal()
 72 | 
 73 | squirrel_subset %>%  
 74 |   ggplot() + 
 75 |   aes(x = date_f) + 
 76 |   geom_histogram() + 
 77 |   facet_wrap(vars(primary_fur_color), ncol = 1)
 78 | 
 79 | 
 80 | squirrel_subset %>%  
 81 |   group_by(date_f, primary_fur_color) %>%  
 82 |   summarise(count_of_squirrels = n()) %>%  
 83 |   ggplot() + 
 84 |   aes(x = date_f, y = count_of_squirrels) + 
 85 |   facet_wrap(facets = vars(primary_fur_color), nrow = 1) + 
 86 |   geom_line()
 87 | 
 88 | squirrel_subset %>%  
 89 |   ggplot() + 
 90 |   aes(date_f, fill = primary_fur_color) + 
 91 |   geom_histogram(position = 'dodge')
 92 |   
 93 | squirrel_subset %>%  
 94 |   summarise(count_of_squirrels = n()) %>%  
 95 |   ggplot() + 
 96 |   aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 
 97 |   stat_smooth(se = FALSE)
 98 | 
 99 | 
100 | squirrel_subset %>%  
101 |   group_by(date_f, primary_fur_color) %>%  
102 |   summarise(count_of_squirrels = n()) %>%  
103 |   ggplot() + 
104 |   aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 
105 |   stat_smooth(se = FALSE) + 
106 |   labs(
107 |     title = 'There are a lot of grey squirrels',
108 |     subtitle = 'But, people are collecting data in later days',
109 |     x = 'Date of observation', 
110 |     y = 'Count of squirrels', 
111 |     color = 'Primary Fur Color'
112 |   )
113 | 
114 | squirrel_subset %>%  
115 |   group_by(date_f, primary_fur_color) %>%  
116 |   summarise(count_of_squirrels = n()) %>%  
117 |   ggplot() + 
118 |   aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 
119 |   stat_smooth(se = FALSE) + 
120 |   lims(
121 |     x = c(as.Date.character('2018-10-07'), 
122 |           as.Date.character('2018-10-16'))) +
123 |   labs(
124 |     title = 'There are a lot of grey squirrels',
125 |     subtitle = 'But, people are collecting data in later days',
126 |     x = 'Date of observation', 
127 |     y = 'Count of squirrels', 
128 |     color = 'Primary Fur Color'
129 |   )
130 | 
131 | squirrel_subset %>%  
132 |   group_by(date_f, primary_fur_color) %>%  
133 |   summarise(count_of_squirrels = n()) %>%  
134 |   ggplot() + 
135 |   aes(x = date_f, y = count_of_squirrels, color = primary_fur_color) + 
136 |   stat_smooth(se = FALSE) + 
137 |   coord_cartesian(
138 |     xlim = c(as.Date.character('2018-10-07'), 
139 |           as.Date.character('2018-10-16'))
140 |   ) +
141 |   labs(
142 |     title = 'There are a lot of grey squirrels',
143 |     subtitle = 'But, people are collecting data in later days',
144 |     x = 'Date of observation', 
145 |     y = 'Count of squirrels', 
146 |     color = 'Primary Fur Color'
147 |   )
148 | 


--------------------------------------------------------------------------------