├── .gitignore
├── README.md
├── apps
├── README.md
└── coin-toss
│ ├── README.md
│ ├── app.R
│ └── functions.R
├── cheatsheets
├── README.md
├── base-r-cheatsheet.pdf
├── command-line-cheatsheet.pdf
├── data-import-cheatsheet.pdf
├── data-transformation-cheatsheet.pdf
├── ggplot2-cheatsheet-2.1.pdf
├── git-cheatsheet.pdf
├── packages-cheatsheet.pdf
├── plotly-cheatsheet.pdf
├── regular-expressions-cheatsheet.pdf
├── rmarkdown-cheatsheet-2.0.pdf
├── rstudio-IDE-cheatsheet.pdf
├── shiny-cheatsheet.pdf
└── stringr-cheatsheet.pdf
├── data
├── andre-iguodala.csv
├── draymond-green.csv
├── gsw-roster-2017.html
├── kevin-durant.csv
├── klay-thompson.csv
├── may-logs.txt
├── mobile-food-sf.csv
├── nba-teams-2017.csv
├── nba2017-conference-standings.html
├── nba2017-players.csv
├── nba2017-roster.csv
├── nba2017-salary-points-dictionary.md
├── nba2017-salary-points.RData
├── stephen-curry.csv
└── text-emotion.csv
├── hws
├── README.md
├── hw01-data-frame-basics.pdf
├── hw02-shot-charts.pdf
├── hw03-programming-basics.pdf
├── hw04-strings-regex.pdf
├── hw05-package-dieroller.pdf
├── up01-markdown.pdf
└── up02-vector-basics.pdf
├── images
├── assignment-comments-bcourses.png
├── data-by-the-numbers.png
├── it-is-in-the-syllabus.png
├── mrs-mutner-rules.jpg
├── nba-court.jpg
├── sample-variance.png
└── tyrion-table.png
├── labs
├── lab01-R-basics.Rmd
├── lab01-R-basics.md
├── lab02-vector-basics.Rmd
├── lab02-vector-basics.md
├── lab03-command-line-basics.Rmd
├── lab03-command-line-basics.md
├── lab03-github-classroom.pdf
├── lab04-data-frame-basics.Rmd
├── lab04-data-frame-basics.md
├── lab05-dplyr-ggplot-basics.Rmd
├── lab05-dplyr-ggplot-basics.md
├── lab05-images
│ ├── knitr-fig-path.png
│ ├── named-chunk.png
│ ├── unnamed-chunk-19-1.png
│ ├── unnamed-chunk-21-1.png
│ ├── unnamed-chunk-22-1.png
│ ├── unnamed-chunk-23-1.png
│ ├── unnamed-chunk-24-1.png
│ ├── unnamed-chunk-25-1.png
│ └── unnamed-chunk-25-2.png
├── lab06-more-data-wrangling.Rmd
├── lab06-more-data-wrangling.md
├── lab07-images
│ ├── error-true.png
│ ├── gaussian_plot-1.png
│ └── polynomial_plot-1.png
├── lab07-simple-functions.Rmd
├── lab07-simple-functions.md
├── lab08-images
│ ├── arith_mean.png
│ ├── arith_series.png
│ ├── geo_seq.png
│ ├── geom_mean.png
│ ├── sine_series.png
│ ├── std_dev.png
│ ├── sum_series1.png
│ └── sum_series2.png
├── lab08-simple-loops.Rmd
├── lab08-simple-loops.md
├── lab09-tests-strings-basics.Rmd
├── lab09-tests-strings-basics.md
├── lab10-images
│ ├── day_barchart-1.png
│ ├── plotly-barchart1.png
│ ├── plotly-barchart2.png
│ ├── plotly0.png
│ ├── plotly1.png
│ ├── san-francisco-map-1.png
│ ├── san-francisco.png
│ ├── san-francisco.png.rda
│ └── ugly_map-1.png
├── lab10-regex-basics.Rmd
├── lab10-regex-basics.md
├── lab11-images
│ ├── app1.png
│ ├── app2.png
│ ├── app3.png
│ └── freqs-plot.png
├── lab11-random-simulations.Rmd
├── lab11-random-simulations.md
├── lab11-shiny-apps
│ ├── app1.R
│ ├── app2.R
│ ├── app3.R
│ └── app4.R
├── lab12-images
│ ├── conference-standings.png
│ ├── gsw-2017-roster.png
│ ├── inspect1.png
│ └── inspect2.png
├── lab12-web-scraping.Rmd
└── lab12-web-scraping.md
├── papers
├── correlograms-xia-liu.pdf
├── testthat-wickham.pdf
├── tidy-data-wickham.pdf
└── what-is-data-science.pdf
├── slides
├── 00-about-course.pdf
├── 01-big-picture.pdf
├── 02-about-R.pdf
├── 03-R-vector-types.pdf
├── 04-arrays-factors.pdf
├── 05-lists.pdf
├── 06-base-graphics1.pdf
├── 07-base-graphics2.pdf
├── 08-filesystem-basics.pdf
├── 09-shell-basics.pdf
├── 10-working-with-files.pdf
├── 11-git-basics.pdf
├── 12-data-tables.pdf
├── 13-importing-tables.pdf
├── 14-data-frame-basics.pdf
├── 15-principal-components1.pdf
├── 16-principal-components2.pdf
├── 17-dplyr-tutorial.pdf
├── 18-grammar-graphics.pdf
├── 19-ggplot-lecture.pdf
├── 20-strings-basics.pdf
└── shiny-tutorial.pdf
├── syllabus
├── README.md
├── faqs.md
├── piazza.md
└── policies.md
└── tutorials
├── 01-images
├── ggplot-scatter-1.png
├── gsw-2017-roster.png
├── gsw-2017-salaries.png
├── gsw-2017-totals.png
├── plot-scatter-1.png
├── screen-rgui.png
└── screen-rstudio.png
├── 01-intro-to-R.Rmd
├── 01-intro-to-R.md
├── 02-intro-to-Rmd-files.Rmd
├── 02-intro-to-Rmd-files.md
├── 03-intro-to-vectors.Rmd
├── 03-intro-to-vectors.md
├── 04-intro-to-data-frames.Rmd
├── 04-intro-to-data-frames.md
├── 05-dplyr-pipes.Rmd
├── 05-dplyr-pipes.md
├── 05-images
├── boxplots-1.png
├── densities-1.png
├── gender_barchart-1.png
├── height_histogram-1.png
├── scatterplot-1.png
├── unnamed-chunk-3-1.png
└── unnamed-chunk-5-1.png
├── 06-images
├── biplot-1.png
├── circle_correlations-1.png
├── eig_barchart-1.png
└── pc_plot-1.png
├── 06-principal-components.Rmd
├── 06-principal-components.md
├── 07-shell-redirections.Rmd
├── 07-shell-redirections.md
├── 08-shell-filters.Rmd
├── 08-shell-filters.md
├── 09-creating-functions.Rmd
├── 09-creating-functions.md
├── 10-intro-to-functions.Rmd
├── 10-intro-to-functions.md
├── 11-intro-to-expressions-conditionals.Rmd
├── 11-intro-to-expressions-conditionals.md
├── 12-intro-to-loops.Rmd
├── 12-intro-to-loops.md
├── 13-more-functions.Rmd
├── 13-more-functions.md
├── 14-images
└── test-report.png
├── 14-testing-functions.Rmd
├── 14-testing-functions.md
├── 15-intro-to-regex.Rmd
├── 15-intro-to-regex.md
├── 16-images
└── head_freqs_plot-1.png
├── 16-intro-to-random-numbers.Rmd
├── 16-intro-to-random-numbers.md
├── 17-programming-s3classes.Rmd
└── 17-programming-s3classes.pdf
/.gitignore:
--------------------------------------------------------------------------------
1 | # Mac specific
2 | *.DS_Store
3 |
4 | # latex specific
5 | *.aux
6 | *.log
7 |
8 | # files in labs/
9 | labs/.DS_Store
10 | labs/*.html
11 |
12 | # files in data/
13 | data/.DS_Store
14 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## stat133-spring-2018
2 |
3 | This repository holds the course materials for the Spring 2018 edition of
4 | __Statistics 133: Concepts in Computing with Data__ at UC Berkeley.
5 |
6 |
7 | - __Instructor:__ [Gaston Sanchez](http://gastonsanchez.com), gasigiri [at] berkeley [dot] edu
8 | - __Class Time:__ MWF 8-9am in [Dwinelle 155](http://www.berkeley.edu/map?dwinelle)
9 | - __Session Dates:__ 01/17/18 - 05/04/18
10 | - __Code #:__ 30844
11 | - __Units:__ 3 (more info [here](http://classes.berkeley.edu/content/2018-spring-stat-133-001-lec-001))
12 | - __Office Hours:__ W 1-2pm 309 Evans, Fr 2-3pm 344 Evans, 3-4pm 340 Evans
13 | - __Final:__ Mon May 7, 8-11am (Dwinelle 145 and 155)
14 | - __Piazza:__ [piazza.com/berkeley/spring2018/stat133](https://piazza.com/berkeley/spring2018/stat133)
15 | - __GSIs:__ Office hours of the GSIs displayed below.
16 | You can go to the office hours of __any__ GSI, not just your own.
17 |
18 | | GSI | Room | Day/Time |
19 | |:-------------|:----------|:--------------------|
20 | | Jin Kweon | 342 Evans | M Tu 9 - 11am |
21 | | Da Xu | 342 Evans | W 4-6pm, Th 8-9am & 5-6pm |
22 | | Minchul Shin | 342 Evans | M 10:30am-12:30pm, Th 5-7pm |
23 | | Ninh Do | 342 Evans | M 9-11am, Th 8-9am |
24 | | Qi Chen | 342 Evans | M 3-5pm, Tu 8-10am |
25 | | Rui Chen | 426 Evans | W 12-2pm, Th 9-11am |
26 |
27 |
28 | - __Lab Sections:__
29 |
30 | | Lab | Date | Room | GSI |
31 | |-----|------------|--------------|-----------------|
32 | | 101 | W 10-12pm | 342 Evans | Jin Kweon |
33 | | 102 | W 12-2pm | 340 Evans | Qi Chen |
34 | | 103 | W 12-2pm | 342 Evans | Ninh Do |
35 | | 104 | W 2-4pm | 340 Evans | Qi Chen |
36 | | 105 | W 2-4pm | 342 Evans | Rui Chen |
37 | | 106 | Th 9-11am | 342 Evans | Ninh Do |
38 | | 107 | Th 9-11am | 330 Evans | Da Xu |
39 | | 108 | Th 11-1pm | 342 Evans | Jin Kweon |
40 | | 109 | Th 11-1pm | 330 Evans | Rui Chen |
41 | | 110 | Th 1-3pm | 342 Evans | Minchul Shin |
42 | | 111 | Th 2-4pm | 340 Evans | Da Xu |
43 | | 112 | Th 3-5pm | 342 Evans | Minchul Shin |
44 |
45 |
46 | -----
47 |
48 |
49 | ### Philosophy
50 |
51 | __Concepts in Computing with Data__ presents computing tools and basic concepts
52 | for the main stages of the _Data Analysis Cycle_ (DAC): 1) data preparation,
53 | 2) actual analysis, and 3) reporting of results.
54 |
55 |
56 |
57 | Traditionally, teaching has been focused on the actual analysis part of a data
58 | set (e.g. description, hypothesis testing, modeling, validation) while leaving
59 | out the data preparation steps, as well as the presentation and reporting of results.
60 | This is nuts. Typically, the most time consuming parts in the _DAC_ are the
61 | preparation of data (e.g. cleaning, reformatting, tidying), and the report and
62 | communication of results (e.g. images, tables, papers, presentations, reports, docs).
63 | However, we don't teach students how to approach these tasks. They are left alone
64 | to acquire them in the wild. Stat 133 aims to add its two cents to provide solid
65 | foundations that will help you crunch data in a less
66 | improvised/naive/inefficient way.
67 |
68 |
69 | ### Description
70 |
71 | __Stat 133: Concepts in Computing with Data__ is an introductory course to computational
72 | data analysis using the statistical programming language __R__ (and other computational tools)
73 | with an emphasis on five major cornerstones:
74 |
75 | - Data Manipulation (wrangling, reshaping, tidying)
76 | - Data Visualization (focus on statistical charts)
77 | - Programming Concepts (with emphasis on data analysis)
78 | - Data Technologies (various sources/formats of data)
79 | - Reporting Tools (via dynamic documents)
80 |
81 | Because Stat 133 is one of the core courses of the Statistics Department,
82 | the underlying goal is to provide foundations for "computing with data" so students
83 | have the basic computational skills for subsequent
84 | upper division courses (e.g. Stat 150, 151A, 152, 153, 154, 155, 157, 158, 159).
85 | This involves teaching students how to:
86 |
87 | - understand common data formats
88 | - use the computer extensively to conduct statistical analysis of data.
89 | - use existing software rather than build routines from the ground up.
90 | - understand how to visualize data and display statistical information
91 | - learn the basic principles for writing code
92 | - organize your workflow
93 | - focus on aspects of computing to conduct data analysis, NOT the
94 | computational aspects of statistical methods
95 | - use computational tools to carry out the data analysis cycle
96 |
97 | For more details, check the [syllabus](syllabus/README.md)
98 |
99 |
100 | ### Prerequisites / Review
101 |
102 | This course does not have any prerequisites, although it would be nice if you
103 | have taken an introductory course in statistics (e.g. Stat 2, 20, 21, 131A).
104 |
105 | The curriculum and format is designed specifically for students (ideally
106 | majoring in Statistics) __who have NOT taken computer science courses__.
107 |
108 | You don't need previous programming experience, and you also don't need previous
109 | data analysis experience. However, students with some exposure to programming
110 | concepts, and data analysis tend to understand certain concepts better.
111 |
112 | Students with some prior experience in either computational statistics
113 | or computing are welcome to enroll, though some parts of the course will feel
114 | extremely slow.
115 | Students who have taken computer science courses (e.g. CS C8, CS C100, CS 9H,
116 | CS 61a, CS 61b) should instead take a more advanced course.
117 |
118 |
119 |
120 | ### Expectations
121 |
122 | We expect that, at the end of the course, you have a basic understanding of R.
123 | But more important, we expect that you understand the general principles of
124 | data analysis projects, independently of the programming
125 | language that you use.
126 |
127 | After completing the course, we expect that you feel comfortable in any of
128 | the three stages of the _Data Analysis Cycle (DAC)_. This means that
129 | you can take almost any data set(s), clean it, tidy it, get visualizations,
130 | write code, and report the results in a varied number of formats.
131 |
132 | We don't expect you to become a jedi data scientist, an R ninja, or a super coder.
133 | That takes YEARS of practice, training, learning, and collaborating. Instead,
134 | we want you to become a skilled [padawan](http://starwars.wikia.com/wiki/Padawan)
135 | analyst (which, if interested, can be prepared to take the next steps of a data
136 | science marathon race).
137 |
138 |
139 |
140 | ### Methods of Instruction
141 |
142 | We will be using a combination of materials such as slides, tutorials,
143 | reading assignments, and chalk-and-talk.
144 |
145 | The main computational tool will be the [computing and programming environment R](https://www.r-project.org/).
146 | The main workbench will be the IDE [RStudio](https://www.rstudio.com/).
147 | You will also use a terminal emulator to work with command line.
148 |
149 |
150 |
151 | ### Other
152 |
153 | - Please read the course [logistics and policies](syllabus/policies.md), and the [piazza etiquette rules](syllabus/piazza.md) for more details
154 | about the structure of the course, DO's and DONT's, etc.
155 |
156 |
157 |
158 | - I've prepared a list of [Frequently Asked Questions](syllabus/faqs.md) that I get asked
159 | every semester.
160 |
161 |
162 |
163 | -----
164 |
165 | ### License
166 |
167 | 
Unless otherwise noticed, this work, by Gaston Sanchez, is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.
168 |
169 | Author: [Gaston Sanchez](http://gastonsanchez.com)
170 |
--------------------------------------------------------------------------------
/apps/README.md:
--------------------------------------------------------------------------------
1 | # Shiny Apps
2 |
3 | This is a collcetion of Shiny apps used during lecture.
4 |
5 | - [Tossing a coin](coin-toss)
6 |
7 |
8 | ## Running the apps
9 |
10 | Assuming that you have both R and RStudio, the other thing you need is the R package `"shiny"`. In case of doubt, run:
11 |
12 | ```R
13 | install.packages("shiny")
14 | ```
15 |
16 | The easiest way to run an app is with the `runGitHub()` function from the `"shiny"` package. For instance, to run the app contained in the [coin-toss](coin-toss) folder, run the following code in R:
17 |
18 | ```R
19 | library(shiny)
20 |
21 | # Run an app from a subdirectory in the repo
22 | runGitHub("stat133-spring-2018", "ucb-stat133", subdir = "apps/coin-toss")
23 | ```
24 |
--------------------------------------------------------------------------------
/apps/coin-toss/README.md:
--------------------------------------------------------------------------------
1 | ## Coin Tossing
2 |
3 | This shiny app simulates tossing a coin, and visualizing the frequency of `heads`
4 |
5 |
6 | ## Running the app
7 |
8 | Assuming that you have both R and RStudio, the other thing you need is the R package `"shiny"`. In case of doubt, run:
9 |
10 | ```R
11 | install.packages("shiny")
12 | ```
13 |
14 | The easiest way to run an app is with the `runGitHub()` function from the `"shiny"` package.
15 |
16 | ```R
17 | library(shiny)
18 |
19 | # Run an app from a subdirectory in the repo
20 | runGitHub("stat133-spring-2018", "ucb-stat133", subdir = "apps/coin-toss")
21 | ```
22 |
--------------------------------------------------------------------------------
/apps/coin-toss/app.R:
--------------------------------------------------------------------------------
1 | # Shiny app for tossing coin
2 | # Author: Gaston Sanchez
3 |
4 | library(shiny)
5 | library(ggplot2)
6 |
7 | # source toss() function
8 | source('functions.R')
9 |
10 |
11 | # Define UI for application that draws a histogram
12 | ui <- fluidPage(
13 |
14 | # Application title
15 | titlePanel("Frequency of Heads"),
16 |
17 | # Sidebar with a slider input for number of bins
18 | sidebarLayout(
19 | sidebarPanel(
20 | sliderInput("prob",
21 | "Prob of heads",
22 | min = 0,
23 | max = 1,
24 | value = 0.5),
25 | sliderInput("times",
26 | "Number of tosses",
27 | min = 1,
28 | max = 5000,
29 | value = 100),
30 | numericInput("seed", label = "random seed", value = 123)
31 | ),
32 |
33 | # Show a plot of the generated distribution
34 | mainPanel(
35 | plotOutput("distPlot")
36 | )
37 | )
38 | )
39 |
40 |
41 | # Define server logic required to draw the plot
42 | server <- function(input, output) {
43 |
44 | output$distPlot <- renderPlot({
45 | coin <- c('heads', 'tails')
46 | set.seed(input$seed)
47 | tosses <- toss(coin, times = input$times,
48 | prob = c(input$prob, 1 - input$prob))
49 |
50 | head_freqs <- cumsum(tosses == 'heads')
51 | head_props <- head_freqs / (1:length(tosses))
52 |
53 | heads_df <- data.frame(
54 | num_tosses = 1:length(tosses),
55 | head_props = head_props
56 | )
57 | # draw frequency line
58 | ggplot(data = heads_df, aes(x = num_tosses, y = head_props)) +
59 | geom_hline(yintercept = 0.5, col = 'gray50') +
60 | geom_path(size = 1.5, color = '#4078d1') +
61 | ylim(0, 1) +
62 | xlab("Number of tosses") +
63 | ylab("Proportion of heads") +
64 | theme_minimal()
65 | })
66 | }
67 |
68 | # Run the application
69 | shinyApp(ui = ui, server = server)
70 |
71 |
--------------------------------------------------------------------------------
/apps/coin-toss/functions.R:
--------------------------------------------------------------------------------
1 | #' @title Toss
2 | #' @description Simulates tossing a coin
3 | #' @param x a coin object (i.e. vector)
4 | #' @param times number of tosses
5 | #' @param prob optional probability vector
6 | #' @return vector of tosses
7 | toss <- function(x, times = 1, prob = NULL) {
8 | sample(x, size = times, replace = TRUE, prob = prob)
9 | }
10 |
--------------------------------------------------------------------------------
/cheatsheets/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## Cheat Sheets
3 |
4 | - [RStudio](rstudio-IDE-cheatsheet.pdf)
5 | - [R markdown](rmarkdown-cheatsheet-2.0.pdf)
6 | - [Base R](base-r-cheatsheet.pdf)
7 | - [Git](git-cheatsheet.pdf)
8 | - [Command Line](command-line-cheatsheet.pdf)
9 | - [Data Import](data-import-cheatsheet.pdf)
10 | - [Graphics with ggplot2](ggplot2-cheatsheet-2.1.pdf)
11 | - [Graphics with plotly](plotly-cheatsheet.pdf)
12 | - [Data transformation cheat sheet](data-transformation-cheatsheet.pdf)
13 | - [Srtingr](stringr-cheatsheet.pdf)
14 | - [Regular Expressions](regular-expressions-cheatsheet.pdf)
15 | - [Shiny](shiny-cheatsheet.pdf)
16 |
--------------------------------------------------------------------------------
/cheatsheets/base-r-cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/cheatsheets/base-r-cheatsheet.pdf
--------------------------------------------------------------------------------
/cheatsheets/command-line-cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/cheatsheets/command-line-cheatsheet.pdf
--------------------------------------------------------------------------------
/cheatsheets/data-import-cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/cheatsheets/data-import-cheatsheet.pdf
--------------------------------------------------------------------------------
/cheatsheets/data-transformation-cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/cheatsheets/data-transformation-cheatsheet.pdf
--------------------------------------------------------------------------------
/cheatsheets/ggplot2-cheatsheet-2.1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/cheatsheets/ggplot2-cheatsheet-2.1.pdf
--------------------------------------------------------------------------------
/cheatsheets/git-cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/cheatsheets/git-cheatsheet.pdf
--------------------------------------------------------------------------------
/cheatsheets/packages-cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/cheatsheets/packages-cheatsheet.pdf
--------------------------------------------------------------------------------
/cheatsheets/plotly-cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/cheatsheets/plotly-cheatsheet.pdf
--------------------------------------------------------------------------------
/cheatsheets/regular-expressions-cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/cheatsheets/regular-expressions-cheatsheet.pdf
--------------------------------------------------------------------------------
/cheatsheets/rmarkdown-cheatsheet-2.0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/cheatsheets/rmarkdown-cheatsheet-2.0.pdf
--------------------------------------------------------------------------------
/cheatsheets/rstudio-IDE-cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/cheatsheets/rstudio-IDE-cheatsheet.pdf
--------------------------------------------------------------------------------
/cheatsheets/shiny-cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/cheatsheets/shiny-cheatsheet.pdf
--------------------------------------------------------------------------------
/cheatsheets/stringr-cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/cheatsheets/stringr-cheatsheet.pdf
--------------------------------------------------------------------------------
/data/nba-teams-2017.csv:
--------------------------------------------------------------------------------
1 | team,games_played,wins,losses,win_prop,minutes,points,field_goals,field_goals_attempted,field_goals_prop,points3,points3_attempted,points3_prop,free_throws,free_throws_att,free_throws_prop,off_rebounds,def_rebounds,rebounds,assists,turnovers,steals,blocks,block_fga,personal_fouls,personal_fouls_drawn,plus_minus
2 | Golden State Warriors,82,67,15,0.817,48.2,115.9,43.1,87.1,49.5,12,31.2,38.3,17.8,22.6,78.8,9.4,35,44.4,30.4,14.8,9.6,6.8,3.8,19.3,19.4,11.6
3 | San Antonio Spurs,82,61,21,0.744,48.3,105.3,39.3,83.7,46.9,9.2,23.5,39.1,17.6,22,79.7,10,33.9,43.9,23.8,13.4,8,5.9,4.1,18.3,19.8,7.2
4 | Houston Rockets,82,55,27,0.671,48.2,115.3,40.3,87.2,46.2,14.4,40.3,35.7,20.3,26.5,76.6,10.9,33.5,44.4,25.2,15.1,8.2,4.3,5,19.9,20.4,5.8
5 | Boston Celtics,82,53,29,0.646,48.2,108,38.6,85.1,45.4,12,33.4,35.9,18.7,23.2,80.7,9.1,32.9,42,25.2,13.3,7.5,4.1,5.2,20.6,20.3,2.6
6 | Utah Jazz,82,51,31,0.622,48.2,100.7,37,79.5,46.6,9.6,26,37.2,17.1,22.9,74.7,9.4,33.8,43.2,20.1,13.6,6.7,5,3.8,18.8,20.2,3.9
7 | Toronto Raptors,82,51,31,0.622,48.2,106.9,39.2,84.4,46.4,8.8,24.3,36.3,19.7,24.7,79.6,10.6,32.6,43.3,18.5,12.7,8.3,4.9,4.8,20.8,20.3,4.2
8 | Cleveland Cavaliers,82,51,31,0.622,48.5,110.3,39.9,84.9,47,13,33.9,38.4,17.5,23.3,74.8,9.3,34.4,43.7,22.7,13.7,6.6,4,4.3,18.1,20.6,3.2
9 | LA Clippers,82,51,31,0.622,48.2,108.7,39.5,83.2,47.5,10.3,27.4,37.5,19.3,26,74.5,9,34,43,22.5,13,7.5,4.2,3.1,19.8,22.4,4.3
10 | Washington Wizards,82,49,33,0.598,48.4,109.2,41.3,87,47.5,9.2,24.8,37.2,17.3,22.1,78.4,10.3,32.6,42.9,23.9,14.2,8.5,4.1,4.6,21.3,19.8,1.8
11 | Oklahoma City Thunder,82,47,35,0.573,48.3,106.6,39.5,87.4,45.2,8.4,25.8,32.7,19.2,25.8,74.5,12.2,34.4,46.6,21,15,7.9,5,5.5,20.9,20.7,0.8
12 | Memphis Grizzlies,82,43,39,0.524,48.5,100.5,36.4,83.6,43.5,9.4,26.5,35.4,18.3,23.4,78.4,10.8,32,42.8,21.3,12.9,8,4.2,5,22.4,20.8,0.5
13 | Atlanta Hawks,82,43,39,0.524,48.5,103.2,38.1,84.4,45.1,8.9,26.1,34.1,18.1,24.9,72.8,10.3,34.1,44.3,23.6,15.8,8.2,4.8,5.2,18.2,21.6,-0.9
14 | Indiana Pacers,82,42,40,0.512,48.4,105.1,39.3,84.5,46.5,8.6,23,37.6,17.9,22.1,81,9,33,42,22.5,13.8,8.2,5,5,19.5,19.5,-0.2
15 | Milwaukee Bucks,82,42,40,0.512,48.2,103.6,38.8,81.9,47.4,8.8,23.7,37,17.2,22.4,76.8,8.8,31.6,40.4,24.2,14,8.1,5.3,4.6,20.2,19.3,-0.2
16 | Chicago Bulls,82,41,41,0.5,48.2,102.9,38.6,87.1,44.4,7.6,22.3,34,18,22.5,79.8,12.2,34.1,46.3,22.6,13.6,7.8,4.8,4.6,17.7,18.8,0.4
17 | Portland Trail Blazers,82,41,41,0.5,48.6,107.9,39.5,86.1,45.9,10.4,27.7,37.5,18.5,23.7,78,10.1,33.5,43.7,21.1,13.7,7,5,5.3,21.2,19.8,-0.5
18 | Miami Heat,82,41,41,0.5,48.2,103.2,39,85.8,45.5,9.9,27,36.5,15.2,21.6,70.6,10.6,33,43.6,21.2,13.4,7.2,5.7,4.9,20.5,18.7,1.1
19 | Denver Nuggets,82,40,42,0.488,48.2,111.7,41.2,87.7,46.9,10.6,28.8,36.8,18.7,24.2,77.4,11.8,34.6,46.4,25.3,15,6.9,3.9,4.9,19.1,20.2,0.5
20 | Detroit Pistons,82,37,45,0.451,48.3,101.3,39.9,88.8,44.9,7.7,23.4,33,13.9,19.3,71.9,11.1,34.6,45.7,21.1,11.9,7,3.8,4.1,17.9,17.5,-1.1
21 | Charlotte Hornets,82,36,46,0.439,48.4,104.9,37.7,85.4,44.2,10,28.6,35.1,19.4,23.8,81.5,8.8,34.8,43.6,23.1,11.5,7,4.8,5.5,16.6,19.9,0.2
22 | New Orleans Pelicans,82,34,48,0.415,48.5,104.3,39.1,87,45,9.4,26.8,35,16.7,22.3,75,8.6,35.1,43.7,22.8,12.9,7.8,5.5,4.2,18.2,19.3,-2.1
23 | Dallas Mavericks,82,33,49,0.402,48.2,97.9,36.2,82.3,44,10.7,30.2,35.5,14.8,18.5,80.1,7.9,30.7,38.6,20.8,11.9,7.5,3.7,3.4,19.1,19.4,-2.9
24 | Sacramento Kings,82,32,50,0.39,48.5,102.8,37.9,82.1,46.1,9,23.9,37.6,18.1,23.3,77.5,8.7,32.3,41.1,22.5,14.6,7.6,4,5.1,20.3,20.1,-3.9
25 | Minnesota Timberwolves,82,31,51,0.378,48.3,105.6,39.5,84.4,46.7,7.3,21,34.9,19.3,24.2,79.9,11.4,31,42.4,23.7,14,8,4.5,5,20.1,20.5,-1.1
26 | New York Knicks,82,31,51,0.378,48.4,104.3,39.6,88.5,44.7,8.6,24.7,34.8,16.6,21.1,78.8,12,33.2,45.2,21.8,13.9,7.1,5.5,4.6,20.3,18.5,-3.7
27 | Orlando Magic,82,29,53,0.354,48.3,101.1,38.3,87,44,8.5,26.1,32.8,16,21.4,74.7,9.8,33.3,43.2,22.2,13.3,7.1,4.8,5.1,19.3,18.7,-6.6
28 | Philadelphia 76ers,82,28,54,0.341,48.4,102.4,37.7,85.3,44.2,10.1,29.8,34,17,22,77.1,9.8,33,42.8,23.8,16.7,8.4,5.1,5.4,21.9,19.6,-5.7
29 | Los Angeles Lakers,82,26,56,0.317,48.1,104.6,39.3,87.4,45,8.9,25.7,34.6,17,22.6,75.4,11.4,32.1,43.5,20.9,15.2,8.2,3.9,5.4,20.7,18.5,-6.9
30 | Phoenix Suns,82,24,58,0.293,48.4,107.7,39.9,88.5,45,7.5,22.6,33.2,20.4,26.3,77.6,11.9,33.1,45,19.6,15.4,8.2,4.9,5.3,24.8,22.2,-5.6
31 | Brooklyn Nets,82,20,62,0.244,48.2,105.8,37.8,85.2,44.4,10.7,31.6,33.8,19.4,24.6,78.8,8.8,35.1,43.9,21.4,16.5,7.2,4.7,5.6,21,20.4,-6.7
--------------------------------------------------------------------------------
/data/nba2017-salary-points-dictionary.md:
--------------------------------------------------------------------------------
1 |
2 | ## Data `nba2017-salary-points.RData`
3 |
4 | Here's the description of the R objects in `nba2017-salary-points.RData`:
5 |
6 | - `player`: name of the player.
7 | - `team`: team name abbreviation.
8 | - `position`: player position.
9 | - `age`: age (in years).
10 | - `experience`: years of experience.
11 | - `salary`: salary (in dollars).
12 | - `scored`: total scored points.
13 | - `points1`: number of free throws, worth 1 point each.
14 | - `points2`: number of 2-point field goals, worth 2 points each.
15 | - `points3`: number of 3-point field goals, worth 3 points each.
16 |
17 | There are five types of player positions (see [wikipedia](https://en.wikipedia.org/wiki/Basketball_positions) for more details):
18 |
19 | + `PG`: point guard
20 | + `SG`: shooting guard
21 | + `SF`: small forward
22 | + `PF`: power forward
23 | + `C`: center
24 |
25 | The values in `scored` result from adding all scored points:
26 |
27 | ```r
28 | points1 + (2 * points2) + (3 * points3)
29 | ```
30 |
31 | Although each object has its own data type, you can think of each of them as a variable from a statistics standpoint like so:
32 |
33 | | Object | Variable |
34 | |:-------------|:-------------|
35 | | `player` | categorical |
36 | | `team` | categorical |
37 | | `position` | categorical |
38 | | `age` | quantitative |
39 | | `experience` | quantitative |
40 | | `salary` | quantitative |
41 | | `scored` | quantitative |
42 | | `points1` | quantitative |
43 | | `points2` | quantitative |
44 | | `points3` | quantitative |
45 |
--------------------------------------------------------------------------------
/data/nba2017-salary-points.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/data/nba2017-salary-points.RData
--------------------------------------------------------------------------------
/hws/README.md:
--------------------------------------------------------------------------------
1 | ## HW Assignments
2 |
3 | There are two warm-up assignments (won't count toward your final grade), and
4 | five-to-six bi-weekly assignments.
5 |
6 |
--------------------------------------------------------------------------------
/hws/hw01-data-frame-basics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/hws/hw01-data-frame-basics.pdf
--------------------------------------------------------------------------------
/hws/hw02-shot-charts.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/hws/hw02-shot-charts.pdf
--------------------------------------------------------------------------------
/hws/hw03-programming-basics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/hws/hw03-programming-basics.pdf
--------------------------------------------------------------------------------
/hws/hw04-strings-regex.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/hws/hw04-strings-regex.pdf
--------------------------------------------------------------------------------
/hws/hw05-package-dieroller.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/hws/hw05-package-dieroller.pdf
--------------------------------------------------------------------------------
/hws/up01-markdown.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/hws/up01-markdown.pdf
--------------------------------------------------------------------------------
/hws/up02-vector-basics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/hws/up02-vector-basics.pdf
--------------------------------------------------------------------------------
/images/assignment-comments-bcourses.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/images/assignment-comments-bcourses.png
--------------------------------------------------------------------------------
/images/data-by-the-numbers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/images/data-by-the-numbers.png
--------------------------------------------------------------------------------
/images/it-is-in-the-syllabus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/images/it-is-in-the-syllabus.png
--------------------------------------------------------------------------------
/images/mrs-mutner-rules.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/images/mrs-mutner-rules.jpg
--------------------------------------------------------------------------------
/images/nba-court.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/images/nba-court.jpg
--------------------------------------------------------------------------------
/images/sample-variance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/images/sample-variance.png
--------------------------------------------------------------------------------
/images/tyrion-table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/images/tyrion-table.png
--------------------------------------------------------------------------------
/labs/lab03-command-line-basics.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Lab 3: Command Line Basics and Git"
3 | subtitle: "Stat 133, Spring 2018"
4 | author: "Gaston Sanchez"
5 | output: github_document
6 | urlcolor: blue
7 | ---
8 |
9 | > ### Learning Objectives
10 | >
11 | > - Practicing with the command line
12 | > - Navigating the filesystem and managing files
13 | > - Practice basic manipulation of data files
14 |
15 | ------
16 |
17 | ## Basic Bash shell commands
18 |
19 | The first part of the lab involves navigating the file system and manipulating
20 | files (and directories) with the following basic bash commands:
21 |
22 | - `pwd`: print working directory
23 | - `ls`: list files and directories
24 | - `cd`: change directory (move to another directory)
25 | - `mkdir`: create a new directory
26 | - `touch`: create a new (empty) file
27 | - `cp`: copy file(s)
28 | - `mv`: rename file(s)
29 | - `rm`: delete file(s)
30 |
31 | If you are using git-bash (i.e. your OS is Windows) you don't have the `man`
32 | command to see the manual documentation of other commands. In this case you can
33 | check the _man_ pages online:
34 |
35 | [http://man7.org/linux/man-pages/index.html](http://man7.org/linux/man-pages/index.html)
36 |
37 |
38 | ## Your turn
39 |
40 | Write your bash commands in a text editor (NOT a word processor) and save them
41 | in a text file: e.g. `lab03-gaston-sanchez.txt`.
42 |
43 | - Open (or launch) the command line
44 | - Use `mkdir` to create a new directory `stat133-lab03`
45 | - Change to the directory `stat133-lab03`
46 | - Use the command `curl` to download the following text file:
47 | ```bash
48 | # the option is the letter O (Not the number 0)
49 | curl -O http://textfiles.com/food/bread.txt
50 | ```
51 |
52 | - Use the command `ls` to list the contents in your current directory
53 | - Use the command `curl` to download these other text files:
54 | - http://textfiles.com/food/btaco.txt
55 | - http://textfiles.com/food/1st_aid.txt
56 | - http://textfiles.com/food/beesherb.txt
57 | - Use the command `curl` to download the following csv files:
58 | - http://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv
59 | - http://www.math.uah.edu/stat/data/Fisher.csv
60 | - http://web.pdx.edu/~gerbing/data/cars.csv
61 | - Now try `ls -l` to list the contents in your current directory in long format
62 | - Look at the `man` documentation of `ls` to find out how to list the contents in reverse order
63 | - How would you list the contents in long format and by time?
64 | - Inside `stat133-lab03` create a directory `data`
65 | - Change to the directory `data`
66 | - Create a directory `txt-files`
67 | - Create a directory `csv-files`
68 | - Use the command `mv` to move the `bread.txt` file to the folder `txt-files`
69 | - Use the wildcard `*` to move all the text files to the directory `txt-files`
70 | - Use the wildcard `*` to move all the `.csv` files to the directory `csv-files`
71 | - Go back to the parent directory `stat133-lab03`
72 | - Create a directory `copies`
73 | - Use the command `cp` to copy the `bread.txt` file (the one inside the folder
74 | `txt-files`) to the `copies` directory
75 | - Use the wildcard `*` to copy all the `.txt` files in the directory `copies`
76 | - Use the wildcard `*` to copy all the `.csv` files in the directory `copies`
77 | - Change to the directory `copies`
78 | - Use the command `mv` to rename the file `bread.txt` as `bread-recipe.txt`
79 | - Rename the file `Fisher.csv` as `iris.csv`
80 | - Rename the file `btaco.txt` as `breakfast-taco.txt`
81 | - Change to the parent directory (i.e. `stat133-lab03`)
82 | - Rename the directory `copies` as `copy-files`
83 | - Find out how to use the `rm` command to delete the directory `copy-files`
84 | - List the contents of the directory `txt-files` displaying the results in reverse (alphabetical) order
85 |
86 |
87 | ### Optional challenge
88 |
89 | If you are already familiar with the basic bash commands to
90 | navigate the filesystem (or if you want to expand your R skills), use the R
91 | functions to manipulate files and directories to perform the exact same tasks
92 | from within R. See `?files` for more information.
93 |
94 | - `getwd()`
95 | - `setwd()`
96 | - `download.file()`
97 | - `dir.create()`
98 | - `list.files()`
99 | - `list.dirs()`
100 | - `file.create()`
101 | - `file.copy()`
102 | - `file.rename()`
103 | - `file.remove()`
104 |
105 | -----
106 |
107 | # Git and GitHub Practice
108 |
109 | > ### Learning Objectives
110 | >
111 | > * Create a GitHub repository
112 | > * Create a local Git repository
113 | > * Practice adding, and committing changes to your (local) Git repo
114 | > * Practice pushing commited changes to a remote repo
115 |
116 |
117 | ### 1) Create a New GitHub Repository
118 |
119 | - Open your browser and Sign in to your github account.
120 | - Locate the `+` button (next to your avatar).
121 | - Select the `New repository` option.
122 | - Choose a name for your repository: e.g. `demo-repo`.
123 | - In the __Description__ field add a brief description: e.g. "this is a demo repo"
124 | - Use the default settings, and click the green button __Create repository__.
125 | - You should see some content similar (but not identical) to the one below:
126 |
127 | ```bash
128 | echo "# Demo Repo" >> README.md
129 | git init
130 | git add README.md
131 | git commit -m "first commit"
132 | git remote add origin https://github.com/gastonstat/demo-repo.git
133 | git push -u origin master
134 | ```
135 |
136 | ### 2) Create a local Git Repository
137 |
138 | - Open the terminal (Mac Terminal, or Git-Bash for Windows users).
139 | - Optional: change directory to your preferred location
140 | e.g. your `Desktop`
141 | ```bash
142 | cd Desktop
143 | ```
144 | - Create a directory with the name of your github repo
145 | ```bash
146 | mkdir demo-repo
147 | ```
148 | - Change to the directory you just created
149 | ```bash
150 | cd demo-repo
151 | ```
152 | - Initialize the directory as a git repository
153 | ```bash
154 | git init
155 | ```
156 |
157 | It's possible that you encounter some error message, e.g. Mac users may get a
158 | message related with a missing component for `CommandLineTools`. If this your
159 | case, then type in the terminal console:
160 |
161 | ```bash
162 | # Mac users may need to run this command
163 | xcode-select --install
164 | ```
165 |
166 | The command `git init` will set-up your directory `demo-repo` as a Git
167 | repository (NOT to confuse with your GitHub repository). This is basically
168 | your __local__ repository.
169 |
170 |
171 | ### 3) Adding a README file
172 |
173 | - It is customary to add a `README.md` file at the top level. This file must
174 | contain (at least) a description of what the repository is about. The following
175 | command will create a `README.md` file with some minimalist content:
176 | ```bash
177 | echo "# Demo Repo" >> README.md
178 | ```
179 | - So far there you have a "new" file in your local repo, but this change has
180 | not been recorded by Git. You can confirm this by checking the status of the repo:
181 | ```bash
182 | git status
183 | ```
184 | - Notice that Git knows that `README.md` is untracked. So let's add the
185 | changes to Git's database:
186 | ```bash
187 | git add README.md
188 | ```
189 | - Check the status of the repo again:
190 | ```bash
191 | git status
192 | ```
193 | - Now Git is tracking the file `README.md`.
194 | - Next thing consists of __committing__ the changes
195 | ```bash
196 | git commit -m "first commit"
197 | ```
198 |
199 | ### 4) Adding a remote
200 |
201 | Right now you have a (local) Git repository in your computer. And you also have
202 | a GitHub repository in your GitHub account. Both repositories should have the
203 | same name, and the goal is to link them. To do this, you need to tell Git that
204 | a _remote_ repository (i.e. the one in GitHub) will be added:
205 |
206 | - To add a remote repository use the command below __with your own username__:
207 | ```bash
208 | git remote add origin https://github.com/username/demo-repo.git
209 | ```
210 | - Verify your new remote
211 | ```bash
212 | git remote -v
213 | ```
214 | - If everything is okay, you should be able to see a message
215 | (with your own username) like this:
216 | ```
217 | # Verify new remote
218 | origin https://github.com/username/demo-repo.git (fetch)
219 | origin https://github.com/username/demo-repo.git (push)
220 | ```
221 |
222 | ### 5) Pushing changes to a remote repo
223 |
224 | - Now that you have linked your local repo with your remote repo, you can
225 | start pushing (i.e. uploading) commits to GitHub.
226 | - As part of the basic workflow with git and github, you want to constantly
227 | check the status of your repo
228 | ```
229 | git status
230 | ```
231 | - Now let's push your recent commit to the remote branch (`origin`) from
232 | the local branch (`master`):
233 | ```bash
234 | git push origin master
235 | ```
236 | - Go to your Github repository and refresh the browser. If everything went fine,
237 | you should be able to see the contents of your customized `README.md` file.
238 |
239 |
240 | -----
241 |
242 | ## GitHub Classroom Repository
243 |
244 | Read and follow the instructions to get your own __GitHub Classroom__ repository, available in the following link:
245 |
246 | [lab03-github-classroom.pdf](lab03-github-classroom.pdf)
247 |
248 |
--------------------------------------------------------------------------------
/labs/lab03-command-line-basics.md:
--------------------------------------------------------------------------------
1 | Lab 3: Command Line Basics and Git
2 | ================
3 | Gaston Sanchez
4 |
5 | > ### Learning Objectives
6 | >
7 | > - Practicing with the command line
8 | > - Navigating the filesystem and managing files
9 | > - Practice basic manipulation of data files
10 |
11 | ------------------------------------------------------------------------
12 |
13 | Basic Bash shell commands
14 | -------------------------
15 |
16 | The first part of the lab involves navigating the file system and manipulating files (and directories) with the following basic bash commands:
17 |
18 | - `pwd`: print working directory
19 | - `ls`: list files and directories
20 | - `cd`: change directory (move to another directory)
21 | - `mkdir`: create a new directory
22 | - `touch`: create a new (empty) file
23 | - `cp`: copy file(s)
24 | - `mv`: rename file(s)
25 | - `rm`: delete file(s)
26 |
27 | If you are using git-bash (i.e. your OS is Windows) you don't have the `man` command to see the manual documentation of other commands. In this case you can check the *man* pages online:
28 |
29 |
30 |
31 | Your turn
32 | ---------
33 |
34 | Write your bash commands in a text editor (NOT a word processor) and save them in a text file: e.g. `lab03-gaston-sanchez.txt`.
35 |
36 | - Open (or launch) the command line
37 | - Use `mkdir` to create a new directory `stat133-lab03`
38 | - Change to the directory `stat133-lab03`
39 | - Use the command `curl` to download the following text file:
40 |
41 | ``` bash
42 | # the option is the letter O (Not the number 0)
43 | curl -O http://textfiles.com/food/bread.txt
44 | ```
45 |
46 | - Use the command `ls` to list the contents in your current directory
47 | - Use the command `curl` to download these other text files:
48 | -
49 | -
50 | -
51 | - Use the command `curl` to download the following csv files:
52 | -
53 | -
54 | -
55 | - Now try `ls -l` to list the contents in your current directory in long format
56 | - Look at the `man` documentation of `ls` to find out how to list the contents in reverse order
57 | - How would you list the contents in long format and by time?
58 | - Inside `stat133-lab03` create a directory `data`
59 | - Change to the directory `data`
60 | - Create a directory `txt-files`
61 | - Create a directory `csv-files`
62 | - Use the command `mv` to move the `bread.txt` file to the folder `txt-files`
63 | - Use the wildcard `*` to move all the text files to the directory `txt-files`
64 | - Use the wildcard `*` to move all the `.csv` files to the directory `csv-files`
65 | - Go back to the parent directory `stat133-lab03`
66 | - Create a directory `copies`
67 | - Use the command `cp` to copy the `bread.txt` file (the one inside the folder `txt-files`) to the `copies` directory
68 | - Use the wildcard `*` to copy all the `.txt` files in the directory `copies`
69 | - Use the wildcard `*` to copy all the `.csv` files in the directory `copies`
70 | - Change to the directory `copies`
71 | - Use the command `mv` to rename the file `bread.txt` as `bread-recipe.txt`
72 | - Rename the file `Fisher.csv` as `iris.csv`
73 | - Rename the file `btaco.txt` as `breakfast-taco.txt`
74 | - Change to the parent directory (i.e. `stat133-lab03`)
75 | - Rename the directory `copies` as `copy-files`
76 | - Find out how to use the `rm` command to delete the directory `copy-files`
77 | - List the contents of the directory `txt-files` displaying the results in reverse (alphabetical) order
78 |
79 | ### Optional challenge
80 |
81 | If you are already familiar with the basic bash commands to navigate the filesystem (or if you want to expand your R skills), use the R functions to manipulate files and directories to perform the exact same tasks from within R. See `?files` for more information.
82 |
83 | - `getwd()`
84 | - `setwd()`
85 | - `download.file()`
86 | - `dir.create()`
87 | - `list.files()`
88 | - `list.dirs()`
89 | - `file.create()`
90 | - `file.copy()`
91 | - `file.rename()`
92 | - `file.remove()`
93 |
94 | ------------------------------------------------------------------------
95 |
96 | Git and GitHub Practice
97 | =======================
98 |
99 | > ### Learning Objectives
100 | >
101 | > - Create a GitHub repository
102 | > - Create a local Git repository
103 | > - Practice adding, and committing changes to your (local) Git repo
104 | > - Practice pushing commited changes to a remote repo
105 |
106 | ### 1) Create a New GitHub Repository
107 |
108 | - Open your browser and Sign in to your github account.
109 | - Locate the `+` button (next to your avatar).
110 | - Select the `New repository` option.
111 | - Choose a name for your repository: e.g. `demo-repo`.
112 | - In the **Description** field add a brief description: e.g. "this is a demo repo"
113 | - Use the default settings, and click the green button **Create repository**.
114 | - You should see some content similar (but not identical) to the one below:
115 |
116 | ``` bash
117 | echo "# Demo Repo" >> README.md
118 | git init
119 | git add README.md
120 | git commit -m "first commit"
121 | git remote add origin https://github.com/gastonstat/demo-repo.git
122 | git push -u origin master
123 | ```
124 |
125 | ### 2) Create a local Git Repository
126 |
127 | - Open the terminal (Mac Terminal, or Git-Bash for Windows users).
128 | - Optional: change directory to your preferred location e.g. your `Desktop`
129 |
130 | ``` bash
131 | cd Desktop
132 | ```
133 |
134 | - Create a directory with the name of your github repo
135 |
136 | ``` bash
137 | mkdir demo-repo
138 | ```
139 |
140 | - Change to the directory you just created
141 |
142 | ``` bash
143 | cd demo-repo
144 | ```
145 |
146 | - Initialize the directory as a git repository
147 |
148 | ``` bash
149 | git init
150 | ```
151 |
152 | It's possible that you encounter some error message, e.g. Mac users may get a message related with a missing component for `CommandLineTools`. If this your case, then type in the terminal console:
153 |
154 | ``` bash
155 | # Mac users may need to run this command
156 | xcode-select --install
157 | ```
158 |
159 | The command `git init` will set-up your directory `demo-repo` as a Git repository (NOT to confuse with your GitHub repository). This is basically your **local** repository.
160 |
161 | ### 3) Adding a README file
162 |
163 | - It is customary to add a `README.md` file at the top level. This file must contain (at least) a description of what the repository is about. The following command will create a `README.md` file with some minimalist content:
164 |
165 | ``` bash
166 | echo "# Demo Repo" >> README.md
167 | ```
168 |
169 | - So far there you have a "new" file in your local repo, but this change has not been recorded by Git. You can confirm this by checking the status of the repo:
170 |
171 | ``` bash
172 | git status
173 | ```
174 |
175 | - Notice that Git knows that `README.md` is untracked. So let's add the changes to Git's database:
176 |
177 | ``` bash
178 | git add README.md
179 | ```
180 |
181 | - Check the status of the repo again:
182 |
183 | ``` bash
184 | git status
185 | ```
186 |
187 | - Now Git is tracking the file `README.md`.
188 | - Next thing consists of **committing** the changes
189 |
190 | ``` bash
191 | git commit -m "first commit"
192 | ```
193 |
194 | ### 4) Adding a remote
195 |
196 | Right now you have a (local) Git repository in your computer. And you also have a GitHub repository in your GitHub account. Both repositories should have the same name, and the goal is to link them. To do this, you need to tell Git that a *remote* repository (i.e. the one in GitHub) will be added:
197 |
198 | - To add a remote repository use the command below **with your own username**:
199 |
200 | ``` bash
201 | git remote add origin https://github.com/username/demo-repo.git
202 | ```
203 |
204 | - Verify your new remote
205 |
206 | ``` bash
207 | git remote -v
208 | ```
209 |
210 | - If everything is okay, you should be able to see a message (with your own username) like this:
211 |
212 | # Verify new remote
213 | origin https://github.com/username/demo-repo.git (fetch)
214 | origin https://github.com/username/demo-repo.git (push)
215 |
216 | ### 5) Pushing changes to a remote repo
217 |
218 | - Now that you have linked your local repo with your remote repo, you can start pushing (i.e. uploading) commits to GitHub.
219 | - As part of the basic workflow with git and github, you want to constantly check the status of your repo
220 |
221 | git status
222 |
223 | - Now let's push your recent commit to the remote branch (`origin`) from the local branch (`master`):
224 |
225 | ``` bash
226 | git push origin master
227 | ```
228 |
229 | - Go to your Github repository and refresh the browser. If everything went fine, you should be able to see the contents of your customized `README.md` file.
230 |
231 | ------------------------------------------------------------------------
232 |
233 | GitHub Classroom Repository
234 | ---------------------------
235 |
236 | Read and follow the instructions to get your own **GitHub Classroom** repository, available in the following link:
237 |
238 | [lab03-github-classroom.pdf](lab03-github-classroom.pdf)
239 |
--------------------------------------------------------------------------------
/labs/lab03-github-classroom.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab03-github-classroom.pdf
--------------------------------------------------------------------------------
/labs/lab05-images/knitr-fig-path.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab05-images/knitr-fig-path.png
--------------------------------------------------------------------------------
/labs/lab05-images/named-chunk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab05-images/named-chunk.png
--------------------------------------------------------------------------------
/labs/lab05-images/unnamed-chunk-19-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab05-images/unnamed-chunk-19-1.png
--------------------------------------------------------------------------------
/labs/lab05-images/unnamed-chunk-21-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab05-images/unnamed-chunk-21-1.png
--------------------------------------------------------------------------------
/labs/lab05-images/unnamed-chunk-22-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab05-images/unnamed-chunk-22-1.png
--------------------------------------------------------------------------------
/labs/lab05-images/unnamed-chunk-23-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab05-images/unnamed-chunk-23-1.png
--------------------------------------------------------------------------------
/labs/lab05-images/unnamed-chunk-24-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab05-images/unnamed-chunk-24-1.png
--------------------------------------------------------------------------------
/labs/lab05-images/unnamed-chunk-25-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab05-images/unnamed-chunk-25-1.png
--------------------------------------------------------------------------------
/labs/lab05-images/unnamed-chunk-25-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab05-images/unnamed-chunk-25-2.png
--------------------------------------------------------------------------------
/labs/lab07-images/error-true.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab07-images/error-true.png
--------------------------------------------------------------------------------
/labs/lab07-images/gaussian_plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab07-images/gaussian_plot-1.png
--------------------------------------------------------------------------------
/labs/lab07-images/polynomial_plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab07-images/polynomial_plot-1.png
--------------------------------------------------------------------------------
/labs/lab08-images/arith_mean.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab08-images/arith_mean.png
--------------------------------------------------------------------------------
/labs/lab08-images/arith_series.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab08-images/arith_series.png
--------------------------------------------------------------------------------
/labs/lab08-images/geo_seq.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab08-images/geo_seq.png
--------------------------------------------------------------------------------
/labs/lab08-images/geom_mean.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab08-images/geom_mean.png
--------------------------------------------------------------------------------
/labs/lab08-images/sine_series.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab08-images/sine_series.png
--------------------------------------------------------------------------------
/labs/lab08-images/std_dev.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab08-images/std_dev.png
--------------------------------------------------------------------------------
/labs/lab08-images/sum_series1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab08-images/sum_series1.png
--------------------------------------------------------------------------------
/labs/lab08-images/sum_series2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab08-images/sum_series2.png
--------------------------------------------------------------------------------
/labs/lab10-images/day_barchart-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab10-images/day_barchart-1.png
--------------------------------------------------------------------------------
/labs/lab10-images/plotly-barchart1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab10-images/plotly-barchart1.png
--------------------------------------------------------------------------------
/labs/lab10-images/plotly-barchart2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab10-images/plotly-barchart2.png
--------------------------------------------------------------------------------
/labs/lab10-images/plotly0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab10-images/plotly0.png
--------------------------------------------------------------------------------
/labs/lab10-images/plotly1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab10-images/plotly1.png
--------------------------------------------------------------------------------
/labs/lab10-images/san-francisco-map-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab10-images/san-francisco-map-1.png
--------------------------------------------------------------------------------
/labs/lab10-images/san-francisco.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab10-images/san-francisco.png
--------------------------------------------------------------------------------
/labs/lab10-images/san-francisco.png.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab10-images/san-francisco.png.rda
--------------------------------------------------------------------------------
/labs/lab10-images/ugly_map-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab10-images/ugly_map-1.png
--------------------------------------------------------------------------------
/labs/lab11-images/app1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab11-images/app1.png
--------------------------------------------------------------------------------
/labs/lab11-images/app2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab11-images/app2.png
--------------------------------------------------------------------------------
/labs/lab11-images/app3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab11-images/app3.png
--------------------------------------------------------------------------------
/labs/lab11-images/freqs-plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab11-images/freqs-plot.png
--------------------------------------------------------------------------------
/labs/lab11-random-simulations.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Lab 11: Random Number and Simulations"
3 | subtitle: "Stat 133, Spring 2018"
4 | author: "Gaston Sanchez"
5 | output: github_document
6 | fontsize: 11pt
7 | urlcolor: blue
8 | ---
9 |
10 | > ### Learning Objectives
11 | >
12 | > - getting started with simulations in R
13 | > - learn how to create a basic shiny app
14 | > - put in practice concepts from your introductory statistics course(s)
15 |
16 | ```{r setup, include=FALSE}
17 | knitr::opts_chunk$set(echo = TRUE, error = TRUE, fig.path = 'lab11-images/')
18 | ```
19 |
20 | ------
21 |
22 |
23 | # Introduction
24 |
25 | Random numbers have many applications in science and computer programming,
26 | especially when there are significant uncertainties in a phenomenon of interest.
27 |
28 |
29 | # Computing Probabilities
30 |
31 | With the mathematical rules from probability theory we can compute the
32 | probability that a certain event happens. Consider for example two bags
33 | containing balls of different colors. Bag 1 contains 2 white balls and 1 red
34 | ball; bag 2 contains 3 white balls and 1 red ball.
35 |
36 | Suppose that a bag is chosen at random, and then a ball is picked at random
37 | from the selected bag. What is the given probability that:
38 |
39 | a. the ball chosen is red
40 | b. the ball chosen is white
41 |
42 | This problem can be solved analytically using the formulas:
43 |
44 | ```
45 | P(red) = P(red | bag1) P(bag1) + P(red | bag2) P(bag2)
46 |
47 | P(white) = P(white | bag1) P(bag1) + P(white | bag2) P(bag2)
48 | ```
49 |
50 | Instead of solving this problem analytically, you can write R code to simulate
51 | the experiment of picking a bag and drawing a ball. The first step consists of
52 | creating two bags as character vectors with the name of the colors for the balls:
53 |
54 | ```{r}
55 | # bags
56 | bag1 <- c('white', 'white', 'red')
57 | bag2 <- c(rep('white', 3), 'red')
58 | ```
59 |
60 | To compute the probability using simulations, we need to replicate the
61 | random experiments a large number of times (e.g. 500 or 1000 times).
62 |
63 | ```{r}
64 | bags <- c('bag1', 'bag2')
65 | repetitions <- 1000
66 | drawn_balls <- character(repetitions)
67 |
68 | set.seed(345)
69 | for (i in 1:repetitions) {
70 | # select one bag
71 | chosen_bag <- sample(bags, 1)
72 |
73 | # draw a ball from chosen bag
74 | if (chosen_bag == 'bag1') {
75 | drawn_balls[i] <- sample(bag1, 1)
76 | } else {
77 | drawn_balls[i] <- sample(bag2, 1)
78 | }
79 | }
80 |
81 | table(drawn_balls) / repetitions
82 | ```
83 |
84 |
85 |
86 | # A less basic probability problem
87 |
88 | You can manually find the probabilities of the previous example. However, not
89 | all real problems have an analytic solution. Consider the following situation.
90 | There are two boxes with balls of different colors. Box 1 contains two `blue`
91 | balls, and one `red` ball. Box 2 contains two `blue` balls, three `red` balls,
92 | and one `white` ball.
93 |
94 | The random experiment consists of generating a random number that follows a
95 | uniform distribution (min = 0, max = 1). If the number is greater than 0.5,
96 | then a sample __with replacement__ of size 4 is drawn from box 1. If the
97 | random number is less than or equal to 0.5, then a sample __without replacement__
98 | of size is drawn from box 2. The goal is to find the probability distribution
99 | for the number of blue balls. In other words:
100 |
101 | - Probability of 0 blue balls
102 | - Probability of 1 blue ball
103 | - Probability of 2 blue balls
104 | - Probability of 3 blue balls
105 | - Probability of 4 blue balls
106 |
107 |
108 | ### Your Turn
109 |
110 | 1. Create two character vectors `box1` and `box2` with colors of balls:
111 |
112 | 2. The random experiment involves generating a uniform random number using
113 | `runif(1)`. If this number is greater than 0.5, get a `sample()` without
114 | replacement of `size = 4` from `box1.` Otherwise, get a `sample()` without
115 | replacement of `size = 4` from `box2`.
116 |
117 | 3. Repeat the experiment 1000 times using a `for` loop. To store the drawn
118 | samples, use a matrix `drawn_balls`. This matrix will have 1000 rows and 4 columns.
119 | In each row you assign the output of a random sample of balls.
120 |
121 |
122 | Your matrix `drawn_balls` could look like this (first five rows):
123 |
124 | ```
125 | [,1] [,2] [,3] [,4]
126 | [1,] "blue" "red" "red" "blue"
127 | [2,] "red" "blue" "white" "red"
128 | [3,] "red" "blue" "red" "red"
129 | [4,] "red" "red" "red" "blue"
130 | [5,] "red" "red" "blue" "white"
131 | ```
132 |
133 |
134 | 4. Once you filled the matrix `drawn_balls`, compute the proportion of samples
135 | containing: 0, 1, 2, 3, or 4 blue balls.
136 |
137 | 5. Try to obtain the following plot showing the relative frequencies of number
138 | of blue balls over the series of repetitions.
139 |
140 | ```{r freq_plot, fig.width=2, out.width='80%', echo = FALSE, fig.align='center'}
141 | knitr::include_graphics('lab11-images/freqs-plot.png')
142 | ```
143 |
144 |
145 | -----
146 |
147 | ## Shiny App
148 |
149 | - Open RStudio.
150 | - Go to the __File__ option from the menu bar.
151 | - Select __New File__ and choose __Shiny Web App__.
152 | - Give a name to your App, choose a location for it, and click the __Create__
153 | button.
154 |
155 | These steps should create a new folder in the specified directory containing
156 | an R script file called `app.R`. This file contains a basic template with the
157 | following main ingredients:
158 |
159 | - a call to `library(shiny)` at the top of the file
160 | - the User Interface "function" `ui <- fluidPage(...)`
161 | - the Server "function" `server <- function(input, output) {...}`
162 | - a call to `shinyApp(ui = ui, server = server)` to run your app
163 |
164 | By default, shiny creates a basic template with a histogram of the variable
165 | `waiting` from the data set `faithful`. You can try running the app by clicking
166 | on the __Run App__ button (see buttons at the top of the source pane).
167 |
168 |
169 | ## App scripts
170 |
171 | Instead of using the default `app.R` script, you will be playing with your
172 | own scripts to simulate the random experiment of drawing 4 balls from the
173 | boxes.
174 |
175 | While working on this part of the lab, you may want to look at the
176 | [Shiny Widgets Gallery](https://shiny.rstudio.com/gallery/widget-gallery.html),
177 | as well as the [shiny cheatsheet](../cheatsheets/shiny-cheatsheets.pdf).
178 |
179 |
180 | ### App version 1
181 |
182 | Try to create a shiny app that replicates the image below:
183 |
184 | - there is only one widget input: slider that controls the number of repetitions
185 |
186 | ```{r app1, fig.width=4, out.width='80%', echo = FALSE, fig.align='center'}
187 | knitr::include_graphics('lab11-images/app1.png')
188 | ```
189 |
190 |
191 | ### App version 2
192 |
193 | Modify the first app to create a second shiny app that replicates the image below:
194 |
195 | - widget input: slider that controls the number of repetitions
196 | - widget input: slider that controls the probability threshold for choosing the
197 | boxes.
198 |
199 | ```{r app2, fig.width=4, out.width='80%', echo = FALSE, fig.align='center'}
200 | knitr::include_graphics('lab11-images/app2.png')
201 | ```
202 |
203 |
204 | ### App version 3
205 |
206 | Modify the second app to create a third shiny app that replicates the image below:
207 |
208 | - widget input: slider that controls the number of repetitions
209 | - widget input: slider that controls the probability threshold for choosing the
210 | boxes.
211 | - widget input: numeric input that controls the random seed.
212 |
213 |
214 | ```{r app3, fig.width=4, out.width='80%', echo = FALSE, fig.align='center'}
215 | knitr::include_graphics('lab11-images/app3.png')
216 | ```
217 |
218 |
219 | -----
220 |
221 | ### App R scripts
222 |
223 | In the folder `lab11-shiny-apps`, you will find several app R scripts: `app1.R`,
224 | `app2.R`, `app3.R`, and `app4.R`. Each of them adds a new element to the
225 | sidebar, so that your app becomes more flexible.
226 |
227 | - `app1.R`: basic skeleton that includes input for number of repetitions
228 | - `app2.R`: includes input for threshold criteria to select a box
229 | - `app3.R`: includes input for random seed
230 |
231 |
232 | ### App version 4
233 |
234 | The file `app4.R` is a bit more complex. First, we redefine `toss()` by adding
235 | another argument for the random seed. Notice also the use of `reactive()` to
236 | create reactive objects `tosses()` and `proportions()`. Likewise, in the main
237 | panel of outputs, we display a data table showing summary results.
238 |
239 |
--------------------------------------------------------------------------------
/labs/lab11-random-simulations.md:
--------------------------------------------------------------------------------
1 | Lab 11: Random Number and Simulations
2 | ================
3 | Gaston Sanchez
4 |
5 | > ### Learning Objectives
6 | >
7 | > - getting started with simulations in R
8 | > - learn how to create a basic shiny app
9 | > - put in practice concepts from your introductory statistics
10 | > course(s)
11 |
12 | -----
13 |
14 | # Introduction
15 |
16 | Random numbers have many applications in science and computer
17 | programming, especially when there are significant uncertainties in a
18 | phenomenon of interest.
19 |
20 | # Computing Probabilities
21 |
22 | With the mathematical rules from probability theory we can compute the
23 | probability that a certain event happens. Consider for example two bags
24 | containing balls of different colors. Bag 1 contains 2 white balls and 1
25 | red ball; bag 2 contains 3 white balls and 1 red ball.
26 |
27 | Suppose that a bag is chosen at random, and then a ball is picked at
28 | random from the selected bag. What is the given probability that:
29 |
30 | 1. the ball chosen is red
31 | 2. the ball chosen is white
32 |
33 | This problem can be solved analytically using the formulas:
34 |
35 | P(red) = P(red | bag1) P(bag1) + P(red | bag2) P(bag2)
36 |
37 | P(white) = P(white | bag1) P(bag1) + P(white | bag2) P(bag2)
38 |
39 | Instead of solving this problem analytically, you can write R code to
40 | simulate the experiment of picking a bag and drawing a ball. The first
41 | step consists of creating two bags as character vectors with the name of
42 | the colors for the balls:
43 |
44 | ``` r
45 | # bags
46 | bag1 <- c('white', 'white', 'red')
47 | bag2 <- c(rep('white', 3), 'red')
48 | ```
49 |
50 | To compute the probability using simulations, we need to replicate the
51 | random experiments a large number of times (e.g. 500 or 1000 times).
52 |
53 | ``` r
54 | bags <- c('bag1', 'bag2')
55 | repetitions <- 1000
56 | drawn_balls <- character(repetitions)
57 |
58 | set.seed(345)
59 | for (i in 1:repetitions) {
60 | # select one bag
61 | chosen_bag <- sample(bags, 1)
62 |
63 | # draw a ball from chosen bag
64 | if (chosen_bag == 'bag1') {
65 | drawn_balls[i] <- sample(bag1, 1)
66 | } else {
67 | drawn_balls[i] <- sample(bag2, 1)
68 | }
69 | }
70 |
71 | table(drawn_balls) / repetitions
72 | ```
73 |
74 | ## drawn_balls
75 | ## red white
76 | ## 0.263 0.737
77 |
78 | # A less basic probability problem
79 |
80 | You can manually find the probabilities of the previous example.
81 | However, not all real problems have an analytic solution. Consider the
82 | following situation. There are two boxes with balls of different colors.
83 | Box 1 contains two `blue` balls, and one `red` ball. Box 2 contains two
84 | `blue` balls, three `red` balls, and one `white` ball.
85 |
86 | The random experiment consists of generating a random number that
87 | follows a uniform distribution (min = 0, max = 1). If the number is
88 | greater than 0.5, then a sample **with replacement** of size 4 is drawn
89 | from box 1. If the random number is less than or equal to 0.5, then a
90 | sample **without replacement** of size is drawn from box 2. The goal is
91 | to find the probability distribution for the number of blue balls. In
92 | other words:
93 |
94 | - Probability of 0 blue balls
95 | - Probability of 1 blue ball
96 | - Probability of 2 blue balls
97 | - Probability of 3 blue balls
98 | - Probability of 4 blue balls
99 |
100 | ### Your Turn
101 |
102 | 1. Create two character vectors `box1` and `box2` with colors of balls:
103 |
104 | 2. The random experiment involves generating a uniform random number
105 | using `runif(1)`. If this number is greater than 0.5, get a
106 | `sample()` without replacement of `size = 4` from `box1.` Otherwise,
107 | get a `sample()` without replacement of `size = 4` from `box2`.
108 |
109 | 3. Repeat the experiment 1000 times using a `for` loop. To store the
110 | drawn samples, use a matrix `drawn_balls`. This matrix will have
111 | 1000 rows and 4 columns. In each row you assign the output of a
112 | random sample of balls.
113 |
114 | Your matrix `drawn_balls` could look like this (first five rows):
115 |
116 | ```
117 | [,1] [,2] [,3] [,4]
118 | [1,] "blue" "red" "red" "blue"
119 | [2,] "red" "blue" "white" "red"
120 | [3,] "red" "blue" "red" "red"
121 | [4,] "red" "red" "red" "blue"
122 | [5,] "red" "red" "blue" "white"
123 | ```
124 |
125 | 4. Once you filled the matrix `drawn_balls`, compute the proportion of
126 | samples containing: 0, 1, 2, 3, or 4 blue balls.
127 |
128 | 5. Try to obtain the following plot showing the relative frequencies of
129 | number of blue balls over the series of
130 | repetitions.
131 |
132 |
133 |
134 | -----
135 |
136 | ## Shiny App
137 |
138 | - Open RStudio.
139 | - Go to the **File** option from the menu bar.
140 | - Select **New File** and choose **Shiny Web App**.
141 | - Give a name to your App, choose a location for it, and click the
142 | **Create** button.
143 |
144 | These steps should create a new folder in the specified directory
145 | containing an R script file called `app.R`. This file contains a basic
146 | template with the following main ingredients:
147 |
148 | - a call to `library(shiny)` at the top of the file
149 | - the User Interface “function” `ui <- fluidPage(...)`
150 | - the Server “function” `server <- function(input, output) {...}`
151 | - a call to `shinyApp(ui = ui, server = server)` to run your app
152 |
153 | By default, shiny creates a basic template with a histogram of the
154 | variable `waiting` from the data set `faithful`. You can try running the
155 | app by clicking on the **Run App** button (see buttons at the top of the
156 | source pane).
157 |
158 | ## App scripts
159 |
160 | Instead of using the default `app.R` script, you will be playing with
161 | your own scripts to simulate the random experiment of drawing 4 balls
162 | from the boxes.
163 |
164 | While working on this part of the lab, you may want to look at the
165 | [Shiny Widgets
166 | Gallery](https://shiny.rstudio.com/gallery/widget-gallery.html), as well
167 | as the [shiny cheatsheet](../cheatsheets/shiny-cheatsheets.pdf).
168 |
169 | ### App version 1
170 |
171 | Try to create a shiny app that replicates the image below:
172 |
173 | - there is only one widget input: slider that controls the number of
174 | repetitions
175 |
176 |
177 |
178 | ### App version 2
179 |
180 | Modify the first app to create a second shiny app that replicates the
181 | image below:
182 |
183 | - widget input: slider that controls the number of repetitions
184 | - widget input: slider that controls the probability threshold for
185 | choosing the
186 | boxes.
187 |
188 |
189 |
190 | ### App version 3
191 |
192 | Modify the second app to create a third shiny app that replicates the
193 | image below:
194 |
195 | - widget input: slider that controls the number of repetitions
196 | - widget input: slider that controls the probability threshold for
197 | choosing the boxes.
198 | - widget input: numeric input that controls the random
199 | seed.
200 |
201 |
202 |
203 | -----
204 |
205 | ### App R scripts
206 |
207 | In the folder `lab11-shiny-apps`, you will find several app R scripts:
208 | `app1.R`, `app2.R`, `app3.R`, and `app4.R`. Each of them adds a new
209 | element to the sidebar, so that your app becomes more flexible.
210 |
211 | - `app1.R`: basic skeleton that includes input for number of
212 | repetitions
213 | - `app2.R`: includes input for threshold criteria to select a box
214 | - `app3.R`: includes input for random seed
215 |
216 | ### App version 4
217 |
218 | The file `app4.R` is a bit more complex. First, we redefine `toss()` by
219 | adding another argument for the random seed. Notice also the use of
220 | `reactive()` to create reactive objects `tosses()` and `proportions()`.
221 | Likewise, in the main panel of outputs, we display a data table showing
222 | summary results.
223 |
--------------------------------------------------------------------------------
/labs/lab11-shiny-apps/app1.R:
--------------------------------------------------------------------------------
1 | # Lab 11, Stat 133 Spring 2018, Prof. Sanchez
2 | # Shiny App version 1: Drawing balls from boxes
3 | # Inputs:
4 | # repetitions: number of repetitions
5 | #
6 | # Outputs:
7 | # frequency plot of number of blue balls
8 |
9 | library(shiny)
10 | library(ggplot2)
11 |
12 |
13 | # Define UI for application
14 | ui <- fluidPage(
15 |
16 | # Application title
17 | titlePanel("Drawing Balls Experiment"),
18 |
19 | # Sidebar
20 | sidebarLayout(
21 | sidebarPanel(
22 | sliderInput("repetitions",
23 | label = "Number of repetitions:",
24 | min = 1,
25 | max = 5000,
26 | value = 100)
27 | ),
28 |
29 | # Show a plot of the relative frequencies
30 | mainPanel(
31 | plotOutput("freqs_plot")
32 | )
33 | )
34 | )
35 |
36 |
37 | # Define server logic required to draw the plot
38 | server <- function(input, output) {
39 |
40 | # Fill in the spot we created for a plot
41 | output$freqs_plot <- renderPlot({
42 | # boxes as character vectors
43 | box1 <- c('blue', 'blue', 'red')
44 | box2 <- c('blue', 'blue', 'red', 'red', 'red', 'white')
45 |
46 | size <- 4
47 | drawn_balls <- matrix("", input$repetitions, size)
48 |
49 | for (r in 1:input$repetitions) {
50 | aux <- runif(1)
51 | if (aux > 0.5) {
52 | drawn_balls[r, ] <- sample(box1, size, replace = TRUE)
53 | } else {
54 | drawn_balls[r,] <- sample(box2, size)
55 | }
56 | }
57 |
58 | # number of blue balls in each repetition
59 | blue_counts <- apply(drawn_balls, 1, function(x) sum(x == 'blue'))
60 |
61 | # progression of relative frequencies
62 | blue_freqs <- vector(mode = "list", length = 5)
63 | for (num_blue in 0:4) {
64 | temp_freqs <- cumsum(blue_counts == num_blue) / (1:input$repetitions)
65 | blue_freqs[[num_blue + 1]] <- temp_freqs
66 | }
67 |
68 | dat <- data.frame(
69 | reps = rep(1:input$repetitions, 5),
70 | freqs = unlist(blue_freqs),
71 | number = factor(rep(0:4, each = input$repetitions))
72 | )
73 |
74 | ggplot(data = dat, aes(x = reps, y = freqs, group = number)) +
75 | geom_path(aes(color = number)) +
76 | ggtitle("Relative frequencies of number of blue balls")
77 | })
78 | }
79 |
80 |
81 | # Run the application
82 | shinyApp(ui = ui, server = server)
83 |
84 |
85 |
86 |
--------------------------------------------------------------------------------
/labs/lab11-shiny-apps/app2.R:
--------------------------------------------------------------------------------
1 | # Lab 11, Stat 133 Spring 2018, Prof. Sanchez
2 | # Shiny App version 2: Drawing balls from boxes
3 | # Inputs:
4 | # repetitions: number of repetitions
5 | # threshold: threshold to selet box
6 | #
7 | # Outputs:
8 | # frequency plot of number of blue balls
9 |
10 | library(shiny)
11 | library(ggplot2)
12 |
13 | # Define UI for application
14 | ui <- fluidPage(
15 |
16 | # Application title
17 | titlePanel("Drawing Balls Experiment"),
18 |
19 | # Sidebar
20 | sidebarLayout(
21 | sidebarPanel(
22 | sliderInput("repetitions",
23 | label = "Number of repetitions:",
24 | min = 1,
25 | max = 5000,
26 | value = 100),
27 | sliderInput("threshold",
28 | label = "Threshold for choosing boxes:",
29 | min = 0,
30 | max = 1,
31 | value = 0.5)
32 | ),
33 |
34 | # Show a plot of the relative frequencies
35 | mainPanel(
36 | plotOutput("freqs_plot")
37 | )
38 | )
39 | )
40 |
41 |
42 | # Define server logic required to draw the plot
43 | server <- function(input, output) {
44 |
45 | # Fill in the spot we created for a plot
46 | output$freqs_plot <- renderPlot({
47 | # boxes as character vectors
48 | box1 <- c('blue', 'blue', 'red')
49 | box2 <- c('blue', 'blue', 'red', 'red', 'red', 'white')
50 |
51 | size <- 4
52 | drawn_balls <- matrix("", input$repetitions, size)
53 |
54 | for (r in 1:input$repetitions) {
55 | aux <- runif(1)
56 | if (aux > input$threshold) {
57 | drawn_balls[r, ] <- sample(box1, size, replace = TRUE)
58 | } else {
59 | drawn_balls[r,] <- sample(box2, size)
60 | }
61 | }
62 |
63 | # number of blue balls in each repetition
64 | blue_counts <- apply(drawn_balls, 1, function(x) sum(x == 'blue'))
65 |
66 | # progression of relative frequencies
67 | blue_freqs <- vector(mode = "list", length = 5)
68 | for (num_blue in 0:4) {
69 | temp_freqs <- cumsum(blue_counts == num_blue) / (1:input$repetitions)
70 | blue_freqs[[num_blue + 1]] <- temp_freqs
71 | }
72 |
73 | dat <- data.frame(
74 | reps = rep(1:input$repetitions, 5),
75 | freqs = unlist(blue_freqs),
76 | number = factor(rep(0:4, each = input$repetitions))
77 | )
78 |
79 | ggplot(data = dat, aes(x = reps, y = freqs, group = number)) +
80 | geom_path(aes(color = number)) +
81 | ggtitle("Relative frequencies of number of blue balls")
82 | })
83 | }
84 |
85 |
86 | # Run the application
87 | shinyApp(ui = ui, server = server)
88 |
89 |
90 |
91 |
--------------------------------------------------------------------------------
/labs/lab11-shiny-apps/app3.R:
--------------------------------------------------------------------------------
1 | # Lab 11, Stat 133 Spring 2018, Prof. Sanchez
2 | # Shiny App version 3: Drawing balls from boxes
3 | # Inputs:
4 | # repetitions: number of repetitions
5 | # threshold: threshold to selet box
6 | # seed: random seed
7 | #
8 | # Outputs:
9 | # frequency plot of number of blue balls
10 |
11 | library(shiny)
12 | library(ggplot2)
13 |
14 | # Define UI for application
15 | ui <- fluidPage(
16 |
17 | # Application title
18 | titlePanel("Drawing Balls Experiment"),
19 |
20 | # Sidebar
21 | sidebarLayout(
22 | sidebarPanel(
23 | sliderInput("repetitions",
24 | label = "Number of repetitions:",
25 | min = 1,
26 | max = 5000,
27 | value = 100),
28 | sliderInput("threshold",
29 | label = "Threshold for choosing boxes:",
30 | min = 0,
31 | max = 1,
32 | value = 0.5),
33 | numericInput("seed",
34 | "Choose a random seed",
35 | value = 12345)
36 | ),
37 |
38 | # Show a plot of the relative frequencies
39 | mainPanel(
40 | plotOutput("freqs_plot")
41 | )
42 | )
43 | )
44 |
45 |
46 | # Define server logic required to draw the plot
47 | server <- function(input, output) {
48 |
49 | # Fill in the spot we created for a plot
50 | output$freqs_plot <- renderPlot({
51 | # boxes as character vectors
52 | box1 <- c('blue', 'blue', 'red')
53 | box2 <- c('blue', 'blue', 'red', 'red', 'red', 'white')
54 |
55 | size <- 4
56 | drawn_balls <- matrix("", input$repetitions, size)
57 | set.seed(input$seed)
58 | for (r in 1:input$repetitions) {
59 | aux <- runif(1)
60 | if (aux > input$threshold) {
61 | drawn_balls[r, ] <- sample(box1, size, replace = TRUE)
62 | } else {
63 | drawn_balls[r,] <- sample(box2, size)
64 | }
65 | }
66 |
67 | # number of blue balls in each repetition
68 | blue_counts <- apply(drawn_balls, 1, function(x) sum(x == 'blue'))
69 |
70 | # progression of relative frequencies
71 | blue_freqs <- vector(mode = "list", length = 5)
72 | for (num_blue in 0:4) {
73 | temp_freqs <- cumsum(blue_counts == num_blue) / (1:input$repetitions)
74 | blue_freqs[[num_blue + 1]] <- temp_freqs
75 | }
76 |
77 | dat <- data.frame(
78 | reps = rep(1:input$repetitions, 5),
79 | freqs = unlist(blue_freqs),
80 | number = factor(rep(0:4, each = input$repetitions))
81 | )
82 |
83 | ggplot(data = dat, aes(x = reps, y = freqs, group = number)) +
84 | geom_path(aes(color = number)) +
85 | ggtitle("Relative frequencies of number of blue balls")
86 | })
87 | }
88 |
89 |
90 | # Run the application
91 | shinyApp(ui = ui, server = server)
92 |
93 |
94 |
95 |
--------------------------------------------------------------------------------
/labs/lab11-shiny-apps/app4.R:
--------------------------------------------------------------------------------
1 | # Lab 11, Stat 133 Spring 2018, Prof. Sanchez
2 | # Shiny App version 3: Drawing balls from boxes
3 | # Inputs:
4 | # repetitions: number of repetitions
5 | # threshold: threshold to selet box
6 | # seed: random seed
7 | #
8 | # Outputs:
9 | # frequency plot of number of blue balls
10 | # table with frequencies
11 |
12 | library(shiny)
13 | library(ggplot2)
14 |
15 | # Define UI for application
16 | ui <- fluidPage(
17 |
18 | # Application title
19 | titlePanel("Drawing Balls Experiment"),
20 |
21 | # Sidebar
22 | sidebarLayout(
23 | sidebarPanel(
24 | sliderInput("repetitions",
25 | label = "Number of repetitions:",
26 | min = 1,
27 | max = 5000,
28 | value = 100),
29 | sliderInput("threshold",
30 | label = "Threshold for choosing boxes:",
31 | min = 0,
32 | max = 1,
33 | value = 0.5),
34 | numericInput("seed",
35 | "Choose a random seed",
36 | value = 12345)
37 | ),
38 |
39 | # Show a plot of the relative frequencies
40 | mainPanel(
41 | plotOutput("freqs_plot"),
42 | tableOutput("summary_table")
43 | )
44 | )
45 | )
46 |
47 |
48 | # Define server logic required to draw the plot
49 | server <- function(input, output) {
50 |
51 | blue_counts <- reactive({
52 | # boxes as character vectors
53 | box1 <- c('blue', 'blue', 'red')
54 | box2 <- c('blue', 'blue', 'red', 'red', 'red', 'white')
55 |
56 | size <- 4
57 | drawn_balls <- matrix("", input$repetitions, size)
58 | set.seed(input$seed)
59 | for (r in 1:input$repetitions) {
60 | aux <- runif(1)
61 | if (aux > input$threshold) {
62 | drawn_balls[r, ] <- sample(box1, size, replace = TRUE)
63 | } else {
64 | drawn_balls[r,] <- sample(box2, size)
65 | }
66 | }
67 |
68 | # number of blue balls in each repetition
69 | blue_counts <- apply(drawn_balls, 1, function(x) sum(x == 'blue'))
70 | blue_counts
71 | })
72 |
73 | # Fill in the spot we created for a plot
74 | output$freqs_plot <- renderPlot({
75 | # progression of relative frequencies
76 | blue_freqs <- vector(mode = "list", length = 5)
77 | for (num_blue in 0:4) {
78 | temp_freqs <- cumsum(blue_counts() == num_blue) / (1:input$repetitions)
79 | blue_freqs[[num_blue + 1]] <- temp_freqs
80 | }
81 |
82 | dat <- data.frame(
83 | reps = rep(1:input$repetitions, 5),
84 | freqs = unlist(blue_freqs),
85 | number = factor(rep(0:4, each = input$repetitions))
86 | )
87 |
88 | ggplot(data = dat, aes(x = reps, y = freqs, group = number)) +
89 | geom_path(aes(color = number)) +
90 | ggtitle("Relative frequencies of number of blue balls")
91 | })
92 |
93 | output$summary_table <- renderTable({
94 | table(blue_counts()) / input$repetitions
95 | })
96 | }
97 |
98 |
99 | # Run the application
100 | shinyApp(ui = ui, server = server)
101 |
--------------------------------------------------------------------------------
/labs/lab12-images/conference-standings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab12-images/conference-standings.png
--------------------------------------------------------------------------------
/labs/lab12-images/gsw-2017-roster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab12-images/gsw-2017-roster.png
--------------------------------------------------------------------------------
/labs/lab12-images/inspect1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab12-images/inspect1.png
--------------------------------------------------------------------------------
/labs/lab12-images/inspect2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/labs/lab12-images/inspect2.png
--------------------------------------------------------------------------------
/papers/correlograms-xia-liu.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/papers/correlograms-xia-liu.pdf
--------------------------------------------------------------------------------
/papers/testthat-wickham.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/papers/testthat-wickham.pdf
--------------------------------------------------------------------------------
/papers/tidy-data-wickham.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/papers/tidy-data-wickham.pdf
--------------------------------------------------------------------------------
/papers/what-is-data-science.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/papers/what-is-data-science.pdf
--------------------------------------------------------------------------------
/slides/00-about-course.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/00-about-course.pdf
--------------------------------------------------------------------------------
/slides/01-big-picture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/01-big-picture.pdf
--------------------------------------------------------------------------------
/slides/02-about-R.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/02-about-R.pdf
--------------------------------------------------------------------------------
/slides/03-R-vector-types.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/03-R-vector-types.pdf
--------------------------------------------------------------------------------
/slides/04-arrays-factors.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/04-arrays-factors.pdf
--------------------------------------------------------------------------------
/slides/05-lists.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/05-lists.pdf
--------------------------------------------------------------------------------
/slides/06-base-graphics1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/06-base-graphics1.pdf
--------------------------------------------------------------------------------
/slides/07-base-graphics2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/07-base-graphics2.pdf
--------------------------------------------------------------------------------
/slides/08-filesystem-basics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/08-filesystem-basics.pdf
--------------------------------------------------------------------------------
/slides/09-shell-basics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/09-shell-basics.pdf
--------------------------------------------------------------------------------
/slides/10-working-with-files.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/10-working-with-files.pdf
--------------------------------------------------------------------------------
/slides/11-git-basics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/11-git-basics.pdf
--------------------------------------------------------------------------------
/slides/12-data-tables.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/12-data-tables.pdf
--------------------------------------------------------------------------------
/slides/13-importing-tables.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/13-importing-tables.pdf
--------------------------------------------------------------------------------
/slides/14-data-frame-basics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/14-data-frame-basics.pdf
--------------------------------------------------------------------------------
/slides/15-principal-components1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/15-principal-components1.pdf
--------------------------------------------------------------------------------
/slides/16-principal-components2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/16-principal-components2.pdf
--------------------------------------------------------------------------------
/slides/17-dplyr-tutorial.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/17-dplyr-tutorial.pdf
--------------------------------------------------------------------------------
/slides/18-grammar-graphics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/18-grammar-graphics.pdf
--------------------------------------------------------------------------------
/slides/19-ggplot-lecture.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/19-ggplot-lecture.pdf
--------------------------------------------------------------------------------
/slides/20-strings-basics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/20-strings-basics.pdf
--------------------------------------------------------------------------------
/slides/shiny-tutorial.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/slides/shiny-tutorial.pdf
--------------------------------------------------------------------------------
/syllabus/faqs.md:
--------------------------------------------------------------------------------
1 | # Frequently Asked Questions
2 |
3 | Below are some of the questions I get asked everytime I teach Stat 133.
4 | Do you have to read them? Not really, but if you ever ask me a
5 | question that it's here, I'm going to kindly ask you to refer to this document.
6 |
7 | -----
8 |
9 | ### I am an undergrad student on the waitlist. What are my chances of enrolling in the class?
10 |
11 | Stat 133 is a highly demanded course, with a waiting list continuously increasing every year.
12 | From my past experience, between 5-7% of enrolled students tend to drop the course in the
13 | first two weeks, thus allowing between 18 to 25 students in the waitlist to join the class.
14 | This semester (spring 2018), however, there are about 110 waitlisted students. So if you
15 | are outside the first 25 on the waitlist, you have a very low chance to join the class.
16 |
17 |
18 | ### I am a grad student on the waitlist. What are my chances of enrolling in the class?
19 |
20 | If you are a grad student seriously interested in Stat 133, then you should try to schedule a meeting with me within the first two weeks of classes. While I cannot guarantee you a spot in the class, I would like to first determine if this course is a good fit for you.
21 |
22 |
23 | ### I am a concurrent student. What are my chances of enrolling in the class?
24 |
25 | Concurrent students have the lowest priority. And looking at past trends, you basically have no chances of getting in the class.
26 |
27 |
28 | ### I would like to switch lab sections with other student. Is this possible?
29 |
30 | I am afraid this is NOT possible. You must attend the lab discussion you are officially registered in. If you cannot attend the lab discussion you are officially registered in, then don't take the class.
31 |
32 |
33 | ### What if I don't agree with all the course policies?
34 |
35 | If there is one or more [policies](policies.md) you don't agree with, then please reconsider your enrollment in the course. I am assuming that all students completely agree with the course policies.
36 |
37 |
38 | ### Is this course a good fit if I don't have any programming experience?
39 |
40 | Yes. We actually expect that most of you come without any coding experience.
41 | It is nice to have some programming experience under your belt, which makes
42 | the learning curve less steep.
43 |
44 |
45 | ### Is this course a good fit if I've already taken at least one programming course?
46 |
47 | You may find the majority of this course somewhat slow (and boring?) in terms
48 | of basics concepts such as data types, data structures, conditionals, loops,
49 | and functions. Please consider taking more advanced courses if what you are
50 | interested in is algorithms, computational statistics, or data bases.
51 |
52 |
53 | ### Is this course a good fit if I don't have any data analysis experience?
54 |
55 | Yes. We actually expect that most of you come without any data analysis experience.
56 | In this course you will be working with real data sets, as well as with
57 | simulated data.
58 |
59 |
60 | ### Is this course a good fit if I don't intend to major in Statistics?
61 |
62 | Stat 133 is one of the core courses of the [Statistics Major](http://statistics.berkeley.edu/programs/undergrad/major). The way I teach the course is having Statistics majors as my target audience. However, much of the content should be helpful for any student who has to analyze data.
63 |
64 |
65 | ### After finishing this course, can I call myself a data scientist?
66 |
67 | Not yet. Becoming a data scientist is not a sprint. It is a marathon. Like any other
68 | profession, it takes years of practice and learning. This course is just the beginning.
69 |
70 |
71 | ### What if I don't want to be a data scientist?
72 |
73 | That's perfect too. You don't need to be a data scientist aspirant to take this course.
74 | Whether your plans are to be a consultant, life scientist, social scientist,
75 | journalist, or get some analytic skills, this course should be a good choice.
76 |
77 |
78 | ### Are we going to learn about machine learning methods?
79 |
80 | No. This course is not about machine learning (or statistical learning) methods.
81 | The Statistics department offers a dedicated course on this topic:
82 | __Stat 154: Modern Statistical Prediction and Machine Learning__.
83 | There is also __CS 189: Introduction to Machine Learning__ offered through
84 | Electrical Engineering and Computer Sciences (EECS).
85 |
86 |
87 | ### Are we going to learn about data bases?
88 |
89 | No. If you are interested in Databases you should consider
90 | __CS 186: Introduction to Database Systems__ offered through
91 | Electrical Engineering and Computer Sciences (EECS).
92 |
93 |
94 | ### Are we going to learn about linear models?
95 |
96 | No. The course for linear models is __Stat 151A: Linear Modeling, Theory and Applications__.
97 |
98 |
99 | ### Are we going to learn about Reproducible Research (RR)?
100 |
101 | Not really. We will touch on dynamic documents and practices that are useful
102 | in RR, but the dedicated course for this topic is
103 | __Stat 159: Reproducible and Collaborative Statistical Data Science__.
104 |
105 |
106 | ### Do we need to memorize all commands?
107 |
108 | No. We don't expect that you memorize all commands. However, we do expect that
109 | you learn the most common types of functions: e.g. `library()`, `function()`, `help()`, etc.
110 |
111 |
112 | ### Can we work in groups?
113 |
114 | Yes, absolutely. We strongly encourage you to not work alone. Well, let me rephrase that.
115 | You should try to work on your own. Experimenting, trial and error. Take notes
116 | of the things you don't understand. Then get with other people and discuss ideas,
117 | share tips (but not the entire solution). Try to explain how some piece of code
118 | works to your friend(s).
119 |
120 |
121 | ### Aren't you suppose to teach us?
122 |
123 | Yes. But you don't learn programming by watching someone else program.
124 | The same way that you don't learn to swim by watching someone else swimming.
125 | You have to get into the pool, and do all the drills your instructor says.
126 | This is a very hands-on course, and you will be required to do a great amount
127 | of work on your own.
128 |
129 |
130 | ### Can I ask you to write me a Letter of Recommendation (LoR)?
131 |
132 | Quick answer: No. However, I am happy to write you a letter of recommendation
133 | if I have known you for at least one year, and as long as we have developed
134 | a good collegial relationship (e.g. I know your name, I know your personal story,
135 | you've shown interest in my work).
136 | Getting a "good grade" does not guarantee that I will write you a LoR. Conversely,
137 | getting a "not so good grade" does not have to be an impediment to write you a LoR.
138 |
139 |
140 | ### I invited you to join my network in LinkedIn. Why haven't you accepted it?
141 |
142 | First: Don't take it personal. It's not you, it's me. Second: if you really
143 | want me to be part of your network, why don't you come see me in person?
144 | We can meet in OH, we can talk right before or after class. Or you can also
145 | schedule a meeting at a different time. Let me know you better than just as
146 | a distant contact in a social media networking site.
147 |
148 |
149 | ### Do you have research projects open to undergrad students?
150 |
151 | Lecturing takes most of my time and I don't have a lab. However, I'm always coming up
152 | with new ideas and experiments, and it's nice to have additional human resources
153 | to create something useful, interesting, open (and cool). If you are
154 | interested in volunteering and willing to learn a lot, come talk to me and
155 | let's see if we can join forces, and add our two cents to the world.
156 |
157 |
158 |
--------------------------------------------------------------------------------
/syllabus/piazza.md:
--------------------------------------------------------------------------------
1 | ## Piazza Etiquette
2 |
3 |
4 | > Important Note:
5 | >
6 | > We will encourage more student participation on Piazza rather than
7 | > answering right away, that is, we will wait until other students step
8 | > up and answer questions.
9 | >
10 | > Of course, we will still provide clarifications on logistics, typos,
11 | > subtle points, etc.
12 | >
13 | > We want to make sure that you are helping each other out, and having
14 | > instructors give away the answers isn't the most beneficial for your
15 | > education either.
16 |
17 |
18 | In order to make Piazza a better resource for everyone, we've outlined
19 | some guidelines for you to follow when posting your questions. Questions
20 | which follow these guidelines will have a higher chance of being answered!
21 |
22 |
23 | ### 1. Ask HW questions only in the designated HW posts.
24 |
25 | We've created individual posts for each problem from homework.
26 | Please ask questions, discuss problems, or help out in those posts only.
27 | Before asking a question, read through (or search) the whole thread to see
28 | if your question has been answered.
29 |
30 |
31 | ### 2. Do NOT post answers in Piazza.
32 |
33 | Please don't give away the answer on Piazza.
34 | You can explain things in a way that still lets other students figure out
35 | the essence of the problem on their own, but don't spoil the problem.
36 | For example, don't point to a useful stackoverflow or YouTube link that
37 | works out essentially what the problem is asking about.
38 |
39 | That is not cool.
40 |
41 | Post such spoilers after the HW is due. That is totally fine.
42 | If you are not sure, post privately to instructors and then we'll let you know.
43 |
44 |
45 | ### 3. Try to make posts public.
46 |
47 | While not violating Rule 2, try to make your questions public, because
48 | others might have the same question and we don't need to answer them
49 | multiple times.
50 |
51 |
52 | ### 4. Piazza is not Office Hours. 3 minute-test.
53 |
54 | If you think your questions may take more than 3 minutes to answer,
55 | please come to office hours or homework parties instead.
56 |
57 |
58 | ### 5. Neither Piazza nor GSIs are for pre-grading.
59 |
60 | Please do not post questions of the form:
61 |
62 | - "Is this the correct solution to HW X problem Y?"
63 | - "Would this receive full credit on HW X problem Y?"
64 | - "Is this the right level of detail for hw X problem Y?"
65 |
66 | Please do not use Piazza as a medium to ask instructors to check your
67 | homework in advance. We simply cannot check every student's homework
68 | through Piazza.
69 |
70 | Feel free to ask questions of clarification, or ask questions about the
71 | course content to achieve a deeper understanding, but at a certain point,
72 | you must apply your knowledge, give it your best shot, and submit your
73 | answers with confidence.
74 |
75 |
76 | ### 6. Post a screen shot of any resource referenced.
77 |
78 | Your question should be self-contained. The TAs (and other responders)
79 | should not have to scan through PDFs to even figure out what the question is.
80 | Ask yourself: am I referring to some lecture /HW solution/discussion solution/past exam?
81 |
82 | If the answer is yes, post a screen shot of the relevant part.
83 | This can include your handwritten notes from the relevant lecture.
84 |
85 |
86 | ### 7. Post all your work.
87 |
88 | Don't post one line saying:
89 |
90 | > "At step _n_, I get XYZ, and I'm now confused."
91 |
92 | This forces the GSIs to guess:
93 |
94 | > What happened in steps 1, 2, ..., n - 1?
95 |
96 | Most likely, the GSIs will guess wrong, and we run into a mess of followup
97 | questions trying to figure out what steps 1, 2, ..., n - 1 were.
98 |
99 | Instead, post:
100 |
101 | > Starting out, we have: ....
102 | >
103 | > Then, I do ..., and I get ...
104 | >
105 | > Next, I do ..., and I get ...
106 | >
107 | > Next, I do ..., and I get ...
108 | >
109 | > Now, I get $&%(, and this makes no sense.
110 |
111 | Then, the GSI can respond:
112 |
113 | > The mistake is at step 3, you're not allowed to apply ABC to XYZ because ...
114 |
115 |
116 |
--------------------------------------------------------------------------------
/tutorials/01-images/ggplot-scatter-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/01-images/ggplot-scatter-1.png
--------------------------------------------------------------------------------
/tutorials/01-images/gsw-2017-roster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/01-images/gsw-2017-roster.png
--------------------------------------------------------------------------------
/tutorials/01-images/gsw-2017-salaries.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/01-images/gsw-2017-salaries.png
--------------------------------------------------------------------------------
/tutorials/01-images/gsw-2017-totals.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/01-images/gsw-2017-totals.png
--------------------------------------------------------------------------------
/tutorials/01-images/plot-scatter-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/01-images/plot-scatter-1.png
--------------------------------------------------------------------------------
/tutorials/01-images/screen-rgui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/01-images/screen-rgui.png
--------------------------------------------------------------------------------
/tutorials/01-images/screen-rstudio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/01-images/screen-rstudio.png
--------------------------------------------------------------------------------
/tutorials/02-intro-to-Rmd-files.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Getting started with R Markdown files"
3 | subtitle: "Stat 133, Spring 2018"
4 | author: "Gaston Sanchez"
5 | output: github_document
6 | fontsize: 11pt
7 | urlcolor: blue
8 | ---
9 |
10 | > ### Learning Objectives:
11 | >
12 | > - Differentiate between `.R` and `.Rmd` files
13 | > - To understand dynamic documents
14 | > - To gain familiarity with R Markdown `.Rmd` files
15 | > - To gain familiarity with code chunks
16 |
17 | ------
18 |
19 | ## Introduction to R Markdown files
20 |
21 | Besides using R script files to write source code, you will be using other
22 | type of source files known as _R markdown_ files.
23 | Because you will be turning in most homework assignments as `Rmd` files, it is
24 | important that you quickly become familiar with this resource.
25 |
26 |
27 | ### Opening and knitting an `Rmd` file
28 |
29 | In the menu bar of RStudio, click on __File__, then __New File__,
30 | and choose __R Markdown__. Select the default option (Document),
31 | and click __Ok__. RStudio will open a new `.Rmd` file in the source pane.
32 | And you should be able to see a file with some default content.
33 |
34 | Locate the button __Knit HTML__, the one with an icon of a ball of yarn and two
35 | needles. Click the button (knit to HTML) so you can see how `Rmd` files are
36 | rendered and displayed as HTML documents. Alternatively, you can use a keyboard
37 | shortcut: in Mac `Command+Shift+K`, in Windows `Ctrl+Shift+K`
38 |
39 |
40 | ### What is an `Rmd` file?
41 |
42 | __Rmd__ files are a special type of file, referred to as a _dynamic document_.
43 | This is the fancy term we use to describe a document that allows us to combine
44 | narrative (text) with R code in one single file.
45 |
46 | Rmd files are plain text files. This means that you can open an Rmd file
47 | with any text editor (not just RStudio) and being able to see and edit its
48 | contents.
49 |
50 | The main idea behind dynamic documents is simple yet very powerful: instead of
51 | working with two separate files, one that contains the R code, and
52 | another one that contains the narrative, you use an `.Rmd` file to include
53 | both the commands and the narrative.
54 |
55 | One of the main advantages of this paradigm,
56 | is that you avoid having to copy results from your computations and paste them
57 | into a report file. In fact, there are more complex ways to work with dynamic
58 | documents and source files. But the core idea is the same: combine narrative
59 | and code in a way that you let the computer do the manual, repetitive,
60 | and time consuming job.
61 |
62 | Rmd files is just one type of dynamic document that you will find in RStudio.
63 | In fact, RStudio provides other file formats that can be used
64 | as dynamic documents: e.g. `.Rnw`, `.Rpres`, `.Rhtml`, etc.
65 |
66 |
67 | ### Anatomy of an `Rmd` file
68 |
69 | The structure of an `.Rmd` file can be divided in two parts: 1) a __YAML header__,
70 | and 2) the __body__ of the document. In addition to this structure, you should
71 | know that `.Rmd` files use three types of syntaxes: YAML, Markdown, and R.
72 |
73 | The _YAML header_ consists of the first few lines at the top of the file.
74 | This header is established by a set of three dashes `---` as delimiters
75 | (one starting set, and one ending set). This part of the file requires you
76 | to use YAML syntax (Yet Another Markup Language.)
77 | Within the delimiter sets of dashes, you specify settings (or metadata) that
78 | will apply to the entire document. Some of the common
79 | options are things like:
80 |
81 | - `title`
82 | - `author`
83 | - `date`
84 | - `output`
85 |
86 | The _body_ of the document is everything below the YAML header. It consists
87 | of a mix of narrative and R code. All the text that is narrative is written
88 | in a markup syntax called __Markdown__ (although you can also use LaTeX math
89 | notation). In turn, all the text that is code
90 | is written in R syntax inside _blocks of code_.
91 |
92 | There are two types of blocks of code: 1) __code chunks__, and
93 | 2) __inline code__. Code chunks are lines of text separated from any lines of
94 | narrative text. Inline code is code inserted within a line of narrative text .
95 |
96 |
97 | ### How does an Rmd file work?
98 |
99 | Rmd files are plain text files. All that matters is the syntax of its content.
100 | The content is basically divided in the header, and the body.
101 |
102 | - The header uses YAML syntax.
103 | - The narrative in the body uses Markdown syntax.
104 | - The code and commands use R syntax.
105 |
106 | The process to generate a nice rendered document from an Rmd file is
107 | known as __knitting__. When you _knit_ an Rmd file, various R packages
108 | and programs run behind the scenes. But the process can be broken down
109 | in three main phases: 1) Parsing, 2) Execution, and 3) Rendering.
110 |
111 | 1) Parsing: the content of the file is parsed (examined line by line)
112 | and each component is identified as yaml header, or as markdown text, or as R code.
113 |
114 | Each component receives a special treatment and formatting.
115 |
116 | The most interesting part is in the pieces of text that are R code.
117 | Those are separated and executed if necessary. The commands may be included
118 | in the final document. Also, the output may be included in the final document.
119 | Sometimes, nothing is executed nor included.
120 |
121 | Depending on the specified output format (e.g. HTML, pdf, word), all the
122 | components are assembled, and one single document is generated.
123 |
124 |
125 | ### Yet Another Syntax to Learn
126 |
127 | R markdown (`Rmd`) files use [markdown](https://daringfireball.net/projects/markdown/)
128 | as the main syntax to write content. Markdown is a very lightweight type of
129 | markup language, and it is relatively easy to learn.
130 |
131 | One of the most common sources of confusion when learning about R and Rmd
132 | files has to do with the hash symbol `#`. As you know, `#` is the character
133 | used by R to indicate comments. The issue is that the `#` character has a
134 | different meaning in markdown syntax. Hashes in markdown are used to define
135 | levels of headings.
136 |
137 | In an Rmd file, a hash `#` that is inside a code chunk will be treated as
138 | an R comment. A hash outside a code chunk, will be treated as markdown syntax,
139 | making its associated text a given type of heading.
140 |
141 |
142 |
143 | ## Code chunks
144 |
145 | There are dozens of options available to control the executation of the code,
146 | the formatting and display of both the commands and the output, the display
147 | of images, graphs, and tables, and other fancy things. Here's a list of the
148 | basic options you should become familiar with:
149 |
150 | - `eval`: whether the code should be evaluated
151 | + `TRUE`
152 | + `FALSE`
153 | - `echo`: whether the code should be displayed
154 | + `TRUE`
155 | + `FALSE`
156 | + numbers indicating lines in a chunk
157 | - `error`: whether to stop execution if there is an error
158 | + `TRUE`
159 | + `FALSE`
160 | - `results`: how to display the output
161 | + `markup`
162 | + `asis`
163 | + `hold`
164 | + `hide`
165 | - `comment`: character used to indicate output lines
166 | + the default is a double hash `##`
167 | + `""` empty character (to have a cleaner display)
168 |
169 |
170 | -----
171 |
--------------------------------------------------------------------------------
/tutorials/02-intro-to-Rmd-files.md:
--------------------------------------------------------------------------------
1 | Getting started with R Markdown files
2 | ================
3 | Gaston Sanchez
4 |
5 | > ### Learning Objectives:
6 | >
7 | > - Differentiate between `.R` and `.Rmd` files
8 | > - To understand dynamic documents
9 | > - To gain familiarity with R Markdown `.Rmd` files
10 | > - To gain familiarity with code chunks
11 |
12 | ------------------------------------------------------------------------
13 |
14 | Introduction to R Markdown files
15 | --------------------------------
16 |
17 | Besides using R script files to write source code, you will be using other type of source files known as *R markdown* files. Because you will be turning in most homework assignments as `Rmd` files, it is important that you quickly become familiar with this resource.
18 |
19 | ### Opening and knitting an `Rmd` file
20 |
21 | In the menu bar of RStudio, click on **File**, then **New File**, and choose **R Markdown**. Select the default option (Document), and click **Ok**. RStudio will open a new `.Rmd` file in the source pane. And you should be able to see a file with some default content.
22 |
23 | Locate the button **Knit HTML**, the one with an icon of a ball of yarn and two needles. Click the button (knit to HTML) so you can see how `Rmd` files are rendered and displayed as HTML documents. Alternatively, you can use a keyboard shortcut: in Mac `Command+Shift+K`, in Windows `Ctrl+Shift+K`
24 |
25 | ### What is an `Rmd` file?
26 |
27 | **Rmd** files are a special type of file, referred to as a *dynamic document*. This is the fancy term we use to describe a document that allows us to combine narrative (text) with R code in one single file.
28 |
29 | Rmd files are plain text files. This means that you can open an Rmd file with any text editor (not just RStudio) and being able to see and edit its contents.
30 |
31 | The main idea behind dynamic documents is simple yet very powerful: instead of working with two separate files, one that contains the R code, and another one that contains the narrative, you use an `.Rmd` file to include both the commands and the narrative.
32 |
33 | One of the main advantages of this paradigm, is that you avoid having to copy results from your computations and paste them into a report file. In fact, there are more complex ways to work with dynamic documents and source files. But the core idea is the same: combine narrative and code in a way that you let the computer do the manual, repetitive, and time consuming job.
34 |
35 | Rmd files is just one type of dynamic document that you will find in RStudio. In fact, RStudio provides other file formats that can be used as dynamic documents: e.g. `.Rnw`, `.Rpres`, `.Rhtml`, etc.
36 |
37 | ### Anatomy of an `Rmd` file
38 |
39 | The structure of an `.Rmd` file can be divided in two parts: 1) a **YAML header**, and 2) the **body** of the document. In addition to this structure, you should know that `.Rmd` files use three types of syntaxes: YAML, Markdown, and R.
40 |
41 | The *YAML header* consists of the first few lines at the top of the file. This header is established by a set of three dashes `---` as delimiters (one starting set, and one ending set). This part of the file requires you to use YAML syntax (Yet Another Markup Language.) Within the delimiter sets of dashes, you specify settings (or metadata) that will apply to the entire document. Some of the common options are things like:
42 |
43 | - `title`
44 | - `author`
45 | - `date`
46 | - `output`
47 |
48 | The *body* of the document is everything below the YAML header. It consists of a mix of narrative and R code. All the text that is narrative is written in a markup syntax called **Markdown** (although you can also use LaTeX math notation). In turn, all the text that is code is written in R syntax inside *blocks of code*.
49 |
50 | There are two types of blocks of code: 1) **code chunks**, and 2) **inline code**. Code chunks are lines of text separated from any lines of narrative text. Inline code is code inserted within a line of narrative text .
51 |
52 | ### How does an Rmd file work?
53 |
54 | Rmd files are plain text files. All that matters is the syntax of its content. The content is basically divided in the header, and the body.
55 |
56 | - The header uses YAML syntax.
57 | - The narrative in the body uses Markdown syntax.
58 | - The code and commands use R syntax.
59 |
60 | The process to generate a nice rendered document from an Rmd file is known as **knitting**. When you *knit* an Rmd file, various R packages and programs run behind the scenes. But the process can be broken down in three main phases: 1) Parsing, 2) Execution, and 3) Rendering.
61 |
62 | 1. Parsing: the content of the file is parsed (examined line by line) and each component is identified as yaml header, or as markdown text, or as R code.
63 |
64 | Each component receives a special treatment and formatting.
65 |
66 | The most interesting part is in the pieces of text that are R code. Those are separated and executed if necessary. The commands may be included in the final document. Also, the output may be included in the final document. Sometimes, nothing is executed nor included.
67 |
68 | Depending on the specified output format (e.g. HTML, pdf, word), all the components are assembled, and one single document is generated.
69 |
70 | ### Yet Another Syntax to Learn
71 |
72 | R markdown (`Rmd`) files use [markdown](https://daringfireball.net/projects/markdown/) as the main syntax to write content. Markdown is a very lightweight type of markup language, and it is relatively easy to learn.
73 |
74 | One of the most common sources of confusion when learning about R and Rmd files has to do with the hash symbol `#`. As you know, `#` is the character used by R to indicate comments. The issue is that the `#` character has a different meaning in markdown syntax. Hashes in markdown are used to define levels of headings.
75 |
76 | In an Rmd file, a hash `#` that is inside a code chunk will be treated as an R comment. A hash outside a code chunk, will be treated as markdown syntax, making its associated text a given type of heading.
77 |
78 | Code chunks
79 | -----------
80 |
81 | There are dozens of options available to control the executation of the code, the formatting and display of both the commands and the output, the display of images, graphs, and tables, and other fancy things. Here's a list of the basic options you should become familiar with:
82 |
83 | - `eval`: whether the code should be evaluated
84 | - `TRUE`
85 | - `FALSE`
86 | - `echo`: whether the code should be displayed
87 | - `TRUE`
88 | - `FALSE`
89 | - numbers indicating lines in a chunk
90 | - `error`: whether to stop execution if there is an error
91 | - `TRUE`
92 | - `FALSE`
93 | - `results`: how to display the output
94 | - `markup`
95 | - `asis`
96 | - `hold`
97 | - `hide`
98 | - `comment`: character used to indicate output lines
99 | - the default is a double hash `##`
100 | - `""` empty character (to have a cleaner display)
101 |
102 | ------------------------------------------------------------------------
103 |
--------------------------------------------------------------------------------
/tutorials/04-intro-to-data-frames.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Introduction to Data Frames"
3 | subtitle: "Stat 133, Spring 2018"
4 | author: "Gaston Sanchez"
5 | output: github_document
6 | fontsize: 11pt
7 | urlcolor: blue
8 | ---
9 |
10 | > ### Learning Objectives
11 | >
12 | > - Understand Data Frames
13 | > - Importing
14 | > - Exporting
15 | > - Basic Manipulation with `[ , ]`
16 |
17 | ------
18 |
19 | ## Manipulating Data Frames
20 |
21 | The most common format/structure for a data set is a tabular format:
22 | with row and columns (like a spreadsheet). When your data is in this shape,
23 | most of the time you will work with R __data frames__ (or similar rectangular
24 | structures like a `"matrix"`, `"table"`, etc).
25 |
26 | Learning how to manipulate data tables is among the most important
27 | _data computing_ basic skills. The traditional way of manipulating data frames
28 | in R is based on bracket notation, e.g. `dat[ , ]`, to select specific
29 | rows, columns, or cells. Also, the use of the dollar `$` operator to handle
30 | columns is fundamental. In this part of the lab, you will practice a wide
31 | array of data wrangling tasks with the so-called bracket notation, and the
32 | dollar operator.
33 |
34 | I should say that there are alternative ways for manipulating tables in R.
35 | Among the most recent paradigms, there is the __plying__ framework devised
36 | by Hadley Wickham. From his doctoral research, the first _plyr_ tools were
37 | available in the packages `"plyr"` and `"reshape"`. Nowadays we have the
38 | `"reshape2"`package, and the extremely popular package `"dplyr"`
39 | (among other packages). You will have time to learn more about `"dplyr"` in the
40 | next weeks. In the meantime, take some time to understand more about the
41 | bracket notation.
42 |
43 |
44 |
45 | ## R Data Frames
46 |
47 | A data frame is a special type of R list, in which each column is an R vector
48 | (or a factor).
49 |
50 | When working with data frames, you should always spend some time inspecting
51 | the contents, and checking how R is handling the data types. It is in these
52 | early stages of data exploration that you can catch potential issues in order
53 | to avoid disastrous consequences or bugs in subsequent stages.
54 |
55 | What `str()` returns is a display
56 | of the dimensions of the data frame, and then a list with the name of all the
57 | variables, and their data types (e.g. `chr` character, `num` real, etc).
58 | The argument `vec.len = 1` indicates that just the first element in each
59 | column should be displayed.
60 |
61 |
62 |
63 | ## Creating data frames
64 |
65 | Most of the (raw) data tables you will be working with will already be in
66 | some data file. However, from time to time you will face the need of creating
67 | some sort of data table in R. In these situations, you will likely have to
68 | create such table with a data frame. So let's look at various ways to
69 | "manually"" create a data frame.
70 |
71 | __Option 1__: The primary option to build a data frame is with `data.frame()`.
72 | You pass a series of vectors (or factors), of the same length, separated by commas.
73 | Each vector (or factor) will become a column in the generated data frame.
74 | Preferably, give names to each column, like `col1`, `col2`, and `col3`, in the
75 | example below:
76 |
77 | ```{r create_data_frame1}
78 | # creating a basic data frame
79 | my_table1 <- data.frame(
80 | col1 = LETTERS[1:5],
81 | col2 = seq(from = 10, to = 50, by = 10),
82 | col3 = c(TRUE, TRUE, FALSE, TRUE, FALSE)
83 | )
84 |
85 | my_table1
86 | ```
87 |
88 |
89 | __Option 2__: Another way to create data frames is with a `list` containing
90 | vectors or factors (of the same length), which then you convert to a data.frame
91 | with `data.frame()`:
92 |
93 | ```{r create_data_frame2}
94 | # another way to create a basic data frame
95 | my_list <- list(
96 | col1 = LETTERS[1:5],
97 | col2 = seq(from = 10, to = 50, by = 10),
98 | col3 = c(TRUE, TRUE, FALSE, TRUE, FALSE)
99 | )
100 |
101 | my_table2 <- data.frame(my_list)
102 |
103 | my_table2
104 | ```
105 |
106 | Remember that a `data.frame` is nothing more than a `list`. So as long as the
107 | elements in the list (vectors or factors) are of the same length, we can simply
108 | convert the list into a data frame.
109 |
110 | By default, `data.frame()` converts character vectors into factors. You can
111 | check that by exmining the structure of the data frame with `str()`:
112 |
113 | ```{r}
114 | str(my_table2)
115 | ```
116 |
117 | To prevent `data.frame()` from converting strings into factors, you must use
118 | the argument `stringsAsFactors = FALSE`
119 |
120 | ```{r}
121 | # strings as strings (not as factors)
122 | my_table3 <- data.frame(
123 | col1 = LETTERS[1:5],
124 | col2 = seq(from = 10, to = 50, by = 10),
125 | col3 = c(TRUE, TRUE, FALSE, TRUE, FALSE),
126 | stringsAsFactors = FALSE
127 | )
128 |
129 | str(my_table3)
130 | ```
131 |
132 |
133 |
134 | ## Basic Operations with Data Frames
135 |
136 | Now that you have seen some ways to create data frames, let's discuss a number
137 | of basic manipulations of data frames. I will show you some examples and then
138 | you'll have the chance to put in practice the following operations:
139 |
140 | - Selecting table elements:
141 | + select a given cell
142 | + select a set of cells
143 | + select a given row
144 | + select a set of rows
145 | + select a given column
146 | + select a set of columns
147 | - Adding a new column
148 | - Deleting a new column
149 | - Renaming a column
150 | - Moving a column
151 | - Transforming a column
152 |
153 |
154 | ```{r echo = FALSE}
155 | tbl <- data.frame(
156 | player = c('Thompson', 'Curry', 'Green', 'Durant', 'Pachulia'),
157 | position = c('SG', 'PG', 'PF', 'SF', 'C'),
158 | salary = c(16663575, 12112359, 15330435, 26540100, 2898000),
159 | points = c(1742, 1999, 776, 1555, 426),
160 | ppg = c(22.3, 25.3, 10.2, 25.1, 6.1),
161 | rookie = rep(FALSE, 5),
162 | stringsAsFactors = FALSE
163 | )
164 | ```
165 |
166 | Let's say you have a data frame `tbl` with the lineup of the Golden State Warriors:
167 |
168 | ```{r, echo = FALSE, comment = ""}
169 | tbl
170 | ```
171 |
172 |
173 | ### Selecting elements
174 |
175 | The data frame `tbl` is a 2-dimensional object: the 1st dimension corresponds
176 | to the rows, while the 2nd dimension corresponds to the columns.
177 | Because `tbl` has two dimensions, the bracket notation involves
178 | working with the data frame in this form: `tbl[ , ]`.
179 | In other words, you have to specify values inside the
180 | brackets for the 1st index, and the 2nd index: `tbl[index1, index2]`.
181 |
182 | ```{r}
183 | # select value in row 1 and column 1
184 | tbl[1,1]
185 |
186 | # select value in row 2 and column 5
187 | tbl[2,5]
188 |
189 | # select values in these cells
190 | tbl[1:3,3:5]
191 | ```
192 |
193 |
194 | If no value is specified for `index1` then all rows are included. Likewise,
195 | if no value is specified for `index2` then all columns are included.
196 |
197 | ```{r}
198 | # selecting first row
199 | tbl[1, ]
200 |
201 | # selecting third row
202 | tbl[3, ]
203 |
204 | # selecting second column
205 | tbl[ ,2]
206 |
207 | # selecting columns 3 to 5
208 | tbl[ ,3:5]
209 | ```
210 |
211 |
212 | ### Adding a column
213 |
214 | Perhaps the simplest way to add a column is with the dollar operator `$`.
215 | You just need to give a name for the new column, and assign a vector (or factor):
216 |
217 | ```{r}
218 | # adding a column
219 | tbl$new_column <- c('a', 'e', 'i', 'o', 'u')
220 | tbl
221 | ```
222 |
223 | Another way to add a column is with the _column binding_ function `cbind()`:
224 |
225 | ```{r}
226 | # vector of weights
227 | weight <- c(215, 190, 230, 240, 270)
228 |
229 | # adding weights to tbl
230 | tbl <- cbind(tbl, weight)
231 | tbl
232 | ```
233 |
234 |
235 | ### Deleting a column
236 |
237 | The inverse operation of adding a column consists of __deleting__ a column.
238 | This is possible with the `$` dollar operator. For instance, say you want to
239 | remove the column `new_column`. Use the `$` operator to select this column,
240 | and assign it the value `NULL` (think of this as _NULLifying_ a column):
241 |
242 | ```{r}
243 | # deleting a column
244 | tbl$new_column <- NULL
245 | tbl
246 | ```
247 |
248 |
249 | ### Renaming a column
250 |
251 | What if you want to rename a column? There are various options to do this.
252 | One way is by changing the column`names` attribute:
253 |
254 | ```{r}
255 | # attributes
256 | attributes(tbl)
257 | ```
258 |
259 | which is more commonly accessed with the `names()` function:
260 |
261 | ```{r}
262 | # column names
263 | names(tbl)
264 | ```
265 |
266 | Notice that `tbl` has a list of attributes. The element `names` is the vector
267 | of column names.
268 |
269 | You can directly modify the vector of `names`; for example let's change
270 | `rookie` to `rooky`:
271 |
272 | ```{r}
273 | # changing rookie to rooky
274 | attributes(tbl)$names[6] <- "rooky"
275 |
276 | # display column names
277 | names(tbl)
278 | ```
279 |
280 | By the way: this way of changing the name of a variable is very low level, and probably
281 | unfamiliar to most useRs.
282 |
283 |
284 | ### Moving a column
285 |
286 | A more challenging operation is when you want to move a column to a different
287 | position. What if you want to move `salary` to the last position (last column)?
288 | One option is to create a vector of column names in the desired order, and then
289 | use this vector (for the index of columns) to reassign the data frame like this:
290 |
291 | ```{r}
292 | reordered_names <- c("player", "position", "points", "ppg", "rooky", "weight", "salary")
293 |
294 | # moving salary at the end
295 | tbl <- tbl[ ,reordered_names]
296 | tbl
297 | ```
298 |
299 |
300 | ### Transforming a column
301 |
302 | A more common operation than deleting or moving a column, is to transform the
303 | values in a column. This can be easily accomplished with the `$` operator.
304 | For instance, let's say that we want to transform `salary` from dollars to
305 | millions of dollars:
306 |
307 | ```{r}
308 | # converting salary in millions of dollars
309 | tbl$salary <- tbl$salary / 1000000
310 | tbl
311 | ```
312 |
313 | Likewise, instead of using the `$` operator, you can refer to the column using
314 | bracket notation. Here's how to transform weight from pounds to kilograms
315 | (1 pound = 0.453592 kilograms):
316 |
317 | ```{r}
318 | # weight in kilograms
319 | tbl[ ,"weight"] <- tbl[ ,"weight"] * 0.453592
320 | tbl
321 | ```
322 |
323 | There is also the `transform()` function which transform values _interactively_,
324 | that is, temporarily:
325 |
326 | ```{r}
327 | # transform weight to inches
328 | transform(tbl, weight = weight / 0.453592)
329 | ```
330 |
331 | `transform()` does its job of modifying the values of `weight` but only
332 | temporarily; if you inspect `tbl` you'll see what this means:
333 |
334 | ```{r}
335 | # did weight really change?
336 | tbl
337 | ```
338 |
339 | To make the changes permanent with `transform()`, you need to reassign them
340 | to the data frame:
341 |
342 | ```{r}
343 | # transform weight to inches (permanently)
344 | tbl <- transform(tbl, weight = weight / 0.453592)
345 | tbl
346 | ```
347 |
348 |
--------------------------------------------------------------------------------
/tutorials/05-dplyr-pipes.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Pipes with dplyr"
3 | subtitle: "Stat 133, Spring 2018"
4 | author: "Gaston Sanchez"
5 | output: github_document
6 | fontsize: 11pt
7 | urlcolor: blue
8 | ---
9 |
10 | ```{r setup, include=FALSE}
11 | knitr::opts_chunk$set(echo = TRUE, error = TRUE, fig.path = '05-images/')
12 | library(knitr)
13 | library(dplyr)
14 | library(ggplot2)
15 | library(magrittr)
16 | ```
17 |
18 | > ### Learning Objectives:
19 | >
20 | > - Compare base R and `"dplyr"`
21 | > - Get to know the pipe operator `%>%`
22 |
23 | ------
24 |
25 | ## Introduction
26 |
27 | Last week you started to manipulate data tables (e.g. `data.frame`, `tibble`)
28 | with functions provided by the R package `"dplyr"`.
29 |
30 | Having been exposed to the _dplyr_ paradigm, let's compare R base manipulation against the various dplyr syntax flavors.
31 |
32 |
33 | ### Starwars Data Set
34 |
35 | In this tutorial we are going to use the data set `starwars` that comes in `"dplyr"`:
36 |
37 | ```{r warning = FALSE, message = FALSE}
38 | # load dplyr
39 | library(dplyr)
40 |
41 | # data set
42 | starwars
43 | ```
44 |
45 |
46 | ### Average Height of Male and Female Individuals
47 |
48 | For illustration purposes, let's consider a relatively simple example.
49 | Say we are interested in calculating the average (mean) height for both female
50 | and male individuals. Let's discuss how to find the solution under the base R
51 | approach, as well as the dplyr approach.
52 |
53 | -----
54 |
55 |
56 | ## Quick inspection of `height`
57 |
58 | ```{r}
59 | # summary stats of height
60 | summary(starwars$height)
61 | ```
62 |
63 | ```{r height_histogram}
64 | # histogram
65 | hist(starwars$height, col = 'gray80', las = 1)
66 | ```
67 |
68 |
69 | ### Quick inspection of `gender`
70 |
71 | ```{r}
72 | # frequencies of gender
73 | summary(starwars$gender)
74 | gender_freqs <- table(starwars$gender)
75 | gender_freqs
76 | ```
77 |
78 | ```{r gender_barchart}
79 | # barchart of gender freqs
80 | barplot(gender_freqs, border = NA, las = 1)
81 | ```
82 |
83 | Now let's use `"dplyr"` to get the frequencies:
84 |
85 | ```{r}
86 | # distinct values
87 | distinct(starwars, gender)
88 | ```
89 |
90 | Oh! Notice that we have some missing values, which were not reported by `table()`.
91 |
92 | ```{r}
93 | # frequencies of gender (via dplyr)
94 | count(starwars, gender)
95 | ```
96 |
97 |
98 | -----
99 |
100 |
101 | ## Base R approach
102 |
103 | Let's see how to use base R operations to find the average `height` of individuals with `gender` female and male.
104 |
105 | ```{r}
106 | # identify female and male individuals
107 | # (comparison operations)
108 | which_females <- starwars$gender == 'female'
109 | which_males <- starwars$gender == 'male'
110 | ```
111 |
112 | ```{r}
113 | # select the height values of females and males
114 | # (via logical subsetting)
115 | height_females <- starwars$height[which_females]
116 | height_males <- starwars$height[which_males]
117 | ```
118 |
119 | ```{r}
120 | # calculate averages (removing missing values)
121 | avg_ht_female <- mean(height_females, na.rm = TRUE)
122 | avg_ht_male <- mean(height_males, na.rm = TRUE)
123 |
124 | # optional: display averages in a vector
125 | c('female' = avg_ht_female, 'male' = avg_ht_male)
126 | ```
127 |
128 |
129 | All the previous code can be written with more compact expressions:
130 |
131 | ```{r}
132 | # all calculations in a couple of lines of code
133 | c("female" = mean(starwars$height[starwars$gender == 'female'], na.rm = TRUE),
134 | "male" = mean(starwars$height[starwars$gender == 'male'], na.rm = TRUE)
135 | )
136 | ```
137 |
138 |
139 | -----
140 |
141 |
142 | ## With `"dplyr"`
143 |
144 | The behavior of `"dplyr"` is functional in the sense that function calls don't
145 | have side-effects. You must always save their results in order to keep them
146 | in an object (in memory). This doesn't lead to particularly elegant code,
147 | especially if you want to do many operations at once.
148 |
149 |
150 | ### Option 1) Step-by-step
151 |
152 | You either have to do it step-by-step:
153 |
154 | ```{r}
155 | # manipulation step-by-step
156 | gender_height <- select(starwars, gender, height)
157 |
158 | fem_male_height <- filter(gender_height,
159 | gender == 'female' | gender == 'male')
160 |
161 | height_by_gender <- group_by(fem_male_height, gender)
162 |
163 | summarise(height_by_gender, mean(height, na.rm = TRUE))
164 | ```
165 |
166 |
167 | ### Option 2) Nested (embedded) code
168 |
169 | Or if you don't want to name the intermediate results, you need to wrap the
170 | function calls inside each other:
171 |
172 | ```{r}
173 | summarise(
174 | group_by(
175 | filter(select(starwars, gender, height),
176 | gender == 'female' | gender == 'male'),
177 | gender),
178 | mean(height, na.rm = TRUE)
179 | )
180 | ```
181 |
182 | This is difficult to read because the order of the operations is from inside
183 | to out. Thus, the arguments are a long way away from the function.
184 |
185 |
186 | ### Option 3) Piping
187 |
188 | To get around the problem of nesting functions, `"dplyr"` also provides the
189 | `%>%` operator from the R package `"magrittr"`.
190 |
191 | What does the _piper_ `%>%` do? Here's a conceptual example:
192 |
193 | ```{r eval = FALSE}
194 | x %>% f(y)
195 | ```
196 |
197 | `x %>% f(y)` turns into `f(x, y)` so you can use it to rewrite multiple
198 | operations that you can read left-to-right, top-to-bottom.
199 |
200 | Here's how to use the piper to calculate the average height for female and
201 | male individuals:
202 |
203 | ```{r}
204 | avg_height_by_gender <- starwars %>%
205 | select(gender, height) %>%
206 | filter(gender == 'female' | gender == 'male') %>%
207 | group_by(gender) %>%
208 | summarise(avg = mean(height, na.rm = TRUE))
209 |
210 | avg_height_by_gender
211 |
212 | avg_height_by_gender$avg
213 | ```
214 |
215 | -----
216 |
217 | ## Another Example
218 |
219 | Here's another example in which we calculate the mean `height` and mean `mass` of `species` Droid, Ewok, and Human; arranging the rows of the tibble by mean height, in descending order:
220 |
221 | ```{r}
222 | starwars %>%
223 | select(species, height, mass) %>%
224 | filter(species %in% c('Droid', 'Ewok', 'Human')) %>%
225 | group_by(species) %>%
226 | summarise(
227 | mean_height = mean(height, na.rm = TRUE),
228 | mean_mass = mean(mass, na.rm = TRUE)
229 | ) %>%
230 | arrange(desc(mean_height))
231 | ```
232 |
233 | -----
234 |
235 | ## Pipes and Plots
236 |
237 | You can also the `%>%` operator to chain dplyr commands with ggplot commans (and other R commands). The following examples combine some data manipulation to `filter()` female and males individuals, in order to graph a density plot of `height`
238 |
239 | ```{r densities}
240 | starwars %>%
241 | filter(gender %in% c('female', 'male')) %>%
242 | ggplot(aes(x = height, fill = gender)) +
243 | geom_density(alpha = 0.7)
244 | ```
245 |
246 | Here's another example in which instead of graphing density plots, we graph boxplots of `height` for female and male individuals:
247 |
248 | ```{r boxplots}
249 | starwars %>%
250 | filter(gender %in% c('female', 'male')) %>%
251 | ggplot(aes(x = gender, y = height, fill = gender)) +
252 | geom_boxplot()
253 | ```
254 |
255 | -----
256 |
257 | ## More Pipes
258 |
259 | Often, you will work with functions that don't take data frames (or tibbles) as
260 | inputs. A typical example is the base `plot()` function used to produce a
261 | scatterplot; you need to pass vectors to `plot()`, not data frames. In this
262 | situations you might find the `%$%` operator extremely useful.
263 |
264 | ```{r eval = FALSE}
265 | library(magrittr)
266 | ```
267 |
268 | The `%$%` operator, also from the package `"magrittr"`, is a cousin of the
269 | `%>%` operator. What `%$%` does is to _extract_ variables in a data frame
270 | so that you can refer to them explicitly. Let's see a quick example:
271 |
272 | ```{r scatterplot}
273 | starwars %>%
274 | filter(gender %in% c('female', 'male')) %$%
275 | plot(x = height, y = mass, col = factor(gender), las = 1)
276 | ```
277 |
--------------------------------------------------------------------------------
/tutorials/05-images/boxplots-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/05-images/boxplots-1.png
--------------------------------------------------------------------------------
/tutorials/05-images/densities-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/05-images/densities-1.png
--------------------------------------------------------------------------------
/tutorials/05-images/gender_barchart-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/05-images/gender_barchart-1.png
--------------------------------------------------------------------------------
/tutorials/05-images/height_histogram-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/05-images/height_histogram-1.png
--------------------------------------------------------------------------------
/tutorials/05-images/scatterplot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/05-images/scatterplot-1.png
--------------------------------------------------------------------------------
/tutorials/05-images/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/05-images/unnamed-chunk-3-1.png
--------------------------------------------------------------------------------
/tutorials/05-images/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/05-images/unnamed-chunk-5-1.png
--------------------------------------------------------------------------------
/tutorials/06-images/biplot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/06-images/biplot-1.png
--------------------------------------------------------------------------------
/tutorials/06-images/circle_correlations-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/06-images/circle_correlations-1.png
--------------------------------------------------------------------------------
/tutorials/06-images/eig_barchart-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/06-images/eig_barchart-1.png
--------------------------------------------------------------------------------
/tutorials/06-images/pc_plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/06-images/pc_plot-1.png
--------------------------------------------------------------------------------
/tutorials/07-shell-redirections.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Shell I/O Redirection"
3 | subtitle: "Stat 133, Spring 2018"
4 | author: "Gaston Sanchez"
5 | output: github_document
6 | ---
7 |
8 | > ### Learning Objectives
9 | >
10 | > - Understand Input/Output standard
11 | > - Learn about STDIN and STDOUT
12 | > - Understand the concept of input/output redirection
13 |
14 | ------
15 |
16 |
17 | ```{r setup, include=FALSE}
18 | knitr::opts_chunk$set(echo = TRUE)
19 | ```
20 |
21 | ## Introduction
22 |
23 | So far you have been working with the command line interface using basic commands to move around your file system (e.g. `cd`, `ls`, `pwd`), to inspect contents of files (e.g. `wc`, `head`, `tail`, `cat`, `less`, `file`), and to perform typical file operations:
24 |
25 | - create a directory, e.g. `mkdir lab05`
26 | - create an empty file, e.g. `touch README.md`
27 | - rename a file, e.g. `mv report.Rmd document.Rmd`
28 | - move a file to another directory, e.g. `mv myscript.R lab05/`
29 | - copy a file, e.g. `cp data1.csv data2.csv`
30 | - remove a file, e.g. `rm datafile.csv`
31 |
32 | Likewise, all the commands you've learned so far have required you to enter information at the command line, and all have produced output on the screen.
33 | The next step involves learning how to combine existing commands in new ways. To have a working example, we'll start with some toy directory `toydir` containing a handful of files:
34 |
35 | ```bash
36 | mkdir toydir
37 | cd toydir
38 | touch file1.txt
39 | touch README.md
40 | curl -O http://web.pdx.edu/~gerbing/data/cars.csv
41 | curl -O http://web.pdx.edu/~gerbing/data/employee.csv
42 | curl -O http://web.pdx.edu/~gerbing/data/ppseps.csv
43 | ```
44 |
45 | You should have a filestructure like this:
46 |
47 | ```
48 | toydir/
49 | README.md
50 | file1.txt
51 | cars.csv
52 | employee.csv
53 | ppseps.csv
54 | ```
55 |
56 | As you can tell, we have three CSV files. Say you want to find out which CSV file is the shortest? We can run the command `wc *.csv` to answer this question:
57 |
58 | ```bash
59 | wc *.csv
60 | 393 1027 20964 cars.csv
61 | 37 75 1832 employee.csv
62 | 11 28 252 ppseps.csv
63 | 441 1130 23048 total
64 | ```
65 |
66 | In this example we only have three CSV files, but what if there were 1000? Our first step toward a solution is to run the command `wc` to get the number of lines in each CSV file:
67 |
68 | ```bash
69 | wc -l *.csv
70 | 393 cars.csv
71 | 37 employee.csv
72 | 11 ppseps.csv
73 | 441 total
74 | ```
75 |
76 | What if you want to display the output above in increasing order? The answer is given with the following command:
77 |
78 | ```bash
79 | wc -l *.csv | sort
80 | ```
81 |
82 | and the displayed output should be:
83 |
84 | ```
85 | 11 ppseps.csv
86 | 37 employee.csv
87 | 393 cars.csv
88 | 441 total
89 | ```
90 |
91 | To better understand the previous command, we first need to talk about some technical aspects about the terminal, the shell, files, and unix things.
92 |
93 | -----
94 |
95 | ## Some Technical Background
96 |
97 | When you use the terminal, you are interacting with a program called the __shell__. There are different shell flavors, but the most common one is the _bash_ shell. What does the shell (e.g. bash) do? Basically, the shell interprets the commands that you type in and either executes them directly or passes them on to other programs.
98 |
99 | For example, consider the command `cat ppseps.csv` which displays the contents of `ppseps.csv` on the screen:
100 |
101 | ```bash
102 | cat ppseps.csv
103 | ```
104 |
105 | ```
106 | Company,EPS,PPS
107 | Imo Indust Inc,-3.26,6.500
108 | Toro Co ,-1.98,13.000
109 | Calmat Co,-0.45,22.500
110 | Tultex Corp,0.56,8.625
111 | Fam Dol St,1.00,17.250
112 | Phil Sub Corp,1.23,16.000
113 | Rtz Plc,1.50,41.375
114 | Tandy Corp,2.24,24.500
115 | Ok Gas & Elct,2.42,34.125
116 | Nicor Inc ,3.83,49.750
117 | ```
118 |
119 | It is the shell that finds the file `ppseps.csv`, and calls the `cat` command to ask it to print the file's contents. In turn, the `cat` command calls the kernel to find `ppseps.csv` on the disk and print its contents as a stream of characters on the terminal (i.e. monitor).
120 |
121 | Some commands that you type are _built into_ into the shell. For example, the `cd` command is built-in. That is, the shell interprets that command and changes your current directory. The `ls` command, on the other hand, is an _external_ program typically stored in the file `/bin/ls`.
122 |
123 | When you type the name of a command, the shell first checks to see if it is a built-in command and, if so, executes it.
124 |
125 |
126 | ## Standard Input and Output
127 |
128 | Most unix commands take input from your terminal and send the resulting output back to your terminal. A command normally reads its input from a place called _standard input_, which happens to be your keyboard by default. Similarly, a command normally writes its output to _standard output_, which is also your terminal by default.
129 |
130 | If a comman is executed without a filename argument, the command takes its input from standard input. One example of this type of command is the `cat` command. If you don't provide the name of a file to `cat`, then it expects to take input from your keyboard. Here's a toy example, type in `cat` and then press the _Enter_ key, then type three sentences, and finally press the keys _Ctrl_+_d_ to _stop_ the execution of `cat`:
131 |
132 | - `hi there!`
133 | - `never mind`
134 | - `see you later!`
135 |
136 | You should be able to see some lines of text like the following ones:
137 |
138 | ```bash
139 | cat
140 | hi there!
141 | hi there!
142 | never mind
143 | never mind
144 | see you later!
145 | see you later!
146 | ```
147 |
148 | The command `sort` is another example that can take input from the keyboard. In the terminal, type in four words, and the press _Ctrl_+_d_, for instance:
149 |
150 | ```bash
151 | sort
152 | Voldemort
153 | Dumbledore
154 | Potter
155 | Granger
156 | ```
157 |
158 | You should see the following sorted output in your monitor:
159 |
160 | ```
161 | Dumbledore
162 | Granger
163 | Potter
164 | Voldemort
165 | ```
166 |
167 | Because no filename was specified to the `sort` command, the input was taken from standard input, i.e. the keyboard. After the fourth name was typed in, the _Ctrl_ and _d_ keys were pressed to signal the end of the data stream. At that point, the `sort` command sorted the four names and displayed the results on the standard output, i.e. your monitor.
168 |
169 |
170 |
171 | ## Using Standard Input and Output
172 |
173 | If a program's input consists entirely of alphanumeric and punctuation characters, there is no difference between reading data from a file and reading data from a terminal. Likewise, if a program's output consists entirely of alphanumeric characters and punctuation, there is no difference between writing to a file, writing to a terminal, and writing to the input of another program.
174 |
175 | The _standard Input/Output_ facility, typically referred to as _I/O_, provides some simple defaults for managing input/output. There are three default I/O streams: 1) standard input, 2) standard output, and 3) standard error. By convention, standard output aka _stdout_ consists of all normal output from a command, while standard error, abbreviated _stderr_, consists of error messages.
176 |
177 | Standard input (_stdin_) normally comes from your keyboard. Many programs ignore _stdin_; you name files directly on the command line. For instance, the command `cat file1.csv file2.csv` never reads its standard input; it reads the files directly . But without filenames on the command line, commands that need input will usually read _stdin_. Standard input usually comes from your keyboard, but the shell can redirect _stdin_ from a file.
178 |
179 | The real advantage of standard I/O is that it allows you to _redirect_ input or output away from your terminal to a file. For example, if you want to run the command `cat file1.csv file2.csv`, but you want to place the output in `file3.csv` rather than sending it to your terminal, you have to use the following command:
180 |
181 | ```bash
182 | # redirecting output to file3.csv
183 | cat file1.csv file2.csv > file3.csv
184 | ```
185 |
186 | This is called __redirecting__ standard output to `file3.csv`. If you execute this command and look at the contents of `file3.csv`, you will find the contents of `file1.csv`, followed by the contents of `file2.csv`.
187 |
188 | One of the best-known forms of redirection in unix is the __pipe__. The shell's vertical bar `|` operator makes a pipe.
189 |
190 | | Description | bash example |
191 | |--------------------------------|---------------|
192 | | Send _stdout_ to _file_ | `cmd > file` |
193 | | Send _stderr_ to _file_ | `cmd 2> file` |
194 | | Take _stdin_ from _file_ | `cmd < file` |
195 | | Send _stdout_ to end of _file_ | `cmd >> file` |
196 |
197 | _Note:_ Keep in mind that the syntax used to redirect standard I/O depends on the shell you are using.
198 |
199 | In the next tutorial, you'll learn how to use I/O redirection operators, and apply them to do basic wrangling operations on data tables.
200 |
--------------------------------------------------------------------------------
/tutorials/07-shell-redirections.md:
--------------------------------------------------------------------------------
1 | Shell I/O Redirection
2 | ================
3 | Gaston Sanchez
4 |
5 | > ### Learning Objectives
6 | >
7 | > - Understand Input/Output standard
8 | > - Learn about STDIN and STDOUT
9 | > - Understand the concept of input/output redirection
10 |
11 | ------------------------------------------------------------------------
12 |
13 | Introduction
14 | ------------
15 |
16 | So far you have been working with the command line interface using basic commands to move around your file system (e.g. `cd`, `ls`, `pwd`), to inspect contents of files (e.g. `wc`, `head`, `tail`, `cat`, `less`, `file`), and to perform typical file operations:
17 |
18 | - create a directory, e.g. `mkdir lab05`
19 | - create an empty file, e.g. `touch README.md`
20 | - rename a file, e.g. `mv report.Rmd document.Rmd`
21 | - move a file to another directory, e.g. `mv myscript.R lab05/`
22 | - copy a file, e.g. `cp data1.csv data2.csv`
23 | - remove a file, e.g. `rm datafile.csv`
24 |
25 | Likewise, all the commands you've learned so far have required you to enter information at the command line, and all have produced output on the screen. The next step involves learning how to combine existing commands in new ways. To have a working example, we'll start with some toy directory `toydir` containing a handful of files:
26 |
27 | ``` bash
28 | mkdir toydir
29 | cd toydir
30 | touch file1.txt
31 | touch README.md
32 | curl -O http://web.pdx.edu/~gerbing/data/cars.csv
33 | curl -O http://web.pdx.edu/~gerbing/data/employee.csv
34 | curl -O http://web.pdx.edu/~gerbing/data/ppseps.csv
35 | ```
36 |
37 | You should have a filestructure like this:
38 |
39 | toydir/
40 | README.md
41 | file1.txt
42 | cars.csv
43 | employee.csv
44 | ppseps.csv
45 |
46 | As you can tell, we have three CSV files. Say you want to find out which CSV file is the shortest? We can run the command `wc *.csv` to answer this question:
47 |
48 | ``` bash
49 | wc *.csv
50 | 393 1027 20964 cars.csv
51 | 37 75 1832 employee.csv
52 | 11 28 252 ppseps.csv
53 | 441 1130 23048 total
54 | ```
55 |
56 | In this example we only have three CSV files, but what if there were 1000? Our first step toward a solution is to run the command `wc` to get the number of lines in each CSV file:
57 |
58 | ``` bash
59 | wc -l *.csv
60 | 393 cars.csv
61 | 37 employee.csv
62 | 11 ppseps.csv
63 | 441 total
64 | ```
65 |
66 | What if you want to display the output above in increasing order? The answer is given with the following command:
67 |
68 | ``` bash
69 | wc -l *.csv | sort
70 | ```
71 |
72 | and the displayed output should be:
73 |
74 | 11 ppseps.csv
75 | 37 employee.csv
76 | 393 cars.csv
77 | 441 total
78 |
79 | To better understand the previous command, we first need to talk about some technical aspects about the terminal, the shell, files, and unix things.
80 |
81 | ------------------------------------------------------------------------
82 |
83 | Some Technical Background
84 | -------------------------
85 |
86 | When you use the terminal, you are interacting with a program called the **shell**. There are different shell flavors, but the most common one is the *bash* shell. What does the shell (e.g. bash) do? Basically, the shell interprets the commands that you type in and either executes them directly or passes them on to other programs.
87 |
88 | For example, consider the command `cat ppseps.csv` which displays the contents of `ppseps.csv` on the screen:
89 |
90 | ``` bash
91 | cat ppseps.csv
92 | ```
93 |
94 | Company,EPS,PPS
95 | Imo Indust Inc,-3.26,6.500
96 | Toro Co ,-1.98,13.000
97 | Calmat Co,-0.45,22.500
98 | Tultex Corp,0.56,8.625
99 | Fam Dol St,1.00,17.250
100 | Phil Sub Corp,1.23,16.000
101 | Rtz Plc,1.50,41.375
102 | Tandy Corp,2.24,24.500
103 | Ok Gas & Elct,2.42,34.125
104 | Nicor Inc ,3.83,49.750
105 |
106 | It is the shell that finds the file `ppseps.csv`, and calls the `cat` command to ask it to print the file's contents. In turn, the `cat` command calls the kernel to find `ppseps.csv` on the disk and print its contents as a stream of characters on the terminal (i.e. monitor).
107 |
108 | Some commands that you type are *built into* into the shell. For example, the `cd` command is built-in. That is, the shell interprets that command and changes your current directory. The `ls` command, on the other hand, is an *external* program typically stored in the file `/bin/ls`.
109 |
110 | When you type the name of a command, the shell first checks to see if it is a built-in command and, if so, executes it.
111 |
112 | Standard Input and Output
113 | -------------------------
114 |
115 | Most unix commands take input from your terminal and send the resulting output back to your terminal. A command normally reads its input from a place called *standard input*, which happens to be your keyboard by default. Similarly, a command normally writes its output to *standard output*, which is also your terminal by default.
116 |
117 | If a comman is executed without a filename argument, the command takes its input from standard input. One example of this type of command is the `cat` command. If you don't provide the name of a file to `cat`, then it expects to take input from your keyboard. Here's a toy example, type in `cat` and then press the *Enter* key, then type three sentences, and finally press the keys *Ctrl*+*d* to *stop* the execution of `cat`:
118 |
119 | - `hi there!`
120 | - `never mind`
121 | - `see you later!`
122 |
123 | You should be able to see some lines of text like the following ones:
124 |
125 | ``` bash
126 | cat
127 | hi there!
128 | hi there!
129 | never mind
130 | never mind
131 | see you later!
132 | see you later!
133 | ```
134 |
135 | The command `sort` is another example that can take input from the keyboard. In the terminal, type in four words, and the press *Ctrl*+*d*, for instance:
136 |
137 | ``` bash
138 | sort
139 | Voldemort
140 | Dumbledore
141 | Potter
142 | Granger
143 | ```
144 |
145 | You should see the following sorted output in your monitor:
146 |
147 | Dumbledore
148 | Granger
149 | Potter
150 | Voldemort
151 |
152 | Because no filename was specified to the `sort` command, the input was taken from standard input, i.e. the keyboard. After the fourth name was typed in, the *Ctrl* and *d* keys were pressed to signal the end of the data stream. At that point, the `sort` command sorted the four names and displayed the results on the standard output, i.e. your monitor.
153 |
154 | Using Standard Input and Output
155 | -------------------------------
156 |
157 | If a program's input consists entirely of alphanumeric and punctuation characters, there is no difference between reading data from a file and reading data from a terminal. Likewise, if a program's output consists entirely of alphanumeric characters and punctuation, there is no difference between writing to a file, writing to a terminal, and writing to the input of another program.
158 |
159 | The *standard Input/Output* facility, typically referred to as *I/O*, provides some simple defaults for managing input/output. There are three default I/O streams: 1) standard input, 2) standard output, and 3) standard error. By convention, standard output aka *stdout* consists of all normal output from a command, while standard error, abbreviated *stderr*, consists of error messages.
160 |
161 | Standard input (*stdin*) normally comes from your keyboard. Many programs ignore *stdin*; you name files directly on the command line. For instance, the command `cat file1.csv file2.csv` never reads its standard input; it reads the files directly . But without filenames on the command line, commands that need input will usually read *stdin*. Standard input usually comes from your keyboard, but the shell can redirect *stdin* from a file.
162 |
163 | The real advantage of standard I/O is that it allows you to *redirect* input or output away from your terminal to a file. For example, if you want to run the command `cat file1.csv file2.csv`, but you want to place the output in `file3.csv` rather than sending it to your terminal, you have to use the following command:
164 |
165 | ``` bash
166 | # redirecting output to file3.csv
167 | cat file1.csv file2.csv > file3.csv
168 | ```
169 |
170 | This is called **redirecting** standard output to `file3.csv`. If you execute this command and look at the contents of `file3.csv`, you will find the contents of `file1.csv`, followed by the contents of `file2.csv`.
171 |
172 | One of the best-known forms of redirection in unix is the **pipe**. The shell's vertical bar `|` operator makes a pipe.
173 |
174 | | Description | bash example |
175 | |--------------------------------|---------------|
176 | | Send *stdout* to *file* | `cmd > file` |
177 | | Send *stderr* to *file* | `cmd 2> file` |
178 | | Take *stdin* from *file* | `cmd < file` |
179 | | Send *stdout* to end of *file* | `cmd >> file` |
180 |
181 | *Note:* Keep in mind that the syntax used to redirect standard I/O depends on the shell you are using.
182 |
183 | In the next tutorial, you'll learn how to use I/O redirection operators, and apply them to do basic wrangling operations on data tables.
184 |
--------------------------------------------------------------------------------
/tutorials/08-shell-filters.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Introduction to Unix Filters"
3 | subtitle: "Stat 133, Spring 2018"
4 | author: "Gaston Sanchez"
5 | output: github_document
6 | ---
7 |
8 | > ### Learning Objectives
9 | >
10 | > - Basic manipulation of data tables with command line
11 | > - Get to know basic unix filters
12 | > - Redirect a command's output to a file
13 | > - Construct command pipelines with two or more stages
14 |
15 | ------
16 |
17 | ```{r setup, include=FALSE}
18 | knitr::opts_chunk$set(echo = TRUE)
19 | ```
20 |
21 | ## Introduction
22 |
23 | Sooner or later you will need to manipulate files from the command line: it could be `Rmd` files, `R` script files, image files, data files, etc. In this tutorial, we'll see new ways to manipulate data table files with shell commands and pipelines.
24 |
25 |
26 | ## Crash Example
27 |
28 | Let's get our feet wet with a working example. The first thing you'll need to do is create a directory for this tutorial:
29 |
30 | ```bash
31 | mkdir pipes
32 | cd pipes
33 | ```
34 |
35 | Download the file `nba2017-players.csv` from the course github repository, and then invoke `ls` to check that the data file was successfully downloaded.
36 | :
37 |
38 | ```bash
39 | curl -O https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/master/data/nba2017-players.csv
40 |
41 | ls
42 | ```
43 |
44 | Now that you have a data file, you can apply the basic commands that we've used in lab for inspecting the file's contents. Here's a table with some of those commands, as well as functions in R that have a similar use:
45 |
46 |
47 | | Command | Description | R alternative |
48 | |-----------------------------|-------------------------------|---------------------------|
49 | | `wc nba2017-players.csv` | count lines, words, and bytes | `object.size()`, `nrow()` |
50 | | `wc -l nba2017-players.csv` | count number of lines | `nrow()` |
51 | | `head nba2017-players.csv` | inspect first 10 rows | `head()` |
52 | | `tail nba2017-players.csv` | inspect last 10 rows | `tail()` |
53 | | `less nba2017-players.csv` | see contents with a paginator | `View()` |
54 |
55 |
56 | In addition to these inspection-oriented commands, you can also use other unix tools to carry out common manipulation of data tables: select rows and columns, sort information, determine frequencies, find unique values, etc.
57 |
58 |
59 | ### Redirecting output to a new file with `>`
60 |
61 | For demo purposes, let's subset the data file `nba2017-players.csv` by taking the first 11 rows:
62 |
63 | ```{bash}
64 | head -n 11 nba2017-players.csv
65 | ```
66 |
67 | Instead of displaying output on the screen, we can use the output redirection operator `>` to a new file:
68 |
69 | ```{bash}
70 | # redirection to new file
71 | head -n 11 nba2017-players.csv > data10.csv
72 | ```
73 |
74 | You can use `cat` to check that `data10.csv` contains the column names, and the firts 10 lines of data:
75 |
76 | ```{bash}
77 | # display contents on screen
78 | cat data10.csv
79 | ```
80 |
81 |
82 | ### Selecting columns with `cut`
83 |
84 | How do we select a specific column, say `position`, from `data10.csv`? We can use the `cut` command for this purpose, using the flag `-d ","` to specify the field-delimiter, and the flag `-f 3` to indicate that we want to extract the third column:
85 |
86 | ```{bash}
87 | # select third column
88 | cut -d "," -f 3 data10.csv
89 | ```
90 |
91 | In the same way we created `data10.csv`, we can redirect the output of `cut` to a new file `positions10.txt`
92 |
93 | ```{bash}
94 | # positions (first attempt)
95 | cut -d "," -f 3 data10.csv > positions10.txt
96 |
97 | cat positions10.txt
98 | ```
99 |
100 |
101 | ### Sorting lines with `sort`
102 |
103 | Another useful command is `sort`, which as you may guess, allows us to sort the lines of a stream of data:
104 |
105 | ```{bash}
106 | sort positions10.txt
107 | ```
108 |
109 | Notice that the name of the column `position` is also part of the output. But what if we just want to play with the positions values, excluding the column name?
110 |
111 | We can use `tail +2` to exclude the first value (i.e. the column name) . To do this with the column of positions, we must use the pipe operator `|` that enables us to take the output of a command and send it as the input of another command:
112 |
113 | ```{bash}
114 | cut -d "," -f 3 data10.csv | tail +2
115 | ```
116 |
117 | Now let's mix `|` and `>` to rebuild `positions10.txt` without the column name:
118 |
119 | ```{bash}
120 | # positions (second attempt)
121 | cut -d "," -f 3 data10.csv | tail +2 > positions10.txt
122 |
123 | cat positions10.txt
124 | ```
125 |
126 | Let's go back to the sorting operation:
127 |
128 | ```{bash}
129 | sort positions10.txt
130 | ```
131 |
132 |
133 | ### Listing unique occurrences with `sort -u`
134 |
135 | What if we want to list only the unique values (i.e. the unique categories)?
136 | `sort` has the flag `-u` to display only the unique occurrences:
137 |
138 | ```{bash}
139 | sort -u positions10.txt
140 | ```
141 |
142 |
143 | ### Counting unique occurrences with `sort` and `uniq`
144 |
145 | And what if we want the counts of those unique values (i.e. the frequencies)? To find the answer we pipe the output of `sort` as the input of the command `uniq` with the flag `-c`. Here's the entire pipe:
146 |
147 | ```{bash}
148 | sort positions10.txt | uniq -c
149 | ```
150 |
151 | Now, let's apply it on the entire data file, step by step:
152 |
153 | ```{bash}
154 | # select column of positions (excluding column name)
155 | cut -d "," -f 3 nba2017-players.csv | tail +2 > positions.txt
156 |
157 | # get position frequencies
158 | sort positions.txt | uniq -c
159 | ```
160 |
161 |
162 | ### All in one pipeline
163 |
164 | Finally, let's pipe all the commands in a single line, without creating the intermediate file `positions10.txt`:
165 |
166 | ```{bash}
167 | # count unique position values, in a single pipe
168 | cut -d "," -f 3 nba2017-players.csv | tail +2 | sort | uniq -c
169 | ```
170 |
171 |
172 | ### More examples
173 |
174 | What if you want to do the same but now for the teams? In other words, count the number of players in each team?
175 |
176 | ```{bash}
177 | # count unique team values, i.e. number of players
178 | cut -d "," -f 2 nba2017-players.csv | tail +2 | sort | uniq -c
179 | ```
180 |
181 | Find the minimum age (6th column)
182 |
183 | ```{bash}
184 | # minimum age
185 | cut -d "," -f 6 nba2017-players.csv | tail +2 | sort | head -n 1
186 | ```
187 |
188 | Find the maximum age (6th column)
189 |
190 | ```{bash}
191 | # maximum age
192 | cut -d "," -f 6 nba2017-players.csv | tail +2 | sort -r | head -n 1
193 | ```
194 |
195 | Frequencies of ages:
196 |
197 | ```{bash}
198 | # age frequencies
199 | cut -d "," -f 6 nba2017-players.csv | tail +2 | sort | uniq -c
200 | ```
201 |
202 |
203 | -----
204 |
205 |
206 | # Filters
207 |
208 | In the above examples we use a set of commands that are formally known as __filters__:
209 |
210 | - `sort`
211 | - `cut`
212 | - `uniq`
213 | - _etc_ (there are more filters)
214 |
215 | Filters are a particular type of unix program that expects to work either with file redirection or as a part of a pipeline. These programs read input from standard input, write output to standard output, and often don't have any starting arguments.
216 |
217 |
218 | ## Extracting columns with `cut`
219 |
220 | When working with files that have a tabular structure (e.g. csv, tsv, field delimited) it is very common to focus on one or more "columns". To pull vertical columns from a file, you can use the `cut` command.
221 |
222 | `cut` operates based either on character position within the column when using the `-c` flag, or on delimited fields when using the `-f` flag. By default, `cut` expects tabs as the delimiter. If a file separates fields with spaces or commas or any other delimiter, you need to use the option `-d` indicating the character used as field delimiter between quote marks.
223 |
224 | | Option | Description |
225 | |-----------|----------------------------------------------------|
226 | | `-f` 1,5 | return columns 1 and 5, delimited by tabs. |
227 | | `-f` 1,5 | return columns 1 through 5, delimited by tabs. |
228 | | `-d ","` | use commans as the delimiters. |
229 | | `-c 2-7` | return characters 2 through 7 from the file. |
230 |
231 |
232 | ```bash
233 | # return columns 1 and 3 (tsv file)
234 | cut -f 1,3 data.tsv
235 | ```
236 |
237 | ```bash
238 | # return columns 2 and 5 (tsv file)
239 | cut -f 2-5 data.tsv
240 | ```
241 |
242 | ```bash
243 | # return columns 1 and 3 (csv file)
244 | cut -f 1,3 -d "," data.csv
245 | ```
246 |
247 | ```bash
248 | # return characters 1 through 6 (fixed-width format file)
249 | cut -c 1-6 data.dat
250 | ```
251 |
252 | -----
253 |
254 |
255 | ## Sorting lines with `sort`
256 |
257 | The output stream produced by many commands, as well as the lines of a file or of a series of files, can be sorted into alphabetical order with the `sort` command. In other words, `sort` reads information and sorts it alphabetically. You can customize the behavior of `sort` to ignore the case of words, and to reverse the order of a sort. This command also enables you to sort lists of numbers. The table below shows some of the common options for the `sort` command:
258 |
259 | | Option | Description |
260 | |--------|------------------------------------------------------|
261 | | `-n` | sort in numerical order rather than alphabetically. |
262 | | `-r` | sort in reverse order, z to a or decreasing numbers. |
263 | | `-f` | fold uppercase into lowercase (i.e. ignore case). |
264 | | `-u` | return a unique representative of repeated items. |
265 | | `-k 3`| sort lines based on column 3 (tab or space delimiters) |
266 | | `-t ","` | use commas for delimiters. |
267 | | `-b` | ignore leading blanks. |
268 | | `-d` | sort in dictionary order. |
269 |
270 |
271 | -----
272 |
273 | ## Isolating unique lines with `uniq`
274 |
275 | Another useful command for extracting a subset of values from a file, or summarizing a stream of text, is `uniq`. This command removes consecutive identical lines from a file, leaving one unique representative. More precisely, what `uniq` does is compare each line it reads with the previous line. If the lines are the same, `uniq` does not list the second line. You can use options with `uniq` to get more specific results:
276 |
277 | | Option | Description |
278 | |---------|----------------------------------------------------|
279 | | `-c` | adds a count of how many times each line occurred. |
280 | | `-u` | lists only lines that are not repeated. |
281 | | `-d` | lists only lines that are duplicated. |
282 | | `-i` | ignore case when determining uniqueness |
283 | | `-f 4` | ignore the first 4 fields (space delimiter) |
284 |
285 | To get a single representative of each unique line from the entire file, in most cases you would need to first sort the lines with the `sort` command to group matching lines together. Interestingly, `uniq` can be used with the flag `-c` to count the number of occurrences of a line. This gives a quick way, for example, to assess the frequencies of values in a given column.
286 |
287 |
--------------------------------------------------------------------------------
/tutorials/09-creating-functions.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Getting started with functions"
3 | subtitle: "Stat 133, Spring 2018"
4 | author: "Gaston Sanchez"
5 | output: github_document
6 | fontsize: 11pt
7 | urlcolor: blue
8 | ---
9 |
10 | ```{r setup, include=FALSE}
11 | knitr::opts_chunk$set(echo = TRUE)
12 | library(dplyr)
13 | ```
14 |
15 |
16 | > ### Learning Objectives
17 | >
18 | > - Define a function that takes arguments
19 | > - Return a value from a function
20 | > - Test a function
21 | > - Set default values for function arguments
22 | > - Documenting a function
23 |
24 | ------
25 |
26 | ## Motivation
27 |
28 | - R comes with many functions (and packages) that let us perform a wide variety of tasks.
29 | - Most of the things we do in R is via calling some function.
30 | - Sometimes, however, there's no function to do what we want to achieve.
31 | - When that's the case, you will want to write your own functions.
32 |
33 | So far you've been using a number of functions in R. Now it's time to see
34 | how you can create and use your own functions.
35 |
36 | Consider the data set `starwars` that comes in the package `"dplyr"`
37 |
38 | ```{r}
39 | starwars
40 | ```
41 |
42 | Let's focus on the variable `height`, more specifically on the first 10 values:
43 |
44 | ```{r}
45 | ht10 <- starwars$height[1:10]
46 | ht10
47 | ```
48 |
49 | The values of `height` (and `ht10`) are expressed in centimeters, but what if we wanted to obtain values in inches? The conversion formula is 1 cm = 0.3937 in.
50 |
51 | ```{r}
52 | # height in inches
53 | ht10 * 0.3937
54 | ```
55 |
56 | This works. But what if you had more data sets, all of them containing `height` values in cms, and you needed to convert those cms into inches? Wouldn't be nice to have a dedicated function `cm2in()`?
57 |
58 | ```r
59 | cm2in(ht10)
60 | ```
61 |
62 | R does not have a built-in function `cm2in()` but we can create one. Let's see how to do it "logically" step by step:
63 |
64 | ```{r results='hide'}
65 | # 1) concrete example
66 | ht10 * 0.3937
67 |
68 | # 2) make it more general
69 | x <- ht10
70 | y <- x * 0.3937
71 |
72 | # 3) encapsulate code with an R expression
73 | {
74 | y <- x * 0.3937
75 | }
76 |
77 | # 4) create function
78 | cm2in <- function(x) {
79 | y <- x * 0.3937
80 | return(y)
81 | }
82 |
83 | # 5) test it
84 | cm2in(ht10)
85 |
86 | # 6) keep testing
87 | cm2in(starwars$height)
88 | ```
89 |
90 | - To define a new function in R you use the function `function()`.
91 | - You need to specify a name for the function, and then assign `function()`
92 | to the chosen name.
93 | - You also need to define optional arguments (i.e. inputs).
94 | - And of course, you must write the code (i.e. the body) so the function does
95 | something when you use it:
96 |
97 |
98 | -----
99 |
100 | ## Anatomy of a function
101 |
102 | ```{r}
103 | # anatomy of a function
104 | some_name <- function(arguments) {
105 | # body of the function
106 | }
107 | ```
108 |
109 | - Generally, you give a name to a function.
110 | - A function takes one or more inputs (or none), known as _arguments_.
111 | - The expressions forming the operations comprise the __body__ of the function.
112 | - Usually, you wrap the body of the functions with curly braces.
113 | - A function returns a single value.
114 |
115 | A less abstract function could have the following structure:
116 |
117 | ```r
118 | some_name <- function(arg1, arg2, etc)
119 | {
120 | expression_1
121 | expression_2
122 | ...
123 | expression_n
124 | }
125 | ```
126 |
127 | -----
128 |
129 | ### Scale Transformations
130 |
131 | Let's see another example. Often, we need to transform the scale of one or more variables. Perhaps the most common type of transformation is when we _standardize_ a variable, that is: mean-center and divide by its standard deviation:
132 |
133 | 
134 |
135 | R has the function `scale()` that can be used to perform this operation, but let's pretend for a minute that there's no function in R to calculate standard scores. Here are the primary steps to compute such score:
136 |
137 | - compute the mean
138 | - compute the standard deviation
139 | - calculate deviations from mean
140 | - divide by standard deviation
141 |
142 | ```{r}
143 | x <- ht10
144 | x_mean <- mean(x)
145 | x_sd <- sd(x)
146 | x_centered <- x - x_mean
147 | z <- x_centered / x_sd
148 | z
149 | ```
150 |
151 | Having the code of the body, we can encapsulate it with a function:
152 |
153 | ```{r}
154 | # first round
155 | standardize <- function(x) {
156 | x_mean <- mean(x)
157 | x_sd <- sd(x)
158 | x_centered <- x - x_mean
159 | z <- x_centered / x_sd
160 | return(z)
161 | }
162 | ```
163 |
164 | And now we can test it:
165 |
166 | ```{r}
167 | standardize(ht10)
168 | ```
169 |
170 | What about applying `standardize()` on the entire column `height`:
171 |
172 | ```{r}
173 | standardize(starwars$height)
174 | ```
175 |
176 | Ooops! Because `starwars$height` contains missing values, our `standardize()` function does not know how to deal with them.
177 |
178 |
179 | ### Dealing with missing values
180 |
181 | How to deal with `NA`'s? Many functions in R like `sum()`, `mean()`, and `median()` have the so-called `na.rm` argument to specify if missing values should be removed before any computation this feature. We can take advantage of `na.rm = TRUE`:
182 |
183 | ```{r}
184 | # second round
185 | standardize <- function(x) {
186 | x_mean <- mean(x, na.rm = TRUE)
187 | x_sd <- sd(x, na.rm = TRUE)
188 | x_centered <- x - x_mean
189 | z <- x_centered / x_sd
190 | return(z)
191 | }
192 |
193 | standardize(ht10)
194 |
195 | standardize(starwars$height)
196 | ```
197 |
198 | Now `standardize()` is able to return a more useful output by removing missing values. However, we should let the user decide if `NA`'s must be removed. We can include an argument in `standardize()` to indicate if missing values are to be removed:
199 |
200 | ```{r}
201 | # third round
202 | standardize <- function(x, na_rm = FALSE) {
203 | x_mean <- mean(x, na.rm = na_rm)
204 | x_sd <- sd(x, na.rm = na_rm)
205 | x_centered <- x - x_mean
206 | z <- x_centered / x_sd
207 | return(z)
208 | }
209 |
210 | # default call
211 | standardize(starwars$height)
212 |
213 | # removing NAs
214 | standardize(starwars$height, na_rm = TRUE)
215 | ```
216 |
217 |
218 | ### Simplifying the body
219 |
220 | So far we have a working function `standardize()` that does the job and takes care of potential missing values. We can take a further step and review the code of the body. Let's go back to the initial code:
221 |
222 | ```{r}
223 | x <- ht10
224 | x_mean <- mean(x)
225 | x_sd <- sd(x)
226 | x_centered <- x - x_mean
227 | z <- x_centered / x_sd
228 | ```
229 |
230 | The code above works, but it is very "verbose". We can take advantage of R's functional behavior to shorten the computation of the standard scores in one line:
231 |
232 | ```{r}
233 | x <- ht10
234 | z <- (x - mean(x)) / sd(x)
235 | z
236 | ```
237 |
238 | Having simplified the code, we can simplify our function:
239 |
240 | ```{r}
241 | # fifth round
242 | standardize <- function(x, na_rm = FALSE) {
243 | z <- (x - mean(x, na.rm = na_rm)) / sd(x, na.rm = na_rm)
244 | return(z)
245 | }
246 |
247 | standardize(tail(starwars$height, n = 10), na_rm = TRUE)
248 | ```
249 |
250 | -----
251 |
252 | # Documenting Functions
253 |
254 | The examples of functions in this tutorial are simple, and fairly understandble (I hope so). However, you should strive to always include _documentation_ for your functions. What does this mean? Documenting a function involves adding descriptions for the purpose of the function, the inputs it accepts, and the output it produces.
255 |
256 | - Description: what the function does
257 | - Input(s): what are the inputs or arguments
258 | - Output: what is the output (returned value)
259 |
260 | You can find some inspiration in the `help()` documentation when your search
261 | for a given function's description.
262 |
263 | There are several approaches for writing documentation of a function. I will show you how to use what are called __roxygen comments__ to achieve this task. While not used by most useRs, they are great when you want to take your code and make a package out of it.
264 |
265 | Here's an example of documentation for `standardize()`
266 |
267 | ```{r}
268 | #' @title Standardize
269 | #' @description Transforms values in standard units (i.e. standard scores)
270 | #' @param x numeric vector
271 | #' @param na_rm whether to remove missing values
272 | #' @return standardized values
273 | #' @examples
274 | #' standardize(rnorm(10))
275 | standardize <- function(x, na_rm = FALSE) {
276 | z <- (x - mean(x, na.rm = na_rm)) / sd(x, na.rm = na_rm)
277 | return(z)
278 | }
279 | ```
280 |
281 | - Roxygen comments are R comments formed by the hash symbol immediately followed by an apostrophe: `#'`
282 |
283 | - You specify the label of a field with `@` and a keyword: e.g. `@title`
284 |
285 | - The syntax highlighting of RStudio recognizes this type of comments and labels
286 |
287 | - Typical roxygen fields:
288 |
289 | | label | meaning | description |
290 | |----------------|-------------|----------------------------|
291 | | `@title` | title | name of your function |
292 | | `@description` | description | what the function does |
293 | | `@param input` | parameter | describe `input` parameter |
294 | | `@return` | output | what is the returned value |
295 |
296 | -----
297 |
298 | ### General Strategy for Writing Functions
299 |
300 | - Always start simple with test toy-values.
301 | - Get what will be the body of the function working first.
302 | - Check out each step of the way.
303 | - Don't try and do too much at once.
304 | - Create (encapsulate body) the function once everything works.
305 | - Optional: after you have a function that works, you may worry about "elegance", "efficiency", "cleverness", etc
306 | - Include documentation; we suggest using Roxygen comments.
307 |
--------------------------------------------------------------------------------
/tutorials/10-intro-to-functions.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Basics of functions"
3 | subtitle: "Stat 133, Spring 2018"
4 | author: "Gaston Sanchez"
5 | output: github_document
6 | fontsize: 11pt
7 | urlcolor: blue
8 | ---
9 |
10 | ```{r setup, include=FALSE}
11 | knitr::opts_chunk$set(echo = TRUE)
12 | library(dplyr)
13 | ```
14 |
15 |
16 | > ### Learning Objectives
17 | >
18 | > - Define a function that takes arguments
19 | > - Return a value from a function
20 | > - Test a function
21 | > - Set default values for function arguments
22 |
23 | ------
24 |
25 |
26 | ## Anatomy of a function
27 |
28 | To define a new function in R you use the function `function()`.
29 | You need to specify a name for the function, and then assign `function()`
30 | to the chosen name. You also need to define optional arguments (i.e. inputs).
31 | And of course, you must write the code (i.e. the body) so the function does
32 | something when you use it:
33 |
34 | ```{r}
35 | # anatomy of a function
36 | some_name <- function(arguments) {
37 | # body of the function
38 | }
39 | ```
40 |
41 | - Generally, you give a name to a function.
42 | - A function takes one or more inputs (or none), known as _arguments_.
43 | - The expressions forming the operations comprise the __body__ of the function.
44 | - Usually, you wrap the body of the functions with curly braces.
45 | - A function returns a single value.
46 |
47 | A less abstract function could have the following structure:
48 |
49 | ```r
50 | function_name <- function(arg1, arg2, etc)
51 | {
52 | expression_1
53 | expression_2
54 | ...
55 | expression_n
56 | }
57 | ```
58 |
59 | ### Example 2: From Fahrenheit to Celsius
60 |
61 | Let's consider a typical programming example that involves converting
62 | fahrenheit degrees into celsius degrees. The conversion formula is
63 | $(F - 32) \times 5/9 = C$. Here's some R code to convert 100 fahrenheit
64 | degrees into Celsius degrees:
65 |
66 | ```{r}
67 | # fahrenheit degrees
68 | far_deg <- 100
69 |
70 | # convert to celsius
71 | (far_deg - 32) * (5/9)
72 | ```
73 |
74 | What if you want to convert 90 fahrenheit degrees in Celsius degrees?
75 | One option would be to rewrite the previous lines as:
76 |
77 | ```{r}
78 | # fahrenheit degrees
79 | far_deg <- 90
80 |
81 | # convert to celsius
82 | (far_deg - 32) * (5/9)
83 | ```
84 |
85 | However, retyping many lines of code can be very boring, tedious, and
86 | inefficient. To make your code reusable in a more efficient manner, you will
87 | have to write functions.
88 |
89 |
90 | #### Writing a simple function
91 |
92 | So, how do you create a function? The first step is to write code and make
93 | sure that it works. In this case we already have the code that converts a
94 | number in Fahrenheit units into Celsius.
95 |
96 | The next step is to __encapsulate__ the code in the form of a function. You
97 | have to choose a name, some argument(s), and determine the output. Here's one
98 | example with a function `fahrenheit_to_celsius()`
99 |
100 | ```{r}
101 | fahrenheit_to_celsius <- function(x) {
102 | y <- (x - 32) * (5/9)
103 | return(y)
104 | }
105 |
106 | fahrenheit_to_celsius(100)
107 | ```
108 |
109 | If you want to get the conversion of 90 fahrenheit degrees, you just simply
110 | execute it again by changing its argument:
111 |
112 | ```{r}
113 | fahrenheit_to_celsius(90)
114 | ```
115 |
116 | And because we are using arithmetic operators (i.e. multiplication, subtraction,
117 | division), the function is also vectorized:
118 |
119 | ```{r}
120 | fahrenheit_to_celsius(c(90, 100, 110))
121 | ```
122 |
123 | Sometimes it is recommended to add a default value to one (or more) of the
124 | arguments. In this case, we can give a default value of `x = 1`. When the
125 | user executes the function without any input, `fahrenheit_to_celsius` returns
126 | the value of 1 fahrenheit degree to Celsius degrees:
127 |
128 | ```{r}
129 | fahrenheit_to_celsius <- function(x = 1) {
130 | (x - 32) * (5/9)
131 | }
132 |
133 | # default execution
134 | fahrenheit_to_celsius()
135 | ```
136 |
137 |
138 | -----
139 |
140 | ## Another example
141 |
142 | Let's considet another toy example with a function that squares its argument:
143 |
144 | ```{r}
145 | square <- function(x) {
146 | x * x
147 | }
148 | ```
149 |
150 | - the function name is `"square"`
151 | - it has one argument: `x`
152 | - the function body consists of one simple expression
153 | - it returns the value `x * x`
154 |
155 | `square()` works like any other function in R:
156 |
157 | ```{r}
158 | square(10)
159 | ```
160 |
161 | In this case, `square()` is also vectorized:
162 |
163 | ```{r}
164 | square(1:5)
165 | ```
166 |
167 | Why is `square()` vectorized?
168 |
169 |
170 | Once defined, functions can be used in other function definitions:
171 |
172 | ```{r}
173 | sum_of_squares <- function(x) {
174 | sum(square(x))
175 | }
176 |
177 | sum_of_squares(1:5)
178 | ```
179 |
180 |
181 | ### Simple Expressions
182 |
183 | Functions with a body consisting of a __simple expression__ can be written with
184 | no braces (in one single line!):
185 |
186 | ```{r}
187 | square <- function(x) x * x
188 |
189 | square(10)
190 | ```
191 |
192 | However, as a general coding rule, you should get into the habit of writing functions using braces.
193 |
194 |
195 | ### Nested Functions
196 |
197 | We can also define a function inside another function:
198 |
199 | ```{r}
200 | getmax <- function(a) {
201 | # nested function
202 | maxpos <- function(u) which.max(u)
203 | # output
204 | list(position = maxpos(a),
205 | value = max(a))
206 | }
207 |
208 | getmax(c(2, -4, 6, 10, pi))
209 | ```
210 |
211 |
212 | ## Naming Functions
213 |
214 | There are different ways to name functions. The following list provides some
215 | examples with different naming styles:
216 |
217 | - `squareroot()`
218 | - `SquareRoot()`
219 | - `squareRoot()`
220 | - `square.root()`
221 | - `square_root()`
222 |
223 | I personally use the _underscore_ style. But you may find other programmers
224 | employing a different naming format. We strongly suggest using a consistent
225 | naming style. Many programming teams define their own style guides. If you
226 | are new to programming, it usually takes time to develop a consistent style.
227 | However, the sooner you have a defined personal style, the better.
228 |
229 | It is also important that you know which names are invalid in R:
230 |
231 | - `5quareroot()`: cannot begin with a number
232 | - `_square()`: cannot begin with an underscore
233 | - `square-root()`: cannot use hyphenated names
234 |
235 | In addition, avoid using an already existing name, e.g. `sqrt()`.
236 |
237 | Sometimes you will find functions with names starting with a dot: `.hidden()`;
238 | this type of functions are hidden functions, meaning that the function won't
239 | be visible by default in the list of objects in your working environment.
240 |
241 | ```{r}
242 | ls()
243 |
244 | visible <- function(x) {
245 | x * 2
246 | }
247 |
248 | .hidden <- function(y) {
249 | y * 2
250 | }
251 |
252 | ls()
253 | ```
254 |
255 |
256 | ## Function Output
257 |
258 | The value of a function can be established in two ways:
259 |
260 | - As the last evaluated simple expression (in the body of the function)
261 | - An explicitly __returned__ value via `return()`
262 |
263 | Here's a basic example of a function in which the output is the last evaluated
264 | expression:
265 |
266 | ```{r}
267 | add <- function(x, y) {
268 | x + y
269 | }
270 |
271 | add(2, 3)
272 | ```
273 |
274 | Here's another version of `add()` in which the output is the last evaluated
275 | expression:
276 |
277 | ```{r}
278 | add <- function(x, y) {
279 | z <- x + y
280 | z
281 | }
282 |
283 | add(2, 3)
284 | ```
285 |
286 | Be careful with the form in which the last expression is evaluated:
287 |
288 | ```{r}
289 | add <- function(x, y) {
290 | z <- x + y
291 | }
292 |
293 | add(2, 3)
294 | ```
295 |
296 | In this case, it looks like `add()` does not work. If you run the previous
297 | code, nothing appears in the console. Can you guess why? To help you answer
298 | this question, assign the invocation to an object and then print the object:
299 |
300 | ```r
301 | why <- add(2, 3)
302 | why
303 | ```
304 |
305 | `add()` does work. The issue has to do with the form of the last expression.
306 | Nothing gets displayed in the console because the last statement `z <- x + y`
307 | is an assignment (that does not print anything).
308 |
309 |
310 | ### The `return()` command
311 |
312 | More often than not, the `return()` command is included to explicitly indicate
313 | the output of a function:
314 |
315 | ```{r}
316 | add <- function(x, y) {
317 | z <- x + y
318 | return(z)
319 | }
320 |
321 | add(2, 3)
322 | ```
323 |
324 | I've seen that many users with previous programming experience in other languages
325 | prefer to use `return()`. The main reason is that most programming languages
326 | tend to use some sort of _return_ statement to indicate the output of a function.
327 |
328 | So, following good language-agnostic coding practices, we also recommend that
329 | you use the function `return()`. In this way, any reader can quickly scan the
330 | body of your functions and visually locate the places in which a _return_
331 | statement is being made.
332 |
333 | -----
334 |
335 | ### Variance Function Example
336 |
337 | The sample variance is given by the following formula:
338 |
339 | $$
340 | var(x) = \frac{1}{n-1} \sum_{i = 1}^{n} (x_i - \bar{x})^2
341 | $$
342 |
343 | 
344 |
345 | Let's create a `variance()` function that computes the sample variance.
346 | The first step should always be writing the code that will become the body of
347 | the function:
348 |
349 | ```{r}
350 | # start simple
351 | x <- 1:10
352 |
353 | # get working code
354 | sum((x - mean(x)) ^ 2) / (length(x) - 1)
355 |
356 | # test it: compare it to var()
357 | var(1:10)
358 | ```
359 |
360 |
361 | One you know your code works, then you can encapsulate with `function()`:
362 |
363 | ```{r}
364 | # encapsulate your code
365 | variance <- function(x) {
366 | sum((x - mean(x)) ^ 2) / (length(x) - 1)
367 | }
368 |
369 | # check that it works
370 | variance(x)
371 | ```
372 |
373 | Before doing any further changes to `variance()`, you should test it with
374 | a handful of other (possibly extreme) cases:
375 |
376 | ```{r}
377 | # consider less simple cases
378 | variance(runif(10))
379 |
380 | # what about atypical cases?
381 | variance(rep(0, 10))
382 |
383 | # what if there are missing values?
384 | variance(c(1:9, NA))
385 | ```
386 |
387 | You can then start gradually adapting your function to make it more robust,
388 | more flexible, more user friendly, etc. For instance, `variance()` returns
389 | `NA` when the provided vector contains missing values. But you can include
390 | an argument that removes any missing values. Many functions in R have this
391 | feature, like `sum()`, `mean()`, `median()`. They all use the so-called
392 | `na.rm` argument to specify if missing values should be removed before any
393 | computation is done:
394 |
395 | ```{r}
396 | # adapt it gradually
397 | variance <- function(x, na.rm = FALSE) {
398 | if (na.rm) {
399 | # removing missing values
400 | x <- x[!is.na(x)]
401 | }
402 | # compute sample variance
403 | sum((x - mean(x)) ^ 2) / (length(x) - 1)
404 | }
405 |
406 | # check that it works
407 | variance(c(1:9, NA), na.rm = TRUE)
408 | ```
409 |
--------------------------------------------------------------------------------
/tutorials/12-intro-to-loops.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Introduction to loops"
3 | subtitle: "Stat 133, Spring 2018"
4 | author: "Gaston Sanchez"
5 | output: github_document
6 | fontsize: 11pt
7 | urlcolor: blue
8 | ---
9 |
10 | > ### Learning Objectives
11 | >
12 | > - Why do you need loops?
13 | > - Get to know the For loop
14 | > - Get to know the While loop
15 | > - Get to know the Repeat loop
16 |
17 | ```{r setup, include=FALSE}
18 | knitr::opts_chunk$set(error = TRUE)
19 | ```
20 |
21 | ------
22 |
23 | ## About Loops
24 |
25 | - Many times we need to perform a procedure several times
26 | - We perform the same operation several times as long as some condition
27 | is fulfilled
28 | - For this purpose we use loops
29 | - The main idea is that of __iteration__
30 | - R provides three basic paradigms: `for`, `repeat`, `while`
31 |
32 |
33 | ## Motivation example
34 |
35 | Consider a numeric vector with prices of five items:
36 |
37 | ```{r}
38 | prices <- c(2.50, 2.95, 3.45, 3.25)
39 |
40 | prices
41 | ```
42 |
43 |
44 | ### Printing prices "manually"
45 |
46 | Say you are interested in printing each price individually. You can manually
47 | display them one by one, by typing the same command several times:
48 |
49 | ```{r print_prices, eval = FALSE}
50 | cat("Price 1 is", prices[1])
51 | cat("Price 2 is", prices[2])
52 | cat("Price 3 is", prices[3])
53 | cat("Price 4 is", prices[4])
54 | ```
55 |
56 | ```{r print_prices, echo=FALSE}
57 | ```
58 |
59 |
60 | ### Printing prices with a `for` loop
61 |
62 | Or you can use a loop structure in which you tell the computer to display the
63 | prices a given number of times, but using one command instead of typing it
64 | various times:
65 |
66 | ```{r}
67 | for (i in 1:4) {
68 | cat("Price", i, "is", prices[i], "\n")
69 | }
70 | ```
71 |
72 |
73 | Let's make it less simple by creating a vector of prices with the names of
74 | the associated coffees:
75 |
76 | ```{r}
77 | coffee_prices <- c(
78 | expresso = 2.50,
79 | latte = 2.95,
80 | mocha = 3.45,
81 | cappuccino = 3.25)
82 |
83 | coffee_prices
84 | ```
85 |
86 | Without using a loop, you can display, via `cat()`, the prices one-by-one;
87 | (this, of course, involves a lot of repetation)
88 |
89 | ```{r print_coffee, eval = FALSE}
90 | cat("Expresso has a price of", coffee_prices[1])
91 | cat("Latte has a price of", coffee_prices[2])
92 | cat("Mocha has a price of", coffee_prices[3])
93 | cat("Capuccino has a price of", coffee_prices[4])
94 | ```
95 |
96 | ```{r print_coffee, echo = FALSE}
97 | ```
98 |
99 |
100 | ### Printing coffee prices with a `for` loop
101 |
102 | ```{r}
103 | for (i in 1:4) {
104 | cat(names(coffee_prices)[i], "has a price of",
105 | prices[i], "\n")
106 | }
107 | ```
108 |
109 | -----
110 |
111 | ## For Loops
112 |
113 | - Often we want to repeatedly carry out some computation a __fixed__ number of times.
114 | - For instance, repeat an operation for each element of a vector.
115 | - In R this can be done with a __`for`__ loop.
116 | - `for` loops are used when __we know exactly how many times__ we want the code to repeat
117 |
118 | The anatomy of a `for` loop is as follows:
119 |
120 | ```{r eval = FALSE}
121 | for (iterator in times) {
122 | do_something
123 | }
124 | ```
125 |
126 | `for()` takes an __iterator__ variable and a vector of __times__ to iterate
127 | through.
128 |
129 | ```{r}
130 | value <- 2
131 |
132 | for (i in 1:5) {
133 | value <- value * 2
134 | print(value)
135 | }
136 | ```
137 |
138 |
139 | The vector of _times_ does NOT have to be a numeric vector; it can be any vector
140 |
141 | ```{r}
142 | value <- 2
143 | times <- c('one', 'two', 'three', 'four')
144 |
145 | for (i in times) {
146 | value <- value * 2
147 | print(value)
148 | }
149 | ```
150 |
151 |
152 | ### For Loops and Next statement
153 |
154 | Sometimes we need to skip a loop iteration if a given condition is met, this can be done with a next statement
155 |
156 | ```{r eval=FALSE}
157 | for (iterator in times) {
158 | expr1
159 | expr2
160 | if (condition) {
161 | next
162 | }
163 | expr3
164 | expr4
165 | }
166 | ```
167 |
168 |
169 | Example:
170 |
171 | ```{r}
172 | x <- 2
173 |
174 | for (i in 1:5) {
175 | y <- x * i
176 | if (y == 8) {
177 | next
178 | }
179 | print(y)
180 | }
181 | ```
182 |
183 |
184 | ### Nested Loops
185 |
186 | It is common to have nested loops
187 | ```{r eval = FALSE}
188 | for (iterator1 in times1) {
189 | for (iterator2 in times2) {
190 | expr1
191 | expr2
192 | ...
193 | }
194 | }
195 | ```
196 |
197 |
198 | Example: Nested loops
199 |
200 | ```{r}
201 | # some matrix
202 | A <- matrix(1:12, nrow = 3, ncol = 4)
203 |
204 | A
205 | ```
206 |
207 |
208 | Example: Nested Loops
209 |
210 | ```{r}
211 | # reciprocal of values less than 6
212 | for (i in 1:nrow(A)) {
213 | for (j in 1:ncol(A)) {
214 | if (A[i,j] < 6) A[i,j] <- 1 / A[i,j]
215 | }
216 | }
217 |
218 | A
219 | ```
220 |
221 | -----
222 |
223 | ## About `for` Loops and Vectorized Computations
224 |
225 | - R loops have bad reputation for being slow.
226 |
227 | - Experienced users will tell you: "tend to avoid for loops in R" (me included).
228 |
229 | - It is not really that the loops are slow; the slowness has more to do with the way R handles the _boxing and unboxing_ of data objects, which may be a bit inefficient.
230 |
231 | - R provides a family of functions that are usually more efficient than loops
232 | (i.e. `apply()` functions).
233 |
234 | - For this course, especially if you have NO programming experience, you should ignore any advice about avoiding loops in R.
235 |
236 | - You should learn how to write loops, and how they work; every programming language provides some type of loop structures.
237 |
238 | - In practice, many (programming) problems can be tackled using some loop.
239 |
240 | - When using R, you may need to start solving a problem using a loop. Once you solved it, try to see if you can find a vectorized alternative.
241 |
242 | - It takes practice and experience to find alternative solutions to `for` loops.
243 |
244 | - There are cases when using `for` loops is not that bad.
245 |
246 |
247 | -----
248 |
249 |
250 | ## Repeat Loop
251 |
252 | `repeat` executes the same code over and over until a stop condition is met:
253 |
254 | ```{r eval=FALSE}
255 | repeat {
256 | # keep
257 | # doing
258 | # something
259 | if (stop_condition) {
260 | break
261 | }
262 | }
263 | ```
264 |
265 | The `break` statement stops the loops. If you enter an infinite loop, you can
266 | manually break it by pressing the `ESC` key.
267 |
268 | ```{r}
269 | value <- 2
270 |
271 | repeat {
272 | value <- value * 2
273 | print(value)
274 | if (value >= 40) {
275 | break
276 | }
277 | }
278 | ```
279 |
280 |
281 | To skip a current iteration, use `next`
282 |
283 | ```{r}
284 | value <- 2
285 |
286 | repeat {
287 | value <- value * 2
288 | print(value)
289 | if (value == 16) {
290 | value <- value * 2
291 | next
292 | }
293 | if (value > 80) break
294 | }
295 | ```
296 |
297 |
298 |
299 | ## While Loops
300 |
301 | It can also be useful to repeat a computation until a condition is false.
302 | A `while` loop provides this form of control flow.
303 |
304 | ```{r eval=FALSE}
305 | while (condition) {
306 | # keep
307 | # doing
308 | # something
309 | # until
310 | # condition is FALSE
311 | }
312 | ```
313 |
314 |
315 | ### About while loops
316 |
317 | - `while` loops are backward `repeat` loops
318 | - `while` checks first and then attempts to execute
319 | - computations are carried out for as long as the condition is true
320 | - the loop stops when the condition is FALSE
321 | - If you enter an infinite loop, break it by pressing `ESC` key
322 |
323 |
324 | ```{r}
325 | value <- 2
326 |
327 | while (value < 40) {
328 | value <- value * 2
329 | print(value)
330 | }
331 | ```
332 |
333 | -----
334 |
335 | ## Loops: `for`, `while`, `repeat`
336 |
337 | Let's see one last example of a `for` loop, and how to achieve the same task
338 | with `while` and `repeat` loops.
339 |
340 | Say you have a vector `x <- c(2, 4, 6, 8, 10)`, and the goal is to obtain the
341 | sum of the elements in `x`; in other words get `sum(x)` but using loops.
342 |
343 | ```{r}
344 | # using a for loop
345 | x <- c(2, 4, 6, 8, 10)
346 |
347 | # initialize output
348 | sumx <- 0
349 |
350 | for (i in seq_along(x)) {
351 | print(paste('iteration:'), i)
352 | sumx <- sumx + x[i]
353 | print(paste('sum =', sumx))
354 | }
355 |
356 | sumx
357 | ```
358 |
359 | Now let's do it with a while loop
360 |
361 | ```{r}
362 | # initialize output
363 | sumx <- 0
364 |
365 | # initialize counter
366 | i <- 1
367 |
368 | # while loop
369 | while (i <= length(x)) {
370 | print(paste('iteration:', i))
371 | sumx <- sumx + x[i]
372 | print(paste('sum =', sumx))
373 | i <- i + 1
374 | }
375 |
376 | sumx
377 | ```
378 |
379 | And finally with a `repeat` loop:
380 |
381 | ```{r}
382 | # initialize output
383 | sumx <- 0
384 |
385 | # initialize counter
386 | i <- 1
387 |
388 | # repeat loop (visualizing iterations)
389 | repeat {
390 | print(paste('iteration:', i))
391 | sumx <- sumx + x[i]
392 | print(paste('sum =', sumx))
393 | i <- i + 1
394 | if (i > length(x)) {
395 | break
396 | }
397 | }
398 |
399 | sumx
400 | ```
401 |
402 | -----
403 |
404 | ## Repeat, While, For
405 |
406 | - If you don't know the number of times something will be done, you can use
407 | either `repeat` or `while`
408 | - `while` evaluates the condition at the beginning
409 | - `repeat` executes operations until a stop condition is met
410 | - If you know the number of times that something will be done, use `for`
411 | - `for` needs an _iterator_ and a vector of _times_
412 |
413 |
414 | ### Questions
415 |
416 | - What happens if you pass `NA` as a condition to `if()`?
417 | - What happens if you pass `NA` as a condition to `ifelse()`?
418 | - What types of values can be passed as the first argument to `switch()`?
419 | - How do you stop a `repeat` loop executing?
420 | - How do you jump to next iteration of a loop?
421 |
422 |
--------------------------------------------------------------------------------
/tutorials/14-images/test-report.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/14-images/test-report.png
--------------------------------------------------------------------------------
/tutorials/15-intro-to-regex.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Introduction to Regular Expresions"
3 | subtitle: "Stat 133, Spring 2018"
4 | author: "Gaston Sanchez"
5 | output: github_document
6 | fontsize: 11pt
7 | urlcolor: blue
8 | ---
9 |
10 | > ### Learning Objectives:
11 | >
12 | > - First contact with Regex
13 | > - Regex functions in package `"stringr"`
14 | > - Text file with log common format
15 |
16 | ------
17 |
18 |
19 | ```{r setup, include=FALSE}
20 | knitr::opts_chunk$set(echo = TRUE)
21 | library(stringr)
22 | ```
23 |
24 | ## Getting started with Regex in R
25 |
26 | In this tutorial we'll be mainly using functions from the R package `"stringr"`:
27 |
28 | ```{r string, eval = FALSE}
29 | # install.packages(stringr)
30 | library(stringr)
31 | ```
32 |
33 | Although R has built-in functions to perform regex operations, I've found that
34 | functions from `"stringr"` are more user friendly (i.e. they have a more
35 | consistent naming style).
36 |
37 |
38 | ## Data Log File
39 |
40 | In this tutorial, we'll be using the text file `may-logs.txt` located in the
41 | `data/` folder of the course github repo:
42 |
43 | [https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/master/data/may-logs.txt](https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/master/data/may-logs.txt)
44 |
45 | This file is a __server log file__ that contains the recorded events taking place
46 | in a web server. The content of the file is in a special format known as
47 | _common log format_. According to [wikipedia](https://en.wikipedia.org/wiki/Common_Log_Format):
48 |
49 | "The Common Log Format is a standardized text file format used by web servers when generating server log files."
50 |
51 | Here's an example of a log record (the text should in one line of code, but
52 | I've split it into 2 lines for readibility purposes)
53 |
54 | ```
55 | pd9049dac.dip.t-dialin.net - - [01/May/2001:01:51:25 -0700]
56 | "GET /accesswatch/accesswatch-1.33/ HTTP/1.0" 200 1004
57 | ```
58 |
59 | - A `"-"` in a field indicates missing data.
60 | - `pd9049dac.dip.t-dialin.net` is the IP address of the client (remote host) which made the request to the server.
61 | - `[01/May/2001:01:51:25 -0700]` is the date, time, and time zone that the request was received, by default in strftime format` %d/%b/%Y:%H:%M:%S %z`.
62 | - `"GET /accesswatch/accesswatch-1.33/ HTTP/1.0"` is the request line from the client.
63 | - The method `GET, /accesswatch/accesswatch-1.33/` is the resource requested, and `HTTP/1.0` is the HTTP protocol.
64 | - `200` is the HTTP status code returned to the client.
65 | + `2xx` is a successful response
66 | + `3xx` a redirection
67 | + `4xx` a client error, and
68 | + `5xx` a server error
69 | - `1004` is the size of the object returned to the client, measured in bytes.
70 |
71 | If you want to download a copy of the text file to your working directory:
72 |
73 | ```{r eval = FALSE}
74 | # download file
75 | github <- "https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018"
76 | textfile <- "/master/data/may-logs.txt"
77 | download.file(url = paste0(github, textfile), destfile = "may-logs.txt")
78 | ```
79 |
80 |
81 | ```{r echo = FALSE}
82 | # one option is to read in the content with 'readLines()'
83 | logs <- readLines('../data/may-logs.txt')
84 | ```
85 |
86 | -----
87 |
88 |
89 | ## Reading the text file
90 |
91 | The first step involves reading the data in R. How can you do this? One option
92 | is with the `readLines()` function which reads any text file into a character
93 | vector:
94 |
95 | ```{r eval = FALSE}
96 | # one option is to read in the content with 'readLines()'
97 | logs <- readLines('may-logs.txt')
98 | ```
99 |
100 | Let's check how the content looks like:
101 |
102 | ```{r}
103 | # take a peek at the contents in logs
104 | head(logs)
105 | ```
106 |
107 | Because the file contains `r length(logs)` lines (or elements), let's get a
108 | subset by taking a random sample of size 50:
109 |
110 | ```{r}
111 | # subset a sample of lines
112 | set.seed(98765)
113 | s <- sample(1:length(logs), size = 50)
114 | sublogs <- logs[s]
115 | ```
116 |
117 |
118 | ### JPG File requests
119 |
120 | To begin our regex experiments, let's try to find out how many requests
121 | involved a JPG file.
122 |
123 | One way to answer the previous question is by counting how many log lines
124 | contain the pattern `"jpg"`. We can use `grep()` to match or detect this pattern:
125 |
126 | ```{r}
127 | # matching "jpg" (which lements)
128 | grep("jpg", sublogs)
129 |
130 | # showing value of matches
131 | grep("jpg", sublogs, value = TRUE)
132 | ```
133 |
134 | We can try to be more specific by defining a pattern `".jpg"` in which the
135 | `.` corresponds to the _literal_ dot character. To match the dot, we need to
136 | escape it with `"\\."`:
137 |
138 | ```{r}
139 | # we could try to be more precise and match ".jpg"
140 | grep("\\.jpg ", sublogs, value = TRUE)
141 | ```
142 |
143 |
144 | ### Function `str_detect()`
145 |
146 | A similar output of `grep()` can be obtained with `str_detect()`, which allows
147 | you to _detect_ what elements contain a match to the specified pattern:
148 |
149 | ```{r}
150 | # matching "jpg" (which lements)
151 | str_detect(string = sublogs, pattern = "\\.jpg")
152 | ```
153 |
154 | We can do the same for PNG extensions:
155 |
156 | ```{r}
157 | # matching "png" (which lements)
158 | str_detect(string = sublogs, pattern = "\\.png")
159 | ```
160 |
161 |
162 | ### Function `str_extract()`
163 |
164 | Another common task when working with regular expressions has to do with pattern
165 | extraction. For this purposes, we can use `str_extract()`:
166 |
167 | ```{r}
168 | # extracting "jpg" (which lements)
169 | str_extract(string = sublogs, pattern = "\\.jpg")
170 | ```
171 |
172 | `str_extract()` actually let us confirm that we are matching the desired
173 | patterns. Notice that when there is no match, `str_extract()` returns a
174 | missing value `NA`.
175 |
176 |
177 | ### Image files
178 |
179 | Now let's try to detect all types of image files: JPG, PNG, GIF, ICO
180 |
181 | ```{r}
182 | # looking for image file extensions
183 | jpgs <- str_detect(logs, pattern = "\\.jpg ")
184 | sum(jpgs)
185 |
186 | pngs <- str_detect(logs, pattern = "\\.png ")
187 | sum(pngs)
188 |
189 | gifs <- str_detect(logs, pattern = "\\.gif")
190 | sum(gifs)
191 |
192 | icos <- str_detect(logs, pattern = "\\.ico ")
193 | sum(icos)
194 | ```
195 |
196 |
197 | ### How to match image files with one regex pattern?
198 |
199 | We can use character sets to define a more generic pattern. For instance, to
200 | match `"jpg"` or `"png"`, we could join three character sets: `"[jp][pn][g]"`.
201 | The first set `[jp]` looks for `j` or `p`, the second set `[pn]` looks for
202 | `p` or `n`, and the third set simply looks for `g`.
203 |
204 | ```{r}
205 | # matching "jpg" or "png"
206 | jpg_png_lines <- str_detect(sublogs, "[jp][pn][g]")
207 | sum(jpg_png_lines)
208 | ```
209 |
210 | Including the dot, we can use: `"\\.[jp][pn][g]"`
211 |
212 | ```{r}
213 | # matching "jpg" or "png"
214 | jpg_png_lines <- str_detect(sublogs, "\\.[jp][pn][g]")
215 | sum(jpg_png_lines)
216 | ```
217 |
218 | We could generalize the pattern to include the GIF and ICO extensions:
219 |
220 | ```{r}
221 | # matching "jpg" or "png" or "gif"
222 | image_lines1 <- str_detect(sublogs, "[jpgi][pnic][gfo]")
223 | sum(image_lines1)
224 | ```
225 |
226 | To confirm that we are actually matching `jpg`, `png`, `gif` and `ico`, let's
227 | use `str_extract()`
228 |
229 | ```{r}
230 | # are we correctly extracting image file extensions?
231 | str_extract(sublogs, "[jpgi][pnic][gfo]")
232 | ```
233 |
234 | The previous pattern does not really work as expected: note that we are matching
235 | the patterns formed by `"ing"` and `"inf"` which do not correspond to image file
236 | extensions.
237 |
238 | An alternative way to detect JPG and PNG is by grouping patterns inside
239 | parentheses, and separating them with the metacharacter `"|"` which means _OR_:
240 |
241 | ```{r}
242 | # detecting .jpg OR .png
243 | jpg_png <- str_detect(sublogs, "\\.jpg|\\.png")
244 | sum(jpg_png)
245 | ```
246 |
247 | Here's how to detect all the extension in one single pattern:
248 |
249 | ```{r}
250 | # matching "jpg" or "png" or "gif" or "ico"
251 | image_lines <- str_detect(sublogs, "\\.jpg|\\.png|\\.gif|\\.ico")
252 | sum(image_lines)
253 | ```
254 |
255 | To make sure our regex operation is successful, let's see the output of
256 | `str_extract()`:
257 |
258 | ```{r}
259 | images_output <- str_extract(sublogs, "\\.jpg|\\.png|\\.gif|\\.ico")
260 | images_output
261 | ```
262 |
263 | There's some repetition with the dot character; we can modify our previous
264 | pattern by placing the dot `"\\."` at the beginning:
265 |
266 | ```{r}
267 | images_output <- str_extract(sublogs, "\\.jpg|png|gif|ico")
268 | images_output
269 | ```
270 |
271 | Notice that the dot only appears next to `".jpg"` but not with the other
272 | type of extensions. What we need to do is group the file extensions by surrounding
273 | them with parentheses:
274 |
275 | ```{r}
276 | images_output <- str_extract(sublogs, "\\.(jpg|png|gif|ico)")
277 | images_output
278 | ```
279 |
280 | Now let's apply the pattern on the entire log file, to count the number of files
281 | of each type:
282 |
283 | ```{r}
284 | # frequencies
285 | img_extensions <- str_extract(logs, "\\.(jpg|png|gif|ico)")
286 | table(img_extensions)
287 | ```
288 |
289 |
290 | ### More Questions
291 |
292 | - How to get the entire name of the image file (`image.ext`)?
293 |
294 | ```{r echo = FALSE, results = 'hide'}
295 | str_extract(sublogs, "\\w+\\.(jpg|png|gif|ico)")
296 | ```
297 |
298 | - How to get just the name of the image file without, the extension (`image`)?
299 |
300 | ```{r echo = FALSE, results = 'hide'}
301 | str_sub(str_extract(sublogs, "\\w+\\.(jpg|png|gif|ico)"), end = -5)
302 | ```
303 |
304 | - How to get the request type: e.g. `"GET`?
305 |
306 | ```{r echo = FALSE, results = 'hide'}
307 | str_extract(sublogs, '\\"[A-Z]+')
308 | ```
309 |
310 | - How to get the status codes: e.g. `200`?
311 |
312 | ```{r echo = FALSE, results = 'hide'}
313 | str_extract(sublogs, '\\" [0-9][0-9][0-9]')
314 | str_extract(sublogs, ' [0-9][0-9][0-9]')
315 | ```
316 |
317 | - How to get the size of the resource (number at the end): e.g. `34301"`?
318 |
319 | ```{r echo = FALSE, results = 'hide'}
320 | str_extract(sublogs, ' [0-9]+$')
321 | str_extract(sublogs, '[0-9]+$')
322 | ```
323 |
324 | - How to get the IP address of the client?
325 |
326 | ```{r echo = FALSE, results = 'hide'}
327 | # fail
328 | str_extract(sublogs, "\\w+")
329 | str_extract(sublogs, "\\w+\\.+ ")
330 | str_extract(sublogs, "(\\w+\\.\\w+)+")
331 |
332 | # almost (fails at matching 'pd9049f35.dip.t-dialin.net')
333 | str_extract(sublogs, "(\\w+\\.)+\\w+")
334 |
335 | # okay
336 | str_extract(sublogs, "(\\w+\\.)+\\w+-*(\\w+\\.)+\\w+")
337 | ```
338 |
--------------------------------------------------------------------------------
/tutorials/16-images/head_freqs_plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/16-images/head_freqs_plot-1.png
--------------------------------------------------------------------------------
/tutorials/16-intro-to-random-numbers.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Random Numbers and Simulations"
3 | subtitle: "Stat 133, Spring 2018"
4 | author: "Gaston Sanchez"
5 | output: github_document
6 | fontsize: 11pt
7 | urlcolor: blue
8 | ---
9 |
10 | ```{r setup, include=FALSE}
11 | knitr::opts_chunk$set(echo = TRUE, error = TRUE, fig.path = '16-images/')
12 | ```
13 |
14 | > ### Learning Objectives
15 | >
16 | > - How to use R to simulate chance processes
17 | > - Getting to know the function `sample()`
18 | > - Simulate flipping a coin
19 | > - Visualize relative frequencies
20 |
21 | ------
22 |
23 | ## Introduction
24 |
25 | Random numbers have many applications in science and computer programming,
26 | especially when there are significant uncertainties in a phenomenon of interest.
27 | In this tutorial we'll look at a basic problem that involves working
28 | with random numbers and creating simulations.
29 |
30 | More specifically, let's see how to use R to simulate basic chance processes
31 | like tossing a coin.
32 |
33 |
34 | ## Let's flip a coin
35 |
36 | Chance processes, also referred to as chance experiments, have to do with
37 | actions in which the resulting outcome turns out to be different in each
38 | occurrence.
39 |
40 | Typical examples of basic chance processes are tossing one or more coins,
41 | rolling one or more dice, selecting one or more cards from a deck of cards,
42 | and in general, things that can be framed in terms of drawing tickets out of
43 | a box (or any other type of container: bag, urn, etc.).
44 |
45 | You can use your computer, and R in particular, to simulate chances processes.
46 | In order to do that, the first step consists of learning how to create a
47 | virtual coin, or die, or box-with-tickets.
48 |
49 |
50 | ### Creating a coin
51 |
52 | The simplest way to create a coin with two sides, `"heads"` and `"tails"`, is
53 | with an R character vector via the _combine_ function `c()`
54 |
55 | ```{r}
56 | # a (virtual) coin object
57 | coin <- c("heads", "tails")
58 | ```
59 |
60 | You can also create a _numeric_ coin that shows `1` and `0` instead of
61 | `"heads"` and `"tails"`:
62 |
63 | ```{r}
64 | num_coin <- c(0, 1)
65 | ```
66 |
67 | Likewise, you can also create a _logical_ coin that shows `TRUE` and `FALSE`
68 | instead of `"heads"` and `"tails"`:
69 |
70 | ```{r}
71 | log_coin <- c(TRUE, FALSE)
72 | ```
73 |
74 |
75 | ## Tossing a coin
76 |
77 | Once you have an object that represents the _coin_, the next step involves
78 | learning how to simulate tossing the coin. One way to simulate the action of
79 | tossing a coin in R is with the function `sample()` which lets you draw
80 | random samples, with or without replacement, from an input vector.
81 |
82 | To toss the coin use `sample()` like this:
83 |
84 | ```{r}
85 | coin <- c('heads', 'tails')
86 |
87 | # toss the coin
88 | sample(coin, size = 1)
89 | ```
90 |
91 | with the argument `size = `, specifying that we want to take a sample of size 1
92 | from the input vector `coin`.
93 |
94 |
95 | ### Function `sample.int()`
96 |
97 | Another function related to `sample()` is `sample.int()` which simulates
98 | drawing random integers. The main argument is `n`, which represents the maximum
99 | integer to sample from: `1, 2, 3, ..., n`
100 |
101 | ```{r}
102 | sample.int(10)
103 | ```
104 |
105 |
106 | ### Random Samples
107 |
108 | By default, `sample()` draws each element in `coin` with the same probability.
109 | In other words, each element is assigned the same probability of being chosen.
110 | Another default behavior of `sample()` is to take a sample of the specified
111 | `size` __without replacement__. If `size = 1`, it does not really matter whether
112 | the sample is done with or without replacement.
113 |
114 | To draw two elements WITHOUT replacement, use `sample()` like this:
115 |
116 | ```{r}
117 | # draw 2 elements without replacement
118 | sample(coin, size = 2)
119 | ```
120 |
121 | What if we try to toss the coin three or four times?
122 |
123 | ```{r}
124 | # trying to toss coin 3 times
125 | sample(coin, size = 3)
126 | ```
127 |
128 | Notice that R produced an error message. This is because the default behavior
129 | of `sample()` cannot draw more elements that the length of the input vector.
130 |
131 | To be able to draw more elements, we need to sample WITH replacement, which is
132 | done by specifying the argument `replace = TRUE`, like this:
133 |
134 | ```{r}
135 | # draw 4 elements with replacement
136 | sample(coin, size = 4, replace = TRUE)
137 | ```
138 |
139 |
140 |
141 | ## The Random Seed
142 |
143 | The way `sample()` works is by taking a random sample from the input vector.
144 | This means that every time you invoke `sample()` you will likely get a different
145 | output.
146 |
147 | In order to make the examples replicable (so you can get the same output as me),
148 | you need to specify what is called a __random seed__. This is done with the
149 | function `set.seed()`. By setting a _seed_, every time you use one of the random
150 | generator functions, like `sample()`, you will get the same values.
151 |
152 | ```{r}
153 | # set random seed
154 | set.seed(1257)
155 |
156 | # toss a coin with replacement
157 | sample(coin, size = 4, replace = TRUE)
158 | ```
159 |
160 | All computations of random numbers are based on deterministic algorithms, so
161 | the sequence of numbers is not truly random. However, the sequence of numbers
162 | appears to lack any systematic pattern, and we can therefore regard the
163 | numbers as random.
164 |
165 | Every time you use one of the random generator functions in R, the call
166 | produces different numbers. For replication and debugging purposes, it is
167 | useful to get the same sequence of random numebrs every time we run the script.
168 | This functionality is obtained by setting a __seed__ before we start generating
169 | the numebrs. The seed is an integer and set by the function `set.seed()`
170 |
171 | ```{r}
172 | set.seed(123)
173 | runif(4)
174 | ```
175 |
176 | If we set the seed to `123` again, the sequence of uniform random numbers is
177 | regenerated:
178 |
179 | ```{r}
180 | set.seed(123)
181 | runif(4)
182 | ```
183 |
184 | If we don't specify a seed, the random generator functions set a seed based
185 | on the current time. That is, the seed will be different each time we run the
186 | script and consequently the sequence of random numbers will also be different.
187 |
188 |
189 | ## Sampling with different probabilities
190 |
191 | Last but not least, `sample()` comes with the argument `prob` which allows you
192 | to provide specific probabilities for each element in the input vector.
193 |
194 | By default, `prob = NULL`, which means that every element has the same
195 | probability of being drawn. In the example of tossing a coin, the command
196 | `sample(coin)` is equivalent to `sample(coin, prob = c(0.5, 0.5))`. In the
197 | latter case we explicitly specify a probability of 50% chance of heads, and
198 | 50% chance of tails:
199 |
200 | ```{r echo = FALSE}
201 | # tossing a fair coin
202 | coin <- c("heads", "tails")
203 |
204 | sample(coin)
205 | sample(coin, prob = c(0.5, 0.5))
206 | ```
207 |
208 | However, you can provide different probabilities for each of the elements in
209 | the input vector. For instance, to simulate a __loaded__ coin with chance of
210 | heads 20%, and chance of tails 80%, set `prob = c(0.2, 0.8)` like so:
211 |
212 | ```{r}
213 | # tossing a loaded coin (20% heads, 80% tails)
214 | sample(coin, size = 5, replace = TRUE, prob = c(0.2, 0.8))
215 | ```
216 |
217 |
218 | -----
219 |
220 |
221 | ## Simulating tossing a coin
222 |
223 | Now that we have all the elements to toss a coin with R, let's simulate flipping
224 | a coin 100 times, and use the function `table()` to count the resulting number
225 | of `"heads"` and `"tails"`:
226 |
227 | ```{r}
228 | # number of flips
229 | num_flips <- 100
230 |
231 | # flips simulation
232 | coin <- c('heads', 'tails')
233 | flips <- sample(coin, size = num_flips, replace = TRUE)
234 |
235 | # number of heads and tails
236 | freqs <- table(flips)
237 | freqs
238 | ```
239 |
240 | In my case, I got `r freqs[1]` heads and `r freqs[2]` tails. Your results will
241 | probably be different than mine. Some of you will get more `"heads"`, some of
242 | you will get more `"tails"`, and some will get exactly 50 `"heads"` and 50
243 | `"tails"`.
244 |
245 | Run another series of 100 flips, and find the frequency of `"heads"` and `"tails"`:
246 |
247 | ```{r}
248 | # one more 100 flips
249 | flips <- sample(coin, size = num_flips, replace = TRUE)
250 | freqs <- table(flips)
251 | freqs
252 | ```
253 |
254 |
255 | ## Tossing function
256 |
257 | Let's make things a little bit more complex but also more interesting.
258 | Instead of calling `sample()` every time we want to toss a coin, we can
259 | write a `toss()` function:
260 |
261 | ```{r}
262 | #' @title coin toss function
263 | #' @description simulates tossing a coin a given number of times
264 | #' @param x coin object (a vector)
265 | #' @param times number of tosses
266 | #' @return vector of tosses
267 | toss <- function(x, times = 1) {
268 | sample(x, size = times, replace = TRUE)
269 | }
270 |
271 | # basic call
272 | toss(coin)
273 |
274 | # toss 5 times
275 | toss(coin, 5)
276 | ```
277 |
278 | We can make the function more versatile by adding a `prob` argument that let
279 | us specify different probabilities for `heads` and `tails`
280 |
281 | ```{r}
282 | #' @title coin toss function
283 | #' @description simulates tossing a coin a given number of times
284 | #' @param x coin object (a vector)
285 | #' @param times number of tosses
286 | #' @param prob vector of probabilities for each side of the coin
287 | #' @return vector of tosses
288 | toss <- function(x, times = 1, prob = NULL) {
289 | sample(x, size = times, replace = TRUE, prob = prob)
290 | }
291 |
292 | # toss a loaded coin 10 times
293 | toss(coin, times = 10, prob = c(0.8, 0.2))
294 | ```
295 |
296 |
297 | ## Counting Frequencies
298 |
299 | The next step is to toss a coin several times, and count the frequency of
300 | `heads` and `tails`
301 |
302 | ```{r}
303 | # count frequencies
304 | tosses <- toss(coin, times = 100)
305 | table(tosses)
306 | ```
307 |
308 | We can also count the relative frequencies:
309 |
310 | ```{r}
311 | # relative freqs (proportions)
312 | table(tosses) / length(tosses)
313 | ```
314 |
315 | To make things more interesting, let's consider how the frequency of `heads`
316 | evolves over a series of `n` tosses.
317 |
318 | ```{r}
319 | n <- 500
320 | tosses <- toss(coin, times = n)
321 | heads_freq <- cumsum(tosses == 'heads') / 1:n
322 | ```
323 |
324 | In this case, we can make a plot of the relative frequencies:
325 |
326 | ```{r head_freqs_plot}
327 | plot(heads_freq, type = 'l', lwd = 2, col = 'tomato', las = 1,
328 | ylim = c(0, 1))
329 | abline(h = 0.5, col = 'gray50')
330 | ```
331 |
--------------------------------------------------------------------------------
/tutorials/17-programming-s3classes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ucb-stat133/stat133-spring-2018/a7f5409eef21115a468c73214dc0543fe4629170/tutorials/17-programming-s3classes.pdf
--------------------------------------------------------------------------------