├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── NEWS.md ├── R ├── BF_app.R ├── ames.R ├── ames_sampling_dist.R ├── arbuthnot.R ├── atheism.R ├── bandit_posterior.R ├── bandit_sim.R ├── bayes_inference.R ├── bayes_single_mean_JZS.R ├── bayes_single_mean_sim.R ├── bayes_single_mean_theo.R ├── bayes_single_prop.R ├── bayes_two_mean.R ├── bayes_two_prop.R ├── bayes_util.R ├── behren-fisher.R ├── brfss.R ├── calc_streak.R ├── ci_single_mean_sim.R ├── ci_single_mean_theo.R ├── ci_single_median_sim.R ├── ci_single_prop_sim.R ├── ci_single_prop_theo.R ├── ci_two_mean_sim.R ├── ci_two_mean_theo.R ├── ci_two_median_sim.R ├── ci_two_prop_sim.R ├── ci_two_prop_theo.R ├── credible_interval.R ├── evals.R ├── globals.R ├── ht_many_mean_theo.R ├── ht_many_prop_sim.R ├── ht_many_prop_theo.R ├── ht_single_mean_sim.R ├── ht_single_mean_theo.R ├── ht_single_median_sim.R ├── ht_single_prop_sim.R ├── ht_single_prop_theo.R ├── ht_two_mean_sim.R ├── ht_two_mean_theo.R ├── ht_two_median_sim.R ├── ht_two_prop_sim.R ├── ht_two_prop_theo.R ├── inference.R ├── kobe_basket.R ├── mlb11.R ├── nc.R ├── nycflights.R ├── plot_ss.R ├── present.R ├── rep_sample_n.R ├── rstudio.R ├── statsr.R ├── statswithr_lab.R ├── tapwater.R ├── wage.R └── zinc.R ├── README.Rmd ├── README.md ├── _pkgdown.yml ├── cran-comments.md ├── data ├── ames.rda ├── arbuthnot.rda ├── atheism.rda ├── brfss.rda ├── evals.rda ├── kobe_basket.rda ├── mlb11.rda ├── nc.rda ├── nycflights.rda ├── present.rda ├── tapwater.rda ├── wage.rda └── zinc.rda ├── docs ├── 404.html ├── LICENSE-text.html ├── authors.html ├── bootstrap-toc.css ├── bootstrap-toc.js ├── docsearch.css ├── docsearch.js ├── index.html ├── jquery.sticky-kit.min.js ├── link.svg ├── news │ └── index.html ├── pkgdown.css ├── pkgdown.js ├── pkgdown.yml ├── reference │ ├── BF_app.html │ ├── Rplot001.png │ ├── Rplot002.png │ ├── Rplot003.png │ ├── Rplot004.png │ ├── Rplot005.png │ ├── allow_shiny.html │ ├── ames.html │ ├── ames_sampling_dist.html │ ├── arbuthnot.html │ ├── atheism.html │ ├── bandit_posterior-1.png │ ├── bandit_posterior.html │ ├── bandit_sim-1.png │ ├── bandit_sim.html │ ├── bayes_inference-1.png │ ├── bayes_inference-2.png │ ├── bayes_inference-3.png │ ├── bayes_inference-4.png │ ├── bayes_inference-5.png │ ├── bayes_inference.html │ ├── brfss.html │ ├── calc_streak.html │ ├── credible_interval_app.html │ ├── evals.html │ ├── figures │ │ └── unnamed-chunk-3-1.png │ ├── index.html │ ├── inference-1.png │ ├── inference-2.png │ ├── inference-3.png │ ├── inference.html │ ├── kobe_basket.html │ ├── mlb11.html │ ├── nc.html │ ├── nycflights.html │ ├── plot_bandit_posterior-1.png │ ├── plot_bandit_posterior.html │ ├── plot_ss.html │ ├── present.html │ ├── rep_sample_n.html │ ├── statsr.html │ ├── tapwater.html │ ├── wage.html │ ├── zinc-1.png │ └── zinc.html └── sitemap.xml ├── inst ├── WORDLIST └── lab.css ├── man ├── BF_app.Rd ├── allow_shiny.Rd ├── ames.Rd ├── ames_sampling_dist.Rd ├── arbuthnot.Rd ├── atheism.Rd ├── bandit_posterior.Rd ├── bandit_sim.Rd ├── bayes_inference.Rd ├── brfss.Rd ├── calc_streak.Rd ├── credible_interval_app.Rd ├── evals.Rd ├── figures │ └── unnamed-chunk-3-1.png ├── inference.Rd ├── kobe_basket.Rd ├── mlb11.Rd ├── nc.Rd ├── nycflights.Rd ├── plot_bandit_posterior.Rd ├── plot_ss.Rd ├── present.Rd ├── rep_sample_n.Rd ├── statsr.Rd ├── tapwater.Rd ├── wage.Rd └── zinc.Rd ├── statsr.Rproj └── tests ├── spelling.R ├── testthat.R └── testthat └── test-bayes_inference.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | .travis.yml 4 | orig_data 5 | .Rhistory 6 | ^README\.Rmd$ 7 | ^README-.*\.png$ 8 | cran-comments.md 9 | ^CRAN-RELEASE$ 10 | ^_pkgdown\.yml$ 11 | ^docs$ 12 | ^pkgdown$ 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | # Session Data files 5 | .RData 6 | # Example code in package build process 7 | *-Ex.R 8 | # RStudio files 9 | .Rproj.user/ 10 | # produced vignettes 11 | vignettes/*.html 12 | vignettes/*.pdf 13 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 14 | .httr-oauth 15 | .Rproj.user 16 | orig_data/ 17 | .DS_Store 18 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: r 2 | cache: packages 3 | r_check_args: '--as-cran' 4 | 5 | warnings_are_errors: false 6 | 7 | r: 8 | - release 9 | - devel 10 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: statsr 2 | Type: Package 3 | Title: Companion Software for the Coursera Statistics with R Specialization 4 | Version: 0.3.0 5 | Date: 2021-01-21 6 | Authors@R: c(person("Colin", "Rundel", role=c("aut"), email="rundel@gmail.com"), 7 | person("Mine", "Cetinkaya-Rundel", role=c("aut"), email="mine@stat.duke.edu"), 8 | person("Merlise", "Clyde", role=c("aut", "cre"), email="clyde@duke.edu"), 9 | person("David", "Banks", role=c("aut"), email="banks@stat.duke.edu")) 10 | Maintainer: Merlise Clyde 11 | Description: Data and functions to support Bayesian and frequentist inference and decision making 12 | for the Coursera Specialization "Statistics with R". 13 | See for more information. 14 | LazyData: true 15 | License: MIT + file LICENSE 16 | RoxygenNote: 7.1.1 17 | Encoding: UTF-8 18 | Depends: 19 | R (>= 3.3.0), 20 | BayesFactor 21 | Imports: 22 | dplyr, 23 | rmarkdown, 24 | knitr, 25 | ggplot2, 26 | broom, 27 | gridExtra, 28 | shiny, 29 | cubature, 30 | tidyr, 31 | tibble, 32 | utils 33 | Suggests: 34 | spelling, 35 | HistData, 36 | testthat (>= 3.0.0) 37 | URL: https://github.com/StatsWithR/statsr 38 | BugReports: https://github.com/StatsWithR/statsr/issues 39 | Language: en-US 40 | Config/testthat/edition: 3 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2016 2 | COPYRIGHT HOLDER: StatsWithR 3 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(BF_app) 4 | export(ames_sampling_dist) 5 | export(bandit_posterior) 6 | export(bandit_sim) 7 | export(bayes_inference) 8 | export(calc_streak) 9 | export(credible_interval_app) 10 | export(inference) 11 | export(plot_bandit_posterior) 12 | export(plot_ss) 13 | export(rep_sample_n) 14 | import(ggplot2) 15 | import(graphics) 16 | import(shiny) 17 | import(stats) 18 | importFrom(BayesFactor,ttestBF) 19 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # News for statsr 2 | 3 | # statsr 0.3.0 4 | 5 | * merged the 'BayesFactor' branch with main branch so that the `bayes_inference` function on CRAN is consistent with book and other supplemental materials online. Provides a more unified function and additional options. Addresses [issue #15](https://github.com/StatsWithR/statsr/issues/15) 6 | 7 | * Restore the tapwater and zinc data 8 | 9 | # statsr 0.2.0 10 | 11 | * updates so that functions are compatible with tibble package version 3.0.0 12 | 13 | # statsr 0.1.0 14 | 15 | * First release of package on CRAN to accompany version 2 of the Statistics With R course on Coursera and release of the online book [Introduction to Bayesian Thinking](https://statswithr.github.io/book/) -------------------------------------------------------------------------------- /R/BF_app.R: -------------------------------------------------------------------------------- 1 | #' Run the interactive Bayes Factor shiny app 2 | #' 3 | #' This app illustrates how changing the Z score and prior precision 4 | #' affects the Bayes Factor for testing H1 that the mean is zero 5 | #' versus H2 that the mean is not zero for data arising from a normal 6 | #' population. Lindley's paradox occurs for large sample sizes 7 | #' when the Bayes factor favors H1 even though the Z score is large or the 8 | #' p-value is small enough to reach statistical significance and the values of 9 | #' the sample mean do not reflex practical significance based on the prior 10 | #' distribution. 11 | #' Bartlett's paradox may occur when the prior precision goes to zero, leading 12 | #' to Bayes factors that favor H1 regardless of the data. 13 | #' A prior precision of one corresponds to the unit information prior. 14 | #' @examples 15 | #' if (interactive()) { 16 | #' BF.app() 17 | #' } 18 | #' @export 19 | #' 20 | BF_app = function() 21 | { 22 | shinyApp( 23 | ui = pageWithSidebar( 24 | headerPanel(""), 25 | sidebarPanel( 26 | selectInput(inputId = "dist", 27 | label = "Prior Distribution Family:", 28 | choices = c("Normal" = "norm"), 29 | selected = "norm"), 30 | br(), 31 | # sliderInput("n0", "Prior Sample Size n0", min=0, max=4, step=0.01, value=1.0), 32 | sliderInput("Z", "Z score", 33 | min=-3, max=3, step=.05, value=0.0), 34 | # sliderInput("mu_2", "mu_2 (in units of standard deviations)", 35 | # min=-3, max=3, step=.01, value=0.0), 36 | sliderInput("phi0", "Prior Precision", min=.000001, max=2, step=.01, 37 | value=1.0) 38 | ), 39 | mainPanel( 40 | plotOutput("BF_plot_mu") 41 | ) 42 | ), 43 | server = function(input, output, session) 44 | { 45 | 46 | 47 | output$BF_plot_mu = renderPlot( 48 | { 49 | 50 | d = data.frame( 51 | n = 1:1000) 52 | 53 | ybar = input$Z*sqrt(d$n) 54 | d$y = exp(-dnorm(ybar, 0, 1/sqrt(d$n), log=TRUE) + 55 | dnorm(ybar, 0, sqrt(1/(input$phi0+.0000001) + 1/d$n), 56 | log=TRUE)) 57 | d$y = BF10.normal(input$Z, n=d$n, n0=input$phi0, 58 | logBF=FALSE, recip=FALSE) 59 | 60 | BF.fav = subset(d, d$y >= 1) 61 | BF.against = subset(d, d$y < 1) 62 | if (nrow(BF.against) == nrow(d)) { # none in favor 63 | BF.against = rbind(c(1,1),BF.against, 64 | c(max(d$n), 1)) } 65 | else { 66 | if (nrow(BF.fav) == nrow(d)) { # all in favor 67 | BF.fav = rbind(c(1,1), BF.fav,c(max(d$n), 1))} 68 | else {# nrow(BF.fav) > 1 ) { # mix 69 | BF.fav = rbind(c(1,1), BF.fav, c(max(BF.fav$n),1)) 70 | BF.against = rbind(c(min(BF.against$n),1), BF.against, 71 | c(max(BF.against$n),1)) 72 | } 73 | } 74 | 75 | 76 | param = "\u03BC" 77 | 78 | 79 | ggplot(d, aes_string(x='n', y='y')) + 80 | ylab("BF[H2:H1]") + 81 | xlab("Sample Size n") + 82 | geom_line() + 83 | geom_abline(slope=0, intercept=0) + 84 | geom_polygon(data=BF.fav,aes_string(x='n',y='y'),alpha=0.5) + 85 | geom_polygon(data=BF.against,aes_string(x='n',y='y'),alpha=0.5) + 86 | scale_y_log10() + 87 | ggtitle("Bayes Factor H2:H1 H1: mu = 0.0 versus H2: mu = mu_2 ") 88 | }) 89 | 90 | output$BF_plot_sd = renderPlot( 91 | { 92 | 93 | x=10^seq(-2, 6, length=1000) 94 | n0 = 1/(x^2) # precision 95 | Z = input$ybar/sqrt(1/input$n) 96 | y = BF10.normal(Z, n=input$n, n0=n0, logBF=FALSE, recip=TRUE) 97 | d = data.frame(x, y) 98 | 99 | ggplot(d, aes_string(x='x', y='y')) + 100 | ylab("BF[H1:H2]") + 101 | xlab("Prior Standard Deviation (in units of sigma)") + 102 | geom_line() + 103 | scale_y_log10() + 104 | scale_x_log10() + 105 | geom_abline(slope=0, intercept=0) + 106 | ggtitle(paste0("Bayes Factor H1:H2 H1: mu = 0.0, Z = ", round(Z,2))) 107 | }) 108 | output$BF_plot_sd = renderPlot( 109 | { 110 | 111 | x=10^seq(-2, 6, length=1000) 112 | n0 = 1/(x^2) # precision 113 | Z = input$ybar/sqrt(1/input$n) 114 | y = BF10.normal(Z, n=input$n, n0=n0, logBF=FALSE, recip=TRUE) 115 | d = data.frame(x, y) 116 | 117 | ggplot(d, aes_string(x='x', y='y')) + 118 | ylab("BF[H1:H2]") + 119 | xlab("Prior Standard Deviation (in units of sigma)") + 120 | geom_line() + 121 | scale_y_log10() + 122 | scale_x_log10() + 123 | geom_abline(slope=0, intercept=0) + 124 | ggtitle(paste0("Bayes Factor H1:H2 H1: mu = 0.0, Z = ", round(Z,2))) 125 | }) 126 | options = list(height = 500) 127 | } 128 | ) 129 | } 130 | 131 | BF10.normal = function(z, n, n0, logBF=TRUE, recip=FALSE) { 132 | BF10 = .5*(z^2)*n/(n+n0) - .5*log(n + n0) + .5*log(n0) 133 | if (recip) BF10 = -BF10 134 | if (logBF == FALSE) BF10 = exp(BF10) 135 | return(BF10) 136 | } 137 | 138 | -------------------------------------------------------------------------------- /R/ames.R: -------------------------------------------------------------------------------- 1 | #' Housing prices in Ames, Iowa 2 | #' 3 | #' Data set contains information from the Ames Assessor's Office used in computing 4 | #' assessed values for individual residential properties sold in Ames, IA from 2006 5 | #' to 2010. See http://www.amstat.org/publications/jse/v19n3/decock/datadocumentation.txt 6 | #' for detailed variable descriptions. 7 | #' 8 | #' @format A tbl_df with with 2930 rows and 82 variables: 9 | #' \describe{ 10 | #' \item{Order}{Observation number.} 11 | #' \item{PID}{Parcel identification number - can be used with city web site for parcel review.} 12 | #' \item{area}{Above grade (ground) living area square feet.} 13 | #' \item{price}{Sale price in USD.} 14 | #' \item{MS.SubClass}{Identifies the type of dwelling involved in the sale.} 15 | #' \item{MS.Zoning}{Identifies the general zoning classification of the sale.} 16 | #' \item{Lot.Frontage}{Linear feet of street connected to property.} 17 | #' \item{Lot.Area}{Lot size in square feet.} 18 | #' \item{Street}{Type of road access to property.} 19 | #' \item{Alley}{Type of alley access to property.} 20 | #' \item{Lot.Shape}{General shape of property.} 21 | #' \item{Land.Contour}{Flatness of the property.} 22 | #' \item{Utilities}{Type of utilities available.} 23 | #' \item{Lot.Config}{Lot configuration.} 24 | #' \item{Land.Slope}{Slope of property.} 25 | #' \item{Neighborhood}{Physical locations within Ames city limits (map available).} 26 | #' \item{Condition.1}{Proximity to various conditions.} 27 | #' \item{Condition.2}{Proximity to various conditions (if more than one is present).} 28 | #' \item{Bldg.Type}{Type of dwelling.} 29 | #' \item{House.Style}{Style of dwelling.} 30 | #' \item{Overall.Qual}{Rates the overall material and finish of the house.} 31 | #' \item{Overall.Cond}{Rates the overall condition of the house.} 32 | #' \item{Year.Built}{Original construction date.} 33 | #' \item{Year.Remod.Add}{Remodel date (same as construction date if no remodeling or additions).} 34 | #' \item{Roof.Style}{Type of roof.} 35 | #' \item{Roof.Matl}{Roof material.} 36 | #' \item{Exterior.1st}{Exterior covering on house.} 37 | #' \item{Exterior.2nd}{Exterior covering on house (if more than one material).} 38 | #' \item{Mas.Vnr.Type}{Masonry veneer type.} 39 | #' \item{Mas.Vnr.Area}{Masonry veneer area in square feet.} 40 | #' \item{Exter.Qual}{Evaluates the quality of the material on the exterior.} 41 | #' \item{Exter.Cond}{Evaluates the present condition of the material on the exterior.} 42 | #' \item{Foundation}{Type of foundation.} 43 | #' \item{Bsmt.Qual}{Evaluates the height of the basement.} 44 | #' \item{Bsmt.Cond}{Evaluates the general condition of the basement.} 45 | #' \item{Bsmt.Exposure}{Refers to walkout or garden level walls.} 46 | #' \item{BsmtFin.Type.1}{Rating of basement finished area.} 47 | #' \item{BsmtFin.SF.1}{Type 1 finished square feet.} 48 | #' \item{BsmtFin.Type.2}{Rating of basement finished area (if multiple types).} 49 | #' \item{BsmtFin.SF.2}{Type 2 finished square feet.} 50 | #' \item{Bsmt.Unf.SF}{Unfinished square feet of basement area.} 51 | #' \item{Total.Bsmt.SF}{Total square feet of basement area.} 52 | #' \item{Heating}{Type of heating.} 53 | #' \item{Heating.QC}{Heating quality and condition.} 54 | #' \item{Central.Air}{Central air conditioning.} 55 | #' \item{Electrical}{Electrical system.} 56 | #' \item{X1st.Flr.SF}{First Floor square feet.} 57 | #' \item{X2nd.Flr.SF}{Second floor square feet.} 58 | #' \item{Low.Qual.Fin.SF}{Low quality finished square feet (all floors).} 59 | #' \item{Bsmt.Full.Bath}{Basement full bathrooms.} 60 | #' \item{Bsmt.Half.Bath}{Basement half bathrooms.} 61 | #' \item{Full.Bath}{Full bathrooms above grade.} 62 | #' \item{Half.Bath}{Half baths above grade.} 63 | #' \item{Bedroom.AbvGr}{Bedrooms above grade (does NOT include basement bedrooms).} 64 | #' \item{Kitchen.AbvGr}{Kitchens above grade.} 65 | #' \item{Kitchen.Qual}{Kitchen quality.} 66 | #' \item{TotRms.AbvGrd}{Total rooms above grade (does not include bathrooms).} 67 | #' \item{Functional}{Home functionality (Assume typical unless deductions are warranted).} 68 | #' \item{Fireplaces}{Number of fireplaces.} 69 | #' \item{Fireplace.Qu}{Fireplace quality.} 70 | #' \item{Garage.Type}{Garage location.} 71 | #' \item{Garage.Yr.Blt}{Year garage was built.} 72 | #' \item{Garage.Finish}{Interior finish of the garage.} 73 | #' \item{Garage.Cars}{Size of garage in car capacity.} 74 | #' \item{Garage.Area}{Size of garage in square feet.} 75 | #' \item{Garage.Qual}{Garage quality.} 76 | #' \item{Garage.Cond}{Garage condition.} 77 | #' \item{Paved.Drive}{Paved driveway.} 78 | #' \item{Wood.Deck.SF}{Wood deck area in square feet.} 79 | #' \item{Open.Porch.SF}{Open porch area in square feet.} 80 | #' \item{Enclosed.Porch}{Enclosed porch area in square feet.} 81 | #' \item{X3Ssn.Porch}{Three season porch area in square feet.} 82 | #' \item{Screen.Porch}{Screen porch area in square feet.} 83 | #' \item{Pool.Area}{Pool area in square feet.} 84 | #' \item{Pool.QC}{Pool quality.} 85 | #' \item{Fence}{Fence quality.} 86 | #' \item{Misc.Feature}{Miscellaneous feature not covered in other categories.} 87 | #' \item{Misc.Val}{Dollar value of miscellaneous feature.} 88 | #' \item{Mo.Sold}{Month Sold (MM).} 89 | #' \item{Yr.Sold}{Year Sold (YYYY).} 90 | #' \item{Sale.Type}{Type of sale.} 91 | #' \item{Sale.Condition}{Condition of sale.} 92 | #' } 93 | #' @source De Cock, Dean. "Ames, Iowa: Alternative to the Boston housing data as 94 | #' an end of semester regression project." Journal of Statistics Education 19.3 (2011). 95 | "ames" -------------------------------------------------------------------------------- /R/ames_sampling_dist.R: -------------------------------------------------------------------------------- 1 | #' Simulate Sampling Distribution 2 | #' 3 | #' Run the interactive ames sampling distribution shiny app to 4 | #' illustrate sampling distributions using variables from the `ames` 5 | #' dataset. 6 | #' 7 | #' 8 | #' @examples 9 | #' if (interactive()) { 10 | #' ames_sampling_dist() 11 | #' } 12 | 13 | #' @export 14 | 15 | ames_sampling_dist = function() 16 | { 17 | if (!allow_shiny()) 18 | stop("Shiny app will only run when built within RStudio.") 19 | 20 | ames = statsr::ames 21 | 22 | shinyApp( 23 | ui <- fluidPage( 24 | # Sidebar with a slider input for number of bins 25 | sidebarLayout( 26 | sidebarPanel( 27 | selectInput("selected_var", "Variable:", choices = list("area", "price"), selected = "area"), 28 | numericInput("n_samp", "Sample size:", min = 1, max = nrow(ames), value = 30), 29 | numericInput("n_sim", "Number of samples:", min = 1, max = 30000, value = 15000) 30 | ), 31 | # Show a plot of the generated distribution 32 | mainPanel( 33 | plotOutput("sampling_plot"), 34 | verbatimTextOutput("sampling_mean"), 35 | verbatimTextOutput("sampling_se") 36 | ) 37 | ) 38 | ), 39 | 40 | # Define server logic required to draw a histogram 41 | server <- function(input, output) { 42 | 43 | # create sampling distribution 44 | sampling_dist <- reactive({ 45 | s = sample(ames[[input$selected_var]], size = input$n_samp * input$n_sim, replace = TRUE) 46 | m = matrix(s, ncol = input$n_samp) 47 | data.frame(x_bar = rowMeans(m)) 48 | }) 49 | 50 | # plot sampling distribution 51 | output$sampling_plot <- renderPlot({ 52 | x_min <- quantile(ames[[input$selected_var]], 0.1) 53 | x_max <- quantile(ames[[input$selected_var]], 0.9) 54 | 55 | ggplot(sampling_dist(), aes_string(x = "x_bar")) + 56 | geom_histogram(na.rm=TRUE, bins=50) + 57 | xlim(x_min, x_max) + 58 | ylim(0, input$n_sim * 0.35) + 59 | ggtitle(paste0("Sampling distribution of mean ", 60 | input$selected_var, " (n = ", input$n_samp, ")")) + 61 | xlab(paste("mean", input$selected_var)) + 62 | theme(plot.title = element_text(face = "bold", size = 16)) 63 | }) 64 | 65 | # mean of sampling distribution 66 | output$sampling_mean <- renderText({ 67 | paste0("mean of sampling distribution = ", round(mean(sampling_dist()$x_bar), 2)) 68 | }) 69 | 70 | # mean of sampling distribution 71 | output$sampling_se <- renderText({ 72 | paste0("SE of sampling distribution = ", round(sd(sampling_dist()$x_bar), 2)) 73 | }) 74 | }, 75 | 76 | options = list(height = 500) 77 | ) 78 | } -------------------------------------------------------------------------------- /R/arbuthnot.R: -------------------------------------------------------------------------------- 1 | #' Male and female births in London 2 | #' 3 | #' Arbuthnot's data describes male and female christenings (births) for 4 | #' London from 1629-1710. 5 | #' 6 | #' John Arbuthnot (1710) used these time series data to carry out the first 7 | #' known significance test. During every one of the 82 years, there were more 8 | #' male christenings than female christenings. As Arbuthnot wondered, 9 | #' we might also wonder if this could be due to chance, or whether it meant 10 | #' the birth ratio was not actually 1:1. 11 | #' 12 | #' @format A tbl_df with with 82 rows and 3 variables: 13 | #' \describe{ 14 | #' \item{year}{year, ranging from 1629 to 1710} 15 | #' \item{boys}{number of male christenings (births)} 16 | #' \item{girls}{number of female christenings (births)} 17 | #' } 18 | #' @source These data are excerpted from the \code{\link[HistData]{Arbuthnot}} 19 | #' data set in the HistData package. 20 | "arbuthnot" -------------------------------------------------------------------------------- /R/atheism.R: -------------------------------------------------------------------------------- 1 | #' Atheism in the world data 2 | #' 3 | #' Survey results on atheism across several countries and years. Each row 4 | #' represents a single respondent. 5 | #' 6 | #' @format A tbl_df with 88032 rows and 3 variables: 7 | #' \describe{ 8 | #' \item{nationality}{Country of the individual surveyed.} 9 | #' \item{response}{A categorical variable with two levels: atheist and non-atheist.} 10 | #' \item{year}{Year in which the person was surveyed.} 11 | #' } 12 | #' @source \href{https://github.com/OpenIntroStat/oilabs/blob/master/data-raw/atheism/Global_INDEX_of_Religiosity_and_Atheism_PR__6.pdf}{WIN-Gallup International Press Release} 13 | "atheism" -------------------------------------------------------------------------------- /R/bandit_posterior.R: -------------------------------------------------------------------------------- 1 | #' bandit posterior 2 | #' 3 | #' Utility function for calculating the posterior probability of each machine being "good" in 4 | #' two armed bandit problem. Calculated result is based on observed win loss data, prior belief about 5 | #' which machine is good and the probability of the good and bad machine paying out. 6 | #' 7 | #' @param data data frame containing win loss data 8 | #' @param prior prior vector containing the probabilities of Machine 1 and Machine 2 being good, defaults to 0.5 and 0.5 respectively. 9 | #' @param win_probs vector containing the probabilities of winning on the good and bad machine respectively. 10 | #' @return A vector containing the posterior probability of Machine 1 and Machine 2 being the good machine. 11 | #' @seealso \code{\link{bandit_sim}} to generate data and 12 | #' \code{\link{plot_bandit_posterior}} to visualize. 13 | #' @examples 14 | #' data = data.frame(machine = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), 15 | #' outcome = c("W", "L", "W", "L", "L", "W", "L", "L", "L", "W")) 16 | #' bandit_posterior(data) 17 | #' plot_bandit_posterior(data) 18 | #' 19 | #' @export 20 | 21 | 22 | bandit_posterior <- function(data, prior = c(m1_good = 0.5, m2_good = 0.5), win_probs = c(good = 1 / 2, bad = 1 / 3)) { 23 | if (length(names(prior)) == 0) { 24 | names(prior) <- c("m1_good", "m2_good") 25 | } 26 | if (length(names(win_probs)) == 0) { 27 | names(prior) <- c("good", "bad") 28 | } 29 | 30 | m1_good_and_data <- prior["m1_good"] * win_probs["good"]^sum(data$machine == 1L & data$outcome == "W") * 31 | (1 - win_probs["good"])^sum(data$machine == 1L & data$outcome == "L") * 32 | win_probs["bad"]^sum(data$machine == 2L & data$outcome == "W") * 33 | (1 - win_probs["bad"])^sum(data$machine == 2L & data$outcome == "L") 34 | 35 | m2_good_and_data <- prior["m2_good"] * win_probs["bad"]^sum(data$machine == 1L & data$outcome == "W") * 36 | (1 - win_probs["bad"])^sum(data$machine == 1L & data$outcome == "L") * 37 | win_probs["good"]^sum(data$machine == 2L & data$outcome == "W") * 38 | (1 - win_probs["good"])^sum(data$machine == 2L & data$outcome == "L") 39 | return( 40 | c( 41 | m1_good_and_data / (m1_good_and_data + m2_good_and_data), 42 | m2_good_and_data / (m1_good_and_data + m2_good_and_data) 43 | ) 44 | ) 45 | } 46 | 47 | #' plot_bandit_posterior 48 | #' 49 | #' Generates a plot that shows the bandit posterior values as they are sequentially updated 50 | #' by the provided win / loss data. 51 | #' 52 | #' @param data data frame containing win loss data 53 | #' @param prior prior vector containing the probabilities of Machine 1 and Machine 2 being good, defaults to 50-50. 54 | #' @param win_probs vector containing the probabilities of winning on the good and bad machine respectively. 55 | #' @seealso \code{\link{bandit_sim}} to generate data to use below 56 | #' 57 | #' @examples 58 | #' # capture data from the `shiny` app `bandit_sim`. 59 | #' data = data.frame(machine = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), 60 | #' outcome = c("W", "L", "W", "L", "L", "W", "L", "L", "L", "W")) 61 | #' plot_bandit_posterior(data) 62 | #' 63 | #' @export 64 | 65 | 66 | plot_bandit_posterior <- function(data, 67 | prior = c(m1_good = 0.5, m2_good = 0.5), 68 | win_probs = c(good = 1 / 2, bad = 1 / 3)) { 69 | r <- tibble::tibble( 70 | "P(M1 is good | Data)" = c(prior[["m1_good"]], rep(NA, nrow(data))), 71 | "P(M2 is good | Data)" = c(prior[["m2_good"]], rep(NA, nrow(data))) 72 | ) 73 | 74 | for (i in 1:nrow(data)) { 75 | bp <- bandit_posterior(data[1:i, ], prior, win_probs) 76 | r[i+1, 1] <- bp[["m1_good"]] 77 | r[i+1, 2] <- bp[["m1_good"]] 78 | } 79 | 80 | r <- dplyr::mutate(r, play = dplyr::row_number()) 81 | r <- tidyr::gather(r, outcome, prob, -play) 82 | 83 | ggplot(r, aes_string(x = "play", y = "prob", color = "outcome")) + 84 | geom_line(size = 1.5) + 85 | labs(x = "Play #", y = "Posterior Prob.") + 86 | scale_color_manual(values = c("#428bca", "#5cb85c")) 87 | } 88 | -------------------------------------------------------------------------------- /R/bayes_util.R: -------------------------------------------------------------------------------- 1 | BF_plot = function(den_H2, res, parameter) 2 | { 3 | d_H2 = data.frame(x = den_H2$x, 4 | y = den_H2$y * res$post_H2 / max(den_H2$y), 5 | Hypothesis = "H2") 6 | 7 | li = min(which(d_H2$x >= res$ci_H2[1])) 8 | ui = max(which(d_H2$x < res$ci_H2[2])) 9 | 10 | d_H2_poly = data.frame(x = c(d_H2$x[c(li,li:ui,ui)]), 11 | y = c(0, d_H2$y[li:ui], 0), 12 | Hypothesis = "H2") 13 | 14 | d_H1 = data.frame(x = c(res$null, res$null), 15 | y = c(0, res$post_H1), 16 | Hypothesis = "H1") 17 | 18 | d = rbind(data.frame(x=NA, y=NA, Hypothesis="H1"), 19 | d_H2, 20 | data.frame(x=NA, y=NA, Hypothesis="Overall")) 21 | 22 | # H2 Features 23 | p = ggplot(d, aes_string(x="x", y="y", color="Hypothesis", fill="Hypothesis")) + 24 | geom_line(alpha=0.8) + 25 | geom_polygon(data = d_H2_poly, linetype="blank",alpha=0.8) + 26 | ylab("Density") + 27 | xlab(parameter) 28 | 29 | # H2 Features 30 | p = p + geom_line(data = d_H1, size=1.5, alpha=0.8) 31 | 32 | 33 | # Marginal plot features 34 | y_min = ggplot_build(p)$panel$ranges[[1]]$y.range[1] 35 | 36 | d_Marg = data.frame(x = rep(res$ci_Marg, c(2,2)), 37 | y = c(y_min*1/2, y_min, y_min, y_min*1/2), 38 | Hypothesis = "Overall") 39 | 40 | p = p + geom_line(data = d_Marg, size=0.75, alpha=0.8) 41 | 42 | print(p) 43 | } 44 | 45 | coda_density = function(x, from, to) 46 | { 47 | bwf = 1.06 * min(sd(x), IQR(x)/1.34) * length(x)^-0.2 48 | 49 | return(density(x, from=from, to=to, bw=bwf)) 50 | } 51 | 52 | 53 | check_beta_prior = function(beta_prior, group="") 54 | { 55 | arg_name = paste(substitute(beta_prior)) 56 | if (arg_name == "") arg_name = "beta_prior" 57 | 58 | param = ifelse(group == "", "p", paste0("p_",group)) 59 | 60 | if (is.null(beta_prior)) 61 | { 62 | warning("No beta prior for ",param," was specified, assuming a uniform prior (p ~ Beta(a=1,b=1)).\n", 63 | " This beta prior is specified using the argument ",arg_name,"=c(a,b),\n", 64 | " where a and b are your desired hyperparameters.") 65 | beta_prior = c(a=1,b=1) 66 | } 67 | 68 | stopifnot(length(beta_prior) == 2) 69 | 70 | if (is.null(names(beta_prior))) 71 | names(beta_prior) = c("a","b") 72 | stopifnot(all(sort(names(beta_prior)) == c("a","b"))) 73 | beta_prior = beta_prior[c("a","b")] 74 | 75 | return(beta_prior) 76 | } 77 | 78 | check_hypothesis_prior = function(prior) 79 | { 80 | if (is.null(prior)) 81 | { 82 | warning("No prior set for H1 and H2, assuming a uniform prior of P(H1) = 0.5 and P(H2) = 0.5. The hypothesis prior is assigned using the argument prior=c(H1=a,H2=b). ") 83 | prior = c(H1=0.5,H2=0.5) 84 | } 85 | 86 | if (length(prior) == 1) 87 | { 88 | if (names(prior) %in% c("H1","H2")) 89 | prior[ setdiff(c("H1","H2"), names(prior)) ] = 1 - prior 90 | } 91 | 92 | stopifnot(length(prior) == 2) 93 | stopifnot(all(prior >= 0)) 94 | stopifnot(sum(prior) == 1) 95 | 96 | if (is.null(names(prior))) 97 | names(prior) = c("H1","H2") 98 | 99 | stopifnot(all(sort(names(prior)) == c("H1","H2"))) 100 | 101 | return(prior[c("H1","H2")]) 102 | } -------------------------------------------------------------------------------- /R/brfss.R: -------------------------------------------------------------------------------- 1 | #' Behavioral Risk Factor Surveillance System 2013 (Subset) 2 | #' 3 | #' This data set is a small subset of BRFSS results from the 2013 survey, each row represents an individual respondent. 4 | #' 5 | #' @format A tbl_df with with 5000 rows and 6 variables: 6 | #' \describe{ 7 | #' \item{weight}{Weight in pounds.} 8 | #' \item{height}{Height in inches.} 9 | #' \item{sex}{Sex} 10 | #' \item{exercise}{Any exercise in the last 30 days} 11 | #' \item{fruit_per_day}{Number of servings of fruit consumed per day.} 12 | #' \item{vege_per_day}{Number of servings of dark green vegetables consumed per day.} 13 | #' } 14 | #' @source Centers for Disease Control and Prevention (CDC). Behavioral Risk Factor Surveillance System 15 | #' Survey Data. Atlanta, Georgia: U.S. Department of Health and Human Services, Centers for 16 | #' Disease Control and Prevention, 2013. 17 | "brfss" 18 | -------------------------------------------------------------------------------- /R/calc_streak.R: -------------------------------------------------------------------------------- 1 | #' Calculate hitting streaks 2 | #' 3 | #' @param x A data frame or character vector of hits (\code{"H"}) and misses (\code{"M"}). 4 | #' @return A data frame with one column, \code{length}, containing the length of each hit streak. 5 | #' @examples 6 | #' data(kobe_basket) 7 | #' calc_streak(kobe_basket$shot) 8 | #' 9 | #' @export 10 | 11 | calc_streak = function(x) 12 | { 13 | if (!is.atomic(x)) 14 | x = x[,1] 15 | 16 | if (any(!x %in% c("H","M"))) 17 | stop('Input should only contain hits ("H") and misses ("M")') 18 | 19 | y = rep(0,length(x)) 20 | y[x == "H"] = 1 21 | y = c(0, y, 0) 22 | wz = which(y == 0) 23 | streak = diff(wz) - 1 24 | 25 | return(data.frame(length = streak)) 26 | } -------------------------------------------------------------------------------- /R/ci_single_mean_sim.R: -------------------------------------------------------------------------------- 1 | ci_single_mean_sim <- function(y, conf_level, y_name, 2 | boot_method, nsim, seed, 3 | show_var_types, show_summ_stats, show_res, 4 | show_eda_plot, show_inf_plot){ 5 | 6 | # set seed 7 | if(!is.null(seed)){ set.seed(seed) } 8 | 9 | # calculate sample size 10 | n <- length(y) 11 | 12 | # calculate x-bar 13 | y_bar <- mean(y) 14 | 15 | # create bootstrap distribution 16 | sim_dist <- rep(NA, nsim) 17 | for(i in 1:nsim){ 18 | boot_samp <- sample(y, size = n, replace = TRUE) 19 | sim_dist[i] <- mean(boot_samp) 20 | } 21 | 22 | # for percentile method 23 | if(boot_method == "perc"){ 24 | # calculate quantile cutoffs based on confidence level 25 | lower_quantile <- (1-conf_level) / 2 26 | upper_quantile <- conf_level + lower_quantile 27 | 28 | # calculate quantiles of the bootstrap distribution 29 | ci_lower <- as.numeric(quantile(sim_dist, lower_quantile)) 30 | ci_upper <- as.numeric(quantile(sim_dist, upper_quantile)) 31 | 32 | # put CI together 33 | ci <- c(ci_lower, ci_upper) 34 | } 35 | 36 | # for standard error method 37 | if(boot_method == "se"){ 38 | # define degrees of freedom 39 | df <- n - 1 40 | 41 | # find percentile associated with critical value 42 | perc_crit_value <- conf_level + ((1 - conf_level) / 2) 43 | 44 | # find critical value 45 | t_star <- qt(perc_crit_value, df) 46 | 47 | # calculate SE 48 | se <- sd(sim_dist) 49 | 50 | # calculate ME 51 | me <- t_star * se 52 | 53 | # calculate CI 54 | ci <- y_bar + c(-1, 1)* me 55 | } 56 | 57 | # print variable types 58 | if(show_var_types == TRUE){ 59 | cat("Single numerical variable\n") 60 | } 61 | 62 | # print summary statistics 63 | if(show_summ_stats == TRUE){ 64 | s <- sd(y) 65 | cat(paste0("n = ", n, ", y-bar = ", round(y_bar, 4), ", s = ", round(s, 4), "\n")) 66 | } 67 | 68 | # print results 69 | if(show_res == TRUE){ 70 | conf_level_perc = conf_level * 100 71 | cat(paste0(conf_level_perc, "% CI: (", round(ci[1], 4), " , ", round(ci[2], 4), ")\n")) 72 | } 73 | 74 | # eda_plot 75 | d_eda <- data.frame(y = y) 76 | 77 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = y), environment = environment()) + 78 | ggplot2::geom_histogram(fill = "#8FDEE1", binwidth = diff(range(y)) / 20) + 79 | ggplot2::xlab(y_name) + 80 | ggplot2::ylab("") + 81 | ggplot2::ggtitle("Sample Distribution") + 82 | ggplot2::geom_vline(xintercept = y_bar, col = "#1FBEC3", lwd = 1.5) 83 | 84 | # inf_plot 85 | d_inf <- data.frame(sim_dist = sim_dist) 86 | 87 | inf_plot <- ggplot2::ggplot(data = d_inf, ggplot2::aes(x = sim_dist), environment = environment()) + 88 | ggplot2::geom_histogram(fill = "#CCCCCC", binwidth = diff(range(sim_dist)) / 20) + 89 | ggplot2::annotate("rect", xmin = ci[1], xmax = ci[2], ymin = 0, ymax = Inf,alpha = 0.3, fill = "#FABAB8") + 90 | ggplot2::xlab("bootstrap means") + 91 | ggplot2::ylab("") + 92 | ggplot2::ggtitle("Bootstrap Distribution") + 93 | ggplot2::geom_vline(xintercept = ci, color = "#F57670", lwd = 1.5) 94 | 95 | # print plots 96 | if(show_eda_plot & !show_inf_plot){ 97 | print(eda_plot) 98 | } 99 | if(!show_eda_plot & show_inf_plot){ 100 | print(inf_plot) 101 | } 102 | if(show_eda_plot & show_inf_plot){ 103 | gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2) 104 | } 105 | 106 | # return 107 | if(boot_method == "perc"){ 108 | return(list(sim_dist = sim_dist, CI = ci)) 109 | } else { 110 | return(list(sim_dist = sim_dist, SE = se, ME = me, CI = ci)) 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /R/ci_single_mean_theo.R: -------------------------------------------------------------------------------- 1 | ci_single_mean_theo <- function(y, conf_level, y_name, 2 | show_var_types, show_summ_stats, show_res, 3 | show_eda_plot, show_inf_plot){ 4 | 5 | # calculate sample size 6 | n <- length(y) 7 | 8 | # calculate x-bar 9 | y_bar <- mean(y) 10 | 11 | # define degrees of freedom 12 | df <- n - 1 13 | 14 | # find percentile associated with critical value 15 | perc_crit_value <- conf_level + ((1 - conf_level) / 2) 16 | 17 | # find critical value 18 | t_star <- qt(perc_crit_value, df) 19 | 20 | # calculate s 21 | s <- sd(y) 22 | 23 | # calculate SE 24 | se <- s / sqrt(n) 25 | 26 | # calculate ME 27 | me <- t_star * se 28 | 29 | # calculate CI 30 | ci <- y_bar + c(-1, 1)* me 31 | 32 | # print variable types 33 | if(show_var_types == TRUE){ 34 | cat("Single numerical variable\n") 35 | } 36 | 37 | # print summary statistics 38 | if(show_summ_stats == TRUE){ 39 | cat(paste0("n = ", n, ", y-bar = ", round(y_bar, 4), ", s = ", round(s, 4), "\n")) 40 | } 41 | 42 | # print results 43 | if(show_res == TRUE){ 44 | conf_level_perc = conf_level * 100 45 | cat(paste0(conf_level_perc, "% CI: (", round(ci[1], 4), " , ", round(ci[2], 4), ")\n")) 46 | } 47 | 48 | # eda_plot 49 | d_eda <- data.frame(y = y) 50 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = y), environment = environment()) + 51 | ggplot2::geom_histogram(fill = "#8FDEE1", binwidth = diff(range(y)) / 20) + 52 | ggplot2::xlab(y_name) + 53 | ggplot2::ylab("") + 54 | ggplot2::ggtitle("Sample Distribution") + 55 | ggplot2::geom_vline(xintercept = y_bar, col = "#1FBEC3", lwd = 1.5) 56 | 57 | # print plots 58 | if(show_eda_plot){ print(eda_plot) } 59 | if(show_inf_plot){ warning("No inference plot available.", call. = FALSE) } 60 | 61 | # return 62 | return(list(df = df, SE = se, ME = me, CI = ci)) 63 | 64 | } -------------------------------------------------------------------------------- /R/ci_single_median_sim.R: -------------------------------------------------------------------------------- 1 | ci_single_median_sim <- function(y, conf_level, y_name, 2 | boot_method, nsim, seed, 3 | show_var_types, show_summ_stats, show_res, 4 | show_eda_plot, show_inf_plot){ 5 | 6 | # set seed 7 | if(!is.null(seed)){ set.seed(seed) } 8 | 9 | # calculate sample size 10 | n <- length(y) 11 | 12 | # calculate x-bar 13 | med <- median(y) 14 | 15 | # create bootstrap distribution 16 | sim_dist <- rep(NA, nsim) 17 | for(i in 1:nsim){ 18 | boot_samp <- sample(y, size = n, replace = TRUE) 19 | sim_dist[i] <- median(boot_samp) 20 | } 21 | 22 | # for percentile method 23 | if(boot_method == "perc"){ 24 | # calculate quantile cutoffs based on confidence level 25 | lower_quantile <- (1-conf_level) / 2 26 | upper_quantile <- conf_level + lower_quantile 27 | 28 | # calculate quantiles of the bootstrap distribution 29 | ci_lower <- as.numeric(quantile(sim_dist, lower_quantile)) 30 | ci_upper <- as.numeric(quantile(sim_dist, upper_quantile)) 31 | 32 | # put CI together 33 | ci <- c(ci_lower, ci_upper) 34 | } 35 | 36 | # for standard error method 37 | if(boot_method == "se"){ 38 | # define degrees of freedom 39 | df <- n - 1 40 | 41 | # find percentile associated with critical value 42 | perc_crit_value <- conf_level + ((1 - conf_level) / 2) 43 | 44 | # find critical value 45 | t_star <- qt(perc_crit_value, df) 46 | 47 | # calculate SE 48 | se <- sd(sim_dist) 49 | 50 | # calculate ME 51 | me <- t_star * se 52 | 53 | # calculate CI 54 | ci <- med + c(-1, 1)* me 55 | } 56 | 57 | # print variable types 58 | if(show_var_types == TRUE){ 59 | cat("Single numerical variable\n") 60 | } 61 | 62 | # print summary statistics 63 | if(show_summ_stats == TRUE){ 64 | q_25 <- quantile(y, 0.25) 65 | q_75 <- quantile(y, 0.75) 66 | cat(paste0("n = ", n, ", y_med = ", round(med, 4), 67 | ", Q1 = ", round(q_25, 4), ", Q3 = ", round(q_75, 4), "\n")) 68 | } 69 | 70 | # print results 71 | if(show_res == TRUE){ 72 | conf_level_perc = conf_level * 100 73 | cat(paste0(conf_level_perc, "% CI: (", round(ci[1], 4), " , ", round(ci[2], 4), ")\n")) 74 | } 75 | 76 | # eda_plot 77 | d_eda <- data.frame(y = y) 78 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = y), environment = environment()) + 79 | ggplot2::geom_histogram(fill = "#8FDEE1", binwidth = diff(range(y)) / 20) + 80 | ggplot2::xlab(y_name) + 81 | ggplot2::ylab("") + 82 | ggplot2::ggtitle("Sample Distribution") + 83 | ggplot2::geom_vline(xintercept = med, col = "#1FBEC3", lwd = 1.5) 84 | 85 | # inf_plot 86 | d_inf <- data.frame(sim_dist = sim_dist) 87 | inf_plot <- ggplot2::ggplot(data = d_inf, ggplot2::aes(x = sim_dist), environment = environment()) + 88 | ggplot2::geom_histogram(fill = "#CCCCCC", binwidth = max(diff(range(sim_dist)) / 20, 1)) + 89 | ggplot2::annotate("rect", xmin = ci[1], xmax = ci[2], ymin = 0, ymax = Inf, 90 | alpha = 0.3, fill = "#FABAB8") + 91 | ggplot2::xlab("bootstrap medians") + 92 | ggplot2::ylab("") + 93 | ggplot2::ggtitle("Bootstrap Distribution") + 94 | ggplot2::geom_vline(xintercept = ci, color = "#F57670", lwd = 1.5) 95 | 96 | # print plots 97 | if(show_eda_plot & !show_inf_plot){ 98 | print(eda_plot) 99 | } 100 | if(!show_eda_plot & show_inf_plot){ 101 | print(inf_plot) 102 | } 103 | if(show_eda_plot & show_inf_plot){ 104 | gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2) 105 | } 106 | 107 | # return 108 | if(boot_method == "perc"){ 109 | return(list(sim_dist = sim_dist, CI = ci)) 110 | } else { 111 | return(list(sim_dist = sim_dist, SE = se, ME = me, CI = ci)) 112 | } 113 | 114 | } -------------------------------------------------------------------------------- /R/ci_single_prop_sim.R: -------------------------------------------------------------------------------- 1 | ci_single_prop_sim <- function(y, success, conf_level, y_name, 2 | boot_method, nsim, seed, 3 | show_var_types, show_summ_stats, show_res, 4 | show_eda_plot, show_inf_plot){ 5 | 6 | # set seed 7 | if(!is.null(seed)){set.seed(seed)} 8 | 9 | # calculate sample size 10 | n <- length(y) 11 | 12 | # calculate p_hat 13 | p_hat <- sum(y == success) / n 14 | 15 | # create bootstrap distribution 16 | sim_dist <- rep(NA, nsim) 17 | for(i in 1:nsim){ 18 | boot_samp <- sample(y, size = n, replace = TRUE) 19 | sim_dist[i] <- sum(boot_samp == success) / n 20 | } 21 | 22 | # for percentile method 23 | if(boot_method == "perc"){ 24 | # calculate quantile cutoffs based on confidence level 25 | lower_quantile <- (1-conf_level) / 2 26 | upper_quantile <- conf_level + lower_quantile 27 | 28 | # calculate quantiles of the bootstrap distribution 29 | ci_lower <- as.numeric(quantile(sim_dist, lower_quantile)) 30 | ci_upper <- as.numeric(quantile(sim_dist, upper_quantile)) 31 | 32 | # put CI together 33 | ci <- c(ci_lower, ci_upper) 34 | } 35 | 36 | # for standard error method 37 | if(boot_method == "se"){ 38 | 39 | # find percentile associated with critical value 40 | perc_crit_value <- conf_level + ((1 - conf_level) / 2) 41 | 42 | # find critical value 43 | z_star <- qnorm(perc_crit_value) 44 | 45 | # calculate SE 46 | se <- sd(sim_dist) 47 | 48 | # calculate ME 49 | me <- z_star * se 50 | 51 | # calculate CI 52 | ci <- p_hat + c(-1, 1) * me 53 | } 54 | 55 | # print variable types 56 | if(show_var_types == TRUE){ 57 | cat(paste0("Single categorical variable, success: ", success,"\n")) 58 | } 59 | 60 | # print summary statistics 61 | if(show_summ_stats == TRUE){ 62 | cat(paste0("n = ", n, ", p-hat = ", round(p_hat, 4), "\n")) 63 | } 64 | 65 | # print results 66 | if(show_res == TRUE){ 67 | conf_level_perc = conf_level * 100 68 | cat(paste0(conf_level_perc, "% CI: (", round(ci[1], 4), " , ", round(ci[2], 4), ")\n")) 69 | } 70 | 71 | # eda_plot 72 | d_eda <- data.frame(y = y) 73 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = y), environment = environment()) + 74 | ggplot2::geom_bar(fill = "#8FDEE1") + 75 | ggplot2::xlab(y_name) + 76 | ggplot2::ylab("") + 77 | ggplot2::ggtitle("Sample Distribution") 78 | 79 | # inf_plot 80 | d_inf <- data.frame(sim_dist = sim_dist) 81 | inf_plot <- ggplot2::ggplot(data = d_inf, ggplot2::aes(x = sim_dist), environment = environment()) + 82 | ggplot2::geom_histogram(fill = "#CCCCCC", binwidth = diff(range(sim_dist)) / 20) + 83 | ggplot2::annotate("rect", xmin = ci[1], xmax = ci[2], ymin = 0, ymax = Inf, 84 | alpha = 0.3, fill = "#FABAB8") + 85 | ggplot2::xlab("bootstrap means") + 86 | ggplot2::ylab("") + 87 | ggplot2::ggtitle("Bootstrap Distribution") + 88 | ggplot2::geom_vline(xintercept = ci, color = "#F57670", lwd = 1.5) 89 | 90 | # print plots 91 | if(show_eda_plot & !show_inf_plot){ 92 | print(eda_plot) 93 | } 94 | if(!show_eda_plot & show_inf_plot){ 95 | print(inf_plot) 96 | } 97 | if(show_eda_plot & show_inf_plot){ 98 | gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2) 99 | } 100 | 101 | # return 102 | if(boot_method == "perc"){ 103 | return(list(sim_dist = sim_dist, CI = round(ci, 4))) 104 | } else { 105 | return(list(sim_dist = sim_dist, SE = round(se, 4), ME = round(me, 4), CI = round(ci, 4))) 106 | } 107 | 108 | } -------------------------------------------------------------------------------- /R/ci_single_prop_theo.R: -------------------------------------------------------------------------------- 1 | ci_single_prop_theo <- function(y, success, conf_level, y_name, 2 | show_var_types, show_summ_stats, show_res, 3 | show_eda_plot, show_inf_plot){ 4 | 5 | # calculate sample size 6 | n <- length(y) 7 | 8 | # calculate p-hat 9 | p_hat <- sum(y == success) / n 10 | 11 | # find percentile associated with critical value 12 | perc_crit_value <- conf_level + ((1 - conf_level) / 2) 13 | 14 | # find critical value 15 | z_star <- qnorm(perc_crit_value) 16 | 17 | # calculate SE 18 | se <- sqrt(p_hat * (1 - p_hat) / n) 19 | 20 | # calculate ME 21 | me <- z_star * se 22 | 23 | # calculate CI 24 | ci <- p_hat + c(-1, 1) * me 25 | 26 | # print variable types 27 | if(show_var_types == TRUE){ 28 | cat(paste0("Single categorical variable, success: ", success,"\n")) 29 | } 30 | 31 | # print summary statistics 32 | if(show_summ_stats == TRUE){ 33 | cat(paste0("n = ", n, ", p-hat = ", round(p_hat, 4), "\n")) 34 | } 35 | 36 | # print results 37 | if(show_res == TRUE){ 38 | conf_level_perc = conf_level * 100 39 | cat(paste0(conf_level_perc, "% CI: (", round(ci[1], 4), " , ", round(ci[2], 4), ")\n")) 40 | } 41 | 42 | # eda_plot 43 | d_eda <- data.frame(y = y) 44 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = y), environment = environment()) + 45 | ggplot2::geom_bar(fill = "#8FDEE1") + 46 | ggplot2::xlab(y_name) + 47 | ggplot2::ylab("") + 48 | ggplot2::ggtitle("Sample Distribution") 49 | 50 | # print plots 51 | if(show_eda_plot){ print(eda_plot) } 52 | if(show_inf_plot){ warning("No inference plot available.") } 53 | 54 | # return 55 | return(list(SE = round(se, 4), ME = round(me, 4), CI = round(ci, 4))) 56 | } -------------------------------------------------------------------------------- /R/ci_two_mean_sim.R: -------------------------------------------------------------------------------- 1 | ci_two_mean_sim <- function(y, x, conf_level, y_name, x_name, 2 | boot_method, nsim, seed, 3 | show_var_types, show_summ_stats, show_res, 4 | show_eda_plot, show_inf_plot){ 5 | 6 | # set seed 7 | if(!is.null(seed)){ set.seed(seed) } 8 | 9 | # calculate n1 and n2 10 | ns <- by(y, x, length) 11 | n1 <- as.numeric(ns[1]) 12 | n2 <- as.numeric(ns[2]) 13 | n <- n1 + n2 14 | 15 | # calculate y-bar1 and y-bar2 16 | y_bars <- by(y, x, mean) 17 | y_bar1 <- as.numeric(y_bars[1]) 18 | y_bar2 <- as.numeric(y_bars[2]) 19 | 20 | # calculate difference in y-bars 21 | y_bar_diff <- y_bar1 - y_bar2 22 | 23 | # create bootstrap distribution 24 | y1 <- y[x == levels(x)[1]] 25 | y2 <- y[x == levels(x)[2]] 26 | 27 | sim_dist <- rep(NA, nsim) 28 | for(i in 1:nsim){ 29 | boot_samp1 <- sample(y1, size = n1, replace = TRUE) 30 | boot_samp2 <- sample(y2, size = n2, replace = TRUE) 31 | sim_dist[i] <- mean(boot_samp1) - mean(boot_samp2) 32 | } 33 | 34 | # for percentile method 35 | if(boot_method == "perc"){ 36 | # calculate quantile cutoffs based on confidence level 37 | lower_quantile <- (1-conf_level) / 2 38 | upper_quantile <- conf_level + lower_quantile 39 | 40 | # calculate quantiles of the bootstrap distribution 41 | ci_lower <- as.numeric(quantile(sim_dist, lower_quantile)) 42 | ci_upper <- as.numeric(quantile(sim_dist, upper_quantile)) 43 | 44 | # put CI together 45 | ci <- c(ci_lower, ci_upper) 46 | } 47 | 48 | # for standard error method 49 | if(boot_method == "se"){ 50 | # define degrees of freedom 51 | df <- min(n1 - 1, n2 - 1) 52 | 53 | # find percentile associated with critical value 54 | perc_crit_value <- conf_level + ((1 - conf_level) / 2) 55 | 56 | # find critical value 57 | t_star <- qt(perc_crit_value, df) 58 | 59 | # calculate SE 60 | se <- sd(sim_dist) 61 | 62 | # calculate ME 63 | me <- t_star * se 64 | 65 | # calculate CI 66 | ci <- y_bar_diff + c(-1, 1) * me 67 | } 68 | 69 | # print variable types 70 | if(show_var_types == TRUE){ 71 | n_x_levels <- length(levels(x)) 72 | cat(paste0("Response variable: numerical, Explanatory variable: categorical (", n_x_levels," levels)\n")) 73 | } 74 | 75 | # print summary statistics 76 | gr1 <- levels(x)[1] 77 | gr2 <- levels(x)[2] 78 | 79 | if(show_summ_stats == TRUE){ 80 | sds <- by(y, x, sd) 81 | s1 <- as.numeric(sds[1]) 82 | s2 <- as.numeric(sds[2]) 83 | cat(paste0("n_", gr1, " = ", n1, ", y_bar_", gr1, " = ", round(y_bar1, 4), ", s_", gr1, " = ", round(s1, 4), "\n")) 84 | cat(paste0("n_", gr2, " = ", n2, ", y_bar_", gr2, " = ", round(y_bar2, 4), ", s_", gr2, " = ", round(s2, 4), "\n")) 85 | } 86 | 87 | # print results 88 | if(show_res == TRUE){ 89 | conf_level_perc = conf_level * 100 90 | cat(paste0(conf_level_perc, "% CI (", gr1 ," - ", gr2,"): (", round(ci[1], 4), " , ", round(ci[2], 4), ")\n")) 91 | } 92 | 93 | # eda_plot 94 | d_eda <- data.frame(y = y, x = x) 95 | d_means <- data.frame(y_bars = as.numeric(y_bars), x = levels(x)) 96 | 97 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = y), environment = environment()) + 98 | ggplot2::geom_histogram(fill = "#8FDEE1", binwidth = diff(range(y)) / 20) + 99 | ggplot2::xlab(y_name) + 100 | ggplot2::ylab(x_name) + 101 | ggplot2::ggtitle("Sample Distribution") + 102 | ggplot2::geom_vline(data = d_means, ggplot2::aes(xintercept = y_bars), col = "#1FBEC3", lwd = 1.5) + 103 | ggplot2::facet_grid(x ~ .) 104 | 105 | # inf_plot 106 | d_inf <- data.frame(sim_dist = sim_dist) 107 | inf_plot <- ggplot2::ggplot(data = d_inf, ggplot2::aes(x = sim_dist), environment = environment()) + 108 | ggplot2::geom_histogram(fill = "#CCCCCC", binwidth = diff(range(sim_dist)) / 20) + 109 | ggplot2::annotate("rect", xmin = ci[1], xmax = ci[2], ymin = 0, ymax = Inf, 110 | alpha = 0.3, fill = "#FABAB8") + 111 | ggplot2::xlab("bootstrap differences in means") + 112 | ggplot2::ylab("") + 113 | ggplot2::ggtitle("Bootstrap Distribution") + 114 | ggplot2::geom_vline(xintercept = ci, color = "#F57670", lwd = 1.5) 115 | 116 | # print plots 117 | if(show_eda_plot & !show_inf_plot){ 118 | print(eda_plot) 119 | } 120 | if(!show_eda_plot & show_inf_plot){ 121 | print(inf_plot) 122 | } 123 | if(show_eda_plot & show_inf_plot){ 124 | gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2) 125 | } 126 | 127 | # return 128 | if(boot_method == "perc"){ 129 | return(list(sim_dist = sim_dist, CI = ci)) 130 | } else { 131 | return(list(sim_dist = sim_dist, SE = se, ME = me, CI = ci)) 132 | } 133 | } -------------------------------------------------------------------------------- /R/ci_two_mean_theo.R: -------------------------------------------------------------------------------- 1 | ci_two_mean_theo <- function(y, x, conf_level, y_name, x_name, 2 | show_var_types, show_summ_stats, show_res, 3 | show_eda_plot, show_inf_plot){ 4 | 5 | # calculate n1 and n2 6 | ns <- by(y, x, length) 7 | n1 <- as.numeric(ns[1]) 8 | n2 <- as.numeric(ns[2]) 9 | 10 | # calculate y-bar1 and y-bar2 11 | y_bars <- by(y, x, mean) 12 | y_bar1 <- as.numeric(y_bars[1]) 13 | y_bar2 <- as.numeric(y_bars[2]) 14 | 15 | # calculate difference in y-bars 16 | y_bar_diff <- y_bar1 - y_bar2 17 | 18 | # calculate s1 and s2 19 | sds <- by(y, x, sd) 20 | s1 <- as.numeric(sds[1]) 21 | s2 <- as.numeric(sds[2]) 22 | 23 | # define degrees of freedom 24 | df <- min(n1 - 1, n2 - 1) 25 | 26 | # find percentile associated with critical value 27 | perc_crit_value <- conf_level + ((1 - conf_level) / 2) 28 | 29 | # find critical value 30 | t_star <- qt(perc_crit_value, df) 31 | 32 | # calculate SE 33 | se <- sqrt((s1^2 / n1) + (s2^2 / n2)) 34 | 35 | # calculate ME 36 | me <- t_star * se 37 | 38 | # calculate CI 39 | ci <- y_bar_diff + c(-1, 1) * me 40 | 41 | # print variable types 42 | if(show_var_types == TRUE){ 43 | n_x_levels <- length(levels(x)) 44 | cat(paste0("Response variable: numerical, Explanatory variable: categorical (", n_x_levels," levels)\n")) 45 | } 46 | 47 | # print summary statistics 48 | gr1 <- levels(x)[1] 49 | gr2 <- levels(x)[2] 50 | 51 | if(show_summ_stats == TRUE){ 52 | cat(paste0("n_", gr1, " = ", n1, ", y_bar_", gr1, " = ", round(y_bar1, 4), ", s_", gr1, " = ", round(s1, 4), "\n")) 53 | cat(paste0("n_", gr2, " = ", n2, ", y_bar_", gr2, " = ", round(y_bar2, 4), ", s_", gr2, " = ", round(s2, 4), "\n")) 54 | } 55 | 56 | # print results 57 | if(show_res == TRUE){ 58 | conf_level_perc = conf_level * 100 59 | cat(paste0(conf_level_perc, "% CI (", gr1 ," - ", gr2,"): (", round(ci[1], 4), " , ", round(ci[2], 4), ")\n")) 60 | } 61 | 62 | # eda_plot 63 | d_eda <- data.frame(y = y, x = x) 64 | d_means <- data.frame(y_bars = as.numeric(y_bars), x = levels(x)) 65 | 66 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = y), environment = environment()) + 67 | ggplot2::geom_histogram(fill = "#8FDEE1", binwidth = diff(range(y)) / 20) + 68 | ggplot2::xlab(y_name) + 69 | ggplot2::ylab(x_name) + 70 | ggplot2::ggtitle("Sample Distribution") + 71 | ggplot2::geom_vline(data = d_means, ggplot2::aes(xintercept = y_bars), col = "#1FBEC3", lwd = 1.5) + 72 | ggplot2::facet_grid(x ~ .) 73 | 74 | 75 | # print plots 76 | if(show_eda_plot){ print(eda_plot) } 77 | if(show_inf_plot){ warning("No inference plot available.") } 78 | 79 | # return 80 | return(list(df = df, SE = se, ME = me, CI = ci)) 81 | } -------------------------------------------------------------------------------- /R/ci_two_median_sim.R: -------------------------------------------------------------------------------- 1 | ci_two_median_sim <- function(y, x, conf_level, y_name, x_name, 2 | boot_method, nsim, seed, 3 | show_var_types, show_summ_stats, show_res, 4 | show_eda_plot, show_inf_plot){ 5 | 6 | # set seed 7 | if(!is.null(seed)){ set.seed(seed) } 8 | 9 | # calculate n1 and n2 10 | ns <- by(y, x, length) 11 | n1 <- as.numeric(ns[1]) 12 | n2 <- as.numeric(ns[2]) 13 | 14 | # calculate y-bar1 and y-bar2 15 | y_meds <- by(y, x, median) 16 | y_med1 <- as.numeric(y_meds[1]) 17 | y_med2 <- as.numeric(y_meds[2]) 18 | 19 | # calculate difference in y-bars 20 | y_med_diff <- y_med1 - y_med2 21 | 22 | # create bootstrap distribution 23 | y1 <- y[x == levels(x)[1]] 24 | y2 <- y[x == levels(x)[2]] 25 | 26 | sim_dist <- rep(NA, nsim) 27 | for(i in 1:nsim){ 28 | boot_samp1 <- sample(y1, size = n1, replace = TRUE) 29 | boot_samp2 <- sample(y2, size = n2, replace = TRUE) 30 | sim_dist[i] <- median(boot_samp1) - median(boot_samp2) 31 | } 32 | 33 | # for percentile method 34 | if(boot_method == "perc"){ 35 | # calculate quantile cutoffs based on confidence level 36 | lower_quantile <- (1-conf_level) / 2 37 | upper_quantile <- conf_level + lower_quantile 38 | 39 | # calculate quantiles of the bootstrap distribution 40 | ci_lower <- as.numeric(quantile(sim_dist, lower_quantile)) 41 | ci_upper <- as.numeric(quantile(sim_dist, upper_quantile)) 42 | 43 | # put CI together 44 | ci <- c(ci_lower, ci_upper) 45 | } 46 | 47 | # for standard error method 48 | if(boot_method == "se"){ 49 | # define degrees of freedom 50 | df <- min(n1 - 1, n2 - 1) 51 | 52 | # find percentile associated with critical value 53 | perc_crit_value <- conf_level + ((1 - conf_level) / 2) 54 | 55 | # find critical value 56 | t_star <- qt(perc_crit_value, df) 57 | 58 | # calculate SE 59 | se <- sd(sim_dist) 60 | 61 | # calculate ME 62 | me <- t_star * se 63 | 64 | # calculate CI 65 | ci <- y_med_diff + c(-1, 1) * me 66 | } 67 | 68 | # print variable types 69 | if(show_var_types == TRUE){ 70 | n_x_levels <- length(levels(x)) 71 | cat(paste0("Response variable: numerical, Explanatory variable: categorical (", n_x_levels," levels)\n")) 72 | } 73 | 74 | # print summary statistics 75 | gr1 <- levels(x)[1] 76 | gr2 <- levels(x)[2] 77 | 78 | if(show_summ_stats == TRUE){ 79 | iqrs <- by(y, x, IQR) 80 | iqr1 <- as.numeric(iqrs[1]) 81 | iqr2 <- as.numeric(iqrs[2]) 82 | cat(paste0("n_", gr1, " = ", n1, ", y_med_", gr1, " = ", round(y_med1, 4), ", IQR_", gr1, " = ", round(iqr1, 4), "\n")) 83 | cat(paste0("n_", gr2, " = ", n2, ", y_med_", gr2, " = ", round(y_med2, 4), ", IQR_", gr2, " = ", round(iqr2, 4), "\n")) 84 | } 85 | 86 | # print results 87 | if(show_res == TRUE){ 88 | conf_level_perc = conf_level * 100 89 | cat(paste0(conf_level_perc, "% CI (", gr1 ," - ", gr2,"): (", round(ci[1], 4), " , ", round(ci[2], 4), ")\n")) 90 | } 91 | 92 | # eda_plot 93 | d_eda <- data.frame(y = y, x = x) 94 | 95 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = x, y = y), environment = environment()) + 96 | ggplot2::geom_boxplot(color = "#1FBEC3", fill = "#8FDEE1", outlier.colour = "#1FBEC3") + 97 | ggplot2::xlab(x_name) + 98 | ggplot2::ylab(y_name) + 99 | ggplot2::ggtitle("Sample Distribution") 100 | 101 | # inf_plot 102 | d_inf <- data.frame(sim_dist = sim_dist) 103 | inf_plot <- ggplot2::ggplot(data = d_inf, ggplot2::aes(x = sim_dist), environment = environment()) + 104 | ggplot2::geom_histogram(fill = "#CCCCCC", binwidth = diff(range(sim_dist)) / 20) + 105 | ggplot2::annotate("rect", xmin = ci[1], xmax = ci[2], ymin = 0, ymax = Inf, 106 | alpha = 0.3, fill = "#FABAB8") + 107 | ggplot2::xlab("bootstrap differences in medians") + 108 | ggplot2::ylab("") + 109 | ggplot2::ggtitle("Bootstrap Distribution") + 110 | ggplot2::geom_vline(xintercept = ci, color = "#F57670", lwd = 1.5) 111 | 112 | # print plots 113 | if(show_eda_plot & !show_inf_plot){ 114 | print(eda_plot) 115 | } 116 | if(!show_eda_plot & show_inf_plot){ 117 | print(inf_plot) 118 | } 119 | if(show_eda_plot & show_inf_plot){ 120 | gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2) 121 | } 122 | 123 | # return 124 | if(boot_method == "perc"){ 125 | return(list(sim_dist = sim_dist, CI = ci)) 126 | } else { 127 | return(list(sim_dist = sim_dist, SE = se, ME = me, CI = ci)) 128 | } 129 | } -------------------------------------------------------------------------------- /R/ci_two_prop_sim.R: -------------------------------------------------------------------------------- 1 | ci_two_prop_sim <- function(y, x, success, conf_level, 2 | x_name, y_name, 3 | boot_method, nsim, seed, 4 | show_var_types, show_summ_stats, show_res, 5 | show_eda_plot, show_inf_plot){ 6 | 7 | # set seed 8 | if(!is.null(seed)){ set.seed(seed) } 9 | 10 | # calculate n1 and n2 11 | ns <- by(y, x, length) 12 | n1 <- as.numeric(ns[1]) 13 | n2 <- as.numeric(ns[2]) 14 | 15 | # calculate p-hat1 and p-hat2 16 | p_hat1 <- sum(y[x == levels(x)[1]] == success) / n1 17 | p_hat2 <- sum(y[x == levels(x)[2]] == success) / n2 18 | 19 | # calculate difference in p-hats 20 | p_hat_diff <- p_hat1 - p_hat2 21 | 22 | # create bootstrap distribution 23 | y1 <- y[x == levels(x)[1]] 24 | y2 <- y[x == levels(x)[2]] 25 | 26 | sim_dist <- rep(NA, nsim) 27 | for(i in 1:nsim){ 28 | boot_samp1 <- sample(y1, size = n1, replace = TRUE) 29 | boot_samp2 <- sample(y2, size = n2, replace = TRUE) 30 | boot_phat1 <- sum(boot_samp1 == success) / n1 31 | boot_phat2 <- sum(boot_samp2 == success) / n2 32 | sim_dist[i] <- boot_phat1 - boot_phat2 33 | } 34 | 35 | # for percentile method 36 | if(boot_method == "perc"){ 37 | # calculate quantile cutoffs based on confidence level 38 | lower_quantile <- (1-conf_level) / 2 39 | upper_quantile <- conf_level + lower_quantile 40 | 41 | # calculate quantiles of the bootstrap distribution 42 | ci_lower <- as.numeric(quantile(sim_dist, lower_quantile)) 43 | ci_upper <- as.numeric(quantile(sim_dist, upper_quantile)) 44 | 45 | # put CI together 46 | ci <- c(ci_lower, ci_upper) 47 | } 48 | 49 | # for standard error method 50 | if(boot_method == "se"){ 51 | 52 | # find percentile associated with critical value 53 | perc_crit_value <- conf_level + ((1 - conf_level) / 2) 54 | 55 | # find critical value 56 | z_star <- qnorm(perc_crit_value) 57 | 58 | # calculate SE 59 | se <- sd(sim_dist) 60 | 61 | # calculate ME 62 | me <- z_star * se 63 | 64 | # calculate CI 65 | ci <- p_hat_diff + c(-1, 1) * me 66 | } 67 | 68 | # print variable types 69 | if(show_var_types == TRUE){ 70 | n_x_levels <- length(levels(x)) 71 | n_y_levels <- length(levels(y)) 72 | cat(paste0("Response variable: categorical (", n_x_levels, " levels, success: ", success, ")\n")) 73 | cat(paste0("Explanatory variable: categorical (", n_y_levels, " levels) \n")) 74 | } 75 | 76 | # print summary statistics 77 | if(show_summ_stats == TRUE){ 78 | gr1 <- levels(x)[1] 79 | gr2 <- levels(x)[2] 80 | cat(paste0("n_", gr1, " = ", n1, ", p_hat_", gr1, " = ", round(p_hat1, 4), "\n")) 81 | cat(paste0("n_", gr2, " = ", n2, ", p_hat_", gr2, " = ", round(p_hat2, 4), "\n")) 82 | } 83 | 84 | # print results 85 | if(show_res == TRUE){ 86 | conf_level_perc = conf_level * 100 87 | cat(paste0(conf_level_perc, "% CI (", gr1 ," - ", gr2,"): (", round(ci[1], 4), " , ", round(ci[2], 4), ")\n")) 88 | } 89 | 90 | # eda_plot 91 | d_eda <- data.frame(y = y, x = x) 92 | 93 | if(which(levels(y) == success) == 1){ 94 | fill_values = c("#1FBEC3", "#8FDEE1") 95 | } else { 96 | fill_values = c("#8FDEE1", "#1FBEC3") 97 | } 98 | 99 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = x, fill = y), environment = environment()) + 100 | ggplot2::geom_bar() + 101 | ggplot2::scale_fill_manual(values = fill_values) + 102 | ggplot2::xlab(x_name) + 103 | ggplot2::ylab("") + 104 | ggplot2::ggtitle("Sample Distribution") + 105 | ggplot2::guides(fill = ggplot2::guide_legend(title = y_name)) 106 | 107 | # inf_plot 108 | d_inf <- data.frame(sim_dist = sim_dist) 109 | inf_plot <- ggplot2::ggplot(data = d_inf, ggplot2::aes(x = sim_dist), environment = environment()) + 110 | ggplot2::geom_histogram(fill = "#CCCCCC", binwidth = diff(range(sim_dist)) / 20) + 111 | ggplot2::annotate("rect", xmin = ci[1], xmax = ci[2], ymin = 0, ymax = Inf, 112 | alpha = 0.3, fill = "#FABAB8") + 113 | ggplot2::xlab("bootstrap differences in proportions") + 114 | ggplot2::ylab("") + 115 | ggplot2::ggtitle("Bootstrap Distribution") + 116 | ggplot2::geom_vline(xintercept = ci, color = "#F57670", lwd = 1.5) 117 | 118 | # print plots 119 | if(show_eda_plot & !show_inf_plot){ 120 | print(eda_plot) 121 | } 122 | if(!show_eda_plot & show_inf_plot){ 123 | print(inf_plot) 124 | } 125 | if(show_eda_plot & show_inf_plot){ 126 | gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2) 127 | } 128 | 129 | # return 130 | if(boot_method == "perc"){ 131 | return(list(sim_dist = sim_dist, CI = ci)) 132 | } else { 133 | return(list(sim_dist = sim_dist, SE = se, ME = me, CI = ci)) 134 | } 135 | 136 | } -------------------------------------------------------------------------------- /R/ci_two_prop_theo.R: -------------------------------------------------------------------------------- 1 | ci_two_prop_theo <- function(y, x, success, conf_level, 2 | x_name, y_name, 3 | show_var_types, show_summ_stats, show_res, 4 | show_eda_plot, show_inf_plot){ 5 | 6 | # calculate n1 and n2 7 | ns <- by(y, x, length) 8 | n1 <- as.numeric(ns[1]) 9 | n2 <- as.numeric(ns[2]) 10 | 11 | # calculate p-hat1 and p-hat2 12 | p_hat1 <- sum(y[x == levels(x)[1]] == success) / n1 13 | p_hat2 <- sum(y[x == levels(x)[2]] == success) / n2 14 | 15 | # calculate difference in p-hats 16 | p_hat_diff <- p_hat1 - p_hat2 17 | 18 | # find percentile associated with critical value 19 | perc_crit_value <- conf_level + ((1 - conf_level) / 2) 20 | 21 | # find critical value 22 | z_star <- qnorm(perc_crit_value) 23 | 24 | # calculate SE 25 | se <- sqrt((p_hat1 * (1 - p_hat1) / n1) + (p_hat2 * (1 - p_hat2) / n2)) 26 | 27 | # calculate ME 28 | me <- z_star * se 29 | 30 | # calculate CI 31 | ci <- p_hat_diff + c(-1, 1) * me 32 | 33 | # print variable types 34 | if(show_var_types == TRUE){ 35 | n_x_levels <- length(levels(x)) 36 | n_y_levels <- length(levels(y)) 37 | cat(paste0("Response variable: categorical (", n_x_levels, " levels, success: ", success, ")\n")) 38 | cat(paste0("Explanatory variable: categorical (", n_y_levels, " levels) \n")) 39 | } 40 | 41 | # print summary statistics 42 | if(show_summ_stats == TRUE){ 43 | gr1 <- levels(x)[1] 44 | gr2 <- levels(x)[2] 45 | cat(paste0("n_", gr1, " = ", n1, ", p_hat_", gr1, " = ", round(p_hat1, 4), "\n")) 46 | cat(paste0("n_", gr2, " = ", n2, ", p_hat_", gr2, " = ", round(p_hat2, 4), "\n")) 47 | } 48 | 49 | # print results 50 | if(show_res == TRUE){ 51 | conf_level_perc = conf_level * 100 52 | cat(paste0(conf_level_perc, "% CI (", gr1 ," - ", gr2,"): (", round(ci[1], 4), " , ", round(ci[2], 4), ")\n")) 53 | } 54 | 55 | # eda_plot 56 | d_eda <- data.frame(y = y, x = x) 57 | 58 | if(which(levels(y) == success) == 1){ 59 | fill_values = c("#1FBEC3", "#8FDEE1") 60 | } else { 61 | fill_values = c("#8FDEE1", "#1FBEC3") 62 | } 63 | 64 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = x, fill = y), environment = environment()) + 65 | ggplot2::geom_bar(position = "fill") + 66 | ggplot2::scale_fill_manual(values = fill_values) + 67 | ggplot2::xlab(x_name) + 68 | ggplot2::ylab("") + 69 | ggplot2::ggtitle("Sample Distribution") + 70 | ggplot2::guides(fill = ggplot2::guide_legend(title = y_name)) 71 | 72 | # print plots 73 | if(show_eda_plot){ print(eda_plot) } 74 | if(show_inf_plot){ warning("No inference plot available.") } 75 | 76 | # return 77 | return(list(SE = se, ME = me, CI = ci)) 78 | } -------------------------------------------------------------------------------- /R/evals.R: -------------------------------------------------------------------------------- 1 | #' Teachers evaluations at the University of Texas at Austin 2 | #' 3 | #' The data were gathered from end of semester student evaluations for a large 4 | #' sample of professors from the University of Texas at Austin (variables beginning 5 | #' with \code{cls}). In addition, six students rated the professors' physical 6 | #' appearance (variables beginning with \code{bty}). (This is a slightly modified 7 | #' version of the original data set that was released as part of the replication 8 | #' data for Data Analysis Using Regression and Multilevel/Hierarchical Models 9 | #' (Gelman and Hill, 2007). 10 | #' 11 | #' @format A data frame with 463 rows and 21 variables: 12 | #' \describe{ 13 | #' \item{score}{Average professor evaluation score: (1) very unsatisfactory - (5) excellent} 14 | #' \item{rank}{Rank of professor: teaching, tenure track, tenure} 15 | #' \item{ethnicity}{Ethnicity of professor: not minority, minority} 16 | #' \item{gender}{Gender of professor: female, male} 17 | #' \item{language}{Language of school where professor received education: english or non-english} 18 | #' \item{age}{Age of professor} 19 | #' \item{cls_perc_eval}{Percent of students in class who completed evaluation} 20 | #' \item{cls_did_eval}{Number of students in class who completed evaluation} 21 | #' \item{cls_students}{Total number of students in class} 22 | #' \item{cls_level}{Class level: lower, upper} 23 | #' \item{cls_profs}{Number of professors teaching sections in course in sample: single, multiple} 24 | #' \item{cls_credits}{Number of credits of class: one credit (lab, PE, etc.), multi credit} 25 | #' \item{bty_f1lower}{Beauty rating of professor from lower level female: (1) lowest - (10) highest} 26 | #' \item{bty_f1upper}{Beauty rating of professor from upper level female: (1) lowest - (10) highest} 27 | #' \item{bty_f2upper}{Beauty rating of professor from second upper level female: (1) lowest - (10) highest} 28 | #' \item{bty_m1lower}{Beauty rating of professor from lower level male: (1) lowest - (10) highest} 29 | #' \item{bty_m1upper}{Beauty rating of professor from upper level male: (1) lowest - (10) highest} 30 | #' \item{bty_m2upper}{Beauty rating of professor from second upper level male: (1) lowest - (10) highest} 31 | #' \item{bty_avg}{Average beauty rating of professor} 32 | #' \item{pic_outfit}{Outfit of professor in picture: not formal, formal} 33 | #' \item{pic_color}{Color of professor's picture: color, black & white} 34 | #' } 35 | #' @source These data appear in Hamermesh DS, and Parker A. 2005. Beauty in the 36 | #' classroom: instructors pulchritude and putative pedagogical productivity. Economics of Education Review 37 | #' 24(4):369-376. 38 | "evals" -------------------------------------------------------------------------------- /R/globals.R: -------------------------------------------------------------------------------- 1 | utils::globalVariables(c("outcome", "play", "prob", "x_bar")) -------------------------------------------------------------------------------- /R/ht_many_mean_theo.R: -------------------------------------------------------------------------------- 1 | ht_many_mean_theo <- function(y, x, null, alternative, sig_level, 2 | y_name, x_name, 3 | show_var_types, show_summ_stats, show_res, 4 | show_eda_plot, show_inf_plot){ 5 | # summary stats 6 | ns <- by(y, x, length) 7 | y_bars <- by(y, x, mean) 8 | sds <- by(y, x, sd) 9 | 10 | # anova 11 | res <- anova(lm(y ~ x)) 12 | 13 | # anova pieces 14 | terms <- c(x_name, "Residuals", "Total") 15 | deg_frs <- res$Df 16 | ss <- res$`Sum Sq` 17 | ms <- res$`Mean Sq` 18 | stat <- res$`F value`[1] 19 | p_value <- res$`Pr(>F)`[1] 20 | 21 | # calculate totals 22 | ss_tot <- sum(ss) 23 | ss <- c(ss, ss_tot) 24 | df_tot <- sum(deg_frs) 25 | deg_frs <- c(deg_frs, df_tot) 26 | 27 | # ss format 28 | ss_format <- as.character(round(ss, 4)) 29 | 30 | # ms format 31 | ms_format <- as.character(c(round(ms, 4), NA)) 32 | 33 | # stat format 34 | stat_format <- as.character(c(round(stat, 4), NA, NA)) 35 | 36 | # p-value format 37 | p_value_format <- as.character(c(ifelse(round(p_value, 4) == 0, "< 0.0001", round(p_value, 4)), NA, NA)) 38 | 39 | # format output 40 | anova_output <- data.frame( 41 | df = deg_frs, 42 | Sum_Sq = ss_format, 43 | Mean_Sq = ms_format, 44 | F = stat_format, 45 | p_value = p_value_format, 46 | row.names = terms 47 | ) 48 | 49 | # print variable types 50 | if(show_var_types == TRUE){ 51 | n_x_levels <- length(levels(x)) 52 | cat(paste0("Response variable: numerical\n")) 53 | cat(paste0("Explanatory variable: categorical (", n_x_levels, " levels) \n")) 54 | } 55 | 56 | # print summary statistics 57 | if(show_summ_stats == TRUE){ 58 | grs <- levels(x) 59 | ns <- by(y, x, length) 60 | ybars <- round(by(y, x, mean), 4) 61 | sds <- round(by(y, x, sd), 4) 62 | for(i in 1:n_x_levels){ 63 | cat(paste0("n_", grs[i], " = ", ns[i], ", y_bar_", grs[i], " = ", round(ybars[i], 4), 64 | ", s_", grs[i], " = ", sds[i] , "\n")) 65 | } 66 | cat("\n") 67 | } 68 | 69 | # print results 70 | if(show_res == TRUE){ 71 | cat("ANOVA:\n") 72 | print(anova_output, na.print = "", digits = 4) 73 | 74 | # post-hoc tests (if ANOVA is significant) 75 | if(p_value < sig_level){ 76 | cat("\nPairwise tests - ") 77 | pairwise <- pairwise.t.test(y, x, p.adjust.method = "none", pool.sd = TRUE) 78 | cat(paste0(pairwise$method, ":\n")) 79 | print(broom::tidy(pairwise), digits = 4) 80 | } 81 | } 82 | 83 | # eda_plot 84 | d_eda <- data.frame(y = y, x = x) 85 | d_means <- data.frame(y_bars = as.numeric(y_bars), x = levels(x)) 86 | 87 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = y), environment = environment()) + 88 | ggplot2::geom_histogram(fill = "#8FDEE1", binwidth = diff(range(y)) / 20) + 89 | ggplot2::xlab(y_name) + 90 | ggplot2::ylab(x_name) + 91 | ggplot2::ggtitle("Sample Distribution") + 92 | ggplot2::geom_vline(data = d_means, ggplot2::aes(xintercept = y_bars), col = "#1FBEC3", lwd = 1.5) + 93 | ggplot2::facet_grid(x ~ .) 94 | 95 | # inf_plot 96 | x_max <- max(qf(0.99, df1 = deg_frs[1], df2 = deg_frs[2]), stat*1.1) 97 | inf_plot <- ggplot2::ggplot(data.frame(x = c(0, x_max)), ggplot2::aes(x)) + 98 | ggplot2::stat_function(fun = df, args = list(df1 = deg_frs[1], df2 = deg_frs[2]), color = "#999999") + 99 | ggplot2::annotate("rect", xmin = stat, xmax = stat+Inf, ymin = 0, ymax = Inf, 100 | alpha = 0.3, fill = "#FABAB8") + 101 | ggplot2::ggtitle(paste0("F Distribution\n(df_G = ", deg_frs[1], ", df_E = ", deg_frs[2], ")")) + 102 | ggplot2::xlab("") + 103 | ggplot2::ylab("") + 104 | ggplot2::geom_vline(xintercept = stat, color = "#F57670", lwd = 1.5) 105 | 106 | # print plots 107 | if(show_eda_plot & !show_inf_plot){ 108 | print(eda_plot) 109 | } 110 | if(!show_eda_plot & show_inf_plot){ 111 | print(inf_plot) 112 | } 113 | if(show_eda_plot & show_inf_plot){ 114 | gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2) 115 | } 116 | 117 | # return 118 | return(list(F = stat, df1 = deg_frs[1], df2 = deg_frs[2], p_value = p_value)) 119 | } -------------------------------------------------------------------------------- /R/ht_many_prop_sim.R: -------------------------------------------------------------------------------- 1 | ht_many_prop_sim <- function(y, x, x_name, y_name, seed, nsim, 2 | show_var_types, show_summ_stats, show_res, 3 | show_eda_plot, show_inf_plot){ 4 | 5 | length(x) 6 | length(y) 7 | 8 | # set seed 9 | if(!is.null(seed)){ set.seed(seed) } 10 | 11 | # chi-sq test of independence 12 | res <- chisq.test(x, y, correct = FALSE, simulate.p.value = TRUE, B = min(2000, nsim)) 13 | stat <- res$statistic 14 | 15 | # print variable types 16 | if(show_var_types == TRUE){ 17 | n_x_levels <- length(levels(x)) 18 | n_y_levels <- length(levels(y)) 19 | cat(paste0("Response variable: categorical (", n_y_levels, " levels) \n")) 20 | cat(paste0("Explanatory variable: categorical (", n_x_levels, " levels) \n")) 21 | } 22 | 23 | # print summary statistics 24 | if(show_summ_stats == TRUE){ 25 | cat("Observed:\n") 26 | print(res$observed) 27 | cat("\n") 28 | cat("Expected:\n") 29 | print(res$expected) 30 | cat("\n") 31 | } 32 | 33 | # print results 34 | if(show_res == TRUE){ 35 | cat(paste0("H0: ", x_name, " and ", y_name, " are independent\n")) 36 | cat(paste0("HA: ", x_name, " and ", y_name, " are dependent\n")) 37 | cat(paste0("chi_sq = ", round(as.numeric(stat), 4), 38 | ", p_value = ", round(res$p.value, 4), "\n")) 39 | } 40 | 41 | # eda_plot 42 | d_eda <- data.frame(y = y, x = x) 43 | 44 | n_fill_values <- length(levels(y)) 45 | fill_values <- grDevices::colorRampPalette(c("#1FBEC3", "#C7EEF0"))( n_fill_values ) 46 | 47 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = x, fill = y), environment = environment()) + 48 | ggplot2::geom_bar(position = "fill") + 49 | ggplot2::scale_fill_manual(values = fill_values) + 50 | ggplot2::xlab(x_name) + 51 | ggplot2::ylab("") + 52 | ggplot2::ggtitle("Sample Distribution") + 53 | ggplot2::guides(fill = ggplot2::guide_legend(title = y_name)) 54 | 55 | # print plots 56 | if(show_eda_plot){ print(eda_plot) } 57 | if(show_inf_plot){ warning("No inference plot available.") } 58 | 59 | # return 60 | return(list(chi_sq = as.numeric(stat), p_value = res$p.value)) 61 | } -------------------------------------------------------------------------------- /R/ht_many_prop_theo.R: -------------------------------------------------------------------------------- 1 | ht_many_prop_theo <- function(y, x, x_name, y_name, 2 | show_var_types, show_summ_stats, show_res, 3 | show_eda_plot, show_inf_plot){ 4 | 5 | # chi-sq test of independence 6 | res <- chisq.test(x, y, correct = FALSE) 7 | stat <- res$statistic 8 | deg_fr <- res$parameter 9 | 10 | # print variable types 11 | if(show_var_types == TRUE){ 12 | n_x_levels <- length(levels(x)) 13 | n_y_levels <- length(levels(y)) 14 | cat(paste0("Response variable: categorical (", n_y_levels, " levels) \n")) 15 | cat(paste0("Explanatory variable: categorical (", n_x_levels, " levels) \n")) 16 | } 17 | 18 | # print summary statistics 19 | if(show_summ_stats == TRUE){ 20 | cat("Observed:\n") 21 | print(res$observed) 22 | cat("\n") 23 | cat("Expected:\n") 24 | print(res$expected) 25 | cat("\n") 26 | } 27 | 28 | # print results 29 | if(show_res == TRUE){ 30 | cat(paste0("H0: ", x_name, " and ", y_name, " are independent\n")) 31 | cat(paste0("HA: ", x_name, " and ", y_name, " are dependent\n")) 32 | cat(paste0("chi_sq = ", round(as.numeric(stat), 4), ", df = ", as.numeric(deg_fr), 33 | ", p_value = ", round(res$p.value, 4), "\n")) 34 | } 35 | 36 | # eda_plot 37 | d_eda <- data.frame(y = y, x = x) 38 | 39 | n_fill_values <- length(levels(y)) 40 | fill_values <- grDevices::colorRampPalette(c("#1FBEC3", "#C7EEF0"))( n_fill_values ) 41 | 42 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = x, fill = y), environment = environment()) + 43 | ggplot2::geom_bar(position = "fill") + 44 | ggplot2::scale_fill_manual(values = fill_values) + 45 | ggplot2::xlab(x_name) + 46 | ggplot2::ylab("") + 47 | ggplot2::ggtitle("Sample Distribution") + 48 | ggplot2::guides(fill = ggplot2::guide_legend(title = y_name)) 49 | 50 | # inf_plot 51 | x_max <- max(qchisq(0.99, df = deg_fr), stat*1.1) 52 | inf_plot <- ggplot2::ggplot(data.frame(x = c(0, x_max)), ggplot2::aes(x)) + 53 | ggplot2::stat_function(fun = dchisq, args = list(df = deg_fr), color = "#999999") + 54 | ggplot2::annotate("rect", xmin = stat, xmax = stat+Inf, ymin = 0, ymax = Inf, 55 | alpha = 0.3, fill = "#FABAB8") + 56 | ggplot2::ggtitle(paste0("Chi-sq Distribution\n(df = ", deg_fr, ")")) + 57 | ggplot2::xlab("") + 58 | ggplot2::ylab("") + 59 | ggplot2::geom_vline(xintercept = stat, color = "#F57670", lwd = 1.5) 60 | 61 | # print plots 62 | if(show_eda_plot & !show_inf_plot){ 63 | print(eda_plot) 64 | } 65 | if(!show_eda_plot & show_inf_plot){ 66 | print(inf_plot) 67 | } 68 | if(show_eda_plot & show_inf_plot){ 69 | gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2) 70 | } 71 | 72 | # return 73 | return(list(chi_sq = as.numeric(stat), df = as.numeric(deg_fr), p_value = res$p.value)) 74 | } -------------------------------------------------------------------------------- /R/ht_single_mean_sim.R: -------------------------------------------------------------------------------- 1 | ht_single_mean_sim <- function(y, null, alternative, y_name, 2 | nsim, seed, 3 | show_var_types, show_summ_stats, show_res, 4 | show_eda_plot, show_inf_plot){ 5 | 6 | # set seed 7 | if(!is.null(seed)){ set.seed(seed) } 8 | 9 | # calculate sample size 10 | n <- length(y) 11 | 12 | # calculate y-bar 13 | y_bar <- mean(y) 14 | 15 | # create bootstrap distribution 16 | sim_dist <- rep(NA, nsim) 17 | for(i in 1:nsim){ 18 | boot_samp <- sample(y, size = n, replace = TRUE) 19 | sim_dist[i] <- mean(boot_samp) 20 | } 21 | 22 | # center bootstrap distribution at null 23 | sim_dist_temp <- sim_dist 24 | sim_dist <- sim_dist_temp - (mean(sim_dist_temp) - null) 25 | 26 | # shading cutoffs 27 | if(alternative == "greater"){ x_min = y_bar; x_max = Inf } 28 | if(alternative == "less"){ x_min = -Inf; x_max = y_bar } 29 | if(alternative == "twosided"){ 30 | if(y_bar >= null){ 31 | x_min = c(null - (y_bar - null), y_bar) 32 | x_max = c(-Inf, Inf) 33 | } 34 | if(y_bar <= null){ 35 | x_min = c(y_bar, null + (null - y_bar)) 36 | x_max = c(-Inf, Inf) 37 | } 38 | } 39 | 40 | # calculate p-value 41 | if(alternative == "greater"){ p_value <- sum(sim_dist >= y_bar) / nsim } 42 | if(alternative == "less"){ p_value <- sum(sim_dist <= y_bar) / nsim } 43 | if(alternative == "twosided"){ 44 | if(y_bar > null){ 45 | p_value <- min(2 * (sum(sim_dist >= y_bar) / nsim), 1) 46 | } 47 | if(y_bar < null){ 48 | p_value <- min(2 * (sum(sim_dist <= y_bar) / nsim), 1) 49 | } 50 | if(y_bar == null){ p_value <- 1 } 51 | } 52 | 53 | # print variable types 54 | if(show_var_types == TRUE){ 55 | cat("Single numerical variable\n") 56 | } 57 | 58 | # print summary statistics 59 | if(show_summ_stats == TRUE){ 60 | s <- sd(y) 61 | cat(paste0("n = ", n, ", y-bar = ", round(y_bar, 4), ", s = ", round(s, 4), "\n")) 62 | } 63 | 64 | # print results 65 | if(show_res == TRUE){ 66 | if(alternative == "greater"){ 67 | alt_sign <- ">" 68 | } else if(alternative == "less"){ 69 | alt_sign <- "<" 70 | } else { 71 | alt_sign <- "!=" 72 | } 73 | cat(paste0("H0: mu = ", null, "\n")) 74 | cat(paste0("HA: mu ", alt_sign, " ", null, "\n")) 75 | p_val_to_print <- ifelse(round(p_value, 4) == 0, "< 0.0001", round(p_value, 4)) 76 | cat(paste0("p_value = ", p_val_to_print)) 77 | } 78 | 79 | # eda_plot 80 | d_eda <- data.frame(y = y) 81 | 82 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = y), environment = environment()) + 83 | ggplot2::geom_histogram(fill = "#8FDEE1", binwidth = diff(range(y)) / 20) + 84 | ggplot2::xlab(y_name) + 85 | ggplot2::ylab("") + 86 | ggplot2::ggtitle("Sample Distribution") + 87 | ggplot2::geom_vline(xintercept = y_bar, col = "#1FBEC3", lwd = 1.5) 88 | 89 | # inf_plot 90 | d_inf <- data.frame(sim_dist = sim_dist) 91 | 92 | inf_plot <- ggplot2::ggplot(data = d_inf, ggplot2::aes(x = sim_dist), environment = environment()) + 93 | ggplot2::geom_histogram(fill = "#CCCCCC", binwidth = diff(range(sim_dist)) / 20) + 94 | ggplot2::annotate("rect", xmin = x_min, xmax = x_max, ymin = 0, ymax = Inf, 95 | alpha = 0.3, fill = "#FABAB8") + 96 | ggplot2::xlab("simulated means") + 97 | ggplot2::ylab("") + 98 | ggplot2::ggtitle("Null Distribution") + 99 | ggplot2::geom_vline(xintercept = y_bar, color = "#F57670", lwd = 1.5) 100 | 101 | # print plots 102 | if(show_eda_plot & !show_inf_plot){ 103 | suppressWarnings(print(eda_plot)) 104 | } 105 | if(!show_eda_plot & show_inf_plot){ 106 | suppressWarnings(print(inf_plot)) 107 | } 108 | if(show_eda_plot & show_inf_plot){ 109 | suppressWarnings(gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2)) 110 | } 111 | 112 | # return 113 | return(list(sim_dist = sim_dist, p_value = p_value)) 114 | 115 | } -------------------------------------------------------------------------------- /R/ht_single_mean_theo.R: -------------------------------------------------------------------------------- 1 | ht_single_mean_theo <- function(y, null, alternative, y_name, 2 | show_var_types, show_summ_stats, show_res, 3 | show_eda_plot, show_inf_plot){ 4 | 5 | # calculate sample size 6 | n <- length(y) 7 | 8 | # calculate x-bar 9 | y_bar <- mean(y) 10 | 11 | # calculate s 12 | s <- sd(y) 13 | 14 | # calculate SE 15 | se <- s / sqrt(n) 16 | 17 | # calculate test statistic 18 | t <- (y_bar - null) / se 19 | 20 | # define degrees of freedom 21 | deg_fr <- n - 1 22 | 23 | # shading cutoffs 24 | if(alternative == "greater"){ x_min = y_bar; x_max = Inf } 25 | if(alternative == "less"){ x_min = -Inf; x_max = y_bar } 26 | if(alternative == "twosided"){ 27 | if(y_bar >= null){ 28 | x_min = c(null - (y_bar - null), y_bar) 29 | x_max = c(-Inf, Inf) 30 | } 31 | if(y_bar <= null){ 32 | x_min = c(y_bar, null + (null - y_bar)) 33 | x_max = c(-Inf, Inf) 34 | } 35 | } 36 | 37 | # calculate p-value 38 | if(alternative == "greater"){ p_value <- pt(t, deg_fr, lower.tail = FALSE) } 39 | if(alternative == "less"){ p_value <- pt(t, deg_fr, lower.tail = TRUE) } 40 | if(alternative == "twosided"){ 41 | p_value <- pt(abs(t), deg_fr, lower.tail = FALSE) * 2 42 | } 43 | 44 | # print variable types 45 | if(show_var_types == TRUE){ 46 | cat("Single numerical variable\n") 47 | } 48 | 49 | # print summary statistics 50 | if(show_summ_stats == TRUE){ 51 | cat(paste0("n = ", n, ", y-bar = ", round(y_bar, 4), ", s = ", round(s, 4), "\n")) 52 | } 53 | 54 | # print results 55 | if(show_res == TRUE){ 56 | if(alternative == "greater"){ 57 | alt_sign <- ">" 58 | } else if(alternative == "less"){ 59 | alt_sign <- "<" 60 | } else { 61 | alt_sign <- "!=" 62 | } 63 | cat(paste0("H0: mu = ", null, "\n")) 64 | cat(paste0("HA: mu ", alt_sign, " ", null, "\n")) 65 | cat(paste0("t = ", round(t, 4), ", df = ", deg_fr, "\n")) 66 | p_val_to_print <- ifelse(round(p_value, 4) == 0, "< 0.0001", round(p_value, 4)) 67 | cat(paste0("p_value = ", p_val_to_print)) 68 | } 69 | 70 | # eda_plot 71 | d_eda <- data.frame(y = y) 72 | 73 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes_string(x = 'y'), environment = environment()) + 74 | ggplot2::geom_histogram(fill = "#8FDEE1", binwidth = diff(range(y)) / 20) + 75 | ggplot2::xlab(y_name) + 76 | ggplot2::ylab("") + 77 | ggplot2::ggtitle("Sample Distribution") + 78 | ggplot2::geom_vline(xintercept = y_bar, col = "#1FBEC3", lwd = 1.5) 79 | 80 | # inf_plot ### TO DO: remove y axis ticks 81 | d_inf <- data.frame(x = c(null - 4*se, null + 4*se)) 82 | inf_plot <- ggplot2::ggplot(d_inf, ggplot2::aes_string(x = 'x')) + 83 | ggplot2::stat_function(fun = dnorm, args = list(mean = null, sd = se), color = "#999999") + 84 | ggplot2::annotate("rect", xmin = x_min, xmax = x_max, ymin = 0, ymax = Inf, 85 | alpha = 0.3, fill = "#FABAB8") + 86 | ggplot2::ggtitle("Null Distribution") + 87 | ggplot2::xlab("") + 88 | ggplot2::ylab("") + 89 | ggplot2::geom_vline(xintercept = y_bar, color = "#F57670", lwd = 1.5) 90 | 91 | # print plots 92 | if(show_eda_plot & !show_inf_plot){ 93 | print(eda_plot) 94 | } 95 | if(!show_eda_plot & show_inf_plot){ 96 | print(inf_plot) 97 | } 98 | if(show_eda_plot & show_inf_plot){ 99 | gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2) 100 | } 101 | 102 | # return 103 | return(list(SE = se, t = t, df = deg_fr, p_value = p_value)) 104 | } -------------------------------------------------------------------------------- /R/ht_single_median_sim.R: -------------------------------------------------------------------------------- 1 | ht_single_median_sim <- function(y, null, alternative, y_name, 2 | nsim, seed, 3 | show_var_types, show_summ_stats, show_res, 4 | show_eda_plot, show_inf_plot){ 5 | 6 | # set seed 7 | if(!is.null(seed)){ set.seed(seed) } 8 | 9 | # calculate sample size 10 | n <- length(y) 11 | 12 | # calculate y-bar 13 | y_med <- median(y) 14 | 15 | # create bootstrap distribution 16 | sim_dist <- rep(NA, nsim) 17 | for(i in 1:nsim){ 18 | boot_samp <- sample(y, size = n, replace = TRUE) 19 | sim_dist[i] <- median(boot_samp) 20 | } 21 | 22 | # center bootstrap distribution at null 23 | sim_dist_temp <- sim_dist 24 | sim_dist <- sim_dist_temp - (mean(sim_dist_temp) - null) 25 | 26 | # shading cutoffs 27 | if(alternative == "greater"){ x_min = y_med; x_max = Inf } 28 | if(alternative == "less"){ x_min = -Inf; x_max = y_med } 29 | if(alternative == "twosided"){ 30 | if(y_med >= null){ 31 | x_min = c(null - (y_med - null), y_med) 32 | x_max = c(-Inf, Inf) 33 | } 34 | if(y_med <= null){ 35 | x_min = c(y_med, null + (null - y_med)) 36 | x_max = c(-Inf, Inf) 37 | } 38 | } 39 | 40 | # calculate p-value 41 | if(alternative == "greater"){ p_value <- sum(sim_dist >= y_med) / nsim } 42 | if(alternative == "less"){ p_value <- sum(sim_dist <= y_med) / nsim } 43 | if(alternative == "twosided"){ 44 | if(y_med > null){ 45 | p_value <- min(2 * (sum(sim_dist >= y_med) / nsim), 1) 46 | } 47 | if(y_med < null){ 48 | p_value <- min(2 * (sum(sim_dist <= y_med) / nsim), 1) 49 | } 50 | if(y_med == null){ p_value <- 1 } 51 | } 52 | 53 | # print variable types 54 | if(show_var_types == TRUE){ 55 | cat("Single numerical variable\n") 56 | } 57 | 58 | # print summary statistics 59 | if(show_summ_stats == TRUE){ 60 | q_25 <- quantile(y, 0.25) 61 | q_75 <- quantile(y, 0.75) 62 | cat(paste0("n = ", n, ", y_med = ", round(y_med, 4), 63 | ", Q1 = ", round(q_25, 4), ", Q3 = ", round(q_75, 4), "\n")) 64 | } 65 | 66 | # print results 67 | if(show_res == TRUE){ 68 | if(alternative == "greater"){ 69 | alt_sign <- ">" 70 | } else if(alternative == "less"){ 71 | alt_sign <- "<" 72 | } else { 73 | alt_sign <- "!=" 74 | } 75 | cat(paste0("H0: pop_med = ", null, "\n")) 76 | cat(paste0("HA: pop_med ", alt_sign, " ", null, "\n")) 77 | p_val_to_print <- ifelse(round(p_value, 4) == 0, "< 0.0001", round(p_value, 4)) 78 | cat(paste0("p_value = ", p_val_to_print)) 79 | } 80 | 81 | # eda_plot 82 | d_eda <- data.frame(y = y) 83 | 84 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = y), environment = environment()) + 85 | ggplot2::geom_histogram(fill = "#8FDEE1", binwidth = diff(range(y)) / 20) + 86 | ggplot2::xlab(y_name) + 87 | ggplot2::ylab("") + 88 | ggplot2::ggtitle("Sample Distribution") + 89 | ggplot2::geom_vline(xintercept = y_med, col = "#1FBEC3", lwd = 1.5) 90 | 91 | # inf_plot 92 | d_inf <- data.frame(sim_dist = sim_dist) 93 | 94 | inf_plot <- ggplot2::ggplot(data = d_inf, ggplot2::aes(x = sim_dist), environment = environment()) + 95 | ggplot2::geom_histogram(fill = "#CCCCCC", binwidth = max(diff(range(sim_dist)) / 20, 1)) + 96 | ggplot2::annotate("rect", xmin = x_min, xmax = x_max, ymin = 0, ymax = Inf, 97 | alpha = 0.3, fill = "#FABAB8") + 98 | ggplot2::xlab("simulated medians") + 99 | ggplot2::ylab("") + 100 | ggplot2::ggtitle("Null Distribution") + 101 | ggplot2::geom_vline(xintercept = y_med, color = "#F57670", lwd = 1.5) 102 | 103 | # print plots 104 | if(show_eda_plot & !show_inf_plot){ 105 | suppressWarnings(print(eda_plot)) 106 | } 107 | if(!show_eda_plot & show_inf_plot){ 108 | print(inf_plot) 109 | } 110 | if(show_eda_plot & show_inf_plot){ 111 | gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2) 112 | } 113 | 114 | # return 115 | return(list(sim_dist = sim_dist, p_value = p_value)) 116 | 117 | } -------------------------------------------------------------------------------- /R/ht_single_prop_sim.R: -------------------------------------------------------------------------------- 1 | ht_single_prop_sim <- function(y, success, null, alternative, 2 | nsim, seed, y_name, 3 | show_var_types, show_summ_stats, 4 | show_eda_plot, show_inf_plot, show_res){ 5 | 6 | # set seed 7 | if(!is.null(seed)){ set.seed(seed) } 8 | 9 | # calculate sample size 10 | n <- length(y) 11 | 12 | # calculate p-hat 13 | p_hat <- sum(y == success) / n 14 | 15 | # create null distribution 16 | sim_dist <- rep(NA, nsim) 17 | for(i in 1:nsim){ 18 | sim_samp <- sample(c(TRUE, FALSE), size = n, replace = TRUE, prob = c(null, 1 - null)) 19 | sim_dist[i] <- sum(sim_samp) / n 20 | } 21 | 22 | # shading cutoffs 23 | if(alternative == "greater"){ x_min = p_hat; x_max = Inf } 24 | if(alternative == "less"){ x_min = -Inf; x_max = p_hat } 25 | if(alternative == "twosided"){ 26 | if(p_hat >= null){ 27 | x_min = c(null - (p_hat - null), p_hat) 28 | x_max = c(-Inf, Inf) 29 | } 30 | if(p_hat <= null){ 31 | x_min = c(p_hat, null + (null - p_hat)) 32 | x_max = c(-Inf, Inf) 33 | } 34 | } 35 | 36 | # calculate p-value 37 | if(alternative == "greater"){ p_value <- sum(sim_dist >= p_hat) / nsim } 38 | if(alternative == "less"){ p_value <- sum(sim_dist <= p_hat) / nsim } 39 | if(alternative == "twosided"){ 40 | if(p_hat > null){ 41 | p_value <- min(2 * (sum(sim_dist >= p_hat) / nsim), 1) 42 | } 43 | if(p_hat < null){ 44 | p_value <- min(2 * (sum(sim_dist <= p_hat) / nsim), 1) 45 | } 46 | if(p_hat == null){ p_value <- 1 } 47 | } 48 | 49 | # print variable types 50 | if(show_var_types == TRUE){ 51 | cat(paste0("Single categorical variable, success: ", success,"\n")) 52 | } 53 | 54 | # print summary statistics 55 | if(show_summ_stats == TRUE){ 56 | cat(paste0("n = ", n, ", p-hat = ", round(p_hat, 4), "\n")) 57 | } 58 | 59 | # print results 60 | if(show_res == TRUE){ 61 | if(alternative == "greater"){ 62 | alt_sign <- ">" 63 | } else if(alternative == "less"){ 64 | alt_sign <- "<" 65 | } else { 66 | alt_sign <- "!=" 67 | } 68 | cat(paste0("H0: p = ", null, "\n")) 69 | cat(paste0("HA: p ", alt_sign, " ", null, "\n")) 70 | p_val_to_print <- ifelse(round(p_value, 4) == 0, "< 0.0001", round(p_value, 4)) 71 | cat(paste0("p_value = ", p_val_to_print)) 72 | } 73 | 74 | # eda_plot 75 | d_eda <- data.frame(y = y) 76 | 77 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = y), environment = environment()) + 78 | ggplot2::geom_bar(fill = "#8FDEE1") + 79 | ggplot2::xlab(y_name) + 80 | ggplot2::ylab("") + 81 | ggplot2::ggtitle("Sample Distribution") 82 | 83 | # inf_plot 84 | d_inf <- data.frame(sim_dist = sim_dist) 85 | 86 | inf_plot <- ggplot2::ggplot(data = d_inf, ggplot2::aes(x = sim_dist), environment = environment()) + 87 | ggplot2::geom_histogram(fill = "#CCCCCC", binwidth = diff(range(sim_dist)) / 20) + 88 | ggplot2::annotate("rect", xmin = x_min, xmax = x_max, ymin = 0, ymax = Inf, 89 | alpha = 0.3, fill = "#FABAB8") + 90 | ggplot2::xlab("simulated proportions") + 91 | ggplot2::ylab("") + 92 | ggplot2::ggtitle("Null Distribution") + 93 | ggplot2::geom_vline(xintercept = p_hat, color = "#F57670", lwd = 1.5) 94 | 95 | # print plots 96 | if(show_eda_plot & !show_inf_plot){ 97 | print(eda_plot) 98 | } 99 | if(!show_eda_plot & show_inf_plot){ 100 | print(inf_plot) 101 | } 102 | if(show_eda_plot & show_inf_plot){ 103 | gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2) 104 | } 105 | 106 | # return 107 | return(list(sim_dist = sim_dist, p_value = p_value)) 108 | } -------------------------------------------------------------------------------- /R/ht_single_prop_theo.R: -------------------------------------------------------------------------------- 1 | ht_single_prop_theo <- function(y, success, null, alternative, y_name, 2 | show_var_types, show_summ_stats, show_res, 3 | show_eda_plot, show_inf_plot){ 4 | 5 | # calculate sample size 6 | n <- length(y) 7 | 8 | # calculate p-hat 9 | p_hat <- sum(y == success) / n 10 | 11 | # calculate SE 12 | se <- sqrt(p_hat * (1 - p_hat) / n) 13 | 14 | # calculate test statistic 15 | z <- (p_hat - null) / se 16 | 17 | # shading cutoffs 18 | if(alternative == "greater"){ x_min = p_hat; x_max = Inf } 19 | if(alternative == "less"){ x_min = -Inf; x_max = p_hat } 20 | if(alternative == "twosided"){ 21 | if(p_hat >= null){ 22 | x_min = c(null - (p_hat - null), p_hat) 23 | x_max = c(-Inf, Inf) 24 | } 25 | if(p_hat <= null){ 26 | x_min = c(p_hat, null + (null - p_hat)) 27 | x_max = c(-Inf, Inf) 28 | } 29 | } 30 | 31 | # calculate p-value 32 | if(alternative == "greater"){ p_value <- pnorm(z, lower.tail = FALSE) } 33 | if(alternative == "less"){ p_value <- pnorm(z, lower.tail = TRUE) } 34 | if(alternative == "twosided"){ 35 | p_value <- 2 * pnorm(abs(z), lower.tail = FALSE) 36 | } 37 | 38 | # print variable types 39 | if(show_var_types == TRUE){ 40 | cat(paste0("Single categorical variable, success: ", success,"\n")) 41 | } 42 | 43 | # print summary statistics 44 | if(show_summ_stats == TRUE){ 45 | cat(paste0("n = ", n, ", p-hat = ", round(p_hat, 4), "\n")) 46 | } 47 | 48 | # print results 49 | if(show_res == TRUE){ 50 | if(alternative == "greater"){ 51 | alt_sign <- ">" 52 | } else if(alternative == "less"){ 53 | alt_sign <- "<" 54 | } else { 55 | alt_sign <- "!=" 56 | } 57 | cat(paste0("H0: p = ", null, "\n")) 58 | cat(paste0("HA: p ", alt_sign, " ", null, "\n")) 59 | p_val_to_print <- ifelse(round(p_value, 4) == 0, "< 0.0001", round(p_value, 4)) 60 | cat(paste0("z = ", round(z, 4), "\n")) 61 | cat(paste0("p_value = ", p_val_to_print)) 62 | } 63 | 64 | # eda_plot 65 | d_eda <- data.frame(y = y) 66 | 67 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = y), environment = environment()) + 68 | ggplot2::geom_bar(fill = "#8FDEE1") + 69 | ggplot2::xlab(y_name) + 70 | ggplot2::ylab("") + 71 | ggplot2::ggtitle("Sample Distribution") 72 | 73 | # inf_plot 74 | d_for_plot <- data.frame(x = c(null - 4*se, null + 4*se)) 75 | inf_plot <- ggplot2::ggplot(d_for_plot, ggplot2::aes_string(x = 'x')) + 76 | ggplot2::stat_function(fun = dnorm, args = list(mean = null, sd = se), color = "#999999") + 77 | ggplot2::annotate("rect", xmin = x_min, xmax = x_max, ymin = 0, ymax = Inf, 78 | alpha = 0.3, fill = "#FABAB8") + 79 | ggplot2::ggtitle("Null Distribution") + 80 | ggplot2::xlab("") + 81 | ggplot2::ylab("") + 82 | ggplot2::geom_vline(xintercept = p_hat, color = "#F57670", lwd = 1.5) 83 | 84 | # print plots 85 | if(show_eda_plot & !show_inf_plot){ 86 | print(eda_plot) 87 | } 88 | if(!show_eda_plot & show_inf_plot){ 89 | print(inf_plot) 90 | } 91 | if(show_eda_plot & show_inf_plot){ 92 | gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2) 93 | } 94 | 95 | # return 96 | return(list(SE = se, z = z, p_value = p_value)) 97 | } -------------------------------------------------------------------------------- /R/ht_two_mean_sim.R: -------------------------------------------------------------------------------- 1 | ht_two_mean_sim <- function(y, x, null, alternative, nsim, seed, 2 | y_name, x_name, 3 | show_var_types, show_summ_stats, show_res, 4 | show_eda_plot, show_inf_plot){ 5 | 6 | # set seed 7 | if(!is.null(seed)){ set.seed(seed) } 8 | 9 | # calculate n1 and n2 10 | ns <- by(y, x, length) 11 | n1 <- as.numeric(ns[1]) 12 | n2 <- as.numeric(ns[2]) 13 | n <- n1 + n2 14 | 15 | # calculate y-bar1 and y-bar2 16 | y_bars <- by(y, x, mean) 17 | y_bar1 <- as.numeric(y_bars[1]) 18 | y_bar2 <- as.numeric(y_bars[2]) 19 | 20 | # calculate difference in y-bars 21 | y_bar_diff <- y_bar1 - y_bar2 22 | 23 | # create null distribution 24 | sim_dist <- rep(NA, nsim) 25 | for(i in 1:nsim){ 26 | y_sim <- sample(y, size = n, replace = FALSE) 27 | y_sim_bars <- by(y_sim, x, mean) 28 | y_sim_bar1 <- as.numeric(y_sim_bars[1]) 29 | y_sim_bar2 <- as.numeric(y_sim_bars[2]) 30 | sim_dist[i] <- y_sim_bar1 - y_sim_bar2 31 | } 32 | 33 | # shading cutoffs 34 | if(alternative == "greater"){ 35 | x_min <- y_bar_diff 36 | x_max <- Inf 37 | } 38 | if(alternative == "less"){ 39 | x_min <- -Inf 40 | x_max <- y_bar_diff 41 | } 42 | if(alternative == "twosided"){ 43 | if(y_bar_diff >= null){ 44 | x_min <- c(null - (y_bar_diff - null), y_bar_diff) 45 | x_max <- c(-Inf, Inf) 46 | } 47 | if(y_bar_diff <= null){ 48 | x_min <- c(y_bar_diff, null + (null - y_bar_diff)) 49 | x_max <- c(-Inf, Inf) 50 | } 51 | } 52 | 53 | # calculate p-value 54 | if(alternative == "greater"){ p_value <- sum(sim_dist >= y_bar_diff) / nsim } 55 | if(alternative == "less"){ p_value <- sum(sim_dist <= y_bar_diff) / nsim } 56 | if(alternative == "twosided"){ 57 | if(y_bar_diff > null){ 58 | p_value <- min(2 * (sum(sim_dist >= y_bar_diff) / nsim), 1) 59 | } 60 | if(y_bar_diff < null){ 61 | p_value <- min(2 * (sum(sim_dist <= y_bar_diff) / nsim), 1) 62 | } 63 | if(y_bar_diff == null){ p_value <- 1 } 64 | } 65 | 66 | # print variable types 67 | if(show_var_types == TRUE){ 68 | n_x_levels <- length(levels(x)) 69 | cat(paste0("Response variable: numerical\n")) 70 | cat(paste0("Explanatory variable: categorical (", n_x_levels, " levels) \n")) 71 | } 72 | 73 | # print summary statistics 74 | if(show_summ_stats == TRUE){ 75 | gr1 <- levels(x)[1] 76 | gr2 <- levels(x)[2] 77 | sds <- by(y, x, IQR) 78 | s1 <- sds[1] 79 | s2 <- sds[2] 80 | cat(paste0("n_", gr1, " = ", n1, ", y_bar_", gr1, " = ", round(y_bar1, 4), 81 | ", s_", gr1, " = ", s1, "\n")) 82 | cat(paste0("n_", gr2, " = ", n2, ", y_bar_", gr2, " = ", round(y_bar2, 4), 83 | ", s_", gr2, " = ", s2, "\n")) 84 | } 85 | 86 | # print results 87 | if(show_res == TRUE){ 88 | if(alternative == "greater"){ 89 | alt_sign <- ">" 90 | } else if(alternative == "less"){ 91 | alt_sign <- "<" 92 | } else { 93 | alt_sign <- "!=" 94 | } 95 | cat(paste0("H0: mu_", gr1, " = mu_", gr2, "\n")) 96 | cat(paste0("HA: mu_", gr1, " ", alt_sign, " mu_", gr2, "\n")) 97 | p_val_to_print <- ifelse(round(p_value, 4) == 0, "< 0.0001", round(p_value, 4)) 98 | cat(paste0("p_value = ", p_val_to_print)) 99 | } 100 | 101 | # eda_plot 102 | d_eda <- data.frame(y = y, x = x) 103 | d_means <- data.frame(y_bars = as.numeric(y_bars), x = levels(x)) 104 | 105 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = y), environment = environment()) + 106 | ggplot2::geom_histogram(fill = "#8FDEE1", binwidth = diff(range(y)) / 20) + 107 | ggplot2::xlab(y_name) + 108 | ggplot2::ylab(x_name) + 109 | ggplot2::ggtitle("Sample Distribution") + 110 | ggplot2::geom_vline(data = d_means, ggplot2::aes(xintercept = y_bars), col = "#1FBEC3", lwd = 1.5) + 111 | ggplot2::facet_grid(x ~ .) 112 | 113 | # inf_plot 114 | d_inf <- data.frame(sim_dist = sim_dist) 115 | 116 | inf_plot <- ggplot2::ggplot(data = d_inf, ggplot2::aes(x = sim_dist), environment = environment()) + 117 | ggplot2::geom_histogram(fill = "#CCCCCC", binwidth = diff(range(sim_dist)) / 20) + 118 | ggplot2::annotate("rect", xmin = x_min, xmax = x_max, ymin = 0, ymax = Inf, 119 | alpha = 0.3, fill = "#FABAB8") + 120 | ggplot2::xlab("simulated difference in means") + 121 | ggplot2::ylab("") + 122 | ggplot2::ggtitle("Null Distribution") + 123 | ggplot2::geom_vline(xintercept = y_bar_diff, color = "#F57670", lwd = 1.5) 124 | 125 | # print plots 126 | if(show_eda_plot & !show_inf_plot){ 127 | print(eda_plot) 128 | } 129 | if(!show_eda_plot & show_inf_plot){ 130 | print(inf_plot) 131 | } 132 | if(show_eda_plot & show_inf_plot){ 133 | gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2) 134 | } 135 | 136 | # return 137 | return(list(sim_dist = sim_dist, p_value = p_value)) 138 | } -------------------------------------------------------------------------------- /R/ht_two_mean_theo.R: -------------------------------------------------------------------------------- 1 | 2 | ht_two_mean_theo <- function(y, x, null, alternative, 3 | y_name, x_name, 4 | show_var_types, show_summ_stats, show_res, 5 | show_eda_plot, show_inf_plot){ 6 | 7 | # calculate n1 and n2 8 | ns <- by(y, x, length) 9 | n1 <- as.numeric(ns[1]) 10 | n2 <- as.numeric(ns[2]) 11 | 12 | # calculate y-bar1 and y-bar2 13 | y_bars <- by(y, x, mean) 14 | y_bar1 <- as.numeric(y_bars[1]) 15 | y_bar2 <- as.numeric(y_bars[2]) 16 | 17 | # calculate difference in y-bars 18 | y_bar_diff <- y_bar1 - y_bar2 19 | 20 | # calculate s1 and s2 21 | sds <- by(y, x, sd) 22 | s1 <- as.numeric(sds[1]) 23 | s2 <- as.numeric(sds[2]) 24 | 25 | # calculate SE 26 | se <- sqrt((s1^2 / n1) + (s2^2 / n2)) 27 | 28 | # define degrees of freedom 29 | deg_fr <- min(n1 - 1, n2 - 1) 30 | 31 | # calculate t 32 | t <- (y_bar_diff - null) / se 33 | 34 | # shading cutoffs 35 | if(alternative == "greater"){ 36 | x_min <- y_bar_diff 37 | x_max <- Inf 38 | } 39 | if(alternative == "less"){ 40 | x_min <- -Inf 41 | x_max <- y_bar_diff 42 | } 43 | if(alternative == "twosided"){ 44 | if(y_bar_diff >= null){ 45 | x_min <- c(null - (y_bar_diff - null), y_bar_diff) 46 | x_max <- c(-Inf, Inf) 47 | } 48 | if(y_bar_diff <= null){ 49 | x_min <- c(y_bar_diff, null + (null - y_bar_diff)) 50 | x_max <- c(-Inf, Inf) 51 | } 52 | } 53 | 54 | # calculate p-value 55 | if(alternative == "greater"){ p_value <- pt(t, deg_fr, lower.tail = FALSE) } 56 | if(alternative == "less"){ p_value <- pt(t, deg_fr, lower.tail = TRUE) } 57 | if(alternative == "twosided"){ 58 | p_value <- pt(abs(t), deg_fr, lower.tail = FALSE) * 2 59 | } 60 | 61 | # print variable types 62 | if(show_var_types == TRUE){ 63 | n_x_levels <- length(levels(x)) 64 | cat(paste0("Response variable: numerical\n")) 65 | cat(paste0("Explanatory variable: categorical (", n_x_levels, " levels) \n")) 66 | } 67 | 68 | # print summary statistics 69 | if(show_summ_stats == TRUE){ 70 | gr1 <- levels(x)[1] 71 | gr2 <- levels(x)[2] 72 | cat(paste0("n_", gr1, " = ", n1, ", y_bar_", gr1, " = ", round(y_bar1, 4), ", s_", gr1, " = ", round(s1, 4), "\n")) 73 | cat(paste0("n_", gr2, " = ", n2, ", y_bar_", gr2, " = ", round(y_bar2, 4), ", s_", gr2, " = ", round(s2, 4), "\n")) 74 | } 75 | 76 | # print results 77 | if(show_res == TRUE){ 78 | if(alternative == "greater"){ 79 | alt_sign <- ">" 80 | } else if(alternative == "less"){ 81 | alt_sign <- "<" 82 | } else { 83 | alt_sign <- "!=" 84 | } 85 | cat(paste0("H0: mu_", gr1, " = mu_", gr2, "\n")) 86 | cat(paste0("HA: mu_", gr1, " ", alt_sign, " mu_", gr2, "\n")) 87 | cat(paste0("t = ", round(t, 4), ", df = ", deg_fr, "\n")) 88 | p_val_to_print <- ifelse(round(p_value, 4) == 0, "< 0.0001", round(p_value, 4)) 89 | cat(paste0("p_value = ", p_val_to_print)) 90 | } 91 | 92 | # eda_plot 93 | d_eda <- data.frame(y = y, x = x) 94 | d_means <- data.frame(y_bars = as.numeric(y_bars), x = levels(x)) 95 | 96 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = y), environment = environment()) + 97 | ggplot2::geom_histogram(fill = "#8FDEE1", binwidth = diff(range(y)) / 20) + 98 | ggplot2::xlab(y_name) + 99 | ggplot2::ylab(x_name) + 100 | ggplot2::ggtitle("Sample Distribution") + 101 | ggplot2::geom_vline(data = d_means, ggplot2::aes(xintercept = y_bars), col = "#1FBEC3", lwd = 1.5) + 102 | ggplot2::facet_grid(x ~ .) 103 | 104 | 105 | # inf_plot 106 | inf_plot <- ggplot2::ggplot(data.frame(x = c(null - 4*se, null + 4*se)), ggplot2::aes(x)) + 107 | ggplot2::stat_function(fun = dnorm, args = list(mean = null, sd = se), color = "#999999") + 108 | ggplot2::annotate("rect", xmin = x_min, xmax = x_max, ymin = 0, ymax = Inf, 109 | alpha = 0.3, fill = "#FABAB8") + 110 | ggplot2::ggtitle("Null Distribution") + 111 | ggplot2::xlab("") + 112 | ggplot2::ylab("") + 113 | ggplot2::geom_vline(xintercept = y_bar_diff, color = "#F57670", lwd = 1.5) 114 | 115 | # print plots 116 | if(show_eda_plot & !show_inf_plot){ 117 | print(eda_plot) 118 | } 119 | if(!show_eda_plot & show_inf_plot){ 120 | print(inf_plot) 121 | } 122 | if(show_eda_plot & show_inf_plot){ 123 | gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2) 124 | } 125 | 126 | # return 127 | return(list(SE = se, t = t, df = deg_fr, p_value = p_value)) 128 | } -------------------------------------------------------------------------------- /R/ht_two_median_sim.R: -------------------------------------------------------------------------------- 1 | ht_two_median_sim <- function(y, x, null, alternative, nsim, seed, 2 | y_name, x_name, 3 | show_var_types, show_summ_stats, show_res, 4 | show_eda_plot, show_inf_plot){ 5 | 6 | # set seed 7 | if(!is.null(seed)){ set.seed(seed) } 8 | 9 | # calculate n1 and n2 10 | ns <- by(y, x, length) 11 | n1 <- as.numeric(ns[1]) 12 | n2 <- as.numeric(ns[2]) 13 | n <- n1 + n2 14 | 15 | # calculate y-med1 and y-med2 16 | y_meds <- by(y, x, median) 17 | y_med1 <- as.numeric(y_meds[1]) 18 | y_med2 <- as.numeric(y_meds[2]) 19 | 20 | # calculate difference in y-meds 21 | y_med_diff <- y_med1 - y_med2 22 | 23 | # create null distribution 24 | sim_dist <- rep(NA, nsim) 25 | for(i in 1:nsim){ 26 | y_sim <- sample(y, size = n, replace = FALSE) 27 | y_sim_meds <- by(y_sim, x, median) 28 | y_sim_med1 <- as.numeric(y_sim_meds[1]) 29 | y_sim_med2 <- as.numeric(y_sim_meds[2]) 30 | sim_dist[i] <- y_sim_med1 - y_sim_med2 31 | } 32 | 33 | # shading cutoffs 34 | if(alternative == "greater"){ 35 | x_min <- y_med_diff 36 | x_max <- Inf 37 | } 38 | if(alternative == "less"){ 39 | x_min <- -Inf 40 | x_max <- y_med_diff 41 | } 42 | if(alternative == "twosided"){ 43 | if(y_med_diff >= null){ 44 | x_min <- c(null - (y_med_diff - null), y_med_diff) 45 | x_max <- c(-Inf, Inf) 46 | } 47 | if(y_med_diff <= null){ 48 | x_min <- c(y_med_diff, null + (null - y_med_diff)) 49 | x_max <- c(-Inf, Inf) 50 | } 51 | } 52 | 53 | # calculate p-value 54 | if(alternative == "greater"){ p_value <- sum(sim_dist >= y_med_diff) / nsim } 55 | if(alternative == "less"){ p_value <- sum(sim_dist <= y_med_diff) / nsim } 56 | if(alternative == "twosided"){ 57 | if(y_med_diff > null){ 58 | p_value <- min(2 * (sum(sim_dist >= y_med_diff) / nsim), 1) 59 | } 60 | if(y_med_diff < null){ 61 | p_value <- min(2 * (sum(sim_dist <= y_med_diff) / nsim), 1) 62 | } 63 | if(y_med_diff == null){ p_value <- 1 } 64 | } 65 | 66 | # print variable types 67 | if(show_var_types == TRUE){ 68 | n_x_levels <- length(levels(x)) 69 | cat(paste0("Response variable: numerical\n")) 70 | cat(paste0("Explanatory variable: categorical (", n_x_levels, " levels) \n")) 71 | } 72 | 73 | # print summary statistics 74 | if(show_summ_stats == TRUE){ 75 | gr1 <- levels(x)[1] 76 | gr2 <- levels(x)[2] 77 | iqrs <- by(y, x, IQR) 78 | iqr1 <- iqrs[1] 79 | iqr2 <- iqrs[2] 80 | cat(paste0("n_", gr1, " = ", n1, ", y_med_", gr1, " = ", round(y_med1, 4), 81 | ", IQR_", gr1, " = ", iqr1, "\n")) 82 | cat(paste0("n_", gr2, " = ", n2, ", y_med_", gr2, " = ", round(y_med2, 4), 83 | ", IQR_", gr2, " = ", iqr2, "\n")) 84 | } 85 | 86 | # print results 87 | if(show_res == TRUE){ 88 | if(alternative == "greater"){ 89 | alt_sign <- ">" 90 | } else if(alternative == "less"){ 91 | alt_sign <- "<" 92 | } else { 93 | alt_sign <- "!=" 94 | } 95 | cat(paste0("H0: mu_", gr1, " = mu_", gr2, "\n")) 96 | cat(paste0("HA: mu_", gr1, " ", alt_sign, " mu_", gr2, "\n")) 97 | p_val_to_print <- ifelse(round(p_value, 4) == 0, "< 0.0001", round(p_value, 4)) 98 | cat(paste0("p_value = ", p_val_to_print)) 99 | } 100 | 101 | # eda_plot 102 | d_eda <- data.frame(y = y, x = x) 103 | 104 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = x, y = y), environment = environment()) + 105 | ggplot2::geom_boxplot(color = "#1FBEC3", fill = "#8FDEE1", outlier.colour = "#1FBEC3") + 106 | ggplot2::xlab(x_name) + 107 | ggplot2::ylab(y_name) + 108 | ggplot2::ggtitle("Sample Distribution") 109 | 110 | # inf_plot 111 | d_inf <- data.frame(sim_dist = sim_dist) 112 | 113 | inf_plot <- ggplot2::ggplot(data = d_inf, ggplot2::aes(x = sim_dist), environment = environment()) + 114 | ggplot2::geom_histogram(fill = "#CCCCCC", binwidth = diff(range(sim_dist)) / 20) + 115 | ggplot2::annotate("rect", xmin = x_min, xmax = x_max, ymin = 0, ymax = Inf, 116 | alpha = 0.3, fill = "#FABAB8") + 117 | ggplot2::xlab("simulated difference in medians") + 118 | ggplot2::ylab("") + 119 | ggplot2::ggtitle("Null Distribution") + 120 | ggplot2::geom_vline(xintercept = y_med_diff, color = "#F57670", lwd = 1.5) 121 | 122 | # print plots 123 | if(show_eda_plot & !show_inf_plot){ 124 | print(eda_plot) 125 | } 126 | if(!show_eda_plot & show_inf_plot){ 127 | print(inf_plot) 128 | } 129 | if(show_eda_plot & show_inf_plot){ 130 | gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2) 131 | } 132 | 133 | # return 134 | return(list(sim_dist = sim_dist, p_value = p_value)) 135 | } -------------------------------------------------------------------------------- /R/ht_two_prop_sim.R: -------------------------------------------------------------------------------- 1 | ht_two_prop_sim <- function(y, x, success, null, alternative, nsim, seed, 2 | x_name, y_name, 3 | show_var_types, show_summ_stats, show_res, 4 | show_eda_plot, show_inf_plot){ 5 | 6 | # set seed 7 | if(!is.null(seed)){ set.seed(seed) } 8 | 9 | # calculate n1 and n2 10 | ns <- by(y, x, length) 11 | n1 <- as.numeric(ns[1]) 12 | n2 <- as.numeric(ns[2]) 13 | 14 | # calculate p-hat1 and p-hat2 15 | suc1 <- sum(y[x == levels(x)[1]] == success) 16 | suc2 <- sum(y[x == levels(x)[2]] == success) 17 | p_hat1 <- suc1 / n1 18 | p_hat2 <- suc2 / n2 19 | 20 | # calculate difference in p-hats 21 | p_hat_diff <- p_hat1 - p_hat2 22 | 23 | # create null distribution 24 | sim_dist <- rep(NA, nsim) 25 | for(i in 1:nsim){ 26 | y_sim <- sample(y, size = (n1+n2), replace = FALSE) 27 | suc1_sim <- sum(y_sim[x == levels(x)[1]] == success) 28 | suc2_sim <- sum(y_sim[x == levels(x)[2]] == success) 29 | p_hat1_sim <- suc1_sim / n1 30 | p_hat2_sim <- suc2_sim / n2 31 | sim_dist[i] <- p_hat1_sim - p_hat2_sim 32 | } 33 | 34 | # shading cutoffs 35 | if(alternative == "greater"){ 36 | x_min <- p_hat_diff 37 | x_max <- Inf 38 | } 39 | if(alternative == "less"){ 40 | x_min <- -Inf 41 | x_max <- p_hat_diff 42 | } 43 | if(alternative == "twosided"){ 44 | if(p_hat_diff >= null){ 45 | x_min <- c(null - (p_hat_diff - null), p_hat_diff) 46 | x_max <- c(-Inf, Inf) 47 | } 48 | if(p_hat_diff <= null){ 49 | x_min <- c(p_hat_diff, null + (null - p_hat_diff)) 50 | x_max <- c(-Inf, Inf) 51 | } 52 | } 53 | 54 | # calculate p-value 55 | if(alternative == "greater"){ p_value <- sum(sim_dist >= p_hat_diff) / nsim } 56 | if(alternative == "less"){ p_value <- sum(sim_dist <= p_hat_diff) / nsim } 57 | if(alternative == "twosided"){ 58 | if(p_hat_diff > null){ 59 | p_value <- min(2 * (sum(sim_dist >= p_hat_diff) / nsim), 1) 60 | } 61 | if(p_hat_diff < null){ 62 | p_value <- min(2 * (sum(sim_dist <= p_hat_diff) / nsim), 1) 63 | } 64 | } 65 | 66 | # print variable types 67 | if(show_var_types == TRUE){ 68 | n_x_levels <- length(levels(x)) 69 | n_y_levels <- length(levels(y)) 70 | cat(paste0("Response variable: categorical (", n_x_levels, " levels, success: ", success, ")\n")) 71 | cat(paste0("Explanatory variable: categorical (", n_y_levels, " levels) \n")) 72 | } 73 | 74 | # print summary statistics 75 | if(show_summ_stats == TRUE){ 76 | gr1 <- levels(x)[1] 77 | gr2 <- levels(x)[2] 78 | cat(paste0("n_", gr1, " = ", n1, ", p_hat_", gr1, " = ", round(p_hat1, 4), "\n")) 79 | cat(paste0("n_", gr2, " = ", n2, ", p_hat_", gr2, " = ", round(p_hat2, 4), "\n")) 80 | } 81 | 82 | # print results 83 | if(show_res == TRUE){ 84 | if(alternative == "greater"){ 85 | alt_sign <- ">" 86 | } else if(alternative == "less"){ 87 | alt_sign <- "<" 88 | } else { 89 | alt_sign <- "!=" 90 | } 91 | cat(paste0("H0: p_", gr1, " = p_", gr2, "\n")) 92 | cat(paste0("HA: p_", gr1, " ", alt_sign, " p_", gr2, "\n")) 93 | p_val_to_print <- ifelse(round(p_value, 4) == 0, "< 0.0001", round(p_value, 4)) 94 | cat(paste0("p_value = ", p_val_to_print)) 95 | } 96 | 97 | # eda_plot 98 | d_eda <- data.frame(y = y, x = x) 99 | 100 | if(which(levels(y) == success) == 1){ 101 | fill_values = c("#1FBEC3", "#8FDEE1") 102 | } else { 103 | fill_values = c("#8FDEE1", "#1FBEC3") 104 | } 105 | 106 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = x, fill = y), environment = environment()) + 107 | ggplot2::geom_bar(position = "fill") + 108 | ggplot2::scale_fill_manual(values = fill_values) + 109 | ggplot2::xlab(x_name) + 110 | ggplot2::ylab("") + 111 | ggplot2::ggtitle("Sample Distribution") + 112 | ggplot2::guides(fill = ggplot2::guide_legend(title = y_name)) 113 | 114 | # inf_plot 115 | d_inf <- data.frame(sim_dist = sim_dist) 116 | 117 | inf_plot <- ggplot2::ggplot(data = d_inf, ggplot2::aes(x = sim_dist), environment = environment()) + 118 | ggplot2::geom_histogram(fill = "#CCCCCC", binwidth = diff(range(sim_dist)) / 20) + 119 | ggplot2::annotate("rect", xmin = x_min, xmax = x_max, ymin = 0, ymax = Inf, 120 | alpha = 0.3, fill = "#FABAB8") + 121 | ggplot2::xlab("simulated difference in means") + 122 | ggplot2::ylab("") + 123 | ggplot2::ggtitle("Null Distribution") + 124 | ggplot2::geom_vline(xintercept = p_hat_diff, color = "#F57670", lwd = 1.5) 125 | 126 | # print plots 127 | if(show_eda_plot & !show_inf_plot){ 128 | print(eda_plot) 129 | } 130 | if(!show_eda_plot & show_inf_plot){ 131 | print(inf_plot) 132 | } 133 | if(show_eda_plot & show_inf_plot){ 134 | gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2) 135 | } 136 | 137 | # return 138 | return(list(sim_dist = sim_dist, p_value = p_value)) 139 | } -------------------------------------------------------------------------------- /R/ht_two_prop_theo.R: -------------------------------------------------------------------------------- 1 | ht_two_prop_theo <- function(y, x, success, null, alternative, 2 | x_name, y_name, 3 | show_var_types, show_summ_stats, show_res, 4 | show_eda_plot, show_inf_plot){ 5 | 6 | # calculate n1 and n2 7 | ns <- by(y, x, length) 8 | n1 <- as.numeric(ns[1]) 9 | n2 <- as.numeric(ns[2]) 10 | 11 | # calculate p-hat1 and p-hat2 12 | suc1 <- sum(y[x == levels(x)[1]] == success) 13 | suc2 <- sum(y[x == levels(x)[2]] == success) 14 | p_hat1 <- suc1 / n1 15 | p_hat2 <- suc2 / n2 16 | 17 | # calculate difference in p-hats 18 | p_hat_diff <- p_hat1 - p_hat2 19 | 20 | # calculate pooled proportion 21 | suc_tot <- suc1 + suc2 22 | n_tot <- n1 + n2 23 | p_pool <- suc_tot / n_tot 24 | 25 | # calculate SE 26 | se <- sqrt((p_pool * (1 - p_pool) / n1) + (p_pool * (1 - p_pool) / n2)) 27 | 28 | # calculate z 29 | z <- (p_hat_diff - null) / se 30 | 31 | # shading cutoffs 32 | if(alternative == "greater"){ 33 | x_min <- p_hat_diff 34 | x_max <- Inf 35 | } 36 | if(alternative == "less"){ 37 | x_min <- -Inf 38 | x_max <- p_hat_diff 39 | } 40 | if(alternative == "twosided"){ 41 | if(p_hat_diff >= null){ 42 | x_min <- c(null - (p_hat_diff - null), p_hat_diff) 43 | x_max <- c(-Inf, Inf) 44 | } 45 | if(p_hat_diff <= null){ 46 | x_min <- c(p_hat_diff, null + (null - p_hat_diff)) 47 | x_max <- c(-Inf, Inf) 48 | } 49 | } 50 | 51 | # calculate p-value 52 | if(alternative == "greater"){ p_value <- pnorm(z, lower.tail = FALSE) } 53 | if(alternative == "less"){ p_value <- pnorm(z, lower.tail = TRUE) } 54 | if(alternative == "twosided"){ 55 | p_value <- 2 * pnorm(abs(z), lower.tail = FALSE) 56 | } 57 | 58 | # print variable types 59 | if(show_var_types == TRUE){ 60 | n_x_levels <- length(levels(x)) 61 | n_y_levels <- length(levels(y)) 62 | cat(paste0("Response variable: categorical (", n_x_levels, " levels, success: ", success, ")\n")) 63 | cat(paste0("Explanatory variable: categorical (", n_y_levels, " levels) \n")) 64 | } 65 | 66 | # print summary statistics 67 | if(show_summ_stats == TRUE){ 68 | gr1 <- levels(x)[1] 69 | gr2 <- levels(x)[2] 70 | cat(paste0("n_", gr1, " = ", n1, ", p_hat_", gr1, " = ", round(p_hat1, 4), "\n")) 71 | cat(paste0("n_", gr2, " = ", n2, ", p_hat_", gr2, " = ", round(p_hat2, 4), "\n")) 72 | } 73 | 74 | # print results 75 | if(show_res == TRUE){ 76 | if(alternative == "greater"){ 77 | alt_sign <- ">" 78 | } else if(alternative == "less"){ 79 | alt_sign <- "<" 80 | } else { 81 | alt_sign <- "!=" 82 | } 83 | cat(paste0("H0: p_", gr1, " = p_", gr2, "\n")) 84 | cat(paste0("HA: p_", gr1, " ", alt_sign, " p_", gr2, "\n")) 85 | cat(paste0("z = ", round(z, 4), "\n")) 86 | p_val_to_print <- ifelse(round(p_value, 4) == 0, "< 0.0001", round(p_value, 4)) 87 | cat(paste0("p_value = ", p_val_to_print)) 88 | } 89 | 90 | # eda_plot 91 | d_eda <- data.frame(y = y, x = x) 92 | 93 | if(which(levels(y) == success) == 1){ 94 | fill_values = c("#1FBEC3", "#8FDEE1") 95 | } else { 96 | fill_values = c("#8FDEE1", "#1FBEC3") 97 | } 98 | 99 | eda_plot <- ggplot2::ggplot(data = d_eda, ggplot2::aes(x = x, fill = y), environment = environment()) + 100 | ggplot2::geom_bar(position = "fill") + 101 | ggplot2::scale_fill_manual(values = fill_values) + 102 | ggplot2::xlab(x_name) + 103 | ggplot2::ylab("") + 104 | ggplot2::ggtitle("Sample Distribution") + 105 | ggplot2::guides(fill = ggplot2::guide_legend(title = y_name)) 106 | 107 | # inf_plot 108 | inf_plot <- ggplot2::ggplot(data.frame(x = c(null - 4*se, null + 4*se)), ggplot2::aes(x)) + 109 | ggplot2::stat_function(fun = dnorm, args = list(mean = null, sd = se), color = "#999999") + 110 | ggplot2::annotate("rect", xmin = x_min, xmax = x_max, ymin = 0, ymax = Inf, 111 | alpha = 0.3, fill = "#FABAB8") + 112 | ggplot2::ggtitle("Null Distribution") + 113 | ggplot2::xlab("") + 114 | ggplot2::ylab("") + 115 | ggplot2::geom_vline(xintercept = p_hat_diff, color = "#F57670", lwd = 1.5) 116 | 117 | # print plots 118 | if(show_eda_plot & !show_inf_plot){ 119 | print(eda_plot) 120 | } 121 | if(!show_eda_plot & show_inf_plot){ 122 | print(inf_plot) 123 | } 124 | if(show_eda_plot & show_inf_plot){ 125 | gridExtra::grid.arrange(eda_plot, inf_plot, ncol = 2) 126 | } 127 | 128 | # return 129 | return(list(SE = se, z = z, p_value = p_value)) 130 | } -------------------------------------------------------------------------------- /R/kobe_basket.R: -------------------------------------------------------------------------------- 1 | #' Kobe Bryant basketball performance 2 | #' 3 | #' Data from the five games the Los Angeles Lakers played against the Orlando 4 | #' Magic in the 2009 NBA finals. 5 | #' 6 | #' Each row represents a shot Kobe Bryant took during the five games of the 7 | #' 2009 NBA finals. Kobe Bryant's performance earned him the title of Most 8 | #' Valuable Player and many spectators commented on how he appeared to show 9 | #' a hot hand. 10 | #' 11 | #' @format A data frame with 133 rows and 6 variables: 12 | #' \describe{ 13 | #' \item{vs}{A categorical vector, ORL if the Los Angeles Lakers played 14 | #' against Orlando} 15 | #' \item{game}{A numerical vector, game in the 2009 NBA finals} 16 | #' \item{quarter}{A categorical vector, quarter in the game, OT stands for 17 | #' overtime} 18 | #' \item{time}{A character vector, time at which Kobe took a shot} 19 | #' \item{description}{A character vector, description of the shot} 20 | #' \item{shot}{A categorical vector, H if the shot was a hit, M if the shot 21 | #' was a miss} 22 | #' } 23 | "kobe_basket" -------------------------------------------------------------------------------- /R/mlb11.R: -------------------------------------------------------------------------------- 1 | #' Major League Baseball team data 2 | #' 3 | #' Data from all 30 Major League Baseball teams from the 2011 season. 4 | #' 5 | #' @format A data frame with 30 rows and 12 variables: 6 | #' \describe{ 7 | #' \item{team}{Team name.} 8 | #' \item{runs}{Number of runs.} 9 | #' \item{at_bats}{Number of at bats.} 10 | #' \item{hits}{Number of hits.} 11 | #' \item{homeruns}{Number of home runs.} 12 | #' \item{bat_avg}{Batting average.} 13 | #' \item{strikeouts}{Number of strikeouts.} 14 | #' \item{stolen_bases}{Number of stolen bases.} 15 | #' \item{wins}{Number of wins.} 16 | #' \item{new_onbase}{Newer variable: on-base percentage, a measure of 17 | #' how often a batter reaches base for any reason other than a fielding error, 18 | #' fielder's choice, dropped/uncaught third strike, fielder's obstruction, or 19 | #' catcher's interference.} 20 | #' \item{new_slug}{Newer variable: slugging percentage, popular measure of the 21 | #' power of a hitter calculated as the total bases divided by at bats.} 22 | #' \item{new_obs}{Newer variable: on-base plus slugging, calculated as the sum of the on-base and slugging percentages.} 23 | #' } 24 | #' @source \href{https://www.mlb.com/}{mlb.com} 25 | "mlb11" -------------------------------------------------------------------------------- /R/nc.R: -------------------------------------------------------------------------------- 1 | #' North Carolina births 2 | #' 3 | #' In 2004, the state of North Carolina released a large data set containing 4 | #' information on births recorded in this state. This data set is useful to 5 | #' researchers studying the relation between habits and practices of expectant 6 | #' mothers and the birth of their children. We will work with a random sample of 7 | #' observations from this data set. 8 | #' 9 | #' @format A tbl_df with 1000 rows and 13 variables: 10 | #' \describe{ 11 | #' \item{fage}{father's age in years} 12 | #' \item{mage}{mother's age in years} 13 | #' \item{mature}{maturity status of mother} 14 | #' \item{weeks}{length of pregnancy in weeks} 15 | #' \item{premie}{whether the birth was classified as premature (premie) or full-term} 16 | #' \item{visits}{number of hospital visits during pregnancy} 17 | #' \item{marital}{whether mother is `married` or `not married` at birth} 18 | #' \item{gained}{weight gained by mother during pregnancy in pounds} 19 | #' \item{weight}{weight of the baby at birth in pounds} 20 | #' \item{lowbirthweight}{whether baby was classified as low birthweight (`low`) or not (`not low`)} 21 | #' \item{gender}{gender of the baby, `female` or `male`} 22 | #' \item{habit}{status of the mother as a `nonsmoker` or a `smoker`} 23 | #' \item{whitemom}{whether mom is `white` or `not white`} 24 | #' } 25 | #' @source State of North Carolina. 26 | "nc" -------------------------------------------------------------------------------- /R/nycflights.R: -------------------------------------------------------------------------------- 1 | #' Flights data 2 | #' 3 | #' On-time data for a random sample of flights that departed NYC (i.e. JFK, LGA or EWR) 4 | #' in 2013. 5 | #' 6 | #' @source Hadley Wickham (2014). \code{nycflights13}: Data about flights departing 7 | #' NYC in 2013. R package version 0.1. 8 | #' \url{https://CRAN.R-project.org/package=nycflights13} 9 | #' @format A tbl_df with 32,735 rows and 16 variables: 10 | #' \describe{ 11 | #' \item{year,month,day}{Date of departure} 12 | #' \item{dep_time,arr_time}{Departure and arrival times, local tz.} 13 | #' \item{dep_delay,arr_delay}{Departure and arrival delays, in minutes. 14 | #' Negative times represent early departures/arrivals.} 15 | #' \item{hour,minute}{Time of departure broken in to hour and minutes} 16 | #' \item{carrier}{Two letter carrier abbreviation. See \code{airlines} in the 17 | #' \code{nycflights13} package for more information} 18 | #' \item{tailnum}{Plane tail number} 19 | #' \item{flight}{Flight number} 20 | #' \item{origin,dest}{Origin and destination. See \code{airports} in the 21 | #' \code{nycflights13} package for more information, or google airport the code.} 22 | #' \item{air_time}{Amount of time spent in the air} 23 | #' \item{distance}{Distance flown} 24 | #' } 25 | "nycflights" -------------------------------------------------------------------------------- /R/plot_ss.R: -------------------------------------------------------------------------------- 1 | #' plot_ss 2 | #' 3 | #' An interactive shiny app that will generate a scatterplot of two variables, then 4 | #' allow the user to click the plot in two locations to draw a best fitting line. 5 | #' Residuals are drawn by default; boxes representing the squared residuals are 6 | #' optional. 7 | #' 8 | #' @param x the name of numerical vector 1 on x-axis 9 | #' @param y the name of numerical vector 2 on y-axis 10 | #' @param data the dataframe in which x and y can be found 11 | #' @param showSquares logical option to show boxes representing the squared residuals 12 | #' @param leastSquares logical option to bypass point entry and automatically draw the least squares line 13 | #' @examples 14 | #' \dontrun{plot_ss} 15 | #' @export 16 | 17 | plot_ss <- function(x, y, data, showSquares = FALSE, leastSquares = FALSE){ 18 | missingargs <- missing(x) | missing(y) | missing(data) 19 | if (missingargs) stop(simpleError("missing arguments x, y or data")) 20 | 21 | xlab <- paste(substitute(x)) 22 | ylab <- paste(substitute(y)) 23 | 24 | x <- eval(substitute(x), data) 25 | y <- eval(substitute(y), data) 26 | 27 | data=na.omit(data.frame(x=x, y=y)) 28 | x = data[["x"]] 29 | y = data[["y"]] 30 | 31 | plot(y ~ x, data=data, 32 | asp = 1, pch = 16, xlab = xlab, ylab = ylab) 33 | 34 | 35 | if(leastSquares){ 36 | m1 <- lm(y ~ x, data=data) 37 | y.hat <- m1$fit 38 | } else{ 39 | cat("Click two points to make a line.") 40 | pt1 <- locator(1) 41 | points(pt1$x, pt1$y, pch = 4) 42 | pt2 <- locator(1) 43 | points(pt2$x, pt2$y, pch = 4) 44 | pts <- data.frame("x" = c(pt1$x, pt2$x),"y" = c(pt1$y, pt2$y)) 45 | m1 <- lm(y ~ x, data = pts) 46 | y.hat <- predict(m1, newdata = data) 47 | } 48 | r <- y - y.hat 49 | abline(m1) 50 | 51 | oSide <- x - r 52 | 53 | LLim <- par()$usr[1] 54 | RLim <- par()$usr[2] 55 | oSide[oSide < LLim | oSide > RLim] <- c(x + r)[oSide < LLim | oSide > RLim] # move boxes to avoid margins 56 | 57 | n <- length(y.hat) 58 | for(i in 1:n){ 59 | lines(rep(x[i], 2), c(y[i], y.hat[i]), lty = 2, col = "#56B4E9") 60 | if(showSquares){ 61 | lines(rep(oSide[i], 2), c(y[i], y.hat[i]), lty = 3, col = "#E69F00") 62 | lines(c(oSide[i], x[i]), rep(y.hat[i],2), lty = 3, col = "#E69F00") 63 | lines(c(oSide[i], x[i]), rep(y[i],2), lty = 3, col = "#E69F00") 64 | } 65 | } 66 | 67 | SS <- round(sum(r^2), 3) 68 | cat("\r ") 69 | print(m1) 70 | cat("Sum of Squares: ", SS) 71 | } -------------------------------------------------------------------------------- /R/present.R: -------------------------------------------------------------------------------- 1 | #' Male and female births in the US 2 | #' 3 | #' Counts of the total number of male and female births in the United States from 4 | #' 1940 to 2013. 5 | #' 6 | #' @format A tbl_df with 74 rows and 3 variables: 7 | #' \describe{ 8 | #' \item{year}{year, ranging from 1940 to 2013} 9 | #' \item{boys}{number of male births} 10 | #' \item{girls}{number of female births} 11 | #' } 12 | #' @source Data up to 2002 appear in Mathews TJ, and Hamilton BE. 2005. Trend 13 | #' analysis of the sex ratio at birth in the United States. National Vital 14 | #' Statistics Reports 53(20):1-17. Data for 2003 - 2013 have been collected 15 | #' from annual National Vital Statistics Reports published by the US Department of 16 | #' Health and Human Services, Centers for Disease Control and Prevention, 17 | #' National Center for Health Statistics. 18 | "present" -------------------------------------------------------------------------------- /R/rep_sample_n.R: -------------------------------------------------------------------------------- 1 | #' Repeating Sampling from a Tibble 2 | #' 3 | #' @param tbl tbl of data. 4 | #' @param size The number of rows to select. 5 | #' @param replace Sample with or without replacement? 6 | #' @param reps The number of samples to collect. 7 | #' @return A tbl_df that aggregates all created samples, with the addition of a \code{replicate} column that the tbl_df is also grouped by 8 | #' @examples 9 | #' data(nc) 10 | #' rep_sample_n(nc, size=10, replace=FALSE, reps=1) 11 | #' @export 12 | 13 | rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1) 14 | { 15 | n <- nrow(tbl) 16 | i <- unlist(replicate(reps, sample.int(n, size, replace = replace), simplify = FALSE)) 17 | 18 | rep_tbl <- cbind(replicate = rep(1:reps,rep(size,reps)), tbl[i, , drop=FALSE]) 19 | 20 | dplyr::group_by(rep_tbl, replicate) 21 | } -------------------------------------------------------------------------------- /R/rstudio.R: -------------------------------------------------------------------------------- 1 | #' Simple check to determine if code is being run in RStudio with the shiny runtime 2 | #' internal function 3 | #' @keywords internal 4 | 5 | allow_shiny = function() { 6 | runtime = knitr::opts_knit$get("rmarkdown.runtime") 7 | 8 | identical(runtime, "shiny") | is.null(runtime) 9 | } -------------------------------------------------------------------------------- /R/statsr.R: -------------------------------------------------------------------------------- 1 | #' statsr: A companion package for Statistics with R 2 | #' 3 | #' R package to support the online open access book "An Introduction 4 | #' to Bayesian Thinking" available at 5 | #' \url{https://statswithr.github.io/book/} and videos for the Coursera "Statistics with 6 | #' R" Specialization. The package includes data sets, functions 7 | #' and Shiny Applications for learning frequentist and Bayesian 8 | #' statistics with R. The two main functions for inference and decision making are 9 | #' `inference` and `bayes_inference` which support 10 | #' confidence/credible intervals and hypothesis testing with one sample or two samples 11 | #' from Gaussian and Bernoulli populations. Shiny apps are used to illustrate how prior 12 | #' hyperparameters or changes in the data may influence posterior distributions. 13 | #' 14 | #' See \url{https://github.com/StatsWithR/statsr} for the development version and 15 | #' additional information or for additional background and illustrations of functions 16 | #' the online book \url{https://statswithr.github.io/book/}. 17 | #' 18 | #' @docType package 19 | #' @name statsr 20 | #' @import stats 21 | #' @import graphics 22 | #' @import ggplot2 23 | #' @import shiny 24 | 25 | NULL -------------------------------------------------------------------------------- /R/statswithr_lab.R: -------------------------------------------------------------------------------- 1 | statswithr_lab = function(...) { 2 | 3 | # get the locations of resource files located within the package 4 | css = system.file("lab.css", package = "statsr") 5 | 6 | # call the base html_document function 7 | rmarkdown::html_document(css = css, 8 | highlight = "pygments", 9 | theme = "cerulean", 10 | fig_width = 7, 11 | fig_height = 4, 12 | ...) 13 | } -------------------------------------------------------------------------------- /R/tapwater.R: -------------------------------------------------------------------------------- 1 | #' Total Trihalomethanes in Tapwater 2 | #' 3 | #' Trihalomethanes are formed as a by-product predominantly when chlorine is used to disinfect water 4 | #' for drinking. They result from the reaction of chlorine or bromine with 5 | #' organic matter present in the water being treated. 6 | #' THMs have been associated through epidemiological studies 7 | #' with some adverse health effects and many are considered carcinogenic. 8 | #' In the United States, the EPA limits 9 | #' the total concentration of the four chief constituents (chloroform, bromoform, bromodichloromethane, and dibromochloromethane), referred to as 10 | #' total trihalomethanes (TTHM), to 80 parts per billion in treated water. 11 | #' 12 | #' 13 | #' @format A dataframe with 28 rows and 6 variables: 14 | #' \describe{ 15 | #' \item{date}{Date of collection} 16 | #' \item{tthm}{average total trihalomethanes in ppb } 17 | #' \item{samples}{number of samples} 18 | #' \item{nondetects}{number of samples where tthm not detected (0)} 19 | #' \item{min}{min tthm in ppb in samples} 20 | #' \item{max}{max tthm in ppb in samples} 21 | #' } 22 | #' @source National Drinking Water Database for Durham, NC. \url{https://www.ewg.org} 23 | "tapwater" -------------------------------------------------------------------------------- /R/wage.R: -------------------------------------------------------------------------------- 1 | #' Wage data 2 | #' 3 | #' The data were gathered as part of a random sample of 935 respondents throughout the United States. 4 | #' 5 | #' @format A tbl_df with with 935 rows and 17 variables: 6 | #' \describe{ 7 | #' \item{wage}{weekly earnings (dollars)} 8 | #' \item{hours}{average hours worked per week} 9 | #' \item{iq}{IQ score} 10 | #' \item{kww}{Knowledge of world work score} 11 | #' \item{educ}{years of education} 12 | #' \item{exper}{years of work experience} 13 | #' \item{tenure}{years with current employer} 14 | #' \item{age}{age in years} 15 | #' \item{married}{=1 if married} 16 | #' \item{black}{=1 if black} 17 | #' \item{south}{=1 if live in south} 18 | #' \item{urban}{=1 if live in a Standard Metropolitan Statistical Area } 19 | #' \item{sibs}{number of siblings} 20 | #' \item{brthord}{birth order} 21 | #' \item{meduc}{mother's education (years)} 22 | #' \item{feduc}{father's education (years)} 23 | #' \item{lwage}{natural log of wage} 24 | #' } 25 | #' @source Jeffrey M. Wooldridge (2000). Introductory Econometrics: A Modern Approach. South-Western College Publishing. 26 | "wage" -------------------------------------------------------------------------------- /R/zinc.R: -------------------------------------------------------------------------------- 1 | #' Zinc Concentration in Water 2 | #' 3 | #' Trace metals in drinking water affect the flavor and 4 | #' an unusually high concentration can pose a health 5 | #' hazard. Ten pairs of data were taken measuring zinc 6 | #' concentration in bottom water and surface water. 7 | #' 8 | #' @format 9 | #' A data frame with 10 observations on the following 4 variables. 10 | #' \describe{ 11 | #' \item{\code{location}}{sample number} 12 | #' \item{\code{bottom}}{zinc concentration in bottom water} 13 | #' \item{\code{surface}}{zinc concentration in surface water} 14 | #' \item{\code{difference}}{difference between zinc concentration at the bottom and surface} 15 | #' } 16 | #' 17 | #' @source 18 | #' \href{https://online.stat.psu.edu/stat500/sites/stat500/files/data/zinc_conc.txt}{PennState Eberly College of Science Online Courses} 19 | #' 20 | #' @examples 21 | #' data(zinc) 22 | #' str(zinc) 23 | #' plot(bottom ~ surface, data=zinc) 24 | #' # use paired t-test to test if difference in means is zero 25 | #' 26 | "zinc" 27 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, echo = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "README-" 12 | ) 13 | ``` 14 | 15 | [![Build Status](https://travis-ci.org/StatsWithR/statsr.svg?branch=BayesFactor)](https://travis-ci.org/StatsWithR/statsr) 16 | [![CRAN_Status_Badge](https://www.r-pkg.org/badges/version/statsr)](https://cran.r-project.org/package=statsr) 17 | [![](https://cranlogs.r-pkg.org/badges/statsr)](https://CRAN.R-project.org/package=statsr) [![](https://cranlogs.r-pkg.org/badges/grand-total/statsr)](https://CRAN.R-project.org/package=statsr) 18 | 19 | # statsr 20 | 21 | The `R` package `statsr` provides functions and datasets to support the Coursera [*Statistics with `R` Specialization*](https://www.coursera.org/specializations/statistics) videos and open access book 22 | [*An Introduction to Bayesian Thinking*](https://statswithr.github.io/book/) for learning Bayesian and frequentist statistics using `R`. 23 | 24 | 25 | To install the latest version from github, verify that there is a passing badge above on the README page. In `R` enter 26 | 27 | ```{r github, echo=TRUE,eval=FALSE} 28 | library(devtools) 29 | devtools::install_github("statswithr/statsr", 30 | dependencies=TRUE, 31 | upgrade_dependencies = TRUE) 32 | ``` 33 | 34 | This will install the packages and any packages that are required, as well as updating any installed packages to their latest versions. 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | [![Build 5 | Status](https://travis-ci.org/StatsWithR/statsr.svg?branch=BayesFactor)](https://travis-ci.org/StatsWithR/statsr) 6 | [![CRAN\_Status\_Badge](https://www.r-pkg.org/badges/version/statsr)](https://cran.r-project.org/package=statsr) 7 | [![](https://cranlogs.r-pkg.org/badges/statsr)](https://CRAN.R-project.org/package=statsr) 8 | [![](https://cranlogs.r-pkg.org/badges/grand-total/statsr)](https://CRAN.R-project.org/package=statsr) 9 | 10 | # statsr 11 | 12 | The `R` package `statsr` provides functions and datasets to support the 13 | Coursera [*Statistics with `R` 14 | Specialization*](https://www.coursera.org/specializations/statistics) 15 | videos and open access book [*An Introduction to Bayesian 16 | Thinking*](https://statswithr.github.io/book/) for learning Bayesian and 17 | frequentist statistics using `R`. 18 | 19 | To install the latest version from github, verify that there is a 20 | passing badge above on the README page. In `R` enter 21 | 22 | ``` r 23 | library(devtools) 24 | devtools::install_github("statswithr/statsr", 25 | dependencies=TRUE, 26 | upgrade_dependencies = TRUE) 27 | ``` 28 | 29 | This will install the packages and any packages that are required, as 30 | well as updating any installed packages to their latest versions. 31 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: http://statswithr.github.io/statsr/ 2 | 3 | authors: 4 | Merlise Clyde: 5 | href: http://stat.duke.edu/~clyde 6 | navbar: 7 | title: "statsr" 8 | left: 9 | - text: "Functions" 10 | href: reference/index.html 11 | - text: "News" 12 | href: news/index.html 13 | right: 14 | - icon: fa-github 15 | href: https://github.com/statswithr/statsr 16 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | # statsr 0.3.0 Comments to CRAN 2 | 3 | Submission to update bayes_inference function so that package is consistent with book. Addresses issue #15 in GitHub 4 | 5 | 6 | ## Comments on Note from checking: 7 | 8 | None 9 | 10 | ## Test environments 11 | 12 | - local MAC OSX 11.1 R 4.0.3 13 | - Ubuntu (on travis-ci), R-release and R-devel 14 | - win-builder R-release, R-devel 15 | - R-hub ubuntu-gcc-release (R-release) 16 | - R-hub fedora-clang-devel (R-devel) 17 | 18 | ## R CMD check results 19 | 20 | On windows_x86_64-w64-mingw32 (r-devel), ubuntu-gcc-release (r-release), fedora-clang-devel (r-devel) 21 | checking CRAN incoming feasibility ... NOTE 22 | Maintainer: 'Merlise Clyde ' 23 | 24 | 0 errors ✓ | 0 warnings ✓ | 0 notes 25 | -------------------------------------------------------------------------------- /data/ames.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/data/ames.rda -------------------------------------------------------------------------------- /data/arbuthnot.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/data/arbuthnot.rda -------------------------------------------------------------------------------- /data/atheism.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/data/atheism.rda -------------------------------------------------------------------------------- /data/brfss.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/data/brfss.rda -------------------------------------------------------------------------------- /data/evals.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/data/evals.rda -------------------------------------------------------------------------------- /data/kobe_basket.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/data/kobe_basket.rda -------------------------------------------------------------------------------- /data/mlb11.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/data/mlb11.rda -------------------------------------------------------------------------------- /data/nc.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/data/nc.rda -------------------------------------------------------------------------------- /data/nycflights.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/data/nycflights.rda -------------------------------------------------------------------------------- /data/present.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/data/present.rda -------------------------------------------------------------------------------- /data/tapwater.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/data/tapwater.rda -------------------------------------------------------------------------------- /data/wage.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/data/wage.rda -------------------------------------------------------------------------------- /data/zinc.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/data/zinc.rda -------------------------------------------------------------------------------- /docs/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Page not found (404) • statsr 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 55 | 56 | 57 | 58 | 59 | 60 | 61 |
62 |
63 | 99 | 100 | 101 | 102 |
103 | 104 |
105 |
106 | 109 | 110 | Content not found. Please use links in the navbar. 111 | 112 |
113 | 114 | 119 | 120 |
121 | 122 | 123 | 124 |
125 | 128 | 129 |
130 |

Site built with pkgdown 1.6.1.

131 |
132 | 133 |
134 |
135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /docs/LICENSE-text.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | License • statsr 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 55 | 56 | 57 | 58 | 59 | 60 | 61 |
62 |
63 | 99 | 100 | 101 | 102 |
103 | 104 |
105 |
106 | 109 | 110 |
YEAR: 2016
111 | COPYRIGHT HOLDER:  StatsWithR
112 | 
113 | 114 |
115 | 116 | 121 | 122 |
123 | 124 | 125 | 126 |
127 | 130 | 131 |
132 |

Site built with pkgdown 1.6.1.

133 |
134 | 135 |
136 |
137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /docs/bootstrap-toc.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) 3 | * Copyright 2015 Aidan Feldman 4 | * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ 5 | 6 | /* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */ 7 | 8 | /* All levels of nav */ 9 | nav[data-toggle='toc'] .nav > li > a { 10 | display: block; 11 | padding: 4px 20px; 12 | font-size: 13px; 13 | font-weight: 500; 14 | color: #767676; 15 | } 16 | nav[data-toggle='toc'] .nav > li > a:hover, 17 | nav[data-toggle='toc'] .nav > li > a:focus { 18 | padding-left: 19px; 19 | color: #563d7c; 20 | text-decoration: none; 21 | background-color: transparent; 22 | border-left: 1px solid #563d7c; 23 | } 24 | nav[data-toggle='toc'] .nav > .active > a, 25 | nav[data-toggle='toc'] .nav > .active:hover > a, 26 | nav[data-toggle='toc'] .nav > .active:focus > a { 27 | padding-left: 18px; 28 | font-weight: bold; 29 | color: #563d7c; 30 | background-color: transparent; 31 | border-left: 2px solid #563d7c; 32 | } 33 | 34 | /* Nav: second level (shown on .active) */ 35 | nav[data-toggle='toc'] .nav .nav { 36 | display: none; /* Hide by default, but at >768px, show it */ 37 | padding-bottom: 10px; 38 | } 39 | nav[data-toggle='toc'] .nav .nav > li > a { 40 | padding-top: 1px; 41 | padding-bottom: 1px; 42 | padding-left: 30px; 43 | font-size: 12px; 44 | font-weight: normal; 45 | } 46 | nav[data-toggle='toc'] .nav .nav > li > a:hover, 47 | nav[data-toggle='toc'] .nav .nav > li > a:focus { 48 | padding-left: 29px; 49 | } 50 | nav[data-toggle='toc'] .nav .nav > .active > a, 51 | nav[data-toggle='toc'] .nav .nav > .active:hover > a, 52 | nav[data-toggle='toc'] .nav .nav > .active:focus > a { 53 | padding-left: 28px; 54 | font-weight: 500; 55 | } 56 | 57 | /* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */ 58 | nav[data-toggle='toc'] .nav > .active > ul { 59 | display: block; 60 | } 61 | -------------------------------------------------------------------------------- /docs/bootstrap-toc.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) 3 | * Copyright 2015 Aidan Feldman 4 | * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ 5 | (function() { 6 | 'use strict'; 7 | 8 | window.Toc = { 9 | helpers: { 10 | // return all matching elements in the set, or their descendants 11 | findOrFilter: function($el, selector) { 12 | // http://danielnouri.org/notes/2011/03/14/a-jquery-find-that-also-finds-the-root-element/ 13 | // http://stackoverflow.com/a/12731439/358804 14 | var $descendants = $el.find(selector); 15 | return $el.filter(selector).add($descendants).filter(':not([data-toc-skip])'); 16 | }, 17 | 18 | generateUniqueIdBase: function(el) { 19 | var text = $(el).text(); 20 | var anchor = text.trim().toLowerCase().replace(/[^A-Za-z0-9]+/g, '-'); 21 | return anchor || el.tagName.toLowerCase(); 22 | }, 23 | 24 | generateUniqueId: function(el) { 25 | var anchorBase = this.generateUniqueIdBase(el); 26 | for (var i = 0; ; i++) { 27 | var anchor = anchorBase; 28 | if (i > 0) { 29 | // add suffix 30 | anchor += '-' + i; 31 | } 32 | // check if ID already exists 33 | if (!document.getElementById(anchor)) { 34 | return anchor; 35 | } 36 | } 37 | }, 38 | 39 | generateAnchor: function(el) { 40 | if (el.id) { 41 | return el.id; 42 | } else { 43 | var anchor = this.generateUniqueId(el); 44 | el.id = anchor; 45 | return anchor; 46 | } 47 | }, 48 | 49 | createNavList: function() { 50 | return $(''); 51 | }, 52 | 53 | createChildNavList: function($parent) { 54 | var $childList = this.createNavList(); 55 | $parent.append($childList); 56 | return $childList; 57 | }, 58 | 59 | generateNavEl: function(anchor, text) { 60 | var $a = $(''); 61 | $a.attr('href', '#' + anchor); 62 | $a.text(text); 63 | var $li = $('
  • '); 64 | $li.append($a); 65 | return $li; 66 | }, 67 | 68 | generateNavItem: function(headingEl) { 69 | var anchor = this.generateAnchor(headingEl); 70 | var $heading = $(headingEl); 71 | var text = $heading.data('toc-text') || $heading.text(); 72 | return this.generateNavEl(anchor, text); 73 | }, 74 | 75 | // Find the first heading level (`

    `, then `

    `, etc.) that has more than one element. Defaults to 1 (for `

    `). 76 | getTopLevel: function($scope) { 77 | for (var i = 1; i <= 6; i++) { 78 | var $headings = this.findOrFilter($scope, 'h' + i); 79 | if ($headings.length > 1) { 80 | return i; 81 | } 82 | } 83 | 84 | return 1; 85 | }, 86 | 87 | // returns the elements for the top level, and the next below it 88 | getHeadings: function($scope, topLevel) { 89 | var topSelector = 'h' + topLevel; 90 | 91 | var secondaryLevel = topLevel + 1; 92 | var secondarySelector = 'h' + secondaryLevel; 93 | 94 | return this.findOrFilter($scope, topSelector + ',' + secondarySelector); 95 | }, 96 | 97 | getNavLevel: function(el) { 98 | return parseInt(el.tagName.charAt(1), 10); 99 | }, 100 | 101 | populateNav: function($topContext, topLevel, $headings) { 102 | var $context = $topContext; 103 | var $prevNav; 104 | 105 | var helpers = this; 106 | $headings.each(function(i, el) { 107 | var $newNav = helpers.generateNavItem(el); 108 | var navLevel = helpers.getNavLevel(el); 109 | 110 | // determine the proper $context 111 | if (navLevel === topLevel) { 112 | // use top level 113 | $context = $topContext; 114 | } else if ($prevNav && $context === $topContext) { 115 | // create a new level of the tree and switch to it 116 | $context = helpers.createChildNavList($prevNav); 117 | } // else use the current $context 118 | 119 | $context.append($newNav); 120 | 121 | $prevNav = $newNav; 122 | }); 123 | }, 124 | 125 | parseOps: function(arg) { 126 | var opts; 127 | if (arg.jquery) { 128 | opts = { 129 | $nav: arg 130 | }; 131 | } else { 132 | opts = arg; 133 | } 134 | opts.$scope = opts.$scope || $(document.body); 135 | return opts; 136 | } 137 | }, 138 | 139 | // accepts a jQuery object, or an options object 140 | init: function(opts) { 141 | opts = this.helpers.parseOps(opts); 142 | 143 | // ensure that the data attribute is in place for styling 144 | opts.$nav.attr('data-toggle', 'toc'); 145 | 146 | var $topContext = this.helpers.createChildNavList(opts.$nav); 147 | var topLevel = this.helpers.getTopLevel(opts.$scope); 148 | var $headings = this.helpers.getHeadings(opts.$scope, topLevel); 149 | this.helpers.populateNav($topContext, topLevel, $headings); 150 | } 151 | }; 152 | 153 | $(function() { 154 | $('nav[data-toggle="toc"]').each(function(i, el) { 155 | var $nav = $(el); 156 | Toc.init($nav); 157 | }); 158 | }); 159 | })(); 160 | -------------------------------------------------------------------------------- /docs/docsearch.js: -------------------------------------------------------------------------------- 1 | $(function() { 2 | 3 | // register a handler to move the focus to the search bar 4 | // upon pressing shift + "/" (i.e. "?") 5 | $(document).on('keydown', function(e) { 6 | if (e.shiftKey && e.keyCode == 191) { 7 | e.preventDefault(); 8 | $("#search-input").focus(); 9 | } 10 | }); 11 | 12 | $(document).ready(function() { 13 | // do keyword highlighting 14 | /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ 15 | var mark = function() { 16 | 17 | var referrer = document.URL ; 18 | var paramKey = "q" ; 19 | 20 | if (referrer.indexOf("?") !== -1) { 21 | var qs = referrer.substr(referrer.indexOf('?') + 1); 22 | var qs_noanchor = qs.split('#')[0]; 23 | var qsa = qs_noanchor.split('&'); 24 | var keyword = ""; 25 | 26 | for (var i = 0; i < qsa.length; i++) { 27 | var currentParam = qsa[i].split('='); 28 | 29 | if (currentParam.length !== 2) { 30 | continue; 31 | } 32 | 33 | if (currentParam[0] == paramKey) { 34 | keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); 35 | } 36 | } 37 | 38 | if (keyword !== "") { 39 | $(".contents").unmark({ 40 | done: function() { 41 | $(".contents").mark(keyword); 42 | } 43 | }); 44 | } 45 | } 46 | }; 47 | 48 | mark(); 49 | }); 50 | }); 51 | 52 | /* Search term highlighting ------------------------------*/ 53 | 54 | function matchedWords(hit) { 55 | var words = []; 56 | 57 | var hierarchy = hit._highlightResult.hierarchy; 58 | // loop to fetch from lvl0, lvl1, etc. 59 | for (var idx in hierarchy) { 60 | words = words.concat(hierarchy[idx].matchedWords); 61 | } 62 | 63 | var content = hit._highlightResult.content; 64 | if (content) { 65 | words = words.concat(content.matchedWords); 66 | } 67 | 68 | // return unique words 69 | var words_uniq = [...new Set(words)]; 70 | return words_uniq; 71 | } 72 | 73 | function updateHitURL(hit) { 74 | 75 | var words = matchedWords(hit); 76 | var url = ""; 77 | 78 | if (hit.anchor) { 79 | url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; 80 | } else { 81 | url = hit.url + '?q=' + escape(words.join(" ")); 82 | } 83 | 84 | return url; 85 | } 86 | -------------------------------------------------------------------------------- /docs/jquery.sticky-kit.min.js: -------------------------------------------------------------------------------- 1 | /* Sticky-kit v1.1.2 | WTFPL | Leaf Corcoran 2015 | */ 2 | /* 3 | Source: https://github.com/leafo/sticky-kit 4 | License: MIT 5 | */ 6 | (function(){var b,f;b=this.jQuery||window.jQuery;f=b(window);b.fn.stick_in_parent=function(d){var A,w,J,n,B,K,p,q,k,E,t;null==d&&(d={});t=d.sticky_class;B=d.inner_scrolling;E=d.recalc_every;k=d.parent;q=d.offset_top;p=d.spacer;w=d.bottoming;null==q&&(q=0);null==k&&(k=void 0);null==B&&(B=!0);null==t&&(t="is_stuck");A=b(document);null==w&&(w=!0);J=function(a,d,n,C,F,u,r,G){var v,H,m,D,I,c,g,x,y,z,h,l;if(!a.data("sticky_kit")){a.data("sticky_kit",!0);I=A.height();g=a.parent();null!=k&&(g=g.closest(k)); 7 | if(!g.length)throw"failed to find stick parent";v=m=!1;(h=null!=p?p&&a.closest(p):b("
    "))&&h.css("position",a.css("position"));x=function(){var c,f,e;if(!G&&(I=A.height(),c=parseInt(g.css("border-top-width"),10),f=parseInt(g.css("padding-top"),10),d=parseInt(g.css("padding-bottom"),10),n=g.offset().top+c+f,C=g.height(),m&&(v=m=!1,null==p&&(a.insertAfter(h),h.detach()),a.css({position:"",top:"",width:"",bottom:""}).removeClass(t),e=!0),F=a.offset().top-(parseInt(a.css("margin-top"),10)||0)-q, 8 | u=a.outerHeight(!0),r=a.css("float"),h&&h.css({width:a.outerWidth(!0),height:u,display:a.css("display"),"vertical-align":a.css("vertical-align"),"float":r}),e))return l()};x();if(u!==C)return D=void 0,c=q,z=E,l=function(){var b,l,e,k;if(!G&&(e=!1,null!=z&&(--z,0>=z&&(z=E,x(),e=!0)),e||A.height()===I||x(),e=f.scrollTop(),null!=D&&(l=e-D),D=e,m?(w&&(k=e+u+c>C+n,v&&!k&&(v=!1,a.css({position:"fixed",bottom:"",top:c}).trigger("sticky_kit:unbottom"))),eb&&!v&&(c-=l,c=Math.max(b-u,c),c=Math.min(q,c),m&&a.css({top:c+"px"})))):e>F&&(m=!0,b={position:"fixed",top:c},b.width="border-box"===a.css("box-sizing")?a.outerWidth()+"px":a.width()+"px",a.css(b).addClass(t),null==p&&(a.after(h),"left"!==r&&"right"!==r||h.append(a)),a.trigger("sticky_kit:stick")),m&&w&&(null==k&&(k=e+u+c>C+n),!v&&k)))return v=!0,"static"===g.css("position")&&g.css({position:"relative"}), 10 | a.css({position:"absolute",bottom:d,top:"auto"}).trigger("sticky_kit:bottom")},y=function(){x();return l()},H=function(){G=!0;f.off("touchmove",l);f.off("scroll",l);f.off("resize",y);b(document.body).off("sticky_kit:recalc",y);a.off("sticky_kit:detach",H);a.removeData("sticky_kit");a.css({position:"",bottom:"",top:"",width:""});g.position("position","");if(m)return null==p&&("left"!==r&&"right"!==r||a.insertAfter(h),h.remove()),a.removeClass(t)},f.on("touchmove",l),f.on("scroll",l),f.on("resize", 11 | y),b(document.body).on("sticky_kit:recalc",y),a.on("sticky_kit:detach",H),setTimeout(l,0)}};n=0;for(K=this.length;n 2 | 3 | 5 | 8 | 12 | 13 | -------------------------------------------------------------------------------- /docs/pkgdown.js: -------------------------------------------------------------------------------- 1 | /* http://gregfranko.com/blog/jquery-best-practices/ */ 2 | (function($) { 3 | $(function() { 4 | 5 | $('.navbar-fixed-top').headroom(); 6 | 7 | $('body').css('padding-top', $('.navbar').height() + 10); 8 | $(window).resize(function(){ 9 | $('body').css('padding-top', $('.navbar').height() + 10); 10 | }); 11 | 12 | $('[data-toggle="tooltip"]').tooltip(); 13 | 14 | var cur_path = paths(location.pathname); 15 | var links = $("#navbar ul li a"); 16 | var max_length = -1; 17 | var pos = -1; 18 | for (var i = 0; i < links.length; i++) { 19 | if (links[i].getAttribute("href") === "#") 20 | continue; 21 | // Ignore external links 22 | if (links[i].host !== location.host) 23 | continue; 24 | 25 | var nav_path = paths(links[i].pathname); 26 | 27 | var length = prefix_length(nav_path, cur_path); 28 | if (length > max_length) { 29 | max_length = length; 30 | pos = i; 31 | } 32 | } 33 | 34 | // Add class to parent
  • , and enclosing
  • if in dropdown 35 | if (pos >= 0) { 36 | var menu_anchor = $(links[pos]); 37 | menu_anchor.parent().addClass("active"); 38 | menu_anchor.closest("li.dropdown").addClass("active"); 39 | } 40 | }); 41 | 42 | function paths(pathname) { 43 | var pieces = pathname.split("/"); 44 | pieces.shift(); // always starts with / 45 | 46 | var end = pieces[pieces.length - 1]; 47 | if (end === "index.html" || end === "") 48 | pieces.pop(); 49 | return(pieces); 50 | } 51 | 52 | // Returns -1 if not found 53 | function prefix_length(needle, haystack) { 54 | if (needle.length > haystack.length) 55 | return(-1); 56 | 57 | // Special case for length-0 haystack, since for loop won't run 58 | if (haystack.length === 0) { 59 | return(needle.length === 0 ? 0 : -1); 60 | } 61 | 62 | for (var i = 0; i < haystack.length; i++) { 63 | if (needle[i] != haystack[i]) 64 | return(i); 65 | } 66 | 67 | return(haystack.length); 68 | } 69 | 70 | /* Clipboard --------------------------*/ 71 | 72 | function changeTooltipMessage(element, msg) { 73 | var tooltipOriginalTitle=element.getAttribute('data-original-title'); 74 | element.setAttribute('data-original-title', msg); 75 | $(element).tooltip('show'); 76 | element.setAttribute('data-original-title', tooltipOriginalTitle); 77 | } 78 | 79 | if(ClipboardJS.isSupported()) { 80 | $(document).ready(function() { 81 | var copyButton = ""; 82 | 83 | $(".examples, div.sourceCode").addClass("hasCopyButton"); 84 | 85 | // Insert copy buttons: 86 | $(copyButton).prependTo(".hasCopyButton"); 87 | 88 | // Initialize tooltips: 89 | $('.btn-copy-ex').tooltip({container: 'body'}); 90 | 91 | // Initialize clipboard: 92 | var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { 93 | text: function(trigger) { 94 | return trigger.parentNode.textContent; 95 | } 96 | }); 97 | 98 | clipboardBtnCopies.on('success', function(e) { 99 | changeTooltipMessage(e.trigger, 'Copied!'); 100 | e.clearSelection(); 101 | }); 102 | 103 | clipboardBtnCopies.on('error', function() { 104 | changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); 105 | }); 106 | }); 107 | } 108 | })(window.jQuery || window.$) 109 | -------------------------------------------------------------------------------- /docs/pkgdown.yml: -------------------------------------------------------------------------------- 1 | pandoc: 2.11.2 2 | pkgdown: 1.6.1 3 | pkgdown_sha: ~ 4 | articles: {} 5 | last_built: 2021-01-22T19:37Z 6 | urls: 7 | reference: http://statswithr.github.io/statsr//reference 8 | article: http://statswithr.github.io/statsr//articles 9 | 10 | -------------------------------------------------------------------------------- /docs/reference/Rplot001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/docs/reference/Rplot001.png -------------------------------------------------------------------------------- /docs/reference/Rplot002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/docs/reference/Rplot002.png -------------------------------------------------------------------------------- /docs/reference/Rplot003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/docs/reference/Rplot003.png -------------------------------------------------------------------------------- /docs/reference/Rplot004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/docs/reference/Rplot004.png -------------------------------------------------------------------------------- /docs/reference/Rplot005.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/docs/reference/Rplot005.png -------------------------------------------------------------------------------- /docs/reference/bandit_posterior-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/docs/reference/bandit_posterior-1.png -------------------------------------------------------------------------------- /docs/reference/bandit_sim-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/docs/reference/bandit_sim-1.png -------------------------------------------------------------------------------- /docs/reference/bayes_inference-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/docs/reference/bayes_inference-1.png -------------------------------------------------------------------------------- /docs/reference/bayes_inference-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/docs/reference/bayes_inference-2.png -------------------------------------------------------------------------------- /docs/reference/bayes_inference-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/docs/reference/bayes_inference-3.png -------------------------------------------------------------------------------- /docs/reference/bayes_inference-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/docs/reference/bayes_inference-4.png -------------------------------------------------------------------------------- /docs/reference/bayes_inference-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/docs/reference/bayes_inference-5.png -------------------------------------------------------------------------------- /docs/reference/figures/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/docs/reference/figures/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /docs/reference/inference-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/docs/reference/inference-1.png -------------------------------------------------------------------------------- /docs/reference/inference-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/docs/reference/inference-2.png -------------------------------------------------------------------------------- /docs/reference/inference-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/docs/reference/inference-3.png -------------------------------------------------------------------------------- /docs/reference/plot_bandit_posterior-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/docs/reference/plot_bandit_posterior-1.png -------------------------------------------------------------------------------- /docs/reference/zinc-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/docs/reference/zinc-1.png -------------------------------------------------------------------------------- /docs/sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | http://statswithr.github.io/statsr//index.html 5 | 6 | 7 | http://statswithr.github.io/statsr//reference/BF_app.html 8 | 9 | 10 | http://statswithr.github.io/statsr//reference/allow_shiny.html 11 | 12 | 13 | http://statswithr.github.io/statsr//reference/ames.html 14 | 15 | 16 | http://statswithr.github.io/statsr//reference/ames_sampling_dist.html 17 | 18 | 19 | http://statswithr.github.io/statsr//reference/arbuthnot.html 20 | 21 | 22 | http://statswithr.github.io/statsr//reference/atheism.html 23 | 24 | 25 | http://statswithr.github.io/statsr//reference/bandit_posterior.html 26 | 27 | 28 | http://statswithr.github.io/statsr//reference/bandit_sim.html 29 | 30 | 31 | http://statswithr.github.io/statsr//reference/bayes_inference.html 32 | 33 | 34 | http://statswithr.github.io/statsr//reference/brfss.html 35 | 36 | 37 | http://statswithr.github.io/statsr//reference/calc_streak.html 38 | 39 | 40 | http://statswithr.github.io/statsr//reference/credible_interval_app.html 41 | 42 | 43 | http://statswithr.github.io/statsr//reference/evals.html 44 | 45 | 46 | http://statswithr.github.io/statsr//reference/inference.html 47 | 48 | 49 | http://statswithr.github.io/statsr//reference/kobe_basket.html 50 | 51 | 52 | http://statswithr.github.io/statsr//reference/mlb11.html 53 | 54 | 55 | http://statswithr.github.io/statsr//reference/nc.html 56 | 57 | 58 | http://statswithr.github.io/statsr//reference/nycflights.html 59 | 60 | 61 | http://statswithr.github.io/statsr//reference/plot_bandit_posterior.html 62 | 63 | 64 | http://statswithr.github.io/statsr//reference/plot_ss.html 65 | 66 | 67 | http://statswithr.github.io/statsr//reference/present.html 68 | 69 | 70 | http://statswithr.github.io/statsr//reference/rep_sample_n.html 71 | 72 | 73 | http://statswithr.github.io/statsr//reference/statsr.html 74 | 75 | 76 | http://statswithr.github.io/statsr//reference/tapwater.html 77 | 78 | 79 | http://statswithr.github.io/statsr//reference/wage.html 80 | 81 | 82 | http://statswithr.github.io/statsr//reference/zinc.html 83 | 84 | 85 | -------------------------------------------------------------------------------- /inst/WORDLIST: -------------------------------------------------------------------------------- 1 | Ames 2 | Arbuthnot 3 | Arbuthnot's 4 | BayesFactor 5 | BRFSS 6 | CLT 7 | Coursera 8 | DS 9 | De 10 | EDA 11 | EWR 12 | Eberly 13 | Gelman 14 | Hadley 15 | Hamermesh 16 | HistData 17 | JSZ 18 | JUI 19 | JZS 20 | Jeffreys 21 | LGA 22 | Lakers 23 | Lindley's 24 | NG 25 | ORL 26 | PennState 27 | README 28 | RStudio 29 | Siow 30 | THMs 31 | TJ 32 | TTHM 33 | Tapwater 34 | Tibble 35 | Trihalomethanes 36 | Wickham 37 | Wooldridge 38 | YYYY 39 | Zellner 40 | ames 41 | amstat 42 | bayes 43 | birthweight 44 | bromodichloromethane 45 | bromoform 46 | ci 47 | datadocumentation 48 | decock 49 | df 50 | dibromochloromethane 51 | english 52 | freqentist 53 | github 54 | http 55 | jse 56 | mlb 57 | perc 58 | posttests 59 | ppb 60 | premie 61 | rscale 62 | se 63 | ss 64 | tapwater 65 | tbl 66 | tibble 67 | trihalomethanes 68 | tthm 69 | twosided 70 | tz 71 | www 72 | -------------------------------------------------------------------------------- /inst/lab.css: -------------------------------------------------------------------------------- 1 | .fax-slot-machine::before { 2 | content: "\1f3b0" 3 | } 4 | 5 | body { 6 | counter-reset: question 0 exercise 0; 7 | } 8 | 9 | h1 { 10 | font-family: Arial, Helvetica, sans-serif; 11 | font-weight: bold; 12 | } 13 | 14 | h2 { 15 | font-family: Arial, Helvetica, sans-serif; 16 | font-weight: bold; 17 | margin-top: 24px; 18 | } 19 | 20 | hr { 21 | border: 1px solid #357FAA; 22 | } 23 | 24 | .question, .exercise { 25 | position: relative; 26 | margin: 2em; 27 | padding: 2em 20px 1em 20px; 28 | } 29 | 30 | .question::before, .exercise::before { 31 | position: absolute; 32 | top: -1em; 33 | left: -2em; 34 | width: 7em; 35 | padding: 5px 0; 36 | color: #ffffff; 37 | font-weight: bold; 38 | font-family: "Helvetica Neue", Arial, sans-serif; 39 | text-align: center; 40 | } 41 | 42 | .question { 43 | counter-increment: question; 44 | background: rgb(49, 126, 172); 45 | background: rgba(49, 126, 172, 0.1); 46 | } 47 | 48 | .question::before { 49 | content: "Question " counter(question); 50 | background: #317EAC; 51 | background: rgb(49, 126, 172); 52 | } 53 | 54 | .exercise { 55 | counter-increment: exercise; 56 | background: rgb(92, 184, 92); 57 | background: rgba(92, 184, 92, 0.1); 58 | } 59 | 60 | .exercise::before { 61 | content: "Exercise " counter(exercise); 62 | background: rgb(92, 184, 92); 63 | } 64 | 65 | 66 | .question ul { 67 | counter-reset: choice; 68 | margin-left: 1.5em; 69 | list-style-type: none; 70 | } 71 | 72 | .question li { 73 | margin-top: 20px; 74 | counter-increment: choice; 75 | } 76 | 77 | .question li::before { 78 | content: counter(choice, upper-alpha) '. '; 79 | color: #317EAC; 80 | font-weight: bold; 81 | font-family: "Helvetica Neue", Arial, sans-serif; 82 | text-align: left; 83 | width: 2em; 84 | margin-left: -2em; 85 | display: inline-block; 86 | } 87 | 88 | 89 | .instructions { 90 | margin-top: 30px; 91 | /*margin-bottom: 30px;*/ 92 | padding: 10px 10px 0; 93 | border: 1px solid rgb(0, 102, 102); 94 | border: 1px solid rgba(0, 102, 102, 0.2); 95 | border-radius: 5px; 96 | color: rgb(0, 102, 102); 97 | color: rgba(0, 102, 102, 0.8); 98 | background: rgb(204, 255, 255); 99 | background: rgba(204, 255, 255, 0.1); 100 | } 101 | 102 | .license { 103 | margin-top: 30px; 104 | margin-bottom: 30px; 105 | padding: 10px 10px 0; 106 | border: 1px solid rgb(76, 114, 29); 107 | border: 1px solid rgba(76, 114, 29, 0.2); 108 | border-radius: 5px; 109 | color: rgb(76, 114, 29); 110 | color: rgba(76, 114, 29, 0.8); 111 | background: rgb(76, 114, 29); 112 | background: rgba(76, 114, 29, 0.1); 113 | } 114 | 115 | .boxedtext { 116 | background-color: rgb(86, 155, 189); 117 | background-color: rgba(86, 155, 189, 0.2); 118 | padding: 20px; 119 | margin-bottom: 20px; 120 | font-size: 10pt; 121 | } 122 | 123 | 124 | -------------------------------------------------------------------------------- /man/BF_app.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/BF_app.R 3 | \name{BF_app} 4 | \alias{BF_app} 5 | \title{Run the interactive Bayes Factor shiny app} 6 | \usage{ 7 | BF_app() 8 | } 9 | \description{ 10 | This app illustrates how changing the Z score and prior precision 11 | affects the Bayes Factor for testing H1 that the mean is zero 12 | versus H2 that the mean is not zero for data arising from a normal 13 | population. Lindley's paradox occurs for large sample sizes 14 | when the Bayes factor favors H1 even though the Z score is large or the 15 | p-value is small enough to reach statistical significance and the values of 16 | the sample mean do not reflex practical significance based on the prior 17 | distribution. 18 | Bartlett's paradox may occur when the prior precision goes to zero, leading 19 | to Bayes factors that favor H1 regardless of the data. 20 | A prior precision of one corresponds to the unit information prior. 21 | } 22 | \examples{ 23 | if (interactive()) { 24 | BF.app() 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /man/allow_shiny.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rstudio.R 3 | \name{allow_shiny} 4 | \alias{allow_shiny} 5 | \title{Simple check to determine if code is being run in RStudio with the shiny runtime 6 | internal function} 7 | \usage{ 8 | allow_shiny() 9 | } 10 | \description{ 11 | Simple check to determine if code is being run in RStudio with the shiny runtime 12 | internal function 13 | } 14 | \keyword{internal} 15 | -------------------------------------------------------------------------------- /man/ames.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ames.R 3 | \docType{data} 4 | \name{ames} 5 | \alias{ames} 6 | \title{Housing prices in Ames, Iowa} 7 | \format{ 8 | A tbl_df with with 2930 rows and 82 variables: 9 | \describe{ 10 | \item{Order}{Observation number.} 11 | \item{PID}{Parcel identification number - can be used with city web site for parcel review.} 12 | \item{area}{Above grade (ground) living area square feet.} 13 | \item{price}{Sale price in USD.} 14 | \item{MS.SubClass}{Identifies the type of dwelling involved in the sale.} 15 | \item{MS.Zoning}{Identifies the general zoning classification of the sale.} 16 | \item{Lot.Frontage}{Linear feet of street connected to property.} 17 | \item{Lot.Area}{Lot size in square feet.} 18 | \item{Street}{Type of road access to property.} 19 | \item{Alley}{Type of alley access to property.} 20 | \item{Lot.Shape}{General shape of property.} 21 | \item{Land.Contour}{Flatness of the property.} 22 | \item{Utilities}{Type of utilities available.} 23 | \item{Lot.Config}{Lot configuration.} 24 | \item{Land.Slope}{Slope of property.} 25 | \item{Neighborhood}{Physical locations within Ames city limits (map available).} 26 | \item{Condition.1}{Proximity to various conditions.} 27 | \item{Condition.2}{Proximity to various conditions (if more than one is present).} 28 | \item{Bldg.Type}{Type of dwelling.} 29 | \item{House.Style}{Style of dwelling.} 30 | \item{Overall.Qual}{Rates the overall material and finish of the house.} 31 | \item{Overall.Cond}{Rates the overall condition of the house.} 32 | \item{Year.Built}{Original construction date.} 33 | \item{Year.Remod.Add}{Remodel date (same as construction date if no remodeling or additions).} 34 | \item{Roof.Style}{Type of roof.} 35 | \item{Roof.Matl}{Roof material.} 36 | \item{Exterior.1st}{Exterior covering on house.} 37 | \item{Exterior.2nd}{Exterior covering on house (if more than one material).} 38 | \item{Mas.Vnr.Type}{Masonry veneer type.} 39 | \item{Mas.Vnr.Area}{Masonry veneer area in square feet.} 40 | \item{Exter.Qual}{Evaluates the quality of the material on the exterior.} 41 | \item{Exter.Cond}{Evaluates the present condition of the material on the exterior.} 42 | \item{Foundation}{Type of foundation.} 43 | \item{Bsmt.Qual}{Evaluates the height of the basement.} 44 | \item{Bsmt.Cond}{Evaluates the general condition of the basement.} 45 | \item{Bsmt.Exposure}{Refers to walkout or garden level walls.} 46 | \item{BsmtFin.Type.1}{Rating of basement finished area.} 47 | \item{BsmtFin.SF.1}{Type 1 finished square feet.} 48 | \item{BsmtFin.Type.2}{Rating of basement finished area (if multiple types).} 49 | \item{BsmtFin.SF.2}{Type 2 finished square feet.} 50 | \item{Bsmt.Unf.SF}{Unfinished square feet of basement area.} 51 | \item{Total.Bsmt.SF}{Total square feet of basement area.} 52 | \item{Heating}{Type of heating.} 53 | \item{Heating.QC}{Heating quality and condition.} 54 | \item{Central.Air}{Central air conditioning.} 55 | \item{Electrical}{Electrical system.} 56 | \item{X1st.Flr.SF}{First Floor square feet.} 57 | \item{X2nd.Flr.SF}{Second floor square feet.} 58 | \item{Low.Qual.Fin.SF}{Low quality finished square feet (all floors).} 59 | \item{Bsmt.Full.Bath}{Basement full bathrooms.} 60 | \item{Bsmt.Half.Bath}{Basement half bathrooms.} 61 | \item{Full.Bath}{Full bathrooms above grade.} 62 | \item{Half.Bath}{Half baths above grade.} 63 | \item{Bedroom.AbvGr}{Bedrooms above grade (does NOT include basement bedrooms).} 64 | \item{Kitchen.AbvGr}{Kitchens above grade.} 65 | \item{Kitchen.Qual}{Kitchen quality.} 66 | \item{TotRms.AbvGrd}{Total rooms above grade (does not include bathrooms).} 67 | \item{Functional}{Home functionality (Assume typical unless deductions are warranted).} 68 | \item{Fireplaces}{Number of fireplaces.} 69 | \item{Fireplace.Qu}{Fireplace quality.} 70 | \item{Garage.Type}{Garage location.} 71 | \item{Garage.Yr.Blt}{Year garage was built.} 72 | \item{Garage.Finish}{Interior finish of the garage.} 73 | \item{Garage.Cars}{Size of garage in car capacity.} 74 | \item{Garage.Area}{Size of garage in square feet.} 75 | \item{Garage.Qual}{Garage quality.} 76 | \item{Garage.Cond}{Garage condition.} 77 | \item{Paved.Drive}{Paved driveway.} 78 | \item{Wood.Deck.SF}{Wood deck area in square feet.} 79 | \item{Open.Porch.SF}{Open porch area in square feet.} 80 | \item{Enclosed.Porch}{Enclosed porch area in square feet.} 81 | \item{X3Ssn.Porch}{Three season porch area in square feet.} 82 | \item{Screen.Porch}{Screen porch area in square feet.} 83 | \item{Pool.Area}{Pool area in square feet.} 84 | \item{Pool.QC}{Pool quality.} 85 | \item{Fence}{Fence quality.} 86 | \item{Misc.Feature}{Miscellaneous feature not covered in other categories.} 87 | \item{Misc.Val}{Dollar value of miscellaneous feature.} 88 | \item{Mo.Sold}{Month Sold (MM).} 89 | \item{Yr.Sold}{Year Sold (YYYY).} 90 | \item{Sale.Type}{Type of sale.} 91 | \item{Sale.Condition}{Condition of sale.} 92 | } 93 | } 94 | \source{ 95 | De Cock, Dean. "Ames, Iowa: Alternative to the Boston housing data as 96 | an end of semester regression project." Journal of Statistics Education 19.3 (2011). 97 | } 98 | \usage{ 99 | ames 100 | } 101 | \description{ 102 | Data set contains information from the Ames Assessor's Office used in computing 103 | assessed values for individual residential properties sold in Ames, IA from 2006 104 | to 2010. See http://www.amstat.org/publications/jse/v19n3/decock/datadocumentation.txt 105 | for detailed variable descriptions. 106 | } 107 | \keyword{datasets} 108 | -------------------------------------------------------------------------------- /man/ames_sampling_dist.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ames_sampling_dist.R 3 | \name{ames_sampling_dist} 4 | \alias{ames_sampling_dist} 5 | \title{Simulate Sampling Distribution} 6 | \usage{ 7 | ames_sampling_dist() 8 | } 9 | \description{ 10 | Run the interactive ames sampling distribution shiny app to 11 | illustrate sampling distributions using variables from the `ames` 12 | dataset. 13 | } 14 | \examples{ 15 | if (interactive()) { 16 | ames_sampling_dist() 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /man/arbuthnot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/arbuthnot.R 3 | \docType{data} 4 | \name{arbuthnot} 5 | \alias{arbuthnot} 6 | \title{Male and female births in London} 7 | \format{ 8 | A tbl_df with with 82 rows and 3 variables: 9 | \describe{ 10 | \item{year}{year, ranging from 1629 to 1710} 11 | \item{boys}{number of male christenings (births)} 12 | \item{girls}{number of female christenings (births)} 13 | } 14 | } 15 | \source{ 16 | These data are excerpted from the \code{\link[HistData]{Arbuthnot}} 17 | data set in the HistData package. 18 | } 19 | \usage{ 20 | arbuthnot 21 | } 22 | \description{ 23 | Arbuthnot's data describes male and female christenings (births) for 24 | London from 1629-1710. 25 | } 26 | \details{ 27 | John Arbuthnot (1710) used these time series data to carry out the first 28 | known significance test. During every one of the 82 years, there were more 29 | male christenings than female christenings. As Arbuthnot wondered, 30 | we might also wonder if this could be due to chance, or whether it meant 31 | the birth ratio was not actually 1:1. 32 | } 33 | \keyword{datasets} 34 | -------------------------------------------------------------------------------- /man/atheism.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/atheism.R 3 | \docType{data} 4 | \name{atheism} 5 | \alias{atheism} 6 | \title{Atheism in the world data} 7 | \format{ 8 | A tbl_df with 88032 rows and 3 variables: 9 | \describe{ 10 | \item{nationality}{Country of the individual surveyed.} 11 | \item{response}{A categorical variable with two levels: atheist and non-atheist.} 12 | \item{year}{Year in which the person was surveyed.} 13 | } 14 | } 15 | \source{ 16 | \href{https://github.com/OpenIntroStat/oilabs/blob/master/data-raw/atheism/Global_INDEX_of_Religiosity_and_Atheism_PR__6.pdf}{WIN-Gallup International Press Release} 17 | } 18 | \usage{ 19 | atheism 20 | } 21 | \description{ 22 | Survey results on atheism across several countries and years. Each row 23 | represents a single respondent. 24 | } 25 | \keyword{datasets} 26 | -------------------------------------------------------------------------------- /man/bandit_posterior.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bandit_posterior.R 3 | \name{bandit_posterior} 4 | \alias{bandit_posterior} 5 | \title{bandit posterior} 6 | \usage{ 7 | bandit_posterior( 8 | data, 9 | prior = c(m1_good = 0.5, m2_good = 0.5), 10 | win_probs = c(good = 1/2, bad = 1/3) 11 | ) 12 | } 13 | \arguments{ 14 | \item{data}{data frame containing win loss data} 15 | 16 | \item{prior}{prior vector containing the probabilities of Machine 1 and Machine 2 being good, defaults to 0.5 and 0.5 respectively.} 17 | 18 | \item{win_probs}{vector containing the probabilities of winning on the good and bad machine respectively.} 19 | } 20 | \value{ 21 | A vector containing the posterior probability of Machine 1 and Machine 2 being the good machine. 22 | } 23 | \description{ 24 | Utility function for calculating the posterior probability of each machine being "good" in 25 | two armed bandit problem. Calculated result is based on observed win loss data, prior belief about 26 | which machine is good and the probability of the good and bad machine paying out. 27 | } 28 | \examples{ 29 | data = data.frame(machine = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), 30 | outcome = c("W", "L", "W", "L", "L", "W", "L", "L", "L", "W")) 31 | bandit_posterior(data) 32 | plot_bandit_posterior(data) 33 | 34 | } 35 | \seealso{ 36 | \code{\link{bandit_sim}} to generate data and 37 | \code{\link{plot_bandit_posterior}} to visualize. 38 | } 39 | -------------------------------------------------------------------------------- /man/bandit_sim.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bandit_sim.R 3 | \name{bandit_sim} 4 | \alias{bandit_sim} 5 | \title{Run the Bandit Simulation shiny app} 6 | \usage{ 7 | bandit_sim() 8 | } 9 | \description{ 10 | Simulate data from a two armed-bandit (two slot machines) by clicking 11 | on the images for Machine 1 or Machine 2 and guess/learn which machine 12 | has the higher probability of winning as the number of 13 | outcomes of wins and losses accumulate. 14 | } 15 | \examples{ 16 | if (interactive()) { 17 | # run interactive shiny app to generate wins and losses 18 | bandit_sim() 19 | } 20 | # paste data from the shiny app into varible 21 | data = data.frame( 22 | machine = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 23 | 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 24 | 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 25 | 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 26 | 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 27 | 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 28 | 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 29 | 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L), 30 | outcome = c("W", "W", "W", "L", "W", "W", "W", "L", "W", "L", "W", "L", 31 | "L", "L", "W", "L", "W", "L", "L", "L", "W", "W", "W", "L", "L", "L", 32 | "L", "L", "W", "W", "L", "L", "W", "L", "L", "W", "L", "L", "W", "L", 33 | "L", "L", "L", "L", "W", "L", "L", "W", "W", "W", "W", "L", "L", "L", 34 | "L", "L", "L", "W", "L", "W", "L", "W", "L", "L", "L", "L", "L", "L", "L", 35 | "L", "L", "L", "W", "W", "W", "L", "W", "L", "L", "L", "L", "L", "L", "L", 36 | "L", "L", "L", "W", "W", "W", "W", "W", "L", "W", "W", "L", "W", "L", "L", 37 | "L", "L", "L", "W", "L", "W", "L", "L", "L", "W", "W", "W", "W", "L", "L", 38 | "W", "L", "W", "L", "L", "W")) 39 | bandit_posterior(data) 40 | plot_bandit_posterior(data) 41 | 42 | } 43 | \seealso{ 44 | \code{\link{bandit_posterior}} and \code{\link{plot_bandit_posterior}} 45 | } 46 | -------------------------------------------------------------------------------- /man/brfss.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/brfss.R 3 | \docType{data} 4 | \name{brfss} 5 | \alias{brfss} 6 | \title{Behavioral Risk Factor Surveillance System 2013 (Subset)} 7 | \format{ 8 | A tbl_df with with 5000 rows and 6 variables: 9 | \describe{ 10 | \item{weight}{Weight in pounds.} 11 | \item{height}{Height in inches.} 12 | \item{sex}{Sex} 13 | \item{exercise}{Any exercise in the last 30 days} 14 | \item{fruit_per_day}{Number of servings of fruit consumed per day.} 15 | \item{vege_per_day}{Number of servings of dark green vegetables consumed per day.} 16 | } 17 | } 18 | \source{ 19 | Centers for Disease Control and Prevention (CDC). Behavioral Risk Factor Surveillance System 20 | Survey Data. Atlanta, Georgia: U.S. Department of Health and Human Services, Centers for 21 | Disease Control and Prevention, 2013. 22 | } 23 | \usage{ 24 | brfss 25 | } 26 | \description{ 27 | This data set is a small subset of BRFSS results from the 2013 survey, each row represents an individual respondent. 28 | } 29 | \keyword{datasets} 30 | -------------------------------------------------------------------------------- /man/calc_streak.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/calc_streak.R 3 | \name{calc_streak} 4 | \alias{calc_streak} 5 | \title{Calculate hitting streaks} 6 | \usage{ 7 | calc_streak(x) 8 | } 9 | \arguments{ 10 | \item{x}{A data frame or character vector of hits (\code{"H"}) and misses (\code{"M"}).} 11 | } 12 | \value{ 13 | A data frame with one column, \code{length}, containing the length of each hit streak. 14 | } 15 | \description{ 16 | Calculate hitting streaks 17 | } 18 | \examples{ 19 | data(kobe_basket) 20 | calc_streak(kobe_basket$shot) 21 | 22 | } 23 | -------------------------------------------------------------------------------- /man/credible_interval_app.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/credible_interval.R 3 | \name{credible_interval_app} 4 | \alias{credible_interval_app} 5 | \title{Credible Interval shiny app} 6 | \usage{ 7 | credible_interval_app() 8 | } 9 | \description{ 10 | Run the `shiny` credible interval app to generate credible 11 | intervals under the prior or posterior distribution for 12 | Beta, Gamma and Gaussian families. Sliders are used to 13 | adjust the hyperparameters in the distribution so that one 14 | may see how the resulting credible intervals and plotted 15 | distributions change. 16 | } 17 | \examples{ 18 | if (interactive()) { 19 | credible_interval_app() 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /man/evals.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/evals.R 3 | \docType{data} 4 | \name{evals} 5 | \alias{evals} 6 | \title{Teachers evaluations at the University of Texas at Austin} 7 | \format{ 8 | A data frame with 463 rows and 21 variables: 9 | \describe{ 10 | \item{score}{Average professor evaluation score: (1) very unsatisfactory - (5) excellent} 11 | \item{rank}{Rank of professor: teaching, tenure track, tenure} 12 | \item{ethnicity}{Ethnicity of professor: not minority, minority} 13 | \item{gender}{Gender of professor: female, male} 14 | \item{language}{Language of school where professor received education: english or non-english} 15 | \item{age}{Age of professor} 16 | \item{cls_perc_eval}{Percent of students in class who completed evaluation} 17 | \item{cls_did_eval}{Number of students in class who completed evaluation} 18 | \item{cls_students}{Total number of students in class} 19 | \item{cls_level}{Class level: lower, upper} 20 | \item{cls_profs}{Number of professors teaching sections in course in sample: single, multiple} 21 | \item{cls_credits}{Number of credits of class: one credit (lab, PE, etc.), multi credit} 22 | \item{bty_f1lower}{Beauty rating of professor from lower level female: (1) lowest - (10) highest} 23 | \item{bty_f1upper}{Beauty rating of professor from upper level female: (1) lowest - (10) highest} 24 | \item{bty_f2upper}{Beauty rating of professor from second upper level female: (1) lowest - (10) highest} 25 | \item{bty_m1lower}{Beauty rating of professor from lower level male: (1) lowest - (10) highest} 26 | \item{bty_m1upper}{Beauty rating of professor from upper level male: (1) lowest - (10) highest} 27 | \item{bty_m2upper}{Beauty rating of professor from second upper level male: (1) lowest - (10) highest} 28 | \item{bty_avg}{Average beauty rating of professor} 29 | \item{pic_outfit}{Outfit of professor in picture: not formal, formal} 30 | \item{pic_color}{Color of professor's picture: color, black & white} 31 | } 32 | } 33 | \source{ 34 | These data appear in Hamermesh DS, and Parker A. 2005. Beauty in the 35 | classroom: instructors pulchritude and putative pedagogical productivity. Economics of Education Review 36 | 24(4):369-376. 37 | } 38 | \usage{ 39 | evals 40 | } 41 | \description{ 42 | The data were gathered from end of semester student evaluations for a large 43 | sample of professors from the University of Texas at Austin (variables beginning 44 | with \code{cls}). In addition, six students rated the professors' physical 45 | appearance (variables beginning with \code{bty}). (This is a slightly modified 46 | version of the original data set that was released as part of the replication 47 | data for Data Analysis Using Regression and Multilevel/Hierarchical Models 48 | (Gelman and Hill, 2007). 49 | } 50 | \keyword{datasets} 51 | -------------------------------------------------------------------------------- /man/figures/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsWithR/statsr/9cb9edad2f60a21308e13f9c52a70d1dfcbe423a/man/figures/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /man/inference.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/inference.R 3 | \name{inference} 4 | \alias{inference} 5 | \title{Hypothesis tests and confidence intervals} 6 | \usage{ 7 | inference( 8 | y, 9 | x = NULL, 10 | data, 11 | type = c("ci", "ht"), 12 | statistic = c("mean", "median", "proportion"), 13 | success = NULL, 14 | order = NULL, 15 | method = c("theoretical", "simulation"), 16 | null = NULL, 17 | alternative = c("less", "greater", "twosided"), 18 | sig_level = 0.05, 19 | conf_level = 0.95, 20 | boot_method = c("perc", "se"), 21 | nsim = 15000, 22 | seed = NULL, 23 | verbose = TRUE, 24 | show_var_types = verbose, 25 | show_summ_stats = verbose, 26 | show_eda_plot = verbose, 27 | show_inf_plot = verbose, 28 | show_res = verbose 29 | ) 30 | } 31 | \arguments{ 32 | \item{y}{Response variable, can be numerical or categorical} 33 | 34 | \item{x}{Explanatory variable, categorical (optional)} 35 | 36 | \item{data}{Name of data frame that y and x are in} 37 | 38 | \item{type}{of inference; "ci" (confidence interval) or "ht" (hypothesis test)} 39 | 40 | \item{statistic}{parameter to estimate: mean, median, or proportion} 41 | 42 | \item{success}{which level of the categorical variable to call "success", i.e. do inference on} 43 | 44 | \item{order}{when x is given, order of levels of x in which to subtract parameters} 45 | 46 | \item{method}{of inference; "theoretical" (CLT based) or "simulation" (randomization/bootstrap)} 47 | 48 | \item{null}{null value for a hypothesis test} 49 | 50 | \item{alternative}{direction of the alternative hypothesis; "less","greater", or "twosided"} 51 | 52 | \item{sig_level}{significance level, value between 0 and 1 (used only for ANOVA to determine if posttests are necessary)} 53 | 54 | \item{conf_level}{confidence level, value between 0 and 1} 55 | 56 | \item{boot_method}{bootstrap method; "perc" (percentile) or "se" (standard error)} 57 | 58 | \item{nsim}{number of simulations} 59 | 60 | \item{seed}{seed to be set, default is NULL} 61 | 62 | \item{verbose}{whether output should be verbose or not, default is TRUE} 63 | 64 | \item{show_var_types}{print variable types, set to verbose by default} 65 | 66 | \item{show_summ_stats}{print summary stats, set to verbose by default} 67 | 68 | \item{show_eda_plot}{print EDA plot, set to verbose by default} 69 | 70 | \item{show_inf_plot}{print inference plot, set to verbose by default} 71 | 72 | \item{show_res}{print results, set to verbose by default} 73 | } 74 | \value{ 75 | Results of inference task performed 76 | } 77 | \description{ 78 | Hypothesis tests and confidence intervals 79 | } 80 | \examples{ 81 | data(tapwater) 82 | 83 | # Calculate 95\% CI using quantiles using a Student t distribution 84 | inference(tthm, data=tapwater, 85 | statistic="mean", 86 | type="ci", 87 | method="theoretical") 88 | 89 | inference(tthm, data=tapwater, 90 | statistic="mean", 91 | type="ci", 92 | boot_method = "perc", 93 | method="simulation") 94 | 95 | # Inference for a proportion 96 | # Calculate 95\% confidence intervals for the proportion of atheists 97 | 98 | data("atheism") 99 | library("dplyr") 100 | us12 <- atheism \%>\% 101 | filter(nationality == "United States" , atheism$year == "2012") 102 | inference(y = response, data = us12, statistic = "proportion", 103 | type = "ci", 104 | method = "theoretical", 105 | success = "atheist") 106 | 107 | } 108 | -------------------------------------------------------------------------------- /man/kobe_basket.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/kobe_basket.R 3 | \docType{data} 4 | \name{kobe_basket} 5 | \alias{kobe_basket} 6 | \title{Kobe Bryant basketball performance} 7 | \format{ 8 | A data frame with 133 rows and 6 variables: 9 | \describe{ 10 | \item{vs}{A categorical vector, ORL if the Los Angeles Lakers played 11 | against Orlando} 12 | \item{game}{A numerical vector, game in the 2009 NBA finals} 13 | \item{quarter}{A categorical vector, quarter in the game, OT stands for 14 | overtime} 15 | \item{time}{A character vector, time at which Kobe took a shot} 16 | \item{description}{A character vector, description of the shot} 17 | \item{shot}{A categorical vector, H if the shot was a hit, M if the shot 18 | was a miss} 19 | } 20 | } 21 | \usage{ 22 | kobe_basket 23 | } 24 | \description{ 25 | Data from the five games the Los Angeles Lakers played against the Orlando 26 | Magic in the 2009 NBA finals. 27 | } 28 | \details{ 29 | Each row represents a shot Kobe Bryant took during the five games of the 30 | 2009 NBA finals. Kobe Bryant's performance earned him the title of Most 31 | Valuable Player and many spectators commented on how he appeared to show 32 | a hot hand. 33 | } 34 | \keyword{datasets} 35 | -------------------------------------------------------------------------------- /man/mlb11.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/mlb11.R 3 | \docType{data} 4 | \name{mlb11} 5 | \alias{mlb11} 6 | \title{Major League Baseball team data} 7 | \format{ 8 | A data frame with 30 rows and 12 variables: 9 | \describe{ 10 | \item{team}{Team name.} 11 | \item{runs}{Number of runs.} 12 | \item{at_bats}{Number of at bats.} 13 | \item{hits}{Number of hits.} 14 | \item{homeruns}{Number of home runs.} 15 | \item{bat_avg}{Batting average.} 16 | \item{strikeouts}{Number of strikeouts.} 17 | \item{stolen_bases}{Number of stolen bases.} 18 | \item{wins}{Number of wins.} 19 | \item{new_onbase}{Newer variable: on-base percentage, a measure of 20 | how often a batter reaches base for any reason other than a fielding error, 21 | fielder's choice, dropped/uncaught third strike, fielder's obstruction, or 22 | catcher's interference.} 23 | \item{new_slug}{Newer variable: slugging percentage, popular measure of the 24 | power of a hitter calculated as the total bases divided by at bats.} 25 | \item{new_obs}{Newer variable: on-base plus slugging, calculated as the sum of the on-base and slugging percentages.} 26 | } 27 | } 28 | \source{ 29 | \href{https://www.mlb.com/}{mlb.com} 30 | } 31 | \usage{ 32 | mlb11 33 | } 34 | \description{ 35 | Data from all 30 Major League Baseball teams from the 2011 season. 36 | } 37 | \keyword{datasets} 38 | -------------------------------------------------------------------------------- /man/nc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/nc.R 3 | \docType{data} 4 | \name{nc} 5 | \alias{nc} 6 | \title{North Carolina births} 7 | \format{ 8 | A tbl_df with 1000 rows and 13 variables: 9 | \describe{ 10 | \item{fage}{father's age in years} 11 | \item{mage}{mother's age in years} 12 | \item{mature}{maturity status of mother} 13 | \item{weeks}{length of pregnancy in weeks} 14 | \item{premie}{whether the birth was classified as premature (premie) or full-term} 15 | \item{visits}{number of hospital visits during pregnancy} 16 | \item{marital}{whether mother is `married` or `not married` at birth} 17 | \item{gained}{weight gained by mother during pregnancy in pounds} 18 | \item{weight}{weight of the baby at birth in pounds} 19 | \item{lowbirthweight}{whether baby was classified as low birthweight (`low`) or not (`not low`)} 20 | \item{gender}{gender of the baby, `female` or `male`} 21 | \item{habit}{status of the mother as a `nonsmoker` or a `smoker`} 22 | \item{whitemom}{whether mom is `white` or `not white`} 23 | } 24 | } 25 | \source{ 26 | State of North Carolina. 27 | } 28 | \usage{ 29 | nc 30 | } 31 | \description{ 32 | In 2004, the state of North Carolina released a large data set containing 33 | information on births recorded in this state. This data set is useful to 34 | researchers studying the relation between habits and practices of expectant 35 | mothers and the birth of their children. We will work with a random sample of 36 | observations from this data set. 37 | } 38 | \keyword{datasets} 39 | -------------------------------------------------------------------------------- /man/nycflights.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/nycflights.R 3 | \docType{data} 4 | \name{nycflights} 5 | \alias{nycflights} 6 | \title{Flights data} 7 | \format{ 8 | A tbl_df with 32,735 rows and 16 variables: 9 | \describe{ 10 | \item{year,month,day}{Date of departure} 11 | \item{dep_time,arr_time}{Departure and arrival times, local tz.} 12 | \item{dep_delay,arr_delay}{Departure and arrival delays, in minutes. 13 | Negative times represent early departures/arrivals.} 14 | \item{hour,minute}{Time of departure broken in to hour and minutes} 15 | \item{carrier}{Two letter carrier abbreviation. See \code{airlines} in the 16 | \code{nycflights13} package for more information} 17 | \item{tailnum}{Plane tail number} 18 | \item{flight}{Flight number} 19 | \item{origin,dest}{Origin and destination. See \code{airports} in the 20 | \code{nycflights13} package for more information, or google airport the code.} 21 | \item{air_time}{Amount of time spent in the air} 22 | \item{distance}{Distance flown} 23 | } 24 | } 25 | \source{ 26 | Hadley Wickham (2014). \code{nycflights13}: Data about flights departing 27 | NYC in 2013. R package version 0.1. 28 | \url{https://CRAN.R-project.org/package=nycflights13} 29 | } 30 | \usage{ 31 | nycflights 32 | } 33 | \description{ 34 | On-time data for a random sample of flights that departed NYC (i.e. JFK, LGA or EWR) 35 | in 2013. 36 | } 37 | \keyword{datasets} 38 | -------------------------------------------------------------------------------- /man/plot_bandit_posterior.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bandit_posterior.R 3 | \name{plot_bandit_posterior} 4 | \alias{plot_bandit_posterior} 5 | \title{plot_bandit_posterior} 6 | \usage{ 7 | plot_bandit_posterior( 8 | data, 9 | prior = c(m1_good = 0.5, m2_good = 0.5), 10 | win_probs = c(good = 1/2, bad = 1/3) 11 | ) 12 | } 13 | \arguments{ 14 | \item{data}{data frame containing win loss data} 15 | 16 | \item{prior}{prior vector containing the probabilities of Machine 1 and Machine 2 being good, defaults to 50-50.} 17 | 18 | \item{win_probs}{vector containing the probabilities of winning on the good and bad machine respectively.} 19 | } 20 | \description{ 21 | Generates a plot that shows the bandit posterior values as they are sequentially updated 22 | by the provided win / loss data. 23 | } 24 | \examples{ 25 | # capture data from the `shiny` app `bandit_sim`. 26 | data = data.frame(machine = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L), 27 | outcome = c("W", "L", "W", "L", "L", "W", "L", "L", "L", "W")) 28 | plot_bandit_posterior(data) 29 | 30 | } 31 | \seealso{ 32 | \code{\link{bandit_sim}} to generate data to use below 33 | } 34 | -------------------------------------------------------------------------------- /man/plot_ss.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_ss.R 3 | \name{plot_ss} 4 | \alias{plot_ss} 5 | \title{plot_ss} 6 | \usage{ 7 | plot_ss(x, y, data, showSquares = FALSE, leastSquares = FALSE) 8 | } 9 | \arguments{ 10 | \item{x}{the name of numerical vector 1 on x-axis} 11 | 12 | \item{y}{the name of numerical vector 2 on y-axis} 13 | 14 | \item{data}{the dataframe in which x and y can be found} 15 | 16 | \item{showSquares}{logical option to show boxes representing the squared residuals} 17 | 18 | \item{leastSquares}{logical option to bypass point entry and automatically draw the least squares line} 19 | } 20 | \description{ 21 | An interactive shiny app that will generate a scatterplot of two variables, then 22 | allow the user to click the plot in two locations to draw a best fitting line. 23 | Residuals are drawn by default; boxes representing the squared residuals are 24 | optional. 25 | } 26 | \examples{ 27 | \dontrun{plot_ss} 28 | } 29 | -------------------------------------------------------------------------------- /man/present.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/present.R 3 | \docType{data} 4 | \name{present} 5 | \alias{present} 6 | \title{Male and female births in the US} 7 | \format{ 8 | A tbl_df with 74 rows and 3 variables: 9 | \describe{ 10 | \item{year}{year, ranging from 1940 to 2013} 11 | \item{boys}{number of male births} 12 | \item{girls}{number of female births} 13 | } 14 | } 15 | \source{ 16 | Data up to 2002 appear in Mathews TJ, and Hamilton BE. 2005. Trend 17 | analysis of the sex ratio at birth in the United States. National Vital 18 | Statistics Reports 53(20):1-17. Data for 2003 - 2013 have been collected 19 | from annual National Vital Statistics Reports published by the US Department of 20 | Health and Human Services, Centers for Disease Control and Prevention, 21 | National Center for Health Statistics. 22 | } 23 | \usage{ 24 | present 25 | } 26 | \description{ 27 | Counts of the total number of male and female births in the United States from 28 | 1940 to 2013. 29 | } 30 | \keyword{datasets} 31 | -------------------------------------------------------------------------------- /man/rep_sample_n.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rep_sample_n.R 3 | \name{rep_sample_n} 4 | \alias{rep_sample_n} 5 | \title{Repeating Sampling from a Tibble} 6 | \usage{ 7 | rep_sample_n(tbl, size, replace = FALSE, reps = 1) 8 | } 9 | \arguments{ 10 | \item{tbl}{tbl of data.} 11 | 12 | \item{size}{The number of rows to select.} 13 | 14 | \item{replace}{Sample with or without replacement?} 15 | 16 | \item{reps}{The number of samples to collect.} 17 | } 18 | \value{ 19 | A tbl_df that aggregates all created samples, with the addition of a \code{replicate} column that the tbl_df is also grouped by 20 | } 21 | \description{ 22 | Repeating Sampling from a Tibble 23 | } 24 | \examples{ 25 | data(nc) 26 | rep_sample_n(nc, size=10, replace=FALSE, reps=1) 27 | } 28 | -------------------------------------------------------------------------------- /man/statsr.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/statsr.R 3 | \docType{package} 4 | \name{statsr} 5 | \alias{statsr} 6 | \title{statsr: A companion package for Statistics with R} 7 | \description{ 8 | R package to support the online open access book "An Introduction 9 | to Bayesian Thinking" available at 10 | \url{https://statswithr.github.io/book/} and videos for the Coursera "Statistics with 11 | R" Specialization. The package includes data sets, functions 12 | and Shiny Applications for learning frequentist and Bayesian 13 | statistics with R. The two main functions for inference and decision making are 14 | `inference` and `bayes_inference` which support 15 | confidence/credible intervals and hypothesis testing with one sample or two samples 16 | from Gaussian and Bernoulli populations. Shiny apps are used to illustrate how prior 17 | hyperparameters or changes in the data may influence posterior distributions. 18 | } 19 | \details{ 20 | See \url{https://github.com/StatsWithR/statsr} for the development version and 21 | additional information or for additional background and illustrations of functions 22 | the online book \url{https://statswithr.github.io/book/}. 23 | } 24 | -------------------------------------------------------------------------------- /man/tapwater.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tapwater.R 3 | \docType{data} 4 | \name{tapwater} 5 | \alias{tapwater} 6 | \title{Total Trihalomethanes in Tapwater} 7 | \format{ 8 | A dataframe with 28 rows and 6 variables: 9 | \describe{ 10 | \item{date}{Date of collection} 11 | \item{tthm}{average total trihalomethanes in ppb } 12 | \item{samples}{number of samples} 13 | \item{nondetects}{number of samples where tthm not detected (0)} 14 | \item{min}{min tthm in ppb in samples} 15 | \item{max}{max tthm in ppb in samples} 16 | } 17 | } 18 | \source{ 19 | National Drinking Water Database for Durham, NC. \url{https://www.ewg.org} 20 | } 21 | \usage{ 22 | tapwater 23 | } 24 | \description{ 25 | Trihalomethanes are formed as a by-product predominantly when chlorine is used to disinfect water 26 | for drinking. They result from the reaction of chlorine or bromine with 27 | organic matter present in the water being treated. 28 | THMs have been associated through epidemiological studies 29 | with some adverse health effects and many are considered carcinogenic. 30 | In the United States, the EPA limits 31 | the total concentration of the four chief constituents (chloroform, bromoform, bromodichloromethane, and dibromochloromethane), referred to as 32 | total trihalomethanes (TTHM), to 80 parts per billion in treated water. 33 | } 34 | \keyword{datasets} 35 | -------------------------------------------------------------------------------- /man/wage.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wage.R 3 | \docType{data} 4 | \name{wage} 5 | \alias{wage} 6 | \title{Wage data} 7 | \format{ 8 | A tbl_df with with 935 rows and 17 variables: 9 | \describe{ 10 | \item{wage}{weekly earnings (dollars)} 11 | \item{hours}{average hours worked per week} 12 | \item{iq}{IQ score} 13 | \item{kww}{Knowledge of world work score} 14 | \item{educ}{years of education} 15 | \item{exper}{years of work experience} 16 | \item{tenure}{years with current employer} 17 | \item{age}{age in years} 18 | \item{married}{=1 if married} 19 | \item{black}{=1 if black} 20 | \item{south}{=1 if live in south} 21 | \item{urban}{=1 if live in a Standard Metropolitan Statistical Area } 22 | \item{sibs}{number of siblings} 23 | \item{brthord}{birth order} 24 | \item{meduc}{mother's education (years)} 25 | \item{feduc}{father's education (years)} 26 | \item{lwage}{natural log of wage} 27 | } 28 | } 29 | \source{ 30 | Jeffrey M. Wooldridge (2000). Introductory Econometrics: A Modern Approach. South-Western College Publishing. 31 | } 32 | \usage{ 33 | wage 34 | } 35 | \description{ 36 | The data were gathered as part of a random sample of 935 respondents throughout the United States. 37 | } 38 | \keyword{datasets} 39 | -------------------------------------------------------------------------------- /man/zinc.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/zinc.R 3 | \docType{data} 4 | \name{zinc} 5 | \alias{zinc} 6 | \title{Zinc Concentration in Water} 7 | \format{ 8 | A data frame with 10 observations on the following 4 variables. 9 | \describe{ 10 | \item{\code{location}}{sample number} 11 | \item{\code{bottom}}{zinc concentration in bottom water} 12 | \item{\code{surface}}{zinc concentration in surface water} 13 | \item{\code{difference}}{difference between zinc concentration at the bottom and surface} 14 | } 15 | } 16 | \source{ 17 | \href{https://online.stat.psu.edu/stat500/sites/stat500/files/data/zinc_conc.txt}{PennState Eberly College of Science Online Courses} 18 | } 19 | \usage{ 20 | zinc 21 | } 22 | \description{ 23 | Trace metals in drinking water affect the flavor and 24 | an unusually high concentration can pose a health 25 | hazard. Ten pairs of data were taken measuring zinc 26 | concentration in bottom water and surface water. 27 | } 28 | \examples{ 29 | data(zinc) 30 | str(zinc) 31 | plot(bottom ~ surface, data=zinc) 32 | # use paired t-test to test if difference in means is zero 33 | 34 | } 35 | \keyword{datasets} 36 | -------------------------------------------------------------------------------- /statsr.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | PackageRoxygenize: rd,collate,namespace 19 | -------------------------------------------------------------------------------- /tests/spelling.R: -------------------------------------------------------------------------------- 1 | if(requireNamespace('spelling', quietly = TRUE)) 2 | spelling::spell_check_test(vignettes = TRUE, error = FALSE, 3 | skip_on_cran = TRUE) 4 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(statsr) 3 | 4 | test_check("statsr") 5 | -------------------------------------------------------------------------------- /tests/testthat/test-bayes_inference.R: -------------------------------------------------------------------------------- 1 | test_that("multiplication works", { # issue 15 2 | # 4.1.5 Example: TTHM in Tapwater" 3 | data(tapwater) 4 | # prior hyperparameters 5 | m_0 = 35; n_0 = 25; s2_0 = 156.25; v_0 = n_0 - 1 6 | # sample summaries 7 | Y = tapwater$tthm 8 | ybar = mean(Y) 9 | s2 = var(Y) 10 | n = length(Y) 11 | # posterior hyperparamters 12 | n_n = n_0 + n 13 | m_n = (n*ybar + n_0*m_0)/n_n 14 | v_n = v_0 + n 15 | s2_n = ((n-1)*s2 + v_0*s2_0 + n_0*n*(m_0 - ybar)^2/n_n)/v_n 16 | ci = m_n + qt(c(0.025, 0.975), v_n)*sqrt(s2_n/n_n) 17 | out = bayes_inference(tthm, data=tapwater, prior="NG", 18 | mu_0 = m_0, n_0=n_0, s_0 = sqrt(s2_0), v_0 = v_0, 19 | stat="mean", type="ci", method="theoretical", 20 | show_res=TRUE, show_summ=TRUE, show_plot=FALSE) 21 | expect_equal(m_n, out$post_mean) 22 | expect_equal(ci, out$ci) 23 | }) 24 | --------------------------------------------------------------------------------