├── .Rbuildignore ├── .Renviron ├── .Rprofile ├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ └── pkgdown.yaml ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── R └── tidymodels.tutorials-package.R ├── README.Rmd ├── README.md ├── TODO.txt ├── _pkgdown.yml ├── inst └── tutorials │ ├── 02-a-tidyverse-primer │ └── tutorial.Rmd │ ├── 03-a-review-of-r-modeling-fundamentals │ └── tutorial.Rmd │ ├── 04-the-ames-housing-data │ └── tutorial.Rmd │ ├── 05-spending-our-data │ └── tutorial.Rmd │ ├── 06-fitting-models-with-parsnip │ ├── images │ │ ├── DiagramOne.png │ │ ├── TableFour.png │ │ ├── TableOne.png │ │ ├── TableThree.png │ │ └── TableTwo.png │ └── tutorial.Rmd │ ├── 07-a-model-workflow │ ├── images │ │ └── img.png │ └── tutorial.Rmd │ ├── 08-feature-engineering-with-recipes │ └── tutorial.Rmd │ ├── 09-judging-model-effectiveness │ └── tutorial.Rmd │ ├── 10-resampling │ ├── images │ │ ├── fig-ten-point-eight.png │ │ ├── fig-ten-point-five.png │ │ ├── fig-ten-point-one.png │ │ ├── fig-ten-point-seven.png │ │ ├── fig-ten-point-six.png │ │ ├── fig-ten-point-three.png │ │ └── fig-ten-point-two.png │ └── tutorial.Rmd │ ├── 11-comparing-models │ ├── data │ │ └── linear-statistical-model.png │ └── tutorial.Rmd │ ├── 12-model-tuning-and-the-dangers-of-overfitting │ ├── images │ │ ├── pic1.png │ │ ├── pic2.png │ │ ├── pic3.png │ │ ├── pic4.png │ │ └── pic5.png │ └── tutorial.Rmd │ ├── 13-grid-search │ └── tutorial.Rmd │ ├── 14-iterative-search │ ├── images │ │ ├── pic1.png │ │ ├── pic10.png │ │ ├── pic2.png │ │ ├── pic3.png │ │ ├── pic4.png │ │ ├── pic5.png │ │ ├── pic6.png │ │ ├── pic7.png │ │ ├── pic8.png │ │ └── pic9.png │ └── tutorial.Rmd │ ├── 15-screening-many-models │ └── tutorial.Rmd │ ├── 16-dimensionality-reduction │ ├── images │ │ ├── pic1.png │ │ ├── pic2.png │ │ ├── pic3.png │ │ ├── pic4.png │ │ └── pic5.png │ └── tutorial.Rmd │ └── 18-explaining-models-and-predictions │ ├── images │ └── pic1.png │ └── tutorial.Rmd ├── man ├── figures │ └── README-pressure-1.png └── tidymodels.tutorials-package.Rd ├── renv.lock ├── renv ├── .gitignore ├── activate.R └── settings.json └── tests ├── testthat.R └── testthat └── test-tutorials.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^renv$ 2 | ^renv\.lock$ 3 | ^tidymodels\.tutorials\.Rproj$ 4 | ^\.Rproj\.user$ 5 | ^LICENSE\.md$ 6 | ^TODO.txt$ 7 | ^\.github$ 8 | ^README\.Rmd$ 9 | tutorials/[^/]*/(?!(data|images|.*Rmd)) 10 | ^_pkgdown\.yml$ 11 | ^docs$ 12 | ^pkgdown$ 13 | -------------------------------------------------------------------------------- /.Renviron: -------------------------------------------------------------------------------- 1 | RENV_CONFIG_SANDBOX_ENABLED = FALSE 2 | -------------------------------------------------------------------------------- /.Rprofile: -------------------------------------------------------------------------------- 1 | source("renv/activate.R") 2 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ubuntu-latest 14 | env: 15 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 16 | R_KEEP_PKG_SOURCE: yes 17 | steps: 18 | - uses: actions/checkout@v3 19 | 20 | - uses: r-lib/actions/setup-r@v2 21 | with: 22 | use-public-rspm: true 23 | 24 | - uses: r-lib/actions/setup-r-dependencies@v2 25 | with: 26 | extra-packages: any::rcmdcheck 27 | needs: check 28 | 29 | - uses: r-lib/actions/check-r-package@v2 30 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | # Only restrict concurrency for non-PR jobs 18 | concurrency: 19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 20 | env: 21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 22 | permissions: 23 | contents: write 24 | steps: 25 | - uses: actions/checkout@v3 26 | 27 | - uses: r-lib/actions/setup-pandoc@v2 28 | 29 | - uses: r-lib/actions/setup-r@v2 30 | with: 31 | use-public-rspm: true 32 | 33 | - uses: r-lib/actions/setup-r-dependencies@v2 34 | with: 35 | extra-packages: any::pkgdown, local::. 36 | needs: website 37 | 38 | - name: Build site 39 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 40 | shell: Rscript {0} 41 | 42 | - name: Deploy to GitHub pages 🚀 43 | if: github.event_name != 'pull_request' 44 | uses: JamesIves/github-pages-deploy-action@v4.4.1 45 | with: 46 | clean: false 47 | branch: gh-pages 48 | folder: docs 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | 6 | *Rproj 7 | 8 | inst/tutorials/[^/]*/(?!(data|images|.*Rmd)) 9 | 10 | inst/tutorials/*/*html 11 | inst/tutorials/*/*cache* 12 | 13 | inst/tutorials/*/*_files/ 14 | docs 15 | 16 | # The simple name of a file ignores that file wherever it might appear. 17 | 18 | .DS_Store 19 | 20 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: tidymodels.tutorials 2 | Title: Tutorials for "Tidy Modeling with R" 3 | Version: 0.0.0.9002 4 | Authors@R: 5 | person(given = "David", 6 | family = "Kane", 7 | role = c("aut", "cre", "cph"), 8 | email = "dave.kane@gmail.com", 9 | comment = c(ORCID = "0000-0002-6660-3934")) 10 | Description: When assigned "Tidy Modeling with R: A Framework for Modeling in 11 | the Tidyverse" (Kuhn and Silge (2023, ISBN: 1492096482)), students should 12 | read the book and type in all the associated R commands themselves. Sadly, 13 | that never happens. These tutorials allow students to demonstrate (and their 14 | instructors to be sure) that all work has been completed. See Kane (2023) 15 | from 16 | the 'tutorial.helpers' package for a background discussion. 17 | License: MIT + file LICENSE 18 | Encoding: UTF-8 19 | Roxygen: list(markdown = TRUE) 20 | RoxygenNote: 7.2.3 21 | Suggests: 22 | baguette, 23 | beans, 24 | bestNormalize, 25 | censored, 26 | corrplot, 27 | corrr, 28 | DALEXtra, 29 | discrim, 30 | doParallel, 31 | embed, 32 | fastICA, 33 | finetune, 34 | ggrepel, 35 | ggforce, 36 | kernlab, 37 | klaR, 38 | lme4, 39 | learnr, 40 | mda, 41 | mixOmics, 42 | multilevelmod, 43 | nlme, 44 | ranger, 45 | roxygen2, 46 | rsconnect, 47 | rstanarm, 48 | rules, 49 | stringr, 50 | testthat (>= 3.0.0), 51 | tidymodels, 52 | tidyposterior, 53 | tidyverse, 54 | tutorial.helpers, 55 | usemodels, 56 | xgboost 57 | Config/testthat/edition: 3 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2023 2 | COPYRIGHT HOLDER: tidymodels.tutorials authors 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2023 tidymodels.tutorials authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | -------------------------------------------------------------------------------- /R/tidymodels.tutorials-package.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | "_PACKAGE" 3 | 4 | ## usethis namespace: start 5 | ## usethis namespace: end 6 | NULL 7 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-", 12 | out.width = "100%" 13 | ) 14 | ``` 15 | 16 | # Tutorials for *Tidy Modeling with R* 17 | 18 | 19 | [![R-CMD-check](https://github.com/PPBDS/tidymodels.tutorials/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/PPBDS/tidymodels.tutorials/actions/workflows/R-CMD-check.yaml) 20 | 21 | 22 | 23 | ## About this package 24 | 25 | **tidymodel.tutorials** provides tutorials for [*Tidy Modeling with R*](https://www.tmwr.org/) by Max Kuhn and Julia Silge. These tutorials assume that you have some experience working with the tools provided by the **[tutorial.helpers](https://ppbds.github.io/tutorial.helpers/)** package. As long as you have completed the "Getting Started" tutorial from that package, you should be fine. 26 | 27 | The main audience for these tutorials is instructors teaching data science and their students. Instructors want students to, for example, read Chapter 8 of [*Tidy Modeling with R*](https://www.tmwr.org/) (or something similar), typing in the code at the R Console along the way. Sadly, students almost never do that. Indeed, many (most?) of them won't even read the assigned chapter. 28 | 29 | The promise we make to instructors is that, if they assign our tutorial for Chapter 8, then students will type in at least 90% of the code examples from the chapter, and then run the code to see what happens. We also pull out some of the most important prose from the chapter and do everything we can to cajole/trick students into reading it. These [two](https://ppbds.github.io/tutorial.helpers/articles/instructions.html) [essays](https://ppbds.github.io/tutorial.helpers/articles/books.html) provide background information about our approach. 30 | 31 | Our causal claim is that, if an instructor were to randomly assign half the class to do these tutorials and half to simply complete the reading, the half completing the tutorials would perform much better for the rest of the course. 32 | 33 | ## Installation 34 | 35 | You can install the development version from [GitHub](https://github.com/) with: 36 | 37 | ``` r 38 | remotes::install_github("PPBDS/tidymodels.tutorials") 39 | ``` 40 | 41 | If R offers you the option to update some packages, you should do so. For packages that need compilation, feel free to answer "no". 42 | 43 | Then **restart your R session** or **restart RStudio**. 44 | 45 | ## Accessing tutorials 46 | 47 | In order to access the tutorials, start by loading the package. 48 | 49 | ``` r 50 | library(tidymodels.tutorials) 51 | ``` 52 | 53 | You can access the tutorials via the Tutorial tab in the top right (Environment) pane in RStudio. 54 | 55 | If either of the following is happening to you 56 | 57 | 61 | 62 | Then **remember to restart your R session** after installing the package. 63 | 64 | Because tutorials within the Tutorial pane are sorted in alphabetical order by the name of the package, the **tidymodels.tutorials** will be toward the bottom. If you don’t see any tutorials, try clicking the “Home” button – the little house symbol with the thin red roof in the upper right. 65 | 66 | In order to expand the window, you can drag and enlarge the tutorial pane inside RStudio. In order to open a pop-up window, click the "Show in New Window" icon next to the home icon. 67 | 68 | You may notice that the Jobs tab in the lower left will create output as the tutorial is starting up. This is because RStudio is running the code to create the tutorial. If you accidentally clicked "Start Tutorial" and would like to stop the job from running, you can click the back arrow in the Jobs tab, and then press the red stop sign icon. Your work will be saved between RStudio sessions, meaning that you can complete a tutorial in multiple sittings. Once you have completed a tutorial, follow the instructions on the tutorial `Submit` page and, if you're a student, submit your answers as instructed. 69 | 70 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Tutorials for *Tidy Modeling with R* 5 | 6 | 7 | 8 | [![R-CMD-check](https://github.com/PPBDS/tidymodels.tutorials/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/PPBDS/tidymodels.tutorials/actions/workflows/R-CMD-check.yaml) 9 | 10 | 11 | ## About this package 12 | 13 | **tidymodel.tutorials** provides tutorials for [*Tidy Modeling with 14 | R*](https://www.tmwr.org/) by Max Kuhn and Julia Silge. These tutorials 15 | assume that you have some experience working with the tools provided by 16 | the **[tutorial.helpers](https://ppbds.github.io/tutorial.helpers/)** 17 | package. As long as you have completed the “Getting Started” tutorial 18 | from that package, you should be fine. 19 | 20 | The main audience for these tutorials is instructors teaching data 21 | science and their students. Instructors want students to, for example, 22 | read Chapter 8 of [*Tidy Modeling with R*](https://www.tmwr.org/) (or 23 | something similar), typing in the code at the R Console along the way. 24 | Sadly, students almost never do that. Indeed, many (most?) of them won’t 25 | even read the assigned chapter. 26 | 27 | The promise we make to instructors is that, if they assign our tutorial 28 | for Chapter 8, then students will type in at least 90% of the code 29 | examples from the chapter, and then run the code to see what happens. We 30 | also pull out some of the most important prose from the chapter and do 31 | everything we can to cajole/trick students into reading it. These 32 | [two](https://ppbds.github.io/tutorial.helpers/articles/instructions.html) 33 | [essays](https://ppbds.github.io/tutorial.helpers/articles/books.html) 34 | provide background information about our approach. 35 | 36 | Our causal claim is that, if an instructor were to randomly assign half 37 | the class to do these tutorials and half to simply complete the reading, 38 | the half completing the tutorials would perform much better for the rest 39 | of the course. 40 | 41 | ## Installation 42 | 43 | You can install the development version from 44 | [GitHub](https://github.com/) with: 45 | 46 | ``` r 47 | remotes::install_github("PPBDS/tidymodels.tutorials") 48 | ``` 49 | 50 | If R offers you the option to update some packages, you should do so. 51 | For packages that need compilation, feel free to answer “no”. 52 | 53 | Then **restart your R session** or **restart RStudio**. 54 | 55 | ## Accessing tutorials 56 | 57 | In order to access the tutorials, start by loading the package. 58 | 59 | ``` r 60 | library(tidymodels.tutorials) 61 | ``` 62 | 63 | You can access the tutorials via the Tutorial tab in the top right 64 | (Environment) pane in RStudio. 65 | 66 | If either of the following is happening to you 67 | 68 | 76 | 77 | Then **remember to restart your R session** after installing the 78 | package. 79 | 80 | Because tutorials within the Tutorial pane are sorted in alphabetical 81 | order by the name of the package, the **tidymodels.tutorials** will be 82 | toward the bottom. If you don’t see any tutorials, try clicking the 83 | “Home” button – the little house symbol with the thin red roof in the 84 | upper right. 85 | 86 | In order to expand the window, you can drag and enlarge the tutorial 87 | pane inside RStudio. In order to open a pop-up window, click the “Show 88 | in New Window” icon next to the home icon. 89 | 90 | You may notice that the Jobs tab in the lower left will create output as 91 | the tutorial is starting up. This is because RStudio is running the code 92 | to create the tutorial. If you accidentally clicked “Start Tutorial” and 93 | would like to stop the job from running, you can click the back arrow in 94 | the Jobs tab, and then press the red stop sign icon. Your work will be 95 | saved between RStudio sessions, meaning that you can complete a tutorial 96 | in multiple sittings. Once you have completed a tutorial, follow the 97 | instructions on the tutorial `Submit` page and, if you’re a student, 98 | submit your answers as instructed. 99 | -------------------------------------------------------------------------------- /TODO.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | Read Modelling Basics chapters. 4 | 5 | # Anish 6 | 7 | Read chapter 2. Complete tutorial 2, and send me your answers as html. Determine which material from chapter 1 is most important, and can fit, and then include it as knowledge drops. Add test chunks. 8 | 9 | 10 | 11 | 12 | Deal with censored issue. 13 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: ~ 2 | template: 3 | bootstrap: 5 4 | 5 | -------------------------------------------------------------------------------- /inst/tutorials/04-the-ames-housing-data/tutorial.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: The Ames Housing Data 3 | author: Pratham Kancherla and David Kane 4 | tutorial: 5 | id: the-ames-housing-data 6 | output: 7 | learnr::tutorial: 8 | progressive: yes 9 | allow_skip: yes 10 | runtime: shiny_prerendered 11 | description: 'Tutorial for Chapter 4: The Ames Housing Data' 12 | --- 13 | 14 | ```{r setup, include = FALSE} 15 | library(learnr) 16 | library(tidymodels) 17 | library(tidyverse) 18 | library(tutorial.helpers) 19 | knitr::opts_chunk$set(echo = FALSE) 20 | options(tutorial.exercise.timelimit = 60, 21 | tutorial.storage = "local") 22 | 23 | gg_hist <- ggplot(ames, aes(x = Sale_Price)) + 24 | geom_histogram(bins = 50, col= "white") + 25 | scale_x_log10() 26 | ``` 27 | 28 | ```{r copy-code-chunk, child = system.file("child_documents/copy_button.Rmd", package = "tutorial.helpers")} 29 | ``` 30 | 31 | ```{r info-section, child = system.file("child_documents/info_section.Rmd", package = "tutorial.helpers")} 32 | ``` 33 | 34 | ## Introduction 35 | ### 36 | 37 | This tutorial explores [Chapter 4: The Ames Housing Data](https://www.tmwr.org/ames.html) from [*Tidy Modeling with R*](https://www.tmwr.org/) by Max Kuhn and Julia Silge. We'll introduce the Ames housing data set [(De Cock 2011)](https://www.tmwr.org/ames.html#ref-ames), a key component in upcoming modeling examples. Conducting exploratory data analysis, as demonstrated in this tutorial, serves as a initial phase in constructing a dependable model. 38 | 39 | ## Exploring Features of Homes in Ames 40 | ### 41 | 42 | The data set contains information on 2,930 properties in Ames, Iowa, including columns related to: 43 | 44 | - house characteristics (bedrooms, garage, fireplace, pool, porch, etc.) 45 | 46 | - location (neighborhood) 47 | 48 | - lot information (zoning, shape, size, etc.) 49 | 50 | - ratings of condition and quality 51 | 52 | - sale price 53 | 54 | ### Exercise 1 55 | 56 | The Ames data has been made available through the **modeldata** package, which is a built in **tidymodels** package. 57 | 58 | Load the library **tidymodels** using `library()`. 59 | 60 | ```{r exploring-features-of-homes-in-1, exercise = TRUE} 61 | 62 | ``` 63 | 64 | ```{r exploring-features-of-homes-in-1-hint-1, eval = FALSE} 65 | library(...) 66 | ``` 67 | 68 | ```{r exploring-features-of-homes-in-1-test, include = FALSE} 69 | library(tidymodels) 70 | ``` 71 | 72 | ### 73 | 74 | The core **tidymodels** packages: [**rsample**](https://rsample.tidymodels.org/), [**parsnip**](https://parsnip.tidymodels.org/), [**recipes**](https://recipes.tidymodels.org/), [**workflows**](https://workflows.tidymodels.org/), [**tune**](https://tune.tidymodels.org/), [**yardstick**](https://yardstick.tidymodels.org/), [**broom**](https://broom.tidymodels.org/), [**dials**](https://dials.tidymodels.org/) 75 | 76 | ### Exercise 2 77 | 78 | Now load the data `ames` from the package using the `data()` function. 79 | 80 | ```{r exploring-features-of-homes-in-2, exercise = TRUE} 81 | 82 | ``` 83 | 84 | ```{r exploring-features-of-homes-in-2-hint-1, eval = FALSE} 85 | data(...) 86 | ``` 87 | 88 | ```{r exploring-features-of-homes-in-2-test, include = FALSE} 89 | data(ames) 90 | ``` 91 | 92 | ### 93 | 94 | In R, the dim() function is used to retrieve or set the dimensions of an object, such as a matrix, array, or data frame. The dim() function returns a vector with two elements representing the number of rows and columns (dimensions) of the object. 95 | 96 | ### Exercise 3 97 | 98 | We want to look at how many rows and columns are in `ames`. Use `dim()` with `ames` as the parameter to retrieve a vector of (# of rows, # of columns). 99 | 100 | ```{r exploring-features-of-homes-in-3, exercise = TRUE} 101 | 102 | ``` 103 | 104 | ```{r exploring-features-of-homes-in-3-hint-1, eval = FALSE} 105 | dim(...) 106 | ``` 107 | 108 | ```{r exploring-features-of-homes-in-3-test, include = FALSE} 109 | dim(ames) 110 | ``` 111 | 112 | ### 113 | 114 | Changes made to the data in the `modeldata` package are such that in the raw data, if a house did not have a particular feature, it was implicitly encoded as missing, the categorical predictors were converted to R’s factor data type, and quality descriptors for each house were removed since they are more like outcomes than predictors. 115 | 116 | ### Exercise 4 117 | 118 | Let’s start our exploratory data analysis by focusing on the outcome we want to predict: the last sale price of the house (in USD). We can create a histogram to see the distribution of sale prices. 119 | 120 | Type in ames to see the data that we are looking at. 121 | 122 | ```{r exploring-features-of-homes-in-4, exercise = TRUE} 123 | 124 | ``` 125 | 126 | ```{r exploring-features-of-homes-in-4-hint-1, eval = FALSE} 127 | ames 128 | ``` 129 | 130 | ### 131 | 132 | The root mean squared error (RMSE) is a common performance metric used in regression models. It uses the difference between the observed and predicted values in its calculations. 133 | 134 | ### Exercise 5 135 | 136 | Copy the previous code and pipe it to ggplot() to start creating a histogram. 137 | 138 | ```{r exploring-features-of-homes-in-5, exercise = TRUE} 139 | 140 | ``` 141 | 142 | 143 | 144 | ```{r exploring-features-of-homes-in-5-hint-1, eval = FALSE} 145 | ames |> 146 | ...() 147 | ``` 148 | 149 | ```{r exploring-features-of-homes-in-5-test, include = FALSE} 150 | ames |> 151 | ggplot() 152 | ``` 153 | 154 | ### 155 | 156 | In R, ggplot() is a function from the **ggplot2** package, which is a powerful and widely-used package for creating visualizations. 157 | 158 | ### Exercise 6 159 | 160 | We need to establish the x-axis on the graph. Copy the previous code and, using `aes()` within in `ggplot()`, set `x` equal to `Sale_Price`. 161 | 162 | ```{r exploring-features-of-homes-in-6, exercise = TRUE} 163 | 164 | ``` 165 | 166 | 167 | 168 | ```{r exploring-features-of-homes-in-6-hint-1, eval = FALSE} 169 | ... |> 170 | ggplot(aes(x = ...)) 171 | ``` 172 | 173 | ```{r exploring-features-of-homes-in-6-test, include = FALSE} 174 | ames |> 175 | ggplot(aes(x = Sale_Price)) 176 | ``` 177 | 178 | ### 179 | 180 | The disadvantages of transforming the outcome mostly relate to interpretation of model results. 181 | 182 | ### Exercise 7 183 | 184 | Now we will created the histogram. Copy the previous code and add the geom_hist() function to ggplot() using the `+` symbol. 185 | 186 | ```{r exploring-features-of-homes-in-7, exercise = TRUE} 187 | 188 | ``` 189 | 190 | 191 | 192 | ```{r exploring-features-of-homes-in-7-hint-1, eval = FALSE} 193 | ... + 194 | geom_histogram() 195 | ``` 196 | 197 | ```{r exploring-features-of-homes-in-7-test, include = FALSE} 198 | ames |> 199 | ggplot(aes(x = Sale_Price)) + 200 | geom_histogram() 201 | ``` 202 | 203 | ### 204 | 205 | In **ggplot2**, geom_histogram() is a function used to create a histogram, which is a graphical representation of the distribution of a continuous variable. A histogram divides the data into intervals (bins) and displays the frequency or count of data points falling into each bin. 206 | 207 | ### Exercise 8 208 | 209 | Lets add `bins` to the histogram. Copy the previous code and in geom_histogram(), add the parameter `bins`, setting it equal to `50`. 210 | 211 | ```{r exploring-features-of-homes-in-8, exercise = TRUE} 212 | 213 | ``` 214 | 215 | 216 | 217 | ```{r exploring-features-of-homes-in-8-hint-1, eval = FALSE} 218 | ... + 219 | geom_histogram(bins = ...) 220 | ``` 221 | 222 | ```{r exploring-features-of-homes-in-8-test, include = FALSE} 223 | ames |> 224 | ggplot(aes(x = Sale_Price)) + 225 | geom_histogram(bins = 50) 226 | ``` 227 | 228 | ### 229 | 230 | This plot shows us that the data are right-skewed; there are more inexpensive houses than expensive ones. When modeling this outcome, a strong argument can be made that the price should be log-transformed. 231 | 232 | ### Exercise 9 233 | 234 | Lets add `col` (color) to the histogram. Copy the previous code and in geom_histogram(), add the parameter `col`, setting it equal to `"white"`. 235 | 236 | ```{r exploring-features-of-homes-in-9, exercise = TRUE} 237 | 238 | ``` 239 | 240 | 241 | 242 | ```{r exploring-features-of-homes-in-9-hint-1, eval = FALSE} 243 | ... + 244 | geom_histogram(bins = 50, col = "...") 245 | ``` 246 | 247 | ```{r exploring-features-of-homes-in-9-test, include = FALSE} 248 | ames |> 249 | ggplot(aes(x = Sale_Price)) + 250 | geom_histogram(bins = 50, col = "white") 251 | ``` 252 | 253 | ### 254 | 255 | From a statistical perspective, a logarithmic transform may also stabilize the variance in a way that makes inference more legitimate. 256 | 257 | ### Exercise 10 258 | 259 | Copy the previous code and add `scale_x_log10()`. 260 | 261 | ```{r exploring-features-of-homes-in-10, exercise = TRUE} 262 | 263 | ``` 264 | 265 | 266 | 267 | ```{r exploring-features-of-homes-in-10-hint-1, eval = FALSE} 268 | ... + 269 | scale_x_...() 270 | ``` 271 | 272 | ```{r exploring-features-of-homes-in-10-test, include = FALSE} 273 | ames |> 274 | ggplot(aes(x = Sale_Price)) + 275 | geom_histogram(bins = 50, col = "white") + 276 | scale_x_log10() 277 | ``` 278 | 279 | ### 280 | 281 | The advantages of this type of transformation are that no houses would be predicted with negative sale prices and that errors in predicting expensive houses will not have an undue influence on the model. 282 | 283 | ### 284 | 285 | Great Job! You now understood the basic concept of exploratory data analysis by analyzing the sale price of homes in Ames, Iowa. 286 | 287 | 288 | 289 | ## Summary 290 | ### 291 | 292 | This tutorial covered [Chapter 4: The Ames Housing Data](https://www.tmwr.org/ames.html) from [*Tidy Modeling with R*](https://www.tmwr.org/) by Max Kuhn and Julia Silge. We introduced the Ames housing data set [(De Cock 2011)](https://www.tmwr.org/ames.html#ref-ames), which we will use in modeling examples in later tutorials. Exploratory data analysis, like what we walk through in this tutorial, is an important first step in building a reliable model and you now a basic understanding of this concept. 293 | 294 | ```{r download-answers, child = system.file("child_documents/download_answers.Rmd", package = "tutorial.helpers")} 295 | ``` 296 | -------------------------------------------------------------------------------- /inst/tutorials/05-spending-our-data/tutorial.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Spending our Data 3 | author: Aryan Kancherla 4 | tutorial: 5 | id: spending-our-data 6 | output: 7 | learnr::tutorial: 8 | progressive: yes 9 | allow_skip: yes 10 | runtime: shiny_prerendered 11 | description: 'Tutorial for Chapter 5: Spending our Data' 12 | --- 13 | 14 | ```{r setup, include = FALSE} 15 | library(learnr) 16 | library(tutorial.helpers) 17 | library(tidymodels) 18 | tidymodels_prefer() 19 | knitr::opts_chunk$set(echo = FALSE) 20 | options(tutorial.exercise.timelimit = 60, 21 | tutorial.storage = "local") 22 | 23 | set.seed(501) 24 | ames_split <- initial_split(ames, prop = 0.80) 25 | 26 | ames_update <- ames |> 27 | mutate(Sale_Price = log10(Sale_Price)) 28 | 29 | ames_plot <- ames_update |> 30 | ggplot(aes(x = Sale_Price)) + 31 | geom_density() + theme_classic() + 32 | labs(x = "Sale Price (log-10 USD)") 33 | 34 | set.seed(502) 35 | 36 | ames_strata_split <- initial_split(ames, prop = 0.80, strata = Sale_Price) 37 | 38 | 39 | ``` 40 | 41 | ```{r copy-code-chunk, child = system.file("child_documents/copy_button.Rmd", package = "tutorial.helpers")} 42 | ``` 43 | 44 | ```{r info-section, child = system.file("child_documents/info_section.Rmd", package = "tutorial.helpers")} 45 | ``` 46 | 47 | ## Introduction 48 | ### 49 | 50 | This tutorial covers [Chapter 5: Spending our Data](https://www.tmwr.org/splitting.html) from [*Tidy Modeling with R*](https://www.tmwr.org/) by Max Kuhn and Julia Silge. In this tutorial, you will learn how to partition data into distinct groups for modeling and evaluation. The functions that will be used to do this are `initial_split()`, `training()`, and `testing()` from the [**tidymodels**](https://www.tidymodels.org/packages/) and [**rsample**](https://rsample.tidymodels.org/) packages. 51 | 52 | 53 | ## Common Method for Splitting Data 54 | ### 55 | 56 | There are several steps to creating a useful model, including parameter estimation, model selection and tuning, and performance assessment. At the start of a new project, there is usually an initial finite pool of data available for all these tasks, which we can think of as an available data budget. How should the data be applied to different steps or tasks? The idea of data spending is an important first consideration when modeling, especially as it relates to empirical validation. 57 | 58 | ### Exercise 1 59 | 60 | Load the **tidymodels** package below, using `library()`. 61 | 62 | ```{r common-method-for-sp-1, exercise = TRUE} 63 | 64 | ``` 65 | 66 | ```{r common-method-for-sp-1-hint-1, eval = FALSE} 67 | library(...) 68 | ``` 69 | 70 | ```{r include = FALSE} 71 | library(tidymodels) 72 | ``` 73 | 74 | ### 75 | 76 | When there are copious amounts of data available, a smart strategy is to allocate specific subsets of data for different tasks, as opposed to allocating the largest possible amount (or even all) to the model parameter estimation only. For example, one possible strategy (when both data and predictors are abundant) is to spend a specific subset of data to determine which predictors are informative, before considering parameter estimation at all. 77 | 78 | 79 | 80 | ### Exercise 2 81 | 82 | To combat the function naming conflicts, type in `tidymodels_prefer()`. 83 | 84 | ```{r common-method-for-sp-2, exercise = TRUE} 85 | 86 | ``` 87 | 88 | ```{r common-method-for-sp-2-hint-1, eval = FALSE} 89 | ...() 90 | ``` 91 | 92 | ```{r include = FALSE} 93 | tidymodels_prefer() 94 | ``` 95 | 96 | ### 97 | 98 | The primary approach for empirical model validation is to split the existing pool of data into two distinct sets: the training set and the test set. One portion of the data is used to develop and optimize the model. This *training set* is usually the majority of the data. These data are a sandbox for model building where different models can be fit, feature engineering strategies are investigated, and so on. As modeling practitioners, we spend the vast majority of the modeling process using the training set as the substrate to develop the model. 99 | 100 | 101 | ### Exercise 3 102 | 103 | In order to split data, we will need to use the `initial_split()` function from the *rsample* package. Type in `?initial_split()` in the Console and look at the Description section. CP/CR. 104 | 105 | ```{r common-method-for-sp-3} 106 | question_text(NULL, 107 | answer(NULL, correct = TRUE), 108 | allow_retry = TRUE, 109 | try_again_button = "Edit Answer", 110 | incorrect = NULL, 111 | rows = 3) 112 | ``` 113 | 114 | ### 115 | 116 | Since one portion of the data is placeed in the training set, the other portion of the data is placed into the *test set*. This is held in reserve until one or two models are chosen as the methods most likely to succeed. The test set is then used as the final arbiter to determine the efficacy of the model. It is critical to look at the test set only once; otherwise, it becomes part of the modeling process. 117 | 118 | 119 | ### Exercise 4 120 | 121 | The data we will be splitting is the `ames` data set. Type in `ames` and press "Run Code". 122 | 123 | ```{r common-method-for-sp-4, exercise = TRUE} 124 | 125 | ``` 126 | 127 | 128 | 129 | ```{r common-method-for-sp-4-hint-1, eval = FALSE} 130 | ... 131 | ``` 132 | 133 | ```{r include = FALSE} 134 | ames 135 | ``` 136 | 137 | ### 138 | 139 | The `ames` data set contains information on 2,930 properties in Ames, Iowa, including columns related to: 140 | 141 | - house characteristics (bedrooms, garage, fireplace, pool, porch, etc.) 142 | - location (neighborhood) 143 | - lot information (zoning, shape, size, etc.) 144 | - ratings of condition and quality 145 | - sale price 146 | 147 | ### Exercise 5 148 | 149 | In order to make sure the results can be produced later, we are going to use the `set.seed()` function. In the code chunk below, type in `set.seed()` and pass in `501`. 150 | 151 | ```{r common-method-for-sp-5, exercise = TRUE} 152 | 153 | ``` 154 | 155 | 156 | 157 | ```{r common-method-for-sp-5-hint-1, eval = FALSE} 158 | set.seed(...) 159 | ``` 160 | 161 | ```{r include = FALSE} 162 | set.seed(501) 163 | ``` 164 | 165 | ### 166 | 167 | Note that the method for conducting the splitting of data depends on the context. 168 | 169 | ### Exercise 6 170 | 171 | Lets allocate 80% of the data to the training set and the remaining 20% for the testing set. In the code chunk below, type in `initial_split()` passing in the `ames` data set. 172 | 173 | ```{r common-method-for-sp-6, exercise = TRUE} 174 | 175 | ``` 176 | 177 | ```{r common-method-for-sp-6-hint-1, eval = FALSE} 178 | initial_split(...) 179 | ``` 180 | 181 | ```{r include = FALSE} 182 | initial_split(ames) 183 | ``` 184 | 185 | ### 186 | 187 | As you can see, the data spits out a training number, testing number, and total number. The *Total* stands for the total amount of data in the data set. The *Training* number stands for the amount of data placed in the training set and the *Testing* number stands for the amount of data placed in the testing set. 188 | 189 | ### Exercise 7 190 | 191 | By doing the math, you can see that the data allocated to the training and testing sets are not what we wanted. The training set contains 75% of the data and the testing set contains 25% percent of the data. However, we want the training set to have 80% of the data and the testing set to have 20% of the data. 192 | 193 | To fix this, copy the previous code and inside `initial_split()`, set the `prop` argument to `0.80`. 194 | 195 | ```{r common-method-for-sp-7, exercise = TRUE} 196 | 197 | ``` 198 | 199 | 200 | 201 | ```{r common-method-for-sp-7-hint-1, eval = FALSE} 202 | initial_split(ames, prop = ...) 203 | ``` 204 | 205 | ```{r include = FALSE} 206 | initial_split(ames, prop = 0.80) 207 | ``` 208 | 209 | ### 210 | 211 | Doing the math, we can now see that 80% of the data (n = 2,344) is in the training set and 20% (n = 586) is in the testing set. 212 | 213 | ### Exercise 8 214 | 215 | Copy the previous code and set it to the variable `ames_split`. 216 | 217 | ```{r common-method-for-sp-8, exercise = TRUE} 218 | 219 | ``` 220 | 221 | 222 | 223 | ```{r common-method-for-sp-8-hint-1, eval = FALSE} 224 | ... <- initial_split(ames, prop = 0.80) 225 | ``` 226 | 227 | ```{r include = FALSE} 228 | ames_split <- initial_split(ames, prop = 0.80) 229 | 230 | ``` 231 | 232 | ### 233 | 234 | The **rsample** package also provides a `group_initial_split()` function for splitting data. Click [here](https://rsample.tidymodels.org/reference/initial_split.html) to learn more. 235 | 236 | ### Exercise 9 237 | 238 | The object `ames_split` is an `rsplit` object and contains only the partitioning information; to get the resulting data sets, we need apply two more functions: `training()` and `testing()`. In the code below, type `training()` and passing in `ames_split`. 239 | 240 | ```{r common-method-for-sp-9, exercise = TRUE} 241 | 242 | ``` 243 | 244 | ```{r common-method-for-sp-9-hint-1, eval = FALSE} 245 | training(...) 246 | ``` 247 | 248 | ```{r include = FALSE} 249 | training(ames_split) 250 | ``` 251 | 252 | ### 253 | 254 | As you can see, the `training()` function gets the tibble that contains all of the training data. 255 | 256 | ### Exercise 10 257 | 258 | Copy the previous code and pass it into the `dim()` function. 259 | 260 | ```{r common-method-for-sp-10, exercise = TRUE} 261 | 262 | ``` 263 | 264 | 265 | 266 | ```{r common-method-for-sp-10-hint-1, eval = FALSE} 267 | ...(training(ames_split)) 268 | ``` 269 | 270 | ```{r include = FALSE} 271 | dim(training(ames_split)) 272 | 273 | ``` 274 | 275 | ### 276 | 277 | The `dim()` function is used to determine the dimensions of an object. It returns a numerical vector that contains the number of rows and columns in the object. As you can see, the training data contains 2344 rows and 74 columns. 278 | 279 | ### Exercise 11 280 | 281 | Now, lets extract the testing data. Copy the code above and change `training` to `testing`. 282 | 283 | ```{r common-method-for-sp-11, exercise = TRUE} 284 | 285 | ``` 286 | 287 | 288 | 289 | ```{r common-method-for-sp-11-hint-1, eval = FALSE} 290 | dim(...(ames_split)) 291 | ``` 292 | 293 | ```{r include = FALSE} 294 | dim(testing(ames_split)) 295 | ``` 296 | 297 | ### 298 | 299 | As you can see, the `dim()` and `testing()` functions returned all of the testing data, which contains 586 rows. 300 | 301 | 302 | ## Stratified Sampling 303 | ### 304 | 305 | Simple random sampling is appropriate in many cases but there are exceptions. When there is a dramatic class imbalance in classification problems, one class occurs much less frequently than another. Using a simple random sample may haphazardly allocate these infrequent samples disproportionately into the training or test set. 306 | 307 | To avoid this, *stratified sampling* can be used. The training/test split is conducted separately within each class and then these subsamples are combined into the overall training and test set. For regression problems, the outcome data can be artificially binned into quartiles and then stratified sampling can be conducted four separate times. This is an effective method for keeping the distributions of the outcome similar between the training and test set. 308 | 309 | 310 | ### Exercise 1 311 | 312 | Let's create the following graph, which shows the distribution of sales prices from the `ames` data set. 313 | 314 | ```{r} 315 | ames_plot 316 | ``` 317 | 318 | Before we start however, we need to modify the `ames` data set so that it is on a logarithmic scale. Start by piping `ames` to `mutate()`. 319 | 320 | ```{r stratified-sampling-1, exercise = TRUE} 321 | 322 | ``` 323 | 324 | ```{r stratified-sampling-1-hint-1, eval = FALSE} 325 | ames |> 326 | ...() 327 | ``` 328 | 329 | ```{r include = FALSE} 330 | ames |> 331 | mutate() 332 | ``` 333 | 334 | ### 335 | 336 | The `log10()` function to modify the data so that it is on a logarithmic scale. 337 | 338 | ### Exercise 2 339 | 340 | Copy the previous code. Inside `mutate()`, set `Sale_Price` to `log10(Sale_Price)`. 341 | 342 | ```{r stratified-sampling-2, exercise = TRUE} 343 | 344 | ``` 345 | 346 | 347 | 348 | ```{r stratified-sampling-2-hint-1, eval = FALSE} 349 | ames |> 350 | mutate(... = log10(...)) 351 | ``` 352 | 353 | ```{r include = FALSE} 354 | ames |> 355 | mutate(Sale_Price = log10(Sale_Price)) 356 | ``` 357 | 358 | ### 359 | 360 | The root mean squared error (RMSE) is a common performance metric used in regression models. It uses the difference between the observed and predicted values in its calculations. If the sale price is on the log scale, these differences (i.e., the residuals) are also on the log scale. 361 | 362 | ### Exercise 3 363 | 364 | Copy the previous code and save it to the variable `ames_update`. 365 | 366 | ```{r stratified-sampling-3, exercise = TRUE} 367 | 368 | ``` 369 | 370 | 371 | 372 | ```{r stratified-sampling-3-hint-1, eval = FALSE} 373 | ... <- ames |> 374 | mutate(Sale_Price = log10(Sale_Price)) 375 | ``` 376 | 377 | ```{r include = FALSE} 378 | ames_update <- ames |> 379 | mutate(Sale_Price = log10(Sale_Price)) 380 | ``` 381 | 382 | ### 383 | 384 | When data are reused for multiple tasks, instead of carefully “spent” from the finite data budget, certain risks increase, such as the risk of accentuating bias or compounding effects from methodological errors. 385 | 386 | 387 | 388 | ### Exercise 4 389 | 390 | Now, lets start creating the graph. Start by piping `ames_update` to `ggplot()`. 391 | 392 | ```{r stratified-sampling-4, exercise = TRUE} 393 | 394 | ``` 395 | 396 | ```{r stratified-sampling-4-hint-1, eval = FALSE} 397 | ... |> 398 | ggplot() 399 | ``` 400 | 401 | ### 402 | 403 | As a reminder, the `ggplot()` function, which comes from the **ggplot2** library, is used to create data visualizations. 404 | 405 | ### Exercise 5 406 | 407 | Copy the previous code. Inside `ggplot()` type in `aes()`. Inside `aes()` set `x` to `Sale_Price`. 408 | 409 | ```{r stratified-sampling-5, exercise = TRUE} 410 | 411 | ``` 412 | 413 | 414 | 415 | ```{r stratified-sampling-5-hint-1, eval = FALSE} 416 | ames_update |> 417 | ggplot(...(x = ...)) 418 | ``` 419 | 420 | ```{r include = FALSE} 421 | ames_update |> 422 | ggplot(aes(x = Sale_Price)) 423 | ``` 424 | 425 | ### 426 | 427 | If a model has limited fidelity to the data, the inferences generated by the model should be highly suspect. In other words, statistical significance may not be sufficient proof that a model is appropriate. 428 | 429 | ### Exercise 6 430 | 431 | Copy the previous code and add `geom_density()` to the plot. 432 | 433 | ```{r stratified-sampling-6, exercise = TRUE} 434 | 435 | ``` 436 | 437 | 438 | 439 | ```{r stratified-sampling-6-hint-1, eval = FALSE} 440 | ames_update |> 441 | ggplot(aes(x = Sale_Price)) + 442 | ...() 443 | ``` 444 | 445 | ```{r include = FALSE} 446 | ames_update |> 447 | ggplot(aes(x = Sale_Price)) + 448 | geom_density() 449 | ``` 450 | 451 | ### 452 | 453 | `geom_density()` creates a density plot. A density plot is a graphical representation of the distribution of a numeric value (which in this case is `Sale_Price`). 454 | 455 | ### Exercise 7 456 | 457 | Copy the previous code and add `theme_classic()` to make the graph look nicer. 458 | 459 | ```{r stratified-sampling-7, exercise = TRUE} 460 | 461 | ``` 462 | 463 | 464 | 465 | ```{r stratified-sampling-7-hint-1, eval = FALSE} 466 | ames_update |> 467 | ggplot(aes(x = Sale_Price)) + 468 | geom_density() + 469 | ...() 470 | ``` 471 | 472 | ```{r include = FALSE} 473 | ames_update |> 474 | ggplot(aes(x = Sale_Price)) + 475 | geom_density() + 476 | theme_classic() 477 | ``` 478 | 479 | ### 480 | 481 | `theme_classic()` is one of the various themes you can use for your graphs. This [link](https://ggplot2.tidyverse.org/reference/ggtheme.html) provides more themes. 482 | 483 | ### Exercise 8 484 | 485 | Finally, copy the previous code and add your `labs()`. The final graph should look like this: 486 | 487 | ```{r} 488 | ames_plot 489 | ``` 490 | 491 | ```{r stratified-sampling-8, exercise = TRUE} 492 | 493 | ``` 494 | 495 | 496 | 497 | ```{r stratified-sampling-8-hint-1, eval = FALSE} 498 | ames_update |> 499 | ggplot(aes(x = Sale_Price)) + 500 | geom_density() + 501 | theme_classic() + 502 | labs( 503 | x = ... 504 | ) 505 | ``` 506 | 507 | ### 508 | 509 | As you can see, the sale price distribution is right-skewed, with proportionally more inexpensive houses than expensive houses on either side of the center of the distribution. The worry here with simple splitting is that the more expensive houses would not be well represented in the training set; this would increase the risk that our model would be ineffective at predicting the price for such properties. 510 | 511 | ### Exercise 9 512 | 513 | In order to fix this, We can use a stratified random sample. In the **rsample** package, we can use the `strata` argument in the `initial_split()` function. 514 | 515 | Before we do that, type in `set.seed()` and pass in `502`. 516 | 517 | ```{r stratified-sampling-9, exercise = TRUE} 518 | 519 | ``` 520 | 521 | ```{r stratified-sampling-9-hint-1, eval = FALSE} 522 | set.seed(...) 523 | ``` 524 | 525 | ```{r include = FALSE} 526 | set.seed(502) 527 | ``` 528 | 529 | ### 530 | 531 | As a reminder, the `set.seed()` function is used in order to make sure the results can be produced later. 532 | 533 | ### Exercise 10 534 | 535 | Take a look at the `initial_split()` code from the previous section. 536 | 537 | Now, lets add the `strata` argument. Inside `initial_split()`, set `strata` to `Sale_Price`. 538 | 539 | ```{r stratified-sampling-10, exercise = TRUE} 540 | initial_split(ames, prop = 0.80) 541 | ``` 542 | 543 | ```{r stratified-sampling-10-hint-1, eval = FALSE} 544 | initial_split(ames, prop = 0.80, ... = Sale_Price) 545 | ``` 546 | 547 | ```{r include = FALSE} 548 | initial_split(ames, prop = 0.80, strata = Sale_Price) 549 | ``` 550 | 551 | ### 552 | 553 | Stratified sampling is a sampling technique where the data is divided into subgroups (strata) based on the levels of a categorical variable. The sampling is then performed independently within each stratum, ensuring that each stratum is represented proportionally in both the training and testing sets. This is particularly useful when you have imbalanced data or when you want to ensure that certain groups are well-represented in the training and testing sets. 554 | 555 | ### Exercise 11 556 | 557 | Copy the previous code and save it to the variable `ames_strata_split`. 558 | 559 | ```{r stratified-sampling-11, exercise = TRUE} 560 | 561 | ``` 562 | 563 | 564 | 565 | ```{r stratified-sampling-11-hint-1, eval = FALSE} 566 | ... <- initial_split(ames, prop = 0.80, strata = Sale_Price) 567 | ``` 568 | 569 | ```{r include = FALSE} 570 | ames_strata_split <- initial_split(ames, prop = 0.80, strata = Sale_Price) 571 | 572 | ``` 573 | 574 | ### 575 | 576 | The proportion of data that should be allocated for splitting is highly dependent on the context of the problem at hand. Too little data in the training set hampers the model’s ability to find appropriate parameter estimates. Conversely, too little data in the test set lowers the quality of the performance estimates 577 | 578 | ### Exercise 12 579 | 580 | Now that we added the `strata` argument, we can reuse the `training()` and `testing()` functions. In the code chunk below, type in `dim()`. Inside `dim()`, type in `training()` and pass in `ames_strata_split`. 581 | 582 | ```{r stratified-sampling-12, exercise = TRUE} 583 | 584 | ``` 585 | 586 | ```{r stratified-sampling-12-hint-1, eval = FALSE} 587 | dim(...(ames_strata_split)) 588 | ``` 589 | 590 | ```{r include = FALSE} 591 | training(ames_strata_split) 592 | ``` 593 | 594 | ### 595 | 596 | As you can see, the training data now contains 2,342 rows. 597 | 598 | ### Exercise 13 599 | 600 | Copy the previous code and change `training()` to `testing()`. 601 | 602 | ```{r stratified-sampling-13, exercise = TRUE} 603 | 604 | ``` 605 | 606 | 607 | 608 | ```{r stratified-sampling-13-hint-1, eval = FALSE} 609 | dim(...(ames_strata_split)) 610 | ``` 611 | 612 | ```{r include = FALSE} 613 | dim(testing(ames_strata_split)) 614 | ``` 615 | 616 | ### 617 | 618 | Are there situations when random sampling is not the best choice? One case is when the data have a significant time component, such as time series data. Here, it is more common to use the most recent data as the test set. The **rsample** package contains a function called `initial_time_split()` that is very similar to `initial_split()`. Instead of using random sampling, the prop argument denotes what proportion of the first part of the data should be used as the training set; the function assumes that the data have been pre-sorted in an appropriate order. 619 | 620 | ### Exercise 14 621 | 622 | Chapter 5 of the *Tidy Modeling With R* textbook contains more information regarding how to spend data. Click on this [link](https://www.tmwr.org/splitting.html) and type in the names of sections 5.2 - 5.4. 623 | 624 | ```{r stratified-sampling-14} 625 | question_text(NULL, 626 | answer(NULL, correct = TRUE), 627 | allow_retry = TRUE, 628 | try_again_button = "Edit Answer", 629 | incorrect = NULL, 630 | rows = 3) 631 | ``` 632 | 633 | ### 634 | 635 | As you can see, the chapter covers more information, such as validation sets, multilevel data, and data budgets. 636 | 637 | ## Summary 638 | ### 639 | 640 | In this tutorial you have learned: 641 | 642 | - How to split data using `initial_split()` 643 | 644 | - How to allocate data towards the training and testing sets by using the `prop` argument inside `initial_split()` 645 | 646 | - How to train and test data, using `training()` and `testing()` respectively. 647 | 648 | - How to conduct a stratified random sample by using the `strata` argument inside `initial_split()` 649 | 650 | ```{r download-answers, child = system.file("child_documents/download_answers.Rmd", package = "tutorial.helpers")} 651 | ``` 652 | -------------------------------------------------------------------------------- /inst/tutorials/06-fitting-models-with-parsnip/images/DiagramOne.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/06-fitting-models-with-parsnip/images/DiagramOne.png -------------------------------------------------------------------------------- /inst/tutorials/06-fitting-models-with-parsnip/images/TableFour.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/06-fitting-models-with-parsnip/images/TableFour.png -------------------------------------------------------------------------------- /inst/tutorials/06-fitting-models-with-parsnip/images/TableOne.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/06-fitting-models-with-parsnip/images/TableOne.png -------------------------------------------------------------------------------- /inst/tutorials/06-fitting-models-with-parsnip/images/TableThree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/06-fitting-models-with-parsnip/images/TableThree.png -------------------------------------------------------------------------------- /inst/tutorials/06-fitting-models-with-parsnip/images/TableTwo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/06-fitting-models-with-parsnip/images/TableTwo.png -------------------------------------------------------------------------------- /inst/tutorials/07-a-model-workflow/images/img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/07-a-model-workflow/images/img.png -------------------------------------------------------------------------------- /inst/tutorials/07-a-model-workflow/tutorial.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: A Model Workflow 3 | author: Pratham Kancherla and David Kane 4 | tutorial: 5 | id: a-model-workflow 6 | output: 7 | learnr::tutorial: 8 | progressive: true 9 | allow_skip: true 10 | runtime: shiny_prerendered 11 | description: 'Tutorial for Chapter 7: A Model Workflow' 12 | --- 13 | 14 | ```{r setup, include = FALSE} 15 | library(learnr) 16 | library(tutorial.helpers) 17 | library(knitr) 18 | library(tidyverse) 19 | library(tidymodels) 20 | library(lme4) 21 | library(multilevelmod) 22 | library(nlme) 23 | library(workflowsets) 24 | 25 | tidymodels_prefer() 26 | 27 | knitr::opts_chunk$set(echo = FALSE) 28 | options(tutorial.exercise.timelimit = 60, 29 | tutorial.storage = "local") 30 | 31 | lm_model <- 32 | linear_reg() |> 33 | set_engine("lm") 34 | 35 | lm_wflow <- 36 | workflow() |> 37 | add_model(lm_model) 38 | 39 | lm_wflow <- 40 | lm_wflow |> 41 | add_formula(Sale_Price ~ Longitude + Latitude) 42 | 43 | data(ames) 44 | 45 | ames <- mutate(ames, Sale_Price = log10(Sale_Price)) 46 | 47 | ames_split <- initial_split(ames, prop = 0.80, strata = Sale_Price) 48 | 49 | ames_train <- training(ames_split) 50 | 51 | ames_test <- testing(ames_split) 52 | 53 | lm_fit <- fit(lm_wflow, ames_train) 54 | 55 | lm_wflow <- 56 | lm_wflow |> 57 | remove_formula() |> 58 | add_variables(outcome = Sale_Price, 59 | predictors = c(Longitude, Latitude) 60 | ) 61 | 62 | multilevel_spec <- 63 | linear_reg() |> 64 | set_engine("lmer") 65 | 66 | multilevel_workflow <- 67 | workflow() |> 68 | add_variables(outcome = distance, 69 | predictors = c(Sex, age, Subject)) |> 70 | add_model(multilevel_spec, 71 | formula = distance ~ Sex + (age | Subject) 72 | ) 73 | 74 | multilevel_fit <- fit(multilevel_workflow, data = Orthodont) 75 | 76 | location <- list( 77 | longitude = Sale_Price ~ Longitude, 78 | latitude = Sale_Price ~ Latitude, 79 | coords = Sale_Price ~ Longitude + Latitude, 80 | neighborhood = Sale_Price ~ Neighborhood 81 | ) 82 | 83 | location_models <- workflow_set(preproc = location, models = list(lm = lm_model)) 84 | 85 | location_models <- 86 | location_models %>% 87 | mutate(fit = map(info, ~ fit(.x$workflow[[1]], ames_train))) 88 | 89 | final_lm_res <- last_fit(lm_wflow, ames_split) 90 | 91 | c_mtrcs <- collect_metrics(final_lm_res) 92 | c_predic <- 93 | collect_predictions(final_lm_res) |> 94 | slice(1:5) 95 | 96 | ``` 97 | 98 | ```{r copy-code-chunk, child = system.file("child_documents/copy_button.Rmd", package = "tutorial.helpers")} 99 | ``` 100 | 101 | ```{r info-section, child = system.file("child_documents/info_section.Rmd", package = "tutorial.helpers")} 102 | ``` 103 | 104 | ## Introduction 105 | ### 106 | 107 | This tutorial covers [Chapter 7: A Model Workflow](https://www.tmwr.org/workflows.html) from [*Tidy Modeling with R*](https://www.tmwr.org/) by Max Kuhn and Julia Silge. In the previous chapter, we discussed the [**parsnip**](https://parsnip.tidymodels.org/) package, which can be used to define and fit the model. This chapter introduces a new concept called a model workflow. The purpose of this concept (and the corresponding **tidymodels** `workflow()` object) is to encapsulate the major pieces of the modeling process. 108 | 109 | ## Workflow Basics 110 | ### 111 | 112 | PCA is a way to replace correlated predictors with new artificial features that are uncorrelated and capture most of the information in the original set. 113 | 114 | ```{r} 115 | #| echo: false 116 | #| message: false 117 | #| warning: false 118 | 119 | #include_graphics("inst/tutorials/07-a-model-workflow/images/img.png") 120 | 121 | ``` 122 | 123 | The workflows package allows the user to bind modeling and preprocessing objects together. Let’s start again with the Ames data and a simple linear model. 124 | 125 | ### Exercise 1 126 | 127 | Load the library **tidymodels** using `library()`. 128 | 129 | ```{r workflow-basics-1, exercise = TRUE} 130 | 131 | ``` 132 | 133 | ```{r workflow-basics-1-hint-1, eval = FALSE} 134 | library(...) 135 | ``` 136 | 137 | ```{r include = FALSE} 138 | library(tidymodels) 139 | ``` 140 | 141 | ### 142 | 143 | The core **tidymodels** packages: [**rsample**](https://rsample.tidymodels.org/), [**parsnip**](https://parsnip.tidymodels.org/), [**recipes**](https://recipes.tidymodels.org/), [**workflows**](https://workflows.tidymodels.org/), [**tune**](https://tune.tidymodels.org/), [**yardstick**](https://yardstick.tidymodels.org/), [**broom**](https://broom.tidymodels.org/), [**dials**](https://dials.tidymodels.org/) 144 | 145 | ### Exercise 2 146 | 147 | The **workflows** package allows the user to bind modeling and pre-processing objects together. Let’s start again with the Ames data. Enter `linear_reg()` and hit "Run Code". 148 | 149 | ```{r workflow-basics-2, exercise = TRUE} 150 | 151 | ``` 152 | 153 | ```{r workflow-basics-2-hint-1, eval = FALSE} 154 | linear_reg() 155 | ``` 156 | 157 | ```{r include = FALSE} 158 | linear_reg() 159 | ``` 160 | 161 | ### 162 | 163 | `linear_reg()` is used to specify and fit a linear regression model in the **tidymodels** framework. It is similar to other model functions in **parsnip** and follows the same pattern. 164 | 165 | ### Exercise 3 166 | 167 | Copy the previous code and pipe `set_engine()`, with the parameter being `"lm"`, by using the pipe operator. Set this equal to `lm_model`. 168 | 169 | ```{r workflow-basics-3, exercise = TRUE} 170 | 171 | ``` 172 | 173 | 174 | 175 | ```{r workflow-basics-3-hint-1, eval = FALSE} 176 | lm_model <- 177 | ... |> 178 | set_engine("...") 179 | ``` 180 | 181 | ```{r include = FALSE} 182 | lm_model <- 183 | linear_reg() |> 184 | set_engine("lm") 185 | ``` 186 | 187 | ### 188 | 189 | It is important to focus on the broader modeling process, instead of only fitting the specific model used to estimate parameters. This broader process includes any pre-processing steps, the model fit itself, as well as potential post-processing activities. 190 | 191 | ### Exercise 4 192 | 193 | A workflow always requires a parsnip model object. Type in `workflow()` and hit "Run Code". 194 | 195 | ```{r workflow-basics-4, exercise = TRUE} 196 | 197 | ``` 198 | 199 | ```{r workflow-basics-4-hint-1, eval = FALSE} 200 | workflow() 201 | ``` 202 | 203 | ```{r include = FALSE} 204 | workflow() 205 | ``` 206 | 207 | ### 208 | 209 | A workflow object can include steps such as data pre-processing, feature engineering, model specification, model fitting, and evaluation. Each step is represented by a modeling object or a function. 210 | 211 | ### Exercise 5 212 | 213 | Copy the previous code and pipe `add_model()`, with the parameter being `lm_model`, by using the pipe operator. Set this equal to `lm_wflow`. 214 | 215 | ```{r workflow-basics-5, exercise = TRUE} 216 | 217 | ``` 218 | 219 | 220 | 221 | ```{r workflow-basics-5-hint-1, eval = FALSE} 222 | ... |> 223 | add_model(...) 224 | ``` 225 | 226 | ```{r include = FALSE} 227 | lm_wflow <- 228 | workflow() |> 229 | add_model(lm_model) 230 | ``` 231 | 232 | ### 233 | 234 | Principal Component Analysis (PCA) signal extraction is a way to replace correlated predictors with new artificial features that are uncorrelated and capture most of the information in the original set. 235 | 236 | ### Exercise 6 237 | 238 | Type `lm_wflow` on the next line and hit "Run Code". 239 | 240 | ```{r workflow-basics-6, exercise = TRUE} 241 | 242 | ``` 243 | 244 | ```{r workflow-basics-6-hint-1, eval = FALSE} 245 | lm_wflow 246 | ``` 247 | 248 | ```{r include = FALSE} 249 | lm_wflow 250 | ``` 251 | 252 | ### 253 | 254 | Notice how the preprocessor has not be defined yet. In statistics, a preprocessor refers to a step or a set of steps taken before modeling or analyzing the data. The main goal of a preprocessor is to transform the raw data into a format that is more suitable for the subsequent statistical analysis or modeling tasks. 255 | 256 | ### Exercise 7 257 | 258 | The `add_formula()` function can be used to add a formula to the preprocessor. Copy the previous code and pipe `add_formula()`, with the formula being `Sale Price ~ Longitude + Latitude`. Set it equal to `lm_wflow`. 259 | 260 | ```{r workflow-basics-7, exercise = TRUE} 261 | 262 | ``` 263 | 264 | 265 | 266 | ```{r workflow-basics-7-hint-1, eval = FALSE} 267 | lm_wflow <- 268 | ... |> 269 | add_formula(Sale_Price ~ Longitude + Latitude) 270 | ``` 271 | 272 | ```{r include = FALSE} 273 | lm_wflow <- 274 | lm_wflow |> 275 | add_formula(Sale_Price ~ Longitude + Latitude) 276 | ``` 277 | 278 | ### 279 | 280 | The `fit()` function is used to train a specified model on a given dataset, using the formula and data provided in the model specification. It returns a fitted model object that can be used for prediction and evaluation. 281 | 282 | ### Exercise 8 283 | 284 | We will be using some of the objects created from the previous tutorial to make our fitted model using the `fit()` function. Within the function, add the parameters, `lm_wflow` and `ames_train`. Set this expression equal to `lm_fit`. 285 | 286 | ```{r workflow-basics-8, exercise = TRUE} 287 | 288 | ``` 289 | 290 | ```{r workflow-basics-8-hint-1, eval = FALSE} 291 | ... <- fit(..., ames_train) 292 | ``` 293 | 294 | ```{r include = FALSE} 295 | lm_fit <- fit(lm_wflow, ames_train) 296 | ``` 297 | 298 | ### 299 | 300 | The `predict()` function works with a wide range of models, including linear regression, generalized linear models, decision trees, random forests, support vector machines, and many others. 301 | 302 | ### Exercise 9 303 | 304 | To predicted on the fitted workflow, we will be using `predict()`. Within the function, add the parameter `lm_fit`. Note that this will throw an error which will be fixed soon. 305 | 306 | ```{r workflow-basics-9, exercise = TRUE} 307 | 308 | ``` 309 | 310 | ```{r workflow-basics-9-hint-1, eval = FALSE} 311 | predict(...) 312 | ``` 313 | 314 | ### 315 | 316 | The `predict()` function requires the "newdata" argument to make predictions on new data. This argument specifies the data frame containing the predictor variables for which you want to make predictions. 317 | 318 | ### Exercise 10 319 | 320 | Copy the previous code and add the parameter `ames_test` as the new_data argument to make predictions on the data. 321 | 322 | ```{r workflow-basics-10, exercise = TRUE} 323 | 324 | ``` 325 | 326 | 327 | 328 | ```{r workflow-basics-10-hint-1, eval = FALSE} 329 | predict(lm_fit, ...) 330 | ``` 331 | 332 | ```{r include = FALSE} 333 | predict(lm_fit, ames_train) 334 | ``` 335 | 336 | ### 337 | 338 | There are too many rows that are difficult to look at once. The `slice()` functions lets us select a certain amount of rows to be printed out. 339 | 340 | ### Exercise 11 341 | 342 | Copy the previous code and add `slice()` to the pipe. Add the parameter `1:3` to `slice()` and hit "Run Code". 343 | 344 | ```{r workflow-basics-11, exercise = TRUE} 345 | 346 | ``` 347 | 348 | 349 | 350 | ```{r workflow-basics-11-hint-1, eval = FALSE} 351 | ... |> 352 | slice(...) 353 | ``` 354 | 355 | ```{r include = FALSE} 356 | predict(lm_fit, ames_train) |> 357 | slice(1:3) 358 | ``` 359 | 360 | ### 361 | 362 | `update_formula()` is used to update model formulae. This typically involves adding or dropping terms, but updates can be more general. 363 | 364 | ### Exercise 12 365 | 366 | Pipe `update_formula()` to lm_fit. In `update_formula()`, we will change the formula to `Sale_Price ~ Longitude`. 367 | 368 | ```{r workflow-basics-12, exercise = TRUE} 369 | 370 | ``` 371 | 372 | ```{r workflow-basics-12-hint-1, eval = FALSE} 373 | lm_fit |> 374 | update_formula(Sale_Price ~ ...) 375 | ``` 376 | 377 | ```{r include = FALSE} 378 | lm_fit |> 379 | update_formula(Sale_Price ~ Longitude) 380 | ``` 381 | 382 | ### 383 | 384 | Great Job! You now understand the basics of workflow and the different functions that can help model the data. 385 | 386 | ## Adding Raw Variables to the `workflow()` 387 | ### 388 | 389 | There is another interface for passing data to the model, the `add_variables()` function, which uses a **dplyr**-like syntax for choosing variables. The function has two primary arguments: *outcomes* and *predictors*. These use a selection approach similar to the **tidyselect** backend of **tidyverse** packages to capture multiple selectors using `c()`. 390 | 391 | ### Exercise 1 392 | 393 | We will not be needing a formula any more, as we only will need outcomes and predictors. Pipe `remove_formula()` to `lm_wflow`. 394 | 395 | ```{r adding-raw-variables-1, exercise = TRUE} 396 | 397 | ``` 398 | 399 | ```{r adding-raw-variables-1-hint-1, eval = FALSE} 400 | lm_wflow |> 401 | ... 402 | ``` 403 | 404 | ```{r include = FALSE} 405 | lm_wflow |> 406 | remove_formula() 407 | ``` 408 | 409 | ### 410 | 411 | You can see under the preprocessor tab, there is no formula anymore. There now needs to be outcomes and predictors. 412 | 413 | ### Exercise 2 414 | 415 | We will use the `add_variables()` function to add the outcome first. Copy the previous code and add `add_variables()` to the pipe, setting `outcome = Sale_Price`. This will throw an error. 416 | 417 | ```{r adding-raw-variables-2, exercise = TRUE} 418 | 419 | ``` 420 | 421 | 422 | 423 | ```{r adding-raw-variables-2-hint-1, eval = FALSE} 424 | ... |> 425 | add_variables(outcome = ...) 426 | ``` 427 | 428 | ```{r include = FALSE} 429 | #lm_wflow |> 430 | #remove_formula() |> 431 | #add_variables(outcome = Sale_Price) 432 | ``` 433 | 434 | Note that there is no predictors parameter established yet. 435 | 436 | ### Exercise 3 437 | 438 | Now we will add the predictor variable to the preprocessor. Copy the previous code and in `add_variables()`, add `parameter` and set it equal to `Longitude` and `Latitude` using `c()`. 439 | 440 | ```{r adding-raw-variables-3, exercise = TRUE} 441 | 442 | ``` 443 | 444 | 445 | 446 | ```{r adding-raw-variables-3-hint-1, eval = FALSE} 447 | ... |> 448 | add_variables(outcome = Sale_Price, predictors = c(..., ...)) 449 | ``` 450 | 451 | ```{r include = FALSE} 452 | lm_wflow |> 453 | remove_formula() |> 454 | add_variables(outcome = Sale_Price, predictors = c(Longitude, Latitude)) 455 | ``` 456 | 457 | ### 458 | 459 | `add_variables()` adds a new column to a data frame, while `add_case()` adds a new row to a data frame. These are convenient functions to add columns or rows not only at the end of a data frame, but at any column or row position. Furthermore, they allow easy integration into a pipe-workflow. 460 | 461 | 462 | ### Exercise 4 463 | 464 | 465 | Finally, copy the previous code and set the expression equal to `lm_wflow` using the `<-`. On the next line, type in `lm_wflow` to see the workflow. 466 | 467 | ```{r adding-raw-variables-4, exercise = TRUE} 468 | 469 | ``` 470 | 471 | 472 | 473 | ```{r adding-raw-variables-4-hint-1, eval = FALSE} 474 | ... <- lm_wflow |> 475 | remove_formula() |> 476 | add_variables( 477 | outcome = Sale_Price, 478 | predictors = c(Longitude, ...)) 479 | ``` 480 | 481 | ```{r include = FALSE} 482 | lm_wflow <- 483 | lm_wflow |> 484 | remove_formula() |> 485 | add_variables(outcome = Sale_Price, 486 | predictors = c(Longitude, Latitude) 487 | ) 488 | ``` 489 | 490 | ### 491 | 492 | If you would like the underlying modeling method to do what it would normally do with the data, `add_variables()` can be a helpful interface. 493 | 494 | ### Exercise 5 495 | 496 | Now we can create the model using `fit()`. Within `fit()`, add the parameters `lm_wflow` and `ames_train`. 497 | 498 | ```{r adding-raw-variables-5, exercise = TRUE} 499 | 500 | ``` 501 | 502 | ```{r adding-raw-variables-5-hint-1, eval = FALSE} 503 | fit(lm_wflow, ...) 504 | ``` 505 | 506 | ```{r include = FALSE} 507 | fit(lm_wflow, ames_train) 508 | ``` 509 | 510 | ### 511 | 512 | Models such as **glmnet** and **xgboost** expect the user to make indicator variables from factor predictors. In these cases, a recipe or formula interface will typically be a better choice. 513 | 514 | Great Job! You now know how to add raw variables such as outcome predictors to the workflow. In the next chapter, we will look at a more powerful preprocessor (called a recipe) that can also be added to a workflow. 515 | 516 | ## How Does a `workflow()` Use the Formula? 517 | ### 518 | 519 | When we fit a tree to the data, the **parsnip** package understands what the modeling function would do. For example, if a random forest model is fit using the ranger or **randomForest** packages, the workflow knows predictors columns that are factors should be left as is. 520 | 521 | ### Exercise 1 522 | 523 | A number of multilevel models have standardized on a formula specification devised in the lme4 package. For example, to fit a regression model that has random effects for subjects, we would use the following formula: 524 | 525 | ```{r how-does-a-workflow--1, exercise = TRUE} 526 | library(lme4) 527 | lmer(formula = distance ~ Sex + (age | Subject), data = Orthodont) 528 | ``` 529 | 530 | ### 531 | 532 | The effect of this is that each subject will have an estimated intercept and slope parameter for age. The problem, however, is that standard R methods can’t properly process this formula. 533 | 534 | ### Exercise 2 535 | 536 | We can try to process this formula with `model.matrix()`. Copy the previous code and replace `lmer()` with `model.matrix()`. Hit "Run Code". 537 | 538 | ```{r how-does-a-workflow--2, exercise = TRUE} 539 | 540 | ``` 541 | 542 | 543 | 544 | ```{r how-does-a-workflow--2-hint-1, eval = FALSE} 545 | ...(distance ~ Sex + (age | Subject), data = Orthodont) 546 | ``` 547 | 548 | ```{r include = FALSE} 549 | model.matrix(distance ~ Sex + (age | Subject), data = Orthodont) 550 | ``` 551 | 552 | Even if this formula could be used with `model.matrix()`, this would still present a problem since the formula also specifies the statistical attributes of the model. 553 | 554 | ### Exercise 3 555 | 556 | The solution in workflows is an optional supplementary model formula that can be passed to `add_model()`. The `add_variables()` specification provides the bare column names, and then the actual formula given to the model is set within `add_model()`. 557 | 558 | First, load the library **multilevelmod** using `library()`. 559 | 560 | ```{r how-does-a-workflow--3, exercise = TRUE} 561 | 562 | ``` 563 | 564 | ```{r how-does-a-workflow--3-hint-1, eval = FALSE} 565 | library(multilevelmod) 566 | ``` 567 | 568 | ### 569 | 570 | **multilevelmod** package enables the use of multilevel models (a.k.a mixed-effects models, Bayesian hierarchical models, etc.) with the **parsnip** package. 571 | 572 | ### Exercise 4 573 | 574 | We need the specify that we will be using a linear regression model. Pipe `set_model("lmer")` to `linear_reg()`. Set this expression equal to `multilevel_spec` using `<-` 575 | 576 | ```{r how-does-a-workflow--4, exercise = TRUE} 577 | 578 | ``` 579 | 580 | ```{r how-does-a-workflow--4-hint-1, eval = FALSE} 581 | multilevel_spec <- 582 | linear_reg() |> 583 | set_model("...") 584 | ``` 585 | 586 | ```{r include = FALSE} 587 | multilevel_spec <- 588 | linear_reg() |> 589 | set_engine("lmer") 590 | ``` 591 | 592 | ### 593 | 594 | The `set_engine()` function is used to specify the computational "engine" or backend for fitting a model. It allows you to choose a specific modeling library or package to be used for model training and prediction. 595 | 596 | ### Exercise 5 597 | 598 | We now need a workflow to model the data. Type `workflow()` to create a workflow. Hit "Run Code". 599 | 600 | ```{r how-does-a-workflow--5, exercise = TRUE} 601 | 602 | ``` 603 | 604 | ```{r how-does-a-workflow--5-hint-1, eval = FALSE} 605 | workflow() 606 | ``` 607 | 608 | ### 609 | 610 | The `workflow()` function allows you to build a complete modeling pipeline by combining various modeling and preprocessing steps, making it easier to manage and reproduce complex analyses. 611 | 612 | ### Exercise 6 613 | 614 | Now we need to add the raw variables to the model, which are the outcomes and predictors. Copy the previous code and pipe `add_variables()`. This will throw and error because we should always have two parameters: `outcomes` and `parameters`. 615 | 616 | ```{r how-does-a-workflow--6, exercise = TRUE} 617 | 618 | ``` 619 | 620 | 621 | 622 | ```{r how-does-a-workflow--6-hint-1, eval = FALSE} 623 | ... |> 624 | add_variables() 625 | ``` 626 | 627 | ```{r include = FALSE} 628 | #workflow() |> 629 | #add_variables() 630 | ``` 631 | 632 | ### 633 | 634 | For predictive models, it is advisable to evaluate a variety of different model types. This requires the user to create multiple model specifications. 635 | 636 | ### Exercise 7 637 | 638 | Copy the previous code and add the parameter `outcome`, setting it equal to `distance`. This also will not work because the `predictors` parameter needs to be added 639 | 640 | ```{r how-does-a-workflow--7, exercise = TRUE} 641 | 642 | ``` 643 | 644 | 645 | 646 | ```{r how-does-a-workflow--7-hint-1, eval = FALSE} 647 | ... |> 648 | add_variables(outcome = ...) 649 | ``` 650 | 651 | ```{r include = FALSE} 652 | #workflow() |> 653 | #add_variables(outcome = distance) 654 | ``` 655 | 656 | ### 657 | 658 | Sequential testing of models typically starts with an expanded set of predictors. This “full model” is compared to a sequence of the same model that removes each predictor in turn. Using basic hypothesis testing methods or empirical validation, the effect of each predictor can be isolated and assessed. 659 | 660 | ### Exercise 8 661 | 662 | Copy the previous code and the `predictors` parameters. Set the parameter equal to `Sex, age, Subject` using the vector function `c()`. 663 | 664 | ```{r how-does-a-workflow--8, exercise = TRUE} 665 | 666 | ``` 667 | 668 | 669 | 670 | ```{r how-does-a-workflow--8-hint-1, eval = FALSE} 671 | ... |> 672 | add_variables( 673 | outcome = distance, 674 | predictors = ...(Sex, age, Subject)) 675 | ``` 676 | 677 | ```{r include = FALSE} 678 | workflow() |> 679 | add_variables( 680 | outcome = distance, 681 | predictors = c(Sex, age, Subject)) 682 | ``` 683 | 684 | ### 685 | 686 | In regression analysis, the outcome variable is the variable we aim to model as a function of one or more predictor variables. It represents the target or dependent variable that we want to predict or explain. 687 | 688 | ### Exercise 9 689 | 690 | Finally we need to add the model. Copy the previous code and pipe `add_model()`. This will throw and error becasue a `spec` or model specification was not specified. 691 | 692 | ```{r how-does-a-workflow--9, exercise = TRUE} 693 | 694 | ``` 695 | 696 | 697 | 698 | ```{r how-does-a-workflow--9-hint-1, eval = FALSE} 699 | ... |> 700 | add_model() 701 | ``` 702 | 703 | ```{r include = FALSE} 704 | # workflow() |> 705 | # add_variables( 706 | # outcome = distance, 707 | # predictors = c(Sex, age, Subject)) |> 708 | # add_model() 709 | ``` 710 | 711 | ### 712 | 713 | The `add_model()` function allows you to add a modeling specification to your workflow. It specifies the type of model you want to use for the analysis, such as linear regression, random forest, support vector machine, etc. 714 | 715 | ### Exercise 10 716 | 717 | Copy the previous code and add the specification we made earlier, `multilevel_spec` as the first parameter. 718 | 719 | ```{r how-does-a-workflow--10, exercise = TRUE} 720 | 721 | ``` 722 | 723 | 724 | 725 | ```{r how-does-a-workflow--10-hint-1, eval = FALSE} 726 | ... |> 727 | add_model(multilevel_spec) 728 | ``` 729 | 730 | ```{r include = FALSE} 731 | workflow() |> 732 | add_variables( 733 | outcome = distance, 734 | predictors = c(Sex, age, Subject)) |> 735 | add_model(multilevel_spec) 736 | ``` 737 | 738 | ### 739 | 740 | The `formula` parameter allows you to specify the formula that defines the relationship between the outcome variable (response variable) and the predictor variables in the model. 741 | 742 | ### Exercise 11 743 | 744 | Copy the previous code and add the parameter `formula` to `add_model()`. Set `formula` equal to the formula seen in exercise 1, `distance ~ Sex + (age | Subject)`. 745 | 746 | ```{r how-does-a-workflow--11, exercise = TRUE} 747 | 748 | ``` 749 | 750 | 751 | 752 | ```{r how-does-a-workflow--11-hint-1, eval = FALSE} 753 | ... |> 754 | add_model( 755 | multilevel_spec, 756 | formula = ... 757 | ) 758 | ``` 759 | 760 | ```{r include = FALSE} 761 | workflow() |> 762 | add_variables( 763 | outcome = distance, 764 | predictors = c(Sex, age, Subject)) |> 765 | add_model( 766 | multilevel_spec, 767 | formula = distance ~ Sex + (age | Subject) 768 | ) 769 | ``` 770 | 771 | ### Exercise 12 772 | 773 | Copy the previous code and set it equal to `multilevel_workflow` using `<-`. 774 | 775 | ```{r how-does-a-workflow--12, exercise = TRUE} 776 | 777 | ``` 778 | 779 | 780 | 781 | ```{r how-does-a-workflow--12-hint-1, eval = FALSE} 782 | multilevel_workflow <- ... 783 | ``` 784 | 785 | ```{r include = FALSE} 786 | multilevel_workflow <- 787 | workflow() |> 788 | add_variables(outcome = distance, 789 | predictors = c(Sex, age, Subject)) |> 790 | add_model(multilevel_spec, 791 | formula = distance ~ Sex + (age | Subject) 792 | ) 793 | ``` 794 | 795 | ### 796 | 797 | Since the preprocessing is model dependent, workflows attempts to emulate what the underlying model would do whenever possible. If it is not possible, the formula processing should not do anything to the columns used in the formula. 798 | 799 | ### Exercise 13 800 | 801 | Now we need to fit the model specified using a model specification object using `fit()`. Type in `fit()` and add the parameter `multilevel_workflow`. Error will appear but we will fix it later. 802 | 803 | ```{r how-does-a-workflow--13, exercise = TRUE} 804 | 805 | ``` 806 | 807 | ```{r how-does-a-workflow--13-hint-1, eval = FALSE} 808 | fit(...) 809 | ``` 810 | 811 | ```{r include = FALSE} 812 | #fit(multilevel_workflow) 813 | ``` 814 | 815 | ### 816 | 817 | Preprocessing is a crucial step in the data analysis workflow because it helps address various issues and challenges associated with real-world data. 818 | 819 | ### Exercise 14 820 | 821 | Copy the previous code and add the `data` parameter, setting it equal `Orthodont`. Set this expression equal to `multilevel_fit` and print it out on the next line. 822 | 823 | ```{r how-does-a-workflow--14, exercise = TRUE} 824 | 825 | ``` 826 | 827 | 828 | 829 | ```{r how-does-a-workflow--14-hint-1, eval = FALSE} 830 | multilevel_fit <- fit(multilevel_workflow, data = ...) 831 | ``` 832 | 833 | ```{r include = FALSE} 834 | fit(multilevel_workflow, data = Orthodont) 835 | ``` 836 | 837 | ### 838 | 839 | `strata()` is a special function used in the context of the Cox survival model. It identifies stratification variables when they appear on the right hand side of a formula. 840 | 841 | ### Exercise 15 842 | 843 | 844 | 845 | We can even use the previously mentioned `strata()` function from the survival package for survival analysis. Run the following code. 846 | 847 | ```{r how-does-a-workflow--15, exercise = TRUE} 848 | library(censored) 849 | 850 | parametric_spec <- survival_reg() 851 | 852 | parametric_workflow <- 853 | workflow() %>% 854 | add_variables(outcome = c(fustat, futime), predictors = c(age, rx)) %>% 855 | add_model(parametric_spec, 856 | formula = Surv(futime, fustat) ~ age + strata(rx)) 857 | 858 | parametric_fit <- fit(parametric_workflow, data = ovarian) 859 | parametric_fit 860 | ``` 861 | 862 | ### 863 | 864 | Great Job! You now know how a workflow uses different sorts of formulas from a data set. 865 | 866 | ## Creating Multiple Workflows at Once 867 | ### 868 | 869 | In some situations, the data require numerous attempts to find an appropriate model. In these situations, as well as others, it can become tedious or onerous to create a lot of workflows from different sets of preprocessors and/or model specifications. To address this problem, the **workflowset** package creates combinations of workflow components. A list of preprocessors (e.g., formulas, **dplyr** selectors, or feature engineering recipe objects discussed in the next chapter) can be combined with a list of model specifications, resulting in a set of workflows. 870 | 871 | ### Exercise 1 872 | 873 | Let’s say that we want to focus on the different ways that house location is represented in the Ames data. We can create a set of formulas that capture these predictors. Hit "Run Code". 874 | 875 | ```{r creating-multiple-wo-1, exercise = TRUE} 876 | location <- list( 877 | longitude = Sale_Price ~ Longitude, 878 | latitude = Sale_Price ~ Latitude, 879 | coords = Sale_Price ~ Longitude + Latitude, 880 | neighborhood = Sale_Price ~ Neighborhood 881 | ) 882 | ``` 883 | 884 | ### 885 | 886 | In R, list() is a built-in function used to create a list, which is a versatile data structure that can hold elements of different types, such as vectors, matrices, data frames, and even other lists. Lists allow you to organize and store multiple objects together in a single container. 887 | 888 | ### Exercise 2 889 | 890 | 891 | Load the library **workflowsets** using `library()`. 892 | 893 | ```{r creating-multiple-wo-2, exercise = TRUE} 894 | 895 | ``` 896 | 897 | ```{r creating-multiple-wo-2-hint-1, eval = FALSE} 898 | library(...) 899 | ``` 900 | 901 | ```{r include = FALSE} 902 | library(workflowsets) 903 | ``` 904 | 905 | ### 906 | 907 | The goal of **workflowsets** is to allow users to create and easily fit a large number of models. **workflowsets** can create a *workflow set* that holds multiple workflow objects. These objects can be created by crossing all combinations of preprocessors (e.g., formula, recipe, etc) and model specifications. This set can be tuned or resampled using a set of specific functions. 908 | 909 | ### Exercise 3 910 | 911 | Create a workflow set by using the method `workflow_set()`. Add the parameter `preproc` and set it equal to the `location` list created earlier. 912 | 913 | ```{r creating-multiple-wo-3, exercise = TRUE} 914 | 915 | ``` 916 | 917 | ```{r creating-multiple-wo-3-hint-1, eval = FALSE} 918 | workflow_set(preproc = ...) 919 | ``` 920 | 921 | ```{r include = FALSE} 922 | #workflow_set(preproc = location) 923 | ``` 924 | 925 | This throws an error because no model is specified for the set. 926 | 927 | ### Exercise 4 928 | 929 | We will use a linear model `lm_model` across the list of locations. Copy the previous code and add the parameter `models`, setting it equal to `list(lm = lm_model)`. Set this expression equal to `location_models` using `<-`. 930 | 931 | ```{r creating-multiple-wo-4, exercise = TRUE} 932 | 933 | ``` 934 | 935 | 936 | 937 | ```{r creating-multiple-wo-4-hint-1, eval = FALSE} 938 | ... <- workflow_set(preproc = location, models = list(lm = ...)) 939 | ``` 940 | 941 | ```{r include = FALSE} 942 | workflow_set(preproc = location, models = list(lm = lm_model)) 943 | ``` 944 | 945 | ### 946 | 947 | In R, lm() stands for "linear model," and it is a built-in function used to fit linear regression models. Linear regression is a statistical method used to model the relationship between a dependent variable (response variable) and one or more independent variables (predictors) as a linear equation. 948 | 949 | ### Exercise 5 950 | 951 | Lets take a look at some of the info in the `location_models`. Extract the first column from the info section in `location_models` by using the `$` and `info[[1]]`. 952 | 953 | ```{r creating-multiple-wo-5, exercise = TRUE} 954 | 955 | ``` 956 | 957 | ```{r creating-multiple-wo-5-hint-1, eval = FALSE} 958 | location_models$info[...] 959 | ``` 960 | 961 | ```{r include = FALSE} 962 | location_models$info[1] 963 | ``` 964 | 965 | ### 966 | 967 | You can see that this produces a summary of what type of preprocessor (preproc) and model is being used in the `location_models` workflow. `extract_workflow()` returns the workflow object. The workflow will not have been estimated. 968 | 969 | 970 | ### Exercise 6 971 | 972 | To extract the workflow of the model, we will use `extract_workflow()`. Within `extract_workflow()`, add the parameters, `location_models` and `id = "coords_lm"`. 973 | 974 | ```{r creating-multiple-wo-6, exercise = TRUE} 975 | 976 | ``` 977 | 978 | ```{r creating-multiple-wo-6-hint-1, eval = FALSE} 979 | extract_workflow(location_models, id = "...") 980 | 981 | ``` 982 | 983 | ```{r include = FALSE} 984 | extract_workflow(location_models, id = "coords_lm") 985 | ``` 986 | 987 | ### 988 | 989 | Workflow sets are mostly designed to work with resampling. The columns `option` and `result` must be populated with specific types of objects that result from resampling. 990 | 991 | ### Exercise 7 992 | 993 | Let’s create model fits for each formula and save them in a new column called fit. We’ll use basic **dplyr** and **purrr** operations. Hit "Run Code". 994 | 995 | ```{r creating-multiple-wo-7, exercise = TRUE} 996 | location_models <- 997 | location_models %>% 998 | mutate(fit = map(info, ~ fit(.x$workflow[[1]], ames_train))) 999 | 1000 | location_models 1001 | ``` 1002 | 1003 | ### 1004 | 1005 | As you can see, we have tibbles of information of 4 different workflows that we have created. 1006 | 1007 | ### Exercise 8 1008 | 1009 | Lets extract the fit model from `location_models` by using `$fit[[1]]`. 1010 | 1011 | ```{r creating-multiple-wo-8, exercise = TRUE} 1012 | 1013 | ``` 1014 | 1015 | ```{r creating-multiple-wo-8-hint-1, eval = FALSE} 1016 | location_models$fit[[...]] 1017 | ``` 1018 | 1019 | ```{r include = FALSE} 1020 | location_models$fit[[1]] 1021 | ``` 1022 | 1023 | ### 1024 | 1025 | We use a **purrr** function here to map through our models, but there is an easier, better approach to fit workflow sets that will be introduced in later tutorials. 1026 | 1027 | ### 1028 | 1029 | Great Job! You now know how to create multiple workflows and put them in a workflow set. You also know how to extract these sets and analyze them based on the model of the chosen workflow set. 1030 | 1031 | ## Evaluatin the Test Set 1032 | ### 1033 | 1034 | Let’s say that we’ve concluded our model development and have settled on a final model. There is a convenience function called `last_fit()` that will fit the model to the entire training set and evaluate it with the testing set. 1035 | 1036 | ### Exercise 1 1037 | 1038 | Enter `last_fit()` and add the parameter `lm_wflow`. Hit "Run Code." (Note: This will throw an error.) 1039 | 1040 | ```{r evaluatin-the-test-s-1, exercise = TRUE} 1041 | 1042 | ``` 1043 | 1044 | ```{r evaluatin-the-test-s-1-hint, eval = FALSE} 1045 | last_fit(...) 1046 | ``` 1047 | 1048 | ```{r, include = FALSE} 1049 | #last_fit(lm_wflow) 1050 | ``` 1051 | 1052 | ### 1053 | 1054 | The `last_fit()` function is used to fit a model on the last split of a resampled data set, typically obtained through cross-validation or bootstrapping. It is useful when you want to use the final model trained on the entire training dataset for making predictions on new, unseen data. 1055 | 1056 | ### Exercise 2 1057 | 1058 | We always need to a have split for `last_fit()`. Add the parameter `ames_split` to the function and set the whole expression to `final_lm_res`. Print `final_lm_res` on the next line to see the output. 1059 | 1060 | ```{r evaluatin-the-test-s-2, exercise = TRUE} 1061 | 1062 | ``` 1063 | 1064 | 1065 | 1066 | ```{r evaluatin-the-test-s-2-hint, eval = FALSE} 1067 | final_lm_res <- last_fit(lm_wflow, ...) 1068 | ``` 1069 | 1070 | ```{r, include = FALSE} 1071 | final_lm_res <- last_fit(lm_wflow, ames_split) 1072 | ``` 1073 | 1074 | ### 1075 | 1076 | The .workflow column contains the fitted workflow and can be pulled out of the results using `extract_workflow()`. 1077 | 1078 | ### Exercise 3 1079 | 1080 | Use `extract_workflow()` and add the parameter `final_lm_res`. Hit "Run Code". 1081 | 1082 | ```{r evaluatin-the-test-s-3, exercise = TRUE} 1083 | 1084 | ``` 1085 | 1086 | ```{r evaluatin-the-test-s-3-hint, eval = FALSE} 1087 | extract_workflow(...) 1088 | ``` 1089 | 1090 | ```{r, include = FALSE} 1091 | extract_workflow(final_lm_res) 1092 | ``` 1093 | 1094 | ### 1095 | 1096 | `collect_metrics()` and `collect_predictions()` provide access to the performance metrics and predictions, respectively. The `collect_metrics()` function is a lovely way to extract model performance metrics with resampling. `collect_predictions()` can summarize the various results over replicate out-of-sample predictions. 1097 | 1098 | ### Exercise 4 1099 | 1100 | Run `collect_metrics()` and `collect_predictions()`, on separate lines, with the parameter being `final_lm_res`. Set the expressions equal to `c_mtrcs` and `c_predic`, respectively. Print these two functions on the next two consecutive lines. 1101 | 1102 | ```{r evaluatin-the-test-s-4, exercise = TRUE} 1103 | 1104 | ``` 1105 | 1106 | 1107 | 1108 | ```{r evaluatin-the-test-s-4-hint, eval = FALSE} 1109 | c_mtrcs <- collect_metrics(...) 1110 | c_predic <- collect_predictions(...) 1111 | ``` 1112 | 1113 | ```{r, include = FALSE} 1114 | c_mtrcs <- collect_metrics(final_lm_res) 1115 | c_predic <- collect_predictions(final_lm_res) 1116 | ``` 1117 | 1118 | ### 1119 | 1120 | Statistical metrics are used to describe the distribution of data, compare groups, assess relationships between variables, and draw conclusions from data.The model takes the predictor variables from the test data and generates predictions for the outcome variable. For example, in linear regression, the model estimates the response variable based on the values of the predictor variables. 1121 | 1122 | ### Exercise 5 1123 | 1124 | Finally, lets `slice()` the predictions output, as it is too many unnecessary rows that we need to analyze at once. Copy the previous code and slice the first 5 rows by adding the parameter `1:5` to `slice()`. Print them out on the next lines. 1125 | 1126 | ```{r evaluatin-the-test-s-5, exercise = TRUE} 1127 | 1128 | ``` 1129 | 1130 | 1131 | 1132 | ```{r evaluatin-the-test-s-5-hint, eval = FALSE} 1133 | c_predic <- 1134 | collect_predictions(final_lm_res) |> 1135 | slice(...) 1136 | ``` 1137 | 1138 | ```{r, include = FALSE} 1139 | c_predic <- 1140 | collect_predictions(final_lm_res) |> 1141 | slice(1:5) 1142 | ``` 1143 | 1144 | ### 1145 | 1146 | Great Job! You now know how to evaluate a testing set by using `last_fit()` and statistical metrics and predictions using the `collect_metrics()` and `collect_predictions()`. 1147 | 1148 | 1149 | ## Summary 1150 | ### 1151 | 1152 | This tutorial covers [Chapter 7: A Model Workflow](https://www.tmwr.org/workflows.html) from [*Tidy Modeling with R*](https://www.tmwr.org/) by Max Kuhn and Julia Silge. In the previous chapter, we discussed the [**parsnip**](https://parsnip.tidymodels.org/) package, which can be used to define and fit the model. This chapter introduced a new concept called a model workflow. The purpose of this concept (and the corresponding **tidymodels** `workflow()` object) encapsulated the major pieces of the modeling process. 1153 | 1154 | 1155 | ```{r download-answers, child = system.file("child_documents/download_answers.Rmd", package = "tutorial.helpers")} 1156 | ``` 1157 | -------------------------------------------------------------------------------- /inst/tutorials/09-judging-model-effectiveness/tutorial.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Judging Model Effectiveness 3 | author: Pratham Kancherla and David Kane 4 | tutorial: 5 | id: judging-model-effectiveness 6 | output: 7 | learnr::tutorial: 8 | progressive: yes 9 | allow_skip: yes 10 | runtime: shiny_prerendered 11 | description: 'Tutorial for Chapter 9: Judging Model Effectiveness' 12 | --- 13 | 14 | ```{r setup, include = FALSE} 15 | library(learnr) 16 | library(tutorial.helpers) 17 | library(tidyverse) 18 | library(tidymodels) 19 | 20 | tidymodels_prefer() 21 | 22 | knitr::opts_chunk$set(echo = FALSE) 23 | options(tutorial.exercise.timelimit = 60, 24 | tutorial.storage = "local") 25 | 26 | data(ames) 27 | ames <- mutate(ames, Sale_Price = log10(Sale_Price)) 28 | 29 | ames_split <- initial_split(ames, prop = 0.80, strata = Sale_Price) 30 | ames_train <- training(ames_split) 31 | ames_test <- testing(ames_split) 32 | 33 | ames_rec <- 34 | recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 35 | Latitude + Longitude, data = ames_train) %>% 36 | step_log(Gr_Liv_Area, base = 10) %>% 37 | step_other(Neighborhood, threshold = 0.01) %>% 38 | step_dummy(all_nominal_predictors()) %>% 39 | step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) %>% 40 | step_ns(Latitude, Longitude, deg_free = 20) 41 | 42 | lm_model <- 43 | linear_reg() |> 44 | set_engine("lm") 45 | 46 | lm_wflow <- 47 | workflow() |> 48 | add_model(lm_model) |> 49 | add_recipe(ames_rec) 50 | 51 | lm_fit <- fit(lm_wflow, ames_train) 52 | 53 | ames_test_res <- predict(lm_fit, new_data = ames_test %>% select(-Sale_Price)) 54 | 55 | ames_test_res <- bind_cols(ames_test_res, ames_test %>% select(Sale_Price)) 56 | 57 | 58 | ames_metrics <- metric_set(rmse, rsq, mae) 59 | 60 | classification_metrics <- metric_set(accuracy, mcc, f_meas) 61 | 62 | two_class_curve <- roc_curve(two_class_example, truth, Class1) 63 | 64 | ``` 65 | 66 | ```{r copy-code-chunk, child = system.file("child_documents/copy_button.Rmd", package = "tutorial.helpers")} 67 | ``` 68 | 69 | ```{r info-section, child = system.file("child_documents/info_section.Rmd", package = "tutorial.helpers")} 70 | ``` 71 | 72 | ## Introduction 73 | ### 74 | 75 | This tutorial covers [Chapter 9: Judging Model Effectiveness](https://www.tmwr.org/compare.html#workflow-set) from [*Tidy Modeling with R*](https://www.tmwr.org/) by Max Kuhn and Julia Silge. This tutorial will demonstrate the **yardstick** package, a core **tidymodels** packages with the focus of measuring model performance. Before illustrating syntax, let’s explore whether empirical validation using performance metrics is worthwhile when a model is focused on inference rather than prediction. 76 | 77 | 78 | ## Regression Metrics 79 | ### 80 | 81 | **tidymodels** prediction functions produce tibbles with columns for the predicted values. These columns have consistent names, and the functions in the **yardstick** package that produce performance metrics have consistent interfaces. 82 | 83 | ### Exercise 1 84 | 85 | Load the library **tidyverse** using `library()`. 86 | 87 | ```{r regression-metrics-1, exercise = TRUE} 88 | 89 | ``` 90 | 91 | ```{r regression-metrics-1-hint-1, eval = FALSE} 92 | library(...) 93 | ``` 94 | 95 | ```{r include = FALSE} 96 | library(tidyverse) 97 | ``` 98 | 99 | ### 100 | 101 | Two common metrics for regression models are the root mean squared error (RMSE) and the coefficient of determination (a.k.a. R2). The former measures accuracy while the latter measures correlation. These are not necessarily the same thing. 102 | 103 | ### Exercise 2 104 | 105 | Now lets create the prediction model using `predict()`. Add the parameter`lm_fit` to `predict()` and the data object, `ames_test` (from the previous tutorials), hit "Run Code". 106 | 107 | ```{r regression-metrics-2, exercise = TRUE} 108 | 109 | ``` 110 | 111 | ```{r regression-metrics-2-hint-1, eval = FALSE} 112 | predict(lm_fit, new_data = ...) 113 | ``` 114 | 115 | ```{r include = FALSE} 116 | predict(lm_fit, new_data = ames_test) 117 | ``` 118 | 119 | ### 120 | 121 | Now we need to add `-Sale_Price` to ignore that column in the data set `ames_test`. 122 | 123 | ### Exercise 3 124 | 125 | Copy the previous code and pipe `select(-Sale_Price)` to the code. Set this expression equal to `ames_test_res`. Hit "Run Code". 126 | 127 | ```{r regression-metrics-3, exercise = TRUE} 128 | 129 | ``` 130 | 131 | 132 | 133 | ```{r regression-metrics-3-hint-1, eval = FALSE} 134 | ames_test_res <- predict(lm_fit, new_data = ames_test |> select(-...)) 135 | ``` 136 | 137 | ```{r include = FALSE} 138 | ames_test_res <- predict(lm_fit, new_data = ames_test |> select(-Sale_Price)) 139 | ``` 140 | 141 | ### 142 | 143 | The `select()` function is part of the **dplyr** package in R, which is widely used for data manipulation tasks. The function allows you to choose or remove specific columns from a data frame or tibble, providing a flexible and straightforward way to work with data. 144 | 145 | ### Exercise 4 146 | 147 | The predicted numeric outcome from the regression model is named .pred. Let’s match the predicted values with their corresponding observed outcome values using `bind_cols()`. Within in the function, add the parameter `ames_test_res` and the `ames_test` as the data argument. 148 | 149 | ```{r regression-metrics-4, exercise = TRUE} 150 | 151 | ``` 152 | 153 | ```{r regression-metrics-4-hint-1, eval = FALSE} 154 | bind_cols(ames_test_res, ...) 155 | ``` 156 | 157 | ```{r include = FALSE} 158 | bind_cols(ames_test_res, ames_test) 159 | ``` 160 | 161 | ### 162 | 163 | In R, `bind_cols()` is a function from the **dplyr** package used to combine data frames or tibbles by column-wise binding. It is commonly used to merge multiple data frames horizontally, adding new columns to the resulting data frame. 164 | 165 | ### Exercise 5 166 | 167 | We only want to compare the predicted values to the `Sale_Price` column, which is why we need to only look at that column from the `ames_test` data set. Copy the previous code and within the function, pipe `select(Sale_Price)` after `ames_test`. Set this expression to `ames_test_res`. 168 | 169 | ```{r regression-metrics-5, exercise = TRUE} 170 | 171 | ``` 172 | 173 | 174 | 175 | ```{r regression-metrics-5-hint-1, eval = FALSE} 176 | ames_test_res <- bind_cols(ames_test_res, ames_test |> ...(Sale_Price)) 177 | ``` 178 | 179 | ```{r include = FALSE} 180 | ames_test_res <- bind_cols(ames_test_res, ames_test |> select(Sale_Price)) 181 | ``` 182 | 183 | ### 184 | 185 | Note that both the predicted and observed outcomes are in log-10 units. It is best practice to analyze the predictions on the transformed scale (if one were used) even if the predictions are reported using the original units. 186 | 187 | ### Exercise 6 188 | 189 | Now let's graph the data. Pipe `ggplot()` to `ames_test_res` and hit "Run Code." 190 | 191 | ```{r regression-metrics-6, exercise = TRUE} 192 | 193 | ``` 194 | 195 | ```{r regression-metrics-6-hint-1, eval = FALSE} 196 | ames_test_res |> 197 | ...() 198 | ``` 199 | 200 | ```{r include = FALSE} 201 | ames_test_res |> 202 | ggplot() 203 | ``` 204 | 205 | ### 206 | 207 | ### Exercise 7 208 | 209 | Copy the previous code and within `aes()`, set `x = Sale_Price` and `y = .pred`. Hit "Run Code". 210 | 211 | ```{r regression-metrics-7, exercise = TRUE} 212 | 213 | ``` 214 | 215 | 216 | 217 | ```{r regression-metrics-7-hint-1, eval = FALSE} 218 | ames_test_res |> 219 | ggplot(aes(x = ..., y = .pred)) 220 | ``` 221 | 222 | ```{r include = FALSE} 223 | ames_test_res |> 224 | ggplot(aes(x = Sale_Price, y = .pred)) 225 | ``` 226 | 227 | ### 228 | 229 | In R, `geom_abline()` is a function from the **ggplot2** package used to add reference lines to a plot created using the `ggplot()` function. These reference lines can be horizontal, vertical, or diagonal, and they are typically used to highlight specific relationships or patterns in the data. 230 | 231 | ### Exercise 8 232 | 233 | We want to add a regression line to the plot by using `geom_abline()`. Copy the previous code and add `geom_abline()`. Set the line type `lty` to `2` and hit "Run Code". 234 | 235 | ```{r regression-metrics-8, exercise = TRUE} 236 | 237 | ``` 238 | 239 | 240 | 241 | ```{r regression-metrics-8-hint-1, eval = FALSE} 242 | ... + 243 | geom_abline(lty = ...) 244 | ``` 245 | 246 | ```{r include = FALSE} 247 | ames_test_res |> 248 | ggplot(aes(x = Sale_Price, y = .pred)) + 249 | geom_abline(lty = 2) 250 | ``` 251 | 252 | ### 253 | 254 | `geom_point()` is a function in the R programming language that is part of the **ggplot2** package. It is used to create scatter plots in data visualization, where individual data points are represented as points on a Cartesian coordinate system. 255 | 256 | ### Exercise 9 257 | 258 | Copy the previous code and add `geom_point()`. Set `alpha = 0.5`. Hit "Run Code". 259 | 260 | ```{r regression-metrics-9, exercise = TRUE} 261 | 262 | ``` 263 | 264 | 265 | 266 | ```{r regression-metrics-9-hint-1, eval = FALSE} 267 | ... + 268 | geom_point(... = 0.5) 269 | ``` 270 | 271 | ```{r include = FALSE} 272 | ames_test_res |> 273 | ggplot(aes(x = Sale_Price, y = .pred)) + 274 | geom_abline(lty = 2) + 275 | geom_point(alpha = 0.5) 276 | ``` 277 | 278 | ### 279 | 280 | A model optimized for RMSE has more variability but has relatively uniform accuracy across the range of the outcome. 281 | 282 | ### Exercise 10 283 | 284 | Copy the previous code and add the correct labels to the graph. 285 | 286 | x: "Sale_Price (log10)" 287 | 288 | y: "Predicted Sale Price (log10)" 289 | 290 | ```{r regression-metrics-10, exercise = TRUE} 291 | 292 | ``` 293 | 294 | 295 | 296 | ```{r regression-metrics-10-hint-1, eval = FALSE} 297 | ... + 298 | labs(y = "...", x = "...") 299 | ``` 300 | 301 | ```{r include = FALSE} 302 | ames_test_res |> 303 | ggplot(aes(x = Sale_Price, y = .pred)) + 304 | geom_abline(lty = 2) + 305 | geom_point(alpha = 0.5) + 306 | labs(y = "Predicted Sale Price (log10)", x = "Sale Price (log10)") 307 | ``` 308 | 309 | ### 310 | 311 | For regression models, `coord_obs_pred()` can be used in a **ggplot** to make the x- and y-axes have the same exact scale along with an aspect ratio of one. 312 | 313 | ### Exercise 11 314 | 315 | Copy the previous code and add `coord_obs_pred()`. Also add `theme_classic()` to make the graph look more presentable. Hit "Run Code". 316 | 317 | ```{r regression-metrics-11, exercise = TRUE} 318 | 319 | ``` 320 | 321 | 322 | 323 | ```{r regression-metrics-11-hint-1, eval = FALSE} 324 | ... + 325 | coord_obs_pred() + 326 | theme_classic() 327 | ``` 328 | 329 | ```{r include = FALSE} 330 | ames_test_res |> 331 | ggplot(aes(x = Sale_Price, y = .pred)) + 332 | geom_abline(lty = 2) + 333 | geom_point(alpha = 0.5) + 334 | labs(y = "Predicted Sale Price (log10)", x = "Sale Price (log10)") + 335 | coord_obs_pred() + 336 | theme_classic() 337 | ``` 338 | 339 | ### 340 | 341 | `rmse()` is the square root of the mean of the square of all of the error. RMSE is a good measure of accuracy, but only to compare prediction errors of different models or model configurations for a particular variable and not between variables, as it is scale-dependent. 342 | 343 | ### Exercise 12 344 | 345 | We will be using `rmse()` to compute the Root-Mean Square Error of `ames_test_res`. Pipe `ames_test_res` to `rsme()`. This might throw an error. 346 | 347 | ```{r regression-metrics-12, exercise = TRUE} 348 | 349 | ``` 350 | 351 | ```{r regression-metrics-12-hint-1, eval = FALSE} 352 | ames_test_res |> 353 | rmse() 354 | ``` 355 | 356 | ### Exercise 13 357 | 358 | Copy the previous code. We need to add the parameters `truth` and `estimate`. `truth` is our independent variable and `estimate` is our dependent, therefore, set `truth = Sale_Price` and `estimate = .pred`. 359 | 360 | ```{r regression-metrics-13, exercise = TRUE} 361 | 362 | ``` 363 | 364 | 365 | 366 | ```{r regression-metrics-13-hint-1, eval = FALSE} 367 | ames_test_res |> 368 | rmse(truth = ..., ...= .pred) 369 | ``` 370 | 371 | ```{r include = FALSE} 372 | ames_test_res |> 373 | rmse(truth = Sale_Price, estimate = .pred) 374 | ``` 375 | 376 | ### 377 | 378 | `metric_set()` allows you to combine multiple metric functions together into a new function that calculates all of them at once. 379 | 380 | ### Exercise 14 381 | 382 | Let's create a metric set consistent of these functions: Root-Mean Square Error (rmse), R-Squared (rsq), and Mean Absolute Error (mae). Type `metric_set()` and add `rmse`, `rsq`, and `mae`. Set this expression <- to `ames_metrics`. 383 | 384 | ```{r regression-metrics-14, exercise = TRUE} 385 | 386 | ``` 387 | 388 | ```{r regression-metrics-14-hint-1, eval = FALSE} 389 | ames_metrics <- metric_set(..., ..., ...) 390 | ``` 391 | 392 | ```{r include = FALSE} 393 | ames_metrics <- metric_set(rmse, rsq, mae) 394 | ``` 395 | 396 | ### 397 | 398 | An inferential model is used primarily to understand relationships, and typically emphasizes the choice (and validity) of probabilistic distributions and other generative qualities that define the model. 399 | 400 | ### Exercise 15 401 | 402 | Now lets use the same parameters as seen in Exercise 14. Use the parameters in the function created in the previous exercise, `ames_metrics`. 403 | 404 | ```{r regression-metrics-15, exercise = TRUE} 405 | 406 | ``` 407 | 408 | 409 | 410 | ```{r regression-metrics-15-hint-1, eval = FALSE} 411 | ames_metrics(ames_test_res, truth = ..., estimate = ...) 412 | ``` 413 | 414 | ```{r include = FALSE} 415 | ames_metrics(ames_test_res, truth = Sale_Price, estimate = .pred) 416 | ``` 417 | 418 | ### 419 | 420 | The root mean squared error and mean absolute error metrics are both on the scale of the outcome (so `log10(Sale_Price)` for our example) and measure the difference between the predicted and observed values. The value for R2 measures the squared correlation between the predicted and observed values, so values closer to one are better. 421 | 422 | ### 423 | 424 | Great job! You now know how to calculate and analyze regression metrics. 425 | 426 | ## Binay Classification Metrics 427 | ### 428 | 429 | In binary classification, we are dealing with problems where the target variable has two classes or categories. Commonly, these classes are denoted as "positive" and "negative." 430 | 431 | ### Exercise 1 432 | 433 | The data set we will be looking at in this section is `two_class_example`. Type `tibble(two_class_example)` to get a sense of what the data looks like. 434 | 435 | ```{r binay-classification-1, exercise = TRUE} 436 | 437 | ``` 438 | 439 | ```{r binay-classification-1-hint-1, eval = FALSE} 440 | tibble(...) 441 | ``` 442 | 443 | ```{r include = FALSE} 444 | tibble(two_class_example) 445 | ``` 446 | 447 | ### 448 | 449 | The second and third columns are the predicted class probabilities for the test set while predicted are the discrete predictions. 450 | 451 | ### Exercise 2 452 | 453 | A confusion matrix, `conf_mat()`, also known as an error matrix, is a table used to evaluate the performance of a classification model in machine learning. It summarizes the results of a binary classification task by comparing the predicted class labels to the actual class labels in the test data. 454 | 455 | Type `conf_mat()` and add the parameters, `two_class_example`, `truth`, and `predicted`. Hit "Run Code". 456 | 457 | ```{r binay-classification-2, exercise = TRUE} 458 | 459 | ``` 460 | 461 | ```{r binay-classification-2-hint-1, eval = FALSE} 462 | conf_mat(two_class_example, ..., predicted) 463 | ``` 464 | 465 | ```{r include = FALSE} 466 | conf_mat(two_class_example, truth, predicted) 467 | ``` 468 | 469 | ### 470 | 471 | Returns range of summary measures of the forecast accuracy. If *x* is provided, the function measures test set forecast accuracy based on *x-f*. If x is not provided, the function only produces training set accuracy measures of the forecasts based on *f["x"]-fitted(f)*. 472 | 473 | ### Exercise 3 474 | 475 | Copy the previous code and change `conf_mat()` to `accuracy()`. Hit "Run Code". 476 | 477 | ```{r binay-classification-3, exercise = TRUE} 478 | 479 | ``` 480 | 481 | 482 | 483 | ```{r binay-classification-3-hint-1, eval = FALSE} 484 | accuracy(two_class_example, truth, ...) 485 | ``` 486 | 487 | ```{r include = FALSE} 488 | accuracy(two_class_example, truth, predicted) 489 | ``` 490 | 491 | ### 492 | 493 | The term "MCC" typically refers to the Matthews Correlation Coefficient, which is a metric commonly used to evaluate the performance of binary classification models. The Matthews Correlation Coefficient takes into account true positives, true negatives, false positives, and false negatives and provides a balanced metric even for imbalanced data sets. 494 | 495 | ### Exercise 4 496 | 497 | Copy the previous code and change `accuracy()` to `mcc()`. Hit "Run Code". 498 | 499 | ```{r binay-classification-4, exercise = TRUE} 500 | 501 | ``` 502 | 503 | 504 | 505 | ```{r binay-classification-4-hint-1, eval = FALSE} 506 | mcc(two_class_example, ..., predicted) 507 | ``` 508 | 509 | ```{r include = FALSE} 510 | mcc(two_class_example, truth, predicted) 511 | ``` 512 | 513 | ### 514 | 515 | The term "F-measure" (also known as F1-score) is a commonly used metric in binary classification to evaluate the performance of a model. The F-measure is the harmonic mean of precision and recall, providing a balanced metric that takes both false positives and false negatives into account. 516 | 517 | ### Exercise 5 518 | 519 | Copy the previous code and change the previous function name to `f_meas()`. Hit "Run code". 520 | 521 | ```{r binay-classification-5, exercise = TRUE} 522 | 523 | ``` 524 | 525 | 526 | 527 | ```{r binay-classification-5-hint-1, eval = FALSE} 528 | f_meas(two_class_example, ..., predicted) 529 | ``` 530 | 531 | ```{r include = FALSE} 532 | f_meas(two_class_example, truth, predicted) 533 | ``` 534 | 535 | ### 536 | 537 | The Matthews correlation coefficient and F1 score both summarize the confusion matrix, but compared to `mcc()`, which measures the quality of both positive and negative examples, the `f_meas()` metric emphasizes the positive class, i.e., the event of interest. 538 | 539 | ### Exercise 6 540 | 541 | Now lets create a metric set of the functions `accuracy`, `mcc`, `f_meas`. Within `metric_set()`, add the parameters `accuracy, mcc, f_meas`. Set this expression equal to `classification_metrics` using the `<-` operator. 542 | 543 | ```{r binay-classification-6, exercise = TRUE} 544 | 545 | ``` 546 | 547 | ```{r binay-classification-6-hint-1, eval = FALSE} 548 | classification_metrics <- ...(accuracy, mcc, f_meas) 549 | ``` 550 | 551 | ```{r include = FALSE} 552 | classification_metrics <- metric_set(accuracy, mcc, f_meas) 553 | ``` 554 | 555 | ### 556 | 557 | There is some heterogeneity in R functions in this regard; some use the first level and others the second to denote the event of interest. We consider it more intuitive that the first level is the most important. 558 | 559 | ### Exercise 7 560 | 561 | Now lets call the method created in the previous exercise using the parameters from Exercise 5. Within `classification_metrics()`, set `truth = truth` and `estimate = predicted` and hit "Run Code". 562 | 563 | ```{r binay-classification-7, exercise = TRUE} 564 | 565 | ``` 566 | 567 | ```{r binay-classification-7-hint-1, eval = FALSE} 568 | classification_metrics(two_class_example, truth, ...) 569 | ``` 570 | 571 | ```{r include = FALSE} 572 | classification_metrics(two_class_example, truth = truth, estimate = predicted) 573 | ``` 574 | 575 | ### 576 | 577 | The second level logic is borne of encoding the outcome as 0/1 (in which case the second value is the event) and unfortunately remains in some packages. However, **tidymodels** (along with many other R packages) require a categorical outcome to be encoded as a factor and, for this reason, the legacy justification for the second level as the event becomes irrelevant. 578 | 579 | ### Exercise 8 580 | 581 | As an example where the second level is the event below. Hit "Run Code". 582 | 583 | 584 | ```{r binay-classification-8, exercise = TRUE} 585 | f_meas(two_class_example, truth, predicted, event_level = "second") 586 | ``` 587 | 588 | ```{r include = FALSE} 589 | f_meas(two_class_example, truth, predicted, event_level = "second") 590 | ``` 591 | 592 | In this output, the .estimator value of “binary” indicates that the standard formula for binary classes will be used. 593 | 594 | ### Exercise 9 595 | 596 | The term "ROC curve" refers to the Receiver Operating Characteristic curve, which is a graphical representation of the performance of a binary classification model at various classification thresholds. We will use the `roc_curve()` to represent the performance of the binary classification model used in this section. 597 | 598 | Within `roc_curve()`, add the parameters `two_class_example`, `truth`, and `Class1`. Set this expression equal to `two_class_curve` and hit "Run Code". 599 | 600 | ```{r binay-classification-9, exercise = TRUE} 601 | 602 | ``` 603 | 604 | ```{r binay-classification-9-hint-1, eval = FALSE} 605 | two_class_curve <- roc_curve(..., truth, Class1) 606 | ``` 607 | 608 | ```{r include = FALSE} 609 | two_class_curve <- roc_curve(two_class_example, truth, Class1) 610 | ``` 611 | 612 | ### 613 | 614 | The ROC curve plots the True Positive Rate (TPR) against the False Positive Rate (FPR) for different threshold values, and it helps to visualize the trade-off between sensitivity and specificity. 615 | 616 | ### Exercise 10 617 | 618 | The term "ROC AUC" refers to the Area Under the Receiver Operating Characteristic Curve, which is a commonly used metric to evaluate the performance of binary classification models. 619 | 620 | Now lets call the function `roc_auc()` and add the same parameters as the parameters in `roc_curve()` from the previous exercise. 621 | 622 | ```{r binay-classification-10, exercise = TRUE} 623 | 624 | ``` 625 | 626 | ```{r binay-classification-10-hint-1, eval = FALSE} 627 | roc_auc(two_class_example, truth, ...) 628 | ``` 629 | 630 | ```{r include = FALSE} 631 | roc_auc(two_class_example, truth, Class1) 632 | ``` 633 | 634 | ### 635 | 636 | The ROC AUC provides a single value that represents the overall performance of the model across different classification thresholds. 637 | 638 | ### 639 | 640 | `autoplot()` is a generic function in **ggfortify** that is used to automatically generate visualizations (plots) for various objects or data types. The purpose of `autoplot()` is to provide an easy way to create high-quality, informative plots without having to manually specify all the details. 641 | 642 | ### Exercise 11 643 | 644 | We will be using the `autoplot()` function to graph the roc curve created in the previous exercises. Within `autoplot()`, add the parameter `two_class_curve`. 645 | 646 | ```{r binay-classification-11, exercise = TRUE} 647 | 648 | ``` 649 | 650 | ```{r binay-classification-11-hint-1, eval = FALSE} 651 | autoplot() 652 | ``` 653 | 654 | ```{r include = FALSE} 655 | autoplot(two_class_curve) 656 | ``` 657 | 658 | ### 659 | 660 | Great Job! You now know the basics of binary classification metrics and how to analyze these metrics using functions such as `accuracy()`, `f_meas()`, `roc_curve()`, etc. 661 | 662 | ## Multiclass Classification Metrics 663 | ### 664 | 665 | In multiclass classification, we are dealing with problems where the target variable has more than two classes or categories. Unlike binary classification, where we have true positive, true negative, false positive, and false negative, multiclass classification introduces additional complexity in evaluating the performance of the model. 666 | 667 | ### Exercise 1 668 | 669 | The data set we will be using is `hpc_cv`. Type `tibble(hpc_cv)` to get a sense of how the data looks. 670 | 671 | ```{r multiclass-classific-1, exercise = TRUE} 672 | 673 | ``` 674 | 675 | ```{r multiclass-classific-1-hint-1, eval = FALSE} 676 | tibble(...) 677 | ``` 678 | 679 | ```{r include = FALSE} 680 | tibble(hpc_cv) 681 | ``` 682 | 683 | ### 684 | 685 | As before, there are factors for the observed and predicted outcomes along with four other columns of predicted probabilities for each class. (These data also include a Resample column. These `hpc_cv` results are for out-of-sample predictions associated with 10-fold cross-validation.) 686 | 687 | ### Exercise 2 688 | 689 | The functions for metrics that use the discrete class predictions are identical to their binary counterparts and the functions we will be using are `accuracy()` and `mcc()`. First, within `accuracy()`, add the parameters `hpc_cv`, `obs`, and `pred`. 690 | 691 | ```{r multiclass-classific-2, exercise = TRUE} 692 | 693 | ``` 694 | 695 | ```{r multiclass-classific-2-hint-1, eval = FALSE} 696 | accuracy(hpc_cv, obs, ...) 697 | ``` 698 | 699 | ```{r include = FALSE} 700 | accuracy(hpc_cv, obs, pred) 701 | ``` 702 | 703 | ### 704 | 705 | The Matthews correlation coefficient (mcc) was originally designed for two classes but has been extended to cases with more class levels. 706 | 707 | ### Exercise 3 708 | 709 | Copy the previous code and switch the function to `mcc()`. 710 | 711 | ```{r multiclass-classific-3, exercise = TRUE} 712 | 713 | ``` 714 | 715 | 716 | 717 | ```{r multiclass-classific-3-hint-1, eval = FALSE} 718 | mcc(hpc_cv, obs, ...) 719 | ``` 720 | 721 | ```{r include = FALSE} 722 | mcc(hpc_cv, obs, pred) 723 | ``` 724 | 725 | ### 726 | 727 | Note that, in these results, a “multiclass” .estimator is listed. Like “binary,” this indicates that the formula for outcomes with three or more class levels was used. 728 | 729 | ### Exercise 4 730 | 731 | Using sensitivity as an example, the usual two-class calculation is the ratio of the number of correctly predicted events divided by the number of true events. **yardstick** functions can automatically apply these methods via the estimator argument. 732 | 733 | Copy the previous code and switch the function to `sensitivity` and add the parameter `estimated = "macro"` 734 | 735 | ```{r multiclass-classific-4, exercise = TRUE} 736 | 737 | ``` 738 | 739 | 740 | 741 | ```{r multiclass-classific-4-hint-1, eval = FALSE} 742 | sensitivity(hpc_cv, obs, pred, estimated = "...") 743 | ``` 744 | 745 | ```{r include = FALSE} 746 | sensitivity(hpc_cv, obs, pred, estimated = "macro") 747 | ``` 748 | 749 | ### 750 | 751 | "macro" estimator refers to a method of calculating performance metrics that treats all classes equally, regardless of their size or frequency in the dataset. 752 | 753 | ### Exercise 5 754 | 755 | Copy the previous code and swithc the `estimated` parameter to `"macro-weighted"`. 756 | 757 | ```{r multiclass-classific-5, exercise = TRUE} 758 | 759 | ``` 760 | 761 | 762 | 763 | ```{r multiclass-classific-5-hint-1, eval = FALSE} 764 | sensitivity(hpc_cv, obs, pred, estimated = "...") 765 | ``` 766 | 767 | ```{r include = FALSE} 768 | sensitivity(hpc_cv, obs, pred, estimated = "macro-weighted") 769 | ``` 770 | 771 | ### 772 | 773 | A "macro-weighted" strategy is a combination of both "macro" and "weighted", where the metrics are first computed separately for each class using the "macro" approach and then weighted by class size to provide a balanced metric that considers both class equality and class size. 774 | 775 | ### Exercise 6 776 | 777 | Copy the previous code and switch the `estimated` parameter to `"micro"`. 778 | 779 | ```{r multiclass-classific-6, exercise = TRUE} 780 | 781 | ``` 782 | 783 | 784 | 785 | ```{r multiclass-classific-6-hint-1, eval = FALSE} 786 | sensitivity(hpc_cv, obs, pred, estimated = "...") 787 | ``` 788 | 789 | ```{r include = FALSE} 790 | sensitivity(hpc_cv, obs, pred, estimated = "micro") 791 | ``` 792 | 793 | ### 794 | 795 | The term "micro" refers to a method of calculating performance metrics that aggregates the true positives, false positives, and false negatives across all classes and then computes the metrics. 796 | 797 | ### Exercise 7 798 | 799 | Hand and Till (2001) determined a multiclass technique for ROC curves. In this case, all of the class probability columns must be given to the function. Type `roc_auc` and add the parameters `hpc_cv` and `obs`. This will throw an error. 800 | 801 | ```{r multiclass-classific-7, exercise = TRUE} 802 | 803 | ``` 804 | 805 | 806 | 807 | ```{r multiclass-classific-7-hint-1, eval = FALSE} 808 | roc_auc(hpc_cv, ...) 809 | ``` 810 | 811 | ```{r include = FALSE} 812 | # roc_auc(hpc_cv, obs) 813 | ``` 814 | 815 | We need to select at least one item from the data set, which are the columns `VF, F, M, L`. 816 | 817 | ### Exercise 8 818 | 819 | Copy the previous code and add `VF, F, M, L` as parameters to the function `roc_auc`. 820 | 821 | ```{r multiclass-classific-8, exercise = TRUE} 822 | 823 | ``` 824 | 825 | 826 | 827 | ```{r multiclass-classific-8-hint-1, eval = FALSE} 828 | roc_auc(hpc_cv, obs, VF, ..., M, ...) 829 | ``` 830 | 831 | ```{r include = FALSE} 832 | roc_auc(hpc_cv, obs, VF, F, M, L) 833 | ``` 834 | 835 | ### 836 | 837 | Recall that these data have a column for the resampling groups. We haven’t yet discussed resampling in detail, but notice how we can pass a grouped data frame to the metric function to compute the metrics for each group using `group_by()`. 838 | 839 | ### Exercise 9 840 | 841 | Pipe `group_by(Resample)` to `hpc_cv` and hit "Run Code". 842 | 843 | ```{r multiclass-classific-9, exercise = TRUE} 844 | 845 | ``` 846 | 847 | ```{r multiclass-classific-9-hint-1, eval = FALSE} 848 | hpc_cv |> 849 | group_by(...) 850 | ``` 851 | 852 | ```{r include = FALSE} 853 | hpc_cv |> 854 | group_by(Resample) 855 | ``` 856 | 857 | ### 858 | 859 | In R, the `accuracy()` function is part of the **forecast** package, which is commonly used for time series forecasting and related tasks. The `accuracy()` function is used to compute accuracy measures for a fitted time series forecasting model. 860 | 861 | ### Exercise 10 862 | 863 | Copy the previous code and pipe `accuracy()` with the parameters being `obs, pred`. Hit "Run Code". 864 | 865 | ```{r multiclass-classific-10, exercise = TRUE} 866 | 867 | ``` 868 | 869 | 870 | 871 | ```{r multiclass-classific-10-hint-1, eval = FALSE} 872 | ... |> 873 | accuracy(obs, ...) 874 | ``` 875 | 876 | ```{r include = FALSE} 877 | hpc_cv |> 878 | group_by(Resample) |> 879 | accuracy(obs, pred) 880 | ``` 881 | 882 | ### 883 | 884 | Cohen's Kappa is a statistic that measures the agreement between predicted and actual class labels, considering the agreement that could occur by chance. R offers functions like `kappa2()` from the `vcd` package to calculate Cohen's Kappa. 885 | 886 | ### Exercise 11 887 | 888 | Now we will plot the data to get a better visual understanding of the grouping of the data. Copy the previous code and delete the `accuracy()` function. Instead, pipe `roc_curve()` and add the parameters `obs, VF, F, M, L`. 889 | 890 | ```{r multiclass-classific-11, exercise = TRUE} 891 | 892 | ``` 893 | 894 | 895 | 896 | ```{r multiclass-classific-11-hint-1, eval = FALSE} 897 | ... |> 898 | roc_curve(obs, VF, F, M, ...) 899 | ``` 900 | 901 | ```{r include = FALSE} 902 | hpc_cv |> 903 | group_by(Resample) |> 904 | roc_curve(obs, VF, F, M, L) 905 | ``` 906 | 907 | ### 908 | 909 | When working with multiclass classification, R provides flexibility to use an One-Versus-All (OvA) approach, where you treat each class as the positive class and the rest as the negative class. The **caret** package's `train()` function allows specifying the `classProbs = TRUE` parameter to enable this approach. 910 | 911 | ### Exercise 12 912 | 913 | Copy the previous code and pipe `autoplot()`. Hit "Run Code." 914 | 915 | ```{r multiclass-classific-12, exercise = TRUE} 916 | 917 | ``` 918 | 919 | 920 | 921 | ```{r multiclass-classific-12-hint-1, eval = FALSE} 922 | hpc_cv |> 923 | group_by(Resample) |> 924 | roc_curve(obs, VF, F, M, L) |> 925 | ...() 926 | ``` 927 | 928 | ```{r include = FALSE} 929 | hpc_cv |> 930 | group_by(Resample) |> 931 | roc_curve(obs, VF, F, M, L) |> 932 | autoplot() 933 | ``` 934 | 935 | ### 936 | 937 | Great job! You now know how to calculate and analyze multiclass classification metrics using methods such as `roc_curve()` and multiclass estimator such as `macro, macro-weighted, and micro". 938 | 939 | 940 | 941 | ## Summary 942 | ### 943 | 944 | This tutorial covered [Chapter 9: Judging Model Effectiveness](https://www.tmwr.org/compare.html#workflow-set) from [*Tidy Modeling with R*](https://www.tmwr.org/) by Max Kuhn and Julia Silge. This tutorial demonstrated the **yardstick** package, a core **tidymodels** packages with the focus of measuring model performance. Before illustrating syntax, we explored whether empirical validation using performance metrics is worthwhile when a model is focused on inference rather than prediction. Empirical validation can provide valuable insights into the model's goodness of fit and reliability. However, it's essential to keep in mind that the choice of performance metrics may differ from those commonly used in prediction models. Metrics like R-squared, which are popular for predictive models, may not be as informative for inferential models. 945 | 946 | ```{r download-answers, child = system.file("child_documents/download_answers.Rmd", package = "tutorial.helpers")} 947 | ``` 948 | -------------------------------------------------------------------------------- /inst/tutorials/10-resampling/images/fig-ten-point-eight.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/10-resampling/images/fig-ten-point-eight.png -------------------------------------------------------------------------------- /inst/tutorials/10-resampling/images/fig-ten-point-five.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/10-resampling/images/fig-ten-point-five.png -------------------------------------------------------------------------------- /inst/tutorials/10-resampling/images/fig-ten-point-one.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/10-resampling/images/fig-ten-point-one.png -------------------------------------------------------------------------------- /inst/tutorials/10-resampling/images/fig-ten-point-seven.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/10-resampling/images/fig-ten-point-seven.png -------------------------------------------------------------------------------- /inst/tutorials/10-resampling/images/fig-ten-point-six.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/10-resampling/images/fig-ten-point-six.png -------------------------------------------------------------------------------- /inst/tutorials/10-resampling/images/fig-ten-point-three.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/10-resampling/images/fig-ten-point-three.png -------------------------------------------------------------------------------- /inst/tutorials/10-resampling/images/fig-ten-point-two.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/10-resampling/images/fig-ten-point-two.png -------------------------------------------------------------------------------- /inst/tutorials/11-comparing-models/data/linear-statistical-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/11-comparing-models/data/linear-statistical-model.png -------------------------------------------------------------------------------- /inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic1.png -------------------------------------------------------------------------------- /inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic2.png -------------------------------------------------------------------------------- /inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic3.png -------------------------------------------------------------------------------- /inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic4.png -------------------------------------------------------------------------------- /inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic5.png -------------------------------------------------------------------------------- /inst/tutorials/14-iterative-search/images/pic1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic1.png -------------------------------------------------------------------------------- /inst/tutorials/14-iterative-search/images/pic10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic10.png -------------------------------------------------------------------------------- /inst/tutorials/14-iterative-search/images/pic2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic2.png -------------------------------------------------------------------------------- /inst/tutorials/14-iterative-search/images/pic3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic3.png -------------------------------------------------------------------------------- /inst/tutorials/14-iterative-search/images/pic4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic4.png -------------------------------------------------------------------------------- /inst/tutorials/14-iterative-search/images/pic5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic5.png -------------------------------------------------------------------------------- /inst/tutorials/14-iterative-search/images/pic6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic6.png -------------------------------------------------------------------------------- /inst/tutorials/14-iterative-search/images/pic7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic7.png -------------------------------------------------------------------------------- /inst/tutorials/14-iterative-search/images/pic8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic8.png -------------------------------------------------------------------------------- /inst/tutorials/14-iterative-search/images/pic9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic9.png -------------------------------------------------------------------------------- /inst/tutorials/16-dimensionality-reduction/images/pic1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/16-dimensionality-reduction/images/pic1.png -------------------------------------------------------------------------------- /inst/tutorials/16-dimensionality-reduction/images/pic2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/16-dimensionality-reduction/images/pic2.png -------------------------------------------------------------------------------- /inst/tutorials/16-dimensionality-reduction/images/pic3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/16-dimensionality-reduction/images/pic3.png -------------------------------------------------------------------------------- /inst/tutorials/16-dimensionality-reduction/images/pic4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/16-dimensionality-reduction/images/pic4.png -------------------------------------------------------------------------------- /inst/tutorials/16-dimensionality-reduction/images/pic5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/16-dimensionality-reduction/images/pic5.png -------------------------------------------------------------------------------- /inst/tutorials/18-explaining-models-and-predictions/images/pic1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/18-explaining-models-and-predictions/images/pic1.png -------------------------------------------------------------------------------- /inst/tutorials/18-explaining-models-and-predictions/tutorial.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Explaining Models and Predictions 3 | author: Aryan Kancherla 4 | tutorial: 5 | id: explaining-models-and-predictions 6 | output: 7 | learnr::tutorial: 8 | progressive: yes 9 | allow_skip: yes 10 | runtime: shiny_prerendered 11 | description: 'Tutorial for Chapter 18: Explaining Models and Predictions' 12 | --- 13 | 14 | ```{r setup, include = FALSE} 15 | library(learnr) 16 | library(tutorial.helpers) 17 | library(knitr) 18 | 19 | library(tidymodels) 20 | library(DALEXtra) 21 | library(forcats) 22 | 23 | tidymodels_prefer() 24 | 25 | 26 | knitr::opts_chunk$set(echo = FALSE) 27 | options(tutorial.exercise.timelimit = 60, 28 | tutorial.storage = "local") 29 | 30 | ames_update <- ames |> 31 | mutate(Sale_Price = log10(Sale_Price)) 32 | 33 | set.seed(502) 34 | ames_strata_split <- initial_split(ames_update, prop = 0.80, strata = Sale_Price) 35 | ames_train <- training(ames_strata_split) 36 | 37 | ames_rec <- 38 | recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 39 | Latitude + Longitude, data = ames_train) |> 40 | step_log(Gr_Liv_Area, base = 10) |> 41 | step_other(Neighborhood, threshold = 0.01) |> 42 | step_dummy(all_nominal_predictors()) |> 43 | step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) |> 44 | step_ns(Latitude, Longitude, deg_free = 20) 45 | 46 | lm_model <- linear_reg() |> set_engine("lm") 47 | 48 | lm_wflow <- 49 | workflow() |> 50 | add_model(lm_model) |> 51 | add_recipe(ames_rec) 52 | 53 | lm_fit <- fit(lm_wflow, ames_train) 54 | 55 | vip_features <- c("Neighborhood", "Gr_Liv_Area", "Year_Built", "Bldg_Type", "Latitude", "Longitude") 56 | 57 | vip_train <- 58 | ames_train |> 59 | select(all_of(vip_features)) 60 | 61 | explainer_lm <- 62 | explain_tidymodels( 63 | lm_fit, 64 | data = vip_train, 65 | y = ames_train$Sale_Price, 66 | label = "lm + interactions", 67 | verbose = FALSE 68 | ) 69 | 70 | rf_model <- 71 | rand_forest(trees = 1000) |> 72 | set_engine("ranger") |> 73 | set_mode("regression") 74 | 75 | rf_wflow <- 76 | workflow() |> 77 | add_formula( 78 | Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 79 | Latitude + Longitude) |> 80 | add_model(rf_model) 81 | 82 | rf_fit <- rf_wflow |> fit(data = ames_train) 83 | 84 | explainer_rf <- 85 | explain_tidymodels( 86 | rf_fit, 87 | data = vip_train, 88 | y = ames_train$Sale_Price, 89 | label = "random forest", 90 | verbose = FALSE 91 | ) 92 | 93 | ``` 94 | 95 | ```{r copy-code-chunk, child = system.file("child_documents/copy_button.Rmd", package = "tutorial.helpers")} 96 | ``` 97 | 98 | ```{r info-section, child = system.file("child_documents/info_section.Rmd", package = "tutorial.helpers")} 99 | ``` 100 | 101 | ## Introduction 102 | ### 103 | 104 | 105 | 106 | ## Software for Model Explanations 107 | ### 108 | 109 | In Section [1.2](https://www.tmwr.org/software-modeling#model-types) of Chapter [1](https://www.tmwr.org/software-modeling), a taxonomy of models were outlined and suggested that models typically are built as one or more of descriptive, inferential, or predictive. The chapter suggested that model performance, as measured by appropriate metrics (like RMSE for regression or area under the ROC curve for classification), can be important for all modeling applications. Similarly, model explanations, answering *why* a model makes the predictions it does, can be important whether the purpose of your model is largely descriptive, to test a hypothesis, or to make a prediction. 110 | 111 | ### Exercise 1 112 | 113 | Load the **DALEXtra** library using `library()`. 114 | 115 | ```{r software-for-model-e-1, exercise = TRUE} 116 | 117 | ``` 118 | 119 | ```{r software-for-model-e-1-hint-1, eval = FALSE} 120 | library(...) 121 | ``` 122 | 123 | ```{r include = FALSE} 124 | library(DALEXtra) 125 | ``` 126 | 127 | ### 128 | 129 | The tidymodels framework does not itself contain software for model explanations. Instead, models trained and evaluated with tidymodels can be explained with other, supplementary software in R packages such as **lime**, **vip**, and **DALEX**. 130 | 131 | **DALEXtra**, which is an add-on package for **DALEX**, provides support for tidymodels. 132 | 133 | ### Exercise 2 134 | 135 | In Chapters [10](https://www.tmwr.org/resampling) and [11](https://www.tmwr.org/compare), several models were trained and compared to predict the price of homes in Ames, IA, including a linear model with interactions and a random forest model, with the results shown below: 136 | 137 | ```{r} 138 | knitr::include_graphics("images/pic1.png") 139 | ``` 140 | 141 | ### 142 | 143 | **vip** functions are chosen for *model-based* methods that take advantage of model structure (and are often faster) 144 | **DALEX** functions are chosen for *model-agnostic* methods that can be applied to any model 145 | 146 | ### Exercise 3 147 | 148 | Let’s build model-agnostic explainers for both of these models (see the graph from the previous exercise) to find out why they make these predictions. 149 | 150 | In the code chunk below, create a vector that contains `"Neighborhood"`, `"Gr_Liv_Area"`, `"Year_Built"`, `"Bldg_Type"`, `"Latitude"`, and `"Longitude"`. 151 | 152 | ```{r software-for-model-e-3, exercise = TRUE} 153 | 154 | ``` 155 | 156 | ```{r software-for-model-e-3-hint-1, eval = FALSE} 157 | c("...", "...", "...", "Bldg_Type", "Latitude", "Longitude") 158 | ``` 159 | 160 | ```{r include = FALSE} 161 | c("Neighborhood", "Gr_Liv_Area", "Year_Built", "Bldg_Type", "Latitude", "Longitude") 162 | ``` 163 | 164 | ### 165 | 166 | Answering the question “why?” allows modeling practitioners to understand which features were important in predictions and even how model predictions would change under different values for the features. 167 | 168 | 169 | ### Exercise 4 170 | 171 | Copy the previous code and assign it to a new variable named `vip_features`. 172 | 173 | ```{r software-for-model-e-4, exercise = TRUE} 174 | 175 | ``` 176 | 177 | 178 | 179 | ```{r software-for-model-e-4-hint-1, eval = FALSE} 180 | ... <- c("Neighborhood", "Gr_Liv_Area", "Year_Built", "Bldg_Type", "Latitude", "Longitude") 181 | ``` 182 | 183 | ```{r include = FALSE} 184 | vip_features <- c("Neighborhood", "Gr_Liv_Area", "Year_Built", "Bldg_Type", "Latitude", "Longitude") 185 | ``` 186 | 187 | ### 188 | 189 | For some models, like linear regression, it is usually clear how to explain why the model makes its predictions. The structure of a linear model contains coefficients for each predictor that are typically straightforward to interpret. 190 | 191 | ### Exercise 5 192 | 193 | Load the **tidymodels** package using `library()`. Then, on a new line, type in `tidymodels_prefer()` to get rid of naming conflicts. 194 | 195 | ```{r software-for-model-e-5, exercise = TRUE} 196 | 197 | ``` 198 | 199 | ```{r software-for-model-e-5-hint-1, eval = FALSE} 200 | library(...) 201 | tidymodels_prefer() 202 | ``` 203 | 204 | ```{r include = FALSE} 205 | library(tidymodels) 206 | tidymodels_prefer() 207 | ``` 208 | 209 | ### 210 | 211 | As a reminder, the `ames` data set comes from the **modeldata** package, which is loaded when you load the **tidymodels** package. 212 | 213 | ### Exercise 6 214 | 215 | Since the models from the graph in Exercise 2 use Ames data set, the code for the splits and recipes are needed. Press "Run code". 216 | 217 | ```{r software-for-model-e-6, exercise = TRUE} 218 | ames_update <- ames |> 219 | mutate(Sale_Price = log10(Sale_Price)) 220 | 221 | set.seed(502) 222 | ames_strata_split <- initial_split(ames_update, prop = 0.80, strata = Sale_Price) 223 | ames_train <- training(ames_strata_split) 224 | 225 | ames_rec <- 226 | recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 227 | Latitude + Longitude, data = ames_train) |> 228 | step_log(Gr_Liv_Area, base = 10) |> 229 | step_other(Neighborhood, threshold = 0.01) |> 230 | step_dummy(all_nominal_predictors()) |> 231 | step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) |> 232 | step_ns(Latitude, Longitude, deg_free = 20) 233 | 234 | lm_model <- linear_reg() |> set_engine("lm") 235 | 236 | lm_wflow <- 237 | workflow() |> 238 | add_model(lm_model) |> 239 | add_recipe(ames_rec) 240 | 241 | lm_fit <- fit(lm_wflow, ames_train) 242 | ``` 243 | 244 | ```{r include = FALSE} 245 | ames_update <- ames |> 246 | mutate(Sale_Price = log10(Sale_Price)) 247 | 248 | set.seed(502) 249 | ames_strata_split <- initial_split(ames_update, prop = 0.80, strata = Sale_Price) 250 | ames_train <- training(ames_strata_split) 251 | 252 | ames_rec <- 253 | recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 254 | Latitude + Longitude, data = ames_train) |> 255 | step_log(Gr_Liv_Area, base = 10) |> 256 | step_other(Neighborhood, threshold = 0.01) |> 257 | step_dummy(all_nominal_predictors()) |> 258 | step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) |> 259 | step_ns(Latitude, Longitude, deg_free = 20) 260 | 261 | lm_model <- linear_reg() |> set_engine("lm") 262 | 263 | lm_wflow <- 264 | workflow() |> 265 | add_model(lm_model) |> 266 | add_recipe(ames_rec) 267 | 268 | lm_fit <- fit(lm_wflow, ames_train) 269 | ``` 270 | 271 | ### 272 | 273 | These are the variables you coded in the past tutorials for the `ames` data set. See the "Feature Engineering with recipes" tutorial or Chapter [8](https://www.tmwr.org/recipes) to review this. 274 | 275 | ### Exercise 7 276 | 277 | In the code chunk below, pipe `ames_train` to `select()`. Inside this function, type `all_of()`. Inside `all_of()`, type in `vip_features`. 278 | 279 | ```{r software-for-model-e-7, exercise = TRUE} 280 | 281 | ``` 282 | 283 | ```{r software-for-model-e-7-hint-1, eval = FALSE} 284 | ... |> 285 | select(all_of(...)) 286 | ``` 287 | 288 | ```{r include = FALSE} 289 | ames_train |> 290 | select(all_of(vip_features)) 291 | ``` 292 | 293 | ### 294 | 295 | `all_of()` is a function that selects variables from character vectors. 296 | 297 | ### Exercise 8 298 | 299 | Copy the previous code and assign it to a new variable named `vip_train`. 300 | 301 | ```{r software-for-model-e-8, exercise = TRUE} 302 | 303 | ``` 304 | 305 | 306 | 307 | ```{r software-for-model-e-8-hint-1, eval = FALSE} 308 | ... <- 309 | ames_train |> 310 | select(all_of(vip_features)) 311 | ``` 312 | 313 | ```{r include = FALSE} 314 | vip_train <- 315 | ames_train |> 316 | select(all_of(vip_features)) 317 | ``` 318 | 319 | ### 320 | 321 | Przemyslaw Biecek and Tomasz Burzykowski's [*Explanatory Model Analysis*](https://ema.drwhy.ai/) book provide a thorough exploration of how to use **DALEX** for model explanations. 322 | 323 | ### Exercise 9 324 | 325 | Now, let's generate some information about the model. In the code chunk below, type in `explain_tidymodels()`. Inside this function, type in `lm_fit`, set `data` to `vip_train`, and set `y` to `ames_train$Sale_Price`. 326 | 327 | ```{r software-for-model-e-9, exercise = TRUE} 328 | 329 | ``` 330 | 331 | ```{r software-for-model-e-9-hint-1, eval = FALSE} 332 | explain_tidymodels( 333 | ..., 334 | data = vip_train, 335 | y = ...$... 336 | ) 337 | ``` 338 | 339 | ```{r include = FALSE} 340 | explain_tidymodels( 341 | lm_fit, 342 | data = vip_train, 343 | y = ames_train$Sale_Price 344 | ) 345 | ``` 346 | 347 | ### 348 | 349 | `explain_tidymodels()` is a function (from the **DALEXtra** package) that creates an explainer from your tidymodels workflow. In this scenario, the function is being used for the linear model `lm_fit`. 350 | 351 | ### Exercise 10 352 | 353 | Copy the previous code. Inside `explain_tidymodels()`, set `label` to `"lm + interactions"` and `verbose` to `FALSE`. 354 | 355 | ```{r software-for-model-e-10, exercise = TRUE} 356 | 357 | ``` 358 | 359 | 360 | 361 | ```{r software-for-model-e-10-hint-1, eval = FALSE} 362 | explain_tidymodels( 363 | lm_fit, 364 | data = vip_train, 365 | y = ames_train$Sale_Price, 366 | ... = "lm + interactions", 367 | verbose = ... 368 | ) 369 | ``` 370 | 371 | ```{r include = FALSE} 372 | explain_tidymodels( 373 | lm_fit, 374 | data = vip_train, 375 | y = ames_train$Sale_Price, 376 | label = "lm + interactions", 377 | verbose = FALSE 378 | ) 379 | ``` 380 | 381 | ### 382 | 383 | For other models, like random forests that can capture nonlinear behavior by design, it is less transparent how to explain the model’s predictions from only the structure of the model itself. Instead, we can apply model explainer algorithms to generate understanding of predictions. 384 | 385 | ### Exercise 11 386 | 387 | Copy the previous code and assign it to a new variable named `explainer_lm`. 388 | 389 | ```{r software-for-model-e-11, exercise = TRUE} 390 | 391 | ``` 392 | 393 | 394 | 395 | ```{r software-for-model-e-11-hint-1, eval = FALSE} 396 | ... <- 397 | explain_tidymodels( 398 | lm_fit, 399 | data = vip_train, 400 | y = ames_train$Sale_Price, 401 | label = "lm + interactions", 402 | verbose = FALSE 403 | ) 404 | ``` 405 | 406 | ```{r include = FALSE} 407 | explainer_lm <- 408 | explain_tidymodels( 409 | lm_fit, 410 | data = vip_train, 411 | y = ames_train$Sale_Price, 412 | label = "lm + interactions", 413 | verbose = FALSE 414 | ) 415 | ``` 416 | 417 | ### 418 | 419 | Click [here](https://search.r-project.org/CRAN/refmans/DALEXtra/html/explain_tidymodels.html) to learn more about the `explain_tidymodels()` function. 420 | 421 | ### Exercise 12 422 | 423 | Press "Run code". 424 | 425 | ```{r software-for-model-e-12, exercise = TRUE} 426 | rf_model <- 427 | rand_forest(trees = 1000) |> 428 | set_engine("ranger") |> 429 | set_mode("regression") 430 | 431 | rf_wflow <- 432 | workflow() |> 433 | add_formula( 434 | Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 435 | Latitude + Longitude) |> 436 | add_model(rf_model) 437 | 438 | rf_fit <- rf_wflow |> fit(data = ames_train) 439 | ``` 440 | 441 | ```{r include = FALSE} 442 | rf_model <- 443 | rand_forest(trees = 1000) |> 444 | set_engine("ranger") |> 445 | set_mode("regression") 446 | 447 | rf_wflow <- 448 | workflow() |> 449 | add_formula( 450 | Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 451 | Latitude + Longitude) |> 452 | add_model(rf_model) 453 | 454 | rf_fit <- rf_wflow |> fit(data = ames_train) 455 | ``` 456 | 457 | ### 458 | 459 | These were the variables you created in the "Resampling for Evaluating Performance" tutorial. `rf_model` is a random forest model that has `1000` trees. Then, this model is used to create a random forest workflow, adding `Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + Latitude + Longitude` as the formula. Then, this model is fitted, with `data` being `ames_train`. 460 | 461 | Visit Chapter [10](https://www.tmwr.org/resampling) to review this process. 462 | 463 | ### Exercise 13 464 | 465 | In the code chunk below, type in `explain_tidymodels()`. Inside this function, type in `rf_fit`, set `data` to `vip_train`, and set `y` to `ames_train$Sale_Price`. 466 | 467 | ```{r software-for-model-e-13, exercise = TRUE} 468 | 469 | ``` 470 | 471 | ```{r software-for-model-e-13-hint-1, eval = FALSE} 472 | explain_tidymodels( 473 | ..., 474 | data = ..., 475 | ... = ames_train$Sale_Price 476 | ) 477 | ``` 478 | 479 | ```{r include = FALSE} 480 | explain_tidymodels( 481 | rf_fit, 482 | data = vip_train, 483 | y = ames_train$Sale_Price 484 | ) 485 | ``` 486 | 487 | ### 488 | 489 | There are two types of model explanations, *global* and *local.* Global model explanations provide an overall understanding aggregated over a whole set of observations; local model explanations provide information about a prediction for a single observation. 490 | 491 | ### Exercise 14 492 | 493 | Copy the previous code. Inside `explain_tidymodels()`, set `label` to `"random forest"` and set `verbose` to `FALSE`. 494 | 495 | ```{r software-for-model-e-14, exercise = TRUE} 496 | 497 | ``` 498 | 499 | 500 | 501 | ```{r software-for-model-e-14-hint-1, eval = FALSE} 502 | explain_tidymodels( 503 | rf_fit, 504 | data = vip_train, 505 | y = ames_train$Sale_Price, 506 | label = "...", 507 | ... = FALSE 508 | ) 509 | ``` 510 | 511 | ```{r include = FALSE} 512 | explain_tidymodels( 513 | rf_fit, 514 | data = vip_train, 515 | y = ames_train$Sale_Price, 516 | label = "random forest", 517 | verbose = FALSE 518 | ) 519 | ``` 520 | 521 | ### 522 | 523 | A linear model is typically straightforward to interpret and explain; you may not often find yourself using separate model explanation algorithms for a linear model. However, it can sometimes be difficult to understand or explain the predictions of even a linear model once it has splines and interaction terms! 524 | 525 | ### Exercise 15 526 | 527 | Copy the previous code and assign it to a new variable named `explainer_rf`. 528 | 529 | ```{r software-for-model-e-15, exercise = TRUE} 530 | 531 | ``` 532 | 533 | 534 | 535 | ```{r software-for-model-e-15-hint-1, eval = FALSE} 536 | ... <- 537 | explain_tidymodels( 538 | rf_fit, 539 | data = vip_train, 540 | y = ames_train$Sale_Price, 541 | label = "random forest", 542 | verbose = FALSE 543 | ) 544 | ``` 545 | 546 | ```{r include = FALSE} 547 | explainer_rf <- 548 | explain_tidymodels( 549 | rf_fit, 550 | data = vip_train, 551 | y = ames_train$Sale_Price, 552 | label = "random forest", 553 | verbose = FALSE 554 | ) 555 | ``` 556 | 557 | ### 558 | 559 | ## Summary 560 | ### 561 | 562 | 563 | 564 | ```{r download-answers, child = system.file("child_documents/download_answers.Rmd", package = "tutorial.helpers")} 565 | ``` 566 | -------------------------------------------------------------------------------- /man/figures/README-pressure-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/man/figures/README-pressure-1.png -------------------------------------------------------------------------------- /man/tidymodels.tutorials-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tidymodels.tutorials-package.R 3 | \docType{package} 4 | \name{tidymodels.tutorials-package} 5 | \alias{tidymodels.tutorials} 6 | \alias{tidymodels.tutorials-package} 7 | \title{tidymodels.tutorials: Tutorials for Tidy Modeling with R} 8 | \description{ 9 | This package provides tutorials for Tidy Modeling with R by Max Kuhn and Julia Silge. In an ideal world, students would read the book and type in all the associated R commands themselves. Sadly, that often does not happen. These tutorials allow students to demonstrate (and their instructors to be sure) that all work has been completed. See the tutorial.helpers package for a background discussion of the tool and approach. 10 | } 11 | \author{ 12 | \strong{Maintainer}: David Kane \email{dave.kane@gmail.com} (\href{https://orcid.org/0000-0002-6660-3934}{ORCID}) [copyright holder] 13 | 14 | } 15 | \keyword{internal} 16 | -------------------------------------------------------------------------------- /renv/.gitignore: -------------------------------------------------------------------------------- 1 | library/ 2 | local/ 3 | cellar/ 4 | lock/ 5 | python/ 6 | sandbox/ 7 | staging/ 8 | -------------------------------------------------------------------------------- /renv/activate.R: -------------------------------------------------------------------------------- 1 | 2 | local({ 3 | 4 | # the requested version of renv 5 | version <- "1.0.0" 6 | attr(version, "sha") <- NULL 7 | 8 | # the project directory 9 | project <- getwd() 10 | 11 | # figure out whether the autoloader is enabled 12 | enabled <- local({ 13 | 14 | # first, check config option 15 | override <- getOption("renv.config.autoloader.enabled") 16 | if (!is.null(override)) 17 | return(override) 18 | 19 | # next, check environment variables 20 | # TODO: prefer using the configuration one in the future 21 | envvars <- c( 22 | "RENV_CONFIG_AUTOLOADER_ENABLED", 23 | "RENV_AUTOLOADER_ENABLED", 24 | "RENV_ACTIVATE_PROJECT" 25 | ) 26 | 27 | for (envvar in envvars) { 28 | envval <- Sys.getenv(envvar, unset = NA) 29 | if (!is.na(envval)) 30 | return(tolower(envval) %in% c("true", "t", "1")) 31 | } 32 | 33 | # enable by default 34 | TRUE 35 | 36 | }) 37 | 38 | if (!enabled) 39 | return(FALSE) 40 | 41 | # avoid recursion 42 | if (identical(getOption("renv.autoloader.running"), TRUE)) { 43 | warning("ignoring recursive attempt to run renv autoloader") 44 | return(invisible(TRUE)) 45 | } 46 | 47 | # signal that we're loading renv during R startup 48 | options(renv.autoloader.running = TRUE) 49 | on.exit(options(renv.autoloader.running = NULL), add = TRUE) 50 | 51 | # signal that we've consented to use renv 52 | options(renv.consent = TRUE) 53 | 54 | # load the 'utils' package eagerly -- this ensures that renv shims, which 55 | # mask 'utils' packages, will come first on the search path 56 | library(utils, lib.loc = .Library) 57 | 58 | # unload renv if it's already been loaded 59 | if ("renv" %in% loadedNamespaces()) 60 | unloadNamespace("renv") 61 | 62 | # load bootstrap tools 63 | `%||%` <- function(x, y) { 64 | if (is.null(x)) y else x 65 | } 66 | 67 | catf <- function(fmt, ..., appendLF = TRUE) { 68 | 69 | quiet <- getOption("renv.bootstrap.quiet", default = FALSE) 70 | if (quiet) 71 | return(invisible()) 72 | 73 | msg <- sprintf(fmt, ...) 74 | cat(msg, file = stdout(), sep = if (appendLF) "\n" else "") 75 | 76 | invisible(msg) 77 | 78 | } 79 | 80 | header <- function(label, 81 | ..., 82 | prefix = "#", 83 | suffix = "-", 84 | n = min(getOption("width"), 78)) 85 | { 86 | label <- sprintf(label, ...) 87 | n <- max(n - nchar(label) - nchar(prefix) - 2L, 8L) 88 | if (n <= 0) 89 | return(paste(prefix, label)) 90 | 91 | tail <- paste(rep.int(suffix, n), collapse = "") 92 | paste0(prefix, " ", label, " ", tail) 93 | 94 | } 95 | 96 | startswith <- function(string, prefix) { 97 | substring(string, 1, nchar(prefix)) == prefix 98 | } 99 | 100 | bootstrap <- function(version, library) { 101 | 102 | friendly <- renv_bootstrap_version_friendly(version) 103 | section <- header(sprintf("Bootstrapping renv %s", friendly)) 104 | catf(section) 105 | 106 | # attempt to download renv 107 | catf("- Downloading renv ... ", appendLF = FALSE) 108 | withCallingHandlers( 109 | tarball <- renv_bootstrap_download(version), 110 | error = function(err) { 111 | catf("FAILED") 112 | stop("failed to download:\n", conditionMessage(err)) 113 | } 114 | ) 115 | catf("OK") 116 | on.exit(unlink(tarball), add = TRUE) 117 | 118 | # now attempt to install 119 | catf("- Installing renv ... ", appendLF = FALSE) 120 | withCallingHandlers( 121 | status <- renv_bootstrap_install(version, tarball, library), 122 | error = function(err) { 123 | catf("FAILED") 124 | stop("failed to install:\n", conditionMessage(err)) 125 | } 126 | ) 127 | catf("OK") 128 | 129 | # add empty line to break up bootstrapping from normal output 130 | catf("") 131 | 132 | return(invisible()) 133 | } 134 | 135 | renv_bootstrap_tests_running <- function() { 136 | getOption("renv.tests.running", default = FALSE) 137 | } 138 | 139 | renv_bootstrap_repos <- function() { 140 | 141 | # get CRAN repository 142 | cran <- getOption("renv.repos.cran", "https://cloud.r-project.org") 143 | 144 | # check for repos override 145 | repos <- Sys.getenv("RENV_CONFIG_REPOS_OVERRIDE", unset = NA) 146 | if (!is.na(repos)) { 147 | 148 | # check for RSPM; if set, use a fallback repository for renv 149 | rspm <- Sys.getenv("RSPM", unset = NA) 150 | if (identical(rspm, repos)) 151 | repos <- c(RSPM = rspm, CRAN = cran) 152 | 153 | return(repos) 154 | 155 | } 156 | 157 | # check for lockfile repositories 158 | repos <- tryCatch(renv_bootstrap_repos_lockfile(), error = identity) 159 | if (!inherits(repos, "error") && length(repos)) 160 | return(repos) 161 | 162 | # retrieve current repos 163 | repos <- getOption("repos") 164 | 165 | # ensure @CRAN@ entries are resolved 166 | repos[repos == "@CRAN@"] <- cran 167 | 168 | # add in renv.bootstrap.repos if set 169 | default <- c(FALLBACK = "https://cloud.r-project.org") 170 | extra <- getOption("renv.bootstrap.repos", default = default) 171 | repos <- c(repos, extra) 172 | 173 | # remove duplicates that might've snuck in 174 | dupes <- duplicated(repos) | duplicated(names(repos)) 175 | repos[!dupes] 176 | 177 | } 178 | 179 | renv_bootstrap_repos_lockfile <- function() { 180 | 181 | lockpath <- Sys.getenv("RENV_PATHS_LOCKFILE", unset = "renv.lock") 182 | if (!file.exists(lockpath)) 183 | return(NULL) 184 | 185 | lockfile <- tryCatch(renv_json_read(lockpath), error = identity) 186 | if (inherits(lockfile, "error")) { 187 | warning(lockfile) 188 | return(NULL) 189 | } 190 | 191 | repos <- lockfile$R$Repositories 192 | if (length(repos) == 0) 193 | return(NULL) 194 | 195 | keys <- vapply(repos, `[[`, "Name", FUN.VALUE = character(1)) 196 | vals <- vapply(repos, `[[`, "URL", FUN.VALUE = character(1)) 197 | names(vals) <- keys 198 | 199 | return(vals) 200 | 201 | } 202 | 203 | renv_bootstrap_download <- function(version) { 204 | 205 | sha <- attr(version, "sha", exact = TRUE) 206 | 207 | methods <- if (!is.null(sha)) { 208 | 209 | # attempting to bootstrap a development version of renv 210 | c( 211 | function() renv_bootstrap_download_tarball(sha), 212 | function() renv_bootstrap_download_github(sha) 213 | ) 214 | 215 | } else { 216 | 217 | # attempting to bootstrap a release version of renv 218 | c( 219 | function() renv_bootstrap_download_tarball(version), 220 | function() renv_bootstrap_download_cran_latest(version), 221 | function() renv_bootstrap_download_cran_archive(version) 222 | ) 223 | 224 | } 225 | 226 | for (method in methods) { 227 | path <- tryCatch(method(), error = identity) 228 | if (is.character(path) && file.exists(path)) 229 | return(path) 230 | } 231 | 232 | stop("All download methods failed") 233 | 234 | } 235 | 236 | renv_bootstrap_download_impl <- function(url, destfile) { 237 | 238 | mode <- "wb" 239 | 240 | # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17715 241 | fixup <- 242 | Sys.info()[["sysname"]] == "Windows" && 243 | substring(url, 1L, 5L) == "file:" 244 | 245 | if (fixup) 246 | mode <- "w+b" 247 | 248 | args <- list( 249 | url = url, 250 | destfile = destfile, 251 | mode = mode, 252 | quiet = TRUE 253 | ) 254 | 255 | if ("headers" %in% names(formals(utils::download.file))) 256 | args$headers <- renv_bootstrap_download_custom_headers(url) 257 | 258 | do.call(utils::download.file, args) 259 | 260 | } 261 | 262 | renv_bootstrap_download_custom_headers <- function(url) { 263 | 264 | headers <- getOption("renv.download.headers") 265 | if (is.null(headers)) 266 | return(character()) 267 | 268 | if (!is.function(headers)) 269 | stopf("'renv.download.headers' is not a function") 270 | 271 | headers <- headers(url) 272 | if (length(headers) == 0L) 273 | return(character()) 274 | 275 | if (is.list(headers)) 276 | headers <- unlist(headers, recursive = FALSE, use.names = TRUE) 277 | 278 | ok <- 279 | is.character(headers) && 280 | is.character(names(headers)) && 281 | all(nzchar(names(headers))) 282 | 283 | if (!ok) 284 | stop("invocation of 'renv.download.headers' did not return a named character vector") 285 | 286 | headers 287 | 288 | } 289 | 290 | renv_bootstrap_download_cran_latest <- function(version) { 291 | 292 | spec <- renv_bootstrap_download_cran_latest_find(version) 293 | type <- spec$type 294 | repos <- spec$repos 295 | 296 | baseurl <- utils::contrib.url(repos = repos, type = type) 297 | ext <- if (identical(type, "source")) 298 | ".tar.gz" 299 | else if (Sys.info()[["sysname"]] == "Windows") 300 | ".zip" 301 | else 302 | ".tgz" 303 | name <- sprintf("renv_%s%s", version, ext) 304 | url <- paste(baseurl, name, sep = "/") 305 | 306 | destfile <- file.path(tempdir(), name) 307 | status <- tryCatch( 308 | renv_bootstrap_download_impl(url, destfile), 309 | condition = identity 310 | ) 311 | 312 | if (inherits(status, "condition")) 313 | return(FALSE) 314 | 315 | # report success and return 316 | destfile 317 | 318 | } 319 | 320 | renv_bootstrap_download_cran_latest_find <- function(version) { 321 | 322 | # check whether binaries are supported on this system 323 | binary <- 324 | getOption("renv.bootstrap.binary", default = TRUE) && 325 | !identical(.Platform$pkgType, "source") && 326 | !identical(getOption("pkgType"), "source") && 327 | Sys.info()[["sysname"]] %in% c("Darwin", "Windows") 328 | 329 | types <- c(if (binary) "binary", "source") 330 | 331 | # iterate over types + repositories 332 | for (type in types) { 333 | for (repos in renv_bootstrap_repos()) { 334 | 335 | # retrieve package database 336 | db <- tryCatch( 337 | as.data.frame( 338 | utils::available.packages(type = type, repos = repos), 339 | stringsAsFactors = FALSE 340 | ), 341 | error = identity 342 | ) 343 | 344 | if (inherits(db, "error")) 345 | next 346 | 347 | # check for compatible entry 348 | entry <- db[db$Package %in% "renv" & db$Version %in% version, ] 349 | if (nrow(entry) == 0) 350 | next 351 | 352 | # found it; return spec to caller 353 | spec <- list(entry = entry, type = type, repos = repos) 354 | return(spec) 355 | 356 | } 357 | } 358 | 359 | # if we got here, we failed to find renv 360 | fmt <- "renv %s is not available from your declared package repositories" 361 | stop(sprintf(fmt, version)) 362 | 363 | } 364 | 365 | renv_bootstrap_download_cran_archive <- function(version) { 366 | 367 | name <- sprintf("renv_%s.tar.gz", version) 368 | repos <- renv_bootstrap_repos() 369 | urls <- file.path(repos, "src/contrib/Archive/renv", name) 370 | destfile <- file.path(tempdir(), name) 371 | 372 | for (url in urls) { 373 | 374 | status <- tryCatch( 375 | renv_bootstrap_download_impl(url, destfile), 376 | condition = identity 377 | ) 378 | 379 | if (identical(status, 0L)) 380 | return(destfile) 381 | 382 | } 383 | 384 | return(FALSE) 385 | 386 | } 387 | 388 | renv_bootstrap_download_tarball <- function(version) { 389 | 390 | # if the user has provided the path to a tarball via 391 | # an environment variable, then use it 392 | tarball <- Sys.getenv("RENV_BOOTSTRAP_TARBALL", unset = NA) 393 | if (is.na(tarball)) 394 | return() 395 | 396 | # allow directories 397 | if (dir.exists(tarball)) { 398 | name <- sprintf("renv_%s.tar.gz", version) 399 | tarball <- file.path(tarball, name) 400 | } 401 | 402 | # bail if it doesn't exist 403 | if (!file.exists(tarball)) { 404 | 405 | # let the user know we weren't able to honour their request 406 | fmt <- "- RENV_BOOTSTRAP_TARBALL is set (%s) but does not exist." 407 | msg <- sprintf(fmt, tarball) 408 | warning(msg) 409 | 410 | # bail 411 | return() 412 | 413 | } 414 | 415 | catf("- Using local tarball '%s'.", tarball) 416 | tarball 417 | 418 | } 419 | 420 | renv_bootstrap_download_github <- function(version) { 421 | 422 | enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE") 423 | if (!identical(enabled, "TRUE")) 424 | return(FALSE) 425 | 426 | # prepare download options 427 | pat <- Sys.getenv("GITHUB_PAT") 428 | if (nzchar(Sys.which("curl")) && nzchar(pat)) { 429 | fmt <- "--location --fail --header \"Authorization: token %s\"" 430 | extra <- sprintf(fmt, pat) 431 | saved <- options("download.file.method", "download.file.extra") 432 | options(download.file.method = "curl", download.file.extra = extra) 433 | on.exit(do.call(base::options, saved), add = TRUE) 434 | } else if (nzchar(Sys.which("wget")) && nzchar(pat)) { 435 | fmt <- "--header=\"Authorization: token %s\"" 436 | extra <- sprintf(fmt, pat) 437 | saved <- options("download.file.method", "download.file.extra") 438 | options(download.file.method = "wget", download.file.extra = extra) 439 | on.exit(do.call(base::options, saved), add = TRUE) 440 | } 441 | 442 | url <- file.path("https://api.github.com/repos/rstudio/renv/tarball", version) 443 | name <- sprintf("renv_%s.tar.gz", version) 444 | destfile <- file.path(tempdir(), name) 445 | 446 | status <- tryCatch( 447 | renv_bootstrap_download_impl(url, destfile), 448 | condition = identity 449 | ) 450 | 451 | if (!identical(status, 0L)) 452 | return(FALSE) 453 | 454 | renv_bootstrap_download_augment(destfile) 455 | 456 | return(destfile) 457 | 458 | } 459 | 460 | # Add Sha to DESCRIPTION. This is stop gap until #890, after which we 461 | # can use renv::install() to fully capture metadata. 462 | renv_bootstrap_download_augment <- function(destfile) { 463 | sha <- renv_bootstrap_git_extract_sha1_tar(destfile) 464 | if (is.null(sha)) { 465 | return() 466 | } 467 | 468 | # Untar 469 | tempdir <- tempfile("renv-github-") 470 | on.exit(unlink(tempdir, recursive = TRUE), add = TRUE) 471 | untar(destfile, exdir = tempdir) 472 | pkgdir <- dir(tempdir, full.names = TRUE)[[1]] 473 | 474 | # Modify description 475 | desc_path <- file.path(pkgdir, "DESCRIPTION") 476 | desc_lines <- readLines(desc_path) 477 | remotes_fields <- c( 478 | "RemoteType: github", 479 | "RemoteHost: api.github.com", 480 | "RemoteRepo: renv", 481 | "RemoteUsername: rstudio", 482 | "RemotePkgRef: rstudio/renv", 483 | paste("RemoteRef: ", sha), 484 | paste("RemoteSha: ", sha) 485 | ) 486 | writeLines(c(desc_lines[desc_lines != ""], remotes_fields), con = desc_path) 487 | 488 | # Re-tar 489 | local({ 490 | old <- setwd(tempdir) 491 | on.exit(setwd(old), add = TRUE) 492 | 493 | tar(destfile, compression = "gzip") 494 | }) 495 | invisible() 496 | } 497 | 498 | # Extract the commit hash from a git archive. Git archives include the SHA1 499 | # hash as the comment field of the tarball pax extended header 500 | # (see https://www.kernel.org/pub/software/scm/git/docs/git-archive.html) 501 | # For GitHub archives this should be the first header after the default one 502 | # (512 byte) header. 503 | renv_bootstrap_git_extract_sha1_tar <- function(bundle) { 504 | 505 | # open the bundle for reading 506 | # We use gzcon for everything because (from ?gzcon) 507 | # > Reading from a connection which does not supply a ‘gzip’ magic 508 | # > header is equivalent to reading from the original connection 509 | conn <- gzcon(file(bundle, open = "rb", raw = TRUE)) 510 | on.exit(close(conn)) 511 | 512 | # The default pax header is 512 bytes long and the first pax extended header 513 | # with the comment should be 51 bytes long 514 | # `52 comment=` (11 chars) + 40 byte SHA1 hash 515 | len <- 0x200 + 0x33 516 | res <- rawToChar(readBin(conn, "raw", n = len)[0x201:len]) 517 | 518 | if (grepl("^52 comment=", res)) { 519 | sub("52 comment=", "", res) 520 | } else { 521 | NULL 522 | } 523 | } 524 | 525 | renv_bootstrap_install <- function(version, tarball, library) { 526 | 527 | # attempt to install it into project library 528 | dir.create(library, showWarnings = FALSE, recursive = TRUE) 529 | output <- renv_bootstrap_install_impl(library, tarball) 530 | 531 | # check for successful install 532 | status <- attr(output, "status") 533 | if (is.null(status) || identical(status, 0L)) 534 | return(status) 535 | 536 | # an error occurred; report it 537 | header <- "installation of renv failed" 538 | lines <- paste(rep.int("=", nchar(header)), collapse = "") 539 | text <- paste(c(header, lines, output), collapse = "\n") 540 | stop(text) 541 | 542 | } 543 | 544 | renv_bootstrap_install_impl <- function(library, tarball) { 545 | 546 | # invoke using system2 so we can capture and report output 547 | bin <- R.home("bin") 548 | exe <- if (Sys.info()[["sysname"]] == "Windows") "R.exe" else "R" 549 | R <- file.path(bin, exe) 550 | 551 | args <- c( 552 | "--vanilla", "CMD", "INSTALL", "--no-multiarch", 553 | "-l", shQuote(path.expand(library)), 554 | shQuote(path.expand(tarball)) 555 | ) 556 | 557 | system2(R, args, stdout = TRUE, stderr = TRUE) 558 | 559 | } 560 | 561 | renv_bootstrap_platform_prefix <- function() { 562 | 563 | # construct version prefix 564 | version <- paste(R.version$major, R.version$minor, sep = ".") 565 | prefix <- paste("R", numeric_version(version)[1, 1:2], sep = "-") 566 | 567 | # include SVN revision for development versions of R 568 | # (to avoid sharing platform-specific artefacts with released versions of R) 569 | devel <- 570 | identical(R.version[["status"]], "Under development (unstable)") || 571 | identical(R.version[["nickname"]], "Unsuffered Consequences") 572 | 573 | if (devel) 574 | prefix <- paste(prefix, R.version[["svn rev"]], sep = "-r") 575 | 576 | # build list of path components 577 | components <- c(prefix, R.version$platform) 578 | 579 | # include prefix if provided by user 580 | prefix <- renv_bootstrap_platform_prefix_impl() 581 | if (!is.na(prefix) && nzchar(prefix)) 582 | components <- c(prefix, components) 583 | 584 | # build prefix 585 | paste(components, collapse = "/") 586 | 587 | } 588 | 589 | renv_bootstrap_platform_prefix_impl <- function() { 590 | 591 | # if an explicit prefix has been supplied, use it 592 | prefix <- Sys.getenv("RENV_PATHS_PREFIX", unset = NA) 593 | if (!is.na(prefix)) 594 | return(prefix) 595 | 596 | # if the user has requested an automatic prefix, generate it 597 | auto <- Sys.getenv("RENV_PATHS_PREFIX_AUTO", unset = NA) 598 | if (auto %in% c("TRUE", "True", "true", "1")) 599 | return(renv_bootstrap_platform_prefix_auto()) 600 | 601 | # empty string on failure 602 | "" 603 | 604 | } 605 | 606 | renv_bootstrap_platform_prefix_auto <- function() { 607 | 608 | prefix <- tryCatch(renv_bootstrap_platform_os(), error = identity) 609 | if (inherits(prefix, "error") || prefix %in% "unknown") { 610 | 611 | msg <- paste( 612 | "failed to infer current operating system", 613 | "please file a bug report at https://github.com/rstudio/renv/issues", 614 | sep = "; " 615 | ) 616 | 617 | warning(msg) 618 | 619 | } 620 | 621 | prefix 622 | 623 | } 624 | 625 | renv_bootstrap_platform_os <- function() { 626 | 627 | sysinfo <- Sys.info() 628 | sysname <- sysinfo[["sysname"]] 629 | 630 | # handle Windows + macOS up front 631 | if (sysname == "Windows") 632 | return("windows") 633 | else if (sysname == "Darwin") 634 | return("macos") 635 | 636 | # check for os-release files 637 | for (file in c("/etc/os-release", "/usr/lib/os-release")) 638 | if (file.exists(file)) 639 | return(renv_bootstrap_platform_os_via_os_release(file, sysinfo)) 640 | 641 | # check for redhat-release files 642 | if (file.exists("/etc/redhat-release")) 643 | return(renv_bootstrap_platform_os_via_redhat_release()) 644 | 645 | "unknown" 646 | 647 | } 648 | 649 | renv_bootstrap_platform_os_via_os_release <- function(file, sysinfo) { 650 | 651 | # read /etc/os-release 652 | release <- utils::read.table( 653 | file = file, 654 | sep = "=", 655 | quote = c("\"", "'"), 656 | col.names = c("Key", "Value"), 657 | comment.char = "#", 658 | stringsAsFactors = FALSE 659 | ) 660 | 661 | vars <- as.list(release$Value) 662 | names(vars) <- release$Key 663 | 664 | # get os name 665 | os <- tolower(sysinfo[["sysname"]]) 666 | 667 | # read id 668 | id <- "unknown" 669 | for (field in c("ID", "ID_LIKE")) { 670 | if (field %in% names(vars) && nzchar(vars[[field]])) { 671 | id <- vars[[field]] 672 | break 673 | } 674 | } 675 | 676 | # read version 677 | version <- "unknown" 678 | for (field in c("UBUNTU_CODENAME", "VERSION_CODENAME", "VERSION_ID", "BUILD_ID")) { 679 | if (field %in% names(vars) && nzchar(vars[[field]])) { 680 | version <- vars[[field]] 681 | break 682 | } 683 | } 684 | 685 | # join together 686 | paste(c(os, id, version), collapse = "-") 687 | 688 | } 689 | 690 | renv_bootstrap_platform_os_via_redhat_release <- function() { 691 | 692 | # read /etc/redhat-release 693 | contents <- readLines("/etc/redhat-release", warn = FALSE) 694 | 695 | # infer id 696 | id <- if (grepl("centos", contents, ignore.case = TRUE)) 697 | "centos" 698 | else if (grepl("redhat", contents, ignore.case = TRUE)) 699 | "redhat" 700 | else 701 | "unknown" 702 | 703 | # try to find a version component (very hacky) 704 | version <- "unknown" 705 | 706 | parts <- strsplit(contents, "[[:space:]]")[[1L]] 707 | for (part in parts) { 708 | 709 | nv <- tryCatch(numeric_version(part), error = identity) 710 | if (inherits(nv, "error")) 711 | next 712 | 713 | version <- nv[1, 1] 714 | break 715 | 716 | } 717 | 718 | paste(c("linux", id, version), collapse = "-") 719 | 720 | } 721 | 722 | renv_bootstrap_library_root_name <- function(project) { 723 | 724 | # use project name as-is if requested 725 | asis <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT_ASIS", unset = "FALSE") 726 | if (asis) 727 | return(basename(project)) 728 | 729 | # otherwise, disambiguate based on project's path 730 | id <- substring(renv_bootstrap_hash_text(project), 1L, 8L) 731 | paste(basename(project), id, sep = "-") 732 | 733 | } 734 | 735 | renv_bootstrap_library_root <- function(project) { 736 | 737 | prefix <- renv_bootstrap_profile_prefix() 738 | 739 | path <- Sys.getenv("RENV_PATHS_LIBRARY", unset = NA) 740 | if (!is.na(path)) 741 | return(paste(c(path, prefix), collapse = "/")) 742 | 743 | path <- renv_bootstrap_library_root_impl(project) 744 | if (!is.null(path)) { 745 | name <- renv_bootstrap_library_root_name(project) 746 | return(paste(c(path, prefix, name), collapse = "/")) 747 | } 748 | 749 | renv_bootstrap_paths_renv("library", project = project) 750 | 751 | } 752 | 753 | renv_bootstrap_library_root_impl <- function(project) { 754 | 755 | root <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT", unset = NA) 756 | if (!is.na(root)) 757 | return(root) 758 | 759 | type <- renv_bootstrap_project_type(project) 760 | if (identical(type, "package")) { 761 | userdir <- renv_bootstrap_user_dir() 762 | return(file.path(userdir, "library")) 763 | } 764 | 765 | } 766 | 767 | renv_bootstrap_validate_version <- function(version, description = NULL) { 768 | 769 | # resolve description file 770 | description <- description %||% { 771 | path <- getNamespaceInfo("renv", "path") 772 | packageDescription("renv", lib.loc = dirname(path)) 773 | } 774 | 775 | # check whether requested version 'version' matches loaded version of renv 776 | sha <- attr(version, "sha", exact = TRUE) 777 | valid <- if (!is.null(sha)) 778 | renv_bootstrap_validate_version_dev(sha, description) 779 | else 780 | renv_bootstrap_validate_version_release(version, description) 781 | 782 | if (valid) 783 | return(TRUE) 784 | 785 | # the loaded version of renv doesn't match the requested version; 786 | # give the user instructions on how to proceed 787 | remote <- if (!is.null(description[["RemoteSha"]])) { 788 | paste("rstudio/renv", description[["RemoteSha"]], sep = "@") 789 | } else { 790 | paste("renv", description[["Version"]], sep = "@") 791 | } 792 | 793 | # display both loaded version + sha if available 794 | friendly <- renv_bootstrap_version_friendly( 795 | version = description[["Version"]], 796 | sha = description[["RemoteSha"]] 797 | ) 798 | 799 | fmt <- paste( 800 | "renv %1$s was loaded from project library, but this project is configured to use renv %2$s.", 801 | "- Use `renv::record(\"%3$s\")` to record renv %1$s in the lockfile.", 802 | "- Use `renv::restore(packages = \"renv\")` to install renv %2$s into the project library.", 803 | sep = "\n" 804 | ) 805 | catf(fmt, friendly, renv_bootstrap_version_friendly(version), remote) 806 | 807 | FALSE 808 | 809 | } 810 | 811 | renv_bootstrap_validate_version_dev <- function(version, description) { 812 | expected <- description[["RemoteSha"]] 813 | is.character(expected) && startswith(expected, version) 814 | } 815 | 816 | renv_bootstrap_validate_version_release <- function(version, description) { 817 | expected <- description[["Version"]] 818 | is.character(expected) && identical(expected, version) 819 | } 820 | 821 | renv_bootstrap_hash_text <- function(text) { 822 | 823 | hashfile <- tempfile("renv-hash-") 824 | on.exit(unlink(hashfile), add = TRUE) 825 | 826 | writeLines(text, con = hashfile) 827 | tools::md5sum(hashfile) 828 | 829 | } 830 | 831 | renv_bootstrap_load <- function(project, libpath, version) { 832 | 833 | # try to load renv from the project library 834 | if (!requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) 835 | return(FALSE) 836 | 837 | # warn if the version of renv loaded does not match 838 | renv_bootstrap_validate_version(version) 839 | 840 | # execute renv load hooks, if any 841 | hooks <- getHook("renv::autoload") 842 | for (hook in hooks) 843 | if (is.function(hook)) 844 | tryCatch(hook(), error = warning) 845 | 846 | # load the project 847 | renv::load(project) 848 | 849 | TRUE 850 | 851 | } 852 | 853 | renv_bootstrap_profile_load <- function(project) { 854 | 855 | # if RENV_PROFILE is already set, just use that 856 | profile <- Sys.getenv("RENV_PROFILE", unset = NA) 857 | if (!is.na(profile) && nzchar(profile)) 858 | return(profile) 859 | 860 | # check for a profile file (nothing to do if it doesn't exist) 861 | path <- renv_bootstrap_paths_renv("profile", profile = FALSE, project = project) 862 | if (!file.exists(path)) 863 | return(NULL) 864 | 865 | # read the profile, and set it if it exists 866 | contents <- readLines(path, warn = FALSE) 867 | if (length(contents) == 0L) 868 | return(NULL) 869 | 870 | # set RENV_PROFILE 871 | profile <- contents[[1L]] 872 | if (!profile %in% c("", "default")) 873 | Sys.setenv(RENV_PROFILE = profile) 874 | 875 | profile 876 | 877 | } 878 | 879 | renv_bootstrap_profile_prefix <- function() { 880 | profile <- renv_bootstrap_profile_get() 881 | if (!is.null(profile)) 882 | return(file.path("profiles", profile, "renv")) 883 | } 884 | 885 | renv_bootstrap_profile_get <- function() { 886 | profile <- Sys.getenv("RENV_PROFILE", unset = "") 887 | renv_bootstrap_profile_normalize(profile) 888 | } 889 | 890 | renv_bootstrap_profile_set <- function(profile) { 891 | profile <- renv_bootstrap_profile_normalize(profile) 892 | if (is.null(profile)) 893 | Sys.unsetenv("RENV_PROFILE") 894 | else 895 | Sys.setenv(RENV_PROFILE = profile) 896 | } 897 | 898 | renv_bootstrap_profile_normalize <- function(profile) { 899 | 900 | if (is.null(profile) || profile %in% c("", "default")) 901 | return(NULL) 902 | 903 | profile 904 | 905 | } 906 | 907 | renv_bootstrap_path_absolute <- function(path) { 908 | 909 | substr(path, 1L, 1L) %in% c("~", "/", "\\") || ( 910 | substr(path, 1L, 1L) %in% c(letters, LETTERS) && 911 | substr(path, 2L, 3L) %in% c(":/", ":\\") 912 | ) 913 | 914 | } 915 | 916 | renv_bootstrap_paths_renv <- function(..., profile = TRUE, project = NULL) { 917 | renv <- Sys.getenv("RENV_PATHS_RENV", unset = "renv") 918 | root <- if (renv_bootstrap_path_absolute(renv)) NULL else project 919 | prefix <- if (profile) renv_bootstrap_profile_prefix() 920 | components <- c(root, renv, prefix, ...) 921 | paste(components, collapse = "/") 922 | } 923 | 924 | renv_bootstrap_project_type <- function(path) { 925 | 926 | descpath <- file.path(path, "DESCRIPTION") 927 | if (!file.exists(descpath)) 928 | return("unknown") 929 | 930 | desc <- tryCatch( 931 | read.dcf(descpath, all = TRUE), 932 | error = identity 933 | ) 934 | 935 | if (inherits(desc, "error")) 936 | return("unknown") 937 | 938 | type <- desc$Type 939 | if (!is.null(type)) 940 | return(tolower(type)) 941 | 942 | package <- desc$Package 943 | if (!is.null(package)) 944 | return("package") 945 | 946 | "unknown" 947 | 948 | } 949 | 950 | renv_bootstrap_user_dir <- function() { 951 | dir <- renv_bootstrap_user_dir_impl() 952 | path.expand(chartr("\\", "/", dir)) 953 | } 954 | 955 | renv_bootstrap_user_dir_impl <- function() { 956 | 957 | # use local override if set 958 | override <- getOption("renv.userdir.override") 959 | if (!is.null(override)) 960 | return(override) 961 | 962 | # use R_user_dir if available 963 | tools <- asNamespace("tools") 964 | if (is.function(tools$R_user_dir)) 965 | return(tools$R_user_dir("renv", "cache")) 966 | 967 | # try using our own backfill for older versions of R 968 | envvars <- c("R_USER_CACHE_DIR", "XDG_CACHE_HOME") 969 | for (envvar in envvars) { 970 | root <- Sys.getenv(envvar, unset = NA) 971 | if (!is.na(root)) 972 | return(file.path(root, "R/renv")) 973 | } 974 | 975 | # use platform-specific default fallbacks 976 | if (Sys.info()[["sysname"]] == "Windows") 977 | file.path(Sys.getenv("LOCALAPPDATA"), "R/cache/R/renv") 978 | else if (Sys.info()[["sysname"]] == "Darwin") 979 | "~/Library/Caches/org.R-project.R/R/renv" 980 | else 981 | "~/.cache/R/renv" 982 | 983 | } 984 | 985 | renv_bootstrap_version_friendly <- function(version, sha = NULL) { 986 | sha <- sha %||% attr(version, "sha", exact = TRUE) 987 | parts <- c(version, sprintf("[sha: %s]", substring(sha, 1L, 7L))) 988 | paste(parts, collapse = " ") 989 | } 990 | 991 | renv_bootstrap_run <- function(version, libpath) { 992 | 993 | # perform bootstrap 994 | bootstrap(version, libpath) 995 | 996 | # exit early if we're just testing bootstrap 997 | if (!is.na(Sys.getenv("RENV_BOOTSTRAP_INSTALL_ONLY", unset = NA))) 998 | return(TRUE) 999 | 1000 | # try again to load 1001 | if (requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) { 1002 | return(renv::load(project = getwd())) 1003 | } 1004 | 1005 | # failed to download or load renv; warn the user 1006 | msg <- c( 1007 | "Failed to find an renv installation: the project will not be loaded.", 1008 | "Use `renv::activate()` to re-initialize the project." 1009 | ) 1010 | 1011 | warning(paste(msg, collapse = "\n"), call. = FALSE) 1012 | 1013 | } 1014 | 1015 | 1016 | renv_bootstrap_in_rstudio <- function() { 1017 | commandArgs()[[1]] == "RStudio" 1018 | } 1019 | 1020 | renv_json_read <- function(file = NULL, text = NULL) { 1021 | 1022 | jlerr <- NULL 1023 | 1024 | # if jsonlite is loaded, use that instead 1025 | if ("jsonlite" %in% loadedNamespaces()) { 1026 | 1027 | json <- catch(renv_json_read_jsonlite(file, text)) 1028 | if (!inherits(json, "error")) 1029 | return(json) 1030 | 1031 | jlerr <- json 1032 | 1033 | } 1034 | 1035 | # otherwise, fall back to the default JSON reader 1036 | json <- catch(renv_json_read_default(file, text)) 1037 | if (!inherits(json, "error")) 1038 | return(json) 1039 | 1040 | # report an error 1041 | if (!is.null(jlerr)) 1042 | stop(jlerr) 1043 | else 1044 | stop(json) 1045 | 1046 | } 1047 | 1048 | renv_json_read_jsonlite <- function(file = NULL, text = NULL) { 1049 | text <- paste(text %||% read(file), collapse = "\n") 1050 | jsonlite::fromJSON(txt = text, simplifyVector = FALSE) 1051 | } 1052 | 1053 | renv_json_read_default <- function(file = NULL, text = NULL) { 1054 | 1055 | # find strings in the JSON 1056 | text <- paste(text %||% read(file), collapse = "\n") 1057 | pattern <- '["](?:(?:\\\\.)|(?:[^"\\\\]))*?["]' 1058 | locs <- gregexpr(pattern, text, perl = TRUE)[[1]] 1059 | 1060 | # if any are found, replace them with placeholders 1061 | replaced <- text 1062 | strings <- character() 1063 | replacements <- character() 1064 | 1065 | if (!identical(c(locs), -1L)) { 1066 | 1067 | # get the string values 1068 | starts <- locs 1069 | ends <- locs + attr(locs, "match.length") - 1L 1070 | strings <- substring(text, starts, ends) 1071 | 1072 | # only keep those requiring escaping 1073 | strings <- grep("[[\\]{}:]", strings, perl = TRUE, value = TRUE) 1074 | 1075 | # compute replacements 1076 | replacements <- sprintf('"\032%i\032"', seq_along(strings)) 1077 | 1078 | # replace the strings 1079 | mapply(function(string, replacement) { 1080 | replaced <<- sub(string, replacement, replaced, fixed = TRUE) 1081 | }, strings, replacements) 1082 | 1083 | } 1084 | 1085 | # transform the JSON into something the R parser understands 1086 | transformed <- replaced 1087 | transformed <- gsub("{}", "`names<-`(list(), character())", transformed, fixed = TRUE) 1088 | transformed <- gsub("[[{]", "list(", transformed, perl = TRUE) 1089 | transformed <- gsub("[]}]", ")", transformed, perl = TRUE) 1090 | transformed <- gsub(":", "=", transformed, fixed = TRUE) 1091 | text <- paste(transformed, collapse = "\n") 1092 | 1093 | # parse it 1094 | json <- parse(text = text, keep.source = FALSE, srcfile = NULL)[[1L]] 1095 | 1096 | # construct map between source strings, replaced strings 1097 | map <- as.character(parse(text = strings)) 1098 | names(map) <- as.character(parse(text = replacements)) 1099 | 1100 | # convert to list 1101 | map <- as.list(map) 1102 | 1103 | # remap strings in object 1104 | remapped <- renv_json_remap(json, map) 1105 | 1106 | # evaluate 1107 | eval(remapped, envir = baseenv()) 1108 | 1109 | } 1110 | 1111 | renv_json_remap <- function(json, map) { 1112 | 1113 | # fix names 1114 | if (!is.null(names(json))) { 1115 | lhs <- match(names(json), names(map), nomatch = 0L) 1116 | rhs <- match(names(map), names(json), nomatch = 0L) 1117 | names(json)[rhs] <- map[lhs] 1118 | } 1119 | 1120 | # fix values 1121 | if (is.character(json)) 1122 | return(map[[json]] %||% json) 1123 | 1124 | # handle true, false, null 1125 | if (is.name(json)) { 1126 | text <- as.character(json) 1127 | if (text == "true") 1128 | return(TRUE) 1129 | else if (text == "false") 1130 | return(FALSE) 1131 | else if (text == "null") 1132 | return(NULL) 1133 | } 1134 | 1135 | # recurse 1136 | if (is.recursive(json)) { 1137 | for (i in seq_along(json)) { 1138 | json[i] <- list(renv_json_remap(json[[i]], map)) 1139 | } 1140 | } 1141 | 1142 | json 1143 | 1144 | } 1145 | 1146 | # load the renv profile, if any 1147 | renv_bootstrap_profile_load(project) 1148 | 1149 | # construct path to library root 1150 | root <- renv_bootstrap_library_root(project) 1151 | 1152 | # construct library prefix for platform 1153 | prefix <- renv_bootstrap_platform_prefix() 1154 | 1155 | # construct full libpath 1156 | libpath <- file.path(root, prefix) 1157 | 1158 | # attempt to load 1159 | if (renv_bootstrap_load(project, libpath, version)) 1160 | return(TRUE) 1161 | 1162 | if (renv_bootstrap_in_rstudio()) { 1163 | setHook("rstudio.sessionInit", function(...) { 1164 | renv_bootstrap_run(version, libpath) 1165 | 1166 | # Work around buglet in RStudio if hook uses readline 1167 | tryCatch( 1168 | { 1169 | tools <- as.environment("tools:rstudio") 1170 | tools$.rs.api.sendToConsole("", echo = FALSE, focus = FALSE) 1171 | }, 1172 | error = function(cnd) {} 1173 | ) 1174 | }) 1175 | } else { 1176 | renv_bootstrap_run(version, libpath) 1177 | } 1178 | 1179 | invisible() 1180 | 1181 | }) 1182 | -------------------------------------------------------------------------------- /renv/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "bioconductor.version": null, 3 | "external.libraries": [], 4 | "ignored.packages": [], 5 | "package.dependency.fields": [ 6 | "Imports", 7 | "Depends", 8 | "LinkingTo" 9 | ], 10 | "ppm.enabled": null, 11 | "ppm.ignored.urls": [], 12 | "r.version": null, 13 | "snapshot.type": "implicit", 14 | "use.cache": true, 15 | "vcs.ignore.cellar": true, 16 | "vcs.ignore.library": true, 17 | "vcs.ignore.local": true, 18 | "vcs.manage.ignores": true 19 | } 20 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | # This file is part of the standard setup for testthat. 2 | # It is recommended that you do not modify it. 3 | # 4 | # Where should you do additional test configuration? 5 | # Learn more about the roles of various files in: 6 | # * https://r-pkgs.org/tests.html 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files 8 | 9 | library(testthat) 10 | library(tidymodels.tutorials) 11 | 12 | test_check("tidymodels.tutorials") 13 | -------------------------------------------------------------------------------- /tests/testthat/test-tutorials.R: -------------------------------------------------------------------------------- 1 | # For now, we will do all our tutorial testing in this one script. We need a 2 | # listing of all the tutorials. I still worry that I don't really know which 3 | # paths this is getting. I *think* it is not doing what we want, which is to get 4 | # all the paths from this version of the package. Instead, it is getting the 5 | # paths from the most recently installed version of the package. So, you really 6 | # need to install before testing. If true, that is a hack! 7 | 8 | tut_paths <- tutorial.helpers::return_tutorial_paths("tidymodels.tutorials") 9 | 10 | test_that("All tutorials can be knit without error", { 11 | expect_null( 12 | tutorial.helpers::knit_tutorials(tut_paths) 13 | ) 14 | }) 15 | 16 | 17 | test_that("All tutorials have the expected components", { 18 | expect_null( 19 | tutorial.helpers::check_tutorial_defaults(tut_paths) 20 | ) 21 | }) 22 | 23 | --------------------------------------------------------------------------------