├── .Rbuildignore
├── .Renviron
├── .Rprofile
├── .github
    ├── .gitignore
    └── workflows
    │   ├── R-CMD-check.yaml
    │   └── pkgdown.yaml
├── .gitignore
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── R
    └── tidymodels.tutorials-package.R
├── README.Rmd
├── README.md
├── TODO.txt
├── _pkgdown.yml
├── inst
    └── tutorials
    │   ├── 02-a-tidyverse-primer
    │       └── tutorial.Rmd
    │   ├── 03-a-review-of-r-modeling-fundamentals
    │       └── tutorial.Rmd
    │   ├── 04-the-ames-housing-data
    │       └── tutorial.Rmd
    │   ├── 05-spending-our-data
    │       └── tutorial.Rmd
    │   ├── 06-fitting-models-with-parsnip
    │       ├── images
    │       │   ├── DiagramOne.png
    │       │   ├── TableFour.png
    │       │   ├── TableOne.png
    │       │   ├── TableThree.png
    │       │   └── TableTwo.png
    │       └── tutorial.Rmd
    │   ├── 07-a-model-workflow
    │       ├── images
    │       │   └── img.png
    │       └── tutorial.Rmd
    │   ├── 08-feature-engineering-with-recipes
    │       └── tutorial.Rmd
    │   ├── 09-judging-model-effectiveness
    │       └── tutorial.Rmd
    │   ├── 10-resampling
    │       ├── images
    │       │   ├── fig-ten-point-eight.png
    │       │   ├── fig-ten-point-five.png
    │       │   ├── fig-ten-point-one.png
    │       │   ├── fig-ten-point-seven.png
    │       │   ├── fig-ten-point-six.png
    │       │   ├── fig-ten-point-three.png
    │       │   └── fig-ten-point-two.png
    │       └── tutorial.Rmd
    │   ├── 11-comparing-models
    │       ├── data
    │       │   └── linear-statistical-model.png
    │       └── tutorial.Rmd
    │   ├── 12-model-tuning-and-the-dangers-of-overfitting
    │       ├── images
    │       │   ├── pic1.png
    │       │   ├── pic2.png
    │       │   ├── pic3.png
    │       │   ├── pic4.png
    │       │   └── pic5.png
    │       └── tutorial.Rmd
    │   ├── 13-grid-search
    │       └── tutorial.Rmd
    │   ├── 14-iterative-search
    │       ├── images
    │       │   ├── pic1.png
    │       │   ├── pic10.png
    │       │   ├── pic2.png
    │       │   ├── pic3.png
    │       │   ├── pic4.png
    │       │   ├── pic5.png
    │       │   ├── pic6.png
    │       │   ├── pic7.png
    │       │   ├── pic8.png
    │       │   └── pic9.png
    │       └── tutorial.Rmd
    │   ├── 15-screening-many-models
    │       └── tutorial.Rmd
    │   ├── 16-dimensionality-reduction
    │       ├── images
    │       │   ├── pic1.png
    │       │   ├── pic2.png
    │       │   ├── pic3.png
    │       │   ├── pic4.png
    │       │   └── pic5.png
    │       └── tutorial.Rmd
    │   └── 18-explaining-models-and-predictions
    │       ├── images
    │           └── pic1.png
    │       └── tutorial.Rmd
├── man
    ├── figures
    │   └── README-pressure-1.png
    └── tidymodels.tutorials-package.Rd
├── renv.lock
├── renv
    ├── .gitignore
    ├── activate.R
    └── settings.json
└── tests
    ├── testthat.R
    └── testthat
        └── test-tutorials.R


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^renv$
 2 | ^renv\.lock$
 3 | ^tidymodels\.tutorials\.Rproj$
 4 | ^\.Rproj\.user$
 5 | ^LICENSE\.md$
 6 | ^TODO.txt$
 7 | ^\.github$
 8 | ^README\.Rmd$
 9 | tutorials/[^/]*/(?!(data|images|.*Rmd))
10 | ^_pkgdown\.yml$
11 | ^docs$
12 | ^pkgdown$
13 | 


--------------------------------------------------------------------------------
/.Renviron:
--------------------------------------------------------------------------------
1 | RENV_CONFIG_SANDBOX_ENABLED = FALSE
2 | 


--------------------------------------------------------------------------------
/.Rprofile:
--------------------------------------------------------------------------------
1 | source("renv/activate.R")
2 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 | 
 9 | name: R-CMD-check
10 | 
11 | jobs:
12 |   R-CMD-check:
13 |     runs-on: ubuntu-latest
14 |     env:
15 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
16 |       R_KEEP_PKG_SOURCE: yes
17 |     steps:
18 |       - uses: actions/checkout@v3
19 | 
20 |       - uses: r-lib/actions/setup-r@v2
21 |         with:
22 |           use-public-rspm: true
23 | 
24 |       - uses: r-lib/actions/setup-r-dependencies@v2
25 |         with:
26 |           extra-packages: any::rcmdcheck
27 |           needs: check
28 | 
29 |       - uses: r-lib/actions/check-r-package@v2
30 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |     branches: [main, master]
 8 |   release:
 9 |     types: [published]
10 |   workflow_dispatch:
11 | 
12 | name: pkgdown
13 | 
14 | jobs:
15 |   pkgdown:
16 |     runs-on: ubuntu-latest
17 |     # Only restrict concurrency for non-PR jobs
18 |     concurrency:
19 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
20 |     env:
21 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
22 |     permissions:
23 |       contents: write
24 |     steps:
25 |       - uses: actions/checkout@v3
26 | 
27 |       - uses: r-lib/actions/setup-pandoc@v2
28 | 
29 |       - uses: r-lib/actions/setup-r@v2
30 |         with:
31 |           use-public-rspm: true
32 | 
33 |       - uses: r-lib/actions/setup-r-dependencies@v2
34 |         with:
35 |           extra-packages: any::pkgdown, local::.
36 |           needs: website
37 | 
38 |       - name: Build site
39 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
40 |         shell: Rscript {0}
41 | 
42 |       - name: Deploy to GitHub pages 🚀
43 |         if: github.event_name != 'pull_request'
44 |         uses: JamesIves/github-pages-deploy-action@v4.4.1
45 |         with:
46 |           clean: false
47 |           branch: gh-pages
48 |           folder: docs
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | 
 6 | *Rproj
 7 | 
 8 | inst/tutorials/[^/]*/(?!(data|images|.*Rmd))
 9 | 
10 | inst/tutorials/*/*html
11 | inst/tutorials/*/*cache*
12 | 
13 | inst/tutorials/*/*_files/
14 | docs
15 | 
16 | # The simple name of a file ignores that file wherever it might appear.
17 | 
18 | .DS_Store
19 | 
20 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: tidymodels.tutorials
 2 | Title: Tutorials for "Tidy Modeling with R"
 3 | Version: 0.0.0.9002
 4 | Authors@R: 
 5 |     person(given = "David",
 6 |            family = "Kane",
 7 |            role = c("aut", "cre", "cph"),
 8 |            email = "dave.kane@gmail.com",
 9 |            comment = c(ORCID = "0000-0002-6660-3934"))
10 | Description: When assigned "Tidy Modeling with R: A Framework for Modeling in
11 |   the Tidyverse" (Kuhn and Silge (2023, ISBN: 1492096482)), students should
12 |   read the book and type in all the associated R commands themselves. Sadly,
13 |   that never happens. These tutorials allow students to demonstrate (and their
14 |   instructors to be sure) that all work has been completed. See Kane (2023)
15 |   <https://ppbds.github.io/tutorial.helpers/articles/instructions.html> from
16 |   the 'tutorial.helpers' package for a background discussion.
17 | License: MIT + file LICENSE
18 | Encoding: UTF-8
19 | Roxygen: list(markdown = TRUE)
20 | RoxygenNote: 7.2.3
21 | Suggests:
22 |     baguette,
23 |     beans,
24 |     bestNormalize,
25 |     censored,
26 |     corrplot,
27 |     corrr,
28 |     DALEXtra,
29 |     discrim,
30 |     doParallel,
31 |     embed,
32 |     fastICA,
33 |     finetune,
34 |     ggrepel,
35 |     ggforce,
36 |     kernlab,
37 |     klaR,
38 |     lme4,
39 |     learnr,
40 |     mda,
41 |     mixOmics,
42 |     multilevelmod,
43 |     nlme,
44 |     ranger,
45 |     roxygen2,
46 |     rsconnect,
47 |     rstanarm,
48 |     rules,
49 |     stringr,
50 |     testthat (>= 3.0.0),
51 |     tidymodels,
52 |     tidyposterior,
53 |     tidyverse,
54 |     tutorial.helpers,
55 |     usemodels,
56 |     xgboost
57 | Config/testthat/edition: 3
58 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2023
2 | COPYRIGHT HOLDER: tidymodels.tutorials authors
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2023 tidymodels.tutorials authors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 | 
3 | 


--------------------------------------------------------------------------------
/R/tidymodels.tutorials-package.R:
--------------------------------------------------------------------------------
1 | #' @keywords internal
2 | "_PACKAGE"
3 | 
4 | ## usethis namespace: start
5 | ## usethis namespace: end
6 | NULL
7 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output: github_document
 3 | ---
 4 | 
 5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 6 | 
 7 | ```{r, include = FALSE}
 8 | knitr::opts_chunk$set(
 9 |   collapse = TRUE,
10 |   comment = "#>",
11 |   fig.path = "man/figures/README-",
12 |   out.width = "100%"
13 | )
14 | ```
15 | 
16 | # Tutorials for *Tidy Modeling with R*
17 | 
18 | <!-- badges: start -->
19 | [![R-CMD-check](https://github.com/PPBDS/tidymodels.tutorials/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/PPBDS/tidymodels.tutorials/actions/workflows/R-CMD-check.yaml)
20 | <!-- badges: end -->
21 | 
22 | 
23 | ## About this package
24 | 
25 | **tidymodel.tutorials** provides tutorials for [*Tidy Modeling with R*](https://www.tmwr.org/) by Max Kuhn and Julia Silge. These tutorials assume that you have some experience working with the tools provided by the **[tutorial.helpers](https://ppbds.github.io/tutorial.helpers/)** package. As long as you have completed the "Getting Started" tutorial from that package, you should be fine.
26 | 
27 | The main audience for these tutorials is instructors teaching data science and their students. Instructors want students to, for example, read Chapter 8 of [*Tidy Modeling with R*](https://www.tmwr.org/) (or something similar), typing in the code at the R Console along the way. Sadly, students almost never do that. Indeed, many (most?) of them won't even read the assigned chapter.
28 | 
29 | The promise we make to instructors is that, if they assign our tutorial for Chapter 8, then students will type in at least 90% of the code examples from the chapter, and then run the code to see what happens. We also pull out some of the most important prose from the chapter and do everything we can to cajole/trick students into reading it. These [two](https://ppbds.github.io/tutorial.helpers/articles/instructions.html) [essays](https://ppbds.github.io/tutorial.helpers/articles/books.html) provide background information about our approach.
30 | 
31 | Our causal claim is that, if an instructor were to randomly assign half the class to do these tutorials and half to simply complete the reading, the half completing the tutorials would perform much better for the rest of the course.
32 | 
33 | ## Installation
34 | 
35 | You can install the development version from [GitHub](https://github.com/) with:
36 | 
37 | ``` r
38 | remotes::install_github("PPBDS/tidymodels.tutorials")
39 | ```
40 | 
41 | If R offers you the option to update some packages, you should do so. For packages that need compilation, feel free to answer "no".
42 | 
43 | Then **restart your R session** or **restart RStudio**.
44 | 
45 | ## Accessing tutorials
46 | 
47 | In order to access the tutorials, start by loading the package.
48 | 
49 | ``` r
50 | library(tidymodels.tutorials)
51 | ```
52 | 
53 | You can access the tutorials via the Tutorial tab in the top right (Environment) pane in RStudio.
54 | 
55 | If either of the following is happening to you
56 | 
57 | <ul>
58 | <li>Cannot find the Tutorial pane</li>
59 | <li>Cannot find any tidymodels tutorials</li>
60 | </ul>
61 | 
62 | Then **remember to restart your R session** after installing the package.
63 | 
64 | Because tutorials within the Tutorial pane are sorted in alphabetical order by the name of the package, the **tidymodels.tutorials** will be toward the bottom. If you don’t see any tutorials, try clicking the “Home” button – the little house symbol with the thin red roof in the upper right.
65 | 
66 | In order to expand the window, you can drag and enlarge the tutorial pane inside RStudio. In order to open a pop-up window, click the "Show in New Window" icon next to the home icon.
67 | 
68 | You may notice that the Jobs tab in the lower left will create output as the tutorial is starting up. This is because RStudio is running the code to create the tutorial. If you accidentally clicked "Start Tutorial" and would like to stop the job from running, you can click the back arrow in the Jobs tab, and then press the red stop sign icon. Your work will be saved between RStudio sessions, meaning that you can complete a tutorial in multiple sittings. Once you have completed a tutorial, follow the instructions on the tutorial `Submit` page and, if you're a student, submit your answers as instructed.
69 | 
70 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 3 | 
 4 | # Tutorials for *Tidy Modeling with R*
 5 | 
 6 | <!-- badges: start -->
 7 | 
 8 | [![R-CMD-check](https://github.com/PPBDS/tidymodels.tutorials/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/PPBDS/tidymodels.tutorials/actions/workflows/R-CMD-check.yaml)
 9 | <!-- badges: end -->
10 | 
11 | ## About this package
12 | 
13 | **tidymodel.tutorials** provides tutorials for [*Tidy Modeling with
14 | R*](https://www.tmwr.org/) by Max Kuhn and Julia Silge. These tutorials
15 | assume that you have some experience working with the tools provided by
16 | the **[tutorial.helpers](https://ppbds.github.io/tutorial.helpers/)**
17 | package. As long as you have completed the “Getting Started” tutorial
18 | from that package, you should be fine.
19 | 
20 | The main audience for these tutorials is instructors teaching data
21 | science and their students. Instructors want students to, for example,
22 | read Chapter 8 of [*Tidy Modeling with R*](https://www.tmwr.org/) (or
23 | something similar), typing in the code at the R Console along the way.
24 | Sadly, students almost never do that. Indeed, many (most?) of them won’t
25 | even read the assigned chapter.
26 | 
27 | The promise we make to instructors is that, if they assign our tutorial
28 | for Chapter 8, then students will type in at least 90% of the code
29 | examples from the chapter, and then run the code to see what happens. We
30 | also pull out some of the most important prose from the chapter and do
31 | everything we can to cajole/trick students into reading it. These
32 | [two](https://ppbds.github.io/tutorial.helpers/articles/instructions.html)
33 | [essays](https://ppbds.github.io/tutorial.helpers/articles/books.html)
34 | provide background information about our approach.
35 | 
36 | Our causal claim is that, if an instructor were to randomly assign half
37 | the class to do these tutorials and half to simply complete the reading,
38 | the half completing the tutorials would perform much better for the rest
39 | of the course.
40 | 
41 | ## Installation
42 | 
43 | You can install the development version from
44 | [GitHub](https://github.com/) with:
45 | 
46 | ``` r
47 | remotes::install_github("PPBDS/tidymodels.tutorials")
48 | ```
49 | 
50 | If R offers you the option to update some packages, you should do so.
51 | For packages that need compilation, feel free to answer “no”.
52 | 
53 | Then **restart your R session** or **restart RStudio**.
54 | 
55 | ## Accessing tutorials
56 | 
57 | In order to access the tutorials, start by loading the package.
58 | 
59 | ``` r
60 | library(tidymodels.tutorials)
61 | ```
62 | 
63 | You can access the tutorials via the Tutorial tab in the top right
64 | (Environment) pane in RStudio.
65 | 
66 | If either of the following is happening to you
67 | 
68 | <ul>
69 | <li>
70 | Cannot find the Tutorial pane
71 | </li>
72 | <li>
73 | Cannot find any tidymodels tutorials
74 | </li>
75 | </ul>
76 | 
77 | Then **remember to restart your R session** after installing the
78 | package.
79 | 
80 | Because tutorials within the Tutorial pane are sorted in alphabetical
81 | order by the name of the package, the **tidymodels.tutorials** will be
82 | toward the bottom. If you don’t see any tutorials, try clicking the
83 | “Home” button – the little house symbol with the thin red roof in the
84 | upper right.
85 | 
86 | In order to expand the window, you can drag and enlarge the tutorial
87 | pane inside RStudio. In order to open a pop-up window, click the “Show
88 | in New Window” icon next to the home icon.
89 | 
90 | You may notice that the Jobs tab in the lower left will create output as
91 | the tutorial is starting up. This is because RStudio is running the code
92 | to create the tutorial. If you accidentally clicked “Start Tutorial” and
93 | would like to stop the job from running, you can click the back arrow in
94 | the Jobs tab, and then press the red stop sign icon. Your work will be
95 | saved between RStudio sessions, meaning that you can complete a tutorial
96 | in multiple sittings. Once you have completed a tutorial, follow the
97 | instructions on the tutorial `Submit` page and, if you’re a student,
98 | submit your answers as instructed.
99 | 


--------------------------------------------------------------------------------
/TODO.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | Read Modelling Basics chapters.
 4 | 
 5 | # Anish
 6 | 
 7 | Read chapter 2. Complete tutorial 2, and send me your answers as html. Determine which material from chapter 1 is most important, and can fit, and then include it as knowledge drops. Add test chunks.
 8 | 
 9 | 
10 | 
11 | 
12 | Deal with censored issue.
13 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
1 | url: ~
2 | template:
3 |   bootstrap: 5
4 | 
5 | 


--------------------------------------------------------------------------------
/inst/tutorials/04-the-ames-housing-data/tutorial.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: The Ames Housing Data
  3 | author: Pratham Kancherla and David Kane
  4 | tutorial:
  5 |   id: the-ames-housing-data
  6 | output:
  7 |   learnr::tutorial:
  8 |     progressive: yes
  9 |     allow_skip: yes
 10 | runtime: shiny_prerendered
 11 | description: 'Tutorial for Chapter 4: The Ames Housing Data'
 12 | ---
 13 | 
 14 | ```{r setup, include = FALSE}
 15 | library(learnr)
 16 | library(tidymodels)
 17 | library(tidyverse)
 18 | library(tutorial.helpers)
 19 | knitr::opts_chunk$set(echo = FALSE)
 20 | options(tutorial.exercise.timelimit = 60, 
 21 |         tutorial.storage = "local") 
 22 | 
 23 | gg_hist <- ggplot(ames, aes(x = Sale_Price)) + 
 24 |   geom_histogram(bins = 50, col= "white") +
 25 |   scale_x_log10()
 26 | ```
 27 | 
 28 | ```{r copy-code-chunk, child = system.file("child_documents/copy_button.Rmd", package = "tutorial.helpers")}
 29 | ```
 30 | 
 31 | ```{r info-section, child = system.file("child_documents/info_section.Rmd", package = "tutorial.helpers")}
 32 | ```
 33 | 
 34 | ## Introduction
 35 | ### 
 36 | 
 37 | This tutorial explores [Chapter 4: The Ames Housing Data](https://www.tmwr.org/ames.html) from [*Tidy Modeling with R*](https://www.tmwr.org/) by Max Kuhn and Julia Silge. We'll introduce the Ames housing data set [(De Cock 2011)](https://www.tmwr.org/ames.html#ref-ames), a key component in upcoming modeling examples. Conducting exploratory data analysis, as demonstrated in this tutorial, serves as a initial phase in constructing a dependable model. 
 38 | 
 39 | ## Exploring Features of Homes in Ames
 40 | ### 
 41 | 
 42 | The data set contains information on 2,930 properties in Ames, Iowa, including columns related to:
 43 | 
 44 | -   house characteristics (bedrooms, garage, fireplace, pool, porch, etc.)
 45 | 
 46 | -   location (neighborhood)
 47 | 
 48 | -   lot information (zoning, shape, size, etc.)
 49 | 
 50 | -   ratings of condition and quality
 51 | 
 52 | -   sale price
 53 | 
 54 | ### Exercise 1
 55 | 
 56 | The Ames data has been made available through the **modeldata** package, which is a built in **tidymodels** package.
 57 | 
 58 | Load the library **tidymodels** using `library()`.
 59 | 
 60 | ```{r exploring-features-of-homes-in-1, exercise = TRUE}
 61 | 
 62 | ```
 63 | 
 64 | ```{r exploring-features-of-homes-in-1-hint-1, eval = FALSE}
 65 | library(...)
 66 | ```
 67 | 
 68 | ```{r exploring-features-of-homes-in-1-test, include = FALSE}
 69 | library(tidymodels)
 70 | ```
 71 | 
 72 | ### 
 73 | 
 74 | The core **tidymodels** packages: [**rsample**](https://rsample.tidymodels.org/), [**parsnip**](https://parsnip.tidymodels.org/), [**recipes**](https://recipes.tidymodels.org/), [**workflows**](https://workflows.tidymodels.org/), [**tune**](https://tune.tidymodels.org/), [**yardstick**](https://yardstick.tidymodels.org/), [**broom**](https://broom.tidymodels.org/), [**dials**](https://dials.tidymodels.org/)
 75 | 
 76 | ### Exercise 2
 77 | 
 78 | Now load the data `ames` from the package using the `data()` function.
 79 | 
 80 | ```{r exploring-features-of-homes-in-2, exercise = TRUE}
 81 | 
 82 | ```
 83 | 
 84 | ```{r exploring-features-of-homes-in-2-hint-1, eval = FALSE}
 85 | data(...)
 86 | ```
 87 | 
 88 | ```{r exploring-features-of-homes-in-2-test, include = FALSE}
 89 | data(ames)
 90 | ```
 91 | 
 92 | ### 
 93 | 
 94 | In R, the dim() function is used to retrieve or set the dimensions of an object, such as a matrix, array, or data frame. The dim() function returns a vector with two elements representing the number of rows and columns (dimensions) of the object.
 95 | 
 96 | ### Exercise 3
 97 | 
 98 | We want to look at how many rows and columns are in `ames`. Use `dim()` with `ames` as the parameter to retrieve a vector of (# of rows, # of columns).
 99 | 
100 | ```{r exploring-features-of-homes-in-3, exercise = TRUE}
101 | 
102 | ```
103 | 
104 | ```{r exploring-features-of-homes-in-3-hint-1, eval = FALSE}
105 | dim(...)
106 | ```
107 | 
108 | ```{r exploring-features-of-homes-in-3-test, include = FALSE}
109 | dim(ames)
110 | ```
111 | 
112 | ### 
113 | 
114 | Changes made to the data in the `modeldata` package are such that in the raw data, if a house did not have a particular feature, it was implicitly encoded as missing, the categorical predictors were converted to R’s factor data type, and quality descriptors for each house were removed since they are more like outcomes than predictors.
115 | 
116 | ### Exercise 4
117 | 
118 | Let’s start our exploratory data analysis by focusing on the outcome we want to predict: the last sale price of the house (in USD). We can create a histogram to see the distribution of sale prices.
119 | 
120 | Type in ames to see the data that we are looking at.
121 | 
122 | ```{r exploring-features-of-homes-in-4, exercise = TRUE}
123 | 
124 | ```
125 | 
126 | ```{r exploring-features-of-homes-in-4-hint-1, eval = FALSE}
127 | ames
128 | ```
129 | 
130 | ### 
131 | 
132 | The root mean squared error (RMSE) is a common performance metric used in regression models. It uses the difference between the observed and predicted values in its calculations.
133 | 
134 | ### Exercise 5
135 | 
136 | Copy the previous code and pipe it to ggplot() to start creating a histogram.
137 | 
138 | ```{r exploring-features-of-homes-in-5, exercise = TRUE}
139 | 
140 | ```
141 | 
142 | <button onclick = "transfer_code(this)">Copy previous code</button>
143 | 
144 | ```{r exploring-features-of-homes-in-5-hint-1, eval = FALSE}
145 | ames |>
146 |   ...()
147 | ```
148 | 
149 | ```{r exploring-features-of-homes-in-5-test, include = FALSE}
150 | ames |>
151 |   ggplot()
152 | ```
153 | 
154 | ### 
155 | 
156 | In R, ggplot() is a function from the **ggplot2** package, which is a powerful and widely-used package for creating visualizations. 
157 | 
158 | ### Exercise 6
159 | 
160 | We need to establish the x-axis on the graph. Copy the previous code and, using `aes()` within in `ggplot()`, set `x` equal to `Sale_Price`.
161 | 
162 | ```{r exploring-features-of-homes-in-6, exercise = TRUE}
163 | 
164 | ```
165 | 
166 | <button onclick = "transfer_code(this)">Copy previous code</button>
167 | 
168 | ```{r exploring-features-of-homes-in-6-hint-1, eval = FALSE}
169 | ... |>
170 |   ggplot(aes(x = ...))
171 | ```
172 | 
173 | ```{r exploring-features-of-homes-in-6-test, include = FALSE}
174 | ames |>
175 |   ggplot(aes(x = Sale_Price))
176 | ```
177 | 
178 | ### 
179 | 
180 | The disadvantages of transforming the outcome mostly relate to interpretation of model results.
181 | 
182 | ### Exercise 7
183 | 
184 | Now we will created the histogram. Copy the previous code and add the geom_hist() function to ggplot() using the `+` symbol.
185 | 
186 | ```{r exploring-features-of-homes-in-7, exercise = TRUE}
187 | 
188 | ```
189 | 
190 | <button onclick = "transfer_code(this)">Copy previous code</button>
191 | 
192 | ```{r exploring-features-of-homes-in-7-hint-1, eval = FALSE}
193 | ... +
194 |   geom_histogram()
195 | ```
196 | 
197 | ```{r exploring-features-of-homes-in-7-test, include = FALSE}
198 | ames |>
199 |   ggplot(aes(x = Sale_Price)) +
200 |     geom_histogram()
201 | ```
202 | 
203 | ### 
204 | 
205 | In **ggplot2**, geom_histogram() is a function used to create a histogram, which is a graphical representation of the distribution of a continuous variable. A histogram divides the data into intervals (bins) and displays the frequency or count of data points falling into each bin.
206 | 
207 | ### Exercise 8
208 | 
209 | Lets add `bins` to the histogram. Copy the previous code and in geom_histogram(), add the parameter `bins`, setting it equal to `50`.
210 | 
211 | ```{r exploring-features-of-homes-in-8, exercise = TRUE}
212 | 
213 | ```
214 | 
215 | <button onclick = "transfer_code(this)">Copy previous code</button>
216 | 
217 | ```{r exploring-features-of-homes-in-8-hint-1, eval = FALSE}
218 | ... +
219 |   geom_histogram(bins = ...)
220 | ```
221 | 
222 | ```{r exploring-features-of-homes-in-8-test, include = FALSE}
223 | ames |>
224 |   ggplot(aes(x = Sale_Price)) +
225 |     geom_histogram(bins = 50)
226 | ```
227 | 
228 | ### 
229 | 
230 | This plot shows us that the data are right-skewed; there are more inexpensive houses than expensive ones. When modeling this outcome, a strong argument can be made that the price should be log-transformed.
231 | 
232 | ### Exercise 9
233 | 
234 | Lets add `col` (color) to the histogram. Copy the previous code and in geom_histogram(), add the parameter `col`, setting it equal to `"white"`.
235 | 
236 | ```{r exploring-features-of-homes-in-9, exercise = TRUE}
237 | 
238 | ```
239 | 
240 | <button onclick = "transfer_code(this)">Copy previous code</button>
241 | 
242 | ```{r exploring-features-of-homes-in-9-hint-1, eval = FALSE}
243 | ... +
244 |   geom_histogram(bins = 50, col = "...")
245 | ```
246 | 
247 | ```{r exploring-features-of-homes-in-9-test, include = FALSE}
248 | ames |>
249 |   ggplot(aes(x = Sale_Price)) +
250 |     geom_histogram(bins = 50, col = "white")
251 | ```
252 | 
253 | ### 
254 | 
255 | From a statistical perspective, a logarithmic transform may also stabilize the variance in a way that makes inference more legitimate. 
256 | 
257 | ### Exercise 10
258 | 
259 | Copy the previous code and add `scale_x_log10()`.
260 | 
261 | ```{r exploring-features-of-homes-in-10, exercise = TRUE}
262 | 
263 | ```
264 | 
265 | <button onclick = "transfer_code(this)">Copy previous code</button>
266 | 
267 | ```{r exploring-features-of-homes-in-10-hint-1, eval = FALSE}
268 | ... +
269 |   scale_x_...()
270 | ```
271 | 
272 | ```{r exploring-features-of-homes-in-10-test, include = FALSE}
273 | ames |>
274 |   ggplot(aes(x = Sale_Price)) +
275 |     geom_histogram(bins = 50, col = "white") +
276 |     scale_x_log10()
277 | ```
278 | 
279 | ### 
280 | 
281 | The advantages of this type of transformation are that no houses would be predicted with negative sale prices and that errors in predicting expensive houses will not have an undue influence on the model. 
282 | 
283 | ### 
284 | 
285 | Great Job! You now understood the basic concept of exploratory data analysis by analyzing the sale price of homes in Ames, Iowa. 
286 | 
287 | <!-- DK: leave comments about decision to not do all graphics. -->
288 | 
289 | ## Summary
290 | ### 
291 | 
292 | This tutorial covered [Chapter 4: The Ames Housing Data](https://www.tmwr.org/ames.html) from [*Tidy Modeling with R*](https://www.tmwr.org/) by Max Kuhn and Julia Silge. We introduced the Ames housing data set [(De Cock 2011)](https://www.tmwr.org/ames.html#ref-ames), which we will use in modeling examples in later tutorials. Exploratory data analysis, like what we walk through in this tutorial, is an important first step in building a reliable model and you now a basic understanding of this concept.
293 | 
294 | ```{r download-answers, child = system.file("child_documents/download_answers.Rmd", package = "tutorial.helpers")}
295 | ```
296 | 


--------------------------------------------------------------------------------
/inst/tutorials/05-spending-our-data/tutorial.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Spending our Data
  3 | author: Aryan Kancherla
  4 | tutorial:
  5 |   id: spending-our-data
  6 | output:
  7 |   learnr::tutorial:
  8 |     progressive: yes
  9 |     allow_skip: yes
 10 | runtime: shiny_prerendered
 11 | description: 'Tutorial for Chapter 5: Spending our Data'
 12 | ---
 13 | 
 14 | ```{r setup, include = FALSE}
 15 | library(learnr)
 16 | library(tutorial.helpers)
 17 | library(tidymodels)
 18 | tidymodels_prefer()
 19 | knitr::opts_chunk$set(echo = FALSE)
 20 | options(tutorial.exercise.timelimit = 60, 
 21 |         tutorial.storage = "local") 
 22 | 
 23 | set.seed(501)
 24 | ames_split <- initial_split(ames, prop = 0.80)
 25 | 
 26 | ames_update <- ames |>
 27 |   mutate(Sale_Price = log10(Sale_Price))
 28 | 
 29 | ames_plot <- ames_update |> 
 30 |   ggplot(aes(x = Sale_Price)) + 
 31 |     geom_density() + theme_classic() + 
 32 |       labs(x = "Sale Price (log-10 USD)")
 33 | 
 34 | set.seed(502)
 35 | 
 36 | ames_strata_split <- initial_split(ames, prop = 0.80, strata = Sale_Price)
 37 | 
 38 | 
 39 | ```
 40 | 
 41 | ```{r copy-code-chunk, child = system.file("child_documents/copy_button.Rmd", package = "tutorial.helpers")}
 42 | ```
 43 | 
 44 | ```{r info-section, child = system.file("child_documents/info_section.Rmd", package = "tutorial.helpers")}
 45 | ```
 46 | 
 47 | ## Introduction
 48 | ### 
 49 | 
 50 | This tutorial covers [Chapter 5: Spending our Data](https://www.tmwr.org/splitting.html) from [*Tidy Modeling with R*](https://www.tmwr.org/) by Max Kuhn and Julia Silge. In this tutorial, you will learn how to partition data into distinct groups for modeling and evaluation. The functions that will be used to do this are `initial_split()`, `training()`, and `testing()` from the [**tidymodels**](https://www.tidymodels.org/packages/) and [**rsample**](https://rsample.tidymodels.org/) packages.
 51 | 
 52 | 
 53 | ## Common Method for Splitting Data
 54 | ### 
 55 | 
 56 | There are several steps to creating a useful model, including parameter estimation, model selection and tuning, and performance assessment. At the start of a new project, there is usually an initial finite pool of data available for all these tasks, which we can think of as an available data budget. How should the data be applied to different steps or tasks? The idea of data spending is an important first consideration when modeling, especially as it relates to empirical validation.
 57 | 
 58 | ### Exercise 1
 59 | 
 60 | Load the **tidymodels** package below, using `library()`.
 61 | 
 62 | ```{r common-method-for-sp-1, exercise = TRUE}
 63 | 
 64 | ```
 65 | 
 66 | ```{r common-method-for-sp-1-hint-1, eval = FALSE}
 67 | library(...)
 68 | ```
 69 | 
 70 | ```{r include = FALSE}
 71 | library(tidymodels)
 72 | ```
 73 | 
 74 | ### 
 75 | 
 76 | When there are copious amounts of data available, a smart strategy is to allocate specific subsets of data for different tasks, as opposed to allocating the largest possible amount (or even all) to the model parameter estimation only. For example, one possible strategy (when both data and predictors are abundant) is to spend a specific subset of data to determine which predictors are informative, before considering parameter estimation at all.
 77 | 
 78 | 
 79 | 
 80 | ### Exercise 2
 81 | 
 82 | To combat the function naming conflicts, type in `tidymodels_prefer()`.
 83 | 
 84 | ```{r common-method-for-sp-2, exercise = TRUE}
 85 | 
 86 | ```
 87 | 
 88 | ```{r common-method-for-sp-2-hint-1, eval = FALSE}
 89 | ...()
 90 | ```
 91 | 
 92 | ```{r include = FALSE}
 93 | tidymodels_prefer()
 94 | ```
 95 | 
 96 | ### 
 97 | 
 98 | The primary approach for empirical model validation is to split the existing pool of data into two distinct sets: the training set and the test set. One portion of the data is used to develop and optimize the model. This *training set* is usually the majority of the data. These data are a sandbox for model building where different models can be fit, feature engineering strategies are investigated, and so on. As modeling practitioners, we spend the vast majority of the modeling process using the training set as the substrate to develop the model.
 99 | 
100 | 
101 | ### Exercise 3
102 | 
103 | In order to split data, we will need to use the `initial_split()` function from the *rsample* package. Type in `?initial_split()` in the Console and look at the Description section. CP/CR.
104 | 
105 | ```{r common-method-for-sp-3}
106 | question_text(NULL,
107 | 	answer(NULL, correct = TRUE),
108 | 	allow_retry = TRUE,
109 | 	try_again_button = "Edit Answer",
110 | 	incorrect = NULL,
111 | 	rows = 3)
112 | ```
113 | 
114 | ### 
115 | 
116 | Since one portion of the data is placeed in the training set, the other portion of the data is placed into the *test set*. This is held in reserve until one or two models are chosen as the methods most likely to succeed. The test set is then used as the final arbiter to determine the efficacy of the model. It is critical to look at the test set only once; otherwise, it becomes part of the modeling process.
117 | 
118 | 
119 | ### Exercise 4
120 | 
121 | The data we will be splitting is the `ames` data set. Type in `ames` and press "Run Code".
122 | 
123 | ```{r common-method-for-sp-4, exercise = TRUE}
124 | 
125 | ```
126 | 
127 | <button onclick = "transfer_code(this)">Copy previous code</button>
128 | 
129 | ```{r common-method-for-sp-4-hint-1, eval = FALSE}
130 | ...
131 | ```
132 | 
133 | ```{r include = FALSE}
134 | ames
135 | ```
136 | 
137 | ### 
138 | 
139 | The `ames` data set contains information on 2,930 properties in Ames, Iowa, including columns related to:
140 | 
141 | - house characteristics (bedrooms, garage, fireplace, pool, porch, etc.)
142 | - location (neighborhood)
143 | - lot information (zoning, shape, size, etc.)
144 | - ratings of condition and quality
145 | - sale price
146 | 
147 | ### Exercise 5
148 | 
149 | In order to make sure the results can be produced later, we are going to use the `set.seed()` function. In the code chunk below, type in `set.seed()` and pass in `501`. 
150 | 
151 | ```{r common-method-for-sp-5, exercise = TRUE}
152 | 
153 | ```
154 | 
155 | <button onclick = "transfer_code(this)">Copy previous code</button>
156 | 
157 | ```{r common-method-for-sp-5-hint-1, eval = FALSE}
158 | set.seed(...)
159 | ```
160 | 
161 | ```{r include = FALSE}
162 | set.seed(501)
163 | ```
164 | 
165 | ### 
166 | 
167 | Note that the method for conducting the splitting of data depends on the context. 
168 | 
169 | ### Exercise 6
170 | 
171 | Lets allocate 80% of the data to the training set and the remaining 20% for the testing set. In the code chunk below, type in `initial_split()` passing in the `ames` data set.
172 | 
173 | ```{r common-method-for-sp-6, exercise = TRUE}
174 | 
175 | ```
176 | 
177 | ```{r common-method-for-sp-6-hint-1, eval = FALSE}
178 | initial_split(...)
179 | ```
180 | 
181 | ```{r include = FALSE}
182 | initial_split(ames)
183 | ```
184 | 
185 | ### 
186 | 
187 | As you can see, the data spits out a training number, testing number, and total number. The *Total* stands for the total amount of data in the data set. The *Training* number stands for the amount of data placed in the training set and the *Testing* number stands for the amount of data placed in the testing set. 
188 | 
189 | ### Exercise 7
190 | 
191 | By doing the math, you can see that the data allocated to the training and testing sets are not what we wanted. The training set contains 75% of the data and the testing set contains 25% percent of the data. However, we want the training set to have 80% of the data and the testing set to have 20% of the data.
192 | 
193 | To fix this, copy the previous code and inside `initial_split()`, set the `prop` argument to `0.80`.
194 | 
195 | ```{r common-method-for-sp-7, exercise = TRUE}
196 | 
197 | ```
198 | 
199 | <button onclick = "transfer_code(this)">Copy previous code</button>
200 | 
201 | ```{r common-method-for-sp-7-hint-1, eval = FALSE}
202 | initial_split(ames, prop = ...)
203 | ```
204 | 
205 | ```{r include = FALSE}
206 | initial_split(ames, prop = 0.80)
207 | ```
208 | 
209 | ### 
210 | 
211 | Doing the math, we can now see that 80% of the data (n = 2,344) is in the training set and 20% (n = 586) is in the testing set. 
212 | 
213 | ### Exercise 8
214 | 
215 | Copy the previous code and set it to the variable `ames_split`.
216 | 
217 | ```{r common-method-for-sp-8, exercise = TRUE}
218 | 
219 | ```
220 | 
221 | <button onclick = "transfer_code(this)">Copy previous code</button>
222 | 
223 | ```{r common-method-for-sp-8-hint-1, eval = FALSE}
224 | ... <- initial_split(ames, prop = 0.80)
225 | ```
226 | 
227 | ```{r include = FALSE}
228 | ames_split <- initial_split(ames, prop = 0.80)
229 | 
230 | ```
231 | 
232 | ### 
233 | 
234 | The **rsample** package also provides a `group_initial_split()` function for splitting data. Click [here](https://rsample.tidymodels.org/reference/initial_split.html) to learn more. 
235 | 
236 | ### Exercise 9
237 | 
238 | The object `ames_split` is an `rsplit` object and contains only the partitioning information; to get the resulting data sets, we need apply two more functions: `training()` and `testing()`. In the code below, type `training()` and passing in `ames_split`.
239 | 
240 | ```{r common-method-for-sp-9, exercise = TRUE}
241 | 
242 | ```
243 | 
244 | ```{r common-method-for-sp-9-hint-1, eval = FALSE}
245 | training(...)
246 | ```
247 | 
248 | ```{r include = FALSE}
249 | training(ames_split)
250 | ```
251 | 
252 | ### 
253 | 
254 | As you can see, the `training()` function gets the tibble that contains all of the training data.
255 | 
256 | ### Exercise 10
257 | 
258 | Copy the previous code and pass it into the `dim()` function.
259 | 
260 | ```{r common-method-for-sp-10, exercise = TRUE}
261 | 
262 | ```
263 | 
264 | <button onclick = "transfer_code(this)">Copy previous code</button>
265 | 
266 | ```{r common-method-for-sp-10-hint-1, eval = FALSE}
267 | ...(training(ames_split))
268 | ```
269 | 
270 | ```{r include = FALSE}
271 | dim(training(ames_split))
272 | 
273 | ```
274 | 
275 | ### 
276 | 
277 | The `dim()` function is used to determine the dimensions of an object. It returns a numerical vector that contains the number of rows and columns in the object. As you can see, the training data contains 2344 rows and 74 columns. 
278 | 
279 | ### Exercise 11
280 | 
281 | Now, lets extract the testing data. Copy the code above and change `training` to `testing`.
282 | 
283 | ```{r common-method-for-sp-11, exercise = TRUE}
284 | 
285 | ```
286 | 
287 | <button onclick = "transfer_code(this)">Copy previous code</button>
288 | 
289 | ```{r common-method-for-sp-11-hint-1, eval = FALSE}
290 | dim(...(ames_split))
291 | ```
292 | 
293 | ```{r include = FALSE}
294 | dim(testing(ames_split))
295 | ```
296 | 
297 | ### 
298 | 
299 | As you can see, the `dim()` and `testing()` functions returned all of the testing data, which contains 586 rows. 
300 | 
301 | 
302 | ## Stratified Sampling
303 | ### 
304 | 
305 | Simple random sampling is appropriate in many cases but there are exceptions. When there is a dramatic class imbalance in classification problems, one class occurs much less frequently than another. Using a simple random sample may haphazardly allocate these infrequent samples disproportionately into the training or test set.
306 | 
307 | To avoid this, *stratified sampling* can be used. The training/test split is conducted separately within each class and then these subsamples are combined into the overall training and test set. For regression problems, the outcome data can be artificially binned into quartiles and then stratified sampling can be conducted four separate times. This is an effective method for keeping the distributions of the outcome similar between the training and test set. 
308 | 
309 | 
310 | ### Exercise 1
311 | 
312 | Let's create the following graph, which shows the distribution of sales prices from the `ames` data set. 
313 | 
314 | ```{r}
315 | ames_plot
316 | ```
317 | 
318 | Before we start however, we need to modify the `ames` data set so that it is on a logarithmic scale. Start by piping `ames` to `mutate()`.
319 | 
320 | ```{r stratified-sampling-1, exercise = TRUE}
321 | 
322 | ```
323 | 
324 | ```{r stratified-sampling-1-hint-1, eval = FALSE}
325 | ames |>
326 |   ...()
327 | ```
328 | 
329 | ```{r include = FALSE}
330 | ames |> 
331 |   mutate()
332 | ```
333 | 
334 | ### 
335 | 
336 | The `log10()` function to modify the data so that it is on a logarithmic scale.
337 | 
338 | ### Exercise 2
339 | 
340 | Copy the previous code. Inside `mutate()`, set `Sale_Price` to `log10(Sale_Price)`.
341 | 
342 | ```{r stratified-sampling-2, exercise = TRUE}
343 | 
344 | ```
345 | 
346 | <button onclick = "transfer_code(this)">Copy previous code</button>
347 | 
348 | ```{r stratified-sampling-2-hint-1, eval = FALSE}
349 | ames |>
350 |   mutate(... = log10(...))
351 | ```
352 | 
353 | ```{r include = FALSE}
354 | ames |>
355 |   mutate(Sale_Price = log10(Sale_Price))
356 | ```
357 | 
358 | ### 
359 | 
360 | The root mean squared error (RMSE) is a common performance metric used in regression models. It uses the difference between the observed and predicted values in its calculations. If the sale price is on the log scale, these differences (i.e., the residuals) are also on the log scale.
361 | 
362 | ### Exercise 3
363 | 
364 | Copy the previous code and save it to the variable `ames_update`.
365 | 
366 | ```{r stratified-sampling-3, exercise = TRUE}
367 | 
368 | ```
369 | 
370 | <button onclick = "transfer_code(this)">Copy previous code</button>
371 | 
372 | ```{r stratified-sampling-3-hint-1, eval = FALSE}
373 | ... <- ames |>
374 |   mutate(Sale_Price = log10(Sale_Price))
375 | ```
376 | 
377 | ```{r include = FALSE}
378 | ames_update <- ames |>
379 |   mutate(Sale_Price = log10(Sale_Price))
380 | ```
381 | 
382 | ### 
383 | 
384 | When data are reused for multiple tasks, instead of carefully “spent” from the finite data budget, certain risks increase, such as the risk of accentuating bias or compounding effects from methodological errors.
385 | 
386 | 
387 | 
388 | ### Exercise 4
389 | 
390 | Now, lets start creating the graph. Start by piping `ames_update` to `ggplot()`.
391 | 
392 | ```{r stratified-sampling-4, exercise = TRUE}
393 | 
394 | ```
395 | 
396 | ```{r stratified-sampling-4-hint-1, eval = FALSE}
397 | ... |> 
398 |   ggplot()
399 | ```
400 | 
401 | ### 
402 | 
403 | As a reminder, the `ggplot()` function, which comes from the **ggplot2** library, is used to create data visualizations.
404 | 
405 | ### Exercise 5
406 | 
407 | Copy the previous code. Inside `ggplot()` type in `aes()`. Inside `aes()` set `x` to `Sale_Price`.
408 | 
409 | ```{r stratified-sampling-5, exercise = TRUE}
410 | 
411 | ```
412 | 
413 | <button onclick = "transfer_code(this)">Copy previous code</button>
414 | 
415 | ```{r stratified-sampling-5-hint-1, eval = FALSE}
416 | ames_update |> 
417 |   ggplot(...(x = ...))
418 | ```
419 | 
420 | ```{r include = FALSE}
421 | ames_update |> 
422 |   ggplot(aes(x = Sale_Price))
423 | ```
424 | 
425 | ### 
426 | 
427 | If a model has limited fidelity to the data, the inferences generated by the model should be highly suspect. In other words, statistical significance may not be sufficient proof that a model is appropriate.
428 | 
429 | ### Exercise 6
430 | 
431 | Copy the previous code and add `geom_density()` to the plot.
432 | 
433 | ```{r stratified-sampling-6, exercise = TRUE}
434 | 
435 | ```
436 | 
437 | <button onclick = "transfer_code(this)">Copy previous code</button>
438 | 
439 | ```{r stratified-sampling-6-hint-1, eval = FALSE}
440 | ames_update |> 
441 |   ggplot(aes(x = Sale_Price)) + 
442 |     ...()
443 | ```
444 | 
445 | ```{r include = FALSE}
446 | ames_update |> 
447 |   ggplot(aes(x = Sale_Price)) + 
448 |     geom_density()
449 | ```
450 | 
451 | ### 
452 | 
453 | `geom_density()` creates a density plot. A density plot is a graphical representation of the distribution of a numeric value (which in this case is `Sale_Price`).
454 | 
455 | ### Exercise 7
456 | 
457 | Copy the previous code and add `theme_classic()` to make the graph look nicer.
458 | 
459 | ```{r stratified-sampling-7, exercise = TRUE}
460 | 
461 | ```
462 | 
463 | <button onclick = "transfer_code(this)">Copy previous code</button>
464 | 
465 | ```{r stratified-sampling-7-hint-1, eval = FALSE}
466 | ames_update |> 
467 |   ggplot(aes(x = Sale_Price)) + 
468 |     geom_density() + 
469 |       ...()
470 | ```
471 | 
472 | ```{r include = FALSE}
473 | ames_update |> 
474 |   ggplot(aes(x = Sale_Price)) + 
475 |     geom_density() + 
476 |       theme_classic()
477 | ```
478 | 
479 | ### 
480 | 
481 | `theme_classic()` is one of the various themes you can use for your graphs. This [link](https://ggplot2.tidyverse.org/reference/ggtheme.html) provides more themes.
482 | 
483 | ### Exercise 8
484 | 
485 | Finally, copy the previous code and add your `labs()`. The final graph should look like this: 
486 | 
487 | ```{r}
488 | ames_plot
489 | ```
490 | 
491 | ```{r stratified-sampling-8, exercise = TRUE}
492 | 
493 | ```
494 | 
495 | <button onclick = "transfer_code(this)">Copy previous code</button>
496 | 
497 | ```{r stratified-sampling-8-hint-1, eval = FALSE}
498 | ames_update |> 
499 |   ggplot(aes(x = Sale_Price)) + 
500 |     geom_density() + 
501 |       theme_classic() + 
502 |         labs(
503 |           x = ...
504 |         )
505 | ```
506 | 
507 | ### 
508 | 
509 | As you can see, the sale price distribution is right-skewed, with proportionally more inexpensive houses than expensive houses on either side of the center of the distribution. The worry here with simple splitting is that the more expensive houses would not be well represented in the training set; this would increase the risk that our model would be ineffective at predicting the price for such properties. 
510 | 
511 | ### Exercise 9
512 | 
513 | In order to fix this, We can use a stratified random sample. In the **rsample** package, we can use the `strata` argument in the `initial_split()` function.
514 | 
515 | Before we do that, type in `set.seed()` and pass in `502`.
516 | 
517 | ```{r stratified-sampling-9, exercise = TRUE}
518 | 
519 | ```
520 | 
521 | ```{r stratified-sampling-9-hint-1, eval = FALSE}
522 | set.seed(...)
523 | ```
524 | 
525 | ```{r include = FALSE}
526 | set.seed(502)
527 | ```
528 | 
529 | ### 
530 | 
531 | As a reminder, the `set.seed()` function is used in order to make sure the results can be produced later.
532 | 
533 | ### Exercise 10
534 | 
535 | Take a look at the `initial_split()` code from the previous section.
536 | 
537 | Now, lets add the `strata` argument. Inside `initial_split()`, set `strata` to `Sale_Price`.
538 | 
539 | ```{r stratified-sampling-10, exercise = TRUE}
540 | initial_split(ames, prop = 0.80)
541 | ```
542 | 
543 | ```{r stratified-sampling-10-hint-1, eval = FALSE}
544 | initial_split(ames, prop = 0.80, ... = Sale_Price)
545 | ```
546 | 
547 | ```{r include = FALSE}
548 | initial_split(ames, prop = 0.80, strata = Sale_Price)
549 | ```
550 | 
551 | ### 
552 | 
553 | Stratified sampling is a sampling technique where the data is divided into subgroups (strata) based on the levels of a categorical variable. The sampling is then performed independently within each stratum, ensuring that each stratum is represented proportionally in both the training and testing sets. This is particularly useful when you have imbalanced data or when you want to ensure that certain groups are well-represented in the training and testing sets.
554 | 
555 | ### Exercise 11
556 | 
557 | Copy the previous code and save it to the variable `ames_strata_split`.
558 | 
559 | ```{r stratified-sampling-11, exercise = TRUE}
560 | 
561 | ```
562 | 
563 | <button onclick = "transfer_code(this)">Copy previous code</button>
564 | 
565 | ```{r stratified-sampling-11-hint-1, eval = FALSE}
566 | ... <- initial_split(ames, prop = 0.80, strata = Sale_Price)
567 | ```
568 | 
569 | ```{r include = FALSE}
570 | ames_strata_split <- initial_split(ames, prop = 0.80, strata = Sale_Price)
571 | 
572 | ```
573 | 
574 | ### 
575 | 
576 | The proportion of data that should be allocated for splitting is highly dependent on the context of the problem at hand. Too little data in the training set hampers the model’s ability to find appropriate parameter estimates. Conversely, too little data in the test set lowers the quality of the performance estimates
577 | 
578 | ### Exercise 12
579 | 
580 | Now that we added the `strata` argument, we can reuse the `training()` and `testing()` functions. In the code chunk below, type in `dim()`. Inside `dim()`, type in `training()` and pass in `ames_strata_split`.
581 | 
582 | ```{r stratified-sampling-12, exercise = TRUE}
583 | 
584 | ```
585 | 
586 | ```{r stratified-sampling-12-hint-1, eval = FALSE}
587 | dim(...(ames_strata_split))
588 | ```
589 | 
590 | ```{r include = FALSE}
591 | training(ames_strata_split)
592 | ```
593 | 
594 | ### 
595 | 
596 | As you can see, the training data now contains 2,342 rows.
597 | 
598 | ### Exercise 13
599 | 
600 | Copy the previous code and change `training()` to `testing()`.
601 | 
602 | ```{r stratified-sampling-13, exercise = TRUE}
603 | 
604 | ```
605 | 
606 | <button onclick = "transfer_code(this)">Copy previous code</button>
607 | 
608 | ```{r stratified-sampling-13-hint-1, eval = FALSE}
609 | dim(...(ames_strata_split))
610 | ```
611 | 
612 | ```{r include = FALSE}
613 | dim(testing(ames_strata_split))
614 | ```
615 | 
616 | ### 
617 | 
618 | Are there situations when random sampling is not the best choice? One case is when the data have a significant time component, such as time series data. Here, it is more common to use the most recent data as the test set. The **rsample** package contains a function called `initial_time_split()` that is very similar to `initial_split()`. Instead of using random sampling, the prop argument denotes what proportion of the first part of the data should be used as the training set; the function assumes that the data have been pre-sorted in an appropriate order.
619 | 
620 | ### Exercise 14
621 | 
622 | Chapter 5 of the *Tidy Modeling With R* textbook contains more information regarding how to spend data. Click on this [link](https://www.tmwr.org/splitting.html) and type in the names of sections 5.2 - 5.4.
623 | 
624 | ```{r stratified-sampling-14}
625 | question_text(NULL,
626 | 	answer(NULL, correct = TRUE),
627 | 	allow_retry = TRUE,
628 | 	try_again_button = "Edit Answer",
629 | 	incorrect = NULL,
630 | 	rows = 3)
631 | ```
632 | 
633 | ### 
634 | 
635 | As you can see, the chapter covers more information, such as validation sets, multilevel data, and data budgets.
636 | 
637 | ## Summary
638 | ### 
639 | 
640 | In this tutorial you have learned:
641 | 
642 | - How to split data using `initial_split()`
643 | 
644 | - How to allocate data towards the training and testing sets by using the `prop` argument inside `initial_split()`
645 | 
646 | - How to train and test data, using `training()` and `testing()` respectively. 
647 | 
648 | - How to conduct a stratified random sample by using the `strata` argument inside `initial_split()`
649 | 
650 | ```{r download-answers, child = system.file("child_documents/download_answers.Rmd", package = "tutorial.helpers")}
651 | ```
652 | 


--------------------------------------------------------------------------------
/inst/tutorials/06-fitting-models-with-parsnip/images/DiagramOne.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/06-fitting-models-with-parsnip/images/DiagramOne.png


--------------------------------------------------------------------------------
/inst/tutorials/06-fitting-models-with-parsnip/images/TableFour.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/06-fitting-models-with-parsnip/images/TableFour.png


--------------------------------------------------------------------------------
/inst/tutorials/06-fitting-models-with-parsnip/images/TableOne.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/06-fitting-models-with-parsnip/images/TableOne.png


--------------------------------------------------------------------------------
/inst/tutorials/06-fitting-models-with-parsnip/images/TableThree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/06-fitting-models-with-parsnip/images/TableThree.png


--------------------------------------------------------------------------------
/inst/tutorials/06-fitting-models-with-parsnip/images/TableTwo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/06-fitting-models-with-parsnip/images/TableTwo.png


--------------------------------------------------------------------------------
/inst/tutorials/07-a-model-workflow/images/img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/07-a-model-workflow/images/img.png


--------------------------------------------------------------------------------
/inst/tutorials/07-a-model-workflow/tutorial.Rmd:
--------------------------------------------------------------------------------
   1 | ---
   2 | title: A Model Workflow
   3 | author: Pratham Kancherla and David Kane
   4 | tutorial:
   5 |   id: a-model-workflow
   6 | output:
   7 |   learnr::tutorial:
   8 |     progressive: true
   9 |     allow_skip: true
  10 | runtime: shiny_prerendered
  11 | description: 'Tutorial for Chapter 7: A Model Workflow'
  12 | ---
  13 | 
  14 | ```{r setup, include = FALSE}
  15 | library(learnr)
  16 | library(tutorial.helpers)
  17 | library(knitr)
  18 | library(tidyverse)
  19 | library(tidymodels)
  20 | library(lme4)
  21 | library(multilevelmod)
  22 | library(nlme)
  23 | library(workflowsets)
  24 | 
  25 | tidymodels_prefer()
  26 | 
  27 | knitr::opts_chunk$set(echo = FALSE)
  28 | options(tutorial.exercise.timelimit = 60, 
  29 |         tutorial.storage = "local") 
  30 | 
  31 | lm_model <- 
  32 |   linear_reg() |>
  33 |   set_engine("lm")
  34 | 
  35 | lm_wflow <- 
  36 |   workflow() |>
  37 |   add_model(lm_model)
  38 | 
  39 | lm_wflow <- 
  40 |   lm_wflow |>
  41 |   add_formula(Sale_Price ~ Longitude + Latitude)
  42 | 
  43 | data(ames)
  44 | 
  45 | ames <- mutate(ames, Sale_Price = log10(Sale_Price))
  46 | 
  47 | ames_split <- initial_split(ames, prop = 0.80, strata = Sale_Price)
  48 | 
  49 | ames_train <- training(ames_split)
  50 | 
  51 | ames_test  <-  testing(ames_split)
  52 | 
  53 | lm_fit <- fit(lm_wflow, ames_train)
  54 | 
  55 | lm_wflow <- 
  56 |   lm_wflow |>
  57 |   remove_formula() |>
  58 |   add_variables(outcome = Sale_Price,
  59 |                 predictors = c(Longitude, Latitude)
  60 |   )
  61 | 
  62 | multilevel_spec <- 
  63 |   linear_reg() |> 
  64 |   set_engine("lmer")
  65 | 
  66 | multilevel_workflow <- 
  67 |   workflow() |>
  68 |   add_variables(outcome = distance,
  69 |                 predictors = c(Sex, age, Subject)) |>
  70 |   add_model(multilevel_spec, 
  71 |             formula = distance ~ Sex + (age | Subject)
  72 |     )
  73 | 
  74 | multilevel_fit <- fit(multilevel_workflow, data = Orthodont)
  75 | 
  76 | location <- list(
  77 |   longitude = Sale_Price ~ Longitude,
  78 |   latitude = Sale_Price ~ Latitude,
  79 |   coords = Sale_Price ~ Longitude + Latitude,
  80 |   neighborhood = Sale_Price ~ Neighborhood
  81 | )
  82 | 
  83 | location_models <- workflow_set(preproc = location, models = list(lm = lm_model))
  84 | 
  85 | location_models <-
  86 |    location_models %>%
  87 |    mutate(fit = map(info, ~ fit(.x$workflow[[1]], ames_train)))
  88 | 
  89 | final_lm_res <- last_fit(lm_wflow, ames_split)
  90 | 
  91 | c_mtrcs <- collect_metrics(final_lm_res)
  92 | c_predic <- 
  93 |   collect_predictions(final_lm_res) |>
  94 |   slice(1:5)
  95 | 
  96 | ```
  97 | 
  98 | ```{r copy-code-chunk, child = system.file("child_documents/copy_button.Rmd", package = "tutorial.helpers")}
  99 | ```
 100 | 
 101 | ```{r info-section, child = system.file("child_documents/info_section.Rmd", package = "tutorial.helpers")}
 102 | ```
 103 | 
 104 | ## Introduction
 105 | ### 
 106 | 
 107 | This tutorial covers [Chapter 7: A Model Workflow](https://www.tmwr.org/workflows.html) from [*Tidy Modeling with R*](https://www.tmwr.org/) by Max Kuhn and Julia Silge. In the previous chapter, we discussed the [**parsnip**](https://parsnip.tidymodels.org/) package, which can be used to define and fit the model. This chapter introduces a new concept called a model workflow. The purpose of this concept (and the corresponding **tidymodels** `workflow()` object) is to encapsulate the major pieces of the modeling process. 
 108 | 
 109 | ## Workflow Basics
 110 | ### 
 111 | 
 112 | PCA is a way to replace correlated predictors with new artificial features that are uncorrelated and capture most of the information in the original set.
 113 | 
 114 | ```{r}
 115 | #| echo: false
 116 | #| message: false
 117 | #| warning: false
 118 | 
 119 | #include_graphics("inst/tutorials/07-a-model-workflow/images/img.png")
 120 | 
 121 | ```
 122 | 
 123 | The workflows package allows the user to bind modeling and preprocessing objects together. Let’s start again with the Ames data and a simple linear model.
 124 | 
 125 | ### Exercise 1
 126 | 
 127 | Load the library **tidymodels** using `library()`.
 128 | 
 129 | ```{r workflow-basics-1, exercise = TRUE}
 130 | 
 131 | ```
 132 | 
 133 | ```{r workflow-basics-1-hint-1, eval = FALSE}
 134 | library(...)
 135 | ```
 136 | 
 137 | ```{r include = FALSE}
 138 | library(tidymodels)
 139 | ```
 140 | 
 141 | ### 
 142 | 
 143 | The core **tidymodels** packages: [**rsample**](https://rsample.tidymodels.org/), [**parsnip**](https://parsnip.tidymodels.org/), [**recipes**](https://recipes.tidymodels.org/), [**workflows**](https://workflows.tidymodels.org/), [**tune**](https://tune.tidymodels.org/), [**yardstick**](https://yardstick.tidymodels.org/), [**broom**](https://broom.tidymodels.org/), [**dials**](https://dials.tidymodels.org/)
 144 | 
 145 | ### Exercise 2
 146 | 
 147 | The **workflows** package allows the user to bind modeling and pre-processing objects together. Let’s start again with the Ames data. Enter `linear_reg()` and hit "Run Code".
 148 | 
 149 | ```{r workflow-basics-2, exercise = TRUE}
 150 | 
 151 | ```
 152 | 
 153 | ```{r workflow-basics-2-hint-1, eval = FALSE}
 154 | linear_reg()
 155 | ```
 156 | 
 157 | ```{r include = FALSE}
 158 | linear_reg()
 159 | ```
 160 | 
 161 | ### 
 162 | 
 163 | `linear_reg()` is used to specify and fit a linear regression model in the **tidymodels** framework. It is similar to other model functions in **parsnip** and follows the same pattern.
 164 | 
 165 | ### Exercise 3
 166 | 
 167 | Copy the previous code and pipe `set_engine()`, with the parameter being `"lm"`, by using the pipe operator. Set this equal to `lm_model`.
 168 | 
 169 | ```{r workflow-basics-3, exercise = TRUE}
 170 | 
 171 | ```
 172 | 
 173 | <button onclick = "transfer_code(this)">Copy previous code</button>
 174 | 
 175 | ```{r workflow-basics-3-hint-1, eval = FALSE}
 176 | lm_model <- 
 177 |   ... |>
 178 |   set_engine("...")
 179 | ```
 180 | 
 181 | ```{r include = FALSE}
 182 | lm_model <- 
 183 |   linear_reg() |>
 184 |   set_engine("lm")
 185 | ```
 186 | 
 187 | ### 
 188 | 
 189 | It is important to focus on the broader modeling process, instead of only fitting the specific model used to estimate parameters. This broader process includes any pre-processing steps, the model fit itself, as well as potential post-processing activities.
 190 | 
 191 | ### Exercise 4
 192 | 
 193 | A workflow always requires a parsnip model object. Type in `workflow()` and hit "Run Code".
 194 | 
 195 | ```{r workflow-basics-4, exercise = TRUE}
 196 | 
 197 | ```
 198 | 
 199 | ```{r workflow-basics-4-hint-1, eval = FALSE}
 200 | workflow()
 201 | ```
 202 | 
 203 | ```{r include = FALSE}
 204 | workflow()
 205 | ```
 206 | 
 207 | ### 
 208 | 
 209 | A workflow object can include steps such as data pre-processing, feature engineering, model specification, model fitting, and evaluation. Each step is represented by a modeling object or a function.
 210 | 
 211 | ### Exercise 5
 212 | 
 213 | Copy the previous code and pipe `add_model()`, with the parameter being `lm_model`, by using the pipe operator. Set this equal to `lm_wflow`.
 214 | 
 215 | ```{r workflow-basics-5, exercise = TRUE}
 216 | 
 217 | ```
 218 | 
 219 | <button onclick = "transfer_code(this)">Copy previous code</button>
 220 | 
 221 | ```{r workflow-basics-5-hint-1, eval = FALSE}
 222 | ... |>
 223 |   add_model(...)
 224 | ```
 225 | 
 226 | ```{r include = FALSE}
 227 | lm_wflow <- 
 228 |   workflow() |>
 229 |   add_model(lm_model)
 230 | ```
 231 | 
 232 | ### 
 233 | 
 234 | Principal Component Analysis (PCA) signal extraction is a way to replace correlated predictors with new artificial features that are uncorrelated and capture most of the information in the original set. 
 235 | 
 236 | ### Exercise 6
 237 | 
 238 | Type `lm_wflow` on the next line and hit "Run Code".
 239 | 
 240 | ```{r workflow-basics-6, exercise = TRUE}
 241 | 
 242 | ```
 243 | 
 244 | ```{r workflow-basics-6-hint-1, eval = FALSE}
 245 | lm_wflow
 246 | ```
 247 | 
 248 | ```{r include = FALSE}
 249 | lm_wflow
 250 | ```
 251 | 
 252 | ### 
 253 | 
 254 | Notice how the preprocessor has not be defined yet. In statistics, a preprocessor refers to a step or a set of steps taken before modeling or analyzing the data. The main goal of a preprocessor is to transform the raw data into a format that is more suitable for the subsequent statistical analysis or modeling tasks.
 255 | 
 256 | ### Exercise 7
 257 | 
 258 | The `add_formula()` function can be used to add a formula to the preprocessor. Copy the previous code and pipe `add_formula()`, with the formula being `Sale Price ~ Longitude + Latitude`. Set it equal to `lm_wflow`.
 259 | 
 260 | ```{r workflow-basics-7, exercise = TRUE}
 261 | 
 262 | ```
 263 | 
 264 | <button onclick = "transfer_code(this)">Copy previous code</button>
 265 | 
 266 | ```{r workflow-basics-7-hint-1, eval = FALSE}
 267 | lm_wflow <-
 268 |   ... |>
 269 |   add_formula(Sale_Price ~ Longitude + Latitude)
 270 | ```
 271 | 
 272 | ```{r include = FALSE}
 273 | lm_wflow <- 
 274 |   lm_wflow |>
 275 |   add_formula(Sale_Price ~ Longitude + Latitude)
 276 | ```
 277 | 
 278 | ### 
 279 | 
 280 | The `fit()` function is used to train a specified model on a given dataset, using the formula and data provided in the model specification. It returns a fitted model object that can be used for prediction and evaluation.
 281 | 
 282 | ### Exercise 8
 283 | 
 284 | We will be using some of the objects created from the previous tutorial to make our fitted model using the `fit()` function. Within the function, add the parameters, `lm_wflow` and `ames_train`. Set this expression equal to `lm_fit`.
 285 | 
 286 | ```{r workflow-basics-8, exercise = TRUE}
 287 | 
 288 | ```
 289 | 
 290 | ```{r workflow-basics-8-hint-1, eval = FALSE}
 291 | ... <- fit(..., ames_train)
 292 | ```
 293 | 
 294 | ```{r include = FALSE}
 295 | lm_fit <- fit(lm_wflow, ames_train)
 296 | ```
 297 | 
 298 | ### 
 299 | 
 300 | The `predict()` function works with a wide range of models, including linear regression, generalized linear models, decision trees, random forests, support vector machines, and many others.
 301 | 
 302 | ### Exercise 9
 303 | 
 304 | To predicted on the fitted workflow, we will be using `predict()`. Within the function, add the parameter `lm_fit`. Note that this will throw an error which will be fixed soon.
 305 | 
 306 | ```{r workflow-basics-9, exercise = TRUE}
 307 | 
 308 | ```
 309 | 
 310 | ```{r workflow-basics-9-hint-1, eval = FALSE}
 311 | predict(...)
 312 | ```
 313 | 
 314 | ### 
 315 | 
 316 | The `predict()` function requires the "newdata" argument to make predictions on new data. This argument specifies the data frame containing the predictor variables for which you want to make predictions.
 317 | 
 318 | ### Exercise 10
 319 | 
 320 | Copy the previous code and add the parameter `ames_test` as the new_data argument to make predictions on the data.
 321 | 
 322 | ```{r workflow-basics-10, exercise = TRUE}
 323 | 
 324 | ```
 325 | 
 326 | <button onclick = "transfer_code(this)">Copy previous code</button>
 327 | 
 328 | ```{r workflow-basics-10-hint-1, eval = FALSE}
 329 | predict(lm_fit, ...)
 330 | ```
 331 | 
 332 | ```{r include = FALSE}
 333 | predict(lm_fit, ames_train)
 334 | ```
 335 | 
 336 | ### 
 337 | 
 338 | There are too many rows that are difficult to look at once. The `slice()` functions lets us select a certain amount of rows to be printed out.
 339 | 
 340 | ### Exercise 11
 341 | 
 342 | Copy the previous code and add `slice()` to the pipe. Add the parameter `1:3` to `slice()` and hit "Run Code".
 343 | 
 344 | ```{r workflow-basics-11, exercise = TRUE}
 345 | 
 346 | ```
 347 | 
 348 | <button onclick = "transfer_code(this)">Copy previous code</button>
 349 | 
 350 | ```{r workflow-basics-11-hint-1, eval = FALSE}
 351 | ... |>
 352 |   slice(...)
 353 | ```
 354 | 
 355 | ```{r include = FALSE}
 356 | predict(lm_fit, ames_train) |>
 357 |   slice(1:3)
 358 | ```
 359 | 
 360 | ### 
 361 | 
 362 | `update_formula()` is used to update model formulae. This typically involves adding or dropping terms, but updates can be more general.
 363 | 
 364 | ### Exercise 12
 365 | 
 366 | Pipe `update_formula()` to lm_fit. In `update_formula()`, we will change the formula to `Sale_Price ~ Longitude`.
 367 | 
 368 | ```{r workflow-basics-12, exercise = TRUE}
 369 | 
 370 | ```
 371 | 
 372 | ```{r workflow-basics-12-hint-1, eval = FALSE}
 373 | lm_fit |>
 374 |   update_formula(Sale_Price ~ ...)
 375 | ```
 376 | 
 377 | ```{r include = FALSE}
 378 | lm_fit |>
 379 |   update_formula(Sale_Price ~ Longitude)
 380 | ```
 381 | 
 382 | ### 
 383 | 
 384 | Great Job! You now understand the basics of workflow and the different functions that can help model the data.
 385 | 
 386 | ## Adding Raw Variables to the `workflow()`
 387 | ### 
 388 | 
 389 | There is another interface for passing data to the model, the `add_variables()` function, which uses a **dplyr**-like syntax for choosing variables. The function has two primary arguments: *outcomes* and *predictors*. These use a selection approach similar to the **tidyselect** backend of **tidyverse** packages to capture multiple selectors using `c()`.
 390 | 
 391 | ### Exercise 1
 392 | 
 393 | We will not be needing a formula any more, as we only will need outcomes and predictors. Pipe `remove_formula()` to `lm_wflow`.
 394 | 
 395 | ```{r adding-raw-variables-1, exercise = TRUE}
 396 | 
 397 | ```
 398 | 
 399 | ```{r adding-raw-variables-1-hint-1, eval = FALSE}
 400 | lm_wflow |>
 401 |   ...
 402 | ```
 403 | 
 404 | ```{r include = FALSE}
 405 | lm_wflow |>
 406 |   remove_formula()
 407 | ```
 408 | 
 409 | ### 
 410 | 
 411 | You can see under the preprocessor tab, there is no formula anymore. There now needs to be outcomes and predictors.
 412 | 
 413 | ### Exercise 2
 414 | 
 415 | We will use the `add_variables()` function to add the outcome first. Copy the previous code and add `add_variables()` to the pipe, setting `outcome = Sale_Price`. This will throw an error.
 416 | 
 417 | ```{r adding-raw-variables-2, exercise = TRUE}
 418 | 
 419 | ```
 420 | 
 421 | <button onclick = "transfer_code(this)">Copy previous code</button>
 422 | 
 423 | ```{r adding-raw-variables-2-hint-1, eval = FALSE}
 424 | ... |>
 425 |   add_variables(outcome = ...)
 426 | ```
 427 | 
 428 | ```{r include = FALSE}
 429 | #lm_wflow |>
 430 |   #remove_formula() |>
 431 |   #add_variables(outcome = Sale_Price)
 432 | ```
 433 | 
 434 | Note that there is no predictors parameter established yet.
 435 | 
 436 | ### Exercise 3
 437 | 
 438 | Now we will add the predictor variable to the preprocessor. Copy the previous code and in `add_variables()`, add `parameter` and set it equal to `Longitude` and `Latitude` using `c()`.
 439 | 
 440 | ```{r adding-raw-variables-3, exercise = TRUE}
 441 | 
 442 | ```
 443 | 
 444 | <button onclick = "transfer_code(this)">Copy previous code</button>
 445 | 
 446 | ```{r adding-raw-variables-3-hint-1, eval = FALSE}
 447 | ... |>
 448 |   add_variables(outcome = Sale_Price, predictors = c(..., ...))
 449 | ```
 450 | 
 451 | ```{r include = FALSE}
 452 | lm_wflow |>
 453 |   remove_formula() |>
 454 |   add_variables(outcome = Sale_Price, predictors = c(Longitude, Latitude))
 455 | ```
 456 | 
 457 | ### 
 458 | 
 459 | `add_variables()` adds a new column to a data frame, while `add_case()` adds a new row to a data frame. These are convenient functions to add columns or rows not only at the end of a data frame, but at any column or row position. Furthermore, they allow easy integration into a pipe-workflow.
 460 | 
 461 | 
 462 | ### Exercise 4
 463 | 
 464 |  
 465 | Finally, copy the previous code and set the expression equal to `lm_wflow` using the `<-`. On the next line, type in `lm_wflow` to see the workflow.
 466 | 
 467 | ```{r adding-raw-variables-4, exercise = TRUE}
 468 | 
 469 | ```
 470 | 
 471 | <button onclick = "transfer_code(this)">Copy previous code</button>
 472 | 
 473 | ```{r adding-raw-variables-4-hint-1, eval = FALSE}
 474 | ... <- lm_wflow |>
 475 |   remove_formula() |>
 476 |   add_variables(
 477 |     outcome = Sale_Price, 
 478 |     predictors = c(Longitude, ...))
 479 | ```
 480 | 
 481 | ```{r include = FALSE}
 482 | lm_wflow <- 
 483 |   lm_wflow |>
 484 |   remove_formula() |>
 485 |   add_variables(outcome = Sale_Price,
 486 |                 predictors = c(Longitude, Latitude)
 487 |   )
 488 | ```
 489 | 
 490 | ### 
 491 | 
 492 | If you would like the underlying modeling method to do what it would normally do with the data, `add_variables()` can be a helpful interface. 
 493 | 
 494 | ### Exercise 5
 495 | 
 496 | Now we can create the model using `fit()`. Within `fit()`, add the parameters `lm_wflow` and `ames_train`.
 497 | 
 498 | ```{r adding-raw-variables-5, exercise = TRUE}
 499 | 
 500 | ```
 501 | 
 502 | ```{r adding-raw-variables-5-hint-1, eval = FALSE}
 503 | fit(lm_wflow, ...)
 504 | ```
 505 | 
 506 | ```{r include = FALSE}
 507 | fit(lm_wflow, ames_train)
 508 | ```
 509 | 
 510 | ### 
 511 | 
 512 | Models such as **glmnet** and **xgboost** expect the user to make indicator variables from factor predictors. In these cases, a recipe or formula interface will typically be a better choice.
 513 | 
 514 | Great Job! You now know how to add raw variables such as outcome predictors to the workflow. In the next chapter, we will look at a more powerful preprocessor (called a recipe) that can also be added to a workflow.
 515 | 
 516 | ## How Does a `workflow()` Use the Formula?
 517 | ### 
 518 | 
 519 | When we fit a tree to the data, the **parsnip** package understands what the modeling function would do. For example, if a random forest model is fit using the ranger or **randomForest** packages, the workflow knows predictors columns that are factors should be left as is.
 520 | 
 521 | ### Exercise 1
 522 | 
 523 | A number of multilevel models have standardized on a formula specification devised in the lme4 package. For example, to fit a regression model that has random effects for subjects, we would use the following formula:
 524 | 
 525 | ```{r how-does-a-workflow--1, exercise = TRUE}
 526 | library(lme4)
 527 | lmer(formula = distance ~ Sex + (age | Subject), data = Orthodont)
 528 | ```
 529 | 
 530 | ### 
 531 | 
 532 | The effect of this is that each subject will have an estimated intercept and slope parameter for age. The problem, however, is that standard R methods can’t properly process this formula.
 533 | 
 534 | ### Exercise 2
 535 | 
 536 | We can try to process this formula with `model.matrix()`. Copy the previous code and replace `lmer()` with `model.matrix()`. Hit "Run Code".
 537 | 
 538 | ```{r how-does-a-workflow--2, exercise = TRUE}
 539 | 
 540 | ```
 541 | 
 542 | <button onclick = "transfer_code(this)">Copy previous code</button>
 543 | 
 544 | ```{r how-does-a-workflow--2-hint-1, eval = FALSE}
 545 | ...(distance ~ Sex + (age | Subject), data = Orthodont)
 546 | ```
 547 | 
 548 | ```{r include = FALSE}
 549 | model.matrix(distance ~ Sex + (age | Subject), data = Orthodont)
 550 | ```
 551 | 
 552 | Even if this formula could be used with `model.matrix()`, this would still present a problem since the formula also specifies the statistical attributes of the model.
 553 | 
 554 | ### Exercise 3
 555 | 
 556 | The solution in workflows is an optional supplementary model formula that can be passed to `add_model()`. The `add_variables()` specification provides the bare column names, and then the actual formula given to the model is set within `add_model()`.
 557 | 
 558 | First, load the library **multilevelmod** using `library()`.
 559 | 
 560 | ```{r how-does-a-workflow--3, exercise = TRUE}
 561 | 
 562 | ```
 563 | 
 564 | ```{r how-does-a-workflow--3-hint-1, eval = FALSE}
 565 | library(multilevelmod)
 566 | ```
 567 | 
 568 | ### 
 569 | 
 570 | **multilevelmod** package enables the use of multilevel models (a.k.a mixed-effects models, Bayesian hierarchical models, etc.) with the **parsnip** package.
 571 | 
 572 | ### Exercise 4
 573 | 
 574 | We need the specify that we will be using a linear regression model. Pipe `set_model("lmer")` to `linear_reg()`. Set this expression equal to `multilevel_spec` using `<-`
 575 | 
 576 | ```{r how-does-a-workflow--4, exercise = TRUE}
 577 | 
 578 | ```
 579 | 
 580 | ```{r how-does-a-workflow--4-hint-1, eval = FALSE}
 581 | multilevel_spec <- 
 582 |   linear_reg() |>
 583 |   set_model("...")
 584 | ```
 585 | 
 586 | ```{r include = FALSE}
 587 | multilevel_spec <- 
 588 |   linear_reg() |> 
 589 |   set_engine("lmer")
 590 | ```
 591 | 
 592 | ### 
 593 | 
 594 | The `set_engine()` function is used to specify the computational "engine" or backend for fitting a model. It allows you to choose a specific modeling library or package to be used for model training and prediction.
 595 | 
 596 | ### Exercise 5
 597 | 
 598 | We now need a workflow to model the data. Type `workflow()` to create a workflow. Hit "Run Code".
 599 | 
 600 | ```{r how-does-a-workflow--5, exercise = TRUE}
 601 | 
 602 | ```
 603 | 
 604 | ```{r how-does-a-workflow--5-hint-1, eval = FALSE}
 605 | workflow()
 606 | ```
 607 | 
 608 | ### 
 609 | 
 610 | The `workflow()` function allows you to build a complete modeling pipeline by combining various modeling and preprocessing steps, making it easier to manage and reproduce complex analyses.
 611 | 
 612 | ### Exercise 6
 613 | 
 614 | Now we need to add the raw variables to the model, which are the outcomes and predictors. Copy the previous code and pipe `add_variables()`. This will throw and error because we should always have two parameters: `outcomes` and `parameters`.
 615 | 
 616 | ```{r how-does-a-workflow--6, exercise = TRUE}
 617 | 
 618 | ```
 619 | 
 620 | <button onclick = "transfer_code(this)">Copy previous code</button>
 621 | 
 622 | ```{r how-does-a-workflow--6-hint-1, eval = FALSE}
 623 | ... |>
 624 |   add_variables()
 625 | ```
 626 | 
 627 | ```{r include = FALSE}
 628 | #workflow() |>
 629 |   #add_variables()
 630 | ```
 631 | 
 632 | ### 
 633 | 
 634 | For predictive models, it is advisable to evaluate a variety of different model types. This requires the user to create multiple model specifications.
 635 | 
 636 | ### Exercise 7
 637 | 
 638 | Copy the previous code and add the parameter `outcome`, setting it equal to `distance`. This also will not work because the `predictors` parameter needs to be added
 639 | 
 640 | ```{r how-does-a-workflow--7, exercise = TRUE}
 641 | 
 642 | ```
 643 | 
 644 | <button onclick = "transfer_code(this)">Copy previous code</button>
 645 | 
 646 | ```{r how-does-a-workflow--7-hint-1, eval = FALSE}
 647 | ... |>
 648 |   add_variables(outcome = ...)
 649 | ```
 650 | 
 651 | ```{r include = FALSE}
 652 | #workflow() |>
 653 |   #add_variables(outcome = distance)
 654 | ```
 655 | 
 656 | ### 
 657 | 
 658 | Sequential testing of models typically starts with an expanded set of predictors. This “full model” is compared to a sequence of the same model that removes each predictor in turn. Using basic hypothesis testing methods or empirical validation, the effect of each predictor can be isolated and assessed.
 659 | 
 660 | ### Exercise 8
 661 | 
 662 | Copy the previous code and the `predictors` parameters. Set the parameter equal to `Sex, age, Subject` using the vector function `c()`.
 663 | 
 664 | ```{r how-does-a-workflow--8, exercise = TRUE}
 665 | 
 666 | ```
 667 | 
 668 | <button onclick = "transfer_code(this)">Copy previous code</button>
 669 | 
 670 | ```{r how-does-a-workflow--8-hint-1, eval = FALSE}
 671 | ... |>
 672 |   add_variables(
 673 |     outcome = distance,
 674 |     predictors = ...(Sex, age, Subject))
 675 | ```
 676 | 
 677 | ```{r include = FALSE}
 678 | workflow() |>
 679 |   add_variables(
 680 |     outcome = distance,
 681 |     predictors = c(Sex, age, Subject))
 682 | ```
 683 | 
 684 | ### 
 685 | 
 686 | In regression analysis, the outcome variable is the variable we aim to model as a function of one or more predictor variables. It represents the target or dependent variable that we want to predict or explain. 
 687 | 
 688 | ### Exercise 9
 689 | 
 690 | Finally we need to add the model. Copy the previous code and pipe `add_model()`. This will throw and error becasue a `spec` or model specification was not specified. 
 691 | 
 692 | ```{r how-does-a-workflow--9, exercise = TRUE}
 693 | 
 694 | ```
 695 | 
 696 | <button onclick = "transfer_code(this)">Copy previous code</button>
 697 | 
 698 | ```{r how-does-a-workflow--9-hint-1, eval = FALSE}
 699 | ... |>
 700 |   add_model()
 701 | ```
 702 | 
 703 | ```{r include = FALSE}
 704 | # workflow() |>
 705 | #   add_variables(
 706 | #     outcome = distance,
 707 | #     predictors = c(Sex, age, Subject)) |>
 708 | #   add_model()
 709 | ```
 710 | 
 711 | ### 
 712 | 
 713 | The `add_model()` function allows you to add a modeling specification to your workflow. It specifies the type of model you want to use for the analysis, such as linear regression, random forest, support vector machine, etc.
 714 | 
 715 | ### Exercise 10
 716 | 
 717 | Copy the previous code and add the specification we made earlier, `multilevel_spec` as the first parameter.
 718 | 
 719 | ```{r how-does-a-workflow--10, exercise = TRUE}
 720 | 
 721 | ```
 722 | 
 723 | <button onclick = "transfer_code(this)">Copy previous code</button>
 724 | 
 725 | ```{r how-does-a-workflow--10-hint-1, eval = FALSE}
 726 | ... |>
 727 |   add_model(multilevel_spec)
 728 | ```
 729 | 
 730 | ```{r include = FALSE}
 731 | workflow() |>
 732 |   add_variables(
 733 |     outcome = distance,
 734 |     predictors = c(Sex, age, Subject)) |>
 735 |   add_model(multilevel_spec)
 736 | ```
 737 | 
 738 | ### 
 739 | 
 740 | The `formula` parameter allows you to specify the formula that defines the relationship between the outcome variable (response variable) and the predictor variables in the model.
 741 | 
 742 | ### Exercise 11
 743 | 
 744 | Copy the previous code and add the parameter `formula` to `add_model()`. Set `formula` equal to the formula seen in exercise 1, `distance ~ Sex + (age | Subject)`.
 745 | 
 746 | ```{r how-does-a-workflow--11, exercise = TRUE}
 747 | 
 748 | ```
 749 | 
 750 | <button onclick = "transfer_code(this)">Copy previous code</button>
 751 | 
 752 | ```{r how-does-a-workflow--11-hint-1, eval = FALSE}
 753 | ... |>
 754 |   add_model(
 755 |     multilevel_spec,
 756 |     formula = ...
 757 |             )
 758 | ```
 759 | 
 760 | ```{r include = FALSE}
 761 | workflow() |>
 762 |   add_variables(
 763 |     outcome = distance,
 764 |     predictors = c(Sex, age, Subject)) |>
 765 |   add_model(
 766 |     multilevel_spec, 
 767 |     formula = distance ~ Sex + (age | Subject)
 768 |     )
 769 | ```
 770 | 
 771 | ### Exercise 12
 772 | 
 773 | Copy the previous code and set it equal to `multilevel_workflow` using `<-`.
 774 | 
 775 | ```{r how-does-a-workflow--12, exercise = TRUE}
 776 | 
 777 | ```
 778 | 
 779 | <button onclick = "transfer_code(this)">Copy previous code</button>
 780 | 
 781 | ```{r how-does-a-workflow--12-hint-1, eval = FALSE}
 782 | multilevel_workflow <- ...
 783 | ```
 784 | 
 785 | ```{r include = FALSE}
 786 | multilevel_workflow <- 
 787 |   workflow() |>
 788 |   add_variables(outcome = distance,
 789 |                 predictors = c(Sex, age, Subject)) |>
 790 |   add_model(multilevel_spec, 
 791 |             formula = distance ~ Sex + (age | Subject)
 792 |     )
 793 | ```
 794 | 
 795 | ### 
 796 | 
 797 | Since the preprocessing is model dependent, workflows attempts to emulate what the underlying model would do whenever possible. If it is not possible, the formula processing should not do anything to the columns used in the formula. 
 798 | 
 799 | ### Exercise 13
 800 | 
 801 | Now we need to fit the model specified using a model specification object using `fit()`. Type in `fit()` and add the parameter `multilevel_workflow`. Error will appear but we will fix it later.
 802 | 
 803 | ```{r how-does-a-workflow--13, exercise = TRUE}
 804 | 
 805 | ```
 806 | 
 807 | ```{r how-does-a-workflow--13-hint-1, eval = FALSE}
 808 | fit(...)
 809 | ```
 810 | 
 811 | ```{r include = FALSE}
 812 | #fit(multilevel_workflow)
 813 | ```
 814 | 
 815 | ### 
 816 | 
 817 | Preprocessing is a crucial step in the data analysis workflow because it helps address various issues and challenges associated with real-world data.
 818 | 
 819 | ### Exercise 14
 820 | 
 821 | Copy the previous code and add the `data` parameter, setting it equal `Orthodont`. Set this expression equal to `multilevel_fit` and print it out on the next line.
 822 | 
 823 | ```{r how-does-a-workflow--14, exercise = TRUE}
 824 | 
 825 | ```
 826 | 
 827 | <button onclick = "transfer_code(this)">Copy previous code</button>
 828 | 
 829 | ```{r how-does-a-workflow--14-hint-1, eval = FALSE}
 830 | multilevel_fit <- fit(multilevel_workflow, data = ...)
 831 | ```
 832 | 
 833 | ```{r include = FALSE}
 834 | fit(multilevel_workflow, data = Orthodont)
 835 | ```
 836 | 
 837 | ### 
 838 | 
 839 | `strata()` is a special function used in the context of the Cox survival model. It identifies stratification variables when they appear on the right hand side of a formula.
 840 | 
 841 | ### Exercise 15
 842 | 
 843 | <!-- PK: Not sure if I should just give this code since it is kind of repetitive from the last 13 exercises of just split it up. Split it up! Repetition in the pursuit of understanding is no vice! -->
 844 | 
 845 | We can even use the previously mentioned `strata()` function from the survival package for survival analysis. Run the following code.
 846 | 
 847 | ```{r how-does-a-workflow--15, exercise = TRUE}
 848 | library(censored)
 849 | 
 850 | parametric_spec <- survival_reg()
 851 | 
 852 | parametric_workflow <- 
 853 |   workflow() %>% 
 854 |   add_variables(outcome = c(fustat, futime), predictors = c(age, rx)) %>% 
 855 |   add_model(parametric_spec, 
 856 |             formula = Surv(futime, fustat) ~ age + strata(rx))
 857 | 
 858 | parametric_fit <- fit(parametric_workflow, data = ovarian)
 859 | parametric_fit
 860 | ```
 861 | 
 862 | ### 
 863 | 
 864 | Great Job! You now know how a workflow uses different sorts of formulas from a data set.
 865 | 
 866 | ## Creating Multiple Workflows at Once
 867 | ### 
 868 | 
 869 | In some situations, the data require numerous attempts to find an appropriate model. In these situations, as well as others, it can become tedious or onerous to create a lot of workflows from different sets of preprocessors and/or model specifications. To address this problem, the **workflowset** package creates combinations of workflow components. A list of preprocessors (e.g., formulas, **dplyr** selectors, or feature engineering recipe objects discussed in the next chapter) can be combined with a list of model specifications, resulting in a set of workflows.
 870 | 
 871 | ### Exercise 1
 872 | 
 873 | Let’s say that we want to focus on the different ways that house location is represented in the Ames data. We can create a set of formulas that capture these predictors. Hit "Run Code".
 874 | 
 875 | ```{r creating-multiple-wo-1, exercise = TRUE}
 876 | location <- list(
 877 |   longitude = Sale_Price ~ Longitude,
 878 |   latitude = Sale_Price ~ Latitude,
 879 |   coords = Sale_Price ~ Longitude + Latitude,
 880 |   neighborhood = Sale_Price ~ Neighborhood
 881 | )
 882 | ```
 883 | 
 884 | ### 
 885 | 
 886 | In R, list() is a built-in function used to create a list, which is a versatile data structure that can hold elements of different types, such as vectors, matrices, data frames, and even other lists. Lists allow you to organize and store multiple objects together in a single container.
 887 | 
 888 | ### Exercise 2
 889 | 
 890 |  
 891 | Load the library **workflowsets** using `library()`.
 892 | 
 893 | ```{r creating-multiple-wo-2, exercise = TRUE}
 894 | 
 895 | ```
 896 | 
 897 | ```{r creating-multiple-wo-2-hint-1, eval = FALSE}
 898 | library(...)
 899 | ```
 900 | 
 901 | ```{r include = FALSE}
 902 | library(workflowsets)
 903 | ```
 904 | 
 905 | ### 
 906 | 
 907 | The goal of **workflowsets** is to allow users to create and easily fit a large number of models. **workflowsets** can create a *workflow set* that holds multiple workflow objects. These objects can be created by crossing all combinations of preprocessors (e.g., formula, recipe, etc) and model specifications. This set can be tuned or resampled using a set of specific functions.
 908 | 
 909 | ### Exercise 3
 910 | 
 911 | Create a workflow set by using the method `workflow_set()`. Add the parameter `preproc` and set it equal to the `location` list created earlier.
 912 | 
 913 | ```{r creating-multiple-wo-3, exercise = TRUE}
 914 | 
 915 | ```
 916 | 
 917 | ```{r creating-multiple-wo-3-hint-1, eval = FALSE}
 918 | workflow_set(preproc = ...)
 919 | ```
 920 | 
 921 | ```{r include = FALSE}
 922 | #workflow_set(preproc = location)
 923 | ```
 924 | 
 925 | This throws an error because no model is specified for the set. 
 926 | 
 927 | ### Exercise 4
 928 | 
 929 | We will use a linear model `lm_model` across the list of locations. Copy the previous code and add the parameter `models`, setting it equal to `list(lm = lm_model)`. Set this expression equal to `location_models` using `<-`.
 930 | 
 931 | ```{r creating-multiple-wo-4, exercise = TRUE}
 932 | 
 933 | ```
 934 | 
 935 | <button onclick = "transfer_code(this)">Copy previous code</button>
 936 | 
 937 | ```{r creating-multiple-wo-4-hint-1, eval = FALSE}
 938 | ... <- workflow_set(preproc = location, models = list(lm = ...))
 939 | ```
 940 | 
 941 | ```{r include = FALSE}
 942 | workflow_set(preproc = location, models = list(lm = lm_model))
 943 | ```
 944 | 
 945 | ### 
 946 | 
 947 | In R, lm() stands for "linear model," and it is a built-in function used to fit linear regression models. Linear regression is a statistical method used to model the relationship between a dependent variable (response variable) and one or more independent variables (predictors) as a linear equation.
 948 | 
 949 | ### Exercise 5
 950 | 
 951 | Lets take a look at some of the info in the `location_models`. Extract the first column from the info section in `location_models` by using the `$` and `info[[1]]`.
 952 | 
 953 | ```{r creating-multiple-wo-5, exercise = TRUE}
 954 | 
 955 | ```
 956 | 
 957 | ```{r creating-multiple-wo-5-hint-1, eval = FALSE}
 958 | location_models$info[...]
 959 | ```
 960 | 
 961 | ```{r include = FALSE}
 962 | location_models$info[1]
 963 | ```
 964 | 
 965 | ### 
 966 | 
 967 | You can see that this produces a summary of what type of preprocessor (preproc) and model is being used in the `location_models` workflow. `extract_workflow()` returns the workflow object. The workflow will not have been estimated.
 968 | 
 969 | 
 970 | ### Exercise 6
 971 | 
 972 | To extract the workflow of the model, we will use `extract_workflow()`. Within `extract_workflow()`, add the parameters, `location_models` and `id = "coords_lm"`.
 973 | 
 974 | ```{r creating-multiple-wo-6, exercise = TRUE}
 975 | 
 976 | ```
 977 | 
 978 | ```{r creating-multiple-wo-6-hint-1, eval = FALSE}
 979 | extract_workflow(location_models, id = "...")
 980 | 
 981 | ```
 982 | 
 983 | ```{r include = FALSE}
 984 | extract_workflow(location_models, id = "coords_lm")
 985 | ```
 986 | 
 987 | ### 
 988 | 
 989 | Workflow sets are mostly designed to work with resampling. The columns `option` and `result` must be populated with specific types of objects that result from resampling.
 990 | 
 991 | ### Exercise 7
 992 | 
 993 | Let’s create model fits for each formula and save them in a new column called fit. We’ll use basic **dplyr** and **purrr** operations. Hit "Run Code".
 994 | 
 995 | ```{r creating-multiple-wo-7, exercise = TRUE}
 996 | location_models <-
 997 |    location_models %>%
 998 |    mutate(fit = map(info, ~ fit(.x$workflow[[1]], ames_train)))
 999 | 
1000 | location_models
1001 | ```
1002 | 
1003 | ### 
1004 | 
1005 | As you can see, we have tibbles of information of 4 different workflows that we have created.
1006 | 
1007 | ### Exercise 8
1008 | 
1009 | Lets extract the fit model from `location_models` by using `$fit[[1]]`.
1010 | 
1011 | ```{r creating-multiple-wo-8, exercise = TRUE}
1012 | 
1013 | ```
1014 | 
1015 | ```{r creating-multiple-wo-8-hint-1, eval = FALSE}
1016 | location_models$fit[[...]]
1017 | ```
1018 | 
1019 | ```{r include = FALSE}
1020 | location_models$fit[[1]]
1021 | ```
1022 | 
1023 | ### 
1024 | 
1025 | We use a **purrr** function here to map through our models, but there is an easier, better approach to fit workflow sets that will be introduced in later tutorials.
1026 | 
1027 | ###
1028 | 
1029 | Great Job! You now know how to create multiple workflows and put them in a workflow set. You also know how to extract these sets and analyze them based on the model of the chosen workflow set.
1030 | 
1031 | ## Evaluatin the Test Set
1032 | ###
1033 | 
1034 | Let’s say that we’ve concluded our model development and have settled on a final model. There is a convenience function called `last_fit()` that will fit the model to the entire training set and evaluate it with the testing set.
1035 | 
1036 | ### Exercise 1
1037 | 
1038 | Enter `last_fit()` and add the parameter `lm_wflow`. Hit "Run Code." (Note: This will throw an error.)
1039 | 
1040 | ```{r evaluatin-the-test-s-1, exercise = TRUE}
1041 | 
1042 | ```
1043 | 
1044 | ```{r evaluatin-the-test-s-1-hint, eval = FALSE}
1045 | last_fit(...)
1046 | ```
1047 | 
1048 | ```{r, include = FALSE}
1049 | #last_fit(lm_wflow)
1050 | ```
1051 | 
1052 | ###
1053 | 
1054 | The `last_fit()` function is used to fit a model on the last split of a resampled data set, typically obtained through cross-validation or bootstrapping. It is useful when you want to use the final model trained on the entire training dataset for making predictions on new, unseen data.
1055 | 
1056 | ### Exercise 2
1057 | 
1058 | We always need to a have split for `last_fit()`. Add the parameter `ames_split` to the function and set the whole expression to `final_lm_res`. Print `final_lm_res` on the next line to see the output.
1059 | 
1060 | ```{r evaluatin-the-test-s-2, exercise = TRUE}
1061 | 
1062 | ```
1063 | 
1064 | <button onclick = "transfer_code(this)">Copy previous code</button>
1065 | 
1066 | ```{r evaluatin-the-test-s-2-hint, eval = FALSE}
1067 | final_lm_res <- last_fit(lm_wflow, ...)
1068 | ```
1069 | 
1070 | ```{r, include = FALSE}
1071 | final_lm_res <- last_fit(lm_wflow, ames_split)
1072 | ```
1073 | 
1074 | ###
1075 | 
1076 | The .workflow column contains the fitted workflow and can be pulled out of the results using `extract_workflow()`. 
1077 | 
1078 | ### Exercise 3
1079 | 
1080 | Use `extract_workflow()` and add the parameter `final_lm_res`. Hit "Run Code".
1081 | 
1082 | ```{r evaluatin-the-test-s-3, exercise = TRUE}
1083 | 
1084 | ```
1085 | 
1086 | ```{r evaluatin-the-test-s-3-hint, eval = FALSE}
1087 | extract_workflow(...)
1088 | ```
1089 | 
1090 | ```{r, include = FALSE}
1091 | extract_workflow(final_lm_res)
1092 | ```
1093 | 
1094 | ###
1095 | 
1096 | `collect_metrics()` and `collect_predictions()` provide access to the performance metrics and predictions, respectively. The `collect_metrics()` function is a lovely way to extract model performance metrics with resampling. `collect_predictions()` can summarize the various results over replicate out-of-sample predictions.
1097 | 
1098 | ### Exercise 4
1099 | 
1100 | Run `collect_metrics()` and `collect_predictions()`, on separate lines, with the parameter being `final_lm_res`. Set the expressions equal to `c_mtrcs` and `c_predic`, respectively. Print these two functions on the next two consecutive lines.
1101 | 
1102 | ```{r evaluatin-the-test-s-4, exercise = TRUE}
1103 | 
1104 | ```
1105 | 
1106 | <button onclick = "transfer_code(this)">Copy previous code</button>
1107 | 
1108 | ```{r evaluatin-the-test-s-4-hint, eval = FALSE}
1109 | c_mtrcs <- collect_metrics(...)
1110 | c_predic <- collect_predictions(...)
1111 | ```
1112 | 
1113 | ```{r, include = FALSE}
1114 | c_mtrcs <- collect_metrics(final_lm_res)
1115 | c_predic <- collect_predictions(final_lm_res)
1116 | ```
1117 | 
1118 | ###
1119 | 
1120 | Statistical metrics are used to describe the distribution of data, compare groups, assess relationships between variables, and draw conclusions from data.The model takes the predictor variables from the test data and generates predictions for the outcome variable. For example, in linear regression, the model estimates the response variable based on the values of the predictor variables.
1121 | 
1122 | ### Exercise 5
1123 | 
1124 | Finally, lets `slice()` the predictions output, as it is too many unnecessary rows that we need to analyze at once. Copy the previous code and slice the first 5 rows by adding the parameter `1:5` to `slice()`. Print them out on the next lines.
1125 | 
1126 | ```{r evaluatin-the-test-s-5, exercise = TRUE}
1127 | 
1128 | ```
1129 | 
1130 | <button onclick = "transfer_code(this)">Copy previous code</button>
1131 | 
1132 | ```{r evaluatin-the-test-s-5-hint, eval = FALSE}
1133 | c_predic <- 
1134 |   collect_predictions(final_lm_res) |>
1135 |   slice(...)
1136 | ```
1137 | 
1138 | ```{r, include = FALSE}
1139 | c_predic <- 
1140 |   collect_predictions(final_lm_res) |>
1141 |   slice(1:5)
1142 | ```
1143 | 
1144 | ###
1145 | 
1146 | Great Job! You now know how to evaluate a testing set by using `last_fit()` and statistical metrics and predictions using the `collect_metrics()` and `collect_predictions()`.
1147 | 
1148 | 
1149 | ## Summary
1150 | ### 
1151 | 
1152 | This tutorial covers [Chapter 7: A Model Workflow](https://www.tmwr.org/workflows.html) from [*Tidy Modeling with R*](https://www.tmwr.org/) by Max Kuhn and Julia Silge. In the previous chapter, we discussed the [**parsnip**](https://parsnip.tidymodels.org/) package, which can be used to define and fit the model. This chapter introduced a new concept called a model workflow. The purpose of this concept (and the corresponding **tidymodels** `workflow()` object) encapsulated the major pieces of the modeling process. 
1153 | 
1154 | 
1155 | ```{r download-answers, child = system.file("child_documents/download_answers.Rmd", package = "tutorial.helpers")}
1156 | ```
1157 | 


--------------------------------------------------------------------------------
/inst/tutorials/09-judging-model-effectiveness/tutorial.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Judging Model Effectiveness
  3 | author: Pratham Kancherla and David Kane
  4 | tutorial:
  5 |   id: judging-model-effectiveness
  6 | output:
  7 |   learnr::tutorial:
  8 |     progressive: yes
  9 |     allow_skip: yes
 10 | runtime: shiny_prerendered
 11 | description: 'Tutorial for Chapter 9: Judging Model Effectiveness'
 12 | ---
 13 | 
 14 | ```{r setup, include = FALSE}
 15 | library(learnr)
 16 | library(tutorial.helpers)
 17 | library(tidyverse)
 18 | library(tidymodels)
 19 | 
 20 | tidymodels_prefer()
 21 | 
 22 | knitr::opts_chunk$set(echo = FALSE)
 23 | options(tutorial.exercise.timelimit = 60, 
 24 |         tutorial.storage = "local") 
 25 | 
 26 | data(ames)
 27 | ames <- mutate(ames, Sale_Price = log10(Sale_Price))
 28 | 
 29 | ames_split <- initial_split(ames, prop = 0.80, strata = Sale_Price)
 30 | ames_train <- training(ames_split)
 31 | ames_test  <-  testing(ames_split)
 32 | 
 33 | ames_rec <- 
 34 |   recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 
 35 |            Latitude + Longitude, data = ames_train) %>%
 36 |   step_log(Gr_Liv_Area, base = 10) %>% 
 37 |   step_other(Neighborhood, threshold = 0.01) %>% 
 38 |   step_dummy(all_nominal_predictors()) %>% 
 39 |   step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) %>% 
 40 |   step_ns(Latitude, Longitude, deg_free = 20)
 41 |   
 42 | lm_model <- 
 43 |   linear_reg() |>
 44 |   set_engine("lm")
 45 | 
 46 | lm_wflow <- 
 47 |   workflow() |> 
 48 |   add_model(lm_model) |>
 49 |   add_recipe(ames_rec)
 50 | 
 51 | lm_fit <- fit(lm_wflow, ames_train)
 52 | 
 53 | ames_test_res <- predict(lm_fit, new_data = ames_test %>% select(-Sale_Price))
 54 | 
 55 | ames_test_res <- bind_cols(ames_test_res, ames_test %>% select(Sale_Price))
 56 | 
 57 | 
 58 | ames_metrics <- metric_set(rmse, rsq, mae)
 59 | 
 60 | classification_metrics <- metric_set(accuracy, mcc, f_meas)
 61 | 
 62 | two_class_curve <- roc_curve(two_class_example, truth, Class1)
 63 | 
 64 | ```
 65 | 
 66 | ```{r copy-code-chunk, child = system.file("child_documents/copy_button.Rmd", package = "tutorial.helpers")}
 67 | ```
 68 | 
 69 | ```{r info-section, child = system.file("child_documents/info_section.Rmd", package = "tutorial.helpers")}
 70 | ```
 71 | 
 72 | ## Introduction
 73 | ### 
 74 | 
 75 | This tutorial covers [Chapter 9: Judging Model Effectiveness](https://www.tmwr.org/compare.html#workflow-set) from [*Tidy Modeling with R*](https://www.tmwr.org/) by Max Kuhn and Julia Silge. This tutorial will demonstrate the **yardstick** package, a core **tidymodels** packages with the focus of measuring model performance. Before illustrating syntax, let’s explore whether empirical validation using performance metrics is worthwhile when a model is focused on inference rather than prediction.
 76 | 
 77 | 
 78 | ## Regression Metrics
 79 | ### 
 80 | 
 81 | **tidymodels** prediction functions produce tibbles with columns for the predicted values. These columns have consistent names, and the functions in the **yardstick** package that produce performance metrics have consistent interfaces.
 82 | 
 83 | ### Exercise 1
 84 | 
 85 | Load the library **tidyverse** using `library()`.
 86 | 
 87 | ```{r regression-metrics-1, exercise = TRUE}
 88 | 
 89 | ```
 90 | 
 91 | ```{r regression-metrics-1-hint-1, eval = FALSE}
 92 | library(...)
 93 | ```
 94 | 
 95 | ```{r include = FALSE}
 96 | library(tidyverse)
 97 | ```
 98 | 
 99 | ### 
100 | 
101 | Two common metrics for regression models are the root mean squared error (RMSE) and the coefficient of determination (a.k.a. R2). The former measures accuracy while the latter measures correlation. These are not necessarily the same thing. 
102 | 
103 | ### Exercise 2
104 | 
105 | Now lets create the prediction model using `predict()`. Add the parameter`lm_fit` to `predict()` and the data object, `ames_test` (from the previous tutorials), hit "Run Code".
106 | 
107 | ```{r regression-metrics-2, exercise = TRUE}
108 | 
109 | ```
110 | 
111 | ```{r regression-metrics-2-hint-1, eval = FALSE}
112 | predict(lm_fit, new_data = ...)
113 | ```
114 | 
115 | ```{r include = FALSE}
116 | predict(lm_fit, new_data = ames_test)
117 | ```
118 | 
119 | ### 
120 | 
121 | Now we need to add `-Sale_Price` to ignore that column in the data set `ames_test`. 
122 | 
123 | ### Exercise 3
124 | 
125 | Copy the previous code and pipe `select(-Sale_Price)` to the code. Set this expression equal to `ames_test_res`. Hit "Run Code".
126 | 
127 | ```{r regression-metrics-3, exercise = TRUE}
128 | 
129 | ```
130 | 
131 | <button onclick = "transfer_code(this)">Copy previous code</button>
132 | 
133 | ```{r regression-metrics-3-hint-1, eval = FALSE}
134 | ames_test_res <- predict(lm_fit, new_data = ames_test |> select(-...)) 
135 | ```
136 | 
137 | ```{r include = FALSE}
138 | ames_test_res <- predict(lm_fit, new_data = ames_test |> select(-Sale_Price)) 
139 | ```
140 | 
141 | ### 
142 | 
143 | The `select()` function is part of the **dplyr** package in R, which is widely used for data manipulation tasks. The function allows you to choose or remove specific columns from a data frame or tibble, providing a flexible and straightforward way to work with data.
144 | 
145 | ### Exercise 4
146 | 
147 | The predicted numeric outcome from the regression model is named .pred. Let’s match the predicted values with their corresponding observed outcome values using `bind_cols()`. Within in the function, add the parameter `ames_test_res` and the `ames_test` as the data argument. 
148 | 
149 | ```{r regression-metrics-4, exercise = TRUE}
150 | 
151 | ```
152 | 
153 | ```{r regression-metrics-4-hint-1, eval = FALSE}
154 | bind_cols(ames_test_res, ...)
155 | ```
156 | 
157 | ```{r include = FALSE}
158 | bind_cols(ames_test_res, ames_test)
159 | ```
160 | 
161 | ### 
162 | 
163 | In R, `bind_cols()` is a function from the **dplyr** package used to combine data frames or tibbles by column-wise binding. It is commonly used to merge multiple data frames horizontally, adding new columns to the resulting data frame.
164 | 
165 | ### Exercise 5
166 | 
167 | We only want to compare the predicted values to the `Sale_Price` column, which is why we need to only look at that column from the `ames_test` data set. Copy the previous code and within the function, pipe `select(Sale_Price)` after `ames_test`. Set this expression to `ames_test_res`.
168 | 
169 | ```{r regression-metrics-5, exercise = TRUE}
170 | 
171 | ```
172 | 
173 | <button onclick = "transfer_code(this)">Copy previous code</button>
174 | 
175 | ```{r regression-metrics-5-hint-1, eval = FALSE}
176 | ames_test_res <- bind_cols(ames_test_res, ames_test |> ...(Sale_Price))
177 | ```
178 | 
179 | ```{r include = FALSE}
180 | ames_test_res <- bind_cols(ames_test_res, ames_test |> select(Sale_Price))
181 | ```
182 | 
183 | ### 
184 | 
185 | Note that both the predicted and observed outcomes are in log-10 units. It is best practice to analyze the predictions on the transformed scale (if one were used) even if the predictions are reported using the original units.
186 | 
187 | ### Exercise 6
188 | 
189 | Now let's graph the data. Pipe `ggplot()` to `ames_test_res` and hit "Run Code."
190 | 
191 | ```{r regression-metrics-6, exercise = TRUE}
192 | 
193 | ```
194 | 
195 | ```{r regression-metrics-6-hint-1, eval = FALSE}
196 | ames_test_res |>
197 |   ...()
198 | ```
199 | 
200 | ```{r include = FALSE}
201 | ames_test_res |>
202 |   ggplot()
203 | ```
204 | 
205 | ### 
206 | 
207 | ### Exercise 7
208 | 
209 | Copy the previous code and within `aes()`, set `x = Sale_Price` and `y = .pred`. Hit "Run Code".
210 | 
211 | ```{r regression-metrics-7, exercise = TRUE}
212 | 
213 | ```
214 | 
215 | <button onclick = "transfer_code(this)">Copy previous code</button>
216 | 
217 | ```{r regression-metrics-7-hint-1, eval = FALSE}
218 | ames_test_res |>
219 |   ggplot(aes(x = ..., y = .pred))
220 | ```
221 | 
222 | ```{r include = FALSE}
223 | ames_test_res |>
224 |   ggplot(aes(x = Sale_Price, y = .pred))
225 | ```
226 | 
227 | ### 
228 | 
229 | In R, `geom_abline()` is a function from the **ggplot2** package used to add reference lines to a plot created using the `ggplot()` function. These reference lines can be horizontal, vertical, or diagonal, and they are typically used to highlight specific relationships or patterns in the data.
230 | 
231 | ### Exercise 8
232 | 
233 | We want to add a regression line to the plot by using `geom_abline()`. Copy the previous code and add `geom_abline()`. Set the line type `lty` to `2` and hit "Run Code".
234 | 
235 | ```{r regression-metrics-8, exercise = TRUE}
236 | 
237 | ```
238 | 
239 | <button onclick = "transfer_code(this)">Copy previous code</button>
240 | 
241 | ```{r regression-metrics-8-hint-1, eval = FALSE}
242 | ... +
243 |   geom_abline(lty = ...)
244 | ```
245 | 
246 | ```{r include = FALSE}
247 | ames_test_res |>
248 |   ggplot(aes(x = Sale_Price, y = .pred)) +
249 |     geom_abline(lty = 2)
250 | ```
251 | 
252 | ### 
253 | 
254 | `geom_point()` is a function in the R programming language that is part of the **ggplot2** package. It is used to create scatter plots in data visualization, where individual data points are represented as points on a Cartesian coordinate system.
255 | 
256 | ### Exercise 9
257 | 
258 | Copy the previous code and add `geom_point()`. Set `alpha = 0.5`. Hit "Run Code".
259 | 
260 | ```{r regression-metrics-9, exercise = TRUE}
261 | 
262 | ```
263 | 
264 | <button onclick = "transfer_code(this)">Copy previous code</button>
265 | 
266 | ```{r regression-metrics-9-hint-1, eval = FALSE}
267 | ... +
268 |   geom_point(... = 0.5)
269 | ```
270 | 
271 | ```{r include = FALSE}
272 | ames_test_res |>
273 |   ggplot(aes(x = Sale_Price, y = .pred)) +
274 |     geom_abline(lty = 2) +
275 |     geom_point(alpha = 0.5)
276 | ```
277 | 
278 | ### 
279 | 
280 | A model optimized for RMSE has more variability but has relatively uniform accuracy across the range of the outcome.
281 | 
282 | ### Exercise 10
283 | 
284 | Copy the previous code and add the correct labels to the graph.
285 | 
286 | x: "Sale_Price (log10)"
287 | 
288 | y: "Predicted Sale Price (log10)"
289 | 
290 | ```{r regression-metrics-10, exercise = TRUE}
291 | 
292 | ```
293 | 
294 | <button onclick = "transfer_code(this)">Copy previous code</button>
295 | 
296 | ```{r regression-metrics-10-hint-1, eval = FALSE}
297 | ... + 
298 |   labs(y = "...", x = "...")
299 | ```
300 | 
301 | ```{r include = FALSE}
302 | ames_test_res |>
303 |   ggplot(aes(x = Sale_Price, y = .pred)) +
304 |     geom_abline(lty = 2) +
305 |     geom_point(alpha = 0.5) +
306 |     labs(y = "Predicted Sale Price (log10)", x = "Sale Price (log10)")
307 | ```
308 | 
309 | ### 
310 | 
311 | For regression models, `coord_obs_pred()` can be used in a **ggplot** to make the x- and y-axes have the same exact scale along with an aspect ratio of one.
312 | 
313 | ### Exercise 11
314 | 
315 | Copy the previous code and add `coord_obs_pred()`. Also add `theme_classic()` to make the graph look more presentable. Hit "Run Code".
316 | 
317 | ```{r regression-metrics-11, exercise = TRUE}
318 | 
319 | ```
320 | 
321 | <button onclick = "transfer_code(this)">Copy previous code</button>
322 | 
323 | ```{r regression-metrics-11-hint-1, eval = FALSE}
324 | ... +
325 |     coord_obs_pred() +
326 |     theme_classic()
327 | ```
328 | 
329 | ```{r include = FALSE}
330 | ames_test_res |>
331 |   ggplot(aes(x = Sale_Price, y = .pred)) +
332 |     geom_abline(lty = 2) +
333 |     geom_point(alpha = 0.5) +
334 |     labs(y = "Predicted Sale Price (log10)", x = "Sale Price (log10)") +
335 |     coord_obs_pred() +
336 |     theme_classic()
337 | ```
338 | 
339 | ### 
340 | 
341 | `rmse()` is the square root of the mean of the square of all of the error. RMSE is a good measure of accuracy, but only to compare prediction errors of different models or model configurations for a particular variable and not between variables, as it is scale-dependent.
342 | 
343 | ### Exercise 12
344 | 
345 | We will be using `rmse()` to compute the Root-Mean Square Error of `ames_test_res`. Pipe `ames_test_res` to `rsme()`. This might throw an error.
346 | 
347 | ```{r regression-metrics-12, exercise = TRUE}
348 | 
349 | ```
350 | 
351 | ```{r regression-metrics-12-hint-1, eval = FALSE}
352 | ames_test_res |>
353 |   rmse()
354 | ```
355 | 
356 | ### Exercise 13
357 | 
358 | Copy the previous code. We need to add the parameters `truth` and `estimate`. `truth` is our independent variable and `estimate` is our dependent, therefore, set `truth = Sale_Price` and `estimate = .pred`.
359 | 
360 | ```{r regression-metrics-13, exercise = TRUE}
361 | 
362 | ```
363 | 
364 | <button onclick = "transfer_code(this)">Copy previous code</button>
365 | 
366 | ```{r regression-metrics-13-hint-1, eval = FALSE}
367 | ames_test_res |>
368 |   rmse(truth = ..., ...= .pred)
369 | ```
370 | 
371 | ```{r include = FALSE}
372 | ames_test_res |>
373 |   rmse(truth = Sale_Price, estimate = .pred)
374 | ```
375 | 
376 | ### 
377 | 
378 | `metric_set()` allows you to combine multiple metric functions together into a new function that calculates all of them at once.
379 | 
380 | ### Exercise 14
381 | 
382 | Let's create a metric set consistent of these functions: Root-Mean Square Error (rmse), R-Squared (rsq), and Mean Absolute Error (mae). Type `metric_set()` and add `rmse`, `rsq`, and `mae`. Set this expression <- to `ames_metrics`.
383 | 
384 | ```{r regression-metrics-14, exercise = TRUE}
385 | 
386 | ```
387 | 
388 | ```{r regression-metrics-14-hint-1, eval = FALSE}
389 | ames_metrics <- metric_set(..., ..., ...)
390 | ```
391 | 
392 | ```{r include = FALSE}
393 | ames_metrics <- metric_set(rmse, rsq, mae)
394 | ```
395 | 
396 | ### 
397 | 
398 | An inferential model is used primarily to understand relationships, and typically emphasizes the choice (and validity) of probabilistic distributions and other generative qualities that define the model. 
399 | 
400 | ### Exercise 15
401 | 
402 | Now lets use the same parameters as seen in Exercise 14. Use the parameters in the function created in the previous exercise, `ames_metrics`. 
403 | 
404 | ```{r regression-metrics-15, exercise = TRUE}
405 | 
406 | ```
407 | 
408 | <button onclick = "transfer_code(this)">Copy previous code</button>
409 | 
410 | ```{r regression-metrics-15-hint-1, eval = FALSE}
411 | ames_metrics(ames_test_res, truth = ..., estimate = ...)
412 | ```
413 | 
414 | ```{r include = FALSE}
415 | ames_metrics(ames_test_res, truth = Sale_Price, estimate = .pred)
416 | ```
417 | 
418 | ### 
419 | 
420 | The root mean squared error and mean absolute error metrics are both on the scale of the outcome (so `log10(Sale_Price)` for our example) and measure the difference between the predicted and observed values. The value for  R2 measures the squared correlation between the predicted and observed values, so values closer to one are better.
421 | 
422 | ### 
423 | 
424 | Great job! You now know how to calculate and analyze regression metrics.
425 | 
426 | ## Binay Classification Metrics
427 | ### 
428 | 
429 | In binary classification, we are dealing with problems where the target variable has two classes or categories. Commonly, these classes are denoted as "positive" and "negative." 
430 | 
431 | ### Exercise 1
432 | 
433 | The data set we will be looking at in this section is `two_class_example`. Type `tibble(two_class_example)` to get a sense of what the data looks like.
434 | 
435 | ```{r binay-classification-1, exercise = TRUE}
436 | 
437 | ```
438 | 
439 | ```{r binay-classification-1-hint-1, eval = FALSE}
440 | tibble(...)
441 | ```
442 | 
443 | ```{r include = FALSE}
444 | tibble(two_class_example)
445 | ```
446 | 
447 | ### 
448 | 
449 | The second and third columns are the predicted class probabilities for the test set while predicted are the discrete predictions.
450 | 
451 | ### Exercise 2
452 | 
453 | A confusion matrix, `conf_mat()`, also known as an error matrix, is a table used to evaluate the performance of a classification model in machine learning. It summarizes the results of a binary classification task by comparing the predicted class labels to the actual class labels in the test data.
454 | 
455 | Type `conf_mat()` and add the parameters, `two_class_example`, `truth`, and `predicted`. Hit "Run Code".
456 | 
457 | ```{r binay-classification-2, exercise = TRUE}
458 | 
459 | ```
460 | 
461 | ```{r binay-classification-2-hint-1, eval = FALSE}
462 | conf_mat(two_class_example, ..., predicted)
463 | ```
464 | 
465 | ```{r include = FALSE}
466 | conf_mat(two_class_example, truth, predicted)
467 | ```
468 | 
469 | ### 
470 | 
471 | Returns range of summary measures of the forecast accuracy. If *x* is provided, the function measures test set forecast accuracy based on *x-f*. If x is not provided, the function only produces training set accuracy measures of the forecasts based on *f["x"]-fitted(f)*. 
472 | 
473 | ### Exercise 3
474 | 
475 | Copy the previous code and change `conf_mat()` to `accuracy()`. Hit "Run Code".
476 | 
477 | ```{r binay-classification-3, exercise = TRUE}
478 | 
479 | ```
480 | 
481 | <button onclick = "transfer_code(this)">Copy previous code</button>
482 | 
483 | ```{r binay-classification-3-hint-1, eval = FALSE}
484 | accuracy(two_class_example, truth, ...)
485 | ```
486 | 
487 | ```{r include = FALSE}
488 | accuracy(two_class_example, truth, predicted)
489 | ```
490 | 
491 | ### 
492 | 
493 | The term "MCC" typically refers to the Matthews Correlation Coefficient, which is a metric commonly used to evaluate the performance of binary classification models. The Matthews Correlation Coefficient takes into account true positives, true negatives, false positives, and false negatives and provides a balanced metric even for imbalanced data sets.
494 | 
495 | ### Exercise 4
496 | 
497 | Copy the previous code and change `accuracy()` to `mcc()`. Hit "Run Code".
498 | 
499 | ```{r binay-classification-4, exercise = TRUE}
500 | 
501 | ```
502 | 
503 | <button onclick = "transfer_code(this)">Copy previous code</button>
504 | 
505 | ```{r binay-classification-4-hint-1, eval = FALSE}
506 | mcc(two_class_example, ..., predicted)
507 | ```
508 | 
509 | ```{r include = FALSE}
510 | mcc(two_class_example, truth, predicted)
511 | ```
512 | 
513 | ### 
514 | 
515 | The term "F-measure" (also known as F1-score) is a commonly used metric in binary classification to evaluate the performance of a model. The F-measure is the harmonic mean of precision and recall, providing a balanced metric that takes both false positives and false negatives into account.
516 | 
517 | ### Exercise 5
518 | 
519 | Copy the previous code and change the previous function name to `f_meas()`. Hit "Run code".
520 | 
521 | ```{r binay-classification-5, exercise = TRUE}
522 | 
523 | ```
524 | 
525 | <button onclick = "transfer_code(this)">Copy previous code</button>
526 | 
527 | ```{r binay-classification-5-hint-1, eval = FALSE}
528 | f_meas(two_class_example, ..., predicted)
529 | ```
530 | 
531 | ```{r include = FALSE}
532 | f_meas(two_class_example, truth, predicted)
533 | ```
534 | 
535 | ### 
536 | 
537 | The Matthews correlation coefficient and F1 score both summarize the confusion matrix, but compared to `mcc()`, which measures the quality of both positive and negative examples, the `f_meas()` metric emphasizes the positive class, i.e., the event of interest.
538 | 
539 | ### Exercise 6
540 | 
541 | Now lets create a metric set of the functions `accuracy`, `mcc`, `f_meas`. Within `metric_set()`, add the parameters `accuracy, mcc, f_meas`. Set this expression equal to `classification_metrics` using the `<-` operator.
542 | 
543 | ```{r binay-classification-6, exercise = TRUE}
544 | 
545 | ```
546 | 
547 | ```{r binay-classification-6-hint-1, eval = FALSE}
548 | classification_metrics <- ...(accuracy, mcc, f_meas)
549 | ```
550 | 
551 | ```{r include = FALSE}
552 | classification_metrics <- metric_set(accuracy, mcc, f_meas)
553 | ```
554 | 
555 | ### 
556 | 
557 | There is some heterogeneity in R functions in this regard; some use the first level and others the second to denote the event of interest. We consider it more intuitive that the first level is the most important.
558 | 
559 | ### Exercise 7
560 | 
561 | Now lets call the method created in the previous exercise using the parameters from Exercise 5. Within `classification_metrics()`, set `truth = truth` and `estimate = predicted` and hit "Run Code".  
562 | 
563 | ```{r binay-classification-7, exercise = TRUE}
564 | 
565 | ```
566 | 
567 | ```{r binay-classification-7-hint-1, eval = FALSE}
568 | classification_metrics(two_class_example, truth, ...)
569 | ```
570 | 
571 | ```{r include = FALSE}
572 | classification_metrics(two_class_example, truth = truth, estimate = predicted)
573 | ```
574 | 
575 | ### 
576 | 
577 | The second level logic is borne of encoding the outcome as 0/1 (in which case the second value is the event) and unfortunately remains in some packages. However, **tidymodels** (along with many other R packages) require a categorical outcome to be encoded as a factor and, for this reason, the legacy justification for the second level as the event becomes irrelevant.
578 | 
579 | ### Exercise 8
580 | 
581 | As an example where the second level is the event below. Hit "Run Code".
582 | 
583 | 
584 | ```{r binay-classification-8, exercise = TRUE}
585 | f_meas(two_class_example, truth, predicted, event_level = "second")
586 | ```
587 | 
588 | ```{r include = FALSE}
589 | f_meas(two_class_example, truth, predicted, event_level = "second")
590 | ```
591 | 
592 | In this output, the .estimator value of “binary” indicates that the standard formula for binary classes will be used.
593 | 
594 | ### Exercise 9
595 | 
596 | The term "ROC curve" refers to the Receiver Operating Characteristic curve, which is a graphical representation of the performance of a binary classification model at various classification thresholds. We will use the `roc_curve()` to represent the performance of the binary classification model used in this section.
597 | 
598 | Within `roc_curve()`, add the parameters `two_class_example`, `truth`, and `Class1`. Set this expression equal to `two_class_curve` and hit "Run Code". 
599 | 
600 | ```{r binay-classification-9, exercise = TRUE}
601 | 
602 | ```
603 | 
604 | ```{r binay-classification-9-hint-1, eval = FALSE}
605 | two_class_curve <- roc_curve(..., truth, Class1)
606 | ```
607 | 
608 | ```{r include = FALSE}
609 | two_class_curve <- roc_curve(two_class_example, truth, Class1)
610 | ```
611 | 
612 | ### 
613 | 
614 | The ROC curve plots the True Positive Rate (TPR) against the False Positive Rate (FPR) for different threshold values, and it helps to visualize the trade-off between sensitivity and specificity.
615 | 
616 | ### Exercise 10
617 | 
618 | The term "ROC AUC" refers to the Area Under the Receiver Operating Characteristic Curve, which is a commonly used metric to evaluate the performance of binary classification models. 
619 | 
620 | Now lets call the function `roc_auc()` and add the same parameters as the parameters in `roc_curve()` from the previous exercise.
621 | 
622 | ```{r binay-classification-10, exercise = TRUE}
623 | 
624 | ```
625 | 
626 | ```{r binay-classification-10-hint-1, eval = FALSE}
627 | roc_auc(two_class_example, truth, ...)
628 | ```
629 | 
630 | ```{r include = FALSE}
631 | roc_auc(two_class_example, truth, Class1)
632 | ```
633 | 
634 | ### 
635 | 
636 | The ROC AUC provides a single value that represents the overall performance of the model across different classification thresholds.
637 | 
638 | ### 
639 | 
640 | `autoplot()` is a generic function in **ggfortify** that is used to automatically generate visualizations (plots) for various objects or data types. The purpose of `autoplot()` is to provide an easy way to create high-quality, informative plots without having to manually specify all the details.
641 | 
642 | ### Exercise 11
643 | 
644 | We will be using the `autoplot()` function to graph the roc curve created in the previous exercises. Within `autoplot()`, add the parameter `two_class_curve`.
645 | 
646 | ```{r binay-classification-11, exercise = TRUE}
647 | 
648 | ```
649 | 
650 | ```{r binay-classification-11-hint-1, eval = FALSE}
651 | autoplot()
652 | ```
653 | 
654 | ```{r include = FALSE}
655 | autoplot(two_class_curve)
656 | ```
657 | 
658 | ### 
659 | 
660 | Great Job! You now know the basics of binary classification metrics and how to analyze these metrics using functions such as `accuracy()`, `f_meas()`, `roc_curve()`, etc.
661 | 
662 | ## Multiclass Classification Metrics
663 | ### 
664 | 
665 | In multiclass classification, we are dealing with problems where the target variable has more than two classes or categories. Unlike binary classification, where we have true positive, true negative, false positive, and false negative, multiclass classification introduces additional complexity in evaluating the performance of the model.
666 | 
667 | ### Exercise 1
668 | 
669 | The data set we will be using is `hpc_cv`. Type `tibble(hpc_cv)` to get a sense of how the data looks.
670 | 
671 | ```{r multiclass-classific-1, exercise = TRUE}
672 | 
673 | ```
674 | 
675 | ```{r multiclass-classific-1-hint-1, eval = FALSE}
676 | tibble(...)
677 | ```
678 | 
679 | ```{r include = FALSE}
680 | tibble(hpc_cv)
681 | ```
682 | 
683 | ### 
684 | 
685 | As before, there are factors for the observed and predicted outcomes along with four other columns of predicted probabilities for each class. (These data also include a Resample column. These `hpc_cv` results are for out-of-sample predictions associated with 10-fold cross-validation.)
686 | 
687 | ### Exercise 2
688 | 
689 | The functions for metrics that use the discrete class predictions are identical to their binary counterparts and the functions we will be using are `accuracy()` and `mcc()`. First, within `accuracy()`, add the parameters `hpc_cv`, `obs`, and `pred`.
690 | 
691 | ```{r multiclass-classific-2, exercise = TRUE}
692 | 
693 | ```
694 | 
695 | ```{r multiclass-classific-2-hint-1, eval = FALSE}
696 | accuracy(hpc_cv, obs, ...)
697 | ```
698 | 
699 | ```{r include = FALSE}
700 | accuracy(hpc_cv, obs, pred)
701 | ```
702 | 
703 | ### 
704 | 
705 | The Matthews correlation coefficient (mcc) was originally designed for two classes but has been extended to cases with more class levels.
706 | 
707 | ### Exercise 3
708 | 
709 | Copy the previous code and switch the function to `mcc()`.
710 | 
711 | ```{r multiclass-classific-3, exercise = TRUE}
712 | 
713 | ```
714 | 
715 | <button onclick = "transfer_code(this)">Copy previous code</button>
716 | 
717 | ```{r multiclass-classific-3-hint-1, eval = FALSE}
718 | mcc(hpc_cv, obs, ...)
719 | ```
720 | 
721 | ```{r include = FALSE}
722 | mcc(hpc_cv, obs, pred)
723 | ```
724 | 
725 | ### 
726 | 
727 | Note that, in these results, a “multiclass” .estimator is listed. Like “binary,” this indicates that the formula for outcomes with three or more class levels was used. 
728 | 
729 | ### Exercise 4
730 | 
731 | Using sensitivity as an example, the usual two-class calculation is the ratio of the number of correctly predicted events divided by the number of true events. **yardstick** functions can automatically apply these methods via the estimator argument.
732 | 
733 | Copy the previous code and switch the function to `sensitivity` and add the parameter `estimated = "macro"`
734 | 
735 | ```{r multiclass-classific-4, exercise = TRUE}
736 | 
737 | ```
738 | 
739 | <button onclick = "transfer_code(this)">Copy previous code</button>
740 | 
741 | ```{r multiclass-classific-4-hint-1, eval = FALSE}
742 | sensitivity(hpc_cv, obs, pred, estimated = "...")
743 | ```
744 | 
745 | ```{r include = FALSE}
746 | sensitivity(hpc_cv, obs, pred, estimated = "macro")
747 | ```
748 | 
749 | ### 
750 | 
751 | "macro" estimator refers to a method of calculating performance metrics that treats all classes equally, regardless of their size or frequency in the dataset. 
752 | 
753 | ### Exercise 5
754 | 
755 | Copy the previous code and swithc the `estimated` parameter to `"macro-weighted"`.
756 | 
757 | ```{r multiclass-classific-5, exercise = TRUE}
758 | 
759 | ```
760 | 
761 | <button onclick = "transfer_code(this)">Copy previous code</button>
762 | 
763 | ```{r multiclass-classific-5-hint-1, eval = FALSE}
764 | sensitivity(hpc_cv, obs, pred, estimated = "...")
765 | ```
766 | 
767 | ```{r include = FALSE}
768 | sensitivity(hpc_cv, obs, pred, estimated = "macro-weighted")
769 | ```
770 | 
771 | ### 
772 | 
773 | A "macro-weighted" strategy is a combination of both "macro" and "weighted", where the metrics are first computed separately for each class using the "macro" approach and then weighted by class size to provide a balanced metric that considers both class equality and class size.
774 | 
775 | ### Exercise 6
776 | 
777 | Copy the previous code and switch the `estimated` parameter to `"micro"`.
778 | 
779 | ```{r multiclass-classific-6, exercise = TRUE}
780 | 
781 | ```
782 | 
783 | <button onclick = "transfer_code(this)">Copy previous code</button>
784 | 
785 | ```{r multiclass-classific-6-hint-1, eval = FALSE}
786 | sensitivity(hpc_cv, obs, pred, estimated = "...")
787 | ```
788 | 
789 | ```{r include = FALSE}
790 | sensitivity(hpc_cv, obs, pred, estimated = "micro")
791 | ```
792 | 
793 | ### 
794 | 
795 | The term "micro" refers to a method of calculating performance metrics that aggregates the true positives, false positives, and false negatives across all classes and then computes the metrics.
796 | 
797 | ### Exercise 7
798 | 
799 | Hand and Till (2001) determined a multiclass technique for ROC curves. In this case, all of the class probability columns must be given to the function. Type `roc_auc` and add the parameters `hpc_cv` and `obs`. This will throw an error.
800 | 
801 | ```{r multiclass-classific-7, exercise = TRUE}
802 | 
803 | ```
804 | 
805 | <button onclick = "transfer_code(this)">Copy previous code</button>
806 | 
807 | ```{r multiclass-classific-7-hint-1, eval = FALSE}
808 | roc_auc(hpc_cv, ...)
809 | ```
810 | 
811 | ```{r include = FALSE}
812 | # roc_auc(hpc_cv, obs)
813 | ```
814 | 
815 | We need to select at least one item from the data set, which are the columns `VF, F, M, L`.
816 | 
817 | ### Exercise 8
818 | 
819 | Copy the previous code and add `VF, F, M, L` as parameters to the function `roc_auc`.
820 | 
821 | ```{r multiclass-classific-8, exercise = TRUE}
822 | 
823 | ```
824 | 
825 | <button onclick = "transfer_code(this)">Copy previous code</button>
826 | 
827 | ```{r multiclass-classific-8-hint-1, eval = FALSE}
828 | roc_auc(hpc_cv, obs, VF, ..., M, ...)
829 | ```
830 | 
831 | ```{r include = FALSE}
832 | roc_auc(hpc_cv, obs, VF, F, M, L)
833 | ```
834 | 
835 | ### 
836 | 
837 | Recall that these data have a column for the resampling groups. We haven’t yet discussed resampling in detail, but notice how we can pass a grouped data frame to the metric function to compute the metrics for each group using `group_by()`.
838 | 
839 | ### Exercise 9
840 | 
841 | Pipe `group_by(Resample)` to `hpc_cv` and hit "Run Code".
842 | 
843 | ```{r multiclass-classific-9, exercise = TRUE}
844 | 
845 | ```
846 | 
847 | ```{r multiclass-classific-9-hint-1, eval = FALSE}
848 | hpc_cv |>
849 |   group_by(...)
850 | ```
851 | 
852 | ```{r include = FALSE}
853 | hpc_cv |>
854 |   group_by(Resample)
855 | ```
856 | 
857 | ### 
858 | 
859 | In R, the `accuracy()` function is part of the **forecast** package, which is commonly used for time series forecasting and related tasks. The `accuracy()` function is used to compute accuracy measures for a fitted time series forecasting model.
860 | 
861 | ### Exercise 10
862 | 
863 | Copy the previous code and pipe `accuracy()` with the parameters being `obs, pred`. Hit "Run Code".
864 | 
865 | ```{r multiclass-classific-10, exercise = TRUE}
866 | 
867 | ```
868 | 
869 | <button onclick = "transfer_code(this)">Copy previous code</button>
870 | 
871 | ```{r multiclass-classific-10-hint-1, eval = FALSE}
872 | ... |>
873 |   accuracy(obs, ...)
874 | ```
875 | 
876 | ```{r include = FALSE}
877 | hpc_cv |>
878 |   group_by(Resample) |>
879 |   accuracy(obs, pred)
880 | ```
881 | 
882 | ### 
883 | 
884 | Cohen's Kappa is a statistic that measures the agreement between predicted and actual class labels, considering the agreement that could occur by chance. R offers functions like `kappa2()` from the `vcd` package to calculate Cohen's Kappa.
885 | 
886 | ### Exercise 11
887 | 
888 | Now we will plot the data to get a better visual understanding of the grouping of the data. Copy the previous code and delete the `accuracy()` function. Instead, pipe `roc_curve()` and add the parameters `obs, VF, F, M, L`. 
889 | 
890 | ```{r multiclass-classific-11, exercise = TRUE}
891 | 
892 | ```
893 | 
894 | <button onclick = "transfer_code(this)">Copy previous code</button>
895 | 
896 | ```{r multiclass-classific-11-hint-1, eval = FALSE}
897 | ... |>
898 |   roc_curve(obs, VF, F, M, ...)
899 | ```
900 | 
901 | ```{r include = FALSE}
902 | hpc_cv |>
903 |   group_by(Resample) |>
904 |   roc_curve(obs, VF, F, M, L)
905 | ```
906 | 
907 | ### 
908 | 
909 | When working with multiclass classification, R provides flexibility to use an One-Versus-All (OvA) approach, where you treat each class as the positive class and the rest as the negative class. The **caret** package's `train()` function allows specifying the `classProbs = TRUE` parameter to enable this approach.
910 | 
911 | ### Exercise 12
912 | 
913 | Copy the previous code and pipe `autoplot()`. Hit "Run Code."
914 | 
915 | ```{r multiclass-classific-12, exercise = TRUE}
916 | 
917 | ```
918 | 
919 | <button onclick = "transfer_code(this)">Copy previous code</button>
920 | 
921 | ```{r multiclass-classific-12-hint-1, eval = FALSE}
922 | hpc_cv |>
923 |   group_by(Resample) |>
924 |   roc_curve(obs, VF, F, M, L) |>
925 |   ...()
926 | ```
927 | 
928 | ```{r include = FALSE}
929 | hpc_cv |>
930 |   group_by(Resample) |>
931 |   roc_curve(obs, VF, F, M, L) |>
932 |   autoplot()
933 | ```
934 | 
935 | ### 
936 | 
937 | Great job! You now know how to calculate and analyze multiclass classification metrics using methods such as `roc_curve()` and multiclass estimator such as `macro, macro-weighted, and micro".
938 | 
939 | 
940 | 
941 | ## Summary
942 | ### 
943 | 
944 | This tutorial covered [Chapter 9: Judging Model Effectiveness](https://www.tmwr.org/compare.html#workflow-set) from [*Tidy Modeling with R*](https://www.tmwr.org/) by Max Kuhn and Julia Silge. This tutorial demonstrated the **yardstick** package, a core **tidymodels** packages with the focus of measuring model performance. Before illustrating syntax, we explored whether empirical validation using performance metrics is worthwhile when a model is focused on inference rather than prediction. Empirical validation can provide valuable insights into the model's goodness of fit and reliability. However, it's essential to keep in mind that the choice of performance metrics may differ from those commonly used in prediction models. Metrics like R-squared, which are popular for predictive models, may not be as informative for inferential models.
945 | 
946 | ```{r download-answers, child = system.file("child_documents/download_answers.Rmd", package = "tutorial.helpers")}
947 | ```
948 | 


--------------------------------------------------------------------------------
/inst/tutorials/10-resampling/images/fig-ten-point-eight.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/10-resampling/images/fig-ten-point-eight.png


--------------------------------------------------------------------------------
/inst/tutorials/10-resampling/images/fig-ten-point-five.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/10-resampling/images/fig-ten-point-five.png


--------------------------------------------------------------------------------
/inst/tutorials/10-resampling/images/fig-ten-point-one.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/10-resampling/images/fig-ten-point-one.png


--------------------------------------------------------------------------------
/inst/tutorials/10-resampling/images/fig-ten-point-seven.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/10-resampling/images/fig-ten-point-seven.png


--------------------------------------------------------------------------------
/inst/tutorials/10-resampling/images/fig-ten-point-six.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/10-resampling/images/fig-ten-point-six.png


--------------------------------------------------------------------------------
/inst/tutorials/10-resampling/images/fig-ten-point-three.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/10-resampling/images/fig-ten-point-three.png


--------------------------------------------------------------------------------
/inst/tutorials/10-resampling/images/fig-ten-point-two.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/10-resampling/images/fig-ten-point-two.png


--------------------------------------------------------------------------------
/inst/tutorials/11-comparing-models/data/linear-statistical-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/11-comparing-models/data/linear-statistical-model.png


--------------------------------------------------------------------------------
/inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic1.png


--------------------------------------------------------------------------------
/inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic2.png


--------------------------------------------------------------------------------
/inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic3.png


--------------------------------------------------------------------------------
/inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic4.png


--------------------------------------------------------------------------------
/inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/12-model-tuning-and-the-dangers-of-overfitting/images/pic5.png


--------------------------------------------------------------------------------
/inst/tutorials/14-iterative-search/images/pic1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic1.png


--------------------------------------------------------------------------------
/inst/tutorials/14-iterative-search/images/pic10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic10.png


--------------------------------------------------------------------------------
/inst/tutorials/14-iterative-search/images/pic2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic2.png


--------------------------------------------------------------------------------
/inst/tutorials/14-iterative-search/images/pic3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic3.png


--------------------------------------------------------------------------------
/inst/tutorials/14-iterative-search/images/pic4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic4.png


--------------------------------------------------------------------------------
/inst/tutorials/14-iterative-search/images/pic5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic5.png


--------------------------------------------------------------------------------
/inst/tutorials/14-iterative-search/images/pic6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic6.png


--------------------------------------------------------------------------------
/inst/tutorials/14-iterative-search/images/pic7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic7.png


--------------------------------------------------------------------------------
/inst/tutorials/14-iterative-search/images/pic8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic8.png


--------------------------------------------------------------------------------
/inst/tutorials/14-iterative-search/images/pic9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/14-iterative-search/images/pic9.png


--------------------------------------------------------------------------------
/inst/tutorials/16-dimensionality-reduction/images/pic1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/16-dimensionality-reduction/images/pic1.png


--------------------------------------------------------------------------------
/inst/tutorials/16-dimensionality-reduction/images/pic2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/16-dimensionality-reduction/images/pic2.png


--------------------------------------------------------------------------------
/inst/tutorials/16-dimensionality-reduction/images/pic3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/16-dimensionality-reduction/images/pic3.png


--------------------------------------------------------------------------------
/inst/tutorials/16-dimensionality-reduction/images/pic4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/16-dimensionality-reduction/images/pic4.png


--------------------------------------------------------------------------------
/inst/tutorials/16-dimensionality-reduction/images/pic5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/16-dimensionality-reduction/images/pic5.png


--------------------------------------------------------------------------------
/inst/tutorials/18-explaining-models-and-predictions/images/pic1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/inst/tutorials/18-explaining-models-and-predictions/images/pic1.png


--------------------------------------------------------------------------------
/inst/tutorials/18-explaining-models-and-predictions/tutorial.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Explaining Models and Predictions
  3 | author: Aryan Kancherla
  4 | tutorial:
  5 |   id: explaining-models-and-predictions
  6 | output:
  7 |   learnr::tutorial:
  8 |     progressive: yes
  9 |     allow_skip: yes
 10 | runtime: shiny_prerendered
 11 | description: 'Tutorial for Chapter 18: Explaining Models and Predictions'
 12 | ---
 13 | 
 14 | ```{r setup, include = FALSE}
 15 | library(learnr)
 16 | library(tutorial.helpers)
 17 | library(knitr)
 18 | 
 19 | library(tidymodels)
 20 | library(DALEXtra)
 21 | library(forcats)
 22 | 
 23 | tidymodels_prefer()
 24 | 
 25 | 
 26 | knitr::opts_chunk$set(echo = FALSE)
 27 | options(tutorial.exercise.timelimit = 60, 
 28 |         tutorial.storage = "local") 
 29 | 
 30 | ames_update <- ames |>
 31 |   mutate(Sale_Price = log10(Sale_Price))
 32 | 
 33 | set.seed(502)
 34 | ames_strata_split <- initial_split(ames_update, prop = 0.80, strata = Sale_Price)
 35 | ames_train <- training(ames_strata_split)
 36 | 
 37 | ames_rec <- 
 38 |   recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 
 39 |            Latitude + Longitude, data = ames_train) |>
 40 |   step_log(Gr_Liv_Area, base = 10) |> 
 41 |   step_other(Neighborhood, threshold = 0.01) |> 
 42 |   step_dummy(all_nominal_predictors()) |> 
 43 |   step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) |> 
 44 |   step_ns(Latitude, Longitude, deg_free = 20)
 45 |   
 46 | lm_model <- linear_reg() |> set_engine("lm")
 47 | 
 48 | lm_wflow <- 
 49 |   workflow() |> 
 50 |   add_model(lm_model) |> 
 51 |   add_recipe(ames_rec)
 52 | 
 53 | lm_fit <- fit(lm_wflow, ames_train)
 54 | 
 55 | vip_features <- c("Neighborhood", "Gr_Liv_Area", "Year_Built", "Bldg_Type", "Latitude", "Longitude")
 56 | 
 57 | vip_train <- 
 58 |   ames_train |> 
 59 |   select(all_of(vip_features))
 60 | 
 61 | explainer_lm <-
 62 |   explain_tidymodels(
 63 |     lm_fit, 
 64 |     data = vip_train, 
 65 |     y = ames_train$Sale_Price,
 66 |     label = "lm + interactions",
 67 |     verbose = FALSE
 68 | )
 69 | 
 70 | rf_model <- 
 71 |   rand_forest(trees = 1000) |> 
 72 |   set_engine("ranger") |> 
 73 |   set_mode("regression")
 74 | 
 75 | rf_wflow <- 
 76 |   workflow() |> 
 77 |   add_formula(
 78 |     Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 
 79 |       Latitude + Longitude) |> 
 80 |   add_model(rf_model) 
 81 | 
 82 | rf_fit <- rf_wflow |> fit(data = ames_train)
 83 | 
 84 | explainer_rf <- 
 85 |   explain_tidymodels(
 86 |     rf_fit, 
 87 |     data = vip_train, 
 88 |     y = ames_train$Sale_Price,
 89 |     label = "random forest",
 90 |     verbose = FALSE
 91 |   )
 92 | 
 93 | ```
 94 | 
 95 | ```{r copy-code-chunk, child = system.file("child_documents/copy_button.Rmd", package = "tutorial.helpers")}
 96 | ```
 97 | 
 98 | ```{r info-section, child = system.file("child_documents/info_section.Rmd", package = "tutorial.helpers")}
 99 | ```
100 | 
101 | ## Introduction
102 | ### 
103 | 
104 | <!-- Two to four sentence about the main topics covered in this tutorial. Why are we here? What will students get out of giving you 90 minutes of their lives? How does this tutorial connect to other tutorials? -->
105 | 
106 | ## Software for Model Explanations 
107 | ### 
108 | 
109 | In Section [1.2](https://www.tmwr.org/software-modeling#model-types) of Chapter [1](https://www.tmwr.org/software-modeling), a taxonomy of models were outlined and suggested that models typically are built as one or more of descriptive, inferential, or predictive. The chapter suggested that model performance, as measured by appropriate metrics (like RMSE for regression or area under the ROC curve for classification), can be important for all modeling applications. Similarly, model explanations, answering *why* a model makes the predictions it does, can be important whether the purpose of your model is largely descriptive, to test a hypothesis, or to make a prediction.
110 | 
111 | ### Exercise 1
112 | 
113 | Load the **DALEXtra** library using `library()`.
114 | 
115 | ```{r software-for-model-e-1, exercise = TRUE}
116 | 
117 | ```
118 | 
119 | ```{r software-for-model-e-1-hint-1, eval = FALSE}
120 | library(...)
121 | ```
122 | 
123 | ```{r include = FALSE}
124 | library(DALEXtra)
125 | ```
126 | 
127 | ### 
128 | 
129 | The tidymodels framework does not itself contain software for model explanations. Instead, models trained and evaluated with tidymodels can be explained with other, supplementary software in R packages such as **lime**, **vip**, and **DALEX**.
130 | 
131 | **DALEXtra**, which is an add-on package for **DALEX**, provides support for tidymodels. 
132 | 
133 | ### Exercise 2
134 | 
135 | In Chapters [10](https://www.tmwr.org/resampling) and [11](https://www.tmwr.org/compare), several models were trained and compared to predict the price of homes in Ames, IA, including a linear model with interactions and a random forest model, with the results shown below:
136 | 
137 | ```{r}
138 | knitr::include_graphics("images/pic1.png")
139 | ```
140 | 
141 | ### 
142 | 
143 | **vip** functions are chosen for *model-based* methods that take advantage of model structure (and are often faster)
144 | **DALEX** functions are chosen for *model-agnostic* methods that can be applied to any model
145 | 
146 | ### Exercise 3
147 | 
148 | Let’s build model-agnostic explainers for both of these models (see the graph from the previous exercise) to find out why they make these predictions.
149 | 
150 | In the code chunk below, create a vector that contains `"Neighborhood"`, `"Gr_Liv_Area"`, `"Year_Built"`, `"Bldg_Type"`, `"Latitude"`, and `"Longitude"`.
151 | 
152 | ```{r software-for-model-e-3, exercise = TRUE}
153 | 
154 | ```
155 | 
156 | ```{r software-for-model-e-3-hint-1, eval = FALSE}
157 | c("...", "...", "...", "Bldg_Type", "Latitude", "Longitude")
158 | ```
159 | 
160 | ```{r include = FALSE}
161 | c("Neighborhood", "Gr_Liv_Area", "Year_Built", "Bldg_Type", "Latitude", "Longitude")
162 | ```
163 | 
164 | ### 
165 | 
166 | Answering the question “why?” allows modeling practitioners to understand which features were important in predictions and even how model predictions would change under different values for the features. 
167 | 
168 | 
169 | ### Exercise 4
170 | 
171 | Copy the previous code and assign it to a new variable named `vip_features`.
172 | 
173 | ```{r software-for-model-e-4, exercise = TRUE}
174 | 
175 | ```
176 | 
177 | <button onclick = "transfer_code(this)">Copy previous code</button>
178 | 
179 | ```{r software-for-model-e-4-hint-1, eval = FALSE}
180 | ... <- c("Neighborhood", "Gr_Liv_Area", "Year_Built", "Bldg_Type", "Latitude", "Longitude")
181 | ```
182 | 
183 | ```{r include = FALSE}
184 | vip_features <- c("Neighborhood", "Gr_Liv_Area", "Year_Built", "Bldg_Type", "Latitude", "Longitude")
185 | ```
186 | 
187 | ### 
188 | 
189 | For some models, like linear regression, it is usually clear how to explain why the model makes its predictions. The structure of a linear model contains coefficients for each predictor that are typically straightforward to interpret.
190 | 
191 | ### Exercise 5
192 | 
193 | Load the **tidymodels** package using `library()`. Then, on a new line, type in `tidymodels_prefer()` to get rid of naming conflicts.
194 | 
195 | ```{r software-for-model-e-5, exercise = TRUE}
196 | 
197 | ```
198 | 
199 | ```{r software-for-model-e-5-hint-1, eval = FALSE}
200 | library(...)
201 | tidymodels_prefer()
202 | ```
203 | 
204 | ```{r include = FALSE}
205 | library(tidymodels)
206 | tidymodels_prefer()
207 | ```
208 | 
209 | ### 
210 | 
211 | As a reminder, the `ames` data set comes from the **modeldata** package, which is loaded when you load the **tidymodels** package.
212 | 
213 | ### Exercise 6
214 | 
215 | Since the models from the graph in Exercise 2 use Ames data set, the code for the splits and recipes are needed. Press "Run code". 
216 | 
217 | ```{r software-for-model-e-6, exercise = TRUE}
218 | ames_update <- ames |>
219 |   mutate(Sale_Price = log10(Sale_Price))
220 | 
221 | set.seed(502)
222 | ames_strata_split <- initial_split(ames_update, prop = 0.80, strata = Sale_Price)
223 | ames_train <- training(ames_strata_split)
224 | 
225 | ames_rec <- 
226 |   recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 
227 |            Latitude + Longitude, data = ames_train) |>
228 |   step_log(Gr_Liv_Area, base = 10) |> 
229 |   step_other(Neighborhood, threshold = 0.01) |> 
230 |   step_dummy(all_nominal_predictors()) |> 
231 |   step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) |> 
232 |   step_ns(Latitude, Longitude, deg_free = 20)
233 |   
234 | lm_model <- linear_reg() |> set_engine("lm")
235 | 
236 | lm_wflow <- 
237 |   workflow() |> 
238 |   add_model(lm_model) |> 
239 |   add_recipe(ames_rec)
240 | 
241 | lm_fit <- fit(lm_wflow, ames_train)
242 | ```
243 | 
244 | ```{r include = FALSE}
245 | ames_update <- ames |>
246 |   mutate(Sale_Price = log10(Sale_Price))
247 | 
248 | set.seed(502)
249 | ames_strata_split <- initial_split(ames_update, prop = 0.80, strata = Sale_Price)
250 | ames_train <- training(ames_strata_split)
251 | 
252 | ames_rec <- 
253 |   recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 
254 |            Latitude + Longitude, data = ames_train) |>
255 |   step_log(Gr_Liv_Area, base = 10) |> 
256 |   step_other(Neighborhood, threshold = 0.01) |> 
257 |   step_dummy(all_nominal_predictors()) |> 
258 |   step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) |> 
259 |   step_ns(Latitude, Longitude, deg_free = 20)
260 |   
261 | lm_model <- linear_reg() |> set_engine("lm")
262 | 
263 | lm_wflow <- 
264 |   workflow() |> 
265 |   add_model(lm_model) |> 
266 |   add_recipe(ames_rec)
267 | 
268 | lm_fit <- fit(lm_wflow, ames_train)
269 | ```
270 | 
271 | ### 
272 | 
273 | These are the variables you coded in the past tutorials for the `ames` data set. See the "Feature Engineering with recipes" tutorial or Chapter [8](https://www.tmwr.org/recipes) to review this.
274 | 
275 | ### Exercise 7
276 | 
277 | In the code chunk below, pipe `ames_train` to `select()`. Inside this function, type `all_of()`. Inside `all_of()`, type in `vip_features`. 
278 | 
279 | ```{r software-for-model-e-7, exercise = TRUE}
280 | 
281 | ```
282 | 
283 | ```{r software-for-model-e-7-hint-1, eval = FALSE}
284 | ... |> 
285 |   select(all_of(...))
286 | ```
287 | 
288 | ```{r include = FALSE}
289 | ames_train |> 
290 |   select(all_of(vip_features))
291 | ```
292 | 
293 | ### 
294 | 
295 | `all_of()` is a function that selects variables from character vectors.
296 | 
297 | ### Exercise 8
298 | 
299 | Copy the previous code and assign it to a new variable named `vip_train`.
300 | 
301 | ```{r software-for-model-e-8, exercise = TRUE}
302 | 
303 | ```
304 | 
305 | <button onclick = "transfer_code(this)">Copy previous code</button>
306 | 
307 | ```{r software-for-model-e-8-hint-1, eval = FALSE}
308 | ... <- 
309 |   ames_train |> 
310 |   select(all_of(vip_features))
311 | ```
312 | 
313 | ```{r include = FALSE}
314 | vip_train <- 
315 |   ames_train |> 
316 |   select(all_of(vip_features))
317 | ```
318 | 
319 | ### 
320 | 
321 | Przemyslaw Biecek and Tomasz Burzykowski's [*Explanatory Model Analysis*](https://ema.drwhy.ai/) book provide a thorough exploration of how to use **DALEX** for model explanations.
322 | 
323 | ### Exercise 9
324 | 
325 | Now, let's generate some information about the model. In the code chunk below, type in `explain_tidymodels()`. Inside this function, type in `lm_fit`, set `data` to `vip_train`, and set `y` to `ames_train$Sale_Price`.
326 | 
327 | ```{r software-for-model-e-9, exercise = TRUE}
328 | 
329 | ```
330 | 
331 | ```{r software-for-model-e-9-hint-1, eval = FALSE}
332 | explain_tidymodels(
333 |     ..., 
334 |     data = vip_train, 
335 |     y = ...$...
336 | )
337 | ```
338 | 
339 | ```{r include = FALSE}
340 | explain_tidymodels(
341 |     lm_fit, 
342 |     data = vip_train, 
343 |     y = ames_train$Sale_Price
344 | )
345 | ```
346 | 
347 | ### 
348 | 
349 | `explain_tidymodels()` is a function (from the **DALEXtra** package) that creates an explainer from your tidymodels workflow. In this scenario, the function is being used for the linear model `lm_fit`.
350 | 
351 | ### Exercise 10
352 | 
353 | Copy the previous code. Inside `explain_tidymodels()`, set `label` to `"lm + interactions"` and `verbose` to `FALSE`.
354 | 
355 | ```{r software-for-model-e-10, exercise = TRUE}
356 | 
357 | ```
358 | 
359 | <button onclick = "transfer_code(this)">Copy previous code</button>
360 | 
361 | ```{r software-for-model-e-10-hint-1, eval = FALSE}
362 | explain_tidymodels(
363 |     lm_fit, 
364 |     data = vip_train, 
365 |     y = ames_train$Sale_Price,
366 |     ... = "lm + interactions",
367 |     verbose = ...
368 | )
369 | ```
370 | 
371 | ```{r include = FALSE}
372 | explain_tidymodels(
373 |     lm_fit, 
374 |     data = vip_train, 
375 |     y = ames_train$Sale_Price,
376 |     label = "lm + interactions",
377 |     verbose = FALSE
378 | )
379 | ```
380 | 
381 | ### 
382 | 
383 | For other models, like random forests that can capture nonlinear behavior by design, it is less transparent how to explain the model’s predictions from only the structure of the model itself. Instead, we can apply model explainer algorithms to generate understanding of predictions.
384 | 
385 | ### Exercise 11
386 | 
387 | Copy the previous code and assign it to a new variable named `explainer_lm`.
388 | 
389 | ```{r software-for-model-e-11, exercise = TRUE}
390 | 
391 | ```
392 | 
393 | <button onclick = "transfer_code(this)">Copy previous code</button>
394 | 
395 | ```{r software-for-model-e-11-hint-1, eval = FALSE}
396 | ... <-
397 |   explain_tidymodels(
398 |     lm_fit, 
399 |     data = vip_train, 
400 |     y = ames_train$Sale_Price,
401 |     label = "lm + interactions",
402 |     verbose = FALSE
403 | )
404 | ```
405 | 
406 | ```{r include = FALSE}
407 | explainer_lm <-
408 |   explain_tidymodels(
409 |     lm_fit, 
410 |     data = vip_train, 
411 |     y = ames_train$Sale_Price,
412 |     label = "lm + interactions",
413 |     verbose = FALSE
414 | )
415 | ```
416 | 
417 | ### 
418 | 
419 | Click [here](https://search.r-project.org/CRAN/refmans/DALEXtra/html/explain_tidymodels.html) to learn more about the `explain_tidymodels()` function.
420 | 
421 | ### Exercise 12
422 | 
423 | Press "Run code".
424 | 
425 | ```{r software-for-model-e-12, exercise = TRUE}
426 | rf_model <- 
427 |   rand_forest(trees = 1000) |> 
428 |   set_engine("ranger") |> 
429 |   set_mode("regression")
430 | 
431 | rf_wflow <- 
432 |   workflow() |> 
433 |   add_formula(
434 |     Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 
435 |       Latitude + Longitude) |> 
436 |   add_model(rf_model) 
437 | 
438 | rf_fit <- rf_wflow |> fit(data = ames_train)
439 | ```
440 | 
441 | ```{r include = FALSE}
442 | rf_model <- 
443 |   rand_forest(trees = 1000) |> 
444 |   set_engine("ranger") |> 
445 |   set_mode("regression")
446 | 
447 | rf_wflow <- 
448 |   workflow() |> 
449 |   add_formula(
450 |     Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 
451 |       Latitude + Longitude) |> 
452 |   add_model(rf_model) 
453 | 
454 | rf_fit <- rf_wflow |> fit(data = ames_train)
455 | ```
456 | 
457 | ### 
458 | 
459 | These were the variables you created in the "Resampling for Evaluating Performance" tutorial. `rf_model` is a random forest model that has `1000` trees. Then, this model is used to create a random forest workflow, adding `Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + Latitude + Longitude` as the formula. Then, this model is fitted, with `data` being `ames_train`.
460 | 
461 | Visit Chapter [10](https://www.tmwr.org/resampling) to review this process.
462 | 
463 | ### Exercise 13
464 | 
465 | In the code chunk below, type in `explain_tidymodels()`. Inside this function, type in `rf_fit`, set `data` to `vip_train`, and set `y` to `ames_train$Sale_Price`.
466 | 
467 | ```{r software-for-model-e-13, exercise = TRUE}
468 | 
469 | ```
470 | 
471 | ```{r software-for-model-e-13-hint-1, eval = FALSE}
472 | explain_tidymodels(
473 |     ..., 
474 |     data = ..., 
475 |     ... = ames_train$Sale_Price
476 | )
477 | ```
478 | 
479 | ```{r include = FALSE}
480 | explain_tidymodels(
481 |     rf_fit, 
482 |     data = vip_train, 
483 |     y = ames_train$Sale_Price
484 | )
485 | ```
486 | 
487 | ### 
488 | 
489 | There are two types of model explanations, *global* and *local.* Global model explanations provide an overall understanding aggregated over a whole set of observations; local model explanations provide information about a prediction for a single observation.
490 | 
491 | ### Exercise 14
492 | 
493 | Copy the previous code. Inside `explain_tidymodels()`, set `label` to `"random forest"` and set `verbose` to `FALSE`.
494 | 
495 | ```{r software-for-model-e-14, exercise = TRUE}
496 | 
497 | ```
498 | 
499 | <button onclick = "transfer_code(this)">Copy previous code</button>
500 | 
501 | ```{r software-for-model-e-14-hint-1, eval = FALSE}
502 | explain_tidymodels(
503 |     rf_fit, 
504 |     data = vip_train, 
505 |     y = ames_train$Sale_Price,
506 |     label = "...",
507 |     ... = FALSE
508 |   )
509 | ```
510 | 
511 | ```{r include = FALSE}
512 | explain_tidymodels(
513 |     rf_fit, 
514 |     data = vip_train, 
515 |     y = ames_train$Sale_Price,
516 |     label = "random forest",
517 |     verbose = FALSE
518 |   )
519 | ```
520 | 
521 | ### 
522 | 
523 | A linear model is typically straightforward to interpret and explain; you may not often find yourself using separate model explanation algorithms for a linear model. However, it can sometimes be difficult to understand or explain the predictions of even a linear model once it has splines and interaction terms!
524 | 
525 | ### Exercise 15
526 | 
527 | Copy the previous code and assign it to a new variable named `explainer_rf`.
528 | 
529 | ```{r software-for-model-e-15, exercise = TRUE}
530 | 
531 | ```
532 | 
533 | <button onclick = "transfer_code(this)">Copy previous code</button>
534 | 
535 | ```{r software-for-model-e-15-hint-1, eval = FALSE}
536 | ... <- 
537 |   explain_tidymodels(
538 |     rf_fit, 
539 |     data = vip_train, 
540 |     y = ames_train$Sale_Price,
541 |     label = "random forest",
542 |     verbose = FALSE
543 |   )
544 | ```
545 | 
546 | ```{r include = FALSE}
547 | explainer_rf <- 
548 |   explain_tidymodels(
549 |     rf_fit, 
550 |     data = vip_train, 
551 |     y = ames_train$Sale_Price,
552 |     label = "random forest",
553 |     verbose = FALSE
554 |   )
555 | ```
556 | 
557 | ### 
558 | 
559 | ## Summary
560 | ### 
561 | 
562 | <!-- Two to four sentences which bring the lessons of the tutorial together for the student. What do they know now that they did not know before? How does this tutorial connect to other tutorials? OK if this is very similar to the Introduction. You made a promise as to what they would learn. You (we hope!) kept that promise.-->
563 | 
564 | ```{r download-answers, child = system.file("child_documents/download_answers.Rmd", package = "tutorial.helpers")}
565 | ```
566 | 


--------------------------------------------------------------------------------
/man/figures/README-pressure-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PPBDS/tidymodels.tutorials/d5bd79fd17cec2ca29c97703a482b6bfcb909dd3/man/figures/README-pressure-1.png


--------------------------------------------------------------------------------
/man/tidymodels.tutorials-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tidymodels.tutorials-package.R
 3 | \docType{package}
 4 | \name{tidymodels.tutorials-package}
 5 | \alias{tidymodels.tutorials}
 6 | \alias{tidymodels.tutorials-package}
 7 | \title{tidymodels.tutorials: Tutorials for Tidy Modeling with R}
 8 | \description{
 9 | This package provides tutorials for Tidy Modeling with R by Max Kuhn and Julia Silge. In an ideal world, students would read the book and type in all the associated R commands themselves. Sadly, that often does not happen. These tutorials allow students to demonstrate (and their instructors to be sure) that all work has been completed. See the tutorial.helpers package for a background discussion of the tool and approach.
10 | }
11 | \author{
12 | \strong{Maintainer}: David Kane \email{dave.kane@gmail.com} (\href{https://orcid.org/0000-0002-6660-3934}{ORCID}) [copyright holder]
13 | 
14 | }
15 | \keyword{internal}
16 | 


--------------------------------------------------------------------------------
/renv/.gitignore:
--------------------------------------------------------------------------------
1 | library/
2 | local/
3 | cellar/
4 | lock/
5 | python/
6 | sandbox/
7 | staging/
8 | 


--------------------------------------------------------------------------------
/renv/activate.R:
--------------------------------------------------------------------------------
   1 | 
   2 | local({
   3 | 
   4 |   # the requested version of renv
   5 |   version <- "1.0.0"
   6 |   attr(version, "sha") <- NULL
   7 | 
   8 |   # the project directory
   9 |   project <- getwd()
  10 | 
  11 |   # figure out whether the autoloader is enabled
  12 |   enabled <- local({
  13 | 
  14 |     # first, check config option
  15 |     override <- getOption("renv.config.autoloader.enabled")
  16 |     if (!is.null(override))
  17 |       return(override)
  18 | 
  19 |     # next, check environment variables
  20 |     # TODO: prefer using the configuration one in the future
  21 |     envvars <- c(
  22 |       "RENV_CONFIG_AUTOLOADER_ENABLED",
  23 |       "RENV_AUTOLOADER_ENABLED",
  24 |       "RENV_ACTIVATE_PROJECT"
  25 |     )
  26 | 
  27 |     for (envvar in envvars) {
  28 |       envval <- Sys.getenv(envvar, unset = NA)
  29 |       if (!is.na(envval))
  30 |         return(tolower(envval) %in% c("true", "t", "1"))
  31 |     }
  32 | 
  33 |     # enable by default
  34 |     TRUE
  35 | 
  36 |   })
  37 | 
  38 |   if (!enabled)
  39 |     return(FALSE)
  40 | 
  41 |   # avoid recursion
  42 |   if (identical(getOption("renv.autoloader.running"), TRUE)) {
  43 |     warning("ignoring recursive attempt to run renv autoloader")
  44 |     return(invisible(TRUE))
  45 |   }
  46 | 
  47 |   # signal that we're loading renv during R startup
  48 |   options(renv.autoloader.running = TRUE)
  49 |   on.exit(options(renv.autoloader.running = NULL), add = TRUE)
  50 | 
  51 |   # signal that we've consented to use renv
  52 |   options(renv.consent = TRUE)
  53 | 
  54 |   # load the 'utils' package eagerly -- this ensures that renv shims, which
  55 |   # mask 'utils' packages, will come first on the search path
  56 |   library(utils, lib.loc = .Library)
  57 | 
  58 |   # unload renv if it's already been loaded
  59 |   if ("renv" %in% loadedNamespaces())
  60 |     unloadNamespace("renv")
  61 | 
  62 |   # load bootstrap tools   
  63 |   `%||%` <- function(x, y) {
  64 |     if (is.null(x)) y else x
  65 |   }
  66 |   
  67 |   catf <- function(fmt, ..., appendLF = TRUE) {
  68 |   
  69 |     quiet <- getOption("renv.bootstrap.quiet", default = FALSE)
  70 |     if (quiet)
  71 |       return(invisible())
  72 |   
  73 |     msg <- sprintf(fmt, ...)
  74 |     cat(msg, file = stdout(), sep = if (appendLF) "\n" else "")
  75 |   
  76 |     invisible(msg)
  77 |   
  78 |   }
  79 |   
  80 |   header <- function(label,
  81 |                      ...,
  82 |                      prefix = "#",
  83 |                      suffix = "-",
  84 |                      n = min(getOption("width"), 78))
  85 |   {
  86 |     label <- sprintf(label, ...)
  87 |     n <- max(n - nchar(label) - nchar(prefix) - 2L, 8L)
  88 |     if (n <= 0)
  89 |       return(paste(prefix, label))
  90 |   
  91 |     tail <- paste(rep.int(suffix, n), collapse = "")
  92 |     paste0(prefix, " ", label, " ", tail)
  93 |   
  94 |   }
  95 |   
  96 |   startswith <- function(string, prefix) {
  97 |     substring(string, 1, nchar(prefix)) == prefix
  98 |   }
  99 |   
 100 |   bootstrap <- function(version, library) {
 101 |   
 102 |     friendly <- renv_bootstrap_version_friendly(version)
 103 |     section <- header(sprintf("Bootstrapping renv %s", friendly))
 104 |     catf(section)
 105 |   
 106 |     # attempt to download renv
 107 |     catf("- Downloading renv ... ", appendLF = FALSE)
 108 |     withCallingHandlers(
 109 |       tarball <- renv_bootstrap_download(version),
 110 |       error = function(err) {
 111 |         catf("FAILED")
 112 |         stop("failed to download:\n", conditionMessage(err))
 113 |       }
 114 |     )
 115 |     catf("OK")
 116 |     on.exit(unlink(tarball), add = TRUE)
 117 |   
 118 |     # now attempt to install
 119 |     catf("- Installing renv  ... ", appendLF = FALSE)
 120 |     withCallingHandlers(
 121 |       status <- renv_bootstrap_install(version, tarball, library),
 122 |       error = function(err) {
 123 |         catf("FAILED")
 124 |         stop("failed to install:\n", conditionMessage(err))
 125 |       }
 126 |     )
 127 |     catf("OK")
 128 |   
 129 |     # add empty line to break up bootstrapping from normal output
 130 |     catf("")
 131 |   
 132 |     return(invisible())
 133 |   }
 134 |   
 135 |   renv_bootstrap_tests_running <- function() {
 136 |     getOption("renv.tests.running", default = FALSE)
 137 |   }
 138 |   
 139 |   renv_bootstrap_repos <- function() {
 140 |   
 141 |     # get CRAN repository
 142 |     cran <- getOption("renv.repos.cran", "https://cloud.r-project.org")
 143 |   
 144 |     # check for repos override
 145 |     repos <- Sys.getenv("RENV_CONFIG_REPOS_OVERRIDE", unset = NA)
 146 |     if (!is.na(repos)) {
 147 |   
 148 |       # check for RSPM; if set, use a fallback repository for renv
 149 |       rspm <- Sys.getenv("RSPM", unset = NA)
 150 |       if (identical(rspm, repos))
 151 |         repos <- c(RSPM = rspm, CRAN = cran)
 152 |   
 153 |       return(repos)
 154 |   
 155 |     }
 156 |   
 157 |     # check for lockfile repositories
 158 |     repos <- tryCatch(renv_bootstrap_repos_lockfile(), error = identity)
 159 |     if (!inherits(repos, "error") && length(repos))
 160 |       return(repos)
 161 |   
 162 |     # retrieve current repos
 163 |     repos <- getOption("repos")
 164 |   
 165 |     # ensure @CRAN@ entries are resolved
 166 |     repos[repos == "@CRAN@"] <- cran
 167 |   
 168 |     # add in renv.bootstrap.repos if set
 169 |     default <- c(FALLBACK = "https://cloud.r-project.org")
 170 |     extra <- getOption("renv.bootstrap.repos", default = default)
 171 |     repos <- c(repos, extra)
 172 |   
 173 |     # remove duplicates that might've snuck in
 174 |     dupes <- duplicated(repos) | duplicated(names(repos))
 175 |     repos[!dupes]
 176 |   
 177 |   }
 178 |   
 179 |   renv_bootstrap_repos_lockfile <- function() {
 180 |   
 181 |     lockpath <- Sys.getenv("RENV_PATHS_LOCKFILE", unset = "renv.lock")
 182 |     if (!file.exists(lockpath))
 183 |       return(NULL)
 184 |   
 185 |     lockfile <- tryCatch(renv_json_read(lockpath), error = identity)
 186 |     if (inherits(lockfile, "error")) {
 187 |       warning(lockfile)
 188 |       return(NULL)
 189 |     }
 190 |   
 191 |     repos <- lockfile$R$Repositories
 192 |     if (length(repos) == 0)
 193 |       return(NULL)
 194 |   
 195 |     keys <- vapply(repos, `[[`, "Name", FUN.VALUE = character(1))
 196 |     vals <- vapply(repos, `[[`, "URL", FUN.VALUE = character(1))
 197 |     names(vals) <- keys
 198 |   
 199 |     return(vals)
 200 |   
 201 |   }
 202 |   
 203 |   renv_bootstrap_download <- function(version) {
 204 |   
 205 |     sha <- attr(version, "sha", exact = TRUE)
 206 |   
 207 |     methods <- if (!is.null(sha)) {
 208 |   
 209 |       # attempting to bootstrap a development version of renv
 210 |       c(
 211 |         function() renv_bootstrap_download_tarball(sha),
 212 |         function() renv_bootstrap_download_github(sha)
 213 |       )
 214 |   
 215 |     } else {
 216 |   
 217 |       # attempting to bootstrap a release version of renv
 218 |       c(
 219 |         function() renv_bootstrap_download_tarball(version),
 220 |         function() renv_bootstrap_download_cran_latest(version),
 221 |         function() renv_bootstrap_download_cran_archive(version)
 222 |       )
 223 |   
 224 |     }
 225 |   
 226 |     for (method in methods) {
 227 |       path <- tryCatch(method(), error = identity)
 228 |       if (is.character(path) && file.exists(path))
 229 |         return(path)
 230 |     }
 231 |   
 232 |     stop("All download methods failed")
 233 |   
 234 |   }
 235 |   
 236 |   renv_bootstrap_download_impl <- function(url, destfile) {
 237 |   
 238 |     mode <- "wb"
 239 |   
 240 |     # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17715
 241 |     fixup <-
 242 |       Sys.info()[["sysname"]] == "Windows" &&
 243 |       substring(url, 1L, 5L) == "file:"
 244 |   
 245 |     if (fixup)
 246 |       mode <- "w+b"
 247 |   
 248 |     args <- list(
 249 |       url      = url,
 250 |       destfile = destfile,
 251 |       mode     = mode,
 252 |       quiet    = TRUE
 253 |     )
 254 |   
 255 |     if ("headers" %in% names(formals(utils::download.file)))
 256 |       args$headers <- renv_bootstrap_download_custom_headers(url)
 257 |   
 258 |     do.call(utils::download.file, args)
 259 |   
 260 |   }
 261 |   
 262 |   renv_bootstrap_download_custom_headers <- function(url) {
 263 |   
 264 |     headers <- getOption("renv.download.headers")
 265 |     if (is.null(headers))
 266 |       return(character())
 267 |   
 268 |     if (!is.function(headers))
 269 |       stopf("'renv.download.headers' is not a function")
 270 |   
 271 |     headers <- headers(url)
 272 |     if (length(headers) == 0L)
 273 |       return(character())
 274 |   
 275 |     if (is.list(headers))
 276 |       headers <- unlist(headers, recursive = FALSE, use.names = TRUE)
 277 |   
 278 |     ok <-
 279 |       is.character(headers) &&
 280 |       is.character(names(headers)) &&
 281 |       all(nzchar(names(headers)))
 282 |   
 283 |     if (!ok)
 284 |       stop("invocation of 'renv.download.headers' did not return a named character vector")
 285 |   
 286 |     headers
 287 |   
 288 |   }
 289 |   
 290 |   renv_bootstrap_download_cran_latest <- function(version) {
 291 |   
 292 |     spec <- renv_bootstrap_download_cran_latest_find(version)
 293 |     type  <- spec$type
 294 |     repos <- spec$repos
 295 |   
 296 |     baseurl <- utils::contrib.url(repos = repos, type = type)
 297 |     ext <- if (identical(type, "source"))
 298 |       ".tar.gz"
 299 |     else if (Sys.info()[["sysname"]] == "Windows")
 300 |       ".zip"
 301 |     else
 302 |       ".tgz"
 303 |     name <- sprintf("renv_%s%s", version, ext)
 304 |     url <- paste(baseurl, name, sep = "/")
 305 |   
 306 |     destfile <- file.path(tempdir(), name)
 307 |     status <- tryCatch(
 308 |       renv_bootstrap_download_impl(url, destfile),
 309 |       condition = identity
 310 |     )
 311 |   
 312 |     if (inherits(status, "condition"))
 313 |       return(FALSE)
 314 |   
 315 |     # report success and return
 316 |     destfile
 317 |   
 318 |   }
 319 |   
 320 |   renv_bootstrap_download_cran_latest_find <- function(version) {
 321 |   
 322 |     # check whether binaries are supported on this system
 323 |     binary <-
 324 |       getOption("renv.bootstrap.binary", default = TRUE) &&
 325 |       !identical(.Platform$pkgType, "source") &&
 326 |       !identical(getOption("pkgType"), "source") &&
 327 |       Sys.info()[["sysname"]] %in% c("Darwin", "Windows")
 328 |   
 329 |     types <- c(if (binary) "binary", "source")
 330 |   
 331 |     # iterate over types + repositories
 332 |     for (type in types) {
 333 |       for (repos in renv_bootstrap_repos()) {
 334 |   
 335 |         # retrieve package database
 336 |         db <- tryCatch(
 337 |           as.data.frame(
 338 |             utils::available.packages(type = type, repos = repos),
 339 |             stringsAsFactors = FALSE
 340 |           ),
 341 |           error = identity
 342 |         )
 343 |   
 344 |         if (inherits(db, "error"))
 345 |           next
 346 |   
 347 |         # check for compatible entry
 348 |         entry <- db[db$Package %in% "renv" & db$Version %in% version, ]
 349 |         if (nrow(entry) == 0)
 350 |           next
 351 |   
 352 |         # found it; return spec to caller
 353 |         spec <- list(entry = entry, type = type, repos = repos)
 354 |         return(spec)
 355 |   
 356 |       }
 357 |     }
 358 |   
 359 |     # if we got here, we failed to find renv
 360 |     fmt <- "renv %s is not available from your declared package repositories"
 361 |     stop(sprintf(fmt, version))
 362 |   
 363 |   }
 364 |   
 365 |   renv_bootstrap_download_cran_archive <- function(version) {
 366 |   
 367 |     name <- sprintf("renv_%s.tar.gz", version)
 368 |     repos <- renv_bootstrap_repos()
 369 |     urls <- file.path(repos, "src/contrib/Archive/renv", name)
 370 |     destfile <- file.path(tempdir(), name)
 371 |   
 372 |     for (url in urls) {
 373 |   
 374 |       status <- tryCatch(
 375 |         renv_bootstrap_download_impl(url, destfile),
 376 |         condition = identity
 377 |       )
 378 |   
 379 |       if (identical(status, 0L))
 380 |         return(destfile)
 381 |   
 382 |     }
 383 |   
 384 |     return(FALSE)
 385 |   
 386 |   }
 387 |   
 388 |   renv_bootstrap_download_tarball <- function(version) {
 389 |   
 390 |     # if the user has provided the path to a tarball via
 391 |     # an environment variable, then use it
 392 |     tarball <- Sys.getenv("RENV_BOOTSTRAP_TARBALL", unset = NA)
 393 |     if (is.na(tarball))
 394 |       return()
 395 |   
 396 |     # allow directories
 397 |     if (dir.exists(tarball)) {
 398 |       name <- sprintf("renv_%s.tar.gz", version)
 399 |       tarball <- file.path(tarball, name)
 400 |     }
 401 |   
 402 |     # bail if it doesn't exist
 403 |     if (!file.exists(tarball)) {
 404 |   
 405 |       # let the user know we weren't able to honour their request
 406 |       fmt <- "- RENV_BOOTSTRAP_TARBALL is set (%s) but does not exist."
 407 |       msg <- sprintf(fmt, tarball)
 408 |       warning(msg)
 409 |   
 410 |       # bail
 411 |       return()
 412 |   
 413 |     }
 414 |   
 415 |     catf("- Using local tarball '%s'.", tarball)
 416 |     tarball
 417 |   
 418 |   }
 419 |   
 420 |   renv_bootstrap_download_github <- function(version) {
 421 |   
 422 |     enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE")
 423 |     if (!identical(enabled, "TRUE"))
 424 |       return(FALSE)
 425 |   
 426 |     # prepare download options
 427 |     pat <- Sys.getenv("GITHUB_PAT")
 428 |     if (nzchar(Sys.which("curl")) && nzchar(pat)) {
 429 |       fmt <- "--location --fail --header \"Authorization: token %s\""
 430 |       extra <- sprintf(fmt, pat)
 431 |       saved <- options("download.file.method", "download.file.extra")
 432 |       options(download.file.method = "curl", download.file.extra = extra)
 433 |       on.exit(do.call(base::options, saved), add = TRUE)
 434 |     } else if (nzchar(Sys.which("wget")) && nzchar(pat)) {
 435 |       fmt <- "--header=\"Authorization: token %s\""
 436 |       extra <- sprintf(fmt, pat)
 437 |       saved <- options("download.file.method", "download.file.extra")
 438 |       options(download.file.method = "wget", download.file.extra = extra)
 439 |       on.exit(do.call(base::options, saved), add = TRUE)
 440 |     }
 441 |   
 442 |     url <- file.path("https://api.github.com/repos/rstudio/renv/tarball", version)
 443 |     name <- sprintf("renv_%s.tar.gz", version)
 444 |     destfile <- file.path(tempdir(), name)
 445 |   
 446 |     status <- tryCatch(
 447 |       renv_bootstrap_download_impl(url, destfile),
 448 |       condition = identity
 449 |     )
 450 |   
 451 |     if (!identical(status, 0L))
 452 |       return(FALSE)
 453 |   
 454 |     renv_bootstrap_download_augment(destfile)
 455 |   
 456 |     return(destfile)
 457 |   
 458 |   }
 459 |   
 460 |   # Add Sha to DESCRIPTION. This is stop gap until #890, after which we
 461 |   # can use renv::install() to fully capture metadata.
 462 |   renv_bootstrap_download_augment <- function(destfile) {
 463 |     sha <- renv_bootstrap_git_extract_sha1_tar(destfile)
 464 |     if (is.null(sha)) {
 465 |       return()
 466 |     }
 467 |   
 468 |     # Untar
 469 |     tempdir <- tempfile("renv-github-")
 470 |     on.exit(unlink(tempdir, recursive = TRUE), add = TRUE)
 471 |     untar(destfile, exdir = tempdir)
 472 |     pkgdir <- dir(tempdir, full.names = TRUE)[[1]]
 473 |   
 474 |     # Modify description
 475 |     desc_path <- file.path(pkgdir, "DESCRIPTION")
 476 |     desc_lines <- readLines(desc_path)
 477 |     remotes_fields <- c(
 478 |       "RemoteType: github",
 479 |       "RemoteHost: api.github.com",
 480 |       "RemoteRepo: renv",
 481 |       "RemoteUsername: rstudio",
 482 |       "RemotePkgRef: rstudio/renv",
 483 |       paste("RemoteRef: ", sha),
 484 |       paste("RemoteSha: ", sha)
 485 |     )
 486 |     writeLines(c(desc_lines[desc_lines != ""], remotes_fields), con = desc_path)
 487 |   
 488 |     # Re-tar
 489 |     local({
 490 |       old <- setwd(tempdir)
 491 |       on.exit(setwd(old), add = TRUE)
 492 |   
 493 |       tar(destfile, compression = "gzip")
 494 |     })
 495 |     invisible()
 496 |   }
 497 |   
 498 |   # Extract the commit hash from a git archive. Git archives include the SHA1
 499 |   # hash as the comment field of the tarball pax extended header
 500 |   # (see https://www.kernel.org/pub/software/scm/git/docs/git-archive.html)
 501 |   # For GitHub archives this should be the first header after the default one
 502 |   # (512 byte) header.
 503 |   renv_bootstrap_git_extract_sha1_tar <- function(bundle) {
 504 |   
 505 |     # open the bundle for reading
 506 |     # We use gzcon for everything because (from ?gzcon)
 507 |     # > Reading from a connection which does not supply a ‘gzip’ magic
 508 |     # > header is equivalent to reading from the original connection
 509 |     conn <- gzcon(file(bundle, open = "rb", raw = TRUE))
 510 |     on.exit(close(conn))
 511 |   
 512 |     # The default pax header is 512 bytes long and the first pax extended header
 513 |     # with the comment should be 51 bytes long
 514 |     # `52 comment=` (11 chars) + 40 byte SHA1 hash
 515 |     len <- 0x200 + 0x33
 516 |     res <- rawToChar(readBin(conn, "raw", n = len)[0x201:len])
 517 |   
 518 |     if (grepl("^52 comment=", res)) {
 519 |       sub("52 comment=", "", res)
 520 |     } else {
 521 |       NULL
 522 |     }
 523 |   }
 524 |   
 525 |   renv_bootstrap_install <- function(version, tarball, library) {
 526 |   
 527 |     # attempt to install it into project library
 528 |     dir.create(library, showWarnings = FALSE, recursive = TRUE)
 529 |     output <- renv_bootstrap_install_impl(library, tarball)
 530 |   
 531 |     # check for successful install
 532 |     status <- attr(output, "status")
 533 |     if (is.null(status) || identical(status, 0L))
 534 |       return(status)
 535 |   
 536 |     # an error occurred; report it
 537 |     header <- "installation of renv failed"
 538 |     lines <- paste(rep.int("=", nchar(header)), collapse = "")
 539 |     text <- paste(c(header, lines, output), collapse = "\n")
 540 |     stop(text)
 541 |   
 542 |   }
 543 |   
 544 |   renv_bootstrap_install_impl <- function(library, tarball) {
 545 |   
 546 |     # invoke using system2 so we can capture and report output
 547 |     bin <- R.home("bin")
 548 |     exe <- if (Sys.info()[["sysname"]] == "Windows") "R.exe" else "R"
 549 |     R <- file.path(bin, exe)
 550 |   
 551 |     args <- c(
 552 |       "--vanilla", "CMD", "INSTALL", "--no-multiarch",
 553 |       "-l", shQuote(path.expand(library)),
 554 |       shQuote(path.expand(tarball))
 555 |     )
 556 |   
 557 |     system2(R, args, stdout = TRUE, stderr = TRUE)
 558 |   
 559 |   }
 560 |   
 561 |   renv_bootstrap_platform_prefix <- function() {
 562 |   
 563 |     # construct version prefix
 564 |     version <- paste(R.version$major, R.version$minor, sep = ".")
 565 |     prefix <- paste("R", numeric_version(version)[1, 1:2], sep = "-")
 566 |   
 567 |     # include SVN revision for development versions of R
 568 |     # (to avoid sharing platform-specific artefacts with released versions of R)
 569 |     devel <-
 570 |       identical(R.version[["status"]],   "Under development (unstable)") ||
 571 |       identical(R.version[["nickname"]], "Unsuffered Consequences")
 572 |   
 573 |     if (devel)
 574 |       prefix <- paste(prefix, R.version[["svn rev"]], sep = "-r")
 575 |   
 576 |     # build list of path components
 577 |     components <- c(prefix, R.version$platform)
 578 |   
 579 |     # include prefix if provided by user
 580 |     prefix <- renv_bootstrap_platform_prefix_impl()
 581 |     if (!is.na(prefix) && nzchar(prefix))
 582 |       components <- c(prefix, components)
 583 |   
 584 |     # build prefix
 585 |     paste(components, collapse = "/")
 586 |   
 587 |   }
 588 |   
 589 |   renv_bootstrap_platform_prefix_impl <- function() {
 590 |   
 591 |     # if an explicit prefix has been supplied, use it
 592 |     prefix <- Sys.getenv("RENV_PATHS_PREFIX", unset = NA)
 593 |     if (!is.na(prefix))
 594 |       return(prefix)
 595 |   
 596 |     # if the user has requested an automatic prefix, generate it
 597 |     auto <- Sys.getenv("RENV_PATHS_PREFIX_AUTO", unset = NA)
 598 |     if (auto %in% c("TRUE", "True", "true", "1"))
 599 |       return(renv_bootstrap_platform_prefix_auto())
 600 |   
 601 |     # empty string on failure
 602 |     ""
 603 |   
 604 |   }
 605 |   
 606 |   renv_bootstrap_platform_prefix_auto <- function() {
 607 |   
 608 |     prefix <- tryCatch(renv_bootstrap_platform_os(), error = identity)
 609 |     if (inherits(prefix, "error") || prefix %in% "unknown") {
 610 |   
 611 |       msg <- paste(
 612 |         "failed to infer current operating system",
 613 |         "please file a bug report at https://github.com/rstudio/renv/issues",
 614 |         sep = "; "
 615 |       )
 616 |   
 617 |       warning(msg)
 618 |   
 619 |     }
 620 |   
 621 |     prefix
 622 |   
 623 |   }
 624 |   
 625 |   renv_bootstrap_platform_os <- function() {
 626 |   
 627 |     sysinfo <- Sys.info()
 628 |     sysname <- sysinfo[["sysname"]]
 629 |   
 630 |     # handle Windows + macOS up front
 631 |     if (sysname == "Windows")
 632 |       return("windows")
 633 |     else if (sysname == "Darwin")
 634 |       return("macos")
 635 |   
 636 |     # check for os-release files
 637 |     for (file in c("/etc/os-release", "/usr/lib/os-release"))
 638 |       if (file.exists(file))
 639 |         return(renv_bootstrap_platform_os_via_os_release(file, sysinfo))
 640 |   
 641 |     # check for redhat-release files
 642 |     if (file.exists("/etc/redhat-release"))
 643 |       return(renv_bootstrap_platform_os_via_redhat_release())
 644 |   
 645 |     "unknown"
 646 |   
 647 |   }
 648 |   
 649 |   renv_bootstrap_platform_os_via_os_release <- function(file, sysinfo) {
 650 |   
 651 |     # read /etc/os-release
 652 |     release <- utils::read.table(
 653 |       file             = file,
 654 |       sep              = "=",
 655 |       quote            = c("\"", "'"),
 656 |       col.names        = c("Key", "Value"),
 657 |       comment.char     = "#",
 658 |       stringsAsFactors = FALSE
 659 |     )
 660 |   
 661 |     vars <- as.list(release$Value)
 662 |     names(vars) <- release$Key
 663 |   
 664 |     # get os name
 665 |     os <- tolower(sysinfo[["sysname"]])
 666 |   
 667 |     # read id
 668 |     id <- "unknown"
 669 |     for (field in c("ID", "ID_LIKE")) {
 670 |       if (field %in% names(vars) && nzchar(vars[[field]])) {
 671 |         id <- vars[[field]]
 672 |         break
 673 |       }
 674 |     }
 675 |   
 676 |     # read version
 677 |     version <- "unknown"
 678 |     for (field in c("UBUNTU_CODENAME", "VERSION_CODENAME", "VERSION_ID", "BUILD_ID")) {
 679 |       if (field %in% names(vars) && nzchar(vars[[field]])) {
 680 |         version <- vars[[field]]
 681 |         break
 682 |       }
 683 |     }
 684 |   
 685 |     # join together
 686 |     paste(c(os, id, version), collapse = "-")
 687 |   
 688 |   }
 689 |   
 690 |   renv_bootstrap_platform_os_via_redhat_release <- function() {
 691 |   
 692 |     # read /etc/redhat-release
 693 |     contents <- readLines("/etc/redhat-release", warn = FALSE)
 694 |   
 695 |     # infer id
 696 |     id <- if (grepl("centos", contents, ignore.case = TRUE))
 697 |       "centos"
 698 |     else if (grepl("redhat", contents, ignore.case = TRUE))
 699 |       "redhat"
 700 |     else
 701 |       "unknown"
 702 |   
 703 |     # try to find a version component (very hacky)
 704 |     version <- "unknown"
 705 |   
 706 |     parts <- strsplit(contents, "[[:space:]]")[[1L]]
 707 |     for (part in parts) {
 708 |   
 709 |       nv <- tryCatch(numeric_version(part), error = identity)
 710 |       if (inherits(nv, "error"))
 711 |         next
 712 |   
 713 |       version <- nv[1, 1]
 714 |       break
 715 |   
 716 |     }
 717 |   
 718 |     paste(c("linux", id, version), collapse = "-")
 719 |   
 720 |   }
 721 |   
 722 |   renv_bootstrap_library_root_name <- function(project) {
 723 |   
 724 |     # use project name as-is if requested
 725 |     asis <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT_ASIS", unset = "FALSE")
 726 |     if (asis)
 727 |       return(basename(project))
 728 |   
 729 |     # otherwise, disambiguate based on project's path
 730 |     id <- substring(renv_bootstrap_hash_text(project), 1L, 8L)
 731 |     paste(basename(project), id, sep = "-")
 732 |   
 733 |   }
 734 |   
 735 |   renv_bootstrap_library_root <- function(project) {
 736 |   
 737 |     prefix <- renv_bootstrap_profile_prefix()
 738 |   
 739 |     path <- Sys.getenv("RENV_PATHS_LIBRARY", unset = NA)
 740 |     if (!is.na(path))
 741 |       return(paste(c(path, prefix), collapse = "/"))
 742 |   
 743 |     path <- renv_bootstrap_library_root_impl(project)
 744 |     if (!is.null(path)) {
 745 |       name <- renv_bootstrap_library_root_name(project)
 746 |       return(paste(c(path, prefix, name), collapse = "/"))
 747 |     }
 748 |   
 749 |     renv_bootstrap_paths_renv("library", project = project)
 750 |   
 751 |   }
 752 |   
 753 |   renv_bootstrap_library_root_impl <- function(project) {
 754 |   
 755 |     root <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT", unset = NA)
 756 |     if (!is.na(root))
 757 |       return(root)
 758 |   
 759 |     type <- renv_bootstrap_project_type(project)
 760 |     if (identical(type, "package")) {
 761 |       userdir <- renv_bootstrap_user_dir()
 762 |       return(file.path(userdir, "library"))
 763 |     }
 764 |   
 765 |   }
 766 |   
 767 |   renv_bootstrap_validate_version <- function(version, description = NULL) {
 768 |   
 769 |     # resolve description file
 770 |     description <- description %||% {
 771 |       path <- getNamespaceInfo("renv", "path")
 772 |       packageDescription("renv", lib.loc = dirname(path))
 773 |     }
 774 |   
 775 |     # check whether requested version 'version' matches loaded version of renv
 776 |     sha <- attr(version, "sha", exact = TRUE)
 777 |     valid <- if (!is.null(sha))
 778 |       renv_bootstrap_validate_version_dev(sha, description)
 779 |     else
 780 |       renv_bootstrap_validate_version_release(version, description)
 781 |   
 782 |     if (valid)
 783 |       return(TRUE)
 784 |   
 785 |     # the loaded version of renv doesn't match the requested version;
 786 |     # give the user instructions on how to proceed
 787 |     remote <- if (!is.null(description[["RemoteSha"]])) {
 788 |       paste("rstudio/renv", description[["RemoteSha"]], sep = "@")
 789 |     } else {
 790 |       paste("renv", description[["Version"]], sep = "@")
 791 |     }
 792 |   
 793 |     # display both loaded version + sha if available
 794 |     friendly <- renv_bootstrap_version_friendly(
 795 |       version = description[["Version"]],
 796 |       sha     = description[["RemoteSha"]]
 797 |     )
 798 |   
 799 |     fmt <- paste(
 800 |       "renv %1$s was loaded from project library, but this project is configured to use renv %2$s.",
 801 |       "- Use `renv::record(\"%3$s\")` to record renv %1$s in the lockfile.",
 802 |       "- Use `renv::restore(packages = \"renv\")` to install renv %2$s into the project library.",
 803 |       sep = "\n"
 804 |     )
 805 |     catf(fmt, friendly, renv_bootstrap_version_friendly(version), remote)
 806 |   
 807 |     FALSE
 808 |   
 809 |   }
 810 |   
 811 |   renv_bootstrap_validate_version_dev <- function(version, description) {
 812 |     expected <- description[["RemoteSha"]]
 813 |     is.character(expected) && startswith(expected, version)
 814 |   }
 815 |   
 816 |   renv_bootstrap_validate_version_release <- function(version, description) {
 817 |     expected <- description[["Version"]]
 818 |     is.character(expected) && identical(expected, version)
 819 |   }
 820 |   
 821 |   renv_bootstrap_hash_text <- function(text) {
 822 |   
 823 |     hashfile <- tempfile("renv-hash-")
 824 |     on.exit(unlink(hashfile), add = TRUE)
 825 |   
 826 |     writeLines(text, con = hashfile)
 827 |     tools::md5sum(hashfile)
 828 |   
 829 |   }
 830 |   
 831 |   renv_bootstrap_load <- function(project, libpath, version) {
 832 |   
 833 |     # try to load renv from the project library
 834 |     if (!requireNamespace("renv", lib.loc = libpath, quietly = TRUE))
 835 |       return(FALSE)
 836 |   
 837 |     # warn if the version of renv loaded does not match
 838 |     renv_bootstrap_validate_version(version)
 839 |   
 840 |     # execute renv load hooks, if any
 841 |     hooks <- getHook("renv::autoload")
 842 |     for (hook in hooks)
 843 |       if (is.function(hook))
 844 |         tryCatch(hook(), error = warning)
 845 |   
 846 |     # load the project
 847 |     renv::load(project)
 848 |   
 849 |     TRUE
 850 |   
 851 |   }
 852 |   
 853 |   renv_bootstrap_profile_load <- function(project) {
 854 |   
 855 |     # if RENV_PROFILE is already set, just use that
 856 |     profile <- Sys.getenv("RENV_PROFILE", unset = NA)
 857 |     if (!is.na(profile) && nzchar(profile))
 858 |       return(profile)
 859 |   
 860 |     # check for a profile file (nothing to do if it doesn't exist)
 861 |     path <- renv_bootstrap_paths_renv("profile", profile = FALSE, project = project)
 862 |     if (!file.exists(path))
 863 |       return(NULL)
 864 |   
 865 |     # read the profile, and set it if it exists
 866 |     contents <- readLines(path, warn = FALSE)
 867 |     if (length(contents) == 0L)
 868 |       return(NULL)
 869 |   
 870 |     # set RENV_PROFILE
 871 |     profile <- contents[[1L]]
 872 |     if (!profile %in% c("", "default"))
 873 |       Sys.setenv(RENV_PROFILE = profile)
 874 |   
 875 |     profile
 876 |   
 877 |   }
 878 |   
 879 |   renv_bootstrap_profile_prefix <- function() {
 880 |     profile <- renv_bootstrap_profile_get()
 881 |     if (!is.null(profile))
 882 |       return(file.path("profiles", profile, "renv"))
 883 |   }
 884 |   
 885 |   renv_bootstrap_profile_get <- function() {
 886 |     profile <- Sys.getenv("RENV_PROFILE", unset = "")
 887 |     renv_bootstrap_profile_normalize(profile)
 888 |   }
 889 |   
 890 |   renv_bootstrap_profile_set <- function(profile) {
 891 |     profile <- renv_bootstrap_profile_normalize(profile)
 892 |     if (is.null(profile))
 893 |       Sys.unsetenv("RENV_PROFILE")
 894 |     else
 895 |       Sys.setenv(RENV_PROFILE = profile)
 896 |   }
 897 |   
 898 |   renv_bootstrap_profile_normalize <- function(profile) {
 899 |   
 900 |     if (is.null(profile) || profile %in% c("", "default"))
 901 |       return(NULL)
 902 |   
 903 |     profile
 904 |   
 905 |   }
 906 |   
 907 |   renv_bootstrap_path_absolute <- function(path) {
 908 |   
 909 |     substr(path, 1L, 1L) %in% c("~", "/", "\\") || (
 910 |       substr(path, 1L, 1L) %in% c(letters, LETTERS) &&
 911 |       substr(path, 2L, 3L) %in% c(":/", ":\\")
 912 |     )
 913 |   
 914 |   }
 915 |   
 916 |   renv_bootstrap_paths_renv <- function(..., profile = TRUE, project = NULL) {
 917 |     renv <- Sys.getenv("RENV_PATHS_RENV", unset = "renv")
 918 |     root <- if (renv_bootstrap_path_absolute(renv)) NULL else project
 919 |     prefix <- if (profile) renv_bootstrap_profile_prefix()
 920 |     components <- c(root, renv, prefix, ...)
 921 |     paste(components, collapse = "/")
 922 |   }
 923 |   
 924 |   renv_bootstrap_project_type <- function(path) {
 925 |   
 926 |     descpath <- file.path(path, "DESCRIPTION")
 927 |     if (!file.exists(descpath))
 928 |       return("unknown")
 929 |   
 930 |     desc <- tryCatch(
 931 |       read.dcf(descpath, all = TRUE),
 932 |       error = identity
 933 |     )
 934 |   
 935 |     if (inherits(desc, "error"))
 936 |       return("unknown")
 937 |   
 938 |     type <- desc$Type
 939 |     if (!is.null(type))
 940 |       return(tolower(type))
 941 |   
 942 |     package <- desc$Package
 943 |     if (!is.null(package))
 944 |       return("package")
 945 |   
 946 |     "unknown"
 947 |   
 948 |   }
 949 |   
 950 |   renv_bootstrap_user_dir <- function() {
 951 |     dir <- renv_bootstrap_user_dir_impl()
 952 |     path.expand(chartr("\\", "/", dir))
 953 |   }
 954 |   
 955 |   renv_bootstrap_user_dir_impl <- function() {
 956 |   
 957 |     # use local override if set
 958 |     override <- getOption("renv.userdir.override")
 959 |     if (!is.null(override))
 960 |       return(override)
 961 |   
 962 |     # use R_user_dir if available
 963 |     tools <- asNamespace("tools")
 964 |     if (is.function(tools$R_user_dir))
 965 |       return(tools$R_user_dir("renv", "cache"))
 966 |   
 967 |     # try using our own backfill for older versions of R
 968 |     envvars <- c("R_USER_CACHE_DIR", "XDG_CACHE_HOME")
 969 |     for (envvar in envvars) {
 970 |       root <- Sys.getenv(envvar, unset = NA)
 971 |       if (!is.na(root))
 972 |         return(file.path(root, "R/renv"))
 973 |     }
 974 |   
 975 |     # use platform-specific default fallbacks
 976 |     if (Sys.info()[["sysname"]] == "Windows")
 977 |       file.path(Sys.getenv("LOCALAPPDATA"), "R/cache/R/renv")
 978 |     else if (Sys.info()[["sysname"]] == "Darwin")
 979 |       "~/Library/Caches/org.R-project.R/R/renv"
 980 |     else
 981 |       "~/.cache/R/renv"
 982 |   
 983 |   }
 984 |   
 985 |   renv_bootstrap_version_friendly <- function(version, sha = NULL) {
 986 |     sha <- sha %||% attr(version, "sha", exact = TRUE)
 987 |     parts <- c(version, sprintf("[sha: %s]", substring(sha, 1L, 7L)))
 988 |     paste(parts, collapse = " ")
 989 |   }
 990 |   
 991 |   renv_bootstrap_run <- function(version, libpath) {
 992 |   
 993 |     # perform bootstrap
 994 |     bootstrap(version, libpath)
 995 |   
 996 |     # exit early if we're just testing bootstrap
 997 |     if (!is.na(Sys.getenv("RENV_BOOTSTRAP_INSTALL_ONLY", unset = NA)))
 998 |       return(TRUE)
 999 |   
1000 |     # try again to load
1001 |     if (requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) {
1002 |       return(renv::load(project = getwd()))
1003 |     }
1004 |   
1005 |     # failed to download or load renv; warn the user
1006 |     msg <- c(
1007 |       "Failed to find an renv installation: the project will not be loaded.",
1008 |       "Use `renv::activate()` to re-initialize the project."
1009 |     )
1010 |   
1011 |     warning(paste(msg, collapse = "\n"), call. = FALSE)
1012 |   
1013 |   }
1014 |   
1015 |   
1016 |   renv_bootstrap_in_rstudio <- function() {
1017 |     commandArgs()[[1]] == "RStudio"
1018 |   }
1019 |   
1020 |   renv_json_read <- function(file = NULL, text = NULL) {
1021 |   
1022 |     jlerr <- NULL
1023 |   
1024 |     # if jsonlite is loaded, use that instead
1025 |     if ("jsonlite" %in% loadedNamespaces()) {
1026 |   
1027 |       json <- catch(renv_json_read_jsonlite(file, text))
1028 |       if (!inherits(json, "error"))
1029 |         return(json)
1030 |   
1031 |       jlerr <- json
1032 |   
1033 |     }
1034 |   
1035 |     # otherwise, fall back to the default JSON reader
1036 |     json <- catch(renv_json_read_default(file, text))
1037 |     if (!inherits(json, "error"))
1038 |       return(json)
1039 |   
1040 |     # report an error
1041 |     if (!is.null(jlerr))
1042 |       stop(jlerr)
1043 |     else
1044 |       stop(json)
1045 |   
1046 |   }
1047 |   
1048 |   renv_json_read_jsonlite <- function(file = NULL, text = NULL) {
1049 |     text <- paste(text %||% read(file), collapse = "\n")
1050 |     jsonlite::fromJSON(txt = text, simplifyVector = FALSE)
1051 |   }
1052 |   
1053 |   renv_json_read_default <- function(file = NULL, text = NULL) {
1054 |   
1055 |     # find strings in the JSON
1056 |     text <- paste(text %||% read(file), collapse = "\n")
1057 |     pattern <- '["](?:(?:\\\\.)|(?:[^"\\\\]))*?["]'
1058 |     locs <- gregexpr(pattern, text, perl = TRUE)[[1]]
1059 |   
1060 |     # if any are found, replace them with placeholders
1061 |     replaced <- text
1062 |     strings <- character()
1063 |     replacements <- character()
1064 |   
1065 |     if (!identical(c(locs), -1L)) {
1066 |   
1067 |       # get the string values
1068 |       starts <- locs
1069 |       ends <- locs + attr(locs, "match.length") - 1L
1070 |       strings <- substring(text, starts, ends)
1071 |   
1072 |       # only keep those requiring escaping
1073 |       strings <- grep("[[\\]{}:]", strings, perl = TRUE, value = TRUE)
1074 |   
1075 |       # compute replacements
1076 |       replacements <- sprintf('"\032%i\032"', seq_along(strings))
1077 |   
1078 |       # replace the strings
1079 |       mapply(function(string, replacement) {
1080 |         replaced <<- sub(string, replacement, replaced, fixed = TRUE)
1081 |       }, strings, replacements)
1082 |   
1083 |     }
1084 |   
1085 |     # transform the JSON into something the R parser understands
1086 |     transformed <- replaced
1087 |     transformed <- gsub("{}", "`names<-`(list(), character())", transformed, fixed = TRUE)
1088 |     transformed <- gsub("[[{]", "list(", transformed, perl = TRUE)
1089 |     transformed <- gsub("[]}]", ")", transformed, perl = TRUE)
1090 |     transformed <- gsub(":", "=", transformed, fixed = TRUE)
1091 |     text <- paste(transformed, collapse = "\n")
1092 |   
1093 |     # parse it
1094 |     json <- parse(text = text, keep.source = FALSE, srcfile = NULL)[[1L]]
1095 |   
1096 |     # construct map between source strings, replaced strings
1097 |     map <- as.character(parse(text = strings))
1098 |     names(map) <- as.character(parse(text = replacements))
1099 |   
1100 |     # convert to list
1101 |     map <- as.list(map)
1102 |   
1103 |     # remap strings in object
1104 |     remapped <- renv_json_remap(json, map)
1105 |   
1106 |     # evaluate
1107 |     eval(remapped, envir = baseenv())
1108 |   
1109 |   }
1110 |   
1111 |   renv_json_remap <- function(json, map) {
1112 |   
1113 |     # fix names
1114 |     if (!is.null(names(json))) {
1115 |       lhs <- match(names(json), names(map), nomatch = 0L)
1116 |       rhs <- match(names(map), names(json), nomatch = 0L)
1117 |       names(json)[rhs] <- map[lhs]
1118 |     }
1119 |   
1120 |     # fix values
1121 |     if (is.character(json))
1122 |       return(map[[json]] %||% json)
1123 |   
1124 |     # handle true, false, null
1125 |     if (is.name(json)) {
1126 |       text <- as.character(json)
1127 |       if (text == "true")
1128 |         return(TRUE)
1129 |       else if (text == "false")
1130 |         return(FALSE)
1131 |       else if (text == "null")
1132 |         return(NULL)
1133 |     }
1134 |   
1135 |     # recurse
1136 |     if (is.recursive(json)) {
1137 |       for (i in seq_along(json)) {
1138 |         json[i] <- list(renv_json_remap(json[[i]], map))
1139 |       }
1140 |     }
1141 |   
1142 |     json
1143 |   
1144 |   }
1145 | 
1146 |   # load the renv profile, if any
1147 |   renv_bootstrap_profile_load(project)
1148 | 
1149 |   # construct path to library root
1150 |   root <- renv_bootstrap_library_root(project)
1151 | 
1152 |   # construct library prefix for platform
1153 |   prefix <- renv_bootstrap_platform_prefix()
1154 | 
1155 |   # construct full libpath
1156 |   libpath <- file.path(root, prefix)
1157 | 
1158 |   # attempt to load
1159 |   if (renv_bootstrap_load(project, libpath, version))
1160 |     return(TRUE)
1161 | 
1162 |   if (renv_bootstrap_in_rstudio()) {
1163 |     setHook("rstudio.sessionInit", function(...) {
1164 |       renv_bootstrap_run(version, libpath)
1165 | 
1166 |       # Work around buglet in RStudio if hook uses readline
1167 |       tryCatch(
1168 |         {
1169 |           tools <- as.environment("tools:rstudio")
1170 |           tools$.rs.api.sendToConsole("", echo = FALSE, focus = FALSE)
1171 |         },
1172 |         error = function(cnd) {}
1173 |       )
1174 |     })
1175 |   } else {
1176 |     renv_bootstrap_run(version, libpath)
1177 |   }
1178 | 
1179 |   invisible()
1180 | 
1181 | })
1182 | 


--------------------------------------------------------------------------------
/renv/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bioconductor.version": null,
 3 |   "external.libraries": [],
 4 |   "ignored.packages": [],
 5 |   "package.dependency.fields": [
 6 |     "Imports",
 7 |     "Depends",
 8 |     "LinkingTo"
 9 |   ],
10 |   "ppm.enabled": null,
11 |   "ppm.ignored.urls": [],
12 |   "r.version": null,
13 |   "snapshot.type": "implicit",
14 |   "use.cache": true,
15 |   "vcs.ignore.cellar": true,
16 |   "vcs.ignore.library": true,
17 |   "vcs.ignore.local": true,
18 |   "vcs.manage.ignores": true
19 | }
20 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
 1 | # This file is part of the standard setup for testthat.
 2 | # It is recommended that you do not modify it.
 3 | #
 4 | # Where should you do additional test configuration?
 5 | # Learn more about the roles of various files in:
 6 | # * https://r-pkgs.org/tests.html
 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files
 8 | 
 9 | library(testthat)
10 | library(tidymodels.tutorials)
11 | 
12 | test_check("tidymodels.tutorials")
13 | 


--------------------------------------------------------------------------------
/tests/testthat/test-tutorials.R:
--------------------------------------------------------------------------------
 1 | # For now, we will do all our tutorial testing in this one script. We need a
 2 | # listing of all the tutorials. I still worry that I don't really know which
 3 | # paths this is getting. I *think* it is not doing what we want, which is to get
 4 | # all the paths from this version of the package. Instead, it is getting the
 5 | # paths from the most recently installed version of the package. So, you really
 6 | # need to install before testing. If true, that is a hack!
 7 | 
 8 | tut_paths <- tutorial.helpers::return_tutorial_paths("tidymodels.tutorials")
 9 | 
10 | test_that("All tutorials can be knit without error", {
11 |   expect_null(
12 |     tutorial.helpers::knit_tutorials(tut_paths)
13 |   )
14 | })
15 | 
16 | 
17 | test_that("All tutorials have the expected components", {
18 |   expect_null(
19 |     tutorial.helpers::check_tutorial_defaults(tut_paths)
20 |   )
21 | })
22 | 
23 | 


--------------------------------------------------------------------------------