├── .Rbuildignore ├── .gitattributes ├── .github ├── .gitignore └── workflows │ ├── bookdown.yaml │ └── lock.yaml ├── .gitignore ├── 01-software-modeling.Rmd ├── 02-tidyverse.Rmd ├── 03-base-r.Rmd ├── 04-ames.Rmd ├── 05-data-spending.Rmd ├── 06-fitting-models.Rmd ├── 07-the-model-workflow.Rmd ├── 08-feature-engineering.Rmd ├── 09-judging-model-effectiveness.Rmd ├── 10-resampling.Rmd ├── 11-comparing-models.Rmd ├── 12-tuning-parameters.Rmd ├── 13-grid-search.Rmd ├── 14-iterative-search.Rmd ├── 15-workflow-sets.Rmd ├── 16-dimensionality-reduction.Rmd ├── 17-encoding-categorical-data.Rmd ├── 18-explaining-models-and-predictions.Rmd ├── 19-when-should-you-trust-predictions.Rmd ├── 20-ensemble-models.Rmd ├── 21-inferential-analysis.Rmd ├── DESCRIPTION ├── LICENSE.md ├── RData ├── Chicago_2020.RData ├── bears_at_home.RData ├── concrete_mixtures.RData ├── concrete_results.RData ├── dry_beans.RData ├── lm_fit.RData ├── mlp_times.RData ├── plm_resids.RData ├── post_intervals.RData ├── rda_fit.RData ├── resampling.RData ├── sa_history.RData ├── search_examples.RData ├── svm_bo_metrics.RData ├── svm_large.RData └── svm_sa_metrics.RData ├── README.Rmd ├── README.md ├── TMwR.Rproj ├── TMwR.bib ├── TMwR.css ├── _bookdown.yml ├── _common.R ├── _output.yaml ├── ames_snippets.R ├── bo_search.mp4 ├── chi.csv ├── code_of_conduct.md ├── contributing.md ├── contributors.csv ├── displaced.Rmd ├── extras ├── affy_plm_ggplot.R ├── ames_posterior_intervals.R ├── ames_sf.R ├── bo_3panel_plot.R ├── cells_svm_large.R ├── cells_svm_large.Rout ├── dry_beans.R ├── nonlinear_function.R ├── parallel_times │ ├── collect.R │ ├── everything_05_01_expensive.R │ ├── everything_05_01_expensive.Rout │ ├── everything_05_01_with.R │ ├── everything_05_01_with.Rout │ ├── everything_05_01_without.R │ ├── everything_05_01_without.Rout │ ├── everything_05_02_expensive.R │ ├── everything_05_02_expensive.Rout │ ├── everything_05_02_with.R │ ├── everything_05_02_with.Rout │ ├── everything_05_02_without.R │ ├── everything_05_02_without.Rout │ ├── everything_05_03_expensive.R │ ├── everything_05_03_expensive.Rout │ ├── everything_05_03_with.R │ ├── everything_05_03_with.Rout │ ├── everything_05_03_without.R │ ├── everything_05_03_without.Rout │ ├── everything_05_04_expensive.R │ ├── everything_05_04_expensive.Rout │ ├── everything_05_04_with.R │ ├── everything_05_04_with.Rout │ ├── everything_05_04_without.R │ ├── everything_05_04_without.Rout │ ├── everything_05_05_expensive.R │ ├── everything_05_05_expensive.Rout │ ├── everything_05_05_with.R │ ├── everything_05_05_with.Rout │ ├── everything_05_05_without.R │ ├── everything_05_05_without.Rout │ ├── everything_05_10_expensive.R │ ├── everything_05_10_expensive.Rout │ ├── everything_05_10_with.R │ ├── everything_05_10_with.Rout │ ├── everything_05_10_without.R │ ├── everything_05_10_without.Rout │ ├── everything_05_15_expensive.R │ ├── everything_05_15_expensive.Rout │ ├── everything_05_15_with.R │ ├── everything_05_15_with.Rout │ ├── everything_05_15_without.R │ ├── everything_05_15_without.Rout │ ├── everything_05_20_expensive.R │ ├── everything_05_20_expensive.Rout │ ├── everything_05_20_with.R │ ├── everything_05_20_with.Rout │ ├── everything_05_20_without.R │ ├── everything_05_20_without.Rout │ ├── everything_times.RData │ ├── resamples_05_01_expensive.R │ ├── resamples_05_01_expensive.Rout │ ├── resamples_05_01_with.R │ ├── resamples_05_01_with.Rout │ ├── resamples_05_01_without.R │ ├── resamples_05_01_without.Rout │ ├── resamples_05_02_expensive.R │ ├── resamples_05_02_expensive.Rout │ ├── resamples_05_02_with.R │ ├── resamples_05_02_with.Rout │ ├── resamples_05_02_without.R │ ├── resamples_05_02_without.Rout │ ├── resamples_05_03_expensive.R │ ├── resamples_05_03_expensive.Rout │ ├── resamples_05_03_with.R │ ├── resamples_05_03_with.Rout │ ├── resamples_05_03_without.R │ ├── resamples_05_03_without.Rout │ ├── resamples_05_04_expensive.R │ ├── resamples_05_04_expensive.Rout │ ├── resamples_05_04_with.R │ ├── resamples_05_04_with.Rout │ ├── resamples_05_04_without.R │ ├── resamples_05_04_without.Rout │ ├── resamples_05_05_expensive.R │ ├── resamples_05_05_expensive.Rout │ ├── resamples_05_05_with.R │ ├── resamples_05_05_with.Rout │ ├── resamples_05_05_without.R │ ├── resamples_05_05_without.Rout │ ├── resamples_times.RData │ ├── runs.sh │ ├── tune_iter_times_everything.R │ ├── tune_iter_times_resamples.R │ ├── xgb_10_2020_10_28_17_49_51.RData │ ├── xgb_10_2020_10_28_21_28_17.RData │ ├── xgb_10_2020_10_28_22_43_51.RData │ ├── xgb_15_2020_10_28_20_51_49.RData │ ├── xgb_15_2020_10_28_21_23_27.RData │ ├── xgb_15_2020_10_29_00_52_42.RData │ ├── xgb_1_2020_10_28_20_29_50.RData │ ├── xgb_1_2020_10_28_21_13_01.RData │ ├── xgb_1_2020_10_28_22_26_43.RData │ ├── xgb_1_2020_10_28_23_01_51.RData │ ├── xgb_1_2020_10_29_00_23_51.RData │ ├── xgb_1_2020_10_29_04_04_38.RData │ ├── xgb_20_2020_10_28_20_55_16.RData │ ├── xgb_20_2020_10_28_22_01_02.RData │ ├── xgb_20_2020_10_29_03_01_36.RData │ ├── xgb_2_2020_10_28_20_41_12.RData │ ├── xgb_2_2020_10_28_21_39_20.RData │ ├── xgb_2_2020_10_28_21_57_32.RData │ ├── xgb_2_2020_10_29_00_35_18.RData │ ├── xgb_2_2020_10_29_02_04_38.RData │ ├── xgb_2_2020_10_29_02_15_36.RData │ ├── xgb_3_2020_10_28_17_30_51.RData │ ├── xgb_3_2020_10_28_18_05_27.RData │ ├── xgb_3_2020_10_28_20_48_31.RData │ ├── xgb_3_2020_10_28_22_34_29.RData │ ├── xgb_3_2020_10_28_23_21_46.RData │ ├── xgb_3_2020_10_29_02_28_28.RData │ ├── xgb_4_2020_10_28_17_57_37.RData │ ├── xgb_4_2020_10_28_18_11_50.RData │ ├── xgb_4_2020_10_28_22_08_45.RData │ ├── xgb_4_2020_10_29_02_41_00.RData │ ├── xgb_4_2020_10_29_03_40_17.RData │ ├── xgb_4_2020_10_29_03_46_38.RData │ ├── xgb_5_2020_10_28_21_20_10.RData │ ├── xgb_5_2020_10_28_22_39_01.RData │ ├── xgb_5_2020_10_28_23_08_07.RData │ ├── xgb_5_2020_10_28_23_14_28.RData │ ├── xgb_5_2020_10_28_23_54_07.RData │ ├── xgb_5_2020_10_29_02_45_30.RData │ └── xgb_times.RData ├── sa_2d_plot.R ├── submodels │ ├── with_submodel_trick.R │ ├── with_submodel_trick.Rout │ ├── without_submodel_trick.R │ └── without_submodel_trick.Rout └── verify_results.R ├── figures ├── .DS_Store ├── introduction-cricket-plot-1.svg ├── introduction-descr-examples-1.pdf ├── introduction-descr-examples-1.png ├── introduction-interaction-plots-1.svg ├── introduction-modeling-process-1.pdf ├── introduction-modeling-process-1.svg ├── tidyverse-cricket-plot-1.pdf ├── tidyverse-cricket-plot-1.svg ├── tidyverse-interaction-plots-1.pdf └── tidyverse-interaction-plots-1.svg ├── images ├── cover.png ├── error.png ├── note.png ├── robot.png ├── rstudio.png ├── tip.png └── warning.png ├── index.Rmd ├── issue_template.md ├── latex_extras └── preamble.tex ├── pre-proc-table.Rmd ├── premade ├── addin.gif ├── ames.png ├── bad-workflow.pdf ├── bad-workflow.svg ├── bootstraps.pdf ├── bootstraps.svg ├── crawford.png ├── data-science-model.graffle ├── data-science-model.pdf ├── data-science-model.png ├── data-science-model.svg ├── dot_rr.png ├── exp_improve.gif ├── good-proper-workflows.graffle ├── mitchell.png ├── modeling-process.graffle ├── modeling-process.pdf ├── modeling-process.png ├── modeling-process.svg ├── morphology.png ├── morphology.svg ├── northridge.png ├── proper-workflow.pdf ├── proper-workflow.svg ├── recipes-process.graffle ├── recipes-process.pdf ├── recipes-process.svg ├── resampling-details.graffle ├── resampling.pdf ├── resampling.svg ├── roc_surface.png ├── rolling.pdf ├── rolling.svg ├── three-CV-iter.pdf ├── three-CV-iter.svg ├── three-CV.pdf ├── three-CV.svg ├── timberland.png ├── validation-alt.pdf ├── validation-alt.svg ├── validation.pdf └── validation.svg ├── race_results.mp4 ├── references.Rmd ├── sa_search.mp4 └── style.css /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^README\.Rmd$ 4 | ^CODE_OF_CONDUCT\.md$ 5 | ^\.github$ 6 | ^LICENSE\.md$ 7 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.Rmd linguist-detectable 2 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/bookdown.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | pull_request: 6 | branches: 7 | - main 8 | workflow_dispatch: 9 | 10 | name: bookdown 11 | 12 | env: 13 | isExtPR: ${{ github.event.pull_request.head.repo.fork == true }} 14 | 15 | jobs: 16 | build: 17 | runs-on: ubuntu-latest 18 | env: 19 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 20 | steps: 21 | - uses: actions/checkout@v2 22 | 23 | - uses: r-lib/actions/setup-pandoc@v2 24 | 25 | - name: Install system dependencies 26 | run: | 27 | sudo apt-get update -qq 28 | sudo apt-get install -y ffmpeg libavfilter-dev 29 | 30 | - uses: r-lib/actions/setup-r@v2 31 | with: 32 | use-public-rspm: true 33 | 34 | - uses: r-lib/actions/setup-r-dependencies@v2 35 | 36 | - name: Build site 37 | run: Rscript -e 'bookdown::render_book("index.Rmd", quiet = TRUE)' 38 | 39 | - name: Deploy to Netlify 40 | if: contains(env.isExtPR, 'false') 41 | id: netlify-deploy 42 | uses: nwtgck/actions-netlify@v1.1 43 | with: 44 | publish-dir: './_book' 45 | production-branch: main 46 | github-token: ${{ secrets.GITHUB_TOKEN }} 47 | deploy-message: 48 | 'Deploy from GHA: ${{ github.event.pull_request.title || github.event.head_commit.message }} (${{ github.sha }})' 49 | enable-pull-request-comment: false 50 | enable-commit-comment: false 51 | env: 52 | NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }} 53 | NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }} 54 | timeout-minutes: 1 55 | -------------------------------------------------------------------------------- /.github/workflows/lock.yaml: -------------------------------------------------------------------------------- 1 | name: 'Lock Threads' 2 | 3 | on: 4 | schedule: 5 | - cron: '0 0 * * *' 6 | 7 | jobs: 8 | lock: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: dessant/lock-threads@v2 12 | with: 13 | github-token: ${{ github.token }} 14 | issue-lock-inactive-days: '14' 15 | # issue-exclude-labels: '' 16 | # issue-lock-labels: 'outdated' 17 | issue-lock-comment: > 18 | This issue has been automatically locked. If you believe you have 19 | found a related problem, please file a new issue (with a reprex: 20 | ) and link to this issue. 21 | issue-lock-reason: '' 22 | pr-lock-inactive-days: '14' 23 | # pr-exclude-labels: 'wip' 24 | pr-lock-labels: '' 25 | pr-lock-comment: > 26 | This pull request has been automatically locked. If you believe you 27 | have found a related problem, please file a new issue (with a reprex: 28 | ) and link to this issue. 29 | pr-lock-reason: '' 30 | # process-only: 'issues' 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | .DS_Store 6 | _book 7 | _main.* 8 | libs 9 | figures 10 | _bookdown_files 11 | figures/introduction-cricket-plot-1.svg 12 | figures/introduction-descr-examples-1.pdf 13 | figures/introduction-interaction-plots-1.svg 14 | figures/introduction-modeling-process-1.pdf 15 | figures/tidyverse-cricket-plot-1.pdf 16 | figures/tidyverse-cricket-plot-1.svg 17 | figures/tidyverse-interaction-plots-1.pdf 18 | figures/tidyverse-interaction-plots-1.svg 19 | extras/iowa_highway.shx 20 | extras/iowa_highway.shp 21 | files_for_print* 22 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: TMwR 2 | Title: Tidy Modeling with R. 3 | Version: 1.0.0 4 | Authors@R: c( 5 | person("Max", "Kuhn", , "max@rstudio.com", role = c("aut", "cre"), 6 | comment = c(ORCID = "0000-0003-2402-136X")), 7 | person("Julia", "Silge", , "julia.silge@rstudio.com", role = "aut", 8 | comment = c(ORCID = "0000-0002-3671-836X")) 9 | ) 10 | License: CC BY-NC-SA 4.0 11 | URL: https://github.com/tidymodels/TMwR, https://www.tmwr.org/ 12 | Depends: 13 | R (>= 4.0.0) 14 | Imports: 15 | applicable, 16 | av, 17 | baguette, 18 | beans, 19 | bestNormalize, 20 | bookdown, 21 | broom, 22 | censored, 23 | corrplot, 24 | corrr, 25 | Cubist, 26 | DALEXtra, 27 | dials (>= 0.0.9), 28 | dimRed, 29 | discrim, 30 | doMC, 31 | dplyr, 32 | earth, 33 | embed (>= 0.1.5), 34 | fastICA, 35 | finetune (>= 0.1.1), 36 | forcats, 37 | ggforce, 38 | ggplot2, 39 | glmnet, 40 | gridExtra, 41 | infer, 42 | kableExtra (>= 1.2.1), 43 | kernlab, 44 | kknn, 45 | klaR, 46 | knitr, 47 | learntidymodels, 48 | lime, 49 | lme4, 50 | lubridate, 51 | mda, 52 | mixOmics, 53 | modeldata, 54 | multilevelmod, 55 | nlme (>= 3.1-157), 56 | nnet, 57 | parsnip, 58 | patchwork, 59 | pillar (>= 1.6.6), 60 | poissonreg, 61 | prettyunits, 62 | probably, 63 | pscl, 64 | purrr, 65 | ranger, 66 | recipes (>= 1.0.8), 67 | rlang, 68 | rmarkdown, 69 | rpart, 70 | rsample (>= 1.2.0), 71 | rstanarm, 72 | rules, 73 | sessioninfo, 74 | stacks (>= 0.2.1), 75 | stringr, 76 | svglite, 77 | text2vec, 78 | textrecipes, 79 | themis, 80 | tibble (>= 3.1.0), 81 | tidymodels (>= 1.1.0), 82 | tidyposterior (>= 0.0.3), 83 | tidyverse, 84 | tune (>= 0.1.3), 85 | uwot, 86 | workflows (>= 0.2.2), 87 | workflowsets (>= 0.0.1), 88 | xgboost, 89 | yardstick 90 | Remotes: 91 | tidymodels/learntidymodels 92 | biocViews: mixOmics 93 | Encoding: UTF-8 94 | SystemRequirements: FFmpeg (>= 3.2); with at least libx264 and lame (mp3) 95 | drivers. Debian/Ubuntu: libavfilter-dev, Fedora/CentOS: ffmpeg-devel 96 | (via https://rpmfusion.org), MacOS Homebrew: ffmp 97 | -------------------------------------------------------------------------------- /RData/Chicago_2020.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/Chicago_2020.RData -------------------------------------------------------------------------------- /RData/bears_at_home.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/bears_at_home.RData -------------------------------------------------------------------------------- /RData/concrete_mixtures.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/concrete_mixtures.RData -------------------------------------------------------------------------------- /RData/concrete_results.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/concrete_results.RData -------------------------------------------------------------------------------- /RData/dry_beans.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/dry_beans.RData -------------------------------------------------------------------------------- /RData/lm_fit.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/lm_fit.RData -------------------------------------------------------------------------------- /RData/mlp_times.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/mlp_times.RData -------------------------------------------------------------------------------- /RData/plm_resids.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/plm_resids.RData -------------------------------------------------------------------------------- /RData/post_intervals.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/post_intervals.RData -------------------------------------------------------------------------------- /RData/rda_fit.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/rda_fit.RData -------------------------------------------------------------------------------- /RData/resampling.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/resampling.RData -------------------------------------------------------------------------------- /RData/sa_history.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/sa_history.RData -------------------------------------------------------------------------------- /RData/search_examples.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/search_examples.RData -------------------------------------------------------------------------------- /RData/svm_bo_metrics.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/svm_bo_metrics.RData -------------------------------------------------------------------------------- /RData/svm_large.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/svm_large.RData -------------------------------------------------------------------------------- /RData/svm_sa_metrics.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/svm_sa_metrics.RData -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | # TMwR 2 | 3 | [![Build Status](https://github.com/tidymodels/TMwR/workflows/bookdown/badge.svg)](https://github.com/tidymodels/TMwR/actions) 4 | 5 | ```{r, include = FALSE} 6 | knitr::opts_chunk$set( 7 | collapse = TRUE, 8 | comment = "#>", 9 | out.width = "100%" 10 | ) 11 | ``` 12 | 13 | 14 | This repository contains the source for [_Tidy Modeling with R_](https://tmwr.org). The purpose of this book is to demonstrate how the [tidyverse](https://www.tidyverse.org/) and [tidymodels](https://www.tidymodels.org/) can be used to produce high quality models. 15 | 16 | # Reproducing the book or results 17 | 18 | First, you'll need to install the required packages. To do this, first install the `remotes` package: 19 | 20 | ``` r 21 | install.packages("remotes") 22 | ``` 23 | 24 | Then use this to install what you need to create the book: 25 | 26 | ``` r 27 | remotes::install_github("tidymodels/TMwR") 28 | ``` 29 | 30 | Although we rigorously try to use the current CRAN versions of all packages, the code above may install some development versions. 31 | 32 | The content is created using the `bookdown` package. To compile the book, use: 33 | 34 | ```r 35 | bookdown::render_book("index.Rmd", "bookdown::gitbook") 36 | ``` 37 | 38 | This will create the HTML files in a directory called `_book`. Although we are in the process of publishing a print version of this work with O'Reilly, we do _not_ currently support building to a PDF version. 39 | 40 | 41 | # Contributing 42 | 43 | Please note that this work is written under a [Contributor Code of Conduct](CODE_OF_CONDUCT.md) and the online version is licensed under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-nc-sa/4.0/). By participating in this project (for example, by submitting an [issue](https://github.com/tidymodels/TMwR/issues) with suggestions or edits) you agree to abide by its terms. Instructions for making contributions can be found in the [`contributing.md`](contributing.md) file. 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TMwR 2 | 3 | [![Build Status](https://github.com/tidymodels/TMwR/workflows/bookdown/badge.svg)](https://github.com/tidymodels/TMwR/actions) 4 | 5 | 6 | 7 | 8 | This repository contains the source for [_Tidy Modeling with R_](https://tmwr.org). The purpose of this book is to demonstrate how the [tidyverse](https://www.tidyverse.org/) and [tidymodels](https://www.tidymodels.org/) can be used to produce high quality models. 9 | 10 | # Reproducing the book or results 11 | 12 | First, you'll need to install the required packages. To do this, first install the `remotes` package: 13 | 14 | ``` r 15 | install.packages("remotes") 16 | ``` 17 | 18 | Then use this to install what you need to create the book: 19 | 20 | ``` r 21 | remotes::install_github("tidymodels/TMwR") 22 | ``` 23 | 24 | Although we rigorously try to use the current CRAN versions of all packages, the code above may install some development versions. 25 | 26 | The content is created using the `bookdown` package. To compile the book, use: 27 | 28 | ```r 29 | bookdown::render_book("index.Rmd", "bookdown::gitbook") 30 | ``` 31 | 32 | This will create the HTML files in a directory called `_book`. Although we are in the process of publishing a print version of this work with O'Reilly, we do _not_ currently support building to a PDF version. 33 | 34 | 35 | # Contributing 36 | 37 | Please note that this work is written under a [Contributor Code of Conduct](CODE_OF_CONDUCT.md) and the online version is licensed under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-nc-sa/4.0/). By participating in this project (for example, by submitting an [issue](https://github.com/tidymodels/TMwR/issues) with suggestions or edits) you agree to abide by its terms. Instructions for making contributions can be found in the [`contributing.md`](contributing.md) file. 38 | -------------------------------------------------------------------------------- /TMwR.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Website 16 | -------------------------------------------------------------------------------- /_bookdown.yml: -------------------------------------------------------------------------------- 1 | new_session: yes 2 | 3 | rmd_files: [ 4 | "index.Rmd", 5 | 6 | "01-software-modeling.Rmd", 7 | "02-tidyverse.Rmd", 8 | "03-base-r.Rmd", 9 | 10 | "04-ames.Rmd", 11 | "05-data-spending.Rmd", 12 | "06-fitting-models.Rmd", 13 | "07-the-model-workflow.Rmd", 14 | "08-feature-engineering.Rmd", 15 | "09-judging-model-effectiveness.Rmd", 16 | 17 | "10-resampling.Rmd", 18 | "11-comparing-models.Rmd", 19 | "12-tuning-parameters.Rmd", 20 | "13-grid-search.Rmd", 21 | "14-iterative-search.Rmd", 22 | "15-workflow-sets.Rmd", 23 | 24 | "16-dimensionality-reduction.Rmd", 25 | "17-encoding-categorical-data.Rmd", 26 | "18-explaining-models-and-predictions.Rmd", 27 | "19-when-should-you-trust-predictions.Rmd", 28 | "20-ensemble-models.Rmd", 29 | "21-inferential-analysis.Rmd", 30 | 31 | "pre-proc-table.Rmd", 32 | "references.Rmd" 33 | ] 34 | 35 | before_chapter_script: "_common.R" 36 | -------------------------------------------------------------------------------- /_common.R: -------------------------------------------------------------------------------- 1 | options(digits = 4, width = 84) 2 | options(dplyr.print_min = 6, dplyr.print_max = 6) 3 | options(cli.width = 85) 4 | options(crayon.enabled = FALSE) 5 | 6 | knitr::opts_chunk$set( 7 | comment = "#>", 8 | collapse = TRUE, 9 | fig.align = 'center', 10 | tidy = FALSE 11 | ) 12 | 13 | 14 | 15 | theme_transparent <- function(...) { 16 | 17 | ret <- ggplot2::theme_bw(...) 18 | 19 | trans_rect <- ggplot2::element_rect(fill = "transparent", colour = NA) 20 | ret$panel.background <- trans_rect 21 | ret$plot.background <- trans_rect 22 | ret$legend.background <- trans_rect 23 | ret$legend.key <- trans_rect 24 | 25 | ret$legend.position <- "top" 26 | 27 | ret 28 | } 29 | 30 | library(ggplot2) 31 | theme_set(theme_transparent()) 32 | 33 | tmwr_version <- function() { 34 | dt <- Sys.Date() 35 | ver <- read.dcf("DESCRIPTION")[1, "Version"] 36 | paste0("Version ", ver, " (", dt, ")") 37 | } 38 | 39 | pkg <- function(x) { 40 | cl <- match.call() 41 | x <- as.character(cl$x) 42 | paste0('', x, '') 43 | } 44 | 45 | is_new_version <- function(x, path) { 46 | cl <- match.call() 47 | nm <- as.character(cl$x) 48 | if (!file.exists(path)) { 49 | return(TRUE) 50 | } 51 | load(path) 52 | prev <- get(nm) 53 | 54 | # parsnip model fits have an elapsed time and this will change from run-to-run. 55 | # We'll remove that to check for a new version. Same for workflows. 56 | if (inherits(prev, "model_fit")) { 57 | x$elapsed <- NA 58 | prev$elapsed <- NA 59 | } 60 | if (workflows:::is_workflow(prev)) { 61 | x$fit$fit$elapsed <- NA 62 | prev$fit$fit$elapsed <- NA 63 | } 64 | 65 | res <- all.equal(x, prev) 66 | !isTRUE(res) 67 | } 68 | 69 | -------------------------------------------------------------------------------- /_output.yaml: -------------------------------------------------------------------------------- 1 | bookdown::gitbook: 2 | css: [style.css, TMwR.css] 3 | dev: png 4 | config: 5 | toc: 6 | collapse: section 7 | before: | 8 |
  • Tidy Modeling with R
  • 9 | edit: 10 | link: https://github.com/tidymodels/TMwR/edit/main/%s 11 | text: "Edit" 12 | fontsettings: null 13 | sharing: no 14 | 15 | bookdown::pdf_book: 16 | latex_engine: pdflatex 17 | citation_package: natbib 18 | includes: 19 | in_header: latex_extras/preamble.tex 20 | keep_tex: yes 21 | highlight: tango 22 | 23 | -------------------------------------------------------------------------------- /ames_snippets.R: -------------------------------------------------------------------------------- 1 | # Any changes to this code should trigger changes to the end-of-chapter summary 2 | # sections (that include these in code chunks) 3 | 4 | library(tidymodels) 5 | data(ames) 6 | ames <- mutate(ames, Sale_Price = log10(Sale_Price)) 7 | 8 | set.seed(502) 9 | ames_split <- initial_split(ames, prop = 0.80, strata = Sale_Price) 10 | ames_train <- training(ames_split) 11 | ames_test <- testing(ames_split) 12 | 13 | ames_rec <- 14 | recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 15 | Latitude + Longitude, data = ames_train) %>% 16 | step_log(Gr_Liv_Area, base = 10) %>% 17 | step_other(Neighborhood, threshold = 0.01) %>% 18 | step_dummy(all_nominal_predictors()) %>% 19 | step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) %>% 20 | step_ns(Latitude, Longitude, deg_free = 20) 21 | 22 | lm_model <- linear_reg() %>% set_engine("lm") 23 | 24 | lm_wflow <- 25 | workflow() %>% 26 | add_model(lm_model) %>% 27 | add_recipe(ames_rec) 28 | 29 | # cached in RData/lm_fit.RData 30 | # lm_fit <- fit(lm_wflow, ames_train) 31 | 32 | rf_model <- 33 | rand_forest(trees = 1000) %>% 34 | set_engine("ranger") %>% 35 | set_mode("regression") 36 | 37 | rf_wflow <- 38 | workflow() %>% 39 | add_formula( 40 | Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 41 | Latitude + Longitude) %>% 42 | add_model(rf_model) 43 | 44 | set.seed(1001) 45 | ames_folds <- vfold_cv(ames_train, v = 10) 46 | 47 | # cached in RData/resampling.RData from Ch 10 48 | # rf_res <- rf_wflow %>% fit_resamples(resamples = ames_folds, control = keep_pred) 49 | -------------------------------------------------------------------------------- /bo_search.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/bo_search.mp4 -------------------------------------------------------------------------------- /contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Thank you for your interest in contributing. This file contains what you need to know to help. 4 | 5 | - For __questions and discussions__ about tidymodels packages, modeling, and machine learning, please [post on RStudio Community](https://rstd.io/tidymodels-community). 6 | 7 | If you have a __contribution__, please fork the repo and make a pull request (PR). If these terms are unfamiliar to you, take a look at [_Happy Git and GitHub for the useR_](https://happygitwithr.com/). It might be helpful to start a GitHub issue to discuss them before putting a lot of effort into it. 8 | 9 | If you make significant changes, include the phrase "I assign the copyright of this contribution to the authors listed in the `DESCRIPTION` file". 10 | 11 | __If you find a bug__, please make an issue or pull request. Since all of the data and code are available, we will require minimal reprex (reproducible example). The goal of a reprex is to make it as easy as possible for me to recreate your problem so that we can fix it. If you've never heard of a reprex before, start by reading "[What is a reprex](https://github.com/tidyverse/reprex#what-is-a-reprex)", and follow the advice further down that page. 12 | 13 | ## Formatting 14 | 15 | * All code chunks have labels that are concise but descriptive. They should also make good figure names. Look at each chapter's chunk names; we keep a common prefix for each chapter. Names should use `-` to space words. 16 | 17 | * Figures should have transparent backgrounds and legends (if any) on top. 18 | 19 | * Please use US spellings (e.g. "color" instead of "colour"). 20 | 21 | * Do not break lines within sentences or paragraphs. 22 | 23 | * Adhere as best as possible to the [`tidyverse` style guide](https://style.tidyverse.org/). 24 | 25 | * Please avoid adding new package dependencies. If that can't be avoided, add them to the DESCRIPTION file. 26 | 27 | -------------------------------------------------------------------------------- /contributors.csv: -------------------------------------------------------------------------------- 1 | login,n,name,blog 2 | arisp99,1,NA,NA 3 | bradisbrad,1,Brad Hill,www.bradisblogging.com 4 | bryceroney,1,Bryce Roney,NA 5 | cedricbatailler,1,Cedric Batailler,cedricbatailler.me 6 | czeildi,1,Ildikó Czeller,https://ildiczeller.com/ 7 | davidkane9,1,David Kane,www.davidkane.info 8 | DavZim,1,NA,https://davzim.github.io/ 9 | DCharIAA,2,NA,NA 10 | dcossyleon,5,Desirée De Leon,https://tinystats.github.io/teacups-giraffes-and-statistics/ 11 | EmilHvitfeldt,3,Emil Hvitfeldt,https://www.emilhvitfeldt.com/ 12 | emilopezcano,2,Emilio,http://emilio.lcano.com 13 | Fgazzelloni,1,Fgazzelloni,https://www.linkedin.com/in/fgazzelloni/ 14 | hfrick,5,Hannah Frick,http://www.frick.ws 15 | hlynurhallgrims,2,Hlynur,NA 16 | howardbaek,3,Howard Baek,http://insidethetv.rbind.io/ 17 | jaeyk,1,Jae Yeon Kim,https://jaeyk.github.io/ 18 | jdtrat,1,Jonathan D. Trattner,https://www.jdtrat.com 19 | jmgirard,1,Jeffrey Girard,https://www.jmgirard.com 20 | JohnPickering,1,John W Pickering,NA 21 | jonthegeek,10,Jon Harmon,http://jonthegeek.com 22 | joseph-rickert,2,Joseph B. Rickert,http://www.rstudio.com 23 | juliasilge,238,Julia Silge,https://juliasilge.com 24 | maxdrohde,2,Maximilian Rohde,maximilianrohde.com 25 | michaelgrund,1,Michael Grund, 26 | MikeJohnPage,1,NA,NA 27 | mine-cetinkaya-rundel,1,Mine Cetinkaya-Rundel,http://mine-cr.com 28 | mmhamdy,1,Mohammed Hamdy,NA 29 | nattalides,1,NA,NA 30 | PursuitOfDataScience,1,Y. Yu,https://youzhi.netlify.app/ 31 | riazhedayati,1,Riaz Hedayati,NA 32 | RobWiederstein,1,Rob Wiederstein,www.robwiederstein.org 33 | scottyd22,2,Scott,datascott.com 34 | simonschoe,1,Simon Schölzel,NA 35 | tagasimon,1,Simon Sayz,https://simonsayz.xyz 36 | thrkng,2,NA,NA 37 | tmstauss,4,Tanner Stauss,www.linkedin.com/in/tanner-stauss 38 | tonyelhabr,1,Tony ElHabr,https://tonyelhabr.rbind.io/ 39 | topepo,389,Max Kuhn,NA 40 | x1o,3,Dmitry Zotikov,NA 41 | xiaochi-liu,3,Xiaochi,xiaochi.rbind.io 42 | zachbogart,1,Zach Bogart,zachbogart.com 43 | -------------------------------------------------------------------------------- /extras/affy_plm_ggplot.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(affyPLM) 3 | library(AmpAffyExample) 4 | 5 | # ------------------------------------------------------------------------------ 6 | 7 | data(AmpData) 8 | sampleNames(AmpData) <- c("N1","Good Quality","Poor Quality","A1","A2","A3") 9 | 10 | Pset1 <- fitPLM(AmpData) 11 | 12 | # ------------------------------------------------------------------------------ 13 | 14 | # Take from the image method for PLMset objevts 15 | pm.index <- unlist(affy::indexProbes(Pset1, "pm", row.names(coefs(Pset1)))) 16 | rows <- Pset1@nrow 17 | cols <- Pset1@ncol 18 | pm.x.locs <- pm.index %% rows 19 | pm.x.locs[pm.x.locs == 0] <- rows 20 | pm.y.locs <- pm.index %/% rows + 1 21 | 22 | # ------------------------------------------------------------------------------ 23 | 24 | plm_resids <- 25 | tibble::as_tibble(Pset1@residuals$PM.resid) %>% 26 | mutate( 27 | probe = rownames(Pset1@residuals$PM.resid), 28 | x = pm.x.locs, 29 | y = pm.y.locs 30 | ) %>% 31 | pivot_longer(cols = c(1:6), names_to = "Sample", values_to = "Intensity") %>% 32 | dplyr::filter(Sample %in% c("Good Quality", "Poor Quality")) 33 | 34 | # ------------------------------------------------------------------------------ 35 | 36 | save(plm_resids, file = "RData/plm_resids.RData") 37 | 38 | -------------------------------------------------------------------------------- /extras/ames_posterior_intervals.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(doMC) 3 | library(tidyposterior) 4 | library(workflowsets) 5 | library(rstanarm) 6 | theme_set(theme_bw()) 7 | 8 | data(ames, package = "modeldata") 9 | 10 | ames <- mutate(ames, Sale_Price = log10(Sale_Price)) 11 | 12 | set.seed(123) 13 | ames_split <- initial_split(ames, prop = 0.80, strata = Sale_Price) 14 | ames_train <- training(ames_split) 15 | ames_test <- testing(ames_split) 16 | 17 | crs <- parallel::detectCores() 18 | 19 | registerDoMC(cores = crs) 20 | 21 | ## ----------------------------------------------------------------------------- 22 | 23 | set.seed(55) 24 | ames_folds <- vfold_cv(ames_train, v = 10, repeats = 10) 25 | 26 | lm_model <- linear_reg() %>% set_engine("lm") 27 | 28 | rf_model <- 29 | rand_forest(trees = 1000) %>% 30 | set_engine("ranger") %>% 31 | set_mode("regression") 32 | 33 | # ------------------------------------------------------------------------------ 34 | 35 | basic_rec <- 36 | recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 37 | Latitude + Longitude, data = ames_train) %>% 38 | step_log(Gr_Liv_Area, base = 10) %>% 39 | step_other(Neighborhood, threshold = 0.01) %>% 40 | step_dummy(all_nominal_predictors()) 41 | 42 | interaction_rec <- 43 | basic_rec %>% 44 | step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) 45 | 46 | spline_rec <- 47 | interaction_rec %>% 48 | step_ns(Latitude, Longitude, deg_free = 50) 49 | 50 | preproc <- 51 | list(basic = basic_rec, 52 | interact = interaction_rec, 53 | splines = spline_rec, 54 | formula = Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + 55 | Bldg_Type + Latitude + Longitude 56 | ) 57 | 58 | models <- list(lm = lm_model, lm = lm_model, lm = lm_model, rf = rf_model) 59 | 60 | four_models <- 61 | workflow_set(preproc, models, cross = FALSE) 62 | four_models 63 | 64 | posteriors <- NULL 65 | 66 | for(i in 11:100) { 67 | if (i %% 10 == 0) cat(i, "... ") 68 | 69 | tmp_rset <- rsample:::df_reconstruct(ames_folds %>% slice(1:i), ames_folds) 70 | 71 | four_resamples <- 72 | four_models %>% 73 | workflow_map("fit_resamples", seed = 1, resamples = tmp_rset) 74 | 75 | ## ----------------------------------------------------------------------------- 76 | 77 | rsq_anova <- 78 | perf_mod( 79 | four_resamples, 80 | prior_intercept = student_t(df = 1), 81 | chains = crs - 2, 82 | iter = 5000, 83 | seed = 2, 84 | cores = crs - 2, 85 | refresh = 0 86 | ) 87 | 88 | rqs_diff <- 89 | contrast_models(rsq_anova, 90 | list_1 = "splines_lm", 91 | list_2 = "basic_lm", 92 | seed = 3) %>% 93 | as_tibble() %>% 94 | mutate(label = paste(format(1:100)[i], "resamples"), resamples = i) 95 | 96 | posteriors <- bind_rows(posteriors, rqs_diff) 97 | 98 | rm(rqs_diff) 99 | 100 | } 101 | 102 | ## ----------------------------------------------------------------------------- 103 | 104 | # ggplot(posteriors, aes(x = difference)) + 105 | # geom_histogram(bins = 30) + 106 | # facet_wrap(~label) 107 | # 108 | # ggplot(posteriors, aes(x = difference)) + 109 | # geom_line(stat = "density", trim = FALSE) + 110 | # facet_wrap(~label) 111 | 112 | intervals <- 113 | posteriors %>% 114 | group_by(resamples) %>% 115 | summarize( 116 | mean = mean(difference), 117 | lower = quantile(difference, prob = 0.05), 118 | upper = quantile(difference, prob = 0.95), 119 | .groups = "drop" 120 | ) %>% 121 | ungroup() %>% 122 | mutate( 123 | mean = predict(loess(mean ~ resamples, span = .15)), 124 | lower = predict(loess(lower ~ resamples, span = .15)), 125 | upper = predict(loess(upper ~ resamples, span = .15)) 126 | ) 127 | 128 | save(intervals, file = "RData/post_intervals.RData") 129 | 130 | # ggplot(intervals, 131 | # aes(x = resamples, y = mean)) + 132 | # geom_path() + 133 | # geom_ribbon(aes(ymin = lower, ymax = upper), fill = "red", alpha = .1) + 134 | # labs(y = expression(paste("Mean difference in ", R^2)), 135 | # x = "Number of Resamples (repeated 10-fold cross-validation)") 136 | # 137 | 138 | -------------------------------------------------------------------------------- /extras/cells_svm_large.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(rayshader) 3 | library(doMC) 4 | registerDoMC(cores = parallel::detectCores(logical = TRUE)) 5 | 6 | ## ----------------------------------------------------------------------------- 7 | 8 | data(cells) 9 | cells <- cells %>% select(-case) 10 | set.seed(33) 11 | cell_folds <- vfold_cv(cells) 12 | roc_res <- metric_set(roc_auc) 13 | 14 | ## ----------------------------------------------------------------------------- 15 | 16 | svm_rec <- 17 | recipe(class ~ ., data = cells) %>% 18 | step_YeoJohnson(all_numeric_predictors()) %>% 19 | step_normalize(all_numeric_predictors()) 20 | 21 | svm_spec <- 22 | svm_rbf(cost = tune(), rbf_sigma = tune()) %>% 23 | set_engine("kernlab") %>% 24 | set_mode("classification") 25 | 26 | svm_wflow <- 27 | workflow() %>% 28 | add_model(svm_spec) %>% 29 | add_recipe(svm_rec) 30 | 31 | svm_param <- 32 | svm_wflow %>% 33 | parameters() %>% 34 | update( 35 | cost = cost(c(-10, 5)), 36 | rbf_sigma = rbf_sigma(c(-7, -1)) 37 | ) 38 | 39 | ## ----------------------------------------------------------------------------- 40 | 41 | large_grid <- grid_regular(svm_param, levels = 50) 42 | 43 | set.seed(2) 44 | svm_large <- 45 | svm_wflow %>% 46 | tune_grid(resamples = cell_folds, grid = large_grid, metrics = roc_res) 47 | 48 | ## ----------------------------------------------------------------------------- 49 | 50 | if (interactive()) { 51 | 52 | svm_roc <- 53 | svm_large %>% 54 | collect_metrics() 55 | 56 | large_plot <- 57 | svm_roc %>% 58 | ggplot(aes(x = rbf_sigma, y = cost)) + 59 | geom_raster(aes(fill = mean)) + 60 | geom_point(data = top_n(svm_roc, 1, mean)) + 61 | scale_x_log10() + 62 | scale_y_continuous(trans = "log2") + 63 | scale_fill_distiller(palette = "Blues") + 64 | theme_minimal() + 65 | theme( 66 | legend.position = "bottom", 67 | legend.key.width = grid::unit(2, "cm"), 68 | plot.title = element_text(hjust = 0.5) 69 | ) + 70 | guides(title.position = "bottom") + 71 | labs(x = "rbf_sigma\n\n\n\n", title = "ROC AUC surface") + 72 | coord_fixed(ratio = 1/2.5) 73 | 74 | agg_png("roc_surface.png", height = 4 * 480, width = 4 * 480, res = 72 * 3, scaling = 1) 75 | print(large_plot) 76 | dev.off() 77 | 78 | 79 | 80 | plot_gg( 81 | large_plot, 82 | multicore = FALSE, 83 | raytrace = TRUE, 84 | width = 7, 85 | height = 7, 86 | scale = 300, 87 | windowsize = c(1400, 1400), 88 | zoom = 1, 89 | phi = 30, 90 | theta = 30 91 | ) 92 | 93 | } 94 | 95 | ## ----------------------------------------------------------------------------- 96 | 97 | sessioninfo::session_info() 98 | 99 | ## ----------------------------------------------------------------------------- 100 | 101 | save(svm_large, file = "../RData/svm_large.RData") 102 | -------------------------------------------------------------------------------- /extras/dry_beans.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(RWeka) 3 | library(janitor) 4 | 5 | dry_beans <- 6 | read.arff(url("https://www.muratkoklu.com/datasets/vtdhnd02.php")) %>% 7 | dplyr::rename(AspectRatio = AspectRation) %>% 8 | clean_names() %>% 9 | as_tibble() %>% 10 | mutate(class = tolower(as.character(class)), 11 | class = factor(class)) 12 | 13 | names(dry_beans) <- gsub("([1-4]$)", "_\\1", names(dry_beans), perl = TRUE) 14 | 15 | save(dry_beans, file = "RData/dry_beans.RData", compress = "xz", version = 2) 16 | 17 | -------------------------------------------------------------------------------- /extras/nonlinear_function.R: -------------------------------------------------------------------------------- 1 | nonlin_function <- function(x, error = TRUE) { 2 | # use the ames spline curve for Longitude just because I think that it's 3 | # cool 4 | data(ames, package = "modeldata") 5 | rec <- 6 | recipe(Sale_Price ~ Longitude, data = ames) %>% 7 | step_log(Sale_Price, skip = TRUE) %>% 8 | step_range(Longitude) %>% 9 | prep() 10 | 11 | 12 | # use the ames longitude pattern since I like it 13 | f <- lm(log10(Sale_Price) ~ splines::ns(Longitude, df = 12), data = juice(rec)) 14 | p <- predict(f, newdata = data.frame(Longitude = x), se.fit = TRUE) 15 | err <- p$se.fit 16 | if (!error) { 17 | err <- 0 18 | } 19 | res <- rnorm(1, mean = p$fit, sd = err) 20 | # convert to a R^2-like value 21 | res <- (8 * res)/10 22 | res <- max(res, 0) 23 | res <- min(res, 1) 24 | res 25 | } 26 | -------------------------------------------------------------------------------- /extras/parallel_times/collect.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(lubridate) 3 | 4 | ## ----------------------------------------------------------------------------- 5 | 6 | get_date <- function(x) { 7 | x <- basename(x) 8 | x <- strsplit(x, "_") 9 | x <- map(x, ~ .x[3:8]) 10 | x <- map(x, ~ gsub("\\.RData", "", .x)) 11 | x <- map_chr(x, paste0, collapse = "-") 12 | ymd_hms(x) 13 | } 14 | 15 | get_times <- function(x) { 16 | load(x) 17 | res <- 18 | times %>% 19 | mutate(date = get_date(x)) 20 | 21 | res 22 | } 23 | 24 | ## ----------------------------------------------------------------------------- 25 | 26 | rdata <- 27 | list.files(path = "extras/parallel_times/", 28 | pattern = "\\.RData", 29 | full.names = TRUE) 30 | rdata <- rdata[!grepl("xgb_times", rdata)] 31 | rdata <- rdata[!grepl("logging_data", rdata)] 32 | 33 | all_times <- map_dfr(rdata, get_times) 34 | 35 | seq <- 36 | all_times %>% 37 | filter(num_cores == 1) %>% 38 | dplyr::rename(seq_time = elapsed) %>% 39 | select(-num_cores, -date) 40 | 41 | times <- 42 | full_join(all_times, seq, 43 | by = c("num_resamples", "num_grid", "preproc", "par_method")) %>% 44 | mutate( 45 | time_per_fit = elapsed/(num_grid * num_resamples), 46 | speed_up = seq_time/elapsed, 47 | preprocessing = gsub(" preprocessing", "", preproc), 48 | preprocessing = ifelse(preprocessing == "no", "none", preprocessing), 49 | preprocessing = factor(preprocessing, levels = c("none", "light", "expensive")), 50 | parallel_over = par_method 51 | ) 52 | 53 | if (interactive()) { 54 | 55 | 56 | ggplot(times, aes(x = num_cores, y = elapsed, col = parallel_over, shape = parallel_over)) + 57 | geom_point() + 58 | geom_line() + 59 | facet_wrap(~ preprocessing) + 60 | labs(x = "Number of Workers", y = "Execution Time (s)") + 61 | scale_y_log10() + 62 | theme_bw() + 63 | theme(legend.position = "top") 64 | 65 | times %>% 66 | filter(preprocessing == "none") %>% 67 | ggplot(aes(x = num_cores, y = speed_up, col = preprocessing, shape = preprocessing)) + 68 | geom_abline(lty = 1) + 69 | geom_point() + 70 | geom_line() + 71 | facet_wrap(~ par_method) + 72 | coord_obs_pred() + 73 | labs(x = "Number of Workers", y = "Speed-up", 74 | title = "5 resamples, 10 grid points") + 75 | theme_bw() + 76 | theme(legend.position = "top") 77 | 78 | times %>% 79 | filter(preprocessing != "expensive") %>% 80 | ggplot(aes(x = num_cores, y = speed_up, col = preprocessing, shape = preprocessing)) + 81 | geom_abline(lty = 1) + 82 | geom_point() + 83 | geom_line() + 84 | facet_wrap(~ par_method) + 85 | coord_obs_pred() + 86 | labs(x = "Number of Workers", y = "Speed-up", 87 | title = "5 resamples, 10 grid points") + 88 | theme_bw() + 89 | theme(legend.position = "top") 90 | 91 | 92 | ggplot(times, aes(x = num_cores, y = speed_up, col = parallel_over, shape = parallel_over)) + 93 | geom_abline(lty = 1) + 94 | geom_point() + 95 | geom_line() + 96 | facet_wrap(~ preprocessing) + 97 | coord_obs_pred() + 98 | labs(x = "Number of Workers", y = "Speed-up", 99 | title = "5 resamples, 10 grid points") + 100 | theme_bw() + 101 | theme(legend.position = "top") 102 | 103 | } 104 | 105 | save(times, file = "extras/parallel_times/xgb_times.RData") 106 | 107 | # r_files <- list.files(path = ".", pattern = "R$") 108 | # r_files <- r_files[r_files != "collect.R"] 109 | # r_files <- r_files[r_files != "template.R"] 110 | # r_files <- paste0("R CMD BATCH --vanilla ", r_files, "\nsleep 20\n") 111 | # cat(sample(r_files), sep = "") 112 | 113 | q("no") 114 | 115 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_01_expensive.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | library(embed) 8 | library(rstanarm) 9 | 10 | ## ----------------------------------------------------------------------------- 11 | 12 | num_resamples <- 5 13 | num_grid <- 10 14 | num_cores <- 1 15 | preproc <- "expensive preprocessing" 16 | par_method <- "everything" 17 | 18 | ## ----------------------------------------------------------------------------- 19 | 20 | set.seed(123) 21 | 22 | flight_data <- 23 | flights %>% 24 | mutate( 25 | # Convert the arrival delay to a factor 26 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 27 | arr_delay = factor(arr_delay), 28 | # We will use the date (not date-time) in the recipe below 29 | date = as.Date(time_hour) 30 | ) %>% 31 | # Include the weather data 32 | inner_join(weather, by = c("origin", "time_hour")) %>% 33 | # Only retain the specific columns we will use 34 | select(dep_time, flight, origin, dest, air_time, distance, 35 | carrier, date, arr_delay, time_hour) %>% 36 | # Exclude missing data 37 | na.omit() %>% 38 | # For creating models, it is better to have qualitative columns 39 | # encoded as factors (instead of character strings) 40 | mutate_if(is.character, as.factor) %>% 41 | sample_n(4000) 42 | 43 | ## ----------------------------------------------------------------------------- 44 | 45 | flights_rec <- 46 | recipe(arr_delay ~ ., data = flight_data) %>% 47 | update_role(flight, time_hour, new_role = "ID") %>% 48 | step_date(date, features = c("dow", "month")) %>% 49 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 50 | step_rm(date) %>% 51 | step_mutate(flight = as.factor(flight)) %>% 52 | step_lencode_bayes(flight, outcome = vars(arr_delay)) %>% 53 | step_dummy(all_nominal_predictors()) %>% 54 | step_zv(all_predictors()) 55 | 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_01_with.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 1 13 | preproc <- "light preprocessing" 14 | par_method <- "everything" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_01_without.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 1 13 | preproc <- "no preprocessing" 14 | par_method <- "everything" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_02_expensive.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | library(embed) 8 | library(rstanarm) 9 | 10 | ## ----------------------------------------------------------------------------- 11 | 12 | num_resamples <- 5 13 | num_grid <- 10 14 | num_cores <- 2 15 | preproc <- "expensive preprocessing" 16 | par_method <- "everything" 17 | 18 | ## ----------------------------------------------------------------------------- 19 | 20 | set.seed(123) 21 | 22 | flight_data <- 23 | flights %>% 24 | mutate( 25 | # Convert the arrival delay to a factor 26 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 27 | arr_delay = factor(arr_delay), 28 | # We will use the date (not date-time) in the recipe below 29 | date = as.Date(time_hour) 30 | ) %>% 31 | # Include the weather data 32 | inner_join(weather, by = c("origin", "time_hour")) %>% 33 | # Only retain the specific columns we will use 34 | select(dep_time, flight, origin, dest, air_time, distance, 35 | carrier, date, arr_delay, time_hour) %>% 36 | # Exclude missing data 37 | na.omit() %>% 38 | # For creating models, it is better to have qualitative columns 39 | # encoded as factors (instead of character strings) 40 | mutate_if(is.character, as.factor) %>% 41 | sample_n(4000) 42 | 43 | ## ----------------------------------------------------------------------------- 44 | 45 | flights_rec <- 46 | recipe(arr_delay ~ ., data = flight_data) %>% 47 | update_role(flight, time_hour, new_role = "ID") %>% 48 | step_date(date, features = c("dow", "month")) %>% 49 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 50 | step_rm(date) %>% 51 | step_mutate(flight = as.factor(flight)) %>% 52 | step_lencode_bayes(flight, outcome = vars(arr_delay)) %>% 53 | step_dummy(all_nominal_predictors()) %>% 54 | step_zv(all_predictors()) 55 | 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_02_with.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 2 13 | preproc <- "light preprocessing" 14 | par_method <- "everything" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_02_without.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 2 13 | preproc <- "no preprocessing" 14 | par_method <- "everything" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_03_expensive.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | library(embed) 8 | library(rstanarm) 9 | 10 | ## ----------------------------------------------------------------------------- 11 | 12 | num_resamples <- 5 13 | num_grid <- 10 14 | num_cores <- 3 15 | preproc <- "expensive preprocessing" 16 | par_method <- "everything" 17 | 18 | ## ----------------------------------------------------------------------------- 19 | 20 | set.seed(123) 21 | 22 | flight_data <- 23 | flights %>% 24 | mutate( 25 | # Convert the arrival delay to a factor 26 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 27 | arr_delay = factor(arr_delay), 28 | # We will use the date (not date-time) in the recipe below 29 | date = as.Date(time_hour) 30 | ) %>% 31 | # Include the weather data 32 | inner_join(weather, by = c("origin", "time_hour")) %>% 33 | # Only retain the specific columns we will use 34 | select(dep_time, flight, origin, dest, air_time, distance, 35 | carrier, date, arr_delay, time_hour) %>% 36 | # Exclude missing data 37 | na.omit() %>% 38 | # For creating models, it is better to have qualitative columns 39 | # encoded as factors (instead of character strings) 40 | mutate_if(is.character, as.factor) %>% 41 | sample_n(4000) 42 | 43 | ## ----------------------------------------------------------------------------- 44 | 45 | flights_rec <- 46 | recipe(arr_delay ~ ., data = flight_data) %>% 47 | update_role(flight, time_hour, new_role = "ID") %>% 48 | step_date(date, features = c("dow", "month")) %>% 49 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 50 | step_rm(date) %>% 51 | step_mutate(flight = as.factor(flight)) %>% 52 | step_lencode_bayes(flight, outcome = vars(arr_delay)) %>% 53 | step_dummy(all_nominal_predictors()) %>% 54 | step_zv(all_predictors()) 55 | 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_03_with.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 3 13 | preproc <- "light preprocessing" 14 | par_method <- "everything" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_03_without.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 3 13 | preproc <- "no preprocessing" 14 | par_method <- "everything" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_04_expensive.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | library(embed) 8 | library(rstanarm) 9 | 10 | ## ----------------------------------------------------------------------------- 11 | 12 | num_resamples <- 5 13 | num_grid <- 10 14 | num_cores <- 4 15 | preproc <- "expensive preprocessing" 16 | par_method <- "everything" 17 | 18 | ## ----------------------------------------------------------------------------- 19 | 20 | set.seed(123) 21 | 22 | flight_data <- 23 | flights %>% 24 | mutate( 25 | # Convert the arrival delay to a factor 26 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 27 | arr_delay = factor(arr_delay), 28 | # We will use the date (not date-time) in the recipe below 29 | date = as.Date(time_hour) 30 | ) %>% 31 | # Include the weather data 32 | inner_join(weather, by = c("origin", "time_hour")) %>% 33 | # Only retain the specific columns we will use 34 | select(dep_time, flight, origin, dest, air_time, distance, 35 | carrier, date, arr_delay, time_hour) %>% 36 | # Exclude missing data 37 | na.omit() %>% 38 | # For creating models, it is better to have qualitative columns 39 | # encoded as factors (instead of character strings) 40 | mutate_if(is.character, as.factor) %>% 41 | sample_n(4000) 42 | 43 | ## ----------------------------------------------------------------------------- 44 | 45 | flights_rec <- 46 | recipe(arr_delay ~ ., data = flight_data) %>% 47 | update_role(flight, time_hour, new_role = "ID") %>% 48 | step_date(date, features = c("dow", "month")) %>% 49 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 50 | step_rm(date) %>% 51 | step_mutate(flight = as.factor(flight)) %>% 52 | step_lencode_bayes(flight, outcome = vars(arr_delay)) %>% 53 | step_dummy(all_nominal_predictors()) %>% 54 | step_zv(all_predictors()) 55 | 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_04_with.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 4 13 | preproc <- "light preprocessing" 14 | par_method <- "everything" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_04_without.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 4 13 | preproc <- "no preprocessing" 14 | par_method <- "everything" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_05_expensive.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | library(embed) 8 | library(rstanarm) 9 | 10 | ## ----------------------------------------------------------------------------- 11 | 12 | num_resamples <- 5 13 | num_grid <- 10 14 | num_cores <- 5 15 | preproc <- "expensive preprocessing" 16 | par_method <- "everything" 17 | 18 | ## ----------------------------------------------------------------------------- 19 | 20 | set.seed(123) 21 | 22 | flight_data <- 23 | flights %>% 24 | mutate( 25 | # Convert the arrival delay to a factor 26 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 27 | arr_delay = factor(arr_delay), 28 | # We will use the date (not date-time) in the recipe below 29 | date = as.Date(time_hour) 30 | ) %>% 31 | # Include the weather data 32 | inner_join(weather, by = c("origin", "time_hour")) %>% 33 | # Only retain the specific columns we will use 34 | select(dep_time, flight, origin, dest, air_time, distance, 35 | carrier, date, arr_delay, time_hour) %>% 36 | # Exclude missing data 37 | na.omit() %>% 38 | # For creating models, it is better to have qualitative columns 39 | # encoded as factors (instead of character strings) 40 | mutate_if(is.character, as.factor) %>% 41 | sample_n(4000) 42 | 43 | ## ----------------------------------------------------------------------------- 44 | 45 | flights_rec <- 46 | recipe(arr_delay ~ ., data = flight_data) %>% 47 | update_role(flight, time_hour, new_role = "ID") %>% 48 | step_date(date, features = c("dow", "month")) %>% 49 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 50 | step_rm(date) %>% 51 | step_mutate(flight = as.factor(flight)) %>% 52 | step_lencode_bayes(flight, outcome = vars(arr_delay)) %>% 53 | step_dummy(all_nominal_predictors()) %>% 54 | step_zv(all_predictors()) 55 | 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_05_with.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 5 13 | preproc <- "light preprocessing" 14 | par_method <- "everything" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_05_without.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 5 13 | preproc <- "no preprocessing" 14 | par_method <- "everything" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_10_expensive.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | library(embed) 8 | library(rstanarm) 9 | 10 | ## ----------------------------------------------------------------------------- 11 | 12 | num_resamples <- 5 13 | num_grid <- 10 14 | num_cores <- 10 15 | preproc <- "expensive preprocessing" 16 | par_method <- "everything" 17 | 18 | ## ----------------------------------------------------------------------------- 19 | 20 | set.seed(123) 21 | 22 | flight_data <- 23 | flights %>% 24 | mutate( 25 | # Convert the arrival delay to a factor 26 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 27 | arr_delay = factor(arr_delay), 28 | # We will use the date (not date-time) in the recipe below 29 | date = as.Date(time_hour) 30 | ) %>% 31 | # Include the weather data 32 | inner_join(weather, by = c("origin", "time_hour")) %>% 33 | # Only retain the specific columns we will use 34 | select(dep_time, flight, origin, dest, air_time, distance, 35 | carrier, date, arr_delay, time_hour) %>% 36 | # Exclude missing data 37 | na.omit() %>% 38 | # For creating models, it is better to have qualitative columns 39 | # encoded as factors (instead of character strings) 40 | mutate_if(is.character, as.factor) %>% 41 | sample_n(4000) 42 | 43 | ## ----------------------------------------------------------------------------- 44 | 45 | flights_rec <- 46 | recipe(arr_delay ~ ., data = flight_data) %>% 47 | update_role(flight, time_hour, new_role = "ID") %>% 48 | step_date(date, features = c("dow", "month")) %>% 49 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 50 | step_rm(date) %>% 51 | step_mutate(flight = as.factor(flight)) %>% 52 | step_lencode_bayes(flight, outcome = vars(arr_delay)) %>% 53 | step_dummy(all_nominal_predictors()) %>% 54 | step_zv(all_predictors()) 55 | 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_10_with.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 10 13 | preproc <- "light preprocessing" 14 | par_method <- "everything" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_10_without.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 10 13 | preproc <- "no preprocessing" 14 | par_method <- "everything" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_15_expensive.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | library(embed) 8 | library(rstanarm) 9 | 10 | ## ----------------------------------------------------------------------------- 11 | 12 | num_resamples <- 5 13 | num_grid <- 10 14 | num_cores <- 15 15 | preproc <- "expensive preprocessing" 16 | par_method <- "everything" 17 | 18 | ## ----------------------------------------------------------------------------- 19 | 20 | set.seed(123) 21 | 22 | flight_data <- 23 | flights %>% 24 | mutate( 25 | # Convert the arrival delay to a factor 26 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 27 | arr_delay = factor(arr_delay), 28 | # We will use the date (not date-time) in the recipe below 29 | date = as.Date(time_hour) 30 | ) %>% 31 | # Include the weather data 32 | inner_join(weather, by = c("origin", "time_hour")) %>% 33 | # Only retain the specific columns we will use 34 | select(dep_time, flight, origin, dest, air_time, distance, 35 | carrier, date, arr_delay, time_hour) %>% 36 | # Exclude missing data 37 | na.omit() %>% 38 | # For creating models, it is better to have qualitative columns 39 | # encoded as factors (instead of character strings) 40 | mutate_if(is.character, as.factor) %>% 41 | sample_n(4000) 42 | 43 | ## ----------------------------------------------------------------------------- 44 | 45 | flights_rec <- 46 | recipe(arr_delay ~ ., data = flight_data) %>% 47 | update_role(flight, time_hour, new_role = "ID") %>% 48 | step_date(date, features = c("dow", "month")) %>% 49 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 50 | step_rm(date) %>% 51 | step_mutate(flight = as.factor(flight)) %>% 52 | step_lencode_bayes(flight, outcome = vars(arr_delay)) %>% 53 | step_dummy(all_nominal_predictors()) %>% 54 | step_zv(all_predictors()) 55 | 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_15_with.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 15 13 | preproc <- "light preprocessing" 14 | par_method <- "everything" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_15_without.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 15 13 | preproc <- "no preprocessing" 14 | par_method <- "everything" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_20_expensive.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | library(embed) 8 | library(rstanarm) 9 | 10 | ## ----------------------------------------------------------------------------- 11 | 12 | num_resamples <- 5 13 | num_grid <- 10 14 | num_cores <- 20 15 | preproc <- "expensive preprocessing" 16 | par_method <- "everything" 17 | 18 | ## ----------------------------------------------------------------------------- 19 | 20 | set.seed(123) 21 | 22 | flight_data <- 23 | flights %>% 24 | mutate( 25 | # Convert the arrival delay to a factor 26 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 27 | arr_delay = factor(arr_delay), 28 | # We will use the date (not date-time) in the recipe below 29 | date = as.Date(time_hour) 30 | ) %>% 31 | # Include the weather data 32 | inner_join(weather, by = c("origin", "time_hour")) %>% 33 | # Only retain the specific columns we will use 34 | select(dep_time, flight, origin, dest, air_time, distance, 35 | carrier, date, arr_delay, time_hour) %>% 36 | # Exclude missing data 37 | na.omit() %>% 38 | # For creating models, it is better to have qualitative columns 39 | # encoded as factors (instead of character strings) 40 | mutate_if(is.character, as.factor) %>% 41 | sample_n(4000) 42 | 43 | ## ----------------------------------------------------------------------------- 44 | 45 | flights_rec <- 46 | recipe(arr_delay ~ ., data = flight_data) %>% 47 | update_role(flight, time_hour, new_role = "ID") %>% 48 | step_date(date, features = c("dow", "month")) %>% 49 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 50 | step_rm(date) %>% 51 | step_mutate(flight = as.factor(flight)) %>% 52 | step_lencode_bayes(flight, outcome = vars(arr_delay)) %>% 53 | step_dummy(all_nominal_predictors()) %>% 54 | step_zv(all_predictors()) 55 | 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_20_with.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 20 13 | preproc <- "light preprocessing" 14 | par_method <- "everything" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_05_20_without.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 20 13 | preproc <- "no preprocessing" 14 | par_method <- "everything" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/everything_times.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/everything_times.RData -------------------------------------------------------------------------------- /extras/parallel_times/resamples_05_01_with.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 1 13 | preproc <- "light preprocessing" 14 | par_method <- "resamples" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/resamples_05_01_without.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 1 13 | preproc <- "no preprocessing" 14 | par_method <- "resamples" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/resamples_05_02_with.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 2 13 | preproc <- "light preprocessing" 14 | par_method <- "resamples" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/resamples_05_02_without.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 2 13 | preproc <- "no preprocessing" 14 | par_method <- "resamples" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/resamples_05_03_with.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 3 13 | preproc <- "light preprocessing" 14 | par_method <- "resamples" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/resamples_05_03_without.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 3 13 | preproc <- "no preprocessing" 14 | par_method <- "resamples" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/resamples_05_04_with.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 4 13 | preproc <- "light preprocessing" 14 | par_method <- "resamples" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/resamples_05_04_without.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 4 13 | preproc <- "no preprocessing" 14 | par_method <- "resamples" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/resamples_05_05_with.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 5 13 | preproc <- "light preprocessing" 14 | par_method <- "resamples" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/resamples_05_05_without.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(nycflights13) 3 | library(doMC) 4 | library(rlang) 5 | library(xgboost) 6 | library(vctrs) 7 | 8 | ## ----------------------------------------------------------------------------- 9 | 10 | num_resamples <- 5 11 | num_grid <- 10 12 | num_cores <- 5 13 | preproc <- "no preprocessing" 14 | par_method <- "resamples" 15 | 16 | ## ----------------------------------------------------------------------------- 17 | 18 | set.seed(123) 19 | 20 | flight_data <- 21 | flights %>% 22 | mutate( 23 | # Convert the arrival delay to a factor 24 | arr_delay = ifelse(arr_delay >= 30, "late", "on_time"), 25 | arr_delay = factor(arr_delay), 26 | # We will use the date (not date-time) in the recipe below 27 | date = as.Date(time_hour) 28 | ) %>% 29 | # Include the weather data 30 | inner_join(weather, by = c("origin", "time_hour")) %>% 31 | # Only retain the specific columns we will use 32 | select(dep_time, flight, origin, dest, air_time, distance, 33 | carrier, date, arr_delay, time_hour) %>% 34 | # Exclude missing data 35 | na.omit() %>% 36 | # For creating models, it is better to have qualitative columns 37 | # encoded as factors (instead of character strings) 38 | mutate_if(is.character, as.factor) %>% 39 | sample_n(4000) 40 | 41 | ## ----------------------------------------------------------------------------- 42 | 43 | flights_rec <- 44 | recipe(arr_delay ~ ., data = flight_data) %>% 45 | update_role(flight, time_hour, new_role = "ID") %>% 46 | step_date(date, features = c("dow", "month")) %>% 47 | step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 48 | step_rm(date) %>% 49 | step_dummy(all_nominal_predictors()) %>% 50 | step_zv(all_predictors()) 51 | 52 | preproc_data <- 53 | flights_rec %>% 54 | prep() %>% 55 | juice(all_predictors(), all_outcomes()) 56 | 57 | ## ----------------------------------------------------------------------------- 58 | 59 | xgboost_spec <- 60 | boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 61 | loss_reduction = tune(), sample_size = tune()) %>% 62 | set_mode("classification") %>% 63 | set_engine("xgboost") 64 | 65 | ## ----------------------------------------------------------------------------- 66 | 67 | if (preproc != "no preprocessing") { 68 | xgboost_workflow <- 69 | workflow() %>% 70 | add_recipe(flights_rec) %>% 71 | add_model(xgboost_spec) 72 | 73 | set.seed(33) 74 | bt <- bootstraps(flight_data, times = num_resamples) 75 | } else { 76 | xgboost_workflow <- 77 | workflow() %>% 78 | add_variables(arr_delay, predictors = c(everything())) %>% 79 | add_model(xgboost_spec) 80 | 81 | set.seed(33) 82 | bt <- bootstraps(preproc_data, times = num_resamples) 83 | } 84 | 85 | ## ----------------------------------------------------------------------------- 86 | 87 | set.seed(22) 88 | xgboost_grid <- 89 | xgboost_workflow %>% 90 | parameters() %>% 91 | update(trees = trees(c(100, 2000))) %>% 92 | grid_max_entropy(size = num_grid) 93 | 94 | ## ----------------------------------------------------------------------------- 95 | 96 | if (num_cores > 1) { 97 | registerDoMC(cores=num_cores) 98 | } 99 | 100 | ## ----------------------------------------------------------------------------- 101 | 102 | roc_res <- metric_set(roc_auc) 103 | 104 | ctrl <- control_grid(parallel_over = par_method) 105 | 106 | grid_time <- system.time({ 107 | set.seed(99) 108 | xgboost_workflow %>% 109 | tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl) 110 | }) 111 | 112 | ## ----------------------------------------------------------------------------- 113 | 114 | times <- tibble::tibble( 115 | elapsed = grid_time[3], 116 | num_resamples = num_resamples, 117 | num_grid = num_grid, 118 | num_cores = num_cores, 119 | preproc = preproc, 120 | par_method = par_method 121 | ) 122 | 123 | 124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData"))) 125 | 126 | sessioninfo::session_info() 127 | 128 | if (!interactive()) { 129 | q("no") 130 | } 131 | 132 | -------------------------------------------------------------------------------- /extras/parallel_times/resamples_times.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/resamples_times.RData -------------------------------------------------------------------------------- /extras/parallel_times/runs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | R CMD BATCH --vanilla everything_05_03_expensive.R 4 | sleep 20 5 | R CMD BATCH --vanilla everything_05_10_expensive.R 6 | sleep 20 7 | R CMD BATCH --vanilla resamples_05_04_without.R 8 | sleep 20 9 | R CMD BATCH --vanilla resamples_05_03_without.R 10 | sleep 20 11 | R CMD BATCH --vanilla everything_05_04_without.R 12 | sleep 20 13 | R CMD BATCH --vanilla everything_05_01_expensive.R 14 | sleep 20 15 | R CMD BATCH --vanilla everything_05_02_with.R 16 | sleep 20 17 | R CMD BATCH --vanilla everything_05_03_without.R 18 | sleep 20 19 | R CMD BATCH --vanilla everything_05_15_without.R 20 | sleep 20 21 | R CMD BATCH --vanilla everything_05_20_with.R 22 | sleep 20 23 | R CMD BATCH --vanilla resamples_05_01_with.R 24 | sleep 20 25 | R CMD BATCH --vanilla resamples_05_05_expensive.R 26 | sleep 20 27 | R CMD BATCH --vanilla everything_05_15_with.R 28 | sleep 20 29 | R CMD BATCH --vanilla everything_05_10_without.R 30 | sleep 20 31 | R CMD BATCH --vanilla resamples_05_02_without.R 32 | sleep 20 33 | R CMD BATCH --vanilla resamples_05_02_expensive.R 34 | sleep 20 35 | R CMD BATCH --vanilla everything_05_20_without.R 36 | sleep 20 37 | R CMD BATCH --vanilla resamples_05_04_with.R 38 | sleep 20 39 | R CMD BATCH --vanilla everything_05_01_with.R 40 | sleep 20 41 | R CMD BATCH --vanilla resamples_05_03_with.R 42 | sleep 20 43 | R CMD BATCH --vanilla resamples_05_05_without.R 44 | sleep 20 45 | R CMD BATCH --vanilla everything_05_10_with.R 46 | sleep 20 47 | R CMD BATCH --vanilla resamples_05_01_without.R 48 | sleep 20 49 | R CMD BATCH --vanilla everything_05_05_with.R 50 | sleep 20 51 | R CMD BATCH --vanilla everything_05_05_without.R 52 | sleep 20 53 | R CMD BATCH --vanilla everything_05_03_with.R 54 | sleep 20 55 | R CMD BATCH --vanilla everything_05_05_expensive.R 56 | sleep 20 57 | R CMD BATCH --vanilla resamples_05_01_expensive.R 58 | sleep 20 59 | R CMD BATCH --vanilla everything_05_02_without.R 60 | sleep 20 61 | R CMD BATCH --vanilla everything_05_15_expensive.R 62 | sleep 20 63 | R CMD BATCH --vanilla everything_05_02_expensive.R 64 | sleep 20 65 | R CMD BATCH --vanilla resamples_05_02_with.R 66 | sleep 20 67 | R CMD BATCH --vanilla resamples_05_03_expensive.R 68 | sleep 20 69 | R CMD BATCH --vanilla resamples_05_04_expensive.R 70 | sleep 20 71 | R CMD BATCH --vanilla resamples_05_05_with.R 72 | sleep 20 73 | R CMD BATCH --vanilla everything_05_20_expensive.R 74 | sleep 20 75 | R CMD BATCH --vanilla everything_05_04_expensive.R 76 | sleep 20 77 | R CMD BATCH --vanilla everything_05_04_with.R 78 | sleep 20 79 | R CMD BATCH --vanilla everything_05_01_without.R 80 | -------------------------------------------------------------------------------- /extras/parallel_times/tune_iter_times_everything.R: -------------------------------------------------------------------------------- 1 | #remotes::install_github("tidymodels/tune@monitor-execution-times") 2 | # This will try to write to ~/tmp 3 | library(tidymodels) 4 | library(doParallel) 5 | cl <- makePSOCKcluster(10) 6 | registerDoParallel(cl) 7 | 8 | options(width = 120) 9 | 10 | data(cells) 11 | cells <- cells %>% select(-case) 12 | 13 | set.seed(6735) 14 | folds <- vfold_cv(cells, v = 5) 15 | 16 | 17 | cell_rec <- 18 | recipe(class ~ ., data = cells) %>% 19 | step_normalize(all_numeric_predictors()) %>% 20 | step_ica(all_numeric_predictors(), num_comp = 30) 21 | 22 | rf_mod <- 23 | rand_forest(mtry = tune(), min_n = tune(), trees = 50) %>% 24 | set_engine("ranger") %>% 25 | set_mode("classification") 26 | 27 | # Use a space-filling design with 7 points 28 | set.seed(3254) 29 | rf_res <- tune_grid(rf_mod, cell_rec, resamples = folds, grid = 7, 30 | control = control_grid(parallel_over = "everything")) 31 | 32 | 33 | f_names <- list.files("~/tmp", pattern = "^time", full.names = TRUE) 34 | 35 | timings <- NULL 36 | for (i in f_names) { 37 | load(i) 38 | timings <- bind_rows(timings, res) 39 | } 40 | 41 | 42 | everyting_times <- 43 | timings %>% 44 | mutate( 45 | label = ifelse(mod_iter == 0, "preprocess", "model"), 46 | label = factor(label, levels = rev(c("preprocess", "model"))), 47 | pid = factor(format(pid)), 48 | pid = paste("worker", format(as.numeric(pid))) 49 | ) %>% 50 | arrange(pid, id, label) 51 | 52 | -------------------------------------------------------------------------------- /extras/parallel_times/tune_iter_times_resamples.R: -------------------------------------------------------------------------------- 1 | #remotes::install_github("tidymodels/tune@monitor-execution-times") 2 | # This will try to write to ~/tmp 3 | library(tidymodels) 4 | library(doParallel) 5 | cl <- makePSOCKcluster(10) 6 | registerDoParallel(cl) 7 | 8 | options(width = 120) 9 | 10 | data(cells) 11 | cells <- cells %>% select(-case) 12 | 13 | set.seed(6735) 14 | folds <- vfold_cv(cells, v = 5) 15 | 16 | 17 | cell_rec <- 18 | recipe(class ~ ., data = cells) %>% 19 | step_normalize(all_numeric_predictors()) %>% 20 | step_ica(all_numeric_predictors(), num_comp = 30) 21 | 22 | rf_mod <- 23 | rand_forest(mtry = tune(), min_n = tune(), trees = 50) %>% 24 | set_engine("ranger") %>% 25 | set_mode("classification") 26 | 27 | # Use a space-filling design with 7 points 28 | set.seed(3254) 29 | rf_res <- tune_grid(rf_mod, cell_rec, resamples = folds, grid = 7, 30 | control = control_grid(parallel_over = "resamples")) 31 | 32 | 33 | f_names <- list.files("~/tmp", pattern = "^time", full.names = TRUE) 34 | 35 | timings <- NULL 36 | for (i in f_names) { 37 | load(i) 38 | timings <- bind_rows(timings, res) 39 | } 40 | 41 | 42 | resamples_times <- 43 | timings %>% 44 | mutate( 45 | label = ifelse(mod_iter == 0, "preprocess", "model"), 46 | label = factor(label, levels = rev(c("preprocess", "model"))), 47 | pid = factor(format(pid)), 48 | pid = paste("worker", format(as.numeric(pid))), 49 | id_alt = paste(id, "/", pid) 50 | ) %>% 51 | arrange(pid, id, label) 52 | 53 | 54 | -------------------------------------------------------------------------------- /extras/parallel_times/xgb_10_2020_10_28_17_49_51.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_10_2020_10_28_17_49_51.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_10_2020_10_28_21_28_17.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_10_2020_10_28_21_28_17.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_10_2020_10_28_22_43_51.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_10_2020_10_28_22_43_51.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_15_2020_10_28_20_51_49.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_15_2020_10_28_20_51_49.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_15_2020_10_28_21_23_27.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_15_2020_10_28_21_23_27.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_15_2020_10_29_00_52_42.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_15_2020_10_29_00_52_42.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_1_2020_10_28_20_29_50.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_1_2020_10_28_20_29_50.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_1_2020_10_28_21_13_01.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_1_2020_10_28_21_13_01.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_1_2020_10_28_22_26_43.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_1_2020_10_28_22_26_43.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_1_2020_10_28_23_01_51.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_1_2020_10_28_23_01_51.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_1_2020_10_29_00_23_51.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_1_2020_10_29_00_23_51.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_1_2020_10_29_04_04_38.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_1_2020_10_29_04_04_38.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_20_2020_10_28_20_55_16.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_20_2020_10_28_20_55_16.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_20_2020_10_28_22_01_02.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_20_2020_10_28_22_01_02.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_20_2020_10_29_03_01_36.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_20_2020_10_29_03_01_36.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_2_2020_10_28_20_41_12.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_2_2020_10_28_20_41_12.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_2_2020_10_28_21_39_20.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_2_2020_10_28_21_39_20.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_2_2020_10_28_21_57_32.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_2_2020_10_28_21_57_32.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_2_2020_10_29_00_35_18.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_2_2020_10_29_00_35_18.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_2_2020_10_29_02_04_38.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_2_2020_10_29_02_04_38.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_2_2020_10_29_02_15_36.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_2_2020_10_29_02_15_36.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_3_2020_10_28_17_30_51.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_3_2020_10_28_17_30_51.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_3_2020_10_28_18_05_27.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_3_2020_10_28_18_05_27.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_3_2020_10_28_20_48_31.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_3_2020_10_28_20_48_31.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_3_2020_10_28_22_34_29.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_3_2020_10_28_22_34_29.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_3_2020_10_28_23_21_46.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_3_2020_10_28_23_21_46.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_3_2020_10_29_02_28_28.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_3_2020_10_29_02_28_28.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_4_2020_10_28_17_57_37.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_4_2020_10_28_17_57_37.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_4_2020_10_28_18_11_50.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_4_2020_10_28_18_11_50.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_4_2020_10_28_22_08_45.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_4_2020_10_28_22_08_45.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_4_2020_10_29_02_41_00.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_4_2020_10_29_02_41_00.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_4_2020_10_29_03_40_17.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_4_2020_10_29_03_40_17.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_4_2020_10_29_03_46_38.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_4_2020_10_29_03_46_38.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_5_2020_10_28_21_20_10.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_5_2020_10_28_21_20_10.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_5_2020_10_28_22_39_01.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_5_2020_10_28_22_39_01.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_5_2020_10_28_23_08_07.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_5_2020_10_28_23_08_07.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_5_2020_10_28_23_14_28.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_5_2020_10_28_23_14_28.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_5_2020_10_28_23_54_07.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_5_2020_10_28_23_54_07.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_5_2020_10_29_02_45_30.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_5_2020_10_29_02_45_30.RData -------------------------------------------------------------------------------- /extras/parallel_times/xgb_times.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_times.RData -------------------------------------------------------------------------------- /extras/sa_2d_plot.R: -------------------------------------------------------------------------------- 1 | sa_2d_plot <- function(sa_obj, history, large_sa, path = tempdir()) { 2 | range_data <- 3 | sa_obj %>% 4 | collect_metrics() %>% 5 | select(cost, rbf_sigma) %>% 6 | bind_rows( 7 | large_sa %>% 8 | collect_metrics() %>% 9 | select(cost, rbf_sigma) 10 | ) %>% 11 | mutate( 12 | cost = log2(cost), 13 | rbf_sigma = log10(rbf_sigma) 14 | ) 15 | x_rng <- 10^extendrange(range_data$rbf_sigma) 16 | y_rng <- 2^extendrange(range_data$cost) 17 | 18 | params <- 19 | sa_obj %>% 20 | collect_metrics() %>% 21 | select(.iter, cost, rbf_sigma) %>% 22 | arrange(.iter) 23 | 24 | init <- 25 | params %>% 26 | filter(.iter == 0) 27 | 28 | ## ----------------------------------------------------------------------------- 29 | 30 | svm_roc <- 31 | large_sa %>% 32 | collect_metrics() 33 | 34 | large_plot <- 35 | svm_roc %>% 36 | ggplot(aes(x = rbf_sigma, y = cost)) + 37 | geom_raster(aes(fill = mean), show.legend = FALSE) + 38 | scale_x_log10(labels = fmt_dcimals(2), limits = x_rng) + 39 | scale_y_continuous(trans = "log2", labels = fmt_dcimals(2), limits = y_rng) + 40 | scale_fill_distiller(palette = "Blues") + 41 | theme_minimal() + 42 | theme( 43 | legend.position = "bottom", 44 | legend.key.width = grid::unit(2, "cm"), 45 | plot.title = element_text(hjust = 0.5) 46 | ) + 47 | guides(title.position = "bottom") + 48 | labs(x = "rbf_sigma\n\n\n\n", title = "ROC AUC surface") + 49 | coord_fixed(ratio = 1/2.5) 50 | 51 | base_plot <- 52 | large_plot + 53 | geom_point(data = init, pch = 4, cex = 4) 54 | 55 | ## ----------------------------------------------------------------------------- 56 | 57 | num_init <- nrow(init) 58 | num_iter <- max(history$.iter) 59 | 60 | nms <- purrr::map_chr(1:nrow(history), ~ tempfile()) 61 | 62 | for (i in (num_init + 1):nrow(history)) { 63 | current_iter <- history$.iter[i] 64 | current_res <- current_param_path(history, current_iter) 65 | current_best <- current_res %>% dplyr::filter(results == "new best") 66 | 67 | ttl <- paste0("Iteration ", current_iter) 68 | 69 | text_just <- 70 | case_when( 71 | history$results[i] == "restart from best" ~0.00, 72 | history$results[i] == "discard suboptimal" ~ 0.25, 73 | history$results[i] == "accept suboptimal" ~ 0.50, 74 | history$results[i] == "better suboptimal" ~ 0.75, 75 | history$results[i] == "new best" ~ 1.00 76 | ) 77 | 78 | tmp <- history 79 | tmp$results <- gsub(" suboptimal", "\nsuboptimal", tmp$results) 80 | tmp$results <- gsub(" best", "\nbest", tmp$results) 81 | 82 | new_plot <- 83 | base_plot + 84 | geom_point( 85 | data = current_res %>% slice(n()), 86 | size = 3, 87 | col = "green" 88 | ) + 89 | geom_path( 90 | data = current_res, 91 | alpha = .5, 92 | arrow = arrow(length = unit(0.1, "inches")) 93 | ) + 94 | ggtitle(ttl, subtitle = tmp$results[i]) + 95 | theme(plot.subtitle = element_text(hjust = text_just)) 96 | 97 | if (nrow(current_best) > 0) { 98 | new_plot <- 99 | new_plot + 100 | geom_point(data = current_best, size = 1/3) 101 | } 102 | print(new_plot) 103 | } 104 | invisible(NULL) 105 | } 106 | 107 | current_param_path <- function(x, iter) { 108 | x <- 109 | x %>% 110 | dplyr::filter(.iter <= iter) 111 | ind <- nrow(x) 112 | param_path <- ind 113 | while(length(ind) > 0) { 114 | ind <- which(x$.config == x$.parent[ind]) 115 | param_path <- c(param_path, ind) 116 | } 117 | x %>% dplyr::slice(rev(param_path)) 118 | } 119 | -------------------------------------------------------------------------------- /extras/submodels/with_submodel_trick.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(tictoc) 3 | library(doMC) 4 | 5 | ## ----------------------------------------------------------------------------- 6 | 7 | data(cells) 8 | cells <- cells %>% select(-case) 9 | set.seed(33) 10 | cell_folds <- vfold_cv(cells) 11 | roc_res <- metric_set(roc_auc) 12 | 13 | ## ----------------------------------------------------------------------------- 14 | 15 | c5_spec <- 16 | boost_tree(trees = tune()) %>% 17 | set_engine("C5.0") %>% 18 | set_mode("classification") 19 | 20 | tic() 21 | set.seed(2) 22 | c5_spec %>% 23 | tune_grid( 24 | class ~ ., 25 | resamples = cell_folds, 26 | grid = data.frame(trees = 1:100), 27 | metrics = roc_res 28 | ) 29 | toc() 30 | 31 | ## ----------------------------------------------------------------------------- 32 | 33 | registerDoMC(cores = 10) 34 | 35 | tic() 36 | set.seed(2) 37 | c5_spec %>% 38 | tune_grid( 39 | class ~ ., 40 | resamples = cell_folds, 41 | grid = data.frame(trees = 1:100), 42 | metrics = roc_res 43 | ) 44 | toc() 45 | 46 | ## ----------------------------------------------------------------------------- 47 | 48 | sessioninfo::session_info() 49 | 50 | q("no") 51 | 52 | -------------------------------------------------------------------------------- /extras/submodels/without_submodel_trick.R: -------------------------------------------------------------------------------- 1 | # remotes::install_github("tidymodels/parsnip@no-submodel-trick") 2 | library(tidymodels) 3 | library(tictoc) 4 | library(doMC) 5 | 6 | ## ----------------------------------------------------------------------------- 7 | 8 | data(cells) 9 | cells <- cells %>% select(-case) 10 | set.seed(33) 11 | cell_folds <- vfold_cv(cells) 12 | roc_res <- metric_set(roc_auc) 13 | 14 | ## ----------------------------------------------------------------------------- 15 | 16 | c5_spec <- 17 | boost_tree(trees = tune()) %>% 18 | set_engine("C5.0") %>% 19 | set_mode("classification") 20 | 21 | tic() 22 | set.seed(2) 23 | c5_spec %>% 24 | tune_grid( 25 | class ~ ., 26 | resamples = cell_folds, 27 | grid = data.frame(trees = 1:100), 28 | metrics = roc_res 29 | ) 30 | toc() 31 | 32 | ## ----------------------------------------------------------------------------- 33 | 34 | registerDoMC(cores = 10) 35 | 36 | tic() 37 | set.seed(2) 38 | c5_spec %>% 39 | tune_grid( 40 | class ~ ., 41 | resamples = cell_folds, 42 | grid = data.frame(trees = 1:100), 43 | metrics = roc_res 44 | ) 45 | toc() 46 | 47 | ## ----------------------------------------------------------------------------- 48 | 49 | sessioninfo::session_info() 50 | 51 | q("no") 52 | 53 | -------------------------------------------------------------------------------- /extras/verify_results.R: -------------------------------------------------------------------------------- 1 | # These functions make sure that our results have not changed so that the interpretation 2 | # in the text is not incorrect. 3 | 4 | verify_consistent_bo <- function(x) { 5 | # initial results generated on 2022-02-16 6 | load("RData/svm_bo_metrics.RData") 7 | bo_check <- all.equal(x, svm_bo_metrics, tolerance = 0.01) 8 | if (!isTRUE(bo_check)) { 9 | msg <- "These Bayesian optimization results don't match the previous values.:\n" 10 | msg <- paste0(msg, paste0(bo_check, collapse = "\n")) 11 | rlang::abort(msg) 12 | } 13 | invisible(NULL) 14 | } 15 | 16 | 17 | verify_consistent_sa <- function(x) { 18 | # initial results generated on 2022-02-16 19 | load("RData/svm_sa_metrics.RData") 20 | sa_check <- all.equal(x, svm_sa_metrics, tolerance = 0.01) 21 | if (!isTRUE(sa_check)) { 22 | msg <- "These simulated annealing results don't match the previous values.:\n" 23 | msg <- paste0(msg, paste0(sa_check, collapse = "\n")) 24 | rlang::abort(msg) 25 | } 26 | invisible(NULL) 27 | } 28 | -------------------------------------------------------------------------------- /figures/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/figures/.DS_Store -------------------------------------------------------------------------------- /figures/introduction-descr-examples-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/figures/introduction-descr-examples-1.pdf -------------------------------------------------------------------------------- /figures/introduction-descr-examples-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/figures/introduction-descr-examples-1.png -------------------------------------------------------------------------------- /figures/introduction-modeling-process-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/figures/introduction-modeling-process-1.pdf -------------------------------------------------------------------------------- /figures/tidyverse-cricket-plot-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/figures/tidyverse-cricket-plot-1.pdf -------------------------------------------------------------------------------- /figures/tidyverse-interaction-plots-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/figures/tidyverse-interaction-plots-1.pdf -------------------------------------------------------------------------------- /images/cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/images/cover.png -------------------------------------------------------------------------------- /images/error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/images/error.png -------------------------------------------------------------------------------- /images/note.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/images/note.png -------------------------------------------------------------------------------- /images/robot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/images/robot.png -------------------------------------------------------------------------------- /images/rstudio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/images/rstudio.png -------------------------------------------------------------------------------- /images/tip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/images/tip.png -------------------------------------------------------------------------------- /images/warning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/images/warning.png -------------------------------------------------------------------------------- /issue_template.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report or feature request 3 | about: Describe a bug you've seen or make suggestion 4 | --- 5 | 6 | # PLEASE READ: Making a new issue for _Tidy Models with R_ 7 | 8 | 9 | Thanks for giving us feedback on the book. Please follow these guidelines: 10 | 11 | Please state the version of the book by referencing the **version** and **date** on the book's first page (under the title and authors). 12 | 13 | 14 | ## Comments or questions on the content 15 | 16 | Please tell us where exactly the issue is (e.g. chapter, section, figure number, etc.) and, whenever possible, copy/paste the text in question using `>` in the issue. 17 | For example: 18 | 19 | > In 0.0.1.9000 (2020-03-05), capitalization in article/book names are inconsistent. Also, sometimes initials are used for names, sometimes not: 20 | 21 | > Agresti, Alan. 2012. Categorical Data Analysis. Wiley-Interscience. 22 | > Altman, D. 1991. “Categorising Continuous Variables.” British Journal of Cancer, no. 5:975. 23 | 24 | ## Potential Bugs 25 | 26 | Since the repo and data are publicly available, it should be entirely possible to create a minimal reprex (reproducible example). 27 | 28 | The goal of a reprex is to make it as easy as possible for me to recreate your problem so that I can fix it: please help me help you! 29 | 30 | If you've never heard of a reprex before, start by reading "[What is a reprex](https://github.com/tidyverse/reprex#what-is-a-reprex)", and follow the advice further down that page. 31 | 32 | 33 | ## Contributions 34 | 35 | These details are in `contributing.md` -------------------------------------------------------------------------------- /latex_extras/preamble.tex: -------------------------------------------------------------------------------- 1 | % Begin preamble.tex ----------------------------------------------------------- 2 | 3 | % ------------------------------------------------------------------------------ 4 | % size based on "Statistical Rethinking" style files 5 | 6 | \usepackage[paperwidth=7.67in,paperheight=10.67in,layoutwidth=7in,layoutheight=10in,text={5.5in,8.5in},left=0.65in,top=0.75in,headheight=0.25in,headsep=0.4in,footskip=0.4in,showcrop, layouthoffset=0.33in, layoutvoffset=0.33in]{geometry} 7 | 8 | % ------------------------------------------------------------------------------ 9 | % headers 10 | 11 | \usepackage{fancyhdr} 12 | 13 | \pagestyle{fancy} 14 | \fancyhf{} 15 | \fancyhead[CO]{\nouppercase{\emph{\rightmark}}} 16 | \fancyhead[CE]{\nouppercase{\emph{\leftmark}}} 17 | \fancyhead[RO]{\thepage} 18 | \fancyhead[LE]{\thepage} 19 | % no line in header or footer 20 | \renewcommand{\headrulewidth}{0pt} 21 | 22 | % Code chunk mods ------------------------------------------------------------- 23 | \DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\},fontsize=\small} 24 | \renewcommand{\CommentTok}[1]{\textcolor[rgb]{0.41,0.41,0.41}{\texttt{#1}}} 25 | 26 | % Render custom blocks --------------------------------------------------------- 27 | % See https://github.com/rstudio/bookdown/issues/420 --------------------------- 28 | 29 | \makeatletter 30 | \newenvironment{kframe}{% 31 | \medskip{} 32 | \setlength{\fboxsep}{.8em} 33 | \def\at@end@of@kframe{}% 34 | \ifinner\ifhmode% 35 | \def\at@end@of@kframe{\end{minipage}}% 36 | \begin{minipage}{\columnwidth}% 37 | \fi\fi% 38 | \def\FrameCommand##1{\hskip\@totalleftmargin \hskip-\fboxsep 39 | \colorbox{shadecolor}{##1}\hskip-\fboxsep 40 | % There is no \\@totalrightmargin, so: 41 | \hskip-\linewidth \hskip-\@totalleftmargin \hskip\columnwidth}% 42 | \MakeFramed {\advance\hsize-\width 43 | \@totalleftmargin\z@ \linewidth\hsize 44 | \@setminipage}}% 45 | {\par\unskip\endMakeFramed% 46 | \at@end@of@kframe} 47 | \makeatother 48 | 49 | \makeatletter 50 | \@ifundefined{Shaded}{ 51 | }{\renewenvironment{Shaded}{\begin{kframe}}{\end{kframe}}} 52 | \makeatother 53 | 54 | \newenvironment{rmdblock}[1] 55 | { 56 | \begin{itemize} 57 | \renewcommand{\labelitemi}{ 58 | \raisebox{-.7\height}[0pt][0pt]{ 59 | {\setkeys{Gin}{width=3em,keepaspectratio}\includegraphics{images/#1}} 60 | } 61 | } 62 | \setlength{\fboxsep}{1em} 63 | \begin{kframe} 64 | \item 65 | } 66 | { 67 | \end{kframe} 68 | \end{itemize} 69 | } 70 | \newenvironment{rmdnote} 71 | {\begin{rmdblock}{note}} 72 | {\end{rmdblock}} 73 | \newenvironment{rmdcaution} 74 | {\begin{rmdblock}{caution}} 75 | {\end{rmdblock}} 76 | \newenvironment{rmdimportant} 77 | {\begin{rmdblock}{important}} 78 | {\end{rmdblock}} 79 | \newenvironment{rmdtip} 80 | {\begin{rmdblock}{tip}} 81 | {\end{rmdblock}} 82 | \newenvironment{rmdwarning} 83 | {\begin{rmdblock}{warning}} 84 | {\end{rmdblock}} 85 | 86 | 87 | % End preamble.tex ------------------------------------------------------------- 88 | -------------------------------------------------------------------------------- /premade/addin.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/addin.gif -------------------------------------------------------------------------------- /premade/ames.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/ames.png -------------------------------------------------------------------------------- /premade/bad-workflow.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/bad-workflow.pdf -------------------------------------------------------------------------------- /premade/bootstraps.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/bootstraps.pdf -------------------------------------------------------------------------------- /premade/crawford.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/crawford.png -------------------------------------------------------------------------------- /premade/data-science-model.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/data-science-model.graffle -------------------------------------------------------------------------------- /premade/data-science-model.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/data-science-model.pdf -------------------------------------------------------------------------------- /premade/data-science-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/data-science-model.png -------------------------------------------------------------------------------- /premade/dot_rr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/dot_rr.png -------------------------------------------------------------------------------- /premade/exp_improve.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/exp_improve.gif -------------------------------------------------------------------------------- /premade/good-proper-workflows.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/good-proper-workflows.graffle -------------------------------------------------------------------------------- /premade/mitchell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/mitchell.png -------------------------------------------------------------------------------- /premade/modeling-process.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/modeling-process.graffle -------------------------------------------------------------------------------- /premade/modeling-process.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/modeling-process.pdf -------------------------------------------------------------------------------- /premade/modeling-process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/modeling-process.png -------------------------------------------------------------------------------- /premade/morphology.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/morphology.png -------------------------------------------------------------------------------- /premade/northridge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/northridge.png -------------------------------------------------------------------------------- /premade/proper-workflow.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/proper-workflow.pdf -------------------------------------------------------------------------------- /premade/recipes-process.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/recipes-process.graffle -------------------------------------------------------------------------------- /premade/recipes-process.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/recipes-process.pdf -------------------------------------------------------------------------------- /premade/resampling-details.graffle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/resampling-details.graffle -------------------------------------------------------------------------------- /premade/resampling.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/resampling.pdf -------------------------------------------------------------------------------- /premade/roc_surface.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/roc_surface.png -------------------------------------------------------------------------------- /premade/rolling.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/rolling.pdf -------------------------------------------------------------------------------- /premade/three-CV-iter.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/three-CV-iter.pdf -------------------------------------------------------------------------------- /premade/three-CV.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/three-CV.pdf -------------------------------------------------------------------------------- /premade/timberland.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/timberland.png -------------------------------------------------------------------------------- /premade/validation-alt.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/validation-alt.pdf -------------------------------------------------------------------------------- /premade/validation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/validation.pdf -------------------------------------------------------------------------------- /race_results.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/race_results.mp4 -------------------------------------------------------------------------------- /references.Rmd: -------------------------------------------------------------------------------- 1 | # REFERENCES {-} 2 | 3 | -------------------------------------------------------------------------------- /sa_search.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/sa_search.mp4 -------------------------------------------------------------------------------- /style.css: -------------------------------------------------------------------------------- 1 | 2 | .rmdnote, .rmdtip, .rmdwarning { 3 | padding: 1em 1em 1em 4em; 4 | margin-bottom: 10px; 5 | background: #f5f5f5 5px center/3em no-repeat; 6 | } 7 | 8 | .rmdnote { 9 | background-image: url("images/note.png"); 10 | } 11 | .rmdtip { 12 | background-image: url("images/tip.png"); 13 | } 14 | .rmdwarning { 15 | background-image: url("images/warning.png"); 16 | } 17 | 18 | --------------------------------------------------------------------------------