├── .Rbuildignore
├── .gitattributes
├── .github
    ├── .gitignore
    └── workflows
    │   ├── bookdown.yaml
    │   └── lock.yaml
├── .gitignore
├── 01-software-modeling.Rmd
├── 02-tidyverse.Rmd
├── 03-base-r.Rmd
├── 04-ames.Rmd
├── 05-data-spending.Rmd
├── 06-fitting-models.Rmd
├── 07-the-model-workflow.Rmd
├── 08-feature-engineering.Rmd
├── 09-judging-model-effectiveness.Rmd
├── 10-resampling.Rmd
├── 11-comparing-models.Rmd
├── 12-tuning-parameters.Rmd
├── 13-grid-search.Rmd
├── 14-iterative-search.Rmd
├── 15-workflow-sets.Rmd
├── 16-dimensionality-reduction.Rmd
├── 17-encoding-categorical-data.Rmd
├── 18-explaining-models-and-predictions.Rmd
├── 19-when-should-you-trust-predictions.Rmd
├── 20-ensemble-models.Rmd
├── 21-inferential-analysis.Rmd
├── DESCRIPTION
├── LICENSE.md
├── RData
    ├── Chicago_2020.RData
    ├── bears_at_home.RData
    ├── concrete_mixtures.RData
    ├── concrete_results.RData
    ├── dry_beans.RData
    ├── lm_fit.RData
    ├── mlp_times.RData
    ├── plm_resids.RData
    ├── post_intervals.RData
    ├── rda_fit.RData
    ├── resampling.RData
    ├── sa_history.RData
    ├── search_examples.RData
    ├── svm_bo_metrics.RData
    ├── svm_large.RData
    └── svm_sa_metrics.RData
├── README.Rmd
├── README.md
├── TMwR.Rproj
├── TMwR.bib
├── TMwR.css
├── _bookdown.yml
├── _common.R
├── _output.yaml
├── ames_snippets.R
├── bo_search.mp4
├── chi.csv
├── code_of_conduct.md
├── contributing.md
├── contributors.csv
├── displaced.Rmd
├── extras
    ├── affy_plm_ggplot.R
    ├── ames_posterior_intervals.R
    ├── ames_sf.R
    ├── bo_3panel_plot.R
    ├── cells_svm_large.R
    ├── cells_svm_large.Rout
    ├── dry_beans.R
    ├── nonlinear_function.R
    ├── parallel_times
    │   ├── collect.R
    │   ├── everything_05_01_expensive.R
    │   ├── everything_05_01_expensive.Rout
    │   ├── everything_05_01_with.R
    │   ├── everything_05_01_with.Rout
    │   ├── everything_05_01_without.R
    │   ├── everything_05_01_without.Rout
    │   ├── everything_05_02_expensive.R
    │   ├── everything_05_02_expensive.Rout
    │   ├── everything_05_02_with.R
    │   ├── everything_05_02_with.Rout
    │   ├── everything_05_02_without.R
    │   ├── everything_05_02_without.Rout
    │   ├── everything_05_03_expensive.R
    │   ├── everything_05_03_expensive.Rout
    │   ├── everything_05_03_with.R
    │   ├── everything_05_03_with.Rout
    │   ├── everything_05_03_without.R
    │   ├── everything_05_03_without.Rout
    │   ├── everything_05_04_expensive.R
    │   ├── everything_05_04_expensive.Rout
    │   ├── everything_05_04_with.R
    │   ├── everything_05_04_with.Rout
    │   ├── everything_05_04_without.R
    │   ├── everything_05_04_without.Rout
    │   ├── everything_05_05_expensive.R
    │   ├── everything_05_05_expensive.Rout
    │   ├── everything_05_05_with.R
    │   ├── everything_05_05_with.Rout
    │   ├── everything_05_05_without.R
    │   ├── everything_05_05_without.Rout
    │   ├── everything_05_10_expensive.R
    │   ├── everything_05_10_expensive.Rout
    │   ├── everything_05_10_with.R
    │   ├── everything_05_10_with.Rout
    │   ├── everything_05_10_without.R
    │   ├── everything_05_10_without.Rout
    │   ├── everything_05_15_expensive.R
    │   ├── everything_05_15_expensive.Rout
    │   ├── everything_05_15_with.R
    │   ├── everything_05_15_with.Rout
    │   ├── everything_05_15_without.R
    │   ├── everything_05_15_without.Rout
    │   ├── everything_05_20_expensive.R
    │   ├── everything_05_20_expensive.Rout
    │   ├── everything_05_20_with.R
    │   ├── everything_05_20_with.Rout
    │   ├── everything_05_20_without.R
    │   ├── everything_05_20_without.Rout
    │   ├── everything_times.RData
    │   ├── resamples_05_01_expensive.R
    │   ├── resamples_05_01_expensive.Rout
    │   ├── resamples_05_01_with.R
    │   ├── resamples_05_01_with.Rout
    │   ├── resamples_05_01_without.R
    │   ├── resamples_05_01_without.Rout
    │   ├── resamples_05_02_expensive.R
    │   ├── resamples_05_02_expensive.Rout
    │   ├── resamples_05_02_with.R
    │   ├── resamples_05_02_with.Rout
    │   ├── resamples_05_02_without.R
    │   ├── resamples_05_02_without.Rout
    │   ├── resamples_05_03_expensive.R
    │   ├── resamples_05_03_expensive.Rout
    │   ├── resamples_05_03_with.R
    │   ├── resamples_05_03_with.Rout
    │   ├── resamples_05_03_without.R
    │   ├── resamples_05_03_without.Rout
    │   ├── resamples_05_04_expensive.R
    │   ├── resamples_05_04_expensive.Rout
    │   ├── resamples_05_04_with.R
    │   ├── resamples_05_04_with.Rout
    │   ├── resamples_05_04_without.R
    │   ├── resamples_05_04_without.Rout
    │   ├── resamples_05_05_expensive.R
    │   ├── resamples_05_05_expensive.Rout
    │   ├── resamples_05_05_with.R
    │   ├── resamples_05_05_with.Rout
    │   ├── resamples_05_05_without.R
    │   ├── resamples_05_05_without.Rout
    │   ├── resamples_times.RData
    │   ├── runs.sh
    │   ├── tune_iter_times_everything.R
    │   ├── tune_iter_times_resamples.R
    │   ├── xgb_10_2020_10_28_17_49_51.RData
    │   ├── xgb_10_2020_10_28_21_28_17.RData
    │   ├── xgb_10_2020_10_28_22_43_51.RData
    │   ├── xgb_15_2020_10_28_20_51_49.RData
    │   ├── xgb_15_2020_10_28_21_23_27.RData
    │   ├── xgb_15_2020_10_29_00_52_42.RData
    │   ├── xgb_1_2020_10_28_20_29_50.RData
    │   ├── xgb_1_2020_10_28_21_13_01.RData
    │   ├── xgb_1_2020_10_28_22_26_43.RData
    │   ├── xgb_1_2020_10_28_23_01_51.RData
    │   ├── xgb_1_2020_10_29_00_23_51.RData
    │   ├── xgb_1_2020_10_29_04_04_38.RData
    │   ├── xgb_20_2020_10_28_20_55_16.RData
    │   ├── xgb_20_2020_10_28_22_01_02.RData
    │   ├── xgb_20_2020_10_29_03_01_36.RData
    │   ├── xgb_2_2020_10_28_20_41_12.RData
    │   ├── xgb_2_2020_10_28_21_39_20.RData
    │   ├── xgb_2_2020_10_28_21_57_32.RData
    │   ├── xgb_2_2020_10_29_00_35_18.RData
    │   ├── xgb_2_2020_10_29_02_04_38.RData
    │   ├── xgb_2_2020_10_29_02_15_36.RData
    │   ├── xgb_3_2020_10_28_17_30_51.RData
    │   ├── xgb_3_2020_10_28_18_05_27.RData
    │   ├── xgb_3_2020_10_28_20_48_31.RData
    │   ├── xgb_3_2020_10_28_22_34_29.RData
    │   ├── xgb_3_2020_10_28_23_21_46.RData
    │   ├── xgb_3_2020_10_29_02_28_28.RData
    │   ├── xgb_4_2020_10_28_17_57_37.RData
    │   ├── xgb_4_2020_10_28_18_11_50.RData
    │   ├── xgb_4_2020_10_28_22_08_45.RData
    │   ├── xgb_4_2020_10_29_02_41_00.RData
    │   ├── xgb_4_2020_10_29_03_40_17.RData
    │   ├── xgb_4_2020_10_29_03_46_38.RData
    │   ├── xgb_5_2020_10_28_21_20_10.RData
    │   ├── xgb_5_2020_10_28_22_39_01.RData
    │   ├── xgb_5_2020_10_28_23_08_07.RData
    │   ├── xgb_5_2020_10_28_23_14_28.RData
    │   ├── xgb_5_2020_10_28_23_54_07.RData
    │   ├── xgb_5_2020_10_29_02_45_30.RData
    │   └── xgb_times.RData
    ├── sa_2d_plot.R
    ├── submodels
    │   ├── with_submodel_trick.R
    │   ├── with_submodel_trick.Rout
    │   ├── without_submodel_trick.R
    │   └── without_submodel_trick.Rout
    └── verify_results.R
├── figures
    ├── .DS_Store
    ├── introduction-cricket-plot-1.svg
    ├── introduction-descr-examples-1.pdf
    ├── introduction-descr-examples-1.png
    ├── introduction-interaction-plots-1.svg
    ├── introduction-modeling-process-1.pdf
    ├── introduction-modeling-process-1.svg
    ├── tidyverse-cricket-plot-1.pdf
    ├── tidyverse-cricket-plot-1.svg
    ├── tidyverse-interaction-plots-1.pdf
    └── tidyverse-interaction-plots-1.svg
├── images
    ├── cover.png
    ├── error.png
    ├── note.png
    ├── robot.png
    ├── rstudio.png
    ├── tip.png
    └── warning.png
├── index.Rmd
├── issue_template.md
├── latex_extras
    └── preamble.tex
├── pre-proc-table.Rmd
├── premade
    ├── addin.gif
    ├── ames.png
    ├── bad-workflow.pdf
    ├── bad-workflow.svg
    ├── bootstraps.pdf
    ├── bootstraps.svg
    ├── crawford.png
    ├── data-science-model.graffle
    ├── data-science-model.pdf
    ├── data-science-model.png
    ├── data-science-model.svg
    ├── dot_rr.png
    ├── exp_improve.gif
    ├── good-proper-workflows.graffle
    ├── mitchell.png
    ├── modeling-process.graffle
    ├── modeling-process.pdf
    ├── modeling-process.png
    ├── modeling-process.svg
    ├── morphology.png
    ├── morphology.svg
    ├── northridge.png
    ├── proper-workflow.pdf
    ├── proper-workflow.svg
    ├── recipes-process.graffle
    ├── recipes-process.pdf
    ├── recipes-process.svg
    ├── resampling-details.graffle
    ├── resampling.pdf
    ├── resampling.svg
    ├── roc_surface.png
    ├── rolling.pdf
    ├── rolling.svg
    ├── three-CV-iter.pdf
    ├── three-CV-iter.svg
    ├── three-CV.pdf
    ├── three-CV.svg
    ├── timberland.png
    ├── validation-alt.pdf
    ├── validation-alt.svg
    ├── validation.pdf
    └── validation.svg
├── race_results.mp4
├── references.Rmd
├── sa_search.mp4
└── style.css


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | ^README\.Rmd$
4 | ^CODE_OF_CONDUCT\.md$
5 | ^\.github$
6 | ^LICENSE\.md$
7 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.Rmd linguist-detectable
2 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/bookdown.yaml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - main
 5 |   pull_request:
 6 |     branches:
 7 |       - main
 8 |   workflow_dispatch:    
 9 | 
10 | name: bookdown
11 | 
12 | env:
13 |   isExtPR: ${{ github.event.pull_request.head.repo.fork == true }}
14 | 
15 | jobs:
16 |   build:
17 |     runs-on: ubuntu-latest
18 |     env:
19 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
20 |     steps:
21 |       - uses: actions/checkout@v2
22 | 
23 |       - uses: r-lib/actions/setup-pandoc@v2
24 |       
25 |       - name: Install system dependencies
26 |         run: |
27 |           sudo apt-get update -qq
28 |           sudo apt-get install -y ffmpeg libavfilter-dev
29 | 
30 |       - uses: r-lib/actions/setup-r@v2
31 |         with:
32 |           use-public-rspm: true
33 | 
34 |       - uses: r-lib/actions/setup-r-dependencies@v2
35 | 
36 |       - name: Build site
37 |         run: Rscript -e 'bookdown::render_book("index.Rmd", quiet = TRUE)'
38 | 
39 |       - name: Deploy to Netlify
40 |         if: contains(env.isExtPR, 'false')
41 |         id: netlify-deploy
42 |         uses: nwtgck/actions-netlify@v1.1
43 |         with:
44 |           publish-dir: './_book'
45 |           production-branch: main
46 |           github-token: ${{ secrets.GITHUB_TOKEN }}
47 |           deploy-message:
48 |             'Deploy from GHA: ${{ github.event.pull_request.title || github.event.head_commit.message }} (${{ github.sha }})'
49 |           enable-pull-request-comment: false
50 |           enable-commit-comment: false
51 |         env:
52 |           NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
53 |           NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
54 |         timeout-minutes: 1
55 | 


--------------------------------------------------------------------------------
/.github/workflows/lock.yaml:
--------------------------------------------------------------------------------
 1 | name: 'Lock Threads'
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 0 * * *'
 6 | 
 7 | jobs:
 8 |   lock:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: dessant/lock-threads@v2
12 |         with:
13 |           github-token: ${{ github.token }}
14 |           issue-lock-inactive-days: '14'
15 | #          issue-exclude-labels: ''
16 | #          issue-lock-labels: 'outdated'
17 |           issue-lock-comment: >
18 |             This issue has been automatically locked. If you believe you have
19 |             found a related problem, please file a new issue (with a reprex:
20 |             <https://reprex.tidyverse.org>) and link to this issue.
21 |           issue-lock-reason: ''
22 |           pr-lock-inactive-days: '14'
23 | #          pr-exclude-labels: 'wip'
24 |           pr-lock-labels: ''
25 |           pr-lock-comment: >
26 |             This pull request has been automatically locked. If you believe you
27 |             have found a related problem, please file a new issue (with a reprex:
28 |             <https://reprex.tidyverse.org>) and link to this issue.
29 |           pr-lock-reason: ''
30 | #          process-only: 'issues'
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | .DS_Store
 6 | _book
 7 | _main.*
 8 | libs
 9 | figures
10 | _bookdown_files
11 | figures/introduction-cricket-plot-1.svg
12 | figures/introduction-descr-examples-1.pdf
13 | figures/introduction-interaction-plots-1.svg
14 | figures/introduction-modeling-process-1.pdf
15 | figures/tidyverse-cricket-plot-1.pdf
16 | figures/tidyverse-cricket-plot-1.svg
17 | figures/tidyverse-interaction-plots-1.pdf
18 | figures/tidyverse-interaction-plots-1.svg
19 | extras/iowa_highway.shx
20 | extras/iowa_highway.shp
21 | files_for_print*
22 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: TMwR
 2 | Title: Tidy Modeling with R.
 3 | Version: 1.0.0
 4 | Authors@R: c(
 5 |     person("Max", "Kuhn", , "max@rstudio.com", role = c("aut", "cre"),
 6 |            comment = c(ORCID = "0000-0003-2402-136X")),
 7 |     person("Julia", "Silge", , "julia.silge@rstudio.com", role = "aut",
 8 |            comment = c(ORCID = "0000-0002-3671-836X"))
 9 |   )
10 | License: CC BY-NC-SA 4.0
11 | URL: https://github.com/tidymodels/TMwR, https://www.tmwr.org/
12 | Depends: 
13 |     R (>= 4.0.0)
14 | Imports:
15 |     applicable,
16 |     av,
17 |     baguette,
18 |     beans,
19 |     bestNormalize,
20 |     bookdown,
21 |     broom,
22 |     censored,
23 |     corrplot,
24 |     corrr,
25 |     Cubist,
26 |     DALEXtra,
27 |     dials (>= 0.0.9),
28 |     dimRed,
29 |     discrim,
30 |     doMC,
31 |     dplyr,
32 |     earth,
33 |     embed (>= 0.1.5),
34 |     fastICA,
35 |     finetune (>= 0.1.1),
36 |     forcats,
37 |     ggforce,
38 |     ggplot2,
39 |     glmnet,
40 |     gridExtra,
41 |     infer,
42 |     kableExtra (>= 1.2.1),
43 |     kernlab,
44 |     kknn,
45 |     klaR,
46 |     knitr,
47 |     learntidymodels,
48 |     lime,
49 |     lme4,
50 |     lubridate,
51 |     mda,
52 |     mixOmics,
53 |     modeldata,
54 |     multilevelmod,
55 |     nlme (>= 3.1-157),
56 |     nnet,
57 |     parsnip,
58 |     patchwork,
59 |     pillar (>= 1.6.6),
60 |     poissonreg,
61 |     prettyunits,
62 |     probably,
63 |     pscl,
64 |     purrr,
65 |     ranger,
66 |     recipes (>= 1.0.8),
67 |     rlang,
68 |     rmarkdown,
69 |     rpart,
70 |     rsample (>= 1.2.0),
71 |     rstanarm,
72 |     rules,
73 |     sessioninfo,
74 |     stacks (>= 0.2.1),
75 |     stringr,
76 |     svglite,
77 |     text2vec,
78 |     textrecipes,
79 |     themis,
80 |     tibble (>= 3.1.0),
81 |     tidymodels (>= 1.1.0),
82 |     tidyposterior (>= 0.0.3),
83 |     tidyverse,
84 |     tune (>= 0.1.3),
85 |     uwot,
86 |     workflows (>= 0.2.2),
87 |     workflowsets (>= 0.0.1),
88 |     xgboost,
89 |     yardstick
90 | Remotes:
91 |     tidymodels/learntidymodels
92 | biocViews: mixOmics
93 | Encoding: UTF-8
94 | SystemRequirements: FFmpeg (>= 3.2); with at least libx264 and lame (mp3)
95 |     drivers. Debian/Ubuntu: libavfilter-dev, Fedora/CentOS: ffmpeg-devel
96 |     (via https://rpmfusion.org), MacOS Homebrew: ffmp
97 | 


--------------------------------------------------------------------------------
/RData/Chicago_2020.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/Chicago_2020.RData


--------------------------------------------------------------------------------
/RData/bears_at_home.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/bears_at_home.RData


--------------------------------------------------------------------------------
/RData/concrete_mixtures.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/concrete_mixtures.RData


--------------------------------------------------------------------------------
/RData/concrete_results.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/concrete_results.RData


--------------------------------------------------------------------------------
/RData/dry_beans.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/dry_beans.RData


--------------------------------------------------------------------------------
/RData/lm_fit.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/lm_fit.RData


--------------------------------------------------------------------------------
/RData/mlp_times.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/mlp_times.RData


--------------------------------------------------------------------------------
/RData/plm_resids.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/plm_resids.RData


--------------------------------------------------------------------------------
/RData/post_intervals.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/post_intervals.RData


--------------------------------------------------------------------------------
/RData/rda_fit.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/rda_fit.RData


--------------------------------------------------------------------------------
/RData/resampling.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/resampling.RData


--------------------------------------------------------------------------------
/RData/sa_history.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/sa_history.RData


--------------------------------------------------------------------------------
/RData/search_examples.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/search_examples.RData


--------------------------------------------------------------------------------
/RData/svm_bo_metrics.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/svm_bo_metrics.RData


--------------------------------------------------------------------------------
/RData/svm_large.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/svm_large.RData


--------------------------------------------------------------------------------
/RData/svm_sa_metrics.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/RData/svm_sa_metrics.RData


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | # TMwR
 2 | 
 3 | [![Build Status](https://github.com/tidymodels/TMwR/workflows/bookdown/badge.svg)](https://github.com/tidymodels/TMwR/actions)
 4 | 
 5 | ```{r, include = FALSE}
 6 | knitr::opts_chunk$set(
 7 |   collapse = TRUE,
 8 |   comment = "#>",
 9 |   out.width = "100%"
10 | )
11 | ```
12 | 
13 | 
14 | This repository contains the source for [_Tidy Modeling with R_](https://tmwr.org). The purpose of this book is to demonstrate how the [tidyverse](https://www.tidyverse.org/) and [tidymodels](https://www.tidymodels.org/) can be used to produce high quality models.
15 | 
16 | # Reproducing the book or results
17 | 
18 | First, you'll need to install the required packages. To do this, first install the `remotes` package:
19 | 
20 | ``` r
21 | install.packages("remotes")
22 | ```
23 | 
24 | Then use this to install what you need to create the book: 
25 | 
26 | ``` r
27 | remotes::install_github("tidymodels/TMwR")
28 | ```
29 | 
30 | Although we rigorously try to use the current CRAN versions of all packages, the code above may install some development versions. 
31 | 
32 | The content is created using the `bookdown` package. To compile the book, use:
33 | 
34 | ```r
35 | bookdown::render_book("index.Rmd", "bookdown::gitbook")
36 | ```
37 | 
38 | This will create the HTML files in a directory called `_book`. Although we are in the process of publishing a print version of this work with O'Reilly, we do _not_ currently support building to a PDF version.
39 | 
40 | 
41 | # Contributing
42 | 
43 | Please note that this work is written under a [Contributor Code of Conduct](CODE_OF_CONDUCT.md) and the online version is licensed under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-nc-sa/4.0/). By participating in this project (for example, by submitting an [issue](https://github.com/tidymodels/TMwR/issues) with suggestions or edits) you agree to abide by its terms. Instructions for making contributions can be found in the [`contributing.md`](contributing.md) file.
44 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # TMwR
 2 | 
 3 | [![Build Status](https://github.com/tidymodels/TMwR/workflows/bookdown/badge.svg)](https://github.com/tidymodels/TMwR/actions)
 4 | 
 5 | 
 6 | 
 7 | 
 8 | This repository contains the source for [_Tidy Modeling with R_](https://tmwr.org). The purpose of this book is to demonstrate how the [tidyverse](https://www.tidyverse.org/) and [tidymodels](https://www.tidymodels.org/) can be used to produce high quality models.
 9 | 
10 | # Reproducing the book or results
11 | 
12 | First, you'll need to install the required packages. To do this, first install the `remotes` package:
13 | 
14 | ``` r
15 | install.packages("remotes")
16 | ```
17 | 
18 | Then use this to install what you need to create the book: 
19 | 
20 | ``` r
21 | remotes::install_github("tidymodels/TMwR")
22 | ```
23 | 
24 | Although we rigorously try to use the current CRAN versions of all packages, the code above may install some development versions. 
25 | 
26 | The content is created using the `bookdown` package. To compile the book, use:
27 | 
28 | ```r
29 | bookdown::render_book("index.Rmd", "bookdown::gitbook")
30 | ```
31 | 
32 | This will create the HTML files in a directory called `_book`. Although we are in the process of publishing a print version of this work with O'Reilly, we do _not_ currently support building to a PDF version.
33 | 
34 | 
35 | # Contributing
36 | 
37 | Please note that this work is written under a [Contributor Code of Conduct](CODE_OF_CONDUCT.md) and the online version is licensed under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-nc-sa/4.0/). By participating in this project (for example, by submitting an [issue](https://github.com/tidymodels/TMwR/issues) with suggestions or edits) you agree to abide by its terms. Instructions for making contributions can be found in the [`contributing.md`](contributing.md) file.
38 | 


--------------------------------------------------------------------------------
/TMwR.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Website
16 | 


--------------------------------------------------------------------------------
/_bookdown.yml:
--------------------------------------------------------------------------------
 1 | new_session: yes
 2 | 
 3 | rmd_files: [
 4 |   "index.Rmd",
 5 |   
 6 |   "01-software-modeling.Rmd",
 7 |   "02-tidyverse.Rmd",
 8 |   "03-base-r.Rmd",
 9 |   
10 |   "04-ames.Rmd",
11 |   "05-data-spending.Rmd",
12 |   "06-fitting-models.Rmd",
13 |   "07-the-model-workflow.Rmd",
14 |   "08-feature-engineering.Rmd",
15 |   "09-judging-model-effectiveness.Rmd",
16 |   
17 |   "10-resampling.Rmd",
18 |   "11-comparing-models.Rmd",
19 |   "12-tuning-parameters.Rmd",
20 |   "13-grid-search.Rmd",
21 |   "14-iterative-search.Rmd",
22 |   "15-workflow-sets.Rmd",
23 |   
24 |   "16-dimensionality-reduction.Rmd",
25 |   "17-encoding-categorical-data.Rmd",
26 |   "18-explaining-models-and-predictions.Rmd",
27 |   "19-when-should-you-trust-predictions.Rmd",
28 |   "20-ensemble-models.Rmd",
29 |   "21-inferential-analysis.Rmd",
30 |   
31 |   "pre-proc-table.Rmd",
32 |   "references.Rmd"
33 | ]
34 | 
35 | before_chapter_script: "_common.R"
36 | 


--------------------------------------------------------------------------------
/_common.R:
--------------------------------------------------------------------------------
 1 | options(digits = 4, width = 84)
 2 | options(dplyr.print_min = 6, dplyr.print_max = 6)
 3 | options(cli.width = 85)
 4 | options(crayon.enabled = FALSE)
 5 | 
 6 | knitr::opts_chunk$set(
 7 |   comment = "#>",
 8 |   collapse = TRUE,
 9 |   fig.align = 'center',
10 |   tidy = FALSE
11 | )
12 | 
13 | 
14 | 
15 | theme_transparent <- function(...) {
16 |   
17 |   ret <- ggplot2::theme_bw(...)
18 |   
19 |   trans_rect <- ggplot2::element_rect(fill = "transparent", colour = NA)
20 |   ret$panel.background  <- trans_rect
21 |   ret$plot.background   <- trans_rect
22 |   ret$legend.background <- trans_rect
23 |   ret$legend.key        <- trans_rect
24 |   
25 |   ret$legend.position <- "top"
26 |   
27 |   ret
28 | }
29 | 
30 | library(ggplot2)
31 | theme_set(theme_transparent())
32 | 
33 | tmwr_version <- function() {
34 |   dt <- Sys.Date()
35 |   ver <- read.dcf("DESCRIPTION")[1, "Version"]
36 |   paste0("Version ", ver, " (", dt, ")")
37 | }
38 | 
39 | pkg <- function(x) {
40 |   cl <- match.call()
41 |   x <- as.character(cl$x)
42 |   paste0('<span class="pkg">', x, '</span>')
43 | }
44 | 
45 | is_new_version <- function(x, path) {
46 |   cl <- match.call()
47 |   nm <- as.character(cl$x)
48 |   if (!file.exists(path)) {
49 |     return(TRUE)
50 |   }
51 |   load(path)
52 |   prev <- get(nm)
53 |   
54 |   # parsnip model fits have an elapsed time and this will change from run-to-run.
55 |   # We'll remove that to check for a new version. Same for workflows. 
56 |   if (inherits(prev, "model_fit")) {
57 |     x$elapsed <- NA
58 |     prev$elapsed <- NA
59 |   }
60 |   if (workflows:::is_workflow(prev)) {
61 |     x$fit$fit$elapsed <- NA
62 |     prev$fit$fit$elapsed <- NA
63 |   }
64 |   
65 |   res <- all.equal(x, prev)
66 |   !isTRUE(res)
67 | }
68 | 
69 | 


--------------------------------------------------------------------------------
/_output.yaml:
--------------------------------------------------------------------------------
 1 | bookdown::gitbook:
 2 |   css: [style.css, TMwR.css]
 3 |   dev: png
 4 |   config:
 5 |     toc:
 6 |       collapse: section
 7 |       before: |
 8 |         <li><strong><a href="./">Tidy Modeling with R</a></strong></li>
 9 |     edit:
10 |       link: https://github.com/tidymodels/TMwR/edit/main/%s
11 |       text: "Edit"
12 |     fontsettings: null  
13 |     sharing: no
14 | 
15 | bookdown::pdf_book:
16 |   latex_engine: pdflatex
17 |   citation_package: natbib
18 |   includes:
19 |     in_header: latex_extras/preamble.tex
20 |   keep_tex: yes
21 |   highlight: tango
22 | 
23 | 


--------------------------------------------------------------------------------
/ames_snippets.R:
--------------------------------------------------------------------------------
 1 | # Any changes to this code should trigger changes to the end-of-chapter summary
 2 | # sections (that include these in code chunks)
 3 | 
 4 | library(tidymodels)
 5 | data(ames)
 6 | ames <- mutate(ames, Sale_Price = log10(Sale_Price))
 7 | 
 8 | set.seed(502)
 9 | ames_split <- initial_split(ames, prop = 0.80, strata = Sale_Price)
10 | ames_train <- training(ames_split)
11 | ames_test  <-  testing(ames_split)
12 | 
13 | ames_rec <- 
14 |   recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 
15 |            Latitude + Longitude, data = ames_train) %>%
16 |   step_log(Gr_Liv_Area, base = 10) %>% 
17 |   step_other(Neighborhood, threshold = 0.01) %>% 
18 |   step_dummy(all_nominal_predictors()) %>% 
19 |   step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) %>% 
20 |   step_ns(Latitude, Longitude, deg_free = 20)
21 | 
22 | lm_model <- linear_reg() %>% set_engine("lm")
23 | 
24 | lm_wflow <- 
25 |   workflow() %>% 
26 |   add_model(lm_model) %>% 
27 |   add_recipe(ames_rec)
28 | 
29 | # cached in RData/lm_fit.RData
30 | # lm_fit <- fit(lm_wflow, ames_train)
31 | 
32 | rf_model <- 
33 |   rand_forest(trees = 1000) %>% 
34 |   set_engine("ranger") %>% 
35 |   set_mode("regression")
36 | 
37 | rf_wflow <- 
38 |   workflow() %>% 
39 |   add_formula(
40 |     Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 
41 |       Latitude + Longitude) %>% 
42 |   add_model(rf_model) 
43 | 
44 | set.seed(1001)
45 | ames_folds <- vfold_cv(ames_train, v = 10)
46 | 
47 | # cached in RData/resampling.RData from Ch 10
48 | # rf_res <- rf_wflow %>% fit_resamples(resamples = ames_folds, control = keep_pred)
49 | 


--------------------------------------------------------------------------------
/bo_search.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/bo_search.mp4


--------------------------------------------------------------------------------
/contributing.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | Thank you for your interest in contributing. This file contains what you need to know to help. 
 4 | 
 5 | - For __questions and discussions__ about tidymodels packages, modeling, and machine learning, please [post on RStudio Community](https://rstd.io/tidymodels-community).
 6 | 
 7 | If you have a __contribution__, please fork the repo and make a pull request (PR). If these terms are unfamiliar to you, take a look at [_Happy Git and GitHub for the useR_](https://happygitwithr.com/). It might be helpful to start a GitHub issue to discuss them before putting a lot of effort into it. 
 8 | 
 9 | If you make significant changes, include the phrase "I assign the copyright of this contribution to the authors listed in the `DESCRIPTION` file".
10 | 
11 | __If you find a bug__, please make an issue or pull request. Since all of the data and code are available, we will require minimal reprex (reproducible example). The goal of a reprex is to make it as easy as possible for me to recreate your problem so that we can fix it. If you've never heard of a reprex before, start by reading "[What is a reprex](https://github.com/tidyverse/reprex#what-is-a-reprex)", and follow the advice further down that page. 
12 | 
13 | ## Formatting
14 | 
15 |  * All code chunks have labels that are concise but descriptive. They should also make good figure names. Look at each chapter's chunk names; we keep a common prefix for each chapter. Names should use `-` to space words.
16 |  
17 |  * Figures should have transparent backgrounds and legends (if any) on top.
18 |  
19 |  * Please use US spellings (e.g. "color" instead of "colour"). 
20 |  
21 |  * Do not break lines within sentences or paragraphs. 
22 |  
23 |  * Adhere as best as possible to the [`tidyverse` style guide](https://style.tidyverse.org/). 
24 |  
25 |  * Please avoid adding new package dependencies. If that can't be avoided, add them to the DESCRIPTION file. 
26 |  
27 | 


--------------------------------------------------------------------------------
/contributors.csv:
--------------------------------------------------------------------------------
 1 | login,n,name,blog
 2 | arisp99,1,NA,NA
 3 | bradisbrad,1,Brad Hill,www.bradisblogging.com
 4 | bryceroney,1,Bryce Roney,NA
 5 | cedricbatailler,1,Cedric Batailler,cedricbatailler.me
 6 | czeildi,1,Ildikó Czeller,https://ildiczeller.com/
 7 | davidkane9,1,David Kane,www.davidkane.info
 8 | DavZim,1,NA,https://davzim.github.io/
 9 | DCharIAA,2,NA,NA
10 | dcossyleon,5,Desirée De Leon,https://tinystats.github.io/teacups-giraffes-and-statistics/
11 | EmilHvitfeldt,3,Emil Hvitfeldt,https://www.emilhvitfeldt.com/
12 | emilopezcano,2,Emilio,http://emilio.lcano.com
13 | Fgazzelloni,1,Fgazzelloni,https://www.linkedin.com/in/fgazzelloni/
14 | hfrick,5,Hannah Frick,http://www.frick.ws
15 | hlynurhallgrims,2,Hlynur,NA
16 | howardbaek,3,Howard Baek,http://insidethetv.rbind.io/
17 | jaeyk,1,Jae Yeon Kim,https://jaeyk.github.io/
18 | jdtrat,1,Jonathan D. Trattner,https://www.jdtrat.com
19 | jmgirard,1,Jeffrey Girard,https://www.jmgirard.com
20 | JohnPickering,1,John W Pickering,NA
21 | jonthegeek,10,Jon Harmon,http://jonthegeek.com
22 | joseph-rickert,2,Joseph B. Rickert,http://www.rstudio.com
23 | juliasilge,238,Julia Silge,https://juliasilge.com
24 | maxdrohde,2,Maximilian Rohde,maximilianrohde.com
25 | michaelgrund,1,Michael Grund,
26 | MikeJohnPage,1,NA,NA
27 | mine-cetinkaya-rundel,1,Mine Cetinkaya-Rundel,http://mine-cr.com
28 | mmhamdy,1,Mohammed Hamdy,NA
29 | nattalides,1,NA,NA
30 | PursuitOfDataScience,1,Y. Yu,https://youzhi.netlify.app/
31 | riazhedayati,1,Riaz Hedayati,NA
32 | RobWiederstein,1,Rob Wiederstein,www.robwiederstein.org
33 | scottyd22,2,Scott,datascott.com
34 | simonschoe,1,Simon Schölzel,NA
35 | tagasimon,1,Simon Sayz,https://simonsayz.xyz
36 | thrkng,2,NA,NA
37 | tmstauss,4,Tanner Stauss,www.linkedin.com/in/tanner-stauss
38 | tonyelhabr,1,Tony ElHabr,https://tonyelhabr.rbind.io/
39 | topepo,389,Max Kuhn,NA
40 | x1o,3,Dmitry Zotikov,NA
41 | xiaochi-liu,3,Xiaochi,xiaochi.rbind.io
42 | zachbogart,1,Zach Bogart,zachbogart.com
43 | 


--------------------------------------------------------------------------------
/extras/affy_plm_ggplot.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(affyPLM)
 3 | library(AmpAffyExample)
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | 
 7 | data(AmpData)
 8 | sampleNames(AmpData) <- c("N1","Good Quality","Poor Quality","A1","A2","A3")
 9 | 
10 | Pset1 <- fitPLM(AmpData)
11 | 
12 | # ------------------------------------------------------------------------------
13 | 
14 | # Take from the image method for PLMset objevts 
15 | pm.index <- unlist(affy::indexProbes(Pset1, "pm", row.names(coefs(Pset1))))
16 | rows <-  Pset1@nrow
17 | cols <-  Pset1@ncol
18 | pm.x.locs <- pm.index %% rows
19 | pm.x.locs[pm.x.locs == 0] <- rows
20 | pm.y.locs <- pm.index %/% rows + 1
21 | 
22 | # ------------------------------------------------------------------------------
23 | 
24 | plm_resids <- 
25 |   tibble::as_tibble(Pset1@residuals$PM.resid) %>% 
26 |   mutate(
27 |     probe = rownames(Pset1@residuals$PM.resid),
28 |     x = pm.x.locs,
29 |     y = pm.y.locs
30 |   ) %>% 
31 |   pivot_longer(cols = c(1:6), names_to = "Sample", values_to = "Intensity") %>% 
32 |   dplyr::filter(Sample %in% c("Good Quality", "Poor Quality"))
33 | 
34 | # ------------------------------------------------------------------------------
35 | 
36 | save(plm_resids, file = "RData/plm_resids.RData")
37 | 
38 |   


--------------------------------------------------------------------------------
/extras/ames_posterior_intervals.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(doMC)
  3 | library(tidyposterior)
  4 | library(workflowsets)
  5 | library(rstanarm)
  6 | theme_set(theme_bw())
  7 | 
  8 | data(ames, package = "modeldata")
  9 | 
 10 | ames <- mutate(ames, Sale_Price = log10(Sale_Price))
 11 | 
 12 | set.seed(123)
 13 | ames_split <- initial_split(ames, prop = 0.80, strata = Sale_Price)
 14 | ames_train <- training(ames_split)
 15 | ames_test  <-  testing(ames_split)
 16 | 
 17 | crs <- parallel::detectCores()
 18 | 
 19 | registerDoMC(cores = crs)
 20 | 
 21 | ## -----------------------------------------------------------------------------
 22 | 
 23 | set.seed(55)
 24 | ames_folds <- vfold_cv(ames_train, v = 10, repeats = 10)
 25 | 
 26 | lm_model <- linear_reg() %>% set_engine("lm")
 27 | 
 28 | rf_model <-
 29 |   rand_forest(trees = 1000) %>%
 30 |   set_engine("ranger") %>%
 31 |   set_mode("regression")
 32 | 
 33 | # ------------------------------------------------------------------------------
 34 | 
 35 | basic_rec <- 
 36 |   recipe(Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + Bldg_Type + 
 37 |            Latitude + Longitude, data = ames_train) %>%
 38 |   step_log(Gr_Liv_Area, base = 10) %>% 
 39 |   step_other(Neighborhood, threshold = 0.01) %>% 
 40 |   step_dummy(all_nominal_predictors())
 41 | 
 42 | interaction_rec <- 
 43 |   basic_rec %>% 
 44 |   step_interact( ~ Gr_Liv_Area:starts_with("Bldg_Type_") ) 
 45 | 
 46 | spline_rec <- 
 47 |   interaction_rec %>% 
 48 |   step_ns(Latitude, Longitude, deg_free = 50)
 49 | 
 50 | preproc <- 
 51 |   list(basic = basic_rec, 
 52 |        interact = interaction_rec, 
 53 |        splines = spline_rec,
 54 |        formula = Sale_Price ~ Neighborhood + Gr_Liv_Area + Year_Built + 
 55 |          Bldg_Type + Latitude + Longitude
 56 |   )
 57 | 
 58 | models <- list(lm = lm_model, lm = lm_model, lm = lm_model, rf = rf_model)
 59 | 
 60 | four_models <- 
 61 |   workflow_set(preproc, models, cross = FALSE)
 62 | four_models
 63 | 
 64 | posteriors <- NULL
 65 | 
 66 | for(i in 11:100) {
 67 |   if (i %% 10 == 0) cat(i, "... ")
 68 | 
 69 |   tmp_rset <- rsample:::df_reconstruct(ames_folds %>% slice(1:i), ames_folds)
 70 | 
 71 |   four_resamples <- 
 72 |     four_models %>% 
 73 |     workflow_map("fit_resamples", seed = 1, resamples = tmp_rset)
 74 | 
 75 |   ## -----------------------------------------------------------------------------
 76 | 
 77 |   rsq_anova <-
 78 |     perf_mod(
 79 |       four_resamples,
 80 |       prior_intercept = student_t(df = 1),
 81 |       chains = crs - 2,
 82 |       iter = 5000,
 83 |       seed = 2,
 84 |       cores = crs - 2,
 85 |       refresh = 0
 86 |     )
 87 | 
 88 |   rqs_diff <-
 89 |     contrast_models(rsq_anova,
 90 |                     list_1 = "splines_lm",
 91 |                     list_2 = "basic_lm",
 92 |                     seed = 3) %>%
 93 |     as_tibble() %>%
 94 |     mutate(label = paste(format(1:100)[i], "resamples"), resamples = i)
 95 | 
 96 |   posteriors <- bind_rows(posteriors, rqs_diff)
 97 | 
 98 |   rm(rqs_diff)
 99 | 
100 | }
101 | 
102 | ## -----------------------------------------------------------------------------
103 | 
104 | # ggplot(posteriors, aes(x = difference)) +
105 | #   geom_histogram(bins = 30) +
106 | #   facet_wrap(~label)
107 | # 
108 | # ggplot(posteriors, aes(x = difference)) +
109 | #   geom_line(stat = "density", trim = FALSE) +
110 | #   facet_wrap(~label)
111 | 
112 | intervals <-
113 |   posteriors %>%
114 |   group_by(resamples) %>%
115 |   summarize(
116 |     mean = mean(difference),
117 |     lower = quantile(difference, prob = 0.05),
118 |     upper = quantile(difference, prob = 0.95),
119 |     .groups = "drop"
120 |   ) %>%
121 |   ungroup() %>%
122 |   mutate(
123 |     mean = predict(loess(mean ~ resamples, span = .15)),
124 |     lower = predict(loess(lower ~ resamples, span = .15)),
125 |     upper = predict(loess(upper ~ resamples, span = .15))
126 |   )
127 | 
128 | save(intervals, file = "RData/post_intervals.RData")
129 | 
130 | # ggplot(intervals,
131 | #        aes(x = resamples, y = mean)) +
132 | #   geom_path() +
133 | #   geom_ribbon(aes(ymin = lower, ymax = upper), fill = "red", alpha = .1) +
134 | #   labs(y = expression(paste("Mean difference in ", R^2)),
135 | #        x = "Number of Resamples (repeated 10-fold cross-validation)")
136 | # 
137 | 
138 | 


--------------------------------------------------------------------------------
/extras/cells_svm_large.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(rayshader)
  3 | library(doMC)
  4 | registerDoMC(cores = parallel::detectCores(logical = TRUE))
  5 | 
  6 | ## -----------------------------------------------------------------------------
  7 | 
  8 | data(cells)
  9 | cells <- cells %>% select(-case)
 10 | set.seed(33)
 11 | cell_folds <- vfold_cv(cells)
 12 | roc_res <- metric_set(roc_auc)
 13 | 
 14 | ## -----------------------------------------------------------------------------
 15 | 
 16 | svm_rec <- 
 17 |   recipe(class ~ ., data = cells) %>%
 18 |   step_YeoJohnson(all_numeric_predictors()) %>%
 19 |   step_normalize(all_numeric_predictors())
 20 | 
 21 | svm_spec <- 
 22 |   svm_rbf(cost = tune(), rbf_sigma = tune()) %>% 
 23 |   set_engine("kernlab") %>% 
 24 |   set_mode("classification")
 25 | 
 26 | svm_wflow <- 
 27 |   workflow() %>% 
 28 |   add_model(svm_spec) %>% 
 29 |   add_recipe(svm_rec)
 30 | 
 31 | svm_param <- 
 32 |   svm_wflow %>% 
 33 |   parameters() %>% 
 34 |   update(
 35 |     cost = cost(c(-10, 5)),
 36 |     rbf_sigma = rbf_sigma(c(-7, -1))
 37 |   )
 38 | 
 39 | ## -----------------------------------------------------------------------------
 40 | 
 41 | large_grid <- grid_regular(svm_param, levels = 50)
 42 | 
 43 | set.seed(2)
 44 | svm_large <- 
 45 |   svm_wflow %>% 
 46 |   tune_grid(resamples = cell_folds, grid = large_grid, metrics = roc_res)
 47 | 
 48 | ## -----------------------------------------------------------------------------
 49 | 
 50 | if (interactive()) {
 51 | 
 52 |   svm_roc <-   
 53 |     svm_large %>% 
 54 |     collect_metrics()
 55 |   
 56 |   large_plot <-
 57 |     svm_roc %>% 
 58 |     ggplot(aes(x = rbf_sigma, y = cost)) + 
 59 |     geom_raster(aes(fill = mean)) + 
 60 |     geom_point(data = top_n(svm_roc, 1, mean)) + 
 61 |     scale_x_log10() + 
 62 |     scale_y_continuous(trans = "log2") +
 63 |     scale_fill_distiller(palette = "Blues") +
 64 |     theme_minimal() + 
 65 |     theme(
 66 |       legend.position = "bottom",
 67 |       legend.key.width = grid::unit(2, "cm"),
 68 |       plot.title = element_text(hjust = 0.5)
 69 |     ) + 
 70 |     guides(title.position = "bottom") + 
 71 |     labs(x = "rbf_sigma\n\n\n\n", title = "ROC AUC surface") + 
 72 |     coord_fixed(ratio = 1/2.5)
 73 |   
 74 |   agg_png("roc_surface.png", height = 4 * 480, width = 4 * 480, res = 72 * 3, scaling = 1)
 75 |   print(large_plot)
 76 |   dev.off()
 77 |   
 78 |   
 79 |   
 80 |   plot_gg(
 81 |     large_plot,
 82 |     multicore = FALSE,
 83 |     raytrace = TRUE,
 84 |     width = 7,
 85 |     height = 7,
 86 |     scale = 300,
 87 |     windowsize = c(1400, 1400),
 88 |     zoom = 1,
 89 |     phi = 30,
 90 |     theta = 30
 91 |   )
 92 |   
 93 | }
 94 | 
 95 | ## -----------------------------------------------------------------------------
 96 | 
 97 | sessioninfo::session_info()
 98 | 
 99 | ## -----------------------------------------------------------------------------
100 | 
101 | save(svm_large, file = "../RData/svm_large.RData")
102 | 


--------------------------------------------------------------------------------
/extras/dry_beans.R:
--------------------------------------------------------------------------------
 1 | library(tidymodels)
 2 | library(RWeka)
 3 | library(janitor)
 4 | 
 5 | dry_beans <- 
 6 |   read.arff(url("https://www.muratkoklu.com/datasets/vtdhnd02.php")) %>% 
 7 |   dplyr::rename(AspectRatio = AspectRation) %>% 
 8 |   clean_names() %>% 
 9 |   as_tibble() %>% 
10 |   mutate(class = tolower(as.character(class)),
11 |          class = factor(class))
12 | 
13 | names(dry_beans) <- gsub("([1-4]$)", "_\\1", names(dry_beans), perl = TRUE)
14 | 
15 | save(dry_beans, file = "RData/dry_beans.RData", compress = "xz", version = 2)
16 | 
17 | 


--------------------------------------------------------------------------------
/extras/nonlinear_function.R:
--------------------------------------------------------------------------------
 1 | nonlin_function <- function(x, error = TRUE) {
 2 |   # use the ames spline curve for Longitude just because I think that it's 
 3 |   # cool
 4 |   data(ames, package = "modeldata")
 5 |   rec <- 
 6 |     recipe(Sale_Price ~ Longitude, data = ames) %>% 
 7 |     step_log(Sale_Price, skip = TRUE) %>% 
 8 |     step_range(Longitude) %>% 
 9 |     prep()
10 |   
11 |   
12 |   # use the ames longitude pattern since I like it
13 |   f <- lm(log10(Sale_Price) ~ splines::ns(Longitude, df = 12), data = juice(rec))
14 |   p <- predict(f, newdata = data.frame(Longitude = x), se.fit = TRUE)
15 |   err <- p$se.fit
16 |   if (!error) {
17 |     err <- 0
18 |   }
19 |   res <- rnorm(1, mean = p$fit, sd = err)
20 |   # convert to a R^2-like value
21 |   res <- (8 * res)/10
22 |   res <- max(res, 0)
23 |   res <- min(res, 1)
24 |   res
25 | }
26 | 


--------------------------------------------------------------------------------
/extras/parallel_times/collect.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(lubridate)
  3 | 
  4 | ## -----------------------------------------------------------------------------
  5 | 
  6 | get_date <- function(x) {
  7 |    x <- basename(x)
  8 |    x <- strsplit(x, "_")
  9 |    x <- map(x, ~ .x[3:8])
 10 |    x <- map(x, ~ gsub("\\.RData", "", .x))
 11 |    x <- map_chr(x, paste0, collapse = "-")
 12 |    ymd_hms(x)
 13 | }
 14 | 
 15 | get_times <- function(x) {
 16 |    load(x)
 17 |    res <- 
 18 |       times %>% 
 19 |       mutate(date = get_date(x))
 20 | 
 21 |    res
 22 | }
 23 | 
 24 | ## -----------------------------------------------------------------------------
 25 | 
 26 | rdata <-
 27 |    list.files(path = "extras/parallel_times/",
 28 |               pattern = "\\.RData",
 29 |               full.names = TRUE)
 30 | rdata <- rdata[!grepl("xgb_times", rdata)]
 31 | rdata <- rdata[!grepl("logging_data", rdata)]
 32 | 
 33 | all_times <-  map_dfr(rdata, get_times) 
 34 | 
 35 | seq <- 
 36 |    all_times %>% 
 37 |    filter(num_cores == 1) %>% 
 38 |    dplyr::rename(seq_time = elapsed) %>% 
 39 |    select(-num_cores, -date) 
 40 | 
 41 | times <- 
 42 |    full_join(all_times, seq, 
 43 |              by = c("num_resamples", "num_grid", "preproc", "par_method")) %>% 
 44 |    mutate(
 45 |       time_per_fit = elapsed/(num_grid * num_resamples),
 46 |       speed_up = seq_time/elapsed,
 47 |       preprocessing = gsub(" preprocessing", "", preproc),
 48 |       preprocessing = ifelse(preprocessing == "no", "none", preprocessing),
 49 |       preprocessing = factor(preprocessing, levels = c("none", "light", "expensive")),
 50 |       parallel_over = par_method
 51 |    )
 52 | 
 53 | if (interactive()) {
 54 | 
 55 | 
 56 |    ggplot(times, aes(x = num_cores, y = elapsed, col = parallel_over, shape = parallel_over)) + 
 57 |       geom_point() + 
 58 |       geom_line() +
 59 |       facet_wrap(~ preprocessing) + 
 60 |       labs(x = "Number of Workers", y = "Execution Time (s)") + 
 61 |       scale_y_log10() + 
 62 |       theme_bw() + 
 63 |       theme(legend.position = "top")
 64 |    
 65 |    times %>% 
 66 |       filter(preprocessing == "none") %>% 
 67 |       ggplot(aes(x = num_cores, y = speed_up, col = preprocessing, shape = preprocessing)) + 
 68 |       geom_abline(lty = 1) + 
 69 |       geom_point() + 
 70 |       geom_line() +
 71 |       facet_wrap(~ par_method) + 
 72 |       coord_obs_pred() +
 73 |       labs(x = "Number of Workers", y = "Speed-up", 
 74 |            title = "5 resamples, 10 grid points") + 
 75 |       theme_bw() + 
 76 |       theme(legend.position = "top")
 77 |    
 78 |    times %>% 
 79 |       filter(preprocessing != "expensive") %>% 
 80 |       ggplot(aes(x = num_cores, y = speed_up, col = preprocessing, shape = preprocessing)) + 
 81 |       geom_abline(lty = 1) + 
 82 |       geom_point() + 
 83 |       geom_line() +
 84 |       facet_wrap(~ par_method) + 
 85 |       coord_obs_pred() +
 86 |       labs(x = "Number of Workers", y = "Speed-up", 
 87 |            title = "5 resamples, 10 grid points") + 
 88 |       theme_bw() + 
 89 |       theme(legend.position = "top")
 90 | 
 91 |    
 92 |    ggplot(times, aes(x = num_cores, y = speed_up, col = parallel_over, shape = parallel_over)) + 
 93 |       geom_abline(lty = 1) + 
 94 |       geom_point() + 
 95 |       geom_line() +
 96 |       facet_wrap(~ preprocessing) + 
 97 |       coord_obs_pred() +
 98 |       labs(x = "Number of Workers", y = "Speed-up", 
 99 |            title = "5 resamples, 10 grid points") + 
100 |       theme_bw() + 
101 |       theme(legend.position = "top")
102 |    
103 | }
104 | 
105 | save(times, file = "extras/parallel_times/xgb_times.RData")
106 | 
107 | # r_files <- list.files(path = ".", pattern = "R$")
108 | # r_files <- r_files[r_files != "collect.R"]
109 | # r_files <- r_files[r_files != "template.R"]
110 | # r_files <- paste0("R CMD BATCH --vanilla ", r_files, "\nsleep 20\n")
111 | # cat(sample(r_files), sep = "")
112 | 
113 | q("no")
114 | 
115 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_01_expensive.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | library(embed)
  8 | library(rstanarm)
  9 | 
 10 | ## -----------------------------------------------------------------------------
 11 | 
 12 | num_resamples <- 5
 13 | num_grid <- 10
 14 | num_cores <- 1
 15 | preproc <- "expensive preprocessing"
 16 | par_method <- "everything"
 17 |  
 18 | ## -----------------------------------------------------------------------------
 19 | 
 20 | set.seed(123)
 21 | 
 22 | flight_data <- 
 23 |   flights %>% 
 24 |   mutate(
 25 |     # Convert the arrival delay to a factor
 26 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 27 |     arr_delay = factor(arr_delay),
 28 |     # We will use the date (not date-time) in the recipe below
 29 |     date = as.Date(time_hour)
 30 |   ) %>% 
 31 |   # Include the weather data
 32 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 33 |   # Only retain the specific columns we will use
 34 |   select(dep_time, flight, origin, dest, air_time, distance, 
 35 |          carrier, date, arr_delay, time_hour) %>% 
 36 |   # Exclude missing data
 37 |   na.omit() %>% 
 38 |   # For creating models, it is better to have qualitative columns
 39 |   # encoded as factors (instead of character strings)
 40 |   mutate_if(is.character, as.factor) %>% 
 41 |   sample_n(4000)
 42 | 
 43 | ## -----------------------------------------------------------------------------
 44 | 
 45 | flights_rec <- 
 46 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 47 |   update_role(flight, time_hour, new_role = "ID") %>% 
 48 |   step_date(date, features = c("dow", "month")) %>% 
 49 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 50 |   step_rm(date) %>% 
 51 |   step_mutate(flight = as.factor(flight)) %>% 
 52 |   step_lencode_bayes(flight, outcome = vars(arr_delay)) %>% 
 53 |   step_dummy(all_nominal_predictors()) %>% 
 54 |   step_zv(all_predictors()) 
 55 | 
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_01_with.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 1
 13 | preproc <- "light preprocessing"
 14 | par_method <- "everything"
 15 |  
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_01_without.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 1
 13 | preproc <- "no preprocessing"
 14 | par_method <- "everything"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_02_expensive.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | library(embed)
  8 | library(rstanarm)
  9 | 
 10 | ## -----------------------------------------------------------------------------
 11 | 
 12 | num_resamples <- 5
 13 | num_grid <- 10
 14 | num_cores <- 2
 15 | preproc <- "expensive preprocessing"
 16 | par_method <- "everything"
 17 | 
 18 | ## -----------------------------------------------------------------------------
 19 | 
 20 | set.seed(123)
 21 | 
 22 | flight_data <- 
 23 |   flights %>% 
 24 |   mutate(
 25 |     # Convert the arrival delay to a factor
 26 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 27 |     arr_delay = factor(arr_delay),
 28 |     # We will use the date (not date-time) in the recipe below
 29 |     date = as.Date(time_hour)
 30 |   ) %>% 
 31 |   # Include the weather data
 32 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 33 |   # Only retain the specific columns we will use
 34 |   select(dep_time, flight, origin, dest, air_time, distance, 
 35 |          carrier, date, arr_delay, time_hour) %>% 
 36 |   # Exclude missing data
 37 |   na.omit() %>% 
 38 |   # For creating models, it is better to have qualitative columns
 39 |   # encoded as factors (instead of character strings)
 40 |   mutate_if(is.character, as.factor) %>% 
 41 |   sample_n(4000)
 42 | 
 43 | ## -----------------------------------------------------------------------------
 44 | 
 45 | flights_rec <- 
 46 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 47 |   update_role(flight, time_hour, new_role = "ID") %>% 
 48 |   step_date(date, features = c("dow", "month")) %>% 
 49 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 50 |   step_rm(date) %>% 
 51 |   step_mutate(flight = as.factor(flight)) %>% 
 52 |   step_lencode_bayes(flight, outcome = vars(arr_delay)) %>% 
 53 |   step_dummy(all_nominal_predictors()) %>% 
 54 |   step_zv(all_predictors()) 
 55 | 
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_02_with.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 2
 13 | preproc <- "light preprocessing"
 14 | par_method <- "everything"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_02_without.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 2
 13 | preproc <- "no preprocessing"
 14 | par_method <- "everything"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_03_expensive.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | library(embed)
  8 | library(rstanarm)
  9 | 
 10 | ## -----------------------------------------------------------------------------
 11 | 
 12 | num_resamples <- 5
 13 | num_grid <- 10
 14 | num_cores <- 3
 15 | preproc <- "expensive preprocessing"
 16 | par_method <- "everything"
 17 | 
 18 | ## -----------------------------------------------------------------------------
 19 | 
 20 | set.seed(123)
 21 | 
 22 | flight_data <- 
 23 |   flights %>% 
 24 |   mutate(
 25 |     # Convert the arrival delay to a factor
 26 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 27 |     arr_delay = factor(arr_delay),
 28 |     # We will use the date (not date-time) in the recipe below
 29 |     date = as.Date(time_hour)
 30 |   ) %>% 
 31 |   # Include the weather data
 32 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 33 |   # Only retain the specific columns we will use
 34 |   select(dep_time, flight, origin, dest, air_time, distance, 
 35 |          carrier, date, arr_delay, time_hour) %>% 
 36 |   # Exclude missing data
 37 |   na.omit() %>% 
 38 |   # For creating models, it is better to have qualitative columns
 39 |   # encoded as factors (instead of character strings)
 40 |   mutate_if(is.character, as.factor) %>% 
 41 |   sample_n(4000)
 42 | 
 43 | ## -----------------------------------------------------------------------------
 44 | 
 45 | flights_rec <- 
 46 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 47 |   update_role(flight, time_hour, new_role = "ID") %>% 
 48 |   step_date(date, features = c("dow", "month")) %>% 
 49 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 50 |   step_rm(date) %>% 
 51 |   step_mutate(flight = as.factor(flight)) %>% 
 52 |   step_lencode_bayes(flight, outcome = vars(arr_delay)) %>% 
 53 |   step_dummy(all_nominal_predictors()) %>% 
 54 |   step_zv(all_predictors()) 
 55 | 
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_03_with.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 3
 13 | preproc <- "light preprocessing"
 14 | par_method <- "everything"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_03_without.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 3
 13 | preproc <- "no preprocessing"
 14 | par_method <- "everything"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_04_expensive.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | library(embed)
  8 | library(rstanarm)
  9 | 
 10 | ## -----------------------------------------------------------------------------
 11 | 
 12 | num_resamples <- 5
 13 | num_grid <- 10
 14 | num_cores <- 4
 15 | preproc <- "expensive preprocessing"
 16 | par_method <- "everything"
 17 | 
 18 | ## -----------------------------------------------------------------------------
 19 | 
 20 | set.seed(123)
 21 | 
 22 | flight_data <- 
 23 |   flights %>% 
 24 |   mutate(
 25 |     # Convert the arrival delay to a factor
 26 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 27 |     arr_delay = factor(arr_delay),
 28 |     # We will use the date (not date-time) in the recipe below
 29 |     date = as.Date(time_hour)
 30 |   ) %>% 
 31 |   # Include the weather data
 32 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 33 |   # Only retain the specific columns we will use
 34 |   select(dep_time, flight, origin, dest, air_time, distance, 
 35 |          carrier, date, arr_delay, time_hour) %>% 
 36 |   # Exclude missing data
 37 |   na.omit() %>% 
 38 |   # For creating models, it is better to have qualitative columns
 39 |   # encoded as factors (instead of character strings)
 40 |   mutate_if(is.character, as.factor) %>% 
 41 |   sample_n(4000)
 42 | 
 43 | ## -----------------------------------------------------------------------------
 44 | 
 45 | flights_rec <- 
 46 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 47 |   update_role(flight, time_hour, new_role = "ID") %>% 
 48 |   step_date(date, features = c("dow", "month")) %>% 
 49 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 50 |   step_rm(date) %>% 
 51 |   step_mutate(flight = as.factor(flight)) %>% 
 52 |   step_lencode_bayes(flight, outcome = vars(arr_delay)) %>% 
 53 |   step_dummy(all_nominal_predictors()) %>% 
 54 |   step_zv(all_predictors()) 
 55 | 
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_04_with.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 4
 13 | preproc <- "light preprocessing"
 14 | par_method <- "everything"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_04_without.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 4
 13 | preproc <- "no preprocessing"
 14 | par_method <- "everything"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_05_expensive.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | library(embed)
  8 | library(rstanarm)
  9 | 
 10 | ## -----------------------------------------------------------------------------
 11 | 
 12 | num_resamples <- 5
 13 | num_grid <- 10
 14 | num_cores <- 5
 15 | preproc <- "expensive preprocessing"
 16 | par_method <- "everything"
 17 | 
 18 | ## -----------------------------------------------------------------------------
 19 | 
 20 | set.seed(123)
 21 | 
 22 | flight_data <- 
 23 |   flights %>% 
 24 |   mutate(
 25 |     # Convert the arrival delay to a factor
 26 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 27 |     arr_delay = factor(arr_delay),
 28 |     # We will use the date (not date-time) in the recipe below
 29 |     date = as.Date(time_hour)
 30 |   ) %>% 
 31 |   # Include the weather data
 32 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 33 |   # Only retain the specific columns we will use
 34 |   select(dep_time, flight, origin, dest, air_time, distance, 
 35 |          carrier, date, arr_delay, time_hour) %>% 
 36 |   # Exclude missing data
 37 |   na.omit() %>% 
 38 |   # For creating models, it is better to have qualitative columns
 39 |   # encoded as factors (instead of character strings)
 40 |   mutate_if(is.character, as.factor) %>% 
 41 |   sample_n(4000)
 42 | 
 43 | ## -----------------------------------------------------------------------------
 44 | 
 45 | flights_rec <- 
 46 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 47 |   update_role(flight, time_hour, new_role = "ID") %>% 
 48 |   step_date(date, features = c("dow", "month")) %>% 
 49 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 50 |   step_rm(date) %>% 
 51 |   step_mutate(flight = as.factor(flight)) %>% 
 52 |   step_lencode_bayes(flight, outcome = vars(arr_delay)) %>% 
 53 |   step_dummy(all_nominal_predictors()) %>% 
 54 |   step_zv(all_predictors()) 
 55 | 
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_05_with.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 5
 13 | preproc <- "light preprocessing"
 14 | par_method <- "everything"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_05_without.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 5
 13 | preproc <- "no preprocessing"
 14 | par_method <- "everything"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_10_expensive.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | library(embed)
  8 | library(rstanarm)
  9 | 
 10 | ## -----------------------------------------------------------------------------
 11 | 
 12 | num_resamples <- 5
 13 | num_grid <- 10
 14 | num_cores <- 10
 15 | preproc <- "expensive preprocessing"
 16 | par_method <- "everything"
 17 | 
 18 | ## -----------------------------------------------------------------------------
 19 | 
 20 | set.seed(123)
 21 | 
 22 | flight_data <- 
 23 |   flights %>% 
 24 |   mutate(
 25 |     # Convert the arrival delay to a factor
 26 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 27 |     arr_delay = factor(arr_delay),
 28 |     # We will use the date (not date-time) in the recipe below
 29 |     date = as.Date(time_hour)
 30 |   ) %>% 
 31 |   # Include the weather data
 32 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 33 |   # Only retain the specific columns we will use
 34 |   select(dep_time, flight, origin, dest, air_time, distance, 
 35 |          carrier, date, arr_delay, time_hour) %>% 
 36 |   # Exclude missing data
 37 |   na.omit() %>% 
 38 |   # For creating models, it is better to have qualitative columns
 39 |   # encoded as factors (instead of character strings)
 40 |   mutate_if(is.character, as.factor) %>% 
 41 |   sample_n(4000)
 42 | 
 43 | ## -----------------------------------------------------------------------------
 44 | 
 45 | flights_rec <- 
 46 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 47 |   update_role(flight, time_hour, new_role = "ID") %>% 
 48 |   step_date(date, features = c("dow", "month")) %>% 
 49 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 50 |   step_rm(date) %>% 
 51 |   step_mutate(flight = as.factor(flight)) %>% 
 52 |   step_lencode_bayes(flight, outcome = vars(arr_delay)) %>% 
 53 |   step_dummy(all_nominal_predictors()) %>% 
 54 |   step_zv(all_predictors()) 
 55 | 
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_10_with.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 10
 13 | preproc <- "light preprocessing"
 14 | par_method <- "everything"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_10_without.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 10
 13 | preproc <- "no preprocessing"
 14 | par_method <- "everything"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_15_expensive.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | library(embed)
  8 | library(rstanarm)
  9 | 
 10 | ## -----------------------------------------------------------------------------
 11 | 
 12 | num_resamples <- 5
 13 | num_grid <- 10
 14 | num_cores <- 15
 15 | preproc <- "expensive preprocessing"
 16 | par_method <- "everything"
 17 | 
 18 | ## -----------------------------------------------------------------------------
 19 | 
 20 | set.seed(123)
 21 | 
 22 | flight_data <- 
 23 |   flights %>% 
 24 |   mutate(
 25 |     # Convert the arrival delay to a factor
 26 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 27 |     arr_delay = factor(arr_delay),
 28 |     # We will use the date (not date-time) in the recipe below
 29 |     date = as.Date(time_hour)
 30 |   ) %>% 
 31 |   # Include the weather data
 32 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 33 |   # Only retain the specific columns we will use
 34 |   select(dep_time, flight, origin, dest, air_time, distance, 
 35 |          carrier, date, arr_delay, time_hour) %>% 
 36 |   # Exclude missing data
 37 |   na.omit() %>% 
 38 |   # For creating models, it is better to have qualitative columns
 39 |   # encoded as factors (instead of character strings)
 40 |   mutate_if(is.character, as.factor) %>% 
 41 |   sample_n(4000)
 42 | 
 43 | ## -----------------------------------------------------------------------------
 44 | 
 45 | flights_rec <- 
 46 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 47 |   update_role(flight, time_hour, new_role = "ID") %>% 
 48 |   step_date(date, features = c("dow", "month")) %>% 
 49 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 50 |   step_rm(date) %>% 
 51 |   step_mutate(flight = as.factor(flight)) %>% 
 52 |   step_lencode_bayes(flight, outcome = vars(arr_delay)) %>% 
 53 |   step_dummy(all_nominal_predictors()) %>% 
 54 |   step_zv(all_predictors()) 
 55 | 
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_15_with.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 15
 13 | preproc <- "light preprocessing"
 14 | par_method <- "everything"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_15_without.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 15
 13 | preproc <- "no preprocessing"
 14 | par_method <- "everything"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_20_expensive.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | library(embed)
  8 | library(rstanarm)
  9 | 
 10 | ## -----------------------------------------------------------------------------
 11 | 
 12 | num_resamples <- 5
 13 | num_grid <- 10
 14 | num_cores <- 20
 15 | preproc <- "expensive preprocessing"
 16 | par_method <- "everything"
 17 | 
 18 | ## -----------------------------------------------------------------------------
 19 | 
 20 | set.seed(123)
 21 | 
 22 | flight_data <- 
 23 |   flights %>% 
 24 |   mutate(
 25 |     # Convert the arrival delay to a factor
 26 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 27 |     arr_delay = factor(arr_delay),
 28 |     # We will use the date (not date-time) in the recipe below
 29 |     date = as.Date(time_hour)
 30 |   ) %>% 
 31 |   # Include the weather data
 32 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 33 |   # Only retain the specific columns we will use
 34 |   select(dep_time, flight, origin, dest, air_time, distance, 
 35 |          carrier, date, arr_delay, time_hour) %>% 
 36 |   # Exclude missing data
 37 |   na.omit() %>% 
 38 |   # For creating models, it is better to have qualitative columns
 39 |   # encoded as factors (instead of character strings)
 40 |   mutate_if(is.character, as.factor) %>% 
 41 |   sample_n(4000)
 42 | 
 43 | ## -----------------------------------------------------------------------------
 44 | 
 45 | flights_rec <- 
 46 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 47 |   update_role(flight, time_hour, new_role = "ID") %>% 
 48 |   step_date(date, features = c("dow", "month")) %>% 
 49 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 50 |   step_rm(date) %>% 
 51 |   step_mutate(flight = as.factor(flight)) %>% 
 52 |   step_lencode_bayes(flight, outcome = vars(arr_delay)) %>% 
 53 |   step_dummy(all_nominal_predictors()) %>% 
 54 |   step_zv(all_predictors()) 
 55 | 
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_20_with.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 20
 13 | preproc <- "light preprocessing"
 14 | par_method <- "everything"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_05_20_without.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 20
 13 | preproc <- "no preprocessing"
 14 | par_method <- "everything"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/everything_times.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/everything_times.RData


--------------------------------------------------------------------------------
/extras/parallel_times/resamples_05_01_with.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 1
 13 | preproc <- "light preprocessing"
 14 | par_method <- "resamples"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/resamples_05_01_without.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 1
 13 | preproc <- "no preprocessing"
 14 | par_method <- "resamples"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/resamples_05_02_with.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 2
 13 | preproc <- "light preprocessing"
 14 | par_method <- "resamples"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/resamples_05_02_without.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 2
 13 | preproc <- "no preprocessing"
 14 | par_method <- "resamples"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/resamples_05_03_with.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 3
 13 | preproc <- "light preprocessing"
 14 | par_method <- "resamples"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/resamples_05_03_without.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 3
 13 | preproc <- "no preprocessing"
 14 | par_method <- "resamples"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/resamples_05_04_with.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 4
 13 | preproc <- "light preprocessing"
 14 | par_method <- "resamples"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/resamples_05_04_without.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 4
 13 | preproc <- "no preprocessing"
 14 | par_method <- "resamples"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/resamples_05_05_with.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 5
 13 | preproc <- "light preprocessing"
 14 | par_method <- "resamples"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/resamples_05_05_without.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(nycflights13)
  3 | library(doMC)
  4 | library(rlang)
  5 | library(xgboost)
  6 | library(vctrs)
  7 | 
  8 | ## -----------------------------------------------------------------------------
  9 | 
 10 | num_resamples <- 5
 11 | num_grid <- 10
 12 | num_cores <- 5
 13 | preproc <- "no preprocessing"
 14 | par_method <- "resamples"
 15 | 
 16 | ## -----------------------------------------------------------------------------
 17 | 
 18 | set.seed(123)
 19 | 
 20 | flight_data <- 
 21 |   flights %>% 
 22 |   mutate(
 23 |     # Convert the arrival delay to a factor
 24 |     arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
 25 |     arr_delay = factor(arr_delay),
 26 |     # We will use the date (not date-time) in the recipe below
 27 |     date = as.Date(time_hour)
 28 |   ) %>% 
 29 |   # Include the weather data
 30 |   inner_join(weather, by = c("origin", "time_hour")) %>% 
 31 |   # Only retain the specific columns we will use
 32 |   select(dep_time, flight, origin, dest, air_time, distance, 
 33 |          carrier, date, arr_delay, time_hour) %>% 
 34 |   # Exclude missing data
 35 |   na.omit() %>% 
 36 |   # For creating models, it is better to have qualitative columns
 37 |   # encoded as factors (instead of character strings)
 38 |   mutate_if(is.character, as.factor) %>% 
 39 |   sample_n(4000)
 40 | 
 41 | ## -----------------------------------------------------------------------------
 42 | 
 43 | flights_rec <- 
 44 |   recipe(arr_delay ~ ., data = flight_data) %>% 
 45 |   update_role(flight, time_hour, new_role = "ID") %>% 
 46 |   step_date(date, features = c("dow", "month")) %>% 
 47 |   step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
 48 |   step_rm(date) %>% 
 49 |   step_dummy(all_nominal_predictors()) %>% 
 50 |   step_zv(all_predictors())
 51 | 
 52 | preproc_data <- 
 53 |   flights_rec %>% 
 54 |   prep() %>% 
 55 |   juice(all_predictors(), all_outcomes())
 56 | 
 57 | ## -----------------------------------------------------------------------------
 58 | 
 59 | xgboost_spec <- 
 60 |   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
 61 |              loss_reduction = tune(), sample_size = tune()) %>% 
 62 |   set_mode("classification") %>% 
 63 |   set_engine("xgboost") 
 64 | 
 65 | ## -----------------------------------------------------------------------------
 66 | 
 67 | if (preproc != "no preprocessing") {
 68 |   xgboost_workflow <- 
 69 |     workflow() %>% 
 70 |     add_recipe(flights_rec) %>% 
 71 |     add_model(xgboost_spec) 
 72 | 
 73 |   set.seed(33)
 74 |   bt <- bootstraps(flight_data, times = num_resamples)
 75 | } else {
 76 |   xgboost_workflow <- 
 77 |     workflow() %>% 
 78 |     add_variables(arr_delay, predictors = c(everything())) %>% 
 79 |     add_model(xgboost_spec) 
 80 |   
 81 |   set.seed(33)
 82 |   bt <- bootstraps(preproc_data, times = num_resamples)
 83 | }
 84 | 
 85 | ## -----------------------------------------------------------------------------
 86 | 
 87 | set.seed(22)
 88 | xgboost_grid <- 
 89 |   xgboost_workflow %>% 
 90 |   parameters() %>% 
 91 |   update(trees = trees(c(100, 2000))) %>% 
 92 |   grid_max_entropy(size = num_grid)
 93 | 
 94 | ## -----------------------------------------------------------------------------
 95 | 
 96 | if (num_cores > 1) {
 97 |   registerDoMC(cores=num_cores)
 98 | }
 99 | 
100 | ## -----------------------------------------------------------------------------
101 | 
102 | roc_res <- metric_set(roc_auc)
103 | 
104 | ctrl <- control_grid(parallel_over = par_method)
105 | 
106 | grid_time <- system.time({
107 |   set.seed(99)
108 |   xgboost_workflow %>%
109 |     tune_grid(bt, grid = xgboost_grid, metrics = roc_res, control = ctrl)
110 | })
111 | 
112 | ## -----------------------------------------------------------------------------
113 | 
114 | times <- tibble::tibble(
115 |   elapsed = grid_time[3],
116 |   num_resamples = num_resamples,
117 |   num_grid = num_grid,
118 |   num_cores = num_cores,
119 |   preproc = preproc,
120 |   par_method = par_method
121 | )
122 | 
123 | 
124 | save(times, file = paste0("xgb_", num_cores, format(Sys.time(), "_%Y_%m_%d_%H_%M_%S.RData")))
125 | 
126 | sessioninfo::session_info()
127 | 
128 | if (!interactive()) {
129 |   q("no")
130 | }
131 | 
132 | 


--------------------------------------------------------------------------------
/extras/parallel_times/resamples_times.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/resamples_times.RData


--------------------------------------------------------------------------------
/extras/parallel_times/runs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | R CMD BATCH --vanilla everything_05_03_expensive.R
 4 | sleep 20
 5 | R CMD BATCH --vanilla everything_05_10_expensive.R
 6 | sleep 20
 7 | R CMD BATCH --vanilla resamples_05_04_without.R
 8 | sleep 20
 9 | R CMD BATCH --vanilla resamples_05_03_without.R
10 | sleep 20
11 | R CMD BATCH --vanilla everything_05_04_without.R
12 | sleep 20
13 | R CMD BATCH --vanilla everything_05_01_expensive.R
14 | sleep 20
15 | R CMD BATCH --vanilla everything_05_02_with.R
16 | sleep 20
17 | R CMD BATCH --vanilla everything_05_03_without.R
18 | sleep 20
19 | R CMD BATCH --vanilla everything_05_15_without.R
20 | sleep 20
21 | R CMD BATCH --vanilla everything_05_20_with.R
22 | sleep 20
23 | R CMD BATCH --vanilla resamples_05_01_with.R
24 | sleep 20
25 | R CMD BATCH --vanilla resamples_05_05_expensive.R
26 | sleep 20
27 | R CMD BATCH --vanilla everything_05_15_with.R
28 | sleep 20
29 | R CMD BATCH --vanilla everything_05_10_without.R
30 | sleep 20
31 | R CMD BATCH --vanilla resamples_05_02_without.R
32 | sleep 20
33 | R CMD BATCH --vanilla resamples_05_02_expensive.R
34 | sleep 20
35 | R CMD BATCH --vanilla everything_05_20_without.R
36 | sleep 20
37 | R CMD BATCH --vanilla resamples_05_04_with.R
38 | sleep 20
39 | R CMD BATCH --vanilla everything_05_01_with.R
40 | sleep 20
41 | R CMD BATCH --vanilla resamples_05_03_with.R
42 | sleep 20
43 | R CMD BATCH --vanilla resamples_05_05_without.R
44 | sleep 20
45 | R CMD BATCH --vanilla everything_05_10_with.R
46 | sleep 20
47 | R CMD BATCH --vanilla resamples_05_01_without.R
48 | sleep 20
49 | R CMD BATCH --vanilla everything_05_05_with.R
50 | sleep 20
51 | R CMD BATCH --vanilla everything_05_05_without.R
52 | sleep 20
53 | R CMD BATCH --vanilla everything_05_03_with.R
54 | sleep 20
55 | R CMD BATCH --vanilla everything_05_05_expensive.R
56 | sleep 20
57 | R CMD BATCH --vanilla resamples_05_01_expensive.R
58 | sleep 20
59 | R CMD BATCH --vanilla everything_05_02_without.R
60 | sleep 20
61 | R CMD BATCH --vanilla everything_05_15_expensive.R
62 | sleep 20
63 | R CMD BATCH --vanilla everything_05_02_expensive.R
64 | sleep 20
65 | R CMD BATCH --vanilla resamples_05_02_with.R
66 | sleep 20
67 | R CMD BATCH --vanilla resamples_05_03_expensive.R
68 | sleep 20
69 | R CMD BATCH --vanilla resamples_05_04_expensive.R
70 | sleep 20
71 | R CMD BATCH --vanilla resamples_05_05_with.R
72 | sleep 20
73 | R CMD BATCH --vanilla everything_05_20_expensive.R
74 | sleep 20
75 | R CMD BATCH --vanilla everything_05_04_expensive.R
76 | sleep 20
77 | R CMD BATCH --vanilla everything_05_04_with.R
78 | sleep 20
79 | R CMD BATCH --vanilla everything_05_01_without.R
80 | 


--------------------------------------------------------------------------------
/extras/parallel_times/tune_iter_times_everything.R:
--------------------------------------------------------------------------------
 1 | #remotes::install_github("tidymodels/tune@monitor-execution-times")
 2 | # This will try to write to ~/tmp
 3 | library(tidymodels)
 4 | library(doParallel)
 5 | cl <- makePSOCKcluster(10)
 6 | registerDoParallel(cl)
 7 | 
 8 | options(width = 120)
 9 | 
10 | data(cells)
11 | cells <- cells %>% select(-case)
12 | 
13 | set.seed(6735)
14 | folds <- vfold_cv(cells, v = 5)
15 | 
16 | 
17 | cell_rec <-
18 |   recipe(class ~ ., data = cells) %>%
19 |   step_normalize(all_numeric_predictors()) %>%
20 |   step_ica(all_numeric_predictors(), num_comp = 30)
21 | 
22 | rf_mod <-
23 |   rand_forest(mtry = tune(), min_n = tune(), trees = 50) %>%
24 |   set_engine("ranger") %>%
25 |   set_mode("classification")
26 | 
27 | # Use a space-filling design with 7 points
28 | set.seed(3254)
29 | rf_res <- tune_grid(rf_mod, cell_rec, resamples = folds, grid = 7,
30 |                      control = control_grid(parallel_over = "everything"))
31 | 
32 | 
33 | f_names <- list.files("~/tmp", pattern = "^time", full.names = TRUE)
34 | 
35 | timings <- NULL
36 | for (i in f_names) {
37 |   load(i)
38 |   timings <- bind_rows(timings, res)
39 | }
40 | 
41 | 
42 | everyting_times <-
43 |   timings %>%
44 |   mutate(
45 |     label = ifelse(mod_iter == 0, "preprocess", "model"),
46 |     label = factor(label, levels = rev(c("preprocess", "model"))),
47 |     pid = factor(format(pid)),
48 |     pid = paste("worker", format(as.numeric(pid)))
49 |   ) %>%
50 |   arrange(pid, id, label)
51 | 
52 | 


--------------------------------------------------------------------------------
/extras/parallel_times/tune_iter_times_resamples.R:
--------------------------------------------------------------------------------
 1 | #remotes::install_github("tidymodels/tune@monitor-execution-times")
 2 | # This will try to write to ~/tmp
 3 | library(tidymodels)
 4 | library(doParallel)
 5 | cl <- makePSOCKcluster(10)
 6 | registerDoParallel(cl)
 7 | 
 8 | options(width = 120)
 9 | 
10 | data(cells)
11 | cells <- cells %>% select(-case)
12 | 
13 | set.seed(6735)
14 | folds <- vfold_cv(cells, v = 5)
15 | 
16 | 
17 | cell_rec <-
18 |   recipe(class ~ ., data = cells) %>%
19 |   step_normalize(all_numeric_predictors()) %>%
20 |   step_ica(all_numeric_predictors(), num_comp = 30)
21 | 
22 | rf_mod <-
23 |   rand_forest(mtry = tune(), min_n = tune(), trees = 50) %>%
24 |   set_engine("ranger") %>%
25 |   set_mode("classification")
26 | 
27 | # Use a space-filling design with 7 points
28 | set.seed(3254)
29 | rf_res <- tune_grid(rf_mod, cell_rec, resamples = folds, grid = 7,
30 |                      control = control_grid(parallel_over = "resamples"))
31 | 
32 | 
33 | f_names <- list.files("~/tmp", pattern = "^time", full.names = TRUE)
34 | 
35 | timings <- NULL
36 | for (i in f_names) {
37 |   load(i)
38 |   timings <- bind_rows(timings, res)
39 | }
40 | 
41 | 
42 | resamples_times <-
43 |   timings %>%
44 |   mutate(
45 |     label = ifelse(mod_iter == 0, "preprocess", "model"),
46 |     label = factor(label, levels = rev(c("preprocess", "model"))),
47 |     pid = factor(format(pid)),
48 |     pid = paste("worker", format(as.numeric(pid))),
49 |     id_alt = paste(id, "/", pid)
50 |   ) %>%
51 |   arrange(pid, id, label)
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_10_2020_10_28_17_49_51.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_10_2020_10_28_17_49_51.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_10_2020_10_28_21_28_17.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_10_2020_10_28_21_28_17.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_10_2020_10_28_22_43_51.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_10_2020_10_28_22_43_51.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_15_2020_10_28_20_51_49.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_15_2020_10_28_20_51_49.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_15_2020_10_28_21_23_27.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_15_2020_10_28_21_23_27.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_15_2020_10_29_00_52_42.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_15_2020_10_29_00_52_42.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_1_2020_10_28_20_29_50.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_1_2020_10_28_20_29_50.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_1_2020_10_28_21_13_01.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_1_2020_10_28_21_13_01.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_1_2020_10_28_22_26_43.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_1_2020_10_28_22_26_43.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_1_2020_10_28_23_01_51.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_1_2020_10_28_23_01_51.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_1_2020_10_29_00_23_51.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_1_2020_10_29_00_23_51.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_1_2020_10_29_04_04_38.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_1_2020_10_29_04_04_38.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_20_2020_10_28_20_55_16.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_20_2020_10_28_20_55_16.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_20_2020_10_28_22_01_02.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_20_2020_10_28_22_01_02.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_20_2020_10_29_03_01_36.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_20_2020_10_29_03_01_36.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_2_2020_10_28_20_41_12.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_2_2020_10_28_20_41_12.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_2_2020_10_28_21_39_20.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_2_2020_10_28_21_39_20.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_2_2020_10_28_21_57_32.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_2_2020_10_28_21_57_32.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_2_2020_10_29_00_35_18.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_2_2020_10_29_00_35_18.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_2_2020_10_29_02_04_38.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_2_2020_10_29_02_04_38.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_2_2020_10_29_02_15_36.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_2_2020_10_29_02_15_36.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_3_2020_10_28_17_30_51.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_3_2020_10_28_17_30_51.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_3_2020_10_28_18_05_27.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_3_2020_10_28_18_05_27.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_3_2020_10_28_20_48_31.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_3_2020_10_28_20_48_31.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_3_2020_10_28_22_34_29.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_3_2020_10_28_22_34_29.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_3_2020_10_28_23_21_46.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_3_2020_10_28_23_21_46.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_3_2020_10_29_02_28_28.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_3_2020_10_29_02_28_28.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_4_2020_10_28_17_57_37.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_4_2020_10_28_17_57_37.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_4_2020_10_28_18_11_50.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_4_2020_10_28_18_11_50.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_4_2020_10_28_22_08_45.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_4_2020_10_28_22_08_45.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_4_2020_10_29_02_41_00.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_4_2020_10_29_02_41_00.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_4_2020_10_29_03_40_17.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_4_2020_10_29_03_40_17.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_4_2020_10_29_03_46_38.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_4_2020_10_29_03_46_38.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_5_2020_10_28_21_20_10.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_5_2020_10_28_21_20_10.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_5_2020_10_28_22_39_01.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_5_2020_10_28_22_39_01.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_5_2020_10_28_23_08_07.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_5_2020_10_28_23_08_07.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_5_2020_10_28_23_14_28.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_5_2020_10_28_23_14_28.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_5_2020_10_28_23_54_07.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_5_2020_10_28_23_54_07.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_5_2020_10_29_02_45_30.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_5_2020_10_29_02_45_30.RData


--------------------------------------------------------------------------------
/extras/parallel_times/xgb_times.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/extras/parallel_times/xgb_times.RData


--------------------------------------------------------------------------------
/extras/sa_2d_plot.R:
--------------------------------------------------------------------------------
  1 | sa_2d_plot <- function(sa_obj, history, large_sa, path = tempdir()) {
  2 |   range_data <- 
  3 |     sa_obj %>% 
  4 |     collect_metrics() %>% 
  5 |     select(cost, rbf_sigma) %>% 
  6 |     bind_rows(
  7 |       large_sa %>% 
  8 |         collect_metrics() %>% 
  9 |         select(cost, rbf_sigma)
 10 |     ) %>% 
 11 |     mutate(
 12 |       cost = log2(cost),
 13 |       rbf_sigma = log10(rbf_sigma)
 14 |     )
 15 |   x_rng <- 10^extendrange(range_data$rbf_sigma)
 16 |   y_rng <- 2^extendrange(range_data$cost)
 17 |   
 18 |   params <-
 19 |     sa_obj %>%
 20 |     collect_metrics() %>%
 21 |     select(.iter, cost, rbf_sigma) %>%
 22 |     arrange(.iter)
 23 |   
 24 |   init <-
 25 |     params %>%
 26 |     filter(.iter == 0)
 27 |   
 28 |   ## -----------------------------------------------------------------------------
 29 |   
 30 |   svm_roc <-
 31 |     large_sa %>%
 32 |     collect_metrics()
 33 |   
 34 |   large_plot <-
 35 |     svm_roc %>%
 36 |     ggplot(aes(x = rbf_sigma, y = cost)) +
 37 |     geom_raster(aes(fill = mean), show.legend = FALSE) +
 38 |     scale_x_log10(labels = fmt_dcimals(2), limits = x_rng) + 
 39 |     scale_y_continuous(trans = "log2", labels = fmt_dcimals(2), limits = y_rng) +
 40 |     scale_fill_distiller(palette = "Blues") +
 41 |     theme_minimal() +
 42 |     theme(
 43 |       legend.position = "bottom",
 44 |       legend.key.width = grid::unit(2, "cm"),
 45 |       plot.title = element_text(hjust = 0.5)
 46 |     ) +
 47 |     guides(title.position = "bottom") +
 48 |     labs(x = "rbf_sigma\n\n\n\n", title = "ROC AUC surface") +
 49 |     coord_fixed(ratio = 1/2.5)
 50 |   
 51 |   base_plot <-
 52 |     large_plot +
 53 |     geom_point(data = init, pch = 4, cex = 4)
 54 |   
 55 |   ## -----------------------------------------------------------------------------
 56 |   
 57 |   num_init <- nrow(init)
 58 |   num_iter <- max(history$.iter)
 59 |   
 60 |   nms <- purrr::map_chr(1:nrow(history), ~ tempfile())
 61 |   
 62 |   for (i in (num_init + 1):nrow(history)) {
 63 |     current_iter <- history$.iter[i]
 64 |     current_res <- current_param_path(history, current_iter)
 65 |     current_best <- current_res %>% dplyr::filter(results == "new best")
 66 |     
 67 |     ttl <- paste0("Iteration ", current_iter)
 68 |     
 69 |     text_just <-
 70 |       case_when(
 71 |         history$results[i] == "restart from best"  ~0.00,
 72 |         history$results[i] == "discard suboptimal" ~ 0.25,
 73 |         history$results[i] == "accept suboptimal"  ~ 0.50,
 74 |         history$results[i] == "better suboptimal"  ~ 0.75,
 75 |         history$results[i] == "new best"           ~ 1.00
 76 |       )
 77 |     
 78 |     tmp <- history
 79 |     tmp$results <- gsub(" suboptimal", "\nsuboptimal",  tmp$results)
 80 |     tmp$results <- gsub(" best", "\nbest",  tmp$results)
 81 |     
 82 |     new_plot <-
 83 |       base_plot +
 84 |       geom_point(
 85 |         data = current_res %>% slice(n()),
 86 |         size = 3,
 87 |         col = "green"
 88 |       ) +
 89 |       geom_path(
 90 |         data = current_res,
 91 |         alpha = .5,
 92 |         arrow = arrow(length = unit(0.1, "inches"))
 93 |       ) +
 94 |       ggtitle(ttl, subtitle = tmp$results[i]) +
 95 |       theme(plot.subtitle = element_text(hjust = text_just))
 96 |     
 97 |     if (nrow(current_best) > 0) {
 98 |       new_plot <-
 99 |         new_plot +
100 |         geom_point(data = current_best, size = 1/3)
101 |     }
102 |     print(new_plot)
103 |   }
104 |   invisible(NULL)
105 | }
106 | 
107 | current_param_path <- function(x, iter) {
108 |   x <-
109 |     x %>%
110 |     dplyr::filter(.iter <= iter)
111 |   ind <- nrow(x)
112 |   param_path <- ind
113 |   while(length(ind) > 0) {
114 |     ind <- which(x$.config == x$.parent[ind])
115 |     param_path <- c(param_path, ind)
116 |   }
117 |   x %>% dplyr::slice(rev(param_path))
118 | }
119 | 


--------------------------------------------------------------------------------
/extras/submodels/with_submodel_trick.R:
--------------------------------------------------------------------------------
 1 | library(tidymodels)
 2 | library(tictoc)
 3 | library(doMC)
 4 | 
 5 | ## -----------------------------------------------------------------------------
 6 | 
 7 | data(cells)
 8 | cells <- cells %>% select(-case)
 9 | set.seed(33)
10 | cell_folds <- vfold_cv(cells)
11 | roc_res <- metric_set(roc_auc)
12 | 
13 | ## -----------------------------------------------------------------------------
14 | 
15 | c5_spec <- 
16 |   boost_tree(trees = tune()) %>% 
17 |   set_engine("C5.0") %>% 
18 |   set_mode("classification")
19 | 
20 | tic()
21 | set.seed(2)
22 | c5_spec %>%
23 |   tune_grid(
24 |     class ~ .,
25 |     resamples = cell_folds,
26 |     grid = data.frame(trees = 1:100),
27 |     metrics = roc_res
28 |   )
29 | toc()
30 | 
31 | ## -----------------------------------------------------------------------------
32 | 
33 | registerDoMC(cores = 10)
34 | 
35 | tic()
36 | set.seed(2)
37 | c5_spec %>%
38 |   tune_grid(
39 |     class ~ .,
40 |     resamples = cell_folds,
41 |     grid = data.frame(trees = 1:100),
42 |     metrics = roc_res
43 |   )
44 | toc()
45 | 
46 | ## -----------------------------------------------------------------------------
47 | 
48 | sessioninfo::session_info()
49 | 
50 | q("no")
51 | 
52 | 


--------------------------------------------------------------------------------
/extras/submodels/without_submodel_trick.R:
--------------------------------------------------------------------------------
 1 | # remotes::install_github("tidymodels/parsnip@no-submodel-trick")
 2 | library(tidymodels)
 3 | library(tictoc)
 4 | library(doMC)
 5 | 
 6 | ## -----------------------------------------------------------------------------
 7 | 
 8 | data(cells)
 9 | cells <- cells %>% select(-case)
10 | set.seed(33)
11 | cell_folds <- vfold_cv(cells)
12 | roc_res <- metric_set(roc_auc)
13 | 
14 | ## -----------------------------------------------------------------------------
15 | 
16 | c5_spec <- 
17 |   boost_tree(trees = tune()) %>% 
18 |   set_engine("C5.0") %>% 
19 |   set_mode("classification")
20 | 
21 | tic()
22 | set.seed(2)
23 | c5_spec %>%
24 |   tune_grid(
25 |     class ~ .,
26 |     resamples = cell_folds,
27 |     grid = data.frame(trees = 1:100),
28 |     metrics = roc_res
29 |   )
30 | toc()
31 | 
32 | ## -----------------------------------------------------------------------------
33 | 
34 | registerDoMC(cores = 10)
35 | 
36 | tic()
37 | set.seed(2)
38 | c5_spec %>%
39 |   tune_grid(
40 |     class ~ .,
41 |     resamples = cell_folds,
42 |     grid = data.frame(trees = 1:100),
43 |     metrics = roc_res
44 |   )
45 | toc()
46 | 
47 | ## -----------------------------------------------------------------------------
48 | 
49 | sessioninfo::session_info()
50 | 
51 | q("no")
52 | 
53 | 


--------------------------------------------------------------------------------
/extras/verify_results.R:
--------------------------------------------------------------------------------
 1 | # These functions make sure that our results have not changed so that the interpretation
 2 | # in the text is not incorrect.
 3 | 
 4 | verify_consistent_bo <- function(x) {
 5 |   # initial results generated on 2022-02-16
 6 |   load("RData/svm_bo_metrics.RData")
 7 |   bo_check <- all.equal(x, svm_bo_metrics, tolerance = 0.01)
 8 |   if (!isTRUE(bo_check)) {
 9 |     msg <- "These Bayesian optimization results don't match the previous values.:\n"
10 |     msg <- paste0(msg, paste0(bo_check, collapse = "\n"))
11 |     rlang::abort(msg)
12 |   }
13 |   invisible(NULL)
14 | }
15 | 
16 | 
17 | verify_consistent_sa <- function(x) {
18 |   # initial results generated on 2022-02-16
19 |   load("RData/svm_sa_metrics.RData")
20 |   sa_check <- all.equal(x, svm_sa_metrics, tolerance = 0.01)
21 |   if (!isTRUE(sa_check)) {
22 |     msg <- "These simulated annealing results don't match the previous values.:\n"
23 |     msg <- paste0(msg, paste0(sa_check, collapse = "\n"))
24 |     rlang::abort(msg)
25 |   }
26 |   invisible(NULL)
27 | }
28 | 


--------------------------------------------------------------------------------
/figures/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/figures/.DS_Store


--------------------------------------------------------------------------------
/figures/introduction-descr-examples-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/figures/introduction-descr-examples-1.pdf


--------------------------------------------------------------------------------
/figures/introduction-descr-examples-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/figures/introduction-descr-examples-1.png


--------------------------------------------------------------------------------
/figures/introduction-modeling-process-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/figures/introduction-modeling-process-1.pdf


--------------------------------------------------------------------------------
/figures/tidyverse-cricket-plot-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/figures/tidyverse-cricket-plot-1.pdf


--------------------------------------------------------------------------------
/figures/tidyverse-interaction-plots-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/figures/tidyverse-interaction-plots-1.pdf


--------------------------------------------------------------------------------
/images/cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/images/cover.png


--------------------------------------------------------------------------------
/images/error.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/images/error.png


--------------------------------------------------------------------------------
/images/note.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/images/note.png


--------------------------------------------------------------------------------
/images/robot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/images/robot.png


--------------------------------------------------------------------------------
/images/rstudio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/images/rstudio.png


--------------------------------------------------------------------------------
/images/tip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/images/tip.png


--------------------------------------------------------------------------------
/images/warning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/images/warning.png


--------------------------------------------------------------------------------
/issue_template.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report or feature request
 3 | about: Describe a bug you've seen or make suggestion
 4 | ---
 5 | 
 6 | # PLEASE READ: Making a new issue for _Tidy Models with R_
 7 | 
 8 | 
 9 | Thanks for giving us feedback on the book. Please follow these guidelines:
10 | 
11 | Please state the version of the book by referencing the **version** and **date** on the book's first page (under the title and authors). 
12 | 
13 | 
14 | ## Comments or questions on the content
15 | 
16 | Please tell us where exactly the issue is (e.g. chapter, section, figure number, etc.) and, whenever possible, copy/paste the text in question using `>` in the issue. 
17 | For example: 
18 | 
19 | >  In 0.0.1.9000 (2020-03-05), capitalization in article/book names are inconsistent. Also, sometimes initials are used for names, sometimes not: 
20 | 
21 | > Agresti, Alan. 2012. Categorical Data Analysis. Wiley-Interscience.
22 | > Altman, D. 1991. “Categorising Continuous Variables.” British Journal of Cancer, no. 5:975.
23 | 
24 | ## Potential Bugs
25 | 
26 | Since the repo and data are publicly available, it should be entirely possible to create a minimal reprex (reproducible example).
27 | 
28 | The goal of a reprex is to make it as easy as possible for me to recreate your problem so that I can fix it: please help me help you!
29 | 
30 | If you've never heard of a reprex before, start by reading "[What is a reprex](https://github.com/tidyverse/reprex#what-is-a-reprex)", and follow the advice further down that page.
31 | 
32 | 
33 | ## Contributions
34 | 
35 | These details are in `contributing.md` 


--------------------------------------------------------------------------------
/latex_extras/preamble.tex:
--------------------------------------------------------------------------------
 1 | % Begin preamble.tex -----------------------------------------------------------
 2 | 
 3 | % ------------------------------------------------------------------------------
 4 | % size based on "Statistical Rethinking" style files
 5 | 
 6 | \usepackage[paperwidth=7.67in,paperheight=10.67in,layoutwidth=7in,layoutheight=10in,text={5.5in,8.5in},left=0.65in,top=0.75in,headheight=0.25in,headsep=0.4in,footskip=0.4in,showcrop, layouthoffset=0.33in, layoutvoffset=0.33in]{geometry}
 7 | 
 8 | % ------------------------------------------------------------------------------
 9 | % headers
10 | 
11 | \usepackage{fancyhdr}
12 | 
13 | \pagestyle{fancy}
14 | \fancyhf{}
15 | \fancyhead[CO]{\nouppercase{\emph{\rightmark}}}
16 | \fancyhead[CE]{\nouppercase{\emph{\leftmark}}}
17 | \fancyhead[RO]{\thepage}
18 | \fancyhead[LE]{\thepage}
19 | % no line in header or footer
20 | \renewcommand{\headrulewidth}{0pt}
21 | 
22 | % Code chunk mods -------------------------------------------------------------
23 | \DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\},fontsize=\small}
24 | \renewcommand{\CommentTok}[1]{\textcolor[rgb]{0.41,0.41,0.41}{\texttt{#1}}}
25 | 
26 | % Render custom blocks ---------------------------------------------------------
27 | % See https://github.com/rstudio/bookdown/issues/420 ---------------------------
28 | 
29 | \makeatletter
30 | \newenvironment{kframe}{%
31 | \medskip{}
32 | \setlength{\fboxsep}{.8em}
33 |  \def\at@end@of@kframe{}%
34 |  \ifinner\ifhmode%
35 |   \def\at@end@of@kframe{\end{minipage}}%
36 |   \begin{minipage}{\columnwidth}%
37 |  \fi\fi%
38 |  \def\FrameCommand##1{\hskip\@totalleftmargin \hskip-\fboxsep
39 |  \colorbox{shadecolor}{##1}\hskip-\fboxsep
40 |      % There is no \\@totalrightmargin, so:
41 |      \hskip-\linewidth \hskip-\@totalleftmargin \hskip\columnwidth}%
42 |  \MakeFramed {\advance\hsize-\width
43 |    \@totalleftmargin\z@ \linewidth\hsize
44 |    \@setminipage}}%
45 |  {\par\unskip\endMakeFramed%
46 |  \at@end@of@kframe}
47 | \makeatother
48 | 
49 | \makeatletter
50 | \@ifundefined{Shaded}{
51 | }{\renewenvironment{Shaded}{\begin{kframe}}{\end{kframe}}}
52 | \makeatother
53 | 
54 | \newenvironment{rmdblock}[1]
55 |   {
56 |   \begin{itemize}
57 |   \renewcommand{\labelitemi}{
58 |     \raisebox{-.7\height}[0pt][0pt]{
59 |       {\setkeys{Gin}{width=3em,keepaspectratio}\includegraphics{images/#1}}
60 |     }
61 |   }
62 |   \setlength{\fboxsep}{1em}
63 |   \begin{kframe}
64 |   \item
65 |   }
66 |   {
67 |   \end{kframe}
68 |   \end{itemize}
69 |   }
70 | \newenvironment{rmdnote}
71 |   {\begin{rmdblock}{note}}
72 |   {\end{rmdblock}}
73 | \newenvironment{rmdcaution}
74 |   {\begin{rmdblock}{caution}}
75 |   {\end{rmdblock}}
76 | \newenvironment{rmdimportant}
77 |   {\begin{rmdblock}{important}}
78 |   {\end{rmdblock}}
79 | \newenvironment{rmdtip}
80 |   {\begin{rmdblock}{tip}}
81 |   {\end{rmdblock}}
82 | \newenvironment{rmdwarning}
83 |   {\begin{rmdblock}{warning}}
84 |   {\end{rmdblock}}
85 | 
86 | 
87 | % End preamble.tex -------------------------------------------------------------
88 | 


--------------------------------------------------------------------------------
/premade/addin.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/addin.gif


--------------------------------------------------------------------------------
/premade/ames.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/ames.png


--------------------------------------------------------------------------------
/premade/bad-workflow.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/bad-workflow.pdf


--------------------------------------------------------------------------------
/premade/bootstraps.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/bootstraps.pdf


--------------------------------------------------------------------------------
/premade/crawford.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/crawford.png


--------------------------------------------------------------------------------
/premade/data-science-model.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/data-science-model.graffle


--------------------------------------------------------------------------------
/premade/data-science-model.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/data-science-model.pdf


--------------------------------------------------------------------------------
/premade/data-science-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/data-science-model.png


--------------------------------------------------------------------------------
/premade/dot_rr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/dot_rr.png


--------------------------------------------------------------------------------
/premade/exp_improve.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/exp_improve.gif


--------------------------------------------------------------------------------
/premade/good-proper-workflows.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/good-proper-workflows.graffle


--------------------------------------------------------------------------------
/premade/mitchell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/mitchell.png


--------------------------------------------------------------------------------
/premade/modeling-process.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/modeling-process.graffle


--------------------------------------------------------------------------------
/premade/modeling-process.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/modeling-process.pdf


--------------------------------------------------------------------------------
/premade/modeling-process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/modeling-process.png


--------------------------------------------------------------------------------
/premade/morphology.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/morphology.png


--------------------------------------------------------------------------------
/premade/northridge.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/northridge.png


--------------------------------------------------------------------------------
/premade/proper-workflow.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/proper-workflow.pdf


--------------------------------------------------------------------------------
/premade/recipes-process.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/recipes-process.graffle


--------------------------------------------------------------------------------
/premade/recipes-process.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/recipes-process.pdf


--------------------------------------------------------------------------------
/premade/resampling-details.graffle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/resampling-details.graffle


--------------------------------------------------------------------------------
/premade/resampling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/resampling.pdf


--------------------------------------------------------------------------------
/premade/roc_surface.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/roc_surface.png


--------------------------------------------------------------------------------
/premade/rolling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/rolling.pdf


--------------------------------------------------------------------------------
/premade/three-CV-iter.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/three-CV-iter.pdf


--------------------------------------------------------------------------------
/premade/three-CV.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/three-CV.pdf


--------------------------------------------------------------------------------
/premade/timberland.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/timberland.png


--------------------------------------------------------------------------------
/premade/validation-alt.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/validation-alt.pdf


--------------------------------------------------------------------------------
/premade/validation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/premade/validation.pdf


--------------------------------------------------------------------------------
/race_results.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/race_results.mp4


--------------------------------------------------------------------------------
/references.Rmd:
--------------------------------------------------------------------------------
1 | # REFERENCES {-}
2 | 
3 | 


--------------------------------------------------------------------------------
/sa_search.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/TMwR/6e8b6a9dd9c6aaffd9b119911ae247144bd5c4d8/sa_search.mp4


--------------------------------------------------------------------------------
/style.css:
--------------------------------------------------------------------------------
 1 | 
 2 | .rmdnote, .rmdtip, .rmdwarning {
 3 |   padding: 1em 1em 1em 4em;
 4 |   margin-bottom: 10px;
 5 |   background: #f5f5f5 5px center/3em no-repeat;
 6 | }
 7 | 
 8 | .rmdnote {
 9 |   background-image: url("images/note.png");
10 | }
11 | .rmdtip {
12 |   background-image: url("images/tip.png");
13 | }
14 | .rmdwarning {
15 |   background-image: url("images/warning.png");
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------