├── .Rbuildignore
├── .github
    ├── .gitignore
    ├── CODEOWNERS
    ├── CODE_OF_CONDUCT.md
    └── workflows
    │   ├── R-CMD-check-hard.yaml
    │   ├── R-CMD-check.yaml
    │   ├── lock.yaml
    │   ├── pkgdown.yaml
    │   ├── pr-commands.yaml
    │   └── test-coverage.yaml
├── .gitignore
├── .vscode
    ├── extensions.json
    └── settings.json
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── R
    ├── Chicago.R
    ├── Smithsonian.R
    ├── ad_data.R
    ├── ames.R
    ├── attrition.R
    ├── biomass.R
    ├── bivariate.R
    ├── car_prices.R
    ├── cat_adoption.R
    ├── cells.R
    ├── check_times.R
    ├── chem_proc_yield.R
    ├── churn.R
    ├── concrete.R
    ├── covers.R
    ├── credit_data.R
    ├── crickets.R
    ├── deliveries.R
    ├── drinks.R
    ├── fine_foods.R
    ├── grants.R
    ├── hepatic_injury_qsar.R
    ├── hotel_rates.R
    ├── hpc_cv.R
    ├── hpc_data.R
    ├── ischemic_stroke.R
    ├── leaf_id_flavia.R
    ├── lending_club.R
    ├── meats.R
    ├── modeldata-package.R
    ├── oils.R
    ├── parabolic.R
    ├── pathology.R
    ├── pd_speech.R
    ├── penguins.R
    ├── permeability_qsar.R
    ├── sacremento.R
    ├── scat.R
    ├── simulations.R
    ├── solubility.R
    ├── stackoverflow.R
    ├── steroidogenic_toxicity.R
    ├── tate_text.R
    ├── taxi.R
    ├── two_class_dat.R
    └── wa_churn.R
├── README.Rmd
├── README.md
├── _pkgdown.yml
├── air.toml
├── codecov.yml
├── data-raw
    ├── animal-shelter-intakes-and-outcomes.csv
    ├── cat_adoption.R
    ├── chem_proc_yield.R
    ├── hepatic_injury_qsar.R
    ├── hotel_rates.R
    ├── ischemic_stroke.R
    ├── leaf_id_flavia.R
    ├── permeability_qsar.R
    ├── prep_datasets.R
    ├── steroidogenic_toxicity.R
    └── taxi.R
├── data
    ├── Chicago.rda
    ├── Sacramento.RData
    ├── Smithsonian.RData
    ├── ad_data.RData
    ├── ames.rda
    ├── attrition.RData
    ├── biomass.RData
    ├── bivariate.RData
    ├── car_prices.RData
    ├── cat_adoption.rda
    ├── cells.RData
    ├── check_times.rda
    ├── chem_proc_yield.rda
    ├── concrete.RData
    ├── covers.RData
    ├── credit_data.RData
    ├── crickets.rda
    ├── datalist
    ├── deliveries.rda
    ├── drinks.rda
    ├── grants.rda
    ├── hepatic_injury_qsar.rda
    ├── hotel_rates.rda
    ├── hpc_cv.rda
    ├── hpc_data.RData
    ├── ischemic_stroke.rda
    ├── leaf_id_flavia.rda
    ├── lending_club.rda
    ├── meats.RData
    ├── mlc_churn.RData
    ├── oils.RData
    ├── parabolic.rda
    ├── pathology.rda
    ├── pd_speech.rda
    ├── penguins.rda
    ├── permeability_qsar.rda
    ├── scat.RData
    ├── small_fine_foods.RData
    ├── solubility_test.rda
    ├── stackoverflow.rda
    ├── steroidogenic_toxicity.rda
    ├── tate_text.rda
    ├── taxi.rda
    ├── two_class_dat.RData
    ├── two_class_example.rda
    └── wa_churn.rda
├── man
    ├── Chicago.Rd
    ├── Sacramento.Rd
    ├── Smithsonian.Rd
    ├── ad_data.Rd
    ├── ames.Rd
    ├── attrition.Rd
    ├── biomass.Rd
    ├── bivariate.Rd
    ├── car_prices.Rd
    ├── cat_adoption.Rd
    ├── cells.Rd
    ├── check_times.Rd
    ├── chem_proc_yield.Rd
    ├── concrete.Rd
    ├── covers.Rd
    ├── credit_data.Rd
    ├── crickets.Rd
    ├── deliveries.Rd
    ├── drinks.Rd
    ├── figures
    │   └── lifecycle-deprecated.svg
    ├── grants.Rd
    ├── hepatic_injury_qsar.Rd
    ├── hotel_rates.Rd
    ├── hpc_cv.Rd
    ├── hpc_data.Rd
    ├── ischemic_stroke.Rd
    ├── leaf_id_flavia.Rd
    ├── lending_club.Rd
    ├── meats.Rd
    ├── mlc_churn.Rd
    ├── modeldata-package.Rd
    ├── oils.Rd
    ├── parabolic.Rd
    ├── pathology.Rd
    ├── pd_speech.Rd
    ├── penguins.Rd
    ├── permeability_qsar.Rd
    ├── rmd
    │   └── ames.md
    ├── scat.Rd
    ├── sim_classification.Rd
    ├── small_fine_foods.Rd
    ├── solubility_test.Rd
    ├── stackoverflow.Rd
    ├── steroidogenic_toxicity.Rd
    ├── tate_text.Rd
    ├── taxi.Rd
    ├── two_class_dat.Rd
    ├── two_class_example.Rd
    └── wa_churn.Rd
├── modeldata.Rproj
└── tests
    ├── testthat.R
    └── testthat
        ├── _snaps
            └── simulations.md
        └── test-simulations.R


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^LICENSE\.md$
 4 | ^\.github$
 5 | ^_pkgdown\.yml$
 6 | ^docs$
 7 | ^pkgdown$
 8 | ^README\.Rmd$
 9 | ^codecov\.yml$
10 | ^CODE_OF_CONDUCT\.md$
11 | ^data-raw$
12 | ^revdep$
13 | ^.Rhistory$
14 | ^[\.]?air\.toml$
15 | ^\.vscode$
16 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # CODEOWNERS for modeldata
2 | # https://www.tidyverse.org/development/understudies
3 | .github/CODEOWNERS @topepo @juliasilge
4 | 


--------------------------------------------------------------------------------
/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, caste, color, religion, or sexual
 10 | identity and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the overall
 26 |   community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or advances of
 31 |   any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email address,
 35 |   without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at codeofconduct@posit.co. 
 63 | All complaints will be reviewed and investigated promptly and fairly.
 64 | 
 65 | All community leaders are obligated to respect the privacy and security of the
 66 | reporter of any incident.
 67 | 
 68 | ## Enforcement Guidelines
 69 | 
 70 | Community leaders will follow these Community Impact Guidelines in determining
 71 | the consequences for any action they deem in violation of this Code of Conduct:
 72 | 
 73 | ### 1. Correction
 74 | 
 75 | **Community Impact**: Use of inappropriate language or other behavior deemed
 76 | unprofessional or unwelcome in the community.
 77 | 
 78 | **Consequence**: A private, written warning from community leaders, providing
 79 | clarity around the nature of the violation and an explanation of why the
 80 | behavior was inappropriate. A public apology may be requested.
 81 | 
 82 | ### 2. Warning
 83 | 
 84 | **Community Impact**: A violation through a single incident or series of
 85 | actions.
 86 | 
 87 | **Consequence**: A warning with consequences for continued behavior. No
 88 | interaction with the people involved, including unsolicited interaction with
 89 | those enforcing the Code of Conduct, for a specified period of time. This
 90 | includes avoiding interactions in community spaces as well as external channels
 91 | like social media. Violating these terms may lead to a temporary or permanent
 92 | ban.
 93 | 
 94 | ### 3. Temporary Ban
 95 | 
 96 | **Community Impact**: A serious violation of community standards, including
 97 | sustained inappropriate behavior.
 98 | 
 99 | **Consequence**: A temporary ban from any sort of interaction or public
100 | communication with the community for a specified period of time. No public or
101 | private interaction with the people involved, including unsolicited interaction
102 | with those enforcing the Code of Conduct, is allowed during this period.
103 | Violating these terms may lead to a permanent ban.
104 | 
105 | ### 4. Permanent Ban
106 | 
107 | **Community Impact**: Demonstrating a pattern of violation of community
108 | standards, including sustained inappropriate behavior, harassment of an
109 | individual, or aggression toward or disparagement of classes of individuals.
110 | 
111 | **Consequence**: A permanent ban from any sort of public interaction within the
112 | community.
113 | 
114 | ## Attribution
115 | 
116 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
117 | version 2.1, available at
118 | <https://www.contributor-covenant.org/version/2/1/code_of_conduct.html>.
119 | 
120 | Community Impact Guidelines were inspired by
121 | [Mozilla's code of conduct enforcement ladder][https://github.com/mozilla/inclusion].
122 | 
123 | For answers to common questions about this code of conduct, see the FAQ at
124 | <https://www.contributor-covenant.org/faq>. Translations are available at <https://www.contributor-covenant.org/translations>.
125 | 
126 | [homepage]: https://www.contributor-covenant.org
127 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check-hard.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | #
 4 | # NOTE: This workflow only directly installs "hard" dependencies, i.e. Depends,
 5 | # Imports, and LinkingTo dependencies. Notably, Suggests dependencies are never
 6 | # installed, with the exception of testthat, knitr, and rmarkdown. The cache is
 7 | # never used to avoid accidentally restoring a cache containing a suggested
 8 | # dependency.
 9 | on:
10 |   push:
11 |     branches: [main, master]
12 |   pull_request:
13 | 
14 | name: R-CMD-check-hard.yaml
15 | 
16 | permissions: read-all
17 | 
18 | jobs:
19 |   check-no-suggests:
20 |     runs-on: ${{ matrix.config.os }}
21 | 
22 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
23 | 
24 |     strategy:
25 |       fail-fast: false
26 |       matrix:
27 |         config:
28 |           - {os: ubuntu-latest,   r: 'release'}
29 | 
30 |     env:
31 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
32 |       R_KEEP_PKG_SOURCE: yes
33 | 
34 |     steps:
35 |       - uses: actions/checkout@v4
36 | 
37 |       - uses: r-lib/actions/setup-pandoc@v2
38 | 
39 |       - uses: r-lib/actions/setup-r@v2
40 |         with:
41 |           r-version: ${{ matrix.config.r }}
42 |           http-user-agent: ${{ matrix.config.http-user-agent }}
43 |           use-public-rspm: true
44 | 
45 |       - uses: r-lib/actions/setup-r-dependencies@v2
46 |         with:
47 |           dependencies: '"hard"'
48 |           cache: false
49 |           extra-packages: |
50 |             any::rcmdcheck
51 |             any::testthat
52 |             any::knitr
53 |             any::rmarkdown
54 |           needs: check
55 | 
56 |       - uses: r-lib/actions/check-r-package@v2
57 |         with:
58 |           upload-snapshots: true
59 |           build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")'
60 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | #
 4 | # NOTE: This workflow is overkill for most R packages and
 5 | # check-standard.yaml is likely a better choice.
 6 | # usethis::use_github_action("check-standard") will install it.
 7 | on:
 8 |   push:
 9 |     branches: [main, master]
10 |   pull_request:
11 | 
12 | name: R-CMD-check.yaml
13 | 
14 | permissions: read-all
15 | 
16 | jobs:
17 |   R-CMD-check:
18 |     runs-on: ${{ matrix.config.os }}
19 | 
20 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
21 | 
22 |     strategy:
23 |       fail-fast: false
24 |       matrix:
25 |         config:
26 |           - {os: macos-latest,   r: 'release'}
27 | 
28 |           - {os: windows-latest, r: 'release'}
29 |           # use 4.0 or 4.1 to check with rtools40's older compiler
30 |           - {os: windows-latest, r: 'oldrel-4'}
31 | 
32 |           - {os: ubuntu-latest,  r: 'devel', http-user-agent: 'release'}
33 |           - {os: ubuntu-latest,  r: 'release'}
34 |           - {os: ubuntu-latest,  r: 'oldrel-1'}
35 |           - {os: ubuntu-latest,  r: 'oldrel-2'}
36 |           - {os: ubuntu-latest,  r: 'oldrel-3'}
37 |           - {os: ubuntu-latest,  r: 'oldrel-4'}
38 | 
39 |     env:
40 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
41 |       R_KEEP_PKG_SOURCE: yes
42 | 
43 |     steps:
44 |       - uses: actions/checkout@v4
45 | 
46 |       - uses: r-lib/actions/setup-pandoc@v2
47 | 
48 |       - uses: r-lib/actions/setup-r@v2
49 |         with:
50 |           r-version: ${{ matrix.config.r }}
51 |           http-user-agent: ${{ matrix.config.http-user-agent }}
52 |           use-public-rspm: true
53 | 
54 |       - uses: r-lib/actions/setup-r-dependencies@v2
55 |         with:
56 |           extra-packages: any::rcmdcheck
57 |           needs: check
58 | 
59 |       - uses: r-lib/actions/check-r-package@v2
60 |         with:
61 |           upload-snapshots: true
62 |           build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")'
63 | 


--------------------------------------------------------------------------------
/.github/workflows/lock.yaml:
--------------------------------------------------------------------------------
 1 | name: 'Lock Threads'
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 0 * * *'
 6 | 
 7 | jobs:
 8 |   lock:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: dessant/lock-threads@v2
12 |         with:
13 |           github-token: ${{ github.token }}
14 |           issue-lock-inactive-days: '14'
15 | #          issue-exclude-labels: ''
16 | #          issue-lock-labels: 'outdated'
17 |           issue-lock-comment: >
18 |             This issue has been automatically locked. If you believe you have
19 |             found a related problem, please file a new issue (with a reprex:
20 |             <https://reprex.tidyverse.org>) and link to this issue.
21 |           issue-lock-reason: ''
22 |           pr-lock-inactive-days: '14'
23 | #          pr-exclude-labels: 'wip'
24 |           pr-lock-labels: ''
25 |           pr-lock-comment: >
26 |             This pull request has been automatically locked. If you believe you
27 |             have found a related problem, please file a new issue (with a reprex:
28 |             <https://reprex.tidyverse.org>) and link to this issue.
29 |           pr-lock-reason: ''
30 | #          process-only: 'issues'
31 | 


--------------------------------------------------------------------------------
/.github/workflows/pkgdown.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 |   release:
 8 |     types: [published]
 9 |   workflow_dispatch:
10 | 
11 | name: pkgdown.yaml
12 | 
13 | permissions: read-all
14 | 
15 | jobs:
16 |   pkgdown:
17 |     runs-on: ubuntu-latest
18 |     # Only restrict concurrency for non-PR jobs
19 |     concurrency:
20 |       group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
21 |     env:
22 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
23 |     permissions:
24 |       contents: write
25 |     steps:
26 |       - uses: actions/checkout@v4
27 | 
28 |       - uses: r-lib/actions/setup-pandoc@v2
29 | 
30 |       - uses: r-lib/actions/setup-r@v2
31 |         with:
32 |           use-public-rspm: true
33 | 
34 |       - uses: r-lib/actions/setup-r-dependencies@v2
35 |         with:
36 |           extra-packages: any::pkgdown, local::.
37 |           needs: website
38 | 
39 |       - name: Build site
40 |         run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
41 |         shell: Rscript {0}
42 | 
43 |       - name: Deploy to GitHub pages 🚀
44 |         if: github.event_name != 'pull_request'
45 |         uses: JamesIves/github-pages-deploy-action@v4.5.0
46 |         with:
47 |           clean: false
48 |           branch: gh-pages
49 |           folder: docs
50 | 


--------------------------------------------------------------------------------
/.github/workflows/pr-commands.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   issue_comment:
 5 |     types: [created]
 6 | 
 7 | name: pr-commands.yaml
 8 | 
 9 | permissions: read-all
10 | 
11 | jobs:
12 |   document:
13 |     if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/document') }}
14 |     name: document
15 |     runs-on: ubuntu-latest
16 |     env:
17 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
18 |     permissions:
19 |       contents: write
20 |     steps:
21 |       - uses: actions/checkout@v4
22 | 
23 |       - uses: r-lib/actions/pr-fetch@v2
24 |         with:
25 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
26 | 
27 |       - uses: r-lib/actions/setup-r@v2
28 |         with:
29 |           use-public-rspm: true
30 | 
31 |       - uses: r-lib/actions/setup-r-dependencies@v2
32 |         with:
33 |           extra-packages: any::roxygen2
34 |           needs: pr-document
35 | 
36 |       - name: Document
37 |         run: roxygen2::roxygenise()
38 |         shell: Rscript {0}
39 | 
40 |       - name: commit
41 |         run: |
42 |           git config --local user.name "$GITHUB_ACTOR"
43 |           git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
44 |           git add man/\* NAMESPACE
45 |           git commit -m 'Document'
46 | 
47 |       - uses: r-lib/actions/pr-push@v2
48 |         with:
49 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
50 | 
51 |   style:
52 |     if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/style') }}
53 |     name: style
54 |     runs-on: ubuntu-latest
55 |     env:
56 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
57 |     permissions:
58 |       contents: write
59 |     steps:
60 |       - uses: actions/checkout@v4
61 | 
62 |       - uses: r-lib/actions/pr-fetch@v2
63 |         with:
64 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
65 | 
66 |       - uses: r-lib/actions/setup-r@v2
67 | 
68 |       - name: Install dependencies
69 |         run: install.packages("styler")
70 |         shell: Rscript {0}
71 | 
72 |       - name: Style
73 |         run: styler::style_pkg()
74 |         shell: Rscript {0}
75 | 
76 |       - name: commit
77 |         run: |
78 |           git config --local user.name "$GITHUB_ACTOR"
79 |           git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
80 |           git add \*.R
81 |           git commit -m 'Style'
82 | 
83 |       - uses: r-lib/actions/pr-push@v2
84 |         with:
85 |           repo-token: ${{ secrets.GITHUB_TOKEN }}
86 | 


--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 | 
 8 | name: test-coverage.yaml
 9 | 
10 | permissions: read-all
11 | 
12 | jobs:
13 |   test-coverage:
14 |     runs-on: ubuntu-latest
15 |     env:
16 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
17 | 
18 |     steps:
19 |       - uses: actions/checkout@v4
20 | 
21 |       - uses: r-lib/actions/setup-r@v2
22 |         with:
23 |           use-public-rspm: true
24 | 
25 |       - uses: r-lib/actions/setup-r-dependencies@v2
26 |         with:
27 |           extra-packages: any::covr, any::xml2
28 |           needs: coverage
29 | 
30 |       - name: Test coverage
31 |         run: |
32 |           cov <- covr::package_coverage(
33 |             quiet = FALSE,
34 |             clean = FALSE,
35 |             install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package")
36 |           )
37 |           print(cov)
38 |           covr::to_cobertura(cov)
39 |         shell: Rscript {0}
40 | 
41 |       - uses: codecov/codecov-action@v5
42 |         with:
43 |           # Fail if error if not on PR, or if on PR and token is given
44 |           fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }}
45 |           files: ./cobertura.xml
46 |           plugins: noop
47 |           disable_search: true
48 |           token: ${{ secrets.CODECOV_TOKEN }}
49 | 
50 |       - name: Show testthat output
51 |         if: always()
52 |         run: |
53 |           ## --------------------------------------------------------------------
54 |           find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true
55 |         shell: bash
56 | 
57 |       - name: Upload test results
58 |         if: failure()
59 |         uses: actions/upload-artifact@v4
60 |         with:
61 |           name: coverage-test-failures
62 |           path: ${{ runner.temp }}/package
63 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | 
 8 | # Example code in package build process
 9 | *-Ex.R
10 | 
11 | # Output files from R CMD build
12 | /*.tar.gz
13 | 
14 | # Output files from R CMD check
15 | /*.Rcheck/
16 | 
17 | # RStudio files
18 | .Rproj.user/
19 | 
20 | # produced vignettes
21 | vignettes/*.html
22 | vignettes/*.pdf
23 | 
24 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
25 | .httr-oauth
26 | 
27 | # knitr and R markdown default cache directories
28 | /*_cache/
29 | /cache/
30 | 
31 | # Temporary files created by R markdown
32 | *.utf8.md
33 | *.knit.md
34 | 
35 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
36 | rsconnect/
37 | .DS_Store
38 | 


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 |     "recommendations": [
3 |         "Posit.air-vscode"
4 |     ]
5 | }
6 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "[r]": {
3 |         "editor.formatOnSave": true,
4 |         "editor.defaultFormatter": "Posit.air-vscode"
5 |     }
6 | }
7 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: modeldata
 2 | Title: Data Sets Useful for Modeling Examples
 3 | Version: 1.4.0.9000
 4 | Authors@R: c(
 5 |     person("Max", "Kuhn", , "max@posit.co", role = c("aut", "cre")),
 6 |     person("Posit Software, PBC", role = c("cph", "fnd"),
 7 |            comment = c(ROR = "03wc8by49"))
 8 |   )
 9 | Description: Data sets used for demonstrating or testing model-related
10 |     packages are contained in this package.
11 | License: MIT + file LICENSE
12 | URL: https://modeldata.tidymodels.org,
13 |     https://github.com/tidymodels/modeldata
14 | BugReports: https://github.com/tidymodels/modeldata/issues
15 | Depends: 
16 |     R (>= 4.1)
17 | Imports:
18 |     dplyr,
19 |     MASS,
20 |     purrr,
21 |     rlang,
22 |     tibble
23 | Suggests: 
24 |     covr,
25 |     ggplot2,
26 |     testthat (>= 3.0.0)
27 | Config/Needs/website: tidyverse/tidytemplate, tidymodels/tidymodels
28 | Config/testthat/edition: 3
29 | Config/usethis/last-upkeep: 2025-04-27
30 | Encoding: UTF-8
31 | LazyData: true
32 | LazyDataCompression: xz
33 | Roxygen: list(markdown = TRUE)
34 | RoxygenNote: 7.3.2
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2025
2 | COPYRIGHT HOLDER: modeldata authors
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2025 modeldata authors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(sim_classification)
 4 | export(sim_logistic)
 5 | export(sim_multinomial)
 6 | export(sim_noise)
 7 | export(sim_regression)
 8 | importFrom(stats,rnorm)
 9 | importFrom(stats,runif)
10 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # modeldata (development version)
 2 | 
 3 | * Added a new regression simulation function via `method = "worley_1987"`.
 4 | 
 5 | * Transition from the magrittr pipe to the base R pipe.
 6 | 
 7 | # modeldata 1.4.0
 8 | 
 9 | * Added the `cat_adoption` data set.  
10 | 
11 | # modeldata 1.3.0
12 | 
13 | * Added the `deliveries` data set.  
14 | 
15 | # modeldata 1.2.0
16 | 
17 | * New data sets 
18 | 
19 |   - `chem_proc_yield` (regression)
20 |   - `hepatic_injury_qsar` (ordinal classification)
21 |   - `hotel_rates`  (regression)
22 |   - `ischemic_stroke` (classification)
23 |   - `leaf_id_flavia` (classification)
24 |   - `permeability_qsar` (regression)
25 |   - `steroidogenic_toxicity` (classification)
26 |   - `taxi` (classification)
27 | 
28 | * The simulation equation for Hooker (2004) was slightly incorrect and has been corrected. 
29 | 
30 | # modeldata 1.1.0
31 | 
32 | * Added a `keep_truth` argument to the supervised simulation functions. This retains the column that defines the error free simulated value of the outcome. This numeric column is called `.truth`. 
33 | 
34 | * A simulation functions were added: 
35 | 
36 |   * `sim_logistic()` and `sim_multinomial()` were added. 
37 |   
38 |   * A method for Hooker (2004) was added for `sim_regression()`. 
39 | 
40 | # modeldata 1.0.1
41 | 
42 | * Small update to fix HTML  for CRAN. 
43 | 
44 | # modeldata 1.0.0
45 | 
46 | * Added a set of regression and classification simulation functions (#273).
47 | 
48 | * Remove OkCupid data, including text data, because of privacy concerns.
49 | 
50 | # modeldata 0.1.1
51 | 
52 | * Add Tate Gallery modern artwork metadata.
53 | 
54 | * Deprecate OkCupid data, including text data, because of concerns around such data such as the ability to identify individuals.
55 | 
56 | # modeldata 0.1.0
57 | 
58 | * Add the grant acceptance data from Kuhn and Johnson (2013) (_Applied Predictive Modeling_).
59 | 
60 | * The `crickets` data from Chapter 3 of [`tmwr.org`](https://www.tmwr.org/base-r.html#an-example) were added. 
61 | 
62 | # modeldata 0.0.2
63 | 
64 | * The bivariate dataset was missing, this has been corrected (@mdogucu, #5).
65 | 
66 | * The [Ames](https://github.com/topepo/AmesHousing) and [penguin](https://github.com/allisonhorst/palmerpenguins) data sets were added. 
67 | 
68 | # modeldata 0.0.1
69 | 
70 | * Added a `NEWS.md` file to track changes to the package.
71 | 


--------------------------------------------------------------------------------
/R/Chicago.R:
--------------------------------------------------------------------------------
 1 | #' Chicago ridership data
 2 | #'
 3 | #' @details These data are from Kuhn and Johnson (2020) and contain an
 4 | #'  _abbreviated_ training set for modeling the number of people (in thousands)
 5 | #'  who enter the Clark and Lake L station.
 6 | #'
 7 | #' The `date` column corresponds to the current date. The columns with station
 8 | #'  names (`Austin` through `California`) are a _sample_ of the columns used in
 9 | #'  the original analysis (for file size reasons). These are 14 day lag
10 | #'  variables (i.e. `date - 14 days`). There are columns related to weather and
11 | #'  sports team schedules.
12 | #'
13 | #' The station at 35th and Archer is contained in the column `Archer_35th` to
14 | #' make it a valid R column name.
15 | #'
16 | #'
17 | #' @name Chicago
18 | #' @aliases Chicago stations
19 | #' @docType data
20 | #' @return \item{Chicago}{a tibble} \item{stations}{a vector of station names}
21 | #'
22 | #' @source Kuhn and Johnson (2020), _Feature Engineering and Selection_,
23 | #' Chapman and Hall/CRC . \url{https://bookdown.org/max/FES/} and
24 | #' \url{https://github.com/topepo/FES}
25 | #'
26 | #'
27 | #' @keywords datasets
28 | #' @examples
29 | #' data(Chicago)
30 | #' str(Chicago)
31 | #' stations
32 | NULL
33 | 


--------------------------------------------------------------------------------
/R/Smithsonian.R:
--------------------------------------------------------------------------------
 1 | #' Smithsonian museums
 2 | #'
 3 | #' Geocodes for the Smithsonian museums (circa 2018).
 4 | #'
 5 | #' @name Smithsonian
 6 | #' @aliases Smithsonian
 7 | #' @docType data
 8 | #' @return \item{Smithsonian}{a tibble}
 9 | #'
10 | #' @source https://en.wikipedia.org/wiki/List_of_Smithsonian_museums
11 | #'
12 | #' @keywords datasets
13 | #' @examples
14 | #' data(Smithsonian)
15 | #' str(Smithsonian)
16 | NULL
17 | 


--------------------------------------------------------------------------------
/R/ad_data.R:
--------------------------------------------------------------------------------
 1 | #' Alzheimer's disease data
 2 | #'
 3 | #' @details
 4 | #' Craig-Schapiro et al. (2011) describe a clinical study of 333 patients,
 5 | #'  including some with mild (but well-characterized) cognitive impairment as
 6 | #'  well as healthy individuals. CSF samples were taken from all subjects. The
 7 | #'  goal of the study was to determine if subjects in the early states of
 8 | #'  impairment could be differentiated from cognitively healthy individuals.
 9 | #'  Data collected on each subject included:
10 | #' \itemize{
11 | #' \item Demographic characteristics such as age and gender
12 | #' \item Apolipoprotein E genotype
13 | #' \item Protein measurements of Abeta, Tau, and a phosphorylated version of Tau (called pTau)
14 | #' \item Protein measurements of 124 exploratory biomarkers, and
15 | #' \item Clinical dementia scores
16 | #' }
17 | #'
18 | #' For these analyses, we have converted the scores to two classes: impaired
19 | #'  and healthy. The goal of this analysis is to create classification models
20 | #'  using the demographic and assay data to predict which patients have early
21 | #'  stages of disease.
22 | #'
23 | #' @name ad_data
24 | #' @aliases ad_data
25 | #' @docType data
26 | #' @return \item{ad_data}{a tibble}
27 | #'
28 | #' @source
29 | #' Kuhn, M., Johnson, K. (2013) *Applied Predictive Modeling*, Springer.
30 | #'
31 | #' Craig-Schapiro R, Kuhn M, Xiong C, Pickering EH, Liu J, Misko TP, et al.
32 | #' (2011) Multiplexed Immunoassay Panel Identifies Novel CSF Biomarkers for
33 | #' Alzheimer's Disease Diagnosis and Prognosis. PLoS ONE 6(4): e18850.
34 | #'
35 | #'
36 | #' @keywords datasets
37 | #' @examples
38 | #' data(ad_data)
39 | #' str(ad_data)
40 | NULL
41 | 


--------------------------------------------------------------------------------
/R/ames.R:
--------------------------------------------------------------------------------
 1 | #' Ames Housing Data
 2 | #'
 3 | #' A data set from De Cock (2011) has 82 fields were recorded for 2,930
 4 | #' properties in Ames IA. This version is copies from the `AmesHousing` package
 5 | #' but does not include a few quality columns that appear to be outcomes
 6 | #' rather than predictors.
 7 | #'
 8 | #' See this links for the sources below for more information as well as
 9 | #' `?AmesHousing::make_ames`.
10 | #'
11 | #' @includeRmd man/rmd/ames.md details
12 | #'
13 | #' @name ames
14 | #' @aliases ames
15 | #' @docType data
16 | #' @return \item{ames}{a tibble}
17 | #' @source De Cock, D. (2011). "Ames, Iowa: Alternative to the Boston Housing Data as an End of Semester Regression Project," \emph{Journal of Statistics Education},  Volume 19, Number 3.
18 | #'
19 | #' \url{http://jse.amstat.org/v19n3/decock/DataDocumentation.txt}
20 | #'
21 | #' \url{http://jse.amstat.org/v19n3/decock.pdf}
22 | #' @keywords datasets
23 | #' @examples
24 | #' data(ames)
25 | #' str(ames)
26 | NULL
27 | 


--------------------------------------------------------------------------------
/R/attrition.R:
--------------------------------------------------------------------------------
 1 | #' Job attrition
 2 | #'
 3 | #' @details These data are from the IBM Watson Analytics Lab.
 4 | #'  The website describes the data with \dQuote{Uncover the
 5 | #'  factors that lead to employee attrition and explore important
 6 | #'  questions such as \sQuote{show me a breakdown of distance
 7 | #'  from home by job role and attrition} or \sQuote{compare
 8 | #'  average monthly income by education and attrition}. This is a
 9 | #'  fictional data set created by IBM data scientists.}. There
10 | #'  are 1470 rows.
11 | #'
12 | #' @name attrition
13 | #' @aliases attrition
14 | #' @docType data
15 | #' @return \item{attrition}{a data frame}
16 | #'
17 | #' @source The IBM Watson Analytics Lab website https://www.ibm.com/communities/analytics/watson-analytics-blog/hr-employee-attrition/
18 | #'
19 | #'
20 | #' @keywords datasets
21 | #' @examples
22 | #' data(attrition)
23 | #' str(attrition)
24 | NULL
25 | 


--------------------------------------------------------------------------------
/R/biomass.R:
--------------------------------------------------------------------------------
 1 | #' Biomass data
 2 | #'
 3 | #' Ghugare et al (2014) contains a data set where different biomass fuels are
 4 | #' characterized by the amount of certain molecules (carbon, hydrogen, oxygen,
 5 | #' nitrogen, and sulfur) and the corresponding higher heating value (HHV).
 6 | #' These data are from their Table S.2 of the Supplementary Materials
 7 | #'
 8 | #' @name biomass
 9 | #' @aliases biomass
10 | #' @docType data
11 | #' @return \item{biomass}{a data frame}
12 | #'
13 | #' @source Ghugare, S. B., Tiwary, S., Elangovan, V., and Tambe, S. S. (2013).
14 | #' Prediction of Higher Heating Value of Solid Biomass Fuels Using Artificial
15 | #' Intelligence Formalisms. *BioEnergy Research*, 1-12.
16 | #'
17 | #' @keywords datasets
18 | #' @examples
19 | #' data(biomass)
20 | #' str(biomass)
21 | NULL
22 | 


--------------------------------------------------------------------------------
/R/bivariate.R:
--------------------------------------------------------------------------------
 1 | #' Example bivariate classification data
 2 | #'
 3 | #' @details These data are a simplified version of the segmentation data contained
 4 | #' in `caret`. There are three columns: `A` and `B` are predictors and the column
 5 | #' `Class` is a factor with levels "One" and "Two". There are three data sets:
 6 | #' one for training (n = 1009), validation (n = 300), and testing (n = 710).
 7 | #'
 8 | #' @name bivariate
 9 | #' @aliases bivariate_train  bivariate_test bivariate_val
10 | #' @docType data
11 | #' @return \item{bivariate_train, bivariate_test, bivariate_val}{tibbles}
12 | #'
13 | #' @keywords datasets
14 | #' @examples
15 | #' data(bivariate)
16 | #' str(bivariate_train)
17 | #' str(bivariate_val)
18 | #' str(bivariate_test)
19 | NULL
20 | 


--------------------------------------------------------------------------------
/R/car_prices.R:
--------------------------------------------------------------------------------
 1 | #' Kelly Blue Book resale data for 2005 model year GM cars
 2 | #'
 3 | #' Kuiper (2008) collected data on Kelly Blue Book resale data for 804 GM cars (2005 model year).
 4 | #'
 5 | #' @name car_prices
 6 | #' @docType data
 7 | #' @return \item{car_prices}{data frame of the suggested retail price (column \code{Price}) and various
 8 | #' characteristics of each car (columns \code{Mileage}, \code{Cylinder}, \code{Doors}, \code{Cruise},
 9 | #' \code{Sound}, \code{Leather}, \code{Buick}, \code{Cadillac}, \code{Chevy}, \code{Pontiac}, \code{Saab},
10 | #'  \code{Saturn}, \code{convertible}, \code{coupe}, \code{hatchback}, \code{sedan} and \code{wagon})}
11 | #' @source Kuiper, S. (2008). Introduction to Multiple Regression: How Much Is Your Car Worth?,
12 | #' \emph{Journal of Statistics Education}, Vol. 16
13 | #' \url{http://jse.amstat.org/jse_archive.htm#2008}.
14 | #' @keywords datasets
15 | #' @examples
16 | #' data(car_prices)
17 | #' str(car_prices)
18 | NULL
19 | 


--------------------------------------------------------------------------------
/R/cat_adoption.R:
--------------------------------------------------------------------------------
 1 | #' Cat Adoption
 2 | #'
 3 | #' @description
 4 | #' A subset of the cats at the animal shelter in Long Beach, California, USA.
 5 | #'
 6 | #' @return tibble
 7 | #' @aliases cat_adoption
 8 | #' @name cat_adoption
 9 | #' @docType data
10 | #' @details
11 | #'
12 | #' A data frame with 2257 rows and 19 columns:
13 | #' \describe{
14 | #'   \item{time}{The time the cat spent at the shelter.}
15 | #'   \item{event}{The event of interest is the cat being homed or returned to
16 | #'   its original location (i.e., owner or community). The non-event is the cat
17 | #'   being transferred to another shelter or dying. Zero indicates a non-event
18 | #'   (censored), and one corresponds to the event occurring.}
19 | #'   \item{sex}{The sex of the cat.}
20 | #'   \item{neutered}{Whether the cat is neutered.}
21 | #'   \item{intake_condition}{The intake condition of the cat.}
22 | #'   \item{intake_type}{The type of intake.}
23 | #'   \item{latitude}{Latitude of the intersection/cross street of intake or capture.}
24 | #'   \item{longitude}{Longitude of the intersection/cross street of intake or capture.}
25 | #'  \item{black,brown,brown_tabby,calico,cream,gray,gray_tabby,orange,orange_tabby,tan,tortie,white}{Indicators for the color/pattern of the cat's fur.}
26 | #' }
27 | #' @source
28 | #'
29 | #' <https://data.longbeach.gov/explore/dataset/animal-shelter-intakes-and-outcomes/information/>
30 | #' on 2024-06-17
31 | #'
32 | #' @examples
33 | #' str(cat_adoption)
34 | #' @keywords datasets
35 | NULL
36 | 


--------------------------------------------------------------------------------
/R/cells.R:
--------------------------------------------------------------------------------
 1 | #' Cell body segmentation
 2 | #'
 3 | #' Hill, LaPan, Li and Haney (2007) develop models to predict which cells in a
 4 | #' high content screen were well segmented.  The data consists of 119 imaging
 5 | #' measurements on 2019. The original analysis used 1009 for training and 1010
 6 | #' as a test set (see the column called \code{case}).
 7 | #'
 8 | #' The outcome class is contained in a factor variable called \code{class} with
 9 | #' levels "PS" for poorly segmented and "WS" for well segmented.
10 | #'
11 | #' The raw data used in the paper can be found at the Biomedcentral website.
12 | #' The version
13 | #' contained in \code{cells} is modified. First, several discrete
14 | #' versions of some of the predictors (with the suffix "Status") were removed.
15 | #' Second, there are several skewed predictors with minimum values of zero
16 | #' (that would benefit from some transformation, such as the log). A constant
17 | #' value of 1 was added to these fields: \code{avg_inten_ch_2},
18 | #' \code{fiber_align_2_ch_3}, \code{fiber_align_2_ch_4}, \code{spot_fiber_count_ch_4} and
19 | #' \code{total_inten_ch_2}.
20 | #'
21 | #' @name cells
22 | #' @docType data
23 | #' @return \item{cells}{a tibble}
24 | #' @source Hill, LaPan, Li and Haney (2007). Impact of image segmentation on
25 | #' high-content screening data quality for SK-BR-3 cells, \emph{BMC
26 | #' Bioinformatics}, Vol. 8, pg. 340,
27 | #' \url{https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-8-340}.
28 | #' @keywords datasets
29 | #' @examples
30 | #' data(cells)
31 | #' str(cells)
32 | NULL
33 | 


--------------------------------------------------------------------------------
/R/check_times.R:
--------------------------------------------------------------------------------
 1 | #' Execution time data
 2 | #'
 3 | #' These data were collected from the CRAN web page for 13,626 R
 4 | #' packages. The time to complete the standard package checking
 5 | #' routine was collected In some cases, the package checking
 6 | #' process is stopped due to errors and these data are treated as
 7 | #' censored. It is less than 1 percent.
 8 | #'
 9 | #' As predictors, the associated package source code were
10 | #' downloaded and parsed to create predictors, including
11 | #'
12 | #' * `authors`: The number of authors in the author field.
13 | #' * `imports`: The number of imported packages.
14 | #' * `suggests`: The number of packages suggested.
15 | #' * `depends`: The number of hard dependencies.
16 | #' * `Roxygen`: a binary indicator for whether Roxygen was used
17 | #'   for documentation.
18 | #' * `gh`: a binary indicator for whether the URL field contained
19 | #'   a GitHub link.
20 | #' * `rforge`: a binary indicator for whether the URL field
21 | #'   contained a link to R-forge.
22 | #' * `descr`: The number of characters (or, in some cases, bytes)
23 | #'   in the description field.
24 | #' * `r_count`: The number of R files in the R directory.
25 | #' * `r_size`: The total disk size of the R files.
26 | #' * `ns_import`: Estimated number of imported functions or methods.
27 | #' * `ns_export`: Estimated number of exported functions or methods.
28 | #' * `s3_methods`: Estimated number of S3 methods.
29 | #' * `s4_methods`: Estimated number of S4 methods.
30 | #' * `doc_count`: How many Rmd or Rnw files in the vignettes
31 | #'   directory.
32 | #' * `doc_size`: The disk size of the Rmd or Rnw files.
33 | #' * `src_count`: The number of files in the `src` directory.
34 | #' * `src_size`: The size on disk of files in the `src` directory.
35 | #' * `data_count`  The number of files in the `data` directory.
36 | #' * `data_size`: The size on disk of files in the `data` directory.
37 | #' * `testthat_count`: The number of files in the `testthat`
38 | #'   directory.
39 | #' * `testthat_size`: The size on disk of files in the `testthat`
40 | #'   directory.
41 | #' * `check_time`: The time (in seconds) to run `R CMD check`
42 | #'   using the "r-devel-windows-ix86+x86_64` flavor.
43 | #' * `status`: An indicator for whether the tests completed.
44 | #'
45 | #' Data were collected on 2019-01-20.
46 | #' @name check_times
47 | #' @aliases check_times
48 | #' @docType data
49 | #' @return \item{check_times}{a data frame}
50 | #'
51 | #' @source CRAN
52 | #'
53 | #' @keywords datasets
54 | #' @examples
55 | #' data(check_times)
56 | #' str(check_times)
57 | NULL
58 | 


--------------------------------------------------------------------------------
/R/chem_proc_yield.R:
--------------------------------------------------------------------------------
 1 | #' Chemical manufacturing process data set
 2 | #'
 3 | #' @description
 4 | #' A data set that models yield as a function of biological material predictors
 5 | #' and chemical structure predictors.
 6 | #'
 7 | #' @name chem_proc_yield
 8 | #' @aliases chem_proc_yield
 9 | #' @docType data
10 | #' @return \item{chem_proc_yield}{a tibble}
11 | #'
12 | #' @details
13 | #' This data set contains information about a chemical manufacturing
14 | #' process, in which the goal is to understand the relationship between
15 | #' the process and the resulting final product yield.  Raw material in
16 | #' this process is put through a sequence of 27 steps to generate the
17 | #' final pharmaceutical product.  The starting material is generated from
18 | #' a biological unit and has a range of quality and characteristics.  The
19 | #' objective in this project was to develop a model to predict percent
20 | #' yield of the manufacturing process.  The data set consisted of 177
21 | #' samples of biological material for which 57 characteristics were
22 | #' measured.  Of the 57 characteristics, there were 12 measurements of
23 | #' the biological starting material, and 45 measurements of the
24 | #' manufacturing process.  The process variables included measurements
25 | #' such as temperature, drying time, washing time, and concentrations of
26 | #' by-products at various steps.  Some of the process measurements can
27 | #' be controlled, while others are observed.  Predictors are continuous,
28 | #' count, categorical; some are correlated, and some contain missing
29 | #' values.  Samples are not independent because sets of samples come from
30 | #' the same batch of biological starting material.
31 | #'
32 | #' Columns:
33 | #' \itemize{
34 | #'  \item \code{yield}:  numeric
35 | #'  \item \code{bio_material_01} - \code{bio_material_12}:  numeric
36 | #'  \item \code{man_proc_01} - \code{man_proc_45}:  numeric
37 | #' }
38 | #' @source
39 | #' Kuhn, Max, and Kjell Johnson. _Applied predictive modeling_. New York:
40 | #' Springer, 2013.
41 | #'
42 | #' @examples
43 | #' data(chem_proc_yield)
44 | #' str(chem_proc_yield)
45 | #'
46 | NULL
47 | 


--------------------------------------------------------------------------------
/R/churn.R:
--------------------------------------------------------------------------------
 1 | #' Customer churn data
 2 | #'
 3 | #' A data set from the MLC++ machine learning software for modeling customer
 4 | #' churn. There are 19 predictors, mostly numeric: `state` (categorical),
 5 | #' `account_length` `area_code` `international_plan` (yes/no),
 6 | #' `voice_mail_plan` (yes/no), `number_vmail_messages`
 7 | #' `total_day_minutes` `total_day_calls` `total_day_charge`
 8 | #' `total_eve_minutes` `total_eve_calls` `total_eve_charge`
 9 | #' `total_night_minutes` `total_night_calls`
10 | #' `total_night_charge` `total_intl_minutes`
11 | #' `total_intl_calls` `total_intl_charge`, and
12 | #' `number_customer_service_calls`.
13 | #'
14 | #' The outcome is contained in a column called `churn` (also yes/no).
15 | #' A note in one of the source files states that the data are "artificial based
16 | #' on claims similar to real world".
17 | #'
18 | #' @name mlc_churn
19 | #' @aliases mlc_churn
20 | #' @docType data
21 | #' @return \item{mlc_churn}{a tibble}
22 | #' @source Originally at `http://www.sgi.com/tech/mlc/`
23 | #' @keywords datasets
24 | #' @examples
25 | #' data(mlc_churn)
26 | #' str(mlc_churn)
27 | NULL
28 | 


--------------------------------------------------------------------------------
/R/concrete.R:
--------------------------------------------------------------------------------
 1 | #' Compressive strength of concrete mixtures
 2 | #'
 3 | #' Yeh (2006) describes an aggregated data set for experimental designs used to
 4 | #' test the compressive strength of concrete mixtures. The data are used by
 5 | #' Kuhn and Johnson (2013).
 6 | #'
 7 | #'
 8 | #' @name concrete
 9 | #' @aliases concrete
10 | #' @docType data
11 | #' @return \item{concrete}{a tibble}
12 | #' @keywords datasets
13 | #' @source
14 | #' Yeh I (2006). "Analysis of Strength of Concrete Using Design of Experiments
15 | #' and Neural Networks." *Journal of Materials in Civil Engineering*, 18, 597-604.
16 | #'
17 | #' Kuhn, M., Johnson, K. (2013) *Applied Predictive Modeling*, Springer.
18 | #' @examples
19 | #' data(concrete)
20 | #' str(concrete)
21 | NULL
22 | 


--------------------------------------------------------------------------------
/R/covers.R:
--------------------------------------------------------------------------------
 1 | #' Raw cover type data
 2 | #'
 3 | #' These data are raw data describing different types of forest cover-types
 4 | #'   from the UCI Machine Learning Database (see link below). There is one
 5 | #'   column in the data that has a few difference pieces of textual
 6 | #'   information (of variable lengths).
 7 | #'
 8 | #' @name covers
 9 | #' @aliases covers
10 | #' @docType data
11 | #' @return \item{covers}{a data frame}
12 | #'
13 | #' @source https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info
14 | #'
15 | #' @keywords datasets
16 | #' @examples
17 | #' data(covers)
18 | #' str(covers)
19 | NULL
20 | 


--------------------------------------------------------------------------------
/R/credit_data.R:
--------------------------------------------------------------------------------
 1 | #' Credit data
 2 | #'
 3 | #' These data are from the website of Dr. Lluís A. Belanche Muñoz by way of a
 4 | #' github repository of Dr. Gaston Sanchez. One data point is a missing outcome
 5 | #' was removed from the original data.
 6 | #'
 7 | #' @name credit_data
 8 | #' @aliases credit_data
 9 | #' @docType data
10 | #' @return \item{credit_data}{a data frame}
11 | #'
12 | #' @source https://github.com/gastonstat/CreditScoring,
13 | #' http://bit.ly/2kkBFrk
14 | #'
15 | #' @keywords datasets
16 | #' @examples
17 | #' data(credit_data)
18 | #' str(credit_data)
19 | NULL
20 | 


--------------------------------------------------------------------------------
/R/crickets.R:
--------------------------------------------------------------------------------
 1 | #' Rates of Cricket Chirps
 2 | #'
 3 | #' These data are from from McDonald (2009), by way of Mangiafico (2015), on
 4 | #' the relationship between the ambient temperature and the rate of cricket
 5 | #' chirps per minute. Data were collected for two species of the genus _Oecanthus_: _O. exclamationis_
 6 | #' and _O. niveus_. The data are contained in a data frame called `crickets` with
 7 | #' a total of 31 data points.
 8 | #'
 9 | #' @name crickets
10 | #' @aliases crickets
11 | #' @docType data
12 | #' @return \item{crickets}{a tibble}
13 | #' @source Mangiafico, S. 2015. "An R Companion for the Handbook of Biological
14 | #' Statistics." \url{https://rcompanion.org/handbook/}.
15 | #'
16 | #' McDonald, J. 2009. _Handbook of Biological Statistics_. Sparky House Publishing.
17 | #' @keywords datasets
18 | #' @examples
19 | #' data(crickets)
20 | #' str(crickets)
21 | NULL
22 | 


--------------------------------------------------------------------------------
/R/deliveries.R:
--------------------------------------------------------------------------------
 1 | #' Food Delivery Time Data
 2 | #'
 3 | #' @details
 4 | #' These data are from a study of food delivery times in minutes (i.e., the time from the
 5 | #' initial order to receiving the food) for a single restaurant. The data
 6 | #' contains 10,012 orders from a specific restaurant. The predictors include:
 7 | #' \itemize{
 8 | #' \item The time, in decimal hours, of the order.
 9 | #' \item The day of the week for the order.
10 | #' \item The approximate distance in miles between the restaurant and the delivery
11 | #'       location.
12 | #' \item A set of 27 predictors that count the number of distinct menu items
13 | #'       in the order.
14 | #' }
15 | #'
16 | #' No times are censored.
17 | #'
18 | #' @name deliveries
19 | #' @aliases deliveries
20 | #' @docType data
21 | #' @return \item{deliveries}{a tibble}
22 | #'
23 | #' @keywords datasets
24 | #' @examples
25 | #' data(deliveries)
26 | #' str(deliveries)
27 | NULL
28 | 


--------------------------------------------------------------------------------
/R/drinks.R:
--------------------------------------------------------------------------------
 1 | #' Sample time series data
 2 | #'
 3 | #' @details Drink sales. The exact name of the series from FRED is:
 4 | #' "Merchant Wholesalers, Except Manufacturers' Sales Branches and Offices
 5 | #' Sales: Nondurable Goods: Beer, Wine, and Distilled Alcoholic Beverages Sales"
 6 | #'
 7 | #' @name drinks
 8 | #' @aliases drinks
 9 | #' @docType data
10 | #' @return \item{drinks}{a tibble}
11 | #'
12 | #' @source The Federal Reserve Bank of St. Louis website https://fred.stlouisfed.org/series/S4248SM144NCEN
13 | #'
14 | #' @keywords datasets
15 | #' @examples
16 | #' data(drinks)
17 | #' str(drinks)
18 | NULL
19 | 


--------------------------------------------------------------------------------
/R/fine_foods.R:
--------------------------------------------------------------------------------
 1 | #' Fine foods example data
 2 | #'
 3 | #' @details
 4 | #' These data are from Amazon, who describe it as "This dataset consists of
 5 | #'  reviews of fine foods from amazon. The data span a period of more than 10
 6 | #'  years, including all ~500,000 reviews up to October 2012. Reviews include
 7 | #'  product and user information, ratings, and a plaintext review."
 8 | #'
 9 | #' A subset of the data are contained here and are split into a training and
10 | #'  test set. The training set sampled 10 products and retained all of their
11 | #'  individual reviews. Since the reviews within these products are correlated,
12 | #'  we recommend resampling the data using a leave-one-product-out approach. The
13 | #'  test set sampled 500 products that were not included in the training set
14 | #'  and selected a single review at random for each.
15 | #'
16 | #' There is a column for the product, a column for the text of the review, and
17 | #'  a factor column for a class variable. The outcome is whether the reviewer
18 | #'  gave the product a 5-star rating or not.
19 | #'
20 | #' @name small_fine_foods
21 | #' @aliases small_fine_foods training_data testing_data
22 | #' @docType data
23 | #' @return \item{training_data,testing_data}{tibbles}
24 | #'
25 | #' @source https://snap.stanford.edu/data/web-FineFoods.html
26 | #'
27 | #'
28 | #' @keywords datasets
29 | #' @examples
30 | #' data(small_fine_foods)
31 | #' str(training_data)
32 | #' str(testing_data)
33 | NULL
34 | 


--------------------------------------------------------------------------------
/R/grants.R:
--------------------------------------------------------------------------------
 1 | #' Grant acceptance data
 2 | #'
 3 | #' A data set related to the success or failure of academic grants.
 4 | #'
 5 | #' The data are discussed in Kuhn and Johnson (2013):
 6 | #'
 7 | #' "These data are from a 2011 Kaggle competition sponsored by the University
 8 | #'  of Melbourne where there was interest in predicting whether or not a grant
 9 | #'  application would be accepted. Since public funding of grants had decreased
10 | #'  over time, triaging grant applications based on their likelihood of success
11 | #'  could be important for estimating the amount of potential funding to the
12 | #'  university. In addition to predicting grant success, the university sought
13 | #'  to understand factors that were important in predicting success."
14 | #'
15 | #'  The data ranged from 2005 and 2008 and the data spending strategy was
16 | #'  driven by the date of the grant. Kuhn and Johnson (2013) describe:
17 | #'
18 | #' "The compromise taken here is to build models on the pre-2008 data and
19 | #'  tune them by evaluating a random sample of 2,075 grants from 2008. Once the
20 | #'  optimal parameters are determined, final model is built using these
21 | #'  parameters and the entire training set (i.e., the data prior to 2008 and the
22 | #'  additional 2,075 grants). A small holdout set of 518 grants from 2008 will
23 | #'  be used to ensure that no gross methodology errors occur from repeatedly
24 | #'  evaluating the 2008 data during model tuning. In the text, this set of
25 | #'  samples is called the 2 0 0 8 holdout set. This small set of year 2008
26 | #'  grants will be referred to as the test set and will not be evaluated until
27 | #'  set of candidate models are identified."
28 | #'
29 | #' To emulate this, `grants_other` contains the training (pre-2008, n = 6,633)
30 | #'  and holdout/validation data (2008, n = 1,557). `grants_test` has 518 grant
31 | #'  samples from 2008. The object `grants_2008` is an integer vector that can
32 | #'  be used to separate the modeling with the holdout/validation sets.
33 | #'
34 | #'
35 | #' @name grants
36 | #' @aliases grants_other grants_test grants_2008
37 | #' @docType data
38 | #' @return \item{grants_other,grants_test,grants_2008}{two tibbles and an integer
39 | #' vector of data points used for training}
40 | #' @source Kuhn and Johnson (2013). _Applied Predictive Modeling_. Springer.
41 | #' @keywords datasets
42 | #' @examples
43 | #' data(grants)
44 | #' str(grants_other)
45 | #' str(grants_test)
46 | #' str(grants_2008)
47 | NULL
48 | 


--------------------------------------------------------------------------------
/R/hepatic_injury_qsar.R:
--------------------------------------------------------------------------------
 1 | #' Predicting hepatic injury from chemical information
 2 | #'
 3 | #' @description
 4 | #' A quantitative structure-activity relationship (QSAR) data set to predict
 5 | #' when a molecule has risk associated with liver function.
 6 | #'
 7 | #' @name hepatic_injury_qsar
 8 | #' @aliases hepatic_injury_qsar
 9 | #' @docType data
10 | #' @return \item{hepatic_injury_qsar}{a tibble}
11 | #'
12 | #' @details
13 | #' This  data set was used to develop a model for predicting compounds'
14 | #' probability of causing hepatic injury (i.e. liver damage). This data set
15 | #' consisted of 281 unique compounds; 376 predictors were measured or computed
16 | #' for each. The response was categorical (either "none", "mild", or "severe"),
17 | #' and was highly unbalanced.
18 | #'
19 | #' This kind of response often occurs in pharmaceutical data because companies
20 | #' steer away from creating molecules that have undesirable characteristics.
21 | #' Therefore, well-behaved molecules often greatly outnumber undesirable
22 | #' molecules. The predictors consisted of measurements from 184 biological
23 | #' screens and 192 chemical feature predictors. The biological predictors
24 | #' represent activity for each screen and take values between 0 and 10 with a
25 | #' mode of 4. The chemical feature predictors represent counts of important
26 | #' sub-structures as well as measures of physical properties that are thought to
27 | #' be associated with hepatic injury.
28 | #'
29 | #' Columns:
30 | #' \itemize{
31 | #'  \item \code{class}:  ordered and factor (levels: 'none', 'mild', and 'severe')
32 | #'  \item \code{bio_assay_001} - \code{bio_assay_184}:  numeric
33 | #'  \item \code{chem_fp_001} - \code{chem_fp_192}:  numeric
34 | #' }
35 | #' @source
36 | #' Kuhn, Max, and Kjell Johnson. _Applied predictive modeling_. New York:
37 | #' Springer, 2013.
38 | #'
39 | #' @examples
40 | #' data(hepatic_injury_qsar)
41 | #' str(hepatic_injury_qsar)
42 | #'
43 | NULL
44 | 


--------------------------------------------------------------------------------
/R/hotel_rates.R:
--------------------------------------------------------------------------------
 1 | #' Daily Hotel Rate Data
 2 | #'
 3 | #' @description
 4 | #' A data set to predict the average daily rate for a hotel in Lisbon Portugal.
 5 | #'
 6 | #' @name hotel_rates
 7 | #' @aliases hotel_rates
 8 | #' @docType data
 9 | #'
10 | #' @details
11 | #'
12 | #' Data are originally described in Antonio, de Almeida, and Nunes (2019).
13 | #' This version of the data is filtered for one hotel (the "Resort Hotel") and
14 | #' is intended as regression data set for predicting the average daily rate for
15 | #' a room. The data are post-2016; the 2016 data were used to have a predictor
16 | #' for the historical daily rates. See the `hotel_rates.R` file in the
17 | #' `data-raw` directory of the package to understand other filters used when
18 | #' creating this version of the data.
19 | #'
20 | #' The `agent` and `company` fields were changed from random characters to use
21 | #' a set of random names.
22 | #'
23 | #' The outcome column is `avg_price_per_room`.
24 | #'
25 | #' ## License
26 | #'
27 | #' No license was given for the data; See the reference below for source.
28 | #'
29 | #' @source
30 | #' \url{https://github.com/rfordatascience/tidytuesday/tree/master/data/2020/2020-02-11}
31 | #'
32 | #' @references
33 | #' Antonio, N., de Almeida, A., and Nunes, L. (2019). Hotel booking demand
34 | #' datasets. _Data in Brief_, 22, 41-49.
35 | #'
36 | #' @keywords datasets
37 | #' @examples
38 | #' \dontrun{
39 | #' str(hotel_rates)
40 | #' }
41 | NULL
42 | 


--------------------------------------------------------------------------------
/R/hpc_cv.R:
--------------------------------------------------------------------------------
 1 | #' Class probability predictions
 2 | #'
 3 | #' @details This data frame contains the predicted classes and
 4 | #'  class probabilities for a linear discriminant analysis model fit
 5 | #'  to the HPC data set from Kuhn and Johnson (2013). These data are
 6 | #'  the assessment sets from a 10-fold cross-validation scheme. The
 7 | #'  data column columns for the true class (`obs`), the class
 8 | #'  prediction (`pred`) and columns for each class probability
 9 | #'  (columns `VF`, `F`, `M`, and `L`). Additionally, a column for
10 | #'  the resample indicator is included.
11 | #'
12 | #' @name hpc_cv
13 | #' @aliases hpc_cv
14 | #' @docType data
15 | #' @return \item{hpc_cv}{a data frame}
16 | #'
17 | #' @source Kuhn, M., Johnson, K. (2013) *Applied Predictive
18 | #'  Modeling*, Springer
19 | #'
20 | #' @keywords datasets
21 | #' @examples
22 | #' data(hpc_cv)
23 | #' str(hpc_cv)
24 | NULL
25 | 


--------------------------------------------------------------------------------
/R/hpc_data.R:
--------------------------------------------------------------------------------
 1 | #' High-performance computing system data
 2 | #'
 3 | #' Kuhn and Johnson (2013) describe a data set where characteristics of unix
 4 | #' jobs were used to classify there completion times as either very fast
 5 | #' (1 min or less, `VF`), fast (1–50 min, `F`), moderate (5–30 min, `M`), or
 6 | #' long (greater than 30 min, `L`).
 7 | #'
 8 | #'
 9 | #' @name hpc_data
10 | #' @aliases hpc_data
11 | #' @docType data
12 | #' @return \item{hpc_data}{a tibble}
13 | #' @keywords datasets
14 | #' @source
15 | #' Kuhn, M., Johnson, K. (2013) *Applied Predictive Modeling*, Springer.
16 | #' @examples
17 | #'
18 | #' data(hpc_data)
19 | #' str(hpc_data)
20 | NULL
21 | 


--------------------------------------------------------------------------------
/R/ischemic_stroke.R:
--------------------------------------------------------------------------------
 1 | #' Clinical data used to predict ischemic stroke
 2 | #'
 3 | #' @description
 4 | #' A data set to predict a binary outcome using imaging and patient data.
 5 | #'
 6 | #' @name ischemic_stroke
 7 | #' @aliases ischemic_stroke
 8 | #' @docType data
 9 | #' @return \item{ischemic_stroke}{a tibble}
10 | #'
11 | #' @details
12 | #' These data were gathered to predict patient risk for ischemic stroke. A
13 | #' historical set of patients with a range of carotid artery blockages were
14 | #' selected. The data consisted of 126 patients, 44 of which had blockages
15 | #' greater than 70%. All patients had undergone Computed Tomography Angiography
16 | #' (CTA) to generate a detailed three-dimensional visualization and
17 | #' characterization of the blockage. These images were then analyzed in order to
18 | #' compute several features related to the disease, including: percent stenosis,
19 | #' arterial wall thickness, and tissue characteristics such as lipid-rich
20 | #' necrotic core and calcification.
21 | #'
22 | #' The group of patients in this study also had follow-up information on
23 | #' whether or not a stroke occurred at a subsequent point in time. The data for
24 | #' each patient also included commonly collected clinical characteristics for
25 | #' risk of stroke such as whether or not the patient had atrial fibrillation,
26 | #' coronary artery disease, and a history of smoking. Demographics of gender and
27 | #' age were included as well. These readily available risk factors can be
28 | #' thought of as another potentially useful predictor set that can be evaluated.
29 | #' In fact, this set of predictors should be evaluated first to assess their
30 | #' ability to predict stroke since these predictors are easy to collect, are
31 | #' acquired at patient presentation, and do not require an expensive imaging
32 | #' technique.
33 | #'
34 | #' Columns:
35 | #' \itemize{
36 | #'  \item \code{stroke}:  factor (levels: 'yes' and 'no')
37 | #'  \item \code{nascet_scale}:  numeric
38 | #'  \item \code{calc_vol}:  numeric
39 | #'  \item \code{calc_vol_prop}:  numeric
40 | #'  \item \code{matx_vol}:  numeric
41 | #'  \item \code{matx_vol_prop}:  numeric
42 | #'  \item \code{lrnc_vol}:  numeric
43 | #'  \item \code{lrnc_vol_prop}:  numeric
44 | #'  \item \code{max_calc_area}:  numeric
45 | #'  \item \code{max_calc_area_prop}:  numeric
46 | #'  \item \code{max_dilation_by_area}:  numeric
47 | #'  \item \code{max_matx_area}:  numeric
48 | #'  \item \code{max_matx_area_prop}:  numeric
49 | #'  \item \code{max_lrnc_area}:  numeric
50 | #'  \item \code{max_lrnc_area_prop}:  numeric
51 | #'  \item \code{max_max_wall_thickness}:  numeric
52 | #'  \item \code{max_remodeling_ratio}:  numeric
53 | #'  \item \code{max_stenosis_by_area}:  numeric
54 | #'  \item \code{max_wall_area}:  numeric
55 | #'  \item \code{wall_vol}:  numeric
56 | #'  \item \code{max_stenosis_by_diameter}:  numeric
57 | #'  \item \code{age}:  integer
58 | #'  \item \code{male}:  integer
59 | #'  \item \code{smoking_history}:  integer
60 | #'  \item \code{atrial_fibrillation}:  integer
61 | #'  \item \code{coronary_artery_disease}:  integer
62 | #'  \item \code{diabetes_history}:  integer
63 | #'  \item \code{hypercholesterolemia_history}:  integer
64 | #'  \item \code{hypertension_history}:  integer
65 | #' }
66 | #' @source
67 | #' Kuhn, Max, and Kjell Johnson. _Feature Engineering and Selection: A Practical
68 | #' Approach for Predictive Models_. Chapman and Hall/CRC, 2019.
69 | #'
70 | #' @examples
71 | #' data(ischemic_stroke)
72 | #' str(ischemic_stroke)
73 | #'
74 | NULL
75 | 


--------------------------------------------------------------------------------
/R/leaf_id_flavia.R:
--------------------------------------------------------------------------------
 1 | #' Leaf identification data (Flavia)
 2 | #'
 3 | #' @description
 4 | #' Image analysis of leaves to predict species.
 5 | #'
 6 | #' @name leaf_id_flavia
 7 | #' @aliases leaf_id_flavia
 8 | #' @docType data
 9 | #' @return \item{leaf_id_flavia}{a data frame}
10 | #'
11 | #' @details
12 | #' From the original manuscript: "The Flavia dataset contains 1907 leaf images.
13 | #' There are 32 different species and each has 50-77 images. Scanners and
14 | #' digital cameras are used to acquire the leaf images on a plain background.
15 | #' The isolated leaf images contain blades only, without a petiole. These leaf
16 | #' images are collected from the most common plants in Yangtze, Delta,
17 | #' China. Those leaves were sampled on the campus of the Nanjing University and
18 | #' the Sun Yat-Sen arboretum, Nanking, China."
19 | #'
20 | #' The reference below has details information on the features used for
21 | #' prediction.
22 | #'
23 | #' Columns:
24 | #' \itemize{
25 | #'  \item \code{species}:  factor (32 levels)
26 | #'  \item \code{apex}:  factor (9 levels)
27 | #'  \item \code{base}:  factor (6 levels)
28 | #'  \item \code{shape}:  factor (5 levels)
29 | #'  \item \code{denate_edge}:  factor (levels: 'no' and 'yes')
30 | #'  \item \code{lobed_edge}:  factor (levels: 'no' and 'yes')
31 | #'  \item \code{smooth_edge}:  factor (levels: 'no' and 'yes')
32 | #'  \item \code{toothed_edge}:  factor (levels: 'no' and 'yes')
33 | #'  \item \code{undulate_edge}:  factor (levels: 'no' and 'yes')
34 | #'  \item \code{outlying_polar}:  numeric
35 | #'  \item \code{skewed_polar}:  numeric
36 | #'  \item \code{clumpy_polar}:  numeric
37 | #'  \item \code{sparse_polar}:  numeric
38 | #'  \item \code{striated_polar}:  numeric
39 | #'  \item \code{convex_polar}:  numeric
40 | #'  \item \code{skinny_polar}:  numeric
41 | #'  \item \code{stringy_polar}:  numeric
42 | #'  \item \code{monotonic_polar}:  numeric
43 | #'  \item \code{outlying_contour}:  numeric
44 | #'  \item \code{skewed_contour}:  numeric
45 | #'  \item \code{clumpy_contour}:  numeric
46 | #'  \item \code{sparse_contour}:  numeric
47 | #'  \item \code{striated_contour}:  numeric
48 | #'  \item \code{convex_contour}:  numeric
49 | #'  \item \code{skinny_contour}:  numeric
50 | #'  \item \code{stringy_contour}:  numeric
51 | #'  \item \code{monotonic_contour}:  numeric
52 | #'  \item \code{num_max_ponits}:  numeric
53 | #'  \item \code{num_min_points}:  numeric
54 | #'  \item \code{diameter}:  numeric
55 | #'  \item \code{area}:  numeric
56 | #'  \item \code{perimeter}:  numeric
57 | #'  \item \code{physiological_length}:  numeric
58 | #'  \item \code{physiological_width}:  numeric
59 | #'  \item \code{aspect_ratio}:  numeric
60 | #'  \item \code{rectangularity}:  numeric
61 | #'  \item \code{circularity}:  numeric
62 | #'  \item \code{compactness}:  numeric
63 | #'  \item \code{narrow_factor}:  numeric
64 | #'  \item \code{perimeter_ratio_diameter}:  numeric
65 | #'  \item \code{perimeter_ratio_length}:  numeric
66 | #'  \item \code{perimeter_ratio_lw}:  numeric
67 | #'  \item \code{num_convex_points}:  numeric
68 | #'  \item \code{perimeter_convexity}:  numeric
69 | #'  \item \code{area_convexity}:  numeric
70 | #'  \item \code{area_ratio_convexity}:  numeric
71 | #'  \item \code{equivalent_diameter}:  numeric
72 | #'  \item \code{eccentriciry}:  numeric
73 | #'  \item \code{contrast}:  numeric
74 | #'  \item \code{correlation_texture}:  numeric
75 | #'  \item \code{inverse_difference_moments}:  numeric
76 | #'  \item \code{entropy}:  numeric
77 | #'  \item \code{mean_red_val}:  numeric
78 | #'  \item \code{mean_green_val}:  numeric
79 | #'  \item \code{mean_blue_val}:  numeric
80 | #'  \item \code{std_red_val}:  numeric
81 | #'  \item \code{std_green_val}:  numeric
82 | #'  \item \code{std_blue_val}:  numeric
83 | #'  \item \code{correlation}:  numeric
84 | #' }
85 | #' @source
86 | #' Lakshika, Jayani PG, and Thiyanga S. Talagala. "Computer-aided interpretable
87 | #' features for leaf image classification." _arXiv preprint_ arXiv:2106.08077
88 | #' (2021).
89 | #'
90 | #' \url{https://github.com/SMART-Research/leaffeatures_paper}
91 | #'
92 | #' @examples
93 | #' data(leaf_id_flavia)
94 | #' str(leaf_id_flavia)
95 | #'
96 | NULL
97 | 


--------------------------------------------------------------------------------
/R/lending_club.R:
--------------------------------------------------------------------------------
 1 | #' Loan data
 2 | #'
 3 | #' @details These data were downloaded from the Lending Club
 4 | #'  access site (see below) and are from the first quarter of 2016.
 5 | #'  A subset of the rows and variables are included here. The
 6 | #'  outcome is in the variable `Class` and is either "good" (meaning
 7 | #'  that the loan was fully paid back or currently on-time) or "bad"
 8 | #'  (charged off, defaulted, of 21-120 days late). A data dictionary
 9 | #'  can be found on the source website.
10 | #'
11 | #' @name lending_club
12 | #' @aliases lending_club
13 | #' @docType data
14 | #' @return \item{lending_club}{a data frame}
15 | #'
16 | #' @source Lending Club Statistics https://www.lendingclub.com/info/download-data.action
17 | #'
18 | #' @keywords datasets
19 | #' @examples
20 | #' data(lending_club)
21 | #' str(lending_club)
22 | NULL
23 | 


--------------------------------------------------------------------------------
/R/meats.R:
--------------------------------------------------------------------------------
 1 | #' Fat, water and protein content of meat samples
 2 | #'
 3 | #' "These data are recorded on a Tecator Infratec Food and Feed Analyzer
 4 | #' working in the wavelength range 850 - 1050 nm by the Near Infrared
 5 | #' Transmission (NIT) principle. Each sample contains finely chopped pure meat
 6 | #' with different moisture, fat and protein contents.
 7 | #'
 8 | #' If results from these data are used in a publication we want you to mention
 9 | #' the instrument and company name (Tecator) in the publication.  In addition,
10 | #' please send a preprint of your article to:
11 | #'
12 | #' Karin Thente, Tecator AB, Box 70, S-263 21 Hoganas, Sweden
13 | #'
14 | #' The data are available in the public domain with no responsibility from the
15 | #' original data source. The data can be redistributed as long as this
16 | #' permission note is attached."
17 | #'
18 | #' "For each meat sample the data consists of a 100 channel spectrum of
19 | #' absorbances and the contents of moisture (water), fat and protein.  The
20 | #' absorbance is -log10 of the transmittance measured by the spectrometer. The
21 | #' three contents, measured in percent, are determined by analytic chemistry."
22 | #'
23 | #' Included here are the training, monitoring and test sets.
24 | #'
25 | #'
26 | #' @name meats
27 | #' @aliases meats
28 | #' @docType data
29 | #' @return \item{meats}{a tibble}
30 | #' @keywords datasets
31 | #' @examples
32 | #'
33 | #' data(meats)
34 | #' str(meats)
35 | NULL
36 | 


--------------------------------------------------------------------------------
/R/modeldata-package.R:
--------------------------------------------------------------------------------
 1 | #' @keywords internal
 2 | "_PACKAGE"
 3 | 
 4 | ## usethis namespace: start
 5 | #' @importFrom stats rnorm runif
 6 | ## usethis namespace: end
 7 | NULL
 8 | 
 9 | 
10 | # needed for simulation docs
11 | utils::globalVariables(
12 |   c(
13 |     ".",
14 |     "linear_pred",
15 |     "non_linear_1",
16 |     "non_linear_2",
17 |     "non_linear_3",
18 |     "outcome",
19 |     "predictor_01",
20 |     "predictor_02",
21 |     "predictor_03",
22 |     "predictor_04",
23 |     "predictor_05",
24 |     "predictor_06",
25 |     "predictor_07",
26 |     "predictor_08",
27 |     "predictor_09",
28 |     "predictor_10",
29 |     "predictor_11",
30 |     "predictor_12",
31 |     "predictor_13",
32 |     "predictor_14",
33 |     "predictor_15",
34 |     "predictor_16",
35 |     "predictor_17",
36 |     "predictor_18",
37 |     "predictor_19",
38 |     "predictor_20",
39 |     "rand",
40 |     "true_prob",
41 |     "two_factor_1",
42 |     "two_factor_2",
43 |     ".truth",
44 |     ".linear_pred",
45 |     ".rand"
46 |   )
47 | )
48 | 


--------------------------------------------------------------------------------
/R/oils.R:
--------------------------------------------------------------------------------
 1 | #' Fatty acid composition of commercial oils
 2 | #'
 3 | #' Fatty acid concentrations of commercial oils were measured using gas
 4 | #' chromatography.  The data is used to predict the type of oil.  Note that
 5 | #' only the known oils are in the data set. Also, the authors state that there
 6 | #' are 95 samples of known oils. However, we count 96 in Table 1 (pgs.  33-35).
 7 | #'
 8 | #'
 9 | #' @name oils
10 | #' @aliases oils
11 | #' @docType data
12 | #' @return \item{oils}{a tibble}
13 | #' @source Brodnjak-Voncina et al. (2005). Multivariate data analysis in
14 | #' classification of vegetable oils characterized by the content of fatty
15 | #' acids, \emph{Chemometrics and Intelligent Laboratory Systems}, Vol.
16 | #' 75:31-45.
17 | #' @keywords datasets
18 | #' @examples
19 | #' data(oils)
20 | #' str(oils)
21 | NULL
22 | 


--------------------------------------------------------------------------------
/R/parabolic.R:
--------------------------------------------------------------------------------
 1 | #' Parabolic class boundary data
 2 | #'
 3 | #' @details These data were simulated. There are two correlated predictors and
 4 | #' two classes in the factor outcome.
 5 | #'
 6 | #' @name parabolic
 7 | #' @aliases parabolic
 8 | #' @docType data
 9 | #' @return \item{parabolic}{a data frame}
10 | #'
11 | #' @keywords datasets
12 | #' @examples
13 | #' data(parabolic)
14 | #' str(parabolic)
15 | NULL
16 | 


--------------------------------------------------------------------------------
/R/pathology.R:
--------------------------------------------------------------------------------
 1 | #' Liver pathology data
 2 | #'
 3 | #' @details These data have the results of a _x_-ray examination
 4 | #'  to determine whether liver is abnormal or not (in the `scan`
 5 | #'  column) versus the more extensive pathology results that
 6 | #'  approximate the truth (in `pathology`).
 7 | #'
 8 | #' @name pathology
 9 | #' @aliases pathology
10 | #' @docType data
11 | #' @return \item{pathology}{a data frame}
12 | #'
13 | #' @source Altman, D.G., Bland, J.M. (1994) ``Diagnostic tests 1:
14 | #'  sensitivity and specificity,'' *British Medical Journal*,
15 | #'  vol 308, 1552.
16 | #'
17 | #'
18 | #' @keywords datasets
19 | #' @examples
20 | #' data(pathology)
21 | #' str(pathology)
22 | NULL
23 | 


--------------------------------------------------------------------------------
/R/pd_speech.R:
--------------------------------------------------------------------------------
 1 | #' Parkinson's disease speech classification data set
 2 | #'
 3 | #' @details From the UCI ML archive, the description is "The data used in this
 4 | #'  study were gathered from 188 patients with PD (107 men and 81 women) with
 5 | #'  ages ranging from 33 to 87 (65.1 p/m 10.9) at the Department of Neurology
 6 | #'  in Cerrahpaşa Faculty of Medicine, Istanbul University. The control group
 7 | #'  consists of 64 healthy individuals (23 men and 41 women) with ages varying
 8 | #'  between 41 and 82 (61.1 p/m 8.9). During the data collection process,
 9 | #'  the microphone is set to 44.1 KHz and following the physician's examination,
10 | #'  the sustained phonation of the vowel `/a/` was collected from each subject
11 | #'  with three repetitions."
12 | #'
13 | #'  The data here are averaged over the replicates.
14 | #'
15 | #' @name pd_speech
16 | #' @aliases pd_speech
17 | #' @docType data
18 | #' @return \item{pd_speech}{a data frame}
19 | #'
20 | #' @source UCI ML repository (data) https://archive.ics.uci.edu/ml/datasets/Parkinson%27s+Disease+Classification#,
21 | #'
22 | #' Sakar et al (2019), "A comparative analysis of speech signal processing
23 | #' algorithms for Parkinson’s disease classification and the use of the tunable
24 | #' Q-factor wavelet transform", _Applied Soft Computing_, V74, pg 255-263.
25 | #'
26 | #' @keywords datasets
27 | #' @examples
28 | #' data(pd_speech)
29 | #' str(pd_speech)
30 | NULL
31 | 


--------------------------------------------------------------------------------
/R/penguins.R:
--------------------------------------------------------------------------------
 1 | #' Palmer Station penguin data
 2 | #'
 3 | #' A data set from Gorman, Williams, and Fraser (2014) containing measurements
 4 | #' from different types of penguins. This version of the data was retrieved from
 5 | #' Allison Horst's `palmerpenguins` package on 2020-06-22.
 6 | #'
 7 | #' @name penguins
 8 | #' @aliases penguins
 9 | #' @docType data
10 | #' @return \item{penguins}{a tibble}
11 | #' @source Gorman KB, Williams TD, Fraser WR (2014) Ecological Sexual Dimorphism
12 | #' and Environmental Variability within a Community of Antarctic Penguins
13 | #' (_Genus Pygoscelis_). PLoS ONE 9(3): e90081.
14 | #' \doi{10.1371/journal.pone.0090081}
15 | #'
16 | #' \url{https://github.com/allisonhorst/palmerpenguins}
17 | #' @keywords datasets
18 | #' @examples
19 | #' data(penguins)
20 | #' str(penguins)
21 | NULL
22 | 


--------------------------------------------------------------------------------
/R/permeability_qsar.R:
--------------------------------------------------------------------------------
 1 | #' Predicting permeability from chemical information
 2 | #'
 3 | #' @description
 4 | #' A quantitative structure-activity relationship (QSAR) data set to predict
 5 | #' when a molecule can permeate cells.
 6 | #'
 7 | #' @name permeability_qsar
 8 | #' @aliases permeability_qsar
 9 | #' @docType data
10 | #' @return \item{permeability_qsar}{a data frame}
11 | #'
12 | #' @details
13 | #' This pharmaceutical data set was used to develop a model for predicting
14 | #' compounds' permeability. In short, permeability is the measure of a
15 | #' molecule's ability to cross a membrane. The body, for example, has notable
16 | #' membranes between the body and brain, known as the blood-brain barrier, and
17 | #' between the gut and body in the intestines. These membranes help the body
18 | #' guard critical regions from receiving undesirable or detrimental substances.
19 | #' For an orally taken drug to be effective in the brain, it first must pass
20 | #' through the intestinal wall and then must pass through the blood-brain
21 | #' barrier in order to be present for the desired neurological target.
22 | #' Therefore, a compound's ability to permeate relevant biological membranes
23 | #' is critically important to understand early in the drug discovery process.
24 | #' Compounds that appear to be effective for a particular disease in research
25 | #' screening experiments, but appear to be poorly permeable may need to be
26 | #' altered in order improve permeability, and thus the compound's ability to
27 | #' reach the desired target. Identifying permeability problems can help guide
28 | #' chemists towards better molecules.
29 | #'
30 | #' Permeability assays such as PAMPA and Caco-2 have been developed to help
31 | #' measure compounds' permeability (Kansy et al, 1998). These screens are
32 | #' effective at quantifying a compound's permeability, but the assay is
33 | #' expensive labor intensive. Given a sufficient number of compounds that have
34 | #' been screened, we could develop a predictive model for permeability in an
35 | #' attempt to potentially reduce the need for the assay. In this project there
36 | #' were 165 unique compounds; 1107 molecular fingerprints were determined for
37 | #' each. A molecular fingerprint is a binary sequence of numbers that
38 | #' represents the presence or absence of a specific molecular sub-structure.
39 | #' The response is highly skewed, the predictors are sparse (15.5% are present),
40 | #' and many predictors are strongly associated.
41 | #'
42 | #' Columns:
43 | #' \itemize{
44 | #'   \item \code{permeability}: numeric
45 | #'   \item \code{chem_fp_0001} - \code{chem_fp_1107}: numeric
46 | #' }
47 | #'
48 | #' @source
49 | #' Kuhn, Max, and Kjell Johnson. _Applied predictive modeling_. New York:
50 | #' Springer, 2013.
51 | #'
52 | #' @examples
53 | #' data(permeability_qsar)
54 | #' str(permeability_qsar)
55 | #'
56 | NULL
57 | 


--------------------------------------------------------------------------------
/R/sacremento.R:
--------------------------------------------------------------------------------
 1 | #' Sacramento CA home prices
 2 | #'
 3 | #' This data frame contains house and sale price data for 932 homes in
 4 | #' Sacramento CA.  The original data were obtained from the website for the
 5 | #' SpatialKey software. From their website: "The Sacramento real estate
 6 | #' transactions file is a list of 985 real estate transactions in the
 7 | #' Sacramento area reported over a five-day period, as reported by the
 8 | #' Sacramento Bee." Google was used to fill in missing/incorrect data.
 9 | #'
10 | #'
11 | #' @name Sacramento
12 | #' @docType data
13 | #' @return \item{Sacramento}{a tibble}
14 | #' @source SpatialKey website:
15 | #' \url{https://support.spatialkey.com/spatialkey-sample-csv-data/}
16 | #' @keywords datasets
17 | #' @examples
18 | #' data(Sacramento)
19 | #' str(Sacramento)
20 | NULL
21 | 


--------------------------------------------------------------------------------
/R/scat.R:
--------------------------------------------------------------------------------
 1 | #' Morphometric data on scat
 2 | #'
 3 | #' Reid (2015) collected data on animal feses in coastal California. The data
 4 | #' consist of DNA verified species designations as well as fields related to
 5 | #' the time and place of the collection and the scat itself. The data are on
 6 | #' the three main species.
 7 | #'
 8 | #'
 9 | #' @name scat
10 | #' @aliases scat
11 | #' @docType data
12 | #' @return \item{scat}{a tibble}
13 | #' @source Reid, R. E. B. (2015). A morphometric modeling approach to
14 | #' distinguishing among bobcat, coyote and gray fox scats. \emph{Wildlife
15 | #' Biology}, 21(5), 254-262
16 | #' @keywords datasets
17 | #' @examples
18 | #' data(scat)
19 | #' str(scat)
20 | NULL
21 | 


--------------------------------------------------------------------------------
/R/solubility.R:
--------------------------------------------------------------------------------
 1 | #' Solubility predictions from MARS model
 2 | #'
 3 | #' @details For the solubility data in Kuhn and Johnson (2013),
 4 | #'  these data are the test set results for the MARS model. The
 5 | #'  observed solubility (in column `solubility`) and the model
 6 | #'  results (`prediction`) are contained in the data.
 7 | #'
 8 | #' @name solubility_test
 9 | #' @aliases solubility_test
10 | #' @docType data
11 | #' @return \item{solubility_test}{a data frame}
12 | #'
13 | #' @source Kuhn, M., Johnson, K. (2013) *Applied Predictive
14 | #'  Modeling*, Springer
15 | #'
16 | #' @keywords datasets
17 | #' @examples
18 | #' data(solubility_test)
19 | #' str(solubility_test)
20 | NULL
21 | 


--------------------------------------------------------------------------------
/R/stackoverflow.R:
--------------------------------------------------------------------------------
 1 | #' Annual Stack Overflow Developer Survey Data
 2 | #'
 3 | #' @details These data are a collection of 5,594 data points collected on
 4 | #' developers. These data could be used to try to predict who works remotely
 5 | #' (as used in the source listed below).
 6 | #'
 7 | #' @name stackoverflow
 8 | #' @aliases stackoverflow
 9 | #' @docType data
10 | #' @return \item{stackoverflow}{a tibble}
11 | #'
12 | #' @source
13 | #' Julia Silge, _Supervised Machine Learning Case Studies in R_
14 | #'
15 | #' `https://supervised-ml-course.netlify.com/chapter2`
16 | #'
17 | #' Raw data: `https://insights.stackoverflow.com/survey/`
18 | #' @keywords datasets
19 | #' @examples
20 | #' data(stackoverflow)
21 | #' str(stackoverflow)
22 | NULL
23 | 


--------------------------------------------------------------------------------
/R/steroidogenic_toxicity.R:
--------------------------------------------------------------------------------
 1 | #' Predicting steroidogenic toxicity with assay data
 2 | #'
 3 | #' @description
 4 | #' A set of _in vitro_ assays are used to quantify the risk of reproductive
 5 | #' toxicity via the disruption of steroidogenic pathways.
 6 | #'
 7 | #' @name steroidogenic_toxicity
 8 | #' @aliases steroidogenic_toxicity
 9 | #' @docType data
10 | #' @return A tibble with columns
11 | #'  - `class`: factor(levels: toxic and nontoxic)
12 | #'  - `cyp_11a1`: numeric
13 | #'  - `cyp_11b1`: numeric
14 | #'  - `cyp_11b2`: numeric
15 | #'  - `cyp_17a1`: numeric
16 | #'  - `cyp_19a1`: numeric
17 | #'  - `cyp_21a1`: numeric
18 | #'  - `hsd3b2`: numeric
19 | #'  - `star`: numeric
20 | #'  - `progesterone`: numeric
21 | #'  - `testosterone`: numeric
22 | #'  - `dhea`: numeric
23 | #'  - `cortisol`: numeric
24 | 
25 | #' @details
26 | #' H295R cells were used to measure the effect with two sets of assay results.
27 | #' The first includes a set of protein measurements on: cytochrome P450 enzymes
28 | #' ("cyp"s), STAR, and 3BHSD2. The second include hormone measurements for
29 | #' DHEA, progesterone, testosterone, and cortisol.
30 | #'
31 | #' Columns:
32 | #' \itemize{
33 | #'  \item \code{class}: factor (levels: 'toxic' and 'nontoxic')
34 | #'  \item \code{cyp_11a1}: numeric
35 | #'  \item \code{cyp_11b1}: numeric
36 | #'  \item \code{cyp_11b2}: numeric
37 | #'  \item \code{cyp_17a1}: numeric
38 | #'  \item \code{cyp_19a1}: numeric
39 | #'  \item \code{cyp_21a1}: numeric
40 | #'  \item \code{hsd3b2}: numeric
41 | #'  \item \code{star}: numeric
42 | #'  \item \code{progesterone}: numeric
43 | #'  \item \code{testosterone}: numeric
44 | #'  \item \code{dhea}: numeric
45 | #'  \item \code{cortisol}: numeric
46 | #' }
47 | #'
48 | #' @source
49 | #' Maglich, J. M., Kuhn, M., Chapin, R. E., & Pletcher, M. T. (2014). More than
50 | #' just hormones: H295R cells as predictors of reproductive toxicity.
51 | #' _Reproductive Toxicology_, 45, 77-86.
52 | #'
53 | #' @examples
54 | #' data(steroidogenic_toxicity)
55 | #' str(steroidogenic_toxicity)
56 | #'
57 | NULL
58 | 


--------------------------------------------------------------------------------
/R/tate_text.R:
--------------------------------------------------------------------------------
 1 | #' Tate Gallery modern artwork metadata
 2 | #'
 3 | #' Metadata such as artist, title, and year created for recent artworks owned
 4 | #'  by the Tate Gallery. Only artworks created during or after 1990 are
 5 | #'  included, and the metadata source was last updated in 2014. The Tate Gallery
 6 | #'  provides these data but requests users to be respectful of their
 7 | #'  [guidelines for use](https://github.com/tategallery/collection#usage-guidelines-for-open-data).
 8 | #'
 9 | #' @name tate_text
10 | #' @aliases tate_text
11 | #' @docType data
12 | #' @return \item{tate_text}{a tibble}
13 | #'
14 | #' @source \itemize{
15 | #' \item \url{https://github.com/tategallery/collection}
16 | #' \item \url{https://www.tate.org.uk/}
17 | #' }
18 | #'
19 | #' @keywords datasets
20 | #' @examples
21 | #' data(tate_text)
22 | #' str(tate_text)
23 | NULL
24 | 


--------------------------------------------------------------------------------
/R/taxi.R:
--------------------------------------------------------------------------------
 1 | #' Chicago taxi data set
 2 | #'
 3 | #' @description
 4 | #'
 5 | #' A data set containing information on a subset of taxi trips in the city
 6 | #' of Chicago in 2022.
 7 | #'
 8 | #' @name taxi
 9 | #' @aliases taxi
10 | #' @docType data
11 | #'
12 | #' @return tibble
13 | #'
14 | #' @details
15 | #'
16 | #' The source data are originally described on the linked City of Chicago
17 | #' data portal. The data exported here are a pre-processed subset motivated by
18 | #' the modeling problem of predicting whether a rider will tip or not.
19 | #'
20 | #' \describe{
21 | #'   \item{tip}{Whether the rider left a tip. A factor with levels
22 | #'     "yes" and "no".}
23 | #'   \item{distance}{The trip distance, in odometer miles.}
24 | #'   \item{company}{The taxi company, as a factor. Companies that occurred
25 | #'     few times were binned as "other".}
26 | #'   \item{local}{Whether the trip's starting and ending locations are in the
27 | #'     same community. See the source data for community area values.}
28 | #'   \item{dow}{The day of the week in which the trip began, as a
29 | #'     factor.}
30 | #'   \item{month}{The month in which the trip began, as a factor.}
31 | #'   \item{hour}{The hour of the day in which the trip began, as a
32 | #'     numeric.}
33 | #' }
34 | #'
35 | #' @source
36 | #'
37 | #' \url{https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew}
38 | #'
39 | #' @examples
40 | #' \donttest{
41 | #' taxi
42 | #' }
43 | NULL
44 | 


--------------------------------------------------------------------------------
/R/two_class_dat.R:
--------------------------------------------------------------------------------
 1 | #' Two class data
 2 | #'
 3 | #' @details There are artificial data with two predictors (`A` and `B`) and
 4 | #'  a factor outcome variable (`Class`).
 5 | #'
 6 | #' @name two_class_dat
 7 | #' @aliases two_class_dat
 8 | #' @docType data
 9 | #' @return \item{two_class_dat}{a data frame}
10 | #'
11 | #' @keywords datasets
12 | #' @examples
13 | #' data(two_class_dat)
14 | #' str(two_class_dat)
15 | NULL
16 | 
17 | #' Two class predictions
18 | #'
19 | #' @details These data are a test set form a model built for two
20 | #'  classes ("Class1" and "Class2"). There are columns for the true
21 | #'  and predicted classes and column for the probabilities for each
22 | #'  class.
23 | #'
24 | #' @name two_class_example
25 | #' @aliases two_class_example
26 | #' @docType data
27 | #' @return \item{two_class_example}{a data frame}
28 | #'
29 | #' @keywords datasets
30 | #' @examples
31 | #' data(two_class_example)
32 | #' str(two_class_example)
33 | NULL
34 | 


--------------------------------------------------------------------------------
/R/wa_churn.R:
--------------------------------------------------------------------------------
 1 | #' Watson churn data
 2 | #'
 3 | #' @details These data were downloaded from the IBM Watson site
 4 | #'  (see below) in September 2018. The data contain a factor for
 5 | #'  whether a customer churned or not. Alternatively, the `tenure`
 6 | #'  column presumably contains information on how long the customer
 7 | #'  has had an account. A survival analysis can be done on this
 8 | #'  column using the `churn` outcome as the censoring information. A
 9 | #'  data dictionary can be found on the source website.
10 | #'
11 | #' @name wa_churn
12 | #' @aliases wa_churn
13 | #' @docType data
14 | #' @return \item{wa_churn}{a data frame}
15 | #'
16 | #' @source IBM Watson Analytics https://ibm.co/2sOvyvy
17 | #'
18 | #' @keywords datasets
19 | #' @examples
20 | #' data(wa_churn)
21 | #' str(wa_churn)
22 | NULL
23 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output: github_document
 3 | ---
 4 | 
 5 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 6 | 
 7 | ```{r}
 8 | #| include: false
 9 | knitr::opts_chunk$set(
10 |   collapse = TRUE,
11 |   comment = "#>",
12 |   fig.path = "man/figures/README-",
13 |   out.width = "100%"
14 | )
15 | ```
16 | 
17 | # modeldata
18 | 
19 | <!-- badges: start -->
20 | [![CRAN status](https://www.r-pkg.org/badges/version/modeldata)](https://CRAN.R-project.org/package=modeldata)
21 | [![R-CMD-check](https://github.com/tidymodels/modeldata/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/tidymodels/modeldata/actions/workflows/R-CMD-check.yaml)
22 | [![lifecycle](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://lifecycle.r-lib.org/articles/stages.html)
23 | <!-- badges: end -->
24 | 
25 | `modeldata` contains data sets used in documentation and testing for tidymodels packages. The package also contains a suite of simulation functions for classification and regression data. 
26 | 
27 | ## Installation
28 | 
29 | You can install the released version of modeldata from [CRAN](https://CRAN.R-project.org) with:
30 | 
31 | ``` r
32 | install.packages("modeldata")
33 | ```
34 | 
35 | And the development version from [GitHub](https://github.com/) with:
36 | 
37 | ``` r
38 | # install.packages("pak")
39 | pak::pak("tidymodels/modeldata")
40 | ```
41 | 
42 | ## Contributing
43 | 
44 | This project is released with a [Contributor Code of Conduct](https://contributor-covenant.org/version/2/1/CODE_OF_CONDUCT.html). By contributing to this project, you agree to abide by its terms.
45 | 
46 | - For questions and discussions about tidymodels packages, modeling, and machine learning, please [post on RStudio Community](https://forum.posit.co/new-topic?category_id=15&tags=tidymodels,question).
47 | 
48 | - If you think you have encountered a bug, please [submit an issue](https://github.com/tidymodels/modeldata/issues).
49 | 
50 | - Either way, learn how to create and share a [reprex](https://reprex.tidyverse.org/articles/articles/learn-reprex.html) (a minimal, reproducible example), to clearly communicate about your code.
51 | 
52 | - Check out further details on [contributing guidelines for tidymodels packages](https://www.tidymodels.org/contribute/) and [how to get help](https://www.tidymodels.org/help/).
53 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <!-- README.md is generated from README.Rmd. Please edit that file -->
 3 | 
 4 | # modeldata
 5 | 
 6 | <!-- badges: start -->
 7 | 
 8 | [![CRAN
 9 | status](https://www.r-pkg.org/badges/version/modeldata)](https://CRAN.R-project.org/package=modeldata)
10 | [![R-CMD-check](https://github.com/tidymodels/modeldata/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/tidymodels/modeldata/actions/workflows/R-CMD-check.yaml)
11 | [![lifecycle](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://lifecycle.r-lib.org/articles/stages.html)
12 | <!-- badges: end -->
13 | 
14 | `modeldata` contains data sets used in documentation and testing for
15 | tidymodels packages. The package also contains a suite of simulation
16 | functions for classification and regression data.
17 | 
18 | ## Installation
19 | 
20 | You can install the released version of modeldata from
21 | [CRAN](https://CRAN.R-project.org) with:
22 | 
23 | ``` r
24 | install.packages("modeldata")
25 | ```
26 | 
27 | And the development version from [GitHub](https://github.com/) with:
28 | 
29 | ``` r
30 | # install.packages("pak")
31 | pak::pak("tidymodels/modeldata")
32 | ```
33 | 
34 | ## Contributing
35 | 
36 | This project is released with a [Contributor Code of
37 | Conduct](https://contributor-covenant.org/version/2/1/CODE_OF_CONDUCT.html).
38 | By contributing to this project, you agree to abide by its terms.
39 | 
40 | - For questions and discussions about tidymodels packages, modeling, and
41 |   machine learning, please [post on RStudio
42 |   Community](https://forum.posit.co/new-topic?category_id=15&tags=tidymodels,question).
43 | 
44 | - If you think you have encountered a bug, please [submit an
45 |   issue](https://github.com/tidymodels/modeldata/issues).
46 | 
47 | - Either way, learn how to create and share a
48 |   [reprex](https://reprex.tidyverse.org/articles/articles/learn-reprex.html)
49 |   (a minimal, reproducible example), to clearly communicate about your
50 |   code.
51 | 
52 | - Check out further details on [contributing guidelines for tidymodels
53 |   packages](https://www.tidymodels.org/contribute/) and [how to get
54 |   help](https://www.tidymodels.org/help/).
55 | 


--------------------------------------------------------------------------------
/_pkgdown.yml:
--------------------------------------------------------------------------------
 1 | url: https://modeldata.tidymodels.org
 2 | 
 3 | template:
 4 |   package: tidytemplate
 5 |   bootstrap: 5
 6 |   bslib:
 7 |     danger: "#CA225E"
 8 |     primary: "#CA225E"
 9 |   includes:
10 |     in_header: |
11 |       <script defer data-domain="modeldata.tidymodels.org,all.tidymodels.org" src="https://plausible.io/js/plausible.js"></script>
12 | 
13 | development:
14 |   mode: auto
15 | 
16 | 


--------------------------------------------------------------------------------
/air.toml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/air.toml


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: false
 2 | 
 3 | coverage:
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         target: auto
 8 |         threshold: 1%
 9 |         informational: true
10 |     patch:
11 |       default:
12 |         target: auto
13 |         threshold: 1%
14 |         informational: true
15 | 


--------------------------------------------------------------------------------
/data-raw/cat_adoption.R:
--------------------------------------------------------------------------------
  1 | ## code to prepare `cat_adoption` dataset goes here
  2 | 
  3 | library(tidyverse)
  4 | library(janitor)
  5 | library(recipes)
  6 | library(survival)
  7 | 
  8 | # ------------------------------------------------------------------------------
  9 | 
 10 | # data from
 11 | # https://data.longbeach.gov/explore/dataset/animal-shelter-intakes-and-outcomes/information/
 12 | #
 13 | # We spoke with the Long Beach animal shelter on 2024-06-12 and they had some
 14 | # information about the outcomes. We're looking to define the event as being
 15 | # homed by the Long Beach animal shelter.
 16 | #
 17 | # `"community cat"` and `"shelter, neuter, return"` are animals brought in by a
 18 | # community member (or field officer) for care. They are not owned by anyone
 19 | # but are fed and given shelter by different people in the community. We count
 20 | # these as observed events.
 21 | #
 22 | # `"homefirst"` was a program where the pet was adopted with the promise of
 23 | # getting them spayed/neutered. We count these as observed events.
 24 | #
 25 | # `"rescue"`, `"return to rescue"`, `"transport"`, `"transfer"`, and
 26 | #`"return to rescue"` means that they went to a different organization that
 27 | # works to home them. These are censored data.
 28 | #
 29 | # We also talked about how long, after intake, they would be considered in the
 30 | # "risk set" of animals that could be adopted. This depends on the situation.
 31 | # Some animals in poor health need more time to recover and/or be treated.
 32 | # There is also a 3-5 day period to give the original owner (if any) time to
 33 | # claim it.
 34 | #
 35 | # For our analysis, we will not include any animals that were at the center
 36 | # for <= 1 week.
 37 | 
 38 | # ------------------------------------------------------------------------------
 39 | 
 40 | harmonize_colors <- function(x) {
 41 |   x <- gsub("mut", "", x)
 42 | 
 43 |   x <- gsub("pt", "point", x)
 44 |   x <- gsub("(brn)|(br )", "brown ", x)
 45 |   x <- gsub("dil", "dilute", x)
 46 |   x <- gsub("org", "orange", x)
 47 |   x <- gsub("rd", "red", x)
 48 |   x <- gsub("slvr", "silver", x)
 49 |   x <- gsub("(crm )|(cr )", "cream ", x)
 50 |   x <- gsub("(slvr)|(sl)", "silver", x)
 51 |   x <- gsub("choc ", "chocolate ", x)
 52 |   x <- gsub("(lc )|(li )", "lilac ", x)
 53 |   x <- gsub("l-c", "lilac_cream", x, fixed = TRUE)
 54 |   x <- gsub("(bl )", "blue ", x)
 55 |   x <- gsub("^(y )", "yellow ", x)
 56 |   x <- gsub("(blk)|(bc)|(bk)", "black", x)
 57 | 
 58 |   # fur patterns
 59 |   x <- gsub("brind$", "brindle", x)
 60 |   x <- gsub("tab$", "tabby", x)
 61 | 
 62 |   # Things that are still unclear
 63 |   x <- gsub("b-c", "", x, fixed = TRUE) # "brown-cream"?"
 64 |   x <- gsub("s-t", "", x, fixed = TRUE)
 65 | 
 66 |   x <- trimws(x, which = "both")
 67 |   gsub("[[:space:]]+", "_", x)
 68 | }
 69 | 
 70 | raw <- read_csv("data-raw/animal-shelter-intakes-and-outcomes.csv") %>%
 71 |   clean_names() %>%
 72 |   filter(
 73 |     animal_type == "CAT" &
 74 |       !is.na(outcome_type) &
 75 |       intake_is_dead == "Alive on Intake" &
 76 |       primary_color != "UNKNOWN" &
 77 |       secondary_color != "UNKNOWN"
 78 |   ) %>%
 79 |   filter(
 80 |     # These animals would not have been up for being homed
 81 |     !(outcome_type %in% c("DISPOSAL", "EUTHANASIA", "MISSING", "DUPLICATE"))
 82 |   ) %>%
 83 |   # There are multiple rows for some animals; take most recent
 84 |   arrange(animal_id, outcome_date) %>%
 85 |   slice_head(by = c(animal_id), n = 1)
 86 | 
 87 | event_list <-
 88 |   c(
 89 |     "adoption",
 90 |     "community cat",
 91 |     "foster",
 92 |     "foster to adopt",
 93 |     "homefirst",
 94 |     "return to owner",
 95 |     "return to wild habitat",
 96 |     "shelter, neuter, return",
 97 |     "trap, neuter, release"
 98 |   )
 99 | 
100 | other_list <-
101 |   c("died", "rescue", "return to rescue", "transfer", "transport")
102 | 
103 | cats <- raw %>%
104 |   mutate(
105 |     across(where(is.character), tolower),
106 |     time = as.numeric(difftime(outcome_date, intake_date, units = "days")),
107 |     time = if_else(time < 0, NA_real_, time),
108 |     time = if_else(time < 1, 1, time),
109 |     event = if_else(outcome_type %in% event_list, 1, 0),
110 |   ) %>%
111 |   filter(outcome_type %in% c(event_list, other_list) & time > 7) %>%
112 |   select(
113 |     time,
114 |     event,
115 |     contains("color"),
116 |     sex,
117 |     intake_condition,
118 |     intake_type,
119 |     jurisdiction,
120 |     latitude,
121 |     longitude,
122 |     animal_id
123 |   ) %>%
124 |   mutate(
125 |     neutered = case_when(
126 |       sex %in% c("neutered", "spayed") ~ "yes",
127 |       sex == "unknown" ~ "unknown",
128 |       TRUE ~ "no"
129 |     ),
130 |     sex = case_when(
131 |       sex == "neutered" ~ "male",
132 |       sex == "spayed" ~ "female",
133 |       TRUE ~ sex
134 |     ),
135 |     # clean up color labels
136 |     primary_color = harmonize_colors(primary_color),
137 |     secondary_color = harmonize_colors(secondary_color),
138 |     # underscores
139 |     intake_condition = gsub("age/weight", "age_or_weight", intake_condition),
140 |     intake_condition = gsub("[[:space:]]+", "_", intake_condition),
141 |     intake_type = gsub("i/i", "i_i", intake_type, fixed = TRUE),
142 |     intake_type = gsub("[[:punct:]]", "", intake_type),
143 |     intake_type = gsub("[[:space:]]+", "_", intake_type),
144 |     jurisdiction = gsub("[[:space:]]+", "_", jurisdiction)
145 |   )
146 | 
147 | # Make indicators for color (which also contains pattern)
148 | col_names <- function(var, lvl, ...) {
149 |   lvl
150 | }
151 | cats_with_color_dummies <- cats %>%
152 |   recipe() %>%
153 |   step_dummy_multi_choice(
154 |     ends_with("color"),
155 |     threshold = 0.0,
156 |     naming = col_names
157 |   ) %>%
158 |   step_other(intake_condition, intake_type, threshold = 0.02) %>%
159 |   step_zv() %>%
160 |   prep() %>%
161 |   bake(new_data = NULL)
162 | 
163 | col_counts <- map_int(cats_with_color_dummies %>% select(-(1:10)), sum)
164 | col_count_rm <- names(col_counts)[col_counts <= 20]
165 | 
166 | cat_adoption <-
167 |   cats_with_color_dummies %>%
168 |   select(-all_of(col_count_rm)) %>%
169 |   select(-animal_id, -jurisdiction) %>%
170 |   relocate(time, event) %>%
171 |   relocate(neutered, .after = sex)
172 | 
173 | usethis::use_data(cat_adoption)
174 | 


--------------------------------------------------------------------------------
/data-raw/chem_proc_yield.R:
--------------------------------------------------------------------------------
 1 | library(tidymodels)
 2 | library(janitor)
 3 | library(AppliedPredictiveModeling)
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | 
 7 | tidymodels_prefer()
 8 | theme_set(theme_bw())
 9 | options(pillar.advice = FALSE, pillar.min_title_chars = Inf)
10 | 
11 | # ------------------------------------------------------------------------------
12 | 
13 | data(ChemicalManufacturingProcess)
14 | 
15 | chem_proc_yield <-
16 |   ChemicalManufacturingProcess %>%
17 |   clean_names() %>%
18 |   rename_with(
19 |     .cols = starts_with("manufacturing_process"),
20 |     ~ gsub("manufacturing_process", "man_proc_", .x)
21 |   ) %>%
22 |   rename_with(
23 |     .cols = starts_with("biological_material"),
24 |     ~ gsub("biological_material", "bio_material_", .x)
25 |   ) %>%
26 |   as_tibble()
27 | 
28 | # ------------------------------------------------------------------------------
29 | 
30 | usethis::use_data(chem_proc_yield)
31 | 


--------------------------------------------------------------------------------
/data-raw/hepatic_injury_qsar.R:
--------------------------------------------------------------------------------
 1 | library(tidymodels)
 2 | library(janitor)
 3 | library(AppliedPredictiveModeling)
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | 
 7 | tidymodels_prefer()
 8 | theme_set(theme_bw())
 9 | options(pillar.advice = FALSE, pillar.min_title_chars = Inf)
10 | 
11 | # ------------------------------------------------------------------------------
12 | 
13 | data(hepatic)
14 | 
15 | names(bio) <- recipes::names0(ncol(bio), "bio_assay_")
16 | names(chem) <- recipes::names0(ncol(chem), "chem_fp_")
17 | 
18 | hepatic_injury_qsar <-
19 |   bind_cols(bio, chem) %>%
20 |   mutate(
21 |     class = tolower(as.character(injury)),
22 |     class = factor(class, ordered = TRUE, levels = c("none", "mild", "severe"))
23 |   ) %>%
24 |   as_tibble() %>%
25 |   relocate(class)
26 | 
27 | # ------------------------------------------------------------------------------
28 | 
29 | usethis::use_data(hepatic_injury_qsar, overwrite = TRUE)
30 | 


--------------------------------------------------------------------------------
/data-raw/hotel_rates.R:
--------------------------------------------------------------------------------
  1 | library(tidymodels)
  2 | library(readr)
  3 | library(janitor)
  4 | library(textrecipes)
  5 | library(lubridate)
  6 | library(randomNames)
  7 | 
  8 | # ------------------------------------------------------------------------------
  9 | 
 10 | tidymodels_prefer()
 11 | options(pillar.advice = FALSE, pillar.min_title_chars = Inf)
 12 | 
 13 | # ------------------------------------------------------------------------------
 14 | 
 15 | # See "Hotel booking demand datasets"
 16 | # https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=%22Hotel+booking+demand+datasets%22
 17 | hotel_raw <-
 18 |   readr::read_csv(
 19 |     "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-11/hotels.csv"
 20 |   ) %>%
 21 |   as_tibble() %>%
 22 |   mutate(
 23 |     arrival_date = paste(
 24 |       arrival_date_year,
 25 |       arrival_date_month,
 26 |       arrival_date_day_of_month,
 27 |       sep = "_"
 28 |     ),
 29 |     arrival_date = ymd(arrival_date),
 30 |     arrival_date_num = decimal_date(arrival_date),
 31 | 
 32 |     market_segment = gsub(
 33 |       "TA/TO",
 34 |       "_travel_agent",
 35 |       market_segment,
 36 |       fixed = TRUE
 37 |     ),
 38 |     market_segment = gsub("TA", "_travel_agent", market_segment),
 39 |     market_segment = gsub("[[:space:]]", "", market_segment),
 40 | 
 41 |     meal = case_when(
 42 |       meal == "BB" ~ "Bed and Breakfast",
 43 |       meal == "HB" ~ "breakfast and one other meal",
 44 |       meal == "FB" ~ "breakfast lunch and dinner",
 45 |       TRUE ~ "no meal package"
 46 |     ),
 47 | 
 48 |     near_christmas = arrival_date_month == "December" &
 49 |       arrival_date_day_of_month <= 26 &
 50 |       arrival_date_day_of_month >= 24,
 51 |     near_christmas = as.numeric(near_christmas),
 52 |     near_new_years = (arrival_date_month == "December" &
 53 |       arrival_date_day_of_month >= 30) |
 54 |       (arrival_date_month == "January" & arrival_date_day_of_month <= 2),
 55 |     near_new_years = as.numeric(near_new_years)
 56 |   )
 57 | 
 58 | # ------------------------------------------------------------------------------
 59 | # instead of codes, use random names for agents and companies. Stratify by
 60 | # ethnicity to avoid overlap
 61 | 
 62 | agents <- tibble(agent = unique(hotel_raw$agent))
 63 | 
 64 | set.seed(1)
 65 | agents$fake_name <-
 66 |   randomNames(
 67 |     nrow(agents),
 68 |     name.order = "first.last",
 69 |     name.sep = "_",
 70 |     ethnicity = c(1:2, 4:6), # reserve 3 for company names
 71 |     sample.with.replacement = FALSE
 72 |   )
 73 | agents$fake_name <- gsub("[[:punct:]]", "_", tolower(agents$fake_name))
 74 | agents$fake_name <- gsub("[[:space:]]", "_", tolower(agents$fake_name))
 75 | agents$fake_name[agents$agent == "NULL"] <- "not_applicable"
 76 | 
 77 | hotel_raw <-
 78 |   left_join(hotel_raw, agents, by = "agent") %>%
 79 |   mutate(agent = fake_name) %>%
 80 |   select(-fake_name)
 81 | 
 82 | ###
 83 | 
 84 | companies <- tibble(company = unique(hotel_raw$company))
 85 | 
 86 | set.seed(2)
 87 | companies$fake_name <-
 88 |   randomNames(
 89 |     nrow(companies),
 90 |     ethnicity = 3,
 91 |     which.names = "last",
 92 |     sample.with.replacement = FALSE
 93 |   )
 94 | companies$fake_name <- gsub("[[:punct:]]", "_", tolower(companies$fake_name))
 95 | companies$fake_name <- gsub("[[:space:]]", "_", tolower(companies$fake_name))
 96 | types <- c("_llc", "_inc", "_and_company", "_pbc")
 97 | types <- sample(types, nrow(companies), replace = TRUE)
 98 | companies$fake_name <- paste0(companies$fake_name, types)
 99 | companies$fake_name[companies$company == "NULL"] <- "not_applicable"
100 | 
101 | hotel_raw <-
102 |   left_join(hotel_raw, companies, by = "company") %>%
103 |   mutate(company = fake_name) %>%
104 |   select(-fake_name)
105 | 
106 | # ------------------------------------------------------------------------------
107 | # version for regression analysis
108 | 
109 | hotel_rates_all <-
110 |   hotel_raw %>%
111 |   filter(
112 |     is_canceled == 0 &
113 |       adr > 15 &
114 |       adr < 2000 &
115 |       hotel == "Resort Hotel" &
116 |       reservation_status == "Check-Out" &
117 |       deposit_type == "No Deposit" &
118 |       !(market_segment %in% c("Complementary", "Undefined"))
119 |   ) %>%
120 |   select(
121 |     -reservation_status,
122 |     -is_canceled,
123 |     avg_price_per_room = adr,
124 |     -reservation_status_date,
125 |     -hotel,
126 |     -arrival_date_month,
127 |     -deposit_type
128 |   ) %>%
129 |   mutate(year_day = yday(arrival_date)) %>%
130 |   relocate(avg_price_per_room) %>%
131 |   recipe() %>%
132 |   step_clean_levels(all_nominal()) %>%
133 |   prep() %>%
134 |   bake(new_data = NULL)
135 | 
136 | # ------------------------------------------------------------------------------
137 | # pull off first year of data to compute historical ADR by day
138 | 
139 | year_2016_data <-
140 |   hotel_rates_all %>%
141 |   filter(arrival_date <= min(arrival_date) + years(1))
142 | 
143 | year_2016_stats <-
144 |   year_2016_data %>%
145 |   summarize(
146 |     hist_adr_raw = mean(avg_price_per_room),
147 |     hist_bookings = n(),
148 |     .by = year_day
149 |   ) %>%
150 |   arrange(year_day)
151 | 
152 | year_2016_stats$historical_adr <-
153 |   loess(
154 |     hist_adr_raw ~ year_day,
155 |     data = year_2016_stats,
156 |     span = .1,
157 |     degree = 2
158 |   )$fitted
159 | 
160 | # Add a value for the leap year
161 | year_2016_stats_leap <-
162 |   tibble(
163 |     year_day = 366,
164 |     historical_adr = year_2016_stats$hist_adr_raw[nrow(year_2016_stats)]
165 |   )
166 | 
167 | year_2016_stats <-
168 |   bind_rows(year_2016_stats, year_2016_stats_leap) %>%
169 |   select(year_day, historical_adr)
170 | 
171 | hotel_rates <-
172 |   hotel_rates_all %>%
173 |   filter(arrival_date > min(arrival_date) + years(1)) %>%
174 |   left_join(year_2016_stats, by = "year_day") %>%
175 |   arrange(arrival_date) %>%
176 |   select(
177 |     -arrival_date_year,
178 |     -arrival_date_week_number,
179 |     -arrival_date_day_of_month,
180 |     -year_day
181 |   )
182 | 
183 | usethis::use_data(hotel_rates, overwrite = TRUE)
184 | 


--------------------------------------------------------------------------------
/data-raw/ischemic_stroke.R:
--------------------------------------------------------------------------------
 1 | library(tidymodels)
 2 | library(janitor)
 3 | library(forcats)
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | 
 7 | tidymodels_prefer()
 8 | theme_set(theme_bw())
 9 | options(pillar.advice = FALSE, pillar.min_title_chars = Inf)
10 | 
11 | # ------------------------------------------------------------------------------
12 | 
13 | load(url(
14 |   "https://github.com/topepo/FES/raw/06812c48a21882808403cee338b8312fdbd35a46/Data_Sets/Ischemic_Stroke/stroke_data.RData"
15 | ))
16 | 
17 | ischemic_stroke <-
18 |   bind_rows(stroke_train, stroke_test) %>%
19 |   clean_names() %>%
20 |   rename(male = sex, nascet_scale = nascet) %>%
21 |   mutate(
22 |     stroke = ifelse(stroke == "Y", "yes", "no"),
23 |     stroke = factor(stroke, levels = c("yes", "no"))
24 |   ) %>%
25 |   as_tibble()
26 | 
27 | # ------------------------------------------------------------------------------
28 | 
29 | usethis::use_data(ischemic_stroke)
30 | 


--------------------------------------------------------------------------------
/data-raw/leaf_id_flavia.R:
--------------------------------------------------------------------------------
 1 | library(tidymodels)
 2 | library(janitor)
 3 | library(readr)
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | 
 7 | tidymodels_prefer()
 8 | theme_set(theme_bw())
 9 | options(pillar.advice = FALSE, pillar.min_title_chars = Inf)
10 | 
11 | # ------------------------------------------------------------------------------
12 | 
13 | flavia_url <-
14 |   "https://github.com/SMART-Research/leaffeatures_paper/raw/65ffd8c8b926b8df3f499c9224d6073975db5c3c/data_all_with_label_flavia_with_species.csv"
15 | 
16 | leaf_id_flavia <-
17 |   read_csv(flavia_url) %>%
18 |   clean_names() %>%
19 |   mutate(
20 |     species = gsub("([[:punct:]])|([[:space:]])", "_", tolower(species)),
21 |     shape = tolower(shape_label),
22 |     apex = if_else(!is.na(apex), tolower(apex), "none"),
23 |     base = if_else(!is.na(base), tolower(base), "none"),
24 |     edge_type = tolower(edge_type),
25 |     edge_type_2 = tolower(edge_type_2),
26 |     edge_type_2 = ifelse(is.na(edge_type_2), "", edge_type_2),
27 |     edges = map2_chr(edge_type, edge_type_2, ~ paste(.x, .y, sep = "_")),
28 |     edges = gsub("_$", "", edges),
29 |     denate_edge = ifelse(
30 |       edge_type == "denate" | edge_type_2 == "denate",
31 |       "yes",
32 |       "no"
33 |     ),
34 |     lobed_edge = ifelse(
35 |       edge_type == "lobed" | edge_type_2 == "lobed",
36 |       "yes",
37 |       "no"
38 |     ),
39 |     smooth_edge = ifelse(
40 |       edge_type == "smooth" | edge_type_2 == "smooth",
41 |       "yes",
42 |       "no"
43 |     ),
44 |     toothed_edge = ifelse(
45 |       edge_type == "toothed" | edge_type_2 == "toothed",
46 |       "yes",
47 |       "no"
48 |     ),
49 |     undulate_edge = ifelse(
50 |       edge_type == "undulate" | edge_type_2 == "undulate",
51 |       "yes",
52 |       "no"
53 |     ),
54 |     across(where(is.character), factor)
55 |   ) %>%
56 |   select(-id, -cx, -cy, -shape_label, -edges, -edge_type_2, -edge_type) %>%
57 |   rename(narrow_factor = nf) %>%
58 |   rename_with(~ gsub("_g_", "_green_", .x)) %>%
59 |   rename_with(~ gsub("_b_", "_blue_", .x)) %>%
60 |   rename_with(~ gsub("_r_", "_red_", .x)) %>%
61 |   rename_with(~ gsub("^no_of_", "num_", .x)) %>%
62 |   relocate(
63 |     species,
64 |     apex,
65 |     base,
66 |     shape,
67 |     denate_edge,
68 |     lobed_edge,
69 |     smooth_edge,
70 |     toothed_edge,
71 |     undulate_edge
72 |   )
73 | 
74 | # ------------------------------------------------------------------------------
75 | 
76 | usethis::use_data(leaf_id_flavia)
77 | 


--------------------------------------------------------------------------------
/data-raw/permeability_qsar.R:
--------------------------------------------------------------------------------
 1 | library(tidymodels)
 2 | library(janitor)
 3 | library(AppliedPredictiveModeling)
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | 
 7 | tidymodels_prefer()
 8 | theme_set(theme_bw())
 9 | options(pillar.advice = FALSE, pillar.min_title_chars = Inf)
10 | 
11 | # ------------------------------------------------------------------------------
12 | 
13 | data("permeability")
14 | 
15 | fingerprints <- as.data.frame(fingerprints)
16 | 
17 | names(fingerprints) <- recipes::names0(ncol(fingerprints), "chem_fp_")
18 | 
19 | permeability_qsar <-
20 |   fingerprints %>%
21 |   mutate(
22 |     permeability = permeability[, 1]
23 |   ) %>%
24 |   as_tibble() %>%
25 |   relocate(permeability)
26 | 
27 | # ------------------------------------------------------------------------------
28 | 
29 | usethis::use_data(permeability_qsar)
30 | 


--------------------------------------------------------------------------------
/data-raw/prep_datasets.R:
--------------------------------------------------------------------------------
 1 | ## code to prepare `tate_text`
 2 | 
 3 | library(tidyverse)
 4 | artwork <- read_csv(
 5 |   "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-12/artwork.csv"
 6 | )
 7 | 
 8 | tate_text <- artwork %>%
 9 |   filter(year >= 1990, artistRole == "artist") %>%
10 |   select(id, artist, title, medium, year) %>%
11 |   mutate(across(c(artist, medium), as.factor)) %>%
12 |   arrange(year, artist)
13 | 
14 | usethis::use_data(tate_text, overwrite = TRUE)
15 | 


--------------------------------------------------------------------------------
/data-raw/steroidogenic_toxicity.R:
--------------------------------------------------------------------------------
 1 | library(tidymodels)
 2 | library(janitor)
 3 | library(readr)
 4 | 
 5 | # ------------------------------------------------------------------------------
 6 | 
 7 | tidymodels_prefer()
 8 | theme_set(theme_bw())
 9 | options(pillar.advice = FALSE, pillar.min_title_chars = Inf)
10 | 
11 | # ------------------------------------------------------------------------------
12 | 
13 | steroidogenic_toxicity <-
14 |   read_delim(
15 |     "https://github.com/topepo/steroidogenic_tox/raw/master/data.txt",
16 |     delim = "\t"
17 |   ) %>%
18 |   clean_names() %>%
19 |   rename_with(~ gsub("cyp", "cyp_", .x)) %>%
20 |   select(-compound) %>%
21 |   mutate(class = factor(class, levels = c("toxic", "nontoxic")))
22 | 
23 | # ------------------------------------------------------------------------------
24 | 
25 | usethis::use_data(steroidogenic_toxicity, overwrite = TRUE)
26 | 


--------------------------------------------------------------------------------
/data-raw/taxi.R:
--------------------------------------------------------------------------------
 1 | ## code to prepare `taxi` dataset goes here
 2 | 
 3 | library(tidyverse)
 4 | library(tidymodels)
 5 | library(janitor)
 6 | 
 7 | # https://data.cityofchicago.org/Transportation/Taxi-Trips-2022/npd7-ywjz
 8 | taxi_raw <- read_csv(
 9 |   "https://data.cityofchicago.org/api/views/e55j-2ewb/rows.csv?accessType=DOWNLOAD"
10 | ) |>
11 |   clean_names()
12 | 
13 | set.seed(1234)
14 | 
15 | taxi_med <- taxi_raw |>
16 |   filter(!is.na(tips), payment_type != "Cash") |>
17 |   drop_na() |>
18 |   slice_sample(n = 20000) |>
19 |   mutate(
20 |     tip = if_else(tips > 0, "yes", "no") |> factor(levels = c("yes", "no")),
21 |     trip_start = mdy_hms(trip_start_timestamp),
22 |     local = if_else(
23 |       pickup_community_area == dropoff_community_area,
24 |       "yes",
25 |       "no"
26 |     ) |>
27 |       factor(levels = c("yes", "no")),
28 |     pickup_community_area = factor(pickup_community_area),
29 |     dropoff_community_area = factor(dropoff_community_area)
30 |   )
31 | 
32 | taxi_rec_base <- recipe(tip ~ ., data = taxi_med) |>
33 |   step_date(
34 |     trip_start,
35 |     features = c("dow", "month"),
36 |     keep_original_cols = TRUE
37 |   ) |>
38 |   step_time(
39 |     trip_start,
40 |     features = c("hour", "minute"),
41 |     keep_original_cols = TRUE
42 |   ) |>
43 |   step_other(company) |>
44 |   step_rm(
45 |     trip_start_timestamp,
46 |     trip_end_timestamp,
47 |     taxi_id,
48 |     tips,
49 |     trip_start,
50 |     trip_start_minute,
51 |     contains("census"),
52 |     contains("centroid"),
53 |     contains("community_area")
54 |   ) %>%
55 |   step_rename(
56 |     id := trip_id,
57 |     duration = trip_seconds,
58 |     distance = trip_miles,
59 |     total_cost = trip_total,
60 |     dow = trip_start_dow,
61 |     month = trip_start_month,
62 |     hour = trip_start_hour
63 |   )
64 | 
65 | taxi <- prep(taxi_rec_base) |>
66 |   bake(new_data = NULL) |>
67 |   relocate(tip)
68 | 
69 | taxi <- taxi |>
70 |   mutate(month = factor(month, levels = c("Jan", "Feb", "Mar", "Apr"))) |>
71 |   select(-c(id, duration, fare, tolls, extras, total_cost, payment_type)) |>
72 |   drop_na() |>
73 |   slice_sample(n = 10000)
74 | 
75 | usethis::use_data(taxi, overwrite = TRUE)
76 | 


--------------------------------------------------------------------------------
/data/Chicago.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/Chicago.rda


--------------------------------------------------------------------------------
/data/Sacramento.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/Sacramento.RData


--------------------------------------------------------------------------------
/data/Smithsonian.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/Smithsonian.RData


--------------------------------------------------------------------------------
/data/ad_data.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/ad_data.RData


--------------------------------------------------------------------------------
/data/ames.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/ames.rda


--------------------------------------------------------------------------------
/data/attrition.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/attrition.RData


--------------------------------------------------------------------------------
/data/biomass.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/biomass.RData


--------------------------------------------------------------------------------
/data/bivariate.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/bivariate.RData


--------------------------------------------------------------------------------
/data/car_prices.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/car_prices.RData


--------------------------------------------------------------------------------
/data/cat_adoption.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/cat_adoption.rda


--------------------------------------------------------------------------------
/data/cells.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/cells.RData


--------------------------------------------------------------------------------
/data/check_times.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/check_times.rda


--------------------------------------------------------------------------------
/data/chem_proc_yield.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/chem_proc_yield.rda


--------------------------------------------------------------------------------
/data/concrete.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/concrete.RData


--------------------------------------------------------------------------------
/data/covers.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/covers.RData


--------------------------------------------------------------------------------
/data/credit_data.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/credit_data.RData


--------------------------------------------------------------------------------
/data/crickets.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/crickets.rda


--------------------------------------------------------------------------------
/data/datalist:
--------------------------------------------------------------------------------
 1 | ad_data
 2 | ames
 3 | attrition
 4 | biomass
 5 | bivariate: bivariate_test bivariate_train bivariate_val
 6 | car_prices
 7 | cells
 8 | check_times
 9 | Chicago: Chicago stations
10 | concrete
11 | covers
12 | credit_data
13 | crickets
14 | deliveries
15 | drinks
16 | grants: grants_2008 grants_other grants_test
17 | hpc_cv
18 | hpc_data
19 | lending_club
20 | meats
21 | mlc_churn
22 | oils
23 | parabolic
24 | pathology
25 | pd_speech
26 | penguins
27 | Sacramento
28 | scat
29 | small_fine_foods: testing_data training_data
30 | Smithsonian
31 | solubility_test
32 | stackoverflow
33 | tate_text
34 | two_class_dat
35 | two_class_example
36 | wa_churn
37 | chem_proc_yield
38 | permeability_qsar
39 | steroidogenic_toxicity
40 | leaf_id_flavia
41 | ischemic_stroke
42 | hepatic_injury_qsar
43 | 


--------------------------------------------------------------------------------
/data/deliveries.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/deliveries.rda


--------------------------------------------------------------------------------
/data/drinks.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/drinks.rda


--------------------------------------------------------------------------------
/data/grants.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/grants.rda


--------------------------------------------------------------------------------
/data/hepatic_injury_qsar.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/hepatic_injury_qsar.rda


--------------------------------------------------------------------------------
/data/hotel_rates.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/hotel_rates.rda


--------------------------------------------------------------------------------
/data/hpc_cv.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/hpc_cv.rda


--------------------------------------------------------------------------------
/data/hpc_data.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/hpc_data.RData


--------------------------------------------------------------------------------
/data/ischemic_stroke.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/ischemic_stroke.rda


--------------------------------------------------------------------------------
/data/leaf_id_flavia.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/leaf_id_flavia.rda


--------------------------------------------------------------------------------
/data/lending_club.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/lending_club.rda


--------------------------------------------------------------------------------
/data/meats.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/meats.RData


--------------------------------------------------------------------------------
/data/mlc_churn.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/mlc_churn.RData


--------------------------------------------------------------------------------
/data/oils.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/oils.RData


--------------------------------------------------------------------------------
/data/parabolic.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/parabolic.rda


--------------------------------------------------------------------------------
/data/pathology.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/pathology.rda


--------------------------------------------------------------------------------
/data/pd_speech.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/pd_speech.rda


--------------------------------------------------------------------------------
/data/penguins.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/penguins.rda


--------------------------------------------------------------------------------
/data/permeability_qsar.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/permeability_qsar.rda


--------------------------------------------------------------------------------
/data/scat.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/scat.RData


--------------------------------------------------------------------------------
/data/small_fine_foods.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/small_fine_foods.RData


--------------------------------------------------------------------------------
/data/solubility_test.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/solubility_test.rda


--------------------------------------------------------------------------------
/data/stackoverflow.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/stackoverflow.rda


--------------------------------------------------------------------------------
/data/steroidogenic_toxicity.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/steroidogenic_toxicity.rda


--------------------------------------------------------------------------------
/data/tate_text.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/tate_text.rda


--------------------------------------------------------------------------------
/data/taxi.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/taxi.rda


--------------------------------------------------------------------------------
/data/two_class_dat.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/two_class_dat.RData


--------------------------------------------------------------------------------
/data/two_class_example.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/two_class_example.rda


--------------------------------------------------------------------------------
/data/wa_churn.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/wa_churn.rda


--------------------------------------------------------------------------------
/man/Chicago.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Chicago.R
 3 | \docType{data}
 4 | \name{Chicago}
 5 | \alias{Chicago}
 6 | \alias{stations}
 7 | \title{Chicago ridership data}
 8 | \source{
 9 | Kuhn and Johnson (2020), \emph{Feature Engineering and Selection},
10 | Chapman and Hall/CRC . \url{https://bookdown.org/max/FES/} and
11 | \url{https://github.com/topepo/FES}
12 | }
13 | \value{
14 | \item{Chicago}{a tibble} \item{stations}{a vector of station names}
15 | }
16 | \description{
17 | Chicago ridership data
18 | }
19 | \details{
20 | These data are from Kuhn and Johnson (2020) and contain an
21 | \emph{abbreviated} training set for modeling the number of people (in thousands)
22 | who enter the Clark and Lake L station.
23 | 
24 | The \code{date} column corresponds to the current date. The columns with station
25 | names (\code{Austin} through \code{California}) are a \emph{sample} of the columns used in
26 | the original analysis (for file size reasons). These are 14 day lag
27 | variables (i.e. \verb{date - 14 days}). There are columns related to weather and
28 | sports team schedules.
29 | 
30 | The station at 35th and Archer is contained in the column \code{Archer_35th} to
31 | make it a valid R column name.
32 | }
33 | \examples{
34 | data(Chicago)
35 | str(Chicago)
36 | stations
37 | }
38 | \keyword{datasets}
39 | 


--------------------------------------------------------------------------------
/man/Sacramento.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sacremento.R
 3 | \docType{data}
 4 | \name{Sacramento}
 5 | \alias{Sacramento}
 6 | \title{Sacramento CA home prices}
 7 | \source{
 8 | SpatialKey website:
 9 | \url{https://support.spatialkey.com/spatialkey-sample-csv-data/}
10 | }
11 | \value{
12 | \item{Sacramento}{a tibble}
13 | }
14 | \description{
15 | This data frame contains house and sale price data for 932 homes in
16 | Sacramento CA.  The original data were obtained from the website for the
17 | SpatialKey software. From their website: "The Sacramento real estate
18 | transactions file is a list of 985 real estate transactions in the
19 | Sacramento area reported over a five-day period, as reported by the
20 | Sacramento Bee." Google was used to fill in missing/incorrect data.
21 | }
22 | \examples{
23 | data(Sacramento)
24 | str(Sacramento)
25 | }
26 | \keyword{datasets}
27 | 


--------------------------------------------------------------------------------
/man/Smithsonian.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/Smithsonian.R
 3 | \docType{data}
 4 | \name{Smithsonian}
 5 | \alias{Smithsonian}
 6 | \title{Smithsonian museums}
 7 | \source{
 8 | https://en.wikipedia.org/wiki/List_of_Smithsonian_museums
 9 | }
10 | \value{
11 | \item{Smithsonian}{a tibble}
12 | }
13 | \description{
14 | Geocodes for the Smithsonian museums (circa 2018).
15 | }
16 | \examples{
17 | data(Smithsonian)
18 | str(Smithsonian)
19 | }
20 | \keyword{datasets}
21 | 


--------------------------------------------------------------------------------
/man/ad_data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ad_data.R
 3 | \docType{data}
 4 | \name{ad_data}
 5 | \alias{ad_data}
 6 | \title{Alzheimer's disease data}
 7 | \source{
 8 | Kuhn, M., Johnson, K. (2013) \emph{Applied Predictive Modeling}, Springer.
 9 | 
10 | Craig-Schapiro R, Kuhn M, Xiong C, Pickering EH, Liu J, Misko TP, et al.
11 | (2011) Multiplexed Immunoassay Panel Identifies Novel CSF Biomarkers for
12 | Alzheimer's Disease Diagnosis and Prognosis. PLoS ONE 6(4): e18850.
13 | }
14 | \value{
15 | \item{ad_data}{a tibble}
16 | }
17 | \description{
18 | Alzheimer's disease data
19 | }
20 | \details{
21 | Craig-Schapiro et al. (2011) describe a clinical study of 333 patients,
22 | including some with mild (but well-characterized) cognitive impairment as
23 | well as healthy individuals. CSF samples were taken from all subjects. The
24 | goal of the study was to determine if subjects in the early states of
25 | impairment could be differentiated from cognitively healthy individuals.
26 | Data collected on each subject included:
27 | \itemize{
28 | \item Demographic characteristics such as age and gender
29 | \item Apolipoprotein E genotype
30 | \item Protein measurements of Abeta, Tau, and a phosphorylated version of Tau (called pTau)
31 | \item Protein measurements of 124 exploratory biomarkers, and
32 | \item Clinical dementia scores
33 | }
34 | 
35 | For these analyses, we have converted the scores to two classes: impaired
36 | and healthy. The goal of this analysis is to create classification models
37 | using the demographic and assay data to predict which patients have early
38 | stages of disease.
39 | }
40 | \examples{
41 | data(ad_data)
42 | str(ad_data)
43 | }
44 | \keyword{datasets}
45 | 


--------------------------------------------------------------------------------
/man/ames.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ames.R
 3 | \docType{data}
 4 | \name{ames}
 5 | \alias{ames}
 6 | \title{Ames Housing Data}
 7 | \source{
 8 | De Cock, D. (2011). "Ames, Iowa: Alternative to the Boston Housing Data as an End of Semester Regression Project," \emph{Journal of Statistics Education},  Volume 19, Number 3.
 9 | 
10 | \url{http://jse.amstat.org/v19n3/decock/DataDocumentation.txt}
11 | 
12 | \url{http://jse.amstat.org/v19n3/decock.pdf}
13 | }
14 | \value{
15 | \item{ames}{a tibble}
16 | }
17 | \description{
18 | A data set from De Cock (2011) has 82 fields were recorded for 2,930
19 | properties in Ames IA. This version is copies from the \code{AmesHousing} package
20 | but does not include a few quality columns that appear to be outcomes
21 | rather than predictors.
22 | }
23 | \details{
24 | See this links for the sources below for more information as well as
25 | \code{?AmesHousing::make_ames}.
26 | 
27 | For these data, the training materials typically use:
28 | 
29 | \if{html}{\out{<div class="sourceCode r">}}\preformatted{library(tidymodels)
30 | 
31 | set.seed(4595)
32 | data_split <- initial_split(ames, strata = "Sale_Price")
33 | ames_train <- training(data_split)
34 | ames_test  <- testing(data_split)
35 | 
36 | set.seed(2453)
37 | ames_folds<- vfold_cv(ames_train)
38 | }\if{html}{\out{</div>}}
39 | }
40 | \examples{
41 | data(ames)
42 | str(ames)
43 | }
44 | \keyword{datasets}
45 | 


--------------------------------------------------------------------------------
/man/attrition.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/attrition.R
 3 | \docType{data}
 4 | \name{attrition}
 5 | \alias{attrition}
 6 | \title{Job attrition}
 7 | \source{
 8 | The IBM Watson Analytics Lab website https://www.ibm.com/communities/analytics/watson-analytics-blog/hr-employee-attrition/
 9 | }
10 | \value{
11 | \item{attrition}{a data frame}
12 | }
13 | \description{
14 | Job attrition
15 | }
16 | \details{
17 | These data are from the IBM Watson Analytics Lab.
18 | The website describes the data with \dQuote{Uncover the
19 | factors that lead to employee attrition and explore important
20 | questions such as \sQuote{show me a breakdown of distance
21 | from home by job role and attrition} or \sQuote{compare
22 | average monthly income by education and attrition}. This is a
23 | fictional data set created by IBM data scientists.}. There
24 | are 1470 rows.
25 | }
26 | \examples{
27 | data(attrition)
28 | str(attrition)
29 | }
30 | \keyword{datasets}
31 | 


--------------------------------------------------------------------------------
/man/biomass.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/biomass.R
 3 | \docType{data}
 4 | \name{biomass}
 5 | \alias{biomass}
 6 | \title{Biomass data}
 7 | \source{
 8 | Ghugare, S. B., Tiwary, S., Elangovan, V., and Tambe, S. S. (2013).
 9 | Prediction of Higher Heating Value of Solid Biomass Fuels Using Artificial
10 | Intelligence Formalisms. \emph{BioEnergy Research}, 1-12.
11 | }
12 | \value{
13 | \item{biomass}{a data frame}
14 | }
15 | \description{
16 | Ghugare et al (2014) contains a data set where different biomass fuels are
17 | characterized by the amount of certain molecules (carbon, hydrogen, oxygen,
18 | nitrogen, and sulfur) and the corresponding higher heating value (HHV).
19 | These data are from their Table S.2 of the Supplementary Materials
20 | }
21 | \examples{
22 | data(biomass)
23 | str(biomass)
24 | }
25 | \keyword{datasets}
26 | 


--------------------------------------------------------------------------------
/man/bivariate.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/bivariate.R
 3 | \docType{data}
 4 | \name{bivariate}
 5 | \alias{bivariate}
 6 | \alias{bivariate_train}
 7 | \alias{bivariate_test}
 8 | \alias{bivariate_val}
 9 | \title{Example bivariate classification data}
10 | \value{
11 | \item{bivariate_train, bivariate_test, bivariate_val}{tibbles}
12 | }
13 | \description{
14 | Example bivariate classification data
15 | }
16 | \details{
17 | These data are a simplified version of the segmentation data contained
18 | in \code{caret}. There are three columns: \code{A} and \code{B} are predictors and the column
19 | \code{Class} is a factor with levels "One" and "Two". There are three data sets:
20 | one for training (n = 1009), validation (n = 300), and testing (n = 710).
21 | }
22 | \examples{
23 | data(bivariate)
24 | str(bivariate_train)
25 | str(bivariate_val)
26 | str(bivariate_test)
27 | }
28 | \keyword{datasets}
29 | 


--------------------------------------------------------------------------------
/man/car_prices.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/car_prices.R
 3 | \docType{data}
 4 | \name{car_prices}
 5 | \alias{car_prices}
 6 | \title{Kelly Blue Book resale data for 2005 model year GM cars}
 7 | \source{
 8 | Kuiper, S. (2008). Introduction to Multiple Regression: How Much Is Your Car Worth?,
 9 | \emph{Journal of Statistics Education}, Vol. 16
10 | \url{http://jse.amstat.org/jse_archive.htm#2008}.
11 | }
12 | \value{
13 | \item{car_prices}{data frame of the suggested retail price (column \code{Price}) and various
14 | characteristics of each car (columns \code{Mileage}, \code{Cylinder}, \code{Doors}, \code{Cruise},
15 | \code{Sound}, \code{Leather}, \code{Buick}, \code{Cadillac}, \code{Chevy}, \code{Pontiac}, \code{Saab},
16 | \code{Saturn}, \code{convertible}, \code{coupe}, \code{hatchback}, \code{sedan} and \code{wagon})}
17 | }
18 | \description{
19 | Kuiper (2008) collected data on Kelly Blue Book resale data for 804 GM cars (2005 model year).
20 | }
21 | \examples{
22 | data(car_prices)
23 | str(car_prices)
24 | }
25 | \keyword{datasets}
26 | 


--------------------------------------------------------------------------------
/man/cat_adoption.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cat_adoption.R
 3 | \docType{data}
 4 | \name{cat_adoption}
 5 | \alias{cat_adoption}
 6 | \title{Cat Adoption}
 7 | \source{
 8 | \url{https://data.longbeach.gov/explore/dataset/animal-shelter-intakes-and-outcomes/information/}
 9 | on 2024-06-17
10 | }
11 | \value{
12 | tibble
13 | }
14 | \description{
15 | A subset of the cats at the animal shelter in Long Beach, California, USA.
16 | }
17 | \details{
18 | A data frame with 2257 rows and 19 columns:
19 | \describe{
20 | \item{time}{The time the cat spent at the shelter.}
21 | \item{event}{The event of interest is the cat being homed or returned to
22 | its original location (i.e., owner or community). The non-event is the cat
23 | being transferred to another shelter or dying. Zero indicates a non-event
24 | (censored), and one corresponds to the event occurring.}
25 | \item{sex}{The sex of the cat.}
26 | \item{neutered}{Whether the cat is neutered.}
27 | \item{intake_condition}{The intake condition of the cat.}
28 | \item{intake_type}{The type of intake.}
29 | \item{latitude}{Latitude of the intersection/cross street of intake or capture.}
30 | \item{longitude}{Longitude of the intersection/cross street of intake or capture.}
31 | \item{black,brown,brown_tabby,calico,cream,gray,gray_tabby,orange,orange_tabby,tan,tortie,white}{Indicators for the color/pattern of the cat's fur.}
32 | }
33 | }
34 | \examples{
35 | str(cat_adoption)
36 | }
37 | \keyword{datasets}
38 | 


--------------------------------------------------------------------------------
/man/cells.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/cells.R
 3 | \docType{data}
 4 | \name{cells}
 5 | \alias{cells}
 6 | \title{Cell body segmentation}
 7 | \source{
 8 | Hill, LaPan, Li and Haney (2007). Impact of image segmentation on
 9 | high-content screening data quality for SK-BR-3 cells, \emph{BMC
10 | Bioinformatics}, Vol. 8, pg. 340,
11 | \url{https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-8-340}.
12 | }
13 | \value{
14 | \item{cells}{a tibble}
15 | }
16 | \description{
17 | Hill, LaPan, Li and Haney (2007) develop models to predict which cells in a
18 | high content screen were well segmented.  The data consists of 119 imaging
19 | measurements on 2019. The original analysis used 1009 for training and 1010
20 | as a test set (see the column called \code{case}).
21 | }
22 | \details{
23 | The outcome class is contained in a factor variable called \code{class} with
24 | levels "PS" for poorly segmented and "WS" for well segmented.
25 | 
26 | The raw data used in the paper can be found at the Biomedcentral website.
27 | The version
28 | contained in \code{cells} is modified. First, several discrete
29 | versions of some of the predictors (with the suffix "Status") were removed.
30 | Second, there are several skewed predictors with minimum values of zero
31 | (that would benefit from some transformation, such as the log). A constant
32 | value of 1 was added to these fields: \code{avg_inten_ch_2},
33 | \code{fiber_align_2_ch_3}, \code{fiber_align_2_ch_4}, \code{spot_fiber_count_ch_4} and
34 | \code{total_inten_ch_2}.
35 | }
36 | \examples{
37 | data(cells)
38 | str(cells)
39 | }
40 | \keyword{datasets}
41 | 


--------------------------------------------------------------------------------
/man/check_times.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/check_times.R
 3 | \docType{data}
 4 | \name{check_times}
 5 | \alias{check_times}
 6 | \title{Execution time data}
 7 | \source{
 8 | CRAN
 9 | }
10 | \value{
11 | \item{check_times}{a data frame}
12 | }
13 | \description{
14 | These data were collected from the CRAN web page for 13,626 R
15 | packages. The time to complete the standard package checking
16 | routine was collected In some cases, the package checking
17 | process is stopped due to errors and these data are treated as
18 | censored. It is less than 1 percent.
19 | }
20 | \details{
21 | As predictors, the associated package source code were
22 | downloaded and parsed to create predictors, including
23 | \itemize{
24 | \item \code{authors}: The number of authors in the author field.
25 | \item \code{imports}: The number of imported packages.
26 | \item \code{suggests}: The number of packages suggested.
27 | \item \code{depends}: The number of hard dependencies.
28 | \item \code{Roxygen}: a binary indicator for whether Roxygen was used
29 | for documentation.
30 | \item \code{gh}: a binary indicator for whether the URL field contained
31 | a GitHub link.
32 | \item \code{rforge}: a binary indicator for whether the URL field
33 | contained a link to R-forge.
34 | \item \code{descr}: The number of characters (or, in some cases, bytes)
35 | in the description field.
36 | \item \code{r_count}: The number of R files in the R directory.
37 | \item \code{r_size}: The total disk size of the R files.
38 | \item \code{ns_import}: Estimated number of imported functions or methods.
39 | \item \code{ns_export}: Estimated number of exported functions or methods.
40 | \item \code{s3_methods}: Estimated number of S3 methods.
41 | \item \code{s4_methods}: Estimated number of S4 methods.
42 | \item \code{doc_count}: How many Rmd or Rnw files in the vignettes
43 | directory.
44 | \item \code{doc_size}: The disk size of the Rmd or Rnw files.
45 | \item \code{src_count}: The number of files in the \code{src} directory.
46 | \item \code{src_size}: The size on disk of files in the \code{src} directory.
47 | \item \code{data_count}  The number of files in the \code{data} directory.
48 | \item \code{data_size}: The size on disk of files in the \code{data} directory.
49 | \item \code{testthat_count}: The number of files in the \code{testthat}
50 | directory.
51 | \item \code{testthat_size}: The size on disk of files in the \code{testthat}
52 | directory.
53 | \item \code{check_time}: The time (in seconds) to run \verb{R CMD check}
54 | using the "r-devel-windows-ix86+x86_64` flavor.
55 | \item \code{status}: An indicator for whether the tests completed.
56 | }
57 | 
58 | Data were collected on 2019-01-20.
59 | }
60 | \examples{
61 | data(check_times)
62 | str(check_times)
63 | }
64 | \keyword{datasets}
65 | 


--------------------------------------------------------------------------------
/man/chem_proc_yield.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/chem_proc_yield.R
 3 | \docType{data}
 4 | \name{chem_proc_yield}
 5 | \alias{chem_proc_yield}
 6 | \title{Chemical manufacturing process data set}
 7 | \source{
 8 | Kuhn, Max, and Kjell Johnson. \emph{Applied predictive modeling}. New York:
 9 | Springer, 2013.
10 | }
11 | \value{
12 | \item{chem_proc_yield}{a tibble}
13 | }
14 | \description{
15 | A data set that models yield as a function of biological material predictors
16 | and chemical structure predictors.
17 | }
18 | \details{
19 | This data set contains information about a chemical manufacturing
20 | process, in which the goal is to understand the relationship between
21 | the process and the resulting final product yield.  Raw material in
22 | this process is put through a sequence of 27 steps to generate the
23 | final pharmaceutical product.  The starting material is generated from
24 | a biological unit and has a range of quality and characteristics.  The
25 | objective in this project was to develop a model to predict percent
26 | yield of the manufacturing process.  The data set consisted of 177
27 | samples of biological material for which 57 characteristics were
28 | measured.  Of the 57 characteristics, there were 12 measurements of
29 | the biological starting material, and 45 measurements of the
30 | manufacturing process.  The process variables included measurements
31 | such as temperature, drying time, washing time, and concentrations of
32 | by-products at various steps.  Some of the process measurements can
33 | be controlled, while others are observed.  Predictors are continuous,
34 | count, categorical; some are correlated, and some contain missing
35 | values.  Samples are not independent because sets of samples come from
36 | the same batch of biological starting material.
37 | 
38 | Columns:
39 | \itemize{
40 | \item \code{yield}:  numeric
41 | \item \code{bio_material_01} - \code{bio_material_12}:  numeric
42 | \item \code{man_proc_01} - \code{man_proc_45}:  numeric
43 | }
44 | }
45 | \examples{
46 | data(chem_proc_yield)
47 | str(chem_proc_yield)
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/man/concrete.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/concrete.R
 3 | \docType{data}
 4 | \name{concrete}
 5 | \alias{concrete}
 6 | \title{Compressive strength of concrete mixtures}
 7 | \source{
 8 | Yeh I (2006). "Analysis of Strength of Concrete Using Design of Experiments
 9 | and Neural Networks." \emph{Journal of Materials in Civil Engineering}, 18, 597-604.
10 | 
11 | Kuhn, M., Johnson, K. (2013) \emph{Applied Predictive Modeling}, Springer.
12 | }
13 | \value{
14 | \item{concrete}{a tibble}
15 | }
16 | \description{
17 | Yeh (2006) describes an aggregated data set for experimental designs used to
18 | test the compressive strength of concrete mixtures. The data are used by
19 | Kuhn and Johnson (2013).
20 | }
21 | \examples{
22 | data(concrete)
23 | str(concrete)
24 | }
25 | \keyword{datasets}
26 | 


--------------------------------------------------------------------------------
/man/covers.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/covers.R
 3 | \docType{data}
 4 | \name{covers}
 5 | \alias{covers}
 6 | \title{Raw cover type data}
 7 | \source{
 8 | https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info
 9 | }
10 | \value{
11 | \item{covers}{a data frame}
12 | }
13 | \description{
14 | These data are raw data describing different types of forest cover-types
15 | from the UCI Machine Learning Database (see link below). There is one
16 | column in the data that has a few difference pieces of textual
17 | information (of variable lengths).
18 | }
19 | \examples{
20 | data(covers)
21 | str(covers)
22 | }
23 | \keyword{datasets}
24 | 


--------------------------------------------------------------------------------
/man/credit_data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/credit_data.R
 3 | \docType{data}
 4 | \name{credit_data}
 5 | \alias{credit_data}
 6 | \title{Credit data}
 7 | \source{
 8 | https://github.com/gastonstat/CreditScoring,
 9 | http://bit.ly/2kkBFrk
10 | }
11 | \value{
12 | \item{credit_data}{a data frame}
13 | }
14 | \description{
15 | These data are from the website of Dr. Lluís A. Belanche Muñoz by way of a
16 | github repository of Dr. Gaston Sanchez. One data point is a missing outcome
17 | was removed from the original data.
18 | }
19 | \examples{
20 | data(credit_data)
21 | str(credit_data)
22 | }
23 | \keyword{datasets}
24 | 


--------------------------------------------------------------------------------
/man/crickets.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/crickets.R
 3 | \docType{data}
 4 | \name{crickets}
 5 | \alias{crickets}
 6 | \title{Rates of Cricket Chirps}
 7 | \source{
 8 | Mangiafico, S. 2015. "An R Companion for the Handbook of Biological
 9 | Statistics." \url{https://rcompanion.org/handbook/}.
10 | 
11 | McDonald, J. 2009. \emph{Handbook of Biological Statistics}. Sparky House Publishing.
12 | }
13 | \value{
14 | \item{crickets}{a tibble}
15 | }
16 | \description{
17 | These data are from from McDonald (2009), by way of Mangiafico (2015), on
18 | the relationship between the ambient temperature and the rate of cricket
19 | chirps per minute. Data were collected for two species of the genus \emph{Oecanthus}: \emph{O. exclamationis}
20 | and \emph{O. niveus}. The data are contained in a data frame called \code{crickets} with
21 | a total of 31 data points.
22 | }
23 | \examples{
24 | data(crickets)
25 | str(crickets)
26 | }
27 | \keyword{datasets}
28 | 


--------------------------------------------------------------------------------
/man/deliveries.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/deliveries.R
 3 | \docType{data}
 4 | \name{deliveries}
 5 | \alias{deliveries}
 6 | \title{Food Delivery Time Data}
 7 | \value{
 8 | \item{deliveries}{a tibble}
 9 | }
10 | \description{
11 | Food Delivery Time Data
12 | }
13 | \details{
14 | These data are from a study of food delivery times in minutes (i.e., the time from the
15 | initial order to receiving the food) for a single restaurant. The data
16 | contains 10,012 orders from a specific restaurant. The predictors include:
17 | \itemize{
18 | \item The time, in decimal hours, of the order.
19 | \item The day of the week for the order.
20 | \item The approximate distance in miles between the restaurant and the delivery
21 | location.
22 | \item A set of 27 predictors that count the number of distinct menu items
23 | in the order.
24 | }
25 | 
26 | No times are censored.
27 | }
28 | \examples{
29 | data(deliveries)
30 | str(deliveries)
31 | }
32 | \keyword{datasets}
33 | 


--------------------------------------------------------------------------------
/man/drinks.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/drinks.R
 3 | \docType{data}
 4 | \name{drinks}
 5 | \alias{drinks}
 6 | \title{Sample time series data}
 7 | \source{
 8 | The Federal Reserve Bank of St. Louis website https://fred.stlouisfed.org/series/S4248SM144NCEN
 9 | }
10 | \value{
11 | \item{drinks}{a tibble}
12 | }
13 | \description{
14 | Sample time series data
15 | }
16 | \details{
17 | Drink sales. The exact name of the series from FRED is:
18 | "Merchant Wholesalers, Except Manufacturers' Sales Branches and Offices
19 | Sales: Nondurable Goods: Beer, Wine, and Distilled Alcoholic Beverages Sales"
20 | }
21 | \examples{
22 | data(drinks)
23 | str(drinks)
24 | }
25 | \keyword{datasets}
26 | 


--------------------------------------------------------------------------------
/man/figures/lifecycle-deprecated.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="124" height="20"><linearGradient id="b" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="a"><rect width="124" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#a)"><path fill="#555" d="M0 0h53v20H0z"/><path fill="#fe7d37" d="M53 0h71v20H53z"/><path fill="url(#b)" d="M0 0h124v20H0z"/></g><g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="110"><text x="275" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="430">lifecycle</text><text x="275" y="140" transform="scale(.1)" textLength="430">lifecycle</text><text x="875" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="610">deprecated</text><text x="875" y="140" transform="scale(.1)" textLength="610">deprecated</text></g> </svg>


--------------------------------------------------------------------------------
/man/grants.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/grants.R
 3 | \docType{data}
 4 | \name{grants}
 5 | \alias{grants}
 6 | \alias{grants_other}
 7 | \alias{grants_test}
 8 | \alias{grants_2008}
 9 | \title{Grant acceptance data}
10 | \source{
11 | Kuhn and Johnson (2013). \emph{Applied Predictive Modeling}. Springer.
12 | }
13 | \value{
14 | \item{grants_other,grants_test,grants_2008}{two tibbles and an integer
15 | vector of data points used for training}
16 | }
17 | \description{
18 | A data set related to the success or failure of academic grants.
19 | }
20 | \details{
21 | The data are discussed in Kuhn and Johnson (2013):
22 | 
23 | "These data are from a 2011 Kaggle competition sponsored by the University
24 | of Melbourne where there was interest in predicting whether or not a grant
25 | application would be accepted. Since public funding of grants had decreased
26 | over time, triaging grant applications based on their likelihood of success
27 | could be important for estimating the amount of potential funding to the
28 | university. In addition to predicting grant success, the university sought
29 | to understand factors that were important in predicting success."
30 | 
31 | The data ranged from 2005 and 2008 and the data spending strategy was
32 | driven by the date of the grant. Kuhn and Johnson (2013) describe:
33 | 
34 | "The compromise taken here is to build models on the pre-2008 data and
35 | tune them by evaluating a random sample of 2,075 grants from 2008. Once the
36 | optimal parameters are determined, final model is built using these
37 | parameters and the entire training set (i.e., the data prior to 2008 and the
38 | additional 2,075 grants). A small holdout set of 518 grants from 2008 will
39 | be used to ensure that no gross methodology errors occur from repeatedly
40 | evaluating the 2008 data during model tuning. In the text, this set of
41 | samples is called the 2 0 0 8 holdout set. This small set of year 2008
42 | grants will be referred to as the test set and will not be evaluated until
43 | set of candidate models are identified."
44 | 
45 | To emulate this, \code{grants_other} contains the training (pre-2008, n = 6,633)
46 | and holdout/validation data (2008, n = 1,557). \code{grants_test} has 518 grant
47 | samples from 2008. The object \code{grants_2008} is an integer vector that can
48 | be used to separate the modeling with the holdout/validation sets.
49 | }
50 | \examples{
51 | data(grants)
52 | str(grants_other)
53 | str(grants_test)
54 | str(grants_2008)
55 | }
56 | \keyword{datasets}
57 | 


--------------------------------------------------------------------------------
/man/hepatic_injury_qsar.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/hepatic_injury_qsar.R
 3 | \docType{data}
 4 | \name{hepatic_injury_qsar}
 5 | \alias{hepatic_injury_qsar}
 6 | \title{Predicting hepatic injury from chemical information}
 7 | \source{
 8 | Kuhn, Max, and Kjell Johnson. \emph{Applied predictive modeling}. New York:
 9 | Springer, 2013.
10 | }
11 | \value{
12 | \item{hepatic_injury_qsar}{a tibble}
13 | }
14 | \description{
15 | A quantitative structure-activity relationship (QSAR) data set to predict
16 | when a molecule has risk associated with liver function.
17 | }
18 | \details{
19 | This  data set was used to develop a model for predicting compounds'
20 | probability of causing hepatic injury (i.e. liver damage). This data set
21 | consisted of 281 unique compounds; 376 predictors were measured or computed
22 | for each. The response was categorical (either "none", "mild", or "severe"),
23 | and was highly unbalanced.
24 | 
25 | This kind of response often occurs in pharmaceutical data because companies
26 | steer away from creating molecules that have undesirable characteristics.
27 | Therefore, well-behaved molecules often greatly outnumber undesirable
28 | molecules. The predictors consisted of measurements from 184 biological
29 | screens and 192 chemical feature predictors. The biological predictors
30 | represent activity for each screen and take values between 0 and 10 with a
31 | mode of 4. The chemical feature predictors represent counts of important
32 | sub-structures as well as measures of physical properties that are thought to
33 | be associated with hepatic injury.
34 | 
35 | Columns:
36 | \itemize{
37 | \item \code{class}:  ordered and factor (levels: 'none', 'mild', and 'severe')
38 | \item \code{bio_assay_001} - \code{bio_assay_184}:  numeric
39 | \item \code{chem_fp_001} - \code{chem_fp_192}:  numeric
40 | }
41 | }
42 | \examples{
43 | data(hepatic_injury_qsar)
44 | str(hepatic_injury_qsar)
45 | 
46 | }
47 | 


--------------------------------------------------------------------------------
/man/hotel_rates.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/hotel_rates.R
 3 | \docType{data}
 4 | \name{hotel_rates}
 5 | \alias{hotel_rates}
 6 | \title{Daily Hotel Rate Data}
 7 | \source{
 8 | \url{https://github.com/rfordatascience/tidytuesday/tree/master/data/2020/2020-02-11}
 9 | }
10 | \description{
11 | A data set to predict the average daily rate for a hotel in Lisbon Portugal.
12 | }
13 | \details{
14 | Data are originally described in Antonio, de Almeida, and Nunes (2019).
15 | This version of the data is filtered for one hotel (the "Resort Hotel") and
16 | is intended as regression data set for predicting the average daily rate for
17 | a room. The data are post-2016; the 2016 data were used to have a predictor
18 | for the historical daily rates. See the \code{hotel_rates.R} file in the
19 | \code{data-raw} directory of the package to understand other filters used when
20 | creating this version of the data.
21 | 
22 | The \code{agent} and \code{company} fields were changed from random characters to use
23 | a set of random names.
24 | 
25 | The outcome column is \code{avg_price_per_room}.
26 | \subsection{License}{
27 | 
28 | No license was given for the data; See the reference below for source.
29 | }
30 | }
31 | \examples{
32 | \dontrun{
33 | str(hotel_rates)
34 | }
35 | }
36 | \references{
37 | Antonio, N., de Almeida, A., and Nunes, L. (2019). Hotel booking demand
38 | datasets. \emph{Data in Brief}, 22, 41-49.
39 | }
40 | \keyword{datasets}
41 | 


--------------------------------------------------------------------------------
/man/hpc_cv.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/hpc_cv.R
 3 | \docType{data}
 4 | \name{hpc_cv}
 5 | \alias{hpc_cv}
 6 | \title{Class probability predictions}
 7 | \source{
 8 | Kuhn, M., Johnson, K. (2013) \emph{Applied Predictive
 9 | Modeling}, Springer
10 | }
11 | \value{
12 | \item{hpc_cv}{a data frame}
13 | }
14 | \description{
15 | Class probability predictions
16 | }
17 | \details{
18 | This data frame contains the predicted classes and
19 | class probabilities for a linear discriminant analysis model fit
20 | to the HPC data set from Kuhn and Johnson (2013). These data are
21 | the assessment sets from a 10-fold cross-validation scheme. The
22 | data column columns for the true class (\code{obs}), the class
23 | prediction (\code{pred}) and columns for each class probability
24 | (columns \code{VF}, \code{F}, \code{M}, and \code{L}). Additionally, a column for
25 | the resample indicator is included.
26 | }
27 | \examples{
28 | data(hpc_cv)
29 | str(hpc_cv)
30 | }
31 | \keyword{datasets}
32 | 


--------------------------------------------------------------------------------
/man/hpc_data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/hpc_data.R
 3 | \docType{data}
 4 | \name{hpc_data}
 5 | \alias{hpc_data}
 6 | \title{High-performance computing system data}
 7 | \source{
 8 | Kuhn, M., Johnson, K. (2013) \emph{Applied Predictive Modeling}, Springer.
 9 | }
10 | \value{
11 | \item{hpc_data}{a tibble}
12 | }
13 | \description{
14 | Kuhn and Johnson (2013) describe a data set where characteristics of unix
15 | jobs were used to classify there completion times as either very fast
16 | (1 min or less, \code{VF}), fast (1–50 min, \code{F}), moderate (5–30 min, \code{M}), or
17 | long (greater than 30 min, \code{L}).
18 | }
19 | \examples{
20 | 
21 | data(hpc_data)
22 | str(hpc_data)
23 | }
24 | \keyword{datasets}
25 | 


--------------------------------------------------------------------------------
/man/ischemic_stroke.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/ischemic_stroke.R
 3 | \docType{data}
 4 | \name{ischemic_stroke}
 5 | \alias{ischemic_stroke}
 6 | \title{Clinical data used to predict ischemic stroke}
 7 | \source{
 8 | Kuhn, Max, and Kjell Johnson. \emph{Feature Engineering and Selection: A Practical
 9 | Approach for Predictive Models}. Chapman and Hall/CRC, 2019.
10 | }
11 | \value{
12 | \item{ischemic_stroke}{a tibble}
13 | }
14 | \description{
15 | A data set to predict a binary outcome using imaging and patient data.
16 | }
17 | \details{
18 | These data were gathered to predict patient risk for ischemic stroke. A
19 | historical set of patients with a range of carotid artery blockages were
20 | selected. The data consisted of 126 patients, 44 of which had blockages
21 | greater than 70\%. All patients had undergone Computed Tomography Angiography
22 | (CTA) to generate a detailed three-dimensional visualization and
23 | characterization of the blockage. These images were then analyzed in order to
24 | compute several features related to the disease, including: percent stenosis,
25 | arterial wall thickness, and tissue characteristics such as lipid-rich
26 | necrotic core and calcification.
27 | 
28 | The group of patients in this study also had follow-up information on
29 | whether or not a stroke occurred at a subsequent point in time. The data for
30 | each patient also included commonly collected clinical characteristics for
31 | risk of stroke such as whether or not the patient had atrial fibrillation,
32 | coronary artery disease, and a history of smoking. Demographics of gender and
33 | age were included as well. These readily available risk factors can be
34 | thought of as another potentially useful predictor set that can be evaluated.
35 | In fact, this set of predictors should be evaluated first to assess their
36 | ability to predict stroke since these predictors are easy to collect, are
37 | acquired at patient presentation, and do not require an expensive imaging
38 | technique.
39 | 
40 | Columns:
41 | \itemize{
42 | \item \code{stroke}:  factor (levels: 'yes' and 'no')
43 | \item \code{nascet_scale}:  numeric
44 | \item \code{calc_vol}:  numeric
45 | \item \code{calc_vol_prop}:  numeric
46 | \item \code{matx_vol}:  numeric
47 | \item \code{matx_vol_prop}:  numeric
48 | \item \code{lrnc_vol}:  numeric
49 | \item \code{lrnc_vol_prop}:  numeric
50 | \item \code{max_calc_area}:  numeric
51 | \item \code{max_calc_area_prop}:  numeric
52 | \item \code{max_dilation_by_area}:  numeric
53 | \item \code{max_matx_area}:  numeric
54 | \item \code{max_matx_area_prop}:  numeric
55 | \item \code{max_lrnc_area}:  numeric
56 | \item \code{max_lrnc_area_prop}:  numeric
57 | \item \code{max_max_wall_thickness}:  numeric
58 | \item \code{max_remodeling_ratio}:  numeric
59 | \item \code{max_stenosis_by_area}:  numeric
60 | \item \code{max_wall_area}:  numeric
61 | \item \code{wall_vol}:  numeric
62 | \item \code{max_stenosis_by_diameter}:  numeric
63 | \item \code{age}:  integer
64 | \item \code{male}:  integer
65 | \item \code{smoking_history}:  integer
66 | \item \code{atrial_fibrillation}:  integer
67 | \item \code{coronary_artery_disease}:  integer
68 | \item \code{diabetes_history}:  integer
69 | \item \code{hypercholesterolemia_history}:  integer
70 | \item \code{hypertension_history}:  integer
71 | }
72 | }
73 | \examples{
74 | data(ischemic_stroke)
75 | str(ischemic_stroke)
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/man/leaf_id_flavia.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/leaf_id_flavia.R
  3 | \docType{data}
  4 | \name{leaf_id_flavia}
  5 | \alias{leaf_id_flavia}
  6 | \title{Leaf identification data (Flavia)}
  7 | \source{
  8 | Lakshika, Jayani PG, and Thiyanga S. Talagala. "Computer-aided interpretable
  9 | features for leaf image classification." \emph{arXiv preprint} arXiv:2106.08077
 10 | (2021).
 11 | 
 12 | \url{https://github.com/SMART-Research/leaffeatures_paper}
 13 | }
 14 | \value{
 15 | \item{leaf_id_flavia}{a data frame}
 16 | }
 17 | \description{
 18 | Image analysis of leaves to predict species.
 19 | }
 20 | \details{
 21 | From the original manuscript: "The Flavia dataset contains 1907 leaf images.
 22 | There are 32 different species and each has 50-77 images. Scanners and
 23 | digital cameras are used to acquire the leaf images on a plain background.
 24 | The isolated leaf images contain blades only, without a petiole. These leaf
 25 | images are collected from the most common plants in Yangtze, Delta,
 26 | China. Those leaves were sampled on the campus of the Nanjing University and
 27 | the Sun Yat-Sen arboretum, Nanking, China."
 28 | 
 29 | The reference below has details information on the features used for
 30 | prediction.
 31 | 
 32 | Columns:
 33 | \itemize{
 34 | \item \code{species}:  factor (32 levels)
 35 | \item \code{apex}:  factor (9 levels)
 36 | \item \code{base}:  factor (6 levels)
 37 | \item \code{shape}:  factor (5 levels)
 38 | \item \code{denate_edge}:  factor (levels: 'no' and 'yes')
 39 | \item \code{lobed_edge}:  factor (levels: 'no' and 'yes')
 40 | \item \code{smooth_edge}:  factor (levels: 'no' and 'yes')
 41 | \item \code{toothed_edge}:  factor (levels: 'no' and 'yes')
 42 | \item \code{undulate_edge}:  factor (levels: 'no' and 'yes')
 43 | \item \code{outlying_polar}:  numeric
 44 | \item \code{skewed_polar}:  numeric
 45 | \item \code{clumpy_polar}:  numeric
 46 | \item \code{sparse_polar}:  numeric
 47 | \item \code{striated_polar}:  numeric
 48 | \item \code{convex_polar}:  numeric
 49 | \item \code{skinny_polar}:  numeric
 50 | \item \code{stringy_polar}:  numeric
 51 | \item \code{monotonic_polar}:  numeric
 52 | \item \code{outlying_contour}:  numeric
 53 | \item \code{skewed_contour}:  numeric
 54 | \item \code{clumpy_contour}:  numeric
 55 | \item \code{sparse_contour}:  numeric
 56 | \item \code{striated_contour}:  numeric
 57 | \item \code{convex_contour}:  numeric
 58 | \item \code{skinny_contour}:  numeric
 59 | \item \code{stringy_contour}:  numeric
 60 | \item \code{monotonic_contour}:  numeric
 61 | \item \code{num_max_ponits}:  numeric
 62 | \item \code{num_min_points}:  numeric
 63 | \item \code{diameter}:  numeric
 64 | \item \code{area}:  numeric
 65 | \item \code{perimeter}:  numeric
 66 | \item \code{physiological_length}:  numeric
 67 | \item \code{physiological_width}:  numeric
 68 | \item \code{aspect_ratio}:  numeric
 69 | \item \code{rectangularity}:  numeric
 70 | \item \code{circularity}:  numeric
 71 | \item \code{compactness}:  numeric
 72 | \item \code{narrow_factor}:  numeric
 73 | \item \code{perimeter_ratio_diameter}:  numeric
 74 | \item \code{perimeter_ratio_length}:  numeric
 75 | \item \code{perimeter_ratio_lw}:  numeric
 76 | \item \code{num_convex_points}:  numeric
 77 | \item \code{perimeter_convexity}:  numeric
 78 | \item \code{area_convexity}:  numeric
 79 | \item \code{area_ratio_convexity}:  numeric
 80 | \item \code{equivalent_diameter}:  numeric
 81 | \item \code{eccentriciry}:  numeric
 82 | \item \code{contrast}:  numeric
 83 | \item \code{correlation_texture}:  numeric
 84 | \item \code{inverse_difference_moments}:  numeric
 85 | \item \code{entropy}:  numeric
 86 | \item \code{mean_red_val}:  numeric
 87 | \item \code{mean_green_val}:  numeric
 88 | \item \code{mean_blue_val}:  numeric
 89 | \item \code{std_red_val}:  numeric
 90 | \item \code{std_green_val}:  numeric
 91 | \item \code{std_blue_val}:  numeric
 92 | \item \code{correlation}:  numeric
 93 | }
 94 | }
 95 | \examples{
 96 | data(leaf_id_flavia)
 97 | str(leaf_id_flavia)
 98 | 
 99 | }
100 | 


--------------------------------------------------------------------------------
/man/lending_club.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/lending_club.R
 3 | \docType{data}
 4 | \name{lending_club}
 5 | \alias{lending_club}
 6 | \title{Loan data}
 7 | \source{
 8 | Lending Club Statistics https://www.lendingclub.com/info/download-data.action
 9 | }
10 | \value{
11 | \item{lending_club}{a data frame}
12 | }
13 | \description{
14 | Loan data
15 | }
16 | \details{
17 | These data were downloaded from the Lending Club
18 | access site (see below) and are from the first quarter of 2016.
19 | A subset of the rows and variables are included here. The
20 | outcome is in the variable \code{Class} and is either "good" (meaning
21 | that the loan was fully paid back or currently on-time) or "bad"
22 | (charged off, defaulted, of 21-120 days late). A data dictionary
23 | can be found on the source website.
24 | }
25 | \examples{
26 | data(lending_club)
27 | str(lending_club)
28 | }
29 | \keyword{datasets}
30 | 


--------------------------------------------------------------------------------
/man/meats.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/meats.R
 3 | \docType{data}
 4 | \name{meats}
 5 | \alias{meats}
 6 | \title{Fat, water and protein content of meat samples}
 7 | \value{
 8 | \item{meats}{a tibble}
 9 | }
10 | \description{
11 | "These data are recorded on a Tecator Infratec Food and Feed Analyzer
12 | working in the wavelength range 850 - 1050 nm by the Near Infrared
13 | Transmission (NIT) principle. Each sample contains finely chopped pure meat
14 | with different moisture, fat and protein contents.
15 | }
16 | \details{
17 | If results from these data are used in a publication we want you to mention
18 | the instrument and company name (Tecator) in the publication.  In addition,
19 | please send a preprint of your article to:
20 | 
21 | Karin Thente, Tecator AB, Box 70, S-263 21 Hoganas, Sweden
22 | 
23 | The data are available in the public domain with no responsibility from the
24 | original data source. The data can be redistributed as long as this
25 | permission note is attached."
26 | 
27 | "For each meat sample the data consists of a 100 channel spectrum of
28 | absorbances and the contents of moisture (water), fat and protein.  The
29 | absorbance is -log10 of the transmittance measured by the spectrometer. The
30 | three contents, measured in percent, are determined by analytic chemistry."
31 | 
32 | Included here are the training, monitoring and test sets.
33 | }
34 | \examples{
35 | 
36 | data(meats)
37 | str(meats)
38 | }
39 | \keyword{datasets}
40 | 


--------------------------------------------------------------------------------
/man/mlc_churn.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/churn.R
 3 | \docType{data}
 4 | \name{mlc_churn}
 5 | \alias{mlc_churn}
 6 | \title{Customer churn data}
 7 | \source{
 8 | Originally at \verb{http://www.sgi.com/tech/mlc/}
 9 | }
10 | \value{
11 | \item{mlc_churn}{a tibble}
12 | }
13 | \description{
14 | A data set from the MLC++ machine learning software for modeling customer
15 | churn. There are 19 predictors, mostly numeric: \code{state} (categorical),
16 | \code{account_length} \code{area_code} \code{international_plan} (yes/no),
17 | \code{voice_mail_plan} (yes/no), \code{number_vmail_messages}
18 | \code{total_day_minutes} \code{total_day_calls} \code{total_day_charge}
19 | \code{total_eve_minutes} \code{total_eve_calls} \code{total_eve_charge}
20 | \code{total_night_minutes} \code{total_night_calls}
21 | \code{total_night_charge} \code{total_intl_minutes}
22 | \code{total_intl_calls} \code{total_intl_charge}, and
23 | \code{number_customer_service_calls}.
24 | }
25 | \details{
26 | The outcome is contained in a column called \code{churn} (also yes/no).
27 | A note in one of the source files states that the data are "artificial based
28 | on claims similar to real world".
29 | }
30 | \examples{
31 | data(mlc_churn)
32 | str(mlc_churn)
33 | }
34 | \keyword{datasets}
35 | 


--------------------------------------------------------------------------------
/man/modeldata-package.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/modeldata-package.R
 3 | \docType{package}
 4 | \name{modeldata-package}
 5 | \alias{modeldata}
 6 | \alias{modeldata-package}
 7 | \title{modeldata: Data Sets Useful for Modeling Examples}
 8 | \description{
 9 | Data sets used for demonstrating or testing model-related packages are contained in this package.
10 | }
11 | \seealso{
12 | Useful links:
13 | \itemize{
14 |   \item \url{https://modeldata.tidymodels.org}
15 |   \item \url{https://github.com/tidymodels/modeldata}
16 |   \item Report bugs at \url{https://github.com/tidymodels/modeldata/issues}
17 | }
18 | 
19 | }
20 | \author{
21 | \strong{Maintainer}: Max Kuhn \email{max@posit.co}
22 | 
23 | Other contributors:
24 | \itemize{
25 |   \item Posit Software, PBC (03wc8by49) [copyright holder, funder]
26 | }
27 | 
28 | }
29 | \keyword{internal}
30 | 


--------------------------------------------------------------------------------
/man/oils.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/oils.R
 3 | \docType{data}
 4 | \name{oils}
 5 | \alias{oils}
 6 | \title{Fatty acid composition of commercial oils}
 7 | \source{
 8 | Brodnjak-Voncina et al. (2005). Multivariate data analysis in
 9 | classification of vegetable oils characterized by the content of fatty
10 | acids, \emph{Chemometrics and Intelligent Laboratory Systems}, Vol.
11 | 75:31-45.
12 | }
13 | \value{
14 | \item{oils}{a tibble}
15 | }
16 | \description{
17 | Fatty acid concentrations of commercial oils were measured using gas
18 | chromatography.  The data is used to predict the type of oil.  Note that
19 | only the known oils are in the data set. Also, the authors state that there
20 | are 95 samples of known oils. However, we count 96 in Table 1 (pgs.  33-35).
21 | }
22 | \examples{
23 | data(oils)
24 | str(oils)
25 | }
26 | \keyword{datasets}
27 | 


--------------------------------------------------------------------------------
/man/parabolic.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/parabolic.R
 3 | \docType{data}
 4 | \name{parabolic}
 5 | \alias{parabolic}
 6 | \title{Parabolic class boundary data}
 7 | \value{
 8 | \item{parabolic}{a data frame}
 9 | }
10 | \description{
11 | Parabolic class boundary data
12 | }
13 | \details{
14 | These data were simulated. There are two correlated predictors and
15 | two classes in the factor outcome.
16 | }
17 | \examples{
18 | data(parabolic)
19 | str(parabolic)
20 | }
21 | \keyword{datasets}
22 | 


--------------------------------------------------------------------------------
/man/pathology.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pathology.R
 3 | \docType{data}
 4 | \name{pathology}
 5 | \alias{pathology}
 6 | \title{Liver pathology data}
 7 | \source{
 8 | Altman, D.G., Bland, J.M. (1994) ``Diagnostic tests 1:
 9 | sensitivity and specificity,'' \emph{British Medical Journal},
10 | vol 308, 1552.
11 | }
12 | \value{
13 | \item{pathology}{a data frame}
14 | }
15 | \description{
16 | Liver pathology data
17 | }
18 | \details{
19 | These data have the results of a \emph{x}-ray examination
20 | to determine whether liver is abnormal or not (in the \code{scan}
21 | column) versus the more extensive pathology results that
22 | approximate the truth (in \code{pathology}).
23 | }
24 | \examples{
25 | data(pathology)
26 | str(pathology)
27 | }
28 | \keyword{datasets}
29 | 


--------------------------------------------------------------------------------
/man/pd_speech.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/pd_speech.R
 3 | \docType{data}
 4 | \name{pd_speech}
 5 | \alias{pd_speech}
 6 | \title{Parkinson's disease speech classification data set}
 7 | \source{
 8 | UCI ML repository (data) https://archive.ics.uci.edu/ml/datasets/Parkinson\%27s+Disease+Classification#,
 9 | 
10 | Sakar et al (2019), "A comparative analysis of speech signal processing
11 | algorithms for Parkinson’s disease classification and the use of the tunable
12 | Q-factor wavelet transform", \emph{Applied Soft Computing}, V74, pg 255-263.
13 | }
14 | \value{
15 | \item{pd_speech}{a data frame}
16 | }
17 | \description{
18 | Parkinson's disease speech classification data set
19 | }
20 | \details{
21 | From the UCI ML archive, the description is "The data used in this
22 | study were gathered from 188 patients with PD (107 men and 81 women) with
23 | ages ranging from 33 to 87 (65.1 p/m 10.9) at the Department of Neurology
24 | in Cerrahpaşa Faculty of Medicine, Istanbul University. The control group
25 | consists of 64 healthy individuals (23 men and 41 women) with ages varying
26 | between 41 and 82 (61.1 p/m 8.9). During the data collection process,
27 | the microphone is set to 44.1 KHz and following the physician's examination,
28 | the sustained phonation of the vowel \verb{/a/} was collected from each subject
29 | with three repetitions."
30 | 
31 | The data here are averaged over the replicates.
32 | }
33 | \examples{
34 | data(pd_speech)
35 | str(pd_speech)
36 | }
37 | \keyword{datasets}
38 | 


--------------------------------------------------------------------------------
/man/penguins.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/penguins.R
 3 | \docType{data}
 4 | \name{penguins}
 5 | \alias{penguins}
 6 | \title{Palmer Station penguin data}
 7 | \source{
 8 | Gorman KB, Williams TD, Fraser WR (2014) Ecological Sexual Dimorphism
 9 | and Environmental Variability within a Community of Antarctic Penguins
10 | (\emph{Genus Pygoscelis}). PLoS ONE 9(3): e90081.
11 | \doi{10.1371/journal.pone.0090081}
12 | 
13 | \url{https://github.com/allisonhorst/palmerpenguins}
14 | }
15 | \value{
16 | \item{penguins}{a tibble}
17 | }
18 | \description{
19 | A data set from Gorman, Williams, and Fraser (2014) containing measurements
20 | from different types of penguins. This version of the data was retrieved from
21 | Allison Horst's \code{palmerpenguins} package on 2020-06-22.
22 | }
23 | \examples{
24 | data(penguins)
25 | str(penguins)
26 | }
27 | \keyword{datasets}
28 | 


--------------------------------------------------------------------------------
/man/permeability_qsar.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/permeability_qsar.R
 3 | \docType{data}
 4 | \name{permeability_qsar}
 5 | \alias{permeability_qsar}
 6 | \title{Predicting permeability from chemical information}
 7 | \source{
 8 | Kuhn, Max, and Kjell Johnson. \emph{Applied predictive modeling}. New York:
 9 | Springer, 2013.
10 | }
11 | \value{
12 | \item{permeability_qsar}{a data frame}
13 | }
14 | \description{
15 | A quantitative structure-activity relationship (QSAR) data set to predict
16 | when a molecule can permeate cells.
17 | }
18 | \details{
19 | This pharmaceutical data set was used to develop a model for predicting
20 | compounds' permeability. In short, permeability is the measure of a
21 | molecule's ability to cross a membrane. The body, for example, has notable
22 | membranes between the body and brain, known as the blood-brain barrier, and
23 | between the gut and body in the intestines. These membranes help the body
24 | guard critical regions from receiving undesirable or detrimental substances.
25 | For an orally taken drug to be effective in the brain, it first must pass
26 | through the intestinal wall and then must pass through the blood-brain
27 | barrier in order to be present for the desired neurological target.
28 | Therefore, a compound's ability to permeate relevant biological membranes
29 | is critically important to understand early in the drug discovery process.
30 | Compounds that appear to be effective for a particular disease in research
31 | screening experiments, but appear to be poorly permeable may need to be
32 | altered in order improve permeability, and thus the compound's ability to
33 | reach the desired target. Identifying permeability problems can help guide
34 | chemists towards better molecules.
35 | 
36 | Permeability assays such as PAMPA and Caco-2 have been developed to help
37 | measure compounds' permeability (Kansy et al, 1998). These screens are
38 | effective at quantifying a compound's permeability, but the assay is
39 | expensive labor intensive. Given a sufficient number of compounds that have
40 | been screened, we could develop a predictive model for permeability in an
41 | attempt to potentially reduce the need for the assay. In this project there
42 | were 165 unique compounds; 1107 molecular fingerprints were determined for
43 | each. A molecular fingerprint is a binary sequence of numbers that
44 | represents the presence or absence of a specific molecular sub-structure.
45 | The response is highly skewed, the predictors are sparse (15.5\% are present),
46 | and many predictors are strongly associated.
47 | 
48 | Columns:
49 | \itemize{
50 | \item \code{permeability}: numeric
51 | \item \code{chem_fp_0001} - \code{chem_fp_1107}: numeric
52 | }
53 | }
54 | \examples{
55 | data(permeability_qsar)
56 | str(permeability_qsar)
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/man/rmd/ames.md:
--------------------------------------------------------------------------------
 1 | For these data, the training materials typically use: 
 2 | 
 3 | ```r
 4 | library(tidymodels)
 5 | 
 6 | set.seed(4595)
 7 | data_split <- initial_split(ames, strata = "Sale_Price")
 8 | ames_train <- training(data_split)
 9 | ames_test  <- testing(data_split)
10 | 
11 | set.seed(2453)
12 | ames_folds<- vfold_cv(ames_train)
13 | ```
14 | 
15 | 


--------------------------------------------------------------------------------
/man/scat.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/scat.R
 3 | \docType{data}
 4 | \name{scat}
 5 | \alias{scat}
 6 | \title{Morphometric data on scat}
 7 | \source{
 8 | Reid, R. E. B. (2015). A morphometric modeling approach to
 9 | distinguishing among bobcat, coyote and gray fox scats. \emph{Wildlife
10 | Biology}, 21(5), 254-262
11 | }
12 | \value{
13 | \item{scat}{a tibble}
14 | }
15 | \description{
16 | Reid (2015) collected data on animal feses in coastal California. The data
17 | consist of DNA verified species designations as well as fields related to
18 | the time and place of the collection and the scat itself. The data are on
19 | the three main species.
20 | }
21 | \examples{
22 | data(scat)
23 | str(scat)
24 | }
25 | \keyword{datasets}
26 | 


--------------------------------------------------------------------------------
/man/sim_classification.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/simulations.R
  3 | \name{sim_classification}
  4 | \alias{sim_classification}
  5 | \alias{sim_regression}
  6 | \alias{sim_noise}
  7 | \alias{sim_logistic}
  8 | \alias{sim_multinomial}
  9 | \title{Simulate datasets}
 10 | \usage{
 11 | sim_classification(
 12 |   num_samples = 100,
 13 |   method = "caret",
 14 |   intercept = -5,
 15 |   num_linear = 10,
 16 |   keep_truth = FALSE
 17 | )
 18 | 
 19 | sim_regression(
 20 |   num_samples = 100,
 21 |   method = "sapp_2014_1",
 22 |   std_dev = NULL,
 23 |   factors = FALSE,
 24 |   keep_truth = FALSE
 25 | )
 26 | 
 27 | sim_noise(
 28 |   num_samples,
 29 |   num_vars,
 30 |   cov_type = "exchangeable",
 31 |   outcome = "none",
 32 |   num_classes = 2,
 33 |   cov_param = 0
 34 | )
 35 | 
 36 | sim_logistic(num_samples, eqn, correlation = 0, keep_truth = FALSE)
 37 | 
 38 | sim_multinomial(
 39 |   num_samples,
 40 |   eqn_1,
 41 |   eqn_2,
 42 |   eqn_3,
 43 |   correlation = 0,
 44 |   keep_truth = FALSE
 45 | )
 46 | }
 47 | \arguments{
 48 | \item{num_samples}{Number of data points to simulate.}
 49 | 
 50 | \item{method}{A character string for the simulation method. For
 51 | classification, the single current option is "caret". For regression,
 52 | values can be \code{"sapp_2014_1"}, \code{"sapp_2014_2"}, \code{"van_der_laan_2007_1"},
 53 | \code{"van_der_laan_2007_2"}, \code{"hooker_2004"}, or \code{"worley_1987"}. See Details
 54 | below.}
 55 | 
 56 | \item{intercept}{The intercept for the linear predictor.}
 57 | 
 58 | \item{num_linear}{Number of diminishing linear effects.}
 59 | 
 60 | \item{keep_truth}{A logical: should the true outcome value be retained for
 61 | the data? If so, the column name is \code{.truth}.}
 62 | 
 63 | \item{std_dev}{Gaussian distribution standard deviation for residuals.
 64 | Default values are shown below in Details.}
 65 | 
 66 | \item{factors}{A single logical for whether the binary indicators should be
 67 | encoded as factors or not.}
 68 | 
 69 | \item{num_vars}{Number of noise predictors to create.}
 70 | 
 71 | \item{cov_type}{The multivariate normal correlation structure of the
 72 | predictors. Possible values are "exchangeable" and "toeplitz".}
 73 | 
 74 | \item{outcome}{A single character string for what type of independent outcome
 75 | should be simulated (if any). The default value of "none" produces no extra
 76 | columns. Using "classification" will generate a \code{class} column with
 77 | \code{num_classes} values, equally distributed. A value of "regression" results
 78 | in an \code{outcome} column that contains independent standard normal values.}
 79 | 
 80 | \item{num_classes}{When \code{outcome = "classification"}, the number of classes
 81 | to simulate.}
 82 | 
 83 | \item{cov_param}{A single numeric value for the exchangeable correlation
 84 | value or the base of the Toeplitz structure. See Details below.}
 85 | 
 86 | \item{eqn, eqn_1, eqn_2, eqn_3}{An R expression or  (one sided) formula that
 87 | only involves variables \code{A} and \code{B} that is used to compute the linear
 88 | predictor. External objects should not be used as symbols; see the examples
 89 | below on how to use external objects in the equations.}
 90 | 
 91 | \item{correlation}{A single numeric value for the correlation between variables
 92 | \code{A} and \code{B}.}
 93 | }
 94 | \description{
 95 | These functions can be used to generate simulated data for supervised
 96 | (classification and regression) and unsupervised modeling applications.
 97 | }
 98 | \details{
 99 | \subsection{Specific Regression and Classification methods}{
100 | 
101 | These functions provide several supervised simulation methods (and one
102 | unsupervised). Learn more by \code{method}:
103 | \subsection{\code{method = "caret"}}{
104 | 
105 | This is a simulated classification problem with two classes, originally
106 | implemented in \code{\link[caret:twoClassSim]{caret::twoClassSim()}} with all numeric predictors. The
107 | predictors are simulated in different sets. First, two multivariate normal
108 | predictors (denoted here as \code{two_factor_1} and \code{two_factor_2}) are created
109 | with a correlation of about 0.65. They change the log-odds using main
110 | effects and an interaction:
111 | 
112 | \preformatted{  intercept - 4 * two_factor_1 + 4 * two_factor_2 + 2 * two_factor_1 * two_factor_2 }
113 | 
114 | The intercept is a parameter for the simulation and can be used to control
115 | the amount of class imbalance.
116 | 
117 | The second set of effects are linear with coefficients that alternate signs
118 | and have a sequence of values between 2.5 and 0.25. For example, if there
119 | were four predictors in this set, their contribution to the log-odds would
120 | be
121 | 
122 | \preformatted{  -2.5 * linear_1 + 1.75 * linear_2 -1.00 * linear_3 + 0.25 * linear_4}
123 | 
124 | (Note that these column names may change based on the value of \code{num_linear}).
125 | 
126 | The third set is a nonlinear function of a single predictor ranging between
127 | \verb{[0, 1]} called \code{non_linear_1} here:
128 | 
129 | \preformatted{  (non_linear_1^3) + 2 * exp(-6 * (non_linear_1 - 0.3)^2) }
130 | 
131 | The fourth set of informative predictors are copied from one of Friedman's
132 | systems and use two more predictors (\code{non_linear_2} and \code{non_linear_3}):
133 | 
134 | \preformatted{  2 * sin(non_linear_2 * non_linear_3) }
135 | 
136 | All of these effects are added up to model the log-odds.
137 | }
138 | 
139 | \subsection{\code{method = "sapp_2014_1"}}{
140 | 
141 | This regression simulation is from Sapp et al. (2014). There are 20
142 | independent Gaussian random predictors with mean zero and a variance of 9.
143 | The prediction equation is:
144 | 
145 | \preformatted{
146 |   predictor_01 + sin(predictor_02) + log(abs(predictor_03)) +
147 |    predictor_04^2 + predictor_05 * predictor_06 +
148 |    ifelse(predictor_07 * predictor_08 * predictor_09 < 0, 1, 0) +
149 |    ifelse(predictor_10 > 0, 1, 0) + predictor_11 * ifelse(predictor_11 > 0, 1, 0) +
150 |    sqrt(abs(predictor_12)) + cos(predictor_13) + 2 * predictor_14 + abs(predictor_15) +
151 |    ifelse(predictor_16 < -1, 1, 0) + predictor_17 * ifelse(predictor_17 < -1, 1, 0) -
152 |    2 * predictor_18 - predictor_19 * predictor_20
153 | }
154 | 
155 | The error is Gaussian with mean zero and variance 9.
156 | }
157 | 
158 | \subsection{\code{method = "sapp_2014_2"}}{
159 | 
160 | This regression simulation is also from Sapp et al. (2014). There are 200
161 | independent Gaussian predictors with mean zero and variance 16. The
162 | prediction equation has an intercept of one and identical linear effects of
163 | \code{log(abs(predictor))}.
164 | 
165 | The error is Gaussian with mean zero and variance 25.
166 | }
167 | 
168 | \subsection{\code{method = "van_der_laan_2007_1"}}{
169 | 
170 | This is a regression simulation from van der Laan et al. (2007) with ten
171 | random Bernoulli variables that have a 40\% probability of being a value of
172 | one. The true regression equation is:
173 | 
174 | \preformatted{
175 |   2 * predictor_01 * predictor_10 + 4 * predictor_02 * predictor_07 +
176 |     3 * predictor_04 * predictor_05 - 5 * predictor_06 * predictor_10 +
177 |     3 * predictor_08 * predictor_09 + predictor_01 * predictor_02 * predictor_04 -
178 |     2 * predictor_07 * (1 - predictor_06) * predictor_02 * predictor_09 -
179 |     4 * (1 - predictor_10) * predictor_01 * (1 - predictor_04)
180 | }
181 | 
182 | The error term is standard normal.
183 | }
184 | 
185 | \subsection{\code{method = "van_der_laan_2007_2"}}{
186 | 
187 | This is another regression simulation from van der Laan et al. (2007)  with
188 | twenty Gaussians with mean zero and variance 16. The prediction equation is:
189 | 
190 | \preformatted{
191 |   predictor_01 * predictor_02 + predictor_10^2 - predictor_03 * predictor_17 -
192 |     predictor_15 * predictor_04 + predictor_09 * predictor_05 + predictor_19 -
193 |     predictor_20^2 + predictor_09 * predictor_08
194 | }
195 | 
196 | The error term is also Gaussian with mean zero and variance 16.
197 | }
198 | 
199 | \subsection{\code{method = "hooker_2004"}}{
200 | 
201 | Hooker (2004) and Sorokina \emph{at al} (2008) used the following:
202 | 
203 | \preformatted{
204 |     pi ^ (predictor_01 * predictor_02) * sqrt( 2 * predictor_03 ) -
205 |     asin(predictor_04) + log(predictor_03  + predictor_05) -
206 |    (predictor_09 / predictor_10) * sqrt (predictor_07 / predictor_08) -
207 |     predictor_02 * predictor_07
208 | }
209 | 
210 | Predictors 1, 2, 3, 6, 7, and 9 are standard uniform while the others are
211 | uniform on \verb{[0.6, 1.0]}. The errors are normal with mean zero and default
212 | standard deviation of 0.25.
213 | }
214 | 
215 | \subsection{\code{method = "worley_1987"}}{
216 | 
217 | The simulation system from Worley (1987) is based on a mechanistic model for
218 | the flow rate of liquids from two aquifers positioned vertically (i.e.,
219 | the "upper" and "lower" aquifers). There are two sets of predictors:
220 | \itemize{
221 | \item the borehole radius (\code{radius_borehole} from 0.05 to 0.15) and length
222 | (\code{length_borehole} from 1,120 to 1,680) .
223 | \item The radius of effect that the system has on collecting water
224 | (\code{radius_influence} from 100 to 50,000)
225 | }
226 | 
227 | and physical properties:
228 | \itemize{
229 | \item \code{transmissibility_upper_aq}
230 | \item \code{potentiometric_upper_aq}
231 | \item \code{transmissibility_lower_aq}
232 | \item \code{potentiometric_lower_aq}
233 | \item \code{conductivity_borehole}
234 | }
235 | 
236 | A multiplicative error structure is used; the mechanistic equation is
237 | multiplied by an expoentiated Gaussian random error.
238 | 
239 | The references give feasible ranges for each of these variables. See also
240 | Morris \emph{et al} (1993).
241 | }
242 | 
243 | }
244 | 
245 | \subsection{\code{sim_noise()}}{
246 | 
247 | This function simulates a number of random normal variables with mean zero.
248 | The values can be independent if \code{cov_param = 0}. Otherwise the values are
249 | multivariate normal with non-diagonal covariance matrices. For
250 | \code{cov_type = "exchangeable"}, the structure has unit variances and covariances
251 | of \code{cov_param}. With \code{cov_type = "toeplitz"}, the covariances have an
252 | exponential pattern (see example below).
253 | }
254 | 
255 | \subsection{Logistic simulation}{
256 | 
257 | \code{sim_logistic()} provides a flexible interface to simulating a logistic
258 | regression model with two multivariate normal variables \code{A} and \code{B} (with
259 | zero mean, unit variances and correlation determined by the \code{correlation}
260 | argument).
261 | 
262 | For example, using \code{eqn = A + B} would specify that the true probability of
263 | the event was
264 | 
265 | \preformatted{
266 |    prob = 1 / (1 + exp(A + B))
267 | }
268 | 
269 | The class levels for the outcome column are \code{"one"} and \code{"two"}.
270 | }
271 | 
272 | \subsection{Multinomial simulation}{
273 | 
274 | \code{sim_multinomial()} can generate data with classes \code{"one"}, \code{"two"}, and
275 | \code{"three"} based on the values in arguments \code{eqn_1}, \code{eqn_2}, and \code{eqn_3},
276 | respectfully. Like \code{\link[=sim_logistic]{sim_logistic()}} these equations use predictors \code{A} and
277 | \code{B}.
278 | 
279 | The individual equations are evaluated and exponentiated. After this, their
280 | values are, for each row of data, normalized to add up to one. These
281 | probabilities are them passed to \code{\link[stats:Multinom]{stats::rmultinom()}} to generate the outcome
282 | values.
283 | }
284 | }
285 | \examples{
286 | set.seed(1)
287 | sim_regression(100)
288 | sim_classification(100)
289 | 
290 | # Flexible logistic regression simulation
291 | if (rlang::is_installed("ggplot2")) {
292 |   library(dplyr)
293 |   library(ggplot2)
294 | 
295 |   sim_logistic(1000, ~ .1 + 2 * A - 3 * B + 1 * A *B, corr = .7) |>
296 |     ggplot(aes(A, B, col = class)) +
297 |     geom_point(alpha = 1/2) +
298 |     coord_equal()
299 | 
300 |   f_xor <- ~ 10 * xor(A > 0, B < 0)
301 |   # or
302 |   f_xor <- rlang::expr(10 * xor(A > 0, B < 0))
303 | 
304 |   sim_logistic(1000, f_xor, keep_truth = TRUE) |>
305 |     ggplot(aes(A, B, col = class)) +
306 |     geom_point(alpha = 1/2) +
307 |     coord_equal() +
308 |     theme_bw()
309 | }
310 | 
311 | ## How to use external symbols:
312 | 
313 | a_coef <- 2
314 | # splice the value in using rlang's !! operator
315 | lp_eqn <- rlang::expr(!!a_coef * A+B)
316 | lp_eqn
317 | sim_logistic(5, lp_eqn)
318 | 
319 | # Flexible multinomial regression simulation
320 | if (rlang::is_installed("ggplot2")) {
321 | 
322 | }
323 | }
324 | \references{
325 | Hooker, G. (2004, August). Discovering additive structure in black box
326 | functions. In \emph{Proceedings of the tenth ACM SIGKDD international conference
327 | on Knowledge discovery and data mining} (pp. 575-580).
328 | DOI: 10.1145/1014052.1014122
329 | 
330 | Morris, M. D., Mitchell, T. J., and Ylvisaker, D. (1993). Bayesian design
331 | and analysis of computer experiments: use of derivatives in surface
332 | prediction. \emph{Technometrics}, 35(3), 243-255.
333 | 
334 | Sapp, S., van der Laan, M. J., and Canny, J. (2014). Subsemble: an ensemble
335 | method for combining subset-specific algorithm fits. \emph{Journal of applied
336 | statistics}, 41(6), 1247-1259. DOI: 10.1080/02664763.2013.864263
337 | 
338 | Sorokina, D., Caruana, R., Riedewald, M., and Fink, D. (2008, July). Detecting
339 | statistical interactions with additive groves of trees. In \emph{Proceedings of
340 | the 25th international conference on Machine learning} (pp. 1000-1007).
341 | DOI: 10.1145/1390156.1390282
342 | 
343 | Van der Laan, M. J., Polley, E. C., and Hubbard, A. E. (2007). Super learner.
344 | \emph{Statistical applications in genetics and molecular biology}, 6(1).
345 | DOI: 10.2202/1544-6115.1309.
346 | 
347 | Worley, B. A. (1987). Deterministic uncertainty analysis (No. ORNL-6428). Oak
348 | Ridge National Lab.(ORNL), Oak Ridge, TN/
349 | }
350 | 


--------------------------------------------------------------------------------
/man/small_fine_foods.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/fine_foods.R
 3 | \docType{data}
 4 | \name{small_fine_foods}
 5 | \alias{small_fine_foods}
 6 | \alias{training_data}
 7 | \alias{testing_data}
 8 | \title{Fine foods example data}
 9 | \source{
10 | https://snap.stanford.edu/data/web-FineFoods.html
11 | }
12 | \value{
13 | \item{training_data,testing_data}{tibbles}
14 | }
15 | \description{
16 | Fine foods example data
17 | }
18 | \details{
19 | These data are from Amazon, who describe it as "This dataset consists of
20 | reviews of fine foods from amazon. The data span a period of more than 10
21 | years, including all ~500,000 reviews up to October 2012. Reviews include
22 | product and user information, ratings, and a plaintext review."
23 | 
24 | A subset of the data are contained here and are split into a training and
25 | test set. The training set sampled 10 products and retained all of their
26 | individual reviews. Since the reviews within these products are correlated,
27 | we recommend resampling the data using a leave-one-product-out approach. The
28 | test set sampled 500 products that were not included in the training set
29 | and selected a single review at random for each.
30 | 
31 | There is a column for the product, a column for the text of the review, and
32 | a factor column for a class variable. The outcome is whether the reviewer
33 | gave the product a 5-star rating or not.
34 | }
35 | \examples{
36 | data(small_fine_foods)
37 | str(training_data)
38 | str(testing_data)
39 | }
40 | \keyword{datasets}
41 | 


--------------------------------------------------------------------------------
/man/solubility_test.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/solubility.R
 3 | \docType{data}
 4 | \name{solubility_test}
 5 | \alias{solubility_test}
 6 | \title{Solubility predictions from MARS model}
 7 | \source{
 8 | Kuhn, M., Johnson, K. (2013) \emph{Applied Predictive
 9 | Modeling}, Springer
10 | }
11 | \value{
12 | \item{solubility_test}{a data frame}
13 | }
14 | \description{
15 | Solubility predictions from MARS model
16 | }
17 | \details{
18 | For the solubility data in Kuhn and Johnson (2013),
19 | these data are the test set results for the MARS model. The
20 | observed solubility (in column \code{solubility}) and the model
21 | results (\code{prediction}) are contained in the data.
22 | }
23 | \examples{
24 | data(solubility_test)
25 | str(solubility_test)
26 | }
27 | \keyword{datasets}
28 | 


--------------------------------------------------------------------------------
/man/stackoverflow.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stackoverflow.R
 3 | \docType{data}
 4 | \name{stackoverflow}
 5 | \alias{stackoverflow}
 6 | \title{Annual Stack Overflow Developer Survey Data}
 7 | \source{
 8 | Julia Silge, \emph{Supervised Machine Learning Case Studies in R}
 9 | 
10 | \verb{https://supervised-ml-course.netlify.com/chapter2}
11 | 
12 | Raw data: \verb{https://insights.stackoverflow.com/survey/}
13 | }
14 | \value{
15 | \item{stackoverflow}{a tibble}
16 | }
17 | \description{
18 | Annual Stack Overflow Developer Survey Data
19 | }
20 | \details{
21 | These data are a collection of 5,594 data points collected on
22 | developers. These data could be used to try to predict who works remotely
23 | (as used in the source listed below).
24 | }
25 | \examples{
26 | data(stackoverflow)
27 | str(stackoverflow)
28 | }
29 | \keyword{datasets}
30 | 


--------------------------------------------------------------------------------
/man/steroidogenic_toxicity.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/steroidogenic_toxicity.R
 3 | \docType{data}
 4 | \name{steroidogenic_toxicity}
 5 | \alias{steroidogenic_toxicity}
 6 | \title{Predicting steroidogenic toxicity with assay data}
 7 | \source{
 8 | Maglich, J. M., Kuhn, M., Chapin, R. E., & Pletcher, M. T. (2014). More than
 9 | just hormones: H295R cells as predictors of reproductive toxicity.
10 | \emph{Reproductive Toxicology}, 45, 77-86.
11 | }
12 | \value{
13 | A tibble with columns
14 | \itemize{
15 | \item \code{class}: factor(levels: toxic and nontoxic)
16 | \item \code{cyp_11a1}: numeric
17 | \item \code{cyp_11b1}: numeric
18 | \item \code{cyp_11b2}: numeric
19 | \item \code{cyp_17a1}: numeric
20 | \item \code{cyp_19a1}: numeric
21 | \item \code{cyp_21a1}: numeric
22 | \item \code{hsd3b2}: numeric
23 | \item \code{star}: numeric
24 | \item \code{progesterone}: numeric
25 | \item \code{testosterone}: numeric
26 | \item \code{dhea}: numeric
27 | \item \code{cortisol}: numeric
28 | }
29 | }
30 | \description{
31 | A set of \emph{in vitro} assays are used to quantify the risk of reproductive
32 | toxicity via the disruption of steroidogenic pathways.
33 | }
34 | \details{
35 | H295R cells were used to measure the effect with two sets of assay results.
36 | The first includes a set of protein measurements on: cytochrome P450 enzymes
37 | ("cyp"s), STAR, and 3BHSD2. The second include hormone measurements for
38 | DHEA, progesterone, testosterone, and cortisol.
39 | 
40 | Columns:
41 | \itemize{
42 | \item \code{class}: factor (levels: 'toxic' and 'nontoxic')
43 | \item \code{cyp_11a1}: numeric
44 | \item \code{cyp_11b1}: numeric
45 | \item \code{cyp_11b2}: numeric
46 | \item \code{cyp_17a1}: numeric
47 | \item \code{cyp_19a1}: numeric
48 | \item \code{cyp_21a1}: numeric
49 | \item \code{hsd3b2}: numeric
50 | \item \code{star}: numeric
51 | \item \code{progesterone}: numeric
52 | \item \code{testosterone}: numeric
53 | \item \code{dhea}: numeric
54 | \item \code{cortisol}: numeric
55 | }
56 | }
57 | \examples{
58 | data(steroidogenic_toxicity)
59 | str(steroidogenic_toxicity)
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/man/tate_text.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/tate_text.R
 3 | \docType{data}
 4 | \name{tate_text}
 5 | \alias{tate_text}
 6 | \title{Tate Gallery modern artwork metadata}
 7 | \source{
 8 | \itemize{
 9 | \item \url{https://github.com/tategallery/collection}
10 | \item \url{https://www.tate.org.uk/}
11 | }
12 | }
13 | \value{
14 | \item{tate_text}{a tibble}
15 | }
16 | \description{
17 | Metadata such as artist, title, and year created for recent artworks owned
18 | by the Tate Gallery. Only artworks created during or after 1990 are
19 | included, and the metadata source was last updated in 2014. The Tate Gallery
20 | provides these data but requests users to be respectful of their
21 | \href{https://github.com/tategallery/collection#usage-guidelines-for-open-data}{guidelines for use}.
22 | }
23 | \examples{
24 | data(tate_text)
25 | str(tate_text)
26 | }
27 | \keyword{datasets}
28 | 


--------------------------------------------------------------------------------
/man/taxi.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/taxi.R
 3 | \docType{data}
 4 | \name{taxi}
 5 | \alias{taxi}
 6 | \title{Chicago taxi data set}
 7 | \source{
 8 | \url{https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew}
 9 | }
10 | \value{
11 | tibble
12 | }
13 | \description{
14 | A data set containing information on a subset of taxi trips in the city
15 | of Chicago in 2022.
16 | }
17 | \details{
18 | The source data are originally described on the linked City of Chicago
19 | data portal. The data exported here are a pre-processed subset motivated by
20 | the modeling problem of predicting whether a rider will tip or not.
21 | 
22 | \describe{
23 | \item{tip}{Whether the rider left a tip. A factor with levels
24 | "yes" and "no".}
25 | \item{distance}{The trip distance, in odometer miles.}
26 | \item{company}{The taxi company, as a factor. Companies that occurred
27 | few times were binned as "other".}
28 | \item{local}{Whether the trip's starting and ending locations are in the
29 | same community. See the source data for community area values.}
30 | \item{dow}{The day of the week in which the trip began, as a
31 | factor.}
32 | \item{month}{The month in which the trip began, as a factor.}
33 | \item{hour}{The hour of the day in which the trip began, as a
34 | numeric.}
35 | }
36 | }
37 | \examples{
38 | \donttest{
39 | taxi
40 | }
41 | }
42 | 


--------------------------------------------------------------------------------
/man/two_class_dat.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/two_class_dat.R
 3 | \docType{data}
 4 | \name{two_class_dat}
 5 | \alias{two_class_dat}
 6 | \title{Two class data}
 7 | \value{
 8 | \item{two_class_dat}{a data frame}
 9 | }
10 | \description{
11 | Two class data
12 | }
13 | \details{
14 | There are artificial data with two predictors (\code{A} and \code{B}) and
15 | a factor outcome variable (\code{Class}).
16 | }
17 | \examples{
18 | data(two_class_dat)
19 | str(two_class_dat)
20 | }
21 | \keyword{datasets}
22 | 


--------------------------------------------------------------------------------
/man/two_class_example.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/two_class_dat.R
 3 | \docType{data}
 4 | \name{two_class_example}
 5 | \alias{two_class_example}
 6 | \title{Two class predictions}
 7 | \value{
 8 | \item{two_class_example}{a data frame}
 9 | }
10 | \description{
11 | Two class predictions
12 | }
13 | \details{
14 | These data are a test set form a model built for two
15 | classes ("Class1" and "Class2"). There are columns for the true
16 | and predicted classes and column for the probabilities for each
17 | class.
18 | }
19 | \examples{
20 | data(two_class_example)
21 | str(two_class_example)
22 | }
23 | \keyword{datasets}
24 | 


--------------------------------------------------------------------------------
/man/wa_churn.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/wa_churn.R
 3 | \docType{data}
 4 | \name{wa_churn}
 5 | \alias{wa_churn}
 6 | \title{Watson churn data}
 7 | \source{
 8 | IBM Watson Analytics https://ibm.co/2sOvyvy
 9 | }
10 | \value{
11 | \item{wa_churn}{a data frame}
12 | }
13 | \description{
14 | Watson churn data
15 | }
16 | \details{
17 | These data were downloaded from the IBM Watson site
18 | (see below) in September 2018. The data contain a factor for
19 | whether a customer churned or not. Alternatively, the \code{tenure}
20 | column presumably contains information on how long the customer
21 | has had an account. A survival analysis can be done on this
22 | column using the \code{churn} outcome as the censoring information. A
23 | data dictionary can be found on the source website.
24 | }
25 | \examples{
26 | data(wa_churn)
27 | str(wa_churn)
28 | }
29 | \keyword{datasets}
30 | 


--------------------------------------------------------------------------------
/modeldata.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(modeldata)
3 | 
4 | test_check("modeldata")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/_snaps/simulations.md:
--------------------------------------------------------------------------------
 1 | # classification simulation
 2 | 
 3 |     Code
 4 |       sim_classification(5, method = "potato")
 5 |     Condition
 6 |       Error in `sim_classification()`:
 7 |       ! `method` must be one of "caret", not "potato".
 8 | 
 9 | # sapp_2014_1 simulation
10 | 
11 |     Code
12 |       sim_regression(5, method = "potato")
13 |     Condition
14 |       Error in `sim_regression()`:
15 |       ! `method` must be one of "sapp_2014_1", "sapp_2014_2", "van_der_laan_2007_1", "van_der_laan_2007_2", "hooker_2004", or "worley_1987", not "potato".
16 | 
17 | # multinomial simulation
18 | 
19 |     i In index: 1.
20 |     Caused by error in `.f()`:
21 |     ! The model equations should only use variables/objects `A` and `B`
22 | 
23 | 


--------------------------------------------------------------------------------
/tests/testthat/test-simulations.R:
--------------------------------------------------------------------------------
  1 | test_that("classification simulation", {
  2 |   set.seed(1)
  3 |   dat_1 <- sim_classification(500, num_linear = 0)
  4 |   dat_2 <- sim_classification(10, num_linear = 11)
  5 |   dat_3 <- sim_classification(1000, num_linear = 1, intercept = 50)
  6 |   dat_4 <- sim_classification(500, num_linear = 0, keep_truth = TRUE)
  7 | 
  8 |   expect_equal(
  9 |     names(dat_1),
 10 |     c(
 11 |       "class",
 12 |       "two_factor_1",
 13 |       "two_factor_2",
 14 |       "non_linear_1",
 15 |       "non_linear_2",
 16 |       "non_linear_3"
 17 |     )
 18 |   )
 19 |   expect_equal(
 20 |     names(dat_2),
 21 |     c(
 22 |       "class",
 23 |       "two_factor_1",
 24 |       "two_factor_2",
 25 |       "non_linear_1",
 26 |       "non_linear_2",
 27 |       "non_linear_3",
 28 |       modeldata:::names0(11, "linear_")
 29 |     )
 30 |   )
 31 |   expect_equal(
 32 |     names(dat_3),
 33 |     c(
 34 |       "class",
 35 |       "two_factor_1",
 36 |       "two_factor_2",
 37 |       "non_linear_1",
 38 |       "non_linear_2",
 39 |       "non_linear_3",
 40 |       "linear_1"
 41 |     )
 42 |   )
 43 |   expect_equal(
 44 |     names(dat_4),
 45 |     c(
 46 |       "class",
 47 |       "two_factor_1",
 48 |       "two_factor_2",
 49 |       "non_linear_1",
 50 |       "non_linear_2",
 51 |       "non_linear_3",
 52 |       ".truth"
 53 |     )
 54 |   )
 55 |   expect_equal(nrow(dat_1), 500)
 56 |   expect_equal(nrow(dat_2), 10)
 57 |   expect_equal(nrow(dat_3), 1000)
 58 |   expect_true(all(vapply(dat_1[, -1], is.numeric, logical(1))))
 59 | 
 60 |   expect_equal(sum(dat_3 == "class_2"), 0)
 61 |   expect_equal(levels(dat_3$class), paste0("class_", 1:2))
 62 |   expect_snapshot(
 63 |     error = TRUE,
 64 |     sim_classification(5, method = "potato")
 65 |   )
 66 | })
 67 | 
 68 | test_that("sapp_2014_1 simulation", {
 69 |   set.seed(1)
 70 |   dat_1 <- sim_regression(10, method = "sapp_2014_1")
 71 |   dat_2 <- sim_regression(10, method = "sapp_2014_1", keep_truth = TRUE)
 72 |   expect_equal(names(dat_1), c("outcome", modeldata:::names0(20, "predictor_")))
 73 |   expect_equal(
 74 |     names(dat_2),
 75 |     c("outcome", modeldata:::names0(20, "predictor_"), ".truth")
 76 |   )
 77 |   expect_equal(nrow(dat_1), 10)
 78 |   expect_true(all(vapply(dat_1, is.numeric, logical(1))))
 79 |   expect_snapshot(
 80 |     error = TRUE,
 81 |     sim_regression(5, method = "potato")
 82 |   )
 83 | })
 84 | 
 85 | test_that("sapp_2014_2 simulation", {
 86 |   set.seed(1)
 87 |   dat_1 <- sim_regression(10, method = "sapp_2014_2")
 88 |   dat_2 <- sim_regression(10, method = "sapp_2014_2", keep_truth = TRUE)
 89 |   expect_equal(
 90 |     names(dat_1),
 91 |     c("outcome", modeldata:::names0(200, "predictor_"))
 92 |   )
 93 |   expect_equal(
 94 |     names(dat_2),
 95 |     c("outcome", modeldata:::names0(200, "predictor_"), ".truth")
 96 |   )
 97 |   expect_equal(nrow(dat_1), 10)
 98 |   expect_true(all(vapply(dat_1, is.numeric, logical(1))))
 99 | })
100 | 
101 | test_that("van_der_laan_2007_1 simulation", {
102 |   set.seed(1)
103 |   dat_1 <- sim_regression(10, method = "van_der_laan_2007_1")
104 |   dat_2 <- sim_regression(10, method = "van_der_laan_2007_1", factors = TRUE)
105 |   dat_3 <- sim_regression(10, method = "van_der_laan_2007_1", keep_truth = TRUE)
106 |   expect_equal(names(dat_1), c("outcome", modeldata:::names0(10, "predictor_")))
107 |   expect_equal(
108 |     names(dat_3),
109 |     c("outcome", modeldata:::names0(10, "predictor_"), ".truth")
110 |   )
111 |   expect_equal(nrow(dat_1), 10)
112 |   expect_true(all(vapply(dat_1, is.numeric, logical(1))))
113 |   expect_true(all(vapply(dat_1[, -1], is.integer, logical(1))))
114 |   expect_true(all(vapply(dat_2[, -1], is.factor, logical(1))))
115 |   expect_equal(levels(dat_2$predictor_01), c("yes", "no"))
116 | })
117 | 
118 | test_that("van_der_laan_2007_2 simulation", {
119 |   set.seed(1)
120 |   dat_1 <- sim_regression(10, method = "van_der_laan_2007_2")
121 |   dat_2 <- sim_regression(10, method = "van_der_laan_2007_2", keep_truth = TRUE)
122 |   expect_equal(names(dat_1), c("outcome", modeldata:::names0(20, "predictor_")))
123 |   expect_equal(
124 |     names(dat_2),
125 |     c("outcome", modeldata:::names0(20, "predictor_"), ".truth")
126 |   )
127 |   expect_equal(nrow(dat_1), 10)
128 |   expect_true(all(vapply(dat_1, is.numeric, logical(1))))
129 | })
130 | 
131 | test_that("hooker_2004 simulation", {
132 |   set.seed(1)
133 |   dat_1 <- sim_regression(10, method = "hooker_2004")
134 |   dat_2 <- sim_regression(10, method = "hooker_2004", keep_truth = TRUE)
135 |   expect_equal(names(dat_1), c("outcome", modeldata:::names0(10, "predictor_")))
136 |   expect_equal(
137 |     names(dat_2),
138 |     c("outcome", modeldata:::names0(10, "predictor_"), ".truth")
139 |   )
140 |   expect_equal(nrow(dat_1), 10)
141 |   expect_true(all(vapply(dat_1, is.numeric, logical(1))))
142 | })
143 | 
144 | 
145 | test_that("noise simulation", {
146 |   set.seed(1)
147 |   dat_1 <- sim_noise(1000, num_vars = 10)
148 |   dat_2 <- sim_noise(1000, num_vars = 3, cov_param = .5)
149 |   dat_3 <- sim_noise(1000, num_vars = 3, cov_type = "toeplitz", cov_param = .99)
150 |   dat_4 <- sim_noise(10, num_vars = 3, outcome = "classification")
151 |   dat_5 <- sim_noise(
152 |     10,
153 |     num_vars = 3,
154 |     outcome = "classification",
155 |     num_classes = 10
156 |   )
157 |   dat_6 <- sim_noise(10, num_vars = 3, outcome = "regression")
158 | 
159 |   expect_equal(names(dat_1), modeldata:::names0(10, "noise_"))
160 |   expect_equal(names(dat_2), modeldata:::names0(3, "noise_"))
161 |   expect_equal(nrow(dat_1), 1000)
162 |   expect_equal(nrow(dat_4), 10)
163 | 
164 |   expect_true(all(vapply(dat_1, is.numeric, logical(1))))
165 |   expect_true(all(vapply(dat_1[, -1], is.numeric, logical(1))))
166 |   expect_true(is.factor(dat_5$class))
167 |   expect_true(all(vapply(dat_6, is.numeric, logical(1))))
168 | 
169 |   cor_1 <- cor(dat_1)[upper.tri(cor(dat_1))]
170 |   expect_true(all(cor_1 <= 0.1 & cor_1 >= -0.1))
171 | 
172 |   cor_2 <- cor(dat_2)[upper.tri(cor(dat_2))]
173 |   expect_true(all(cor_2 <= 0.6 & cor_2 >= 0.4))
174 | 
175 |   cor_3 <- cor(dat_3)[upper.tri(cor(dat_3))]
176 |   expect_true(all(cor_3 >= 0.95))
177 | 
178 |   expect_equal(levels(dat_4$class), paste0("class_", 1:2))
179 |   expect_equal(levels(dat_5$class), modeldata:::names0(10, "class_"))
180 | })
181 | 
182 | 
183 | test_that("logistic simulation", {
184 |   set.seed(1)
185 |   dat_1 <- sim_logistic(10, ~A)
186 |   dat_2 <- sim_logistic(10, rlang::expr(~B), keep_truth = TRUE)
187 |   expect_equal(names(dat_1), c(LETTERS[1:2], "class"))
188 |   expect_equal(names(dat_2), c(LETTERS[1:2], ".linear_pred", ".truth", "class"))
189 |   expect_equal(nrow(dat_1), 10)
190 | })
191 | 
192 | 
193 | test_that("multinomial simulation", {
194 |   expect_snapshot_error(sim_multinomial(10, ~ A + C, ~B, ~ A + B))
195 |   set.seed(1)
196 |   dat_1 <- sim_multinomial(10, ~A, ~B, ~ A + B)
197 |   dat_2 <- sim_multinomial(10, ~A, ~B, ~ A + B, keep_truth = TRUE)
198 |   expect_equal(names(dat_1), c(LETTERS[1:2], "class"))
199 |   expect_equal(
200 |     names(dat_2),
201 |     c(LETTERS[1:2], "class", ".truth_one", ".truth_two", ".truth_three")
202 |   )
203 |   expect_equal(nrow(dat_1), 10)
204 | })
205 | 


--------------------------------------------------------------------------------