├── .Rbuildignore ├── .github ├── .gitignore ├── CODEOWNERS ├── CODE_OF_CONDUCT.md └── workflows │ ├── R-CMD-check-hard.yaml │ ├── R-CMD-check.yaml │ ├── lock.yaml │ ├── pkgdown.yaml │ ├── pr-commands.yaml │ └── test-coverage.yaml ├── .gitignore ├── .vscode ├── extensions.json └── settings.json ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── NEWS.md ├── R ├── Chicago.R ├── Smithsonian.R ├── ad_data.R ├── ames.R ├── attrition.R ├── biomass.R ├── bivariate.R ├── car_prices.R ├── cat_adoption.R ├── cells.R ├── check_times.R ├── chem_proc_yield.R ├── churn.R ├── concrete.R ├── covers.R ├── credit_data.R ├── crickets.R ├── deliveries.R ├── drinks.R ├── fine_foods.R ├── grants.R ├── hepatic_injury_qsar.R ├── hotel_rates.R ├── hpc_cv.R ├── hpc_data.R ├── ischemic_stroke.R ├── leaf_id_flavia.R ├── lending_club.R ├── meats.R ├── modeldata-package.R ├── oils.R ├── parabolic.R ├── pathology.R ├── pd_speech.R ├── penguins.R ├── permeability_qsar.R ├── sacremento.R ├── scat.R ├── simulations.R ├── solubility.R ├── stackoverflow.R ├── steroidogenic_toxicity.R ├── tate_text.R ├── taxi.R ├── two_class_dat.R └── wa_churn.R ├── README.Rmd ├── README.md ├── _pkgdown.yml ├── air.toml ├── codecov.yml ├── data-raw ├── animal-shelter-intakes-and-outcomes.csv ├── cat_adoption.R ├── chem_proc_yield.R ├── hepatic_injury_qsar.R ├── hotel_rates.R ├── ischemic_stroke.R ├── leaf_id_flavia.R ├── permeability_qsar.R ├── prep_datasets.R ├── steroidogenic_toxicity.R └── taxi.R ├── data ├── Chicago.rda ├── Sacramento.RData ├── Smithsonian.RData ├── ad_data.RData ├── ames.rda ├── attrition.RData ├── biomass.RData ├── bivariate.RData ├── car_prices.RData ├── cat_adoption.rda ├── cells.RData ├── check_times.rda ├── chem_proc_yield.rda ├── concrete.RData ├── covers.RData ├── credit_data.RData ├── crickets.rda ├── datalist ├── deliveries.rda ├── drinks.rda ├── grants.rda ├── hepatic_injury_qsar.rda ├── hotel_rates.rda ├── hpc_cv.rda ├── hpc_data.RData ├── ischemic_stroke.rda ├── leaf_id_flavia.rda ├── lending_club.rda ├── meats.RData ├── mlc_churn.RData ├── oils.RData ├── parabolic.rda ├── pathology.rda ├── pd_speech.rda ├── penguins.rda ├── permeability_qsar.rda ├── scat.RData ├── small_fine_foods.RData ├── solubility_test.rda ├── stackoverflow.rda ├── steroidogenic_toxicity.rda ├── tate_text.rda ├── taxi.rda ├── two_class_dat.RData ├── two_class_example.rda └── wa_churn.rda ├── man ├── Chicago.Rd ├── Sacramento.Rd ├── Smithsonian.Rd ├── ad_data.Rd ├── ames.Rd ├── attrition.Rd ├── biomass.Rd ├── bivariate.Rd ├── car_prices.Rd ├── cat_adoption.Rd ├── cells.Rd ├── check_times.Rd ├── chem_proc_yield.Rd ├── concrete.Rd ├── covers.Rd ├── credit_data.Rd ├── crickets.Rd ├── deliveries.Rd ├── drinks.Rd ├── figures │ └── lifecycle-deprecated.svg ├── grants.Rd ├── hepatic_injury_qsar.Rd ├── hotel_rates.Rd ├── hpc_cv.Rd ├── hpc_data.Rd ├── ischemic_stroke.Rd ├── leaf_id_flavia.Rd ├── lending_club.Rd ├── meats.Rd ├── mlc_churn.Rd ├── modeldata-package.Rd ├── oils.Rd ├── parabolic.Rd ├── pathology.Rd ├── pd_speech.Rd ├── penguins.Rd ├── permeability_qsar.Rd ├── rmd │ └── ames.md ├── scat.Rd ├── sim_classification.Rd ├── small_fine_foods.Rd ├── solubility_test.Rd ├── stackoverflow.Rd ├── steroidogenic_toxicity.Rd ├── tate_text.Rd ├── taxi.Rd ├── two_class_dat.Rd ├── two_class_example.Rd └── wa_churn.Rd ├── modeldata.Rproj └── tests ├── testthat.R └── testthat ├── _snaps └── simulations.md └── test-simulations.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^LICENSE\.md$ 4 | ^\.github$ 5 | ^_pkgdown\.yml$ 6 | ^docs$ 7 | ^pkgdown$ 8 | ^README\.Rmd$ 9 | ^codecov\.yml$ 10 | ^CODE_OF_CONDUCT\.md$ 11 | ^data-raw$ 12 | ^revdep$ 13 | ^.Rhistory$ 14 | ^[\.]?air\.toml$ 15 | ^\.vscode$ 16 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # CODEOWNERS for modeldata 2 | # https://www.tidyverse.org/development/understudies 3 | .github/CODEOWNERS @topepo @juliasilge 4 | -------------------------------------------------------------------------------- /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, caste, color, religion, or sexual 10 | identity and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the overall 26 | community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or advances of 31 | any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email address, 35 | without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at codeofconduct@posit.co. 63 | All complaints will be reviewed and investigated promptly and fairly. 64 | 65 | All community leaders are obligated to respect the privacy and security of the 66 | reporter of any incident. 67 | 68 | ## Enforcement Guidelines 69 | 70 | Community leaders will follow these Community Impact Guidelines in determining 71 | the consequences for any action they deem in violation of this Code of Conduct: 72 | 73 | ### 1. Correction 74 | 75 | **Community Impact**: Use of inappropriate language or other behavior deemed 76 | unprofessional or unwelcome in the community. 77 | 78 | **Consequence**: A private, written warning from community leaders, providing 79 | clarity around the nature of the violation and an explanation of why the 80 | behavior was inappropriate. A public apology may be requested. 81 | 82 | ### 2. Warning 83 | 84 | **Community Impact**: A violation through a single incident or series of 85 | actions. 86 | 87 | **Consequence**: A warning with consequences for continued behavior. No 88 | interaction with the people involved, including unsolicited interaction with 89 | those enforcing the Code of Conduct, for a specified period of time. This 90 | includes avoiding interactions in community spaces as well as external channels 91 | like social media. Violating these terms may lead to a temporary or permanent 92 | ban. 93 | 94 | ### 3. Temporary Ban 95 | 96 | **Community Impact**: A serious violation of community standards, including 97 | sustained inappropriate behavior. 98 | 99 | **Consequence**: A temporary ban from any sort of interaction or public 100 | communication with the community for a specified period of time. No public or 101 | private interaction with the people involved, including unsolicited interaction 102 | with those enforcing the Code of Conduct, is allowed during this period. 103 | Violating these terms may lead to a permanent ban. 104 | 105 | ### 4. Permanent Ban 106 | 107 | **Community Impact**: Demonstrating a pattern of violation of community 108 | standards, including sustained inappropriate behavior, harassment of an 109 | individual, or aggression toward or disparagement of classes of individuals. 110 | 111 | **Consequence**: A permanent ban from any sort of public interaction within the 112 | community. 113 | 114 | ## Attribution 115 | 116 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 117 | version 2.1, available at 118 | . 119 | 120 | Community Impact Guidelines were inspired by 121 | [Mozilla's code of conduct enforcement ladder][https://github.com/mozilla/inclusion]. 122 | 123 | For answers to common questions about this code of conduct, see the FAQ at 124 | . Translations are available at . 125 | 126 | [homepage]: https://www.contributor-covenant.org 127 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check-hard.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | # 4 | # NOTE: This workflow only directly installs "hard" dependencies, i.e. Depends, 5 | # Imports, and LinkingTo dependencies. Notably, Suggests dependencies are never 6 | # installed, with the exception of testthat, knitr, and rmarkdown. The cache is 7 | # never used to avoid accidentally restoring a cache containing a suggested 8 | # dependency. 9 | on: 10 | push: 11 | branches: [main, master] 12 | pull_request: 13 | 14 | name: R-CMD-check-hard.yaml 15 | 16 | permissions: read-all 17 | 18 | jobs: 19 | check-no-suggests: 20 | runs-on: ${{ matrix.config.os }} 21 | 22 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 23 | 24 | strategy: 25 | fail-fast: false 26 | matrix: 27 | config: 28 | - {os: ubuntu-latest, r: 'release'} 29 | 30 | env: 31 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 32 | R_KEEP_PKG_SOURCE: yes 33 | 34 | steps: 35 | - uses: actions/checkout@v4 36 | 37 | - uses: r-lib/actions/setup-pandoc@v2 38 | 39 | - uses: r-lib/actions/setup-r@v2 40 | with: 41 | r-version: ${{ matrix.config.r }} 42 | http-user-agent: ${{ matrix.config.http-user-agent }} 43 | use-public-rspm: true 44 | 45 | - uses: r-lib/actions/setup-r-dependencies@v2 46 | with: 47 | dependencies: '"hard"' 48 | cache: false 49 | extra-packages: | 50 | any::rcmdcheck 51 | any::testthat 52 | any::knitr 53 | any::rmarkdown 54 | needs: check 55 | 56 | - uses: r-lib/actions/check-r-package@v2 57 | with: 58 | upload-snapshots: true 59 | build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' 60 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | # 4 | # NOTE: This workflow is overkill for most R packages and 5 | # check-standard.yaml is likely a better choice. 6 | # usethis::use_github_action("check-standard") will install it. 7 | on: 8 | push: 9 | branches: [main, master] 10 | pull_request: 11 | 12 | name: R-CMD-check.yaml 13 | 14 | permissions: read-all 15 | 16 | jobs: 17 | R-CMD-check: 18 | runs-on: ${{ matrix.config.os }} 19 | 20 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 21 | 22 | strategy: 23 | fail-fast: false 24 | matrix: 25 | config: 26 | - {os: macos-latest, r: 'release'} 27 | 28 | - {os: windows-latest, r: 'release'} 29 | # use 4.0 or 4.1 to check with rtools40's older compiler 30 | - {os: windows-latest, r: 'oldrel-4'} 31 | 32 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 33 | - {os: ubuntu-latest, r: 'release'} 34 | - {os: ubuntu-latest, r: 'oldrel-1'} 35 | - {os: ubuntu-latest, r: 'oldrel-2'} 36 | - {os: ubuntu-latest, r: 'oldrel-3'} 37 | - {os: ubuntu-latest, r: 'oldrel-4'} 38 | 39 | env: 40 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 41 | R_KEEP_PKG_SOURCE: yes 42 | 43 | steps: 44 | - uses: actions/checkout@v4 45 | 46 | - uses: r-lib/actions/setup-pandoc@v2 47 | 48 | - uses: r-lib/actions/setup-r@v2 49 | with: 50 | r-version: ${{ matrix.config.r }} 51 | http-user-agent: ${{ matrix.config.http-user-agent }} 52 | use-public-rspm: true 53 | 54 | - uses: r-lib/actions/setup-r-dependencies@v2 55 | with: 56 | extra-packages: any::rcmdcheck 57 | needs: check 58 | 59 | - uses: r-lib/actions/check-r-package@v2 60 | with: 61 | upload-snapshots: true 62 | build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' 63 | -------------------------------------------------------------------------------- /.github/workflows/lock.yaml: -------------------------------------------------------------------------------- 1 | name: 'Lock Threads' 2 | 3 | on: 4 | schedule: 5 | - cron: '0 0 * * *' 6 | 7 | jobs: 8 | lock: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: dessant/lock-threads@v2 12 | with: 13 | github-token: ${{ github.token }} 14 | issue-lock-inactive-days: '14' 15 | # issue-exclude-labels: '' 16 | # issue-lock-labels: 'outdated' 17 | issue-lock-comment: > 18 | This issue has been automatically locked. If you believe you have 19 | found a related problem, please file a new issue (with a reprex: 20 | ) and link to this issue. 21 | issue-lock-reason: '' 22 | pr-lock-inactive-days: '14' 23 | # pr-exclude-labels: 'wip' 24 | pr-lock-labels: '' 25 | pr-lock-comment: > 26 | This pull request has been automatically locked. If you believe you 27 | have found a related problem, please file a new issue (with a reprex: 28 | ) and link to this issue. 29 | pr-lock-reason: '' 30 | # process-only: 'issues' 31 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | release: 8 | types: [published] 9 | workflow_dispatch: 10 | 11 | name: pkgdown.yaml 12 | 13 | permissions: read-all 14 | 15 | jobs: 16 | pkgdown: 17 | runs-on: ubuntu-latest 18 | # Only restrict concurrency for non-PR jobs 19 | concurrency: 20 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 21 | env: 22 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 23 | permissions: 24 | contents: write 25 | steps: 26 | - uses: actions/checkout@v4 27 | 28 | - uses: r-lib/actions/setup-pandoc@v2 29 | 30 | - uses: r-lib/actions/setup-r@v2 31 | with: 32 | use-public-rspm: true 33 | 34 | - uses: r-lib/actions/setup-r-dependencies@v2 35 | with: 36 | extra-packages: any::pkgdown, local::. 37 | needs: website 38 | 39 | - name: Build site 40 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 41 | shell: Rscript {0} 42 | 43 | - name: Deploy to GitHub pages 🚀 44 | if: github.event_name != 'pull_request' 45 | uses: JamesIves/github-pages-deploy-action@v4.5.0 46 | with: 47 | clean: false 48 | branch: gh-pages 49 | folder: docs 50 | -------------------------------------------------------------------------------- /.github/workflows/pr-commands.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | issue_comment: 5 | types: [created] 6 | 7 | name: pr-commands.yaml 8 | 9 | permissions: read-all 10 | 11 | jobs: 12 | document: 13 | if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/document') }} 14 | name: document 15 | runs-on: ubuntu-latest 16 | env: 17 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 18 | permissions: 19 | contents: write 20 | steps: 21 | - uses: actions/checkout@v4 22 | 23 | - uses: r-lib/actions/pr-fetch@v2 24 | with: 25 | repo-token: ${{ secrets.GITHUB_TOKEN }} 26 | 27 | - uses: r-lib/actions/setup-r@v2 28 | with: 29 | use-public-rspm: true 30 | 31 | - uses: r-lib/actions/setup-r-dependencies@v2 32 | with: 33 | extra-packages: any::roxygen2 34 | needs: pr-document 35 | 36 | - name: Document 37 | run: roxygen2::roxygenise() 38 | shell: Rscript {0} 39 | 40 | - name: commit 41 | run: | 42 | git config --local user.name "$GITHUB_ACTOR" 43 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 44 | git add man/\* NAMESPACE 45 | git commit -m 'Document' 46 | 47 | - uses: r-lib/actions/pr-push@v2 48 | with: 49 | repo-token: ${{ secrets.GITHUB_TOKEN }} 50 | 51 | style: 52 | if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/style') }} 53 | name: style 54 | runs-on: ubuntu-latest 55 | env: 56 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 57 | permissions: 58 | contents: write 59 | steps: 60 | - uses: actions/checkout@v4 61 | 62 | - uses: r-lib/actions/pr-fetch@v2 63 | with: 64 | repo-token: ${{ secrets.GITHUB_TOKEN }} 65 | 66 | - uses: r-lib/actions/setup-r@v2 67 | 68 | - name: Install dependencies 69 | run: install.packages("styler") 70 | shell: Rscript {0} 71 | 72 | - name: Style 73 | run: styler::style_pkg() 74 | shell: Rscript {0} 75 | 76 | - name: commit 77 | run: | 78 | git config --local user.name "$GITHUB_ACTOR" 79 | git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" 80 | git add \*.R 81 | git commit -m 'Style' 82 | 83 | - uses: r-lib/actions/pr-push@v2 84 | with: 85 | repo-token: ${{ secrets.GITHUB_TOKEN }} 86 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | 8 | name: test-coverage.yaml 9 | 10 | permissions: read-all 11 | 12 | jobs: 13 | test-coverage: 14 | runs-on: ubuntu-latest 15 | env: 16 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | - uses: r-lib/actions/setup-r@v2 22 | with: 23 | use-public-rspm: true 24 | 25 | - uses: r-lib/actions/setup-r-dependencies@v2 26 | with: 27 | extra-packages: any::covr, any::xml2 28 | needs: coverage 29 | 30 | - name: Test coverage 31 | run: | 32 | cov <- covr::package_coverage( 33 | quiet = FALSE, 34 | clean = FALSE, 35 | install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") 36 | ) 37 | print(cov) 38 | covr::to_cobertura(cov) 39 | shell: Rscript {0} 40 | 41 | - uses: codecov/codecov-action@v5 42 | with: 43 | # Fail if error if not on PR, or if on PR and token is given 44 | fail_ci_if_error: ${{ github.event_name != 'pull_request' || secrets.CODECOV_TOKEN }} 45 | files: ./cobertura.xml 46 | plugins: noop 47 | disable_search: true 48 | token: ${{ secrets.CODECOV_TOKEN }} 49 | 50 | - name: Show testthat output 51 | if: always() 52 | run: | 53 | ## -------------------------------------------------------------------- 54 | find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true 55 | shell: bash 56 | 57 | - name: Upload test results 58 | if: failure() 59 | uses: actions/upload-artifact@v4 60 | with: 61 | name: coverage-test-failures 62 | path: ${{ runner.temp }}/package 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # Example code in package build process 9 | *-Ex.R 10 | 11 | # Output files from R CMD build 12 | /*.tar.gz 13 | 14 | # Output files from R CMD check 15 | /*.Rcheck/ 16 | 17 | # RStudio files 18 | .Rproj.user/ 19 | 20 | # produced vignettes 21 | vignettes/*.html 22 | vignettes/*.pdf 23 | 24 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 25 | .httr-oauth 26 | 27 | # knitr and R markdown default cache directories 28 | /*_cache/ 29 | /cache/ 30 | 31 | # Temporary files created by R markdown 32 | *.utf8.md 33 | *.knit.md 34 | 35 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html 36 | rsconnect/ 37 | .DS_Store 38 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "Posit.air-vscode" 4 | ] 5 | } 6 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "[r]": { 3 | "editor.formatOnSave": true, 4 | "editor.defaultFormatter": "Posit.air-vscode" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: modeldata 2 | Title: Data Sets Useful for Modeling Examples 3 | Version: 1.4.0.9000 4 | Authors@R: c( 5 | person("Max", "Kuhn", , "max@posit.co", role = c("aut", "cre")), 6 | person("Posit Software, PBC", role = c("cph", "fnd"), 7 | comment = c(ROR = "03wc8by49")) 8 | ) 9 | Description: Data sets used for demonstrating or testing model-related 10 | packages are contained in this package. 11 | License: MIT + file LICENSE 12 | URL: https://modeldata.tidymodels.org, 13 | https://github.com/tidymodels/modeldata 14 | BugReports: https://github.com/tidymodels/modeldata/issues 15 | Depends: 16 | R (>= 4.1) 17 | Imports: 18 | dplyr, 19 | MASS, 20 | purrr, 21 | rlang, 22 | tibble 23 | Suggests: 24 | covr, 25 | ggplot2, 26 | testthat (>= 3.0.0) 27 | Config/Needs/website: tidyverse/tidytemplate, tidymodels/tidymodels 28 | Config/testthat/edition: 3 29 | Config/usethis/last-upkeep: 2025-04-27 30 | Encoding: UTF-8 31 | LazyData: true 32 | LazyDataCompression: xz 33 | Roxygen: list(markdown = TRUE) 34 | RoxygenNote: 7.3.2 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2025 2 | COPYRIGHT HOLDER: modeldata authors 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2025 modeldata authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(sim_classification) 4 | export(sim_logistic) 5 | export(sim_multinomial) 6 | export(sim_noise) 7 | export(sim_regression) 8 | importFrom(stats,rnorm) 9 | importFrom(stats,runif) 10 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # modeldata (development version) 2 | 3 | * Added a new regression simulation function via `method = "worley_1987"`. 4 | 5 | * Transition from the magrittr pipe to the base R pipe. 6 | 7 | # modeldata 1.4.0 8 | 9 | * Added the `cat_adoption` data set. 10 | 11 | # modeldata 1.3.0 12 | 13 | * Added the `deliveries` data set. 14 | 15 | # modeldata 1.2.0 16 | 17 | * New data sets 18 | 19 | - `chem_proc_yield` (regression) 20 | - `hepatic_injury_qsar` (ordinal classification) 21 | - `hotel_rates` (regression) 22 | - `ischemic_stroke` (classification) 23 | - `leaf_id_flavia` (classification) 24 | - `permeability_qsar` (regression) 25 | - `steroidogenic_toxicity` (classification) 26 | - `taxi` (classification) 27 | 28 | * The simulation equation for Hooker (2004) was slightly incorrect and has been corrected. 29 | 30 | # modeldata 1.1.0 31 | 32 | * Added a `keep_truth` argument to the supervised simulation functions. This retains the column that defines the error free simulated value of the outcome. This numeric column is called `.truth`. 33 | 34 | * A simulation functions were added: 35 | 36 | * `sim_logistic()` and `sim_multinomial()` were added. 37 | 38 | * A method for Hooker (2004) was added for `sim_regression()`. 39 | 40 | # modeldata 1.0.1 41 | 42 | * Small update to fix HTML for CRAN. 43 | 44 | # modeldata 1.0.0 45 | 46 | * Added a set of regression and classification simulation functions (#273). 47 | 48 | * Remove OkCupid data, including text data, because of privacy concerns. 49 | 50 | # modeldata 0.1.1 51 | 52 | * Add Tate Gallery modern artwork metadata. 53 | 54 | * Deprecate OkCupid data, including text data, because of concerns around such data such as the ability to identify individuals. 55 | 56 | # modeldata 0.1.0 57 | 58 | * Add the grant acceptance data from Kuhn and Johnson (2013) (_Applied Predictive Modeling_). 59 | 60 | * The `crickets` data from Chapter 3 of [`tmwr.org`](https://www.tmwr.org/base-r.html#an-example) were added. 61 | 62 | # modeldata 0.0.2 63 | 64 | * The bivariate dataset was missing, this has been corrected (@mdogucu, #5). 65 | 66 | * The [Ames](https://github.com/topepo/AmesHousing) and [penguin](https://github.com/allisonhorst/palmerpenguins) data sets were added. 67 | 68 | # modeldata 0.0.1 69 | 70 | * Added a `NEWS.md` file to track changes to the package. 71 | -------------------------------------------------------------------------------- /R/Chicago.R: -------------------------------------------------------------------------------- 1 | #' Chicago ridership data 2 | #' 3 | #' @details These data are from Kuhn and Johnson (2020) and contain an 4 | #' _abbreviated_ training set for modeling the number of people (in thousands) 5 | #' who enter the Clark and Lake L station. 6 | #' 7 | #' The `date` column corresponds to the current date. The columns with station 8 | #' names (`Austin` through `California`) are a _sample_ of the columns used in 9 | #' the original analysis (for file size reasons). These are 14 day lag 10 | #' variables (i.e. `date - 14 days`). There are columns related to weather and 11 | #' sports team schedules. 12 | #' 13 | #' The station at 35th and Archer is contained in the column `Archer_35th` to 14 | #' make it a valid R column name. 15 | #' 16 | #' 17 | #' @name Chicago 18 | #' @aliases Chicago stations 19 | #' @docType data 20 | #' @return \item{Chicago}{a tibble} \item{stations}{a vector of station names} 21 | #' 22 | #' @source Kuhn and Johnson (2020), _Feature Engineering and Selection_, 23 | #' Chapman and Hall/CRC . \url{https://bookdown.org/max/FES/} and 24 | #' \url{https://github.com/topepo/FES} 25 | #' 26 | #' 27 | #' @keywords datasets 28 | #' @examples 29 | #' data(Chicago) 30 | #' str(Chicago) 31 | #' stations 32 | NULL 33 | -------------------------------------------------------------------------------- /R/Smithsonian.R: -------------------------------------------------------------------------------- 1 | #' Smithsonian museums 2 | #' 3 | #' Geocodes for the Smithsonian museums (circa 2018). 4 | #' 5 | #' @name Smithsonian 6 | #' @aliases Smithsonian 7 | #' @docType data 8 | #' @return \item{Smithsonian}{a tibble} 9 | #' 10 | #' @source https://en.wikipedia.org/wiki/List_of_Smithsonian_museums 11 | #' 12 | #' @keywords datasets 13 | #' @examples 14 | #' data(Smithsonian) 15 | #' str(Smithsonian) 16 | NULL 17 | -------------------------------------------------------------------------------- /R/ad_data.R: -------------------------------------------------------------------------------- 1 | #' Alzheimer's disease data 2 | #' 3 | #' @details 4 | #' Craig-Schapiro et al. (2011) describe a clinical study of 333 patients, 5 | #' including some with mild (but well-characterized) cognitive impairment as 6 | #' well as healthy individuals. CSF samples were taken from all subjects. The 7 | #' goal of the study was to determine if subjects in the early states of 8 | #' impairment could be differentiated from cognitively healthy individuals. 9 | #' Data collected on each subject included: 10 | #' \itemize{ 11 | #' \item Demographic characteristics such as age and gender 12 | #' \item Apolipoprotein E genotype 13 | #' \item Protein measurements of Abeta, Tau, and a phosphorylated version of Tau (called pTau) 14 | #' \item Protein measurements of 124 exploratory biomarkers, and 15 | #' \item Clinical dementia scores 16 | #' } 17 | #' 18 | #' For these analyses, we have converted the scores to two classes: impaired 19 | #' and healthy. The goal of this analysis is to create classification models 20 | #' using the demographic and assay data to predict which patients have early 21 | #' stages of disease. 22 | #' 23 | #' @name ad_data 24 | #' @aliases ad_data 25 | #' @docType data 26 | #' @return \item{ad_data}{a tibble} 27 | #' 28 | #' @source 29 | #' Kuhn, M., Johnson, K. (2013) *Applied Predictive Modeling*, Springer. 30 | #' 31 | #' Craig-Schapiro R, Kuhn M, Xiong C, Pickering EH, Liu J, Misko TP, et al. 32 | #' (2011) Multiplexed Immunoassay Panel Identifies Novel CSF Biomarkers for 33 | #' Alzheimer's Disease Diagnosis and Prognosis. PLoS ONE 6(4): e18850. 34 | #' 35 | #' 36 | #' @keywords datasets 37 | #' @examples 38 | #' data(ad_data) 39 | #' str(ad_data) 40 | NULL 41 | -------------------------------------------------------------------------------- /R/ames.R: -------------------------------------------------------------------------------- 1 | #' Ames Housing Data 2 | #' 3 | #' A data set from De Cock (2011) has 82 fields were recorded for 2,930 4 | #' properties in Ames IA. This version is copies from the `AmesHousing` package 5 | #' but does not include a few quality columns that appear to be outcomes 6 | #' rather than predictors. 7 | #' 8 | #' See this links for the sources below for more information as well as 9 | #' `?AmesHousing::make_ames`. 10 | #' 11 | #' @includeRmd man/rmd/ames.md details 12 | #' 13 | #' @name ames 14 | #' @aliases ames 15 | #' @docType data 16 | #' @return \item{ames}{a tibble} 17 | #' @source De Cock, D. (2011). "Ames, Iowa: Alternative to the Boston Housing Data as an End of Semester Regression Project," \emph{Journal of Statistics Education}, Volume 19, Number 3. 18 | #' 19 | #' \url{http://jse.amstat.org/v19n3/decock/DataDocumentation.txt} 20 | #' 21 | #' \url{http://jse.amstat.org/v19n3/decock.pdf} 22 | #' @keywords datasets 23 | #' @examples 24 | #' data(ames) 25 | #' str(ames) 26 | NULL 27 | -------------------------------------------------------------------------------- /R/attrition.R: -------------------------------------------------------------------------------- 1 | #' Job attrition 2 | #' 3 | #' @details These data are from the IBM Watson Analytics Lab. 4 | #' The website describes the data with \dQuote{Uncover the 5 | #' factors that lead to employee attrition and explore important 6 | #' questions such as \sQuote{show me a breakdown of distance 7 | #' from home by job role and attrition} or \sQuote{compare 8 | #' average monthly income by education and attrition}. This is a 9 | #' fictional data set created by IBM data scientists.}. There 10 | #' are 1470 rows. 11 | #' 12 | #' @name attrition 13 | #' @aliases attrition 14 | #' @docType data 15 | #' @return \item{attrition}{a data frame} 16 | #' 17 | #' @source The IBM Watson Analytics Lab website https://www.ibm.com/communities/analytics/watson-analytics-blog/hr-employee-attrition/ 18 | #' 19 | #' 20 | #' @keywords datasets 21 | #' @examples 22 | #' data(attrition) 23 | #' str(attrition) 24 | NULL 25 | -------------------------------------------------------------------------------- /R/biomass.R: -------------------------------------------------------------------------------- 1 | #' Biomass data 2 | #' 3 | #' Ghugare et al (2014) contains a data set where different biomass fuels are 4 | #' characterized by the amount of certain molecules (carbon, hydrogen, oxygen, 5 | #' nitrogen, and sulfur) and the corresponding higher heating value (HHV). 6 | #' These data are from their Table S.2 of the Supplementary Materials 7 | #' 8 | #' @name biomass 9 | #' @aliases biomass 10 | #' @docType data 11 | #' @return \item{biomass}{a data frame} 12 | #' 13 | #' @source Ghugare, S. B., Tiwary, S., Elangovan, V., and Tambe, S. S. (2013). 14 | #' Prediction of Higher Heating Value of Solid Biomass Fuels Using Artificial 15 | #' Intelligence Formalisms. *BioEnergy Research*, 1-12. 16 | #' 17 | #' @keywords datasets 18 | #' @examples 19 | #' data(biomass) 20 | #' str(biomass) 21 | NULL 22 | -------------------------------------------------------------------------------- /R/bivariate.R: -------------------------------------------------------------------------------- 1 | #' Example bivariate classification data 2 | #' 3 | #' @details These data are a simplified version of the segmentation data contained 4 | #' in `caret`. There are three columns: `A` and `B` are predictors and the column 5 | #' `Class` is a factor with levels "One" and "Two". There are three data sets: 6 | #' one for training (n = 1009), validation (n = 300), and testing (n = 710). 7 | #' 8 | #' @name bivariate 9 | #' @aliases bivariate_train bivariate_test bivariate_val 10 | #' @docType data 11 | #' @return \item{bivariate_train, bivariate_test, bivariate_val}{tibbles} 12 | #' 13 | #' @keywords datasets 14 | #' @examples 15 | #' data(bivariate) 16 | #' str(bivariate_train) 17 | #' str(bivariate_val) 18 | #' str(bivariate_test) 19 | NULL 20 | -------------------------------------------------------------------------------- /R/car_prices.R: -------------------------------------------------------------------------------- 1 | #' Kelly Blue Book resale data for 2005 model year GM cars 2 | #' 3 | #' Kuiper (2008) collected data on Kelly Blue Book resale data for 804 GM cars (2005 model year). 4 | #' 5 | #' @name car_prices 6 | #' @docType data 7 | #' @return \item{car_prices}{data frame of the suggested retail price (column \code{Price}) and various 8 | #' characteristics of each car (columns \code{Mileage}, \code{Cylinder}, \code{Doors}, \code{Cruise}, 9 | #' \code{Sound}, \code{Leather}, \code{Buick}, \code{Cadillac}, \code{Chevy}, \code{Pontiac}, \code{Saab}, 10 | #' \code{Saturn}, \code{convertible}, \code{coupe}, \code{hatchback}, \code{sedan} and \code{wagon})} 11 | #' @source Kuiper, S. (2008). Introduction to Multiple Regression: How Much Is Your Car Worth?, 12 | #' \emph{Journal of Statistics Education}, Vol. 16 13 | #' \url{http://jse.amstat.org/jse_archive.htm#2008}. 14 | #' @keywords datasets 15 | #' @examples 16 | #' data(car_prices) 17 | #' str(car_prices) 18 | NULL 19 | -------------------------------------------------------------------------------- /R/cat_adoption.R: -------------------------------------------------------------------------------- 1 | #' Cat Adoption 2 | #' 3 | #' @description 4 | #' A subset of the cats at the animal shelter in Long Beach, California, USA. 5 | #' 6 | #' @return tibble 7 | #' @aliases cat_adoption 8 | #' @name cat_adoption 9 | #' @docType data 10 | #' @details 11 | #' 12 | #' A data frame with 2257 rows and 19 columns: 13 | #' \describe{ 14 | #' \item{time}{The time the cat spent at the shelter.} 15 | #' \item{event}{The event of interest is the cat being homed or returned to 16 | #' its original location (i.e., owner or community). The non-event is the cat 17 | #' being transferred to another shelter or dying. Zero indicates a non-event 18 | #' (censored), and one corresponds to the event occurring.} 19 | #' \item{sex}{The sex of the cat.} 20 | #' \item{neutered}{Whether the cat is neutered.} 21 | #' \item{intake_condition}{The intake condition of the cat.} 22 | #' \item{intake_type}{The type of intake.} 23 | #' \item{latitude}{Latitude of the intersection/cross street of intake or capture.} 24 | #' \item{longitude}{Longitude of the intersection/cross street of intake or capture.} 25 | #' \item{black,brown,brown_tabby,calico,cream,gray,gray_tabby,orange,orange_tabby,tan,tortie,white}{Indicators for the color/pattern of the cat's fur.} 26 | #' } 27 | #' @source 28 | #' 29 | #' 30 | #' on 2024-06-17 31 | #' 32 | #' @examples 33 | #' str(cat_adoption) 34 | #' @keywords datasets 35 | NULL 36 | -------------------------------------------------------------------------------- /R/cells.R: -------------------------------------------------------------------------------- 1 | #' Cell body segmentation 2 | #' 3 | #' Hill, LaPan, Li and Haney (2007) develop models to predict which cells in a 4 | #' high content screen were well segmented. The data consists of 119 imaging 5 | #' measurements on 2019. The original analysis used 1009 for training and 1010 6 | #' as a test set (see the column called \code{case}). 7 | #' 8 | #' The outcome class is contained in a factor variable called \code{class} with 9 | #' levels "PS" for poorly segmented and "WS" for well segmented. 10 | #' 11 | #' The raw data used in the paper can be found at the Biomedcentral website. 12 | #' The version 13 | #' contained in \code{cells} is modified. First, several discrete 14 | #' versions of some of the predictors (with the suffix "Status") were removed. 15 | #' Second, there are several skewed predictors with minimum values of zero 16 | #' (that would benefit from some transformation, such as the log). A constant 17 | #' value of 1 was added to these fields: \code{avg_inten_ch_2}, 18 | #' \code{fiber_align_2_ch_3}, \code{fiber_align_2_ch_4}, \code{spot_fiber_count_ch_4} and 19 | #' \code{total_inten_ch_2}. 20 | #' 21 | #' @name cells 22 | #' @docType data 23 | #' @return \item{cells}{a tibble} 24 | #' @source Hill, LaPan, Li and Haney (2007). Impact of image segmentation on 25 | #' high-content screening data quality for SK-BR-3 cells, \emph{BMC 26 | #' Bioinformatics}, Vol. 8, pg. 340, 27 | #' \url{https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-8-340}. 28 | #' @keywords datasets 29 | #' @examples 30 | #' data(cells) 31 | #' str(cells) 32 | NULL 33 | -------------------------------------------------------------------------------- /R/check_times.R: -------------------------------------------------------------------------------- 1 | #' Execution time data 2 | #' 3 | #' These data were collected from the CRAN web page for 13,626 R 4 | #' packages. The time to complete the standard package checking 5 | #' routine was collected In some cases, the package checking 6 | #' process is stopped due to errors and these data are treated as 7 | #' censored. It is less than 1 percent. 8 | #' 9 | #' As predictors, the associated package source code were 10 | #' downloaded and parsed to create predictors, including 11 | #' 12 | #' * `authors`: The number of authors in the author field. 13 | #' * `imports`: The number of imported packages. 14 | #' * `suggests`: The number of packages suggested. 15 | #' * `depends`: The number of hard dependencies. 16 | #' * `Roxygen`: a binary indicator for whether Roxygen was used 17 | #' for documentation. 18 | #' * `gh`: a binary indicator for whether the URL field contained 19 | #' a GitHub link. 20 | #' * `rforge`: a binary indicator for whether the URL field 21 | #' contained a link to R-forge. 22 | #' * `descr`: The number of characters (or, in some cases, bytes) 23 | #' in the description field. 24 | #' * `r_count`: The number of R files in the R directory. 25 | #' * `r_size`: The total disk size of the R files. 26 | #' * `ns_import`: Estimated number of imported functions or methods. 27 | #' * `ns_export`: Estimated number of exported functions or methods. 28 | #' * `s3_methods`: Estimated number of S3 methods. 29 | #' * `s4_methods`: Estimated number of S4 methods. 30 | #' * `doc_count`: How many Rmd or Rnw files in the vignettes 31 | #' directory. 32 | #' * `doc_size`: The disk size of the Rmd or Rnw files. 33 | #' * `src_count`: The number of files in the `src` directory. 34 | #' * `src_size`: The size on disk of files in the `src` directory. 35 | #' * `data_count` The number of files in the `data` directory. 36 | #' * `data_size`: The size on disk of files in the `data` directory. 37 | #' * `testthat_count`: The number of files in the `testthat` 38 | #' directory. 39 | #' * `testthat_size`: The size on disk of files in the `testthat` 40 | #' directory. 41 | #' * `check_time`: The time (in seconds) to run `R CMD check` 42 | #' using the "r-devel-windows-ix86+x86_64` flavor. 43 | #' * `status`: An indicator for whether the tests completed. 44 | #' 45 | #' Data were collected on 2019-01-20. 46 | #' @name check_times 47 | #' @aliases check_times 48 | #' @docType data 49 | #' @return \item{check_times}{a data frame} 50 | #' 51 | #' @source CRAN 52 | #' 53 | #' @keywords datasets 54 | #' @examples 55 | #' data(check_times) 56 | #' str(check_times) 57 | NULL 58 | -------------------------------------------------------------------------------- /R/chem_proc_yield.R: -------------------------------------------------------------------------------- 1 | #' Chemical manufacturing process data set 2 | #' 3 | #' @description 4 | #' A data set that models yield as a function of biological material predictors 5 | #' and chemical structure predictors. 6 | #' 7 | #' @name chem_proc_yield 8 | #' @aliases chem_proc_yield 9 | #' @docType data 10 | #' @return \item{chem_proc_yield}{a tibble} 11 | #' 12 | #' @details 13 | #' This data set contains information about a chemical manufacturing 14 | #' process, in which the goal is to understand the relationship between 15 | #' the process and the resulting final product yield. Raw material in 16 | #' this process is put through a sequence of 27 steps to generate the 17 | #' final pharmaceutical product. The starting material is generated from 18 | #' a biological unit and has a range of quality and characteristics. The 19 | #' objective in this project was to develop a model to predict percent 20 | #' yield of the manufacturing process. The data set consisted of 177 21 | #' samples of biological material for which 57 characteristics were 22 | #' measured. Of the 57 characteristics, there were 12 measurements of 23 | #' the biological starting material, and 45 measurements of the 24 | #' manufacturing process. The process variables included measurements 25 | #' such as temperature, drying time, washing time, and concentrations of 26 | #' by-products at various steps. Some of the process measurements can 27 | #' be controlled, while others are observed. Predictors are continuous, 28 | #' count, categorical; some are correlated, and some contain missing 29 | #' values. Samples are not independent because sets of samples come from 30 | #' the same batch of biological starting material. 31 | #' 32 | #' Columns: 33 | #' \itemize{ 34 | #' \item \code{yield}: numeric 35 | #' \item \code{bio_material_01} - \code{bio_material_12}: numeric 36 | #' \item \code{man_proc_01} - \code{man_proc_45}: numeric 37 | #' } 38 | #' @source 39 | #' Kuhn, Max, and Kjell Johnson. _Applied predictive modeling_. New York: 40 | #' Springer, 2013. 41 | #' 42 | #' @examples 43 | #' data(chem_proc_yield) 44 | #' str(chem_proc_yield) 45 | #' 46 | NULL 47 | -------------------------------------------------------------------------------- /R/churn.R: -------------------------------------------------------------------------------- 1 | #' Customer churn data 2 | #' 3 | #' A data set from the MLC++ machine learning software for modeling customer 4 | #' churn. There are 19 predictors, mostly numeric: `state` (categorical), 5 | #' `account_length` `area_code` `international_plan` (yes/no), 6 | #' `voice_mail_plan` (yes/no), `number_vmail_messages` 7 | #' `total_day_minutes` `total_day_calls` `total_day_charge` 8 | #' `total_eve_minutes` `total_eve_calls` `total_eve_charge` 9 | #' `total_night_minutes` `total_night_calls` 10 | #' `total_night_charge` `total_intl_minutes` 11 | #' `total_intl_calls` `total_intl_charge`, and 12 | #' `number_customer_service_calls`. 13 | #' 14 | #' The outcome is contained in a column called `churn` (also yes/no). 15 | #' A note in one of the source files states that the data are "artificial based 16 | #' on claims similar to real world". 17 | #' 18 | #' @name mlc_churn 19 | #' @aliases mlc_churn 20 | #' @docType data 21 | #' @return \item{mlc_churn}{a tibble} 22 | #' @source Originally at `http://www.sgi.com/tech/mlc/` 23 | #' @keywords datasets 24 | #' @examples 25 | #' data(mlc_churn) 26 | #' str(mlc_churn) 27 | NULL 28 | -------------------------------------------------------------------------------- /R/concrete.R: -------------------------------------------------------------------------------- 1 | #' Compressive strength of concrete mixtures 2 | #' 3 | #' Yeh (2006) describes an aggregated data set for experimental designs used to 4 | #' test the compressive strength of concrete mixtures. The data are used by 5 | #' Kuhn and Johnson (2013). 6 | #' 7 | #' 8 | #' @name concrete 9 | #' @aliases concrete 10 | #' @docType data 11 | #' @return \item{concrete}{a tibble} 12 | #' @keywords datasets 13 | #' @source 14 | #' Yeh I (2006). "Analysis of Strength of Concrete Using Design of Experiments 15 | #' and Neural Networks." *Journal of Materials in Civil Engineering*, 18, 597-604. 16 | #' 17 | #' Kuhn, M., Johnson, K. (2013) *Applied Predictive Modeling*, Springer. 18 | #' @examples 19 | #' data(concrete) 20 | #' str(concrete) 21 | NULL 22 | -------------------------------------------------------------------------------- /R/covers.R: -------------------------------------------------------------------------------- 1 | #' Raw cover type data 2 | #' 3 | #' These data are raw data describing different types of forest cover-types 4 | #' from the UCI Machine Learning Database (see link below). There is one 5 | #' column in the data that has a few difference pieces of textual 6 | #' information (of variable lengths). 7 | #' 8 | #' @name covers 9 | #' @aliases covers 10 | #' @docType data 11 | #' @return \item{covers}{a data frame} 12 | #' 13 | #' @source https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info 14 | #' 15 | #' @keywords datasets 16 | #' @examples 17 | #' data(covers) 18 | #' str(covers) 19 | NULL 20 | -------------------------------------------------------------------------------- /R/credit_data.R: -------------------------------------------------------------------------------- 1 | #' Credit data 2 | #' 3 | #' These data are from the website of Dr. Lluís A. Belanche Muñoz by way of a 4 | #' github repository of Dr. Gaston Sanchez. One data point is a missing outcome 5 | #' was removed from the original data. 6 | #' 7 | #' @name credit_data 8 | #' @aliases credit_data 9 | #' @docType data 10 | #' @return \item{credit_data}{a data frame} 11 | #' 12 | #' @source https://github.com/gastonstat/CreditScoring, 13 | #' http://bit.ly/2kkBFrk 14 | #' 15 | #' @keywords datasets 16 | #' @examples 17 | #' data(credit_data) 18 | #' str(credit_data) 19 | NULL 20 | -------------------------------------------------------------------------------- /R/crickets.R: -------------------------------------------------------------------------------- 1 | #' Rates of Cricket Chirps 2 | #' 3 | #' These data are from from McDonald (2009), by way of Mangiafico (2015), on 4 | #' the relationship between the ambient temperature and the rate of cricket 5 | #' chirps per minute. Data were collected for two species of the genus _Oecanthus_: _O. exclamationis_ 6 | #' and _O. niveus_. The data are contained in a data frame called `crickets` with 7 | #' a total of 31 data points. 8 | #' 9 | #' @name crickets 10 | #' @aliases crickets 11 | #' @docType data 12 | #' @return \item{crickets}{a tibble} 13 | #' @source Mangiafico, S. 2015. "An R Companion for the Handbook of Biological 14 | #' Statistics." \url{https://rcompanion.org/handbook/}. 15 | #' 16 | #' McDonald, J. 2009. _Handbook of Biological Statistics_. Sparky House Publishing. 17 | #' @keywords datasets 18 | #' @examples 19 | #' data(crickets) 20 | #' str(crickets) 21 | NULL 22 | -------------------------------------------------------------------------------- /R/deliveries.R: -------------------------------------------------------------------------------- 1 | #' Food Delivery Time Data 2 | #' 3 | #' @details 4 | #' These data are from a study of food delivery times in minutes (i.e., the time from the 5 | #' initial order to receiving the food) for a single restaurant. The data 6 | #' contains 10,012 orders from a specific restaurant. The predictors include: 7 | #' \itemize{ 8 | #' \item The time, in decimal hours, of the order. 9 | #' \item The day of the week for the order. 10 | #' \item The approximate distance in miles between the restaurant and the delivery 11 | #' location. 12 | #' \item A set of 27 predictors that count the number of distinct menu items 13 | #' in the order. 14 | #' } 15 | #' 16 | #' No times are censored. 17 | #' 18 | #' @name deliveries 19 | #' @aliases deliveries 20 | #' @docType data 21 | #' @return \item{deliveries}{a tibble} 22 | #' 23 | #' @keywords datasets 24 | #' @examples 25 | #' data(deliveries) 26 | #' str(deliveries) 27 | NULL 28 | -------------------------------------------------------------------------------- /R/drinks.R: -------------------------------------------------------------------------------- 1 | #' Sample time series data 2 | #' 3 | #' @details Drink sales. The exact name of the series from FRED is: 4 | #' "Merchant Wholesalers, Except Manufacturers' Sales Branches and Offices 5 | #' Sales: Nondurable Goods: Beer, Wine, and Distilled Alcoholic Beverages Sales" 6 | #' 7 | #' @name drinks 8 | #' @aliases drinks 9 | #' @docType data 10 | #' @return \item{drinks}{a tibble} 11 | #' 12 | #' @source The Federal Reserve Bank of St. Louis website https://fred.stlouisfed.org/series/S4248SM144NCEN 13 | #' 14 | #' @keywords datasets 15 | #' @examples 16 | #' data(drinks) 17 | #' str(drinks) 18 | NULL 19 | -------------------------------------------------------------------------------- /R/fine_foods.R: -------------------------------------------------------------------------------- 1 | #' Fine foods example data 2 | #' 3 | #' @details 4 | #' These data are from Amazon, who describe it as "This dataset consists of 5 | #' reviews of fine foods from amazon. The data span a period of more than 10 6 | #' years, including all ~500,000 reviews up to October 2012. Reviews include 7 | #' product and user information, ratings, and a plaintext review." 8 | #' 9 | #' A subset of the data are contained here and are split into a training and 10 | #' test set. The training set sampled 10 products and retained all of their 11 | #' individual reviews. Since the reviews within these products are correlated, 12 | #' we recommend resampling the data using a leave-one-product-out approach. The 13 | #' test set sampled 500 products that were not included in the training set 14 | #' and selected a single review at random for each. 15 | #' 16 | #' There is a column for the product, a column for the text of the review, and 17 | #' a factor column for a class variable. The outcome is whether the reviewer 18 | #' gave the product a 5-star rating or not. 19 | #' 20 | #' @name small_fine_foods 21 | #' @aliases small_fine_foods training_data testing_data 22 | #' @docType data 23 | #' @return \item{training_data,testing_data}{tibbles} 24 | #' 25 | #' @source https://snap.stanford.edu/data/web-FineFoods.html 26 | #' 27 | #' 28 | #' @keywords datasets 29 | #' @examples 30 | #' data(small_fine_foods) 31 | #' str(training_data) 32 | #' str(testing_data) 33 | NULL 34 | -------------------------------------------------------------------------------- /R/grants.R: -------------------------------------------------------------------------------- 1 | #' Grant acceptance data 2 | #' 3 | #' A data set related to the success or failure of academic grants. 4 | #' 5 | #' The data are discussed in Kuhn and Johnson (2013): 6 | #' 7 | #' "These data are from a 2011 Kaggle competition sponsored by the University 8 | #' of Melbourne where there was interest in predicting whether or not a grant 9 | #' application would be accepted. Since public funding of grants had decreased 10 | #' over time, triaging grant applications based on their likelihood of success 11 | #' could be important for estimating the amount of potential funding to the 12 | #' university. In addition to predicting grant success, the university sought 13 | #' to understand factors that were important in predicting success." 14 | #' 15 | #' The data ranged from 2005 and 2008 and the data spending strategy was 16 | #' driven by the date of the grant. Kuhn and Johnson (2013) describe: 17 | #' 18 | #' "The compromise taken here is to build models on the pre-2008 data and 19 | #' tune them by evaluating a random sample of 2,075 grants from 2008. Once the 20 | #' optimal parameters are determined, final model is built using these 21 | #' parameters and the entire training set (i.e., the data prior to 2008 and the 22 | #' additional 2,075 grants). A small holdout set of 518 grants from 2008 will 23 | #' be used to ensure that no gross methodology errors occur from repeatedly 24 | #' evaluating the 2008 data during model tuning. In the text, this set of 25 | #' samples is called the 2 0 0 8 holdout set. This small set of year 2008 26 | #' grants will be referred to as the test set and will not be evaluated until 27 | #' set of candidate models are identified." 28 | #' 29 | #' To emulate this, `grants_other` contains the training (pre-2008, n = 6,633) 30 | #' and holdout/validation data (2008, n = 1,557). `grants_test` has 518 grant 31 | #' samples from 2008. The object `grants_2008` is an integer vector that can 32 | #' be used to separate the modeling with the holdout/validation sets. 33 | #' 34 | #' 35 | #' @name grants 36 | #' @aliases grants_other grants_test grants_2008 37 | #' @docType data 38 | #' @return \item{grants_other,grants_test,grants_2008}{two tibbles and an integer 39 | #' vector of data points used for training} 40 | #' @source Kuhn and Johnson (2013). _Applied Predictive Modeling_. Springer. 41 | #' @keywords datasets 42 | #' @examples 43 | #' data(grants) 44 | #' str(grants_other) 45 | #' str(grants_test) 46 | #' str(grants_2008) 47 | NULL 48 | -------------------------------------------------------------------------------- /R/hepatic_injury_qsar.R: -------------------------------------------------------------------------------- 1 | #' Predicting hepatic injury from chemical information 2 | #' 3 | #' @description 4 | #' A quantitative structure-activity relationship (QSAR) data set to predict 5 | #' when a molecule has risk associated with liver function. 6 | #' 7 | #' @name hepatic_injury_qsar 8 | #' @aliases hepatic_injury_qsar 9 | #' @docType data 10 | #' @return \item{hepatic_injury_qsar}{a tibble} 11 | #' 12 | #' @details 13 | #' This data set was used to develop a model for predicting compounds' 14 | #' probability of causing hepatic injury (i.e. liver damage). This data set 15 | #' consisted of 281 unique compounds; 376 predictors were measured or computed 16 | #' for each. The response was categorical (either "none", "mild", or "severe"), 17 | #' and was highly unbalanced. 18 | #' 19 | #' This kind of response often occurs in pharmaceutical data because companies 20 | #' steer away from creating molecules that have undesirable characteristics. 21 | #' Therefore, well-behaved molecules often greatly outnumber undesirable 22 | #' molecules. The predictors consisted of measurements from 184 biological 23 | #' screens and 192 chemical feature predictors. The biological predictors 24 | #' represent activity for each screen and take values between 0 and 10 with a 25 | #' mode of 4. The chemical feature predictors represent counts of important 26 | #' sub-structures as well as measures of physical properties that are thought to 27 | #' be associated with hepatic injury. 28 | #' 29 | #' Columns: 30 | #' \itemize{ 31 | #' \item \code{class}: ordered and factor (levels: 'none', 'mild', and 'severe') 32 | #' \item \code{bio_assay_001} - \code{bio_assay_184}: numeric 33 | #' \item \code{chem_fp_001} - \code{chem_fp_192}: numeric 34 | #' } 35 | #' @source 36 | #' Kuhn, Max, and Kjell Johnson. _Applied predictive modeling_. New York: 37 | #' Springer, 2013. 38 | #' 39 | #' @examples 40 | #' data(hepatic_injury_qsar) 41 | #' str(hepatic_injury_qsar) 42 | #' 43 | NULL 44 | -------------------------------------------------------------------------------- /R/hotel_rates.R: -------------------------------------------------------------------------------- 1 | #' Daily Hotel Rate Data 2 | #' 3 | #' @description 4 | #' A data set to predict the average daily rate for a hotel in Lisbon Portugal. 5 | #' 6 | #' @name hotel_rates 7 | #' @aliases hotel_rates 8 | #' @docType data 9 | #' 10 | #' @details 11 | #' 12 | #' Data are originally described in Antonio, de Almeida, and Nunes (2019). 13 | #' This version of the data is filtered for one hotel (the "Resort Hotel") and 14 | #' is intended as regression data set for predicting the average daily rate for 15 | #' a room. The data are post-2016; the 2016 data were used to have a predictor 16 | #' for the historical daily rates. See the `hotel_rates.R` file in the 17 | #' `data-raw` directory of the package to understand other filters used when 18 | #' creating this version of the data. 19 | #' 20 | #' The `agent` and `company` fields were changed from random characters to use 21 | #' a set of random names. 22 | #' 23 | #' The outcome column is `avg_price_per_room`. 24 | #' 25 | #' ## License 26 | #' 27 | #' No license was given for the data; See the reference below for source. 28 | #' 29 | #' @source 30 | #' \url{https://github.com/rfordatascience/tidytuesday/tree/master/data/2020/2020-02-11} 31 | #' 32 | #' @references 33 | #' Antonio, N., de Almeida, A., and Nunes, L. (2019). Hotel booking demand 34 | #' datasets. _Data in Brief_, 22, 41-49. 35 | #' 36 | #' @keywords datasets 37 | #' @examples 38 | #' \dontrun{ 39 | #' str(hotel_rates) 40 | #' } 41 | NULL 42 | -------------------------------------------------------------------------------- /R/hpc_cv.R: -------------------------------------------------------------------------------- 1 | #' Class probability predictions 2 | #' 3 | #' @details This data frame contains the predicted classes and 4 | #' class probabilities for a linear discriminant analysis model fit 5 | #' to the HPC data set from Kuhn and Johnson (2013). These data are 6 | #' the assessment sets from a 10-fold cross-validation scheme. The 7 | #' data column columns for the true class (`obs`), the class 8 | #' prediction (`pred`) and columns for each class probability 9 | #' (columns `VF`, `F`, `M`, and `L`). Additionally, a column for 10 | #' the resample indicator is included. 11 | #' 12 | #' @name hpc_cv 13 | #' @aliases hpc_cv 14 | #' @docType data 15 | #' @return \item{hpc_cv}{a data frame} 16 | #' 17 | #' @source Kuhn, M., Johnson, K. (2013) *Applied Predictive 18 | #' Modeling*, Springer 19 | #' 20 | #' @keywords datasets 21 | #' @examples 22 | #' data(hpc_cv) 23 | #' str(hpc_cv) 24 | NULL 25 | -------------------------------------------------------------------------------- /R/hpc_data.R: -------------------------------------------------------------------------------- 1 | #' High-performance computing system data 2 | #' 3 | #' Kuhn and Johnson (2013) describe a data set where characteristics of unix 4 | #' jobs were used to classify there completion times as either very fast 5 | #' (1 min or less, `VF`), fast (1–50 min, `F`), moderate (5–30 min, `M`), or 6 | #' long (greater than 30 min, `L`). 7 | #' 8 | #' 9 | #' @name hpc_data 10 | #' @aliases hpc_data 11 | #' @docType data 12 | #' @return \item{hpc_data}{a tibble} 13 | #' @keywords datasets 14 | #' @source 15 | #' Kuhn, M., Johnson, K. (2013) *Applied Predictive Modeling*, Springer. 16 | #' @examples 17 | #' 18 | #' data(hpc_data) 19 | #' str(hpc_data) 20 | NULL 21 | -------------------------------------------------------------------------------- /R/ischemic_stroke.R: -------------------------------------------------------------------------------- 1 | #' Clinical data used to predict ischemic stroke 2 | #' 3 | #' @description 4 | #' A data set to predict a binary outcome using imaging and patient data. 5 | #' 6 | #' @name ischemic_stroke 7 | #' @aliases ischemic_stroke 8 | #' @docType data 9 | #' @return \item{ischemic_stroke}{a tibble} 10 | #' 11 | #' @details 12 | #' These data were gathered to predict patient risk for ischemic stroke. A 13 | #' historical set of patients with a range of carotid artery blockages were 14 | #' selected. The data consisted of 126 patients, 44 of which had blockages 15 | #' greater than 70%. All patients had undergone Computed Tomography Angiography 16 | #' (CTA) to generate a detailed three-dimensional visualization and 17 | #' characterization of the blockage. These images were then analyzed in order to 18 | #' compute several features related to the disease, including: percent stenosis, 19 | #' arterial wall thickness, and tissue characteristics such as lipid-rich 20 | #' necrotic core and calcification. 21 | #' 22 | #' The group of patients in this study also had follow-up information on 23 | #' whether or not a stroke occurred at a subsequent point in time. The data for 24 | #' each patient also included commonly collected clinical characteristics for 25 | #' risk of stroke such as whether or not the patient had atrial fibrillation, 26 | #' coronary artery disease, and a history of smoking. Demographics of gender and 27 | #' age were included as well. These readily available risk factors can be 28 | #' thought of as another potentially useful predictor set that can be evaluated. 29 | #' In fact, this set of predictors should be evaluated first to assess their 30 | #' ability to predict stroke since these predictors are easy to collect, are 31 | #' acquired at patient presentation, and do not require an expensive imaging 32 | #' technique. 33 | #' 34 | #' Columns: 35 | #' \itemize{ 36 | #' \item \code{stroke}: factor (levels: 'yes' and 'no') 37 | #' \item \code{nascet_scale}: numeric 38 | #' \item \code{calc_vol}: numeric 39 | #' \item \code{calc_vol_prop}: numeric 40 | #' \item \code{matx_vol}: numeric 41 | #' \item \code{matx_vol_prop}: numeric 42 | #' \item \code{lrnc_vol}: numeric 43 | #' \item \code{lrnc_vol_prop}: numeric 44 | #' \item \code{max_calc_area}: numeric 45 | #' \item \code{max_calc_area_prop}: numeric 46 | #' \item \code{max_dilation_by_area}: numeric 47 | #' \item \code{max_matx_area}: numeric 48 | #' \item \code{max_matx_area_prop}: numeric 49 | #' \item \code{max_lrnc_area}: numeric 50 | #' \item \code{max_lrnc_area_prop}: numeric 51 | #' \item \code{max_max_wall_thickness}: numeric 52 | #' \item \code{max_remodeling_ratio}: numeric 53 | #' \item \code{max_stenosis_by_area}: numeric 54 | #' \item \code{max_wall_area}: numeric 55 | #' \item \code{wall_vol}: numeric 56 | #' \item \code{max_stenosis_by_diameter}: numeric 57 | #' \item \code{age}: integer 58 | #' \item \code{male}: integer 59 | #' \item \code{smoking_history}: integer 60 | #' \item \code{atrial_fibrillation}: integer 61 | #' \item \code{coronary_artery_disease}: integer 62 | #' \item \code{diabetes_history}: integer 63 | #' \item \code{hypercholesterolemia_history}: integer 64 | #' \item \code{hypertension_history}: integer 65 | #' } 66 | #' @source 67 | #' Kuhn, Max, and Kjell Johnson. _Feature Engineering and Selection: A Practical 68 | #' Approach for Predictive Models_. Chapman and Hall/CRC, 2019. 69 | #' 70 | #' @examples 71 | #' data(ischemic_stroke) 72 | #' str(ischemic_stroke) 73 | #' 74 | NULL 75 | -------------------------------------------------------------------------------- /R/leaf_id_flavia.R: -------------------------------------------------------------------------------- 1 | #' Leaf identification data (Flavia) 2 | #' 3 | #' @description 4 | #' Image analysis of leaves to predict species. 5 | #' 6 | #' @name leaf_id_flavia 7 | #' @aliases leaf_id_flavia 8 | #' @docType data 9 | #' @return \item{leaf_id_flavia}{a data frame} 10 | #' 11 | #' @details 12 | #' From the original manuscript: "The Flavia dataset contains 1907 leaf images. 13 | #' There are 32 different species and each has 50-77 images. Scanners and 14 | #' digital cameras are used to acquire the leaf images on a plain background. 15 | #' The isolated leaf images contain blades only, without a petiole. These leaf 16 | #' images are collected from the most common plants in Yangtze, Delta, 17 | #' China. Those leaves were sampled on the campus of the Nanjing University and 18 | #' the Sun Yat-Sen arboretum, Nanking, China." 19 | #' 20 | #' The reference below has details information on the features used for 21 | #' prediction. 22 | #' 23 | #' Columns: 24 | #' \itemize{ 25 | #' \item \code{species}: factor (32 levels) 26 | #' \item \code{apex}: factor (9 levels) 27 | #' \item \code{base}: factor (6 levels) 28 | #' \item \code{shape}: factor (5 levels) 29 | #' \item \code{denate_edge}: factor (levels: 'no' and 'yes') 30 | #' \item \code{lobed_edge}: factor (levels: 'no' and 'yes') 31 | #' \item \code{smooth_edge}: factor (levels: 'no' and 'yes') 32 | #' \item \code{toothed_edge}: factor (levels: 'no' and 'yes') 33 | #' \item \code{undulate_edge}: factor (levels: 'no' and 'yes') 34 | #' \item \code{outlying_polar}: numeric 35 | #' \item \code{skewed_polar}: numeric 36 | #' \item \code{clumpy_polar}: numeric 37 | #' \item \code{sparse_polar}: numeric 38 | #' \item \code{striated_polar}: numeric 39 | #' \item \code{convex_polar}: numeric 40 | #' \item \code{skinny_polar}: numeric 41 | #' \item \code{stringy_polar}: numeric 42 | #' \item \code{monotonic_polar}: numeric 43 | #' \item \code{outlying_contour}: numeric 44 | #' \item \code{skewed_contour}: numeric 45 | #' \item \code{clumpy_contour}: numeric 46 | #' \item \code{sparse_contour}: numeric 47 | #' \item \code{striated_contour}: numeric 48 | #' \item \code{convex_contour}: numeric 49 | #' \item \code{skinny_contour}: numeric 50 | #' \item \code{stringy_contour}: numeric 51 | #' \item \code{monotonic_contour}: numeric 52 | #' \item \code{num_max_ponits}: numeric 53 | #' \item \code{num_min_points}: numeric 54 | #' \item \code{diameter}: numeric 55 | #' \item \code{area}: numeric 56 | #' \item \code{perimeter}: numeric 57 | #' \item \code{physiological_length}: numeric 58 | #' \item \code{physiological_width}: numeric 59 | #' \item \code{aspect_ratio}: numeric 60 | #' \item \code{rectangularity}: numeric 61 | #' \item \code{circularity}: numeric 62 | #' \item \code{compactness}: numeric 63 | #' \item \code{narrow_factor}: numeric 64 | #' \item \code{perimeter_ratio_diameter}: numeric 65 | #' \item \code{perimeter_ratio_length}: numeric 66 | #' \item \code{perimeter_ratio_lw}: numeric 67 | #' \item \code{num_convex_points}: numeric 68 | #' \item \code{perimeter_convexity}: numeric 69 | #' \item \code{area_convexity}: numeric 70 | #' \item \code{area_ratio_convexity}: numeric 71 | #' \item \code{equivalent_diameter}: numeric 72 | #' \item \code{eccentriciry}: numeric 73 | #' \item \code{contrast}: numeric 74 | #' \item \code{correlation_texture}: numeric 75 | #' \item \code{inverse_difference_moments}: numeric 76 | #' \item \code{entropy}: numeric 77 | #' \item \code{mean_red_val}: numeric 78 | #' \item \code{mean_green_val}: numeric 79 | #' \item \code{mean_blue_val}: numeric 80 | #' \item \code{std_red_val}: numeric 81 | #' \item \code{std_green_val}: numeric 82 | #' \item \code{std_blue_val}: numeric 83 | #' \item \code{correlation}: numeric 84 | #' } 85 | #' @source 86 | #' Lakshika, Jayani PG, and Thiyanga S. Talagala. "Computer-aided interpretable 87 | #' features for leaf image classification." _arXiv preprint_ arXiv:2106.08077 88 | #' (2021). 89 | #' 90 | #' \url{https://github.com/SMART-Research/leaffeatures_paper} 91 | #' 92 | #' @examples 93 | #' data(leaf_id_flavia) 94 | #' str(leaf_id_flavia) 95 | #' 96 | NULL 97 | -------------------------------------------------------------------------------- /R/lending_club.R: -------------------------------------------------------------------------------- 1 | #' Loan data 2 | #' 3 | #' @details These data were downloaded from the Lending Club 4 | #' access site (see below) and are from the first quarter of 2016. 5 | #' A subset of the rows and variables are included here. The 6 | #' outcome is in the variable `Class` and is either "good" (meaning 7 | #' that the loan was fully paid back or currently on-time) or "bad" 8 | #' (charged off, defaulted, of 21-120 days late). A data dictionary 9 | #' can be found on the source website. 10 | #' 11 | #' @name lending_club 12 | #' @aliases lending_club 13 | #' @docType data 14 | #' @return \item{lending_club}{a data frame} 15 | #' 16 | #' @source Lending Club Statistics https://www.lendingclub.com/info/download-data.action 17 | #' 18 | #' @keywords datasets 19 | #' @examples 20 | #' data(lending_club) 21 | #' str(lending_club) 22 | NULL 23 | -------------------------------------------------------------------------------- /R/meats.R: -------------------------------------------------------------------------------- 1 | #' Fat, water and protein content of meat samples 2 | #' 3 | #' "These data are recorded on a Tecator Infratec Food and Feed Analyzer 4 | #' working in the wavelength range 850 - 1050 nm by the Near Infrared 5 | #' Transmission (NIT) principle. Each sample contains finely chopped pure meat 6 | #' with different moisture, fat and protein contents. 7 | #' 8 | #' If results from these data are used in a publication we want you to mention 9 | #' the instrument and company name (Tecator) in the publication. In addition, 10 | #' please send a preprint of your article to: 11 | #' 12 | #' Karin Thente, Tecator AB, Box 70, S-263 21 Hoganas, Sweden 13 | #' 14 | #' The data are available in the public domain with no responsibility from the 15 | #' original data source. The data can be redistributed as long as this 16 | #' permission note is attached." 17 | #' 18 | #' "For each meat sample the data consists of a 100 channel spectrum of 19 | #' absorbances and the contents of moisture (water), fat and protein. The 20 | #' absorbance is -log10 of the transmittance measured by the spectrometer. The 21 | #' three contents, measured in percent, are determined by analytic chemistry." 22 | #' 23 | #' Included here are the training, monitoring and test sets. 24 | #' 25 | #' 26 | #' @name meats 27 | #' @aliases meats 28 | #' @docType data 29 | #' @return \item{meats}{a tibble} 30 | #' @keywords datasets 31 | #' @examples 32 | #' 33 | #' data(meats) 34 | #' str(meats) 35 | NULL 36 | -------------------------------------------------------------------------------- /R/modeldata-package.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | "_PACKAGE" 3 | 4 | ## usethis namespace: start 5 | #' @importFrom stats rnorm runif 6 | ## usethis namespace: end 7 | NULL 8 | 9 | 10 | # needed for simulation docs 11 | utils::globalVariables( 12 | c( 13 | ".", 14 | "linear_pred", 15 | "non_linear_1", 16 | "non_linear_2", 17 | "non_linear_3", 18 | "outcome", 19 | "predictor_01", 20 | "predictor_02", 21 | "predictor_03", 22 | "predictor_04", 23 | "predictor_05", 24 | "predictor_06", 25 | "predictor_07", 26 | "predictor_08", 27 | "predictor_09", 28 | "predictor_10", 29 | "predictor_11", 30 | "predictor_12", 31 | "predictor_13", 32 | "predictor_14", 33 | "predictor_15", 34 | "predictor_16", 35 | "predictor_17", 36 | "predictor_18", 37 | "predictor_19", 38 | "predictor_20", 39 | "rand", 40 | "true_prob", 41 | "two_factor_1", 42 | "two_factor_2", 43 | ".truth", 44 | ".linear_pred", 45 | ".rand" 46 | ) 47 | ) 48 | -------------------------------------------------------------------------------- /R/oils.R: -------------------------------------------------------------------------------- 1 | #' Fatty acid composition of commercial oils 2 | #' 3 | #' Fatty acid concentrations of commercial oils were measured using gas 4 | #' chromatography. The data is used to predict the type of oil. Note that 5 | #' only the known oils are in the data set. Also, the authors state that there 6 | #' are 95 samples of known oils. However, we count 96 in Table 1 (pgs. 33-35). 7 | #' 8 | #' 9 | #' @name oils 10 | #' @aliases oils 11 | #' @docType data 12 | #' @return \item{oils}{a tibble} 13 | #' @source Brodnjak-Voncina et al. (2005). Multivariate data analysis in 14 | #' classification of vegetable oils characterized by the content of fatty 15 | #' acids, \emph{Chemometrics and Intelligent Laboratory Systems}, Vol. 16 | #' 75:31-45. 17 | #' @keywords datasets 18 | #' @examples 19 | #' data(oils) 20 | #' str(oils) 21 | NULL 22 | -------------------------------------------------------------------------------- /R/parabolic.R: -------------------------------------------------------------------------------- 1 | #' Parabolic class boundary data 2 | #' 3 | #' @details These data were simulated. There are two correlated predictors and 4 | #' two classes in the factor outcome. 5 | #' 6 | #' @name parabolic 7 | #' @aliases parabolic 8 | #' @docType data 9 | #' @return \item{parabolic}{a data frame} 10 | #' 11 | #' @keywords datasets 12 | #' @examples 13 | #' data(parabolic) 14 | #' str(parabolic) 15 | NULL 16 | -------------------------------------------------------------------------------- /R/pathology.R: -------------------------------------------------------------------------------- 1 | #' Liver pathology data 2 | #' 3 | #' @details These data have the results of a _x_-ray examination 4 | #' to determine whether liver is abnormal or not (in the `scan` 5 | #' column) versus the more extensive pathology results that 6 | #' approximate the truth (in `pathology`). 7 | #' 8 | #' @name pathology 9 | #' @aliases pathology 10 | #' @docType data 11 | #' @return \item{pathology}{a data frame} 12 | #' 13 | #' @source Altman, D.G., Bland, J.M. (1994) ``Diagnostic tests 1: 14 | #' sensitivity and specificity,'' *British Medical Journal*, 15 | #' vol 308, 1552. 16 | #' 17 | #' 18 | #' @keywords datasets 19 | #' @examples 20 | #' data(pathology) 21 | #' str(pathology) 22 | NULL 23 | -------------------------------------------------------------------------------- /R/pd_speech.R: -------------------------------------------------------------------------------- 1 | #' Parkinson's disease speech classification data set 2 | #' 3 | #' @details From the UCI ML archive, the description is "The data used in this 4 | #' study were gathered from 188 patients with PD (107 men and 81 women) with 5 | #' ages ranging from 33 to 87 (65.1 p/m 10.9) at the Department of Neurology 6 | #' in Cerrahpaşa Faculty of Medicine, Istanbul University. The control group 7 | #' consists of 64 healthy individuals (23 men and 41 women) with ages varying 8 | #' between 41 and 82 (61.1 p/m 8.9). During the data collection process, 9 | #' the microphone is set to 44.1 KHz and following the physician's examination, 10 | #' the sustained phonation of the vowel `/a/` was collected from each subject 11 | #' with three repetitions." 12 | #' 13 | #' The data here are averaged over the replicates. 14 | #' 15 | #' @name pd_speech 16 | #' @aliases pd_speech 17 | #' @docType data 18 | #' @return \item{pd_speech}{a data frame} 19 | #' 20 | #' @source UCI ML repository (data) https://archive.ics.uci.edu/ml/datasets/Parkinson%27s+Disease+Classification#, 21 | #' 22 | #' Sakar et al (2019), "A comparative analysis of speech signal processing 23 | #' algorithms for Parkinson’s disease classification and the use of the tunable 24 | #' Q-factor wavelet transform", _Applied Soft Computing_, V74, pg 255-263. 25 | #' 26 | #' @keywords datasets 27 | #' @examples 28 | #' data(pd_speech) 29 | #' str(pd_speech) 30 | NULL 31 | -------------------------------------------------------------------------------- /R/penguins.R: -------------------------------------------------------------------------------- 1 | #' Palmer Station penguin data 2 | #' 3 | #' A data set from Gorman, Williams, and Fraser (2014) containing measurements 4 | #' from different types of penguins. This version of the data was retrieved from 5 | #' Allison Horst's `palmerpenguins` package on 2020-06-22. 6 | #' 7 | #' @name penguins 8 | #' @aliases penguins 9 | #' @docType data 10 | #' @return \item{penguins}{a tibble} 11 | #' @source Gorman KB, Williams TD, Fraser WR (2014) Ecological Sexual Dimorphism 12 | #' and Environmental Variability within a Community of Antarctic Penguins 13 | #' (_Genus Pygoscelis_). PLoS ONE 9(3): e90081. 14 | #' \doi{10.1371/journal.pone.0090081} 15 | #' 16 | #' \url{https://github.com/allisonhorst/palmerpenguins} 17 | #' @keywords datasets 18 | #' @examples 19 | #' data(penguins) 20 | #' str(penguins) 21 | NULL 22 | -------------------------------------------------------------------------------- /R/permeability_qsar.R: -------------------------------------------------------------------------------- 1 | #' Predicting permeability from chemical information 2 | #' 3 | #' @description 4 | #' A quantitative structure-activity relationship (QSAR) data set to predict 5 | #' when a molecule can permeate cells. 6 | #' 7 | #' @name permeability_qsar 8 | #' @aliases permeability_qsar 9 | #' @docType data 10 | #' @return \item{permeability_qsar}{a data frame} 11 | #' 12 | #' @details 13 | #' This pharmaceutical data set was used to develop a model for predicting 14 | #' compounds' permeability. In short, permeability is the measure of a 15 | #' molecule's ability to cross a membrane. The body, for example, has notable 16 | #' membranes between the body and brain, known as the blood-brain barrier, and 17 | #' between the gut and body in the intestines. These membranes help the body 18 | #' guard critical regions from receiving undesirable or detrimental substances. 19 | #' For an orally taken drug to be effective in the brain, it first must pass 20 | #' through the intestinal wall and then must pass through the blood-brain 21 | #' barrier in order to be present for the desired neurological target. 22 | #' Therefore, a compound's ability to permeate relevant biological membranes 23 | #' is critically important to understand early in the drug discovery process. 24 | #' Compounds that appear to be effective for a particular disease in research 25 | #' screening experiments, but appear to be poorly permeable may need to be 26 | #' altered in order improve permeability, and thus the compound's ability to 27 | #' reach the desired target. Identifying permeability problems can help guide 28 | #' chemists towards better molecules. 29 | #' 30 | #' Permeability assays such as PAMPA and Caco-2 have been developed to help 31 | #' measure compounds' permeability (Kansy et al, 1998). These screens are 32 | #' effective at quantifying a compound's permeability, but the assay is 33 | #' expensive labor intensive. Given a sufficient number of compounds that have 34 | #' been screened, we could develop a predictive model for permeability in an 35 | #' attempt to potentially reduce the need for the assay. In this project there 36 | #' were 165 unique compounds; 1107 molecular fingerprints were determined for 37 | #' each. A molecular fingerprint is a binary sequence of numbers that 38 | #' represents the presence or absence of a specific molecular sub-structure. 39 | #' The response is highly skewed, the predictors are sparse (15.5% are present), 40 | #' and many predictors are strongly associated. 41 | #' 42 | #' Columns: 43 | #' \itemize{ 44 | #' \item \code{permeability}: numeric 45 | #' \item \code{chem_fp_0001} - \code{chem_fp_1107}: numeric 46 | #' } 47 | #' 48 | #' @source 49 | #' Kuhn, Max, and Kjell Johnson. _Applied predictive modeling_. New York: 50 | #' Springer, 2013. 51 | #' 52 | #' @examples 53 | #' data(permeability_qsar) 54 | #' str(permeability_qsar) 55 | #' 56 | NULL 57 | -------------------------------------------------------------------------------- /R/sacremento.R: -------------------------------------------------------------------------------- 1 | #' Sacramento CA home prices 2 | #' 3 | #' This data frame contains house and sale price data for 932 homes in 4 | #' Sacramento CA. The original data were obtained from the website for the 5 | #' SpatialKey software. From their website: "The Sacramento real estate 6 | #' transactions file is a list of 985 real estate transactions in the 7 | #' Sacramento area reported over a five-day period, as reported by the 8 | #' Sacramento Bee." Google was used to fill in missing/incorrect data. 9 | #' 10 | #' 11 | #' @name Sacramento 12 | #' @docType data 13 | #' @return \item{Sacramento}{a tibble} 14 | #' @source SpatialKey website: 15 | #' \url{https://support.spatialkey.com/spatialkey-sample-csv-data/} 16 | #' @keywords datasets 17 | #' @examples 18 | #' data(Sacramento) 19 | #' str(Sacramento) 20 | NULL 21 | -------------------------------------------------------------------------------- /R/scat.R: -------------------------------------------------------------------------------- 1 | #' Morphometric data on scat 2 | #' 3 | #' Reid (2015) collected data on animal feses in coastal California. The data 4 | #' consist of DNA verified species designations as well as fields related to 5 | #' the time and place of the collection and the scat itself. The data are on 6 | #' the three main species. 7 | #' 8 | #' 9 | #' @name scat 10 | #' @aliases scat 11 | #' @docType data 12 | #' @return \item{scat}{a tibble} 13 | #' @source Reid, R. E. B. (2015). A morphometric modeling approach to 14 | #' distinguishing among bobcat, coyote and gray fox scats. \emph{Wildlife 15 | #' Biology}, 21(5), 254-262 16 | #' @keywords datasets 17 | #' @examples 18 | #' data(scat) 19 | #' str(scat) 20 | NULL 21 | -------------------------------------------------------------------------------- /R/solubility.R: -------------------------------------------------------------------------------- 1 | #' Solubility predictions from MARS model 2 | #' 3 | #' @details For the solubility data in Kuhn and Johnson (2013), 4 | #' these data are the test set results for the MARS model. The 5 | #' observed solubility (in column `solubility`) and the model 6 | #' results (`prediction`) are contained in the data. 7 | #' 8 | #' @name solubility_test 9 | #' @aliases solubility_test 10 | #' @docType data 11 | #' @return \item{solubility_test}{a data frame} 12 | #' 13 | #' @source Kuhn, M., Johnson, K. (2013) *Applied Predictive 14 | #' Modeling*, Springer 15 | #' 16 | #' @keywords datasets 17 | #' @examples 18 | #' data(solubility_test) 19 | #' str(solubility_test) 20 | NULL 21 | -------------------------------------------------------------------------------- /R/stackoverflow.R: -------------------------------------------------------------------------------- 1 | #' Annual Stack Overflow Developer Survey Data 2 | #' 3 | #' @details These data are a collection of 5,594 data points collected on 4 | #' developers. These data could be used to try to predict who works remotely 5 | #' (as used in the source listed below). 6 | #' 7 | #' @name stackoverflow 8 | #' @aliases stackoverflow 9 | #' @docType data 10 | #' @return \item{stackoverflow}{a tibble} 11 | #' 12 | #' @source 13 | #' Julia Silge, _Supervised Machine Learning Case Studies in R_ 14 | #' 15 | #' `https://supervised-ml-course.netlify.com/chapter2` 16 | #' 17 | #' Raw data: `https://insights.stackoverflow.com/survey/` 18 | #' @keywords datasets 19 | #' @examples 20 | #' data(stackoverflow) 21 | #' str(stackoverflow) 22 | NULL 23 | -------------------------------------------------------------------------------- /R/steroidogenic_toxicity.R: -------------------------------------------------------------------------------- 1 | #' Predicting steroidogenic toxicity with assay data 2 | #' 3 | #' @description 4 | #' A set of _in vitro_ assays are used to quantify the risk of reproductive 5 | #' toxicity via the disruption of steroidogenic pathways. 6 | #' 7 | #' @name steroidogenic_toxicity 8 | #' @aliases steroidogenic_toxicity 9 | #' @docType data 10 | #' @return A tibble with columns 11 | #' - `class`: factor(levels: toxic and nontoxic) 12 | #' - `cyp_11a1`: numeric 13 | #' - `cyp_11b1`: numeric 14 | #' - `cyp_11b2`: numeric 15 | #' - `cyp_17a1`: numeric 16 | #' - `cyp_19a1`: numeric 17 | #' - `cyp_21a1`: numeric 18 | #' - `hsd3b2`: numeric 19 | #' - `star`: numeric 20 | #' - `progesterone`: numeric 21 | #' - `testosterone`: numeric 22 | #' - `dhea`: numeric 23 | #' - `cortisol`: numeric 24 | 25 | #' @details 26 | #' H295R cells were used to measure the effect with two sets of assay results. 27 | #' The first includes a set of protein measurements on: cytochrome P450 enzymes 28 | #' ("cyp"s), STAR, and 3BHSD2. The second include hormone measurements for 29 | #' DHEA, progesterone, testosterone, and cortisol. 30 | #' 31 | #' Columns: 32 | #' \itemize{ 33 | #' \item \code{class}: factor (levels: 'toxic' and 'nontoxic') 34 | #' \item \code{cyp_11a1}: numeric 35 | #' \item \code{cyp_11b1}: numeric 36 | #' \item \code{cyp_11b2}: numeric 37 | #' \item \code{cyp_17a1}: numeric 38 | #' \item \code{cyp_19a1}: numeric 39 | #' \item \code{cyp_21a1}: numeric 40 | #' \item \code{hsd3b2}: numeric 41 | #' \item \code{star}: numeric 42 | #' \item \code{progesterone}: numeric 43 | #' \item \code{testosterone}: numeric 44 | #' \item \code{dhea}: numeric 45 | #' \item \code{cortisol}: numeric 46 | #' } 47 | #' 48 | #' @source 49 | #' Maglich, J. M., Kuhn, M., Chapin, R. E., & Pletcher, M. T. (2014). More than 50 | #' just hormones: H295R cells as predictors of reproductive toxicity. 51 | #' _Reproductive Toxicology_, 45, 77-86. 52 | #' 53 | #' @examples 54 | #' data(steroidogenic_toxicity) 55 | #' str(steroidogenic_toxicity) 56 | #' 57 | NULL 58 | -------------------------------------------------------------------------------- /R/tate_text.R: -------------------------------------------------------------------------------- 1 | #' Tate Gallery modern artwork metadata 2 | #' 3 | #' Metadata such as artist, title, and year created for recent artworks owned 4 | #' by the Tate Gallery. Only artworks created during or after 1990 are 5 | #' included, and the metadata source was last updated in 2014. The Tate Gallery 6 | #' provides these data but requests users to be respectful of their 7 | #' [guidelines for use](https://github.com/tategallery/collection#usage-guidelines-for-open-data). 8 | #' 9 | #' @name tate_text 10 | #' @aliases tate_text 11 | #' @docType data 12 | #' @return \item{tate_text}{a tibble} 13 | #' 14 | #' @source \itemize{ 15 | #' \item \url{https://github.com/tategallery/collection} 16 | #' \item \url{https://www.tate.org.uk/} 17 | #' } 18 | #' 19 | #' @keywords datasets 20 | #' @examples 21 | #' data(tate_text) 22 | #' str(tate_text) 23 | NULL 24 | -------------------------------------------------------------------------------- /R/taxi.R: -------------------------------------------------------------------------------- 1 | #' Chicago taxi data set 2 | #' 3 | #' @description 4 | #' 5 | #' A data set containing information on a subset of taxi trips in the city 6 | #' of Chicago in 2022. 7 | #' 8 | #' @name taxi 9 | #' @aliases taxi 10 | #' @docType data 11 | #' 12 | #' @return tibble 13 | #' 14 | #' @details 15 | #' 16 | #' The source data are originally described on the linked City of Chicago 17 | #' data portal. The data exported here are a pre-processed subset motivated by 18 | #' the modeling problem of predicting whether a rider will tip or not. 19 | #' 20 | #' \describe{ 21 | #' \item{tip}{Whether the rider left a tip. A factor with levels 22 | #' "yes" and "no".} 23 | #' \item{distance}{The trip distance, in odometer miles.} 24 | #' \item{company}{The taxi company, as a factor. Companies that occurred 25 | #' few times were binned as "other".} 26 | #' \item{local}{Whether the trip's starting and ending locations are in the 27 | #' same community. See the source data for community area values.} 28 | #' \item{dow}{The day of the week in which the trip began, as a 29 | #' factor.} 30 | #' \item{month}{The month in which the trip began, as a factor.} 31 | #' \item{hour}{The hour of the day in which the trip began, as a 32 | #' numeric.} 33 | #' } 34 | #' 35 | #' @source 36 | #' 37 | #' \url{https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew} 38 | #' 39 | #' @examples 40 | #' \donttest{ 41 | #' taxi 42 | #' } 43 | NULL 44 | -------------------------------------------------------------------------------- /R/two_class_dat.R: -------------------------------------------------------------------------------- 1 | #' Two class data 2 | #' 3 | #' @details There are artificial data with two predictors (`A` and `B`) and 4 | #' a factor outcome variable (`Class`). 5 | #' 6 | #' @name two_class_dat 7 | #' @aliases two_class_dat 8 | #' @docType data 9 | #' @return \item{two_class_dat}{a data frame} 10 | #' 11 | #' @keywords datasets 12 | #' @examples 13 | #' data(two_class_dat) 14 | #' str(two_class_dat) 15 | NULL 16 | 17 | #' Two class predictions 18 | #' 19 | #' @details These data are a test set form a model built for two 20 | #' classes ("Class1" and "Class2"). There are columns for the true 21 | #' and predicted classes and column for the probabilities for each 22 | #' class. 23 | #' 24 | #' @name two_class_example 25 | #' @aliases two_class_example 26 | #' @docType data 27 | #' @return \item{two_class_example}{a data frame} 28 | #' 29 | #' @keywords datasets 30 | #' @examples 31 | #' data(two_class_example) 32 | #' str(two_class_example) 33 | NULL 34 | -------------------------------------------------------------------------------- /R/wa_churn.R: -------------------------------------------------------------------------------- 1 | #' Watson churn data 2 | #' 3 | #' @details These data were downloaded from the IBM Watson site 4 | #' (see below) in September 2018. The data contain a factor for 5 | #' whether a customer churned or not. Alternatively, the `tenure` 6 | #' column presumably contains information on how long the customer 7 | #' has had an account. A survival analysis can be done on this 8 | #' column using the `churn` outcome as the censoring information. A 9 | #' data dictionary can be found on the source website. 10 | #' 11 | #' @name wa_churn 12 | #' @aliases wa_churn 13 | #' @docType data 14 | #' @return \item{wa_churn}{a data frame} 15 | #' 16 | #' @source IBM Watson Analytics https://ibm.co/2sOvyvy 17 | #' 18 | #' @keywords datasets 19 | #' @examples 20 | #' data(wa_churn) 21 | #' str(wa_churn) 22 | NULL 23 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r} 8 | #| include: false 9 | knitr::opts_chunk$set( 10 | collapse = TRUE, 11 | comment = "#>", 12 | fig.path = "man/figures/README-", 13 | out.width = "100%" 14 | ) 15 | ``` 16 | 17 | # modeldata 18 | 19 | 20 | [![CRAN status](https://www.r-pkg.org/badges/version/modeldata)](https://CRAN.R-project.org/package=modeldata) 21 | [![R-CMD-check](https://github.com/tidymodels/modeldata/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/tidymodels/modeldata/actions/workflows/R-CMD-check.yaml) 22 | [![lifecycle](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://lifecycle.r-lib.org/articles/stages.html) 23 | 24 | 25 | `modeldata` contains data sets used in documentation and testing for tidymodels packages. The package also contains a suite of simulation functions for classification and regression data. 26 | 27 | ## Installation 28 | 29 | You can install the released version of modeldata from [CRAN](https://CRAN.R-project.org) with: 30 | 31 | ``` r 32 | install.packages("modeldata") 33 | ``` 34 | 35 | And the development version from [GitHub](https://github.com/) with: 36 | 37 | ``` r 38 | # install.packages("pak") 39 | pak::pak("tidymodels/modeldata") 40 | ``` 41 | 42 | ## Contributing 43 | 44 | This project is released with a [Contributor Code of Conduct](https://contributor-covenant.org/version/2/1/CODE_OF_CONDUCT.html). By contributing to this project, you agree to abide by its terms. 45 | 46 | - For questions and discussions about tidymodels packages, modeling, and machine learning, please [post on RStudio Community](https://forum.posit.co/new-topic?category_id=15&tags=tidymodels,question). 47 | 48 | - If you think you have encountered a bug, please [submit an issue](https://github.com/tidymodels/modeldata/issues). 49 | 50 | - Either way, learn how to create and share a [reprex](https://reprex.tidyverse.org/articles/articles/learn-reprex.html) (a minimal, reproducible example), to clearly communicate about your code. 51 | 52 | - Check out further details on [contributing guidelines for tidymodels packages](https://www.tidymodels.org/contribute/) and [how to get help](https://www.tidymodels.org/help/). 53 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # modeldata 5 | 6 | 7 | 8 | [![CRAN 9 | status](https://www.r-pkg.org/badges/version/modeldata)](https://CRAN.R-project.org/package=modeldata) 10 | [![R-CMD-check](https://github.com/tidymodels/modeldata/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/tidymodels/modeldata/actions/workflows/R-CMD-check.yaml) 11 | [![lifecycle](https://img.shields.io/badge/lifecycle-stable-brightgreen.svg)](https://lifecycle.r-lib.org/articles/stages.html) 12 | 13 | 14 | `modeldata` contains data sets used in documentation and testing for 15 | tidymodels packages. The package also contains a suite of simulation 16 | functions for classification and regression data. 17 | 18 | ## Installation 19 | 20 | You can install the released version of modeldata from 21 | [CRAN](https://CRAN.R-project.org) with: 22 | 23 | ``` r 24 | install.packages("modeldata") 25 | ``` 26 | 27 | And the development version from [GitHub](https://github.com/) with: 28 | 29 | ``` r 30 | # install.packages("pak") 31 | pak::pak("tidymodels/modeldata") 32 | ``` 33 | 34 | ## Contributing 35 | 36 | This project is released with a [Contributor Code of 37 | Conduct](https://contributor-covenant.org/version/2/1/CODE_OF_CONDUCT.html). 38 | By contributing to this project, you agree to abide by its terms. 39 | 40 | - For questions and discussions about tidymodels packages, modeling, and 41 | machine learning, please [post on RStudio 42 | Community](https://forum.posit.co/new-topic?category_id=15&tags=tidymodels,question). 43 | 44 | - If you think you have encountered a bug, please [submit an 45 | issue](https://github.com/tidymodels/modeldata/issues). 46 | 47 | - Either way, learn how to create and share a 48 | [reprex](https://reprex.tidyverse.org/articles/articles/learn-reprex.html) 49 | (a minimal, reproducible example), to clearly communicate about your 50 | code. 51 | 52 | - Check out further details on [contributing guidelines for tidymodels 53 | packages](https://www.tidymodels.org/contribute/) and [how to get 54 | help](https://www.tidymodels.org/help/). 55 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://modeldata.tidymodels.org 2 | 3 | template: 4 | package: tidytemplate 5 | bootstrap: 5 6 | bslib: 7 | danger: "#CA225E" 8 | primary: "#CA225E" 9 | includes: 10 | in_header: | 11 | 12 | 13 | development: 14 | mode: auto 15 | 16 | -------------------------------------------------------------------------------- /air.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/air.toml -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 1% 9 | informational: true 10 | patch: 11 | default: 12 | target: auto 13 | threshold: 1% 14 | informational: true 15 | -------------------------------------------------------------------------------- /data-raw/cat_adoption.R: -------------------------------------------------------------------------------- 1 | ## code to prepare `cat_adoption` dataset goes here 2 | 3 | library(tidyverse) 4 | library(janitor) 5 | library(recipes) 6 | library(survival) 7 | 8 | # ------------------------------------------------------------------------------ 9 | 10 | # data from 11 | # https://data.longbeach.gov/explore/dataset/animal-shelter-intakes-and-outcomes/information/ 12 | # 13 | # We spoke with the Long Beach animal shelter on 2024-06-12 and they had some 14 | # information about the outcomes. We're looking to define the event as being 15 | # homed by the Long Beach animal shelter. 16 | # 17 | # `"community cat"` and `"shelter, neuter, return"` are animals brought in by a 18 | # community member (or field officer) for care. They are not owned by anyone 19 | # but are fed and given shelter by different people in the community. We count 20 | # these as observed events. 21 | # 22 | # `"homefirst"` was a program where the pet was adopted with the promise of 23 | # getting them spayed/neutered. We count these as observed events. 24 | # 25 | # `"rescue"`, `"return to rescue"`, `"transport"`, `"transfer"`, and 26 | #`"return to rescue"` means that they went to a different organization that 27 | # works to home them. These are censored data. 28 | # 29 | # We also talked about how long, after intake, they would be considered in the 30 | # "risk set" of animals that could be adopted. This depends on the situation. 31 | # Some animals in poor health need more time to recover and/or be treated. 32 | # There is also a 3-5 day period to give the original owner (if any) time to 33 | # claim it. 34 | # 35 | # For our analysis, we will not include any animals that were at the center 36 | # for <= 1 week. 37 | 38 | # ------------------------------------------------------------------------------ 39 | 40 | harmonize_colors <- function(x) { 41 | x <- gsub("mut", "", x) 42 | 43 | x <- gsub("pt", "point", x) 44 | x <- gsub("(brn)|(br )", "brown ", x) 45 | x <- gsub("dil", "dilute", x) 46 | x <- gsub("org", "orange", x) 47 | x <- gsub("rd", "red", x) 48 | x <- gsub("slvr", "silver", x) 49 | x <- gsub("(crm )|(cr )", "cream ", x) 50 | x <- gsub("(slvr)|(sl)", "silver", x) 51 | x <- gsub("choc ", "chocolate ", x) 52 | x <- gsub("(lc )|(li )", "lilac ", x) 53 | x <- gsub("l-c", "lilac_cream", x, fixed = TRUE) 54 | x <- gsub("(bl )", "blue ", x) 55 | x <- gsub("^(y )", "yellow ", x) 56 | x <- gsub("(blk)|(bc)|(bk)", "black", x) 57 | 58 | # fur patterns 59 | x <- gsub("brind$", "brindle", x) 60 | x <- gsub("tab$", "tabby", x) 61 | 62 | # Things that are still unclear 63 | x <- gsub("b-c", "", x, fixed = TRUE) # "brown-cream"?" 64 | x <- gsub("s-t", "", x, fixed = TRUE) 65 | 66 | x <- trimws(x, which = "both") 67 | gsub("[[:space:]]+", "_", x) 68 | } 69 | 70 | raw <- read_csv("data-raw/animal-shelter-intakes-and-outcomes.csv") %>% 71 | clean_names() %>% 72 | filter( 73 | animal_type == "CAT" & 74 | !is.na(outcome_type) & 75 | intake_is_dead == "Alive on Intake" & 76 | primary_color != "UNKNOWN" & 77 | secondary_color != "UNKNOWN" 78 | ) %>% 79 | filter( 80 | # These animals would not have been up for being homed 81 | !(outcome_type %in% c("DISPOSAL", "EUTHANASIA", "MISSING", "DUPLICATE")) 82 | ) %>% 83 | # There are multiple rows for some animals; take most recent 84 | arrange(animal_id, outcome_date) %>% 85 | slice_head(by = c(animal_id), n = 1) 86 | 87 | event_list <- 88 | c( 89 | "adoption", 90 | "community cat", 91 | "foster", 92 | "foster to adopt", 93 | "homefirst", 94 | "return to owner", 95 | "return to wild habitat", 96 | "shelter, neuter, return", 97 | "trap, neuter, release" 98 | ) 99 | 100 | other_list <- 101 | c("died", "rescue", "return to rescue", "transfer", "transport") 102 | 103 | cats <- raw %>% 104 | mutate( 105 | across(where(is.character), tolower), 106 | time = as.numeric(difftime(outcome_date, intake_date, units = "days")), 107 | time = if_else(time < 0, NA_real_, time), 108 | time = if_else(time < 1, 1, time), 109 | event = if_else(outcome_type %in% event_list, 1, 0), 110 | ) %>% 111 | filter(outcome_type %in% c(event_list, other_list) & time > 7) %>% 112 | select( 113 | time, 114 | event, 115 | contains("color"), 116 | sex, 117 | intake_condition, 118 | intake_type, 119 | jurisdiction, 120 | latitude, 121 | longitude, 122 | animal_id 123 | ) %>% 124 | mutate( 125 | neutered = case_when( 126 | sex %in% c("neutered", "spayed") ~ "yes", 127 | sex == "unknown" ~ "unknown", 128 | TRUE ~ "no" 129 | ), 130 | sex = case_when( 131 | sex == "neutered" ~ "male", 132 | sex == "spayed" ~ "female", 133 | TRUE ~ sex 134 | ), 135 | # clean up color labels 136 | primary_color = harmonize_colors(primary_color), 137 | secondary_color = harmonize_colors(secondary_color), 138 | # underscores 139 | intake_condition = gsub("age/weight", "age_or_weight", intake_condition), 140 | intake_condition = gsub("[[:space:]]+", "_", intake_condition), 141 | intake_type = gsub("i/i", "i_i", intake_type, fixed = TRUE), 142 | intake_type = gsub("[[:punct:]]", "", intake_type), 143 | intake_type = gsub("[[:space:]]+", "_", intake_type), 144 | jurisdiction = gsub("[[:space:]]+", "_", jurisdiction) 145 | ) 146 | 147 | # Make indicators for color (which also contains pattern) 148 | col_names <- function(var, lvl, ...) { 149 | lvl 150 | } 151 | cats_with_color_dummies <- cats %>% 152 | recipe() %>% 153 | step_dummy_multi_choice( 154 | ends_with("color"), 155 | threshold = 0.0, 156 | naming = col_names 157 | ) %>% 158 | step_other(intake_condition, intake_type, threshold = 0.02) %>% 159 | step_zv() %>% 160 | prep() %>% 161 | bake(new_data = NULL) 162 | 163 | col_counts <- map_int(cats_with_color_dummies %>% select(-(1:10)), sum) 164 | col_count_rm <- names(col_counts)[col_counts <= 20] 165 | 166 | cat_adoption <- 167 | cats_with_color_dummies %>% 168 | select(-all_of(col_count_rm)) %>% 169 | select(-animal_id, -jurisdiction) %>% 170 | relocate(time, event) %>% 171 | relocate(neutered, .after = sex) 172 | 173 | usethis::use_data(cat_adoption) 174 | -------------------------------------------------------------------------------- /data-raw/chem_proc_yield.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(janitor) 3 | library(AppliedPredictiveModeling) 4 | 5 | # ------------------------------------------------------------------------------ 6 | 7 | tidymodels_prefer() 8 | theme_set(theme_bw()) 9 | options(pillar.advice = FALSE, pillar.min_title_chars = Inf) 10 | 11 | # ------------------------------------------------------------------------------ 12 | 13 | data(ChemicalManufacturingProcess) 14 | 15 | chem_proc_yield <- 16 | ChemicalManufacturingProcess %>% 17 | clean_names() %>% 18 | rename_with( 19 | .cols = starts_with("manufacturing_process"), 20 | ~ gsub("manufacturing_process", "man_proc_", .x) 21 | ) %>% 22 | rename_with( 23 | .cols = starts_with("biological_material"), 24 | ~ gsub("biological_material", "bio_material_", .x) 25 | ) %>% 26 | as_tibble() 27 | 28 | # ------------------------------------------------------------------------------ 29 | 30 | usethis::use_data(chem_proc_yield) 31 | -------------------------------------------------------------------------------- /data-raw/hepatic_injury_qsar.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(janitor) 3 | library(AppliedPredictiveModeling) 4 | 5 | # ------------------------------------------------------------------------------ 6 | 7 | tidymodels_prefer() 8 | theme_set(theme_bw()) 9 | options(pillar.advice = FALSE, pillar.min_title_chars = Inf) 10 | 11 | # ------------------------------------------------------------------------------ 12 | 13 | data(hepatic) 14 | 15 | names(bio) <- recipes::names0(ncol(bio), "bio_assay_") 16 | names(chem) <- recipes::names0(ncol(chem), "chem_fp_") 17 | 18 | hepatic_injury_qsar <- 19 | bind_cols(bio, chem) %>% 20 | mutate( 21 | class = tolower(as.character(injury)), 22 | class = factor(class, ordered = TRUE, levels = c("none", "mild", "severe")) 23 | ) %>% 24 | as_tibble() %>% 25 | relocate(class) 26 | 27 | # ------------------------------------------------------------------------------ 28 | 29 | usethis::use_data(hepatic_injury_qsar, overwrite = TRUE) 30 | -------------------------------------------------------------------------------- /data-raw/hotel_rates.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(readr) 3 | library(janitor) 4 | library(textrecipes) 5 | library(lubridate) 6 | library(randomNames) 7 | 8 | # ------------------------------------------------------------------------------ 9 | 10 | tidymodels_prefer() 11 | options(pillar.advice = FALSE, pillar.min_title_chars = Inf) 12 | 13 | # ------------------------------------------------------------------------------ 14 | 15 | # See "Hotel booking demand datasets" 16 | # https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=%22Hotel+booking+demand+datasets%22 17 | hotel_raw <- 18 | readr::read_csv( 19 | "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-02-11/hotels.csv" 20 | ) %>% 21 | as_tibble() %>% 22 | mutate( 23 | arrival_date = paste( 24 | arrival_date_year, 25 | arrival_date_month, 26 | arrival_date_day_of_month, 27 | sep = "_" 28 | ), 29 | arrival_date = ymd(arrival_date), 30 | arrival_date_num = decimal_date(arrival_date), 31 | 32 | market_segment = gsub( 33 | "TA/TO", 34 | "_travel_agent", 35 | market_segment, 36 | fixed = TRUE 37 | ), 38 | market_segment = gsub("TA", "_travel_agent", market_segment), 39 | market_segment = gsub("[[:space:]]", "", market_segment), 40 | 41 | meal = case_when( 42 | meal == "BB" ~ "Bed and Breakfast", 43 | meal == "HB" ~ "breakfast and one other meal", 44 | meal == "FB" ~ "breakfast lunch and dinner", 45 | TRUE ~ "no meal package" 46 | ), 47 | 48 | near_christmas = arrival_date_month == "December" & 49 | arrival_date_day_of_month <= 26 & 50 | arrival_date_day_of_month >= 24, 51 | near_christmas = as.numeric(near_christmas), 52 | near_new_years = (arrival_date_month == "December" & 53 | arrival_date_day_of_month >= 30) | 54 | (arrival_date_month == "January" & arrival_date_day_of_month <= 2), 55 | near_new_years = as.numeric(near_new_years) 56 | ) 57 | 58 | # ------------------------------------------------------------------------------ 59 | # instead of codes, use random names for agents and companies. Stratify by 60 | # ethnicity to avoid overlap 61 | 62 | agents <- tibble(agent = unique(hotel_raw$agent)) 63 | 64 | set.seed(1) 65 | agents$fake_name <- 66 | randomNames( 67 | nrow(agents), 68 | name.order = "first.last", 69 | name.sep = "_", 70 | ethnicity = c(1:2, 4:6), # reserve 3 for company names 71 | sample.with.replacement = FALSE 72 | ) 73 | agents$fake_name <- gsub("[[:punct:]]", "_", tolower(agents$fake_name)) 74 | agents$fake_name <- gsub("[[:space:]]", "_", tolower(agents$fake_name)) 75 | agents$fake_name[agents$agent == "NULL"] <- "not_applicable" 76 | 77 | hotel_raw <- 78 | left_join(hotel_raw, agents, by = "agent") %>% 79 | mutate(agent = fake_name) %>% 80 | select(-fake_name) 81 | 82 | ### 83 | 84 | companies <- tibble(company = unique(hotel_raw$company)) 85 | 86 | set.seed(2) 87 | companies$fake_name <- 88 | randomNames( 89 | nrow(companies), 90 | ethnicity = 3, 91 | which.names = "last", 92 | sample.with.replacement = FALSE 93 | ) 94 | companies$fake_name <- gsub("[[:punct:]]", "_", tolower(companies$fake_name)) 95 | companies$fake_name <- gsub("[[:space:]]", "_", tolower(companies$fake_name)) 96 | types <- c("_llc", "_inc", "_and_company", "_pbc") 97 | types <- sample(types, nrow(companies), replace = TRUE) 98 | companies$fake_name <- paste0(companies$fake_name, types) 99 | companies$fake_name[companies$company == "NULL"] <- "not_applicable" 100 | 101 | hotel_raw <- 102 | left_join(hotel_raw, companies, by = "company") %>% 103 | mutate(company = fake_name) %>% 104 | select(-fake_name) 105 | 106 | # ------------------------------------------------------------------------------ 107 | # version for regression analysis 108 | 109 | hotel_rates_all <- 110 | hotel_raw %>% 111 | filter( 112 | is_canceled == 0 & 113 | adr > 15 & 114 | adr < 2000 & 115 | hotel == "Resort Hotel" & 116 | reservation_status == "Check-Out" & 117 | deposit_type == "No Deposit" & 118 | !(market_segment %in% c("Complementary", "Undefined")) 119 | ) %>% 120 | select( 121 | -reservation_status, 122 | -is_canceled, 123 | avg_price_per_room = adr, 124 | -reservation_status_date, 125 | -hotel, 126 | -arrival_date_month, 127 | -deposit_type 128 | ) %>% 129 | mutate(year_day = yday(arrival_date)) %>% 130 | relocate(avg_price_per_room) %>% 131 | recipe() %>% 132 | step_clean_levels(all_nominal()) %>% 133 | prep() %>% 134 | bake(new_data = NULL) 135 | 136 | # ------------------------------------------------------------------------------ 137 | # pull off first year of data to compute historical ADR by day 138 | 139 | year_2016_data <- 140 | hotel_rates_all %>% 141 | filter(arrival_date <= min(arrival_date) + years(1)) 142 | 143 | year_2016_stats <- 144 | year_2016_data %>% 145 | summarize( 146 | hist_adr_raw = mean(avg_price_per_room), 147 | hist_bookings = n(), 148 | .by = year_day 149 | ) %>% 150 | arrange(year_day) 151 | 152 | year_2016_stats$historical_adr <- 153 | loess( 154 | hist_adr_raw ~ year_day, 155 | data = year_2016_stats, 156 | span = .1, 157 | degree = 2 158 | )$fitted 159 | 160 | # Add a value for the leap year 161 | year_2016_stats_leap <- 162 | tibble( 163 | year_day = 366, 164 | historical_adr = year_2016_stats$hist_adr_raw[nrow(year_2016_stats)] 165 | ) 166 | 167 | year_2016_stats <- 168 | bind_rows(year_2016_stats, year_2016_stats_leap) %>% 169 | select(year_day, historical_adr) 170 | 171 | hotel_rates <- 172 | hotel_rates_all %>% 173 | filter(arrival_date > min(arrival_date) + years(1)) %>% 174 | left_join(year_2016_stats, by = "year_day") %>% 175 | arrange(arrival_date) %>% 176 | select( 177 | -arrival_date_year, 178 | -arrival_date_week_number, 179 | -arrival_date_day_of_month, 180 | -year_day 181 | ) 182 | 183 | usethis::use_data(hotel_rates, overwrite = TRUE) 184 | -------------------------------------------------------------------------------- /data-raw/ischemic_stroke.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(janitor) 3 | library(forcats) 4 | 5 | # ------------------------------------------------------------------------------ 6 | 7 | tidymodels_prefer() 8 | theme_set(theme_bw()) 9 | options(pillar.advice = FALSE, pillar.min_title_chars = Inf) 10 | 11 | # ------------------------------------------------------------------------------ 12 | 13 | load(url( 14 | "https://github.com/topepo/FES/raw/06812c48a21882808403cee338b8312fdbd35a46/Data_Sets/Ischemic_Stroke/stroke_data.RData" 15 | )) 16 | 17 | ischemic_stroke <- 18 | bind_rows(stroke_train, stroke_test) %>% 19 | clean_names() %>% 20 | rename(male = sex, nascet_scale = nascet) %>% 21 | mutate( 22 | stroke = ifelse(stroke == "Y", "yes", "no"), 23 | stroke = factor(stroke, levels = c("yes", "no")) 24 | ) %>% 25 | as_tibble() 26 | 27 | # ------------------------------------------------------------------------------ 28 | 29 | usethis::use_data(ischemic_stroke) 30 | -------------------------------------------------------------------------------- /data-raw/leaf_id_flavia.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(janitor) 3 | library(readr) 4 | 5 | # ------------------------------------------------------------------------------ 6 | 7 | tidymodels_prefer() 8 | theme_set(theme_bw()) 9 | options(pillar.advice = FALSE, pillar.min_title_chars = Inf) 10 | 11 | # ------------------------------------------------------------------------------ 12 | 13 | flavia_url <- 14 | "https://github.com/SMART-Research/leaffeatures_paper/raw/65ffd8c8b926b8df3f499c9224d6073975db5c3c/data_all_with_label_flavia_with_species.csv" 15 | 16 | leaf_id_flavia <- 17 | read_csv(flavia_url) %>% 18 | clean_names() %>% 19 | mutate( 20 | species = gsub("([[:punct:]])|([[:space:]])", "_", tolower(species)), 21 | shape = tolower(shape_label), 22 | apex = if_else(!is.na(apex), tolower(apex), "none"), 23 | base = if_else(!is.na(base), tolower(base), "none"), 24 | edge_type = tolower(edge_type), 25 | edge_type_2 = tolower(edge_type_2), 26 | edge_type_2 = ifelse(is.na(edge_type_2), "", edge_type_2), 27 | edges = map2_chr(edge_type, edge_type_2, ~ paste(.x, .y, sep = "_")), 28 | edges = gsub("_$", "", edges), 29 | denate_edge = ifelse( 30 | edge_type == "denate" | edge_type_2 == "denate", 31 | "yes", 32 | "no" 33 | ), 34 | lobed_edge = ifelse( 35 | edge_type == "lobed" | edge_type_2 == "lobed", 36 | "yes", 37 | "no" 38 | ), 39 | smooth_edge = ifelse( 40 | edge_type == "smooth" | edge_type_2 == "smooth", 41 | "yes", 42 | "no" 43 | ), 44 | toothed_edge = ifelse( 45 | edge_type == "toothed" | edge_type_2 == "toothed", 46 | "yes", 47 | "no" 48 | ), 49 | undulate_edge = ifelse( 50 | edge_type == "undulate" | edge_type_2 == "undulate", 51 | "yes", 52 | "no" 53 | ), 54 | across(where(is.character), factor) 55 | ) %>% 56 | select(-id, -cx, -cy, -shape_label, -edges, -edge_type_2, -edge_type) %>% 57 | rename(narrow_factor = nf) %>% 58 | rename_with(~ gsub("_g_", "_green_", .x)) %>% 59 | rename_with(~ gsub("_b_", "_blue_", .x)) %>% 60 | rename_with(~ gsub("_r_", "_red_", .x)) %>% 61 | rename_with(~ gsub("^no_of_", "num_", .x)) %>% 62 | relocate( 63 | species, 64 | apex, 65 | base, 66 | shape, 67 | denate_edge, 68 | lobed_edge, 69 | smooth_edge, 70 | toothed_edge, 71 | undulate_edge 72 | ) 73 | 74 | # ------------------------------------------------------------------------------ 75 | 76 | usethis::use_data(leaf_id_flavia) 77 | -------------------------------------------------------------------------------- /data-raw/permeability_qsar.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(janitor) 3 | library(AppliedPredictiveModeling) 4 | 5 | # ------------------------------------------------------------------------------ 6 | 7 | tidymodels_prefer() 8 | theme_set(theme_bw()) 9 | options(pillar.advice = FALSE, pillar.min_title_chars = Inf) 10 | 11 | # ------------------------------------------------------------------------------ 12 | 13 | data("permeability") 14 | 15 | fingerprints <- as.data.frame(fingerprints) 16 | 17 | names(fingerprints) <- recipes::names0(ncol(fingerprints), "chem_fp_") 18 | 19 | permeability_qsar <- 20 | fingerprints %>% 21 | mutate( 22 | permeability = permeability[, 1] 23 | ) %>% 24 | as_tibble() %>% 25 | relocate(permeability) 26 | 27 | # ------------------------------------------------------------------------------ 28 | 29 | usethis::use_data(permeability_qsar) 30 | -------------------------------------------------------------------------------- /data-raw/prep_datasets.R: -------------------------------------------------------------------------------- 1 | ## code to prepare `tate_text` 2 | 3 | library(tidyverse) 4 | artwork <- read_csv( 5 | "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-01-12/artwork.csv" 6 | ) 7 | 8 | tate_text <- artwork %>% 9 | filter(year >= 1990, artistRole == "artist") %>% 10 | select(id, artist, title, medium, year) %>% 11 | mutate(across(c(artist, medium), as.factor)) %>% 12 | arrange(year, artist) 13 | 14 | usethis::use_data(tate_text, overwrite = TRUE) 15 | -------------------------------------------------------------------------------- /data-raw/steroidogenic_toxicity.R: -------------------------------------------------------------------------------- 1 | library(tidymodels) 2 | library(janitor) 3 | library(readr) 4 | 5 | # ------------------------------------------------------------------------------ 6 | 7 | tidymodels_prefer() 8 | theme_set(theme_bw()) 9 | options(pillar.advice = FALSE, pillar.min_title_chars = Inf) 10 | 11 | # ------------------------------------------------------------------------------ 12 | 13 | steroidogenic_toxicity <- 14 | read_delim( 15 | "https://github.com/topepo/steroidogenic_tox/raw/master/data.txt", 16 | delim = "\t" 17 | ) %>% 18 | clean_names() %>% 19 | rename_with(~ gsub("cyp", "cyp_", .x)) %>% 20 | select(-compound) %>% 21 | mutate(class = factor(class, levels = c("toxic", "nontoxic"))) 22 | 23 | # ------------------------------------------------------------------------------ 24 | 25 | usethis::use_data(steroidogenic_toxicity, overwrite = TRUE) 26 | -------------------------------------------------------------------------------- /data-raw/taxi.R: -------------------------------------------------------------------------------- 1 | ## code to prepare `taxi` dataset goes here 2 | 3 | library(tidyverse) 4 | library(tidymodels) 5 | library(janitor) 6 | 7 | # https://data.cityofchicago.org/Transportation/Taxi-Trips-2022/npd7-ywjz 8 | taxi_raw <- read_csv( 9 | "https://data.cityofchicago.org/api/views/e55j-2ewb/rows.csv?accessType=DOWNLOAD" 10 | ) |> 11 | clean_names() 12 | 13 | set.seed(1234) 14 | 15 | taxi_med <- taxi_raw |> 16 | filter(!is.na(tips), payment_type != "Cash") |> 17 | drop_na() |> 18 | slice_sample(n = 20000) |> 19 | mutate( 20 | tip = if_else(tips > 0, "yes", "no") |> factor(levels = c("yes", "no")), 21 | trip_start = mdy_hms(trip_start_timestamp), 22 | local = if_else( 23 | pickup_community_area == dropoff_community_area, 24 | "yes", 25 | "no" 26 | ) |> 27 | factor(levels = c("yes", "no")), 28 | pickup_community_area = factor(pickup_community_area), 29 | dropoff_community_area = factor(dropoff_community_area) 30 | ) 31 | 32 | taxi_rec_base <- recipe(tip ~ ., data = taxi_med) |> 33 | step_date( 34 | trip_start, 35 | features = c("dow", "month"), 36 | keep_original_cols = TRUE 37 | ) |> 38 | step_time( 39 | trip_start, 40 | features = c("hour", "minute"), 41 | keep_original_cols = TRUE 42 | ) |> 43 | step_other(company) |> 44 | step_rm( 45 | trip_start_timestamp, 46 | trip_end_timestamp, 47 | taxi_id, 48 | tips, 49 | trip_start, 50 | trip_start_minute, 51 | contains("census"), 52 | contains("centroid"), 53 | contains("community_area") 54 | ) %>% 55 | step_rename( 56 | id := trip_id, 57 | duration = trip_seconds, 58 | distance = trip_miles, 59 | total_cost = trip_total, 60 | dow = trip_start_dow, 61 | month = trip_start_month, 62 | hour = trip_start_hour 63 | ) 64 | 65 | taxi <- prep(taxi_rec_base) |> 66 | bake(new_data = NULL) |> 67 | relocate(tip) 68 | 69 | taxi <- taxi |> 70 | mutate(month = factor(month, levels = c("Jan", "Feb", "Mar", "Apr"))) |> 71 | select(-c(id, duration, fare, tolls, extras, total_cost, payment_type)) |> 72 | drop_na() |> 73 | slice_sample(n = 10000) 74 | 75 | usethis::use_data(taxi, overwrite = TRUE) 76 | -------------------------------------------------------------------------------- /data/Chicago.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/Chicago.rda -------------------------------------------------------------------------------- /data/Sacramento.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/Sacramento.RData -------------------------------------------------------------------------------- /data/Smithsonian.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/Smithsonian.RData -------------------------------------------------------------------------------- /data/ad_data.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/ad_data.RData -------------------------------------------------------------------------------- /data/ames.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/ames.rda -------------------------------------------------------------------------------- /data/attrition.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/attrition.RData -------------------------------------------------------------------------------- /data/biomass.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/biomass.RData -------------------------------------------------------------------------------- /data/bivariate.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/bivariate.RData -------------------------------------------------------------------------------- /data/car_prices.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/car_prices.RData -------------------------------------------------------------------------------- /data/cat_adoption.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/cat_adoption.rda -------------------------------------------------------------------------------- /data/cells.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/cells.RData -------------------------------------------------------------------------------- /data/check_times.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/check_times.rda -------------------------------------------------------------------------------- /data/chem_proc_yield.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/chem_proc_yield.rda -------------------------------------------------------------------------------- /data/concrete.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/concrete.RData -------------------------------------------------------------------------------- /data/covers.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/covers.RData -------------------------------------------------------------------------------- /data/credit_data.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/credit_data.RData -------------------------------------------------------------------------------- /data/crickets.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/crickets.rda -------------------------------------------------------------------------------- /data/datalist: -------------------------------------------------------------------------------- 1 | ad_data 2 | ames 3 | attrition 4 | biomass 5 | bivariate: bivariate_test bivariate_train bivariate_val 6 | car_prices 7 | cells 8 | check_times 9 | Chicago: Chicago stations 10 | concrete 11 | covers 12 | credit_data 13 | crickets 14 | deliveries 15 | drinks 16 | grants: grants_2008 grants_other grants_test 17 | hpc_cv 18 | hpc_data 19 | lending_club 20 | meats 21 | mlc_churn 22 | oils 23 | parabolic 24 | pathology 25 | pd_speech 26 | penguins 27 | Sacramento 28 | scat 29 | small_fine_foods: testing_data training_data 30 | Smithsonian 31 | solubility_test 32 | stackoverflow 33 | tate_text 34 | two_class_dat 35 | two_class_example 36 | wa_churn 37 | chem_proc_yield 38 | permeability_qsar 39 | steroidogenic_toxicity 40 | leaf_id_flavia 41 | ischemic_stroke 42 | hepatic_injury_qsar 43 | -------------------------------------------------------------------------------- /data/deliveries.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/deliveries.rda -------------------------------------------------------------------------------- /data/drinks.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/drinks.rda -------------------------------------------------------------------------------- /data/grants.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/grants.rda -------------------------------------------------------------------------------- /data/hepatic_injury_qsar.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/hepatic_injury_qsar.rda -------------------------------------------------------------------------------- /data/hotel_rates.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/hotel_rates.rda -------------------------------------------------------------------------------- /data/hpc_cv.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/hpc_cv.rda -------------------------------------------------------------------------------- /data/hpc_data.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/hpc_data.RData -------------------------------------------------------------------------------- /data/ischemic_stroke.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/ischemic_stroke.rda -------------------------------------------------------------------------------- /data/leaf_id_flavia.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/leaf_id_flavia.rda -------------------------------------------------------------------------------- /data/lending_club.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/lending_club.rda -------------------------------------------------------------------------------- /data/meats.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/meats.RData -------------------------------------------------------------------------------- /data/mlc_churn.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/mlc_churn.RData -------------------------------------------------------------------------------- /data/oils.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/oils.RData -------------------------------------------------------------------------------- /data/parabolic.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/parabolic.rda -------------------------------------------------------------------------------- /data/pathology.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/pathology.rda -------------------------------------------------------------------------------- /data/pd_speech.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/pd_speech.rda -------------------------------------------------------------------------------- /data/penguins.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/penguins.rda -------------------------------------------------------------------------------- /data/permeability_qsar.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/permeability_qsar.rda -------------------------------------------------------------------------------- /data/scat.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/scat.RData -------------------------------------------------------------------------------- /data/small_fine_foods.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/small_fine_foods.RData -------------------------------------------------------------------------------- /data/solubility_test.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/solubility_test.rda -------------------------------------------------------------------------------- /data/stackoverflow.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/stackoverflow.rda -------------------------------------------------------------------------------- /data/steroidogenic_toxicity.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/steroidogenic_toxicity.rda -------------------------------------------------------------------------------- /data/tate_text.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/tate_text.rda -------------------------------------------------------------------------------- /data/taxi.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/taxi.rda -------------------------------------------------------------------------------- /data/two_class_dat.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/two_class_dat.RData -------------------------------------------------------------------------------- /data/two_class_example.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/two_class_example.rda -------------------------------------------------------------------------------- /data/wa_churn.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidymodels/modeldata/5c22676a2deab18170885be4dd6e55068a669bb3/data/wa_churn.rda -------------------------------------------------------------------------------- /man/Chicago.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Chicago.R 3 | \docType{data} 4 | \name{Chicago} 5 | \alias{Chicago} 6 | \alias{stations} 7 | \title{Chicago ridership data} 8 | \source{ 9 | Kuhn and Johnson (2020), \emph{Feature Engineering and Selection}, 10 | Chapman and Hall/CRC . \url{https://bookdown.org/max/FES/} and 11 | \url{https://github.com/topepo/FES} 12 | } 13 | \value{ 14 | \item{Chicago}{a tibble} \item{stations}{a vector of station names} 15 | } 16 | \description{ 17 | Chicago ridership data 18 | } 19 | \details{ 20 | These data are from Kuhn and Johnson (2020) and contain an 21 | \emph{abbreviated} training set for modeling the number of people (in thousands) 22 | who enter the Clark and Lake L station. 23 | 24 | The \code{date} column corresponds to the current date. The columns with station 25 | names (\code{Austin} through \code{California}) are a \emph{sample} of the columns used in 26 | the original analysis (for file size reasons). These are 14 day lag 27 | variables (i.e. \verb{date - 14 days}). There are columns related to weather and 28 | sports team schedules. 29 | 30 | The station at 35th and Archer is contained in the column \code{Archer_35th} to 31 | make it a valid R column name. 32 | } 33 | \examples{ 34 | data(Chicago) 35 | str(Chicago) 36 | stations 37 | } 38 | \keyword{datasets} 39 | -------------------------------------------------------------------------------- /man/Sacramento.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sacremento.R 3 | \docType{data} 4 | \name{Sacramento} 5 | \alias{Sacramento} 6 | \title{Sacramento CA home prices} 7 | \source{ 8 | SpatialKey website: 9 | \url{https://support.spatialkey.com/spatialkey-sample-csv-data/} 10 | } 11 | \value{ 12 | \item{Sacramento}{a tibble} 13 | } 14 | \description{ 15 | This data frame contains house and sale price data for 932 homes in 16 | Sacramento CA. The original data were obtained from the website for the 17 | SpatialKey software. From their website: "The Sacramento real estate 18 | transactions file is a list of 985 real estate transactions in the 19 | Sacramento area reported over a five-day period, as reported by the 20 | Sacramento Bee." Google was used to fill in missing/incorrect data. 21 | } 22 | \examples{ 23 | data(Sacramento) 24 | str(Sacramento) 25 | } 26 | \keyword{datasets} 27 | -------------------------------------------------------------------------------- /man/Smithsonian.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/Smithsonian.R 3 | \docType{data} 4 | \name{Smithsonian} 5 | \alias{Smithsonian} 6 | \title{Smithsonian museums} 7 | \source{ 8 | https://en.wikipedia.org/wiki/List_of_Smithsonian_museums 9 | } 10 | \value{ 11 | \item{Smithsonian}{a tibble} 12 | } 13 | \description{ 14 | Geocodes for the Smithsonian museums (circa 2018). 15 | } 16 | \examples{ 17 | data(Smithsonian) 18 | str(Smithsonian) 19 | } 20 | \keyword{datasets} 21 | -------------------------------------------------------------------------------- /man/ad_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ad_data.R 3 | \docType{data} 4 | \name{ad_data} 5 | \alias{ad_data} 6 | \title{Alzheimer's disease data} 7 | \source{ 8 | Kuhn, M., Johnson, K. (2013) \emph{Applied Predictive Modeling}, Springer. 9 | 10 | Craig-Schapiro R, Kuhn M, Xiong C, Pickering EH, Liu J, Misko TP, et al. 11 | (2011) Multiplexed Immunoassay Panel Identifies Novel CSF Biomarkers for 12 | Alzheimer's Disease Diagnosis and Prognosis. PLoS ONE 6(4): e18850. 13 | } 14 | \value{ 15 | \item{ad_data}{a tibble} 16 | } 17 | \description{ 18 | Alzheimer's disease data 19 | } 20 | \details{ 21 | Craig-Schapiro et al. (2011) describe a clinical study of 333 patients, 22 | including some with mild (but well-characterized) cognitive impairment as 23 | well as healthy individuals. CSF samples were taken from all subjects. The 24 | goal of the study was to determine if subjects in the early states of 25 | impairment could be differentiated from cognitively healthy individuals. 26 | Data collected on each subject included: 27 | \itemize{ 28 | \item Demographic characteristics such as age and gender 29 | \item Apolipoprotein E genotype 30 | \item Protein measurements of Abeta, Tau, and a phosphorylated version of Tau (called pTau) 31 | \item Protein measurements of 124 exploratory biomarkers, and 32 | \item Clinical dementia scores 33 | } 34 | 35 | For these analyses, we have converted the scores to two classes: impaired 36 | and healthy. The goal of this analysis is to create classification models 37 | using the demographic and assay data to predict which patients have early 38 | stages of disease. 39 | } 40 | \examples{ 41 | data(ad_data) 42 | str(ad_data) 43 | } 44 | \keyword{datasets} 45 | -------------------------------------------------------------------------------- /man/ames.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ames.R 3 | \docType{data} 4 | \name{ames} 5 | \alias{ames} 6 | \title{Ames Housing Data} 7 | \source{ 8 | De Cock, D. (2011). "Ames, Iowa: Alternative to the Boston Housing Data as an End of Semester Regression Project," \emph{Journal of Statistics Education}, Volume 19, Number 3. 9 | 10 | \url{http://jse.amstat.org/v19n3/decock/DataDocumentation.txt} 11 | 12 | \url{http://jse.amstat.org/v19n3/decock.pdf} 13 | } 14 | \value{ 15 | \item{ames}{a tibble} 16 | } 17 | \description{ 18 | A data set from De Cock (2011) has 82 fields were recorded for 2,930 19 | properties in Ames IA. This version is copies from the \code{AmesHousing} package 20 | but does not include a few quality columns that appear to be outcomes 21 | rather than predictors. 22 | } 23 | \details{ 24 | See this links for the sources below for more information as well as 25 | \code{?AmesHousing::make_ames}. 26 | 27 | For these data, the training materials typically use: 28 | 29 | \if{html}{\out{
}}\preformatted{library(tidymodels) 30 | 31 | set.seed(4595) 32 | data_split <- initial_split(ames, strata = "Sale_Price") 33 | ames_train <- training(data_split) 34 | ames_test <- testing(data_split) 35 | 36 | set.seed(2453) 37 | ames_folds<- vfold_cv(ames_train) 38 | }\if{html}{\out{
}} 39 | } 40 | \examples{ 41 | data(ames) 42 | str(ames) 43 | } 44 | \keyword{datasets} 45 | -------------------------------------------------------------------------------- /man/attrition.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/attrition.R 3 | \docType{data} 4 | \name{attrition} 5 | \alias{attrition} 6 | \title{Job attrition} 7 | \source{ 8 | The IBM Watson Analytics Lab website https://www.ibm.com/communities/analytics/watson-analytics-blog/hr-employee-attrition/ 9 | } 10 | \value{ 11 | \item{attrition}{a data frame} 12 | } 13 | \description{ 14 | Job attrition 15 | } 16 | \details{ 17 | These data are from the IBM Watson Analytics Lab. 18 | The website describes the data with \dQuote{Uncover the 19 | factors that lead to employee attrition and explore important 20 | questions such as \sQuote{show me a breakdown of distance 21 | from home by job role and attrition} or \sQuote{compare 22 | average monthly income by education and attrition}. This is a 23 | fictional data set created by IBM data scientists.}. There 24 | are 1470 rows. 25 | } 26 | \examples{ 27 | data(attrition) 28 | str(attrition) 29 | } 30 | \keyword{datasets} 31 | -------------------------------------------------------------------------------- /man/biomass.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/biomass.R 3 | \docType{data} 4 | \name{biomass} 5 | \alias{biomass} 6 | \title{Biomass data} 7 | \source{ 8 | Ghugare, S. B., Tiwary, S., Elangovan, V., and Tambe, S. S. (2013). 9 | Prediction of Higher Heating Value of Solid Biomass Fuels Using Artificial 10 | Intelligence Formalisms. \emph{BioEnergy Research}, 1-12. 11 | } 12 | \value{ 13 | \item{biomass}{a data frame} 14 | } 15 | \description{ 16 | Ghugare et al (2014) contains a data set where different biomass fuels are 17 | characterized by the amount of certain molecules (carbon, hydrogen, oxygen, 18 | nitrogen, and sulfur) and the corresponding higher heating value (HHV). 19 | These data are from their Table S.2 of the Supplementary Materials 20 | } 21 | \examples{ 22 | data(biomass) 23 | str(biomass) 24 | } 25 | \keyword{datasets} 26 | -------------------------------------------------------------------------------- /man/bivariate.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bivariate.R 3 | \docType{data} 4 | \name{bivariate} 5 | \alias{bivariate} 6 | \alias{bivariate_train} 7 | \alias{bivariate_test} 8 | \alias{bivariate_val} 9 | \title{Example bivariate classification data} 10 | \value{ 11 | \item{bivariate_train, bivariate_test, bivariate_val}{tibbles} 12 | } 13 | \description{ 14 | Example bivariate classification data 15 | } 16 | \details{ 17 | These data are a simplified version of the segmentation data contained 18 | in \code{caret}. There are three columns: \code{A} and \code{B} are predictors and the column 19 | \code{Class} is a factor with levels "One" and "Two". There are three data sets: 20 | one for training (n = 1009), validation (n = 300), and testing (n = 710). 21 | } 22 | \examples{ 23 | data(bivariate) 24 | str(bivariate_train) 25 | str(bivariate_val) 26 | str(bivariate_test) 27 | } 28 | \keyword{datasets} 29 | -------------------------------------------------------------------------------- /man/car_prices.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/car_prices.R 3 | \docType{data} 4 | \name{car_prices} 5 | \alias{car_prices} 6 | \title{Kelly Blue Book resale data for 2005 model year GM cars} 7 | \source{ 8 | Kuiper, S. (2008). Introduction to Multiple Regression: How Much Is Your Car Worth?, 9 | \emph{Journal of Statistics Education}, Vol. 16 10 | \url{http://jse.amstat.org/jse_archive.htm#2008}. 11 | } 12 | \value{ 13 | \item{car_prices}{data frame of the suggested retail price (column \code{Price}) and various 14 | characteristics of each car (columns \code{Mileage}, \code{Cylinder}, \code{Doors}, \code{Cruise}, 15 | \code{Sound}, \code{Leather}, \code{Buick}, \code{Cadillac}, \code{Chevy}, \code{Pontiac}, \code{Saab}, 16 | \code{Saturn}, \code{convertible}, \code{coupe}, \code{hatchback}, \code{sedan} and \code{wagon})} 17 | } 18 | \description{ 19 | Kuiper (2008) collected data on Kelly Blue Book resale data for 804 GM cars (2005 model year). 20 | } 21 | \examples{ 22 | data(car_prices) 23 | str(car_prices) 24 | } 25 | \keyword{datasets} 26 | -------------------------------------------------------------------------------- /man/cat_adoption.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cat_adoption.R 3 | \docType{data} 4 | \name{cat_adoption} 5 | \alias{cat_adoption} 6 | \title{Cat Adoption} 7 | \source{ 8 | \url{https://data.longbeach.gov/explore/dataset/animal-shelter-intakes-and-outcomes/information/} 9 | on 2024-06-17 10 | } 11 | \value{ 12 | tibble 13 | } 14 | \description{ 15 | A subset of the cats at the animal shelter in Long Beach, California, USA. 16 | } 17 | \details{ 18 | A data frame with 2257 rows and 19 columns: 19 | \describe{ 20 | \item{time}{The time the cat spent at the shelter.} 21 | \item{event}{The event of interest is the cat being homed or returned to 22 | its original location (i.e., owner or community). The non-event is the cat 23 | being transferred to another shelter or dying. Zero indicates a non-event 24 | (censored), and one corresponds to the event occurring.} 25 | \item{sex}{The sex of the cat.} 26 | \item{neutered}{Whether the cat is neutered.} 27 | \item{intake_condition}{The intake condition of the cat.} 28 | \item{intake_type}{The type of intake.} 29 | \item{latitude}{Latitude of the intersection/cross street of intake or capture.} 30 | \item{longitude}{Longitude of the intersection/cross street of intake or capture.} 31 | \item{black,brown,brown_tabby,calico,cream,gray,gray_tabby,orange,orange_tabby,tan,tortie,white}{Indicators for the color/pattern of the cat's fur.} 32 | } 33 | } 34 | \examples{ 35 | str(cat_adoption) 36 | } 37 | \keyword{datasets} 38 | -------------------------------------------------------------------------------- /man/cells.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/cells.R 3 | \docType{data} 4 | \name{cells} 5 | \alias{cells} 6 | \title{Cell body segmentation} 7 | \source{ 8 | Hill, LaPan, Li and Haney (2007). Impact of image segmentation on 9 | high-content screening data quality for SK-BR-3 cells, \emph{BMC 10 | Bioinformatics}, Vol. 8, pg. 340, 11 | \url{https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-8-340}. 12 | } 13 | \value{ 14 | \item{cells}{a tibble} 15 | } 16 | \description{ 17 | Hill, LaPan, Li and Haney (2007) develop models to predict which cells in a 18 | high content screen were well segmented. The data consists of 119 imaging 19 | measurements on 2019. The original analysis used 1009 for training and 1010 20 | as a test set (see the column called \code{case}). 21 | } 22 | \details{ 23 | The outcome class is contained in a factor variable called \code{class} with 24 | levels "PS" for poorly segmented and "WS" for well segmented. 25 | 26 | The raw data used in the paper can be found at the Biomedcentral website. 27 | The version 28 | contained in \code{cells} is modified. First, several discrete 29 | versions of some of the predictors (with the suffix "Status") were removed. 30 | Second, there are several skewed predictors with minimum values of zero 31 | (that would benefit from some transformation, such as the log). A constant 32 | value of 1 was added to these fields: \code{avg_inten_ch_2}, 33 | \code{fiber_align_2_ch_3}, \code{fiber_align_2_ch_4}, \code{spot_fiber_count_ch_4} and 34 | \code{total_inten_ch_2}. 35 | } 36 | \examples{ 37 | data(cells) 38 | str(cells) 39 | } 40 | \keyword{datasets} 41 | -------------------------------------------------------------------------------- /man/check_times.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/check_times.R 3 | \docType{data} 4 | \name{check_times} 5 | \alias{check_times} 6 | \title{Execution time data} 7 | \source{ 8 | CRAN 9 | } 10 | \value{ 11 | \item{check_times}{a data frame} 12 | } 13 | \description{ 14 | These data were collected from the CRAN web page for 13,626 R 15 | packages. The time to complete the standard package checking 16 | routine was collected In some cases, the package checking 17 | process is stopped due to errors and these data are treated as 18 | censored. It is less than 1 percent. 19 | } 20 | \details{ 21 | As predictors, the associated package source code were 22 | downloaded and parsed to create predictors, including 23 | \itemize{ 24 | \item \code{authors}: The number of authors in the author field. 25 | \item \code{imports}: The number of imported packages. 26 | \item \code{suggests}: The number of packages suggested. 27 | \item \code{depends}: The number of hard dependencies. 28 | \item \code{Roxygen}: a binary indicator for whether Roxygen was used 29 | for documentation. 30 | \item \code{gh}: a binary indicator for whether the URL field contained 31 | a GitHub link. 32 | \item \code{rforge}: a binary indicator for whether the URL field 33 | contained a link to R-forge. 34 | \item \code{descr}: The number of characters (or, in some cases, bytes) 35 | in the description field. 36 | \item \code{r_count}: The number of R files in the R directory. 37 | \item \code{r_size}: The total disk size of the R files. 38 | \item \code{ns_import}: Estimated number of imported functions or methods. 39 | \item \code{ns_export}: Estimated number of exported functions or methods. 40 | \item \code{s3_methods}: Estimated number of S3 methods. 41 | \item \code{s4_methods}: Estimated number of S4 methods. 42 | \item \code{doc_count}: How many Rmd or Rnw files in the vignettes 43 | directory. 44 | \item \code{doc_size}: The disk size of the Rmd or Rnw files. 45 | \item \code{src_count}: The number of files in the \code{src} directory. 46 | \item \code{src_size}: The size on disk of files in the \code{src} directory. 47 | \item \code{data_count} The number of files in the \code{data} directory. 48 | \item \code{data_size}: The size on disk of files in the \code{data} directory. 49 | \item \code{testthat_count}: The number of files in the \code{testthat} 50 | directory. 51 | \item \code{testthat_size}: The size on disk of files in the \code{testthat} 52 | directory. 53 | \item \code{check_time}: The time (in seconds) to run \verb{R CMD check} 54 | using the "r-devel-windows-ix86+x86_64` flavor. 55 | \item \code{status}: An indicator for whether the tests completed. 56 | } 57 | 58 | Data were collected on 2019-01-20. 59 | } 60 | \examples{ 61 | data(check_times) 62 | str(check_times) 63 | } 64 | \keyword{datasets} 65 | -------------------------------------------------------------------------------- /man/chem_proc_yield.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/chem_proc_yield.R 3 | \docType{data} 4 | \name{chem_proc_yield} 5 | \alias{chem_proc_yield} 6 | \title{Chemical manufacturing process data set} 7 | \source{ 8 | Kuhn, Max, and Kjell Johnson. \emph{Applied predictive modeling}. New York: 9 | Springer, 2013. 10 | } 11 | \value{ 12 | \item{chem_proc_yield}{a tibble} 13 | } 14 | \description{ 15 | A data set that models yield as a function of biological material predictors 16 | and chemical structure predictors. 17 | } 18 | \details{ 19 | This data set contains information about a chemical manufacturing 20 | process, in which the goal is to understand the relationship between 21 | the process and the resulting final product yield. Raw material in 22 | this process is put through a sequence of 27 steps to generate the 23 | final pharmaceutical product. The starting material is generated from 24 | a biological unit and has a range of quality and characteristics. The 25 | objective in this project was to develop a model to predict percent 26 | yield of the manufacturing process. The data set consisted of 177 27 | samples of biological material for which 57 characteristics were 28 | measured. Of the 57 characteristics, there were 12 measurements of 29 | the biological starting material, and 45 measurements of the 30 | manufacturing process. The process variables included measurements 31 | such as temperature, drying time, washing time, and concentrations of 32 | by-products at various steps. Some of the process measurements can 33 | be controlled, while others are observed. Predictors are continuous, 34 | count, categorical; some are correlated, and some contain missing 35 | values. Samples are not independent because sets of samples come from 36 | the same batch of biological starting material. 37 | 38 | Columns: 39 | \itemize{ 40 | \item \code{yield}: numeric 41 | \item \code{bio_material_01} - \code{bio_material_12}: numeric 42 | \item \code{man_proc_01} - \code{man_proc_45}: numeric 43 | } 44 | } 45 | \examples{ 46 | data(chem_proc_yield) 47 | str(chem_proc_yield) 48 | 49 | } 50 | -------------------------------------------------------------------------------- /man/concrete.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/concrete.R 3 | \docType{data} 4 | \name{concrete} 5 | \alias{concrete} 6 | \title{Compressive strength of concrete mixtures} 7 | \source{ 8 | Yeh I (2006). "Analysis of Strength of Concrete Using Design of Experiments 9 | and Neural Networks." \emph{Journal of Materials in Civil Engineering}, 18, 597-604. 10 | 11 | Kuhn, M., Johnson, K. (2013) \emph{Applied Predictive Modeling}, Springer. 12 | } 13 | \value{ 14 | \item{concrete}{a tibble} 15 | } 16 | \description{ 17 | Yeh (2006) describes an aggregated data set for experimental designs used to 18 | test the compressive strength of concrete mixtures. The data are used by 19 | Kuhn and Johnson (2013). 20 | } 21 | \examples{ 22 | data(concrete) 23 | str(concrete) 24 | } 25 | \keyword{datasets} 26 | -------------------------------------------------------------------------------- /man/covers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/covers.R 3 | \docType{data} 4 | \name{covers} 5 | \alias{covers} 6 | \title{Raw cover type data} 7 | \source{ 8 | https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.info 9 | } 10 | \value{ 11 | \item{covers}{a data frame} 12 | } 13 | \description{ 14 | These data are raw data describing different types of forest cover-types 15 | from the UCI Machine Learning Database (see link below). There is one 16 | column in the data that has a few difference pieces of textual 17 | information (of variable lengths). 18 | } 19 | \examples{ 20 | data(covers) 21 | str(covers) 22 | } 23 | \keyword{datasets} 24 | -------------------------------------------------------------------------------- /man/credit_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/credit_data.R 3 | \docType{data} 4 | \name{credit_data} 5 | \alias{credit_data} 6 | \title{Credit data} 7 | \source{ 8 | https://github.com/gastonstat/CreditScoring, 9 | http://bit.ly/2kkBFrk 10 | } 11 | \value{ 12 | \item{credit_data}{a data frame} 13 | } 14 | \description{ 15 | These data are from the website of Dr. Lluís A. Belanche Muñoz by way of a 16 | github repository of Dr. Gaston Sanchez. One data point is a missing outcome 17 | was removed from the original data. 18 | } 19 | \examples{ 20 | data(credit_data) 21 | str(credit_data) 22 | } 23 | \keyword{datasets} 24 | -------------------------------------------------------------------------------- /man/crickets.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/crickets.R 3 | \docType{data} 4 | \name{crickets} 5 | \alias{crickets} 6 | \title{Rates of Cricket Chirps} 7 | \source{ 8 | Mangiafico, S. 2015. "An R Companion for the Handbook of Biological 9 | Statistics." \url{https://rcompanion.org/handbook/}. 10 | 11 | McDonald, J. 2009. \emph{Handbook of Biological Statistics}. Sparky House Publishing. 12 | } 13 | \value{ 14 | \item{crickets}{a tibble} 15 | } 16 | \description{ 17 | These data are from from McDonald (2009), by way of Mangiafico (2015), on 18 | the relationship between the ambient temperature and the rate of cricket 19 | chirps per minute. Data were collected for two species of the genus \emph{Oecanthus}: \emph{O. exclamationis} 20 | and \emph{O. niveus}. The data are contained in a data frame called \code{crickets} with 21 | a total of 31 data points. 22 | } 23 | \examples{ 24 | data(crickets) 25 | str(crickets) 26 | } 27 | \keyword{datasets} 28 | -------------------------------------------------------------------------------- /man/deliveries.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/deliveries.R 3 | \docType{data} 4 | \name{deliveries} 5 | \alias{deliveries} 6 | \title{Food Delivery Time Data} 7 | \value{ 8 | \item{deliveries}{a tibble} 9 | } 10 | \description{ 11 | Food Delivery Time Data 12 | } 13 | \details{ 14 | These data are from a study of food delivery times in minutes (i.e., the time from the 15 | initial order to receiving the food) for a single restaurant. The data 16 | contains 10,012 orders from a specific restaurant. The predictors include: 17 | \itemize{ 18 | \item The time, in decimal hours, of the order. 19 | \item The day of the week for the order. 20 | \item The approximate distance in miles between the restaurant and the delivery 21 | location. 22 | \item A set of 27 predictors that count the number of distinct menu items 23 | in the order. 24 | } 25 | 26 | No times are censored. 27 | } 28 | \examples{ 29 | data(deliveries) 30 | str(deliveries) 31 | } 32 | \keyword{datasets} 33 | -------------------------------------------------------------------------------- /man/drinks.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/drinks.R 3 | \docType{data} 4 | \name{drinks} 5 | \alias{drinks} 6 | \title{Sample time series data} 7 | \source{ 8 | The Federal Reserve Bank of St. Louis website https://fred.stlouisfed.org/series/S4248SM144NCEN 9 | } 10 | \value{ 11 | \item{drinks}{a tibble} 12 | } 13 | \description{ 14 | Sample time series data 15 | } 16 | \details{ 17 | Drink sales. The exact name of the series from FRED is: 18 | "Merchant Wholesalers, Except Manufacturers' Sales Branches and Offices 19 | Sales: Nondurable Goods: Beer, Wine, and Distilled Alcoholic Beverages Sales" 20 | } 21 | \examples{ 22 | data(drinks) 23 | str(drinks) 24 | } 25 | \keyword{datasets} 26 | -------------------------------------------------------------------------------- /man/figures/lifecycle-deprecated.svg: -------------------------------------------------------------------------------- 1 | lifecyclelifecycledeprecateddeprecated -------------------------------------------------------------------------------- /man/grants.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/grants.R 3 | \docType{data} 4 | \name{grants} 5 | \alias{grants} 6 | \alias{grants_other} 7 | \alias{grants_test} 8 | \alias{grants_2008} 9 | \title{Grant acceptance data} 10 | \source{ 11 | Kuhn and Johnson (2013). \emph{Applied Predictive Modeling}. Springer. 12 | } 13 | \value{ 14 | \item{grants_other,grants_test,grants_2008}{two tibbles and an integer 15 | vector of data points used for training} 16 | } 17 | \description{ 18 | A data set related to the success or failure of academic grants. 19 | } 20 | \details{ 21 | The data are discussed in Kuhn and Johnson (2013): 22 | 23 | "These data are from a 2011 Kaggle competition sponsored by the University 24 | of Melbourne where there was interest in predicting whether or not a grant 25 | application would be accepted. Since public funding of grants had decreased 26 | over time, triaging grant applications based on their likelihood of success 27 | could be important for estimating the amount of potential funding to the 28 | university. In addition to predicting grant success, the university sought 29 | to understand factors that were important in predicting success." 30 | 31 | The data ranged from 2005 and 2008 and the data spending strategy was 32 | driven by the date of the grant. Kuhn and Johnson (2013) describe: 33 | 34 | "The compromise taken here is to build models on the pre-2008 data and 35 | tune them by evaluating a random sample of 2,075 grants from 2008. Once the 36 | optimal parameters are determined, final model is built using these 37 | parameters and the entire training set (i.e., the data prior to 2008 and the 38 | additional 2,075 grants). A small holdout set of 518 grants from 2008 will 39 | be used to ensure that no gross methodology errors occur from repeatedly 40 | evaluating the 2008 data during model tuning. In the text, this set of 41 | samples is called the 2 0 0 8 holdout set. This small set of year 2008 42 | grants will be referred to as the test set and will not be evaluated until 43 | set of candidate models are identified." 44 | 45 | To emulate this, \code{grants_other} contains the training (pre-2008, n = 6,633) 46 | and holdout/validation data (2008, n = 1,557). \code{grants_test} has 518 grant 47 | samples from 2008. The object \code{grants_2008} is an integer vector that can 48 | be used to separate the modeling with the holdout/validation sets. 49 | } 50 | \examples{ 51 | data(grants) 52 | str(grants_other) 53 | str(grants_test) 54 | str(grants_2008) 55 | } 56 | \keyword{datasets} 57 | -------------------------------------------------------------------------------- /man/hepatic_injury_qsar.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hepatic_injury_qsar.R 3 | \docType{data} 4 | \name{hepatic_injury_qsar} 5 | \alias{hepatic_injury_qsar} 6 | \title{Predicting hepatic injury from chemical information} 7 | \source{ 8 | Kuhn, Max, and Kjell Johnson. \emph{Applied predictive modeling}. New York: 9 | Springer, 2013. 10 | } 11 | \value{ 12 | \item{hepatic_injury_qsar}{a tibble} 13 | } 14 | \description{ 15 | A quantitative structure-activity relationship (QSAR) data set to predict 16 | when a molecule has risk associated with liver function. 17 | } 18 | \details{ 19 | This data set was used to develop a model for predicting compounds' 20 | probability of causing hepatic injury (i.e. liver damage). This data set 21 | consisted of 281 unique compounds; 376 predictors were measured or computed 22 | for each. The response was categorical (either "none", "mild", or "severe"), 23 | and was highly unbalanced. 24 | 25 | This kind of response often occurs in pharmaceutical data because companies 26 | steer away from creating molecules that have undesirable characteristics. 27 | Therefore, well-behaved molecules often greatly outnumber undesirable 28 | molecules. The predictors consisted of measurements from 184 biological 29 | screens and 192 chemical feature predictors. The biological predictors 30 | represent activity for each screen and take values between 0 and 10 with a 31 | mode of 4. The chemical feature predictors represent counts of important 32 | sub-structures as well as measures of physical properties that are thought to 33 | be associated with hepatic injury. 34 | 35 | Columns: 36 | \itemize{ 37 | \item \code{class}: ordered and factor (levels: 'none', 'mild', and 'severe') 38 | \item \code{bio_assay_001} - \code{bio_assay_184}: numeric 39 | \item \code{chem_fp_001} - \code{chem_fp_192}: numeric 40 | } 41 | } 42 | \examples{ 43 | data(hepatic_injury_qsar) 44 | str(hepatic_injury_qsar) 45 | 46 | } 47 | -------------------------------------------------------------------------------- /man/hotel_rates.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hotel_rates.R 3 | \docType{data} 4 | \name{hotel_rates} 5 | \alias{hotel_rates} 6 | \title{Daily Hotel Rate Data} 7 | \source{ 8 | \url{https://github.com/rfordatascience/tidytuesday/tree/master/data/2020/2020-02-11} 9 | } 10 | \description{ 11 | A data set to predict the average daily rate for a hotel in Lisbon Portugal. 12 | } 13 | \details{ 14 | Data are originally described in Antonio, de Almeida, and Nunes (2019). 15 | This version of the data is filtered for one hotel (the "Resort Hotel") and 16 | is intended as regression data set for predicting the average daily rate for 17 | a room. The data are post-2016; the 2016 data were used to have a predictor 18 | for the historical daily rates. See the \code{hotel_rates.R} file in the 19 | \code{data-raw} directory of the package to understand other filters used when 20 | creating this version of the data. 21 | 22 | The \code{agent} and \code{company} fields were changed from random characters to use 23 | a set of random names. 24 | 25 | The outcome column is \code{avg_price_per_room}. 26 | \subsection{License}{ 27 | 28 | No license was given for the data; See the reference below for source. 29 | } 30 | } 31 | \examples{ 32 | \dontrun{ 33 | str(hotel_rates) 34 | } 35 | } 36 | \references{ 37 | Antonio, N., de Almeida, A., and Nunes, L. (2019). Hotel booking demand 38 | datasets. \emph{Data in Brief}, 22, 41-49. 39 | } 40 | \keyword{datasets} 41 | -------------------------------------------------------------------------------- /man/hpc_cv.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hpc_cv.R 3 | \docType{data} 4 | \name{hpc_cv} 5 | \alias{hpc_cv} 6 | \title{Class probability predictions} 7 | \source{ 8 | Kuhn, M., Johnson, K. (2013) \emph{Applied Predictive 9 | Modeling}, Springer 10 | } 11 | \value{ 12 | \item{hpc_cv}{a data frame} 13 | } 14 | \description{ 15 | Class probability predictions 16 | } 17 | \details{ 18 | This data frame contains the predicted classes and 19 | class probabilities for a linear discriminant analysis model fit 20 | to the HPC data set from Kuhn and Johnson (2013). These data are 21 | the assessment sets from a 10-fold cross-validation scheme. The 22 | data column columns for the true class (\code{obs}), the class 23 | prediction (\code{pred}) and columns for each class probability 24 | (columns \code{VF}, \code{F}, \code{M}, and \code{L}). Additionally, a column for 25 | the resample indicator is included. 26 | } 27 | \examples{ 28 | data(hpc_cv) 29 | str(hpc_cv) 30 | } 31 | \keyword{datasets} 32 | -------------------------------------------------------------------------------- /man/hpc_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hpc_data.R 3 | \docType{data} 4 | \name{hpc_data} 5 | \alias{hpc_data} 6 | \title{High-performance computing system data} 7 | \source{ 8 | Kuhn, M., Johnson, K. (2013) \emph{Applied Predictive Modeling}, Springer. 9 | } 10 | \value{ 11 | \item{hpc_data}{a tibble} 12 | } 13 | \description{ 14 | Kuhn and Johnson (2013) describe a data set where characteristics of unix 15 | jobs were used to classify there completion times as either very fast 16 | (1 min or less, \code{VF}), fast (1–50 min, \code{F}), moderate (5–30 min, \code{M}), or 17 | long (greater than 30 min, \code{L}). 18 | } 19 | \examples{ 20 | 21 | data(hpc_data) 22 | str(hpc_data) 23 | } 24 | \keyword{datasets} 25 | -------------------------------------------------------------------------------- /man/ischemic_stroke.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ischemic_stroke.R 3 | \docType{data} 4 | \name{ischemic_stroke} 5 | \alias{ischemic_stroke} 6 | \title{Clinical data used to predict ischemic stroke} 7 | \source{ 8 | Kuhn, Max, and Kjell Johnson. \emph{Feature Engineering and Selection: A Practical 9 | Approach for Predictive Models}. Chapman and Hall/CRC, 2019. 10 | } 11 | \value{ 12 | \item{ischemic_stroke}{a tibble} 13 | } 14 | \description{ 15 | A data set to predict a binary outcome using imaging and patient data. 16 | } 17 | \details{ 18 | These data were gathered to predict patient risk for ischemic stroke. A 19 | historical set of patients with a range of carotid artery blockages were 20 | selected. The data consisted of 126 patients, 44 of which had blockages 21 | greater than 70\%. All patients had undergone Computed Tomography Angiography 22 | (CTA) to generate a detailed three-dimensional visualization and 23 | characterization of the blockage. These images were then analyzed in order to 24 | compute several features related to the disease, including: percent stenosis, 25 | arterial wall thickness, and tissue characteristics such as lipid-rich 26 | necrotic core and calcification. 27 | 28 | The group of patients in this study also had follow-up information on 29 | whether or not a stroke occurred at a subsequent point in time. The data for 30 | each patient also included commonly collected clinical characteristics for 31 | risk of stroke such as whether or not the patient had atrial fibrillation, 32 | coronary artery disease, and a history of smoking. Demographics of gender and 33 | age were included as well. These readily available risk factors can be 34 | thought of as another potentially useful predictor set that can be evaluated. 35 | In fact, this set of predictors should be evaluated first to assess their 36 | ability to predict stroke since these predictors are easy to collect, are 37 | acquired at patient presentation, and do not require an expensive imaging 38 | technique. 39 | 40 | Columns: 41 | \itemize{ 42 | \item \code{stroke}: factor (levels: 'yes' and 'no') 43 | \item \code{nascet_scale}: numeric 44 | \item \code{calc_vol}: numeric 45 | \item \code{calc_vol_prop}: numeric 46 | \item \code{matx_vol}: numeric 47 | \item \code{matx_vol_prop}: numeric 48 | \item \code{lrnc_vol}: numeric 49 | \item \code{lrnc_vol_prop}: numeric 50 | \item \code{max_calc_area}: numeric 51 | \item \code{max_calc_area_prop}: numeric 52 | \item \code{max_dilation_by_area}: numeric 53 | \item \code{max_matx_area}: numeric 54 | \item \code{max_matx_area_prop}: numeric 55 | \item \code{max_lrnc_area}: numeric 56 | \item \code{max_lrnc_area_prop}: numeric 57 | \item \code{max_max_wall_thickness}: numeric 58 | \item \code{max_remodeling_ratio}: numeric 59 | \item \code{max_stenosis_by_area}: numeric 60 | \item \code{max_wall_area}: numeric 61 | \item \code{wall_vol}: numeric 62 | \item \code{max_stenosis_by_diameter}: numeric 63 | \item \code{age}: integer 64 | \item \code{male}: integer 65 | \item \code{smoking_history}: integer 66 | \item \code{atrial_fibrillation}: integer 67 | \item \code{coronary_artery_disease}: integer 68 | \item \code{diabetes_history}: integer 69 | \item \code{hypercholesterolemia_history}: integer 70 | \item \code{hypertension_history}: integer 71 | } 72 | } 73 | \examples{ 74 | data(ischemic_stroke) 75 | str(ischemic_stroke) 76 | 77 | } 78 | -------------------------------------------------------------------------------- /man/leaf_id_flavia.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/leaf_id_flavia.R 3 | \docType{data} 4 | \name{leaf_id_flavia} 5 | \alias{leaf_id_flavia} 6 | \title{Leaf identification data (Flavia)} 7 | \source{ 8 | Lakshika, Jayani PG, and Thiyanga S. Talagala. "Computer-aided interpretable 9 | features for leaf image classification." \emph{arXiv preprint} arXiv:2106.08077 10 | (2021). 11 | 12 | \url{https://github.com/SMART-Research/leaffeatures_paper} 13 | } 14 | \value{ 15 | \item{leaf_id_flavia}{a data frame} 16 | } 17 | \description{ 18 | Image analysis of leaves to predict species. 19 | } 20 | \details{ 21 | From the original manuscript: "The Flavia dataset contains 1907 leaf images. 22 | There are 32 different species and each has 50-77 images. Scanners and 23 | digital cameras are used to acquire the leaf images on a plain background. 24 | The isolated leaf images contain blades only, without a petiole. These leaf 25 | images are collected from the most common plants in Yangtze, Delta, 26 | China. Those leaves were sampled on the campus of the Nanjing University and 27 | the Sun Yat-Sen arboretum, Nanking, China." 28 | 29 | The reference below has details information on the features used for 30 | prediction. 31 | 32 | Columns: 33 | \itemize{ 34 | \item \code{species}: factor (32 levels) 35 | \item \code{apex}: factor (9 levels) 36 | \item \code{base}: factor (6 levels) 37 | \item \code{shape}: factor (5 levels) 38 | \item \code{denate_edge}: factor (levels: 'no' and 'yes') 39 | \item \code{lobed_edge}: factor (levels: 'no' and 'yes') 40 | \item \code{smooth_edge}: factor (levels: 'no' and 'yes') 41 | \item \code{toothed_edge}: factor (levels: 'no' and 'yes') 42 | \item \code{undulate_edge}: factor (levels: 'no' and 'yes') 43 | \item \code{outlying_polar}: numeric 44 | \item \code{skewed_polar}: numeric 45 | \item \code{clumpy_polar}: numeric 46 | \item \code{sparse_polar}: numeric 47 | \item \code{striated_polar}: numeric 48 | \item \code{convex_polar}: numeric 49 | \item \code{skinny_polar}: numeric 50 | \item \code{stringy_polar}: numeric 51 | \item \code{monotonic_polar}: numeric 52 | \item \code{outlying_contour}: numeric 53 | \item \code{skewed_contour}: numeric 54 | \item \code{clumpy_contour}: numeric 55 | \item \code{sparse_contour}: numeric 56 | \item \code{striated_contour}: numeric 57 | \item \code{convex_contour}: numeric 58 | \item \code{skinny_contour}: numeric 59 | \item \code{stringy_contour}: numeric 60 | \item \code{monotonic_contour}: numeric 61 | \item \code{num_max_ponits}: numeric 62 | \item \code{num_min_points}: numeric 63 | \item \code{diameter}: numeric 64 | \item \code{area}: numeric 65 | \item \code{perimeter}: numeric 66 | \item \code{physiological_length}: numeric 67 | \item \code{physiological_width}: numeric 68 | \item \code{aspect_ratio}: numeric 69 | \item \code{rectangularity}: numeric 70 | \item \code{circularity}: numeric 71 | \item \code{compactness}: numeric 72 | \item \code{narrow_factor}: numeric 73 | \item \code{perimeter_ratio_diameter}: numeric 74 | \item \code{perimeter_ratio_length}: numeric 75 | \item \code{perimeter_ratio_lw}: numeric 76 | \item \code{num_convex_points}: numeric 77 | \item \code{perimeter_convexity}: numeric 78 | \item \code{area_convexity}: numeric 79 | \item \code{area_ratio_convexity}: numeric 80 | \item \code{equivalent_diameter}: numeric 81 | \item \code{eccentriciry}: numeric 82 | \item \code{contrast}: numeric 83 | \item \code{correlation_texture}: numeric 84 | \item \code{inverse_difference_moments}: numeric 85 | \item \code{entropy}: numeric 86 | \item \code{mean_red_val}: numeric 87 | \item \code{mean_green_val}: numeric 88 | \item \code{mean_blue_val}: numeric 89 | \item \code{std_red_val}: numeric 90 | \item \code{std_green_val}: numeric 91 | \item \code{std_blue_val}: numeric 92 | \item \code{correlation}: numeric 93 | } 94 | } 95 | \examples{ 96 | data(leaf_id_flavia) 97 | str(leaf_id_flavia) 98 | 99 | } 100 | -------------------------------------------------------------------------------- /man/lending_club.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/lending_club.R 3 | \docType{data} 4 | \name{lending_club} 5 | \alias{lending_club} 6 | \title{Loan data} 7 | \source{ 8 | Lending Club Statistics https://www.lendingclub.com/info/download-data.action 9 | } 10 | \value{ 11 | \item{lending_club}{a data frame} 12 | } 13 | \description{ 14 | Loan data 15 | } 16 | \details{ 17 | These data were downloaded from the Lending Club 18 | access site (see below) and are from the first quarter of 2016. 19 | A subset of the rows and variables are included here. The 20 | outcome is in the variable \code{Class} and is either "good" (meaning 21 | that the loan was fully paid back or currently on-time) or "bad" 22 | (charged off, defaulted, of 21-120 days late). A data dictionary 23 | can be found on the source website. 24 | } 25 | \examples{ 26 | data(lending_club) 27 | str(lending_club) 28 | } 29 | \keyword{datasets} 30 | -------------------------------------------------------------------------------- /man/meats.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/meats.R 3 | \docType{data} 4 | \name{meats} 5 | \alias{meats} 6 | \title{Fat, water and protein content of meat samples} 7 | \value{ 8 | \item{meats}{a tibble} 9 | } 10 | \description{ 11 | "These data are recorded on a Tecator Infratec Food and Feed Analyzer 12 | working in the wavelength range 850 - 1050 nm by the Near Infrared 13 | Transmission (NIT) principle. Each sample contains finely chopped pure meat 14 | with different moisture, fat and protein contents. 15 | } 16 | \details{ 17 | If results from these data are used in a publication we want you to mention 18 | the instrument and company name (Tecator) in the publication. In addition, 19 | please send a preprint of your article to: 20 | 21 | Karin Thente, Tecator AB, Box 70, S-263 21 Hoganas, Sweden 22 | 23 | The data are available in the public domain with no responsibility from the 24 | original data source. The data can be redistributed as long as this 25 | permission note is attached." 26 | 27 | "For each meat sample the data consists of a 100 channel spectrum of 28 | absorbances and the contents of moisture (water), fat and protein. The 29 | absorbance is -log10 of the transmittance measured by the spectrometer. The 30 | three contents, measured in percent, are determined by analytic chemistry." 31 | 32 | Included here are the training, monitoring and test sets. 33 | } 34 | \examples{ 35 | 36 | data(meats) 37 | str(meats) 38 | } 39 | \keyword{datasets} 40 | -------------------------------------------------------------------------------- /man/mlc_churn.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/churn.R 3 | \docType{data} 4 | \name{mlc_churn} 5 | \alias{mlc_churn} 6 | \title{Customer churn data} 7 | \source{ 8 | Originally at \verb{http://www.sgi.com/tech/mlc/} 9 | } 10 | \value{ 11 | \item{mlc_churn}{a tibble} 12 | } 13 | \description{ 14 | A data set from the MLC++ machine learning software for modeling customer 15 | churn. There are 19 predictors, mostly numeric: \code{state} (categorical), 16 | \code{account_length} \code{area_code} \code{international_plan} (yes/no), 17 | \code{voice_mail_plan} (yes/no), \code{number_vmail_messages} 18 | \code{total_day_minutes} \code{total_day_calls} \code{total_day_charge} 19 | \code{total_eve_minutes} \code{total_eve_calls} \code{total_eve_charge} 20 | \code{total_night_minutes} \code{total_night_calls} 21 | \code{total_night_charge} \code{total_intl_minutes} 22 | \code{total_intl_calls} \code{total_intl_charge}, and 23 | \code{number_customer_service_calls}. 24 | } 25 | \details{ 26 | The outcome is contained in a column called \code{churn} (also yes/no). 27 | A note in one of the source files states that the data are "artificial based 28 | on claims similar to real world". 29 | } 30 | \examples{ 31 | data(mlc_churn) 32 | str(mlc_churn) 33 | } 34 | \keyword{datasets} 35 | -------------------------------------------------------------------------------- /man/modeldata-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/modeldata-package.R 3 | \docType{package} 4 | \name{modeldata-package} 5 | \alias{modeldata} 6 | \alias{modeldata-package} 7 | \title{modeldata: Data Sets Useful for Modeling Examples} 8 | \description{ 9 | Data sets used for demonstrating or testing model-related packages are contained in this package. 10 | } 11 | \seealso{ 12 | Useful links: 13 | \itemize{ 14 | \item \url{https://modeldata.tidymodels.org} 15 | \item \url{https://github.com/tidymodels/modeldata} 16 | \item Report bugs at \url{https://github.com/tidymodels/modeldata/issues} 17 | } 18 | 19 | } 20 | \author{ 21 | \strong{Maintainer}: Max Kuhn \email{max@posit.co} 22 | 23 | Other contributors: 24 | \itemize{ 25 | \item Posit Software, PBC (03wc8by49) [copyright holder, funder] 26 | } 27 | 28 | } 29 | \keyword{internal} 30 | -------------------------------------------------------------------------------- /man/oils.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/oils.R 3 | \docType{data} 4 | \name{oils} 5 | \alias{oils} 6 | \title{Fatty acid composition of commercial oils} 7 | \source{ 8 | Brodnjak-Voncina et al. (2005). Multivariate data analysis in 9 | classification of vegetable oils characterized by the content of fatty 10 | acids, \emph{Chemometrics and Intelligent Laboratory Systems}, Vol. 11 | 75:31-45. 12 | } 13 | \value{ 14 | \item{oils}{a tibble} 15 | } 16 | \description{ 17 | Fatty acid concentrations of commercial oils were measured using gas 18 | chromatography. The data is used to predict the type of oil. Note that 19 | only the known oils are in the data set. Also, the authors state that there 20 | are 95 samples of known oils. However, we count 96 in Table 1 (pgs. 33-35). 21 | } 22 | \examples{ 23 | data(oils) 24 | str(oils) 25 | } 26 | \keyword{datasets} 27 | -------------------------------------------------------------------------------- /man/parabolic.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/parabolic.R 3 | \docType{data} 4 | \name{parabolic} 5 | \alias{parabolic} 6 | \title{Parabolic class boundary data} 7 | \value{ 8 | \item{parabolic}{a data frame} 9 | } 10 | \description{ 11 | Parabolic class boundary data 12 | } 13 | \details{ 14 | These data were simulated. There are two correlated predictors and 15 | two classes in the factor outcome. 16 | } 17 | \examples{ 18 | data(parabolic) 19 | str(parabolic) 20 | } 21 | \keyword{datasets} 22 | -------------------------------------------------------------------------------- /man/pathology.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pathology.R 3 | \docType{data} 4 | \name{pathology} 5 | \alias{pathology} 6 | \title{Liver pathology data} 7 | \source{ 8 | Altman, D.G., Bland, J.M. (1994) ``Diagnostic tests 1: 9 | sensitivity and specificity,'' \emph{British Medical Journal}, 10 | vol 308, 1552. 11 | } 12 | \value{ 13 | \item{pathology}{a data frame} 14 | } 15 | \description{ 16 | Liver pathology data 17 | } 18 | \details{ 19 | These data have the results of a \emph{x}-ray examination 20 | to determine whether liver is abnormal or not (in the \code{scan} 21 | column) versus the more extensive pathology results that 22 | approximate the truth (in \code{pathology}). 23 | } 24 | \examples{ 25 | data(pathology) 26 | str(pathology) 27 | } 28 | \keyword{datasets} 29 | -------------------------------------------------------------------------------- /man/pd_speech.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/pd_speech.R 3 | \docType{data} 4 | \name{pd_speech} 5 | \alias{pd_speech} 6 | \title{Parkinson's disease speech classification data set} 7 | \source{ 8 | UCI ML repository (data) https://archive.ics.uci.edu/ml/datasets/Parkinson\%27s+Disease+Classification#, 9 | 10 | Sakar et al (2019), "A comparative analysis of speech signal processing 11 | algorithms for Parkinson’s disease classification and the use of the tunable 12 | Q-factor wavelet transform", \emph{Applied Soft Computing}, V74, pg 255-263. 13 | } 14 | \value{ 15 | \item{pd_speech}{a data frame} 16 | } 17 | \description{ 18 | Parkinson's disease speech classification data set 19 | } 20 | \details{ 21 | From the UCI ML archive, the description is "The data used in this 22 | study were gathered from 188 patients with PD (107 men and 81 women) with 23 | ages ranging from 33 to 87 (65.1 p/m 10.9) at the Department of Neurology 24 | in Cerrahpaşa Faculty of Medicine, Istanbul University. The control group 25 | consists of 64 healthy individuals (23 men and 41 women) with ages varying 26 | between 41 and 82 (61.1 p/m 8.9). During the data collection process, 27 | the microphone is set to 44.1 KHz and following the physician's examination, 28 | the sustained phonation of the vowel \verb{/a/} was collected from each subject 29 | with three repetitions." 30 | 31 | The data here are averaged over the replicates. 32 | } 33 | \examples{ 34 | data(pd_speech) 35 | str(pd_speech) 36 | } 37 | \keyword{datasets} 38 | -------------------------------------------------------------------------------- /man/penguins.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/penguins.R 3 | \docType{data} 4 | \name{penguins} 5 | \alias{penguins} 6 | \title{Palmer Station penguin data} 7 | \source{ 8 | Gorman KB, Williams TD, Fraser WR (2014) Ecological Sexual Dimorphism 9 | and Environmental Variability within a Community of Antarctic Penguins 10 | (\emph{Genus Pygoscelis}). PLoS ONE 9(3): e90081. 11 | \doi{10.1371/journal.pone.0090081} 12 | 13 | \url{https://github.com/allisonhorst/palmerpenguins} 14 | } 15 | \value{ 16 | \item{penguins}{a tibble} 17 | } 18 | \description{ 19 | A data set from Gorman, Williams, and Fraser (2014) containing measurements 20 | from different types of penguins. This version of the data was retrieved from 21 | Allison Horst's \code{palmerpenguins} package on 2020-06-22. 22 | } 23 | \examples{ 24 | data(penguins) 25 | str(penguins) 26 | } 27 | \keyword{datasets} 28 | -------------------------------------------------------------------------------- /man/permeability_qsar.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/permeability_qsar.R 3 | \docType{data} 4 | \name{permeability_qsar} 5 | \alias{permeability_qsar} 6 | \title{Predicting permeability from chemical information} 7 | \source{ 8 | Kuhn, Max, and Kjell Johnson. \emph{Applied predictive modeling}. New York: 9 | Springer, 2013. 10 | } 11 | \value{ 12 | \item{permeability_qsar}{a data frame} 13 | } 14 | \description{ 15 | A quantitative structure-activity relationship (QSAR) data set to predict 16 | when a molecule can permeate cells. 17 | } 18 | \details{ 19 | This pharmaceutical data set was used to develop a model for predicting 20 | compounds' permeability. In short, permeability is the measure of a 21 | molecule's ability to cross a membrane. The body, for example, has notable 22 | membranes between the body and brain, known as the blood-brain barrier, and 23 | between the gut and body in the intestines. These membranes help the body 24 | guard critical regions from receiving undesirable or detrimental substances. 25 | For an orally taken drug to be effective in the brain, it first must pass 26 | through the intestinal wall and then must pass through the blood-brain 27 | barrier in order to be present for the desired neurological target. 28 | Therefore, a compound's ability to permeate relevant biological membranes 29 | is critically important to understand early in the drug discovery process. 30 | Compounds that appear to be effective for a particular disease in research 31 | screening experiments, but appear to be poorly permeable may need to be 32 | altered in order improve permeability, and thus the compound's ability to 33 | reach the desired target. Identifying permeability problems can help guide 34 | chemists towards better molecules. 35 | 36 | Permeability assays such as PAMPA and Caco-2 have been developed to help 37 | measure compounds' permeability (Kansy et al, 1998). These screens are 38 | effective at quantifying a compound's permeability, but the assay is 39 | expensive labor intensive. Given a sufficient number of compounds that have 40 | been screened, we could develop a predictive model for permeability in an 41 | attempt to potentially reduce the need for the assay. In this project there 42 | were 165 unique compounds; 1107 molecular fingerprints were determined for 43 | each. A molecular fingerprint is a binary sequence of numbers that 44 | represents the presence or absence of a specific molecular sub-structure. 45 | The response is highly skewed, the predictors are sparse (15.5\% are present), 46 | and many predictors are strongly associated. 47 | 48 | Columns: 49 | \itemize{ 50 | \item \code{permeability}: numeric 51 | \item \code{chem_fp_0001} - \code{chem_fp_1107}: numeric 52 | } 53 | } 54 | \examples{ 55 | data(permeability_qsar) 56 | str(permeability_qsar) 57 | 58 | } 59 | -------------------------------------------------------------------------------- /man/rmd/ames.md: -------------------------------------------------------------------------------- 1 | For these data, the training materials typically use: 2 | 3 | ```r 4 | library(tidymodels) 5 | 6 | set.seed(4595) 7 | data_split <- initial_split(ames, strata = "Sale_Price") 8 | ames_train <- training(data_split) 9 | ames_test <- testing(data_split) 10 | 11 | set.seed(2453) 12 | ames_folds<- vfold_cv(ames_train) 13 | ``` 14 | 15 | -------------------------------------------------------------------------------- /man/scat.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/scat.R 3 | \docType{data} 4 | \name{scat} 5 | \alias{scat} 6 | \title{Morphometric data on scat} 7 | \source{ 8 | Reid, R. E. B. (2015). A morphometric modeling approach to 9 | distinguishing among bobcat, coyote and gray fox scats. \emph{Wildlife 10 | Biology}, 21(5), 254-262 11 | } 12 | \value{ 13 | \item{scat}{a tibble} 14 | } 15 | \description{ 16 | Reid (2015) collected data on animal feses in coastal California. The data 17 | consist of DNA verified species designations as well as fields related to 18 | the time and place of the collection and the scat itself. The data are on 19 | the three main species. 20 | } 21 | \examples{ 22 | data(scat) 23 | str(scat) 24 | } 25 | \keyword{datasets} 26 | -------------------------------------------------------------------------------- /man/sim_classification.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/simulations.R 3 | \name{sim_classification} 4 | \alias{sim_classification} 5 | \alias{sim_regression} 6 | \alias{sim_noise} 7 | \alias{sim_logistic} 8 | \alias{sim_multinomial} 9 | \title{Simulate datasets} 10 | \usage{ 11 | sim_classification( 12 | num_samples = 100, 13 | method = "caret", 14 | intercept = -5, 15 | num_linear = 10, 16 | keep_truth = FALSE 17 | ) 18 | 19 | sim_regression( 20 | num_samples = 100, 21 | method = "sapp_2014_1", 22 | std_dev = NULL, 23 | factors = FALSE, 24 | keep_truth = FALSE 25 | ) 26 | 27 | sim_noise( 28 | num_samples, 29 | num_vars, 30 | cov_type = "exchangeable", 31 | outcome = "none", 32 | num_classes = 2, 33 | cov_param = 0 34 | ) 35 | 36 | sim_logistic(num_samples, eqn, correlation = 0, keep_truth = FALSE) 37 | 38 | sim_multinomial( 39 | num_samples, 40 | eqn_1, 41 | eqn_2, 42 | eqn_3, 43 | correlation = 0, 44 | keep_truth = FALSE 45 | ) 46 | } 47 | \arguments{ 48 | \item{num_samples}{Number of data points to simulate.} 49 | 50 | \item{method}{A character string for the simulation method. For 51 | classification, the single current option is "caret". For regression, 52 | values can be \code{"sapp_2014_1"}, \code{"sapp_2014_2"}, \code{"van_der_laan_2007_1"}, 53 | \code{"van_der_laan_2007_2"}, \code{"hooker_2004"}, or \code{"worley_1987"}. See Details 54 | below.} 55 | 56 | \item{intercept}{The intercept for the linear predictor.} 57 | 58 | \item{num_linear}{Number of diminishing linear effects.} 59 | 60 | \item{keep_truth}{A logical: should the true outcome value be retained for 61 | the data? If so, the column name is \code{.truth}.} 62 | 63 | \item{std_dev}{Gaussian distribution standard deviation for residuals. 64 | Default values are shown below in Details.} 65 | 66 | \item{factors}{A single logical for whether the binary indicators should be 67 | encoded as factors or not.} 68 | 69 | \item{num_vars}{Number of noise predictors to create.} 70 | 71 | \item{cov_type}{The multivariate normal correlation structure of the 72 | predictors. Possible values are "exchangeable" and "toeplitz".} 73 | 74 | \item{outcome}{A single character string for what type of independent outcome 75 | should be simulated (if any). The default value of "none" produces no extra 76 | columns. Using "classification" will generate a \code{class} column with 77 | \code{num_classes} values, equally distributed. A value of "regression" results 78 | in an \code{outcome} column that contains independent standard normal values.} 79 | 80 | \item{num_classes}{When \code{outcome = "classification"}, the number of classes 81 | to simulate.} 82 | 83 | \item{cov_param}{A single numeric value for the exchangeable correlation 84 | value or the base of the Toeplitz structure. See Details below.} 85 | 86 | \item{eqn, eqn_1, eqn_2, eqn_3}{An R expression or (one sided) formula that 87 | only involves variables \code{A} and \code{B} that is used to compute the linear 88 | predictor. External objects should not be used as symbols; see the examples 89 | below on how to use external objects in the equations.} 90 | 91 | \item{correlation}{A single numeric value for the correlation between variables 92 | \code{A} and \code{B}.} 93 | } 94 | \description{ 95 | These functions can be used to generate simulated data for supervised 96 | (classification and regression) and unsupervised modeling applications. 97 | } 98 | \details{ 99 | \subsection{Specific Regression and Classification methods}{ 100 | 101 | These functions provide several supervised simulation methods (and one 102 | unsupervised). Learn more by \code{method}: 103 | \subsection{\code{method = "caret"}}{ 104 | 105 | This is a simulated classification problem with two classes, originally 106 | implemented in \code{\link[caret:twoClassSim]{caret::twoClassSim()}} with all numeric predictors. The 107 | predictors are simulated in different sets. First, two multivariate normal 108 | predictors (denoted here as \code{two_factor_1} and \code{two_factor_2}) are created 109 | with a correlation of about 0.65. They change the log-odds using main 110 | effects and an interaction: 111 | 112 | \preformatted{ intercept - 4 * two_factor_1 + 4 * two_factor_2 + 2 * two_factor_1 * two_factor_2 } 113 | 114 | The intercept is a parameter for the simulation and can be used to control 115 | the amount of class imbalance. 116 | 117 | The second set of effects are linear with coefficients that alternate signs 118 | and have a sequence of values between 2.5 and 0.25. For example, if there 119 | were four predictors in this set, their contribution to the log-odds would 120 | be 121 | 122 | \preformatted{ -2.5 * linear_1 + 1.75 * linear_2 -1.00 * linear_3 + 0.25 * linear_4} 123 | 124 | (Note that these column names may change based on the value of \code{num_linear}). 125 | 126 | The third set is a nonlinear function of a single predictor ranging between 127 | \verb{[0, 1]} called \code{non_linear_1} here: 128 | 129 | \preformatted{ (non_linear_1^3) + 2 * exp(-6 * (non_linear_1 - 0.3)^2) } 130 | 131 | The fourth set of informative predictors are copied from one of Friedman's 132 | systems and use two more predictors (\code{non_linear_2} and \code{non_linear_3}): 133 | 134 | \preformatted{ 2 * sin(non_linear_2 * non_linear_3) } 135 | 136 | All of these effects are added up to model the log-odds. 137 | } 138 | 139 | \subsection{\code{method = "sapp_2014_1"}}{ 140 | 141 | This regression simulation is from Sapp et al. (2014). There are 20 142 | independent Gaussian random predictors with mean zero and a variance of 9. 143 | The prediction equation is: 144 | 145 | \preformatted{ 146 | predictor_01 + sin(predictor_02) + log(abs(predictor_03)) + 147 | predictor_04^2 + predictor_05 * predictor_06 + 148 | ifelse(predictor_07 * predictor_08 * predictor_09 < 0, 1, 0) + 149 | ifelse(predictor_10 > 0, 1, 0) + predictor_11 * ifelse(predictor_11 > 0, 1, 0) + 150 | sqrt(abs(predictor_12)) + cos(predictor_13) + 2 * predictor_14 + abs(predictor_15) + 151 | ifelse(predictor_16 < -1, 1, 0) + predictor_17 * ifelse(predictor_17 < -1, 1, 0) - 152 | 2 * predictor_18 - predictor_19 * predictor_20 153 | } 154 | 155 | The error is Gaussian with mean zero and variance 9. 156 | } 157 | 158 | \subsection{\code{method = "sapp_2014_2"}}{ 159 | 160 | This regression simulation is also from Sapp et al. (2014). There are 200 161 | independent Gaussian predictors with mean zero and variance 16. The 162 | prediction equation has an intercept of one and identical linear effects of 163 | \code{log(abs(predictor))}. 164 | 165 | The error is Gaussian with mean zero and variance 25. 166 | } 167 | 168 | \subsection{\code{method = "van_der_laan_2007_1"}}{ 169 | 170 | This is a regression simulation from van der Laan et al. (2007) with ten 171 | random Bernoulli variables that have a 40\% probability of being a value of 172 | one. The true regression equation is: 173 | 174 | \preformatted{ 175 | 2 * predictor_01 * predictor_10 + 4 * predictor_02 * predictor_07 + 176 | 3 * predictor_04 * predictor_05 - 5 * predictor_06 * predictor_10 + 177 | 3 * predictor_08 * predictor_09 + predictor_01 * predictor_02 * predictor_04 - 178 | 2 * predictor_07 * (1 - predictor_06) * predictor_02 * predictor_09 - 179 | 4 * (1 - predictor_10) * predictor_01 * (1 - predictor_04) 180 | } 181 | 182 | The error term is standard normal. 183 | } 184 | 185 | \subsection{\code{method = "van_der_laan_2007_2"}}{ 186 | 187 | This is another regression simulation from van der Laan et al. (2007) with 188 | twenty Gaussians with mean zero and variance 16. The prediction equation is: 189 | 190 | \preformatted{ 191 | predictor_01 * predictor_02 + predictor_10^2 - predictor_03 * predictor_17 - 192 | predictor_15 * predictor_04 + predictor_09 * predictor_05 + predictor_19 - 193 | predictor_20^2 + predictor_09 * predictor_08 194 | } 195 | 196 | The error term is also Gaussian with mean zero and variance 16. 197 | } 198 | 199 | \subsection{\code{method = "hooker_2004"}}{ 200 | 201 | Hooker (2004) and Sorokina \emph{at al} (2008) used the following: 202 | 203 | \preformatted{ 204 | pi ^ (predictor_01 * predictor_02) * sqrt( 2 * predictor_03 ) - 205 | asin(predictor_04) + log(predictor_03 + predictor_05) - 206 | (predictor_09 / predictor_10) * sqrt (predictor_07 / predictor_08) - 207 | predictor_02 * predictor_07 208 | } 209 | 210 | Predictors 1, 2, 3, 6, 7, and 9 are standard uniform while the others are 211 | uniform on \verb{[0.6, 1.0]}. The errors are normal with mean zero and default 212 | standard deviation of 0.25. 213 | } 214 | 215 | \subsection{\code{method = "worley_1987"}}{ 216 | 217 | The simulation system from Worley (1987) is based on a mechanistic model for 218 | the flow rate of liquids from two aquifers positioned vertically (i.e., 219 | the "upper" and "lower" aquifers). There are two sets of predictors: 220 | \itemize{ 221 | \item the borehole radius (\code{radius_borehole} from 0.05 to 0.15) and length 222 | (\code{length_borehole} from 1,120 to 1,680) . 223 | \item The radius of effect that the system has on collecting water 224 | (\code{radius_influence} from 100 to 50,000) 225 | } 226 | 227 | and physical properties: 228 | \itemize{ 229 | \item \code{transmissibility_upper_aq} 230 | \item \code{potentiometric_upper_aq} 231 | \item \code{transmissibility_lower_aq} 232 | \item \code{potentiometric_lower_aq} 233 | \item \code{conductivity_borehole} 234 | } 235 | 236 | A multiplicative error structure is used; the mechanistic equation is 237 | multiplied by an expoentiated Gaussian random error. 238 | 239 | The references give feasible ranges for each of these variables. See also 240 | Morris \emph{et al} (1993). 241 | } 242 | 243 | } 244 | 245 | \subsection{\code{sim_noise()}}{ 246 | 247 | This function simulates a number of random normal variables with mean zero. 248 | The values can be independent if \code{cov_param = 0}. Otherwise the values are 249 | multivariate normal with non-diagonal covariance matrices. For 250 | \code{cov_type = "exchangeable"}, the structure has unit variances and covariances 251 | of \code{cov_param}. With \code{cov_type = "toeplitz"}, the covariances have an 252 | exponential pattern (see example below). 253 | } 254 | 255 | \subsection{Logistic simulation}{ 256 | 257 | \code{sim_logistic()} provides a flexible interface to simulating a logistic 258 | regression model with two multivariate normal variables \code{A} and \code{B} (with 259 | zero mean, unit variances and correlation determined by the \code{correlation} 260 | argument). 261 | 262 | For example, using \code{eqn = A + B} would specify that the true probability of 263 | the event was 264 | 265 | \preformatted{ 266 | prob = 1 / (1 + exp(A + B)) 267 | } 268 | 269 | The class levels for the outcome column are \code{"one"} and \code{"two"}. 270 | } 271 | 272 | \subsection{Multinomial simulation}{ 273 | 274 | \code{sim_multinomial()} can generate data with classes \code{"one"}, \code{"two"}, and 275 | \code{"three"} based on the values in arguments \code{eqn_1}, \code{eqn_2}, and \code{eqn_3}, 276 | respectfully. Like \code{\link[=sim_logistic]{sim_logistic()}} these equations use predictors \code{A} and 277 | \code{B}. 278 | 279 | The individual equations are evaluated and exponentiated. After this, their 280 | values are, for each row of data, normalized to add up to one. These 281 | probabilities are them passed to \code{\link[stats:Multinom]{stats::rmultinom()}} to generate the outcome 282 | values. 283 | } 284 | } 285 | \examples{ 286 | set.seed(1) 287 | sim_regression(100) 288 | sim_classification(100) 289 | 290 | # Flexible logistic regression simulation 291 | if (rlang::is_installed("ggplot2")) { 292 | library(dplyr) 293 | library(ggplot2) 294 | 295 | sim_logistic(1000, ~ .1 + 2 * A - 3 * B + 1 * A *B, corr = .7) |> 296 | ggplot(aes(A, B, col = class)) + 297 | geom_point(alpha = 1/2) + 298 | coord_equal() 299 | 300 | f_xor <- ~ 10 * xor(A > 0, B < 0) 301 | # or 302 | f_xor <- rlang::expr(10 * xor(A > 0, B < 0)) 303 | 304 | sim_logistic(1000, f_xor, keep_truth = TRUE) |> 305 | ggplot(aes(A, B, col = class)) + 306 | geom_point(alpha = 1/2) + 307 | coord_equal() + 308 | theme_bw() 309 | } 310 | 311 | ## How to use external symbols: 312 | 313 | a_coef <- 2 314 | # splice the value in using rlang's !! operator 315 | lp_eqn <- rlang::expr(!!a_coef * A+B) 316 | lp_eqn 317 | sim_logistic(5, lp_eqn) 318 | 319 | # Flexible multinomial regression simulation 320 | if (rlang::is_installed("ggplot2")) { 321 | 322 | } 323 | } 324 | \references{ 325 | Hooker, G. (2004, August). Discovering additive structure in black box 326 | functions. In \emph{Proceedings of the tenth ACM SIGKDD international conference 327 | on Knowledge discovery and data mining} (pp. 575-580). 328 | DOI: 10.1145/1014052.1014122 329 | 330 | Morris, M. D., Mitchell, T. J., and Ylvisaker, D. (1993). Bayesian design 331 | and analysis of computer experiments: use of derivatives in surface 332 | prediction. \emph{Technometrics}, 35(3), 243-255. 333 | 334 | Sapp, S., van der Laan, M. J., and Canny, J. (2014). Subsemble: an ensemble 335 | method for combining subset-specific algorithm fits. \emph{Journal of applied 336 | statistics}, 41(6), 1247-1259. DOI: 10.1080/02664763.2013.864263 337 | 338 | Sorokina, D., Caruana, R., Riedewald, M., and Fink, D. (2008, July). Detecting 339 | statistical interactions with additive groves of trees. In \emph{Proceedings of 340 | the 25th international conference on Machine learning} (pp. 1000-1007). 341 | DOI: 10.1145/1390156.1390282 342 | 343 | Van der Laan, M. J., Polley, E. C., and Hubbard, A. E. (2007). Super learner. 344 | \emph{Statistical applications in genetics and molecular biology}, 6(1). 345 | DOI: 10.2202/1544-6115.1309. 346 | 347 | Worley, B. A. (1987). Deterministic uncertainty analysis (No. ORNL-6428). Oak 348 | Ridge National Lab.(ORNL), Oak Ridge, TN/ 349 | } 350 | -------------------------------------------------------------------------------- /man/small_fine_foods.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/fine_foods.R 3 | \docType{data} 4 | \name{small_fine_foods} 5 | \alias{small_fine_foods} 6 | \alias{training_data} 7 | \alias{testing_data} 8 | \title{Fine foods example data} 9 | \source{ 10 | https://snap.stanford.edu/data/web-FineFoods.html 11 | } 12 | \value{ 13 | \item{training_data,testing_data}{tibbles} 14 | } 15 | \description{ 16 | Fine foods example data 17 | } 18 | \details{ 19 | These data are from Amazon, who describe it as "This dataset consists of 20 | reviews of fine foods from amazon. The data span a period of more than 10 21 | years, including all ~500,000 reviews up to October 2012. Reviews include 22 | product and user information, ratings, and a plaintext review." 23 | 24 | A subset of the data are contained here and are split into a training and 25 | test set. The training set sampled 10 products and retained all of their 26 | individual reviews. Since the reviews within these products are correlated, 27 | we recommend resampling the data using a leave-one-product-out approach. The 28 | test set sampled 500 products that were not included in the training set 29 | and selected a single review at random for each. 30 | 31 | There is a column for the product, a column for the text of the review, and 32 | a factor column for a class variable. The outcome is whether the reviewer 33 | gave the product a 5-star rating or not. 34 | } 35 | \examples{ 36 | data(small_fine_foods) 37 | str(training_data) 38 | str(testing_data) 39 | } 40 | \keyword{datasets} 41 | -------------------------------------------------------------------------------- /man/solubility_test.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/solubility.R 3 | \docType{data} 4 | \name{solubility_test} 5 | \alias{solubility_test} 6 | \title{Solubility predictions from MARS model} 7 | \source{ 8 | Kuhn, M., Johnson, K. (2013) \emph{Applied Predictive 9 | Modeling}, Springer 10 | } 11 | \value{ 12 | \item{solubility_test}{a data frame} 13 | } 14 | \description{ 15 | Solubility predictions from MARS model 16 | } 17 | \details{ 18 | For the solubility data in Kuhn and Johnson (2013), 19 | these data are the test set results for the MARS model. The 20 | observed solubility (in column \code{solubility}) and the model 21 | results (\code{prediction}) are contained in the data. 22 | } 23 | \examples{ 24 | data(solubility_test) 25 | str(solubility_test) 26 | } 27 | \keyword{datasets} 28 | -------------------------------------------------------------------------------- /man/stackoverflow.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/stackoverflow.R 3 | \docType{data} 4 | \name{stackoverflow} 5 | \alias{stackoverflow} 6 | \title{Annual Stack Overflow Developer Survey Data} 7 | \source{ 8 | Julia Silge, \emph{Supervised Machine Learning Case Studies in R} 9 | 10 | \verb{https://supervised-ml-course.netlify.com/chapter2} 11 | 12 | Raw data: \verb{https://insights.stackoverflow.com/survey/} 13 | } 14 | \value{ 15 | \item{stackoverflow}{a tibble} 16 | } 17 | \description{ 18 | Annual Stack Overflow Developer Survey Data 19 | } 20 | \details{ 21 | These data are a collection of 5,594 data points collected on 22 | developers. These data could be used to try to predict who works remotely 23 | (as used in the source listed below). 24 | } 25 | \examples{ 26 | data(stackoverflow) 27 | str(stackoverflow) 28 | } 29 | \keyword{datasets} 30 | -------------------------------------------------------------------------------- /man/steroidogenic_toxicity.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/steroidogenic_toxicity.R 3 | \docType{data} 4 | \name{steroidogenic_toxicity} 5 | \alias{steroidogenic_toxicity} 6 | \title{Predicting steroidogenic toxicity with assay data} 7 | \source{ 8 | Maglich, J. M., Kuhn, M., Chapin, R. E., & Pletcher, M. T. (2014). More than 9 | just hormones: H295R cells as predictors of reproductive toxicity. 10 | \emph{Reproductive Toxicology}, 45, 77-86. 11 | } 12 | \value{ 13 | A tibble with columns 14 | \itemize{ 15 | \item \code{class}: factor(levels: toxic and nontoxic) 16 | \item \code{cyp_11a1}: numeric 17 | \item \code{cyp_11b1}: numeric 18 | \item \code{cyp_11b2}: numeric 19 | \item \code{cyp_17a1}: numeric 20 | \item \code{cyp_19a1}: numeric 21 | \item \code{cyp_21a1}: numeric 22 | \item \code{hsd3b2}: numeric 23 | \item \code{star}: numeric 24 | \item \code{progesterone}: numeric 25 | \item \code{testosterone}: numeric 26 | \item \code{dhea}: numeric 27 | \item \code{cortisol}: numeric 28 | } 29 | } 30 | \description{ 31 | A set of \emph{in vitro} assays are used to quantify the risk of reproductive 32 | toxicity via the disruption of steroidogenic pathways. 33 | } 34 | \details{ 35 | H295R cells were used to measure the effect with two sets of assay results. 36 | The first includes a set of protein measurements on: cytochrome P450 enzymes 37 | ("cyp"s), STAR, and 3BHSD2. The second include hormone measurements for 38 | DHEA, progesterone, testosterone, and cortisol. 39 | 40 | Columns: 41 | \itemize{ 42 | \item \code{class}: factor (levels: 'toxic' and 'nontoxic') 43 | \item \code{cyp_11a1}: numeric 44 | \item \code{cyp_11b1}: numeric 45 | \item \code{cyp_11b2}: numeric 46 | \item \code{cyp_17a1}: numeric 47 | \item \code{cyp_19a1}: numeric 48 | \item \code{cyp_21a1}: numeric 49 | \item \code{hsd3b2}: numeric 50 | \item \code{star}: numeric 51 | \item \code{progesterone}: numeric 52 | \item \code{testosterone}: numeric 53 | \item \code{dhea}: numeric 54 | \item \code{cortisol}: numeric 55 | } 56 | } 57 | \examples{ 58 | data(steroidogenic_toxicity) 59 | str(steroidogenic_toxicity) 60 | 61 | } 62 | -------------------------------------------------------------------------------- /man/tate_text.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/tate_text.R 3 | \docType{data} 4 | \name{tate_text} 5 | \alias{tate_text} 6 | \title{Tate Gallery modern artwork metadata} 7 | \source{ 8 | \itemize{ 9 | \item \url{https://github.com/tategallery/collection} 10 | \item \url{https://www.tate.org.uk/} 11 | } 12 | } 13 | \value{ 14 | \item{tate_text}{a tibble} 15 | } 16 | \description{ 17 | Metadata such as artist, title, and year created for recent artworks owned 18 | by the Tate Gallery. Only artworks created during or after 1990 are 19 | included, and the metadata source was last updated in 2014. The Tate Gallery 20 | provides these data but requests users to be respectful of their 21 | \href{https://github.com/tategallery/collection#usage-guidelines-for-open-data}{guidelines for use}. 22 | } 23 | \examples{ 24 | data(tate_text) 25 | str(tate_text) 26 | } 27 | \keyword{datasets} 28 | -------------------------------------------------------------------------------- /man/taxi.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/taxi.R 3 | \docType{data} 4 | \name{taxi} 5 | \alias{taxi} 6 | \title{Chicago taxi data set} 7 | \source{ 8 | \url{https://data.cityofchicago.org/Transportation/Taxi-Trips/wrvz-psew} 9 | } 10 | \value{ 11 | tibble 12 | } 13 | \description{ 14 | A data set containing information on a subset of taxi trips in the city 15 | of Chicago in 2022. 16 | } 17 | \details{ 18 | The source data are originally described on the linked City of Chicago 19 | data portal. The data exported here are a pre-processed subset motivated by 20 | the modeling problem of predicting whether a rider will tip or not. 21 | 22 | \describe{ 23 | \item{tip}{Whether the rider left a tip. A factor with levels 24 | "yes" and "no".} 25 | \item{distance}{The trip distance, in odometer miles.} 26 | \item{company}{The taxi company, as a factor. Companies that occurred 27 | few times were binned as "other".} 28 | \item{local}{Whether the trip's starting and ending locations are in the 29 | same community. See the source data for community area values.} 30 | \item{dow}{The day of the week in which the trip began, as a 31 | factor.} 32 | \item{month}{The month in which the trip began, as a factor.} 33 | \item{hour}{The hour of the day in which the trip began, as a 34 | numeric.} 35 | } 36 | } 37 | \examples{ 38 | \donttest{ 39 | taxi 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /man/two_class_dat.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/two_class_dat.R 3 | \docType{data} 4 | \name{two_class_dat} 5 | \alias{two_class_dat} 6 | \title{Two class data} 7 | \value{ 8 | \item{two_class_dat}{a data frame} 9 | } 10 | \description{ 11 | Two class data 12 | } 13 | \details{ 14 | There are artificial data with two predictors (\code{A} and \code{B}) and 15 | a factor outcome variable (\code{Class}). 16 | } 17 | \examples{ 18 | data(two_class_dat) 19 | str(two_class_dat) 20 | } 21 | \keyword{datasets} 22 | -------------------------------------------------------------------------------- /man/two_class_example.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/two_class_dat.R 3 | \docType{data} 4 | \name{two_class_example} 5 | \alias{two_class_example} 6 | \title{Two class predictions} 7 | \value{ 8 | \item{two_class_example}{a data frame} 9 | } 10 | \description{ 11 | Two class predictions 12 | } 13 | \details{ 14 | These data are a test set form a model built for two 15 | classes ("Class1" and "Class2"). There are columns for the true 16 | and predicted classes and column for the probabilities for each 17 | class. 18 | } 19 | \examples{ 20 | data(two_class_example) 21 | str(two_class_example) 22 | } 23 | \keyword{datasets} 24 | -------------------------------------------------------------------------------- /man/wa_churn.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wa_churn.R 3 | \docType{data} 4 | \name{wa_churn} 5 | \alias{wa_churn} 6 | \title{Watson churn data} 7 | \source{ 8 | IBM Watson Analytics https://ibm.co/2sOvyvy 9 | } 10 | \value{ 11 | \item{wa_churn}{a data frame} 12 | } 13 | \description{ 14 | Watson churn data 15 | } 16 | \details{ 17 | These data were downloaded from the IBM Watson site 18 | (see below) in September 2018. The data contain a factor for 19 | whether a customer churned or not. Alternatively, the \code{tenure} 20 | column presumably contains information on how long the customer 21 | has had an account. A survival analysis can be done on this 22 | column using the \code{churn} outcome as the censoring information. A 23 | data dictionary can be found on the source website. 24 | } 25 | \examples{ 26 | data(wa_churn) 27 | str(wa_churn) 28 | } 29 | \keyword{datasets} 30 | -------------------------------------------------------------------------------- /modeldata.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(modeldata) 3 | 4 | test_check("modeldata") 5 | -------------------------------------------------------------------------------- /tests/testthat/_snaps/simulations.md: -------------------------------------------------------------------------------- 1 | # classification simulation 2 | 3 | Code 4 | sim_classification(5, method = "potato") 5 | Condition 6 | Error in `sim_classification()`: 7 | ! `method` must be one of "caret", not "potato". 8 | 9 | # sapp_2014_1 simulation 10 | 11 | Code 12 | sim_regression(5, method = "potato") 13 | Condition 14 | Error in `sim_regression()`: 15 | ! `method` must be one of "sapp_2014_1", "sapp_2014_2", "van_der_laan_2007_1", "van_der_laan_2007_2", "hooker_2004", or "worley_1987", not "potato". 16 | 17 | # multinomial simulation 18 | 19 | i In index: 1. 20 | Caused by error in `.f()`: 21 | ! The model equations should only use variables/objects `A` and `B` 22 | 23 | -------------------------------------------------------------------------------- /tests/testthat/test-simulations.R: -------------------------------------------------------------------------------- 1 | test_that("classification simulation", { 2 | set.seed(1) 3 | dat_1 <- sim_classification(500, num_linear = 0) 4 | dat_2 <- sim_classification(10, num_linear = 11) 5 | dat_3 <- sim_classification(1000, num_linear = 1, intercept = 50) 6 | dat_4 <- sim_classification(500, num_linear = 0, keep_truth = TRUE) 7 | 8 | expect_equal( 9 | names(dat_1), 10 | c( 11 | "class", 12 | "two_factor_1", 13 | "two_factor_2", 14 | "non_linear_1", 15 | "non_linear_2", 16 | "non_linear_3" 17 | ) 18 | ) 19 | expect_equal( 20 | names(dat_2), 21 | c( 22 | "class", 23 | "two_factor_1", 24 | "two_factor_2", 25 | "non_linear_1", 26 | "non_linear_2", 27 | "non_linear_3", 28 | modeldata:::names0(11, "linear_") 29 | ) 30 | ) 31 | expect_equal( 32 | names(dat_3), 33 | c( 34 | "class", 35 | "two_factor_1", 36 | "two_factor_2", 37 | "non_linear_1", 38 | "non_linear_2", 39 | "non_linear_3", 40 | "linear_1" 41 | ) 42 | ) 43 | expect_equal( 44 | names(dat_4), 45 | c( 46 | "class", 47 | "two_factor_1", 48 | "two_factor_2", 49 | "non_linear_1", 50 | "non_linear_2", 51 | "non_linear_3", 52 | ".truth" 53 | ) 54 | ) 55 | expect_equal(nrow(dat_1), 500) 56 | expect_equal(nrow(dat_2), 10) 57 | expect_equal(nrow(dat_3), 1000) 58 | expect_true(all(vapply(dat_1[, -1], is.numeric, logical(1)))) 59 | 60 | expect_equal(sum(dat_3 == "class_2"), 0) 61 | expect_equal(levels(dat_3$class), paste0("class_", 1:2)) 62 | expect_snapshot( 63 | error = TRUE, 64 | sim_classification(5, method = "potato") 65 | ) 66 | }) 67 | 68 | test_that("sapp_2014_1 simulation", { 69 | set.seed(1) 70 | dat_1 <- sim_regression(10, method = "sapp_2014_1") 71 | dat_2 <- sim_regression(10, method = "sapp_2014_1", keep_truth = TRUE) 72 | expect_equal(names(dat_1), c("outcome", modeldata:::names0(20, "predictor_"))) 73 | expect_equal( 74 | names(dat_2), 75 | c("outcome", modeldata:::names0(20, "predictor_"), ".truth") 76 | ) 77 | expect_equal(nrow(dat_1), 10) 78 | expect_true(all(vapply(dat_1, is.numeric, logical(1)))) 79 | expect_snapshot( 80 | error = TRUE, 81 | sim_regression(5, method = "potato") 82 | ) 83 | }) 84 | 85 | test_that("sapp_2014_2 simulation", { 86 | set.seed(1) 87 | dat_1 <- sim_regression(10, method = "sapp_2014_2") 88 | dat_2 <- sim_regression(10, method = "sapp_2014_2", keep_truth = TRUE) 89 | expect_equal( 90 | names(dat_1), 91 | c("outcome", modeldata:::names0(200, "predictor_")) 92 | ) 93 | expect_equal( 94 | names(dat_2), 95 | c("outcome", modeldata:::names0(200, "predictor_"), ".truth") 96 | ) 97 | expect_equal(nrow(dat_1), 10) 98 | expect_true(all(vapply(dat_1, is.numeric, logical(1)))) 99 | }) 100 | 101 | test_that("van_der_laan_2007_1 simulation", { 102 | set.seed(1) 103 | dat_1 <- sim_regression(10, method = "van_der_laan_2007_1") 104 | dat_2 <- sim_regression(10, method = "van_der_laan_2007_1", factors = TRUE) 105 | dat_3 <- sim_regression(10, method = "van_der_laan_2007_1", keep_truth = TRUE) 106 | expect_equal(names(dat_1), c("outcome", modeldata:::names0(10, "predictor_"))) 107 | expect_equal( 108 | names(dat_3), 109 | c("outcome", modeldata:::names0(10, "predictor_"), ".truth") 110 | ) 111 | expect_equal(nrow(dat_1), 10) 112 | expect_true(all(vapply(dat_1, is.numeric, logical(1)))) 113 | expect_true(all(vapply(dat_1[, -1], is.integer, logical(1)))) 114 | expect_true(all(vapply(dat_2[, -1], is.factor, logical(1)))) 115 | expect_equal(levels(dat_2$predictor_01), c("yes", "no")) 116 | }) 117 | 118 | test_that("van_der_laan_2007_2 simulation", { 119 | set.seed(1) 120 | dat_1 <- sim_regression(10, method = "van_der_laan_2007_2") 121 | dat_2 <- sim_regression(10, method = "van_der_laan_2007_2", keep_truth = TRUE) 122 | expect_equal(names(dat_1), c("outcome", modeldata:::names0(20, "predictor_"))) 123 | expect_equal( 124 | names(dat_2), 125 | c("outcome", modeldata:::names0(20, "predictor_"), ".truth") 126 | ) 127 | expect_equal(nrow(dat_1), 10) 128 | expect_true(all(vapply(dat_1, is.numeric, logical(1)))) 129 | }) 130 | 131 | test_that("hooker_2004 simulation", { 132 | set.seed(1) 133 | dat_1 <- sim_regression(10, method = "hooker_2004") 134 | dat_2 <- sim_regression(10, method = "hooker_2004", keep_truth = TRUE) 135 | expect_equal(names(dat_1), c("outcome", modeldata:::names0(10, "predictor_"))) 136 | expect_equal( 137 | names(dat_2), 138 | c("outcome", modeldata:::names0(10, "predictor_"), ".truth") 139 | ) 140 | expect_equal(nrow(dat_1), 10) 141 | expect_true(all(vapply(dat_1, is.numeric, logical(1)))) 142 | }) 143 | 144 | 145 | test_that("noise simulation", { 146 | set.seed(1) 147 | dat_1 <- sim_noise(1000, num_vars = 10) 148 | dat_2 <- sim_noise(1000, num_vars = 3, cov_param = .5) 149 | dat_3 <- sim_noise(1000, num_vars = 3, cov_type = "toeplitz", cov_param = .99) 150 | dat_4 <- sim_noise(10, num_vars = 3, outcome = "classification") 151 | dat_5 <- sim_noise( 152 | 10, 153 | num_vars = 3, 154 | outcome = "classification", 155 | num_classes = 10 156 | ) 157 | dat_6 <- sim_noise(10, num_vars = 3, outcome = "regression") 158 | 159 | expect_equal(names(dat_1), modeldata:::names0(10, "noise_")) 160 | expect_equal(names(dat_2), modeldata:::names0(3, "noise_")) 161 | expect_equal(nrow(dat_1), 1000) 162 | expect_equal(nrow(dat_4), 10) 163 | 164 | expect_true(all(vapply(dat_1, is.numeric, logical(1)))) 165 | expect_true(all(vapply(dat_1[, -1], is.numeric, logical(1)))) 166 | expect_true(is.factor(dat_5$class)) 167 | expect_true(all(vapply(dat_6, is.numeric, logical(1)))) 168 | 169 | cor_1 <- cor(dat_1)[upper.tri(cor(dat_1))] 170 | expect_true(all(cor_1 <= 0.1 & cor_1 >= -0.1)) 171 | 172 | cor_2 <- cor(dat_2)[upper.tri(cor(dat_2))] 173 | expect_true(all(cor_2 <= 0.6 & cor_2 >= 0.4)) 174 | 175 | cor_3 <- cor(dat_3)[upper.tri(cor(dat_3))] 176 | expect_true(all(cor_3 >= 0.95)) 177 | 178 | expect_equal(levels(dat_4$class), paste0("class_", 1:2)) 179 | expect_equal(levels(dat_5$class), modeldata:::names0(10, "class_")) 180 | }) 181 | 182 | 183 | test_that("logistic simulation", { 184 | set.seed(1) 185 | dat_1 <- sim_logistic(10, ~A) 186 | dat_2 <- sim_logistic(10, rlang::expr(~B), keep_truth = TRUE) 187 | expect_equal(names(dat_1), c(LETTERS[1:2], "class")) 188 | expect_equal(names(dat_2), c(LETTERS[1:2], ".linear_pred", ".truth", "class")) 189 | expect_equal(nrow(dat_1), 10) 190 | }) 191 | 192 | 193 | test_that("multinomial simulation", { 194 | expect_snapshot_error(sim_multinomial(10, ~ A + C, ~B, ~ A + B)) 195 | set.seed(1) 196 | dat_1 <- sim_multinomial(10, ~A, ~B, ~ A + B) 197 | dat_2 <- sim_multinomial(10, ~A, ~B, ~ A + B, keep_truth = TRUE) 198 | expect_equal(names(dat_1), c(LETTERS[1:2], "class")) 199 | expect_equal( 200 | names(dat_2), 201 | c(LETTERS[1:2], "class", ".truth_one", ".truth_two", ".truth_three") 202 | ) 203 | expect_equal(nrow(dat_1), 10) 204 | }) 205 | --------------------------------------------------------------------------------