├── .Rbuildignore
├── .github
    ├── .gitignore
    └── workflows
    │   └── R-CMD-check.yaml
├── .gitignore
├── CRAN-SUBMISSION
├── DESCRIPTION
├── LICENSE
├── LICENSE.md
├── MLDataR.Rproj
├── NAMESPACE
├── NEWS.md
├── R
    ├── PreDiabetes.R
    ├── care_home_incidents.R
    ├── csgo.R
    ├── diabetes_data.R
    ├── heartdisease.R
    ├── long_stayers.R
    ├── stroke_classification.R
    └── thyroid_disease.R
├── README.md
├── cran-comments.md
├── data
    ├── PreDiabetes.rda
    ├── care_home_incidents.rda
    ├── csgo.rda
    ├── diabetes_data.rda
    ├── heartdisease.rda
    ├── long_stayers.rda
    ├── stroke_classification.rda
    └── thyroid_disease.rda
├── man
    ├── PreDiabetes.Rd
    ├── care_home_incidents.Rd
    ├── csgo.Rd
    ├── diabetes_data.Rd
    ├── figures
    │   └── mldataR.png
    ├── heartdisease.Rd
    ├── long_stayers.Rd
    ├── stroke_classification.Rd
    └── thyroid_disease.Rd
└── vignettes
    ├── .gitignore
    ├── MLDataR.Rmd
    └── mldataR.png


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^LICENSE\.md$
 4 | ^\.github$
 5 | ^CRAN-RELEASE$
 6 | ^README.*\.md$
 7 | inst/examples/knitr-.*.pdf
 8 | inst/examples/child/knitr-.*.pdf
 9 | inst/examples/child/knitr-.*\.md
10 | inst/examples/figure
11 | inst/examples/cache
12 | ^\.travis\.yml$
13 | cran-comments.md
14 | ^doc$
15 | ^Meta$
16 | ^CODE_OF_CONDUCT\.md$
17 | ^CRAN-SUBMISSION$
18 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
  1 | # NOTE: This workflow is overkill for most R packages
  2 | # check-standard.yaml is likely a better choice
  3 | # usethis::use_github_action("check-standard") will install it.
  4 | #
  5 | # For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag.
  6 | # https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions
  7 | on:
  8 |   push:
  9 |     branches:
 10 |       - main
 11 |       - master
 12 |   pull_request:
 13 |     branches:
 14 |       - main
 15 |       - master
 16 | 
 17 | name: R-CMD-check
 18 | 
 19 | jobs:
 20 |   R-CMD-check:
 21 |     runs-on: ${{ matrix.config.os }}
 22 | 
 23 |     name: ${{ matrix.config.os }} (${{ matrix.config.r }})
 24 | 
 25 |     strategy:
 26 |       fail-fast: false
 27 |       matrix:
 28 |         config:
 29 |           - {os: windows-latest, r: '4.3', rspm: "https://packagemanager.rstudio.com/cran/latest"}
 30 |         
 31 |     env:
 32 |       RSPM: ${{ matrix.config.rspm }}
 33 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
 34 | 
 35 |     steps:
 36 |       - uses: actions/checkout@v2
 37 | 
 38 |       - uses: r-lib/actions/setup-r@v2
 39 |         id: install-r
 40 |         with:
 41 |           r-version: ${{ matrix.config.r }}
 42 |           http-user-agent: ${{ matrix.config.http-user-agent }}
 43 | 
 44 |       - uses: r-lib/actions/setup-pandoc@v2
 45 | 
 46 |       - name: Install pak and query dependencies
 47 |         run: |
 48 |           install.packages("pak", repos = "https://r-lib.github.io/p/pak/dev/")
 49 |           saveRDS(pak::pkg_deps("local::.", dependencies = TRUE), ".github/r-depends.rds")
 50 |         shell: Rscript {0}
 51 | 
 52 |       - name: Restore R package cache
 53 |         uses: actions/cache@v2
 54 |         with:
 55 |           path: |
 56 |             ${{ env.R_LIBS_USER }}/*
 57 |             !${{ env.R_LIBS_USER }}/pak
 58 |           key: ${{ matrix.config.os }}-${{ steps.install-r.outputs.installed-r-version }}-1-${{ hashFiles('.github/r-depends.rds') }}
 59 |           restore-keys: ${{ matrix.config.os }}-${{ steps.install-r.outputs.installed-r-version }}-1-
 60 | 
 61 |       - name: Install system dependencies
 62 |         if: runner.os == 'Linux'
 63 |         run: |
 64 |           pak::local_system_requirements(execute = TRUE)
 65 |           pak::pkg_system_requirements("rcmdcheck", execute = TRUE)
 66 |         shell: Rscript {0}
 67 | 
 68 |       - name: Install dependencies
 69 |         run: |
 70 |           pak::local_install_dev_deps(upgrade = TRUE)
 71 |           pak::pkg_install("rcmdcheck")
 72 |         shell: Rscript {0}
 73 | 
 74 |       - name: Session info
 75 |         run: |
 76 |           options(width = 100)
 77 |           pkgs <- installed.packages()[, "Package"]
 78 |           sessioninfo::session_info(pkgs, include_base = TRUE)
 79 |         shell: Rscript {0}
 80 | 
 81 |       - name: Check
 82 |         env:
 83 |           _R_CHECK_CRAN_INCOMING_: false
 84 |         run: |
 85 |           options(crayon.enabled = TRUE)
 86 |           rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check")
 87 |         shell: Rscript {0}
 88 | 
 89 |       - name: Show testthat output
 90 |         if: always()
 91 |         run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true
 92 |         shell: bash
 93 | 
 94 |       - name: Upload check results
 95 |         if: failure()
 96 |         uses: actions/upload-artifact@main
 97 |         with:
 98 |           name: ${{ matrix.config.os }}-r${{ matrix.config.r }}-results
 99 |           path: check
100 | 
101 |       - name: Don't use tar from old Rtools to store the cache
102 |         if: ${{ runner.os == 'Windows' && startsWith(steps.install-r.outputs.installed-r-version, '3.6' ) }}
103 |         shell: bash
104 |         run: echo "C:/Program Files/Git/usr/bin" >> $GITHUB_PATH
105 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | .all-contributorsrc
 6 | inst/doc
 7 | README.md
 8 | doc
 9 | Meta
10 | /doc/
11 | /Meta/
12 | .DS_Store
13 | 


--------------------------------------------------------------------------------
/CRAN-SUBMISSION:
--------------------------------------------------------------------------------
1 | Version: 1.0.1
2 | Date: 2022-10-03 14:44:53 UTC
3 | SHA: 7e8b5a3e0657d5e0e8293d10071529c01fc0af97
4 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: MLDataR
 2 | Type: Package
 3 | Title: Collection of Machine Learning Datasets for Supervised Machine Learning
 4 | Version: 1.0.1
 5 | Authors@R: c(
 6 |           person("Gary", "Hutson", , "hutsons-hacks@outlook.com", c("aut", "cre"), 
 7 |           comment = c(ORCID="0000-0003-3534-6143")),
 8 |           person("Asif", "Laldin", , "laldin.asif@gmail.com", c("aut")),
 9 |           person("Isabella", "Velásquez", , "ivelasq@gmail.com", c("aut"))
10 |           )
11 | Maintainer: Gary Hutson <hutsons-hacks@outlook.com>
12 | Description: Contains a collection of datasets for working with machine learning tasks.
13 |     It will contain datasets for supervised machine learning Jiang (2020)<doi:10.1016/j.beth.2020.05.002> and will include datasets for classification and regression.
14 |     The aim of this package is to use data generated around health and other domains.
15 | License: MIT + file LICENSE
16 | Encoding: UTF-8
17 | LazyData: true
18 | BugReports: https://github.com/StatsGary/MLDataR/issues
19 | Imports:
20 |     ConfusionTableR,
21 |     dplyr,
22 |     parsnip, 
23 |     rsample,
24 |     recipes,
25 |     workflows,
26 |     ranger, caret, varhandle,
27 |     OddsPlotty,
28 |     ggplot2
29 | RoxygenNote: 7.1.2
30 | Suggests: 
31 |     rmarkdown,
32 |     knitr
33 | VignetteBuilder: knitr
34 | Depends: 
35 |     R (>= 2.10)
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2021
2 | COPYRIGHT HOLDER: MLDataR authors
3 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | Copyright (c) 2021 MLDataR authors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MLDataR.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 | 
3 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # NEWS - MLDataR
 2 | The news for the package is contained hereunder:
 3 | 
 4 | ## -0.1.1 - Initial build release 
 5 | Initial data items released are:
 6 | - Diabetes data for ML predictive modelling
 7 | - Heart Disease data for ML supervised classification
 8 | - Thyroid Disease data for ML supervised classification
 9 | 
10 | ## 0.1.2 - Added further datasets
11 | Datasets added:
12 | - Care home incidents dataset - classification dataset to predict care home incidents based on features collected via the care home. Aim is to predict failing care homes, over care homes that meet inspection
13 | - Pre-diabetes - regression model to predict time between pre diabetes and diabetes, based off patient features. 
14 | 
15 | ## 0.1.3 - Added long_waiters dataset
16 | Added the long waiters dataset for working with NHS inpatients. 
17 | 
18 | ## 1.0.1 - Fixed issue with patient age in Thyroid dataset
19 | Fixed an issue with the patient age in the Thyroid dataset. 
20 | 


--------------------------------------------------------------------------------
/R/PreDiabetes.R:
--------------------------------------------------------------------------------
 1 | #' PreDiabetes dataset
 2 | #' @docType data
 3 | #' @keywords prediabetes machine learning regression
 4 | #' @format A data frame with 3059 rows and 9 variables:
 5 | #' \describe{
 6 | #'   \item{Age}{age of the patient presenting with diabetes}
 7 | #'   \item{Sex}{sex of the patient with diabetes}
 8 | #'   \item{IMD_Decile}{Index of Multiple Deprivation Decile}
 9 | #'   \item{BMI}{Body Mass Index of patient}
10 | #'   \item{Age_PreDiabetes}{age at pre diabetes diagnosis}
11 | #'   \item{HbA1C}{average blood glucose mmol/mol}
12 | #'   \item{Time_Pre_To_Diabetes}{time in years between pre-diabetes and diabetes diagnosis}
13 | #'   \item{Age_Diabetes}{age at diabetes diagnosis}
14 | #'   \item{PreDiabetes_Checks_Before_Diabetes}{number of pre-diabetes related primary care appointments before diabetes diagnosis}
15 | #'
16 | #' }
17 | #' @source Generated by Asif Laldin \email{a.laldin@nhs.net}, Jan-2022
18 | #' @examples
19 | #' library(dplyr)
20 | #' data(PreDiabetes)
21 | #' # Convert diabetes data to factor'
22 | #' diabetes_data <- PreDiabetes %>%
23 | #'  glimpse()
24 | "PreDiabetes"
25 | 


--------------------------------------------------------------------------------
/R/care_home_incidents.R:
--------------------------------------------------------------------------------
 1 | #' Care Home Incidents
 2 | #' @description a NHS patient safety incidents dataset: \url{https://www.england.nhs.uk/patient-safety/report-patient-safety-incident/} dataset that has been synthetically generated against real data
 3 | #' @docType data
 4 | #' @keywords care home incidents supervised machine learning classification
 5 | #' @format A data frame with 1216 rows and 12 variables:
 6 | #' \describe{
 7 | #'   \item{CareHomeFail}{a binary indicator to specify whether a certain care home is failing}
 8 | #'   \item{WeightLoss}{aggregation of incidents indicating weight loss in patient}
 9 | #'   \item{Medication}{medication missed aggregaation}
10 | #'   \item{Falls}{Recorded number of patient falls}
11 | #'   \item{Choking}{Number of patient choking incidents}
12 | #'   \item{UnexpectedDeaths}{unexpected deaths in the care home}
13 | #'   \item{Bruising}{Number of bruising incidents in the care home}
14 | #'   \item{Absconsion}{Absconding from the care home setting}
15 | #'   \item{ResidentAbuseByResident}{Abuse conducted by one care home resident against another}
16 | #'   \item{ResidentAbuseByStaff}{Incidents of resident abuse by staff}
17 | #'   \item{ResidentAbuseOnStaff}{Incidents of residents abusing staff}
18 | #'   \item{Wounds}{Unexplained wounds against staff}
19 | #'   }
20 | 
21 | #' @source Collected by Gary Hutson \email{hutsons-hacks@outlook.com}, Jan-2022
22 | #' @examples
23 | #' library(dplyr)
24 | #' data(care_home_incidents)
25 | #' # Convert diabetes data to factor'
26 | #' ch_incs <- care_home_incidents %>%
27 | #'  mutate(CareHomeFail = as.factor(CareHomeFail))
28 | #'  ch_incs %>% glimpse()
29 | #'  # Check factor
30 | #'  factor(ch_incs$CareHomeFail)
31 | "care_home_incidents"
32 | 


--------------------------------------------------------------------------------
/R/csgo.R:
--------------------------------------------------------------------------------
 1 | #' csgo
 2 | #' @docType data
 3 | #' @keywords CounterStrike Global Offensive eSports
 4 | #' @format A data frame with 1,133 rows and 17 variables:
 5 | #' \describe{
 6 | #'   \item{map}{Map on which the match was played}
 7 | #'   \item{day}{Day of the month}
 8 | #'   \item{month}{Month of the year}
 9 | #'   \item{year}{Year}
10 | #'   \item{date}{Date of match DD/MM/YYYY}
11 | #'   \item{wait_time_s}{Time waited to find match}
12 | #'   \item{match_time_s}{Total match length in seconds}
13 | #'   \item{team_a_rounds}{Number of rounds played as Team A}
14 | #'   \item{team_b_rounds}{Number of rounds played as Team B}
15 | #'   \item{ping}{Maximum ping in milliseconds;the signal that's sent from one computer to another on the same network}
16 | #'   \item{kills}{Number of kills accumulated in match; max 5 per round}
17 | #'   \item{assists}{Number of assists accumulated in a match,inflicting oppononent with more than 50 percent damage,who is then killed by another player accumulated in match max 5 per round}
18 | #'   \item{deaths}{Number of times player died during match;max 1 per round}
19 | #'   \item{mvps}{Most Valuable Player award}
20 | #'   \item{hs_percent}{Percentage of kills that were a result from a shot to opponent's head}
21 | #'   \item{points}{Number of points accumulated during match. Apoints are gained from kills, assists,bomb defuses & bomb plants. Points are lost for sucicide and friendly kills}
22 | #'   \item{result}{The result of the match, Win, Loss, Draw}
23 | #'}
24 | #' @source Extracted by Asif Laldin \email{a.laldin@nhs.net}, March-2019
25 | 
26 | "csgo"
27 | 


--------------------------------------------------------------------------------
/R/diabetes_data.R:
--------------------------------------------------------------------------------
 1 | #' Diabetes datasets
 2 | #' @docType data
 3 | #' @keywords diabetes machine learning classification
 4 | #' @format A data frame with 520 rows and 17 variables:
 5 | #' \describe{
 6 | #'   \item{Age}{age of the patient presenting with diabetes}
 7 | #'   \item{Gender}{gender of the patient with diabetes}
 8 | #'   \item{ExcessUrination}{if the patient has a history of excessive urination}
 9 | #'   \item{Polydipsia}{abnormal thurst, accompanied by the excessive intake of water or fluid}
10 | #'   \item{WeightLossSudden}{Sudden weight loss that has recently occured}
11 | #'   \item{Fatigue}{Fatigue or weakness}
12 | #'   \item{Polyphagia}{excessive or extreme hunger}
13 | #'   \item{GenitalThrush}{patient has thrush fungus on or near their genital region}
14 | #'   \item{BlurredVision}{history of blurred vision}
15 | #'   \item{Itching}{skin itching}
16 | #'   \item{Irritability}{general irritability and mood issues}
17 | #'   \item{DelayHealing}{delayed healing of wounds}
18 | #'   \item{PartialPsoriasis}{partial psoriasis on the body}
19 | #'   \item{MuscleStiffness}{stiffness of the muscles}
20 | #'   \item{Alopecia}{scalp alopecia and hair shedding}
21 | #'   \item{Obesity}{Classified as obese}
22 | #'   \item{DiabeticClass}{Class label to indicate whether the patient is diabetic or not}
23 | #' }
24 | #' @source Collected by Gary Hutson \email{hutsons-hacks@outlook.com}, Dec-2021
25 | #' @examples
26 | #' library(dplyr)
27 | #' data(diabetes_data)
28 | #' # Convert diabetes data to factor'
29 | #' diabetes_data <- diabetes_data %>%
30 | #'  glimpse() %>%
31 | #'  mutate(DiabeticClass = as.factor(DiabeticClass))
32 | #'  is.factor(diabetes_data$DiabeticClass)
33 | "diabetes_data"
34 | 


--------------------------------------------------------------------------------
/R/heartdisease.R:
--------------------------------------------------------------------------------
 1 | #' Heart disease dataset
 2 | #'
 3 | #' The dataset is to be used with a supervised classification ML model to classify heart disease.
 4 | #' @docType data
 5 | #' @keywords heart disease heart disease machine learning classification
 6 | #' @format A data frame with 918 rows and 10 variables:
 7 | #' \describe{
 8 | #'   \item{Age}{age of the patient presenting with heart disease}
 9 | #'   \item{Sex}{gender of the patient}
10 | #'   \item{RestingBP}{blood pressure for resting heart beat}
11 | #'   \item{Cholesterol}{Cholesterol reading}
12 | #'   \item{FastingBS}{blood sample of glucose after a patient fasts \url{https://www.diabetes.co.uk/diabetes_care/fasting-blood-sugar-levels.html}}
13 | #'   \item{RestingECG}{Resting echocardiography is an indicator of previous myocardial infarction e.g. heart attack}
14 | #'   \item{MaxHR}{Maximum heart rate}
15 | #'   \item{Angina}{chest pain caused by decreased flood flow \url{https://www.nhs.uk/conditions/angina/}}
16 | #'   \item{HeartPeakReading}{reading at the peak of the heart rate}
17 | #'   \item{HeartDisease}{the classification label of whether patient has heart disease or not}
18 | #'
19 | #' }
20 | #' @source Collected by Gary Hutson \email{hutsons-hacks@outlook.com}, Dec-2021
21 | #' @examples
22 | #' library(dplyr)
23 | #' library(ConfusionTableR)
24 | #' data(heartdisease)
25 | #'
26 | #' # Convert diabetes data to factor'
27 | #' hd <- heartdisease %>%
28 | #'  glimpse() %>%
29 | #'  mutate(HeartDisease = as.factor(HeartDisease))
30 | #' # Check that the label is now a factor
31 | #'  is.factor(hd$HeartDisease)
32 | #'  # Dummy encoding
33 | #' # Get categorical columns
34 | #' hd_cat <- hd  %>%
35 | #'  dplyr::select_if(is.character)
36 | 
37 | #'  # Dummy encode the categorical variables
38 | #'  # Specify the columns to encode
39 | #'  cols <- c("RestingECG", "Angina", "Sex")
40 | #'  # Dummy encode using dummy_encoder in ConfusionTableR package
41 | #'  coded <- ConfusionTableR::dummy_encoder(hd_cat, cols, remove_original = TRUE)
42 | #' coded <- coded %>%
43 | #'     select(RestingECG_ST, RestingECG_LVH, Angina=Angina_Y,
44 | #'     Sex=Sex_F)
45 | #' # Remove column names we have encoded from original data frame
46 | #' hd_one <- hd[,!names(hd) %in% cols]
47 | #' # Bind the numerical data on to the categorical data
48 | #' hd_final <- bind_cols(coded, hd_one)
49 | #' # Output the final encoded data frame for the ML task
50 | #' glimpse(hd_final)
51 | "heartdisease"
52 | 


--------------------------------------------------------------------------------
/R/long_stayers.R:
--------------------------------------------------------------------------------
 1 | #' Long stayers dataset
 2 | #' @description classification dataset of long staying patients.
 3 | #' Contains patients who have been registered as an inpatient for longer than 7 days length of stay \url{https://www.england.nhs.uk/south/wp-content/uploads/sites/6/2016/12/rig-reviewing-stranded-patients-hospital.pdf}.
 4 | #' @docType data
 5 | #' @keywords long stay patient stranded NHS
 6 | #' @format A data frame with 768 rows and 9 variables:
 7 | #' \describe{
 8 | #'   \item{stranded.label}{binary classification label indicating whether \strong{stranded = 1} or \strong{not stranded=0}}
 9 | #'   \item{age}{age of the patient}
10 | #'   \item{care.home.referral}{flag indicating whether referred from a private care home - \strong{1=Care Home Referral} and \strong{0=Not a care home referral}}
11 | #'   \item{medicallysafe}{flag indicating whether they are medically safe for discharge - \strong{1=Medically safe} and \strong{0=Not medically safe}}
12 | #'   \item{hcop}{flag indicating health care for older person triage - \strong{1=Yes triaged from HCOP} and \strong{0=Triaged from different department}}
13 | #'   \item{mental_health_care}{flag indicating whether they require mental health care - \strong{1=MH assistance needed} and \strong{0=No history of mental health}}
14 | #'   \item{periods_of_previous_care}{Count of the number of times they have been in hospital in last 12 months}
15 | #'   \item{admit_date}{date the patient was admitted as an inpatient}
16 | #'   \item{frailty_index}{indicates the type of frailty - nominal variable}
17 | #' }
18 | #' @source Prepared, acquired and adatped by Gary Hutson \email{hutsons-hacks@outlook.com}, Dec-2021. Synthetic data, based off live patient data from various NHS secondary health care trusts.
19 | #' @examples
20 | #' library(dplyr)
21 | #' library(ggplot2)
22 | #' library(caret)
23 | #' library(rsample)
24 | #' library(varhandle)
25 | 
26 | #' data("long_stayers")
27 | #' glimpse(long_stayers)
28 | #' # Examine class imbalance
29 | #' prop.table(table(long_stayers$stranded.label))
30 | #' # Feature engineering
31 | #' long_stayers <- long_stayers %>%
32 | #' dplyr::mutate(stranded.label=factor(stranded.label)) %>%
33 | #'  dplyr::select(everything(), -c(admit_date))
34 | #'  # Feature encoding
35 | #'  cats <- select_if(long_stayers, is.character)
36 | #'  cat_dummy <- varhandle::to.dummy(cats$frailty_index, "frail_ind")
37 | #' #Converts the frailty index column to dummy encoding and sets a column called "frail_ind" prefix
38 | #'cat_dummy <- cat_dummy %>%
39 | #'  as.data.frame() %>%
40 | #'  dplyr::select(-frail_ind.No_index_item) #Drop the field of interest
41 | #'long_stayers <- long_stayers %>%
42 | #'  dplyr::select(-frailty_index) %>%
43 | #'  bind_cols(cat_dummy) %>% na.omit(.)
44 | #' # Split the data
45 | #' split <- rsample::initial_split(long_stayers, prop = 3/4)
46 | #' train <- rsample::training(split)
47 | #' test <- rsample::testing(split)
48 | #' set.seed(123)
49 | #' glm_class_mod <- caret::train(factor(stranded.label) ~ ., data = train,
50 | #'                              method = "glm")
51 | #' print(glm_class_mod)
52 | #' # Predict the probabilities
53 | #' preds <- predict(glm_class_mod, newdata = test) # Predict class
54 | #' pred_prob <- predict(glm_class_mod, newdata = test, type="prob") #Predict probs
55 | #'
56 | #'predicted <- data.frame(preds, pred_prob)
57 | #' test <- test %>%
58 | #'  bind_cols(predicted) %>%
59 | #'  dplyr::rename(pred_class=preds)
60 | #' #Evaluate with ConfusionTableR
61 | #' library(ConfusionTableR)
62 | #' cm <- ConfusionTableR::binary_class_cm(test$stranded.label, test$pred_class, positive="Stranded")
63 | #' cm$record_level_cm
64 | #' # Visualise odds ration
65 | #' library(OddsPlotty)
66 | 
67 | #' plotty <- OddsPlotty::odds_plot(glm_class_mod$finalModel,
68 | #'                                title = "Odds Plot ",
69 | #'                                subtitle = "Showing odds of patient stranded",
70 | #'                                point_col = "#00f2ff",
71 | #'                                error_bar_colour = "black",
72 | #'                                point_size = .5,
73 | #'                                error_bar_width = .8,
74 | #'                                h_line_color = "red")
75 | #' print(plotty)
76 | 
77 | "long_stayers"
78 | 


--------------------------------------------------------------------------------
/R/stroke_classification.R:
--------------------------------------------------------------------------------
 1 | #' Stroke Classification dataset
 2 | #' @description This dataset has been obtained from a Stoke department within the NHS and is a traditional supervised ML classification dataset
 3 | #' @docType data
 4 | #' @keywords stoke
 5 | #' @format A data frame with 5110 rows and 11 variables:
 6 | #' \describe{
 7 | #'   \item{pat_id}{unique patient identifier index}
 8 | #'   \item{stroke}{outcome variable as a flag - 1 for stroke and 0 for no stroke}
 9 | #'   \item{gender}{patient gender description}
10 | #'   \item{age}{age of the patient}
11 | #'   \item{hypertension}{binary flag to indicate whether patient has hypertension: \url{https://www.nhs.uk/conditions/high-blood-pressure-hypertension/}}
12 | #'   \item{heart_disease}{binary flag to indicate whether patient has heart disease: 1 or no heart disease history: 0}
13 | #'   \item{work_related_stress}{binary flag to indicate whether patient has history of work related stress}
14 | #'   \item{urban_residence}{binary flag indicating whether patient lives in an urban area or not}
15 | #'   \item{avg_glucose_level}{average blood glucose readings of the patient}
16 | #'   \item{bmi}{body mass index of the patient: \url{https://www.nhs.uk/live-well/healthy-weight/bmi-calculator/}}
17 | #'   \item{smokes}{binary flag to indicate if the patient smokes - 1 for current smoker and 0 for smoking cessation}
18 | #'
19 | #
20 | #' }
21 | #' @source Prepared and compiled by Gary Hutson \email{hutsons-hacks@outlook.com}, Apr-2022.
22 | "stroke_classification"
23 | 


--------------------------------------------------------------------------------
/R/thyroid_disease.R:
--------------------------------------------------------------------------------
  1 | #' Thyroid disease dataset
  2 | #' @description The dataset is to be used with a supervised classification ML model to classify thyroid disease.
  3 | #' The dataset was sourced and adapted from the UCI Machine Learning repository \url{https://archive.ics.uci.edu/ml/index.php}.
  4 | #' @docType data
  5 | #' @keywords thyroid disease
  6 | #' @format A data frame with 3772 rows and 28 variables:
  7 | #' \describe{
  8 | #'   \item{ThryroidClass}{binary classification label indicating whether \strong{sick = 1} or \strong{negative=0}}
  9 | #'   \item{patient_age}{age of the patient}
 10 | #'   \item{patient_gender}{flag indicating gender of patient - \strong{1=Female} and \strong{0=Male}}
 11 | #'   \item{presc_thyroxine}{flag to indicate whether thyroxine replacement prescribed \strong{1=Thyroxine prescribed}}
 12 | #'   \item{queried_why_on_thyroxine}{flag to indicate query has been actioned}
 13 | #'   \item{presc_anthyroid_meds}{flag to indicate whether anti-thyroid medicine has been prescribed}
 14 | #'   \item{sick}{flag to indicate sickness due to thyroxine depletion or over activity}
 15 | #'   \item{pregnant}{flag to indicate whether the patient is pregnant}
 16 | #'   \item{thyroid_surgery}{flag to indicate whether the patient has had thyroid surgery}
 17 | #'   \item{radioactive_iodine_therapyI131}{indicates whether patient has had radioactive iodine treatment: \url{https://www.nhs.uk/conditions/thyroid-cancer/treatment/}}
 18 | #'   \item{query_hypothyroid}{flag to indicate under active thyroid query \url{https://www.nhs.uk/conditions/underactive-thyroid-hypothyroidism/}}
 19 | #'   \item{query_hyperthyroid}{flag to indicate over active thyroid query \url{https://www.nhs.uk/conditions/overactive-thyroid-hyperthyroidism/}}
 20 | #'   \item{lithium}{Lithium carbonate administered to decrease the level of thyroid hormones}
 21 | #'   \item{goitre}{flag to indicate swelling of the thyroid gland \url{https://www.nhs.uk/conditions/goitre/}}
 22 | #'   \item{tumor}{flag to indicate a tumor}
 23 | #'   \item{hypopituitarism}{flag to indicate a diagnosed under active thyroid}
 24 | #'   \item{psych_condition}{indicates whether a patient has a psychological condition}
 25 | #'   \item{TSH_measured}{a TSH level lower than normal indicates there is usually more than enough thyroid hormone in the body and may indicate hyperthyroidism}
 26 | #'   \item{TSH_reading}{the reading result of the TSH blood test}
 27 | #'   \item{T3_measured}{linked to TSH reading - when free triiodothyronine rise above normal this indicates hyperthyroidism}
 28 | #'   \item{T3_reading}{the reading result of the T3 blood test looking for above normal levels of free triiodothyronine}
 29 | #'   \item{T4_measured}{free thyroxine, also known as T4, is used with T3 and TSH tests to diagnose hyperthyroidism}
 30 | #'   \item{T4_reading}{the reading result of th T4 test}
 31 | #'   \item{thyrox_util_rate_T4U_measured}{flag indicating the thyroxine utilisation rate \url{https://pubmed.ncbi.nlm.nih.gov/1685967/}}
 32 | #'   \item{thyrox_util_rate_T4U_reading}{the result of the test}
 33 | #'   \item{FTI_measured}{flag to indicate measurement on the Free Thyroxine Index (FTI)\url{https://endocrinology.testcatalog.org/show/FRTUP}}
 34 | #'   \item{FTI_reading}{the result of the test mentioned above}
 35 | #'   \item{ref_src}{[nominal] indicating the referral source of the patient}
 36 | #' }
 37 | #' @source Prepared and adatped by Gary Hutson \email{hutsons-hacks@outlook.com}, Dec-2021 and sourced from  Garavan Institute and J. Ross Quinlan.
 38 | #' @references Thyroid disease records supplied by the Garavan Institute and J. Ross Quinlan.
 39 | #' @examples
 40 | #' library(dplyr)
 41 | #' library(ConfusionTableR)
 42 | #' library(parsnip)
 43 | #' library(rsample)
 44 | #' library(recipes)
 45 | #' library(ranger)
 46 | #' library(workflows)
 47 | #' data("thyroid_disease")
 48 | #' td <- thyroid_disease
 49 | #' # Create a factor of the class label to use in ML model
 50 | #' td$ThryroidClass <- as.factor(td$ThryroidClass)
 51 | #' # Check the structure of the data to make sure factor has been created
 52 | #' str(td)
 53 | #' # Remove missing values, or choose more advaced imputation option
 54 | #' td <- td[complete.cases(td),]
 55 | #' #Drop the column for referral source
 56 | #' td <- td %>%
 57 | #'  dplyr::select(-ref_src)
 58 | #' # Analyse class imbalance
 59 | #' class_imbalance <- prop.table(table(td$ThryroidClass))
 60 | #' class_imbalance
 61 | #' #Divide the data into a training test split
 62 | #' set.seed(123)
 63 | #' split <- rsample::initial_split(td, prop=3/4)
 64 | #' train_data <- rsample::training(split)
 65 | #' test_data <- rsample::testing(split)
 66 | #' # Create recipe to upsample and normalise
 67 | #' set.seed(123)
 68 | #'td_recipe <-
 69 | #'  recipe(ThryroidClass ~ ., data=train_data) %>%
 70 | #'   step_normalize(all_predictors()) %>%
 71 | #'   step_zv(all_predictors())
 72 | #' # Instantiate the model
 73 | #' set.seed(123)
 74 | #' rf_mod <-
 75 | #'   parsnip::rand_forest() %>%
 76 | #'   set_engine("ranger") %>%
 77 | #'   set_mode("classification")
 78 | #' # Create the model workflow
 79 | #' td_wf <-
 80 | #'   workflow() %>%
 81 | #'   workflows::add_model(rf_mod) %>%
 82 | #'   workflows::add_recipe(td_recipe)
 83 | #'# Fit the workflow to our training data
 84 | #' set.seed(123)
 85 | #' td_rf_fit <-
 86 | #'   td_wf %>%
 87 | #'   fit(data = train_data)
 88 | #' # Extract the fitted data
 89 | #' td_fitted <- td_rf_fit %>%
 90 | #'    extract_fit_parsnip()
 91 | #' # Predict the test set on the training set to see model performance
 92 | #' class_pred <- predict(td_rf_fit, test_data)
 93 | #' td_preds <- test_data %>%
 94 | #' bind_cols(class_pred)
 95 | #' # Convert both to factors
 96 | #' td_preds$.pred_class <- as.factor(td_preds$.pred_class)
 97 | #' td_preds$ThryroidClass <- as.factor(td_preds$ThryroidClass)
 98 | #' # Evaluate the data with ConfusionTableR
 99 | #' cm <- ConfusionTableR::binary_class_cm(td_preds$ThryroidClass ,
100 | #'                                        td_preds$.pred_class,
101 | #'                                        positive="sick")
102 | #' #View Confusion matrix
103 | #' cm$confusion_matrix
104 | #' #View record level
105 | #' cm$record_level_cm
106 | "thyroid_disease"
107 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MLDataR
 2 | 
 3 | <p><a href="https://hutsons-hacks.info/"><img src = "man/figures/mldataR.png" width = "200px" height = "150px" align="right"></a></p>
 4 | 
 5 |  <!-- badges: start -->
 6 |   [![](https://cranlogs.r-pkg.org/badges/MLDataR)](https://cran.r-project.org/package=MLDataR)
 7 |    [![CRAN status](https://www.r-pkg.org/badges/version/MLDataR)](https://CRAN.R-project.org/package=MLDataR) 
 8 |   ![GitHub last commit](https://img.shields.io/github/last-commit/StatsGary/MLDataR)
 9 |   ![GitHub Repo stars](https://img.shields.io/github/stars/StatsGary/MLDataR?label=MLDataR%20Stars)
10 |   [![Downloads](https://cranlogs.r-pkg.org/badges/grand-total/MLDataR)](https://cran.r-project.org/package=MLDataR)
11 |   [![license](https://img.shields.io/github/license/mashape/apistatus.svg)](https://github.com/ald0405/SangerTools/blob/master/LICENSE)
12 |   <!-- badges: end -->
13 | 
14 | A collection of Machine Learning datasets for health care and beyond.
15 | 
16 | ## Installing the package from GitHub
17 | 
18 | Here, I will use the package remotes to install the package:
19 | 
20 | ``` r
21 | # install.packages("remotes") # if not already installed
22 | remotes::install_github("https://github.com/StatsGary/MLDataR")
23 | library(MLDataR)
24 | 
25 | ```
26 | ## Installing the package from CRAN
27 | 
28 | To install from CRAN, use the below command:
29 | ``` r
30 | install.packages("MLDataR")
31 | 
32 | ```
33 | 
34 | ## Loading the package from CRAN
35 | 
36 | To load the package from CRAN, use the following:
37 | 
38 | ``` r
39 | library(MLDataR)
40 | ```
41 | 
42 | ## Datasets included
43 | 
44 | The package currently has three example datasets, and more are being added every week. The first three datasets contained in the package are:
45 | 
46 | - **Counter Strike Global Offensive** - supervised machine learning regression and classification data set to predict score or match outcome. 
47 | - **Diabetes disease prediction** - supervised machine learning classification dataset to enable the prediction of diabetic patients. 
48 | - **Diabetes onset prediction** - supervised machine learning regression dataset to enable prediction of the age at which a pre-diabetic will develop diabetes 
49 | - **Failing Care Home classification** - classification supervised machine learning dataset to predict a failing care home by selected Datix incidents. UK Datix service. 
50 | - **Heart disease prediction** - supervised machine learning classification dataset to enable the prediction of heart disease using a number of key outcome features. Anonymised from the British Heart Foundation example records. 
51 | - **Long stayers prediction** - supervised machine learning classification dataset to enable the prediction of a patient staying in hospital longer than 7 days. Extracted from stranded patients extract and anonymised for training and research purposes. Nottingham University Hospitals. 
52 | - **Stroke Classification** - supervised machine learning classification dataset to enable the prediction of a stroke in an unseen patient, using past observations in the training set.
53 | - **Thyroid disease prediction** - supervised machine learning classification dataset to allow for the prediction of thyroid disease utilising historic patient records. Garvin Institute - see references in markdown files supporting package. 
54 | 
55 | ## Further developments
56 | 
57 | More datasets are being added, so look out for the next version of this package. 
58 | 
59 | ## Closing remarks
60 | 
61 | It has been fun putting this package together and I hope you find it useful. If you find any issues using the package, please raise a git hub ticket and I will address it as soon as possible. Thanks and I hope you enjoy using it.
62 | 


--------------------------------------------------------------------------------
/cran-comments.md:
--------------------------------------------------------------------------------
 1 | ## Test environments
 2 | * local windows 10, R 4.0.3
 3 | * Windows, R devel 2020-09-09 r79174, on Win-builder
 4 | * Ubuntu 16.04.6 LTS (on travis-ci), R 4.0.2
 5 | 
 6 | * GitHub actions:
 7 |   * Mac OS x 10.15.7, R 4.0.3
 8 |   * Windows Server x64 2019, R 4.0.3
 9 |   * Windows Server x64 latest, R 3.6.3
10 |   * Ubuntu 16.04.07, R-devel 2020-11-27 r79522
11 |   * Ubuntu 16.04.07, R 4.0.3
12 |   * Ubuntu 16.04.07, R 3.6.3
13 |   * Ubuntu 16.04.07, R 3.5.3
14 |   * Ubuntu 16.04.07, R 3.4.4
15 | 
16 | * r-hub:
17 |   * Ubuntu Linux 16.04 LTS, R-release, GCC
18 |   * Fedora Linux, R-devel, clang, gfortran
19 |   * Windows Server 2008 R2 SP1, R-devel, 32/64 bit
20 | 
21 | ## R CMD check results
22 | There were no ERRORs or WARNINGs. 
23 | 
24 | ## Downstream dependencies
25 | There are currently no downstream dependencies for this package to my knowledge.
26 | 


--------------------------------------------------------------------------------
/data/PreDiabetes.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/data/PreDiabetes.rda


--------------------------------------------------------------------------------
/data/care_home_incidents.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/data/care_home_incidents.rda


--------------------------------------------------------------------------------
/data/csgo.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/data/csgo.rda


--------------------------------------------------------------------------------
/data/diabetes_data.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/data/diabetes_data.rda


--------------------------------------------------------------------------------
/data/heartdisease.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/data/heartdisease.rda


--------------------------------------------------------------------------------
/data/long_stayers.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/data/long_stayers.rda


--------------------------------------------------------------------------------
/data/stroke_classification.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/data/stroke_classification.rda


--------------------------------------------------------------------------------
/data/thyroid_disease.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/data/thyroid_disease.rda


--------------------------------------------------------------------------------
/man/PreDiabetes.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/PreDiabetes.R
 3 | \docType{data}
 4 | \name{PreDiabetes}
 5 | \alias{PreDiabetes}
 6 | \title{PreDiabetes dataset}
 7 | \format{
 8 | A data frame with 3059 rows and 9 variables:
 9 | \describe{
10 |   \item{Age}{age of the patient presenting with diabetes}
11 |   \item{Sex}{sex of the patient with diabetes}
12 |   \item{IMD_Decile}{Index of Multiple Deprivation Decile}
13 |   \item{BMI}{Body Mass Index of patient}
14 |   \item{Age_PreDiabetes}{age at pre diabetes diagnosis}
15 |   \item{HbA1C}{average blood glucose mmol/mol}
16 |   \item{Time_Pre_To_Diabetes}{time in years between pre-diabetes and diabetes diagnosis}
17 |   \item{Age_Diabetes}{age at diabetes diagnosis}
18 |   \item{PreDiabetes_Checks_Before_Diabetes}{number of pre-diabetes related primary care appointments before diabetes diagnosis}
19 | 
20 | }
21 | }
22 | \source{
23 | Generated by Asif Laldin \email{a.laldin@nhs.net}, Jan-2022
24 | }
25 | \usage{
26 | PreDiabetes
27 | }
28 | \description{
29 | PreDiabetes dataset
30 | }
31 | \examples{
32 | library(dplyr)
33 | data(PreDiabetes)
34 | # Convert diabetes data to factor'
35 | diabetes_data <- PreDiabetes \%>\%
36 |  glimpse()
37 | }
38 | \keyword{learning}
39 | \keyword{machine}
40 | \keyword{prediabetes}
41 | \keyword{regression}
42 | 


--------------------------------------------------------------------------------
/man/care_home_incidents.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/care_home_incidents.R
 3 | \docType{data}
 4 | \name{care_home_incidents}
 5 | \alias{care_home_incidents}
 6 | \title{Care Home Incidents}
 7 | \format{
 8 | A data frame with 1216 rows and 12 variables:
 9 | \describe{
10 |   \item{CareHomeFail}{a binary indicator to specify whether a certain care home is failing}
11 |   \item{WeightLoss}{aggregation of incidents indicating weight loss in patient}
12 |   \item{Medication}{medication missed aggregaation}
13 |   \item{Falls}{Recorded number of patient falls}
14 |   \item{Choking}{Number of patient choking incidents}
15 |   \item{UnexpectedDeaths}{unexpected deaths in the care home}
16 |   \item{Bruising}{Number of bruising incidents in the care home}
17 |   \item{Absconsion}{Absconding from the care home setting}
18 |   \item{ResidentAbuseByResident}{Abuse conducted by one care home resident against another}
19 |   \item{ResidentAbuseByStaff}{Incidents of resident abuse by staff}
20 |   \item{ResidentAbuseOnStaff}{Incidents of residents abusing staff}
21 |   \item{Wounds}{Unexplained wounds against staff}
22 |   }
23 | }
24 | \source{
25 | Collected by Gary Hutson \email{hutsons-hacks@outlook.com}, Jan-2022
26 | }
27 | \usage{
28 | care_home_incidents
29 | }
30 | \description{
31 | a NHS patient safety incidents dataset: \url{https://www.england.nhs.uk/patient-safety/report-patient-safety-incident/} dataset that has been synthetically generated against real data
32 | }
33 | \examples{
34 | library(dplyr)
35 | data(care_home_incidents)
36 | # Convert diabetes data to factor'
37 | ch_incs <- care_home_incidents \%>\%
38 |  mutate(CareHomeFail = as.factor(CareHomeFail))
39 |  ch_incs \%>\% glimpse()
40 |  # Check factor
41 |  factor(ch_incs$CareHomeFail)
42 | }
43 | \keyword{care}
44 | \keyword{classification}
45 | \keyword{home}
46 | \keyword{incidents}
47 | \keyword{learning}
48 | \keyword{machine}
49 | \keyword{supervised}
50 | 


--------------------------------------------------------------------------------
/man/csgo.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/csgo.R
 3 | \docType{data}
 4 | \name{csgo}
 5 | \alias{csgo}
 6 | \title{csgo}
 7 | \format{
 8 | A data frame with 1,133 rows and 17 variables:
 9 | \describe{
10 |   \item{map}{Map on which the match was played}
11 |   \item{day}{Day of the month}
12 |   \item{month}{Month of the year}
13 |   \item{year}{Year}
14 |   \item{date}{Date of match DD/MM/YYYY}
15 |   \item{wait_time_s}{Time waited to find match}
16 |   \item{match_time_s}{Total match length in seconds}
17 |   \item{team_a_rounds}{Number of rounds played as Team A}
18 |   \item{team_b_rounds}{Number of rounds played as Team B}
19 |   \item{ping}{Maximum ping in milliseconds;the signal that's sent from one computer to another on the same network}
20 |   \item{kills}{Number of kills accumulated in match; max 5 per round}
21 |   \item{assists}{Number of assists accumulated in a match,inflicting oppononent with more than 50 percent damage,who is then killed by another player accumulated in match max 5 per round}
22 |   \item{deaths}{Number of times player died during match;max 1 per round}
23 |   \item{mvps}{Most Valuable Player award}
24 |   \item{hs_percent}{Percentage of kills that were a result from a shot to opponent's head}
25 |   \item{points}{Number of points accumulated during match. Apoints are gained from kills, assists,bomb defuses & bomb plants. Points are lost for sucicide and friendly kills}
26 |   \item{result}{The result of the match, Win, Loss, Draw}
27 | }
28 | }
29 | \source{
30 | Extracted by Asif Laldin \email{a.laldin@nhs.net}, March-2019
31 | }
32 | \usage{
33 | csgo
34 | }
35 | \description{
36 | csgo
37 | }
38 | \keyword{CounterStrike}
39 | \keyword{Global}
40 | \keyword{Offensive}
41 | \keyword{eSports}
42 | 


--------------------------------------------------------------------------------
/man/diabetes_data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/diabetes_data.R
 3 | \docType{data}
 4 | \name{diabetes_data}
 5 | \alias{diabetes_data}
 6 | \title{Diabetes datasets}
 7 | \format{
 8 | A data frame with 520 rows and 17 variables:
 9 | \describe{
10 |   \item{Age}{age of the patient presenting with diabetes}
11 |   \item{Gender}{gender of the patient with diabetes}
12 |   \item{ExcessUrination}{if the patient has a history of excessive urination}
13 |   \item{Polydipsia}{abnormal thurst, accompanied by the excessive intake of water or fluid}
14 |   \item{WeightLossSudden}{Sudden weight loss that has recently occured}
15 |   \item{Fatigue}{Fatigue or weakness}
16 |   \item{Polyphagia}{excessive or extreme hunger}
17 |   \item{GenitalThrush}{patient has thrush fungus on or near their genital region}
18 |   \item{BlurredVision}{history of blurred vision}
19 |   \item{Itching}{skin itching}
20 |   \item{Irritability}{general irritability and mood issues}
21 |   \item{DelayHealing}{delayed healing of wounds}
22 |   \item{PartialPsoriasis}{partial psoriasis on the body}
23 |   \item{MuscleStiffness}{stiffness of the muscles}
24 |   \item{Alopecia}{scalp alopecia and hair shedding}
25 |   \item{Obesity}{Classified as obese}
26 |   \item{DiabeticClass}{Class label to indicate whether the patient is diabetic or not}
27 | }
28 | }
29 | \source{
30 | Collected by Gary Hutson \email{hutsons-hacks@outlook.com}, Dec-2021
31 | }
32 | \usage{
33 | diabetes_data
34 | }
35 | \description{
36 | Diabetes datasets
37 | }
38 | \examples{
39 | library(dplyr)
40 | data(diabetes_data)
41 | # Convert diabetes data to factor'
42 | diabetes_data <- diabetes_data \%>\%
43 |  glimpse() \%>\%
44 |  mutate(DiabeticClass = as.factor(DiabeticClass))
45 |  is.factor(diabetes_data$DiabeticClass)
46 | }
47 | \keyword{classification}
48 | \keyword{diabetes}
49 | \keyword{learning}
50 | \keyword{machine}
51 | 


--------------------------------------------------------------------------------
/man/figures/mldataR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/man/figures/mldataR.png


--------------------------------------------------------------------------------
/man/heartdisease.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/heartdisease.R
 3 | \docType{data}
 4 | \name{heartdisease}
 5 | \alias{heartdisease}
 6 | \title{Heart disease dataset}
 7 | \format{
 8 | A data frame with 918 rows and 10 variables:
 9 | \describe{
10 |   \item{Age}{age of the patient presenting with heart disease}
11 |   \item{Sex}{gender of the patient}
12 |   \item{RestingBP}{blood pressure for resting heart beat}
13 |   \item{Cholesterol}{Cholesterol reading}
14 |   \item{FastingBS}{blood sample of glucose after a patient fasts \url{https://www.diabetes.co.uk/diabetes_care/fasting-blood-sugar-levels.html}}
15 |   \item{RestingECG}{Resting echocardiography is an indicator of previous myocardial infarction e.g. heart attack}
16 |   \item{MaxHR}{Maximum heart rate}
17 |   \item{Angina}{chest pain caused by decreased flood flow \url{https://www.nhs.uk/conditions/angina/}}
18 |   \item{HeartPeakReading}{reading at the peak of the heart rate}
19 |   \item{HeartDisease}{the classification label of whether patient has heart disease or not}
20 | 
21 | }
22 | }
23 | \source{
24 | Collected by Gary Hutson \email{hutsons-hacks@outlook.com}, Dec-2021
25 | }
26 | \usage{
27 | heartdisease
28 | }
29 | \description{
30 | The dataset is to be used with a supervised classification ML model to classify heart disease.
31 | }
32 | \examples{
33 | library(dplyr)
34 | library(ConfusionTableR)
35 | data(heartdisease)
36 | 
37 | # Convert diabetes data to factor'
38 | hd <- heartdisease \%>\%
39 |  glimpse() \%>\%
40 |  mutate(HeartDisease = as.factor(HeartDisease))
41 | # Check that the label is now a factor
42 |  is.factor(hd$HeartDisease)
43 |  # Dummy encoding
44 | # Get categorical columns
45 | hd_cat <- hd  \%>\%
46 |  dplyr::select_if(is.character)
47 |  # Dummy encode the categorical variables
48 |  # Specify the columns to encode
49 |  cols <- c("RestingECG", "Angina", "Sex")
50 |  # Dummy encode using dummy_encoder in ConfusionTableR package
51 |  coded <- ConfusionTableR::dummy_encoder(hd_cat, cols, remove_original = TRUE)
52 | coded <- coded \%>\%
53 |     select(RestingECG_ST, RestingECG_LVH, Angina=Angina_Y,
54 |     Sex=Sex_F)
55 | # Remove column names we have encoded from original data frame
56 | hd_one <- hd[,!names(hd) \%in\% cols]
57 | # Bind the numerical data on to the categorical data
58 | hd_final <- bind_cols(coded, hd_one)
59 | # Output the final encoded data frame for the ML task
60 | glimpse(hd_final)
61 | }
62 | \keyword{classification}
63 | \keyword{disease}
64 | \keyword{heart}
65 | \keyword{learning}
66 | \keyword{machine}
67 | 


--------------------------------------------------------------------------------
/man/long_stayers.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/long_stayers.R
 3 | \docType{data}
 4 | \name{long_stayers}
 5 | \alias{long_stayers}
 6 | \title{Long stayers dataset}
 7 | \format{
 8 | A data frame with 768 rows and 9 variables:
 9 | \describe{
10 |   \item{stranded.label}{binary classification label indicating whether \strong{stranded = 1} or \strong{not stranded=0}}
11 |   \item{age}{age of the patient}
12 |   \item{care.home.referral}{flag indicating whether referred from a private care home - \strong{1=Care Home Referral} and \strong{0=Not a care home referral}}
13 |   \item{medicallysafe}{flag indicating whether they are medically safe for discharge - \strong{1=Medically safe} and \strong{0=Not medically safe}}
14 |   \item{hcop}{flag indicating health care for older person triage - \strong{1=Yes triaged from HCOP} and \strong{0=Triaged from different department}}
15 |   \item{mental_health_care}{flag indicating whether they require mental health care - \strong{1=MH assistance needed} and \strong{0=No history of mental health}}
16 |   \item{periods_of_previous_care}{Count of the number of times they have been in hospital in last 12 months}
17 |   \item{admit_date}{date the patient was admitted as an inpatient}
18 |   \item{frailty_index}{indicates the type of frailty - nominal variable}
19 | }
20 | }
21 | \source{
22 | Prepared, acquired and adatped by Gary Hutson \email{hutsons-hacks@outlook.com}, Dec-2021. Synthetic data, based off live patient data from various NHS secondary health care trusts.
23 | }
24 | \usage{
25 | long_stayers
26 | }
27 | \description{
28 | classification dataset of long staying patients.
29 | Contains patients who have been registered as an inpatient for longer than 7 days length of stay \url{https://www.england.nhs.uk/south/wp-content/uploads/sites/6/2016/12/rig-reviewing-stranded-patients-hospital.pdf}.
30 | }
31 | \examples{
32 | library(dplyr)
33 | library(ggplot2)
34 | library(caret)
35 | library(rsample)
36 | library(varhandle)
37 | data("long_stayers")
38 | glimpse(long_stayers)
39 | # Examine class imbalance
40 | prop.table(table(long_stayers$stranded.label))
41 | # Feature engineering
42 | long_stayers <- long_stayers \%>\%
43 | dplyr::mutate(stranded.label=factor(stranded.label)) \%>\%
44 |  dplyr::select(everything(), -c(admit_date))
45 |  # Feature encoding
46 |  cats <- select_if(long_stayers, is.character)
47 |  cat_dummy <- varhandle::to.dummy(cats$frailty_index, "frail_ind")
48 | #Converts the frailty index column to dummy encoding and sets a column called "frail_ind" prefix
49 | cat_dummy <- cat_dummy \%>\%
50 |  as.data.frame() \%>\%
51 |  dplyr::select(-frail_ind.No_index_item) #Drop the field of interest
52 | long_stayers <- long_stayers \%>\%
53 |  dplyr::select(-frailty_index) \%>\%
54 |  bind_cols(cat_dummy) \%>\% na.omit(.)
55 | # Split the data
56 | split <- rsample::initial_split(long_stayers, prop = 3/4)
57 | train <- rsample::training(split)
58 | test <- rsample::testing(split)
59 | set.seed(123)
60 | glm_class_mod <- caret::train(factor(stranded.label) ~ ., data = train,
61 |                              method = "glm")
62 | print(glm_class_mod)
63 | # Predict the probabilities
64 | preds <- predict(glm_class_mod, newdata = test) # Predict class
65 | pred_prob <- predict(glm_class_mod, newdata = test, type="prob") #Predict probs
66 | 
67 | predicted <- data.frame(preds, pred_prob)
68 | test <- test \%>\%
69 |  bind_cols(predicted) \%>\%
70 |  dplyr::rename(pred_class=preds)
71 | #Evaluate with ConfusionTableR
72 | library(ConfusionTableR)
73 | cm <- ConfusionTableR::binary_class_cm(test$stranded.label, test$pred_class, positive="Stranded")
74 | cm$record_level_cm
75 | # Visualise odds ration
76 | library(OddsPlotty)
77 | plotty <- OddsPlotty::odds_plot(glm_class_mod$finalModel,
78 |                                title = "Odds Plot ",
79 |                                subtitle = "Showing odds of patient stranded",
80 |                                point_col = "#00f2ff",
81 |                                error_bar_colour = "black",
82 |                                point_size = .5,
83 |                                error_bar_width = .8,
84 |                                h_line_color = "red")
85 | print(plotty)
86 | }
87 | \keyword{NHS}
88 | \keyword{long}
89 | \keyword{patient}
90 | \keyword{stay}
91 | \keyword{stranded}
92 | 


--------------------------------------------------------------------------------
/man/stroke_classification.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stroke_classification.R
 3 | \docType{data}
 4 | \name{stroke_classification}
 5 | \alias{stroke_classification}
 6 | \title{Stroke Classification dataset}
 7 | \format{
 8 | A data frame with 5110 rows and 11 variables:
 9 | \describe{
10 |   \item{pat_id}{unique patient identifier index}
11 |   \item{stroke}{outcome variable as a flag - 1 for stroke and 0 for no stroke}
12 |   \item{gender}{patient gender description}
13 |   \item{age}{age of the patient}
14 |   \item{hypertension}{binary flag to indicate whether patient has hypertension: \url{https://www.nhs.uk/conditions/high-blood-pressure-hypertension/}}
15 |   \item{heart_disease}{binary flag to indicate whether patient has heart disease: 1 or no heart disease history: 0}
16 |   \item{work_related_stress}{binary flag to indicate whether patient has history of work related stress}
17 |   \item{urban_residence}{binary flag indicating whether patient lives in an urban area or not}
18 |   \item{avg_glucose_level}{average blood glucose readings of the patient}
19 |   \item{bmi}{body mass index of the patient: \url{https://www.nhs.uk/live-well/healthy-weight/bmi-calculator/}}
20 |   \item{smokes}{binary flag to indicate if the patient smokes - 1 for current smoker and 0 for smoking cessation}
21 | 
22 | }
23 | }
24 | \source{
25 | Prepared and compiled by Gary Hutson \email{hutsons-hacks@outlook.com}, Apr-2022.
26 | }
27 | \usage{
28 | stroke_classification
29 | }
30 | \description{
31 | This dataset has been obtained from a Stoke department within the NHS and is a traditional supervised ML classification dataset
32 | }
33 | \keyword{stoke}
34 | 


--------------------------------------------------------------------------------
/man/thyroid_disease.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/thyroid_disease.R
  3 | \docType{data}
  4 | \name{thyroid_disease}
  5 | \alias{thyroid_disease}
  6 | \title{Thyroid disease dataset}
  7 | \format{
  8 | A data frame with 3772 rows and 28 variables:
  9 | \describe{
 10 |   \item{ThryroidClass}{binary classification label indicating whether \strong{sick = 1} or \strong{negative=0}}
 11 |   \item{patient_age}{age of the patient}
 12 |   \item{patient_gender}{flag indicating gender of patient - \strong{1=Female} and \strong{0=Male}}
 13 |   \item{presc_thyroxine}{flag to indicate whether thyroxine replacement prescribed \strong{1=Thyroxine prescribed}}
 14 |   \item{queried_why_on_thyroxine}{flag to indicate query has been actioned}
 15 |   \item{presc_anthyroid_meds}{flag to indicate whether anti-thyroid medicine has been prescribed}
 16 |   \item{sick}{flag to indicate sickness due to thyroxine depletion or over activity}
 17 |   \item{pregnant}{flag to indicate whether the patient is pregnant}
 18 |   \item{thyroid_surgery}{flag to indicate whether the patient has had thyroid surgery}
 19 |   \item{radioactive_iodine_therapyI131}{indicates whether patient has had radioactive iodine treatment: \url{https://www.nhs.uk/conditions/thyroid-cancer/treatment/}}
 20 |   \item{query_hypothyroid}{flag to indicate under active thyroid query \url{https://www.nhs.uk/conditions/underactive-thyroid-hypothyroidism/}}
 21 |   \item{query_hyperthyroid}{flag to indicate over active thyroid query \url{https://www.nhs.uk/conditions/overactive-thyroid-hyperthyroidism/}}
 22 |   \item{lithium}{Lithium carbonate administered to decrease the level of thyroid hormones}
 23 |   \item{goitre}{flag to indicate swelling of the thyroid gland \url{https://www.nhs.uk/conditions/goitre/}}
 24 |   \item{tumor}{flag to indicate a tumor}
 25 |   \item{hypopituitarism}{flag to indicate a diagnosed under active thyroid}
 26 |   \item{psych_condition}{indicates whether a patient has a psychological condition}
 27 |   \item{TSH_measured}{a TSH level lower than normal indicates there is usually more than enough thyroid hormone in the body and may indicate hyperthyroidism}
 28 |   \item{TSH_reading}{the reading result of the TSH blood test}
 29 |   \item{T3_measured}{linked to TSH reading - when free triiodothyronine rise above normal this indicates hyperthyroidism}
 30 |   \item{T3_reading}{the reading result of the T3 blood test looking for above normal levels of free triiodothyronine}
 31 |   \item{T4_measured}{free thyroxine, also known as T4, is used with T3 and TSH tests to diagnose hyperthyroidism}
 32 |   \item{T4_reading}{the reading result of th T4 test}
 33 |   \item{thyrox_util_rate_T4U_measured}{flag indicating the thyroxine utilisation rate \url{https://pubmed.ncbi.nlm.nih.gov/1685967/}}
 34 |   \item{thyrox_util_rate_T4U_reading}{the result of the test}
 35 |   \item{FTI_measured}{flag to indicate measurement on the Free Thyroxine Index (FTI)\url{https://endocrinology.testcatalog.org/show/FRTUP}}
 36 |   \item{FTI_reading}{the result of the test mentioned above}
 37 |   \item{ref_src}{[nominal] indicating the referral source of the patient}
 38 | }
 39 | }
 40 | \source{
 41 | Prepared and adatped by Gary Hutson \email{hutsons-hacks@outlook.com}, Dec-2021 and sourced from  Garavan Institute and J. Ross Quinlan.
 42 | }
 43 | \usage{
 44 | thyroid_disease
 45 | }
 46 | \description{
 47 | The dataset is to be used with a supervised classification ML model to classify thyroid disease.
 48 | The dataset was sourced and adapted from the UCI Machine Learning repository \url{https://archive.ics.uci.edu/ml/index.php}.
 49 | }
 50 | \examples{
 51 | library(dplyr)
 52 | library(ConfusionTableR)
 53 | library(parsnip)
 54 | library(rsample)
 55 | library(recipes)
 56 | library(ranger)
 57 | library(workflows)
 58 | data("thyroid_disease")
 59 | td <- thyroid_disease
 60 | # Create a factor of the class label to use in ML model
 61 | td$ThryroidClass <- as.factor(td$ThryroidClass)
 62 | # Check the structure of the data to make sure factor has been created
 63 | str(td)
 64 | # Remove missing values, or choose more advaced imputation option
 65 | td <- td[complete.cases(td),]
 66 | #Drop the column for referral source
 67 | td <- td \%>\%
 68 |  dplyr::select(-ref_src)
 69 | # Analyse class imbalance
 70 | class_imbalance <- prop.table(table(td$ThryroidClass))
 71 | class_imbalance
 72 | #Divide the data into a training test split
 73 | set.seed(123)
 74 | split <- rsample::initial_split(td, prop=3/4)
 75 | train_data <- rsample::training(split)
 76 | test_data <- rsample::testing(split)
 77 | # Create recipe to upsample and normalise
 78 | set.seed(123)
 79 | td_recipe <-
 80 |  recipe(ThryroidClass ~ ., data=train_data) \%>\%
 81 |   step_normalize(all_predictors()) \%>\%
 82 |   step_zv(all_predictors())
 83 | # Instantiate the model
 84 | set.seed(123)
 85 | rf_mod <-
 86 |   parsnip::rand_forest() \%>\%
 87 |   set_engine("ranger") \%>\%
 88 |   set_mode("classification")
 89 | # Create the model workflow
 90 | td_wf <-
 91 |   workflow() \%>\%
 92 |   workflows::add_model(rf_mod) \%>\%
 93 |   workflows::add_recipe(td_recipe)
 94 | # Fit the workflow to our training data
 95 | set.seed(123)
 96 | td_rf_fit <-
 97 |   td_wf \%>\%
 98 |   fit(data = train_data)
 99 | # Extract the fitted data
100 | td_fitted <- td_rf_fit \%>\%
101 |    extract_fit_parsnip()
102 | # Predict the test set on the training set to see model performance
103 | class_pred <- predict(td_rf_fit, test_data)
104 | td_preds <- test_data \%>\%
105 | bind_cols(class_pred)
106 | # Convert both to factors
107 | td_preds$.pred_class <- as.factor(td_preds$.pred_class)
108 | td_preds$ThryroidClass <- as.factor(td_preds$ThryroidClass)
109 | # Evaluate the data with ConfusionTableR
110 | cm <- ConfusionTableR::binary_class_cm(td_preds$ThryroidClass ,
111 |                                        td_preds$.pred_class,
112 |                                        positive="sick")
113 | #View Confusion matrix
114 | cm$confusion_matrix
115 | #View record level
116 | cm$record_level_cm
117 | }
118 | \references{
119 | Thyroid disease records supplied by the Garavan Institute and J. Ross Quinlan.
120 | }
121 | \keyword{disease}
122 | \keyword{thyroid}
123 | 


--------------------------------------------------------------------------------
/vignettes/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | *.R
3 | 


--------------------------------------------------------------------------------
/vignettes/MLDataR.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "MLDataR - A Package for ML datasets"
  3 | output: rmarkdown::html_vignette
  4 | vignette: >
  5 |   %\VignetteIndexEntry{MLDataR}
  6 |   %\VignetteEngine{knitr::rmarkdown} --> %\VignetteEngine{rmarkdown::render}
  7 |   %\VignetteEncoding{UTF-8}
  8 | ---
  9 | 
 10 | ```{r, include = FALSE}
 11 | knitr::opts_chunk$set(
 12 |   collapse = TRUE,
 13 |   comment = "#>",
 14 |   fig.height= 5, 
 15 |   fig.width=7
 16 | )
 17 | ```
 18 | 
 19 | <p><a href="https://github.com/StatsGary/MLDataR"><img src = "mldataR.png" width = "125px" height = "150px" align="right"></a></p>
 20 | 
 21 | ```{r setup, include = FALSE, echo=FALSE}
 22 | library(MLDataR)
 23 | library(dplyr)
 24 | library(ConfusionTableR)
 25 | library(parsnip)
 26 | library(rsample)
 27 | library(recipes)
 28 | library(ranger)
 29 | library(workflows)
 30 | library(caret)
 31 | 
 32 | ```
 33 | 
 34 | 
 35 | ## Installing the NHSDataR package
 36 | To install the package use the below instructions:
 37 | 
 38 | ```{r install_MLDataR}
 39 | #install.packages(MLDataR)
 40 | library(MLDataR)
 41 | 
 42 | ```
 43 | 
 44 | ## What datasets are included
 45 | 
 46 | The current list of data sets are:
 47 | 
 48 | - **Diabetes disease prediction** - supervised machine learning classification dataset to enable the prediction of diabetic patients
 49 | - **Diabetes onset prediction** - supervised machine learning regression dataset to enable prediction of the age at which a pre-diabetic will develop diabetes 
 50 | - **Heart disease prediction** - supervised machine learning classification dataset to enable the prediction of heart disease using a number of key outcome features
 51 | - **Long stayers prediction** - supervised machine learning classification dataset to enable the prediction of a patient staying in hospital longer than 7 days.
 52 | - **Thyroid disease prediction** - supervised machine learning classification dataset to allow for the prediction of thyroid disease utilising historic patient records
 53 | - **Failing Care Home classification** - classification supervised machine learning dataset to predict a failing care home by selected Datix incidents
 54 | - **Counter Strike Global Offensive** - supervised machine learning regression and classification data set to predict score or match outcome. 
 55 | 
 56 | More and more data sets are being added, and it is my mission to have more than 50 example datasets by the end of 2022. 
 57 | 
 58 | ## Thyroid Disease dataset
 59 | 
 60 | I will first work with the Thyroid disease dataset and inspect the variables in the data:
 61 | 
 62 | ```{r thyroid_data}
 63 | 
 64 | glimpse(MLDataR::thyroid_disease)
 65 | 
 66 | ```
 67 | 
 68 | As you can see this dataset has 28 columns and 3,772 rows. The dataset is fully documented in the help file of what each one of the items means. The next task is to use this dataset to create a ML model in TidyModels.
 69 | 
 70 | ## Create TidyModels recipe to model the thyroid dataset
 71 | 
 72 | This will show how to create and implement the dataset in TidyModels for a supervised ML classification task. 
 73 | 
 74 | ### Data preparation
 75 | 
 76 | The first step will be to do the data preparation steps:
 77 | 
 78 | ```{r data_prep}
 79 | data("thyroid_disease")
 80 | td <- thyroid_disease
 81 | # Create a factor of the class label to use in ML model
 82 | td$ThryroidClass <- as.factor(td$ThryroidClass)
 83 | # Check the structure of the data to make sure factor has been created
 84 | str(td)
 85 | ```
 86 | 
 87 | Next I will remove the missing variable, you could try another imputation method here such as MICE, however for speed of development and building vignette, I will leave this for you to look into:
 88 | 
 89 | ```{r remove_nulls}
 90 | # Remove missing values, or choose more advaced imputation option
 91 | td <- td[complete.cases(td),]
 92 | #Drop the column for referral source
 93 | td <- td %>%
 94 |    dplyr::select(-ref_src)
 95 | 
 96 | ```
 97 | 
 98 | ### Split the data
 99 | 
100 | Next I will partition the data into a training and testing split, so I can evaluate how well the model performs on the testing set:
101 | 
102 | ```{r splitting}
103 | #Divide the data into a training test split
104 | set.seed(123)
105 | split <- rsample::initial_split(td, prop=3/4)
106 | train_data <- rsample::training(split)
107 | test_data <- rsample::testing(split)
108 | 
109 | ```
110 | 
111 | ### Create a recipe with preprocessing steps
112 | 
113 | After I have split the data it is time to prepare a recipe for the preprocessing steps, here I will use the recipes package:
114 | 
115 | 
116 | ```{r create_recipe}
117 | td_recipe <-
118 |    recipe(ThryroidClass ~ ., data=train_data) %>%
119 |    step_normalize(all_predictors()) %>%
120 |    step_zv(all_predictors())
121 | 
122 | print(td_recipe)
123 | ```
124 | 
125 | This recipe links the outcome variable `ThyroidClass` and then we use a normalise function to centre and scale all the numerical outcome variables and then we will remove zero variance from the data.
126 | 
127 | ### Getting modelling with Parsnip
128 | 
129 | We come to the modelling step of the exercise. Here I will instantiate a random forest model for the modeeling task at hand:
130 | 
131 | 
132 | ```{r random_forest_model}
133 | set.seed(123)
134 | rf_mod <-
135 |   parsnip::rand_forest() %>%
136 |   set_engine("ranger") %>%
137 |   set_mode("classification")
138 | 
139 | 
140 | ```
141 | 
142 | ### Create the model workflow
143 | 
144 | [Tidymodels](https://www.tidymodels.org/) uses the concept of workflows to stitch the ML pipeline together, so I will now create the workflow and then fit the model:
145 | 
146 | ```{r creating_workflow}
147 | td_wf <-
148 |    workflow() %>%
149 |    workflows::add_model(rf_mod) %>%
150 |    workflows::add_recipe(td_recipe)
151 | 
152 | print(td_wf)
153 | # Fit the workflow to our training data
154 | set.seed(123)
155 | td_rf_fit <-
156 |    td_wf %>%
157 |    fit(data = train_data)
158 | # Extract the fitted data
159 | td_fitted <- td_rf_fit %>%
160 |     extract_fit_parsnip()
161 | 
162 | ```
163 | ### Make predictions and evaluate with ConfusionTableR
164 | 
165 | The final step, before deploying this live, would be to make predictions on the test set and then evaluate with the ConfusionTableR package:
166 | 
167 | ```{r make_preds_and_evaluate}
168 | # Predict the test set on the training set to see model performance
169 | class_pred <- predict(td_rf_fit, test_data)
170 | td_preds <- test_data %>%
171 |     bind_cols(class_pred)
172 | # Convert both to factors
173 | td_preds$.pred_class <- as.factor(td_preds$.pred_class)
174 | td_preds$ThryroidClass <- as.factor(td_preds$ThryroidClass)
175 | 
176 | str(td_preds)
177 | 
178 | # Evaluate the data with ConfusionTableR
179 | cm <- binary_class_cm(td_preds$.pred_class,
180 |                       td_preds$ThryroidClass,
181 |                       positive="sick")
182 | 
183 | 
184 | 
185 | ```
186 | 
187 | Final step is to view the Confusion Matrix and collapse down for storage in a database to model accuracy drift over time:
188 | 
189 | ```{r modelling_preds}
190 | #View Confusion matrix
191 | cm$confusion_matrix
192 | #View record level
193 | cm$record_level_cm
194 | 
195 | ```
196 | 
197 | That is an example of how to model the Thyroid dataset, and random forest ensembles are giving us good estimates of the model performance. The Kappa level is also excellent, meaning that the model has a high likelihood of being good in practice. 
198 | 
199 | ## Diabetes dataset
200 | The diabetes dataset can be loaded from the package with ease also:
201 | 
202 | ```{r diabetes}
203 | glimpse(MLDataR::diabetes_data)
204 | ```
205 | Has a number of variables that are common with people of diabetes, however some dummy encoding would be needed of the Yes / No variables to make this model work.
206 | 
207 | This is another example of a dataset that you could build an ML model on.
208 | 
209 | ## Heart disease prediction
210 | 
211 | The final dataset, for now, in the package is the heart disease dataset. To load and work with this dataset you could use the following:
212 | 
213 | ```{r load_in_heart}
214 | data(heartdisease)
215 | # Convert diabetes data to factor'
216 | hd <- heartdisease %>%
217 |  mutate(HeartDisease = as.factor(HeartDisease))
218 | is.factor(hd$HeartDisease)
219 | ```
220 | 
221 | ### Dummy encode the dataset
222 | The [ConfusionTableR](https://CRAN.R-project.org/package=ConfusionTableR) package has a `dummy_encoder` function baked into the package. To code up the dummy variables you could use an approach similar to below:
223 | 
224 | ```{r dummy_encode}
225 | # Get categorical columns
226 | hd_cat <- hd  %>%
227 |   dplyr::select_if(is.character)
228 | # Dummy encode the categorical variables 
229 |  cols <- c("RestingECG", "Angina", "Sex")
230 | # Dummy encode using dummy_encoder in ConfusionTableR package
231 | coded <- ConfusionTableR::dummy_encoder(hd_cat, cols, remove_original = TRUE)
232 | coded <- coded %>%
233 |      select(RestingECG_ST, RestingECG_LVH, Angina=Angina_Y,
234 |      Sex=Sex_F)
235 | # Remove column names we have encoded from original data frame
236 | hd_one <- hd[,!names(hd) %in% cols]
237 | # Bind the numerical data on to the categorical data
238 | hd_final <- bind_cols(coded, hd_one)
239 | # Output the final encoded data frame for the ML task
240 | glimpse(hd_final)
241 | ```
242 | 
243 | The data is now ready for modelling in the same fashion as we saw with the thyroid dataset.
244 | 
245 | ## Long stayers
246 | This is a dataset for long stay patients and has been created off the back of real NHS data. Load in the data and the required packages:
247 | 
248 | ```{r ls_one}
249 | library(MLDataR)
250 | library(dplyr)
251 | library(ggplot2)
252 | library(caret)
253 | library(rsample)
254 | library(varhandle)
255 | 
256 | data("long_stayers")
257 | glimpse(long_stayers)
258 | 
259 | ```
260 | 
261 | Do some feature engineering on the dataset:
262 | 
263 | ```{r ls_two}
264 | long_stayers <- long_stayers %>% 
265 |   dplyr::mutate(stranded.label=factor(stranded.label)) %>% 
266 |   dplyr::select(everything(), -c(admit_date))
267 | 
268 | cats <- select_if(long_stayers, is.character)
269 | cat_dummy <- varhandle::to.dummy(cats$frailty_index, "frail_ind") 
270 | #Converts the frailty index column to dummy encoding and sets a column called "frail_ind" prefix
271 | cat_dummy <- cat_dummy %>% 
272 |   as.data.frame() %>% 
273 |   dplyr::select(-frail_ind.No_index_item) #Drop the field of interest
274 | # Drop the frailty index from the stranded data frame and bind on our new encoding categorical variables
275 | long_stayers <- long_stayers %>% 
276 |   dplyr::select(-frailty_index) %>% 
277 |   bind_cols(cat_dummy) %>% na.omit(.)
278 | ```
279 | 
280 | Then we will split and model the data. This uses the CARET package to do the modelling:
281 | 
282 | ```{r ls_three}
283 | split <- rsample::initial_split(long_stayers, prop = 3/4)
284 | train <- rsample::training(split)
285 | test <- rsample::testing(split)
286 | 
287 | set.seed(123)
288 | glm_class_mod <- caret::train(factor(stranded.label) ~ ., data = train,
289 |                  method = "glm")
290 | print(glm_class_mod)
291 | ```
292 | 
293 | Next, we will make predictions on the model:
294 | 
295 | ```{r ls_four}
296 | split <- rsample::initial_split(long_stayers, prop = 3/4)
297 | train <- rsample::training(split)
298 | test <- rsample::testing(split)
299 | 
300 | set.seed(123)
301 | glm_class_mod <- caret::train(factor(stranded.label) ~ ., data = train, 
302 |                  method = "glm")
303 | print(glm_class_mod)
304 | ```
305 | 
306 | Predicting on the test set to do the evaluation:
307 | 
308 | ```{r ls_five}
309 | preds <- predict(glm_class_mod, newdata = test) # Predict class
310 | pred_prob <- predict(glm_class_mod, newdata = test, type="prob") #Predict probs
311 | 
312 | # Join prediction on to actual test data frame and evaluate in confusion matrix
313 | 
314 | predicted <- data.frame(preds, pred_prob)
315 | test <- test %>% 
316 |   bind_cols(predicted) %>% 
317 |   dplyr::rename(pred_class=preds)
318 | 
319 | glimpse(test)
320 | ```
321 | 
322 | Finally, we can evaluate with the ConfusionTableR package and use the OddsPlotty package to visualise the odds ratios:
323 | 
324 | ```{r ls_six}
325 | library(ConfusionTableR)
326 | cm <- ConfusionTableR::binary_class_cm(test$stranded.label, test$pred_class, positive="Stranded")
327 | cm$record_level_cm
328 | 
329 | library(OddsPlotty)
330 | plotty <- OddsPlotty::odds_plot(glm_class_mod$finalModel,
331 |                                 title = "Odds Plot ",
332 |                                 subtitle = "Showing odds of patient stranded",
333 |                                 point_col = "#00f2ff",
334 |                                 error_bar_colour = "black",
335 |                                 point_size = .5,
336 |                                 error_bar_width = .8,
337 |                                 h_line_color = "red")
338 | print(plotty)
339 | 
340 | ```
341 | 
342 | 
343 | ## What's on the horizon?
344 | If you have a dataset and it is dying to be included in this package please reach out to me [`@StatsGary`](https://twitter.com/StatsGary) and I would be happy to add you to the list of collaborators.
345 | 
346 | I will be aiming to add an additional 30+ datasets to this package. All of which are at various stages of documentation, so the first version of this package will be released with the three core datasets, with more being added each additional version of the package. 
347 | 
348 | Please keep watching the package [GitHub](https://github.com/StatsGary/MLDataR), and make sure you install the latest updates of the package, when they are available. 
349 | 


--------------------------------------------------------------------------------
/vignettes/mldataR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/vignettes/mldataR.png


--------------------------------------------------------------------------------