├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ └── R-CMD-check.yaml ├── .gitignore ├── CRAN-SUBMISSION ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── MLDataR.Rproj ├── NAMESPACE ├── NEWS.md ├── R ├── PreDiabetes.R ├── care_home_incidents.R ├── csgo.R ├── diabetes_data.R ├── heartdisease.R ├── long_stayers.R ├── stroke_classification.R └── thyroid_disease.R ├── README.md ├── cran-comments.md ├── data ├── PreDiabetes.rda ├── care_home_incidents.rda ├── csgo.rda ├── diabetes_data.rda ├── heartdisease.rda ├── long_stayers.rda ├── stroke_classification.rda └── thyroid_disease.rda ├── man ├── PreDiabetes.Rd ├── care_home_incidents.Rd ├── csgo.Rd ├── diabetes_data.Rd ├── figures │ └── mldataR.png ├── heartdisease.Rd ├── long_stayers.Rd ├── stroke_classification.Rd └── thyroid_disease.Rd └── vignettes ├── .gitignore ├── MLDataR.Rmd └── mldataR.png /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^LICENSE\.md$ 4 | ^\.github$ 5 | ^CRAN-RELEASE$ 6 | ^README.*\.md$ 7 | inst/examples/knitr-.*.pdf 8 | inst/examples/child/knitr-.*.pdf 9 | inst/examples/child/knitr-.*\.md 10 | inst/examples/figure 11 | inst/examples/cache 12 | ^\.travis\.yml$ 13 | cran-comments.md 14 | ^doc$ 15 | ^Meta$ 16 | ^CODE_OF_CONDUCT\.md$ 17 | ^CRAN-SUBMISSION$ 18 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # NOTE: This workflow is overkill for most R packages 2 | # check-standard.yaml is likely a better choice 3 | # usethis::use_github_action("check-standard") will install it. 4 | # 5 | # For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag. 6 | # https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions 7 | on: 8 | push: 9 | branches: 10 | - main 11 | - master 12 | pull_request: 13 | branches: 14 | - main 15 | - master 16 | 17 | name: R-CMD-check 18 | 19 | jobs: 20 | R-CMD-check: 21 | runs-on: ${{ matrix.config.os }} 22 | 23 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 24 | 25 | strategy: 26 | fail-fast: false 27 | matrix: 28 | config: 29 | - {os: windows-latest, r: '4.3', rspm: "https://packagemanager.rstudio.com/cran/latest"} 30 | 31 | env: 32 | RSPM: ${{ matrix.config.rspm }} 33 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 34 | 35 | steps: 36 | - uses: actions/checkout@v2 37 | 38 | - uses: r-lib/actions/setup-r@v2 39 | id: install-r 40 | with: 41 | r-version: ${{ matrix.config.r }} 42 | http-user-agent: ${{ matrix.config.http-user-agent }} 43 | 44 | - uses: r-lib/actions/setup-pandoc@v2 45 | 46 | - name: Install pak and query dependencies 47 | run: | 48 | install.packages("pak", repos = "https://r-lib.github.io/p/pak/dev/") 49 | saveRDS(pak::pkg_deps("local::.", dependencies = TRUE), ".github/r-depends.rds") 50 | shell: Rscript {0} 51 | 52 | - name: Restore R package cache 53 | uses: actions/cache@v2 54 | with: 55 | path: | 56 | ${{ env.R_LIBS_USER }}/* 57 | !${{ env.R_LIBS_USER }}/pak 58 | key: ${{ matrix.config.os }}-${{ steps.install-r.outputs.installed-r-version }}-1-${{ hashFiles('.github/r-depends.rds') }} 59 | restore-keys: ${{ matrix.config.os }}-${{ steps.install-r.outputs.installed-r-version }}-1- 60 | 61 | - name: Install system dependencies 62 | if: runner.os == 'Linux' 63 | run: | 64 | pak::local_system_requirements(execute = TRUE) 65 | pak::pkg_system_requirements("rcmdcheck", execute = TRUE) 66 | shell: Rscript {0} 67 | 68 | - name: Install dependencies 69 | run: | 70 | pak::local_install_dev_deps(upgrade = TRUE) 71 | pak::pkg_install("rcmdcheck") 72 | shell: Rscript {0} 73 | 74 | - name: Session info 75 | run: | 76 | options(width = 100) 77 | pkgs <- installed.packages()[, "Package"] 78 | sessioninfo::session_info(pkgs, include_base = TRUE) 79 | shell: Rscript {0} 80 | 81 | - name: Check 82 | env: 83 | _R_CHECK_CRAN_INCOMING_: false 84 | run: | 85 | options(crayon.enabled = TRUE) 86 | rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") 87 | shell: Rscript {0} 88 | 89 | - name: Show testthat output 90 | if: always() 91 | run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true 92 | shell: bash 93 | 94 | - name: Upload check results 95 | if: failure() 96 | uses: actions/upload-artifact@main 97 | with: 98 | name: ${{ matrix.config.os }}-r${{ matrix.config.r }}-results 99 | path: check 100 | 101 | - name: Don't use tar from old Rtools to store the cache 102 | if: ${{ runner.os == 'Windows' && startsWith(steps.install-r.outputs.installed-r-version, '3.6' ) }} 103 | shell: bash 104 | run: echo "C:/Program Files/Git/usr/bin" >> $GITHUB_PATH 105 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | .all-contributorsrc 6 | inst/doc 7 | README.md 8 | doc 9 | Meta 10 | /doc/ 11 | /Meta/ 12 | .DS_Store 13 | -------------------------------------------------------------------------------- /CRAN-SUBMISSION: -------------------------------------------------------------------------------- 1 | Version: 1.0.1 2 | Date: 2022-10-03 14:44:53 UTC 3 | SHA: 7e8b5a3e0657d5e0e8293d10071529c01fc0af97 4 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: MLDataR 2 | Type: Package 3 | Title: Collection of Machine Learning Datasets for Supervised Machine Learning 4 | Version: 1.0.1 5 | Authors@R: c( 6 | person("Gary", "Hutson", , "hutsons-hacks@outlook.com", c("aut", "cre"), 7 | comment = c(ORCID="0000-0003-3534-6143")), 8 | person("Asif", "Laldin", , "laldin.asif@gmail.com", c("aut")), 9 | person("Isabella", "Velásquez", , "ivelasq@gmail.com", c("aut")) 10 | ) 11 | Maintainer: Gary Hutson 12 | Description: Contains a collection of datasets for working with machine learning tasks. 13 | It will contain datasets for supervised machine learning Jiang (2020) and will include datasets for classification and regression. 14 | The aim of this package is to use data generated around health and other domains. 15 | License: MIT + file LICENSE 16 | Encoding: UTF-8 17 | LazyData: true 18 | BugReports: https://github.com/StatsGary/MLDataR/issues 19 | Imports: 20 | ConfusionTableR, 21 | dplyr, 22 | parsnip, 23 | rsample, 24 | recipes, 25 | workflows, 26 | ranger, caret, varhandle, 27 | OddsPlotty, 28 | ggplot2 29 | RoxygenNote: 7.1.2 30 | Suggests: 31 | rmarkdown, 32 | knitr 33 | VignetteBuilder: knitr 34 | Depends: 35 | R (>= 2.10) 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2021 2 | COPYRIGHT HOLDER: MLDataR authors 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2021 MLDataR authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MLDataR.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # NEWS - MLDataR 2 | The news for the package is contained hereunder: 3 | 4 | ## -0.1.1 - Initial build release 5 | Initial data items released are: 6 | - Diabetes data for ML predictive modelling 7 | - Heart Disease data for ML supervised classification 8 | - Thyroid Disease data for ML supervised classification 9 | 10 | ## 0.1.2 - Added further datasets 11 | Datasets added: 12 | - Care home incidents dataset - classification dataset to predict care home incidents based on features collected via the care home. Aim is to predict failing care homes, over care homes that meet inspection 13 | - Pre-diabetes - regression model to predict time between pre diabetes and diabetes, based off patient features. 14 | 15 | ## 0.1.3 - Added long_waiters dataset 16 | Added the long waiters dataset for working with NHS inpatients. 17 | 18 | ## 1.0.1 - Fixed issue with patient age in Thyroid dataset 19 | Fixed an issue with the patient age in the Thyroid dataset. 20 | -------------------------------------------------------------------------------- /R/PreDiabetes.R: -------------------------------------------------------------------------------- 1 | #' PreDiabetes dataset 2 | #' @docType data 3 | #' @keywords prediabetes machine learning regression 4 | #' @format A data frame with 3059 rows and 9 variables: 5 | #' \describe{ 6 | #' \item{Age}{age of the patient presenting with diabetes} 7 | #' \item{Sex}{sex of the patient with diabetes} 8 | #' \item{IMD_Decile}{Index of Multiple Deprivation Decile} 9 | #' \item{BMI}{Body Mass Index of patient} 10 | #' \item{Age_PreDiabetes}{age at pre diabetes diagnosis} 11 | #' \item{HbA1C}{average blood glucose mmol/mol} 12 | #' \item{Time_Pre_To_Diabetes}{time in years between pre-diabetes and diabetes diagnosis} 13 | #' \item{Age_Diabetes}{age at diabetes diagnosis} 14 | #' \item{PreDiabetes_Checks_Before_Diabetes}{number of pre-diabetes related primary care appointments before diabetes diagnosis} 15 | #' 16 | #' } 17 | #' @source Generated by Asif Laldin \email{a.laldin@nhs.net}, Jan-2022 18 | #' @examples 19 | #' library(dplyr) 20 | #' data(PreDiabetes) 21 | #' # Convert diabetes data to factor' 22 | #' diabetes_data <- PreDiabetes %>% 23 | #' glimpse() 24 | "PreDiabetes" 25 | -------------------------------------------------------------------------------- /R/care_home_incidents.R: -------------------------------------------------------------------------------- 1 | #' Care Home Incidents 2 | #' @description a NHS patient safety incidents dataset: \url{https://www.england.nhs.uk/patient-safety/report-patient-safety-incident/} dataset that has been synthetically generated against real data 3 | #' @docType data 4 | #' @keywords care home incidents supervised machine learning classification 5 | #' @format A data frame with 1216 rows and 12 variables: 6 | #' \describe{ 7 | #' \item{CareHomeFail}{a binary indicator to specify whether a certain care home is failing} 8 | #' \item{WeightLoss}{aggregation of incidents indicating weight loss in patient} 9 | #' \item{Medication}{medication missed aggregaation} 10 | #' \item{Falls}{Recorded number of patient falls} 11 | #' \item{Choking}{Number of patient choking incidents} 12 | #' \item{UnexpectedDeaths}{unexpected deaths in the care home} 13 | #' \item{Bruising}{Number of bruising incidents in the care home} 14 | #' \item{Absconsion}{Absconding from the care home setting} 15 | #' \item{ResidentAbuseByResident}{Abuse conducted by one care home resident against another} 16 | #' \item{ResidentAbuseByStaff}{Incidents of resident abuse by staff} 17 | #' \item{ResidentAbuseOnStaff}{Incidents of residents abusing staff} 18 | #' \item{Wounds}{Unexplained wounds against staff} 19 | #' } 20 | 21 | #' @source Collected by Gary Hutson \email{hutsons-hacks@outlook.com}, Jan-2022 22 | #' @examples 23 | #' library(dplyr) 24 | #' data(care_home_incidents) 25 | #' # Convert diabetes data to factor' 26 | #' ch_incs <- care_home_incidents %>% 27 | #' mutate(CareHomeFail = as.factor(CareHomeFail)) 28 | #' ch_incs %>% glimpse() 29 | #' # Check factor 30 | #' factor(ch_incs$CareHomeFail) 31 | "care_home_incidents" 32 | -------------------------------------------------------------------------------- /R/csgo.R: -------------------------------------------------------------------------------- 1 | #' csgo 2 | #' @docType data 3 | #' @keywords CounterStrike Global Offensive eSports 4 | #' @format A data frame with 1,133 rows and 17 variables: 5 | #' \describe{ 6 | #' \item{map}{Map on which the match was played} 7 | #' \item{day}{Day of the month} 8 | #' \item{month}{Month of the year} 9 | #' \item{year}{Year} 10 | #' \item{date}{Date of match DD/MM/YYYY} 11 | #' \item{wait_time_s}{Time waited to find match} 12 | #' \item{match_time_s}{Total match length in seconds} 13 | #' \item{team_a_rounds}{Number of rounds played as Team A} 14 | #' \item{team_b_rounds}{Number of rounds played as Team B} 15 | #' \item{ping}{Maximum ping in milliseconds;the signal that's sent from one computer to another on the same network} 16 | #' \item{kills}{Number of kills accumulated in match; max 5 per round} 17 | #' \item{assists}{Number of assists accumulated in a match,inflicting oppononent with more than 50 percent damage,who is then killed by another player accumulated in match max 5 per round} 18 | #' \item{deaths}{Number of times player died during match;max 1 per round} 19 | #' \item{mvps}{Most Valuable Player award} 20 | #' \item{hs_percent}{Percentage of kills that were a result from a shot to opponent's head} 21 | #' \item{points}{Number of points accumulated during match. Apoints are gained from kills, assists,bomb defuses & bomb plants. Points are lost for sucicide and friendly kills} 22 | #' \item{result}{The result of the match, Win, Loss, Draw} 23 | #'} 24 | #' @source Extracted by Asif Laldin \email{a.laldin@nhs.net}, March-2019 25 | 26 | "csgo" 27 | -------------------------------------------------------------------------------- /R/diabetes_data.R: -------------------------------------------------------------------------------- 1 | #' Diabetes datasets 2 | #' @docType data 3 | #' @keywords diabetes machine learning classification 4 | #' @format A data frame with 520 rows and 17 variables: 5 | #' \describe{ 6 | #' \item{Age}{age of the patient presenting with diabetes} 7 | #' \item{Gender}{gender of the patient with diabetes} 8 | #' \item{ExcessUrination}{if the patient has a history of excessive urination} 9 | #' \item{Polydipsia}{abnormal thurst, accompanied by the excessive intake of water or fluid} 10 | #' \item{WeightLossSudden}{Sudden weight loss that has recently occured} 11 | #' \item{Fatigue}{Fatigue or weakness} 12 | #' \item{Polyphagia}{excessive or extreme hunger} 13 | #' \item{GenitalThrush}{patient has thrush fungus on or near their genital region} 14 | #' \item{BlurredVision}{history of blurred vision} 15 | #' \item{Itching}{skin itching} 16 | #' \item{Irritability}{general irritability and mood issues} 17 | #' \item{DelayHealing}{delayed healing of wounds} 18 | #' \item{PartialPsoriasis}{partial psoriasis on the body} 19 | #' \item{MuscleStiffness}{stiffness of the muscles} 20 | #' \item{Alopecia}{scalp alopecia and hair shedding} 21 | #' \item{Obesity}{Classified as obese} 22 | #' \item{DiabeticClass}{Class label to indicate whether the patient is diabetic or not} 23 | #' } 24 | #' @source Collected by Gary Hutson \email{hutsons-hacks@outlook.com}, Dec-2021 25 | #' @examples 26 | #' library(dplyr) 27 | #' data(diabetes_data) 28 | #' # Convert diabetes data to factor' 29 | #' diabetes_data <- diabetes_data %>% 30 | #' glimpse() %>% 31 | #' mutate(DiabeticClass = as.factor(DiabeticClass)) 32 | #' is.factor(diabetes_data$DiabeticClass) 33 | "diabetes_data" 34 | -------------------------------------------------------------------------------- /R/heartdisease.R: -------------------------------------------------------------------------------- 1 | #' Heart disease dataset 2 | #' 3 | #' The dataset is to be used with a supervised classification ML model to classify heart disease. 4 | #' @docType data 5 | #' @keywords heart disease heart disease machine learning classification 6 | #' @format A data frame with 918 rows and 10 variables: 7 | #' \describe{ 8 | #' \item{Age}{age of the patient presenting with heart disease} 9 | #' \item{Sex}{gender of the patient} 10 | #' \item{RestingBP}{blood pressure for resting heart beat} 11 | #' \item{Cholesterol}{Cholesterol reading} 12 | #' \item{FastingBS}{blood sample of glucose after a patient fasts \url{https://www.diabetes.co.uk/diabetes_care/fasting-blood-sugar-levels.html}} 13 | #' \item{RestingECG}{Resting echocardiography is an indicator of previous myocardial infarction e.g. heart attack} 14 | #' \item{MaxHR}{Maximum heart rate} 15 | #' \item{Angina}{chest pain caused by decreased flood flow \url{https://www.nhs.uk/conditions/angina/}} 16 | #' \item{HeartPeakReading}{reading at the peak of the heart rate} 17 | #' \item{HeartDisease}{the classification label of whether patient has heart disease or not} 18 | #' 19 | #' } 20 | #' @source Collected by Gary Hutson \email{hutsons-hacks@outlook.com}, Dec-2021 21 | #' @examples 22 | #' library(dplyr) 23 | #' library(ConfusionTableR) 24 | #' data(heartdisease) 25 | #' 26 | #' # Convert diabetes data to factor' 27 | #' hd <- heartdisease %>% 28 | #' glimpse() %>% 29 | #' mutate(HeartDisease = as.factor(HeartDisease)) 30 | #' # Check that the label is now a factor 31 | #' is.factor(hd$HeartDisease) 32 | #' # Dummy encoding 33 | #' # Get categorical columns 34 | #' hd_cat <- hd %>% 35 | #' dplyr::select_if(is.character) 36 | 37 | #' # Dummy encode the categorical variables 38 | #' # Specify the columns to encode 39 | #' cols <- c("RestingECG", "Angina", "Sex") 40 | #' # Dummy encode using dummy_encoder in ConfusionTableR package 41 | #' coded <- ConfusionTableR::dummy_encoder(hd_cat, cols, remove_original = TRUE) 42 | #' coded <- coded %>% 43 | #' select(RestingECG_ST, RestingECG_LVH, Angina=Angina_Y, 44 | #' Sex=Sex_F) 45 | #' # Remove column names we have encoded from original data frame 46 | #' hd_one <- hd[,!names(hd) %in% cols] 47 | #' # Bind the numerical data on to the categorical data 48 | #' hd_final <- bind_cols(coded, hd_one) 49 | #' # Output the final encoded data frame for the ML task 50 | #' glimpse(hd_final) 51 | "heartdisease" 52 | -------------------------------------------------------------------------------- /R/long_stayers.R: -------------------------------------------------------------------------------- 1 | #' Long stayers dataset 2 | #' @description classification dataset of long staying patients. 3 | #' Contains patients who have been registered as an inpatient for longer than 7 days length of stay \url{https://www.england.nhs.uk/south/wp-content/uploads/sites/6/2016/12/rig-reviewing-stranded-patients-hospital.pdf}. 4 | #' @docType data 5 | #' @keywords long stay patient stranded NHS 6 | #' @format A data frame with 768 rows and 9 variables: 7 | #' \describe{ 8 | #' \item{stranded.label}{binary classification label indicating whether \strong{stranded = 1} or \strong{not stranded=0}} 9 | #' \item{age}{age of the patient} 10 | #' \item{care.home.referral}{flag indicating whether referred from a private care home - \strong{1=Care Home Referral} and \strong{0=Not a care home referral}} 11 | #' \item{medicallysafe}{flag indicating whether they are medically safe for discharge - \strong{1=Medically safe} and \strong{0=Not medically safe}} 12 | #' \item{hcop}{flag indicating health care for older person triage - \strong{1=Yes triaged from HCOP} and \strong{0=Triaged from different department}} 13 | #' \item{mental_health_care}{flag indicating whether they require mental health care - \strong{1=MH assistance needed} and \strong{0=No history of mental health}} 14 | #' \item{periods_of_previous_care}{Count of the number of times they have been in hospital in last 12 months} 15 | #' \item{admit_date}{date the patient was admitted as an inpatient} 16 | #' \item{frailty_index}{indicates the type of frailty - nominal variable} 17 | #' } 18 | #' @source Prepared, acquired and adatped by Gary Hutson \email{hutsons-hacks@outlook.com}, Dec-2021. Synthetic data, based off live patient data from various NHS secondary health care trusts. 19 | #' @examples 20 | #' library(dplyr) 21 | #' library(ggplot2) 22 | #' library(caret) 23 | #' library(rsample) 24 | #' library(varhandle) 25 | 26 | #' data("long_stayers") 27 | #' glimpse(long_stayers) 28 | #' # Examine class imbalance 29 | #' prop.table(table(long_stayers$stranded.label)) 30 | #' # Feature engineering 31 | #' long_stayers <- long_stayers %>% 32 | #' dplyr::mutate(stranded.label=factor(stranded.label)) %>% 33 | #' dplyr::select(everything(), -c(admit_date)) 34 | #' # Feature encoding 35 | #' cats <- select_if(long_stayers, is.character) 36 | #' cat_dummy <- varhandle::to.dummy(cats$frailty_index, "frail_ind") 37 | #' #Converts the frailty index column to dummy encoding and sets a column called "frail_ind" prefix 38 | #'cat_dummy <- cat_dummy %>% 39 | #' as.data.frame() %>% 40 | #' dplyr::select(-frail_ind.No_index_item) #Drop the field of interest 41 | #'long_stayers <- long_stayers %>% 42 | #' dplyr::select(-frailty_index) %>% 43 | #' bind_cols(cat_dummy) %>% na.omit(.) 44 | #' # Split the data 45 | #' split <- rsample::initial_split(long_stayers, prop = 3/4) 46 | #' train <- rsample::training(split) 47 | #' test <- rsample::testing(split) 48 | #' set.seed(123) 49 | #' glm_class_mod <- caret::train(factor(stranded.label) ~ ., data = train, 50 | #' method = "glm") 51 | #' print(glm_class_mod) 52 | #' # Predict the probabilities 53 | #' preds <- predict(glm_class_mod, newdata = test) # Predict class 54 | #' pred_prob <- predict(glm_class_mod, newdata = test, type="prob") #Predict probs 55 | #' 56 | #'predicted <- data.frame(preds, pred_prob) 57 | #' test <- test %>% 58 | #' bind_cols(predicted) %>% 59 | #' dplyr::rename(pred_class=preds) 60 | #' #Evaluate with ConfusionTableR 61 | #' library(ConfusionTableR) 62 | #' cm <- ConfusionTableR::binary_class_cm(test$stranded.label, test$pred_class, positive="Stranded") 63 | #' cm$record_level_cm 64 | #' # Visualise odds ration 65 | #' library(OddsPlotty) 66 | 67 | #' plotty <- OddsPlotty::odds_plot(glm_class_mod$finalModel, 68 | #' title = "Odds Plot ", 69 | #' subtitle = "Showing odds of patient stranded", 70 | #' point_col = "#00f2ff", 71 | #' error_bar_colour = "black", 72 | #' point_size = .5, 73 | #' error_bar_width = .8, 74 | #' h_line_color = "red") 75 | #' print(plotty) 76 | 77 | "long_stayers" 78 | -------------------------------------------------------------------------------- /R/stroke_classification.R: -------------------------------------------------------------------------------- 1 | #' Stroke Classification dataset 2 | #' @description This dataset has been obtained from a Stoke department within the NHS and is a traditional supervised ML classification dataset 3 | #' @docType data 4 | #' @keywords stoke 5 | #' @format A data frame with 5110 rows and 11 variables: 6 | #' \describe{ 7 | #' \item{pat_id}{unique patient identifier index} 8 | #' \item{stroke}{outcome variable as a flag - 1 for stroke and 0 for no stroke} 9 | #' \item{gender}{patient gender description} 10 | #' \item{age}{age of the patient} 11 | #' \item{hypertension}{binary flag to indicate whether patient has hypertension: \url{https://www.nhs.uk/conditions/high-blood-pressure-hypertension/}} 12 | #' \item{heart_disease}{binary flag to indicate whether patient has heart disease: 1 or no heart disease history: 0} 13 | #' \item{work_related_stress}{binary flag to indicate whether patient has history of work related stress} 14 | #' \item{urban_residence}{binary flag indicating whether patient lives in an urban area or not} 15 | #' \item{avg_glucose_level}{average blood glucose readings of the patient} 16 | #' \item{bmi}{body mass index of the patient: \url{https://www.nhs.uk/live-well/healthy-weight/bmi-calculator/}} 17 | #' \item{smokes}{binary flag to indicate if the patient smokes - 1 for current smoker and 0 for smoking cessation} 18 | #' 19 | # 20 | #' } 21 | #' @source Prepared and compiled by Gary Hutson \email{hutsons-hacks@outlook.com}, Apr-2022. 22 | "stroke_classification" 23 | -------------------------------------------------------------------------------- /R/thyroid_disease.R: -------------------------------------------------------------------------------- 1 | #' Thyroid disease dataset 2 | #' @description The dataset is to be used with a supervised classification ML model to classify thyroid disease. 3 | #' The dataset was sourced and adapted from the UCI Machine Learning repository \url{https://archive.ics.uci.edu/ml/index.php}. 4 | #' @docType data 5 | #' @keywords thyroid disease 6 | #' @format A data frame with 3772 rows and 28 variables: 7 | #' \describe{ 8 | #' \item{ThryroidClass}{binary classification label indicating whether \strong{sick = 1} or \strong{negative=0}} 9 | #' \item{patient_age}{age of the patient} 10 | #' \item{patient_gender}{flag indicating gender of patient - \strong{1=Female} and \strong{0=Male}} 11 | #' \item{presc_thyroxine}{flag to indicate whether thyroxine replacement prescribed \strong{1=Thyroxine prescribed}} 12 | #' \item{queried_why_on_thyroxine}{flag to indicate query has been actioned} 13 | #' \item{presc_anthyroid_meds}{flag to indicate whether anti-thyroid medicine has been prescribed} 14 | #' \item{sick}{flag to indicate sickness due to thyroxine depletion or over activity} 15 | #' \item{pregnant}{flag to indicate whether the patient is pregnant} 16 | #' \item{thyroid_surgery}{flag to indicate whether the patient has had thyroid surgery} 17 | #' \item{radioactive_iodine_therapyI131}{indicates whether patient has had radioactive iodine treatment: \url{https://www.nhs.uk/conditions/thyroid-cancer/treatment/}} 18 | #' \item{query_hypothyroid}{flag to indicate under active thyroid query \url{https://www.nhs.uk/conditions/underactive-thyroid-hypothyroidism/}} 19 | #' \item{query_hyperthyroid}{flag to indicate over active thyroid query \url{https://www.nhs.uk/conditions/overactive-thyroid-hyperthyroidism/}} 20 | #' \item{lithium}{Lithium carbonate administered to decrease the level of thyroid hormones} 21 | #' \item{goitre}{flag to indicate swelling of the thyroid gland \url{https://www.nhs.uk/conditions/goitre/}} 22 | #' \item{tumor}{flag to indicate a tumor} 23 | #' \item{hypopituitarism}{flag to indicate a diagnosed under active thyroid} 24 | #' \item{psych_condition}{indicates whether a patient has a psychological condition} 25 | #' \item{TSH_measured}{a TSH level lower than normal indicates there is usually more than enough thyroid hormone in the body and may indicate hyperthyroidism} 26 | #' \item{TSH_reading}{the reading result of the TSH blood test} 27 | #' \item{T3_measured}{linked to TSH reading - when free triiodothyronine rise above normal this indicates hyperthyroidism} 28 | #' \item{T3_reading}{the reading result of the T3 blood test looking for above normal levels of free triiodothyronine} 29 | #' \item{T4_measured}{free thyroxine, also known as T4, is used with T3 and TSH tests to diagnose hyperthyroidism} 30 | #' \item{T4_reading}{the reading result of th T4 test} 31 | #' \item{thyrox_util_rate_T4U_measured}{flag indicating the thyroxine utilisation rate \url{https://pubmed.ncbi.nlm.nih.gov/1685967/}} 32 | #' \item{thyrox_util_rate_T4U_reading}{the result of the test} 33 | #' \item{FTI_measured}{flag to indicate measurement on the Free Thyroxine Index (FTI)\url{https://endocrinology.testcatalog.org/show/FRTUP}} 34 | #' \item{FTI_reading}{the result of the test mentioned above} 35 | #' \item{ref_src}{[nominal] indicating the referral source of the patient} 36 | #' } 37 | #' @source Prepared and adatped by Gary Hutson \email{hutsons-hacks@outlook.com}, Dec-2021 and sourced from Garavan Institute and J. Ross Quinlan. 38 | #' @references Thyroid disease records supplied by the Garavan Institute and J. Ross Quinlan. 39 | #' @examples 40 | #' library(dplyr) 41 | #' library(ConfusionTableR) 42 | #' library(parsnip) 43 | #' library(rsample) 44 | #' library(recipes) 45 | #' library(ranger) 46 | #' library(workflows) 47 | #' data("thyroid_disease") 48 | #' td <- thyroid_disease 49 | #' # Create a factor of the class label to use in ML model 50 | #' td$ThryroidClass <- as.factor(td$ThryroidClass) 51 | #' # Check the structure of the data to make sure factor has been created 52 | #' str(td) 53 | #' # Remove missing values, or choose more advaced imputation option 54 | #' td <- td[complete.cases(td),] 55 | #' #Drop the column for referral source 56 | #' td <- td %>% 57 | #' dplyr::select(-ref_src) 58 | #' # Analyse class imbalance 59 | #' class_imbalance <- prop.table(table(td$ThryroidClass)) 60 | #' class_imbalance 61 | #' #Divide the data into a training test split 62 | #' set.seed(123) 63 | #' split <- rsample::initial_split(td, prop=3/4) 64 | #' train_data <- rsample::training(split) 65 | #' test_data <- rsample::testing(split) 66 | #' # Create recipe to upsample and normalise 67 | #' set.seed(123) 68 | #'td_recipe <- 69 | #' recipe(ThryroidClass ~ ., data=train_data) %>% 70 | #' step_normalize(all_predictors()) %>% 71 | #' step_zv(all_predictors()) 72 | #' # Instantiate the model 73 | #' set.seed(123) 74 | #' rf_mod <- 75 | #' parsnip::rand_forest() %>% 76 | #' set_engine("ranger") %>% 77 | #' set_mode("classification") 78 | #' # Create the model workflow 79 | #' td_wf <- 80 | #' workflow() %>% 81 | #' workflows::add_model(rf_mod) %>% 82 | #' workflows::add_recipe(td_recipe) 83 | #'# Fit the workflow to our training data 84 | #' set.seed(123) 85 | #' td_rf_fit <- 86 | #' td_wf %>% 87 | #' fit(data = train_data) 88 | #' # Extract the fitted data 89 | #' td_fitted <- td_rf_fit %>% 90 | #' extract_fit_parsnip() 91 | #' # Predict the test set on the training set to see model performance 92 | #' class_pred <- predict(td_rf_fit, test_data) 93 | #' td_preds <- test_data %>% 94 | #' bind_cols(class_pred) 95 | #' # Convert both to factors 96 | #' td_preds$.pred_class <- as.factor(td_preds$.pred_class) 97 | #' td_preds$ThryroidClass <- as.factor(td_preds$ThryroidClass) 98 | #' # Evaluate the data with ConfusionTableR 99 | #' cm <- ConfusionTableR::binary_class_cm(td_preds$ThryroidClass , 100 | #' td_preds$.pred_class, 101 | #' positive="sick") 102 | #' #View Confusion matrix 103 | #' cm$confusion_matrix 104 | #' #View record level 105 | #' cm$record_level_cm 106 | "thyroid_disease" 107 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MLDataR 2 | 3 |

4 | 5 | 6 | [![](https://cranlogs.r-pkg.org/badges/MLDataR)](https://cran.r-project.org/package=MLDataR) 7 | [![CRAN status](https://www.r-pkg.org/badges/version/MLDataR)](https://CRAN.R-project.org/package=MLDataR) 8 | ![GitHub last commit](https://img.shields.io/github/last-commit/StatsGary/MLDataR) 9 | ![GitHub Repo stars](https://img.shields.io/github/stars/StatsGary/MLDataR?label=MLDataR%20Stars) 10 | [![Downloads](https://cranlogs.r-pkg.org/badges/grand-total/MLDataR)](https://cran.r-project.org/package=MLDataR) 11 | [![license](https://img.shields.io/github/license/mashape/apistatus.svg)](https://github.com/ald0405/SangerTools/blob/master/LICENSE) 12 | 13 | 14 | A collection of Machine Learning datasets for health care and beyond. 15 | 16 | ## Installing the package from GitHub 17 | 18 | Here, I will use the package remotes to install the package: 19 | 20 | ``` r 21 | # install.packages("remotes") # if not already installed 22 | remotes::install_github("https://github.com/StatsGary/MLDataR") 23 | library(MLDataR) 24 | 25 | ``` 26 | ## Installing the package from CRAN 27 | 28 | To install from CRAN, use the below command: 29 | ``` r 30 | install.packages("MLDataR") 31 | 32 | ``` 33 | 34 | ## Loading the package from CRAN 35 | 36 | To load the package from CRAN, use the following: 37 | 38 | ``` r 39 | library(MLDataR) 40 | ``` 41 | 42 | ## Datasets included 43 | 44 | The package currently has three example datasets, and more are being added every week. The first three datasets contained in the package are: 45 | 46 | - **Counter Strike Global Offensive** - supervised machine learning regression and classification data set to predict score or match outcome. 47 | - **Diabetes disease prediction** - supervised machine learning classification dataset to enable the prediction of diabetic patients. 48 | - **Diabetes onset prediction** - supervised machine learning regression dataset to enable prediction of the age at which a pre-diabetic will develop diabetes 49 | - **Failing Care Home classification** - classification supervised machine learning dataset to predict a failing care home by selected Datix incidents. UK Datix service. 50 | - **Heart disease prediction** - supervised machine learning classification dataset to enable the prediction of heart disease using a number of key outcome features. Anonymised from the British Heart Foundation example records. 51 | - **Long stayers prediction** - supervised machine learning classification dataset to enable the prediction of a patient staying in hospital longer than 7 days. Extracted from stranded patients extract and anonymised for training and research purposes. Nottingham University Hospitals. 52 | - **Stroke Classification** - supervised machine learning classification dataset to enable the prediction of a stroke in an unseen patient, using past observations in the training set. 53 | - **Thyroid disease prediction** - supervised machine learning classification dataset to allow for the prediction of thyroid disease utilising historic patient records. Garvin Institute - see references in markdown files supporting package. 54 | 55 | ## Further developments 56 | 57 | More datasets are being added, so look out for the next version of this package. 58 | 59 | ## Closing remarks 60 | 61 | It has been fun putting this package together and I hope you find it useful. If you find any issues using the package, please raise a git hub ticket and I will address it as soon as possible. Thanks and I hope you enjoy using it. 62 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Test environments 2 | * local windows 10, R 4.0.3 3 | * Windows, R devel 2020-09-09 r79174, on Win-builder 4 | * Ubuntu 16.04.6 LTS (on travis-ci), R 4.0.2 5 | 6 | * GitHub actions: 7 | * Mac OS x 10.15.7, R 4.0.3 8 | * Windows Server x64 2019, R 4.0.3 9 | * Windows Server x64 latest, R 3.6.3 10 | * Ubuntu 16.04.07, R-devel 2020-11-27 r79522 11 | * Ubuntu 16.04.07, R 4.0.3 12 | * Ubuntu 16.04.07, R 3.6.3 13 | * Ubuntu 16.04.07, R 3.5.3 14 | * Ubuntu 16.04.07, R 3.4.4 15 | 16 | * r-hub: 17 | * Ubuntu Linux 16.04 LTS, R-release, GCC 18 | * Fedora Linux, R-devel, clang, gfortran 19 | * Windows Server 2008 R2 SP1, R-devel, 32/64 bit 20 | 21 | ## R CMD check results 22 | There were no ERRORs or WARNINGs. 23 | 24 | ## Downstream dependencies 25 | There are currently no downstream dependencies for this package to my knowledge. 26 | -------------------------------------------------------------------------------- /data/PreDiabetes.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/data/PreDiabetes.rda -------------------------------------------------------------------------------- /data/care_home_incidents.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/data/care_home_incidents.rda -------------------------------------------------------------------------------- /data/csgo.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/data/csgo.rda -------------------------------------------------------------------------------- /data/diabetes_data.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/data/diabetes_data.rda -------------------------------------------------------------------------------- /data/heartdisease.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/data/heartdisease.rda -------------------------------------------------------------------------------- /data/long_stayers.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/data/long_stayers.rda -------------------------------------------------------------------------------- /data/stroke_classification.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/data/stroke_classification.rda -------------------------------------------------------------------------------- /data/thyroid_disease.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/data/thyroid_disease.rda -------------------------------------------------------------------------------- /man/PreDiabetes.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/PreDiabetes.R 3 | \docType{data} 4 | \name{PreDiabetes} 5 | \alias{PreDiabetes} 6 | \title{PreDiabetes dataset} 7 | \format{ 8 | A data frame with 3059 rows and 9 variables: 9 | \describe{ 10 | \item{Age}{age of the patient presenting with diabetes} 11 | \item{Sex}{sex of the patient with diabetes} 12 | \item{IMD_Decile}{Index of Multiple Deprivation Decile} 13 | \item{BMI}{Body Mass Index of patient} 14 | \item{Age_PreDiabetes}{age at pre diabetes diagnosis} 15 | \item{HbA1C}{average blood glucose mmol/mol} 16 | \item{Time_Pre_To_Diabetes}{time in years between pre-diabetes and diabetes diagnosis} 17 | \item{Age_Diabetes}{age at diabetes diagnosis} 18 | \item{PreDiabetes_Checks_Before_Diabetes}{number of pre-diabetes related primary care appointments before diabetes diagnosis} 19 | 20 | } 21 | } 22 | \source{ 23 | Generated by Asif Laldin \email{a.laldin@nhs.net}, Jan-2022 24 | } 25 | \usage{ 26 | PreDiabetes 27 | } 28 | \description{ 29 | PreDiabetes dataset 30 | } 31 | \examples{ 32 | library(dplyr) 33 | data(PreDiabetes) 34 | # Convert diabetes data to factor' 35 | diabetes_data <- PreDiabetes \%>\% 36 | glimpse() 37 | } 38 | \keyword{learning} 39 | \keyword{machine} 40 | \keyword{prediabetes} 41 | \keyword{regression} 42 | -------------------------------------------------------------------------------- /man/care_home_incidents.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/care_home_incidents.R 3 | \docType{data} 4 | \name{care_home_incidents} 5 | \alias{care_home_incidents} 6 | \title{Care Home Incidents} 7 | \format{ 8 | A data frame with 1216 rows and 12 variables: 9 | \describe{ 10 | \item{CareHomeFail}{a binary indicator to specify whether a certain care home is failing} 11 | \item{WeightLoss}{aggregation of incidents indicating weight loss in patient} 12 | \item{Medication}{medication missed aggregaation} 13 | \item{Falls}{Recorded number of patient falls} 14 | \item{Choking}{Number of patient choking incidents} 15 | \item{UnexpectedDeaths}{unexpected deaths in the care home} 16 | \item{Bruising}{Number of bruising incidents in the care home} 17 | \item{Absconsion}{Absconding from the care home setting} 18 | \item{ResidentAbuseByResident}{Abuse conducted by one care home resident against another} 19 | \item{ResidentAbuseByStaff}{Incidents of resident abuse by staff} 20 | \item{ResidentAbuseOnStaff}{Incidents of residents abusing staff} 21 | \item{Wounds}{Unexplained wounds against staff} 22 | } 23 | } 24 | \source{ 25 | Collected by Gary Hutson \email{hutsons-hacks@outlook.com}, Jan-2022 26 | } 27 | \usage{ 28 | care_home_incidents 29 | } 30 | \description{ 31 | a NHS patient safety incidents dataset: \url{https://www.england.nhs.uk/patient-safety/report-patient-safety-incident/} dataset that has been synthetically generated against real data 32 | } 33 | \examples{ 34 | library(dplyr) 35 | data(care_home_incidents) 36 | # Convert diabetes data to factor' 37 | ch_incs <- care_home_incidents \%>\% 38 | mutate(CareHomeFail = as.factor(CareHomeFail)) 39 | ch_incs \%>\% glimpse() 40 | # Check factor 41 | factor(ch_incs$CareHomeFail) 42 | } 43 | \keyword{care} 44 | \keyword{classification} 45 | \keyword{home} 46 | \keyword{incidents} 47 | \keyword{learning} 48 | \keyword{machine} 49 | \keyword{supervised} 50 | -------------------------------------------------------------------------------- /man/csgo.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/csgo.R 3 | \docType{data} 4 | \name{csgo} 5 | \alias{csgo} 6 | \title{csgo} 7 | \format{ 8 | A data frame with 1,133 rows and 17 variables: 9 | \describe{ 10 | \item{map}{Map on which the match was played} 11 | \item{day}{Day of the month} 12 | \item{month}{Month of the year} 13 | \item{year}{Year} 14 | \item{date}{Date of match DD/MM/YYYY} 15 | \item{wait_time_s}{Time waited to find match} 16 | \item{match_time_s}{Total match length in seconds} 17 | \item{team_a_rounds}{Number of rounds played as Team A} 18 | \item{team_b_rounds}{Number of rounds played as Team B} 19 | \item{ping}{Maximum ping in milliseconds;the signal that's sent from one computer to another on the same network} 20 | \item{kills}{Number of kills accumulated in match; max 5 per round} 21 | \item{assists}{Number of assists accumulated in a match,inflicting oppononent with more than 50 percent damage,who is then killed by another player accumulated in match max 5 per round} 22 | \item{deaths}{Number of times player died during match;max 1 per round} 23 | \item{mvps}{Most Valuable Player award} 24 | \item{hs_percent}{Percentage of kills that were a result from a shot to opponent's head} 25 | \item{points}{Number of points accumulated during match. Apoints are gained from kills, assists,bomb defuses & bomb plants. Points are lost for sucicide and friendly kills} 26 | \item{result}{The result of the match, Win, Loss, Draw} 27 | } 28 | } 29 | \source{ 30 | Extracted by Asif Laldin \email{a.laldin@nhs.net}, March-2019 31 | } 32 | \usage{ 33 | csgo 34 | } 35 | \description{ 36 | csgo 37 | } 38 | \keyword{CounterStrike} 39 | \keyword{Global} 40 | \keyword{Offensive} 41 | \keyword{eSports} 42 | -------------------------------------------------------------------------------- /man/diabetes_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/diabetes_data.R 3 | \docType{data} 4 | \name{diabetes_data} 5 | \alias{diabetes_data} 6 | \title{Diabetes datasets} 7 | \format{ 8 | A data frame with 520 rows and 17 variables: 9 | \describe{ 10 | \item{Age}{age of the patient presenting with diabetes} 11 | \item{Gender}{gender of the patient with diabetes} 12 | \item{ExcessUrination}{if the patient has a history of excessive urination} 13 | \item{Polydipsia}{abnormal thurst, accompanied by the excessive intake of water or fluid} 14 | \item{WeightLossSudden}{Sudden weight loss that has recently occured} 15 | \item{Fatigue}{Fatigue or weakness} 16 | \item{Polyphagia}{excessive or extreme hunger} 17 | \item{GenitalThrush}{patient has thrush fungus on or near their genital region} 18 | \item{BlurredVision}{history of blurred vision} 19 | \item{Itching}{skin itching} 20 | \item{Irritability}{general irritability and mood issues} 21 | \item{DelayHealing}{delayed healing of wounds} 22 | \item{PartialPsoriasis}{partial psoriasis on the body} 23 | \item{MuscleStiffness}{stiffness of the muscles} 24 | \item{Alopecia}{scalp alopecia and hair shedding} 25 | \item{Obesity}{Classified as obese} 26 | \item{DiabeticClass}{Class label to indicate whether the patient is diabetic or not} 27 | } 28 | } 29 | \source{ 30 | Collected by Gary Hutson \email{hutsons-hacks@outlook.com}, Dec-2021 31 | } 32 | \usage{ 33 | diabetes_data 34 | } 35 | \description{ 36 | Diabetes datasets 37 | } 38 | \examples{ 39 | library(dplyr) 40 | data(diabetes_data) 41 | # Convert diabetes data to factor' 42 | diabetes_data <- diabetes_data \%>\% 43 | glimpse() \%>\% 44 | mutate(DiabeticClass = as.factor(DiabeticClass)) 45 | is.factor(diabetes_data$DiabeticClass) 46 | } 47 | \keyword{classification} 48 | \keyword{diabetes} 49 | \keyword{learning} 50 | \keyword{machine} 51 | -------------------------------------------------------------------------------- /man/figures/mldataR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/man/figures/mldataR.png -------------------------------------------------------------------------------- /man/heartdisease.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/heartdisease.R 3 | \docType{data} 4 | \name{heartdisease} 5 | \alias{heartdisease} 6 | \title{Heart disease dataset} 7 | \format{ 8 | A data frame with 918 rows and 10 variables: 9 | \describe{ 10 | \item{Age}{age of the patient presenting with heart disease} 11 | \item{Sex}{gender of the patient} 12 | \item{RestingBP}{blood pressure for resting heart beat} 13 | \item{Cholesterol}{Cholesterol reading} 14 | \item{FastingBS}{blood sample of glucose after a patient fasts \url{https://www.diabetes.co.uk/diabetes_care/fasting-blood-sugar-levels.html}} 15 | \item{RestingECG}{Resting echocardiography is an indicator of previous myocardial infarction e.g. heart attack} 16 | \item{MaxHR}{Maximum heart rate} 17 | \item{Angina}{chest pain caused by decreased flood flow \url{https://www.nhs.uk/conditions/angina/}} 18 | \item{HeartPeakReading}{reading at the peak of the heart rate} 19 | \item{HeartDisease}{the classification label of whether patient has heart disease or not} 20 | 21 | } 22 | } 23 | \source{ 24 | Collected by Gary Hutson \email{hutsons-hacks@outlook.com}, Dec-2021 25 | } 26 | \usage{ 27 | heartdisease 28 | } 29 | \description{ 30 | The dataset is to be used with a supervised classification ML model to classify heart disease. 31 | } 32 | \examples{ 33 | library(dplyr) 34 | library(ConfusionTableR) 35 | data(heartdisease) 36 | 37 | # Convert diabetes data to factor' 38 | hd <- heartdisease \%>\% 39 | glimpse() \%>\% 40 | mutate(HeartDisease = as.factor(HeartDisease)) 41 | # Check that the label is now a factor 42 | is.factor(hd$HeartDisease) 43 | # Dummy encoding 44 | # Get categorical columns 45 | hd_cat <- hd \%>\% 46 | dplyr::select_if(is.character) 47 | # Dummy encode the categorical variables 48 | # Specify the columns to encode 49 | cols <- c("RestingECG", "Angina", "Sex") 50 | # Dummy encode using dummy_encoder in ConfusionTableR package 51 | coded <- ConfusionTableR::dummy_encoder(hd_cat, cols, remove_original = TRUE) 52 | coded <- coded \%>\% 53 | select(RestingECG_ST, RestingECG_LVH, Angina=Angina_Y, 54 | Sex=Sex_F) 55 | # Remove column names we have encoded from original data frame 56 | hd_one <- hd[,!names(hd) \%in\% cols] 57 | # Bind the numerical data on to the categorical data 58 | hd_final <- bind_cols(coded, hd_one) 59 | # Output the final encoded data frame for the ML task 60 | glimpse(hd_final) 61 | } 62 | \keyword{classification} 63 | \keyword{disease} 64 | \keyword{heart} 65 | \keyword{learning} 66 | \keyword{machine} 67 | -------------------------------------------------------------------------------- /man/long_stayers.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/long_stayers.R 3 | \docType{data} 4 | \name{long_stayers} 5 | \alias{long_stayers} 6 | \title{Long stayers dataset} 7 | \format{ 8 | A data frame with 768 rows and 9 variables: 9 | \describe{ 10 | \item{stranded.label}{binary classification label indicating whether \strong{stranded = 1} or \strong{not stranded=0}} 11 | \item{age}{age of the patient} 12 | \item{care.home.referral}{flag indicating whether referred from a private care home - \strong{1=Care Home Referral} and \strong{0=Not a care home referral}} 13 | \item{medicallysafe}{flag indicating whether they are medically safe for discharge - \strong{1=Medically safe} and \strong{0=Not medically safe}} 14 | \item{hcop}{flag indicating health care for older person triage - \strong{1=Yes triaged from HCOP} and \strong{0=Triaged from different department}} 15 | \item{mental_health_care}{flag indicating whether they require mental health care - \strong{1=MH assistance needed} and \strong{0=No history of mental health}} 16 | \item{periods_of_previous_care}{Count of the number of times they have been in hospital in last 12 months} 17 | \item{admit_date}{date the patient was admitted as an inpatient} 18 | \item{frailty_index}{indicates the type of frailty - nominal variable} 19 | } 20 | } 21 | \source{ 22 | Prepared, acquired and adatped by Gary Hutson \email{hutsons-hacks@outlook.com}, Dec-2021. Synthetic data, based off live patient data from various NHS secondary health care trusts. 23 | } 24 | \usage{ 25 | long_stayers 26 | } 27 | \description{ 28 | classification dataset of long staying patients. 29 | Contains patients who have been registered as an inpatient for longer than 7 days length of stay \url{https://www.england.nhs.uk/south/wp-content/uploads/sites/6/2016/12/rig-reviewing-stranded-patients-hospital.pdf}. 30 | } 31 | \examples{ 32 | library(dplyr) 33 | library(ggplot2) 34 | library(caret) 35 | library(rsample) 36 | library(varhandle) 37 | data("long_stayers") 38 | glimpse(long_stayers) 39 | # Examine class imbalance 40 | prop.table(table(long_stayers$stranded.label)) 41 | # Feature engineering 42 | long_stayers <- long_stayers \%>\% 43 | dplyr::mutate(stranded.label=factor(stranded.label)) \%>\% 44 | dplyr::select(everything(), -c(admit_date)) 45 | # Feature encoding 46 | cats <- select_if(long_stayers, is.character) 47 | cat_dummy <- varhandle::to.dummy(cats$frailty_index, "frail_ind") 48 | #Converts the frailty index column to dummy encoding and sets a column called "frail_ind" prefix 49 | cat_dummy <- cat_dummy \%>\% 50 | as.data.frame() \%>\% 51 | dplyr::select(-frail_ind.No_index_item) #Drop the field of interest 52 | long_stayers <- long_stayers \%>\% 53 | dplyr::select(-frailty_index) \%>\% 54 | bind_cols(cat_dummy) \%>\% na.omit(.) 55 | # Split the data 56 | split <- rsample::initial_split(long_stayers, prop = 3/4) 57 | train <- rsample::training(split) 58 | test <- rsample::testing(split) 59 | set.seed(123) 60 | glm_class_mod <- caret::train(factor(stranded.label) ~ ., data = train, 61 | method = "glm") 62 | print(glm_class_mod) 63 | # Predict the probabilities 64 | preds <- predict(glm_class_mod, newdata = test) # Predict class 65 | pred_prob <- predict(glm_class_mod, newdata = test, type="prob") #Predict probs 66 | 67 | predicted <- data.frame(preds, pred_prob) 68 | test <- test \%>\% 69 | bind_cols(predicted) \%>\% 70 | dplyr::rename(pred_class=preds) 71 | #Evaluate with ConfusionTableR 72 | library(ConfusionTableR) 73 | cm <- ConfusionTableR::binary_class_cm(test$stranded.label, test$pred_class, positive="Stranded") 74 | cm$record_level_cm 75 | # Visualise odds ration 76 | library(OddsPlotty) 77 | plotty <- OddsPlotty::odds_plot(glm_class_mod$finalModel, 78 | title = "Odds Plot ", 79 | subtitle = "Showing odds of patient stranded", 80 | point_col = "#00f2ff", 81 | error_bar_colour = "black", 82 | point_size = .5, 83 | error_bar_width = .8, 84 | h_line_color = "red") 85 | print(plotty) 86 | } 87 | \keyword{NHS} 88 | \keyword{long} 89 | \keyword{patient} 90 | \keyword{stay} 91 | \keyword{stranded} 92 | -------------------------------------------------------------------------------- /man/stroke_classification.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/stroke_classification.R 3 | \docType{data} 4 | \name{stroke_classification} 5 | \alias{stroke_classification} 6 | \title{Stroke Classification dataset} 7 | \format{ 8 | A data frame with 5110 rows and 11 variables: 9 | \describe{ 10 | \item{pat_id}{unique patient identifier index} 11 | \item{stroke}{outcome variable as a flag - 1 for stroke and 0 for no stroke} 12 | \item{gender}{patient gender description} 13 | \item{age}{age of the patient} 14 | \item{hypertension}{binary flag to indicate whether patient has hypertension: \url{https://www.nhs.uk/conditions/high-blood-pressure-hypertension/}} 15 | \item{heart_disease}{binary flag to indicate whether patient has heart disease: 1 or no heart disease history: 0} 16 | \item{work_related_stress}{binary flag to indicate whether patient has history of work related stress} 17 | \item{urban_residence}{binary flag indicating whether patient lives in an urban area or not} 18 | \item{avg_glucose_level}{average blood glucose readings of the patient} 19 | \item{bmi}{body mass index of the patient: \url{https://www.nhs.uk/live-well/healthy-weight/bmi-calculator/}} 20 | \item{smokes}{binary flag to indicate if the patient smokes - 1 for current smoker and 0 for smoking cessation} 21 | 22 | } 23 | } 24 | \source{ 25 | Prepared and compiled by Gary Hutson \email{hutsons-hacks@outlook.com}, Apr-2022. 26 | } 27 | \usage{ 28 | stroke_classification 29 | } 30 | \description{ 31 | This dataset has been obtained from a Stoke department within the NHS and is a traditional supervised ML classification dataset 32 | } 33 | \keyword{stoke} 34 | -------------------------------------------------------------------------------- /man/thyroid_disease.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/thyroid_disease.R 3 | \docType{data} 4 | \name{thyroid_disease} 5 | \alias{thyroid_disease} 6 | \title{Thyroid disease dataset} 7 | \format{ 8 | A data frame with 3772 rows and 28 variables: 9 | \describe{ 10 | \item{ThryroidClass}{binary classification label indicating whether \strong{sick = 1} or \strong{negative=0}} 11 | \item{patient_age}{age of the patient} 12 | \item{patient_gender}{flag indicating gender of patient - \strong{1=Female} and \strong{0=Male}} 13 | \item{presc_thyroxine}{flag to indicate whether thyroxine replacement prescribed \strong{1=Thyroxine prescribed}} 14 | \item{queried_why_on_thyroxine}{flag to indicate query has been actioned} 15 | \item{presc_anthyroid_meds}{flag to indicate whether anti-thyroid medicine has been prescribed} 16 | \item{sick}{flag to indicate sickness due to thyroxine depletion or over activity} 17 | \item{pregnant}{flag to indicate whether the patient is pregnant} 18 | \item{thyroid_surgery}{flag to indicate whether the patient has had thyroid surgery} 19 | \item{radioactive_iodine_therapyI131}{indicates whether patient has had radioactive iodine treatment: \url{https://www.nhs.uk/conditions/thyroid-cancer/treatment/}} 20 | \item{query_hypothyroid}{flag to indicate under active thyroid query \url{https://www.nhs.uk/conditions/underactive-thyroid-hypothyroidism/}} 21 | \item{query_hyperthyroid}{flag to indicate over active thyroid query \url{https://www.nhs.uk/conditions/overactive-thyroid-hyperthyroidism/}} 22 | \item{lithium}{Lithium carbonate administered to decrease the level of thyroid hormones} 23 | \item{goitre}{flag to indicate swelling of the thyroid gland \url{https://www.nhs.uk/conditions/goitre/}} 24 | \item{tumor}{flag to indicate a tumor} 25 | \item{hypopituitarism}{flag to indicate a diagnosed under active thyroid} 26 | \item{psych_condition}{indicates whether a patient has a psychological condition} 27 | \item{TSH_measured}{a TSH level lower than normal indicates there is usually more than enough thyroid hormone in the body and may indicate hyperthyroidism} 28 | \item{TSH_reading}{the reading result of the TSH blood test} 29 | \item{T3_measured}{linked to TSH reading - when free triiodothyronine rise above normal this indicates hyperthyroidism} 30 | \item{T3_reading}{the reading result of the T3 blood test looking for above normal levels of free triiodothyronine} 31 | \item{T4_measured}{free thyroxine, also known as T4, is used with T3 and TSH tests to diagnose hyperthyroidism} 32 | \item{T4_reading}{the reading result of th T4 test} 33 | \item{thyrox_util_rate_T4U_measured}{flag indicating the thyroxine utilisation rate \url{https://pubmed.ncbi.nlm.nih.gov/1685967/}} 34 | \item{thyrox_util_rate_T4U_reading}{the result of the test} 35 | \item{FTI_measured}{flag to indicate measurement on the Free Thyroxine Index (FTI)\url{https://endocrinology.testcatalog.org/show/FRTUP}} 36 | \item{FTI_reading}{the result of the test mentioned above} 37 | \item{ref_src}{[nominal] indicating the referral source of the patient} 38 | } 39 | } 40 | \source{ 41 | Prepared and adatped by Gary Hutson \email{hutsons-hacks@outlook.com}, Dec-2021 and sourced from Garavan Institute and J. Ross Quinlan. 42 | } 43 | \usage{ 44 | thyroid_disease 45 | } 46 | \description{ 47 | The dataset is to be used with a supervised classification ML model to classify thyroid disease. 48 | The dataset was sourced and adapted from the UCI Machine Learning repository \url{https://archive.ics.uci.edu/ml/index.php}. 49 | } 50 | \examples{ 51 | library(dplyr) 52 | library(ConfusionTableR) 53 | library(parsnip) 54 | library(rsample) 55 | library(recipes) 56 | library(ranger) 57 | library(workflows) 58 | data("thyroid_disease") 59 | td <- thyroid_disease 60 | # Create a factor of the class label to use in ML model 61 | td$ThryroidClass <- as.factor(td$ThryroidClass) 62 | # Check the structure of the data to make sure factor has been created 63 | str(td) 64 | # Remove missing values, or choose more advaced imputation option 65 | td <- td[complete.cases(td),] 66 | #Drop the column for referral source 67 | td <- td \%>\% 68 | dplyr::select(-ref_src) 69 | # Analyse class imbalance 70 | class_imbalance <- prop.table(table(td$ThryroidClass)) 71 | class_imbalance 72 | #Divide the data into a training test split 73 | set.seed(123) 74 | split <- rsample::initial_split(td, prop=3/4) 75 | train_data <- rsample::training(split) 76 | test_data <- rsample::testing(split) 77 | # Create recipe to upsample and normalise 78 | set.seed(123) 79 | td_recipe <- 80 | recipe(ThryroidClass ~ ., data=train_data) \%>\% 81 | step_normalize(all_predictors()) \%>\% 82 | step_zv(all_predictors()) 83 | # Instantiate the model 84 | set.seed(123) 85 | rf_mod <- 86 | parsnip::rand_forest() \%>\% 87 | set_engine("ranger") \%>\% 88 | set_mode("classification") 89 | # Create the model workflow 90 | td_wf <- 91 | workflow() \%>\% 92 | workflows::add_model(rf_mod) \%>\% 93 | workflows::add_recipe(td_recipe) 94 | # Fit the workflow to our training data 95 | set.seed(123) 96 | td_rf_fit <- 97 | td_wf \%>\% 98 | fit(data = train_data) 99 | # Extract the fitted data 100 | td_fitted <- td_rf_fit \%>\% 101 | extract_fit_parsnip() 102 | # Predict the test set on the training set to see model performance 103 | class_pred <- predict(td_rf_fit, test_data) 104 | td_preds <- test_data \%>\% 105 | bind_cols(class_pred) 106 | # Convert both to factors 107 | td_preds$.pred_class <- as.factor(td_preds$.pred_class) 108 | td_preds$ThryroidClass <- as.factor(td_preds$ThryroidClass) 109 | # Evaluate the data with ConfusionTableR 110 | cm <- ConfusionTableR::binary_class_cm(td_preds$ThryroidClass , 111 | td_preds$.pred_class, 112 | positive="sick") 113 | #View Confusion matrix 114 | cm$confusion_matrix 115 | #View record level 116 | cm$record_level_cm 117 | } 118 | \references{ 119 | Thyroid disease records supplied by the Garavan Institute and J. Ross Quinlan. 120 | } 121 | \keyword{disease} 122 | \keyword{thyroid} 123 | -------------------------------------------------------------------------------- /vignettes/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.R 3 | -------------------------------------------------------------------------------- /vignettes/MLDataR.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "MLDataR - A Package for ML datasets" 3 | output: rmarkdown::html_vignette 4 | vignette: > 5 | %\VignetteIndexEntry{MLDataR} 6 | %\VignetteEngine{knitr::rmarkdown} --> %\VignetteEngine{rmarkdown::render} 7 | %\VignetteEncoding{UTF-8} 8 | --- 9 | 10 | ```{r, include = FALSE} 11 | knitr::opts_chunk$set( 12 | collapse = TRUE, 13 | comment = "#>", 14 | fig.height= 5, 15 | fig.width=7 16 | ) 17 | ``` 18 | 19 |

20 | 21 | ```{r setup, include = FALSE, echo=FALSE} 22 | library(MLDataR) 23 | library(dplyr) 24 | library(ConfusionTableR) 25 | library(parsnip) 26 | library(rsample) 27 | library(recipes) 28 | library(ranger) 29 | library(workflows) 30 | library(caret) 31 | 32 | ``` 33 | 34 | 35 | ## Installing the NHSDataR package 36 | To install the package use the below instructions: 37 | 38 | ```{r install_MLDataR} 39 | #install.packages(MLDataR) 40 | library(MLDataR) 41 | 42 | ``` 43 | 44 | ## What datasets are included 45 | 46 | The current list of data sets are: 47 | 48 | - **Diabetes disease prediction** - supervised machine learning classification dataset to enable the prediction of diabetic patients 49 | - **Diabetes onset prediction** - supervised machine learning regression dataset to enable prediction of the age at which a pre-diabetic will develop diabetes 50 | - **Heart disease prediction** - supervised machine learning classification dataset to enable the prediction of heart disease using a number of key outcome features 51 | - **Long stayers prediction** - supervised machine learning classification dataset to enable the prediction of a patient staying in hospital longer than 7 days. 52 | - **Thyroid disease prediction** - supervised machine learning classification dataset to allow for the prediction of thyroid disease utilising historic patient records 53 | - **Failing Care Home classification** - classification supervised machine learning dataset to predict a failing care home by selected Datix incidents 54 | - **Counter Strike Global Offensive** - supervised machine learning regression and classification data set to predict score or match outcome. 55 | 56 | More and more data sets are being added, and it is my mission to have more than 50 example datasets by the end of 2022. 57 | 58 | ## Thyroid Disease dataset 59 | 60 | I will first work with the Thyroid disease dataset and inspect the variables in the data: 61 | 62 | ```{r thyroid_data} 63 | 64 | glimpse(MLDataR::thyroid_disease) 65 | 66 | ``` 67 | 68 | As you can see this dataset has 28 columns and 3,772 rows. The dataset is fully documented in the help file of what each one of the items means. The next task is to use this dataset to create a ML model in TidyModels. 69 | 70 | ## Create TidyModels recipe to model the thyroid dataset 71 | 72 | This will show how to create and implement the dataset in TidyModels for a supervised ML classification task. 73 | 74 | ### Data preparation 75 | 76 | The first step will be to do the data preparation steps: 77 | 78 | ```{r data_prep} 79 | data("thyroid_disease") 80 | td <- thyroid_disease 81 | # Create a factor of the class label to use in ML model 82 | td$ThryroidClass <- as.factor(td$ThryroidClass) 83 | # Check the structure of the data to make sure factor has been created 84 | str(td) 85 | ``` 86 | 87 | Next I will remove the missing variable, you could try another imputation method here such as MICE, however for speed of development and building vignette, I will leave this for you to look into: 88 | 89 | ```{r remove_nulls} 90 | # Remove missing values, or choose more advaced imputation option 91 | td <- td[complete.cases(td),] 92 | #Drop the column for referral source 93 | td <- td %>% 94 | dplyr::select(-ref_src) 95 | 96 | ``` 97 | 98 | ### Split the data 99 | 100 | Next I will partition the data into a training and testing split, so I can evaluate how well the model performs on the testing set: 101 | 102 | ```{r splitting} 103 | #Divide the data into a training test split 104 | set.seed(123) 105 | split <- rsample::initial_split(td, prop=3/4) 106 | train_data <- rsample::training(split) 107 | test_data <- rsample::testing(split) 108 | 109 | ``` 110 | 111 | ### Create a recipe with preprocessing steps 112 | 113 | After I have split the data it is time to prepare a recipe for the preprocessing steps, here I will use the recipes package: 114 | 115 | 116 | ```{r create_recipe} 117 | td_recipe <- 118 | recipe(ThryroidClass ~ ., data=train_data) %>% 119 | step_normalize(all_predictors()) %>% 120 | step_zv(all_predictors()) 121 | 122 | print(td_recipe) 123 | ``` 124 | 125 | This recipe links the outcome variable `ThyroidClass` and then we use a normalise function to centre and scale all the numerical outcome variables and then we will remove zero variance from the data. 126 | 127 | ### Getting modelling with Parsnip 128 | 129 | We come to the modelling step of the exercise. Here I will instantiate a random forest model for the modeeling task at hand: 130 | 131 | 132 | ```{r random_forest_model} 133 | set.seed(123) 134 | rf_mod <- 135 | parsnip::rand_forest() %>% 136 | set_engine("ranger") %>% 137 | set_mode("classification") 138 | 139 | 140 | ``` 141 | 142 | ### Create the model workflow 143 | 144 | [Tidymodels](https://www.tidymodels.org/) uses the concept of workflows to stitch the ML pipeline together, so I will now create the workflow and then fit the model: 145 | 146 | ```{r creating_workflow} 147 | td_wf <- 148 | workflow() %>% 149 | workflows::add_model(rf_mod) %>% 150 | workflows::add_recipe(td_recipe) 151 | 152 | print(td_wf) 153 | # Fit the workflow to our training data 154 | set.seed(123) 155 | td_rf_fit <- 156 | td_wf %>% 157 | fit(data = train_data) 158 | # Extract the fitted data 159 | td_fitted <- td_rf_fit %>% 160 | extract_fit_parsnip() 161 | 162 | ``` 163 | ### Make predictions and evaluate with ConfusionTableR 164 | 165 | The final step, before deploying this live, would be to make predictions on the test set and then evaluate with the ConfusionTableR package: 166 | 167 | ```{r make_preds_and_evaluate} 168 | # Predict the test set on the training set to see model performance 169 | class_pred <- predict(td_rf_fit, test_data) 170 | td_preds <- test_data %>% 171 | bind_cols(class_pred) 172 | # Convert both to factors 173 | td_preds$.pred_class <- as.factor(td_preds$.pred_class) 174 | td_preds$ThryroidClass <- as.factor(td_preds$ThryroidClass) 175 | 176 | str(td_preds) 177 | 178 | # Evaluate the data with ConfusionTableR 179 | cm <- binary_class_cm(td_preds$.pred_class, 180 | td_preds$ThryroidClass, 181 | positive="sick") 182 | 183 | 184 | 185 | ``` 186 | 187 | Final step is to view the Confusion Matrix and collapse down for storage in a database to model accuracy drift over time: 188 | 189 | ```{r modelling_preds} 190 | #View Confusion matrix 191 | cm$confusion_matrix 192 | #View record level 193 | cm$record_level_cm 194 | 195 | ``` 196 | 197 | That is an example of how to model the Thyroid dataset, and random forest ensembles are giving us good estimates of the model performance. The Kappa level is also excellent, meaning that the model has a high likelihood of being good in practice. 198 | 199 | ## Diabetes dataset 200 | The diabetes dataset can be loaded from the package with ease also: 201 | 202 | ```{r diabetes} 203 | glimpse(MLDataR::diabetes_data) 204 | ``` 205 | Has a number of variables that are common with people of diabetes, however some dummy encoding would be needed of the Yes / No variables to make this model work. 206 | 207 | This is another example of a dataset that you could build an ML model on. 208 | 209 | ## Heart disease prediction 210 | 211 | The final dataset, for now, in the package is the heart disease dataset. To load and work with this dataset you could use the following: 212 | 213 | ```{r load_in_heart} 214 | data(heartdisease) 215 | # Convert diabetes data to factor' 216 | hd <- heartdisease %>% 217 | mutate(HeartDisease = as.factor(HeartDisease)) 218 | is.factor(hd$HeartDisease) 219 | ``` 220 | 221 | ### Dummy encode the dataset 222 | The [ConfusionTableR](https://CRAN.R-project.org/package=ConfusionTableR) package has a `dummy_encoder` function baked into the package. To code up the dummy variables you could use an approach similar to below: 223 | 224 | ```{r dummy_encode} 225 | # Get categorical columns 226 | hd_cat <- hd %>% 227 | dplyr::select_if(is.character) 228 | # Dummy encode the categorical variables 229 | cols <- c("RestingECG", "Angina", "Sex") 230 | # Dummy encode using dummy_encoder in ConfusionTableR package 231 | coded <- ConfusionTableR::dummy_encoder(hd_cat, cols, remove_original = TRUE) 232 | coded <- coded %>% 233 | select(RestingECG_ST, RestingECG_LVH, Angina=Angina_Y, 234 | Sex=Sex_F) 235 | # Remove column names we have encoded from original data frame 236 | hd_one <- hd[,!names(hd) %in% cols] 237 | # Bind the numerical data on to the categorical data 238 | hd_final <- bind_cols(coded, hd_one) 239 | # Output the final encoded data frame for the ML task 240 | glimpse(hd_final) 241 | ``` 242 | 243 | The data is now ready for modelling in the same fashion as we saw with the thyroid dataset. 244 | 245 | ## Long stayers 246 | This is a dataset for long stay patients and has been created off the back of real NHS data. Load in the data and the required packages: 247 | 248 | ```{r ls_one} 249 | library(MLDataR) 250 | library(dplyr) 251 | library(ggplot2) 252 | library(caret) 253 | library(rsample) 254 | library(varhandle) 255 | 256 | data("long_stayers") 257 | glimpse(long_stayers) 258 | 259 | ``` 260 | 261 | Do some feature engineering on the dataset: 262 | 263 | ```{r ls_two} 264 | long_stayers <- long_stayers %>% 265 | dplyr::mutate(stranded.label=factor(stranded.label)) %>% 266 | dplyr::select(everything(), -c(admit_date)) 267 | 268 | cats <- select_if(long_stayers, is.character) 269 | cat_dummy <- varhandle::to.dummy(cats$frailty_index, "frail_ind") 270 | #Converts the frailty index column to dummy encoding and sets a column called "frail_ind" prefix 271 | cat_dummy <- cat_dummy %>% 272 | as.data.frame() %>% 273 | dplyr::select(-frail_ind.No_index_item) #Drop the field of interest 274 | # Drop the frailty index from the stranded data frame and bind on our new encoding categorical variables 275 | long_stayers <- long_stayers %>% 276 | dplyr::select(-frailty_index) %>% 277 | bind_cols(cat_dummy) %>% na.omit(.) 278 | ``` 279 | 280 | Then we will split and model the data. This uses the CARET package to do the modelling: 281 | 282 | ```{r ls_three} 283 | split <- rsample::initial_split(long_stayers, prop = 3/4) 284 | train <- rsample::training(split) 285 | test <- rsample::testing(split) 286 | 287 | set.seed(123) 288 | glm_class_mod <- caret::train(factor(stranded.label) ~ ., data = train, 289 | method = "glm") 290 | print(glm_class_mod) 291 | ``` 292 | 293 | Next, we will make predictions on the model: 294 | 295 | ```{r ls_four} 296 | split <- rsample::initial_split(long_stayers, prop = 3/4) 297 | train <- rsample::training(split) 298 | test <- rsample::testing(split) 299 | 300 | set.seed(123) 301 | glm_class_mod <- caret::train(factor(stranded.label) ~ ., data = train, 302 | method = "glm") 303 | print(glm_class_mod) 304 | ``` 305 | 306 | Predicting on the test set to do the evaluation: 307 | 308 | ```{r ls_five} 309 | preds <- predict(glm_class_mod, newdata = test) # Predict class 310 | pred_prob <- predict(glm_class_mod, newdata = test, type="prob") #Predict probs 311 | 312 | # Join prediction on to actual test data frame and evaluate in confusion matrix 313 | 314 | predicted <- data.frame(preds, pred_prob) 315 | test <- test %>% 316 | bind_cols(predicted) %>% 317 | dplyr::rename(pred_class=preds) 318 | 319 | glimpse(test) 320 | ``` 321 | 322 | Finally, we can evaluate with the ConfusionTableR package and use the OddsPlotty package to visualise the odds ratios: 323 | 324 | ```{r ls_six} 325 | library(ConfusionTableR) 326 | cm <- ConfusionTableR::binary_class_cm(test$stranded.label, test$pred_class, positive="Stranded") 327 | cm$record_level_cm 328 | 329 | library(OddsPlotty) 330 | plotty <- OddsPlotty::odds_plot(glm_class_mod$finalModel, 331 | title = "Odds Plot ", 332 | subtitle = "Showing odds of patient stranded", 333 | point_col = "#00f2ff", 334 | error_bar_colour = "black", 335 | point_size = .5, 336 | error_bar_width = .8, 337 | h_line_color = "red") 338 | print(plotty) 339 | 340 | ``` 341 | 342 | 343 | ## What's on the horizon? 344 | If you have a dataset and it is dying to be included in this package please reach out to me [`@StatsGary`](https://twitter.com/StatsGary) and I would be happy to add you to the list of collaborators. 345 | 346 | I will be aiming to add an additional 30+ datasets to this package. All of which are at various stages of documentation, so the first version of this package will be released with the three core datasets, with more being added each additional version of the package. 347 | 348 | Please keep watching the package [GitHub](https://github.com/StatsGary/MLDataR), and make sure you install the latest updates of the package, when they are available. 349 | -------------------------------------------------------------------------------- /vignettes/mldataR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/StatsGary/MLDataR/8a5338bf44a5d646b2cb28720b7b46a3ca8a45ec/vignettes/mldataR.png --------------------------------------------------------------------------------