├── .gitignore ├── 2022-04 ├── finalfit_demo.Rmd └── melanoma_factored.rda ├── 2022-05 └── geom_bar_vs_col.R ├── 2022-06 └── reprex_demo.Rmd ├── 2022-08 ├── global_map.r └── reshape_pivot.rmd ├── 2022-09 └── plot_improvements.rmd ├── 2022-10 └── stringr_demo.rmd ├── 2022-11 └── join_demo.r ├── 2023-03 └── tableone_finalfit.rmd ├── 2023-08 ├── data_cleaning.Rmd ├── demo_ufo_sightings_advanced.csv ├── demo_ufo_sightings_gap.csv ├── demo_ufo_sightings_original.csv ├── parameter_report.R └── reports_automated.Rmd ├── 2023-10 ├── data_orig │ └── data_orig │ │ ├── excel │ │ ├── all_months.xlsx │ │ ├── february_project_dataset.xlsx │ │ ├── january_project_dataset.xlsx │ │ └── march_project_dataset.xlsx │ │ ├── february_project_dataset.csv │ │ ├── february_test_dataset.csv │ │ ├── january_project_dataset.csv │ │ └── march_project_dataset.csv ├── dataclean2.Rmd └── read_multiple_spreadsheets.R ├── 2023-11 ├── quarto.sh └── quarto_1.qmd ├── 2024-02 ├── 2020-02_venn-diagram_transcript.json ├── 2020-02_venn-diagram_transcript.srt ├── 2020-02_venn-diagram_transcript.txt ├── consort_diagram.pdf ├── factors.R ├── flowchart.md ├── flowchart.qmd ├── flowchart_files │ └── figure-commonmark │ │ └── unnamed-chunk-2-1.png ├── venn_diagram.rmd └── venn_diagrams-link.txt ├── 2024-03 ├── app.R ├── factors.R ├── factors_transcripts.txt ├── ggplot_efficient.qmd ├── ggplot_efficient_transcript.txt ├── licodata_raw.csv └── shiny_transcript.txt ├── 2024-04 ├── DAG_demo.qmd ├── dag_transcript.txt ├── gt_tables.qmd └── gt_transcript.txt ├── 2024-05 ├── fuzzy_match.qmd └── fuzzy_match_transcript.txt ├── 2024-08 ├── tidymodels.qmd └── tidymodels_transcript.txt ├── 2024-09 ├── tidytext.qmd └── tidytext_transcript.txt ├── 2024-11 ├── tidymodels_resample.qmd └── tidymodels_resample_transcript.txt ├── README.md ├── healthyr_demos.Rproj └── video_thumbnails ├── DAG.jpg ├── GT.jpg ├── consort.PNG ├── data_clean.png ├── data_clean2.jpg ├── finalfit.png ├── forcats.png ├── fuzzy.jpg ├── geom_bar.png ├── globalmap_demo.png ├── join_demo.png ├── plot_function.PNG ├── plot_improvements.png ├── plotting_stat.PNG ├── quarto_1.PNG ├── quarto_1.jpg ├── quarto_2.PNG ├── quarto_2.jpg ├── report_parameters.png ├── reprex.png ├── reshape_data.png ├── shiny.PNG ├── spreadsheets.PNG ├── spreadsheets.jpg ├── stringr.png ├── tableone_finalfit.png ├── tidymodels1.PNG ├── tidymodels2.PNG ├── tidytext.PNG └── venn.jpg /.gitignore: -------------------------------------------------------------------------------- 1 | .Rhistory 2 | .DS_Store 3 | .Rproj.user 4 | *html -------------------------------------------------------------------------------- /2022-04/finalfit_demo.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Export results" 3 | output: 4 | html_document: default 5 | pdf_document: default 6 | word_document: default 7 | --- 8 | 9 | ```{r setup, include=FALSE} 10 | #+ setup, include=FALSE 11 | knitr::opts_chunk$set( 12 | echo = TRUE, 13 | error = TRUE, 14 | message = FALSE, 15 | warning = FALSE 16 | ) 17 | library(tidyverse) 18 | library(finalfit) 19 | library(knitr) 20 | 21 | load("melanoma_factored.rda") 22 | ``` 23 | 24 | # Section 1: Introduction 25 | 26 | ```{r} 27 | mydata %>% 28 | summary_factorlist("ulcer.factor", c("sex.factor", "age.factor")) 29 | ``` 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /2022-04/melanoma_factored.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/2022-04/melanoma_factored.rda -------------------------------------------------------------------------------- /2022-05/geom_bar_vs_col.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | eurovision <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-17/eurovision.csv') 3 | 4 | ## Summary of variables ---- 5 | eurovision %>% 6 | count(winner) 7 | 8 | eurovision %>% 9 | count(artist_country, sort = TRUE) 10 | 11 | ## Plotting - geom_bar vs geom_col ---- 12 | eurovision %>% 13 | ggplot(aes(x = artist_country, fill = winner)) + 14 | geom_bar(position = "fill") + 15 | coord_flip() 16 | 17 | ## Further plot using geom_col ---- 18 | eurovision %>% 19 | count(artist_country, winner) %>% 20 | group_by(artist_country) %>% 21 | mutate(nn = sum(n), 22 | prop = n / nn) %>% 23 | mutate(prop = if_else(!winner, 0, prop)) %>% 24 | ggplot(aes(y = fct_reorder(artist_country, prop), x = n, fill = winner)) + 25 | geom_col(position = "fill") -------------------------------------------------------------------------------- /2022-06/reprex_demo.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Minimal reproducible example - demo" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | library(tidyverse) 9 | library(palmerpenguins) 10 | penguins = penguins 11 | ``` 12 | 13 | ```{r} 14 | 15 | penguins %>% 16 | mutate(species = fct_relevel(species, "Adelie", "Gentoo")) %>% 17 | ggplot(aes(x=species)) + 18 | geom_bar() 19 | 20 | ``` 21 | 22 | ```{r} 23 | 24 | penguins %>% 25 | mutate(species = fct_relevel(species, "Adelie", "Gentoo")) %>% 26 | ggplot(aes(x=species, y=flipper_length_mm)) + 27 | geom_jitter() + 28 | geom_boxplot(fill=NA) 29 | 30 | ``` 31 | 32 | ```{r} 33 | 34 | penguins %>% 35 | group_by(species) %>% 36 | summarise(flipper_mean = mean(flipper_length_mm)) 37 | 38 | ``` 39 | 40 | ```{r} 41 | 42 | library(tidyverse) 43 | 44 | example_data = tibble(species = c("a", "b", "c", "a", "b", "c"), 45 | flipper = c(1:5, NA)) 46 | 47 | example_data %>% 48 | group_by(species) %>% 49 | summarise(flipper_mean = mean(flipper)) 50 | 51 | ``` -------------------------------------------------------------------------------- /2022-08/global_map.r: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(ggthemes) 3 | library(countrycode) 4 | 5 | wwc_outcomes = readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-09/wwc_outcomes.csv") 6 | team_lookup = read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-09/codes.csv") 7 | 8 | 9 | # Explore data 10 | ## Count years 11 | 12 | wwc_outcomes %>% 13 | count(year) 14 | 15 | ## Games per country 16 | wwc_outcomes %>% 17 | filter(year == 1991) %>% 18 | count(team) 19 | 20 | ## Teams per year 21 | wwc_outcomes %>% 22 | group_by(year) %>% 23 | distinct(team, .keep_all = TRUE) %>% 24 | count(team) %>% 25 | count(year) 26 | 27 | ### Riinu's version 28 | wwc_outcomes %>% 29 | distinct(year, team) %>% 30 | count(year) 31 | 32 | ### Understand round 33 | wwc_outcomes %>% 34 | distinct(round) 35 | 36 | ### Niall - who won each year? 37 | wwc_outcomes %>% 38 | filter(round == "Final") %>% 39 | filter(win_status == "Won") 40 | 41 | ## Mapping 42 | ### Create summary table 43 | countries_takenpart = wwc_outcomes %>% 44 | distinct(team, year) %>% 45 | count(team) 46 | 47 | ### Change 3 letter codes to country names 48 | countries_takenpart %>% 49 | mutate(country = countrycode(team, origin = "iso3c", destination = "country.name")) 50 | #### Countries in our dataset are not ISO coded, so we will use a separate look-up. 51 | 52 | ### Join look-up 53 | countries_takenpart = left_join(countries_takenpart, team_lookup, by = c("team" = "team")) 54 | 55 | ## Can we map yet?! 56 | world_map = map_data("world") %>% 57 | filter(! long > 180) 58 | 59 | countries = world_map %>% 60 | distinct(region) %>% 61 | rowid_to_column() 62 | 63 | ### Add our data 64 | mycountries = left_join(countries, countries_takenpart, by = c("region" = "country")) 65 | 66 | ### Check our data 67 | mycountries %>% 68 | filter(!is.na(n)) 69 | ### 30 countries - we have lost 6 countries. 70 | 71 | ### Do an anti-join! 72 | anti_join(countries_takenpart, countries, by = c("country" = "region")) 73 | 74 | ### Solve mismatches 75 | countries_takenpart = countries_takenpart %>% 76 | mutate(country = case_when( 77 | country == "China PR" ~ "China", 78 | country == "Ivory Coast (Côte d'Ivoire)" ~ "Ivory Coast", 79 | country == "Chinese Taipei" ~ "Taiwan", 80 | country == "United States" ~ "USA", 81 | TRUE ~ country 82 | )) 83 | 84 | # Recreate 85 | mycountries = left_join(countries, countries_takenpart, by = c("region" = "country")) 86 | 87 | ### Map 88 | mycountries %>% 89 | ggplot(aes(fill = n, map_id = region)) + 90 | geom_map(map = world_map) + 91 | expand_limits(x = world_map$long, y = world_map$lat) + 92 | coord_map("moll") + 93 | theme_map() 94 | -------------------------------------------------------------------------------- /2022-08/reshape_pivot.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "HealtyR demo: Pivot" 3 | format: html 4 | editor: visual 5 | --- 6 | 7 | ```{r} 8 | library(tidyverse) 9 | 10 | # Load the datasets 11 | # Datasets from Tidy Tuesday 12 | colony <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-11/colony.csv') 13 | stressor <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-01-11/stressor.csv') 14 | 15 | # Start with wide dataset for this demo 16 | # Data showing percentage of bee colonies impacted by various stressors over time 17 | wide_stressor = stressor %>% 18 | pivot_wider(names_from = months, values_from = stress_pct) 19 | ``` 20 | 21 | ## Make data long 22 | ```{r} 23 | long_stressor = wide_stressor %>% 24 | pivot_longer(4:7, 25 | names_to = "months", values_to = "stress_pct") 26 | 27 | long_stressor = wide_stressor %>% 28 | pivot_longer(c("January-March", "April-June", 29 | "July-September", "October-December"), 30 | names_to = "months", values_to = "stress_pct") 31 | 32 | long_stressor = wide_stressor %>% 33 | pivot_longer(all_of(c("January-March", "April-June", 34 | "July-September", "October-December")), 35 | names_to = "months", values_to = "stress_pct") 36 | 37 | long_stressor = wide_stressor %>% 38 | pivot_longer(contains("-"), 39 | names_to = "months", values_to = "stress_pct") 40 | ``` 41 | 42 | ## Explore / summarise data 43 | ```{r} 44 | # For each stressor average percentage of columns affected per year 45 | year_stressor_pct = long_stressor %>% 46 | group_by(year, stressor) %>% 47 | summarise(pct_mean = mean(stress_pct, na.rm = TRUE)) 48 | ``` 49 | 50 | ## Pivot wider 51 | ```{r} 52 | year_stressor_wide = year_stressor_pct %>% 53 | pivot_wider(names_from = "stressor", values_from = "pct_mean") 54 | ``` 55 | -------------------------------------------------------------------------------- /2022-09/plot_improvements.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Plot Improvements" 3 | output: html_document 4 | date: "2022-09-08" 5 | --- 6 | 7 | # Take A Sad Plot and Make it Better 8 | 9 | ```{r setup, include=FALSE} 10 | knitr::opts_chunk$set(echo = TRUE) 11 | ``` 12 | 13 | ## Load libraries 14 | 15 | ```{r} 16 | library(tidyverse) 17 | library(scales) 18 | library(palmerpenguins) 19 | library(ggbeeswarm) 20 | ``` 21 | 22 | ## Dataset 23 | 24 | ```{r} 25 | finalfit::missing_glimpse(penguins) 26 | penguins = penguins %>% 27 | drop_na(species, sex, body_mass_g) 28 | ``` 29 | 30 | ## Sad Plot 31 | 32 | ```{r} 33 | penguins %>% 34 | ggplot(aes(x = species, y = body_mass_g)) + 35 | geom_boxplot() + 36 | facet_grid(.~island) 37 | ``` 38 | 39 | 40 | ## Sad Plot + raw data 41 | 42 | ```{r} 43 | penguins %>% 44 | ggplot(aes(x = species, y = body_mass_g)) + 45 | geom_boxplot(outlier.shape = NA) + 46 | geom_jitter(alpha = 0.2) + 47 | facet_grid(.~island) 48 | 49 | # Checkout geom_boxplot2 50 | ## https://stackoverflow.com/questions/5677885/ignore-outliers-in-ggplot2-boxplot 51 | ``` 52 | ## Sad Plot + raw data + remove grey background 53 | 54 | ```{r} 55 | penguins %>% 56 | ggplot(aes(x = species, y = body_mass_g)) + 57 | geom_boxplot(outlier.shape = NA) + 58 | geom_jitter(alpha = 0.2) + 59 | facet_grid(.~island) + 60 | theme_bw() 61 | ``` 62 | 63 | ## Sad Plot + raw data + remove grey background + axis labels 64 | 65 | ```{r} 66 | library(ggtext) 67 | penguins %>% 68 | ggplot(aes(x = species, y = body_mass_g)) + 69 | geom_boxplot(outlier.shape = NA) + 70 | geom_jitter(alpha = 0.2) + 71 | # labs(x = "Species", y = "Body mass (g)") + 72 | labs(x = "Species", y = "Body mass (g^(2))") + 73 | facet_grid(. ~ island) + 74 | theme_bw() + 75 | theme(axis.title.y = element_markdown()) 76 | ``` 77 | 78 | 79 | 80 | ## Sad Plot + raw data + remove grey background + axis labels + remove empty factor levels 81 | 82 | ```{r} 83 | forcats::fct_drop 84 | penguins %>% 85 | ggplot(aes(x = species, y = body_mass_g)) + 86 | geom_boxplot(outlier.shape = NA) + 87 | geom_jitter(alpha = 0.2) + 88 | labs(x = "Species", y = "Body mass (g^(2))") + 89 | facet_grid(. ~ island, scales = "free_x") + 90 | theme_bw() + 91 | theme(axis.title.y = element_markdown()) 92 | ``` 93 | 94 | # Sad plot + change whisker length 95 | 96 | ```{r} 97 | penguins %>% 98 | ggplot(aes(x = species, y = body_mass_g)) + 99 | geom_boxplot(coef = 2) + 100 | facet_grid(.~island) 101 | ``` 102 | 103 | ## Hints 104 | 105 | * Add the raw data - colour per species - change colour palatte 106 | * Remove the outliers 107 | * Remove the grey background 108 | * Fix the axes lables - remove legend 109 | * Add the mean 110 | 111 | ```{r} 112 | penguins %>% 113 | ggplot(aes(x = species, y = body_mass_g)) + 114 | geom_boxplot(fill = NA, outlier.shape = NA, alpha = 0.2) + # remove outliers 115 | facet_grid(.~island, scales = "free_x", space = "free_x") + # free x axis 116 | geom_jitter(aes(colour = species), alpha = 0.5, width = 0.1, size = 3) + # jitter 117 | labs(y = "Body Mass (g)", x = "Penguin Species", caption = "Source: Palmerpenguins") + # Axis captions 118 | stat_summary(fun = mean, 119 | geom = "point", shape = 20, size = 5, color = "black", fill = "black") + # Add mean 120 | scale_colour_viridis_d() + # Change colour scheme 121 | theme_bw() + # Remove grey background 122 | theme(legend.position = "none") # Remove legend - note: setting the theme should be done after removing background theme 123 | ``` 124 | -------------------------------------------------------------------------------- /2022-10/stringr_demo.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "stringr_demo" 3 | output: html_document 4 | date: "2022-10-06" 5 | --- 6 | 7 | ```{r setup, include=FALSE} 8 | knitr::opts_chunk$set(echo = TRUE) 9 | ``` 10 | 11 | ```{r} 12 | library(tidyverse) 13 | library(stringr) # already part of tidyverse 14 | library(lubridate) 15 | 16 | # Create the dataset 17 | df = tibble(subjid = 1:50, 18 | age = round(runif(50, 5, 100)), 19 | sex = sample(c("Male", "Female"), size=50, replace=TRUE, prob=c(0.49,0.50)), 20 | country = sample(c("England", "Scotland", "Wales", "N. Ireland"), size=50, replace=TRUE, prob=c(0.7, 0.16, 0.09, 0.05)), 21 | adm_date = sample(seq(ymd('2019-01-01'), ymd('2022-10-01'), by = "day"), size = 50, replace=TRUE), 22 | heart_rate = round(runif(50, 45, 150)), 23 | oxy_sat = round(runif(50, 90, 100)), 24 | test_pos = sample(c("Yes", "No", "Unknown"), size=50, replace=TRUE, prob=c(0.33,0.60, 0.07)), 25 | medication = c("Patient received 500mg x2 paracetamol", 26 | "Morphine; Amoxicillin", 27 | "Paracetamol 500 mg x 2", 28 | "Insulin", 29 | "paracetimol", 30 | "meropenem, diazapam, paracetamol", 31 | "Paracetamol when needed", 32 | "Citalopram & paractamol", 33 | "Ibuprofen", 34 | "adenosine/paracetamol", 35 | "patient to take 2 tablets paracetamol 500mg every 4 hours", 36 | "heparin, amoxicillin", 37 | "cephalexin", 38 | "paracetamol for pain", 39 | NA, 40 | "Insulin", 41 | "Asprin", 42 | "Codine", 43 | "patient received paracetimol at 4am", 44 | "500mg x2 Ibuprofen & 500mg x2 paracetamol", 45 | "Cephalexin, Sertraline, Atenolol", 46 | "paracetomol", 47 | "Insulin, folic acid", 48 | "Multivitamins", 49 | "paracetamol/ibuprofin", 50 | "patient started taking parcetamol on day 2 of admission", 51 | "Glucose", 52 | "paracetamol for pain", 53 | "Insulin&atenolol", 54 | "paracetamol", 55 | "diazipam", 56 | "paracetamol 500mg x2 at 7am", 57 | "Lactulose", 58 | "Lactulose and multivitamins", 59 | "unknown", 60 | "11:00 paracetamol, 13:00 ibuprofen, 15:00 paracetimol", 61 | "Insulin", 62 | "meropenem then paracetaml", 63 | "unknown", 64 | "Saline solution", 65 | "10am Insulin", 66 | "Asprin", 67 | "Lorazepam", 68 | NA, 69 | "Laculose&Paracetamol", 70 | NA, 71 | "Insulin", 72 | "glucose", 73 | "Citalopram & paractamol", 74 | "16.30 paracetamol 500mg")) 75 | ``` 76 | 77 | # Stringr to change case 78 | ```{r} 79 | df = df %>% 80 | mutate(medication = str_to_upper(df$medication)) 81 | 82 | df = df %>% 83 | mutate(medication = str_to_title(df$medication)) 84 | 85 | df = df %>% 86 | mutate(medication = str_to_lower(df$medication)) 87 | ``` 88 | 89 | # Extracting information 90 | ```{r} 91 | #Length of character strings 92 | str_length(string = df$medication) 93 | 94 | #Count the character strings with specific pattern 95 | str_count(string = df$medication, pattern = "insulin") 96 | 97 | #Identify strings with pattern 98 | str_which(df$medication, pattern = "insulin") 99 | str_subset(string = df$medication, pattern = "insulin") 100 | 101 | #Presence or absence of a pattern 102 | df %>% 103 | filter(str_detect(df$medication, pattern = "insulin")) 104 | 105 | 106 | df = df %>% 107 | mutate(insulin = str_count(df$medication, "insulin")) 108 | df 109 | 110 | ``` 111 | 112 | # Paracetamol 113 | hint: typos paracetimol, paracetomol, paractamol, parcetamol, paracetaml 114 | 115 | ```{r} 116 | 117 | #Length of character strings 118 | str_length(string = df$medication) 119 | 120 | #Count the character strings with specific pattern 121 | str_count(string = df$medication, pattern = "par[ac]") 122 | 123 | #Identify strings with pattern 124 | str_which(df$medication, pattern = "par[ac]") 125 | str_subset(string = df$medication, pattern = "par[ac]") 126 | 127 | #Presence or absence of a pattern 128 | df %>% 129 | filter(str_detect(df$medication, pattern = "par[ac]")) 130 | 131 | 132 | df = df %>% 133 | mutate(insulin = str_count(df$medication, "par[ac]")) 134 | df 135 | 136 | ``` 137 | 138 | # Pattern matching with regular expressions 139 | ## parac[e]*t[iao]m[o]*l 140 | 141 | # str_replace 142 | 143 | ```{r} 144 | df$medication 145 | str_replace(df$medication, "parac[e]*t[iao]m[o]*l", "paracetamol") 146 | 147 | 148 | str_replace_all(df$medication, "parac[e]*t[iao]m[o]*l", "paracetamol") 149 | ``` 150 | 151 | 152 | 153 | 154 | ```{r} 155 | df = df %>% 156 | mutate(medication = str_replace_all(df$medication, pattern = "parac[e]*t[iao]m[o]*l", replacement = "paracetamol")) 157 | ``` 158 | 159 | -------------------------------------------------------------------------------- /2022-11/join_demo.r: -------------------------------------------------------------------------------- 1 | # November 2022 2 | # HealthyR Clinic 3 | # *_join() demo 4 | 5 | library(tidyverse) 6 | library(lubridate) 7 | 8 | # The data sets being used 9 | 10 | df1 = tibble(subjid = 1:10, 11 | dob = sample(seq(ymd('1950-01-01'), ymd('2002-01-01'), by = "day"), 12 | size = 10, replace=TRUE), 13 | country = sample(c("England", "Scotland", "Wales", "N. Ireland"), 14 | size=10, replace=TRUE, prob=c(0.7, 0.16, 0.09, 0.05)), 15 | sex = c("female", "male", "male", "male", "female", "other", "female", 16 | "female", "male", "male")) 17 | 18 | df2 = tibble(id = 3:12, 19 | height = round(runif(10, 150, 190)), 20 | weight = round(runif(10, 50, 100)), 21 | test_result = round(runif(10, 40, 80)), 22 | gender = c("male", "male", "female", "other", "female", "female", 23 | "male", "male", "male", "female")) 24 | 25 | df3 = tibble(subjid = 3:12, 26 | test_result = round(runif(10, 60, 100)), 27 | job = c("doctor", "lawyer", "police officer", "teacher", "accountant", 28 | "scientist", "lecturer", "artist", "author", "engineer"), 29 | fav_animal = c("dog", "penguin", "giraffe", "armadillo", "elephant", 30 | "gorilla", "Guinea pig", "sloth", "lion", "parrot")) 31 | 32 | 33 | extra_patient = tibble(subjid = 13, 34 | dob = ymd("1997-03-11"), 35 | country = "Scotland", 36 | sex = "Female", 37 | height = 173, 38 | weight = 70, 39 | test_result_1 = 65, 40 | test_result_2 = 89, 41 | job = "data scientist") 42 | 43 | 44 | 45 | # Q1 - Join df1 and df2 using the subject id column 46 | ## Only rows in common 47 | df1 %>% 48 | inner_join(df2, by = c("subjid" = "id")) 49 | 50 | ## Keep all of df1 51 | df1 %>% 52 | left_join(df2, by = c("subjid" = "id")) 53 | 54 | ## Full join 55 | tmp = df1 %>% 56 | full_join(df2, by = c("subjid" = "id")) 57 | 58 | ## How to use a base R function with %$% 59 | library(magrittr) 60 | tmp %>% 61 | drop_na() %$% 62 | identical(sex, gender) 63 | 64 | # Q2 - Join on multiple columns 65 | ## Full join 66 | df1 %>% 67 | full_join(df2, by = c("subjid" = "id", 68 | "sex" = "gender")) 69 | 70 | 71 | # Q3 - Join multiple datasets 72 | df1 %>% 73 | full_join(df2, by = c("subjid" = "id")) %>% 74 | full_join(df3, by = c("subjid"), suffix = c(".day1", ".discharge")) 75 | 76 | 77 | # Q4 - What would you do about the column that has the same name? 78 | 79 | 80 | # Q5 - how would you join everything except the favourite animal column? 81 | df1 %>% 82 | full_join(df2, by = c("subjid" = "id")) %>% 83 | full_join(df3, by = c("subjid"), suffix = c(".day1", ".discharge")) %>% 84 | select(-fav_animal) 85 | 86 | # Q6 - Filtering joins - which individuals are not in the third dataset? 87 | df1 %>% 88 | anti_join(df2, by = c("subjid" = "id")) 89 | 90 | # Q7 - What would you do if you wanted to add on an extra row of data 91 | df4 = df1 %>% 92 | slice(1:4) 93 | df5 = df1 %>% 94 | slice(5:10) 95 | 96 | df4 %>% 97 | select(-sex) %>% 98 | bind_rows(df5) 99 | 100 | df4 = df4 %>% 101 | select(-sex) 102 | rbind(df4, df5) # error 103 | 104 | # Q8 watch when joining factors 105 | df4 = df1 %>% 106 | slice(1:4) %>% 107 | mutate(sex = factor(sex)) 108 | 109 | df5 = df1 %>% 110 | slice(5:10) %>% 111 | mutate(sex = factor(sex)) 112 | 113 | df4$sex %>% levels() 114 | df5$sex %>% levels() 115 | 116 | df4 %>% 117 | full_join(df5) %$% 118 | levels(sex) 119 | 120 | 121 | df6 = tibble(subjid = 3:12, 122 | test_result = round(runif(10, 60, 100)), 123 | job = c("doctor", "lawyer", "police officer", "teacher", "accountant", 124 | "scientist", "lecturer", "artist", "author", "engineer"), 125 | fav_animal2 = c("doggy", "penguin", "giraffe", "armadillo", "elephant", 126 | "gorilla", "Guinea pig", "sloth", "lion", "parrot")) 127 | 128 | df3 %>% 129 | full_join(df6, by = c("fav_animal" = "fav_animal2")) 130 | -------------------------------------------------------------------------------- /2023-03/tableone_finalfit.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Table One Demo" 3 | author: "HealthyR" 4 | date: "2023-03-23" 5 | output: 6 | pdf_document: default 7 | html_document: default 8 | word_document: default 9 | --- 10 | 11 | ```{r setup, include=FALSE} 12 | knitr::opts_chunk$set( 13 | echo = FALSE, 14 | message = FALSE, 15 | warning = FALSE 16 | ) 17 | ``` 18 | 19 | ```{r} 20 | library(tidyverse) 21 | library(finalfit) 22 | ``` 23 | 24 | ```{r} 25 | melanoma = boot::melanoma 26 | 27 | missing = tibble(time = c(NA,NA), 28 | status = c(1,3), 29 | sex = c(NA,NA), 30 | age = c(65,55), 31 | year = c(NA,NA), 32 | thickness = c(1.5,NA), 33 | ulcer = c(NA, 1)) 34 | 35 | melanoma_missing = melanoma %>% 36 | bind_rows(missing) 37 | ``` 38 | 39 | 40 | #Task 1 41 | Have a quick look at your data 42 | 43 | ```{r} 44 | melanoma_missing %>% 45 | glimpse() 46 | 47 | melanoma_missing %>% 48 | ff_glimpse() 49 | ``` 50 | 51 | # Recode 52 | ```{r} 53 | # Recode to factors 54 | melanoma_missing = melanoma_missing %>% 55 | mutate(sex = factor(sex) %>% fct_recode("Female" = "0", 56 | "Male" = "1"), 57 | ulcer = factor(ulcer) %>% fct_recode("Absent" = "0", 58 | "Present" = "1")) 59 | 60 | melanoma_missing %>% 61 | glimpse() 62 | 63 | melanoma_missing %>% 64 | ff_glimpse() 65 | ``` 66 | 67 | # Label variables 68 | ```{r} 69 | melanoma_missing = melanoma_missing %>% 70 | mutate(age = ff_label(age, "Age (years)"), 71 | sex = ff_label(sex, "Sex (at birth)")) 72 | ``` 73 | 74 | 75 | # Task 2 76 | Research Question: Is there an association between presence of ulceration and death from melanoma 77 | 78 | Create an example of table 1 79 | 80 | ```{r} 81 | dependent = "ulcer" 82 | explanatory = c("age", "sex") 83 | melanoma_missing %>% 84 | summary_factorlist(dependent, explanatory) 85 | ``` 86 | 87 | # Task 3 88 | Use the missing data dataset - how can you change your table to show/hide missingness 89 | 90 | ```{r} 91 | dependent = "ulcer" 92 | explanatory = c("age", "sex") 93 | melanoma_missing %>% 94 | summary_factorlist(dependent, explanatory, 95 | na_include = TRUE, 96 | add_col_totals = TRUE, add_row_totals = TRUE, 97 | include_row_missing_col = FALSE) 98 | ``` 99 | 100 | # Task 4 101 | Can you change the labels? Can you add a label for the dependent variable? 102 | 103 | ```{r} 104 | dependent = "ulcer" 105 | explanatory = c("age", "sex") 106 | melanoma_missing %>% 107 | summary_factorlist(dependent, explanatory, 108 | na_include = TRUE, 109 | add_col_totals = TRUE, add_row_totals = TRUE, 110 | add_dependent_label = TRUE) 111 | ``` 112 | 113 | # Task 5 114 | Do you include p-values? 115 | 116 | ```{r} 117 | dependent = "ulcer" 118 | explanatory = c("age", "sex") 119 | melanoma_missing %>% 120 | summary_factorlist(dependent, explanatory, 121 | na_include = TRUE, 122 | add_col_totals = TRUE, add_row_totals = TRUE, 123 | add_dependent_label = TRUE, 124 | p = TRUE) 125 | ``` 126 | 127 | # Task 6 128 | 129 | Export 130 | 131 | ```{r} 132 | dependent = "ulcer" 133 | explanatory = c("age", "sex") 134 | melanoma_missing %>% 135 | summary_factorlist(dependent, explanatory, 136 | na_include = TRUE, 137 | add_col_totals = TRUE, add_row_totals = TRUE, 138 | add_dependent_label = TRUE, 139 | p = TRUE) %>% 140 | knitr::kable(row.names = FALSE, align = c("l", "r", "r", "l", "r", "r", "r", "r")) 141 | ``` 142 | -------------------------------------------------------------------------------- /2023-08/data_cleaning.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Data Cleaning" 3 | author: "HealthyR" 4 | date: "2023-08-02" 5 | output: html_document 6 | --- 7 | ```{r setup, include=FALSE} 8 | knitr::opts_chunk$set(echo = TRUE) 9 | ``` 10 | ```{r} 11 | library(tidyverse) 12 | library(lubridate) 13 | library(finalfit) 14 | ``` 15 | ## Load in data 16 | ```{r} 17 | ufo_orig = read_csv("demo_ufo_sightings_original.csv") 18 | ``` 19 | ```{r} 20 | ufo_orig = ufo_orig %>% 21 | mutate(reported_date_time = ymd_hms(reported_date_time), 22 | reported_date_time_utc = dmy_hm(reported_date_time_utc)) 23 | ``` 24 | ## Check for outliers 25 | ```{r} 26 | summary_glimpse = ufo_orig %>% ff_glimpse() 27 | write_csv(x = summary_glimpse[[2]], file = "summary_glimpse.csv") 28 | ``` 29 | ```{r} 30 | ufo_orig %>% 31 | ggplot(aes(x = reported_date_time_utc)) + 32 | geom_histogram(bins = 200) 33 | p = ufo_orig %>% 34 | ggplot(aes(x = reported_date_time_utc, y = 1)) + 35 | geom_point() 36 | # plotly::ggplotly(p) 37 | ``` 38 | ## Clean data 39 | ```{r} 40 | ## I'm surprised this worked: 41 | ufo_orig %>% 42 | filter(reported_date_time_utc != "2200-09-07 03:25:00") 43 | ## Used to have to be this: 44 | ufo_orig %>% 45 | filter(reported_date_time_utc != ymd_hms("2200-09-07 03:25:00")) 46 | ## Another option: 47 | ufo_orig %>% 48 | filter(reported_date_time_utc <= today()) 49 | ``` 50 | -------------------------------------------------------------------------------- /2023-08/parameter_report.R: -------------------------------------------------------------------------------- 1 | library(rmarkdown) 2 | library(tidyverse) 3 | 4 | # Copy and paste of code and changing parameter, somewhat simplifies the method 5 | # run knit in script rather than manual click 6 | # e.g 7 | 8 | # Europe 9 | render("reports_automated.Rmd", 10 | params = list(my_continent = "Europe"), 11 | output_file = "automated_report_Europe.pdf") 12 | 13 | # Asia 14 | render("reports_automated.Rmd", 15 | params = list(my_continent = "Asia"), 16 | output_file = "automated_report_Asia.pdf") 17 | 18 | # Automate this step using purrr from tidyverse 19 | 20 | continents = c("Americas", "Europe", "Asia", "Africa", "Oceania") 21 | render_all = tibble(my_continent = continents) %>% 22 | mutate(output_file = paste0("reports_automated/reports_automated_", my_continent, ".pdf"), 23 | params = map(my_continent, ~list(my_continent = .))) %>% 24 | select(params, output_file) 25 | 26 | render_all %>% 27 | pwalk(render, input = "reports_automated.Rmd") 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /2023-08/reports_automated.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "A report of Women's Football World Cup participants" 3 | author: "HealthyR" 4 | date: "17-August 2023" 5 | output: pdf_document 6 | --- 7 | ```{r setup, include=FALSE} 8 | knitr::opts_chunk$set( 9 | echo = FALSE, 10 | message = FALSE, 11 | warning = FALSE 12 | ) 13 | library(tidyverse) 14 | library(ggthemes) 15 | library(countrycode) 16 | 17 | wwc_outcomes = readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-09/wwc_outcomes.csv") 18 | team_lookup = read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-09/codes.csv") 19 | 20 | 21 | ## Mapping 22 | ### Create summary table 23 | countries_takenpart = wwc_outcomes %>% 24 | distinct(team, year) %>% 25 | count(team) 26 | 27 | ### Change 3 letter codes to country names 28 | countries_takenpart %>% 29 | mutate(country = countrycode(team, origin = "iso3c", destination = "country.name")) 30 | #### Countries in our dataset are not ISO coded, so we will use a separate look-up. 31 | 32 | ### Join look-up 33 | countries_takenpart = left_join(countries_takenpart, team_lookup, by = c("team" = "team")) 34 | 35 | ## Can we map yet?! 36 | world_map = map_data("world") %>% 37 | filter(! long > 180) %>% 38 | mutate(continent = countrycode(region, 39 | origin = "country.name", 40 | destination = "continent")) %>% 41 | mutate(continent = if_else(region == "Russia", "Asia", continent)) 42 | 43 | countries = world_map %>% 44 | distinct(region) %>% 45 | rowid_to_column() 46 | 47 | ### Add our data 48 | mycountries = left_join(countries, countries_takenpart, by = c("region" = "country")) 49 | 50 | ### Check our data 51 | mycountries %>% 52 | filter(!is.na(n)) 53 | ### 30 countries - we have lost 6 countries. 54 | 55 | ### Do an anti-join! 56 | anti_join(countries_takenpart, countries, by = c("country" = "region")) 57 | 58 | ### Solve mismatches 59 | countries_takenpart = countries_takenpart %>% 60 | mutate(country = case_when( 61 | country == "China PR" ~ "China", 62 | country == "Ivory Coast (Côte d'Ivoire)" ~ "Ivory Coast", 63 | country == "Chinese Taipei" ~ "Taiwan", 64 | country == "United States" ~ "USA", 65 | TRUE ~ country 66 | )) 67 | 68 | # Recreate 69 | mycountries = left_join(countries, countries_takenpart, by = c("region" = "country")) %>% 70 | mutate(continent = countrycode(region, 71 | origin = "country.name", 72 | destination = "continent")) %>% 73 | mutate(continent = if_else(region == "Russia", "Asia", continent)) 74 | ``` 75 | 76 | ### Map 77 | 78 | ```{r} 79 | 80 | mymap = filter(world_map, continent == "Europe") 81 | 82 | mycountries %>% 83 | filter(continent == "Europe") %>% 84 | ggplot(aes(fill = n, map_id = region)) + 85 | geom_map(map = mymap) + 86 | expand_limits(x = mymap$long, y = mymap$lat) + 87 | coord_map("moll") + 88 | theme_map() 89 | ``` 90 | -------------------------------------------------------------------------------- /2023-10/data_orig/data_orig/excel/all_months.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/2023-10/data_orig/data_orig/excel/all_months.xlsx -------------------------------------------------------------------------------- /2023-10/data_orig/data_orig/excel/february_project_dataset.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/2023-10/data_orig/data_orig/excel/february_project_dataset.xlsx -------------------------------------------------------------------------------- /2023-10/data_orig/data_orig/excel/january_project_dataset.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/2023-10/data_orig/data_orig/excel/january_project_dataset.xlsx -------------------------------------------------------------------------------- /2023-10/data_orig/data_orig/excel/march_project_dataset.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/2023-10/data_orig/data_orig/excel/march_project_dataset.xlsx -------------------------------------------------------------------------------- /2023-10/data_orig/data_orig/february_project_dataset.csv: -------------------------------------------------------------------------------- 1 | date,value 2 | 2020-02-01,0.4443970008287579 3 | 2020-02-02,0.35089914267882705 4 | 2020-02-03,0.6738898695912212 5 | -------------------------------------------------------------------------------- /2023-10/data_orig/data_orig/february_test_dataset.csv: -------------------------------------------------------------------------------- 1 | date,value 2 | 2020-02-01,0.4443970008287579 3 | 2020-02-02,0.35089914267882705 4 | 2020-02-03,0.6738898695912212 5 | -------------------------------------------------------------------------------- /2023-10/data_orig/data_orig/january_project_dataset.csv: -------------------------------------------------------------------------------- 1 | date,value 2 | 2020-01-01,0.5584279303438962 3 | 2020-01-02,0.9876351354178041 4 | 2020-01-03,0.20507181785069406 5 | -------------------------------------------------------------------------------- /2023-10/data_orig/data_orig/march_project_dataset.csv: -------------------------------------------------------------------------------- 1 | date,value 2 | 2020-03-01,0.4248365715611726 3 | 2020-03-02,0.26431114482693374 4 | 2020-03-03,0.11800351133570075 5 | -------------------------------------------------------------------------------- /2023-10/dataclean2.Rmd: -------------------------------------------------------------------------------- 1 | # Extract year and hour values from date 2 | 3 | ```{r} 4 | ufo = ufo_orig %>% 5 | mutate(year = year(reported_date_time), .after = reported_date_time) %>% 6 | mutate(hour = hour(reported_date_time), .after = year) 7 | 8 | ufo %>% 9 | count(hour, sort = TRUE) 10 | ``` 11 | 12 | # Check for duplicates 13 | 14 | ```{r} 15 | # library(janitor) - recommended but not used in this demo 16 | 17 | # ufo %>% 18 | # distinct(posted_date, .keep_all = TRUE) %>% 19 | # arrange(posted_date) 20 | 21 | ufo = ufo %>% 22 | distinct() 23 | ``` 24 | 25 | 26 | ```{r} 27 | udo = ufo %>% 28 | mutate(shape = na_if(shape, "unknown")) 29 | 30 | # %>% 31 | # # mutate(shape = if_else(shape == "unknown" & another_column == "remove", NA, shape) %>% 32 | # count(shape, sort = TRUE) 33 | ``` 34 | 35 | 36 | 37 | 38 | ```{r} 39 | library(janitor) 40 | 41 | ufo = ufo %>% 42 | clean_names() 43 | 44 | 45 | tibble(`date time` = 1, `animal-measurement` = 1, `Hello this is column name` = 1) %>% 46 | clean_names() %>% 47 | rename(hello = hello_this_is_column_name) 48 | ``` 49 | -------------------------------------------------------------------------------- /2023-10/read_multiple_spreadsheets.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(fs) 3 | library(readxl) 4 | 5 | # reading files in manually 6 | jan_df = read_csv("data_orig/january_project_dataset.csv") 7 | feb_df = read_csv("data_orig/february_project_dataset.csv") 8 | march_df = read_csv("data_orig/march_project_dataset.csv") 9 | 10 | # bind 11 | all_months = bind_rows(jan_df, feb_df, march_df) 12 | 13 | rm(jan_df, feb_df, march_df) 14 | 15 | # all files at once 16 | all_months2 = read_csv(c("data_orig/january_project_dataset.csv", 17 | "data_orig/february_project_dataset.csv", 18 | "data_orig/march_project_dataset.csv")) 19 | 20 | 21 | # using filesystem (fs) 22 | all_months3 = dir_ls("data_orig", glob = "*project*.csv") %>% 23 | read_csv() 24 | 25 | # example 26 | 27 | test = read_csv(c("CopyOfdata_orig/CopyOffebruary_project_dataset.csv", 28 | "CopyOfdata_orig/february_project_dataset.csv")) 29 | 30 | tst1 = read_csv("CopyOfdata_orig/CopyOffebruary_project_dataset.csv") 31 | tst2 = read_csv("CopyOfdata_orig/february_project_dataset.csv") 32 | 33 | full_test = full_join(tst1, tst2) 34 | 35 | ## Excel 36 | 37 | # multiple different files - first sheet 38 | 39 | excel_all = dir_ls("data_orig/excel/", glob = "*project*.xlsx") %>% 40 | map_dfr(~ read_excel(.)) 41 | 42 | 43 | all_sheets = excel_sheets("data_orig/excel/all_months.xlsx") %>% 44 | map_dfr(~ read_excel("data_orig/excel/all_months.xlsx", 45 | sheet = .)) 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /2023-11/quarto.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # changing name of output file, need to add both --to and --put arguments 3 | quarto render 08_exporting.qmd --to pdf --output exporting.pdf 4 | # render teacher version 5 | quarto render 08_exporting.qmd --to pdf --output exporting_teacher.pdf --profile teacher 6 | -------------------------------------------------------------------------------- /2023-11/quarto_1.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Markdown and exporting your plots/tables" 3 | author: 'Riinu Pius' 4 | date: today 5 | format: 6 | html: 7 | code-fold: true 8 | code-tools: true 9 | pdf: default 10 | docx: default 11 | --- 12 | ​ 13 | # Data and set-up 14 | ​ 15 | Loading the packages: 16 | ​ 17 | ```{r message=FALSE, warning=FALSE} 18 | ​ 19 | library(tidyverse) 20 | library(knitr) 21 | library(gapminder) 22 | ​ 23 | ``` 24 | ​ 25 | We are using the gapminder dataset. This is a document I'm working in, this text will be nicely printed. 26 | ​ 27 | ```{r} 28 | ​ 29 | mydata = gapminder %>% 30 | mutate(gdpPercap = round(gdpPercap)) %>% 31 | mutate(pop_millions = round(pop/1e6, 1)) %>% 32 | select(-pop) 33 | ​ 34 | ``` 35 | ​ 36 | A random sample of rows in the dataset: 37 | ​ 38 | ```{r} 39 | ​ 40 | mydata %>% 41 | sample_n(10) %>% 42 | kable() 43 | ​ 44 | ``` 45 | ​ 46 | Number of variables: `r mydata %>% ncol()`. 47 | ​ 48 | Total number of observations: `r mydata %>% nrow()`. 49 | ​ 50 | # Plotting 51 | ​ 52 | Results can be seen in \@fig-rplot-1 53 | ​ 54 | ```{r} 55 | #| label: fig-rplot 56 | #| fig-cap: "gapminder GDP per capita vs life expectancy globally" 57 | #| fig-subcap: 58 | #| - "Original plot" 59 | #| - "New plot" 60 | #| layout-ncol: 2 61 | ​ 62 | ​ 63 | mydata %>% 64 | filter(year == 2007) %>% 65 | ggplot(aes(x = gdpPercap/1000, #divide by 1000 to tidy the x-axis 66 | y = lifeExp, 67 | colour = continent, 68 | size = pop_millions)) + 69 | geom_point(shape = 1) + 70 | theme_bw() 71 | ​ 72 | mydata %>% 73 | filter(year == 2007) %>% 74 | ggplot(aes(x = gdpPercap/1000, #divide by 1000 to tidy the x-axis 75 | y = lifeExp, 76 | colour = continent, 77 | size = pop_millions)) + 78 | geom_point(shape = 1) + 79 | theme_bw() + 80 | scale_colour_brewer() 81 | ``` 82 | ​ 83 | ```{=html} 84 | 87 | ``` 88 | ::: {.content-visible when-format="pdf"} 89 | # Solutions 90 | ​ 91 | This how we solve this. Don't show it to students!!!! You're doing a great job. 92 | ::: 93 | -------------------------------------------------------------------------------- /2024-02/consort_diagram.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/2024-02/consort_diagram.pdf -------------------------------------------------------------------------------- /2024-02/factors.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(finalfit) 3 | 4 | # Dataset from medicaldata::licorice_gargle 5 | licodata_raw = read_csv("licodata_raw.csv") 6 | 7 | # reading in 0,1, recoding 8 | licodata = licodata_raw %>% 9 | mutate(preOp_gender.factor = preOp_gender %>% 10 | factor() %>% 11 | fct_recode("Male" = "0", 12 | "Female" = "1"), 13 | preOp_asa.factor = preOp_asa %>% 14 | factor() %>% 15 | fct_recode("a normal healthy patient" = "1", 16 | "a patient with mild systemic disease" = "2", 17 | "a patient with severe systemic disease" = "3"), 18 | treat.factor = treat %>% 19 | factor() %>% 20 | fct_recode("Sugar 5g" = "0", 21 | "Licorice 0.5g" = "1"), 22 | preOp_smoking.factor = preOp_smoking %>% 23 | factor() %>% 24 | fct_recode("Current" = "1", 25 | "Past" = "2", 26 | "Never" = "3"), 27 | 28 | pod1am_throatPain.factor = pod1am_throatPain %>% 29 | factor() %>% 30 | fct_collapse("No Pain" = "0", 31 | other_level = "Pain"), 32 | pod1am_cough.factor = pod1am_cough %>% 33 | factor() %>% 34 | fct_collapse("No Cough" = "0", 35 | other_level = "Cough"), 36 | pacu30min_swallowPain.factor = pacu30min_swallowPain %>% 37 | factor() %>% 38 | fct_collapse("No Pain" = "0", 39 | other_level = "Pain")) 40 | 41 | licodata %>% 42 | count(treat, treat.factor) 43 | 44 | licodata %>% 45 | ggplot(aes(preOp_asa.factor)) + 46 | geom_bar() 47 | 48 | licodata %>% 49 | finalfit("pod1am_throatPain", c("treat.factor")) 50 | 51 | licodata = licodata %>% 52 | mutate(treat = treat.factor, 53 | preOp_gender = preOp_gender.factor, 54 | preOp_asa = preOp_asa.factor) %>% 55 | select(-ends_with(".factor")) 56 | 57 | licodata %>% 58 | write_csv("licodata.csv") 59 | 60 | # reading in CSV 61 | licodata_csv = read_csv("licodata.csv") 62 | 63 | licodata_csv %>% 64 | finalfit("pod1am_throatPain", c("treat")) 65 | 66 | licodata %>% 67 | drop_na(pod1am_throatPain) %>% 68 | ggplot(aes(preOp_asa, fill = factor(pod1am_throatPain))) + 69 | geom_bar(position = "fill") + 70 | scale_fill_brewer(palette = "BuGn") 71 | 72 | 73 | licodata = licodata %>% 74 | mutate(preOp_asa = fct_collapse(preOp_asa, 75 | "Healthy" = "a normal healthy patient", 76 | "Systemic disease" = c("a patient with mild systemic disease", 77 | "a patient with severe systemic disease"))) 78 | 79 | 80 | licodata = licodata %>% 81 | mutate(pod1am_throatPain2 = pod1am_throatPain %>% 82 | factor() %>% 83 | fct_collapse( 84 | "No pain" = "0", 85 | other_level = "Pain")) 86 | 87 | licodata %>% 88 | drop_na(pod1am_throatPain2) %>% 89 | ggplot(aes(preOp_asa, fill = pod1am_throatPain2)) + 90 | geom_bar(position = "fill") + 91 | scale_fill_brewer(palette = "BuGn") 92 | -------------------------------------------------------------------------------- /2024-02/flowchart.md: -------------------------------------------------------------------------------- 1 | # Licorice trial consort diagram 2 | 3 | # The Licorice Gargle Dataset 4 | 5 | These are data from a study by Ruetzler et al. ‘A Randomized, 6 | Double-Blind Comparison of Licorice Versus Sugar-Water Gargle for 7 | Prevention of Postoperative Sore Throat and Postextubation Coughing’. 8 | Anesth Analg 2013; 117: 614 – 21. 9 | 10 | Postoperative sore throat is a common and annoying complication of 11 | endotracheal intubation. This study tested the hypothesis that gargling 12 | with licorice solution immediately before induction of anesthesia 13 | prevents sore throat and postextubation coughing in patients intubated 14 | with double-lumen tubes. 15 | 16 | # Data cleaning 17 | 18 | Data dictionary: 19 | https://higgi13425.github.io/medicaldata/reference/licorice_gargle.html 20 | 21 | The publicly available dataset only includes final 22 | analysis-ready/complete patients. To demonstrate the making of a consort 23 | diagram we’ve randomly created three new variables of exclusions: 24 | 25 | - eligibility 26 | - age \> 70 years 27 | - BMI outwith 18.5 - 30 28 | - intervention 29 | - did not receive intervention 30 | - withdrew consent 31 | - lost to follow up 32 | - died 33 | - refused assessment 34 | 35 | ``` r 36 | library(tidyverse) 37 | library(consort) 38 | library(medicaldata) 39 | 40 | licodata = medicaldata::licorice_gargle %>% 41 | rowid_to_column("patient_id") %>% 42 | # make treatment var from 0,1 to factor 43 | mutate(randomisation = treat %>% 44 | factor() %>% 45 | fct_recode("Sugar" = "0", 46 | "Licorice" = "1")) %>% 47 | # assess eligigibility 48 | mutate(eligibility = case_when(preOp_age > 70 ~ "Age 70+", 49 | ! between(preOp_calcBMI, 18.5, 30) ~ "BMI not 18.5 - 30", 50 | .default = NA), 51 | # randomly generate intervention failed and lost to follow up variables 52 | intervention = sample(c("Did not receive intervention", "Withdrew consent",NA), 53 | size = 235, 54 | replace = TRUE, 55 | prob = c(0.1, 0.1, 0.9)), 56 | followup = sample(c("Died", "Refused Assessment", NA), 57 | size = 235, 58 | replace = TRUE, 59 | prob = c(0.1, 0.2, 0.7))) %>% 60 | mutate(randomisation = if_else(is.na(eligibility), randomisation, NA)) %>% 61 | mutate(intervention = if_else(is.na(eligibility), intervention, NA)) %>% 62 | mutate(followup = if_else(is.na(intervention), followup, NA)) 63 | ``` 64 | 65 | # CONSORT (Consolidated Standards of Reporting Trials) diagram 66 | 67 | ``` r 68 | p_cons = consort_plot(licodata, 69 | order = list(patient_id = "Population", 70 | eligibility = "Excluded", 71 | randomisation = "Randomised", 72 | intervention = "Excluded", 73 | patient_id = "Received treatment", 74 | followup = "Lost to follow-up", 75 | patient_id = "Final analysis"), 76 | side_box = c("eligibility", "intervention", "followup"), 77 | allocation = "randomisation", 78 | cex = 0.8, 79 | text_width = 30) 80 | 81 | p_cons 82 | ``` 83 | 84 | ![](flowchart_files/figure-commonmark/unnamed-chunk-2-1.png) 85 | 86 | ## Exporting 87 | 88 | ``` r 89 | plot(p_cons, grViz = TRUE) |> 90 | DiagrammeRsvg::export_svg() |> 91 | charToRaw() |> 92 | rsvg::rsvg_pdf("consort_diagram.pdf") 93 | ``` 94 | -------------------------------------------------------------------------------- /2024-02/flowchart.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Licorice trial consort diagram" 3 | editor: visual 4 | format: 5 | gfm: default 6 | html: 7 | code-fold: true 8 | docx: 9 | echo: false 10 | pdf: 11 | echo: false 12 | execute: 13 | warning: false 14 | editor_options: 15 | chunk_output_type: console 16 | --- 17 | 18 | # The Licorice Gargle Dataset 19 | 20 | These are data from a study by Ruetzler et al. 'A Randomized, Double-Blind Comparison of Licorice Versus Sugar-Water Gargle for Prevention of Postoperative Sore Throat and Postextubation Coughing'. Anesth Analg 2013; 117: 614 -- 21. 21 | 22 | Postoperative sore throat is a common and annoying complication of endotracheal intubation. This study tested the hypothesis that gargling with licorice solution immediately before induction of anesthesia prevents sore throat and postextubation coughing in patients intubated with double-lumen tubes. 23 | 24 | # Data cleaning 25 | 26 | Data dictionary: https://higgi13425.github.io/medicaldata/reference/licorice_gargle.html 27 | 28 | The publicly available dataset only includes final analysis-ready/complete patients. To demonstrate the making of a consort diagram we've randomly created three new variables of exclusions: 29 | 30 | - eligibility 31 | - age \> 70 years 32 | - BMI outwith 18.5 - 30 33 | - intervention 34 | - did not receive intervention 35 | - withdrew consent 36 | - lost to follow up 37 | - died 38 | - refused assessment 39 | 40 | ```{r} 41 | library(tidyverse) 42 | library(consort) 43 | library(medicaldata) 44 | 45 | licodata = medicaldata::licorice_gargle %>% 46 | rowid_to_column("patient_id") %>% 47 | # make treatment var from 0,1 to factor 48 | mutate(randomisation = treat %>% 49 | factor() %>% 50 | fct_recode("Sugar" = "0", 51 | "Licorice" = "1")) %>% 52 | # assess eligigibility 53 | mutate(eligibility = case_when(preOp_age > 70 ~ "Age 70+", 54 | ! between(preOp_calcBMI, 18.5, 30) ~ "BMI not 18.5 - 30", 55 | .default = NA), 56 | # randomly generate intervention failed and lost to follow up variables 57 | intervention = sample(c("Did not receive intervention", "Withdrew consent",NA), 58 | size = 235, 59 | replace = TRUE, 60 | prob = c(0.1, 0.1, 0.9)), 61 | followup = sample(c("Died", "Refused Assessment", NA), 62 | size = 235, 63 | replace = TRUE, 64 | prob = c(0.1, 0.2, 0.7))) %>% 65 | mutate(randomisation = if_else(is.na(eligibility), randomisation, NA)) %>% 66 | mutate(intervention = if_else(is.na(eligibility), intervention, NA)) %>% 67 | mutate(followup = if_else(is.na(intervention), followup, NA)) 68 | ``` 69 | 70 | # CONSORT (Consolidated Standards of Reporting Trials) diagram 71 | 72 | ```{r} 73 | p_cons = consort_plot(licodata, 74 | order = list(patient_id = "Population", 75 | eligibility = "Excluded", 76 | randomisation = "Randomised", 77 | intervention = "Excluded", 78 | patient_id = "Received treatment", 79 | followup = "Lost to follow-up", 80 | patient_id = "Final analysis"), 81 | side_box = c("eligibility", "intervention", "followup"), 82 | allocation = "randomisation", 83 | cex = 0.8, 84 | text_width = 30) 85 | 86 | p_cons 87 | ``` 88 | 89 | ## Exporting 90 | 91 | ```{r} 92 | plot(p_cons, grViz = TRUE) |> 93 | DiagrammeRsvg::export_svg() |> 94 | charToRaw() |> 95 | rsvg::rsvg_pdf("consort_diagram.pdf") 96 | ``` 97 | 98 | -------------------------------------------------------------------------------- /2024-02/flowchart_files/figure-commonmark/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/2024-02/flowchart_files/figure-commonmark/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /2024-02/venn_diagram.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Venn Diagram" 3 | author: "HealthyR" 4 | date: "2024-01-16" 5 | output: 6 | pdf_document: default 7 | html_document: default 8 | --- 9 | 10 | ```{r setup, include=FALSE} 11 | knitr::opts_chunk$set(echo = TRUE) 12 | ``` 13 | 14 | ### Data prep 15 | 16 | ```{r} 17 | library(tidyverse) 18 | library(janitor) 19 | library(UpSetR) 20 | library(ggvenn) 21 | 22 | # clean names 23 | heart_df = read_csv("heart_df.csv")%>% 24 | clean_names() 25 | 26 | # ggvenn needs a logical format 27 | heart_df = heart_df %>% 28 | mutate(diabetes = as.logical(diabetes), 29 | smoking = as.logical(smoking), 30 | obesity = as.logical(obesity), 31 | alcohol_consumption = as.logical(alcohol_consumption), 32 | medication_use = as.logical(medication_use)) 33 | 34 | ``` 35 | 36 | Task 1: do some plotting and counts of obesity, smoking, and diabetes. Geom_bar(), count() etc. 37 | 38 | 39 | ```{r} 40 | heart_df %>% 41 | ggplot(aes(x=factor(heart_attack_risk), fill=diabetes))+ 42 | geom_bar() 43 | ``` 44 | 45 | Task 2: investigate ggvenn() - plot a 2 variable venn diagram 46 | 47 | ```{r} 48 | heart_df %>% 49 | ggvenn(columns = c("diabetes", "smoking")) 50 | ``` 51 | 52 | Task 3: change auto_scale, fill_color, set_name_size, text_size 53 | 54 | ```{r} 55 | heart_df %>% 56 | ggvenn(columns = c("diabetes", "smoking"), 57 | auto_scale = TRUE, 58 | fill_color = c("purple", "green"), 59 | fill_alpha = 0.5 60 | ``` 61 | 62 | Task 4: plot a 3 variable venn diagram 63 | 64 | ```{r} 65 | heart_df %>% 66 | ggvenn(columns = c("diabetes", "smoking", "obesity", "alcohol_consumption"), 67 | auto_scale = FALSE, 68 | fill_color = c("purple", "green", "orange", "pink"), 69 | fill_alpha = 0.5) 70 | ``` 71 | 72 | 73 | Task 5: Upset plots 74 | 75 | ```{r} 76 | 77 | # too many variables for a venn diagram 78 | upset_data = heart_df %>% 79 | select(diabetes, obesity, smoking, alcohol_consumption, medication_use) %>% 80 | as.data.frame() 81 | 82 | upset_data = upset_data %>% 83 | mutate(across(.cols = everything(), ~as.integer(.))) %>% # turn logical variable to int 84 | rename(Diabetes = diabetes, 85 | Obesity = obesity, 86 | Smoking = smoking, 87 | `Alcohol Consumption` = alcohol_consumption, 88 | `Medication` = medication_use) 89 | ``` 90 | 91 | 92 | ```{r} 93 | # Make an upset plot 94 | upset(upset_data, order.by = "freq", 95 | mainbar.y.label = "Nuber of patients", 96 | main.bar.color = "lightblue", 97 | sets.bar.color = c("orange", "purple", "green", "pink", "blue"), 98 | #matrix.color = c("orange", "purple", "green", "pink", "blue"), 99 | att.color = c("orange", "purple", "green", "pink", "blue")) 100 | ``` 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | ### ANSWERS: 124 | 125 | ### Data Exploration 126 | 127 | ```{r} 128 | heart_df %>% count(diabetes) 129 | heart_df %>% count(smoking) 130 | heart_df %>% count(obesity) 131 | 132 | heart_df %>% 133 | ggplot(aes(x = heart_attack_risk, fill = diabetes))+ 134 | geom_bar() 135 | 136 | heart_df %>% 137 | ggplot(aes(x = heart_attack_risk, fill = smoking))+ 138 | geom_bar() 139 | 140 | heart_df %>% 141 | ggplot(aes(x = heart_attack_risk, fill = obesity))+ 142 | geom_bar() 143 | ``` 144 | 145 | ### Venn Diagrams 146 | 147 | ```{r} 148 | 149 | # ggvenn needs a logical format 150 | 151 | 152 | heart_df %>% 153 | ggvenn(columns = c("diabetes", "smoking"), 154 | auto_scale = TRUE) 155 | 156 | 157 | # make the plot look nicer 158 | heart_df %>% 159 | rename(Diabetes = diabetes, 160 | Smoking = smoking) %>% 161 | ggvenn(columns = c("Diabetes", "Smoking"), 162 | auto_scale = TRUE, 163 | fill_color = c("forestgreen", "deepskyblue"), 164 | set_name_size = 4, 165 | text_size = 3) 166 | 167 | # three variables 168 | heart_df %>% 169 | rename(Diabetes = diabetes, 170 | Smoking = smoking, 171 | Obesity = obesity) %>% 172 | ggvenn(columns = c("Diabetes", "Smoking", "Obesity"), 173 | fill_color = c("forestgreen", "deepskyblue", "coral"), 174 | set_name_size = 4, 175 | text_size = 3) 176 | 177 | # four variables - too complicated! 178 | heart_df %>% 179 | ggvenn(columns = c("diabetes", "smoking", "obesity", "alcohol_consumption"), 180 | fill_color = c("forestgreen", "deepskyblue", "coral", "violetred"), 181 | set_name_size = 4, 182 | text_size = 3) 183 | ``` 184 | 185 | ### Upset Plot 186 | 187 | ```{r} 188 | 189 | 190 | upset_data = heart_df %>% 191 | select(diabetes, obesity, smoking, alcohol_consumption, medication_use) %>% 192 | as.data.frame() 193 | 194 | upset_data = upset_data %>% 195 | mutate(across(.cols = everything(), ~as.integer(.))) %>% 196 | rename(Diabetes = diabetes, 197 | Obesity = obesity, 198 | Smoking = smoking, 199 | `Alcohol Consumption` = alcohol_consumption, 200 | `Medication` = medication_use) 201 | 202 | upset(upset_data, order.by = "freq") 203 | 204 | 205 | upset(upset_data, order.by = "freq", 206 | main.bar.color = "plum", 207 | sets.bar.color = "steelblue", 208 | mainbar.y.label = "Number of Patients") 209 | ``` 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | -------------------------------------------------------------------------------- /2024-02/venn_diagrams-link.txt: -------------------------------------------------------------------------------- 1 | https://media.ed.ac.uk/playlist/dedicated/1_ccvd72lg/1_iw9sulkc 2 | -------------------------------------------------------------------------------- /2024-03/app.R: -------------------------------------------------------------------------------- 1 | library(shiny) 2 | library(tidyverse) 3 | library(finalfit) 4 | 5 | # Dataset from medicaldata::licorice_gargle 6 | licodata_raw = read_csv("licodata_raw.csv") 7 | 8 | # reading in 0,1, recoding 9 | licodata = licodata_raw %>% 10 | mutate(preOp_gender.factor = preOp_gender %>% 11 | factor() %>% 12 | fct_recode("Male" = "0", 13 | "Female" = "1"), 14 | preOp_asa.factor = preOp_asa %>% 15 | factor() %>% 16 | fct_recode("a normal healthy patient" = "1", 17 | "a patient with mild systemic disease" = "2", 18 | "a patient with severe systemic disease" = "3"), 19 | treat.factor = treat %>% 20 | factor() %>% 21 | fct_recode("Sugar 5g" = "0", 22 | "Licorice 0.5g" = "1") %>% 23 | ff_label("Treatment"), 24 | preOp_smoking.factor = preOp_smoking %>% 25 | factor() %>% 26 | fct_recode("Current" = "1", 27 | "Past" = "2", 28 | "Never" = "3"), 29 | 30 | pod1am_throatPain.factor = pod1am_throatPain %>% 31 | factor() %>% 32 | fct_collapse("No Pain" = "0", 33 | other_level = "Pain") %>% ff_label("Throat Pain"), 34 | pod1am_cough.factor = pod1am_cough %>% 35 | factor() %>% 36 | fct_collapse("No Cough" = "0", 37 | other_level = "Cough") %>% 38 | ff_label("Cough"), 39 | 40 | pacu30min_swallowPain.factor = pacu30min_swallowPain %>% 41 | factor() %>% 42 | fct_collapse("No Pain" = "0", 43 | other_level = "Pain") %>% 44 | ff_label("Swallow Pain")) 45 | 46 | # Define UI for application that draws a histogram 47 | ui <- fluidPage( 48 | 49 | # Application title 50 | titlePanel("Licorice Data"), 51 | 52 | # Sidebar with a slider input for number of bins 53 | sidebarLayout( 54 | sidebarPanel( 55 | radioButtons("radio", label = h3("Variable Select"), 56 | choices = list("Throat Pain" = "pod1am_throatPain.factor", 57 | "Swallow Pain (30 min post-op)" = "pacu30min_swallowPain.factor", 58 | "Cough" = "pod1am_cough.factor"), 59 | selected = "pacu30min_swallowPain.factor") 60 | ), 61 | 62 | # Show a plot of the generated distribution 63 | mainPanel( 64 | plotOutput("barplot_output"), 65 | tableOutput("table_lico") 66 | ) 67 | ) 68 | ) 69 | 70 | # Define server logic required to draw a histogram 71 | server <- function(input, output) { 72 | 73 | output$barplot_output <- renderPlot({ 74 | licodata %>% 75 | drop_na(!!sym(input$radio)) %>% 76 | ggplot(aes(x = treat.factor, fill = !!sym(input$radio)))+ 77 | geom_bar() + 78 | theme_bw() + 79 | scale_fill_brewer() 80 | 81 | }) 82 | 83 | output$table_lico <- renderTable({ 84 | 85 | licodata %>% 86 | summary_factorlist(dependent = input$radio, 87 | explanatory = "treat.factor") 88 | }) 89 | } 90 | 91 | # Run the application 92 | shinyApp(ui = ui, server = server) 93 | 94 | # advanced examples: 95 | # https://github.com/riinuots/shinyviz 96 | # https://github.com/ewenharrison/shinyfit 97 | # https://argoshare.is.ed.ac.uk/tbs/ 98 | -------------------------------------------------------------------------------- /2024-03/factors.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(finalfit) 3 | 4 | # Dataset from medicaldata::licorice_gargle 5 | licodata_raw = read_csv("licodata_raw.csv") 6 | 7 | # reading in 0,1, recoding 8 | licodata = licodata_raw %>% 9 | mutate(preOp_gender.factor = preOp_gender %>% 10 | factor() %>% 11 | fct_recode("Male" = "0", 12 | "Female" = "1"), 13 | preOp_asa.factor = preOp_asa %>% 14 | factor() %>% 15 | fct_recode("a normal healthy patient" = "1", 16 | "a patient with mild systemic disease" = "2", 17 | "a patient with severe systemic disease" = "3"), 18 | treat.factor = treat %>% 19 | factor() %>% 20 | fct_recode("Sugar 5g" = "0", 21 | "Licorice 0.5g" = "1"), 22 | preOp_smoking.factor = preOp_smoking %>% 23 | factor() %>% 24 | fct_recode("Current" = "1", 25 | "Past" = "2", 26 | "Never" = "3"), 27 | 28 | pod1am_throatPain.factor = pod1am_throatPain %>% 29 | factor() %>% 30 | fct_collapse("No Pain" = "0", 31 | other_level = "Pain"), 32 | pod1am_cough.factor = pod1am_cough %>% 33 | factor() %>% 34 | fct_collapse("No Cough" = "0", 35 | other_level = "Cough"), 36 | pacu30min_swallowPain.factor = pacu30min_swallowPain %>% 37 | factor() %>% 38 | fct_collapse("No Pain" = "0", 39 | other_level = "Pain")) 40 | 41 | licodata %>% 42 | count(treat, treat.factor) 43 | 44 | licodata %>% 45 | ggplot(aes(preOp_asa.factor)) + 46 | geom_bar() 47 | 48 | licodata %>% 49 | finalfit("pod1am_throatPain", c("treat.factor")) 50 | 51 | licodata = licodata %>% 52 | mutate(treat = treat.factor, 53 | preOp_gender = preOp_gender.factor, 54 | preOp_asa = preOp_asa.factor) %>% 55 | select(-ends_with(".factor")) 56 | 57 | licodata %>% 58 | write_csv("licodata.csv") 59 | 60 | # reading in CSV 61 | licodata_csv = read_csv("licodata.csv") 62 | 63 | licodata_csv %>% 64 | finalfit("pod1am_throatPain", c("treat")) 65 | 66 | licodata %>% 67 | drop_na(pod1am_throatPain) %>% 68 | ggplot(aes(preOp_asa, fill = factor(pod1am_throatPain))) + 69 | geom_bar(position = "fill") + 70 | scale_fill_brewer(palette = "BuGn") 71 | 72 | 73 | licodata = licodata %>% 74 | mutate(preOp_asa = fct_collapse(preOp_asa, 75 | "Healthy" = "a normal healthy patient", 76 | "Systemic disease" = c("a patient with mild systemic disease", 77 | "a patient with severe systemic disease"))) 78 | 79 | 80 | licodata = licodata %>% 81 | mutate(pod1am_throatPain2 = pod1am_throatPain %>% 82 | factor() %>% 83 | fct_collapse( 84 | "No pain" = "0", 85 | other_level = "Pain")) 86 | 87 | licodata %>% 88 | drop_na(pod1am_throatPain2) %>% 89 | ggplot(aes(preOp_asa, fill = pod1am_throatPain2)) + 90 | geom_bar(position = "fill") + 91 | scale_fill_brewer(palette = "BuGn") 92 | -------------------------------------------------------------------------------- /2024-03/ggplot_efficient.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "HealthyR demo: efficient ggplotting" 3 | format: html 4 | editor: visual 5 | execute: 6 | echo: true 7 | warning: false 8 | editor_options: 9 | chunk_output_type: console 10 | --- 11 | 12 | # Dataset 13 | 14 | We are using the same dataset as our last few demos - licorice gargle RCT. 15 | 16 | ```{r} 17 | #| echo: false 18 | library(tidyverse) 19 | 20 | # Load dataset 21 | licodata_raw = medicaldata::licorice_gargle 22 | 23 | # reading in 0,1, recoding 24 | licodata = licodata_raw %>% 25 | mutate(preOp_gender.factor = preOp_gender %>% 26 | factor() %>% 27 | fct_recode("Male" = "0", 28 | "Female" = "1"), 29 | preOp_asa.factor = preOp_asa %>% 30 | factor() %>% 31 | fct_recode("a normal healthy patient" = "1", 32 | "a patient with mild systemic disease" = "2", 33 | "a patient with severe systemic disease" = "3"), 34 | treat.factor = treat %>% 35 | factor() %>% 36 | fct_recode("Sugar 5g" = "0", 37 | "Licorice 0.5g" = "1"), 38 | preOp_smoking.factor = preOp_smoking %>% 39 | factor() %>% 40 | fct_recode("Current" = "1", 41 | "Past" = "2", 42 | "Never" = "3") 43 | ) 44 | ``` 45 | 46 | # Plot of smokers 47 | 48 | ```{r} 49 | barplot_count = function(df, var, xlab = ""){ 50 | df %>% 51 | ggplot(aes(x = {{var}} %>% 52 | fct_infreq() %>% 53 | fct_rev())) + 54 | geom_bar(fill = "seagreen3", 55 | colour = "seagreen4") + 56 | coord_flip() + 57 | theme_classic() + 58 | geom_label(aes(label = after_stat(count / sum(count)) %>% scales::percent(1)), 59 | stat = "count", 60 | hjust = 1, 61 | size = 5) + 62 | ylab("N Patients") + 63 | xlab(xlab) + 64 | scale_y_continuous(expand = c(0, 0)) 65 | } 66 | 67 | ``` 68 | 69 | ```{r} 70 | 71 | barplot_count(licodata, preOp_smoking.factor, "Smoking") 72 | 73 | barplot_count(licodata, treat.factor, "Treatment") 74 | 75 | barplot_count(licodata, preOp_asa.factor, "ASA score") 76 | 77 | barplot_count(licodata, preOp_gender.factor, "Gender") 78 | 79 | ``` 80 | -------------------------------------------------------------------------------- /2024-03/ggplot_efficient_transcript.txt: -------------------------------------------------------------------------------- 1 | Hello and welcome to another healthy r demo. Today's demo is on efficient ggplotting. What we mean by that is sometimes you might want to make a similar plot multiple times. It's like the same plot of multiple different variables and you might end up copy pasting a lot of code. We will show you how to create your own plotting function that will reduce what you need to copy paste. We are continuing using the same licorice gargle dataset that we've been using in the past a few demos. Stella, could you please just execute the first chunk? This code is copied from the factors demo a couple of weeks ago. Should look familiar. If it doesn't, the recording of the demo is up there. Maybe click on the dataset as well so that people who haven't seen it can see that. Can you go to the very end of the dataset? Because that's where the new variables are. So the babis we'll be plotting is the sex of the patients ASA. That's the general health factor, whether they received the treatment or the placebo which was a sugar water, and whether they were smoker. We will basically make four plots in a very efficient way. Can we please jump back to the Q empty mark? Yes, just hide that chunk. Firstly, we'll do a normal ggplot. A chunk please. And piped into GG plot, the aesthetic word plot in the underscore smoking factor. Yeah, factor we're using is bar. Yeah, that's the pop plot. And we'll now quickly apply a few modifications on the pop plot. U, I'm wondering, just based on the size of your screen, which is quite a good size screen, I think it might make sense to send the plot to the plot window so we can show people how to do that in the setting. The next to render, next to render, yeah, chunk output in console, this is confusing but to remove output, if you send output to the console, the console sends plots to the plot window. You can close the outline. You can see where it says outline, close that. Okay, that just because we have enough space for the plot and code to be side by side, we'll do a couple of nice modifications. I like to give my bar plots colourful outer edge. I'll show you what that means inside the par function. Please write fill equals seagreen3. 3, run that. Now do colour equals seagreen4. four. Fill is inside the bar and colour is the outer border. Obviously you could write a hundreds of different colour names in there or you could use it hex code, which is a universal way to denote colours. We've done that. This is what I've decided to go with today. Please add on the coord_flip, we're going to flip the coordinates. This is useful for long labels you want to read because now the words never past and current people can read them horizontally. They often fit better. I like plots and add on the theme classic as well, please. Okay, another modification we're going to do is we're going to put the actual count on the bars because at the moment you can see that there's probably like a 70 something non smokers and past smokers and 80 something current smokers. But the way to add on to the plot, this is quite a geom_ label. I'll tell you what the cool aesthetic label equals. After underscore stat. Yeah, like that. Not quoted. Brackets. Exactly. Yeah. After the aesthetic bracket, there's a lot of brackets, dot equals and this time count is quoted. Obviously, I copied this from online that we would you look at that in the past. People might have seen dot or you might have counted up your data before and then added it on some other way. This is the way to do it that they added in two years ago and they've deprecated dot, frankly was very confusing. This is the way to do it. Now, since we've been quite fast, I might as well assure you that you can also do calculations in there. For example, if you wanted to present the percentage count, go inside there please. After count, do divide, divide some brackets count. Yeah. Run that. You can see you can present the percentage rather than the actual count as well. If you pipe that into the scale sales percent. Yeah, Exactly. Look at that like that. Do I need non numeric? Yeah. Yeah. I think one of the brackets is Missing. Yeah, put bracket there. And then probably remove one from the end after. Yeah, let's try that. See inside percent, if you just type a number one, that gives an example of what you want the position to be, look at that. How cool is that, right? Finally, finally, please add on a Y lab and xlab, so plus ylab equals quoted n patients. It always confuses me because once you do coordinate the flip, it still knows. Yeah. Anyway, run it. Smokers or smoking. Yeah. Run that. Yeah. I almost always get it, the X and Y, but you know, it's trial and you can see what the But yeah. Right. Let's I think this is a fairly good looking plot. Now I want to make four of these. You could obviously just copy paste those ten or so lines. But then the issue with that is if you then end up making a change, you have to change it in all those four places so it makes your a bit lengthy, unnecessarily lengthy. We're just going to show you how to efficiently make that into a function and then copy it as a function. We'll show you how to do that. Please add the first line where it says licodata. Yeah, type in bar plot underscore. This is the name of the new. It could be any, it could be Stella. This is something you made up. This is not a function that already exists. Equals function brackets. Yeah, inside the function brackets, we will put two arguments. One is we'll call it DF data frame, comma VAR variable. Again, these are names that you made up, so you could call it something else. We're going to do curly brackets. Oh can you just do? The curly bracket needs to come immediately after the Yeah, exactly. I'm not even sure about that space, but basically, Yeah. Yeah, yeah, we're going to change Licodata to DF. So we're going to use whatever data frame is passed on from the argument. We're also going to change from pre smoking factor to Var. Now the only tricky bit here is that, that var inside aesthetic X needs to be in double curly brackets. Have you ever done double curly brackets before? Stella? No. Yeah. If you now this chunk, nothing happens. Well, if you remove the plot, can you remove the plot using the brush? Yeah, you run that chunk, this is a good one. I was going to demonstrate that to run that chunk. Normally R knows very well where the chunk is, but if you run it in the middle of the chunk, it doesn't capture the functions to the cursor. Has to be behind the ply bracket or on the first line. If you run that, it doesn't error. Nothing seemingly happens. But what happens is if you look at the environment and there's a new section called functions. If you click on that, you can see that this is what you've defined and this is now available for you to use throughout your document. Let's please now add in the new in that chunk. Use that function, type in bar plus, pluck as arguments, use licodata and the name of the smoking variable. It's not quoted. The reason we had to use double curly brackets, if you built the function that expected a quoted variable, then you wouldn't use double curly brackets to capture it inside the function. But because ggplot and mutate and dplyr and group by work on non quoted variables, that's where the double curly brackets come in for function building. Yeah, look at that. Still has now created the same plot using a single line, which means she can now copy that line and change the name of the variable to do all of the other plots, so I don't remember what the name was, but yeah, treat factor maybe? Yeah, pre-op ASA, treat and gender, that works. And that obviously these four lines are much neater than if she had copied all of the lines above. And if she makes a change above, now you have to remember to rerun the function again. And then you can do one obvious issue is that the label we had made the labelling to smoking inside when you run a function and easiest thing to do, ggplot allows you to overwrite a label. You could Stella go back down and where it says treatment treat factor plus X lab. Yeah, you could do further ggplot modifications. Yeah, it treatment. You could also, if you wanted to change the scale or colour colour trick, I'll show you colour differently. But if you were using a palette or something like that, she's now changed the treatment able. She could add that on. However, maybe to make it just a little bit neater instead of adding it on, which is absolutely fine, let's add it as an additional argument to the function. Yeah, exactly. We currently have two arguments, one is the data frame tibble, and the other one is the viable name. Add another final argument called X lab equals and make it empty by default, just empty brackets. Right now inside. Yeah. Let's just leave it there where it says smoking, Replace smoking with X lab. Exactly. That's exactly how it should be. It doesn't need the curly brackets because it's getting a normal string and it's not getting like a special ggplot variable. If you re run that again, nothing happens. Bar you can see if you run the smoking factor, the run that it takes away the X lab which sometimes is good, never past and current, they're not very clear but sometimes your variable is self explanatory. It might say never smoked, in which case you don't want the lab. But if you no added in as a third argument smoking, it reappears and similar. And now you can add that into all of the other ones as well. When you're used to using final fit, you're used to using FF label and then final fit plots and tables will automatically take the label off the column and use the label. Now GGlot is not capable of doing that. Ggplot will never automatically grab the label from the. Column ff_label or hmisc label or something like that. With ggplot, you do have to either manually label your variables or you could make your variable names nice and capitalised, but I would recommend that that's very confusing. Right. Stella very quickly. I don't know what this one is supposed to be. I don't actually. Yeah. Anyone want to show out what the acronym sounds for? It's how fit the patient is, but I can't remember what the acronym sounds for. You can you can just Google the ASA score, objective assessment of a patient's overall health based on five classes. Even that definition doesn't tell you what the letters stand for. Yeah. There. Right. Does anyone have any questions about this? Any suggestions for changes, requests for changes based on what we've just shown you? Do you want to render that file to see what it looks like? You see that still? Yeah. Yeah, we can. Obviously, in quiet options as we showed in I think November demos, you could hide the code and only adding nice text if you were sending this to someone who doesn't do. But basically this is how we quickly, quite efficiently created for plots. There are a couple of neat maybe tricks and changes I have noted that I'm happy to show you. But if anyone can think of any other things they would like to see do with these plots. Stella. If you jump back into the thing, go to barplot count again, please. A function I quite often use when I do barplot is fact in frequency. You put that around the double curly brackets. Type in FCT. Underscore infrequency. Yeah, exactly. Make the brackets. It's a lot of brackets but it's worth it. Okay. Save it. To run the function, you either have to run the whole chunk or you need to move your cursor to the e, to the first or last line. If you run into the middle, the function gets confused. Now go and create the ASA score plot again. For example, you can see that factor in frequency orders your bars based on the frequency they appear in the dataset. By default, if it's a factor, it gets plotted as the factor levels, which makes sense for ordinal variables. If it's not a factor, it will automatically plot it alphabetically. Factoring frequency enables you to tell it to do that, I often reverse it. I often like to have the common one at the top. If you now add in another thing, we should have probably piped actually. Yeah, I'm usually when I add in the single Yeah. Pipe that into factor in frequency and then pipe that into factor reverse and reverse. Just yeah, try that now we get that at the top, ggplot has a bit of an issue that it sometimes doesn't add enough space at the end of to see how the 57% is slightly cut off. There's two ways of doing that. We could increase the plot, which frankly I do a quite often or we can move the labels in. I think today I'm going to show moving the labels in, but if you definitely want to have a label that is slightly outward, is also possible to just extend the plot to move the tables inside the bars. Geom_label, after_stat count, hjust equals and change it to one. That's horizontal justification that moves the labels around one on the plot together. Yeah, see that moves the labels to be inside the bars. You can horizontal justification is number between -1 and 1. Try minus one. Where it put it there just to remind myself. Okay. Minus one is put it too far. To the minus one puts it Yeah. On the other side. Too far. Yeah. Okay. Zero is if you want it at the end of the bar, minus word puts it even further. And one is if you want it to end count. Yeah, I think one made sense here in this case that solves our plot area issue. I like when bar plots of start from zero, I always add on at the bottom of where it says x label scale continuous and expand equals 00 and run that. Because you can see GG plot automatically adds padding and I like to especially remove the padding from the zero. Sometimes, especially if I do have labels that are extending, I will remove the padding from the one end only and add some padding at the other end. All right. You could also make the labels a bit bigger by going back to just do hjust comma, size equals 5. 5 is as good as numbers. Yeah, so now the labels are big. Has anyone thought of any other requests or questions they would want to see since we're doing that like a ggplot demo? Well, that is a demo that we actually kept to 30 minutes. Then I'm going to finish the demo recording and then we will take the one question, the clinic question we had, or if anyone thought of any other questions. I'll stop that. Thanks for watching. -------------------------------------------------------------------------------- /2024-03/licodata_raw.csv: -------------------------------------------------------------------------------- 1 | preOp_gender,preOp_asa,preOp_calcBMI,preOp_age,preOp_mallampati,preOp_smoking,preOp_pain,treat,intraOp_surgerySize,extubation_cough,pacu30min_cough,pacu30min_throatPain,pacu30min_swallowPain,pacu90min_cough,pacu90min_throatPain,postOp4hour_cough,postOp4hour_throatPain,pod1am_cough,pod1am_throatPain 2 | 0,3,32.98,67,2,1,0,1,2,0,0,0,0,0,0,0,0,0,0 3 | 0,2,23.66,76,2,2,0,1,1,0,0,0,0,0,0,0,0,0,0 4 | 0,2,26.83,58,2,1,0,1,2,0,0,0,0,0,0,0,0,0,0 5 | 0,2,28.39,59,2,1,0,1,3,0,0,0,0,0,0,0,0,0,0 6 | 0,1,30.45,73,1,2,0,1,2,0,0,0,0,0,0,0,0,0,0 7 | 0,2,35.49,61,3,1,0,1,3,0,0,0,0,0,0,0,0,0,0 8 | 0,3,25.5,66,1,1,0,1,3,0,0,0,0,0,0,0,0,0,0 9 | 0,2,31.1,61,2,1,0,1,1,0,0,0,0,0,0,0,0,0,0 10 | 0,3,21.22,83,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0 11 | 0,3,27.16,69,2,3,0,1,2,0,0,0,0,0,0,0,0,0,0 12 | 0,2,19.08,31,1,1,0,1,2,0,0,0,0,0,0,0,0,0,0 13 | 0,2,25.68,72,2,2,0,1,1,0,0,0,0,0,0,0,0,0,0 14 | 0,2,27.64,66,2,3,0,1,1,0,0,0,0,0,0,0,0,0,0 15 | 0,2,23.5,30,1,3,0,1,2,0,0,0,0,0,0,0,0,0,0 16 | 0,3,20.2,69,2,3,0,1,2,0,0,0,0,0,0,0,0,0,0 17 | 0,2,29.03,29,2,1,0,1,2,0,0,0,0,0,0,0,0,0,0 18 | 0,3,26.3,71,1,3,0,1,2,0,0,0,0,0,0,0,0,0,0 19 | 0,2,32.32,58,2,3,0,1,1,0,0,0,0,0,0,0,0,0,0 20 | 0,2,30.07,66,2,2,0,1,2,0,0,0,0,0,0,0,0,0,0 21 | 0,1,34.33,46,1,2,0,1,1,0,0,0,0,0,0,0,0,0,0 22 | 0,1,26.29,55,2,1,0,1,1,0,0,0,0,0,0,0,0,0,0 23 | 0,1,19.88,30,2,1,0,1,2,1,0,0,0,0,0,0,0,0,0 24 | 0,1,25.62,50,2,2,0,1,2,1,0,0,0,0,0,0,0,0,0 25 | 0,1,22.84,22,3,3,0,1,2,1,0,0,0,0,0,0,0,0,0 26 | 0,3,27.78,68,3,2,0,1,2,1,0,0,0,0,0,0,0,0,0 27 | 0,2,25.5,72,2,3,0,1,2,0,1,0,0,0,0,0,0,0,0 28 | 0,1,20.68,33,1,3,0,1,1,0,0,0,0,1,0,0,0,0,0 29 | 0,1,24.49,19,1,2,0,1,1,0,1,0,0,1,0,1,0,0,0 30 | 0,2,26.23,36,2,1,0,1,1,0,0,0,0,0,0,0,0,1,0 31 | 0,1,20.94,24,1,3,0,1,1,0,0,0,0,0,0,0,0,1,0 32 | 0,2,21.16,30,1,1,0,1,2,0,0,0,0,0,0,1,0,1,0 33 | 0,3,25.95,29,2,3,0,1,1,0,0,0,0,0,0,1,0,1,0 34 | 0,1,16.72,27,2,1,0,1,2,2,0,0,0,0,0,1,0,1,0 35 | 0,3,32.61,59,2,2,0,1,2,1,0,0,0,0,0,1,0,1,0 36 | 0,2,28.98,68,2,2,0,1,2,1,0,0,0,0,0,1,0,1,0 37 | 0,3,18.73,53,3,2,0,1,2,0,0,0,0,1,0,1,0,1,0 38 | 0,2,23.14,67,1,2,0,1,2,0,0,0,0,1,0,2,0,2,0 39 | 0,2,26.37,64,2,2,0,1,2,0,0,0,0,0,0,0,0,0,2 40 | 0,2,25.38,73,2,3,0,1,2,0,0,0,0,0,0,0,0,0,3 41 | 0,2,31.18,76,2,1,0,1,1,0,1,0,0,0,0,0,0,1,1 42 | 0,2,24.38,28,2,1,0,1,2,0,0,0,2,1,0,1,0,1,0 43 | 0,2,31.64,49,2,1,0,1,2,0,0,0,0,0,0,0,1,0,0 44 | 0,2,29.38,57,1,1,0,1,2,0,0,0,0,0,0,1,1,0,0 45 | 0,3,29.39,63,1,3,0,1,1,0,0,0,0,0,0,1,1,1,0 46 | 0,1,27.38,52,2,1,0,1,2,0,0,0,0,1,0,1,3,1,0 47 | 0,2,22.98,59,2,1,0,1,2,0,0,0,0,0,0,0,3,0,2 48 | 0,3,29.01,64,3,3,0,1,2,0,0,0,0,0,0,0,2,0,2 49 | 0,2,27.77,71,2,2,0,1,1,0,0,0,0,0,0,0,2,0,1 50 | 0,2,25.01,55,1,3,0,1,2,0,0,0,0,0,0,1,2,1,3 51 | 0,2,28.06,64,3,1,0,1,3,0,0,0,0,0,0,1,2,1,1 52 | 0,3,27.04,65,1,2,0,1,3,1,0,0,0,0,0,1,3,1,2 53 | 1,2,24.62,53,2,1,0,1,1,0,0,0,0,0,0,0,0,0,0 54 | 1,2,21.05,44,1,1,0,1,2,0,0,0,0,0,0,0,0,0,0 55 | 1,2,22.68,63,2,1,0,1,2,0,0,0,0,0,0,0,0,0,0 56 | 1,1,20.32,69,1,1,0,1,2,0,0,0,0,0,0,0,0,0,0 57 | 1,2,24.74,53,1,3,0,1,1,0,0,0,0,0,0,0,0,0,0 58 | 1,3,27.24,55,1,3,0,1,2,0,0,0,0,0,0,0,0,0,0 59 | 1,1,22.66,50,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0 60 | 1,3,22.99,56,2,1,0,1,2,0,0,0,0,0,0,0,0,0,0 61 | 1,3,30.41,65,3,2,0,1,1,0,0,0,0,0,0,0,0,0,0 62 | 1,2,28.93,60,2,3,0,1,1,0,0,0,0,0,0,0,0,0,0 63 | 1,2,22.86,44,1,2,0,1,2,0,0,0,0,0,0,0,0,0,0 64 | 1,3,22.66,63,1,2,0,1,2,0,0,0,0,0,0,0,0,0,0 65 | 1,3,25.71,62,2,2,0,1,2,0,0,0,0,0,0,0,0,0,0 66 | 1,1,22.77,41,1,3,0,1,2,0,0,0,0,0,0,0,0,0,0 67 | 1,1,18.82,29,1,3,0,1,2,0,0,0,0,0,0,0,0,0,0 68 | 1,2,25.91,63,2,2,0,1,2,0,0,0,0,0,0,0,0,0,0 69 | 1,2,23.23,81,2,3,0,1,2,0,0,0,0,0,0,0,0,0,0 70 | 1,1,18.75,63,1,1,0,1,2,0,0,0,0,0,0,0,0,0,0 71 | 1,2,24.61,73,2,2,0,1,2,1,0,0,0,0,0,0,0,0,0 72 | 1,3,26.85,50,3,1,0,1,2,1,0,0,0,0,0,0,0,0,0 73 | 1,3,16.38,42,2,3,0,1,2,1,0,0,0,0,0,0,0,0,0 74 | 1,3,25.26,73,3,3,0,1,2,1,0,0,0,0,0,0,0,0,0 75 | 1,3,29.64,57,2,3,0,1,1,2,0,0,0,0,0,0,0,0,0 76 | 1,2,23.51,71,2,2,0,1,2,1,1,0,0,0,0,1,0,0,0 77 | 1,2,36.33,56,1,3,0,1,2,0,0,0,0,0,0,0,0,2,0 78 | 1,3,30.48,65,2,3,0,1,2,0,0,0,0,0,0,1,0,1,0 79 | 1,2,19.4,73,2,2,0,1,2,0,0,0,0,0,0,1,0,1,0 80 | 1,2,21.2,48,3,1,0,1,3,0,1,0,0,0,0,1,0,2,0 81 | 1,1,17.72,48,1,3,0,1,1,1,1,0,0,0,0,1,0,1,0 82 | 1,2,28.06,60,1,1,0,1,2,0,0,0,0,0,0,0,0,0,1 83 | 1,2,33.67,65,2,3,0,1,2,0,0,0,0,0,0,0,0,0,1 84 | 1,1,32.05,52,2,2,0,1,2,0,0,0,0,0,0,0,0,1,2 85 | 1,2,18.36,47,2,1,0,1,2,0,1,0,0,1,0,0,0,1,1 86 | 1,2,22.06,60,1,3,0,1,2,1,0,0,0,0,0,1,0,1,1 87 | 1,1,23.03,39,1,3,0,1,2,0,1,0,0,1,0,1,0,1,2 88 | 1,3,25.61,66,2,2,0,1,3,1,0,0,2,0,0,0,0,0,0 89 | 1,1,19.63,46,2,1,0,1,1,1,0,0,2,0,0,0,0,0,0 90 | 1,2,28.34,53,2,1,0,1,2,0,0,0,0,0,0,0,2,0,0 91 | 1,2,30.12,48,2,3,0,1,1,2,0,0,0,0,0,1,2,0,0 92 | 1,1,20.03,27,1,3,0,1,2,1,0,0,0,0,0,0,1,1,0 93 | 1,2,24.16,68,3,3,0,1,1,0,0,0,0,0,0,1,1,1,0 94 | 1,2,27.85,66,1,1,0,1,2,0,0,0,0,0,0,0,2,0,1 95 | 1,2,26.84,72,2,2,0,1,1,0,0,0,0,0,0,0,2,0,3 96 | 1,2,27.24,66,2,3,0,1,2,0,0,0,0,0,1,0,1,0,1 97 | 0,2,29.73,61,2,1,0,1,2,0,0,1,1,0,0,0,0,0,0 98 | 0,3,26.81,45,1,2,0,1,3,0,0,1,1,0,0,0,0,0,0 99 | 0,2,26.15,72,2,2,0,1,2,0,0,1,1,0,0,0,0,0,0 100 | 0,2,26.22,68,2,2,0,1,1,1,0,1,1,0,0,0,0,0,0 101 | 0,2,20.52,21,1,1,0,1,1,1,1,1,1,1,0,0,0,0,0 102 | 0,1,28.71,47,2,1,0,1,2,0,1,2,2,0,0,0,0,1,0 103 | 0,3,34.11,68,2,2,0,1,2,1,1,1,0,1,1,0,0,0,0 104 | 0,3,29.41,47,2,1,0,1,2,2,1,3,0,0,2,0,0,0,1 105 | 0,2,31.28,80,3,2,0,1,1,0,0,1,1,0,1,0,0,0,0 106 | 0,2,24.61,47,3,1,0,1,2,1,1,1,2,1,1,1,0,0,0 107 | 0,2,27.34,75,2,2,0,1,3,0,0,1,1,0,1,0,0,3,0 108 | 0,2,24.34,63,2,2,0,1,2,0,0,2,0,0,0,0,2,0,1 109 | 0,3,22.15,63,2,1,0,1,3,0,1,1,1,1,0,0,1,0,2 110 | 0,2,27.38,72,2,2,0,1,2,1,1,3,6,1,1,1,1,0,0 111 | 0,3,20.55,54,2,1,0,1,2,0,1,2,2,1,2,2,2,1,0 112 | 0,2,18.47,66,2,1,0,1,2,1,0,1,1,0,1,0,1,0,1 113 | 0,2,28.63,66,1,1,0,1,2,1,1,1,1,0,1,0,2,0,1 114 | 0,2,24.58,70,2,2,0,1,3,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA 115 | 1,2,22.48,76,2,3,0,1,1,0,0,1,1,0,0,0,0,0,0 116 | 1,2,29.39,51,1,2,0,1,2,0,0,1,1,0,0,0,0,0,0 117 | 1,2,19.2,46,1,3,0,1,1,0,0,1,1,0,0,0,0,0,0 118 | 1,2,20.31,68,2,1,0,1,2,0,0,1,1,1,1,0,0,0,0 119 | 1,2,26.12,68,1,3,0,1,2,1,1,4,4,1,3,1,1,1,1 120 | 0,2,32.37,74,2,3,0,0,1,0,0,0,0,0,0,0,0,0,0 121 | 0,1,25.76,62,1,1,0,0,2,0,0,0,0,0,0,0,0,0,0 122 | 0,1,24.45,65,2,1,0,0,2,0,0,0,0,0,0,0,0,0,0 123 | 0,2,28.09,55,2,1,0,0,2,0,0,0,0,0,0,0,0,0,0 124 | 0,1,30.27,69,2,2,0,0,3,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA 125 | 0,1,20.34,55,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0 126 | 0,3,28.08,66,2,1,0,0,1,0,0,0,0,0,0,0,0,0,0 127 | 0,2,22.65,70,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0 128 | 0,1,20.05,31,3,1,0,0,1,0,0,0,0,0,0,0,0,0,0 129 | 0,2,27.16,63,2,2,0,0,2,0,0,0,0,0,0,0,0,0,0 130 | 0,1,20.11,28,1,3,0,0,2,0,0,0,0,0,0,0,0,0,0 131 | 0,1,20.06,21,2,1,0,0,1,0,0,0,0,0,0,0,0,0,0 132 | 0,3,22.68,67,2,3,0,0,2,0,0,0,0,0,0,0,0,0,0 133 | 0,2,27.02,70,1,3,1,0,2,0,0,0,0,0,0,0,0,0,0 134 | 0,2,30.42,70,2,2,0,0,2,2,0,0,0,0,0,0,0,0,0 135 | 0,2,26.03,66,2,1,0,0,2,2,0,0,0,0,0,0,0,0,0 136 | 0,1,20.05,18,1,3,0,0,1,0,1,0,0,0,0,0,0,0,0 137 | 0,2,24.34,65,1,1,0,0,2,0,1,0,0,0,0,0,0,0,0 138 | 0,3,18.31,22,2,3,0,0,1,0,0,0,0,0,0,0,0,1,0 139 | 0,2,31.21,65,2,1,0,0,2,0,0,0,0,0,0,0,0,2,0 140 | 0,2,27.76,40,2,3,0,0,2,0,0,0,0,0,0,0,0,1,0 141 | 0,2,30.04,54,3,3,0,0,2,2,0,0,0,0,0,0,0,1,0 142 | 0,2,28.03,68,2,3,0,0,2,1,0,0,0,0,0,0,0,1,0 143 | 0,2,23.04,62,2,1,0,0,2,0,0,0,0,1,0,0,0,1,0 144 | 0,1,21.33,30,3,3,0,0,2,0,0,0,0,0,0,1,0,1,0 145 | 0,2,30.76,36,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0 146 | 0,3,20.57,60,2,1,0,0,2,1,0,0,0,0,0,1,0,1,0 147 | 0,2,25.25,65,1,3,0,0,2,1,0,0,0,0,0,1,0,1,0 148 | 0,2,30.86,45,2,1,0,0,2,0,0,0,0,1,0,1,0,1,0 149 | 0,2,27.13,73,2,2,0,0,2,1,0,0,0,1,0,2,0,2,0 150 | 0,2,19.2,59,1,3,0,0,2,0,0,0,0,0,0,0,0,0,1 151 | 0,2,30.9,48,3,2,0,0,2,0,1,0,0,0,0,0,0,0,1 152 | 0,3,29.95,45,1,1,0,0,2,0,0,0,0,0,0,1,0,1,1 153 | 0,3,27.66,66,2,2,0,0,2,0,0,0,0,1,0,1,0,1,2 154 | 0,2,29.05,73,2,1,0,0,2,1,0,0,0,1,0,1,0,3,2 155 | 0,3,27.77,69,1,1,0,0,2,0,0,0,0,0,0,1,1,1,0 156 | 0,3,31.35,53,2,2,0,0,3,0,0,0,0,0,0,1,2,1,0 157 | 0,2,24.38,65,2,1,0,0,1,0,0,0,0,0,0,0,1,0,1 158 | 0,2,26.73,70,1,1,0,0,2,0,0,0,0,0,0,1,2,1,1 159 | 0,2,33.56,66,3,1,0,0,2,0,0,0,0,0,0,1,2,1,2 160 | 0,2,33.22,73,3,1,0,0,2,0,1,0,0,0,0,1,1,1,1 161 | 1,3,23.34,63,2,1,0,0,2,0,0,0,0,0,0,0,0,0,0 162 | 1,3,31.53,61,3,3,0,0,2,0,0,0,0,0,0,0,0,0,0 163 | 1,2,25.85,66,2,3,0,0,2,0,0,0,0,0,0,0,0,0,0 164 | 1,1,20.83,44,1,3,0,0,1,0,0,0,0,0,0,0,0,0,0 165 | 1,3,17.01,65,1,3,0,0,2,0,0,0,0,0,0,0,0,0,0 166 | 1,2,26.3,64,1,3,0,0,1,0,0,0,0,0,0,0,0,0,0 167 | 1,1,20.32,34,1,1,0,0,2,1,0,0,0,0,0,0,0,0,0 168 | 1,2,30.44,70,3,3,0,0,2,1,0,0,0,0,0,0,0,0,0 169 | 1,2,26.57,63,2,2,0,0,2,1,0,0,0,0,0,0,0,0,0 170 | 1,1,21.37,32,1,2,0,0,2,2,0,0,0,0,0,0,0,0,0 171 | 1,2,22.27,84,2,3,0,0,2,1,0,0,0,0,0,0,0,0,0 172 | 1,2,22.45,64,2,3,0,0,2,0,1,0,0,0,0,0,0,0,0 173 | 1,1,26.49,48,2,3,0,0,2,1,1,0,0,0,0,0,0,0,0 174 | 1,2,24.8,58,2,3,0,0,1,0,0,0,0,0,0,1,0,0,0 175 | 1,2,23.63,81,2,1,0,0,2,0,0,0,0,0,0,0,0,1,0 176 | 1,2,25.04,73,2,1,0,0,2,0,0,0,0,0,0,0,0,1,0 177 | 1,3,30.8,67,3,1,0,0,2,0,0,0,0,0,0,0,0,1,0 178 | 1,3,31.57,60,4,2,0,0,2,0,0,0,0,0,0,0,0,1,0 179 | 1,2,30.49,79,2,3,0,0,2,0,0,0,0,0,0,1,0,1,0 180 | 1,2,26.56,53,2,1,0,0,2,0,0,0,0,0,0,1,0,1,0 181 | 1,2,21.61,35,1,1,0,0,1,0,0,0,0,0,0,1,0,1,0 182 | 1,3,34.11,68,3,1,0,0,2,2,0,0,0,0,0,0,0,0,1 183 | 1,2,19.4,26,1,2,0,0,1,0,0,0,1,0,0,0,0,0,0 184 | 1,2,26.18,62,2,2,0,0,2,0,0,0,0,0,0,0,1,0,0 185 | 1,2,28.08,69,2,1,0,0,2,2,0,0,0,0,0,0,1,1,0 186 | 1,2,28.12,60,1,2,0,0,3,0,0,0,0,0,0,0,4,0,1 187 | 1,2,25.26,73,1,2,0,0,2,0,0,0,0,0,0,0,1,0,1 188 | 1,2,18.87,66,2,3,0,0,1,0,0,0,0,0,0,0,5,0,2 189 | 1,3,23.88,79,2,3,0,0,1,1,0,0,0,0,0,0,3,0,2 190 | 1,3,23.23,86,2,3,0,0,1,1,0,0,0,0,0,0,1,0,2 191 | 1,3,29.76,42,2,2,0,0,2,0,0,0,0,0,0,0,2,1,1 192 | 1,2,25.25,56,2,1,0,0,2,0,0,0,0,0,0,1,1,1,1 193 | 1,3,20.76,48,1,3,0,0,2,0,0,0,0,0,0,1,1,1,1 194 | 1,2,26.93,38,2,1,0,0,2,1,0,0,0,1,1,1,2,1,2 195 | 0,1,26.51,52,1,2,0,0,2,2,1,2,2,0,0,0,0,0,0 196 | 0,2,23.32,84,3,2,0,0,2,0,0,3,3,0,0,0,0,1,0 197 | 0,3,21.8,80,1,3,0,0,1,0,0,2,4,0,2,0,0,0,0 198 | 0,3,27.76,66,3,2,0,0,3,2,0,2,2,0,2,0,0,0,0 199 | 0,2,27.15,67,2,2,0,0,2,1,0,4,8,0,2,0,0,0,0 200 | 0,3,25.69,82,2,3,0,0,2,0,0,2,2,0,2,0,0,0,1 201 | 0,2,28.41,54,3,2,0,0,2,0,0,2,4,0,2,0,1,0,0 202 | 0,3,19.93,65,1,1,0,0,2,0,0,2,2,0,2,0,1,0,0 203 | 0,2,22.06,81,2,2,0,0,2,1,1,2,2,0,1,0,1,0,0 204 | 0,2,30.07,66,2,2,0,0,2,1,1,2,2,0,1,0,1,0,0 205 | 0,1,23.2,46,1,1,0,0,2,1,1,3,1,1,2,1,2,0,0 206 | 0,2,20.68,45,2,1,0,0,2,1,1,1,1,1,1,1,2,0,0 207 | 0,2,26.26,76,2,1,0,0,3,0,0,3,3,0,2,0,2,1,0 208 | 0,1,27.47,45,2,2,0,0,2,0,0,3,5,0,2,0,2,0,3 209 | 0,2,29.91,84,2,2,0,0,2,0,0,1,3,0,2,0,1,0,1 210 | 0,1,27.76,45,2,1,0,0,2,0,0,2,4,0,2,0,1,0,1 211 | 0,2,31.18,64,2,2,0,0,3,0,0,2,2,0,2,0,1,0,2 212 | 0,3,32.18,54,2,1,0,0,2,0,0,2,2,0,2,0,3,0,1 213 | 0,3,27.43,73,3,2,0,0,2,2,0,2,6,0,2,0,3,0,1 214 | 0,2,22.15,81,1,3,0,0,1,1,1,3,4,0,2,0,2,0,1 215 | 0,2,22.44,42,2,1,0,0,2,1,1,1,1,1,1,0,1,0,1 216 | 0,2,22.84,38,2,1,0,0,2,1,1,2,2,1,1,1,1,0,1 217 | 0,3,25.77,62,2,1,0,0,2,1,1,2,4,1,2,1,2,0,1 218 | 0,2,31.64,54,2,2,0,0,2,1,1,3,2,1,1,1,1,0,2 219 | 0,2,28.36,62,3,3,0,0,2,1,1,6,10,1,4,0,3,1,2 220 | 0,3,26.37,63,1,2,0,0,1,0,0,2,1,0,1,1,1,1,1 221 | 0,1,23.04,19,1,1,0,0,1,1,1,3,6,1,3,1,2,1,2 222 | 0,3,26.54,67,2,2,0,0,3,2,2,4,7,2,3,2,1,1,2 223 | 0,2,26.53,55,2,3,0,0,3,1,1,4,4,1,2,1,4,1,4 224 | 0,2,23.37,33,2,1,0,0,3,2,2,5,7,1,4,1,5,1,2 225 | 0,2,23.81,43,2,1,0,0,2,1,1,3,6,1,3,1,3,1,2 226 | 0,3,33.39,69,3,2,0,0,2,2,2,3,3,1,4,2,3,2,2 227 | 1,2,25.9,35,2,3,0,0,2,1,1,4,9,1,4,0,0,0,0 228 | 1,2,16.42,67,1,3,0,0,2,0,0,1,1,0,1,1,2,0,0 229 | 1,2,20.08,66,2,3,0,0,1,0,0,4,4,0,4,0,1,1,0 230 | 1,3,17.01,43,2,2,1,0,2,0,0,6,5,0,4,0,3,0,2 231 | 1,2,27.01,83,2,1,0,0,2,1,1,3,5,1,4,1,3,0,2 232 | 1,2,33.43,55,2,2,0,0,2,0,0,3,3,0,1,0,1,1,1 233 | 1,3,26.08,55,2,2,0,0,3,1,0,4,4,1,4,1,1,1,1 234 | 1,2,21.97,43,2,2,0,0,1,2,1,4,8,1,6,1,6,2,6 235 | 1,1,21.67,44,1,2,0,0,1,2,2,3,3,1,2,2,7,2,3 236 | 1,3,15.6,23,2,3,0,0,2,2,1,4,2,2,2,1,1,1,1 237 | -------------------------------------------------------------------------------- /2024-03/shiny_transcript.txt: -------------------------------------------------------------------------------- 1 | I mentioned previously, Shiny is what we're going to be covering today. Shiny is a package that can be used in R, that is for making interactive web applications. They really useful for being able to share your results afterwards. And they can be made fairly simply, which is quite exciting. So I can't, just for the record, I can't really see the chat. So if anybody has any questions that they've typed, if anybody could show that to me, I'll keep eye to say that and I'll let you know. So you just a square rectangle in the middle of your screen? At the moment. Yeah, that was yeah, it should be gone. Yeah. Okay. So what I'm going to start with just now is just going to show you how you can get the R studio or the posit example of shiny web app, which will set us up nicely for this demo. Right now we've just got the usual R Studio IDE which has currently three panels, but about to be four panels that maybe recognise if you've used R before, if you've not used R before, you might not recognise the set up. But in the corner here. You scroll down to Shiny web, it opens up this, we'll call this demo. And then we'll just create this file here. Then that it's created a repository here called Healthy Demo One. In there, it's got this file app. And that's what's opened up here in our top left panel. Just for the sake of the demo, I'm going to quickly click Run App. Just so that you can see how quick and easy it is to create an app using Shiny, which is this package here. And then we'll talk through each of the parts and then we'll create our own. So I'm just going to quickly click Run app. I'm going to assume you can't see that pop up. I'm just going to share my full screen instead of just that. Good s again, there we go. So this is the app, sorry, it'll disappear in a second. This is the app that was just created with those lines of code. It's basically giving us this output of histogram showing the time to up the frequency of the time to eruptions. The interactive part here is that we can choose the number of bins that we want for our histogram. If we want some really granular data, we can scroll it all the way up to 50, and you can see that there's now 50 bars on our histogram, but if we don't really care about that, we can scroll it down. There is pretty cool. It's reactive. As soon as we change it, it updates just like, so cool. Okay, back to the code and how that works. It basically requires three components. It requires the UI, which is the user interface, what the user interacts with, with the app. And then it requires the server, which is if the UI is the front end, the server the back end, it's where all the coding and the work happens, the calculations. And then there's this final bit of code which is combining the UI and the server together to create your app. What we're going to do for this session is we are going to take this out of the box app that Posit has given us and we're going to create something that we want to create. Instead what I'm going to do is we're going to use the dataset that we have been using for the past couple of demos. In factor script, I'm just going to copy over the Factoring code that we used in the previous demos. Don't worry if you're a bit unsure about what this is showing. We've got a previous demo about how to work with factors, but basically we're using this data set here, which is a randomised control trial. Make bigger for looking at the outcome. Having throat pain, swallow pain, or a cough after thoratic surgery. And basically the randomised element is some patients got a licorice gargle and then some had a sugar gargle. That's the data that we're going to be using here. This is just quickly turning the variables into factors where we want them to be factors. Great, what I recommend first, before we try and create anything interactive, we're just going to create a static plot that we will eventually turn interactive. But we just want to make sure that we can get our data working. We're going to do a plot. Plots are more fun and pretty simple to do. Obviously, I should show shiny. Can get, there's lots of different examples. They can go from really basic things that we looked at with the eruptions, but then they can also become much more complicated. This example here, if it loads, there you go, can get much more complicated with colours and sizes and things like that. But we're just going to start with a basic plot. What I'm going to try and get out of this app is I'm going to plot the numbers of patients receiving the licorice gargle versus the sugar gargle as their treatment. Hopefully, I'm going to be able to change the outcome that I look at for the fill of the plot. But first I'm going to, by creating a basic plot, I'm going to start with Licodata. Do GG plot create our aesthetics? Our x axis is going to be treat factor, which is whether or not they got the licorice gargle. And our fill, I will start with just this outcome here about whether or not they had throat pain at 01:00 A.M. post operation. There we go. And I'll just add bar that, see if that works quickly. Well, I've not run Licodata. Instead I'll actually just click Run App and see if it works. Okay. So that is, this error here is saying that the CSV file that we're using isn't in the directory that we have just created. And that's because it's not, this is the data here that we're trying to use. It's looking for it in this directory which has the app that we're running, but it's not there. So I'm just going to move that to here. It should work. There we go. We can see that plot worked now is no longer connected to our interactive slider, but that's not what we were trying to do just then. We can see that we have created a plot having isn't necessarily that useful. It can be quite tricky when it raises the other levels off of the zero value. I might try and get rid of that quickly. Change the colour scheme because I'm true to riinu, I will make the background white as well. Skill, so this is just go, aesthetics unnecessary but just copy this, I just run again. And it's basically just prompted me that I should save before, that we update it. And there we go. It's gotten rid of the NAs and it's changed the colour scheme. You can change it however you like, but we'll just leave it there just now. Great. The next thing I'm going to do before update it, can you show where you ran the factoring script that you copied over? So that's been pasted at the top of that app file? Yes. So that sits out with the UI server functions. Yes. So good. So it all kind of Yeah, I'm not actually quite sure how that works. Could you explain, just so you can run any other scripts that you want within the file, but not included in the UI or the server portion. And the reason that that's important is it will only run once per session. Anything that was in the server block will run multiple times as people refresh the app or change the number of bins and the histogram, et cetera. If you've got something which is quite heavy going in the processing, computationally expensive, if you've got like a big data set that you're processing, it's best to have that running out with the server function which say showed because it will only run once in order to prepare the data rather than running frequently. Thank you. Great. Next we're going to change the UI so that we can then make it interact with the plot that we have just created. This is the UI here, not quite fitting there. This is the slider that we were using on the previous app. It starts, it goes 1-50 The value was the default value that it starts at, But we don't want a slider for the app that we are doing. What we want is some select option. Once again, I'm just going to show you shiny widgets here. These are all the options that you can use for an input. You can have multiple choices. You can type a number. Here's like a select option. The sliders that we saw, Any text here, depending on the app that you're making, change. What kind of input would you want to use for this example? I think we would be best suited to either some select box or radio buttons. Because I'm only going to make a few options, I'm going to use radio buttons. But if you had lots of options that you were selecting from a select box might look nicer to scroll down because you wouldn't want them all to automatically be displayed like this. I'm going to see code here, there we go in the UI, and it's this radio buttons that I would like going back here, I'm going to replace the slider with radio buttons. It can be quite picky with the brackets there just indented to make it look nicer. If I run this again, you'll be able to see that it's changed again. They're not connected, but we've got our radio buttons there, which is closer to what we are going to want next. I'm going to change the input to be what I want. Instead of this, I don't want to say choice. I want it to say throat pain, which is going to be from this variable here instead of one. Need it to be quoted. Choice to swallow pain. This first bit is the label, This is what we want it to say. You could be more detailed if you want swallow pain 01:00 A.M. you can have it say whatever you want it to say. That's not actually what swallow pain is, 30 minutes quote that and choice three, what call J. The selected is which one we want to start on. Which there might be an informative choice in the app that you're trying to make but shouldn't really matter. We will just start with this one fun. There we go. I will reload that app so you can see what it looks like, changed. You saw that it started on the swallow pain, but again, they're not connected yet. So it doesn't really matter it started on this title. I don't want anymore, this not being this title here is something else I would like to change here. That's the main title of the panel. I will change to Licorice Data. That other label, the smaller label three is just saying it's like a third header size. And I'll call this variable select once again. You see there was her have changed which looks good to getting close to what we want. In our UI, we have the radio buttons is the one part of the input and then the other bit is going to be our output in the main panel is going to be out plot that we're going to create. This is what we're saying we're going to call, I'm going to call it instead barplot_output. Then now we have to go back to here. And our output that we're creating is going to reflect here what we've said, we're going to call it. There we go. Now we're almost there with creating our interactive plot. But what we need to do is say in the fill, we want to be able to change what the colour is. We want to choose between either throat pain or swallow pain, or having a cough. We don't want this. Instead our input, which is what we're getting from the radio buttons, we want it to be input. We've currently called it radio here, we will call it radio and then the same here. That's intuitively what I automatically think I want. But this isn't going to quite work. I'll just run it to show the error. Basically, this is just, it's no longer doing the fill that we want. That's because this here is, it's keeping the quotations on. It's coming in with quotations. That doesn't really work well in the tidyverse in ggplot. We don't want the quotations round these variables here. The way to fix this is using this operator. Hadley Wickham calls it bang bang in reference to the two exclamation, exclamation marks. Then this is basically used for unquoting and turning the text into the symbol that we need it to. It's a bit strange, but it works. I'm putting the same there. Now save it, and we'll give it a go and see if it's works. There we go. So it doesn't actually look that different, but you can see that it is changing each time, which is what we wanted. Just the there we go. You could see how you would be able to implement that in various different ways if you had different plots that you wanted to show or different results. We've done that. I'm going to try now because we've got some spare time adding a different element to your output. We've got the, but what if we want a table underneath as well? Well, what we're going to do there is we are going to add another output in our UI main panel. This time we're going to call it a table output. We're going to call it anything we want to, but we'll call it tableOutput Don't forget that you have to add that, comma in there. Now we just have to create tableOutput. table_lico essentially. We put that in here. Again, we're creating our output, we called it table_lico, this time we're doing Render Table. Instead of render, you can see there all the different options that come with shiny if you wanted to just be a text output, plot and image. But we're going to use Render Table. Okay, we're going to use a finalfit table, summary factor list. Okay, so the dependent explanatory. For the explanatory we'll do taked up factor again for the input, we are going to again take the input from our radio buttons. Each time we change it, we change is we want it to be our interactive variable here. However, with final that and summary factualist we actually want the quoted variables. Instead of unquoting them, we're just going to put in the input that we get from our radio ones here without the bang bang function there. Then we're just going to save that and run it. There we go. We've got a table underneath that changes as we click through the variable we have, which I think is pretty cool. And all that was done in not that many lines of code. The actual app itself, starting on line 45 down to 90. In 45 lines of code. We've got an interactive app. Yeah, pretty easy. I think you can see how you could make it more complicated and more complex. There's apps where you could have a model that you're trying to send out to people and you can choose inputs for the model and see what maybe your predicted value is. Things like that can get more complicated at its core. Shiny is always just these three parts is pretty easy to get something functioning and fun and cool. You can customise it whatever way you would like. Yeah. Does anyone have any burning questions? It can be quite a lot and it can be a lot take in, but just playing around is quite a good way to be able to understand what's happening here. This would also work. What's nice about as always, not really a shiny demo, but can use a label here, that's a work. Oh right. Yes. Stupid. That's because it's a factor that needs the FF label. Not, not what I was doing. Sorry. There we go. Now the label is showed up there. Yeah. Cool. Okay. If there aren't any questions specifically about shiny, I will pause the recording and we can see if any other questions have come up. You want me to show three examples with the code of how we use it? Yeah, that'd be good. Could take 2 minutes. Is that showing? Okay. Yeah. Just to give you a couple of ideas about how this can be used. This is shiny Viz app and we can post the code for each of these. This automatically takes a dataset, excuse me, from Red Cap. It was last downloaded on the 29th this morning at 04:00 in the morning. Takes that dataset, does some pre processing, and then makes it available through a shiny app. This is Global Cohort Study, which is ongoing at the moment and allows in real time an exploration of the data that's coming in. The data don't matter, but this is country income level split by BMI and what type of surgery the individual was having, et cetera, and how urgent it was. This is really useful both in terms of exploring hypotheses, but also in assessing data validity or missingness, et cetera. And she's added various different things that you can use to alter that. You can take that underlying code and apply it to whatever it to set you want. I'll link to that, but it's in renews called Shiny is. The next one is called Shiny Fit. This is one that I did in the past. Rather than just looking at the variables stratified by different variables and counting them up. This is actually a regression. So that this is the final fit function as shiny app, you can put a dataset into it. You can specify your dependent variable here, mortality for instance. Then you can pick explanatory variables from the dataset, just as anyone who has used the final fit function. So these add into a final fit table with a univarable and a multivariable analysis. You can include model metrics. This is using the various different widgets which Sarah very nicely showed. You can subset the data on a particular available et cetera. Then you can have another tab which shows the plot, A tab which shows the summary factor list, and a tab that shows the glimpse of the data there you can share that somebody could do fully formed regression analysis with that. Then finally, the transplant benefits score, one that we did. This takes quite a complicated prediction score and uses various widgets in order to predict the benefit from transplant. And the details don't matter, but you can show that you can combine various different Widgets with an underlying dataset in order to give outputs could be used in order to guide care, I guess if you have MHR approval, et cetera. So I'll add the three Github repositories which describe those three example apps. I'll put it in the chat, but we will also put it beside the recording. Yeah, yeah, those are much more real life examples like you might not necessarily just have a plot that you want to change the variable of, but you can see how well that they could be applied in a real science communication capacity. Yeah, when I was playing around with Shy to decide to what to show for this demo, I tried to create one that was, it got a bit too complicated. But it was doing a similar thing. Not a similar thing at all. But it was like using a model to decide whether or not the person would be predicted to have throat pain. If they were like had severe disease, a high BMI, were quite old, if they're a current smoker, they might have throat pain, but otherwise, yeah. Anyway, yeah, lots of different ways that you could. Yeah, cool. If no one has any shiny questions, I will stop the recording here and I think someone has a clinic question that we can try and help with now. -------------------------------------------------------------------------------- /2024-04/DAG_demo.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "DAG demo" 3 | format: html 4 | editor: visual 5 | --- 6 | 7 | ```{r} 8 | # install.packages("ggdag") 9 | library(ggdag) 10 | library(tidyverse) 11 | ``` 12 | 1. Basic DAG 13 | 14 | node_size text_size 15 | 16 | ```{r} 17 | 18 | dagify( 19 | outcome ~ exposure) %>% 20 | ggdag() 21 | 22 | ``` 23 | 2. We are investigating the causal relationship between social media use and poor mental health outcomes 24 | 25 | ```{r} 26 | dagify( 27 | mental_health ~ social_media_use + other_thngs, 28 | other_things ~ yet_more_things) %>% 29 | ggdag() 30 | ``` 31 | 32 | 2. Add confounders: age and personality type Add mediator: sleep duration 33 | 34 | ```{r} 35 | dagify( 36 | mental_health ~ social_media_use + age + personality + sleep_duration, 37 | sleep_duration ~ social_media_use, 38 | social_media_use ~ age + personality 39 | ) %>% 40 | ggdag() 41 | ``` 42 | 43 | 3. Coordinates 44 | 45 | manually set coords time ordered 46 | ```{r} 47 | coords <- list( 48 | x = c(mental_health = 5, social_media_use = 0, age = 2.5, personality = 2.5, sleep_duration = 2.5), 49 | y = c(mental_health = 1, social_media_use = 1, age = 4, personality = 6, sleep_duration = -1) 50 | ) 51 | dagify( 52 | mental_health ~ social_media_use + age + personality + sleep_duration, 53 | sleep_duration ~ social_media_use, 54 | social_media_use ~ age + personality, 55 | 56 | coords = time_ordered_coords() 57 | ) %>% 58 | ggdag() 59 | ``` 60 | 61 | ```{r} 62 | coords <- list( 63 | x = c(mental_health = 5, social_media_use = 0, age = 2.5, personality = 2.5, sleep_duration = 2.5), 64 | y = c(mental_health = 1, social_media_use = 1, age = 4, personality = 6, sleep_duration = -1) 65 | ) 66 | dagify( 67 | mental_health ~ social_media_use + age + personality + sleep_duration, 68 | sleep_duration ~ social_media_use, 69 | social_media_use ~ age + personality, 70 | 71 | coords = coords, 72 | 73 | exposure = "social_media_use", 74 | outcome = "mental_health", 75 | 76 | labels = c(mental_health = "mental health", 77 | social_media_use = "social media use", 78 | age = "age", 79 | personality = "personality", 80 | sleep_duration = "sleep duration") 81 | ) %>% 82 | ggdag_status(use_labels = "label", 83 | text = FALSE) 84 | 85 | ``` 86 | 87 | 4. Add labels and status 88 | 89 | Within dagify set exposure = "", outcome = "", labels = c( ="") 90 | 91 | try ggdag() and ggdag_status() 92 | 93 | 5. Add theme 94 | 95 | ```{r} 96 | dagify( 97 | mental_health ~ social_media_use + age + personality + sleep_duration, 98 | sleep_duration ~ social_media_use, 99 | social_media_use ~ age + personality, 100 | 101 | coords = coords, 102 | 103 | exposure = "social_media_use", 104 | outcome = "mental_health", 105 | 106 | labels = c(mental_health = "mental health", 107 | social_media_use = "social media use", 108 | age = "age", 109 | personality = "personality", 110 | sleep_duration = "sleep duration") 111 | ) %>% 112 | ggdag_status(use_labels = "label", 113 | text = FALSE)+ 114 | theme_dag() 115 | ``` 116 | ```{r} 117 | dagify( 118 | mental_health ~ social_media_use + age + personality + sleep_duration, 119 | sleep_duration ~ social_media_use, 120 | social_media_use ~ age + personality, 121 | 122 | coords = coords, 123 | 124 | exposure = "social_media_use", 125 | outcome = "mental_health", 126 | 127 | labels = c(mental_health = "mental health", 128 | social_media_use = "social media use", 129 | age = "age", 130 | personality = "personality", 131 | sleep_duration = "sleep duration") 132 | ) %>% 133 | ggdag_adjustment_set() 134 | ``` 135 | 136 | 137 | 6. Dag Usability 138 | 139 | open paths (adjust_for) minimal adjustment set 140 | 141 | ```{r} 142 | dagify( 143 | poor_mental_health ~ social_media + age + personality + sleep, 144 | social_media ~ age + personality, 145 | sleep ~ social_media, 146 | exposure = "social_media", 147 | outcome = "poor_mental_health", 148 | coords = coords, 149 | labels = c( 150 | poor_mental_health = "Poor Mental Health", 151 | social_media = "Social Media Usage", 152 | age = "Age (years)", 153 | personality = "Personality Type", 154 | sleep = "Sleep (hours per night)")) %>% 155 | as_tidy_dagitty() %>% 156 | ggplot(aes(x, y, xend = xend, yend = yend)) + 157 | geom_dag_edges() + 158 | geom_dag_point()+ 159 | #geom_dag_text()+ 160 | geom_text(aes(label=label, x=x, y=y), color = "blue") 161 | 162 | ``` 163 | 164 | 7. Additional customisations in ggplot 165 | 166 | e.g geom_text themes colour palettes 167 | 168 | ```{r} 169 | 170 | dagify( 171 | poor_mental_health ~ social_media + age + personality + sleep, 172 | social_media ~ age + personality, 173 | sleep ~ social_media, 174 | exposure = "social_media", 175 | outcome = "poor_mental_health", 176 | coords = coords, 177 | labels = c( 178 | poor_mental_health = "Poor Mental Health", 179 | social_media = "Social Media Usage", 180 | age = "Age (years)", 181 | personality = "Personality Type", 182 | sleep = "Sleep (hours per night)")) %>% 183 | as_tidy_dagitty() %>% 184 | 185 | ggplot(aes(x, y, xend = xend, yend = yend)) + 186 | geom_dag_edges() + 187 | geom_dag_point()+ 188 | geom_dag_text()+ 189 | 190 | 191 | 192 | ``` 193 | 194 | ### ANSWERS 195 | 196 | basic - 197 | 198 | ```{r} 199 | set.seed(1) 200 | dagify( 201 | poor_mental_health ~ social_media + age + personality + sleep, 202 | social_media ~ age + personality, 203 | sleep ~ social_media, 204 | exposure = "social_media", 205 | outcome = "poor_mental_health") %>% 206 | ggdag(node_size = 20, 207 | text_size = 3) 208 | ``` 209 | 210 | add coordinates - 211 | 212 | ```{r} 213 | coords <- list( 214 | x = c(social_media = 1, poor_mental_health = 10, age = 3, personality = 5, sleep = 7), 215 | y = c(social_media = 5, poor_mental_health = 5, age = 1, personality = 1, sleep = 7) 216 | ) 217 | ``` 218 | 219 | ```{r} 220 | basic_dag = dagify( 221 | poor_mental_health ~ social_media + age + personality + sleep, 222 | social_media ~ age + personality, 223 | sleep ~ social_media, 224 | exposure = "social_media", 225 | outcome = "poor_mental_health", 226 | coords = coords) 227 | 228 | basic_dag %>% 229 | ggdag() 230 | 231 | 232 | basic_dag = dagify( 233 | poor_mental_health ~ social_media + age + personality + sleep, 234 | social_media ~ age + personality, 235 | sleep ~ social_media, 236 | exposure = "social_media", 237 | outcome = "poor_mental_health", 238 | coords = time_ordered_coords()) 239 | 240 | basic_dag %>% 241 | ggdag() 242 | ``` 243 | 244 | add labels and status - 245 | 246 | ```{r} 247 | 248 | basic_dag = dagify( 249 | poor_mental_health ~ social_media + age + personality + sleep, 250 | social_media ~ age + personality, 251 | sleep ~ social_media, 252 | exposure = "social_media", 253 | outcome = "poor_mental_health", 254 | coords = coords, 255 | labels = c( 256 | poor_mental_health = "Poor Mental Health", 257 | social_media = "Social Media Usage", 258 | age = "Age (years)", 259 | personality = "Personality Type", 260 | sleep = "Sleep (hours per night)")) 261 | 262 | basic_dag %>% 263 | ggdag(use_labels = "label", text = F) 264 | 265 | basic_dag %>% 266 | ggdag_status(use_labels = "label", text = F) 267 | ``` 268 | 269 | add theme - 270 | 271 | ```{r} 272 | basic_dag %>% 273 | ggdag(use_labels = "label", text = F) + 274 | theme_dag() 275 | 276 | 277 | basic_dag %>% 278 | ggdag(use_labels = "label", text = F) + 279 | theme_dag_grid() 280 | 281 | basic_dag %>% 282 | ggdag(use_labels = "label", text = F) + 283 | theme_classic() 284 | ``` 285 | 286 | DAG fetures 287 | 288 | ```{r} 289 | # what are the open pathways 290 | basic_dag %>% 291 | ggdag_paths() 292 | 293 | basic_dag %>% 294 | ggdag_paths(adjust_for = "age") 295 | 296 | 297 | 298 | # what adjustment set do I need to use? 299 | basic_dag %>% 300 | ggdag_adjustment_set(shadow = TRUE) 301 | 302 | basic_dag %>% 303 | dag_adjustment_sets() 304 | 305 | 306 | basic_dag %>% 307 | ggdag_parents("poor_mental_health") 308 | 309 | basic_dag %>% 310 | ggdag_dseparated() 311 | ``` 312 | 313 | Additional customization 314 | 315 | ```{r} 316 | dagify( 317 | poor_mental_health ~ social_media + age + personality + sleep, 318 | social_media ~ age + personality, 319 | sleep ~ social_media, 320 | exposure = "social_media", 321 | outcome = "poor_mental_health", 322 | coords = coords, 323 | labels = c( 324 | poor_mental_health = "Poor Mental Health", 325 | social_media = "Social Media Usage", 326 | age = "Age (years)", 327 | personality = "Personality Type", 328 | sleep = "Sleep (hours per night)")) %>% 329 | as_tidy_dagitty() %>% 330 | 331 | 332 | mutate(extra = case_when(name == "age" ~ "Confounder", 333 | name == "personality" ~ "Confounder", 334 | name == "poor_mental_health" ~ "Outcome", 335 | name == "sleep" ~ "Mediator", 336 | name == "social_media" ~ "Exposure")) %>% 337 | 338 | ggplot(aes(x, y, xend = xend, yend = yend)) + 339 | geom_dag_edges() + 340 | geom_dag_point(size = 20, aes(colour = extra))+ 341 | # geom_dag_text(size = 2.4)+ 342 | theme_void()+ 343 | scale_color_brewer(palette = "Set3")+ 344 | # theme(legend.position = "none")+ 345 | geom_text(aes(label = label, x =x, y = y ), size = 3) 346 | ``` 347 | 348 | edit colours in ggplot: https://stackoverflow.com/questions/71441472/change-color-for-specific-nodes-in-ggdag 349 | -------------------------------------------------------------------------------- /2024-04/dag_transcript.txt: -------------------------------------------------------------------------------- 1 | There we go. I think that's recording. Hi, everyone. Good morning. Welcome to another healthy live demo and clinic. Today, we're going to be talking about dags and using a package called ggdag, which works quite nicely. There is, for making dags in general, a really useful web application called dagitty which you can also code within R. But today, we're going to be talking about ggdag because it follows quite nicely with the kind of ggplot, tidyverse type coding style. Today we're going to have Neil who's sharing his screen and he's going to be doing the live coding for us. Neil, if you could open the document Dag underscored demo up at the top, it's a quarto document, a few more up yet that one there. There we go. That's perfect. Now for anyone who is new to R, we've just got our posit setup, which is a cloud based version of our studio, which has our four screens that we like our top left, is our script. We're using a quart document, which is where we're going to be typing our code. Bottom left is the console, top right is in the environment, where data will get stored, any kind of functions we make, those sorts of things and then our bottom right is what we can do a bunch of things, but it's where our files. We can do our files, and also where we'll get our help tab and thing. We're going to be focusing on the top left panel, which is our quarto document, which is similar to our Markdown, for anyone that used it before. But yes. We're going to be talking about dags. So in the top chunk, there should be Can you toggle between source and visual quickly because I feel like it's not showing the package. Yeah, there we are. The two packages that we're going to be using today is ggdag and also briefly the tidyverse. So that's why we've installed. That's why we're calling library ggdag and tidy verse there. We have those packages ready. So if you could just run that chunk for me, Neil. That's perfect. And so I'm just going to start by taking you through how you make a basic dag using GG dag. So we start off by using this function called dagify, which is basically what is used to make your dag. And then it follows the syntax of specifying your outcome, and then you do a little tilde sign to your exposure. And that is basically going to create a dag that has the thing you're pointing at. And then the thing that's doing the pointing is in that kind of setup. If you run that chunk and then we put it into ggdag, which is what creates the dag. If you run this chunk and then we'll see what it creates. Oh. There. So you can see that it's created a really, really basic dag, but ultimately that's what a dag is. It's trying to identify a causal pathway between an exposure and outcome. It can be thought of as a bit like a brain experiment of what you think your causal pathway will look like. It's good to do this before you kind jump into any kind of analysis so that you can identify factors that you think will be important in your in your experiment. It's good to think about these kind of things with domain experts as well so that you can kind of get a really full picture. But yeah, that's a bit too much theory behind dags. Let's talk about ggdag a bit more. Sarah can I ask one quick question about that. Yes. I noticed that this doesn't need. So it knows to treat this as a text, which is the label for the node of the dag. Yes. So this isn't linked to any kind of data right now. So it's just those bits there are you specify and it knows that you're talking about these kind of y, the nodes and the text on top of the nodes is what you put. So it's important to note that you can't have bases in those names. Right now, it has to be no spaces. So if you wanted to do a longer word, you'd have to do things like underscore underneath it, but later on I will show how you can add a label so that it can be more of like a easily readable human readable type label as opposed to just your no space label that you'd have to have here. So one thing to note is, if you run that again, that chunk there, It's different. Every time. That's because of the randomness. We could overcome that by using set seed so that every time that R generates this plot, it uses the same random numbers that it's using to generate that. But later on, I'm going to show you how you can actually just manually set the coordinates, which is sometimes a little bit more useful when you're doing something like a dag because sometimes the It can get quite messy quite easily, so it's good to put things where you want them to put them. But it's just to note that every time you run that, it'll be different unless you set the seed or set the coordinates. Okay, so that was just a bit about the syntax of how to use ggdag. But now I'm going to put it into a bit more of a health care context because a lot of us might be doing kind of research in kind of health care sector. So I'm going to be pitching to you that we're going to be do we want to look at the causal pathway between something like social media use and a poor mental health outcome. So that's our experiment that we're going to be doing. So could you copy the code from above and then change outcome and exposure to just a term like social media use and for mental health outcomes. Remembering that these are just short labels. We can't use spaces and things. Yeah, that looks good. Yeah. Perfect. If you run that, see what it looks like. Great. Now you can see exposure and outcome have been replaced with social media and mental health. Here we're saying, we think with increased social media or we might hypothesise that with increased social media use, this can cause poor mental health. And But we know that this isn't going to be the only thing on that pathway. There's lots of other variables, be that confounders and mediators that we're going to want to take into account and that we're going to want to have on our dag so that we can put that in our study, and we can say, look at all these things that we're actually going to be taking into account. So to do that, what you need to do is So following the syntax of the thing that's getting pointed at to the things that are doing the pointing. You can similar to using glm in R, you can use the plus sign and add on in that formula syntax, all of the other things that are going to be pointing at the exposure. Yeah, except no space. Yeah. And then if you want to be having things pointing other things, you would do a comma and then go on the line below and then do that again. Yes, exactly. More things. So if you scroll down slightly, I've created a space, and I'm going to pitch to you that I think we should add. This is something that could be up for debate. There's lots of different confounders and mediators and colliders, et cetera that you might want to take into account in your dag. But to keep it relatively simple for this demonstration, what I'm going to get you to add is, I think age Is an important confounder, and also personality type is an important compounder. And these are things that maybe you want to measure for a dataset if you're about to go out and actively collect, or maybe you have a dataset and you already have these variables. So these are the things that I want you to add as a cono, a compounder being something that impacts both the exposure and outcome. And then I also want you to add sleep duration, except I am thinking that this as opposed to a confounder, might be a mediator in that I think Increased social media use could lead to shorter sleep duration and shorter sleep duration can ultimately lead to poorer mental health. So we would want the arrows to be pointing differently to a confounder. So after this, I have mental health is the sort of outcome. And on this side, I want to put all the things that are pointing to that outcome. Yes, so we want to have social media, age, personality and sleep or sleep, whatever we would call it. Then we want to do a comma and go in the line below because we also want to have sleep duration that points at that's being pointed at by social media use Is a confounder is I'm trying I'm trying to imagine what this is going to look like. Okay? Yes. That is one thing about using ggdag. Sometimes it's nice to maybe have it drawn out in paper or maybe even dagitty, which is the online web app I was talking about at the start. And So right now we're going to have social media use, age, personality, and sleep duration pointing at mental health. But we also want to have social media pointing at sleep duration to make it that mediator pathway. But what we also need now is age and personality to point at social media use because we think they're confounders, and they'll be pointing at both the mental health and social media. Exactly. Like that. Yeah. Let's run it and give it a go. Yes, so a bit of a funny shape, but this is what we want. If you can see that you've got social media use, pointing at mental health. We've got personality and age, pointing at both as confounders, and then we've got that mediating pathway of social media, pointing at sleep duration. Increase social media use, causing sleep, sleep, causing poor mental health outcomes. This is the dag that we've put in, the bag that we're looking for, maybe not quite the layout of the dag. It's a bit tricky to see. If you scroll down again, what we're going to do now is we're going to set the coordinates. If you copy that chunk and paste it in here. So if you hit F one on again you'll see all the things that we put into it. And if you scroll down slightly, you'll be able to see an example of setting coordinates, there. You can manually set your coordinates there. So you would set a list of the coordinates, and then you would within dagify add that coord or label. So if we copy over the coords list and then we can manually change it to how we want it to be. I do it inside the same chunks, like this? Yeah, you can do. Yes. This is basically setting the x and y coordinates. Exactly, you want to change the x and y coordinates for each of our variables. Then we can have a think about where we want them to be. Okay. This is just like a relative coordinate. We might want. You can make up wherever you want it to be. But for example, you might want social media used to sit at 0,1 on the y axis one. Then you would want mental health to maybe be 10,1 or two or 5,1. Then you might want age to be kind of above. So if you do that and set it or something on that yeah, exactly. Personality, similarly somewhere like three and maybe If I put minus one, that'll go below exposed. I'll just change the axis on the y axis and make it lower. The confounders above and the mediator below like that. That's a good idea. But you can really customise whatever way you want it to look. But, just add coords equals. And then run that whole chunk and see what it looks like, brilliant. That looks much easier to read, I think, and it's basically just gone exactly where you specified on the x and y. One cool thing that I learned while doing a bit of research before this topic is I think it's quite useful to specify exactly where you want it to go. But if you start getting a really complicated dag, the important thing to remember about dags is It's time sensitive in that your exposure always has to come before your outcome just for that causality to happen. So there's always a time order to your dag. So instead of manually setting the coords, you can also use a function called time_ordered_coords So if you try putting that in there, where you've got coords equal, just to show people what it looks like. Instead of coords if you take it away and time underscore, and then there is ordered coords. So if you hit that, and then just run this chunk of code now. Instead of manually setting, it's basically just It's taken into account when if you have an arrow pointing at something, it has to come first as such. So it's given a time ordered to it, which I think can be quite helpful if you've got lots of different variables. It can neatly order it for you quite quickly. But manually setting is a simpler way to get things exactly where you want them to be. Right. So now we have our positions for our nodes. If you copy and paste that chunk and then maybe turn it back into the coordinates that you set, we'll customise a bit further. Okay. Brilliant. So what we can do now is so you'll see if you scroll up in the help tab that we have slightly. You'll see that we can set exposure and outcome. So that's what we're going to do now so that we can tell our dag which one is our exposure and which one is our outcome? Because right now, it's all just see of the same. So within dagify again, if you add exposure and outcome, So our exposure is the social media use. I think it has to be quoted this time, though. Brilliant. Then if you run that, nothing has changed because we have not told ggdag to really it now knows what the exposure and the outcome is, but we've not asked it to plot it in a different way. For that, you would want to use the function ggdag_status() instead of ggdag() If you add that in there. Sorry Sarah. I missed that. What was it called? ggdag_status() There. So now you can see it's coloured the exposure and the outcome for us. The next thing I don't really like about this dag is that we can't have any spaces in our labels. Also, the labels are too big for the nodes. So it's a bit hard to read. You can't really see social media use at all and things like that. What we're going to do is we're going to set our labels within dagify again in the help tab you'll see that there's the labels there that we can set. We're going to do that. This, we're just going to use the combined function and basically then set the name that we've given our node and then equals, and then we're going to quote exactly what we want it to be instead. We can use spaces in whatever we want within the label. So. Yeah, that was good. And then, yep, we'll do that for the rest of them. So social media use. Perfect. Now within the GG Dag status function, we have to say, gosh, I think it's labels equals, gosh, I try running that. No, I didn't work. Labels equal label. That didn't work either. Try going on the help tab of ggdag status Okay, so it's use underscore labels, sorry. No, I just labels. Use labels equals true. I think it's labels equal and I think it's text in quotes. Okay. Right. Try use labels and encodes labels. Fifth times the charm. Label. There we go. That's it. That's just given us a little label on top of each. Now if we say text equals false, it'll get rid of the text on top of the node. There we are. That's looking a little bit neater, no. Scroll down, see what the next. Yes. If you copy and paste down that chunk of code into that empty chunk. X and Y axis, we've set in order to give the positions of our nodes. But we don't actually need to see the X and Y axis on our plot because they don't mean anything. So if you add theme underscore Dag, not as a not within ggdag status, but if you add, do a plus sign, not a pipe, a plus sign like ggplot Yeah, there we are. And then do theme underscore Dag and then run that. It just gets rid of the gets rid of the background, and it just gives us a blank slate, which I think probably looks better, easier to read, and the coordinates didn't really mean anything anyway, so it was nice to get rid of those themed dag is just like gives you kind of like out of the box dag settings from the package ggdag, which I think looks quite good. So there we have a dag. We can see our causal pathway that we're trying to investigate is social media use towards mental health. We've got our confounders at the top, which are impacting both social media use and mental health, and we've got our sleep duration as a mediator at the bottom. But now I'm just going to talk through some of the extra functionality that ggdag as a package has just to help you prepare for your analysis or building any models afterwards. Now that you've got your shiny fancy dag. So yeah, if you copy and paste that into the chunk below, Neil. if you get rid of the ggdag status and the dag and just leave us with. This is our dag that's not been plotted currently. And if you instead pipe that into gg underscore open paths, or maybe its just gg_paths(), actually, sorry, not open paths. And then run that. Yeah. So now, ggdag has identified the four open paths that we have between our exposure and our outcome. So it helps identify the things that you might have to take into account if you're trying to assess the true impact that social media use has on mental health because right now, we can see that there is an impact that age and personality and sleep duration will have. In order to identify the adjustment set that you might want to use in your model, ggdag also has a function called handle name ggdag_adjustment_set, I think. So if you want to use that one, give that one to go, we'll see what it shows us. Yeah, there. And so this is basically showing us that in order for us to assess the impact that social media use has on mental health, we're going to have to adjust for personality and age. And that's our minimum adjustment set that we're going to have to do. It's telling us not to adjust for sleep, and that is correct because we've suggested that we think sleep is a mediator. We think that social media use impacts sleep, which impacts mental health. And that mediating pathway, if we adjust for sleep duration, it will impact the suggested impact that social media use has on mental health. And we want to be able to get that kind of, like, true estimate, I guess, so you wouldn't adjust for a mediator in this case. Yeah. I think that's a pretty neat feature of ggdag. It can be quite helpful, particularly if you have a more of a complex dag, it can help identify the variables that you need to adjust for. If you have a pathway that has multiple variables in it to block that pathway, you wouldn't necessarily have to adjust for everything in it. So this can be a useful function for identifying what needs to be adjusted for and what doesn't in order for you to estimate that causal pathway. Yeah. So those were kind of the most useful features that I found within ggdag. No worries Rose. Thank you for coming. There is a bit of kind of like additional customizations that I've planned for, but time is running on a bit. So if anyone has any clinic questions, it might be worth stopping that here and jumping into that instead So I'll pause the recording now, and then we can have a chat about that. And then we can maybe come back to this if we need to. But yeah, that's ggdag, everybody. I hope you enjoyed that. -------------------------------------------------------------------------------- /2024-04/gt_tables.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "gt tables in Quarto dashboards" 3 | format: html 4 | editor: visual 5 | execute: 6 | echo: false 7 | warning: false 8 | --- 9 | 10 | ```{r} 11 | library(tidyverse) 12 | library(gt) 13 | library(emoji) 14 | eurovision <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-05-17/eurovision.csv") %>% 15 | filter(section == "final") 16 | # select(year, country = artist_country, rank) 17 | ``` 18 | 19 | ```{r} 20 | emoji_list = emojis 21 | 22 | eurovision %>% 23 | mutate(winner = if_else(rank == 1, emoji("check"), emoji("cross_mark"))) %>% 24 | gt() %>% 25 | data_color(column = rank, 26 | row = rank < 4, 27 | palette = c("gold", "#C0C0C0", "#CD7F32")) %>% 28 | opt_interactive( 29 | use_pagination = TRUE, 30 | use_pagination_info = TRUE, 31 | use_sorting = TRUE, 32 | use_search = TRUE, 33 | use_filters = TRUE, 34 | use_resizers = TRUE, 35 | use_highlight = TRUE, 36 | use_compact_mode = FALSE, 37 | use_text_wrapping = TRUE, 38 | use_page_size_select = FALSE, 39 | page_size_default = 20, 40 | page_size_values = c(10, 25, 50, 100), 41 | pagination_type = c("numbers", "jump", "simple") 42 | ) 43 | ``` 44 | -------------------------------------------------------------------------------- /2024-04/gt_transcript.txt: -------------------------------------------------------------------------------- 1 | Hi, everyone. Welcome to another healthy demo. Today, we'll be showing you how to put interactive GT tables into a quarto dashboard. We've done a few sessions on quarto before. We will recap one or two things about quarto. But if you haven't used quarto before and you missed the quarto live demos, they are available on our website. Two quarter demos even. GT is a package for making tables. GT stands for grammar of tables. The syntax looks a little bit similar to ggplot, which is the grammar of graphics. This is the grammar of tables, In this demo, we're only going to do an interactive table. An interactive table means that it only works in an HTML document, it could be shared as a website or it could be viewed online. GT is also very good with PDFs and word documents, but this is not covered in this demo. Stella has opened up the posit Cloud demo that I've set up. We be going back to a dataset we used, I think maybe even two years ago, the Eurovision dataset. Stella, could you please just run the first code chunk that I've set up there. We can close all of those warnings and messages. Click on the dataset please. We'll have a quick look and remind ourselves what we're working with. The original the dataset comes with 20 something columns, I've already picked out the four columns we'll be working with today just to make that example tidy. We have a year, a country and I've filtered it for just the final dataset actually comes with semi final results as well, and there's a rank. Do you want to click on the word rank, just to sort it, and we can see that the winner column, which technically is is not necessary because it just shows you whether rank is one, but we have now convinced that of the data that seems to work. If you click on rank again, We can see that no one like whose rank isn't one isn't the winner. Make sense. Please go back to the GT document. Next, could you remove the winner in the select statement. Just take winner out because, like I said, it is covered in rank. Thank you. Could you know please render the document because it is a GT document we're working in Let's have a look at the GT output. And scroll down. Yeah, this is reading. Normally when you're making a dashboard, you definitely don't want the package start up messages, showing up on a dashboard and you usually want to hide code as well. I've now been using quarto for over a year at, I still can't remember how to do this. I always Google quarto Stella, could you please open a new tab and Google quarto output options. Yeah, execution options. That's perfect. Yeah. Copy the echo false on the two lines between the title and Jupyter. Echo means show R code so Echo is going to take that away and go back to the execution options file now, and we also want to hide the warnings and messages. Look at the table, scroll down just a little bit. Warning. I think we also want to do warning falls? If you now go back to the quarter document and, exactly warning, semicolon false. Now render the document again. You can press you can press the render button, or you can do control shift, K. K used to stand stands for knit, but it's the same shortcut for rendering. Can you open the Is that. Perfect. That's work. You can see it's hit all the code and it's hit all the warnings and messages, and there's nothing else because we haven't put in a table yet. Let's put in a GT table. I would do a new. Do a new code chunk, please. You can insert it with that button exactly and uro vision piped into. T. That's it. Render. What was the shortcut again? Command K. Yeah. Can use it. We have a GT table. Now this is not much different to a cable. A cable function is what we maybe used previously. However, GT has several benefits. Namely, it's very easy to make it interactive. Now go back. At the end of GT, do another pipe. The slight difference between, I said gt is similar to ggplot, which is grammar of graphics. After you start the ggplot, you have to use the plus to add things on. Because gt is a much newer package. ggplot was made before the pipe existed in R, but because gt is a newer package, it uses the pipe instead of the plus. Stella, can you please now type in Opt - OPT underscore, interactive. Yeah, that's perfect. Render again. Let's see what that does. You can see it now put it into an HTML table where you can instead of just printing all 900 rows, by default, it gives you ten row pages. It also has a sorting ability. If you now clicked on year or country or rank, it would get alphabetically or numerically. You've already got a slightly interactive table in a dashboard, a very basic dashboard, with no other elements. Let's now make this table a lot more exciting. To do that, can you save the document? I assume that the error there - what that error coming? What does it say when you click on that? No, no, nothing. I look. Just a space. Funny. Can you move your cursor on to options Interactive, please? And try and figure out how to do F one function F one. That's different in the keyboard. Perfect. That it. What happens when you manage to push F one or function one or something like that in code the Help tab comes up. The Option interactive function has a lot of useful arguments. Could you just keep scrolling down, please? Yeah, I actually usage. What I do is I copy everything. I leave the data in active out because I always keep it active. Copy everything from use page nation to page type, and put them inside your opt interactive. And perfect. Thanks for that indentation. I render the document again. I think this has this hasn't changed any of the default. Now that we know what the arguments are called and we copied them in with the default values, we can start changing them. For example, we are using page nation. However, page size default, I usually make it more than ten, so for example, change that to 20. Sorting sorting means when you click on the column, it orders it alphabetically on numerically, that's switched on by default. However, search and filters are both useful. I almost, switch on search and filters, please. Yeah. Now render. Let's see what we've done. Very efficient. You kept the E from false, so you only had to write in TRU. Gotta make those small savings. Add up. We changed a few things. We made the page made it to display 20. Obviously, you can make it whatever you want. Search bar at the top, sees from all columns. If you type in front, We only have France in a single column, but if it was a word that you were searching for that appeared in multiple or maybe delete France and search for number four, we can see that it searches from all columns. Filter means that it's searching from a spic delete the four now, so we can get rid of that. Filter means that you can search for a specific column. If you want to see ranks four on or year 2004 only or something like that. That's basically made your Table more interactive. Other two options that I sometimes use if you jump back to option is both following to the resizes and highlights. So I switch both of those on as well. In. If you now hover over your table, you can see that it highlights the row you're looking a small things also the resize means that you can manually resize column width now. It doesn't make sense in this example because they're very small, but if you have columns with longer text and stuff, you could expand them. That's what we've done here. This is how I do the exciting part is still coming. This is the basic setup of an interactive TT table is that could use sell, please just highlight what we've done. We pipe the dataset, which in this case is urovision. H. To the TT function, which gives you a static table. Then we pipe it into opt interactive, which either have to remember that's called opt interactive or you can Google interactive TT table. Then once I have opt interactive, I jump into the Help tab to copy in these extra arguments that are convenient to be changed. Most of I'll admit that haven't used every single one, but they are quite self explanatory, but there's also further explanations in the Help tab, if you wanted to. Know what's going on. But the exciting is colours and emerges, just like let's really make the table pop. What we now want to do is indicate whether someone is a winner with an emerge. We're going to add a new column. Where you do Eurovision pipe into GT, between that, add a mutate, please. Mutate winner equals if_else(rank == 1, Now do emoji() Yeah bracket. Quote quote. And my mind just went blank. What is the check mark? Just try tick. Let's write that. The next after this is true. If rank is one, you get a takt and now go to the side and do emoji cross. If those don't work, we'll keep thinking about it. Pipe that in there and just red. Couldn't find the mood tech. Modes are a bit of a pain to work with because they come in so many different tables. I can never remember which database includes them with which names. But to figure out what the modes are available to us. Go to add a new line before the eurovision in that same chunk. Type in emoji list. Equals. The second thing in that list, select the second thing. Delete the bracket to surround that line alone. Now click on that list. These are the 4,700 differents we have to choose from. If you put tick into the search, see what we have. The green tick mark is what I was looking for. Try correct, type in correct. What's it called? Typing green? I do have a note of this, but I wanted to show you my workings and I have honestly forgotten what I what it was called. Check Check. Yeah, maybe it's check. Yes, check mark button. If you now scroll to the right where it says keywords. Keywords. We have to We have to use any one of these keywords. Check was the one that tech. If you could type in check instead of tech. Go. I'll show you how to when you render it, it can take a little while. So to test it let's look at the. Let's find that fine let's look at great, great. If you look for try and sort it based on winner, you can see that everyone has a one has a green check. The green check, I should really remember this because I usually use the green and red checks when I'm doing a study overview, has had site to collaborate a completed data entry check check check and I might have different checks based on different parts of data entry. I do that Is green check. Red cross quite a lot. That's why I wanted to show that in the Eurovision example. If you now jump back to the thinking. Because rendering it can take a little while, especially once you have multiple tables and plots and stuff, you can execute this interactively. If you just now do control dent or command just run it like that, you will see, why is that a church now? Cross, oh yeah, it is slightly random. Sometimes cross gets interpreted as. Go back to the emoji list and put in cross. This has happened to me before. We need to find the keyword that exists in the cross, but doesn't exist in any of the other cons in the columns. What is it? Maybe cancel. No, cross Mark button. No. I cancel a different one. Which one was it? No, yeah. Cancel is the third one. Yeah, and that's not in the church. I try that. Random like that. You can also make this. This is working pretty fast. But if you had a bigger table and I'm testing some basic functionality out, I would add in a. Can you pipe eurovision into a slice and do slice one colon ten. That way, we're only doing ten lines run that. Greats, but now it's giving us another random cross. How do we get the red cross? Jump back into the emojilist Can we figure out how to make it consistently give us a red cross? This one. I wonder if cross Mark will work or will it try cross Mark. Until the next time it changes. It's funny because I used the red cross a lot. I had seen the church thing, but anyway. I don't want to open my cheat of the results yet because I think it's good to show how I actually go about finding the right out of the right emoji out of the 4700. As you said, now that I put in the slice, it just takes the first ten rows of the data. In this case, the example is fast anyway. Sarah, Yeah, that's a good question. Does it always look in the keyword column or could you type in the full name? Could you try, try the full name. Where's the full name? The full names you need? Name cross Mark. Let's try Cross Mark. Yeah, I think it is the keyboard. I wonder if there's a different function in the same package. Away, let's move on. Let's move on. We got it to work. I know we've gone working, if a. The final thing I want to show you. I was just saying this example is quite quick, but the reason I wanted to reduce it to ten lines just for a while we're building the final example of colouring in certain cells is that that can be at just a tiny bit lower. Now we're ready, we have a fast thing to add in. What we're now going to add in is data colour. Where it says, sell, please pipe that into data underscore colour. Yeah. Some of you may have realised that ggplot accepts both British and American spelling because the author of ggplot Hadley Wickham is from New Zealand. However, GT only accepts American spelling. But used the auto completely cut the right thing, but I tend to write the colour with the U and then again any way. Run that, see what happens. Just execute that. Ugly looking thing. You don't ever want to colour the whole table. Do you I don't even understand why that row is black and stuff. Inside data colour, adding an argument that says column equals rank. Let's only colour the rank column. Yeah like that. I think it cool. Similarly, firstly, we can give it a better palette. If you just type in palette, equals varieties with an I need be the name of the palette needs to be quoted. Try that. Try ordering by rank so we can see how the colours, click on that. We can also add in reverse equal if you want it the other way around. Mm hm. I think it looks the same because I had just ordered it by that. No, no, I was loading. That's what I mean once we get into colouring every single row, the example will start rendering much lower than any of the previous things we're doing. If you now order by rank, you can see that the yellow ones have gone to the top ranks and that's the reverse equals true. However, even I find that I don't normally colour every single cell. I tend to colour outliers or special node. Let's now colour the win top three instead. How to colour the top three is off the column, adding a new line that says row row equals, rank is greater, sorry rank smaller than less than four. Comma. Let's try that. Yeah. We might need to expand our example. Could you do the add 20 rows into the slice. Hopefully, we'll get more values in there and by rank. Now we call it two and three. Now the palette no longer makes sense. Let's change it to gold silver and bronze. Yeah and do I combine function. C. You can see that RS automatically recognise cold silver and bronze. You can see that it doesn't recognise bronze. The way to tackle that is Google Bronze Hex code. Yeah, exactly. Now copy that take including the hash. A Hex code as a universal way to indicate a colour and you can see Rs already recognise that hex code. Same for silver, please Google silver Hex code. Hey, run that. Because we don't have any number one. I set I think it sets the scale 2-3, and also we need to take out the slice now at least we have working code and I'm confident that we will get three numbers we'll run it like that. Good. Click on rank. Obviously, Bronze is number one, no, we need to take out the reverse. Thanks and Mary. The reverse made sense for the previous palette we were using. Yeah, great. And you now maybe render the whole, the checks are no longer blue green, they're black. Check marks, honestly. Render the whole document, let's see what we have. Yeah, rank. You can see that our golden bars match up with green checks. Yes. That's everything I wanted to show. Does anyone have any questions or suggestions for edits Stella can try and live code? Obviously, this is just an example, but I would normally do row. I would indicate outliers, or you can also do missing values. You could do is na, indicate missing values or something like that. Finally, while you're all thinking about your questions, gt is extremely flexible. You can change the font. You can change the data colour change the highlighting like the cell colour, but you could keep the cell white and change make the number red or name red. You can colour one column based on another column. It doesn't have to be one value, you can colour one column based on its multiple surrounding columns. Yeah. Lots of things. Does anyone have any questions suggestions? Who here has used GT before? Anyone has anyone used GT before? No. Can I ask a quick question. You've been able to filter by rows, if you want all the ones to come up or if you were looking for just the year 2004. But if you had 30 columns, is there a way that you could I know you don't in this example, but could you filter just a column you're looking for, what does it look like when there's too many columns to fit the page? I guesses. Good question. You can't filter for select columns. But St, let's try this. Could you go up and remove the select statement? You know where we read in the dataset, just comment that out and the pipe as we. Perfect. Run that. Now render the whole document again and we'll get we'll see what happens when we have 18 columns. What did it happen? Does it just display like the - oh! You can scroll right. That's pretty cool, isn't it? Yeah. No, I don't think you can select columns like that, but it does look quite clever, you don't have to worry about. In this example, the resizing of the columns might be useful as well so you can make the URL, like that. That's good. Can you freeze if you have loads of rows, but you want to see what the column name was. Is there a way that you can freeze it? I don't know let's google that. That's a good question. GT table Interactive. Do you have freeze header or I think this is how it looks like. What about the first question? I know it sounded slightly different, but it also might not. This isn't the exact same question either. No easily you don't have any other questions. In that case, I'm going to finish the demo. Thanks very much for coming. After we do a demo, there's also the clinic part, anyone can ask any question about anything, not related to what we've just shown, but we don't recall that. The next demo, I believe is 3 weeks time. It's directed acyclic graphs. -------------------------------------------------------------------------------- /2024-05/fuzzy_match.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Fuzzy Match" 3 | format: html 4 | editor: visual 5 | --- 6 | ```{r} 7 | library(stringdist) 8 | library(fuzzyjoin) 9 | library(tidyverse) 10 | set.seed(1) 11 | df1 = tibble( 12 | name = c("John Doe", "Jane Smith", "Michael Johnson", "Emily Davis", "Brian J. Jones", 13 | "Patricia Miller", "Robert Wilson", "Linda Taylor", "David Anderson", "Barbara Thomas", 14 | "James Jackson", "Mary White", "Richard Harris", "Susan Martin", "Joseph Thompson", 15 | "Margaret Garcia", "Thomas Martinez", "Elizabeth Robinson", "Charles Clark", "Sarah Lewis"), 16 | age = sample(20:80, 20, replace = TRUE), 17 | bmi = round(runif(20, 18.5, 40), 1), 18 | diabetes = sample(c("Yes", "No"), 20, replace = TRUE), 19 | hypertension = sample(c("Yes", "No"), 20, replace = TRUE), 20 | asthma = sample(c("Yes", "No"), 20, replace = TRUE) 21 | ) 22 | df2 = tibble( 23 | name = c("John Doe", "Jane Smith", "Micheal Johnson", "Emily Davis", "Brian Jones", 24 | "Patricia Miller", "Robert Wilson", "Linda Taylor", "DAVID ANDERSON", "Barbara Thomes", "Mr Jackson", "Mary White", "Richard Harris", "Susan Martin", "Joseph Thompson", 25 | "Margaret Garica", "Thomas Martinez", "Liz Robinson", "Charlie Clark", "Sarah Lewis"), 26 | oxy_sat = round(runif(20, 90, 100), 1), 27 | medication1 = c("Paracetamol", "Amoxicillin", "Metformin", "Insulin", "Acetaminophen", "Paracetemol", "Insulin","Amoxicillin","paracetamol","Paracetamol", "Amoxicillin", "Metformin", "Insulin", "Acetaminophen","Paracetmol", "Diphenhydramine", "Ibuprofen", "Insulin", "Acetaminophen", "Paracetemol") 28 | ) 29 | ``` 30 | # Joining 31 | - inner join or left join 32 | ```{r} 33 | df3 = df1 %>% 34 | left_join(df2, by = join_by(name)) 35 | ``` 36 | # Anti-join 37 | - what does not match? 38 | - Data wrangling 39 | ```{r} 40 | df4 = df1 %>% 41 | anti_join(df2, by = join_by(name)) 42 | ``` 43 | # Regex - for common mistakes 44 | - Could also be done via look-up tables - e.g. different names for drugs 45 | - str_detect() (Paracet_mol) 46 | ```{r} 47 | str_detect(df2$medication1, "Paracet.*") 48 | df5 = df2 %>% 49 | mutate(medication1 = tolower(medication1), # or str_to_lower() 50 | medication1 = if_else(str_detect(medication1, "paracet.*"), 51 | "paracetamol", 52 | medication1)) 53 | 54 | ``` 55 | # Difference Score 56 | - stringdist() 57 | - stringdistmatrix() 58 | - stringsim() 59 | - amatch() 60 | - compares differences (substitutions, deletions, additions) not synonyms/similar words 61 | - information on distance method in helptab for stringdist-metrics 62 | ```{r} 63 | stringdist(df1$name, df2$name, method = c("osa")) 64 | stringdistmatrix(df1$name, df2$name) 65 | stringsim(df1$name, df2$name) 66 | ``` 67 | # Fuzzy matching 68 | - stringdist_inner_join() 69 | - stringdist\_\*\_join() 70 | - case sensitive? 71 | - maximum distance? 72 | ```{r} 73 | df6 = df1 %>% 74 | stringdist_left_join(df2, by = "name", 75 | method = c("soundex"), 76 | max_dist = 3, 77 | ignore_case = TRUE) 78 | ``` 79 | - 80 | - 81 | # ANSWERS 82 | # Joining 83 | ```{r} 84 | df1 %>% inner_join(df2) 85 | ``` 86 | # Anti-join 87 | ```{r} 88 | df1 %>% anti_join(df2) 89 | df2 %>% anti_join(df1) 90 | ``` 91 | # Regex - for common mistakes 92 | Also can be done via look-up tables 93 | ```{r} 94 | df2 %>% 95 | mutate(medication1 = case_when(str_detect(medication1, "(?i)paracet[a-z]*mol") ~ "Paracetamol", 96 | .default = medication1)) 97 | ``` 98 | # Difference Score 99 | ```{r} 100 | df1$name %>% stringdist(df2$name) 101 | df1$name %>% stringdistmatrix(df2$name) 102 | df1$name %>% stringsim(df2$name) 103 | df1$name %>% amatch(df2$name, maxDist = 5) # closest match in the list 104 | ``` 105 | # Fuzzy matching 106 | ```{r} 107 | df1 %>% stringdist_inner_join(df2, 108 | ignore_case = FALSE, 109 | distance_col = "distance", 110 | max_dist = 5) 111 | ``` 112 | -------------------------------------------------------------------------------- /2024-05/fuzzy_match_transcript.txt: -------------------------------------------------------------------------------- 1 | There we go without further ado. Hi, everybody. Welcome back to another healthy live demo. Thanks for sticking with us through the time change. I'm glad it looks like most people have been okay with that. Today, we're going to be doing a demo on fuzzy matching. We're going to be talking about it in doing fuzzy matching in different ways and different ways that you might want to match non perfect strings. I'm here today with Ewen Harrison, Professor Ewen Harrison, who is sharing his screen and he's going to be the one that's going to be live coding for us, and I'm going to be talking through some tasks and approaches that you can take. Ewen, if you would be able to in the files tab, there should be a quarto document called Fuzzy Match. That one right there. Can people see? You can zoom in a little bit. If you make the files and environment slightly smaller, yeah. That looks big if anyone want it bigger, let us know. But Anyone that's not super familiar with the posit Cloud, interface. This is just what typical R studio might look like when it's locally on your machine, but we're using a cloud space. It's got the usual four panels, although you and has currently minimised the console so that we can look at the top left panel, which is the script. That's the console down below, which you can minimise by clicking that button there. On the right, we've got our files and then the environment just above. We can switch through a bunch of different tabs on the bottom right but files, that's what's up at the moment. Great. Today, for our fuzzy matching. Instead of using an external dataset, I've created two data frames which are slightly smaller so that we're going to be able to look at the exact differences. But of course, these techniques can be used on bigger datasets though as you'll see, it can get quite tricky if you try and do things that are a bit too big. So it's always worth being aware with what task you're trying to tackle. If you just run the first chunk in, it loads the libraries we need, and then also creates those datasets. Perfect. Yes. The two data sets. If you just have a look through them, you'll see that they've both got names, but they have slightly different information in them otherwise. The first data frame maybe has some more combitis, their age, BMI, things like that. And the second data frame has some medication that the patient is on and also oxygen saturation. Basically, what we're going to do is we're going to try and join these datasets together. So we're going to start with that and then we will see what happens after there and we can discuss the different approaches in terms of if things aren't exact matches. So, if you wouldn't mind trying to just join the datasets either an inner join or a left join, just so that we can see how that works. You can hear me. I'm going to use tidyverse principles to join the two datasets. In this box, I'm going to put data frame one and I'm copying them so I get the names right. Data Frame two. I'm just going to run them to make sure that I've got that syntax. There's data frame one coming up. Data Frame two coming up. Now I've been tipped with inner join or left. Left join is what I would normally start with. I'm I'm using the tiny verse version of the pipe, which is control shift and enter. I'm going to I'll just use the exact name there. I'm going to put that in like that. I'm just going to run that like that. Now I get this thing saying joining by join by name. If I was being proper about it, I would include that in there so that I'm specifying how it gets joined. That's really important if you have got common names across the two datasets, which isn't the case here. The first thing I can do when I look at this is just see what the size of it is. This is 20 by eight, and this is 20 long, and this is long and this is two columns 34, five, six, seven. It looks like it might have worked, but I'm guessing it hasn't. Yeah. It has the concept of a left join is it's taking all of the rows within the left or the first data frame that we're using. In this instance, it's data frame one. And it is then taking the matches from data frame two and putting them together. So if you within that join that you did, if you scroll along to there, you'll see that there's some A in the data frame from the variables that didn't have an exact match from the first data frame. Yeah, exactly. Because we know this is a demo on fuzzy matching. This was done on purpose. I didn't want them all to match. On useful thing I use for finding out the variables that might not be matching is using anti join, which is follows similar syntax to an inner join or a left join or a join, except it uses anti join instead. What it does is it shows the rows that aren't full matches between the two data frames. That's a great thing to do in this situation. Yeah. Here we've got the eight rows from data frame one, which didn't have a match in data frame two. It's a quick tip here that you can see, these are eight rows that didn't fit into my join, and then you can start investigating that might be. An important thing, I think to note is that in this instance, you might then depending what the variable is and use some data wrangling to fix the spelling mistakes as opposed to doing what we're going to do over the course of this demo is try to do fuzzy joining basing on similarities. You could correct your data frame. If there's few enough mistakes, for example, obviously can get quite tricky if you have thousands of rows and you can't necessarily individually fix them all. But just something to bear in mind that I would recommend doing data wrangling and cleaning and those things first. Before we go into other fuzzy joints. One thing to help in this kind of thing is regular expressions. These can be quite tricky. We have done a demo on string R in the past and that covered a lot of regular expressions. I thought we could quickly try and show an example of a regular expression before we move on. No by the names, but by the medications that in data frame two. There is a lot of spelling mistakes of the word paracetamol. Yeah. It's just this is a simple spelling mistake that I did in in the script, there's always just y, it's that letter, that or MO or L or nothing at all. This is a simple spelling mistake. But I guess this is also to show that it can quickly get quite complicated and it can be hard to encompass all of the possible variations that you might get from a spelling mistake in a regular expression. But, if you wanted to give that a go with using a mutate take to change the variable medication and use string string R. STR detect with an I was going to suggest a case when, but if you had another suggestion. And you're testing me. Okay. Let me see if I can get it working first of all, so I'm just going to use old style string detect medication. I don't know if something like that. That's detecting all of them that have parasite. I don't carry regular expressions in my head, and I really need to google like crazy when I'm doing this I find it quite difficult. But this is a regular expression that takes past and then takes any other character, which is the any number of times following that, and string de pe a tru false. It's a logical function based on whether the conditions of this regular expression have been fulfilled or not. The first one is true and the sixth one is true, and if we go back to our dataset, the first one is true and the sixth one is true. There's a way of identifying our parast. Now, let's put this up here she wants me to do something different. What I could do is just take everything that started with paracet and then replace it with the correct spelling of paracetamol. That's probably what I would do. I know there's capitalization as well. I would do a two lower case for all of these strings as well as trimming white space at each at the beginning and the end of the string, just as a general starting point for any of these string manipulations. Why do you just tell me what you want me to do say that so we don't get bogged down in mine? No, I think that that kind of solution is exactly right. So if you just do a mutate and then do exactly what you're saying, like a case win or FL. They'll do FL. I don't use titvers FL with the This works as What's the variable name medicate. I'm using mutate because I want to either make a new column or change a column, make a new variable change a variable, so I'm mutating the data frame, piping in. I want to change medication one. I've got FL, and I'm going to put in my condition here. Into the condition of FLS, but I'm going to take out my old style data frame and dollar because that's all automatically passed to this. I think stop if this is wrong. This is going to give me by true false is true falses. Then the second argument in FLS is what to do if it's true, if it's true, I want to say ParacetAmol, if it's false, I just want to pass back to me medication one. We'll put that into data frame five. I haven't got an error. I bring up data frame five. What I'm hoping now is all of the paracetamols are correct. There may be because I have This regular expression will apply case. This is only going to select the paracetamols that are starting with a capital P. It's not going to select out the paracetamols that start with a small p. If I had this is still wrong because as a factor, this is going to be seen as a different thing. I may want to do a two or something like that first. Right. That's perfect. I think that is This is a vaguely simple spelling mistake, but it just goes to show how complicated. It can become quite quickly. Like an says, with regular expressions, it's usually a lot of frantic googling on remembering the logic that you can use. It can be hard to encompass lots of different mistakes and all the possibilities of mistakes that people could use. Instead of doing something like regular expressions. We sometimes talk about lookup tables, which is also maybe another option in terms of if it's not a spelling mistake, but goes by a different drug name for example. You could use a lookup table and switch things out that way. But instead of doing what we're going to now talk about is or unless we're going to fix this error for. Let's that to talk about what we came here for is a bit of fuzzy matching. We're going to start by Using these functions called stringdist, stringdistmatrix and stringsim It's all about calculating a distance score between two strings. And This is a way of looking into the differences between two strings based on additions, deletions, and substitutions and those kind of things. That's how R is going to be taking two strings and calculating how different they are. It's important to note that in these methods, it's not going to understand, for instance, the synonyms of a word or how similar. We know the words to be it'll basically just look at the the differences in terms of the actual characters. For example, the word hard and herd, as in herd, will be more similar than hard and harder despite those two being much more similar words. Just because it takes one substitution to become hard, but two additions to become harder. Sorry for taking long to do that. That's all right. Okay. So, what we're going to have a go at we'll start with the string first, which basically what we're going to have to take. We're going to want to grab the name column from both of the data frames. I was doing the old fashioned dollar sign grab of the con. But yet having a look through the Help is definitely useful. And is it medications or going to names, sorry. So in your own research, it might be more likely to be matching on different things, but we're here, the spelling mistakes and the not quite exact matches are within the names. There we go. That was nice and simple and the output there, you can see that it's for each of the match of the names within those two columns. It's given you a different score. So zero is an exact match all the way up to well, however many characters are within the name. Say that John Doe and John Doe exact James Smith and James Smith were exact. But Michael Johnson was slightly different in the switch around of the A. Yeah. Now we can try the same thing, but if we try a matrix instead. This example, it's taking the first one and matching it with the first one in the other data frame. But if we want to look at the distance score between all of the different options, you can do the exact same thing but use the function stirngdistmatrix There is a bit more of a complicated output, but you'll see it's for each of the data frames we've got. So the best match in the middle. The two data frames that I made happened to have essentially the match in the same order, but they wouldn't necessarily be that way. This is a way that you could. That's just another way that you could get a matrix to have a check of that. The third one that I gave the string similarity, it's very similar to the string distance, but it gives you a score on 0-1 score. With this time, one being the perfect match. I'm just looking at there's obviously different methods that can be used for the matching. OSA is the default and the one that people can get taught is the Jaro, whats the second guy's name. Winkler, is it? Which is how many letters are the same, how many let many transpositions are there that are different and then a little equation that changes them on a scale of zero to one to give a match. Yes. Yeah, you can change in the method. You can change the different one to what you might want. Yeah, exactly. You might want to use the one you're more familiar with than just the default because that's not always what you want to do. Perfect. Now we've calculated distance scores, we can see how similar our words are. But what we might actually want to do is to join our data frames. Now, I'm going to caveat by saying, you have to be very careful to do fuzzy matching, fuzzy joining. It's not something that you might want to do on a large scale if you can't actually go back and double check things. But for the sake of completeness of this demo, I thought it'd be nice to show you some pretty simple functions that you can use to join two data frames based on not exact matches. We're going to be using this package, which is called fuzzy join and it uses similar syntax to inter join folder and left join, like they do in the tiny verse except it is using string dis at the start. If we have a goal at joining the two data frames based on the t we go, use that one. And use strings join instead. Is the by syntax the same? I think so. We of going over my I'll get rid of this just. Joining by name anyway seemed to work last time. So you'll see on the Help tab that there was a default for the maximum distance. That's set at two. If there is any differences that were lower than two, it wouldn't join based on that. You can modify that to whatever you want. That's why some of the variables still were NAs because the algorithm didn't match them closely enough to match them within this specification that we put. We can make it higher or lower. It's all about finding a happy medium so that you don't match things that are too distantly related and that it can there can be quite a lot of overlaps eventually. But also, you want to make sure that it encompasses enough of your variations there. Michael Johnson and Michael Johnson have been matched because let's just go back to this. Here's the distances. This is a distance of one, but this is a distance of three, and we've set a default distance of two, it'll match anything which has got a distance of one and not match anything that's a distance over one, which is why we see row three, that is now matched, but row five has not matched. As always with these things, I've never done this before. But if we now go to Max dist equals three, then Maybe it has to be over. That has that changed? Interesting. Brian is still not coming in. Dunno? Could it be the method that it's using? Probably the method will be different to the Because you made this one so big, the distance between John Doe and Jane Smith or John Doe and Mr. Jackson is enough that has joined them and given you extra entries within your data set. So that's why you have to be careful with how much of a distance you allow in your join. So that so or at that time. Brian is now in. But David Anderson so line nine. David Anderson. All the letters are different because they're all, well, They're all capitals, one, two, three, so that's the distance of 11 cause they're all caps. So in this one, this example, there is you can ignore underscore case within the strings join and by default. Yeah, if you change that to true, that will know ignore the case. It's still different. Interesting. That's very interesting because ignoring the case, it should be an exact match. It's got David Anderson that time. Maybe I didn't do it right. Let me just make sure the by term is correct. Okay, so that's that's back to three. It's not matching Brian Jones Until we get to six. Yes Bizarre. I'm not quite sure, especially because now we've made sure to match the method that we were using. Okay. But anyway, that is I guess, an example of why fuzzy matching can be quite complicated and quite tricky, and you have to make sure you are aware of what you're doing. Things like data wrangling, data cleaning, probably the most important things to do at the beginning so that you limit how much Fuzzy matching you need to do. But it's always good to know that these things are possible and an option and can be quite useful things like regular expressions can get quite complicated, quite fast. Similarity scores can give you useful insights into your data and can help inform decisions that you make going forward. Yeah, that's pretty much all I had planned unless anyone has any question about fuzzy matching, In one of our COVID projects, we matched the drugs of 350,000 patients against BNF standard drugs using a fuzzy join. The drugs have been typed in by research nurses. There was lots of mistakes. We picked a threshold and just said if it's close enough to the BNF standard using a lookup table. Then call it that. It took it took maybe three or four days to run that, running on a server, running away, but worked very well in terms of improving the matching of the particular drugs with it was something like a 1% false positive and you're just trading off the true positive versus false positive. I think that thing in terms of matching on drugs and maybe like hospital names or things like that are more likely to come up in people's research. Definitely, but it can be computationally. There's a question coming up saying, you ever use longest not sure of the difference between that and string. I'm not sure about you. Do you want to unmute and speak about it claire. Or you maybe can't. No mic. No mics. Well, I mean, stringdist is just a way of applying a number of different matching algorithms or distance methods to a string, and there are different distance methods that start very simply and then get much more complicated. The Jaro algorithm, jaro which is the jW one, as I say, just counts letters that are different and counts up transpositions. That's why I was looking at this because for this, it would say Brian is all correct. Then I think it would count up one, two, three, four, five, six, seven, as wrong. And not consider them transpositions, but it depends on how transpositions are usually just one letter apart. So that would count that up as seven, and then the The winkler part of it was to emphasise the start of the word. If there was a prefix match, then that upped the matching score because a lot of words people get right at the start and then get wrong at the end. I tried to wait earlier letters in the word compared with late letters in the word. String just provides a number of different ways of capturing the difference between two strings, and in different situations you might want to capture them in different ways depending on what the task you were trying to do. I'm guessing longest common substring clear is looking at two strings and then looking for the well I'm just rearranging the words of what you've said, but but capturing the longest common subset. For paracetamol with that error at the end, Paracet would be the longest substring, which would be seven characters log. I don't know if that I don't know if that is option. Do you know if that's an option in this? Yeah, the metric it talks three all the different options. Yeah, there you go. Longest common substr. We we do that? That looks like it's given us a perfect match. You just have to watch these, don't you because this might be. You just need to watch the units of these the longest common substring is defined as the longest string that can be obtained by pairing characters from A and B while keeping the order of the characters in. That's what we said. The LCS distance is defined as the number of characters. The distance is one minus the match on a scale of 01. Okay. Yeah, looks good. Always really good to think about the different options. Because the simpler one while, sometimes easier to understand, might not be the best option for your da. Yeah. Square is also saying sound to create phonetic variations can be useful. That sounds really good. Interesting to know whether that worked for names and that names are sometimes pronounced differently. But there's a perfect match on that as well. With no false positives. You know there's false positives if you get multiple if you get multiple rows. If this is more than 20, you've matched more than one from dataframe 1 to dataframe 2. Great. Brilliant. Okay. I was a really useful point for discussion. Thank you, Claire. I will stop the recording now, but thanks everyone for listening in to our fuzzy matching. -------------------------------------------------------------------------------- /2024-08/tidymodels.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Tidymodels" 3 | format: html 4 | editor: visual 5 | --- 6 | 7 | # Tidymodels Demo 8 | 9 | Tidymodels is a collection of packages for modelling in R. One of the main developers is Max Kuhn, who developed the Caret package, and tidymodels uses a lot of this functionality, in combination with other packages for modelling. Tidymodels utilises tidyverse-style programming. 10 | 11 | ### Helpful links: 12 | 13 | https://www.tmwr.org/ - tidymodels book 14 | 15 | https://www.tidymodels.org/find/recipes/ - preprocessing options 16 | 17 | https://www.tmwr.org/pre-proc-table.html - example preprocessing recipe 18 | 19 | https://www.tidymodels.org/find/parsnip/ - model specification options 20 | 21 | ## Data 22 | 23 | ```{r} 24 | library(tidyverse) 25 | library(tidymodels) 26 | 27 | diabetes = read_csv("diabetes_binary_health_indicators.csv") %>% 28 | sample_n(5000) 29 | 30 | diabetes = diabetes %>% 31 | mutate(across(c("Diabetes_binary", "HighBP", "HighChol", "CholCheck", "Smoker", "Stroke", 32 | "HeartDiseaseorAttack", "PhysActivity", "Fruits", "Veggies", 33 | "HvyAlcoholConsump", "AnyHealthcare", "NoDocbcCost", "DiffWalk", "Sex"), as.factor)) %>% 34 | mutate(added = 1, 35 | subjid = 1:n(), 36 | Diabetes_binary = case_when(Diabetes_binary == "1" ~ "Yes", 37 | Diabetes_binary == "0" ~ "No", 38 | .default = NA) %>% fct_relevel("Yes")) 39 | 40 | 41 | diabetes %>% 42 | glimpse() 43 | 44 | diabetes = diabetes %>% 45 | janitor::clean_names() 46 | 47 | ``` 48 | 49 | ## Data Split 50 | 51 | ```{r} 52 | split = diabetes %>% initial_split(prop = 0.8) 53 | diabetes_train = training(split) 54 | diabetes_test = testing(split) 55 | ``` 56 | 57 | ## Recipes 58 | 59 | https://www.tidymodels.org/find/recipes/ 60 | 61 | - remove columns with zero varience 62 | - create dummy variable 63 | - change role - update_role() 64 | 65 | ```{r} 66 | diabetes_recipe = 67 | recipe(diabetes_binary ~ ., data = diabetes_train) %>% 68 | step_zv() %>% 69 | step_ 70 | ``` 71 | 72 | ## Specifications 73 | 74 | https://www.tidymodels.org/find/parsnip/ 75 | 76 | Start with decision tree 77 | 78 | Change to logistic regression? 79 | 80 | ```{r} 81 | 82 | diabetes_tree_spec = decision_tree(cost_complexity = 0.0001) %>% 83 | set_engine("rpart") %>% 84 | set_mode("classification") 85 | 86 | ``` 87 | 88 | ## Workflows 89 | 90 | Use workflow() 91 | 92 | ```{r} 93 | diabetes_workflow = 94 | workflow() %>% 95 | add_recipe(_) %>% 96 | add_model(_) 97 | 98 | diabetes_fit = 99 | fit(_) 100 | ``` 101 | 102 | ## Data Metrics 103 | 104 | roc_auc() roc_curve() conf_mat() 105 | 106 | ```{r} 107 | # predict with augment() 108 | augment(diabetes_fit, new_data = diabetes_train) 109 | 110 | 111 | 112 | ``` 113 | 114 | ## Evaluate Test Set 115 | 116 | last_fit() Using workflow and data split 117 | 118 | ```{r} 119 | 120 | ``` 121 | 122 | # ANSWERS 123 | 124 | ## Recipes 125 | 126 | https://www.tidymodels.org/find/recipes/ 127 | 128 | - remove columns with zero varience 129 | 130 | - create dummy variable 131 | 132 | - normalise 133 | 134 | - change role 135 | 136 | ```{r} 137 | diabetes_recipe = 138 | recipe(diabetes_binary ~ ., data = diabetes_train) %>% 139 | step_zv() %>% 140 | step_dummy() %>% 141 | update_role(subjid, new_role = "id") 142 | 143 | diabetes_recipe %>% prep() %>% juice() 144 | ``` 145 | 146 | ## Specifications 147 | 148 | https://www.tidymodels.org/find/parsnip/ 149 | 150 | Start with decision tree 151 | 152 | Change to logistic regression? 153 | 154 | ```{r} 155 | diabetes_tree_spec = decision_tree(cost_complexity = 0.0001) %>% 156 | set_engine("rpart") %>% 157 | set_mode("classification") 158 | 159 | diabetes_lr_spec = logistic_reg() %>% 160 | set_engine("glm") %>% 161 | set_mode("classification") 162 | ``` 163 | 164 | ## Workflows 165 | 166 | Use workflow() 167 | 168 | ```{r} 169 | diabetes_workflow = 170 | workflow() %>% 171 | add_recipe(diabetes_recipe) %>% 172 | add_model(diabetes_tree_spec) 173 | 174 | diabetes_fit = 175 | fit(diabetes_workflow, diabetes_train) 176 | ``` 177 | 178 | ## Data Metrics 179 | 180 | ```{r} 181 | # predict with augment() 182 | augment(diabetes_fit, new_data = diabetes_train)%>% 183 | roc_auc(truth = diabetes_binary, .pred_Yes) 184 | 185 | augment(diabetes_fit, new_data = diabetes_train)%>% 186 | roc_curve(truth = diabetes_binary, .pred_Yes) %>% 187 | autoplot() 188 | 189 | augment(diabetes_fit, new_data = diabetes_train)%>% 190 | conf_mat(truth = diabetes_binary, .pred_class) %>% 191 | autoplot(type = "heatmap") 192 | ``` 193 | 194 | ## Evaluate 195 | 196 | ```{r} 197 | diabetes_fit_final = last_fit(diabetes_workflow, split) 198 | 199 | collect_metrics(diabetes_fit_final) 200 | ``` 201 | 202 | ## Resamples 203 | 204 | ```{r} 205 | diabetes_folds = bootstraps(diabetes_train) 206 | 207 | 208 | resample_diabetes = 209 | fit_resamples(diabetes_workflow, diabetes_folds, metrics = metric_set(accuracy, roc_auc, brier_class), 210 | control = control_resamples(save_pred = TRUE)) 211 | 212 | resample_diabetes %>% 213 | collect_predictions() %>% 214 | roc_auc(truth = diabetes_binary, .pred_Yes) 215 | 216 | resample_diabetes %>% 217 | collect_predictions() %>% 218 | roc_curve(truth = diabetes_binary, .pred_Yes) %>% 219 | autoplot() 220 | ``` 221 | -------------------------------------------------------------------------------- /2024-09/tidytext.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Tidytext Demo" 3 | author: "HealthyR" 4 | format: html 5 | editor: visual 6 | --- 7 | ## Tidytext 8 | 9 | ```{r} 10 | library(tidyverse) 11 | library(tidytext) 12 | library(taylor) 13 | tswift = taylor_album_songs 14 | ``` 15 | ## Explore the data 16 | ```{r} 17 | # library(finalfit) 18 | # missing_glimpse(tswift) 19 | ``` 20 | ## Unnest the lyrics tibbles 21 | ```{r} 22 | tswift = 23 | taylor_album_songs %>% 24 | unnest(lyrics) 25 | ``` 26 | unnest_tokens() 27 | ```{r} 28 | # Use unnest_tokens() to split the lyrics column into individual tokens 29 | tswift = tswift %>% 30 | unnest_tokens(output = word, input = lyric) 31 | # tswift %>% 32 | # unnest_tokens(output = word, input = lyric, token = "ngrams", n = 2) 33 | ``` 34 | ## Word frequencies 35 | How frequent is each word? What are the most frequently used words? 36 | ```{r} 37 | tswift %>% 38 | count(word, sort = TRUE) 39 | ``` 40 | ## Stop words 41 | Here you can access many different stopwords in a tidy format, in different languages/sources. 42 | ```{r} 43 | stopwords = get_stopwords(language = "en") 44 | ``` 45 | ## Remove the stop words from the taylor swift lyrics 46 | ```{r} 47 | tswift = tswift %>% 48 | filter(!word %in% stopwords$word) 49 | # tswift %>% 50 | # anti_join(stopwords, by = join_by(word)) 51 | ``` 52 | ## Sentiment 53 | ```{r} 54 | sentiments = get_sentiments("bing") 55 | sentiments %>% count(word, sort = TRUE) 56 | sentiments %>% 57 | filter(word %in% c("envious", "enviously", "enviousness")) 58 | sentiments = sentiments %>% 59 | filter(!c(word == "envious" & sentiment == "positive")) %>% 60 | filter(!c(word == "enviously" & sentiment == "positive")) %>% 61 | filter(!c(word == "enviousness" & sentiment == "positive")) 62 | ``` 63 | ## Join the token sentiments to the Taylor Swift lyrics 64 | ```{r} 65 | sentiments %>% count(word, sort = TRUE) 66 | tswift = left_join(tswift, sentiments, by = join_by(word)) 67 | ``` 68 | Make a plot to visualise which albums use more positive vs negative words 69 | ```{r} 70 | tswift %>% 71 | drop_na(sentiment) %>% 72 | group_by(album_name) %>% 73 | count(sentiment, word, sort = TRUE) %>% 74 | slice_max(n, n = 15) %>% 75 | ungroup() %>% 76 | ggplot(aes(n, fct_reorder(word, n), fill = album_name)) + 77 | geom_col(show.legend = FALSE) + 78 | facet_wrap(~album_name, scales = "free") + 79 | labs(x = "n", y = NULL) 80 | ``` 81 | # Answers 82 | ## Explore 83 | ```{r} 84 | tswift %>% 85 | count(album_name) 86 | ``` 87 | ## Unnest tokens 88 | ```{r} 89 | tswift = 90 | taylor_album_songs %>% 91 | unnest(lyrics) 92 | tidy_taylor = tswift %>% 93 | unnest_tokens(word, lyric) 94 | # also bigrams 95 | tidy_taylor2 = tswift %>% 96 | unnest_tokens(bigram, lyric, token = "ngrams", n = 2) 97 | ``` 98 | ## Word frequencies 99 | What are the most frequently used words? 100 | ```{r} 101 | tidy_taylor %>% 102 | count(word, sort = TRUE) 103 | ``` 104 | ## Stop words 105 | Here you can access many different stopwords in a tidy format, in different languages/sources. 106 | ```{r} 107 | get_stopwords(language = "en") 108 | ``` 109 | Remove the stop words from the words used in Taylor Swift songs 110 | ```{r} 111 | tidy_taylor %>% 112 | anti_join(get_stopwords(language = "en")) %>% 113 | count(word, sort = TRUE) 114 | ``` 115 | ## Sentiment 116 | ```{r} 117 | get_sentiments("bing") 118 | ``` 119 | ```{r} 120 | taylor_sentiment = tidy_taylor %>% 121 | inner_join(get_sentiments("bing")) 122 | ``` 123 | ```{r} 124 | # most common positive and negative words used 125 | words_count = taylor_sentiment %>% 126 | count(word, sentiment) 127 | top_words <- words_count %>% 128 | group_by(sentiment) %>% 129 | slice_max(n, n = 10) %>% 130 | ungroup() %>% 131 | mutate(word = reorder(word, n)) 132 | ggplot(top_words, aes(n, word, fill = sentiment)) + 133 | geom_col(show.legend = FALSE) + 134 | facet_wrap(~ sentiment, scales = "free") 135 | ``` 136 | Lexicons are not foolproof tools: e.g. "like" is being over represented as a positive word 137 | ```{r} 138 | taylor_sentiment %>% 139 | count(album_name, sentiment) 140 | taylor_sentiment %>% 141 | ggplot(aes(x = album_name, fill = sentiment)) + 142 | geom_bar(position = "fill") + 143 | coord_flip() 144 | ``` 145 | -------------------------------------------------------------------------------- /2024-11/tidymodels_resample.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Tidymodels" 3 | format: html 4 | editor: visual 5 | --- 6 | 7 | # Tidymodels Demo 8 | 9 | Tidymodels is a collection of packages for modelling in R. One of the main developers is Max Kuhn, who developed the Caret package, and tidymodels uses a lot of this functionality, in combination with other packages for modelling. Tidymodels utilises tidyverse-style programming. 10 | 11 | ### Helpful links: 12 | 13 | https://www.tmwr.org/ - tidymodels book 14 | 15 | https://www.tidymodels.org/find/recipes/ - preprocessing options 16 | 17 | https://www.tmwr.org/pre-proc-table.html - example preprocessing recipe 18 | 19 | https://www.tidymodels.org/find/parsnip/ - model specification options 20 | 21 | ## Data 22 | 23 | ```{r} 24 | library(tidyverse) 25 | library(tidymodels) 26 | 27 | diabetes = read_csv("diabetes_binary_health_indicators.csv") %>% 28 | sample_n(5000) 29 | 30 | diabetes = diabetes %>% 31 | mutate(across(c("Diabetes_binary", "HighBP", "HighChol", "CholCheck", "Smoker", "Stroke", 32 | "HeartDiseaseorAttack", "PhysActivity", "Fruits", "Veggies", 33 | "HvyAlcoholConsump", "AnyHealthcare", "NoDocbcCost", "DiffWalk", "Sex"), as.factor)) %>% 34 | mutate(added = 1, 35 | subjid = 1:n(), 36 | Diabetes_binary = case_when(Diabetes_binary == "1" ~ "Yes", 37 | Diabetes_binary == "0" ~ "No", 38 | .default = NA) %>% fct_relevel("Yes")) 39 | 40 | 41 | diabetes %>% 42 | glimpse() 43 | 44 | diabetes = diabetes %>% 45 | janitor::clean_names() 46 | 47 | ``` 48 | 49 | ## Data Split 50 | 51 | ```{r} 52 | split = diabetes %>% initial_split(prop = 0.8) 53 | diabetes_train = training(split) 54 | diabetes_test = testing(split) 55 | ``` 56 | 57 | ## Recipes 58 | 59 | https://www.tidymodels.org/find/recipes/ 60 | 61 | - remove columns with zero varience 62 | - create dummy variable 63 | - change role - update_role() 64 | 65 | ```{r} 66 | diabetes_recipe = 67 | recipe(diabetes_binary ~ ., data = diabetes_train) %>% 68 | step_zv() %>% 69 | step_dummy() %>% 70 | update_role(subjid, new_role = "id") 71 | 72 | diabetes_recipe %>% prep() %>% juice() 73 | ``` 74 | 75 | ## Specifications 76 | 77 | https://www.tidymodels.org/find/parsnip/ 78 | 79 | Start with decision tree 80 | 81 | Change to logistic regression? 82 | 83 | ```{r} 84 | 85 | diabetes_tree_spec = decision_tree(cost_complexity = 0.0001) %>% 86 | set_engine("rpart") %>% 87 | set_mode("classification") 88 | 89 | diabetes_lr_spec = logistic_reg() %>% 90 | set_engine("glm") %>% 91 | set_mode("classification") 92 | 93 | ``` 94 | 95 | ## Workflows 96 | 97 | Use workflow() 98 | 99 | ```{r} 100 | diabetes_workflow = 101 | workflow() %>% 102 | add_recipe(diabetes_recipe) %>% 103 | add_model(diabetes_tree_spec) 104 | 105 | diabetes_fit = 106 | fit(diabetes_workflow, diabetes_train) 107 | ``` 108 | 109 | ## Data Metrics 110 | 111 | roc_auc() roc_curve() conf_mat() 112 | 113 | ```{r} 114 | # predict with augment() 115 | augment(diabetes_fit, new_data = diabetes_train)%>% 116 | roc_auc(truth = diabetes_binary, .pred_Yes) 117 | 118 | augment(diabetes_fit, new_data = diabetes_train)%>% 119 | roc_curve(truth = diabetes_binary, .pred_Yes) %>% 120 | autoplot() 121 | 122 | augment(diabetes_fit, new_data = diabetes_train)%>% 123 | conf_mat(truth = diabetes_binary, .pred_class) %>% 124 | autoplot(type = "heatmap") 125 | 126 | ``` 127 | 128 | ## Evaluate Test Set 129 | 130 | last_fit() Using workflow and data split 131 | 132 | ```{r} 133 | diabetes_fit_final = last_fit(diabetes_workflow, split) 134 | 135 | collect_metrics(diabetes_fit_final) 136 | ``` 137 | 138 | ## Resamples 139 | 140 | You can only use your test set once! What can you do instead? Resampling simulates how well your model does when exposed to new data 141 | 142 | ### Folds 143 | 144 | https://rsample.tidymodels.org/reference/ Fitting with the folds, we don't retain the perameters, only the performance to evaluate 145 | 146 | ```{r} 147 | set.seed(1) 148 | # create folds pr bootstraps 149 | ``` 150 | 151 | # Fit resamples 152 | 153 | ```{r} 154 | # Decision tree 155 | 156 | set.seed(2) 157 | resample_diabetes_tree = 158 | fit_resamples(diabetes_workflow, 159 | diabetes_folds, 160 | metrics = metric_set(sens, spec, accuracy, roc_auc, brier_class), 161 | control = control_resamples(save_pred = TRUE)) 162 | 163 | resample_diabetes_tree %>% unnest(.metrics) 164 | 165 | # Logistic Regression 166 | 167 | set.seed(2) 168 | resample_diabetes_lr = 169 | fit_resamples() 170 | ``` 171 | 172 | # Evaluate 173 | 174 | collect_metrics() 175 | 176 | ```{r} 177 | 178 | ``` 179 | 180 | collect_predictions() roc_auc() autoplot() 181 | 182 | ```{r} 183 | # tree 184 | 185 | resample_diabetes_tree %>% 186 | collect_predictions() %>% 187 | roc_auc(truth = diabetes_binary, .pred_Yes) 188 | 189 | resample_diabetes_tree %>% 190 | collect_predictions() %>% 191 | roc_curve(truth = diabetes_binary, .pred_Yes) %>% 192 | autoplot() 193 | 194 | 195 | # logistic regression 196 | 197 | ``` 198 | 199 | # Added challenge 200 | 201 | ```{r} 202 | # get them on the same plot? 203 | resample_diabetes_tree %>% 204 | unnest(.predictions) %>% 205 | mutate(model = "tree") %>% 206 | bind_rows(resample_diabetes_lr %>% 207 | unnest(.predictions) %>% 208 | mutate(model = "glm")) %>% 209 | group_by(model) %>% 210 | roc_curve(truth = diabetes_binary, .pred_Yes) %>% 211 | autoplot() 212 | 213 | ``` 214 | 215 | # ANSWERS 216 | 217 | ## Recipes 218 | 219 | https://www.tidymodels.org/find/recipes/ 220 | 221 | - remove columns with zero varience 222 | 223 | - create dummy variable 224 | 225 | - normalise 226 | 227 | - change role 228 | 229 | ```{r} 230 | diabetes_recipe = 231 | recipe(diabetes_binary ~ ., data = diabetes_train) %>% 232 | step_zv() %>% 233 | step_dummy() %>% 234 | update_role(subjid, new_role = "id") 235 | 236 | diabetes_recipe %>% prep() %>% juice() 237 | ``` 238 | 239 | ## Specifications 240 | 241 | https://www.tidymodels.org/find/parsnip/ 242 | 243 | Start with decision tree 244 | 245 | Change to logistic regression? 246 | 247 | ```{r} 248 | diabetes_tree_spec = decision_tree(cost_complexity = 0.0001) %>% 249 | set_engine("rpart") %>% 250 | set_mode("classification") 251 | 252 | diabetes_lr_spec = logistic_reg() %>% 253 | set_engine("glm") %>% 254 | set_mode("classification") 255 | ``` 256 | 257 | ## Workflows 258 | 259 | Use workflow() 260 | 261 | ```{r} 262 | 263 | # decision tree 264 | diabetes_workflow_tree = 265 | workflow() %>% 266 | add_recipe(diabetes_recipe) %>% 267 | add_model(diabetes_tree_spec) 268 | 269 | diabetes_fit_tree = 270 | fit(diabetes_workflow_tree, diabetes_train) 271 | 272 | # logistic regression 273 | diabetes_workflow_lr = 274 | workflow() %>% 275 | add_recipe(diabetes_recipe) %>% 276 | add_model(diabetes_lr_spec) 277 | 278 | diabetes_fit_lr = 279 | fit(diabetes_workflow_tree, diabetes_train) 280 | ``` 281 | 282 | ## Data Metrics 283 | 284 | ```{r} 285 | # tree 286 | 287 | # predict with augment() 288 | augment(diabetes_fit_tree, new_data = diabetes_train)%>% 289 | roc_auc(truth = diabetes_binary, .pred_Yes) 290 | 291 | augment(diabetes_fit_tree, new_data = diabetes_train)%>% 292 | roc_curve(truth = diabetes_binary, .pred_Yes) %>% 293 | autoplot() 294 | 295 | augment(diabetes_fit_tree, new_data = diabetes_train)%>% 296 | conf_mat(truth = diabetes_binary, .pred_class) %>% 297 | autoplot(type = "heatmap") 298 | 299 | 300 | # logistic reg 301 | 302 | # predict with augment() 303 | augment(diabetes_fit_lr, new_data = diabetes_train)%>% 304 | roc_auc(truth = diabetes_binary, .pred_Yes) 305 | 306 | augment(diabetes_fit_lr, new_data = diabetes_train)%>% 307 | roc_curve(truth = diabetes_binary, .pred_Yes) %>% 308 | autoplot() 309 | 310 | augment(diabetes_fit_lr, new_data = diabetes_train)%>% 311 | conf_mat(truth = diabetes_binary, .pred_class) %>% 312 | autoplot(type = "heatmap") 313 | ``` 314 | 315 | ## Evaluate 316 | 317 | ```{r} 318 | # tree 319 | 320 | diabetes_fit_final_tree = last_fit(diabetes_workflow_tree, split) 321 | 322 | collect_metrics(diabetes_fit_final_tree) 323 | 324 | # log reg 325 | 326 | diabetes_fit_final_lr = last_fit(diabetes_workflow_lr, split) 327 | 328 | collect_metrics(diabetes_fit_final_lr) 329 | ``` 330 | 331 | ## Resamples 332 | 333 | You can only use your test set once! What can you do instead? Resampling simulates how well your model does when exposed to new data 334 | 335 | ### Folds 336 | 337 | https://rsample.tidymodels.org/reference/ Fitting with the folds, we don't retain the perameters, only the performance to evaluate 338 | 339 | ```{r} 340 | set.seed(1) 341 | diabetes_folds = vfold_cv(diabetes_train) 342 | ``` 343 | 344 | # Fit resamples 345 | 346 | ```{r} 347 | # Decision tree 348 | 349 | set.seed(2) 350 | resample_diabetes_tree = 351 | fit_resamples(diabetes_workflow, 352 | diabetes_folds, 353 | metrics = metric_set(sens, spec, accuracy, roc_auc, brier_class), 354 | control = control_resamples(save_pred = TRUE)) 355 | 356 | resample_diabetes_tree %>% unnest(.metrics) 357 | 358 | # Logistic Regression 359 | 360 | set.seed(2) 361 | resample_diabetes_lr = 362 | fit_resamples(diabetes_workflow_lr, 363 | diabetes_folds, 364 | metrics = metric_set(sens, spec, accuracy, roc_auc, brier_class), 365 | control = control_resamples(save_pred = TRUE)) 366 | ``` 367 | 368 | # Evaluate 369 | 370 | ```{r} 371 | resample_diabetes_tree %>% collect_metrics() 372 | 373 | resample_diabetes_lr %>% collect_metrics() 374 | ``` 375 | 376 | ```{r} 377 | 378 | # tree 379 | 380 | resample_diabetes_tree %>% 381 | collect_predictions() %>% 382 | roc_auc(truth = diabetes_binary, .pred_Yes) 383 | 384 | resample_diabetes_tree %>% 385 | collect_predictions() %>% 386 | roc_curve(truth = diabetes_binary, .pred_Yes) %>% 387 | autoplot() 388 | 389 | 390 | # logistic regression 391 | 392 | resample_diabetes_lr %>% 393 | collect_predictions() %>% 394 | roc_auc(truth = diabetes_binary, .pred_Yes) 395 | 396 | resample_diabetes_lr %>% 397 | collect_predictions() %>% 398 | roc_curve(truth = diabetes_binary, .pred_Yes) %>% 399 | autoplot() 400 | ``` 401 | 402 | ```{r} 403 | # get them on the same plot? 404 | resample_diabetes_tree %>% 405 | unnest(.predictions) %>% 406 | mutate(model = "tree") %>% 407 | bind_rows(resample_diabetes_lr %>% 408 | unnest(.predictions) %>% 409 | mutate(model = "glm")) %>% 410 | group_by(model) %>% 411 | roc_curve(truth = diabetes_binary, .pred_Yes) %>% 412 | autoplot() 413 | 414 | ``` 415 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Live R demos 2 | 3 | More information on the HealthyR: R for Health Data Science courses, upcoming live demos, clinics, and free resources can be found at https://healthyr.surgicalinformatics.org/ 4 | 5 | *Internal note: the demo bank lives in healthyr_admin.* 6 | 7 | ## library(finalfit) - quickly create elegant final results tables and plots 8 | 9 | This video covers: 10 | * demographics table 11 | * variable labels 12 | * odds ratio plots 13 | 14 | [melanoma dataset and R Markdown document (2022-04 folder in this repository)](2022-04) 15 | 16 | 17 | [![Watch the video](video_thumbnails/finalfit.png)](https://media.ed.ac.uk/media/HealthyR+demoA+finalfit.org/1_ed9ajpct) 18 | 19 | Presented by Riinu Pius and Ewen Harrison, recorded in April 2022. 20 | 21 | ## bar plots - geom_bar() or geom_col()? 22 | 23 | This video covers: 24 | * How to approach a brand new dataset: viewing data, variable names, and counts 25 | * Plotting uncounted, case-level data with geom_bar() 26 | * Summarising data and then plotting aggregated data with geom_col() 27 | * Reordering factors in geom_col() 28 | 29 | [R Script, including dataset loading (2022-05 folder in this repository)](2022-05) 30 | 31 | [![Watch the video](video_thumbnails/geom_bar.png)](https://media.ed.ac.uk/media/HealthyR%20demo%3A%20geom_bar()%20vs%20geom_col()/1_49w03lqk) 32 | 33 | Presented by Riinu Pius and Ewen Harrison, recorded in May 2022. 34 | 35 | ## Reproducible example - reprex 36 | 37 | This video covers: 38 | * Generating dummy/example datasets with tibble() 39 | * Reproducing real dataset issues in an example dataset 40 | * Sharing minimal reproducible examples 41 | 42 | [R Markdown document including dataset loading (2022-06 folder in this repository)](2022-06) 43 | 44 | [![Watch the video](video_thumbnails/reprex.png)](https://media.ed.ac.uk/media/HealthyR%20demo%3A%20Reproducible%20examples%20(reprex)/1_zo8af9bg) 45 | 46 | Presented by Riinu Pius and Mathew Thorpe, Recorded in June 2022. 47 | 48 | ## Create a global map 49 | 50 | This video covers: 51 | * Exploring a brand new dataset 52 | * Creating a global map, customised based on your data 53 | * Troubleshooting issues based on mismatched countries 54 | 55 | [R Script including dataset loading (2022-08 folder in this repository)](2022-08) 56 | 57 | [![Watch the video](video_thumbnails/globalmap_demo.png)](https://media.ed.ac.uk/media/HealthyR%20demo%3A%20Global%20Map/1_sz1b44q4) 58 | 59 | Presented by Riinu Pius and Ewen Harrison, Recorded in August 2022. 60 | 61 | ## Reshape Data 62 | 63 | This video covers: 64 | * Prepare your data in a format suitable for analysis 65 | * Pivot longer and Pivot wider 66 | 67 | [R Script including dataset loading (2022-08 folder in this repository)](2022-08) 68 | 69 | [![Watch the video](video_thumbnails/reshape_data.png)](https://media.ed.ac.uk/media/HealthyR%3A%20Reshaping%20Data%20(pivot)/1_wmix0222) 70 | 71 | Presented by Riinu Pius, Ewen Harrison and Sarah Elliot, Recorded in August 2022. 72 | 73 | ## Plot Improvements 74 | 75 | This video covers: 76 | * Exploring a new data set 77 | * How to create plots using ggplot() 78 | * Edit plot to create high quality visualisations 79 | 80 | [R Script including dataset loading (2022-09 folder in this repository)](2022-09) 81 | 82 | [![Watch the video](video_thumbnails/plot_improvements.png)](https://media.ed.ac.uk/media/HealthyR%20demo%3A%20Plot%20Improvements/1_obw1zgc0) 83 | 84 | Presented by Ewen Harrison and Sarah Elliot, Recorded in September 2022. 85 | 86 | ## Stringr 87 | 88 | This video covers: 89 | * Using stringr 90 | * Free text manipulation 91 | * Regular Expressions 92 | 93 | [R Script including dataset loading (2022-10 folder in this repository)](2022-10) 94 | 95 | [![Watch the video](video_thumbnails/stringr.png)](https://media.ed.ac.uk/media/HealthyR%20demo%3A%20stringR/1_2ksr312b) 96 | 97 | Presented by Ewen Harrison and Sarah Elliot, Recorded in October 2022. 98 | 99 | ## Joining datasets 100 | 101 | This video covers: 102 | * *_join() from the tidyverse 103 | * Mutational joins, such as left, right, inner, full 104 | * Anti_join() and semi_join() 105 | 106 | [R Script including dataset loading (2022-11 folder in this repository)](2022-11) 107 | 108 | [![Watch the video](video_thumbnails/join_demo.png)](https://media.ed.ac.uk/media/HealthyR%20demo%3A%20joining%20datasets/1_wd239kz6) 109 | 110 | Presented by Ewen Harrison and Sarah Elliot, Recorded in November 2022. 111 | 112 | ## Table One - finalfit 113 | 114 | This video covers: 115 | * Explore a new dataset 116 | * Create a summary table with variables of interest to investigate confounding 117 | * Edit this output to create a publishable table 118 | 119 | [R Script including dataset loading (2023-03 folder in this repository)](2023-03) 120 | 121 | [![Watch the video](video_thumbnails/tableone_finalfit.png)](https://media.ed.ac.uk/media/HealthyR%20demo%3A%20Create%20table%20one/1_ivx5th4k) 122 | 123 | Presented by Ewen Harrison and Sarah Elliot, Recorded in March 2023. 124 | 125 | ## Cleaning Data 126 | 127 | This video covers: 128 | * Formatting variables in R 129 | * Working with dates 130 | * Out of range values - summarising, visuals, plotly 131 | * Filter or remove variables 132 | 133 | [R Script including dataset loading (2023-08 folder in this repository)](2023-08) 134 | 135 | [![Watch the video](video_thumbnails/data_clean.png)](https://media.ed.ac.uk/media/HealthyR%20demo%3A%20cleaning%20data%201/1_oypymgwo) 136 | 137 | Presented by Ewen Harrison, Riinu Pius and Sarah Elliot, Recorded in August 2023. 138 | 139 | ## Automatic Parameterised Reports 140 | 141 | This video covers: 142 | * Producing multiple PDFs automatically 143 | * Utilising 'Knit with parameters' within the Posit interface 144 | * Purrr for iteration - using map() and pwalk() 145 | 146 | [R Script including dataset loading (2023-08 folder in this repository)](2023-08) 147 | 148 | [![Watch the video](video_thumbnails/report_parameters.png)](https://media.ed.ac.uk/media/HealthyR%3A%20produce%20multiple%20reports/1_nx07syit) 149 | 150 | Presented by Riinu Pius and Sarah Elliot, Recorded in August 2023. 151 | 152 | 153 | ## Data Cleaning 2 154 | 155 | This video covers: 156 | * Cleaning column names 157 | * Consistency of NAs 158 | * Duplicate entries 159 | 160 | [R Script including dataset loading (2023-10 folder in this repository)](2023-10) 161 | 162 | [![Watch the video](video_thumbnails/data_clean2.jpg)](https://media.ed.ac.uk/media/HealthyR%20Demo%3A%20Data%20Cleaning%202/1_yclggyiq) 163 | 164 | Presented by Riinu Pius and Sarah Elliot, Recorded in October 2023. 165 | 166 | ## Reading Multiple Spreadsheets 167 | 168 | This video covers: 169 | * Reading in CSV and XML files into R 170 | * Working with multiple sheets in one file 171 | * The difference between binding rows and joining dataframes 172 | 173 | [R Script including dataset loading (2023-10 folder in this repository)](2023-10) 174 | 175 | [![Watch the video](video_thumbnails/spreadsheets.jpg)](https://media.ed.ac.uk/media/HealthyR%20Demo%3A%20Reading%20multiple%20spreadsheets/1_qtbbw95l) 176 | 177 | Presented by Riinu Pius and Sarah Elliot, Recorded in October 2023. 178 | 179 | ## Quarto 1 180 | 181 | This video covers: 182 | * What is Quarto? 183 | * Quarto Projects and Documents 184 | * Executable Options 185 | * COnditional Content 186 | 187 | [R Script including dataset loading (2023-11 folder in this repository)](2023-11) 188 | 189 | [![Watch the video](video_thumbnails/quarto_1.jpg)](https://media.ed.ac.uk/media/HealthyR%20Demo%3A%20Quarto%20Part%201%20/1_ii6htaj3) 190 | 191 | Presented by Riinu Pius and Sarah Elliot, Recorded in November 2023. 192 | 193 | 194 | ## Quarto 2 195 | 196 | This video covers: 197 | * Creating a website 198 | * Publishing a website 199 | * Quarto tips 200 | 201 | [R Script including dataset loading (2023-11 folder in this repository)](2023-11) 202 | 203 | [![Watch the video](video_thumbnails/quarto_2.jpg)](https://media.ed.ac.uk/media/HealthyR%20Demo%3A%20Quarto%20Part%202/1_g0ox06lh) 204 | 205 | Presented by Riinu Pius and Sarah Elliot, Recorded in November 2023. 206 | 207 | ## Venn Diagrams and Upset Plot 208 | 209 | This video covers: 210 | * Venn diagrams 211 | * Upset plots 212 | * Editing diagrams 213 | 214 | [R Script including dataset loading (2024-02 folder in this repository)](2024-02) 215 | 216 | [![Watch the video](video_thumbnails/venn.jpg)](https://media.ed.ac.uk/media/HealthyR%20Demo%3A%20Venn%20Diagrams%20and%20Upset%20Plots/1_iw9sulkc) 217 | 218 | Presented by Neil Clark and Sarah Elliot, Recorded in February 2024. 219 | 220 | 221 | ## Consort/Flow diagrams 222 | 223 | This video covers: 224 | * Data cleaning 225 | * Consort diagrams with library(consort) 226 | * PDFs and word documents 227 | 228 | 229 | [R Script including dataset loading (2024-02 folder in this repository)](2024-02) 230 | 231 | [![Watch the video](video_thumbnails/consort.PNG)](https://media.ed.ac.uk/media/HealthyR%20Demo%3A%20Venn%20Diagrams%20and%20Upset%20Plots/1_iw9sulkc) 232 | 233 | Presented by Riinu Pius and Sarah Elliot, Recorded in February 2024. 234 | 235 | ## Forcats 236 | 237 | This video covers: 238 | * Forcats - a tidyverse package 239 | * Factor manipulation 240 | * Finalfit 241 | 242 | [R Script including dataset loading (2024-03 folder in this repository)](2024-03) 243 | 244 | [![Watch the video](video_thumbnails/forcats.png)](https://media.ed.ac.uk/media/HealthyR%20Demo%3A%20Forcats/1_kuu9moku) 245 | 246 | Presented by Riinu Pius and Sarah Elliot, Recorded in February 2024. 247 | 248 | ## What is Shiny? 249 | 250 | This video covers: 251 | * How to make an interactive web app 252 | * UI and server 253 | * Advanced Examples 254 | 255 | [R Script including dataset loading (2024-03 folder in this repository)](2024-03) 256 | 257 | [![Watch the video](video_thumbnails/shiny.PNG)](https://media.ed.ac.uk/media/HealthyR%20Demo%3A%20What%20is%20shiny/1_v1pw3kk7) 258 | 259 | Presented by Ewen Harrison and Sarah Elliot, Recorded in March 2024. 260 | 261 | ## Efficient Plotting 262 | 263 | This video covers: 264 | * Custom plot themes 265 | * How to create a function using your theme 266 | * ggplot 267 | 268 | [R Script including dataset loading (2024-03 folder in this repository)](2024-03) 269 | 270 | [![Watch the video](video_thumbnails/plot_function.PNG)](https://media.ed.ac.uk/media/HealthyR%20Demo%3A%20Efficient%20Plotting%20Functions/1_00x2r9j2) 271 | 272 | Presented by Riinu Pius and Stella Rhode, Recorded in March 2024. 273 | 274 | ## GT and dashboards 275 | 276 | This video covers: 277 | * library(gt) 278 | * emojis 279 | * interactive quarto dashboards 280 | 281 | [R Script including dataset loading (2024-04 folder in this repository)](2024-04) 282 | 283 | [![Watch the video](video_thumbnails/GT.jpg)](https://media.ed.ac.uk/playlist/dedicated/1_ccvd72lg/1_qebu9s7b) 284 | 285 | Presented by Riinu Pius and Stella Rhode, Recorded in April 2024. 286 | 287 | ## Directed Acyclic Graphs (DAGs) 288 | 289 | This video covers: 290 | * Dagitty 291 | * ggdag 292 | * confounders, mediators, exposures, outcomes etc. 293 | 294 | [R Script including dataset loading (2024-04 folder in this repository)](2024-04) 295 | 296 | [![Watch the video](video_thumbnails/DAG.jpg)](https://media.ed.ac.uk/playlist/dedicated/1_ccvd72lg/1_nkwfec1p) 297 | 298 | Presented by Sarah Elliot and Neil Clark, Recorded in April 2024. 299 | 300 | ## Fuzzy Matching 301 | 302 | This video covers: 303 | * regular Expressions 304 | * Joining data sets 305 | * Joining datasets on imperfect matches 306 | 307 | [R Script including dataset loading (2024-05 folder in this repository)](2024-05) 308 | 309 | [![Watch the video](video_thumbnails/fuzzy.jpg)](https://media.ed.ac.uk/playlist/dedicated/1_ccvd72lg/1_udk39v05) 310 | 311 | Presented by Sarah Elliot and Ewen Harrison, Recorded in May 2024. 312 | 313 | ## Tidymodels 314 | This video covers: 315 | * Tidymodels framework 316 | * Workflows 317 | * Logistic regression and Decision Trees 318 | 319 | [R Script including dataset loading (2024-08 folder in this repository)](2024-08) 320 | 321 | [![Watch the video](video_thumbnails/tidymodels1.PNG)](https://media.ed.ac.uk/media/HealthyR%20demo%3A%20Tidymodels%20Part%201/1_i8tebh8l) 322 | 323 | Presented by Sarah Elliot and Ewen Harrison, Recorded in August 2024. 324 | 325 | ## Tidytext 326 | This video covers: 327 | * Free text analysis 328 | * Taylor swift lyrics 329 | 330 | [R Script including dataset loading (2024-09 folder in this repository)](2024-09) 331 | 332 | [![Watch the video](video_thumbnails/tidytext.PNG)](https://media.ed.ac.uk/media/HealthyR%20demo%3A%20tidytext/1_8nrmkdlg) 333 | 334 | Presented by Sarah Elliot and Ewen Harrison, Recorded in September 2024. 335 | 336 | ## Plotting edits 337 | This video covers: 338 | * ggplot() 339 | * Aesthetic changes to visualisations 340 | * Statistical results in plots 341 | 342 | [R Script including dataset loading (2024-09 folder in this repository)](2024-09) 343 | 344 | [![Watch the video](video_thumbnails/plotting_stat.PNG)](https://media.ed.ac.uk/media/HealthyR%20Demo%3A%20Plotting%20edits%20/1_hd8osn0d) 345 | 346 | Presented by Sarah Elliot and Neil Clark, Recorded in October 2024. 347 | 348 | ## Tidymodels Part 2: Resampling 349 | This video covers: 350 | * Tidymodels 351 | * Resamples/bootstraps 352 | * Model metrics 353 | 354 | [R Script including dataset loading (2024-11 folder in this repository)](2024-11) 355 | 356 | [![Watch the video](video_thumbnails/tidymodels2.PNG)](https://media.ed.ac.uk/media/HealthtR%20Demo%3A%20Tidymodels%20Part%202/1_eppxvpop) 357 | 358 | Presented by Sarah Elliot and Neil Clark, Recorded in November 2024. 359 | -------------------------------------------------------------------------------- /healthyr_demos.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /video_thumbnails/DAG.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/DAG.jpg -------------------------------------------------------------------------------- /video_thumbnails/GT.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/GT.jpg -------------------------------------------------------------------------------- /video_thumbnails/consort.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/consort.PNG -------------------------------------------------------------------------------- /video_thumbnails/data_clean.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/data_clean.png -------------------------------------------------------------------------------- /video_thumbnails/data_clean2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/data_clean2.jpg -------------------------------------------------------------------------------- /video_thumbnails/finalfit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/finalfit.png -------------------------------------------------------------------------------- /video_thumbnails/forcats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/forcats.png -------------------------------------------------------------------------------- /video_thumbnails/fuzzy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/fuzzy.jpg -------------------------------------------------------------------------------- /video_thumbnails/geom_bar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/geom_bar.png -------------------------------------------------------------------------------- /video_thumbnails/globalmap_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/globalmap_demo.png -------------------------------------------------------------------------------- /video_thumbnails/join_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/join_demo.png -------------------------------------------------------------------------------- /video_thumbnails/plot_function.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/plot_function.PNG -------------------------------------------------------------------------------- /video_thumbnails/plot_improvements.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/plot_improvements.png -------------------------------------------------------------------------------- /video_thumbnails/plotting_stat.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/plotting_stat.PNG -------------------------------------------------------------------------------- /video_thumbnails/quarto_1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/quarto_1.PNG -------------------------------------------------------------------------------- /video_thumbnails/quarto_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/quarto_1.jpg -------------------------------------------------------------------------------- /video_thumbnails/quarto_2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/quarto_2.PNG -------------------------------------------------------------------------------- /video_thumbnails/quarto_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/quarto_2.jpg -------------------------------------------------------------------------------- /video_thumbnails/report_parameters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/report_parameters.png -------------------------------------------------------------------------------- /video_thumbnails/reprex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/reprex.png -------------------------------------------------------------------------------- /video_thumbnails/reshape_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/reshape_data.png -------------------------------------------------------------------------------- /video_thumbnails/shiny.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/shiny.PNG -------------------------------------------------------------------------------- /video_thumbnails/spreadsheets.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/spreadsheets.PNG -------------------------------------------------------------------------------- /video_thumbnails/spreadsheets.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/spreadsheets.jpg -------------------------------------------------------------------------------- /video_thumbnails/stringr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/stringr.png -------------------------------------------------------------------------------- /video_thumbnails/tableone_finalfit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/tableone_finalfit.png -------------------------------------------------------------------------------- /video_thumbnails/tidymodels1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/tidymodels1.PNG -------------------------------------------------------------------------------- /video_thumbnails/tidymodels2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/tidymodels2.PNG -------------------------------------------------------------------------------- /video_thumbnails/tidytext.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/tidytext.PNG -------------------------------------------------------------------------------- /video_thumbnails/venn.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SurgicalInformatics/healthyr_demos/7ba3f559703d5e647042a7a1e16980526f46d17f/video_thumbnails/venn.jpg --------------------------------------------------------------------------------