├── .gitignore ├── Arsenal August 2020 ├── RUG_making_tables_r.pptx ├── Readme.md ├── r_make_tables.Rmd ├── r_make_tables.docx ├── r_make_tables_rendered.docx └── reference.docx ├── August 2020 Puzzle ├── Example datasets.xlsx ├── Martina Cusinato │ ├── AUG 2020_PUZZLE1_jocelyn.R │ ├── AUG 2020_PUZZLE_2_Ben.R │ └── vbls_all_new2.csv └── ReadMe.md ├── Dates.R ├── December 2020 Puzzle └── Interactions.R ├── February 2021 └── how_to_learn_R.md ├── January 2021 └── 20210122-RUG-ConstrainedOpt.R ├── July 2020 Puzzle ├── July2020PuzzleSolutions.Rmd ├── July2020PuzzleSolutions.html ├── JulyPuzzle.zip └── results.png ├── Plots March 2020 ├── Amy │ └── GGplot example AM.R ├── Calum │ ├── cascade_plot.R │ └── data.csv ├── Lauren │ ├── Days_hours.gif │ ├── hours and day.R │ └── session start heatmap.R ├── Martina │ ├── PLOT html_doc.Rmd │ ├── PLOT html_doc.html │ ├── PLOT notebook.Rmd │ └── PLOT notebook.nb.html └── README.md └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | _publish.R 5 | _book 6 | _bookdown_files 7 | rsconnect 8 | .DS_Store 9 | -------------------------------------------------------------------------------- /Arsenal August 2020/RUG_making_tables_r.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/Arsenal August 2020/RUG_making_tables_r.pptx -------------------------------------------------------------------------------- /Arsenal August 2020/Readme.md: -------------------------------------------------------------------------------- 1 | Sham Lal presented on the use of the `arsenal` package 2 | 3 | His excellent presentation can be found here: https://drive.google.com/file/d/1Ra8Rz3hJttoAGrU46jpGIsyaS9C7RO4n/view?usp=sharing 4 | -------------------------------------------------------------------------------- /Arsenal August 2020/r_make_tables.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R tables easy to make?" 3 | subtitle: "RUG LSHTM" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | word_document: 7 | reference_docx: reference.docx 8 | toc: Yes 9 | --- 10 | 11 | 12 | ```{r setup, include = F, echo = F} 13 | 14 | # Chunk options ---- 15 | 16 | # include = FALSE prevents code and results from appearing in the finished file. R Markdown still runs the code in the chunk, and the results can be used by other chunks. 17 | # echo = FALSE prevents code, but not the results from appearing in the finished file. This is a useful way to embed figures. 18 | # message = FALSE prevents messages that are generated by code from appearing in the finished output 19 | # warning = FALSE prevents warnings that are generated by code from appearing in the finished output. 20 | # fig.cap = "..." adds a caption to graphical results. 21 | # eval = FALSE does not run the code chunk 22 | 23 | # global options ---- 24 | # > code chunk options ---- 25 | 26 | # knitr::opts_chunk$set( 27 | # comment = '', include = F, message = F, fig.width = 6, fig.height = 6) 28 | 29 | knitr::opts_chunk$set('label') 30 | 31 | 32 | # Packages ---- 33 | 34 | # install.packages(c("tidyverse", "kableExtra", "arsenal", "finalfit", "tidyr", "broom")) 35 | # optional extra package emojis devtools::install_github("hadley/emo") 36 | 37 | 38 | 39 | ``` 40 | 41 | ```{r libraries, include = F, echo = F} 42 | 43 | library(tidyverse) 44 | library(kableExtra) 45 | library(arsenal) 46 | library(finalfit) 47 | 48 | library(tidyr) # for neater code output 49 | library(broom) # for tidying other models 50 | 51 | ``` 52 | 53 | ```{r data_prep, include = F, message = F, warning = F, eval = T} 54 | 55 | # Ensure data is cleaned as much as possible and organised correctly (i.e. variables stored as needed e.g. factors, integers, double, characters) 56 | 57 | glimpse(colon_s) 58 | 59 | dim(colon_s) 60 | 61 | head(colon_s, 10) 62 | 63 | colon_s <- colon_s 64 | 65 | 66 | ``` 67 | 68 | # Table 1a: simple one-way table with default *tableby* options `r emo::ji("poop")` 69 | 70 | 71 | ```{r table1a_i, include = T, message = F, warning = F, eval = T} 72 | 73 | table1a_i <- arsenal::tableby(~ age + age.factor + sex.factor + obstruct.factor + nodes, 74 | data = colon_s) 75 | 76 | summary(table1a_i, text = T) 77 | 78 | ``` 79 | 80 | ## Table 1a(ii): formatted `r emo::ji("smile")` 81 | 82 | ```{r table1a_ii, include = T, message = F, warning = F, results = 'asis', eval = T} 83 | 84 | summary(table1a_i, text = T) %>% 85 | kable(format = "pandoc", 86 | caption = "Formatted table", 87 | padding = 0, 88 | label = "", 89 | align = c("l" , "c"), 90 | col_order = order 91 | ) 92 | 93 | ``` 94 | 95 | ## Table 1b: add/modify human readable variable labels `r emo::ji("+1")``r emo::ji("+1")` 96 | 97 | ```{r table1b_code, include = T, message = F, warning = F, results = 'asis', echo = F, eval = T} 98 | 99 | # See all variables labels and factor level names 100 | 101 | labels(colon_s) %>% 102 | unlist() 103 | 104 | # Change variable names and factor level names 105 | 106 | labels(colon_s) <- c(age = 'Age, yrs', sex = "Gender") 107 | 108 | attr(colon_s$sex.factor, 'label') <- 'Gender' 109 | 110 | attr(colon_s$age.factor, 'label') <- 'Age group' 111 | 112 | colon_s$age.factor <- forcats::fct_relevel(colon_s$age.factor, rev) 113 | 114 | ``` 115 | 116 | ```{r table1b_labels, include = T, message = F, warning = F, results = 'asis', eval = T} 117 | 118 | table1b <- tableby(~ age + age.factor + sex.factor + obstruct.factor + nodes, 119 | data = colon_s) 120 | 121 | summary(table1b, text = T) %>% 122 | kable(format = "pandoc", 123 | caption = "Formatted table", 124 | padding = 0, 125 | label = "", 126 | align = c("l" , "c"), 127 | col_order = order 128 | ) 129 | 130 | 131 | ``` 132 | 133 | ## Table 1c: change default statistics and formatting `r emo::ji("+1")``r emo::ji("+1")``r emo::ji("+1")` 134 | 135 | ```{r table1c_options, include = T, message = F, warning = F, results = 'asis', eval = T} 136 | 137 | tableby(~ age + age.factor + sex.factor + obstruct.factor + nodes, 138 | data = colon_s, 139 | digits = 1, 140 | digits.pct = 1, 141 | control = tableby.control(cat.stats = c("countpct", "N", "Nmiss2"), 142 | total = T, test = F, 143 | numeric.stats = c("meansd", "medianq1q3", "range", "N", "Nmiss2"), 144 | stats.labels = list(Nmiss2 = "Number missing") 145 | ) 146 | ) %>% 147 | summary(text = T) %>% 148 | kable(format = "pandoc", 149 | caption = "Formatted table", 150 | padding = 0, 151 | label = "", 152 | align = c("l" , "c"), 153 | col_order = order 154 | ) 155 | 156 | 157 | ``` 158 | 159 | ## Table 1d: more than one group `r emo::ji("flex")``r emo::ji("fist")`` 160 | 161 | ```{r table1d_options, include = T, message = F, warning = F, results = 'asis', eval = T} 162 | 163 | tableby(mort_5yr ~ age + age.factor + sex.factor + obstruct.factor + nodes + perfor.factor, 164 | data = colon_s, 165 | digits = 1, 166 | digits.pct = 1, 167 | control = tableby.control(cat.stats = c("countrowpct", "N", "Nmiss2"), # now with row percentages 168 | total = T, test = T, 169 | numeric.stats = c("meansd", "medianq1q3", "range", "N", "Nmiss2"), 170 | stats.labels = list(Nmiss2 = "Number missing") 171 | ) 172 | ) %>% 173 | summary(text = T) %>% 174 | kable(format = "pandoc", 175 | caption = "Formatted table", 176 | padding = 0, 177 | label = "", 178 | align = c("l" , "c"), 179 | col_order = order 180 | ) 181 | 182 | 183 | ``` 184 | 185 | 186 | ## Table 1e: change default statistics, more than one group `r emo::ji("+1")``r emo::ji("+1")` 187 | 188 | ```{r table1e, include = T, message = F, warning = F, results = 'asis', eval = T} 189 | 190 | tableby(mort_5yr ~ age + age.factor + sex.factor + obstruct.factor + nodes + perfor.factor, 191 | data = colon_s, 192 | digits = 1, 193 | digits.pct = 1, 194 | control = tableby.control(cat.stats = c("countrowpct", "N", "Nmiss2"), # now with row percentages 195 | numeric.stats = c("meansd", "medianq1q3", "range", "N", "Nmiss2"), 196 | stats.labels = list(Nmiss2 = "Number miss"), 197 | total = T, test = F 198 | ) 199 | ) %>% 200 | summary(text = T) %>% 201 | kable(format = "pandoc", 202 | caption = "Formatted table", 203 | padding = 0, 204 | label = "", 205 | align = c("l" , "c"), 206 | col_order = order 207 | ) 208 | 209 | 210 | ``` 211 | 212 | ## Table 1f: Set global table options and create dataframe `r emo::ji("+1")` 213 | 214 | ```{r table_by global options, highlight = T, strip.white = T} 215 | 216 | # Set global table options 217 | 218 | mycontrols <- tableby.control(test=FALSE, total=FALSE, 219 | numeric.test="kwt", cat.test="chisq", 220 | numeric.stats=c("meansd", "medianq1q3", "range", "N", "Nmiss2"), 221 | cat.stats=c("countpct", "N", "Nmiss2"), 222 | stats.labels=list(N = 'Count', 223 | median = 'Median', 224 | q1q3 = 'Q1,Q3'), 225 | digits = 1, 226 | digits.pct = 1 227 | ) 228 | 229 | # Create a table as data frame 230 | 231 | table_1f <- tableby(mort_5yr ~ age + age.factor + sex.factor, 232 | data = colon_s, 233 | control = mycontrols) %>% 234 | summary(text = NULL) # set to null to remove any text formatting 235 | 236 | print(as.data.frame(table_1f)) 237 | 238 | ``` 239 | 240 | *Other possibilities* 241 | + Subsets of groups (e.g. pregnant women) 242 | * Weighted estimates 243 | * Apply custom statistical tests for certain variables 244 | * Additional groupings by strata 245 | * Full list here: 246 | 247 | # Table 2: Odds ratio table 248 | 249 | ```{r table 2, include = F,highlight = T, strip.white = T} 250 | 251 | # FinalFirst package 252 | 253 | explanatory = c("age.factor", "sex.factor", "obstruct.factor", "perfor.factor") 254 | 255 | dependent = "mort_5yr" 256 | 257 | model_output1 <- colon_s %>% 258 | finalfit(dependent, explanatory, metrics = F) 259 | 260 | ``` 261 | 262 | ```{r table_2out, include = T,highlight = T, strip.white = T} 263 | 264 | model_output1 %>% 265 | kable(format = "pandoc", 266 | caption = "", 267 | padding = 0, 268 | label = "", 269 | align = c("l" , "c"), 270 | col_order = order, 271 | row.names = NA, 272 | col.names = NA 273 | ) 274 | ``` 275 | 276 | *Lots of features and customisations* 277 | * Full list here: 278 | -------------------------------------------------------------------------------- /Arsenal August 2020/r_make_tables.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/Arsenal August 2020/r_make_tables.docx -------------------------------------------------------------------------------- /Arsenal August 2020/r_make_tables_rendered.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/Arsenal August 2020/r_make_tables_rendered.docx -------------------------------------------------------------------------------- /Arsenal August 2020/reference.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/Arsenal August 2020/reference.docx -------------------------------------------------------------------------------- /August 2020 Puzzle/Example datasets.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/August 2020 Puzzle/Example datasets.xlsx -------------------------------------------------------------------------------- /August 2020 Puzzle/Martina Cusinato/AUG 2020_PUZZLE1_jocelyn.R: -------------------------------------------------------------------------------- 1 | #********************************************************************************************************** 2 | # WORKFLOW AND NOTES: 3 | ## 1. Inspect and save the raw data in a raw_data_excel 4 | 5 | ## 2. Transform each of the excel tabs I will use in CSV and file them. 6 | ## I normally file these under subdirectory calles raw_data_csv, but for now will save them in the working directory (note file location for csv) 7 | ## I prefer changing to csv before importing because it is pretty easy and ensures consistency in the way characters are entered. 8 | ## I do this by clincking 'save as' and then chaning the format as CSV. Each tab or each file needs to be saved separately. 9 | 10 | ## 3. I dont' know how this data will be used... so I show two options 11 | ### a. Import and 'clean' for one file, and this can be repeated for the others. 12 | ### b. Import and clean (as above and merge all at the end) - I do this with a function. 13 | 14 | ## Note: Although I use a functions, for these kind of tasks, I normally first load and clean each data separately 15 | ## or at least a larg proportion of the files, to see werhe the problesm could be. 16 | 17 | #********************************************************************************************************** 18 | # MAIN CONCLUSIONS AFTER INSPECITON OF RAW DATA 19 | ## Files are all cross-tabulations. 20 | ## for each file, the number of variables/categories should be equal to the number of rows, plus one row for 'Totals'. (so no column for totals) 21 | ## No missing data. Each cell with data has a numerical value. 22 | ## Plenty of empty rows and columns 23 | ## The number of variables and/or catgories si not consitent accorss all the files/loctions. 24 | 25 | #********************************************************************************************************** 26 | # LIBRARIES 27 | library(tidyverse) 28 | # Will use janitor once and will call it then. 29 | 30 | #********************************************************************************************************** 31 | # LOAD A SINGLE FILE #### 32 | #********************************************************************************************************** 33 | 34 | # Load data 35 | (data <- read_csv("location1.csv", 36 | # skip the first rows and keep the column names 37 | skip = 3, col_names = TRUE, 38 | # make all the variables character 39 | # I like to import everythign char unless I know there will be no typos 40 | col_types = cols(.default = "c")) 41 | ) 42 | 43 | # STEP 1 44 | # Fix the vbl names and remove the column for propotions as those can be calculated later. 45 | # I prefer to use the given names and change them at the end after data merging if that is needed. 46 | (data1 <- data %>% 47 | # This removes the proportion columns. 48 | select(-starts_with("X")) %>% 49 | # This is a quick fix for the names starting with numbers 50 | # I chose to remove the first col (X1) because I know after first inspection that the variable names should equate the row 'names' 51 | # I use janitor here because is less labour intensive. 52 | janitor::clean_names() %>% 53 | # shortens the names a bit while keeping enough characters so there are no 2 cols with the same name (e.g. missing) 54 | set_names(., ~str_sub(.x, 1,7)) 55 | ) 56 | 57 | # STEP 2 58 | # Now I Fix the rows that were imported with NA as those were empty. 59 | # Here need to be careful. could use 'complete.cases' (or janitor:: remove_empty) but in some cases the NA is displaced (cells merged) and replaced by '0'. 60 | # this means that I can't just remove rows that are not-complete. 61 | (data2 <- data1 %>% 62 | mutate(na.per.row = rowSums(is.na(data1))) %>% 63 | # I could filter only those with one NA, but want to be conservative in case other daset have more problems here. 64 | # so, I keep onl those where the total na is half of the numbere of variables. 65 | filter(na.per.row <= ncol(data1)/2) %>% 66 | select(-na.per.row) 67 | ) 68 | 69 | # STEP 3 70 | # Add the column with the variables/categories. 71 | (data3 <- bind_cols(row.name = c("Count", names(data2), "total"), data2) 72 | ) 73 | 74 | # ADDITIONAL 75 | # Here you can remove the 'count' and 'total' rows. and also change the NA for transgender from NA to '0'. 76 | # Here you can also change the names 77 | # If you do this for all 3 files and ensure names are consistent (even if you don't have the same columns for all files), 78 | # you can use dplyr::bind_rows() to merge them all (assuming you want them stacked / long format). 79 | # When using bind_row or bind_cols for this, I like to use the option '.id' so I get a variable indicating the dataset the data comes from. 80 | 81 | 82 | #********************************************************************************************************** 83 | # LOAD 3 FILES #### 84 | #********************************************************************************************************** 85 | 86 | # Using the same process I generate a function. 87 | # Note I did the above for al the individual files bu show only one. 88 | rm(list = ls()) 89 | 90 | # Store the name of the csv files to load. (change the path if you put them in a folder. For now they are in my wd) 91 | files_list <- c("location1.csv", 92 | "location2.csv", 93 | "location3.csv") 94 | 95 | # The function 96 | load_data <- function(file) { 97 | data <- read_csv(file, 98 | skip = 3, col_names = TRUE, 99 | col_types = cols(.default = "c")) 100 | 101 | data1 <- data %>% 102 | select(-starts_with("X")) %>% 103 | janitor::clean_names() 104 | 105 | data2 <- data1 %>% 106 | mutate(na.per.row = rowSums(is.na(data1))) %>% 107 | filter(na.per.row <= ncol(data1)/2) %>% 108 | select(-na.per.row) 109 | 110 | data3 <- bind_cols(row.name = c("count", names(data2), "total"), data2) 111 | } 112 | 113 | # This merges everyting in one dataset. the argumnt '.id' will indicate whre the data is comming from 114 | data_all <- map_df(files_list, load_data, .id = "location") 115 | # alternativelyy merge as a list and then see them with 'pluck' 116 | data_all_l <- map(files_list, load_data) 117 | # to see location 3, for example: 118 | data_all_l %>% pluck(3) 119 | 120 | 121 | #********************************************************************************************************** 122 | # HOUSEKEEPING? #### 123 | #********************************************************************************************************** 124 | 125 | # So, at this point, I rename the variables by column and row. 126 | # By inspection I know where the two variables named 'missing' and 'missing_1' / just need to be careful there. 127 | # I'll work from the rectangular dta created in the prvious step 128 | 129 | # Note: The following steps can be reduced but I prefer to 'see' them step by step at first. 130 | 131 | # STEP 1 132 | # First I want to get rid of the 'count' and 'total' rows 133 | data_all1 <- data_all %>% 134 | filter(row.name!= "count" & row.name!= "total") 135 | 136 | # STEP 2 ROW NAMES 137 | # I normally want to work with establishin my vbl name sin a codebook. but also I find this an easier way... 138 | # I extract the names of all the variables form the variable 'row.name' because this will have all the variables even if it is repeated. 139 | # I want the unique values because I will use join and need unique 'keys' (key will be vbl name in this case) 140 | row.vbls <- as.data.frame(unique(data_all1$row.name)) 141 | # Save this in an CSV 142 | write_csv(row.vbls, path = "vbls_all.csv") 143 | # Open the csv generated before put the new name next to each old name, then save as csv [not shown here] 144 | # Load and print so 145 | row.vbls <- read_csv("vbls_all_new2.csv") 146 | row.vbls 147 | # Now that I have the old and the nw name I will merge this with my data 148 | # I do a right_join instead of left because I want the first two rows to be printed on the left of the screen 149 | (data_all2 <- right_join(row.vbls, data_all1, by = c("old.name" = "row.name"))) 150 | # I use 'view' and check that the match is ok 151 | # I also check I still have 32 rows (no duplication occurred) 152 | 153 | # STEP 2 COL NAMES 154 | # Now that I checked that the re-name is OK I delet the col for old name. 155 | data_all3 <- data_all2 %>% select(-old.name) 156 | # Extract the variable names and matche them with the new names 157 | # Check that the number and the name is correct. See it preserved the order. 158 | (col.vbls <- as.data.frame(names(data_all3)) %>% 159 | rename(old.name = "names(data_all3)") %>% 160 | left_join(row.vbls) 161 | ) 162 | # Pull the names out as vector 163 | col.vbls <- col.vbls %>% 164 | filter(!is.na(new.name)) %>% 165 | pull(new.name) 166 | # Re-names 167 | names(data_all3) <- c("new.name", "location", col.vbls) 168 | (data_all4 <- data_all3 %>% 169 | select(location, new.name, 170 | starts_with("age_"), 171 | starts_with("sex"), 172 | starts_with("ms"), 173 | starts_with("wk")) 174 | ) 175 | # Pending 1: fix the NA value for non-binary gender 176 | # Pending 2: format the variable as numerical (not char) 177 | 178 | #***************************************************** 179 | 180 | 181 | -------------------------------------------------------------------------------- /August 2020 Puzzle/Martina Cusinato/AUG 2020_PUZZLE_2_Ben.R: -------------------------------------------------------------------------------- 1 | # I first transform each of the excel files/tabs I will use in CSV and file them. 2 | # I normally file these under subdirectory calles raw_data_csv, but for now will save them in the working directory (note file location for csv) 3 | # I prefer changing to csv before importing because it is pretty easy and ensures consistency in the way characters are entered. 4 | # I do this by clincking 'save as' and then chaning the format as CSV. Each tab or each file needs to be saved separately. 5 | 6 | # Load libraries 7 | library(tidyverse) 8 | library(janitor) 9 | 10 | # Load the csv 11 | data1 <- read_csv("referencetablesworkbook2.csv", 12 | # skip the first rows and keep the column names 13 | skip = 4, col_names = TRUE, 14 | # make all the variables character or remove the code below for R to 'guess' though risky - in my view 15 | col_types = cols(.default = "c") 16 | # the code below selects the columns you wanted. 17 | )[ ,c(1:5,31:35)] %>% 18 | janitor::clean_names() %>% 19 | rename(cause = "x1", 20 | sex = "x2", 21 | geo.type = "x3", 22 | area.code = "x4", 23 | area.name = "x5") %>% 24 | select(-x33) 25 | 26 | head(data1) 27 | # Then use filter or slice to get different subsets of info. 28 | 29 | -------------------------------------------------------------------------------- /August 2020 Puzzle/Martina Cusinato/vbls_all_new2.csv: -------------------------------------------------------------------------------- 1 | old.name,new.name 2 | x15_19,age_15_19 3 | x20_29,age_20_29 4 | x30_39,age_30_39 5 | x40_49,age_40_49 6 | x50_59,age_50_59 7 | x60_69,age_60_69 8 | female,sex_f 9 | male,sex_m 10 | transgender_non_binary,sex_nb 11 | married_or_living_together,ms_marr 12 | never_married,ms_never 13 | other,ms_other 14 | missing,age_miss 15 | missing_1,ms_miss 16 | employed,wk_emp 17 | student_retired_unemployed,wk_unemp -------------------------------------------------------------------------------- /August 2020 Puzzle/ReadMe.md: -------------------------------------------------------------------------------- 1 | # August 2020 puzzle 2 | 3 | ## Ben's problem: reading excel spreadsheets efficiently into R: relatively simple example. 4 | 5 | I find I have recently had to read data typically from multi-sheet files, often from ONS. Format is variously in .xls , .xlsx. or (single-sheet files) .csv. I usually want data only on a few columns and selected rows. The software I have found for .xls/.xlxs files (readxl) seems to require me to count the lines before the data proper starts or specify the entire range of the data, and tends to leave numerical columns as characters. OK to sort out myself once or twice, but I thought that better guessing by software might be possible. Ideally I thought it would also be good to be able to select columns and sub-groups of rows (by value of one of the columns) at the time of reading – but perhaps those are best selected as separate stages. 6 | 7 | Example: COVID deaths from ONS by Local Authority (I also needed this by MSOA, but we will leave that), which are on Table 2 of: 8 | 9 | https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fbirthsdeathsandmarriages%2fdeaths%2fdatasets%2fdeathsinvolvingcovid19bylocalareaanddeprivation%2f1march2020to30june2020/referencetablesworkbook2.xlsx 10 | 11 | I only needed the first 5 character/factor columns and the last 4 numerical ones (and only for COVID-16, both sexes combined). One issue with this file and commonly is that column names in the excel file are effectively spread over two rows (super- and sub-heading; here Month and deaths/rate/etc). 12 | 13 | ## Jocelyn's problem: reading excel spreadsheets efficiently into R: complicated example 14 | 15 | See the Excel workbook above 16 | 17 | What makes it complicated is: 18 | 19 | 1. Sheets within a workbook 20 | 2. Excel formatting (merged cells) 21 | 3. Multiple column titles in each sheet 22 | 4. Column titles for each sheet can be distinct 23 | 5. Rows for each sheet can be distinct 24 | 6. Non-specific column titles (Missing) 25 | -------------------------------------------------------------------------------- /Dates.R: -------------------------------------------------------------------------------- 1 | 2 | # Make some simple date to play with 3 | d <- data.frame(sdate = c("2019-12-20", "2018-12-20")) 4 | d$date <- as.Date(d$sdate, format = "%Y-%m-%d") 5 | head(d) 6 | class(d$date) 7 | class(d$sdate) 8 | 9 | # Obviously this won't work 10 | d$datebefore <- ifelse(d$date >= "2019-12-19", d$date, "2019-12-19") 11 | head(d) 12 | class(d$datebefore) 13 | class(as.Date("2019-12-19", format = "%Y-%m-%d")) 14 | 15 | # This doesn't work 16 | d$datebefore <- ifelse(d$date >= as.Date("2019-12-19"), 17 | d$date, 18 | as.Date("2019-12-19")) 19 | head(d) 20 | class(d$datebefore) 21 | 22 | # Using if_else in dplyr 23 | library(dplyr) 24 | d$datebefore <- if_else(d$date >= as.Date("2019-12-19"), 25 | d$date, 26 | as.Date("2019-12-19")) 27 | head(d) 28 | class(d$datebefore) 29 | 30 | -------------------------------------------------------------------------------- /December 2020 Puzzle/Interactions.R: -------------------------------------------------------------------------------- 1 | # RUGs Interaction Puzzle 2 | # 21 Jan 2021 3 | # Calum Davey 4 | 5 | library(multcomp) # (1) 6 | library(Epi) # (2) 7 | library(emmeans) # (3) 8 | 9 | # Generate simple dataset with interaction 10 | # ======================================== 11 | 12 | N <- 1000 13 | data2 <- data.frame( 14 | f = rbinom(n = N, size=1, prob=0.5), # A factor exposure (e.g. treatment) 15 | x = rnorm(n = N, 10, 2), # A continuous exposure (e.g. age) 16 | e = rnorm(n = N, 0, 5)) # Random error 17 | 18 | # Set the model parameters 19 | Alpha = 10 20 | Beta1 = .2 21 | Beta2 = 1.3 22 | Beta3 = 1.4 23 | 24 | # Generate the dependent variable 25 | data2$y = Alpha + Beta1*data2$f + Beta2*data2$x + Beta3*data2$f*data2$x + data2$e 26 | 27 | # Reclass f as factor 28 | data2$f <- as.factor(data2$f) 29 | 30 | # Fit the model 31 | # ============= 32 | 33 | model <- lm(y ~ f*x, data = data2) 34 | summary(model) 35 | 36 | # Return the interaction 37 | # ====================== 38 | 39 | # (1) Calum + Will + Saravanakumar 40 | confint(multcomp::glht(model, linfct="x + f1:x = 0")) 41 | 42 | # (2) Antonio 43 | model2 <- lm(y ~ x:f + f, data=data2) 44 | Epi::ci.lin(model2) 45 | 46 | # (3) Paul, https://stats.idre.ucla.edu/r/seminars/interactions-r/#s4a 47 | emmeans::emtrends(model, ~ f, var = "x") 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /February 2021/how_to_learn_R.md: -------------------------------------------------------------------------------- 1 | ## General resources 2 | 3 | (Julia) Just adding to chat some resources I'd really recommend for learning R, if you are within the RStudio + Tidyverse paradigm: 4 | 5 | - https://r4ds.had.co.nz/ 6 | - https://rstudio.com/resources/cheatsheets/ 7 | - https://www.statlearning.com/ (not specific to Tidyverse, excellent as a practical open textbook walkthrough on various ML methods) 8 | - https://otexts.com/fpp2/ 9 | 10 | (Julia) At some point I will probably compile the above into a running list on GitHub (like this one https://github.com/7j7j/lshtm-rd-resources), maybe we should set this up with RUsers group (https://github.com/lshtm-rug) and/or CHIL-R (LSHTM health economists') GitHub (https://github.com/LSHTM-CHIL) 11 | 12 | There's also this R for Stata Users book but I have not used it myself: https://www.springer.com/gp/book/9781441913173 13 | 14 | (Naomi) "writing R is like writing a paper, you wouldn't submit your first draft" [write the first draft in notes only, and then a rough version, and improve from there] 15 | 16 | French book https://link.springer.com/book/10.1007/978-2-8178-0535-1 17 | 18 | Interesting presentation from Jeroen Ooms on assessing the quality of open source projects in general and R packages in particular https://rstudio.com/resources/rstudioglobal-2021/monitoring-health-and-impact-of-open-source-projects/ 19 | 20 | For absolute beginners (not this group) SWIRL is useful https://swirlstats.com/ 21 | 22 | I also use this all the time - https://www.r-graph-gallery.com/ 23 | 24 | Good overview of packages in different domains - which I only just became aware of and wish I'd found earlier https://cran.r-project.org/web/views/ 25 | 26 | ## Data wrangling 27 | 28 | (Julia) What I mean by data wrangling is how you go from relatively unstructured raw data to your specific analytical spine (https://en.wikipedia.org/wiki/Data_wrangling) 29 | 30 | Other resources for this in R: 31 | https://www.experfy.com/training/courses/data-wrangling-in-r (I used some of my PhD training funds for this) 32 | 33 | Alternatives that are probably good too: 34 | - https://www.pluralsight.com/paths/data-wrangling-with-r 35 | - http://uc-r.github.io/data_wrangling 36 | - https://www.edx.org/course/data-science-wrangling 37 | - https://www.udemy.com/course/learn-data-wrangling-with-r/ 38 | 39 | ... and others exist 40 | 41 | 42 | ## Multiple imputation 43 | 44 | Rdocumentation.org in general is a great resource, e.g. for mice as Andrei is discussing: https://www.rdocumentation.org/packages/mice/versions/3.6.0/topics/mice 45 | 46 | (Example of a vignette with mice: https://amices.org/mice/ -- great package for missing data multiple imputation in general) 47 | 48 | ## Community 49 | 50 | Also really recommend peer learning resources like R Ladies London: https://www.meetup.com/rladies-london/ 51 | 52 | and similarly across R/Python/Julia etc: https://www.meetup.com/London-Data-Science-ODSC/ 53 | 54 | Per Andrei's point, nice example of StackExchange thread: https://stats.stackexchange.com/questions/25025/how-to-build-a-linear-mixed-effects-model-in-r 55 | 56 | Example NHS R conference: https://nhsrcommunity.com/events/nhs-r-virtual-conference-2020/ 57 | 58 | Also as a PhD student you can use the stats Helpdesk to get some input 59 | 60 | Blogs, e.g. https://www.r-bloggers.com/ 61 | 62 | Naomi and Sam's great software chatter initiative: https://github.com/cmmid/SoftwareChatter 63 | 64 | another great resource is the Recon project (partly run by Thubaut Jombart) which focuses on R and epidemics: https://www.reconlearn.org/ (lots of cases studies and resources here) 65 | 66 | ## Loops 67 | 68 | (Julia) Oh man, I'm still quite bad at loops but some resources with vignette code where purrr and dplyr in Tidyverse are specifically meant for this quite often (where 'across' and 'apply' are lifesavers): 69 | https://bookdown.org/ansellbr/WEHI_tidyR_course_book/automating-your-work.html 70 | 71 | https://datacarpentry.org/semester-biology/materials/for-loops-R/ 72 | 73 | https://dplyr.tidyverse.org/articles/programming.html 74 | 75 | Working with new packages: 76 | 77 | - Copy own function, add a break to see what it is doing 78 | - Use fake data 79 | 80 | - apply https://www.r-bloggers.com/2009/09/r-function-of-the-day-tapply-2/ 81 | - map https://www.rdocumentation.org/packages/purrr/versions/0.2.5/topics/map 82 | 83 | 84 | ## Other languages 85 | Reticulate is a package by RStudio to translate python to R: https://rstudio.github.io/reticulate/ 86 | 87 | https://dept.stat.lsa.umich.edu/~jerrick/courses/stat701/notes/sql.html SQL in R 88 | 89 | ## Plotting 90 | - lattice can also be very powerful. I particularly enjoy using “levelplot” for rasters 91 | - If wanting to map you can use tmap which uses the same grammar of graphics approach as ggplot. 92 | 93 | ## Tables 94 | - Xtable (latex) 95 | - https://cran.r-project.org/web/packages/tableone/vignettes/introduction.html 96 | - https://rmarkdown.rstudio.com/ 97 | 98 | ## Version control 99 | - Git 100 | - here package -------------------------------------------------------------------------------- /January 2021/20210122-RUG-ConstrainedOpt.R: -------------------------------------------------------------------------------- 1 | ################################################### 2 | # 3 | # LSHTM R users group 4 | # Constrained optimization with R 5 | # Friday 22 January 2021 6 | # 7 | ################################################### 8 | 9 | #-------------------------------------- 10 | # Part 1 - What is constrained optimization? 11 | #-------------------------------------- 12 | 13 | #---- Optimization: find where is the minimum of a function 14 | 15 | #- Function to be minimized: the minimum is 3 16 | f <- function(x) (x - 3)^2 + 5 17 | curve(f, from = -10, to = 10) 18 | 19 | # We can use optimize to find this minimum 20 | ?optimize # performs 1D optimization 21 | 22 | resmin <- optimize(f, interval = c(-100, 100)) 23 | resmin 24 | 25 | points(resmin$minimum, resmin$objective, pch = 16, col = 4, cex = 2) 26 | 27 | #- Adding a constraint that x < -5 28 | rect(xleft = -5, ybottom = par("usr")[3], xright = par("usr")[2], 29 | ytop = par("usr")[4], density = 10, col = 2) 30 | 31 | # Now let's otpimize with box constraint 32 | rescons <- optimize(f, interval = c(-100, 100), upper = -5) 33 | rescons 34 | 35 | points(rescons$minimum, rescons$objective, pch = 16, col = 4, cex = 2) 36 | 37 | #---- 2D optimization: Booth function takes 2 parameters 38 | booth <- function(x1, x2) (x1 + 2*x2 - 7)^2 + (2*x1 + x2 - 5)^2 39 | 40 | # To plot the function 41 | xseq <- seq(-10, 10, length.out = 100) 42 | y <- outer(xseq, xseq, booth) 43 | image(xseq, xseq, y, xlab = "x1", ylab = "x2", breaks = 0:12 * 200) 44 | contour(xseq, xseq, y, add = T, levels = 0:12 * 200) 45 | 46 | #- Optimization with optim function 47 | ?optim 48 | 49 | booth_vec <- function(x) booth(x[1], x[2]) 50 | resbooth <- optim(c(0, 0), fn = booth_vec) 51 | resbooth 52 | 53 | # Showing the minimum location 54 | points(resbooth$par[1], resbooth$par[2], pch = 16, cex = 2) 55 | text(resbooth$par[1], resbooth$par[2], 56 | sprintf("(%1.1f,%1.1f)", resbooth$par[1], resbooth$par[2]), 57 | pos = 3, offset = 1) 58 | 59 | #- Let's add linear constraints 60 | # Those are constraints of the form u %*% x >= c 61 | ?constrOptim 62 | 63 | # We want to find the min of the Booth function such that x1 + x2 <= 0 64 | Cmat <- matrix(1, nrow = 1, ncol = 2) 65 | Cmat 66 | Cmat %*% c(1,3) 67 | Cmat %*% c(-1, 2) 68 | Cmat %*% c(1, -2) 69 | 70 | ccontour <- outer(xseq, xseq, function(x, y) (cbind(x, y) %*% t(Cmat)) <= 0) 71 | image(xseq, xseq, ccontour, col = adjustcolor(c(2, 3), .5), 72 | breaks = c(-1, .5, 1), xlab = "x1", ylab = "x2") 73 | contour(xseq, xseq, y, add = T, levels = 0:12 * 200) 74 | points(resbooth$par[1], resbooth$par[2], pch = 16, cex = 2) 75 | 76 | # Performing constrained optimization 77 | constr_res <- constrOptim(c(-10, -10), f = booth_vec, grad = NULL, 78 | ui = -Cmat, ci = 0) 79 | constr_res 80 | 81 | # Adding it to the plot 82 | points(constr_res$par[1], constr_res$par[2], pch = 16, cex = 2) 83 | text(constr_res$par[1], constr_res$par[2], 84 | sprintf("(%1.1f,%1.1f)", constr_res$par[1], constr_res$par[2]), 85 | pos = 3, offset = 1) 86 | 87 | 88 | #-------------------------------------- 89 | # Part 2 - Where I talk about my work: constrained splines 90 | #-------------------------------------- 91 | 92 | # Flexible way of doing nonlinear regression 93 | 94 | #----- Let's simulate a function with data 95 | set.seed(1) 96 | x <- sort(c(runif(98, 0, 10), 0, 10)) 97 | y <- (x - 3)^2 + rnorm(100, 0, 10) 98 | 99 | # Plot 100 | plot(x, y, main = "Scatterplot of x and y", col = grey(0.7), pch = 19) 101 | lines(x, (x - 3)^2, lty = 2, lwd = 2) 102 | 103 | #----- What are splines ? 104 | # Splines are polynomial bases applied to x 105 | spline_mat <- dlnm::ps(x, int = T) 106 | str(spline_mat) 107 | 108 | # We now have 10 variables called bases 109 | matplot(x, spline_mat, type = "l", ylab = "Spline bases") 110 | 111 | # To approximate the function, these bases are used as new variables in a regression model 112 | lmres <- lm(y ~ 0 + spline_mat) 113 | coefs <- coef(lmres) 114 | 115 | # I multiply each basis by its associated coefficient 116 | coefmat <- mapply("*", as.data.frame(spline_mat), coefs) 117 | 118 | matplot(x, coefmat, type = "l", ylab = "Spline basis * coef", ylim = range(coefs)) 119 | points(x[apply(spline_mat, 2, which.max)], coefs, type = "h", col = 1:6) 120 | text(x[apply(spline_mat, 2, which.max)], coefs, round(coefs, 1), pos = 3, col = 1:6, xpd = T) 121 | 122 | # Resulting curve: we add all bases * coef 123 | lm_curve <- spline_mat %*% coefs 124 | lines(x, lm_curve, lwd = 2) 125 | 126 | # Let's compare to the true curve 127 | plot(x, y, main = "Scatterplot of x and y", col = grey(0.7), pch = 19) 128 | lines(x, (x-3)^2, lty = 2, lwd = 2) 129 | lines(x, lm_curve, lwd = 2, col = 2) 130 | 131 | #----- What does this have to do with optimization? 132 | 133 | # Fitting a regression model is an optimization problem: minimizing least squares 134 | # Let's define a least squares function 135 | lsfun <- function(coefs) sum((y - spline_mat %*% coefs)^2) 136 | # Optimization is easier with the gradient of the function 137 | lsgr <- function(coefs) -2 * t(spline_mat) %*% y + 2 * crossprod(spline_mat) %*% coefs 138 | 139 | # we can use this function in a general purpose optimizer 140 | opt <- optim(rep(0, 10), fn = lsfun, gr = lsgr, method = "BFGS") 141 | 142 | opt$par 143 | coefs 144 | 145 | opt_curve <- spline_mat %*% opt$par 146 | lines(x, opt_curve, col = 4, lty = 2, lwd = 2) 147 | 148 | #----- Constrained splines 149 | # We want to fit a curve that is monotone increasing 150 | # This mean that the spline coefficients must also increase (the beauty of B-splines) 151 | # e.g. if we use coefs 152 | ex_coef <- 1:10 153 | ex_coefmat <- mapply("*", as.data.frame(spline_mat), ex_coef) 154 | ex_curve <- spline_mat %*% ex_coef 155 | 156 | # We plot these coefs with the resulting curve: we obtain a straight curve 157 | matplot(x, ex_coefmat, type = "l", ylab = "Spline basis * coef", 158 | ylim = range(ex_coef)) 159 | points(x[apply(spline_mat, 2, which.max)], ex_coef, type = "h", col = 1:6) 160 | text(x[apply(spline_mat, 2, which.max)], ex_coef, round(ex_coef, 1), 161 | pos = 3, col = 1:6, xpd = T) 162 | lines(x, ex_curve, lwd = 2) 163 | 164 | # Another example but with coefficient increasing quadraticaly 165 | ex2_coef <- (1:10)^2 166 | ex2_coefmat <- mapply("*", as.data.frame(spline_mat), ex2_coef) 167 | ex2_curve <- spline_mat %*% ex2_coef 168 | 169 | # We now obtain nice quadratic curve 170 | matplot(x, ex2_coefmat, type = "l", ylab = "Spline basis * coef", 171 | ylim = range(ex2_coef)) 172 | points(x[apply(spline_mat, 2, which.max)], ex2_coef, type = "h", col = 1:6) 173 | text(x[apply(spline_mat, 2, which.max)], ex2_coef, round(ex2_coef, 1), 174 | pos = 3, col = 1:6, xpd = T) 175 | lines(x, ex2_curve, lwd = 2) 176 | 177 | # To obtain an increasing curve we otpimize least-squares 178 | # but with the constraint that the coefficients increase 179 | # i.e. that the difference between successive coefficients is > 0 180 | Cmat <- diff(diag(10)) 181 | Cmat # We have 9 constraints, one for each coefficient difference 182 | 183 | # Now let's use constrOptim: we need starting value 184 | # We'll use increasing values that are the closest to unconstrained ones 185 | start <- predict(lm(opt$par ~ I(1:10))) 186 | 187 | # Now we can use these starting values for optimization 188 | copt <- constrOptim(start, f = lsfun, grad = lsgr, 189 | ui = Cmat, ci = 0, method = "BFGS") 190 | copt$par 191 | 192 | copt_curve <- spline_mat %*% copt$par 193 | 194 | plot(x, y, main = "Scatterplot of x and y", col = grey(0.7), pch = 19) 195 | lines(x, (x-3)^2, lty = 2, lwd = 2) 196 | lines(x, opt_curve, lwd = 2, col = 4) 197 | lines(x, copt_curve, lwd = 2, col = 3) 198 | legend("topleft", c("True", "Unconstrained", "optim"), 199 | lwd = 2, col = c(1, 4, 3), lty = c(2, 1, 1), bty = "n") 200 | 201 | #----- Quadratic programming 202 | # Since the objective function is quadratic, we can use that a Quadratic program 203 | library(quadprog) 204 | ?solve.QP 205 | 206 | # Here we need to rearrange our objective function 207 | Dmat <- crossprod(spline_mat) 208 | dvec <- t(y) %*% spline_mat 209 | 210 | # We can use solve.QP to solve the optimization problem 211 | # The good thing is that we don't need starting values 212 | qpres <- solve.QP(Dmat, dvec, t(Cmat)) 213 | 214 | qpres 215 | 216 | qpcurve <- spline_mat %*% qpres$solution 217 | 218 | plot(x, y, main = "Scatterplot of x and y", col = grey(0.7), pch = 19) 219 | lines(x, (x-3)^2, lty = 2, lwd = 2) 220 | lines(x, opt_curve, lwd = 2, col = 4) 221 | lines(x, copt_curve, lwd = 2, col = 3) 222 | lines(x, qpcurve, lwd = 2, col = 7) 223 | legend("topleft", c("True", "Unconstrained", "optim", "quadprog"), 224 | lwd = 2, col = c(1, 4, 3, 7), lty = c(2, 1, 1, 1), bty = "n") 225 | 226 | #----- Convex curve 227 | # The true curve is actually convex 228 | # We can force our splines to be convex 229 | # This means constraining the second differences 230 | Cmat2 <- diff(diag(10), diff = 2) 231 | Cmat2 232 | 233 | # Now let's use it in the QP 234 | qpres2 <- solve.QP(Dmat, dvec, t(Cmat2)) 235 | 236 | qpcurve2 <- spline_mat %*% qpres2$solution 237 | 238 | plot(x, y, main = "Scatterplot of x and y", col = grey(0.7), pch = 19) 239 | lines(x, (x-3)^2, lty = 2, lwd = 2) 240 | lines(x, opt_curve, lwd = 2, col = 4) 241 | lines(x, qpcurve2, lwd = 2, col = 6) 242 | legend("topleft", c("True", "Unconstrained", "Convex"), 243 | lwd = 2, col = c(1, 4, 6), lty = c(2, 1, 1), bty = "n") 244 | 245 | 246 | 247 | ################# 248 | # Some material 249 | ################# 250 | 251 | #----- To understand the basic concepts: 252 | # Boyd, S., Vandenberghe, L., 2004. Convex Optimization, 1 edition. ed. Cambridge University Press, Cambridge, UK; New York. 253 | 254 | #----- For details about the different optimization algorithms: 255 | # Nocedal, J., Wright, S., 2006. Numerical Optimization. Springer Science & Business Media. 256 | 257 | # CRAN task view on optimization: https://cran.r-project.org/web/views/Optimization.html 258 | 259 | #----- Some useful functions for specific applications 260 | # Linear objective functions (linear programming): lpSolve 261 | # Quadratic objective functions (quadratic programming): quadprog 262 | # Non-linear least-squares: nls/nls2 263 | # Stochastic optimization useful for non-convex problems: GA -------------------------------------------------------------------------------- /July 2020 Puzzle/July2020PuzzleSolutions.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "July 2020 Puzzle" 3 | author: "Calum Davey" 4 | date: "25/07/2020" 5 | output: 6 | html_document: 7 | theme: readable 8 | editor_options: 9 | chunk_output_type: console 10 | --- 11 | 12 | ```{r setup, include=FALSE} 13 | knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE) 14 | ``` 15 | 16 | ## Puzzle 17 | 18 | I have a dataset with 5 different x values and the lower limit of the difference from x = 8. (limit always = 0 when x = 8) 19 | 20 | I would like a function to get the lowest x where limit < 0.05 and all of the previous limits in the sequence to 8 are also < 0.05. 21 | 22 | I don't mind if the function returns a single value or a dataset with one row. Example datasets and expected answers are below. 23 | 24 | ## Datasets and expected answers 25 | 26 | ```{r ProblemDatasets} 27 | # f(a) = 4 28 | a <- data.frame(x = c(8, 7, 6, 5, 4), 29 | limit = c(0, 0.01, 0.01, 0.01, 0.01)) 30 | 31 | # f(b) = 6 32 | b <- data.frame(x = c(8, 7, 6, 5, 4), 33 | limit = c(0, 0.01, 0.01, 0.06, 0.06)) 34 | 35 | # f(c) = 7 (as 6 is outside of limit) 36 | c <- data.frame(x = c(8, 7, 6, 5, 4), 37 | limit = c(0, 0.01, 0.06, 0.01, 0.06)) 38 | 39 | # f(d) = 7 (as 6 is outside of limit) 40 | d <- data.frame(x = c(8, 7, 6, 5, 4), 41 | limit = c(0, 0.01, 0.06, 0.01, 0.01)) 42 | ``` 43 | 44 | ## Solutions 45 | ### Calum 46 | 47 | ```{r Calum} 48 | f <- function(Y, X, threshold, d){ 49 | p <- suppressWarnings(min(which((d[,Y] - threshold) > 0)) - 1) 50 | if(p == Inf){p <- nrow(d)} 51 | d[p, X] 52 | } 53 | ``` 54 | 55 | ### Antonio 56 | 57 | ```{r Antonio} 58 | # FUNCTION fweird WITH ARGUMENTS: 59 | # - x: VALUES 60 | # - limit: CORRESPONDING DECIMALS 61 | # - thr: THRESHOLD FOR DECIMAL 62 | # - res: RETURN VALUE OR POSITION 63 | # NB: SET TO NA IF NONE IS ELIGIBLE 64 | fweird <- function(x, limit, thr=0.05, res="val") { 65 | # CHECK CONSISTENCY OF ARGUMENTS AND THEIR VALUE 66 | if(length(x)!=length(limit)) 67 | error("'x' and 'limit' must have the same length") 68 | res <- match.arg(res, c("val","ind")) 69 | 70 | # IDENTIFY WHICH DECIMALS ARE ABOVE THE THRESHOLD 71 | not <- which(limit>=0.05) 72 | 73 | # IDENTIFY THE INDEX 74 | ind <- ifelse(length(not), ifelse(not[1]==1, NA, not[1]-1), length(x)) 75 | 76 | # RETURN NA IF NONE 77 | if(is.na(ind)) return(NA) 78 | 79 | # RETURN EITHER THE VALUE OR THE INDEX 80 | ifelse(res=="val", x[ind], ind) 81 | } 82 | ``` 83 | 84 | ### Darrren 85 | 86 | ```{r Darren} 87 | #D Scott function 88 | 89 | min_fun = function(data, alpha){ 90 | #================================================ 91 | #Purpose# 92 | #(Find the lowest x where limit < 0.05 and all of 93 | # the previous limits in the sequence.) 94 | #------------------------------------------------ 95 | #Parameters# 96 | #data = Data set, 2 columns X and limit 97 | #alpha = Threshold 98 | #------------------------------------------------ 99 | #Output# 100 | #Min of x up to limit 101 | #================================================ 102 | 103 | indx = seq(1, length(data$x)) 104 | indco = indx[data$limit >= alpha][1] 105 | 106 | if(is.na(indco)){ 107 | 108 | max = length(data$x) 109 | print("Threshold not in x") 110 | se = seq(1, max) 111 | min_x = min(data$x[se]) 112 | 113 | }else if(indco > 1){ 114 | max = indco 115 | se = seq(1, max - 1) 116 | min_x = min(data$x[se]) 117 | 118 | }else{ 119 | min_x = NA 120 | print("Threshold is the first value") 121 | } 122 | 123 | return(min_x) 124 | } 125 | ``` 126 | 127 | ### Clemence 128 | 129 | ```{r Clemence} 130 | #Function: first check if any limit is larger than the threshold. 131 | #If not, the result is the last 132 | #value of the vector x. Otherwise, look at the first time 133 | #limit>threshold and get the x value from previous row) 134 | 135 | puzzle<-function(df){ 136 | result<-ifelse(sum(df$limit>threshold)==0, tail(df$x, n=1), 137 | df$x[min(which(df$limit>threshold))-1]) 138 | return(result) 139 | } 140 | ``` 141 | 142 | ## Alex 143 | 144 | ```{r Alex} 145 | puzzlefunction <- function(data,threshold=0.05){ 146 | index <- sum(1-sign(cumsum(data$limit >= 0.05))) 147 | return(data$x[index]) 148 | } 149 | ``` 150 | 151 | ### Speedtests 152 | 153 | ```{r SpeedTests} 154 | data <- b 155 | alpha <- threshold <- 0.05 156 | 157 | # Benchmarking the solution 158 | # install.packages('microbenchmark') 159 | library(microbenchmark) 160 | m <- microbenchmark( 161 | 162 | # Calum 163 | f(Y = 'limit', X = 'x', threshold = 0.05, d = data), 164 | 165 | # Antonio 166 | fweird(x = data$x, limit = data$limit), 167 | 168 | # Darren 169 | min_fun(data, alpha), 170 | 171 | # Clemence 172 | puzzle(data), 173 | 174 | # Alex 175 | puzzlefunction(data, threshold), 176 | 177 | times = 1000 178 | ) 179 | 180 | barplot(summary(m)$median, 181 | names = c( 'Calum', 'Antonio', 'Darren', 'Clemence', 'Alex'), 182 | ylab = 'Median time in ns') 183 | 184 | ``` 185 | 186 | -------------------------------------------------------------------------------- /July 2020 Puzzle/JulyPuzzle.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/July 2020 Puzzle/JulyPuzzle.zip -------------------------------------------------------------------------------- /July 2020 Puzzle/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/July 2020 Puzzle/results.png -------------------------------------------------------------------------------- /Plots March 2020/Amy/GGplot example AM.R: -------------------------------------------------------------------------------- 1 | 2 | #----------------------# 3 | # GGplot show and tell # 4 | #----------------------# 5 | 6 | require(tidyverse) 7 | require(broom) 8 | require(ggforce) 9 | 10 | #------# 11 | # Data # 12 | #------# 13 | 14 | set.seed(123) 15 | 16 | df1 <- data.frame( weight = rnorm(500), 17 | sex = runif(500)>0.5, 18 | SES = rnorm(500), 19 | yob = sample(1:10, 500, replace = TRUE), 20 | trt1 = runif(500)>0.7, 21 | trt2 = runif(500)>0.2, 22 | trt3 = runif(500)>0.9, 23 | pa = runif(500)>0.5, 24 | staph = runif(500)>0.4) 25 | 26 | 27 | df1$y <- with(df1, 0.3*weight + 0.5*sex - 0.2*SES + 28 | 0.1*yob + 0.01*trt1 + 0.8*trt2 - 0.4*trt3 -0.5*pa + rnorm(500, sd = 0.7) ) 29 | 30 | #-------------------------------# 31 | # Model + point estimates + CI # 32 | #-------------------------------# 33 | 34 | mod1 <- lm(y ~ weight + sex + SES + yob + trt1 + trt2 + trt3 + pa + staph, data = df1) 35 | 36 | # Pt estimates + confidence intervals 37 | 38 | coefs <- tidy(mod1, conf.int = TRUE) 39 | coefs <- subset(coefs, term!="(Intercept)") # remove intercept 40 | 41 | # Add nicer names 42 | 43 | coefs$names <- c( "Weight (kg)", "Sex = FEMALE", 44 | "SES (IMD z-score)", "Year of birth (unit = 1 year)", 45 | "Treatment A", "Treatment B", "Treatment C", 46 | "Bronchitis", "Pneumonia") 47 | 48 | # Group the variables 49 | 50 | coefs$type = c("Weight at age 1 year", 51 | rep("Demographics", 3), 52 | rep("Clinical", 3), 53 | rep("Infections", 2)) 54 | 55 | 56 | # Gather point estimates and CIs 57 | 58 | coefs$display_ests <-sprintf("%.2f [%.2f, %.2f]" , coefs$estimate, coefs$conf.low, coefs$conf.high) 59 | 60 | #-------------------# 61 | # Plot 1: no groups # 62 | #-------------------# 63 | 64 | 65 | 66 | ggplot( coefs, aes(estimate, names )) + 67 | geom_point(aes())+ 68 | geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0, size = 0.5)+ 69 | geom_text(aes(x = Inf, label = display_ests), vjust = -0.5, colour = "black", hjust = -0.1, size = 3.5) + 70 | coord_cartesian(clip = "off") + 71 | geom_vline(xintercept = 0, linetype = 2)+ 72 | theme_bw(base_size = 12)+ 73 | theme(plot.margin = unit(c(0,4,0,0), "cm")) + 74 | labs(x = "Lung function (FEV1%)",y = NULL, title = NULL)+ 75 | facet_col(~type, scale = "free_y", space = "free") 76 | 77 | 78 | #------------------------# 79 | # Plot 2: order by group # 80 | #------------------------# 81 | 82 | 83 | coefs$type <- factor(coefs$type, ordered = TRUE, levels = c("Weight at age 1 year", "Demographics", "Clinical", "Infections")) 84 | coefs <- coefs[ order(coefs$type, coefs$estimate), ] 85 | coefs$names <- factor( coefs$names, ordered = TRUE, levels = rev(coefs$names) ) 86 | 87 | 88 | ggplot( coefs, aes(estimate, names )) + 89 | geom_point()+ 90 | geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0, size = 0.5)+ 91 | geom_text(aes(x = Inf, label = display_ests), vjust = -0.5, colour = "black", hjust = -0.1, size = 3.5) + 92 | coord_cartesian(clip = "off") + 93 | geom_vline(xintercept = 0, linetype = 2)+ 94 | theme_bw(base_size = 12)+ 95 | theme(plot.margin = unit(c(0,4,0,0), "cm")) + 96 | theme(axis.title.x = element_text(face = "bold", size = 15)) + 97 | labs(x = "Lung function (FEV1%)",y = NULL, title = NULL)+ 98 | facet_col(~type, scale = "free_y", space = "free") 99 | 100 | 101 | #-----------------------------# 102 | # Plot 3: Colour by std error # 103 | #-----------------------------# 104 | 105 | ggplot( coefs, aes(estimate, names )) + 106 | geom_point(aes( colour = 1/std.error))+ 107 | geom_errorbarh(aes(xmin = conf.low, xmax = conf.high, colour = 1/std.error), height = 0, size = 0.5)+ 108 | geom_text(aes(x = Inf, label = display_ests), vjust = -0.5, colour = "black", hjust = -0.1, size = 3.5) + 109 | coord_cartesian(clip = "off") + 110 | geom_vline(xintercept = 0, linetype = 2)+ 111 | theme_bw(base_size = 12)+ 112 | theme(plot.margin = unit(c(0,4,0,0), "cm")) + 113 | theme(axis.title.x = element_text(face = "bold", size = 15)) + 114 | labs(x = "Lung function (FEV1%)",y = NULL, title = NULL)+ 115 | facet_col(~type, scale = "free_y", space = "free") + 116 | scale_colour_gradient(low = "grey60", high = "black") + 117 | theme(legend.position = "none") 118 | 119 | -------------------------------------------------------------------------------- /Plots March 2020/Calum/cascade_plot.R: -------------------------------------------------------------------------------- 1 | # HIV treatment serrated cascade plot 2 | # Calum Daveu 3 | # LSHTM 4 | # 26 MAR 2020 5 | 6 | # Change working directory 7 | setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) 8 | 9 | # Read the data 10 | d <- read.csv('data.csv') 11 | 12 | # Set some arbitrary x-values 13 | xs <- c(0.01,.1,0.01,.1,.11,.2,.11,.2,.21,.3,.21,.3,.31,.4,.31,.4) 14 | 15 | # Set values for the point types 16 | pt <- rep(c(1,19),each=2,times=nrow(d)/4) 17 | 18 | # Create the plot and the points 19 | plot(xs, d$p, 20 | ylim=c(0,1), xlim=c(0.01,.4), yaxt="n", xaxt="n", pch=pt, bty="n", 21 | xlab="", ylab="Means of RDS-adjusted proportions in all women", cex.lab=.7) 22 | 23 | # Add the sloping lines 24 | for (r in c(1,3,5,7,9,11,13,15)){ 25 | lines(xs[r:(r+1)], d$p[r:(r+1)], type="l", lwd=1.7) 26 | } 27 | 28 | # Add the vertical lines 29 | for (r in c(1:16)){ 30 | lines(c(xs[r],xs[r]),c(0,d$p[r]), lwd=.7) 31 | } 32 | 33 | # Add horizontal line at bottom 34 | lines(c(0.01,.4), c(0,0)) 35 | 36 | # Add the 90:90:90 lines 37 | base_hiv <- mean(d[d$outcome=='HIV' & d$year==2013,'p']) 38 | nnn <- c(base_hiv*.9, # First 90 for diagnosis 39 | base_hiv*.9*.9, # Second 90 for treatment 40 | base_hiv*.9*.9*.9) # Third 90 for suppression 41 | 42 | lines(c(.11,.2), c(nnn[1],nnn[1]), lty=5, lwd=.7) 43 | lines(c(.21,.3), c(nnn[2],nnn[2]), lty=5, lwd=.7) 44 | lines(c(.31,.4), c(nnn[3],nnn[3]), lty=5, lwd=.7) 45 | 46 | text(.2-.01,nnn[1]+.02,labels="90%", cex=.7) 47 | text(.3-.01,nnn[2]+.02,labels="81%", cex=.7) 48 | text(.4-.01,nnn[3]+.02,labels="73%", cex=.7) 49 | 50 | # Add the dates 51 | lines(c(xs[1],xs[1]),c(d$p[1],d$p[1]+.08), lty=3, lwd=.7) 52 | lines(c(xs[2],xs[2]),c(d$p[2],d$p[2]+.08), lty=3, lwd=.7) 53 | 54 | text(xs[1]+0.0125,d$p[1]+.1,labels="2013", cex=.6) 55 | text(xs[2]+0.0125,d$p[2]+.1,labels="2016", cex=.6) 56 | 57 | # Add axes and labels 58 | ap <- c(0,.2,.4,.6,.8,1) 59 | axis(2, at=ap, lab=paste0(ap * 100, "%"), las=TRUE, cex.axis=0.7) 60 | 61 | text(c(.05, .15, .25, .35, .55,.65),c(-.0193), 62 | labels=c("HIV +ve", "Know +ve", "On ART", "vl<1000c/ml"), cex=.7) 63 | 64 | # Add legend 65 | legend(0,.9,c("Sisters only arm", "Enhanced Sisters arm"),pch=c(1,19), bty="n", cex=.7) 66 | 67 | # Add title 68 | mytitle = "HIV-treatment cascade across 14 sites in Zimbabwe" 69 | mysubtitle = "Change between baseline and endline in each arm of SAPPH-IRe trial" 70 | mtext(side=3, line=2, at=-0.07, adj=0, cex=1, mytitle) 71 | mtext(side=3, line=1, at=-0.07, adj=0, cex=0.7, mysubtitle) -------------------------------------------------------------------------------- /Plots March 2020/Calum/data.csv: -------------------------------------------------------------------------------- 1 | "","year","outcome","arm","p" 2 | "1",2013,"HIV",0,0.606 3 | "2",2016,"HIV",0,0.594 4 | "3",2013,"HIV",1,0.556 5 | "4",2016,"HIV",1,0.578 6 | "5",2013,"Know pos.",0,0.399 7 | "6",2016,"Know pos.",0,0.464 8 | "7",2013,"Know pos.",1,0.351 9 | "8",2016,"Know pos.",1,0.462 10 | "9",2013,"On ART",0,0.278 11 | "10",2016,"On ART",0,0.385 12 | "11",2013,"On ART",1,0.23 13 | "12",2016,"On ART",1,0.398 14 | "13",2013,"Supp.",0,0.215 15 | "14",2016,"Supp.",0,0.333 16 | "15",2013,"Supp.",1,0.177 17 | "16",2016,"Supp.",1,0.343 18 | -------------------------------------------------------------------------------- /Plots March 2020/Lauren/Days_hours.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/Plots March 2020/Lauren/Days_hours.gif -------------------------------------------------------------------------------- /Plots March 2020/Lauren/hours and day.R: -------------------------------------------------------------------------------- 1 | 2 | hour_day <- read_dta("~/PhD work/DrinkLess app/16th April data all of it/hour_day.dta") 3 | View(hour_day) 4 | 5 | setwd("C:/Users/lsh1703883/Documents/PhD work/DrinkLess app/16th April data all of it/gganimate") 6 | 7 | install.packages("tidyverse") 8 | library(gganimate) 9 | 10 | p <- ggplot( 11 | hour_day, 12 | aes(x= hour, y= n, colour=factor(sincedownload_cat)) 13 | ) + 14 | geom_line() + 15 | scale_color_viridis_d() + 16 | labs(title= 'Number of Session from Day 1 to Day 30', x = " Hour", y = "Number of sessions") + 17 | theme(legend.position="top") + geom_point() + transition_reveal(sincedownload_cat) 18 | 19 | final_animation<-animate(p,100,fps = 20,duration = 30, width = 950, height = 750, renderer = gifski_renderer()) 20 | 21 | anim_save("./Days_hours.gif",animation=final_animation) 22 | 23 | 24 | View(gapminder) 25 | 26 | 27 | 28 | library(gapminder) 29 | 30 | ggplot(gapminder, aes(gdpPercap, lifeExp, size = pop, colour = country)) + 31 | geom_point(alpha = 0.7, show.legend = FALSE) + 32 | scale_colour_manual(values = country_colors) + 33 | scale_size(range = c(2, 12)) + 34 | scale_x_log10() + 35 | facet_wrap(~continent) + 36 | # Here comes the gganimate specific bits 37 | labs(title = 'Year: {frame_time}', x = 'GDP per capita', y = 'life expectancy') + 38 | transition_time(year) + 39 | ease_aes('linear') -------------------------------------------------------------------------------- /Plots March 2020/Lauren/session start heatmap.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | library(rayshader) 4 | 5 | library(viridis) 6 | 7 | setwd("C:/Users/lsh1703883/Documents/PhD work/DrinkLess app/16th April data all of it") 8 | getwd() 9 | 10 | # heatmap of when session starts 11 | 12 | library(haven) 13 | session_start_heatmap <- read_dta("session start heatmap.dta") 14 | View(session_start_heatmap) 15 | 16 | session_start_heatmap$day <- factor(session_start_heatmap$day, 17 | levels = c(1,2,3,4,5,6,0), 18 | labels = c( "Mon", "Tues","Wed","Thur","Fri","Sat","Sun")) 19 | 20 | names(session_start_heatmap)[4] <- "sessions" 21 | 22 | ggplot(session_start_heatmap) + 23 | geom_tile(aes(x=day, y=hour, fill=n, color=n),size=1,color="black") + 24 | ggtitle("Heatmap of when sessions begin") + 25 | scale_fill_gradient2(low="white", high="black", guide="colorbar") + 26 | labs(caption = "May 2017 to Jan 2019, excluding 21st August 2018") + 27 | theme(axis.text = element_text(size = 12), 28 | title = element_text(size = 12,face="bold"), 29 | panel.border= element_rect(size=2,color="black",fill=NA)) -> 30 | session_gg 31 | 32 | 33 | #filename_movie = tempfile() 34 | 35 | phivechalf = 30 + 60 * 1/(1 + exp(seq(-7, 20, length.out = 180)/2)) 36 | phivecfull = c(phivechalf, rev(phivechalf)) 37 | thetavec = 60 * sin(seq(0,359,length.out = 360) * pi/180) 38 | zoomvec = 0.45 + 0.2 * 1/(1 + exp(seq(-5, 20, length.out = 180))) 39 | zoomvecfull = c(zoomvec, rev(zoomvec)) 40 | 41 | 42 | plot_gg(session_gg, multicore = TRUE, width = 6, height = 5.5, scale = 300, 43 | background = "#afceff",shadowcolor = "#3a4f70") 44 | 45 | render_movie(filename = "session_graph.mp4", type = "custom", 46 | frames = 360, phi = phivecfull, zoom = zoomvecfull, theta = thetavec) -------------------------------------------------------------------------------- /Plots March 2020/Martina/PLOT html_doc.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "PLOTS" 3 | output: 4 | html_document: 5 | code_folding: show 6 | toc: TRUE 7 | toc_float: 8 | collapsed: TRUE 9 | --- 10 | ### Overview 11 | 12 | - **Study design:** Analytical, prospective dynamic cohort study. 13 | - **Setting:** The data was obtained from a pragmatic, multicentre, individually randomized clinical trial. 14 | - **Study population:** Adults (>18years), HIV-positive patients with no evidence of tuberculosis co-infection, admitted to medical wards at Zomba Central or Edendale Hospital between February 2015 and January 2018, and who provided consent to participate in the parent study. 15 | - **Aim: ** To assess whether ART-status, as indicator of (in)appropriate HIV treatment accessibility and management, is leading to higher death rates in hospitalised patients (with no evidence of tuberculosis co-infection) in two main referral hospitals in Malawi and South Africa. 16 | 17 | - **Outcome:** All-cause 56-day (8-week) mortality. 18 | - **Main exposure:** ‘ART-status’. A composite variable with three categories, based on ART at admission (yes/no) and CD4 level at admission (low/high) if patient is under ART 19 | + **(1) ART-High CD4** HIV-patients who have started ART and have a high count of CD4 at admission 20 | + **(2) ART-Low CD4** HIV-patients who have started ART and have a low count of CD4 at admission 21 | + **(3) No ART** HIV-patients who have not started ART prior admission 22 | 23 | ### Dependencies & global settings 24 | 25 | ```{r message=FALSE, warning=FALSE} 26 | # Packages needed 27 | library(tidyverse) 28 | library(gridExtra) 29 | library(knitr) 30 | library(survminer) 31 | library(survival) 32 | library(ggthemes) 33 | 34 | # Global settings 35 | knitr::opts_chunk$set(echo = TRUE) 36 | options(width = 100) 37 | 38 | ``` 39 | 40 | ### Upload / Prepare data 41 | 42 | ```{r} 43 | hiv <- read.csv(file="~/hiv/hiv07sep.csv", header = TRUE, sep = ",") 44 | 45 | hiv$cd4count <-as.numeric(hiv$cd4count) 46 | hiv$art3 <-as.factor(hiv$art3) 47 | hiv$art2 <-as.factor(hiv$art2) 48 | hiv$art1 <-as.factor(hiv$art1) 49 | hiv$hivdx00 <-as.numeric(hiv$hivdx00) 50 | hiv$age <-as.numeric(hiv$age) 51 | hiv$artyears2 <-as.numeric(hiv$artyears2) 52 | hiv$hiv_art_c2 <-as.factor(hiv$hiv_art_c2) 53 | 54 | # change the labels for fupreason 55 | levels(hiv$fupreason) 56 | # Initially used plyr::revalue because I incorrecly imported this as a factor 57 | # I want to correct this now so I can use dplyr only 58 | # So, I collaps two levels first and then recode the rest. 59 | hiv$fupreason2 <- fct_collapse(hiv$fupreason, Censored = c("2.withdrew","3.lost_fup")) 60 | levels(hiv$fupreason2) 61 | hiv$fupreason2 <- fct_recode(hiv$fupreason2, Outcome = "1.death", Complete = "4.complete") 62 | # checking relabeling worked 63 | table(hiv$fupreason, hiv$fupreason2, useNA = "always") 64 | ``` 65 | 66 | ### Plots settings 67 | 68 | ```{r} 69 | fill <- "slategray" 70 | line <- "#1F3552" 71 | cate <- c("1.ART/highCD4" = "steelblue4", 72 | "2.ART/lowCD4" = "skyblue3", 73 | "3.noART" = "Grey60") 74 | 75 | t <-theme(panel.background = element_rect(fill = "white", colour = "grey"), 76 | panel.grid.minor.y = element_blank(), 77 | panel.grid.major.y = element_line(colour = "grey", size = 0.15), 78 | 79 | axis.ticks.x = element_blank(), 80 | axis.ticks.y = element_blank(), 81 | 82 | axis.text.x = element_text(colour = "grey27", size = 9), 83 | axis.title.y = element_text(colour = "grey27", size = 10), 84 | 85 | plot.margin = unit(c(1, 3, 1, 3), "cm"), 86 | plot.background = element_rect(colour = "black", fill = "grey90", size = 0.1)) 87 | ``` 88 | 89 | ### Box plot 1 - Basic 90 | 91 | Time LHIV (years) over exposure 92 | 93 | ```{r} 94 | ggplot(data = hiv, 95 | mapping = aes(x = artstatus1, y = hivdxyears)) + 96 | # add the wiskers 97 | stat_boxplot(geom ='errorbar', width = 0.5, colour = line) + 98 | # add this layer at the end so it shows over the error bar line 99 | geom_boxplot(fill = fill, colour = line, width = 0.5, 100 | outlier.colour = line, outlier.shape = 16) + 101 | # labels for the axes and axes labels 102 | scale_x_discrete(name = " ", labels = c('ART/highCD4\n(N=812)', 103 | 'ART/lowCD4\n(N=682)', 104 | 'No-ART\n(N=544)')) + 105 | scale_y_continuous(name = "Years LHIV", breaks = seq(0, 28, 2)) + 106 | # add the theme 107 | t 108 | ``` 109 | 110 | ### Box plot 2 - Adding elements 111 | 112 | CD4 count (cells/ml) by time of HIV diagnosis amongst no-ART 113 | 114 | ```{r} 115 | ggplot(data = subset(hiv, !is.na(art3)), 116 | mapping = aes(x = art3, y = cd4count)) + 117 | # add the wiskers 118 | stat_boxplot(geom ='errorbar', width = 0.3, colour = line) + 119 | # add the line used to dicotomise main exposure 120 | geom_hline(yintercept = 264, linetype = "dotted", color = "tomato4", size = 0.5) + 121 | # add this layer at the end so it shows over the dotted line 122 | geom_boxplot(fill = fill, colour = line, width = 0.3, 123 | outlier.colour = line, outlier.shape = 20) + 124 | # labels for the axes and axes labels 125 | scale_x_discrete(name = " ", labels = c('Diagnosis \nbefore admission \n(N=226)', 126 | 'Diagnosis \nat admission \n(N=322)')) + 127 | scale_y_continuous(name = "CD4 cells/ml", breaks = seq(0, 1800, 250)) + 128 | # additional text for the label 129 | annotate("text", x = 2.4, y = 269, label = "264\ncells/ml", size = 3 ) + 130 | # add the theme 131 | t 132 | # save the plot 133 | ggsave("cd4.png", width = 15, height = 10, units = "cm") 134 | ``` 135 | 136 | ### Scatterplot 1 - Overide global 137 | 138 | Time LHIV on time on ART. 139 | 140 | 141 | 142 | ```{r} 143 | ggplot(data = hiv, 144 | mapping = aes(x = hivdxyears, y = artyears2, colour = factor(artstatus1))) + 145 | geom_point() + 146 | # use the colors defined in "settings" and define new labels for the strata 147 | scale_colour_manual(values = cate, 148 | breaks = c("1.ART/highCD4", 149 | "2.ART/lowCD4", 150 | "3.noART"), 151 | labels = c("ART/highCD4 (N=788)", 152 | "ART/lowCD4 (N=671)", 153 | "No-ART (n=218)")) + 154 | # labels and breaks for the axes 155 | scale_x_continuous(name = "Years LHIV", 156 | breaks = seq(0, 30, 5)) + 157 | scale_y_continuous(name = "Years on ART", 158 | breaks = seq(0, 30, 5)) + 159 | ## same scale for both axes 160 | coord_equal() + 161 | # using the same theme but overiding some parameters and adding others 162 | # note I have to use axis.tilte instead of axis.text to override 163 | t + 164 | theme(axis.title.x = element_text(colour = "Grey27", size = 10), 165 | 166 | legend.key = element_blank(), 167 | legend.title = element_blank(), 168 | legend.position = c(0.02, 0.97), 169 | legend.justification = c(0, 1), 170 | legend.text = element_text(size = 8), 171 | 172 | plot.margin = unit(c(1.2, 1.2, 1, 1), "cm") 173 | ) 174 | ``` 175 | 176 | ### Scatterplot 2 - Faceting 177 | 178 | Follow-up time by main exposure and censoring 179 | 180 | ```{r} 181 | # savign and object withthe labels for the graph 182 | fupreason2.labs <- c('Outcome' = "Outcome (N=311)", 183 | 'Censored' = "Censored <56d (N=38)", 184 | 'Complete' = "Complete (n=1712)") 185 | 186 | ggplot(data = hiv, 187 | mapping = aes(x= artstatus1, y=fuptime2)) + 188 | geom_dotplot(binaxis = "y", stackdir = "center", 189 | stackratio = 1.5, dotsize = 0.5, 190 | colour = line) + 191 | # faceting 192 | facet_wrap(~ fupreason2, 193 | ncol = 3, 194 | labeller = labeller(fupreason2 = fupreason2.labs)) + 195 | # labels and breaks for the axes 196 | scale_x_discrete(name = " ", labels = c('ART\nhighCD4', 197 | 'ART\nlowCD4', 198 | 'No\nART')) + 199 | scale_y_continuous(name = "Follow-up time (days)", 200 | breaks = seq(0, 70, 15)) + 201 | # using the same theme but overiding some parameters and adding others 202 | # again using axis.title instead of axis.text to override 203 | t + 204 | theme(panel.grid.major.x = element_line(colour = "grey", size = 0.15), 205 | axis.title.x = element_text(colour = "Grey27", size = 8), 206 | plot.margin = unit(c(1, 1, 1, 1), "cm")) 207 | ``` 208 | 209 | ### Survival curve 210 | 211 | ```{r} 212 | # survival::survfit --> creates the survfit object needed for ggsurvplot 213 | fit1 <-survfit(Surv(fuptime2, died) ~ artstatus1, 214 | data = hiv) 215 | 216 | # survminer::ggsurvplot creates the plot 217 | ggsurvplot(fit1, 218 | censor.shape = "|", censor.size = 4, 219 | size = 0.5, 220 | palette = c("steelblue4","skyblue3","Grey60"), 221 | conf.int = TRUE, 222 | 223 | risk.table = TRUE, 224 | risk.table.y.text = FALSE, 225 | risk.table.height = 0.3, # this is the proportion of hight it takes 226 | risk.table.fontsize = 4, 227 | 228 | legend.title = " ", 229 | legend.labs = c('ART/highCD4', 'ART/lowCD4', 'No-ART'), 230 | legend = "top", 231 | 232 | ylim = c(0.67, 1.00), 233 | xlim = c(0,56), 234 | break.time.by = 7, 235 | xlab = "Time in days", 236 | ggtheme = theme_igray() 237 | ) 238 | 239 | 240 | ``` 241 | 242 | -------------------------------------------------------------------------------- /Plots March 2020/Martina/PLOT html_doc.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/Plots March 2020/Martina/PLOT html_doc.html -------------------------------------------------------------------------------- /Plots March 2020/Martina/PLOT notebook.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "PLOTS" 3 | output: 4 | html_notebook: 5 | toc: TRUE 6 | --- 7 | 8 | ### Overview 9 | 10 | - **Study design:** Analytical, prospective dynamic cohort study. 11 | - **Setting:** The data was obtained from a pragmatic, multicentre, individually randomized clinical trial. 12 | - **Study population:** Adults (>18years), HIV-positive patients with no evidence of tuberculosis co-infection, admitted to medical wards at Zomba Central or Edendale Hospital between February 2015 and January 2018, and who provided consent to participate in the parent study. 13 | - **Aim: ** To assess whether ART-status, as indicator of (in)appropriate HIV treatment accessibility and management, is leading to higher death rates in hospitalised patients (with no evidence of tuberculosis co-infection) in two main referral hospitals in Malawi and South Africa. 14 | 15 | - **Outcome:** All-cause 56-day (8-week) mortality. 16 | - **Main exposure:** ‘ART-status’. A composite variable with three categories, based on ART at admission (yes/no) and CD4 level at admission (low/high) if patient is under ART 17 | + **(1) ART-High CD4** HIV-patients who have started ART and have a high count of CD4 at admission 18 | + **(2) ART-Low CD4** HIV-patients who have started ART and have a low count of CD4 at admission 19 | + **(3) No ART** HIV-patients who have not started ART prior admission 20 | 21 | ### Dependencies & global settings 22 | 23 | ```{r message=FALSE, warning=FALSE} 24 | # Packages needed 25 | library(tidyverse) 26 | library(gridExtra) 27 | library(knitr) 28 | library(survminer) 29 | library(survival) 30 | library(ggthemes) 31 | 32 | # Global settings 33 | knitr::opts_chunk$set(echo = TRUE) 34 | options(width = 100) 35 | 36 | ``` 37 | 38 | ### Upload / Prepare data 39 | 40 | ```{r} 41 | hiv <- read.csv(file="~/hiv/hiv07sep.csv", header = TRUE, sep = ",") 42 | 43 | hiv$cd4count <-as.numeric(hiv$cd4count) 44 | hiv$art3 <-as.factor(hiv$art3) 45 | hiv$art2 <-as.factor(hiv$art2) 46 | hiv$art1 <-as.factor(hiv$art1) 47 | hiv$hivdx00 <-as.numeric(hiv$hivdx00) 48 | hiv$age <-as.numeric(hiv$age) 49 | hiv$artyears2 <-as.numeric(hiv$artyears2) 50 | hiv$hiv_art_c2 <-as.factor(hiv$hiv_art_c2) 51 | 52 | # change the labels for fupreason 53 | levels(hiv$fupreason) 54 | # Initially used plyr::revalue because I incorrecly imported this as a factor 55 | # I want to correct this now so I can use dplyr only 56 | # So, I collaps two levels first and then recode the rest. 57 | hiv$fupreason2 <- fct_collapse(hiv$fupreason, Censored = c("2.withdrew","3.lost_fup")) 58 | levels(hiv$fupreason2) 59 | hiv$fupreason2 <- fct_recode(hiv$fupreason2, Outcome = "1.death", Complete = "4.complete") 60 | # checking relabeling worked 61 | table(hiv$fupreason, hiv$fupreason2, useNA = "always") 62 | ``` 63 | 64 | ### Plots settings 65 | 66 | ```{r} 67 | fill <- "slategray" 68 | line <- "#1F3552" 69 | cate <- c("1.ART/highCD4" = "steelblue4", 70 | "2.ART/lowCD4" = "skyblue3", 71 | "3.noART" = "Grey60") 72 | 73 | t <-theme(panel.background = element_rect(fill = "white", colour = "grey"), 74 | panel.grid.minor.y = element_blank(), 75 | panel.grid.major.y = element_line(colour = "grey", size = 0.15), 76 | 77 | axis.ticks.x = element_blank(), 78 | axis.ticks.y = element_blank(), 79 | 80 | axis.text.x = element_text(colour = "grey27", size = 9), 81 | axis.title.y = element_text(colour = "grey27", size = 10), 82 | 83 | plot.margin = unit(c(1, 3, 1, 3), "cm"), 84 | plot.background = element_rect(colour = "black", fill = "grey90", size = 0.1)) 85 | ``` 86 | 87 | ### Box plot 1 - Basic 88 | 89 | Time LHIV (years) over exposure 90 | 91 | ```{r} 92 | ggplot(data = hiv, 93 | mapping = aes(x = artstatus1, y = hivdxyears)) + 94 | # add the wiskers 95 | stat_boxplot(geom ='errorbar', width = 0.5, colour = line) + 96 | # add this layer at the end so it shows over the error bar line 97 | geom_boxplot(fill = fill, colour = line, width = 0.5, 98 | outlier.colour = line, outlier.shape = 16) + 99 | # labels for the axes and axes labels 100 | scale_x_discrete(name = " ", labels = c('ART/highCD4\n(N=812)', 101 | 'ART/lowCD4\n(N=682)', 102 | 'No-ART\n(N=544)')) + 103 | scale_y_continuous(name = "Years LHIV", breaks = seq(0, 28, 2)) + 104 | # add the theme 105 | t 106 | ``` 107 | 108 | 109 | ### Box plot 2 - Adding elements 110 | 111 | CD4 count (cells/ml) by time of HIV diagnosis amongst no-ART 112 | 113 | ```{r} 114 | ggplot(data = subset(hiv, !is.na(art3)), 115 | mapping = aes(x = art3, y = cd4count)) + 116 | # add the wiskers 117 | stat_boxplot(geom ='errorbar', width = 0.3, colour = line) + 118 | # add the line used to dicotomise main exposure 119 | geom_hline(yintercept = 264, linetype = "dotted", color = "tomato4", size = 0.5) + 120 | # add this layer at the end so it shows over the dotted line 121 | geom_boxplot(fill = fill, colour = line, width = 0.3, 122 | outlier.colour = line, outlier.shape = 20) + 123 | # labels for the axes and axes labels 124 | scale_x_discrete(name = " ", labels = c('Diagnosis \nbefore admission \n(N=226)', 125 | 'Diagnosis \nat admission \n(N=322)')) + 126 | scale_y_continuous(name = "CD4 cells/ml", breaks = seq(0, 1800, 250)) + 127 | # additional text for the label 128 | annotate("text", x = 2.4, y = 269, label = "264\ncells/ml", size = 3 ) + 129 | # add the theme 130 | t 131 | # save the plot 132 | ggsave("cd4.png", width = 15, height = 10, units = "cm") 133 | ``` 134 | 135 | ### Scatterplot 1 - Overide global settings 136 | 137 | Time LHIV on time on ART. 138 | 139 | ```{r} 140 | ggplot(data = hiv, 141 | mapping = aes(x = hivdxyears, y = artyears2, colour = factor(artstatus1))) + 142 | geom_point() + 143 | # use the colors defined in "settings" and define new labels for the strata 144 | scale_colour_manual(values = cate, 145 | breaks = c("1.ART/highCD4", 146 | "2.ART/lowCD4", 147 | "3.noART"), 148 | labels = c("ART/highCD4 (N=788)", 149 | "ART/lowCD4 (N=671)", 150 | "No-ART (n=218)")) + 151 | # labels and breaks for the axes 152 | scale_x_continuous(name = "Years LHIV", 153 | breaks = seq(0, 30, 5)) + 154 | scale_y_continuous(name = "Years on ART", 155 | breaks = seq(0, 30, 5)) + 156 | ## same scale for both axes 157 | coord_equal() + 158 | # using the same theme but overiding some parameters and adding others 159 | # note I have to use axis.tilte instead of axis.text to override 160 | t + 161 | theme(axis.title.x = element_text(colour = "Grey27", size = 10), 162 | 163 | legend.key = element_blank(), 164 | legend.title = element_blank(), 165 | legend.position = c(0.02, 0.97), 166 | legend.justification = c(0, 1), 167 | legend.text = element_text(size = 8), 168 | 169 | plot.margin = unit(c(1.2, 1.2, 1, 1), "cm") 170 | ) 171 | ``` 172 | 173 | ### Scatterplot 2 - Faceting 174 | 175 | Follow-up time by main exposure and censoring 176 | 177 | ```{r} 178 | # savign and object withthe labels for the graph 179 | fupreason2.labs <- c('Outcome' = "Outcome (N=311)", 180 | 'Censored' = "Censored <56d (N=38)", 181 | 'Complete' = "Complete (n=1712)") 182 | 183 | ggplot(data = hiv, 184 | mapping = aes(x= artstatus1, y=fuptime2)) + 185 | geom_dotplot(binaxis = "y", stackdir = "center", 186 | stackratio = 1.5, dotsize = 0.5, 187 | colour = line) + 188 | # faceting 189 | facet_wrap(~ fupreason2, 190 | ncol = 3, 191 | labeller = labeller(fupreason2 = fupreason2.labs)) + 192 | # labels and breaks for the axes 193 | scale_x_discrete(name = " ", labels = c('ART\nhighCD4', 194 | 'ART\nlowCD4', 195 | 'No\nART')) + 196 | scale_y_continuous(name = "Follow-up time (days)", 197 | breaks = seq(0, 70, 15)) + 198 | # using the same theme but overiding some parameters and adding others 199 | # again using axis.title instead of axis.text to override 200 | t + 201 | theme(panel.grid.major.x = element_line(colour = "grey", size = 0.15), 202 | axis.title.x = element_text(colour = "Grey27", size = 8), 203 | plot.margin = unit(c(1, 1, 1, 1), "cm")) 204 | ``` 205 | 206 | ### Survival curves 207 | 208 | ```{r} 209 | # survival::survfit --> creates the survfit object needed for ggsurvplot 210 | fit1 <-survfit(Surv(fuptime2, died) ~ artstatus1, 211 | data = hiv) 212 | 213 | # survminer::ggsurvplot creates the plot 214 | ggsurvplot(fit1, 215 | censor.shape = "|", censor.size = 4, 216 | size = 0.5, 217 | palette = c("steelblue4","skyblue3","Grey60"), 218 | conf.int = TRUE, 219 | 220 | risk.table = TRUE, 221 | risk.table.y.text = FALSE, 222 | risk.table.height = 0.3, # this is the proportion of hight it takes 223 | risk.table.fontsize = 4, 224 | 225 | legend.title = " ", 226 | legend.labs = c('ART/highCD4', 'ART/lowCD4', 'No-ART'), 227 | legend = "top", 228 | 229 | ylim = c(0.67, 1.00), 230 | xlim = c(0,56), 231 | break.time.by = 7, 232 | xlab = "Time in days", 233 | ggtheme = theme_igray() 234 | ) 235 | 236 | 237 | ``` 238 | 239 | -------------------------------------------------------------------------------- /Plots March 2020/Martina/PLOT notebook.nb.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/Plots March 2020/Martina/PLOT notebook.nb.html -------------------------------------------------------------------------------- /Plots March 2020/README.md: -------------------------------------------------------------------------------- 1 | # RUG meeting 27 Mar 2020 2 | Amy MacDougall, Carl Pearson, Julia Shen, Lauren Bell, Martina Cusinato, and Calum Davey presented their plots. 3 | 4 | Carl's materials can be found [here](https://gitlab.com/cabp_LSHTM/denvax). Particularly, in the R/zzz.R file (ignore the code surrounding the functions - that's just to deal with the code only optionally loading) and in the inst/extdata and inst/app folders. 5 | 6 | Julia's materials can be found [here](https://github.com/7j7j/LSHTM-R-polar-plot) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RUG 2 | Repo for the LSHTM R Users Group (RUG) 3 | --------------------------------------------------------------------------------