├── .gitignore
├── Arsenal August 2020
    ├── RUG_making_tables_r.pptx
    ├── Readme.md
    ├── r_make_tables.Rmd
    ├── r_make_tables.docx
    ├── r_make_tables_rendered.docx
    └── reference.docx
├── August 2020 Puzzle
    ├── Example datasets.xlsx
    ├── Martina Cusinato
    │   ├── AUG 2020_PUZZLE1_jocelyn.R
    │   ├── AUG 2020_PUZZLE_2_Ben.R
    │   └── vbls_all_new2.csv
    └── ReadMe.md
├── Dates.R
├── December 2020 Puzzle
    └── Interactions.R
├── February 2021
    └── how_to_learn_R.md
├── January 2021
    └── 20210122-RUG-ConstrainedOpt.R
├── July 2020 Puzzle
    ├── July2020PuzzleSolutions.Rmd
    ├── July2020PuzzleSolutions.html
    ├── JulyPuzzle.zip
    └── results.png
├── Plots March 2020
    ├── Amy
    │   └── GGplot example AM.R
    ├── Calum
    │   ├── cascade_plot.R
    │   └── data.csv
    ├── Lauren
    │   ├── Days_hours.gif
    │   ├── hours and day.R
    │   └── session start heatmap.R
    ├── Martina
    │   ├── PLOT html_doc.Rmd
    │   ├── PLOT html_doc.html
    │   ├── PLOT notebook.Rmd
    │   └── PLOT notebook.nb.html
    └── README.md
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | _publish.R
5 | _book
6 | _bookdown_files
7 | rsconnect
8 | .DS_Store
9 | 


--------------------------------------------------------------------------------
/Arsenal August 2020/RUG_making_tables_r.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/Arsenal August 2020/RUG_making_tables_r.pptx


--------------------------------------------------------------------------------
/Arsenal August 2020/Readme.md:
--------------------------------------------------------------------------------
1 | Sham Lal presented on the use of the `arsenal` package 
2 | 
3 | His excellent presentation can be found here: https://drive.google.com/file/d/1Ra8Rz3hJttoAGrU46jpGIsyaS9C7RO4n/view?usp=sharing 
4 | 


--------------------------------------------------------------------------------
/Arsenal August 2020/r_make_tables.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "R tables easy to make?"
  3 | subtitle: "RUG LSHTM"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output:
  6 |   word_document:
  7 |     reference_docx: reference.docx
  8 |     toc: Yes
  9 | ---
 10 | 
 11 | 
 12 | ```{r setup, include = F, echo = F}
 13 | 
 14 | # Chunk options ---- 
 15 | 
 16 | # include = FALSE prevents code and results from appearing in the finished file. R Markdown still runs the code in the chunk, and the results can be used by other chunks.
 17 | # echo = FALSE prevents code, but not the results from appearing in the finished file. This is a useful way to embed figures.
 18 | # message = FALSE prevents messages that are generated by code from appearing in the finished output
 19 | # warning = FALSE prevents warnings that are generated by code from appearing in the finished output.
 20 | # fig.cap = "..." adds a caption to graphical results.
 21 | # eval = FALSE does not run the code chunk
 22 | 
 23 | # global options ----
 24 | # > code chunk options ----
 25 | 
 26 | # knitr::opts_chunk$set(
 27 | #   comment = '', include = F, message = F, fig.width = 6, fig.height = 6)
 28 | 
 29 | knitr::opts_chunk$set('label')
 30 | 
 31 | 
 32 | # Packages ----
 33 | 
 34 | # install.packages(c("tidyverse", "kableExtra", "arsenal", "finalfit", "tidyr", "broom"))
 35 | # optional extra package emojis devtools::install_github("hadley/emo")
 36 | 
 37 | 
 38 | 
 39 | ```
 40 | 
 41 | ```{r libraries, include = F, echo = F}
 42 | 
 43 | library(tidyverse)
 44 | library(kableExtra)
 45 | library(arsenal)
 46 | library(finalfit)
 47 | 
 48 | library(tidyr) #  for neater code output 
 49 | library(broom) # for tidying other models 
 50 | 
 51 | ```
 52 | 
 53 | ```{r data_prep, include = F, message = F, warning = F, eval = T}
 54 | 
 55 | # Ensure data is cleaned as much as possible and organised correctly (i.e. variables stored as needed e.g. factors, integers, double, characters)
 56 | 
 57 | glimpse(colon_s)
 58 | 
 59 | dim(colon_s)
 60 | 
 61 | head(colon_s, 10)
 62 | 
 63 | colon_s <- colon_s
 64 | 
 65 | 
 66 | ```
 67 | 
 68 | # Table 1a: simple one-way table with default *tableby* options `r emo::ji("poop")`
 69 | 
 70 | 
 71 | ```{r table1a_i, include = T, message = F, warning = F, eval = T}
 72 | 
 73 | table1a_i <- arsenal::tableby(~ age + age.factor + sex.factor + obstruct.factor + nodes,
 74 |           data = colon_s)
 75 | 
 76 | summary(table1a_i, text = T)
 77 | 
 78 | ```
 79 | 
 80 | ## Table 1a(ii): formatted `r emo::ji("smile")`
 81 | 
 82 | ```{r table1a_ii, include = T, message = F, warning = F, results = 'asis', eval = T}
 83 | 
 84 | summary(table1a_i, text = T) %>%
 85 |   kable(format = "pandoc", 
 86 |         caption = "Formatted table", 
 87 |         padding = 0, 
 88 |         label = "", 
 89 |         align = c("l" , "c"), 
 90 |         col_order = order
 91 |         )
 92 | 
 93 | ```
 94 | 
 95 | ## Table 1b: add/modify human readable variable labels `r emo::ji("+1")``r emo::ji("+1")`
 96 | 
 97 | ```{r table1b_code, include = T, message = F, warning = F, results = 'asis', echo = F, eval = T}
 98 | 
 99 | # See all variables labels and factor level names
100 | 
101 | labels(colon_s) %>%
102 |   unlist()
103 | 
104 | # Change variable names and factor level names
105 | 
106 | labels(colon_s)  <- c(age = 'Age, yrs', sex = "Gender")
107 | 
108 | attr(colon_s$sex.factor, 'label')  <- 'Gender'
109 | 
110 | attr(colon_s$age.factor, 'label')  <- 'Age group'
111 | 
112 | colon_s$age.factor <- forcats::fct_relevel(colon_s$age.factor, rev)
113 | 
114 | ```
115 | 
116 | ```{r table1b_labels, include = T, message = F, warning = F, results = 'asis', eval = T}
117 | 
118 | table1b <- tableby(~ age + age.factor + sex.factor + obstruct.factor + nodes,
119 |           data = colon_s)
120 | 
121 | summary(table1b, text = T) %>%
122 |   kable(format = "pandoc", 
123 |         caption = "Formatted table", 
124 |         padding = 0, 
125 |         label = "", 
126 |         align = c("l" , "c"), 
127 |         col_order = order
128 |         ) 
129 | 
130 | 
131 | ```
132 | 
133 | ## Table 1c: change default statistics and formatting `r emo::ji("+1")``r emo::ji("+1")``r emo::ji("+1")`
134 | 
135 | ```{r table1c_options, include = T, message = F, warning = F, results = 'asis', eval = T}
136 | 
137 | tableby(~ age + age.factor + sex.factor + obstruct.factor + nodes,
138 |           data = colon_s,
139 |           digits = 1, 
140 |           digits.pct = 1,
141 |             control = tableby.control(cat.stats = c("countpct", "N", "Nmiss2"), 
142 |                                       total = T, test = F,
143 |                                       numeric.stats = c("meansd", "medianq1q3", "range", "N", "Nmiss2"), 
144 |                                       stats.labels = list(Nmiss2 = "Number missing")
145 |                                       )
146 |           ) %>%
147 |   summary(text = T) %>%
148 |   kable(format = "pandoc", 
149 |         caption = "Formatted table", 
150 |         padding = 0, 
151 |         label = "", 
152 |         align = c("l" , "c"), 
153 |         col_order = order
154 |         )  
155 | 
156 | 
157 | ```
158 | 
159 | ## Table 1d: more than one group `r emo::ji("flex")``r emo::ji("fist")``
160 | 
161 | ```{r table1d_options, include = T, message = F, warning = F, results = 'asis', eval = T}
162 | 
163 | tableby(mort_5yr ~ age + age.factor + sex.factor + obstruct.factor + nodes + perfor.factor,
164 |           data = colon_s,
165 |           digits = 1, 
166 |           digits.pct = 1,
167 |             control = tableby.control(cat.stats = c("countrowpct", "N", "Nmiss2"), # now with row percentages
168 |                                       total = T, test = T,
169 |                                       numeric.stats = c("meansd", "medianq1q3", "range", "N", "Nmiss2"), 
170 |                                       stats.labels = list(Nmiss2 = "Number missing")
171 |                                       )
172 |           ) %>%
173 |   summary(text = T) %>%
174 |   kable(format = "pandoc", 
175 |         caption = "Formatted table", 
176 |         padding = 0, 
177 |         label = "", 
178 |         align = c("l" , "c"), 
179 |         col_order = order
180 |         )  
181 | 
182 | 
183 | ```
184 | 
185 | 
186 | ## Table 1e: change default statistics, more than one group `r emo::ji("+1")``r emo::ji("+1")`
187 | 
188 | ```{r table1e, include = T, message = F, warning = F, results = 'asis', eval = T}
189 | 
190 | tableby(mort_5yr ~ age + age.factor + sex.factor + obstruct.factor + nodes + perfor.factor,
191 |           data = colon_s,
192 |           digits = 1, 
193 |           digits.pct = 1,
194 |             control = tableby.control(cat.stats = c("countrowpct", "N", "Nmiss2"), # now with row percentages
195 |                                       numeric.stats = c("meansd", "medianq1q3", "range", "N", "Nmiss2"), 
196 |                                       stats.labels = list(Nmiss2 = "Number miss"),
197 |                                       total = T, test = F
198 |                                       )
199 |           ) %>%
200 |   summary(text = T) %>%
201 |   kable(format = "pandoc", 
202 |         caption = "Formatted table", 
203 |         padding = 0, 
204 |         label = "", 
205 |         align = c("l" , "c"), 
206 |         col_order = order
207 |         )  
208 | 
209 | 
210 | ```
211 | 
212 | ## Table 1f: Set global table options and create dataframe `r emo::ji("+1")`
213 | 
214 | ```{r table_by global options, highlight = T, strip.white = T}
215 | 
216 | # Set global table options
217 | 
218 | mycontrols  <- tableby.control(test=FALSE, total=FALSE,
219 |                                numeric.test="kwt", cat.test="chisq",
220 |                                numeric.stats=c("meansd", "medianq1q3", "range", "N", "Nmiss2"),
221 |                                cat.stats=c("countpct", "N", "Nmiss2"),
222 |                                stats.labels=list(N = 'Count',
223 |                                                  median = 'Median', 
224 |                                                  q1q3 = 'Q1,Q3'),
225 |                                digits = 1,
226 |                                digits.pct = 1
227 |                                )
228 | 
229 | # Create a table as data frame
230 | 
231 | table_1f <- tableby(mort_5yr ~ age + age.factor + sex.factor,
232 |         data = colon_s,
233 |           control = mycontrols) %>%
234 |   summary(text = NULL) # set to null to remove any text formatting 
235 | 
236 | print(as.data.frame(table_1f))
237 | 
238 | ```
239 | 
240 | *Other possibilities*
241 | + Subsets of groups (e.g. pregnant women)
242 |    * Weighted estimates
243 |    * Apply custom statistical tests for certain variables
244 |    * Additional groupings by strata
245 |    * Full list here: <https://cran.r-project.org/web/packages/arsenal/vignettes/tableby.html>
246 | 
247 | # Table 2: Odds ratio table
248 |    
249 | ```{r  table 2, include = F,highlight = T, strip.white = T}
250 | 
251 | # FinalFirst package 
252 | 
253 | explanatory = c("age.factor", "sex.factor", "obstruct.factor", "perfor.factor")
254 | 
255 | dependent = "mort_5yr"
256 | 
257 | model_output1 <- colon_s %>%
258 |     finalfit(dependent, explanatory, metrics = F)
259 | 
260 | ```   
261 | 
262 | ```{r  table_2out, include = T,highlight = T, strip.white = T}
263 | 
264 | model_output1 %>%
265 |         kable(format = "pandoc", 
266 |         caption = "", 
267 |         padding = 0, 
268 |         label = "", 
269 |         align = c("l" , "c"), 
270 |         col_order = order,
271 |         row.names = NA,
272 |         col.names = NA
273 |         ) 
274 | ```   
275 | 
276 | *Lots of features and customisations*
277 |    * Full list here: <https://finalfit.org/articles/all_tables_examples.html>
278 |    


--------------------------------------------------------------------------------
/Arsenal August 2020/r_make_tables.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/Arsenal August 2020/r_make_tables.docx


--------------------------------------------------------------------------------
/Arsenal August 2020/r_make_tables_rendered.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/Arsenal August 2020/r_make_tables_rendered.docx


--------------------------------------------------------------------------------
/Arsenal August 2020/reference.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/Arsenal August 2020/reference.docx


--------------------------------------------------------------------------------
/August 2020 Puzzle/Example datasets.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/August 2020 Puzzle/Example datasets.xlsx


--------------------------------------------------------------------------------
/August 2020 Puzzle/Martina Cusinato/AUG 2020_PUZZLE1_jocelyn.R:
--------------------------------------------------------------------------------
  1 | #**********************************************************************************************************
  2 | # WORKFLOW AND NOTES:
  3 | ## 1. Inspect and save the raw data in a raw_data_excel
  4 | 
  5 | ## 2. Transform each of the excel tabs I will use in CSV and file them. 
  6 | ##    I normally file these under subdirectory calles raw_data_csv, but for now will save them in the working directory (note file location for csv)
  7 | ##    I prefer changing to csv before importing because it is pretty easy and ensures consistency in the way characters are entered.  
  8 | ##    I do this by clincking 'save as' and then chaning the format as CSV. Each tab or each file needs to be saved separately.
  9 | 
 10 | ## 3. I dont' know how this data will be used... so I show two options
 11 | ### a. Import and 'clean' for one file, and this can be repeated for the others. 
 12 | ### b. Import and clean (as above and merge all at the end) - I do this with a function.
 13 | 
 14 | ## Note: Although I use a functions, for these kind of tasks, I normally first load and clean each data separately
 15 | ## or at least a larg proportion of the files, to see werhe the problesm could be. 
 16 | 
 17 | #**********************************************************************************************************
 18 | # MAIN CONCLUSIONS AFTER INSPECITON OF RAW DATA
 19 | ## Files are all cross-tabulations. 
 20 | ## for each file, the number of variables/categories should be equal to the number of rows, plus one row for 'Totals'. (so no column for totals)
 21 | ## No missing data. Each cell with data has a numerical value. 
 22 | ## Plenty of empty rows and columns
 23 | ## The number of variables and/or catgories si not consitent accorss all the files/loctions. 
 24 | 
 25 | #**********************************************************************************************************
 26 | # LIBRARIES
 27 | library(tidyverse)
 28 | # Will use janitor once and will call it then. 
 29 | 
 30 | #**********************************************************************************************************
 31 | # LOAD A SINGLE FILE ####
 32 | #**********************************************************************************************************
 33 | 
 34 | # Load data
 35 | (data <- read_csv("location1.csv",
 36 |                  # skip the first rows and keep the column names
 37 |                  skip = 3, col_names = TRUE,
 38 |                  # make all the variables character
 39 |                  # I like to import everythign char unless I know there will be no typos
 40 |                  col_types = cols(.default = "c"))
 41 | )
 42 | 
 43 | # STEP 1
 44 | # Fix the vbl names and remove the column for propotions as those can be calculated later. 
 45 | # I prefer to use the given names and change them at the end after data merging if that is needed. 
 46 | (data1 <- data %>% 
 47 |   # This removes the proportion columns. 
 48 |   select(-starts_with("X")) %>% 
 49 |   # This is a quick fix for the names starting with numbers
 50 |   # I chose to remove the first col (X1) because I know after first inspection that the variable names should equate the row 'names'
 51 |   # I use janitor here because is less labour intensive.
 52 |   janitor::clean_names() %>% 
 53 |   # shortens the names a bit while keeping enough characters so there are no 2 cols with the same name (e.g. missing)
 54 |   set_names(., ~str_sub(.x, 1,7))
 55 | )
 56 | 
 57 | # STEP 2
 58 | # Now I Fix the rows that were imported with NA as those were empty. 
 59 | # Here need to be careful. could use 'complete.cases' (or janitor:: remove_empty) but in some cases the NA is displaced (cells merged) and replaced by '0'. 
 60 | # this means that I can't just remove rows that are not-complete. 
 61 | (data2 <- data1 %>% 
 62 |   mutate(na.per.row = rowSums(is.na(data1))) %>%
 63 |   # I could filter only those with one NA, but want to be conservative in case other daset have more problems here.
 64 |   # so, I keep onl those where the total na is half of the numbere of variables. 
 65 |   filter(na.per.row <= ncol(data1)/2) %>% 
 66 |   select(-na.per.row)
 67 | )
 68 | 
 69 | # STEP 3
 70 | # Add the column with the variables/categories. 
 71 | (data3 <- bind_cols(row.name = c("Count", names(data2), "total"), data2)
 72 | )
 73 | 
 74 | # ADDITIONAL 
 75 | # Here you can remove the 'count' and 'total' rows. and also change the NA for transgender from NA to '0'. 
 76 | # Here you can also change the names 
 77 | # If you do this for all 3 files and ensure names are consistent (even if you don't have the same columns for all files),
 78 | # you can use dplyr::bind_rows() to merge them all (assuming you want them stacked / long format).
 79 | #  When using bind_row  or bind_cols for this, I like to use the option '.id' so I get a variable indicating the dataset the data comes from.
 80 | 
 81 | 
 82 | #**********************************************************************************************************
 83 | # LOAD 3 FILES ####
 84 | #**********************************************************************************************************
 85 | 
 86 | # Using the same process I generate a function. 
 87 | # Note I did the above for al the individual files bu show only one. 
 88 | rm(list = ls())
 89 | 
 90 | # Store the name of the csv files to load. (change the path if you put them in a folder. For now they are in my wd)
 91 | files_list <- c("location1.csv",
 92 |                 "location2.csv",
 93 |                 "location3.csv")
 94 | 
 95 | # The function
 96 | load_data <- function(file) {
 97 |   data <- read_csv(file,
 98 |                    skip = 3, col_names = TRUE,
 99 |                    col_types = cols(.default = "c")) 
100 |   
101 |   data1 <- data %>% 
102 |     select(-starts_with("X")) %>% 
103 |     janitor::clean_names()
104 |   
105 |   data2 <- data1 %>% 
106 |     mutate(na.per.row = rowSums(is.na(data1))) %>%
107 |     filter(na.per.row <= ncol(data1)/2) %>% 
108 |     select(-na.per.row)
109 |   
110 |   data3 <- bind_cols(row.name = c("count", names(data2), "total"), data2)
111 | }
112 | 
113 | # This merges everyting in one dataset. the argumnt '.id' will indicate whre the data is comming from
114 | data_all <- map_df(files_list, load_data, .id = "location")
115 | # alternativelyy merge as a list and then see them with 'pluck'
116 | data_all_l <- map(files_list, load_data)
117 | # to see location 3, for example:
118 | data_all_l %>% pluck(3) 
119 | 
120 | 
121 | #**********************************************************************************************************
122 | # HOUSEKEEPING? ####
123 | #**********************************************************************************************************
124 | 
125 | # So, at this point, I rename the variables by column and row. 
126 | # By inspection I know where the two variables named 'missing' and 'missing_1'  / just need to be careful there. 
127 | # I'll work from the rectangular dta created in the prvious step
128 | 
129 | # Note: The following steps can be reduced but I prefer to 'see' them step by step at first. 
130 | 
131 | # STEP 1
132 | # First I want to get rid of the 'count' and 'total' rows
133 | data_all1 <- data_all %>% 
134 |   filter(row.name!= "count" & row.name!= "total")
135 | 
136 | # STEP 2 ROW NAMES
137 | # I normally want to work with establishin my vbl name sin a codebook. but also I find this an easier way... 
138 | # I extract the names of all the variables form the variable 'row.name' because this will have all the variables even if it is repeated. 
139 | # I want the unique values because I will use join and need unique 'keys' (key will be vbl name in this case)
140 | row.vbls <- as.data.frame(unique(data_all1$row.name))
141 | # Save this in an CSV
142 | write_csv(row.vbls, path = "vbls_all.csv")
143 | # Open the csv generated before put the new name next to each old name, then save as csv [not shown here]
144 | # Load and print so 
145 | row.vbls <- read_csv("vbls_all_new2.csv")
146 | row.vbls
147 | # Now that I have the old and the nw name I will merge this with my data
148 | # I do a right_join instead of left because I want the first two rows to be printed on the left of the screen
149 | (data_all2 <- right_join(row.vbls, data_all1, by = c("old.name" = "row.name")))
150 | # I use 'view' and check that the match is ok
151 | # I also check I still have 32 rows (no duplication occurred)
152 | 
153 | # STEP 2 COL NAMES
154 | # Now that I checked that the re-name is OK I delet the col for old name. 
155 | data_all3 <- data_all2 %>% select(-old.name)
156 | # Extract the variable names and matche them with the new names
157 | # Check that the number and the name is correct. See it preserved the order. 
158 | (col.vbls <- as.data.frame(names(data_all3)) %>% 
159 |   rename(old.name = "names(data_all3)") %>% 
160 |   left_join(row.vbls)
161 | )  
162 | # Pull the names out as vector
163 | col.vbls <- col.vbls %>% 
164 |   filter(!is.na(new.name)) %>% 
165 |   pull(new.name)
166 | # Re-names
167 | names(data_all3) <- c("new.name", "location", col.vbls)
168 | (data_all4 <- data_all3 %>% 
169 |   select(location, new.name,
170 |          starts_with("age_"), 
171 |          starts_with("sex"), 
172 |          starts_with("ms"), 
173 |          starts_with("wk"))
174 | )
175 | # Pending 1: fix the NA value for non-binary gender
176 | # Pending 2: format the variable as numerical (not char)
177 | 
178 | #*****************************************************
179 | 
180 | 
181 | 


--------------------------------------------------------------------------------
/August 2020 Puzzle/Martina Cusinato/AUG 2020_PUZZLE_2_Ben.R:
--------------------------------------------------------------------------------
 1 | # I first transform each of the excel files/tabs I will use in CSV and file them. 
 2 | # I normally file these under subdirectory calles raw_data_csv, but for now will save them in the working directory (note file location for csv)
 3 | # I prefer changing to csv before importing because it is pretty easy and ensures consistency in the way characters are entered.  
 4 | # I do this by clincking 'save as' and then chaning the format as CSV. Each tab or each file needs to be saved separately.
 5 | 
 6 | # Load libraries
 7 | library(tidyverse)
 8 | library(janitor)
 9 | 
10 | # Load the csv
11 | data1 <- read_csv("referencetablesworkbook2.csv",
12 |                  # skip the first rows and keep the column names
13 |                  skip = 4, col_names = TRUE,
14 |                  # make all the variables character or remove the code below for R to 'guess' though risky - in my view
15 |                  col_types = cols(.default = "c")
16 |                  # the code below selects the columns you wanted. 
17 |                  )[ ,c(1:5,31:35)] %>% 
18 |   janitor::clean_names() %>% 
19 |   rename(cause     = "x1",
20 |          sex       = "x2",
21 |          geo.type  = "x3", 
22 |          area.code = "x4",
23 |          area.name = "x5") %>% 
24 |   select(-x33)
25 | 
26 | head(data1)
27 | # Then use filter or slice to get different subsets of info. 
28 | 
29 | 


--------------------------------------------------------------------------------
/August 2020 Puzzle/Martina Cusinato/vbls_all_new2.csv:
--------------------------------------------------------------------------------
 1 | old.name,new.name
 2 | x15_19,age_15_19
 3 | x20_29,age_20_29
 4 | x30_39,age_30_39
 5 | x40_49,age_40_49
 6 | x50_59,age_50_59
 7 | x60_69,age_60_69
 8 | female,sex_f
 9 | male,sex_m
10 | transgender_non_binary,sex_nb
11 | married_or_living_together,ms_marr
12 | never_married,ms_never
13 | other,ms_other
14 | missing,age_miss
15 | missing_1,ms_miss
16 | employed,wk_emp
17 | student_retired_unemployed,wk_unemp


--------------------------------------------------------------------------------
/August 2020 Puzzle/ReadMe.md:
--------------------------------------------------------------------------------
 1 | # August 2020 puzzle 
 2 | 
 3 | ## Ben's problem: reading excel spreadsheets efficiently into R: relatively simple example. 
 4 | 
 5 | I find I have recently had to read data typically from multi-sheet files, often from ONS. Format is variously in  .xls , .xlsx. or (single-sheet files) .csv.  I usually want data only on a few columns and selected rows.  The software I have found for .xls/.xlxs files (readxl) seems to require me to count the lines before the data proper starts or specify the entire  range of the data, and tends to leave numerical columns as characters. OK to sort out myself once or twice, but I thought that better guessing by software might be possible.  Ideally I thought it would also be good to be able to select columns and sub-groups of rows (by value of one of the columns) at the time of reading – but perhaps those are best selected as separate stages. 
 6 | 
 7 | Example: COVID deaths from ONS by Local Authority (I also needed this by MSOA, but we will leave that), which are on Table 2 of: 
 8 | 
 9 | https://www.ons.gov.uk/file?uri=%2fpeoplepopulationandcommunity%2fbirthsdeathsandmarriages%2fdeaths%2fdatasets%2fdeathsinvolvingcovid19bylocalareaanddeprivation%2f1march2020to30june2020/referencetablesworkbook2.xlsx 
10 | 
11 | I only needed the first 5 character/factor columns and the last 4 numerical ones (and only for COVID-16, both sexes combined). One issue with this file and commonly is that column names in the excel file are effectively spread over two rows (super- and sub-heading; here Month and deaths/rate/etc).
12 | 
13 | ## Jocelyn's problem: reading excel spreadsheets efficiently into R: complicated example
14 |  
15 | See the Excel workbook above
16 |  
17 | What makes it complicated is:
18 | 
19 | 1. Sheets within a workbook
20 | 2. Excel formatting (merged cells)
21 | 3. Multiple column titles in each sheet
22 | 4. Column titles for each sheet can be distinct
23 | 5. Rows for each sheet can be distinct
24 | 6. Non-specific column titles (Missing)
25 | 


--------------------------------------------------------------------------------
/Dates.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # Make some simple date to play with 
 3 | d <- data.frame(sdate = c("2019-12-20", "2018-12-20"))
 4 | d$date <- as.Date(d$sdate, format = "%Y-%m-%d")
 5 | head(d)
 6 | class(d$date)
 7 | class(d$sdate)
 8 | 
 9 | # Obviously this won't work 
10 | d$datebefore <- ifelse(d$date >= "2019-12-19", d$date, "2019-12-19")
11 | head(d)
12 | class(d$datebefore)
13 | class(as.Date("2019-12-19", format = "%Y-%m-%d"))
14 | 
15 | # This doesn't work 
16 | d$datebefore <- ifelse(d$date >= as.Date("2019-12-19"), 
17 |                        d$date, 
18 |                        as.Date("2019-12-19"))
19 | head(d)
20 | class(d$datebefore)
21 | 
22 | # Using if_else in dplyr 
23 | library(dplyr)
24 | d$datebefore <- if_else(d$date >= as.Date("2019-12-19"), 
25 |                        d$date, 
26 |                        as.Date("2019-12-19"))
27 | head(d)
28 | class(d$datebefore)
29 | 
30 | 


--------------------------------------------------------------------------------
/December 2020 Puzzle/Interactions.R:
--------------------------------------------------------------------------------
 1 | # RUGs Interaction Puzzle 
 2 | # 21 Jan 2021 
 3 | # Calum Davey 
 4 |   
 5 |   library(multcomp) # (1)
 6 |   library(Epi)      # (2)
 7 |   library(emmeans)  # (3)
 8 | 
 9 | # Generate simple dataset with interaction 
10 | # ========================================
11 |   
12 |   N <- 1000
13 |   data2 <- data.frame(  
14 |     f = rbinom(n = N, size=1, prob=0.5), # A factor exposure (e.g. treatment)
15 |     x = rnorm(n = N, 10, 2),             # A continuous exposure (e.g. age)
16 |     e = rnorm(n = N, 0, 5))              # Random error 
17 |   
18 |   # Set the model parameters 
19 |   Alpha = 10
20 |   Beta1 = .2
21 |   Beta2 = 1.3
22 |   Beta3 = 1.4 
23 |   
24 |   # Generate the dependent variable 
25 |   data2$y = Alpha + Beta1*data2$f + Beta2*data2$x + Beta3*data2$f*data2$x + data2$e
26 |   
27 |   # Reclass f as factor
28 |   data2$f <- as.factor(data2$f)
29 |   
30 | # Fit the model   
31 | # =============
32 |   
33 |   model <- lm(y ~ f*x, data = data2) 
34 |   summary(model) 
35 |   
36 | # Return the interaction
37 | # ======================  
38 |   
39 |   # (1) Calum + Will + Saravanakumar 
40 |     confint(multcomp::glht(model, linfct="x + f1:x = 0"))
41 |     
42 |   # (2) Antonio   
43 |     model2 <- lm(y ~ x:f + f, data=data2)
44 |     Epi::ci.lin(model2)
45 |     
46 |   # (3) Paul, https://stats.idre.ucla.edu/r/seminars/interactions-r/#s4a
47 |     emmeans::emtrends(model, ~ f, var = "x")
48 |     
49 |   
50 |     
51 | 


--------------------------------------------------------------------------------
/February 2021/how_to_learn_R.md:
--------------------------------------------------------------------------------
  1 | ## General resources 
  2 | 
  3 | (Julia) Just adding to chat some resources I'd really recommend for learning R, if you are within the RStudio + Tidyverse paradigm:
  4 | 
  5 | - https://r4ds.had.co.nz/
  6 | - https://rstudio.com/resources/cheatsheets/
  7 | - https://www.statlearning.com/ (not specific to Tidyverse, excellent as a practical open textbook walkthrough on various ML methods)
  8 | - https://otexts.com/fpp2/
  9 | 
 10 | (Julia) At some point I will probably compile the above into a running list on GitHub (like this one https://github.com/7j7j/lshtm-rd-resources), maybe we should set this up with RUsers group (https://github.com/lshtm-rug) and/or CHIL-R (LSHTM health economists') GitHub (https://github.com/LSHTM-CHIL)
 11 | 
 12 | There's also this R for Stata Users book but I have not used it myself: https://www.springer.com/gp/book/9781441913173
 13 | 
 14 | (Naomi) "writing R is like writing a paper, you wouldn't submit your first draft" [write the first draft in notes only, and then a rough version, and improve from there]
 15 | 
 16 | French book https://link.springer.com/book/10.1007/978-2-8178-0535-1
 17 | 
 18 | Interesting presentation from Jeroen Ooms on assessing the quality of open source projects in general and R packages in particular https://rstudio.com/resources/rstudioglobal-2021/monitoring-health-and-impact-of-open-source-projects/
 19 | 
 20 | For absolute beginners (not this group) SWIRL is useful https://swirlstats.com/ 
 21 | 
 22 | I also use this all the time - https://www.r-graph-gallery.com/ 
 23 | 
 24 | Good overview of packages in different domains - which I only just became aware of and wish I'd found earlier https://cran.r-project.org/web/views/
 25 | 
 26 | ## Data wrangling 
 27 | 
 28 | (Julia) What I mean by data wrangling is how you go from relatively unstructured raw data to your specific analytical spine (https://en.wikipedia.org/wiki/Data_wrangling)
 29 | 
 30 | Other resources for this in R:
 31 | https://www.experfy.com/training/courses/data-wrangling-in-r (I used some of my PhD training funds for this) 
 32 | 
 33 | Alternatives that are probably good too:
 34 | - https://www.pluralsight.com/paths/data-wrangling-with-r
 35 | - http://uc-r.github.io/data_wrangling
 36 | - https://www.edx.org/course/data-science-wrangling
 37 | - https://www.udemy.com/course/learn-data-wrangling-with-r/
 38 | 
 39 | ... and others exist
 40 | 
 41 | 
 42 | ## Multiple imputation 
 43 | 
 44 | Rdocumentation.org in general is a great resource, e.g. for mice as Andrei is discussing: https://www.rdocumentation.org/packages/mice/versions/3.6.0/topics/mice
 45 | 
 46 | (Example of a vignette with mice: https://amices.org/mice/ -- great package for missing data multiple imputation in general)
 47 | 
 48 | ## Community 
 49 | 
 50 | Also really recommend peer learning resources like R Ladies London: https://www.meetup.com/rladies-london/
 51 | 
 52 | and similarly across R/Python/Julia etc: https://www.meetup.com/London-Data-Science-ODSC/
 53 | 
 54 | Per Andrei's point, nice example of StackExchange thread: https://stats.stackexchange.com/questions/25025/how-to-build-a-linear-mixed-effects-model-in-r
 55 | 
 56 | Example NHS R conference: https://nhsrcommunity.com/events/nhs-r-virtual-conference-2020/
 57 | 
 58 | Also as a PhD student you can use the stats Helpdesk to get some input
 59 | 
 60 | Blogs, e.g. https://www.r-bloggers.com/
 61 | 
 62 | Naomi and Sam's great software chatter initiative: https://github.com/cmmid/SoftwareChatter 
 63 | 
 64 | another great resource is the Recon project (partly run by Thubaut Jombart) which focuses on R and epidemics: https://www.reconlearn.org/ (lots of cases studies and resources here)
 65 | 
 66 | ## Loops 
 67 | 
 68 | (Julia) Oh man, I'm still quite bad at loops but some resources with vignette code where purrr and dplyr in Tidyverse are specifically meant for this quite often (where 'across' and 'apply' are lifesavers):
 69 |  https://bookdown.org/ansellbr/WEHI_tidyR_course_book/automating-your-work.html
 70 | 
 71 | https://datacarpentry.org/semester-biology/materials/for-loops-R/
 72 | 
 73 | https://dplyr.tidyverse.org/articles/programming.html
 74 | 
 75 | Working with new packages:
 76 | 
 77 | - Copy own function, add a break to see what it is doing 
 78 | - Use fake data 
 79 | 
 80 | - apply https://www.r-bloggers.com/2009/09/r-function-of-the-day-tapply-2/
 81 | - map https://www.rdocumentation.org/packages/purrr/versions/0.2.5/topics/map
 82 | 
 83 | 
 84 | ## Other languages 
 85 | Reticulate is a package by RStudio to translate python to R: https://rstudio.github.io/reticulate/
 86 | 
 87 | https://dept.stat.lsa.umich.edu/~jerrick/courses/stat701/notes/sql.html SQL in R
 88 | 
 89 | ## Plotting 
 90 | - lattice can also be very powerful. I particularly enjoy using “levelplot” for rasters
 91 | - If wanting to map you can use tmap which uses the same grammar of graphics approach as ggplot.
 92 | 
 93 | ## Tables
 94 | - Xtable (latex) 
 95 | - https://cran.r-project.org/web/packages/tableone/vignettes/introduction.html
 96 | - https://rmarkdown.rstudio.com/
 97 | 
 98 | ## Version control 
 99 | - Git 
100 | - here package 


--------------------------------------------------------------------------------
/January 2021/20210122-RUG-ConstrainedOpt.R:
--------------------------------------------------------------------------------
  1 | ###################################################
  2 | #
  3 | #  LSHTM R users group
  4 | #  Constrained optimization with R
  5 | #  Friday 22 January 2021
  6 | #
  7 | ###################################################
  8 | 
  9 | #--------------------------------------
 10 | #  Part 1 - What is constrained optimization?
 11 | #--------------------------------------
 12 | 
 13 | #---- Optimization: find where is the minimum of a function
 14 | 
 15 | #- Function to be minimized: the minimum is 3
 16 | f <- function(x) (x - 3)^2 + 5
 17 | curve(f, from = -10, to = 10)
 18 | 
 19 | # We can use optimize to find this minimum
 20 | ?optimize # performs 1D optimization
 21 | 
 22 | resmin <- optimize(f, interval = c(-100, 100))
 23 | resmin
 24 | 
 25 | points(resmin$minimum, resmin$objective, pch = 16, col = 4, cex = 2)
 26 | 
 27 | #- Adding a constraint that x < -5
 28 | rect(xleft = -5, ybottom = par("usr")[3], xright = par("usr")[2],
 29 |   ytop = par("usr")[4], density = 10, col = 2)
 30 | 
 31 | # Now let's otpimize with box constraint
 32 | rescons <- optimize(f, interval = c(-100, 100), upper = -5)
 33 | rescons
 34 | 
 35 | points(rescons$minimum, rescons$objective, pch = 16, col = 4, cex = 2)
 36 | 
 37 | #---- 2D optimization: Booth function takes 2 parameters
 38 | booth <- function(x1, x2) (x1 + 2*x2 - 7)^2 + (2*x1 + x2 - 5)^2
 39 | 
 40 | # To plot the function
 41 | xseq <- seq(-10, 10, length.out = 100)
 42 | y <- outer(xseq, xseq, booth)
 43 | image(xseq, xseq, y, xlab = "x1", ylab = "x2", breaks = 0:12 * 200)
 44 | contour(xseq, xseq, y, add = T, levels = 0:12 * 200)
 45 | 
 46 | #- Optimization with optim function
 47 | ?optim
 48 | 
 49 | booth_vec <- function(x) booth(x[1], x[2])
 50 | resbooth <- optim(c(0, 0), fn = booth_vec)
 51 | resbooth
 52 | 
 53 | # Showing the minimum location
 54 | points(resbooth$par[1], resbooth$par[2], pch = 16, cex = 2)
 55 | text(resbooth$par[1], resbooth$par[2], 
 56 |   sprintf("(%1.1f,%1.1f)", resbooth$par[1], resbooth$par[2]),
 57 |   pos = 3, offset = 1)
 58 | 
 59 | #- Let's add linear constraints
 60 | # Those are constraints of the form u %*% x >= c
 61 | ?constrOptim
 62 | 
 63 | # We want to find the min of the Booth function such that x1 + x2 <= 0
 64 | Cmat <- matrix(1, nrow = 1, ncol = 2)
 65 | Cmat
 66 | Cmat %*% c(1,3)
 67 | Cmat %*% c(-1, 2)
 68 | Cmat %*% c(1, -2)
 69 | 
 70 | ccontour <- outer(xseq, xseq, function(x, y) (cbind(x, y) %*% t(Cmat)) <= 0)
 71 | image(xseq, xseq, ccontour, col = adjustcolor(c(2, 3), .5), 
 72 |   breaks = c(-1, .5, 1), xlab = "x1", ylab = "x2")
 73 | contour(xseq, xseq, y, add = T, levels = 0:12 * 200)
 74 | points(resbooth$par[1], resbooth$par[2], pch = 16, cex = 2)
 75 | 
 76 | # Performing constrained optimization
 77 | constr_res <- constrOptim(c(-10, -10), f = booth_vec, grad = NULL, 
 78 |   ui = -Cmat, ci = 0)
 79 | constr_res
 80 | 
 81 | # Adding it to the plot
 82 | points(constr_res$par[1], constr_res$par[2], pch = 16, cex = 2)
 83 | text(constr_res$par[1], constr_res$par[2], 
 84 |   sprintf("(%1.1f,%1.1f)", constr_res$par[1], constr_res$par[2]),
 85 |   pos = 3, offset = 1)
 86 | 
 87 | 
 88 | #--------------------------------------
 89 | #  Part 2 - Where I talk about my work: constrained splines
 90 | #--------------------------------------
 91 | 
 92 | # Flexible way of doing nonlinear regression
 93 | 
 94 | #----- Let's simulate a function with data
 95 | set.seed(1)
 96 | x <- sort(c(runif(98, 0, 10), 0, 10))
 97 | y <- (x - 3)^2 + rnorm(100, 0, 10)
 98 | 
 99 | # Plot
100 | plot(x, y, main = "Scatterplot of x and y", col = grey(0.7), pch = 19)
101 | lines(x, (x - 3)^2, lty = 2, lwd = 2)
102 | 
103 | #----- What are splines ?
104 | # Splines are polynomial bases applied to x
105 | spline_mat <- dlnm::ps(x, int = T)
106 | str(spline_mat)
107 | 
108 | # We now have 10 variables called bases
109 | matplot(x, spline_mat, type = "l", ylab = "Spline bases")
110 | 
111 | # To approximate the function, these bases are used as new variables in a regression model
112 | lmres <- lm(y ~ 0 + spline_mat)
113 | coefs <- coef(lmres)
114 | 
115 | # I multiply each basis by its associated coefficient
116 | coefmat <- mapply("*", as.data.frame(spline_mat), coefs)
117 | 
118 | matplot(x, coefmat, type = "l", ylab = "Spline basis * coef", ylim = range(coefs))
119 | points(x[apply(spline_mat, 2, which.max)], coefs, type = "h", col = 1:6)
120 | text(x[apply(spline_mat, 2, which.max)], coefs, round(coefs, 1), pos = 3, col = 1:6, xpd = T)
121 | 
122 | # Resulting curve: we add all bases * coef
123 | lm_curve <- spline_mat %*% coefs
124 | lines(x, lm_curve, lwd = 2)
125 | 
126 | # Let's compare to the true curve
127 | plot(x, y, main = "Scatterplot of x and y", col = grey(0.7), pch = 19)
128 | lines(x, (x-3)^2, lty = 2, lwd = 2)
129 | lines(x, lm_curve, lwd = 2, col = 2)
130 | 
131 | #----- What does this have to do with optimization?
132 | 
133 | # Fitting a regression model is an optimization problem: minimizing least squares
134 | # Let's define a least squares function
135 | lsfun <- function(coefs) sum((y - spline_mat %*% coefs)^2)
136 | # Optimization is easier with the gradient of the function
137 | lsgr <- function(coefs) -2 * t(spline_mat) %*% y + 2 * crossprod(spline_mat) %*% coefs
138 | 
139 | # we can use this function in a general purpose optimizer
140 | opt <- optim(rep(0, 10), fn = lsfun, gr = lsgr, method = "BFGS")
141 | 
142 | opt$par
143 | coefs
144 | 
145 | opt_curve <- spline_mat %*% opt$par
146 | lines(x, opt_curve, col = 4, lty = 2, lwd = 2)
147 | 
148 | #----- Constrained splines
149 | # We want to fit a curve that is monotone increasing
150 | # This mean that the spline coefficients must also increase (the beauty of B-splines)
151 | # e.g. if we use coefs
152 | ex_coef <- 1:10
153 | ex_coefmat <- mapply("*", as.data.frame(spline_mat), ex_coef)
154 | ex_curve <- spline_mat %*% ex_coef
155 | 
156 | # We plot these coefs with the resulting curve: we obtain a straight curve
157 | matplot(x, ex_coefmat, type = "l", ylab = "Spline basis * coef", 
158 |   ylim = range(ex_coef))
159 | points(x[apply(spline_mat, 2, which.max)], ex_coef, type = "h", col = 1:6)
160 | text(x[apply(spline_mat, 2, which.max)], ex_coef, round(ex_coef, 1), 
161 |   pos = 3, col = 1:6, xpd = T)
162 | lines(x, ex_curve, lwd = 2)
163 | 
164 | # Another example but with coefficient increasing quadraticaly
165 | ex2_coef <- (1:10)^2
166 | ex2_coefmat <- mapply("*", as.data.frame(spline_mat), ex2_coef)
167 | ex2_curve <- spline_mat %*% ex2_coef
168 | 
169 | # We now obtain nice quadratic curve
170 | matplot(x, ex2_coefmat, type = "l", ylab = "Spline basis * coef", 
171 |   ylim = range(ex2_coef))
172 | points(x[apply(spline_mat, 2, which.max)], ex2_coef, type = "h", col = 1:6)
173 | text(x[apply(spline_mat, 2, which.max)], ex2_coef, round(ex2_coef, 1), 
174 |   pos = 3, col = 1:6, xpd = T)
175 | lines(x, ex2_curve, lwd = 2)
176 | 
177 | # To obtain an increasing curve we otpimize least-squares
178 | # but with the constraint that the coefficients increase
179 | # i.e. that the difference between successive coefficients is > 0
180 | Cmat <- diff(diag(10))
181 | Cmat # We have 9 constraints, one for each coefficient difference
182 | 
183 | # Now let's use constrOptim: we need starting value
184 | # We'll use increasing values that are the closest to unconstrained ones
185 | start <- predict(lm(opt$par ~ I(1:10)))
186 | 
187 | # Now we can use these starting values for optimization
188 | copt <- constrOptim(start, f = lsfun, grad = lsgr, 
189 |   ui = Cmat, ci = 0, method = "BFGS")
190 | copt$par
191 | 
192 | copt_curve <- spline_mat %*% copt$par
193 | 
194 | plot(x, y, main = "Scatterplot of x and y", col = grey(0.7), pch = 19)
195 | lines(x, (x-3)^2, lty = 2, lwd = 2)
196 | lines(x, opt_curve, lwd = 2, col = 4)
197 | lines(x, copt_curve, lwd = 2, col = 3)
198 | legend("topleft", c("True", "Unconstrained", "optim"),
199 |   lwd = 2, col = c(1, 4, 3), lty = c(2, 1, 1), bty = "n")
200 | 
201 | #----- Quadratic programming
202 | # Since the objective function is quadratic, we can use that a Quadratic program
203 | library(quadprog)
204 | ?solve.QP
205 | 
206 | # Here we need to rearrange our objective function
207 | Dmat <- crossprod(spline_mat)
208 | dvec <- t(y) %*% spline_mat
209 | 
210 | # We can use solve.QP to solve the optimization problem
211 | # The good thing is that we don't need starting values
212 | qpres <- solve.QP(Dmat, dvec, t(Cmat))
213 | 
214 | qpres
215 | 
216 | qpcurve <- spline_mat %*% qpres$solution
217 | 
218 | plot(x, y, main = "Scatterplot of x and y", col = grey(0.7), pch = 19)
219 | lines(x, (x-3)^2, lty = 2, lwd = 2)
220 | lines(x, opt_curve, lwd = 2, col = 4)
221 | lines(x, copt_curve, lwd = 2, col = 3)
222 | lines(x, qpcurve, lwd = 2, col = 7)
223 | legend("topleft", c("True", "Unconstrained", "optim", "quadprog"),
224 |   lwd = 2, col = c(1, 4, 3, 7), lty = c(2, 1, 1, 1), bty = "n")
225 | 
226 | #----- Convex curve
227 | # The true curve is actually convex
228 | # We can force our splines to be convex
229 | # This means constraining the second differences
230 | Cmat2 <- diff(diag(10), diff = 2)
231 | Cmat2
232 | 
233 | # Now let's use it in the QP
234 | qpres2 <- solve.QP(Dmat, dvec, t(Cmat2))
235 | 
236 | qpcurve2 <- spline_mat %*% qpres2$solution
237 | 
238 | plot(x, y, main = "Scatterplot of x and y", col = grey(0.7), pch = 19)
239 | lines(x, (x-3)^2, lty = 2, lwd = 2)
240 | lines(x, opt_curve, lwd = 2, col = 4)
241 | lines(x, qpcurve2, lwd = 2, col = 6)
242 | legend("topleft", c("True", "Unconstrained", "Convex"),
243 |   lwd = 2, col = c(1, 4, 6), lty = c(2, 1, 1), bty = "n")
244 | 
245 | 
246 | 
247 | #################
248 | # Some material
249 | #################
250 | 
251 | #----- To understand the basic concepts: 
252 | # Boyd, S., Vandenberghe, L., 2004. Convex Optimization, 1 edition. ed. Cambridge University Press, Cambridge, UK; New York.
253 | 
254 | #----- For details about the different optimization algorithms: 
255 | # Nocedal, J., Wright, S., 2006. Numerical Optimization. Springer Science & Business Media.
256 | 
257 | # CRAN task view on optimization: https://cran.r-project.org/web/views/Optimization.html
258 | 
259 | #----- Some useful functions for specific applications
260 | # Linear objective functions (linear programming): lpSolve
261 | # Quadratic objective functions (quadratic programming): quadprog
262 | # Non-linear least-squares: nls/nls2
263 | # Stochastic optimization useful for non-convex problems: GA


--------------------------------------------------------------------------------
/July 2020 Puzzle/July2020PuzzleSolutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "July 2020 Puzzle"
  3 | author: "Calum Davey"
  4 | date: "25/07/2020"
  5 | output: 
  6 |   html_document:
  7 |     theme: readable
  8 | editor_options: 
  9 |   chunk_output_type: console
 10 | ---
 11 | 
 12 | ```{r setup, include=FALSE}
 13 | knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
 14 | ```
 15 | 
 16 | ## Puzzle  
 17 | 
 18 | I have a dataset with 5 different x values and the lower limit of the difference from x = 8. (limit always = 0 when x = 8)  
 19 | 
 20 | I would like a function to get the lowest x where limit < 0.05 and all of the previous limits in the sequence to 8 are also < 0.05.   
 21 | 
 22 | I don't mind if the function returns a single value or a dataset with one row. Example datasets and expected answers are below.   
 23 | 
 24 | ## Datasets and expected answers
 25 | 
 26 | ```{r ProblemDatasets}
 27 | # f(a) = 4
 28 | a <- data.frame(x = c(8, 7, 6, 5, 4), 
 29 |                 limit = c(0, 0.01, 0.01, 0.01, 0.01))
 30 | 
 31 | # f(b) = 6
 32 | b <- data.frame(x = c(8, 7, 6, 5, 4), 
 33 |       limit = c(0, 0.01, 0.01, 0.06, 0.06))
 34 | 
 35 | # f(c) = 7 (as 6 is outside of limit)
 36 | c <- data.frame(x = c(8, 7, 6, 5, 4), 
 37 |                 limit = c(0, 0.01, 0.06, 0.01, 0.06))
 38 | 
 39 | # f(d) = 7 (as 6 is outside of limit)
 40 | d <- data.frame(x = c(8, 7, 6, 5, 4), 
 41 |                 limit = c(0, 0.01, 0.06, 0.01, 0.01))
 42 | ```
 43 | 
 44 | ## Solutions
 45 | ### Calum 
 46 | 
 47 | ```{r Calum}
 48 | f <- function(Y, X, threshold, d){
 49 |   p <- suppressWarnings(min(which((d[,Y] - threshold) > 0)) - 1)
 50 |   if(p == Inf){p <- nrow(d)}
 51 |   d[p, X]  
 52 | }
 53 | ```
 54 | 
 55 | ### Antonio 
 56 | 
 57 | ```{r Antonio}
 58 | # FUNCTION fweird WITH ARGUMENTS:
 59 | # - x: VALUES
 60 | # - limit: CORRESPONDING DECIMALS
 61 | # - thr: THRESHOLD FOR DECIMAL
 62 | # - res: RETURN VALUE OR POSITION
 63 | # NB: SET TO NA IF NONE IS ELIGIBLE
 64 | fweird <- function(x, limit, thr=0.05, res="val") {
 65 |   # CHECK CONSISTENCY OF ARGUMENTS AND THEIR VALUE
 66 |   if(length(x)!=length(limit)) 
 67 |     error("'x' and 'limit' must have the same length")
 68 |   res <- match.arg(res, c("val","ind"))
 69 |   
 70 |   # IDENTIFY WHICH DECIMALS ARE ABOVE THE THRESHOLD
 71 |   not <- which(limit>=0.05)
 72 |   
 73 |   # IDENTIFY THE INDEX
 74 |   ind <- ifelse(length(not), ifelse(not[1]==1, NA, not[1]-1), length(x))
 75 |   
 76 |   # RETURN NA IF NONE
 77 |   if(is.na(ind)) return(NA)
 78 |   
 79 |   # RETURN EITHER THE VALUE OR THE INDEX
 80 |   ifelse(res=="val", x[ind], ind)
 81 | }
 82 | ```
 83 | 
 84 | ### Darrren 
 85 | 
 86 | ```{r Darren}
 87 | #D Scott function
 88 | 
 89 | min_fun = function(data, alpha){
 90 |   #================================================
 91 |   #Purpose#
 92 |   #(Find the lowest x where limit < 0.05 and all of 
 93 |   # the previous limits in the sequence.)
 94 |   #------------------------------------------------
 95 |   #Parameters#
 96 |   #data = Data set, 2 columns X and limit
 97 |   #alpha = Threshold  
 98 |   #------------------------------------------------  
 99 |   #Output#  
100 |   #Min of x up to limit
101 |   #================================================
102 |   
103 |   indx = seq(1, length(data$x))
104 |   indco = indx[data$limit >= alpha][1]
105 |   
106 |   if(is.na(indco)){
107 |     
108 |     max = length(data$x)
109 |     print("Threshold not in x")
110 |     se = seq(1, max)
111 |     min_x = min(data$x[se])
112 |     
113 |   }else if(indco > 1){
114 |     max = indco
115 |     se = seq(1, max - 1)
116 |     min_x = min(data$x[se])
117 |     
118 |   }else{
119 |     min_x = NA
120 |     print("Threshold is the first value")
121 |   }
122 |   
123 |   return(min_x)
124 | }
125 | ```
126 | 
127 | ### Clemence 
128 | 
129 | ```{r Clemence}
130 | #Function: first check if any limit is larger than the threshold. 
131 | #If not, the result is the last 
132 | #value of the vector x. Otherwise, look at the first time 
133 | #limit>threshold and get the x value from previous row)
134 | 
135 | puzzle<-function(df){
136 |   result<-ifelse(sum(df$limit>threshold)==0, tail(df$x, n=1),
137 |        df$x[min(which(df$limit>threshold))-1])
138 |   return(result)
139 | }
140 | ```
141 | 
142 | ## Alex 
143 | 
144 | ```{r Alex}
145 | puzzlefunction <- function(data,threshold=0.05){
146 |   index <- sum(1-sign(cumsum(data$limit >= 0.05)))  
147 |   return(data$x[index])
148 | }
149 | ```
150 | 
151 | ### Speedtests
152 | 
153 | ```{r SpeedTests}
154 | data <- b
155 | alpha <- threshold <- 0.05
156 | 
157 | # Benchmarking the solution 
158 |   # install.packages('microbenchmark')
159 |   library(microbenchmark)
160 |   m <- microbenchmark(
161 |   
162 |   # Calum
163 |     f(Y = 'limit', X = 'x', threshold = 0.05, d = data),
164 |     
165 |   # Antonio 
166 |     fweird(x = data$x, limit = data$limit),
167 |   
168 |   # Darren 
169 |     min_fun(data, alpha),
170 |   
171 |   # Clemence 
172 |     puzzle(data),
173 |   
174 |   # Alex 
175 |     puzzlefunction(data, threshold),
176 |   
177 |   times = 1000
178 | ) 
179 |   
180 | barplot(summary(m)$median,
181 |         names = c( 'Calum', 'Antonio', 'Darren', 'Clemence', 'Alex'),
182 |         ylab = 'Median time in ns')  
183 | 
184 | ```
185 | 
186 | 


--------------------------------------------------------------------------------
/July 2020 Puzzle/JulyPuzzle.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/July 2020 Puzzle/JulyPuzzle.zip


--------------------------------------------------------------------------------
/July 2020 Puzzle/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/July 2020 Puzzle/results.png


--------------------------------------------------------------------------------
/Plots March 2020/Amy/GGplot example AM.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #----------------------#
  3 | # GGplot show and tell #
  4 | #----------------------#  
  5 | 
  6 | require(tidyverse)
  7 | require(broom)
  8 | require(ggforce)
  9 | 
 10 | #------#
 11 | # Data #
 12 | #------#  
 13 | 
 14 | set.seed(123)
 15 | 
 16 | df1 <- data.frame( weight = rnorm(500),
 17 |             sex = runif(500)>0.5,
 18 |             SES = rnorm(500),
 19 |             yob = sample(1:10, 500, replace = TRUE),
 20 |             trt1 = runif(500)>0.7,
 21 |             trt2 = runif(500)>0.2,
 22 |             trt3 = runif(500)>0.9,
 23 |             pa = runif(500)>0.5,
 24 |             staph = runif(500)>0.4)
 25 | 
 26 | 
 27 | df1$y <- with(df1, 0.3*weight + 0.5*sex - 0.2*SES + 
 28 |                 0.1*yob + 0.01*trt1 + 0.8*trt2 - 0.4*trt3 -0.5*pa +  rnorm(500, sd = 0.7) )
 29 | 
 30 | #-------------------------------#
 31 | # Model + point estimates + CI  #
 32 | #-------------------------------#  
 33 | 
 34 | mod1 <- lm(y ~ weight + sex + SES + yob + trt1 + trt2 + trt3 + pa + staph, data = df1)
 35 | 
 36 | # Pt estimates + confidence intervals
 37 | 
 38 | coefs <- tidy(mod1, conf.int = TRUE)
 39 | coefs <- subset(coefs, term!="(Intercept)") # remove intercept
 40 | 
 41 | # Add nicer names
 42 | 
 43 | coefs$names <- c( "Weight (kg)", "Sex = FEMALE", 
 44 |                  "SES (IMD z-score)", "Year of birth (unit = 1 year)", 
 45 |                  "Treatment A", "Treatment B", "Treatment C",
 46 |                  "Bronchitis", "Pneumonia")
 47 | 
 48 | # Group the variables 
 49 | 
 50 | coefs$type = c("Weight at age 1 year",
 51 |                rep("Demographics", 3),
 52 |                rep("Clinical", 3),
 53 |                rep("Infections", 2))
 54 | 
 55 | 
 56 | # Gather point estimates and CIs
 57 | 
 58 | coefs$display_ests <-sprintf("%.2f [%.2f, %.2f]" , coefs$estimate, coefs$conf.low, coefs$conf.high)
 59 | 
 60 | #-------------------#
 61 | # Plot 1: no groups #
 62 | #-------------------#  
 63 | 
 64 | 
 65 | 
 66 | ggplot( coefs, aes(estimate, names )) + 
 67 |   geom_point(aes())+ 
 68 |   geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0, size = 0.5)+
 69 |   geom_text(aes(x = Inf, label = display_ests), vjust = -0.5, colour = "black", hjust = -0.1, size = 3.5) +
 70 |   coord_cartesian(clip = "off") +
 71 |   geom_vline(xintercept = 0, linetype = 2)+
 72 |   theme_bw(base_size = 12)+
 73 |   theme(plot.margin = unit(c(0,4,0,0), "cm")) +
 74 |   labs(x =  "Lung function (FEV1%)",y = NULL, title = NULL)+  
 75 |   facet_col(~type, scale = "free_y", space = "free")  
 76 | 
 77 | 
 78 | #------------------------#
 79 | # Plot 2: order by group #
 80 | #------------------------#  
 81 | 
 82 | 
 83 | coefs$type <- factor(coefs$type, ordered = TRUE, levels = c("Weight at age 1 year", "Demographics", "Clinical", "Infections"))
 84 | coefs <- coefs[ order(coefs$type, coefs$estimate), ]
 85 | coefs$names <- factor( coefs$names, ordered = TRUE, levels = rev(coefs$names) )
 86 | 
 87 | 
 88 | ggplot( coefs, aes(estimate, names )) + 
 89 |   geom_point()+ 
 90 |   geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0, size = 0.5)+
 91 |   geom_text(aes(x = Inf, label = display_ests), vjust = -0.5, colour = "black", hjust = -0.1, size = 3.5) +
 92 |   coord_cartesian(clip = "off") +
 93 |   geom_vline(xintercept = 0, linetype = 2)+
 94 |   theme_bw(base_size = 12)+
 95 |   theme(plot.margin = unit(c(0,4,0,0), "cm")) +
 96 |   theme(axis.title.x = element_text(face = "bold", size = 15)) +
 97 |   labs(x =  "Lung function (FEV1%)",y = NULL, title = NULL)+  
 98 |   facet_col(~type, scale = "free_y", space = "free")  
 99 | 
100 | 
101 | #-----------------------------#
102 | # Plot 3: Colour by std error #
103 | #-----------------------------#  
104 | 
105 | ggplot( coefs, aes(estimate, names )) + 
106 |   geom_point(aes( colour = 1/std.error))+ 
107 |   geom_errorbarh(aes(xmin = conf.low, xmax = conf.high, colour = 1/std.error), height = 0, size = 0.5)+
108 |   geom_text(aes(x = Inf, label = display_ests), vjust = -0.5, colour = "black", hjust = -0.1, size = 3.5) +
109 |   coord_cartesian(clip = "off") +
110 |   geom_vline(xintercept = 0, linetype = 2)+
111 |   theme_bw(base_size = 12)+
112 |   theme(plot.margin = unit(c(0,4,0,0), "cm")) +
113 |   theme(axis.title.x = element_text(face = "bold", size = 15)) +
114 |   labs(x =  "Lung function (FEV1%)",y = NULL, title = NULL)+  
115 |   facet_col(~type, scale = "free_y", space = "free")  +
116 |   scale_colour_gradient(low = "grey60", high = "black") + 
117 |   theme(legend.position = "none")
118 | 
119 | 


--------------------------------------------------------------------------------
/Plots March 2020/Calum/cascade_plot.R:
--------------------------------------------------------------------------------
 1 | # HIV treatment serrated cascade plot 
 2 | # Calum Daveu
 3 | # LSHTM 
 4 | # 26 MAR 2020
 5 | 
 6 | # Change working directory 
 7 |   setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
 8 | 
 9 | # Read the data 
10 |   d <- read.csv('data.csv')
11 | 
12 | # Set some arbitrary x-values
13 |   xs <- c(0.01,.1,0.01,.1,.11,.2,.11,.2,.21,.3,.21,.3,.31,.4,.31,.4)
14 | 
15 | # Set values for the point types 
16 |   pt <- rep(c(1,19),each=2,times=nrow(d)/4)
17 | 
18 | # Create the plot and the points 
19 |   plot(xs, d$p, 
20 |        ylim=c(0,1), xlim=c(0.01,.4), yaxt="n", xaxt="n", pch=pt, bty="n",
21 |        xlab="", ylab="Means of RDS-adjusted proportions in all women", cex.lab=.7)
22 | 
23 | # Add the sloping lines 
24 |   for (r in c(1,3,5,7,9,11,13,15)){
25 |     lines(xs[r:(r+1)], d$p[r:(r+1)], type="l", lwd=1.7)
26 |   }
27 | 
28 | #  Add the vertical lines
29 |   for (r in c(1:16)){
30 |     lines(c(xs[r],xs[r]),c(0,d$p[r]), lwd=.7)
31 |   }
32 | 
33 | # Add horizontal line at bottom
34 | lines(c(0.01,.4), c(0,0))
35 | 
36 | # Add the 90:90:90 lines
37 |   base_hiv <- mean(d[d$outcome=='HIV' & d$year==2013,'p'])
38 |   nnn <- c(base_hiv*.9,       # First 90 for diagnosis 
39 |            base_hiv*.9*.9,    # Second 90 for treatment  
40 |            base_hiv*.9*.9*.9) # Third 90 for suppression 
41 |   
42 |   lines(c(.11,.2), c(nnn[1],nnn[1]), lty=5, lwd=.7)
43 |   lines(c(.21,.3), c(nnn[2],nnn[2]), lty=5, lwd=.7)
44 |   lines(c(.31,.4), c(nnn[3],nnn[3]), lty=5, lwd=.7)
45 |   
46 |   text(.2-.01,nnn[1]+.02,labels="90%", cex=.7)
47 |   text(.3-.01,nnn[2]+.02,labels="81%", cex=.7)
48 |   text(.4-.01,nnn[3]+.02,labels="73%", cex=.7)
49 | 
50 | # Add the dates 
51 |   lines(c(xs[1],xs[1]),c(d$p[1],d$p[1]+.08), lty=3, lwd=.7)
52 |   lines(c(xs[2],xs[2]),c(d$p[2],d$p[2]+.08), lty=3, lwd=.7)
53 |   
54 |   text(xs[1]+0.0125,d$p[1]+.1,labels="2013", cex=.6)
55 |   text(xs[2]+0.0125,d$p[2]+.1,labels="2016", cex=.6)
56 | 
57 | # Add axes and labels
58 |   ap <- c(0,.2,.4,.6,.8,1)
59 |   axis(2, at=ap, lab=paste0(ap * 100, "%"), las=TRUE, cex.axis=0.7)
60 |   
61 |   text(c(.05, .15, .25, .35, .55,.65),c(-.0193), 
62 |        labels=c("HIV +ve", "Know +ve", "On ART", "vl<1000c/ml"), cex=.7)
63 | 
64 | # Add legend
65 |   legend(0,.9,c("Sisters only arm", "Enhanced Sisters arm"),pch=c(1,19), bty="n", cex=.7)
66 | 
67 | # Add title
68 |   mytitle = "HIV-treatment cascade across 14 sites in Zimbabwe"
69 |   mysubtitle = "Change between baseline and endline in each arm of SAPPH-IRe trial"
70 |   mtext(side=3, line=2, at=-0.07, adj=0, cex=1, mytitle)
71 |   mtext(side=3, line=1, at=-0.07, adj=0, cex=0.7, mysubtitle)


--------------------------------------------------------------------------------
/Plots March 2020/Calum/data.csv:
--------------------------------------------------------------------------------
 1 | "","year","outcome","arm","p"
 2 | "1",2013,"HIV",0,0.606
 3 | "2",2016,"HIV",0,0.594
 4 | "3",2013,"HIV",1,0.556
 5 | "4",2016,"HIV",1,0.578
 6 | "5",2013,"Know pos.",0,0.399
 7 | "6",2016,"Know pos.",0,0.464
 8 | "7",2013,"Know pos.",1,0.351
 9 | "8",2016,"Know pos.",1,0.462
10 | "9",2013,"On ART",0,0.278
11 | "10",2016,"On ART",0,0.385
12 | "11",2013,"On ART",1,0.23
13 | "12",2016,"On ART",1,0.398
14 | "13",2013,"Supp.",0,0.215
15 | "14",2016,"Supp.",0,0.333
16 | "15",2013,"Supp.",1,0.177
17 | "16",2016,"Supp.",1,0.343
18 | 


--------------------------------------------------------------------------------
/Plots March 2020/Lauren/Days_hours.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/Plots March 2020/Lauren/Days_hours.gif


--------------------------------------------------------------------------------
/Plots March 2020/Lauren/hours and day.R:
--------------------------------------------------------------------------------
 1 | 
 2 | hour_day <- read_dta("~/PhD work/DrinkLess app/16th April data all of it/hour_day.dta")
 3 | View(hour_day)
 4 | 
 5 | setwd("C:/Users/lsh1703883/Documents/PhD work/DrinkLess app/16th April data all of it/gganimate")
 6 | 
 7 | install.packages("tidyverse")
 8 | library(gganimate)
 9 | 
10 | p <- ggplot(
11 |   hour_day,
12 |   aes(x= hour, y= n, colour=factor(sincedownload_cat))
13 | ) +
14 |   geom_line() +
15 |   scale_color_viridis_d() +   
16 |   labs(title= 'Number of Session from Day 1 to Day 30', x = " Hour", y = "Number of sessions") +
17 |  theme(legend.position="top") + geom_point()  + transition_reveal(sincedownload_cat)
18 | 
19 |  final_animation<-animate(p,100,fps = 20,duration = 30, width = 950, height = 750, renderer = gifski_renderer())
20 | 
21 | anim_save("./Days_hours.gif",animation=final_animation)
22 | 
23 | 
24 | View(gapminder)
25 | 
26 | 
27 | 
28 | library(gapminder)
29 | 
30 | ggplot(gapminder, aes(gdpPercap, lifeExp, size = pop, colour = country)) +
31 |   geom_point(alpha = 0.7, show.legend = FALSE) +
32 |   scale_colour_manual(values = country_colors) +
33 |   scale_size(range = c(2, 12)) +
34 |   scale_x_log10() +
35 |   facet_wrap(~continent) +
36 |   # Here comes the gganimate specific bits
37 |   labs(title = 'Year: {frame_time}', x = 'GDP per capita', y = 'life expectancy') +
38 |   transition_time(year) +
39 |   ease_aes('linear')


--------------------------------------------------------------------------------
/Plots March 2020/Lauren/session start heatmap.R:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | library(rayshader)
 4 | 
 5 | library(viridis)
 6 | 
 7 | setwd("C:/Users/lsh1703883/Documents/PhD work/DrinkLess app/16th April data all of it")
 8 | getwd()
 9 | 
10 | # heatmap of when session starts 
11 | 
12 | library(haven)
13 | session_start_heatmap <- read_dta("session start heatmap.dta")
14 | View(session_start_heatmap)
15 | 
16 | session_start_heatmap$day <- factor(session_start_heatmap$day,
17 |                     levels = c(1,2,3,4,5,6,0),
18 |                     labels = c( "Mon", "Tues","Wed","Thur","Fri","Sat","Sun"))
19 | 
20 | names(session_start_heatmap)[4] <- "sessions"
21 | 
22 | ggplot(session_start_heatmap) + 
23 |   geom_tile(aes(x=day, y=hour, fill=n, color=n),size=1,color="black") +
24 |   ggtitle("Heatmap of when sessions begin") +
25 |   scale_fill_gradient2(low="white", high="black", guide="colorbar") +
26 |   labs(caption = "May 2017 to Jan 2019, excluding 21st August 2018") +
27 |   theme(axis.text = element_text(size = 12),
28 |         title = element_text(size = 12,face="bold"),
29 |         panel.border= element_rect(size=2,color="black",fill=NA))  ->
30 |   session_gg
31 | 
32 | 
33 | #filename_movie = tempfile()
34 | 
35 | phivechalf = 30 + 60 * 1/(1 + exp(seq(-7, 20, length.out = 180)/2))
36 | phivecfull = c(phivechalf, rev(phivechalf))
37 | thetavec =  60 * sin(seq(0,359,length.out = 360) * pi/180)
38 | zoomvec = 0.45 + 0.2 * 1/(1 + exp(seq(-5, 20, length.out = 180)))
39 | zoomvecfull = c(zoomvec, rev(zoomvec))
40 | 
41 | 
42 | plot_gg(session_gg, multicore = TRUE, width = 6, height = 5.5, scale = 300, 
43 |         background = "#afceff",shadowcolor = "#3a4f70")
44 | 
45 | render_movie(filename =  "session_graph.mp4", type = "custom", 
46 |              frames = 360,  phi = phivecfull, zoom = zoomvecfull, theta = thetavec)


--------------------------------------------------------------------------------
/Plots March 2020/Martina/PLOT html_doc.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "PLOTS"
  3 | output: 
  4 |   html_document:
  5 |     code_folding: show
  6 |     toc: TRUE
  7 |     toc_float: 
  8 |       collapsed: TRUE
  9 | ---
 10 | ### Overview 
 11 | 
 12 | - **Study design:** Analytical, prospective dynamic cohort study.  
 13 | - **Setting:** The data was obtained from a pragmatic, multicentre, individually randomized clinical trial.  
 14 | - **Study population:** Adults (>18years), HIV-positive patients with no evidence of tuberculosis co-infection, admitted to medical wards at Zomba Central or Edendale Hospital between February 2015 and January 2018, and who provided consent to participate in the parent study.  
 15 | - **Aim: ** To assess whether ART-status, as indicator of (in)appropriate HIV treatment accessibility and management, is leading to higher death rates in hospitalised patients (with no evidence of tuberculosis co-infection) in two main referral hospitals in Malawi and South Africa.  
 16 |   
 17 | - **Outcome:** All-cause 56-day (8-week) mortality.  
 18 | - **Main exposure:** ‘ART-status’. A composite variable with three categories, based on ART at admission (yes/no) and CD4 level at admission (low/high) if patient is under ART  
 19 |  + **(1) ART-High CD4** HIV-patients who have started ART and have a high count of CD4 at admission
 20 |  + **(2) ART-Low CD4** HIV-patients who have started ART and have a low count of CD4 at admission
 21 |  + **(3) No ART** HIV-patients who have not started ART prior admission  
 22 | 
 23 | ### Dependencies & global settings
 24 | 
 25 | ```{r message=FALSE, warning=FALSE}
 26 | # Packages needed
 27 | library(tidyverse)
 28 | library(gridExtra) 
 29 | library(knitr)
 30 | library(survminer)
 31 | library(survival)
 32 | library(ggthemes)
 33 | 
 34 | # Global settings 
 35 | knitr::opts_chunk$set(echo = TRUE)
 36 | options(width = 100)
 37 | 
 38 | ```
 39 | 
 40 | ### Upload / Prepare data
 41 | 
 42 | ```{r} 
 43 | hiv <- read.csv(file="~/hiv/hiv07sep.csv", header = TRUE, sep = ",")
 44 |   
 45 | hiv$cd4count   <-as.numeric(hiv$cd4count)
 46 | hiv$art3       <-as.factor(hiv$art3)
 47 | hiv$art2       <-as.factor(hiv$art2)
 48 | hiv$art1       <-as.factor(hiv$art1)
 49 | hiv$hivdx00    <-as.numeric(hiv$hivdx00)
 50 | hiv$age        <-as.numeric(hiv$age)
 51 | hiv$artyears2  <-as.numeric(hiv$artyears2)
 52 | hiv$hiv_art_c2 <-as.factor(hiv$hiv_art_c2)
 53 |   
 54 | # change the labels for fupreason
 55 | levels(hiv$fupreason)
 56 | # Initially used plyr::revalue because I incorrecly imported this as a factor
 57 | # I want to correct this now so I can use dplyr only 
 58 | # So, I collaps two levels first and then recode the rest. 
 59 | hiv$fupreason2 <- fct_collapse(hiv$fupreason, Censored = c("2.withdrew","3.lost_fup"))
 60 | levels(hiv$fupreason2)
 61 | hiv$fupreason2 <- fct_recode(hiv$fupreason2, Outcome = "1.death", Complete = "4.complete")
 62 | # checking relabeling worked
 63 | table(hiv$fupreason, hiv$fupreason2, useNA = "always")
 64 | ```
 65 | 
 66 | ### Plots settings
 67 |  
 68 | ```{r}
 69 | fill <- "slategray"
 70 | line <- "#1F3552"
 71 | cate <- c("1.ART/highCD4" = "steelblue4",
 72 |           "2.ART/lowCD4" = "skyblue3",
 73 |           "3.noART" = "Grey60")
 74 |   
 75 | t <-theme(panel.background   = element_rect(fill = "white", colour = "grey"),
 76 |           panel.grid.minor.y = element_blank(),
 77 |           panel.grid.major.y = element_line(colour = "grey", size = 0.15),
 78 |           
 79 |           axis.ticks.x = element_blank(),
 80 |           axis.ticks.y = element_blank(),
 81 |           
 82 |           axis.text.x = element_text(colour = "grey27", size = 9),
 83 |           axis.title.y = element_text(colour = "grey27", size = 10),
 84 |           
 85 |           plot.margin = unit(c(1, 3, 1, 3), "cm"),
 86 |           plot.background = element_rect(colour = "black", fill = "grey90", size = 0.1))
 87 | ```
 88 |  
 89 | ### Box plot 1 - Basic
 90 | 
 91 | Time LHIV (years) over exposure
 92 |  
 93 | ```{r}
 94 | ggplot(data    = hiv, 
 95 |        mapping = aes(x = artstatus1, y = hivdxyears)) +
 96 |   # add the wiskers
 97 |   stat_boxplot(geom ='errorbar', width = 0.5, colour = line) +
 98 |   # add this layer at the end so it shows over the error bar line
 99 |   geom_boxplot(fill = fill, colour = line, width = 0.5,
100 |                outlier.colour = line, outlier.shape = 16) +
101 |   # labels for the axes and axes labels
102 |   scale_x_discrete(name = " ", labels = c('ART/highCD4\n(N=812)',
103 |                                           'ART/lowCD4\n(N=682)', 
104 |                                           'No-ART\n(N=544)')) +
105 |   scale_y_continuous(name = "Years LHIV", breaks = seq(0, 28, 2)) +
106 |   # add the theme
107 |   t
108 | ```
109 |  
110 | ### Box plot 2 - Adding elements
111 | 
112 | CD4 count (cells/ml) by time of HIV diagnosis amongst no-ART
113 |  
114 | ```{r}
115 | ggplot(data    = subset(hiv, !is.na(art3)), 
116 |        mapping = aes(x = art3, y = cd4count))  +
117 |   # add the wiskers
118 |   stat_boxplot(geom ='errorbar', width = 0.3, colour = line) +
119 |   # add the line used to dicotomise main exposure
120 |   geom_hline(yintercept = 264, linetype = "dotted", color = "tomato4", size = 0.5) +
121 |   # add this layer at the end so it shows over the dotted line
122 |   geom_boxplot(fill = fill, colour = line, width = 0.3,
123 |                outlier.colour = line, outlier.shape = 20) +
124 |   # labels for the axes and axes labels
125 |   scale_x_discrete(name = " ", labels = c('Diagnosis \nbefore admission \n(N=226)',
126 |                                           'Diagnosis \nat admission \n(N=322)')) +
127 |   scale_y_continuous(name = "CD4 cells/ml", breaks = seq(0, 1800, 250)) +
128 |   # additional text for the label
129 |   annotate("text", x = 2.4, y = 269, label = "264\ncells/ml", size = 3 ) +
130 |   # add the theme
131 |   t
132 | # save the plot
133 | ggsave("cd4.png", width = 15, height = 10, units = "cm")
134 | ```
135 |  
136 | ### Scatterplot 1 - Overide global
137 | 
138 | Time LHIV on time on ART. 
139 | 
140 | 
141 |  
142 | ```{r}
143 | ggplot(data    = hiv,
144 |        mapping = aes(x = hivdxyears, y = artyears2, colour = factor(artstatus1))) +
145 |   geom_point()  +
146 |   # use the colors defined in "settings" and define new labels for the strata
147 |   scale_colour_manual(values = cate,
148 |                       breaks = c("1.ART/highCD4",
149 |                                  "2.ART/lowCD4",
150 |                                  "3.noART"),
151 |                       labels = c("ART/highCD4 (N=788)", 
152 |                                  "ART/lowCD4 (N=671)",
153 |                                  "No-ART (n=218)")) +
154 |   # labels and breaks for the axes
155 |   scale_x_continuous(name = "Years LHIV",
156 |                      breaks = seq(0, 30, 5)) + 
157 |   scale_y_continuous(name = "Years on ART", 
158 |                      breaks = seq(0, 30, 5)) +
159 |   ## same scale for both axes
160 |   coord_equal() +
161 |   # using the same theme but overiding some parameters and adding others
162 |   # note I have to use axis.tilte instead of axis.text to override
163 |   t +
164 |   theme(axis.title.x = element_text(colour = "Grey27", size = 10),
165 |         
166 |         legend.key = element_blank(),
167 |         legend.title = element_blank(),
168 |         legend.position = c(0.02, 0.97), 
169 |         legend.justification = c(0, 1),
170 |         legend.text = element_text(size = 8),
171 |         
172 |         plot.margin = unit(c(1.2, 1.2, 1, 1), "cm")
173 | )
174 | ```
175 |  
176 | ### Scatterplot 2 - Faceting
177 | 
178 | Follow-up time by main exposure and censoring
179 |   
180 | ```{r}
181 | # savign and object withthe labels for the graph
182 | fupreason2.labs <- c('Outcome'  = "Outcome (N=311)",
183 |                      'Censored' = "Censored <56d (N=38)",
184 |                      'Complete' = "Complete (n=1712)")
185 | 
186 | ggplot(data = hiv,
187 |        mapping = aes(x= artstatus1, y=fuptime2)) +
188 |   geom_dotplot(binaxis = "y", stackdir = "center",
189 |                stackratio = 1.5, dotsize = 0.5,
190 |                colour = line) +
191 |   # faceting
192 |   facet_wrap(~ fupreason2, 
193 |              ncol = 3,
194 |              labeller = labeller(fupreason2 = fupreason2.labs))  +
195 |   # labels and breaks for the axes
196 |   scale_x_discrete(name = " ", labels = c('ART\nhighCD4',
197 |                                           'ART\nlowCD4', 
198 |                                           'No\nART'))  +
199 |   scale_y_continuous(name   = "Follow-up time (days)",
200 |                      breaks = seq(0, 70, 15)) + 
201 |   # using the same theme but overiding some parameters and adding others
202 |   # again using axis.title instead of axis.text to override
203 |   t +
204 |   theme(panel.grid.major.x = element_line(colour = "grey", size = 0.15),
205 |         axis.title.x = element_text(colour = "Grey27", size = 8),
206 |         plot.margin = unit(c(1, 1, 1, 1), "cm"))
207 | ```
208 |   
209 | ### Survival curve
210 |  
211 | ```{r}
212 | # survival::survfit --> creates the survfit object needed for ggsurvplot 
213 | fit1 <-survfit(Surv(fuptime2, died) ~ artstatus1,
214 |                data = hiv)
215 | 
216 | # survminer::ggsurvplot creates the plot
217 | ggsurvplot(fit1, 
218 |            censor.shape = "|", censor.size = 4,
219 |            size = 0.5,
220 |            palette =  c("steelblue4","skyblue3","Grey60"), 
221 |            conf.int = TRUE,
222 |            
223 |            risk.table = TRUE,
224 |            risk.table.y.text = FALSE,
225 |            risk.table.height = 0.3, # this is the proportion of hight it takes
226 |            risk.table.fontsize = 4,
227 |            
228 |            legend.title = " ",
229 |            legend.labs = c('ART/highCD4', 'ART/lowCD4', 'No-ART'),
230 |            legend = "top",
231 |            
232 |            ylim = c(0.67, 1.00),
233 |            xlim = c(0,56),
234 |            break.time.by = 7,
235 |            xlab = "Time in days",
236 |            ggtheme = theme_igray()
237 |            )
238 | 
239 | 
240 | ```
241 |  
242 |  


--------------------------------------------------------------------------------
/Plots March 2020/Martina/PLOT html_doc.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/Plots March 2020/Martina/PLOT html_doc.html


--------------------------------------------------------------------------------
/Plots March 2020/Martina/PLOT notebook.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "PLOTS"
  3 | output: 
  4 |   html_notebook:
  5 |     toc: TRUE 
  6 | ---
  7 | 
  8 | ### Overview 
  9 | 
 10 | - **Study design:** Analytical, prospective dynamic cohort study.  
 11 | - **Setting:** The data was obtained from a pragmatic, multicentre, individually randomized clinical trial.  
 12 | - **Study population:** Adults (>18years), HIV-positive patients with no evidence of tuberculosis co-infection, admitted to medical wards at Zomba Central or Edendale Hospital between February 2015 and January 2018, and who provided consent to participate in the parent study.  
 13 | - **Aim: ** To assess whether ART-status, as indicator of (in)appropriate HIV treatment accessibility and management, is leading to higher death rates in hospitalised patients (with no evidence of tuberculosis co-infection) in two main referral hospitals in Malawi and South Africa.  
 14 |   
 15 | - **Outcome:** All-cause 56-day (8-week) mortality.  
 16 | - **Main exposure:** ‘ART-status’. A composite variable with three categories, based on ART at admission (yes/no) and CD4 level at admission (low/high) if patient is under ART  
 17 |  + **(1) ART-High CD4** HIV-patients who have started ART and have a high count of CD4 at admission
 18 |  + **(2) ART-Low CD4** HIV-patients who have started ART and have a low count of CD4 at admission
 19 |  + **(3) No ART** HIV-patients who have not started ART prior admission  
 20 |  
 21 | ### Dependencies & global settings
 22 | 
 23 | ```{r message=FALSE, warning=FALSE}
 24 | # Packages needed
 25 | library(tidyverse)
 26 | library(gridExtra) 
 27 | library(knitr)
 28 | library(survminer)
 29 | library(survival)
 30 | library(ggthemes)
 31 | 
 32 | # Global settings 
 33 | knitr::opts_chunk$set(echo = TRUE)
 34 | options(width = 100)
 35 | 
 36 | ```
 37 | 
 38 | ### Upload / Prepare data
 39 | 
 40 | ```{r} 
 41 | hiv <- read.csv(file="~/hiv/hiv07sep.csv", header = TRUE, sep = ",")
 42 |   
 43 | hiv$cd4count   <-as.numeric(hiv$cd4count)
 44 | hiv$art3       <-as.factor(hiv$art3)
 45 | hiv$art2       <-as.factor(hiv$art2)
 46 | hiv$art1       <-as.factor(hiv$art1)
 47 | hiv$hivdx00    <-as.numeric(hiv$hivdx00)
 48 | hiv$age        <-as.numeric(hiv$age)
 49 | hiv$artyears2  <-as.numeric(hiv$artyears2)
 50 | hiv$hiv_art_c2 <-as.factor(hiv$hiv_art_c2)
 51 |   
 52 | # change the labels for fupreason
 53 | levels(hiv$fupreason)
 54 | # Initially used plyr::revalue because I incorrecly imported this as a factor
 55 | # I want to correct this now so I can use dplyr only 
 56 | # So, I collaps two levels first and then recode the rest. 
 57 | hiv$fupreason2 <- fct_collapse(hiv$fupreason, Censored = c("2.withdrew","3.lost_fup"))
 58 | levels(hiv$fupreason2)
 59 | hiv$fupreason2 <- fct_recode(hiv$fupreason2, Outcome = "1.death", Complete = "4.complete")
 60 | # checking relabeling worked
 61 | table(hiv$fupreason, hiv$fupreason2, useNA = "always")
 62 | ```
 63 | 
 64 | ### Plots settings
 65 |  
 66 | ```{r}
 67 | fill <- "slategray"
 68 | line <- "#1F3552"
 69 | cate <- c("1.ART/highCD4" = "steelblue4",
 70 |           "2.ART/lowCD4" = "skyblue3",
 71 |           "3.noART" = "Grey60")
 72 |   
 73 | t <-theme(panel.background   = element_rect(fill = "white", colour = "grey"),
 74 |           panel.grid.minor.y = element_blank(),
 75 |           panel.grid.major.y = element_line(colour = "grey", size = 0.15),
 76 |           
 77 |           axis.ticks.x = element_blank(),
 78 |           axis.ticks.y = element_blank(),
 79 |           
 80 |           axis.text.x = element_text(colour = "grey27", size = 9),
 81 |           axis.title.y = element_text(colour = "grey27", size = 10),
 82 |           
 83 |           plot.margin = unit(c(1, 3, 1, 3), "cm"),
 84 |           plot.background = element_rect(colour = "black", fill = "grey90", size = 0.1))
 85 | ```
 86 |  
 87 | ### Box plot 1 - Basic
 88 | 
 89 | Time LHIV (years) over exposure
 90 |  
 91 | ```{r}
 92 | ggplot(data    = hiv, 
 93 |        mapping = aes(x = artstatus1, y = hivdxyears)) +
 94 |   # add the wiskers
 95 |   stat_boxplot(geom ='errorbar', width = 0.5, colour = line) +
 96 |   # add this layer at the end so it shows over the error bar line
 97 |   geom_boxplot(fill = fill, colour = line, width = 0.5,
 98 |                outlier.colour = line, outlier.shape = 16) +
 99 |   # labels for the axes and axes labels
100 |   scale_x_discrete(name = " ", labels = c('ART/highCD4\n(N=812)',
101 |                                           'ART/lowCD4\n(N=682)', 
102 |                                           'No-ART\n(N=544)')) +
103 |   scale_y_continuous(name = "Years LHIV", breaks = seq(0, 28, 2)) +
104 |   # add the theme
105 |   t
106 | ```
107 |   
108 |  
109 | ### Box plot 2 - Adding elements
110 | 
111 | CD4 count (cells/ml) by time of HIV diagnosis amongst no-ART
112 |  
113 | ```{r}
114 | ggplot(data    = subset(hiv, !is.na(art3)), 
115 |        mapping = aes(x = art3, y = cd4count))  +
116 |   # add the wiskers
117 |   stat_boxplot(geom ='errorbar', width = 0.3, colour = line) +
118 |   # add the line used to dicotomise main exposure
119 |   geom_hline(yintercept = 264, linetype = "dotted", color = "tomato4", size = 0.5) +
120 |   # add this layer at the end so it shows over the dotted line
121 |   geom_boxplot(fill = fill, colour = line, width = 0.3,
122 |                outlier.colour = line, outlier.shape = 20) +
123 |   # labels for the axes and axes labels
124 |   scale_x_discrete(name = " ", labels = c('Diagnosis \nbefore admission \n(N=226)',
125 |                                           'Diagnosis \nat admission \n(N=322)')) +
126 |   scale_y_continuous(name = "CD4 cells/ml", breaks = seq(0, 1800, 250)) +
127 |   # additional text for the label
128 |   annotate("text", x = 2.4, y = 269, label = "264\ncells/ml", size = 3 ) +
129 |   # add the theme
130 |   t
131 | # save the plot
132 | ggsave("cd4.png", width = 15, height = 10, units = "cm")
133 | ```
134 |  
135 | ### Scatterplot 1 - Overide global settings
136 | 
137 | Time LHIV on time on ART. 
138 |  
139 | ```{r}
140 | ggplot(data    = hiv,
141 |        mapping = aes(x = hivdxyears, y = artyears2, colour = factor(artstatus1))) +
142 |   geom_point()  +
143 |   # use the colors defined in "settings" and define new labels for the strata
144 |   scale_colour_manual(values = cate,
145 |                       breaks = c("1.ART/highCD4",
146 |                                  "2.ART/lowCD4",
147 |                                  "3.noART"),
148 |                       labels = c("ART/highCD4 (N=788)", 
149 |                                  "ART/lowCD4 (N=671)",
150 |                                  "No-ART (n=218)")) +
151 |   # labels and breaks for the axes
152 |   scale_x_continuous(name = "Years LHIV",
153 |                      breaks = seq(0, 30, 5)) + 
154 |   scale_y_continuous(name = "Years on ART", 
155 |                      breaks = seq(0, 30, 5)) +
156 |   ## same scale for both axes
157 |   coord_equal() +
158 |   # using the same theme but overiding some parameters and adding others
159 |   # note I have to use axis.tilte instead of axis.text to override
160 |   t +
161 |   theme(axis.title.x = element_text(colour = "Grey27", size = 10),
162 |         
163 |         legend.key = element_blank(),
164 |         legend.title = element_blank(),
165 |         legend.position = c(0.02, 0.97), 
166 |         legend.justification = c(0, 1),
167 |         legend.text = element_text(size = 8),
168 |         
169 |         plot.margin = unit(c(1.2, 1.2, 1, 1), "cm")
170 | )
171 | ```
172 |  
173 | ### Scatterplot 2 - Faceting
174 | 
175 | Follow-up time by main exposure and censoring
176 |   
177 | ```{r}
178 | # savign and object withthe labels for the graph
179 | fupreason2.labs <- c('Outcome'  = "Outcome (N=311)",
180 |                      'Censored' = "Censored <56d (N=38)",
181 |                      'Complete' = "Complete (n=1712)")
182 | 
183 | ggplot(data = hiv,
184 |        mapping = aes(x= artstatus1, y=fuptime2)) +
185 |   geom_dotplot(binaxis = "y", stackdir = "center",
186 |                stackratio = 1.5, dotsize = 0.5,
187 |                colour = line) +
188 |   # faceting
189 |   facet_wrap(~ fupreason2, 
190 |              ncol = 3,
191 |              labeller = labeller(fupreason2 = fupreason2.labs))  +
192 |   # labels and breaks for the axes
193 |   scale_x_discrete(name = " ", labels = c('ART\nhighCD4',
194 |                                           'ART\nlowCD4', 
195 |                                           'No\nART'))  +
196 |   scale_y_continuous(name   = "Follow-up time (days)",
197 |                      breaks = seq(0, 70, 15)) + 
198 |   # using the same theme but overiding some parameters and adding others
199 |   # again using axis.title instead of axis.text to override
200 |   t +
201 |   theme(panel.grid.major.x = element_line(colour = "grey", size = 0.15),
202 |         axis.title.x = element_text(colour = "Grey27", size = 8),
203 |         plot.margin = unit(c(1, 1, 1, 1), "cm"))
204 | ```
205 |   
206 | ### Survival curves
207 |  
208 | ```{r}
209 | # survival::survfit --> creates the survfit object needed for ggsurvplot 
210 | fit1 <-survfit(Surv(fuptime2, died) ~ artstatus1,
211 |                data = hiv)
212 | 
213 | # survminer::ggsurvplot creates the plot
214 | ggsurvplot(fit1, 
215 |            censor.shape = "|", censor.size = 4,
216 |            size = 0.5,
217 |            palette =  c("steelblue4","skyblue3","Grey60"), 
218 |            conf.int = TRUE,
219 |            
220 |            risk.table = TRUE,
221 |            risk.table.y.text = FALSE,
222 |            risk.table.height = 0.3, # this is the proportion of hight it takes
223 |            risk.table.fontsize = 4,
224 |            
225 |            legend.title = " ",
226 |            legend.labs = c('ART/highCD4', 'ART/lowCD4', 'No-ART'),
227 |            legend = "top",
228 |            
229 |            ylim = c(0.67, 1.00),
230 |            xlim = c(0,56),
231 |            break.time.by = 7,
232 |            xlab = "Time in days",
233 |            ggtheme = theme_igray()
234 |            )
235 | 
236 | 
237 | ```
238 |  
239 |  


--------------------------------------------------------------------------------
/Plots March 2020/Martina/PLOT notebook.nb.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/calumdavey/RUG/e776fea294e1d28496b7f0c3a7b87da7b93cc0fd/Plots March 2020/Martina/PLOT notebook.nb.html


--------------------------------------------------------------------------------
/Plots March 2020/README.md:
--------------------------------------------------------------------------------
1 | # RUG meeting 27 Mar 2020
2 | Amy MacDougall, Carl Pearson, Julia Shen, Lauren Bell, Martina Cusinato, and Calum Davey presented their plots.
3 | 
4 | Carl's materials can be found [here](https://gitlab.com/cabp_LSHTM/denvax). Particularly, in the R/zzz.R file (ignore the code surrounding the functions - that's just to deal with the code only optionally loading) and in the inst/extdata and inst/app folders.
5 | 
6 | Julia's materials can be found [here](https://github.com/7j7j/LSHTM-R-polar-plot)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # RUG
2 | Repo for the LSHTM R Users Group (RUG)
3 | 


--------------------------------------------------------------------------------