├── .gitmodules ├── 01-intro.R ├── 01-intro.Rmd ├── 01-intro.md ├── 02-funprog.R ├── 02-funprog.Rmd ├── 02-funprog.md ├── 03-debug.R ├── 03-debug.Rmd ├── 03-debug.md ├── 04-perf.R ├── 04-perf.Rmd ├── 04-perf.md ├── Makefile ├── README.md ├── figs ├── covr.png ├── debugRStudio1.png ├── debugRStudio2.png ├── envex.png ├── envex.svg ├── funs.png ├── optim.png ├── profvis.png ├── sinfun.png ├── style.png └── vectimings.png ├── lineprof-example.R ├── unittesting.Rmd └── unittesting.md /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "R-parallel"] 2 | path = R-parallel 3 | url = git@github.com:lgatto/R-parallel.git 4 | [submodule "rccpp"] 5 | path = rccpp 6 | url = git@github.com:lgatto/rccpp.git 7 | -------------------------------------------------------------------------------- /01-intro.R: -------------------------------------------------------------------------------- 1 | 2 | library("formatR") 3 | tidy_source(text = "a=1+1;a # print the value 4 | matrix ( rnorm(10),5)", 5 | arrow = TRUE) 6 | 7 | 8 | head(cars) 9 | head(cars[, 1]) 10 | head(cars[, 1, drop = FALSE]) 11 | 12 | 13 | df1 <- data.frame(x = 1:3, y = LETTERS[1:3]) 14 | sapply(df1, class) 15 | df2 <- data.frame(x = 1:3, y = Sys.time() + 1:3) 16 | sapply(df2, class) 17 | 18 | 19 | lapply(df1, class) 20 | lapply(df2, class) 21 | 22 | 23 | vapply(df1, class, "1") 24 | vapply(df2, class, "1") 25 | 26 | 27 | x <- 1 28 | f <- function(x) { 29 | x <- 2 30 | x 31 | } 32 | x 33 | f(x) 34 | x 35 | 36 | 37 | e <- new.env() 38 | e$a <- 1 39 | e$b <- LETTERS[1:5] 40 | e$c <- TRUE 41 | e$d <- mean 42 | 43 | 44 | e$a <- e$b 45 | e$a <- LETTERS[1:5] 46 | 47 | 48 | e <- new.env() 49 | parent.env(e) 50 | 51 | 52 | environment() 53 | parent.env(globalenv()) 54 | parent.env(parent.env(globalenv())) 55 | 56 | 57 | globalenv() 58 | emptyenv() 59 | baseenv() 60 | 61 | 62 | search() 63 | as.environment("package:stats") 64 | 65 | 66 | ls() ## default is R_GlobalEnv 67 | ls(envir = e) 68 | ls(pos = 1) 69 | 70 | 71 | search() 72 | 73 | 74 | e1 <- new.env() 75 | e2 <- new.env() 76 | e1$a <- 1:10 77 | e2$a <- e1$a 78 | 79 | 80 | e3 <- e1 81 | e3 82 | e1 83 | identical(e1, e3) 84 | 85 | 86 | e <- new.env() 87 | e$a <- 1 88 | e$b <- 2 ## add 89 | e$a <- 10 ## modify 90 | 91 | 92 | lockEnvironment(e) 93 | e$k <- 1 94 | e$a <- 100 95 | 96 | 97 | lockBinding("a", e) 98 | e$a <- 10 99 | e$b <- 10 100 | 101 | lockEnvironment(e, bindings = TRUE) 102 | e$b <- 1 103 | 104 | 105 | e <- new.env() 106 | e$foo <- 1 107 | bar <- 2 108 | where("foo") 109 | where("bar") 110 | where("foo", env = e) 111 | where("bar", env = e) 112 | 113 | 114 | search() 115 | mean <- function(x) cat("The mean is", sum(x)/length(x), "\n") 116 | mean(1:10) 117 | base::mean(1:10) 118 | rm(mean) 119 | mean(1:10) 120 | 121 | 122 | library("fortunes") 123 | fortune(174) 124 | 125 | 126 | rm(list = ls()) 127 | x 128 | f1 <- function() x <<- 1 129 | f1() 130 | x 131 | 132 | 133 | f2 <- function() x <<- 2 134 | f2() 135 | x 136 | 137 | 138 | f3 <- function() x <- 10 139 | f3() 140 | x 141 | 142 | 143 | f4 <- function(x) x <-10 144 | f4(x) 145 | x 146 | 147 | 148 | modify <- function(x) { 149 | x$a <- 2 150 | invisible(TRUE) 151 | } 152 | 153 | 154 | x_l <- list(a = 1) 155 | modify(x_l) 156 | x_l$a 157 | 158 | 159 | x_e <- new.env() 160 | x_e$a <- 1 161 | modify(x_e) 162 | x_e$a 163 | 164 | 165 | e <- new.env() 166 | e$a <- 1 167 | e 168 | parent.env(e) 169 | 170 | parent.env(e) <- emptyenv() 171 | parent.env(e) 172 | e 173 | 174 | 175 | e <- new.env(parent.env = empty.env()) 176 | 177 | 178 | x <- 1 179 | e1 <- new.env() 180 | get("x", envir = e1) 181 | 182 | 183 | get("x", envir = e1, inherits = FALSE) 184 | 185 | 186 | e2 <- new.env(parent = emptyenv()) 187 | get("x", envir = e2) 188 | 189 | 190 | get("x", envir = e1, inherits = FALSE) 191 | 192 | 193 | e <- new.env() 194 | e$x <- 1 195 | f <- function(myenv) myenv$x <- 2 196 | f(e) 197 | e$x 198 | 199 | 200 | library("Biobase") 201 | getClass("eSet") 202 | getClass("AssayData") 203 | new("ExpressionSet") 204 | 205 | 206 | .pRolocEnv <- new.env(parent=emptyenv(), hash=TRUE) 207 | 208 | stockcol <- c("#E41A1C", "#377EB8", "#238B45", "#FF7F00", "#FFD700", "#333333", 209 | "#00CED1", "#A65628", "#F781BF", "#984EA3", "#9ACD32", "#B0C4DE", 210 | "#00008A", "#8B795E", "#FDAE6B", "#66C2A5", "#276419", "#CD8C95", 211 | "#6A51A3", "#EEAD0E", "#0000FF", "#9ACD32", "#CD6090", "#CD5B45", 212 | "#8E0152", "#808000", "#67000D", "#3F007D", "#6BAED6", "#FC9272") 213 | 214 | assign("stockcol", stockcol, envir = .pRolocEnv) 215 | 216 | getStockcol <- function() get("stockcol", envir = .pRolocEnv) 217 | 218 | setStockcol <- function(cols) { 219 | if (is.null(cols)) { 220 | assign("stockcol", stockcol, envir = .pRolocEnv) 221 | } else { 222 | assign("stockcol", cols, envir = .pRolocEnv) 223 | } 224 | } 225 | 226 | 227 | ... 228 | if (missing(col)) 229 | col <- getStockcol() 230 | ... 231 | 232 | 233 | setStockcol <- function(cols) { 234 | prevcols <- getStockcol() 235 | if (is.null(cols)) { 236 | assign("stockcol", stockcol, envir = .pRolocEnv) 237 | } else { 238 | assign("stockcol", cols, envir = .pRolocEnv) 239 | } 240 | invisible(prevcols) 241 | } 242 | 243 | 244 | library("dplyr") 245 | surveys <- read.csv("http://datacarpentry.github.io/dc_zurich/data/portal_data_joined.csv") 246 | head(surveys) 247 | 248 | surveys %>% 249 | filter(weight < 5) %>% 250 | select(species_id, sex, weight) 251 | 252 | surveys %>% 253 | mutate(weight_kg = weight / 1000) %>% 254 | filter(!is.na(weight)) %>% 255 | head 256 | 257 | surveys %>% 258 | group_by(sex) %>% 259 | tally() 260 | 261 | surveys %>% 262 | group_by(sex, species_id) %>% 263 | summarize(mean_weight = mean(weight, na.rm = TRUE)) 264 | 265 | surveys %>% 266 | group_by(sex, species_id) %>% 267 | summarize(mean_weight = mean(weight, na.rm = TRUE), 268 | min_weight = min(weight, na.rm = TRUE)) %>% 269 | filter(!is.nan(mean_weight)) 270 | 271 | 272 | quote(1:10) 273 | quote(paste(letters, LETTERS, sep = "-")) 274 | 275 | 276 | eval(quote(1 + 1)) 277 | eval(quote(1:10)) 278 | 279 | x <- 10 280 | eval(quote(x + 1)) 281 | 282 | e <- new.env() 283 | e$x <- 1 284 | eval(quote(x + 1), env = e) 285 | 286 | eval(quote(x), list(x = 30)) 287 | 288 | dfr <- data.frame(x = 1:10, y = LETTERS[1:10]) 289 | eval(quote(sum(x)), dfr) 290 | 291 | 292 | x <- 10 293 | substitute(sqrt(x)) 294 | 295 | e <- new.env() 296 | e$x <- 1 297 | substitute(sqrt(x), env = e) 298 | 299 | 300 | parse(text = "1:10") 301 | parse(file = "lineprof-example.R") 302 | 303 | 304 | x <- 123 305 | deparse(substitute(x)) 306 | 307 | 308 | foo <- "bar" 309 | as.name(foo) 310 | string <- "1:10" 311 | parse(text=string) 312 | eval(parse(text=string)) 313 | 314 | 315 | varName1 <- "varName2" 316 | assign(varName1, "123") 317 | varName1 318 | get(varName1) 319 | varName2 320 | 321 | 322 | test <- function(x) { 323 | y <- deparse(substitute(x)) 324 | print(y) 325 | print(x) 326 | } 327 | var <- c("one","two","three") 328 | test(var) 329 | 330 | -------------------------------------------------------------------------------- /01-intro.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Part I: Introduction" 3 | author: "Laurent Gatto" 4 | --- 5 | 6 | ## Overview 7 | 8 | - Coding style(s) 9 | - Interactive use and programming 10 | - Environments 11 | - Tidy data 12 | - Computing on the language 13 | 14 | ## Introduction 15 | 16 | > Computers are cheap, and thinking hurts. -- Uwe Ligges 17 | 18 | Simplicity, readability and consistency are a long way towards 19 | robust code. 20 | 21 | ## Coding style(s) 22 | 23 | Why? 24 | 25 | > Good coding style is like using correct punctuation. You can manage 26 | > without it, but it sure makes things easier to read. 27 | -- Hadley Wickham 28 | 29 | for **consistency** and **readability**. 30 | 31 | ## Which one? 32 | 33 | - [Bioconductor](http://master.bioconductor.org/developers/how-to/coding-style/) 34 | - [Hadley Wickham](http://r-pkgs.had.co.nz/style.html) 35 | - [Google](http://google.github.io/styleguide/Rguide.xml) 36 | - ... 37 | 38 | ## Examples 39 | 40 | - Place spaces around all infix operators (`=`, `+`, `-`, `<-`, etc., but *not* `:`) 41 | and after a comma (`x[i, j]`). 42 | - Spaces before `(` and after `)`; not for function. 43 | - Use `<-` rather than `=`. 44 | - Limit your code to 80 characters per line 45 | - Indentation: do not use tabs, use 2 (HW)/4 (Bioc) spaces 46 | - Function names: use verbs 47 | - Variable names: camelCaps (Bioc)/ `_` (HW) (but not a `.`) 48 | - Prefix non-exported functions with a ‘.’ (Bioc). 49 | - Class names: start with a capital 50 | - Comments: `# ` or `## ` (from emacs) 51 | 52 | ## [`formatR`](https://cran.rstudio.com/web/packages/formatR/index.html) 53 | 54 | ```{r, eval=TRUE} 55 | library("formatR") 56 | tidy_source(text = "a=1+1;a # print the value 57 | matrix ( rnorm(10),5)", 58 | arrow = TRUE) 59 | ``` 60 | 61 | ## [`BiocCheck`](http://bioconductor.org/packages/devel/bioc/html/BiocCheck.html) 62 | 63 | ``` 64 | $ R CMD BiocCheck package_1.0.0.tgz 65 | ``` 66 | 67 | ``` 68 | * Checking function lengths................ 69 | The longest function is 677 lines long 70 | The longest 5 functions are: 71 | * Checking formatting of DESCRIPTION, NAMESPACE, man pages, R source, 72 | and vignette source... 73 | * CONSIDER: Shortening lines; 616 lines (11%) are > 80 characters 74 | long. 75 | * CONSIDER: Replacing tabs with 4 spaces; 3295 lines (60%) contain 76 | tabs. 77 | * CONSIDER: Indenting lines with a multiple of 4 spaces; 162 lines 78 | (2%) are not. 79 | ``` 80 | 81 | ## Style changes over time 82 | 83 | ![Style changes over time](./figs/style.png) 84 | 85 | 86 | ## Ineractive use vs programming 87 | 88 | Moving from using R to programming R is *abstraction*, *automation*, 89 | *generalisation*. 90 | 91 | ## Interactive use vs programming: `drop` 92 | 93 | ```{r, eval=FALSE} 94 | head(cars) 95 | head(cars[, 1]) 96 | head(cars[, 1, drop = FALSE]) 97 | ``` 98 | 99 | ## Interactive use vs programming: `sapply/lapply` 100 | 101 | ```{r, eval=FALSE} 102 | df1 <- data.frame(x = 1:3, y = LETTERS[1:3]) 103 | sapply(df1, class) 104 | df2 <- data.frame(x = 1:3, y = Sys.time() + 1:3) 105 | sapply(df2, class) 106 | ``` 107 | 108 | Rather use a form where the return data structure is known... 109 | 110 | ```{r, eval=FALSE} 111 | lapply(df1, class) 112 | lapply(df2, class) 113 | ``` 114 | 115 | or that will break if the result is not what is exected 116 | 117 | ```{r, eval=FALSE} 118 | vapply(df1, class, "1") 119 | vapply(df2, class, "1") 120 | ``` 121 | 122 | ## Semantics 123 | 124 | - *pass-by-value* copy-on-modify 125 | - *pass-by-reference*: environments, S4 Reference Classes 126 | 127 | ```{r, eval=FALSE} 128 | x <- 1 129 | f <- function(x) { 130 | x <- 2 131 | x 132 | } 133 | x 134 | f(x) 135 | x 136 | ``` 137 | 138 | ## Environments 139 | 140 | ### Motivation 141 | 142 | - Data structure that enables *scoping* (see later). 143 | - Have reference semantics 144 | - Useful data structure on their own 145 | 146 | ### Definition (1) 147 | 148 | An environment associates, or *binds*, names to values in memory. 149 | Variables in an environment are hence called *bindings*. 150 | 151 | ## Creating and populate environments 152 | 153 | ```{r, eval=FALSE} 154 | e <- new.env() 155 | e$a <- 1 156 | e$b <- LETTERS[1:5] 157 | e$c <- TRUE 158 | e$d <- mean 159 | ``` 160 | 161 | ```{r, eval=FALSE} 162 | e$a <- e$b 163 | e$a <- LETTERS[1:5] 164 | ``` 165 | 166 | - Objects in environments have unique names 167 | - Objects in different environments can of course have identical names 168 | - Objects in an environment have no order 169 | - Environments have parents 170 | 171 | ## Definition (2) 172 | 173 | An environment is composed of a *frame* that contains the name-object 174 | bindings and a parent (enclosing) environment. 175 | 176 | ## Relationship between environments 177 | 178 | Every environment has a parent (enclosing) environment 179 | 180 | ```{r, eval=FALSE} 181 | e <- new.env() 182 | parent.env(e) 183 | ``` 184 | Current environment 185 | 186 | ```{r, eval=FALSE} 187 | environment() 188 | parent.env(globalenv()) 189 | parent.env(parent.env(globalenv())) 190 | ``` 191 | 192 | Noteworthy environments 193 | 194 | ```{r, eval=FALSE} 195 | globalenv() 196 | emptyenv() 197 | baseenv() 198 | ``` 199 | 200 | All parent of `R_GlobalEnv`: 201 | 202 | ```{r, eval=FALSE} 203 | search() 204 | as.environment("package:stats") 205 | ``` 206 | 207 | Listing objects in an environment 208 | 209 | ```{r, eval=FALSE} 210 | ls() ## default is R_GlobalEnv 211 | ls(envir = e) 212 | ls(pos = 1) 213 | ``` 214 | 215 | ```{r, eval=FALSE} 216 | search() 217 | ``` 218 | 219 | Note: Every time a package is loaded with `library`, it is inserted in 220 | the search path after the `R_GlobalEnv`. 221 | 222 | ## Accessors and setters 223 | 224 | - In addition to `$`, one can also use `[[`, `get` and `assign`. 225 | - To check if a name exists in an environmet (or in any or its 226 | parents), one can use `exists`. 227 | - Compare two environments with `identical` (not `==`). 228 | 229 | **Question** Are `e1` and `e2` below identical? 230 | 231 | ```{r, eval=FALSE} 232 | e1 <- new.env() 233 | e2 <- new.env() 234 | e1$a <- 1:10 235 | e2$a <- e1$a 236 | ``` 237 | 238 | What about `e1` and `e3`? 239 | 240 | ```{r, eval=FALSE} 241 | e3 <- e1 242 | e3 243 | e1 244 | identical(e1, e3) 245 | ``` 246 | 247 | ## Locking environments and bindings 248 | 249 | ```{r, eval=FALSE} 250 | e <- new.env() 251 | e$a <- 1 252 | e$b <- 2 ## add 253 | e$a <- 10 ## modify 254 | ``` 255 | 256 | Locking an environment stops from adding new bindings: 257 | 258 | ```{r, eval=FALSE} 259 | lockEnvironment(e) 260 | e$k <- 1 261 | e$a <- 100 262 | ``` 263 | 264 | Locking bindings stops from modifying bindings with en envionment: 265 | 266 | ```{r, eval=FALSE} 267 | lockBinding("a", e) 268 | e$a <- 10 269 | e$b <- 10 270 | 271 | lockEnvironment(e, bindings = TRUE) 272 | e$b <- 1 273 | ``` 274 | 275 | ## Exercise 276 | 277 | Reproduce the following environments and variables in R. 278 | 279 | ![envionments and variables exercise](./figs/envex.png) 280 | 281 | ## Where is a symbol defined? 282 | 283 | `pryr::where()` implements the regular scoping rules to find in which 284 | environment a binding is defined. 285 | 286 | ```{r, eval=FALSE} 287 | e <- new.env() 288 | e$foo <- 1 289 | bar <- 2 290 | where("foo") 291 | where("bar") 292 | where("foo", env = e) 293 | where("bar", env = e) 294 | ``` 295 | 296 | ## Lexical scoping 297 | 298 | [Lexical comes from *lexical analysis* in computer science, which is 299 | the conversion of characters (code) into a sequence of meaningful (for 300 | the computer) tokens.] 301 | 302 | **Definition**: Rules that define how R looks up values for a given name/symbol. 303 | 304 | - Objects in environments have unique names 305 | - Objects in different environments can of course have identical names. 306 | - If a name is not found in the current environment, it is looked up 307 | in the parent (enclosing) from. 308 | - If it is not found in the parent (enclosing) frame, it is looked up 309 | in the parent's parent frame, and so on... 310 | 311 | ```{r, eval=FALSE} 312 | search() 313 | mean <- function(x) cat("The mean is", sum(x)/length(x), "\n") 314 | mean(1:10) 315 | base::mean(1:10) 316 | rm(mean) 317 | mean(1:10) 318 | ``` 319 | 320 | ## Assignments 321 | 322 | - `<-` assigns/creates in the current environment 323 | 324 | - `<<-` (deep assignment) never creates/updates a variable in the 325 | current environment, but modifies an existing variable in the 326 | current or first enclosing environment where that name is defined. 327 | 328 | - If `<<-` does not find the name, it will create the variable in the 329 | global environment. 330 | 331 | ```{r, eval=TRUE} 332 | library("fortunes") 333 | fortune(174) 334 | ``` 335 | 336 | ```{r, eval=FALSE} 337 | rm(list = ls()) 338 | x 339 | f1 <- function() x <<- 1 340 | f1() 341 | x 342 | ``` 343 | 344 | ```{r, eval=FALSE} 345 | f2 <- function() x <<- 2 346 | f2() 347 | x 348 | ``` 349 | 350 | ```{r, eval=FALSE} 351 | f3 <- function() x <- 10 352 | f3() 353 | x 354 | ``` 355 | 356 | ```{r, eval=FALSE} 357 | f4 <- function(x) x <-10 358 | f4(x) 359 | x 360 | ``` 361 | 362 | ## Using environments 363 | 364 | Most environments are created when creating and calling 365 | functions. They are also used in packages as *package* and *namespace* 366 | environments. 367 | 368 | There are several reasons to create then manually. 369 | 370 | - Reference semantics 371 | - Avoiding copies 372 | - Package state 373 | - As a hashmap for fast name lookup 374 | 375 | ## Reference semantics 376 | 377 | ```{r, eval=TRUE} 378 | modify <- function(x) { 379 | x$a <- 2 380 | invisible(TRUE) 381 | } 382 | ``` 383 | 384 | ```{r, eval=FALSE} 385 | x_l <- list(a = 1) 386 | modify(x_l) 387 | x_l$a 388 | ``` 389 | 390 | ```{r, eval=FALSE} 391 | x_e <- new.env() 392 | x_e$a <- 1 393 | modify(x_e) 394 | x_e$a 395 | ``` 396 | 397 | Tip: when setting up environments, it is advised to set to parent 398 | (enclosing) environment to be `emptyenv()`, to avoid accidentally 399 | inheriting objects from somewhere else on the search path. 400 | 401 | ```{r, eval=FALSE} 402 | e <- new.env() 403 | e$a <- 1 404 | e 405 | parent.env(e) 406 | 407 | parent.env(e) <- emptyenv() 408 | parent.env(e) 409 | e 410 | ``` 411 | 412 | or directly 413 | 414 | ```{r, eval=FALSE} 415 | e <- new.env(parent.env = empty.env()) 416 | ``` 417 | 418 | ### Exercise 419 | 420 | What is going to happen when we access `"x"` in the four cases below? 421 | 422 | ```{r, eval=FALSE} 423 | x <- 1 424 | e1 <- new.env() 425 | get("x", envir = e1) 426 | ``` 427 | 428 | ```{r, eval=FALSE} 429 | get("x", envir = e1, inherits = FALSE) 430 | ``` 431 | 432 | ```{r, eval=FALSE} 433 | e2 <- new.env(parent = emptyenv()) 434 | get("x", envir = e2) 435 | ``` 436 | 437 | ```{r, eval=FALSE} 438 | get("x", envir = e1, inherits = FALSE) 439 | ``` 440 | 441 | ## Avoiding copies 442 | 443 | Since environments have reference semantics, they are not copied. 444 | When passing an environment as function argument (directly, or as part 445 | of a more complex data structure), it is **not** copied: all its 446 | values are accessible within the function and can be persistently 447 | modified. 448 | 449 | ```{r, eval=FALSE} 450 | e <- new.env() 451 | e$x <- 1 452 | f <- function(myenv) myenv$x <- 2 453 | f(e) 454 | e$x 455 | ``` 456 | 457 | This is used in the `eSet` class family to store the expression data. 458 | 459 | ```{r, eval=FALSE} 460 | library("Biobase") 461 | getClass("eSet") 462 | getClass("AssayData") 463 | new("ExpressionSet") 464 | ``` 465 | 466 | ## Preserving state in packages 467 | 468 | Explicit envirionments are also useful to preserve state or define 469 | constants-like variables in a package. One can then set getters and 470 | setters for users to access the variables within that private 471 | envionment. 472 | 473 | #### Use case 474 | 475 | Colour management in [`pRoloc`](https://github.com/lgatto/pRoloc/blob/master/R/environment.R): 476 | 477 | ```{r, eval=FALSE} 478 | .pRolocEnv <- new.env(parent=emptyenv(), hash=TRUE) 479 | 480 | stockcol <- c("#E41A1C", "#377EB8", "#238B45", "#FF7F00", "#FFD700", "#333333", 481 | "#00CED1", "#A65628", "#F781BF", "#984EA3", "#9ACD32", "#B0C4DE", 482 | "#00008A", "#8B795E", "#FDAE6B", "#66C2A5", "#276419", "#CD8C95", 483 | "#6A51A3", "#EEAD0E", "#0000FF", "#9ACD32", "#CD6090", "#CD5B45", 484 | "#8E0152", "#808000", "#67000D", "#3F007D", "#6BAED6", "#FC9272") 485 | 486 | assign("stockcol", stockcol, envir = .pRolocEnv) 487 | 488 | getStockcol <- function() get("stockcol", envir = .pRolocEnv) 489 | 490 | setStockcol <- function(cols) { 491 | if (is.null(cols)) { 492 | assign("stockcol", stockcol, envir = .pRolocEnv) 493 | } else { 494 | assign("stockcol", cols, envir = .pRolocEnv) 495 | } 496 | } 497 | ``` 498 | 499 | and in plotting functions (we will see the `missing` in more details later): 500 | 501 | ```{r, eval=FALSE} 502 | ... 503 | if (missing(col)) 504 | col <- getStockcol() 505 | ... 506 | ``` 507 | 508 | Hadley's tip: Invisibly returning the old value from 509 | 510 | ```{r, eval=FALSE} 511 | setStockcol <- function(cols) { 512 | prevcols <- getStockcol() 513 | if (is.null(cols)) { 514 | assign("stockcol", stockcol, envir = .pRolocEnv) 515 | } else { 516 | assign("stockcol", cols, envir = .pRolocEnv) 517 | } 518 | invisible(prevcols) 519 | } 520 | ``` 521 | 522 | ## Tidy data 523 | 524 | > Hadley Wickham, Tidy Data, Vol. 59, Issue 10, Sep 2014, Journal of 525 | > Statistical Software. http://www.jstatsoft.org/v59/i10. 526 | 527 | Tidy datasets are easy to manipulate, model and visualize, and have a 528 | specific structure: each variable is a column, each observation is a 529 | row, and each type of observational unit is a table. 530 | 531 | 532 | ## Tidy tools 533 | 534 | Tidy data also makes it easier to develop tidy tools for data 535 | analysis, tools that both input and output tidy datasets. 536 | 537 | - `dply::select` select columns 538 | - `dlpy::filter` select rows 539 | - `dplyr:mutate` create new columns 540 | - `dpplyr:group_by` split-apply-combine 541 | - `dlpyr:summarise` collapse each group into a single-row summary of 542 | that group 543 | - `magrittr::%>%` piping 544 | 545 | 546 | ## Examples 547 | 548 | ```{r, eval=FALSE} 549 | library("dplyr") 550 | surveys <- read.csv("http://datacarpentry.github.io/dc_zurich/data/portal_data_joined.csv") 551 | head(surveys) 552 | 553 | surveys %>% 554 | filter(weight < 5) %>% 555 | select(species_id, sex, weight) 556 | 557 | surveys %>% 558 | mutate(weight_kg = weight / 1000) %>% 559 | filter(!is.na(weight)) %>% 560 | head 561 | 562 | surveys %>% 563 | group_by(sex) %>% 564 | tally() 565 | 566 | surveys %>% 567 | group_by(sex, species_id) %>% 568 | summarize(mean_weight = mean(weight, na.rm = TRUE)) 569 | 570 | surveys %>% 571 | group_by(sex, species_id) %>% 572 | summarize(mean_weight = mean(weight, na.rm = TRUE), 573 | min_weight = min(weight, na.rm = TRUE)) %>% 574 | filter(!is.nan(mean_weight)) 575 | ``` 576 | 577 | ## Application to other data structures 578 | 579 | > Hadley Wickham (@hadleywickham) tweeted at 8:45 pm on Fri, Feb 12, 580 | > 2016: @mark_scheuerell @drob the importance of tidy data is not the 581 | > specific form, but the consistency 582 | > (https://twitter.com/hadleywickham/status/698246671629549568?s=09) 583 | 584 | - Well-formatted and well-documented `S4` class 585 | - `S4` as input -(function)-> `S4` as output 586 | 587 | ![MSnSet schematics](https://raw.githubusercontent.com/lgatto/pRoloc/master/vignettes/Figures/msnset.png) 588 | 589 | ## Computing on the language 590 | 591 | 592 | #### Quoting and evaluating expressions 593 | 594 | Quote an expression, don't evaluate it: 595 | 596 | ```{r, eval=FALSE} 597 | quote(1:10) 598 | quote(paste(letters, LETTERS, sep = "-")) 599 | ``` 600 | Evaluate an expression in a specific environment: 601 | 602 | ```{r, eval=FALSE} 603 | eval(quote(1 + 1)) 604 | eval(quote(1:10)) 605 | 606 | x <- 10 607 | eval(quote(x + 1)) 608 | 609 | e <- new.env() 610 | e$x <- 1 611 | eval(quote(x + 1), env = e) 612 | 613 | eval(quote(x), list(x = 30)) 614 | 615 | dfr <- data.frame(x = 1:10, y = LETTERS[1:10]) 616 | eval(quote(sum(x)), dfr) 617 | ``` 618 | 619 | Substitute any variables bound in `env`, but don't evaluate the 620 | expression: 621 | 622 | ```{r, eval=FALSE} 623 | x <- 10 624 | substitute(sqrt(x)) 625 | 626 | e <- new.env() 627 | e$x <- 1 628 | substitute(sqrt(x), env = e) 629 | ``` 630 | 631 | Parse, but don't evaluate an expression: 632 | 633 | ```{r, eval=FALSE} 634 | parse(text = "1:10") 635 | parse(file = "lineprof-example.R") 636 | ``` 637 | 638 | Turn an unevaluated expressions into character strings: 639 | 640 | ```{r, eval=FALSE} 641 | x <- 123 642 | deparse(substitute(x)) 643 | ``` 644 | 645 | #### Characters as variables names 646 | 647 | 648 | ```{r, eval=FALSE} 649 | foo <- "bar" 650 | as.name(foo) 651 | string <- "1:10" 652 | parse(text=string) 653 | eval(parse(text=string)) 654 | ``` 655 | 656 | 657 | And with `assign` and `get` 658 | 659 | ```{r, eval=FALSE} 660 | varName1 <- "varName2" 661 | assign(varName1, "123") 662 | varName1 663 | get(varName1) 664 | varName2 665 | ``` 666 | 667 | Using `substitute` and `deparse` 668 | 669 | ```{r, eval=FALSE} 670 | test <- function(x) { 671 | y <- deparse(substitute(x)) 672 | print(y) 673 | print(x) 674 | } 675 | var <- c("one","two","three") 676 | test(var) 677 | ``` 678 | 679 | 680 | -------------------------------------------------------------------------------- /01-intro.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Part I: Introduction" 3 | author: "Laurent Gatto" 4 | --- 5 | 6 | ## Overview 7 | 8 | - Coding style(s) 9 | - Interactive use and programming 10 | - Environments 11 | - Tidy data 12 | - Computing on the language 13 | 14 | ## Introduction 15 | 16 | > Computers are cheap, and thinking hurts. -- Uwe Ligges 17 | 18 | Simplicity, readability and consistency are a long way towards 19 | robust code. 20 | 21 | ## Coding style(s) 22 | 23 | Why? 24 | 25 | > Good coding style is like using correct punctuation. You can manage 26 | > without it, but it sure makes things easier to read. 27 | -- Hadley Wickham 28 | 29 | for **consistency** and **readability**. 30 | 31 | ## Which one? 32 | 33 | - [Bioconductor](http://master.bioconductor.org/developers/how-to/coding-style/) 34 | - [Hadley Wickham](http://r-pkgs.had.co.nz/style.html) 35 | - [Google](http://google.github.io/styleguide/Rguide.xml) 36 | - ... 37 | 38 | ## Examples 39 | 40 | - Place spaces around all infix operators (`=`, `+`, `-`, `<-`, etc., but *not* `:`) 41 | and after a comma (`x[i, j]`). 42 | - Spaces before `(` and after `)`; not for function. 43 | - Use `<-` rather than `=`. 44 | - Limit your code to 80 characters per line 45 | - Indentation: do not use tabs, use 2 (HW)/4 (Bioc) spaces 46 | - Function names: use verbs 47 | - Variable names: camelCaps (Bioc)/ `_` (HW) (but not a `.`) 48 | - Prefix non-exported functions with a ‘.’ (Bioc). 49 | - Class names: start with a capital 50 | - Comments: `# ` or `## ` (from emacs) 51 | 52 | ## [`formatR`](https://cran.rstudio.com/web/packages/formatR/index.html) 53 | 54 | 55 | ```r 56 | library("formatR") 57 | tidy_source(text = "a=1+1;a # print the value 58 | matrix ( rnorm(10),5)", 59 | arrow = TRUE) 60 | ``` 61 | 62 | ``` 63 | ## a <- 1 + 1 64 | ## a # print the value 65 | ## matrix(rnorm(10), 5) 66 | ``` 67 | 68 | ## [`BiocCheck`](http://bioconductor.org/packages/devel/bioc/html/BiocCheck.html) 69 | 70 | ``` 71 | $ R CMD BiocCheck package_1.0.0.tgz 72 | ``` 73 | 74 | ``` 75 | * Checking function lengths................ 76 | The longest function is 677 lines long 77 | The longest 5 functions are: 78 | * Checking formatting of DESCRIPTION, NAMESPACE, man pages, R source, 79 | and vignette source... 80 | * CONSIDER: Shortening lines; 616 lines (11%) are > 80 characters 81 | long. 82 | * CONSIDER: Replacing tabs with 4 spaces; 3295 lines (60%) contain 83 | tabs. 84 | * CONSIDER: Indenting lines with a multiple of 4 spaces; 162 lines 85 | (2%) are not. 86 | ``` 87 | 88 | ## Style changes over time 89 | 90 | ![Style changes over time](./figs/style.png) 91 | 92 | 93 | ## Ineractive use vs programming 94 | 95 | Moving from using R to programming R is *abstraction*, *automation*, 96 | *generalisation*. 97 | 98 | ## Interactive use vs programming: `drop` 99 | 100 | 101 | ```r 102 | head(cars) 103 | head(cars[, 1]) 104 | head(cars[, 1, drop = FALSE]) 105 | ``` 106 | 107 | ## Interactive use vs programming: `sapply/lapply` 108 | 109 | 110 | ```r 111 | df1 <- data.frame(x = 1:3, y = LETTERS[1:3]) 112 | sapply(df1, class) 113 | df2 <- data.frame(x = 1:3, y = Sys.time() + 1:3) 114 | sapply(df2, class) 115 | ``` 116 | 117 | Rather use a form where the return data structure is known... 118 | 119 | 120 | ```r 121 | lapply(df1, class) 122 | lapply(df2, class) 123 | ``` 124 | 125 | or that will break if the result is not what is exected 126 | 127 | 128 | ```r 129 | vapply(df1, class, "1") 130 | vapply(df2, class, "1") 131 | ``` 132 | 133 | ## Semantics 134 | 135 | - *pass-by-value* copy-on-modify 136 | - *pass-by-reference*: environments, S4 Reference Classes 137 | 138 | 139 | ```r 140 | x <- 1 141 | f <- function(x) { 142 | x <- 2 143 | x 144 | } 145 | x 146 | f(x) 147 | x 148 | ``` 149 | 150 | ## Environments 151 | 152 | ### Motivation 153 | 154 | - Data structure that enables *scoping* (see later). 155 | - Have reference semantics 156 | - Useful data structure on their own 157 | 158 | ### Definition (1) 159 | 160 | An environment associates, or *binds*, names to values in memory. 161 | Variables in an environment are hence called *bindings*. 162 | 163 | ## Creating and populate environments 164 | 165 | 166 | ```r 167 | e <- new.env() 168 | e$a <- 1 169 | e$b <- LETTERS[1:5] 170 | e$c <- TRUE 171 | e$d <- mean 172 | ``` 173 | 174 | 175 | ```r 176 | e$a <- e$b 177 | e$a <- LETTERS[1:5] 178 | ``` 179 | 180 | - Objects in environments have unique names 181 | - Objects in different environments can of course have identical names 182 | - Objects in an environment have no order 183 | - Environments have parents 184 | 185 | ## Definition (2) 186 | 187 | An environment is composed of a *frame* that contains the name-object 188 | bindings and a parent (enclosing) environment. 189 | 190 | ## Relationship between environments 191 | 192 | Every environment has a parent (enclosing) environment 193 | 194 | 195 | ```r 196 | e <- new.env() 197 | parent.env(e) 198 | ``` 199 | Current environment 200 | 201 | 202 | ```r 203 | environment() 204 | parent.env(globalenv()) 205 | parent.env(parent.env(globalenv())) 206 | ``` 207 | 208 | Noteworthy environments 209 | 210 | 211 | ```r 212 | globalenv() 213 | emptyenv() 214 | baseenv() 215 | ``` 216 | 217 | All parent of `R_GlobalEnv`: 218 | 219 | 220 | ```r 221 | search() 222 | as.environment("package:stats") 223 | ``` 224 | 225 | Listing objects in an environment 226 | 227 | 228 | ```r 229 | ls() ## default is R_GlobalEnv 230 | ls(envir = e) 231 | ls(pos = 1) 232 | ``` 233 | 234 | 235 | ```r 236 | search() 237 | ``` 238 | 239 | Note: Every time a package is loaded with `library`, it is inserted in 240 | the search path after the `R_GlobalEnv`. 241 | 242 | ## Accessors and setters 243 | 244 | - In addition to `$`, one can also use `[[`, `get` and `assign`. 245 | - To check if a name exists in an environmet (or in any or its 246 | parents), one can use `exists`. 247 | - Compare two environments with `identical` (not `==`). 248 | 249 | **Question** Are `e1` and `e2` below identical? 250 | 251 | 252 | ```r 253 | e1 <- new.env() 254 | e2 <- new.env() 255 | e1$a <- 1:10 256 | e2$a <- e1$a 257 | ``` 258 | 259 | What about `e1` and `e3`? 260 | 261 | 262 | ```r 263 | e3 <- e1 264 | e3 265 | e1 266 | identical(e1, e3) 267 | ``` 268 | 269 | ## Locking environments and bindings 270 | 271 | 272 | ```r 273 | e <- new.env() 274 | e$a <- 1 275 | e$b <- 2 ## add 276 | e$a <- 10 ## modify 277 | ``` 278 | 279 | Locking an environment stops from adding new bindings: 280 | 281 | 282 | ```r 283 | lockEnvironment(e) 284 | e$k <- 1 285 | e$a <- 100 286 | ``` 287 | 288 | Locking bindings stops from modifying bindings with en envionment: 289 | 290 | 291 | ```r 292 | lockBinding("a", e) 293 | e$a <- 10 294 | e$b <- 10 295 | 296 | lockEnvironment(e, bindings = TRUE) 297 | e$b <- 1 298 | ``` 299 | 300 | ## Exercise 301 | 302 | Reproduce the following environments and variables in R. 303 | 304 | ![envionments and variables exercise](./figs/envex.png) 305 | 306 | ## Where is a symbol defined? 307 | 308 | `pryr::where()` implements the regular scoping rules to find in which 309 | environment a binding is defined. 310 | 311 | 312 | ```r 313 | e <- new.env() 314 | e$foo <- 1 315 | bar <- 2 316 | where("foo") 317 | where("bar") 318 | where("foo", env = e) 319 | where("bar", env = e) 320 | ``` 321 | 322 | ## Lexical scoping 323 | 324 | [Lexical comes from *lexical analysis* in computer science, which is 325 | the conversion of characters (code) into a sequence of meaningful (for 326 | the computer) tokens.] 327 | 328 | **Definition**: Rules that define how R looks up values for a given name/symbol. 329 | 330 | - Objects in environments have unique names 331 | - Objects in different environments can of course have identical names. 332 | - If a name is not found in the current environment, it is looked up 333 | in the parent (enclosing) from. 334 | - If it is not found in the parent (enclosing) frame, it is looked up 335 | in the parent's parent frame, and so on... 336 | 337 | 338 | ```r 339 | search() 340 | mean <- function(x) cat("The mean is", sum(x)/length(x), "\n") 341 | mean(1:10) 342 | base::mean(1:10) 343 | rm(mean) 344 | mean(1:10) 345 | ``` 346 | 347 | ## Assignments 348 | 349 | - `<-` assigns/creates in the current environment 350 | 351 | - `<<-` (deep assignment) never creates/updates a variable in the 352 | current environment, but modifies an existing variable in the 353 | current or first enclosing environment where that name is defined. 354 | 355 | - If `<<-` does not find the name, it will create the variable in the 356 | global environment. 357 | 358 | 359 | ```r 360 | library("fortunes") 361 | fortune(174) 362 | ``` 363 | 364 | ``` 365 | ## 366 | ## I wish <<- had never been invented, as it makes an esoteric and dangerous 367 | ## feature of the language *seem* normal and reasonable. If you want to dumb 368 | ## down R/S into a macro language, this is the operator for you. 369 | ## -- Bill Venables 370 | ## R-help (July 2001) 371 | ``` 372 | 373 | 374 | ```r 375 | rm(list = ls()) 376 | x 377 | f1 <- function() x <<- 1 378 | f1() 379 | x 380 | ``` 381 | 382 | 383 | ```r 384 | f2 <- function() x <<- 2 385 | f2() 386 | x 387 | ``` 388 | 389 | 390 | ```r 391 | f3 <- function() x <- 10 392 | f3() 393 | x 394 | ``` 395 | 396 | 397 | ```r 398 | f4 <- function(x) x <-10 399 | f4(x) 400 | x 401 | ``` 402 | 403 | ## Using environments 404 | 405 | Most environments are created when creating and calling 406 | functions. They are also used in packages as *package* and *namespace* 407 | environments. 408 | 409 | There are several reasons to create then manually. 410 | 411 | - Reference semantics 412 | - Avoiding copies 413 | - Package state 414 | - As a hashmap for fast name lookup 415 | 416 | ## Reference semantics 417 | 418 | 419 | ```r 420 | modify <- function(x) { 421 | x$a <- 2 422 | invisible(TRUE) 423 | } 424 | ``` 425 | 426 | 427 | ```r 428 | x_l <- list(a = 1) 429 | modify(x_l) 430 | x_l$a 431 | ``` 432 | 433 | 434 | ```r 435 | x_e <- new.env() 436 | x_e$a <- 1 437 | modify(x_e) 438 | x_e$a 439 | ``` 440 | 441 | Tip: when setting up environments, it is advised to set to parent 442 | (enclosing) environment to be `emptyenv()`, to avoid accidentally 443 | inheriting objects from somewhere else on the search path. 444 | 445 | 446 | ```r 447 | e <- new.env() 448 | e$a <- 1 449 | e 450 | parent.env(e) 451 | 452 | parent.env(e) <- emptyenv() 453 | parent.env(e) 454 | e 455 | ``` 456 | 457 | or directly 458 | 459 | 460 | ```r 461 | e <- new.env(parent.env = empty.env()) 462 | ``` 463 | 464 | ### Exercise 465 | 466 | What is going to happen when we access `"x"` in the four cases below? 467 | 468 | 469 | ```r 470 | x <- 1 471 | e1 <- new.env() 472 | get("x", envir = e1) 473 | ``` 474 | 475 | 476 | ```r 477 | get("x", envir = e1, inherits = FALSE) 478 | ``` 479 | 480 | 481 | ```r 482 | e2 <- new.env(parent = emptyenv()) 483 | get("x", envir = e2) 484 | ``` 485 | 486 | 487 | ```r 488 | get("x", envir = e1, inherits = FALSE) 489 | ``` 490 | 491 | ## Avoiding copies 492 | 493 | Since environments have reference semantics, they are not copied. 494 | When passing an environment as function argument (directly, or as part 495 | of a more complex data structure), it is **not** copied: all its 496 | values are accessible within the function and can be persistently 497 | modified. 498 | 499 | 500 | ```r 501 | e <- new.env() 502 | e$x <- 1 503 | f <- function(myenv) myenv$x <- 2 504 | f(e) 505 | e$x 506 | ``` 507 | 508 | This is used in the `eSet` class family to store the expression data. 509 | 510 | 511 | ```r 512 | library("Biobase") 513 | getClass("eSet") 514 | getClass("AssayData") 515 | new("ExpressionSet") 516 | ``` 517 | 518 | ## Preserving state in packages 519 | 520 | Explicit envirionments are also useful to preserve state or define 521 | constants-like variables in a package. One can then set getters and 522 | setters for users to access the variables within that private 523 | envionment. 524 | 525 | #### Use case 526 | 527 | Colour management in [`pRoloc`](https://github.com/lgatto/pRoloc/blob/master/R/environment.R): 528 | 529 | 530 | ```r 531 | .pRolocEnv <- new.env(parent=emptyenv(), hash=TRUE) 532 | 533 | stockcol <- c("#E41A1C", "#377EB8", "#238B45", "#FF7F00", "#FFD700", "#333333", 534 | "#00CED1", "#A65628", "#F781BF", "#984EA3", "#9ACD32", "#B0C4DE", 535 | "#00008A", "#8B795E", "#FDAE6B", "#66C2A5", "#276419", "#CD8C95", 536 | "#6A51A3", "#EEAD0E", "#0000FF", "#9ACD32", "#CD6090", "#CD5B45", 537 | "#8E0152", "#808000", "#67000D", "#3F007D", "#6BAED6", "#FC9272") 538 | 539 | assign("stockcol", stockcol, envir = .pRolocEnv) 540 | 541 | getStockcol <- function() get("stockcol", envir = .pRolocEnv) 542 | 543 | setStockcol <- function(cols) { 544 | if (is.null(cols)) { 545 | assign("stockcol", stockcol, envir = .pRolocEnv) 546 | } else { 547 | assign("stockcol", cols, envir = .pRolocEnv) 548 | } 549 | } 550 | ``` 551 | 552 | and in plotting functions (we will see the `missing` in more details later): 553 | 554 | 555 | ```r 556 | ... 557 | if (missing(col)) 558 | col <- getStockcol() 559 | ... 560 | ``` 561 | 562 | Hadley's tip: Invisibly returning the old value from 563 | 564 | 565 | ```r 566 | setStockcol <- function(cols) { 567 | prevcols <- getStockcol() 568 | if (is.null(cols)) { 569 | assign("stockcol", stockcol, envir = .pRolocEnv) 570 | } else { 571 | assign("stockcol", cols, envir = .pRolocEnv) 572 | } 573 | invisible(prevcols) 574 | } 575 | ``` 576 | 577 | ## Tidy data 578 | 579 | > Hadley Wickham, Tidy Data, Vol. 59, Issue 10, Sep 2014, Journal of 580 | > Statistical Software. http://www.jstatsoft.org/v59/i10. 581 | 582 | Tidy datasets are easy to manipulate, model and visualize, and have a 583 | specific structure: each variable is a column, each observation is a 584 | row, and each type of observational unit is a table. 585 | 586 | 587 | ## Tidy tools 588 | 589 | Tidy data also makes it easier to develop tidy tools for data 590 | analysis, tools that both input and output tidy datasets. 591 | 592 | - `dply::select` select columns 593 | - `dlpy::filter` select rows 594 | - `dplyr:mutate` create new columns 595 | - `dpplyr:group_by` split-apply-combine 596 | - `dlpyr:summarise` collapse each group into a single-row summary of 597 | that group 598 | - `magrittr::%>%` piping 599 | 600 | 601 | ## Examples 602 | 603 | 604 | ```r 605 | library("dplyr") 606 | surveys <- read.csv("http://datacarpentry.github.io/dc_zurich/data/portal_data_joined.csv") 607 | head(surveys) 608 | 609 | surveys %>% 610 | filter(weight < 5) %>% 611 | select(species_id, sex, weight) 612 | 613 | surveys %>% 614 | mutate(weight_kg = weight / 1000) %>% 615 | filter(!is.na(weight)) %>% 616 | head 617 | 618 | surveys %>% 619 | group_by(sex) %>% 620 | tally() 621 | 622 | surveys %>% 623 | group_by(sex, species_id) %>% 624 | summarize(mean_weight = mean(weight, na.rm = TRUE)) 625 | 626 | surveys %>% 627 | group_by(sex, species_id) %>% 628 | summarize(mean_weight = mean(weight, na.rm = TRUE), 629 | min_weight = min(weight, na.rm = TRUE)) %>% 630 | filter(!is.nan(mean_weight)) 631 | ``` 632 | 633 | ## Application to other data structures 634 | 635 | > Hadley Wickham (@hadleywickham) tweeted at 8:45 pm on Fri, Feb 12, 636 | > 2016: @mark_scheuerell @drob the importance of tidy data is not the 637 | > specific form, but the consistency 638 | > (https://twitter.com/hadleywickham/status/698246671629549568?s=09) 639 | 640 | - Well-formatted and well-documented `S4` class 641 | - `S4` as input -(function)-> `S4` as output 642 | 643 | ![MSnSet schematics](https://raw.githubusercontent.com/lgatto/pRoloc/master/vignettes/Figures/msnset.png) 644 | 645 | ## Computing on the language 646 | 647 | 648 | #### Quoting and evaluating expressions 649 | 650 | Quote an expression, don't evaluate it: 651 | 652 | 653 | ```r 654 | quote(1:10) 655 | quote(paste(letters, LETTERS, sep = "-")) 656 | ``` 657 | Evaluate an expression in a specific environment: 658 | 659 | 660 | ```r 661 | eval(quote(1 + 1)) 662 | eval(quote(1:10)) 663 | 664 | x <- 10 665 | eval(quote(x + 1)) 666 | 667 | e <- new.env() 668 | e$x <- 1 669 | eval(quote(x + 1), env = e) 670 | 671 | eval(quote(x), list(x = 30)) 672 | 673 | dfr <- data.frame(x = 1:10, y = LETTERS[1:10]) 674 | eval(quote(sum(x)), dfr) 675 | ``` 676 | 677 | Substitute any variables bound in `env`, but don't evaluate the 678 | expression: 679 | 680 | 681 | ```r 682 | x <- 10 683 | substitute(sqrt(x)) 684 | 685 | e <- new.env() 686 | e$x <- 1 687 | substitute(sqrt(x), env = e) 688 | ``` 689 | 690 | Parse, but don't evaluate an expression: 691 | 692 | 693 | ```r 694 | parse(text = "1:10") 695 | parse(file = "lineprof-example.R") 696 | ``` 697 | 698 | Turn an unevaluated expressions into character strings: 699 | 700 | 701 | ```r 702 | x <- 123 703 | deparse(substitute(x)) 704 | ``` 705 | 706 | #### Characters as variables names 707 | 708 | 709 | 710 | ```r 711 | foo <- "bar" 712 | as.name(foo) 713 | string <- "1:10" 714 | parse(text=string) 715 | eval(parse(text=string)) 716 | ``` 717 | 718 | 719 | And with `assign` and `get` 720 | 721 | 722 | ```r 723 | varName1 <- "varName2" 724 | assign(varName1, "123") 725 | varName1 726 | get(varName1) 727 | varName2 728 | ``` 729 | 730 | Using `substitute` and `deparse` 731 | 732 | 733 | ```r 734 | test <- function(x) { 735 | y <- deparse(substitute(x)) 736 | print(y) 737 | print(x) 738 | } 739 | var <- c("one","two","three") 740 | test(var) 741 | ``` 742 | 743 | 744 | -------------------------------------------------------------------------------- /02-funprog.R: -------------------------------------------------------------------------------- 1 | 2 | f <- function(x) { 3 | y <- x + 1 4 | return(x * y) 5 | } 6 | 7 | 8 | body(f) 9 | args(f) 10 | environment(f) 11 | 12 | body(f) <- quote({ 13 | y <- x * y 14 | return(x + y) 15 | }) 16 | 17 | 18 | f <- function(x) x + y 19 | 20 | f(1) 21 | 22 | environment(f) 23 | y <- 2 24 | f(1) 25 | 26 | 27 | 28 | e <- new.env() 29 | environment(f) <- e 30 | 31 | f(1) 32 | e$y <- 10 33 | f(1) 34 | 35 | 36 | codetools::findGlobals(f) 37 | 38 | 39 | f <- function() { 40 | x <- 1 41 | y <- 2 42 | c(x, y) 43 | } 44 | f() 45 | 46 | 47 | x <- 2 48 | g <- function(){ 49 | y <- 1 50 | c(x, y) 51 | } 52 | g() 53 | 54 | 55 | x <- 1 56 | h <- function() { 57 | y <- 2 58 | i <- function() { 59 | z <- 3 60 | c(x, y, z) 61 | } 62 | i() 63 | } 64 | h() 65 | 66 | 67 | j <- function(x) { 68 | y <- 2 69 | function(){ 70 | c(x, y) 71 | } 72 | } 73 | k <- j(1) 74 | k() 75 | 76 | 77 | j <- function() { 78 | if (!exists("a")) { 79 | a <- 1 80 | } else { 81 | a <- a + 1 82 | } 83 | print(a) 84 | } 85 | j() ## First call 86 | j() ## Second call 87 | 88 | 89 | f <- function(x) { 90 | f <- function(x) { 91 | f <- function(x) { 92 | x^2 93 | } 94 | f(x) + 1 95 | } 96 | f(x) * 2 97 | } 98 | f(10) 99 | 100 | 101 | args <- list(x = 1:10, trim = 0.3) 102 | do.call(mean, args) 103 | 104 | 105 | f <- function(x = 1, y = 2) x * y 106 | f <- function(x = 1, y = x + 2) x * y 107 | 108 | 109 | f <- function(x = 1, y) { 110 | c(missing(x), missing(y)) 111 | } 112 | f() 113 | f(x = 1) 114 | 115 | 116 | plot2 <- function(...) { 117 | message("Verbose plotting...") 118 | plot(...) 119 | } 120 | 121 | f <- function(...) list(...) 122 | 123 | 124 | f1 <- function() 1 125 | f2 <- function() return(1) 126 | f3 <- function() return(invisible(1)) 127 | 128 | 129 | f1 <- function(x) { 130 | on.exit(print("!")) 131 | x + 1 132 | } 133 | 134 | f2 <- function(x) { 135 | on.exit(print("!")) 136 | stop("Error") 137 | } 138 | 139 | 140 | f3 <- function() { 141 | on.exit(print("1")) 142 | on.exit(print("2")) 143 | invisible(TRUE) 144 | } 145 | 146 | 147 | f4 <- function() { 148 | on.exit(print("1")) 149 | on.exit(print("2"), add = TRUE) 150 | invisible(TRUE) 151 | } 152 | 153 | 154 | function(x) x + y 155 | body(function(x) x + y) 156 | args(function(x) x + y) 157 | environment(function(x) x + y) 158 | 159 | 160 | make.power <- function(n) 161 | function(x) x^n 162 | 163 | 164 | cube <- make.power(3) 165 | square <- make.power(2) 166 | cube(2) 167 | square(2) 168 | environment(cube) 169 | environment(square) 170 | 171 | 172 | new_counter <- function() { 173 | i <- 0 174 | function() { 175 | i <<- i + 1 176 | i 177 | } 178 | } 179 | 180 | count1 <- new_counter() 181 | count2 <- new_counter() 182 | 183 | count1() 184 | count1() 185 | count2() 186 | 187 | environment(count1) 188 | environment(count2) 189 | environment(count1)$i 190 | environment(count2)$i 191 | 192 | 193 | colramp <- colorRampPalette(c("blue", "yellow")) 194 | colramp(5) 195 | plot(1:10, col = colramp(10), pch = 19, cex = 2, 196 | main = "colramp(10)") 197 | 198 | 199 | L <- replicate(3, matrix(rnorm(9), 3), simplify = FALSE) 200 | Reduce("+", L) 201 | try(sum(L)) 202 | 203 | 204 | Reduce("+", list(1, 2, 3), init = 10) 205 | Reduce("+", list(1, 2, 3), accumulate = TRUE) 206 | Reduce("+", list(1, 2, 3), right = TRUE, accumulate = TRUE) 207 | 208 | 209 | even <- function(x) x %% 2 == 0 210 | (y <- sample(100, 10)) 211 | Filter(even, y) 212 | Filter(Negate(even), y) 213 | 214 | 215 | Map(even, 1:3) 216 | 217 | 218 | Find(even, 10:15) 219 | Find(even, 10:15, right = TRUE) 220 | Position(Negate(even), 10:15) 221 | Position(Negate(even), 10:15, right = TRUE) 222 | 223 | 224 | (x <- 1:5) 225 | (y <- 5:1) 226 | x + y 227 | 228 | 229 | (x <- 1:6) 230 | (y <- 1:2) 231 | x+y 232 | 233 | 234 | diff1 <- function(e) { 235 | n <- length(e) 236 | interval <- rep(0, n - 1) 237 | for (i in 1:(n - 1)) 238 | interval[i] <- e[i + 1] - e[i] 239 | interval 240 | } 241 | e <- c(2, 5, 10.2, 12, 19) 242 | diff1(e) 243 | 244 | 245 | diff2 <- function(e) { 246 | n <- length(e) 247 | e[-1] - e[-n] 248 | } 249 | e <- c(2, 5, 10.2, 12, 19) 250 | diff2(e) 251 | 252 | 253 | v <- rnorm(1000) ## or a list 254 | res <- numeric(length(v)) 255 | 256 | for (i in 1:length(v)) 257 | res[i] <- f(v[i]) 258 | 259 | res <- sapply(v, f) 260 | 261 | ## if f is vectorised 262 | f(v) 263 | 264 | 265 | M <- matrix(rnorm(100), 10) 266 | apply(M, 1, function(Mrow) 'do something with Mrow') 267 | apply(M, 2, function(Mcol) 'do something with Mcol') 268 | 269 | 270 | f <- function(x, a = 1) sin(x^2)/ (a + abs(x)) 271 | x <- seq(-7, 7, 0.02 ) 272 | x0 <- seq(-2, 2, 0.02) 273 | y0 <- f(x0) 274 | y0[y0 < 0] <- 0 275 | plot(x, f(x), type = "l", main = expression(f(x) == frac(sin(x^2),(a + abs(x))))) 276 | grid() 277 | abline(v = c(-2, 2), lty = "dotted") 278 | polygon(x0, y0, col = "#00000010") 279 | 280 | 281 | f <- function(x, a = 1) sin(x^2)/ (a + abs(x)) 282 | integrate(f, lower = -2, upper = 2) 283 | 284 | 285 | lo <- c(-2, 0) 286 | hi <- c(0, 2) 287 | integrate(f, lower = lo, upper = hi) 288 | 289 | 290 | mapply(function(lo, hi) integrate(f, lo, hi)$value, 291 | lo, hi) 292 | 293 | 294 | Integrate <- Vectorize( 295 | function(fn, lower, upper) 296 | integrate(fn, lower, upper)$value, 297 | vectorize.args=c("lower", "upper") 298 | ) 299 | Integrate(f, lower=lo, upper=hi) 300 | 301 | -------------------------------------------------------------------------------- /02-funprog.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Part II: Functional programming" 3 | author: "Laurent Gatto" 4 | --- 5 | 6 | ## Content 7 | 8 | - Functions 9 | - Robust programming with functions 10 | - Scoping 11 | - Closures 12 | - High-level functions 13 | - Vectorisation 14 | 15 | # Functions 16 | 17 | Among the R's strong points, Hadley Whickham cites: 18 | 19 | > [R has] a strong foundation in functional programming. The ideas of 20 | > functional programming are well suited to solving many of the 21 | > challenges of data analysis. R provides a powerful and flexible 22 | > toolkit which allows you to write concise yet descriptive code. 23 | 24 | Also 25 | > To understand computations in R, two slogans are helpful: 26 | > - Everything that exists is an object. 27 | > - Everything that happens is a function call. 28 | > John Chambers 29 | 30 | ![Messy code hides bugs](./figs/funs.png) 31 | 32 | - Functions are a means of **abstraction**. A concept/computation is 33 | encapsulated/isolated from the rest with a function. 34 | - Functions should **do one thing**, and do it well (compute, or plot, 35 | or save, ... not all in one go). 36 | - **Side effects**: your functions should not have any (unless, of 37 | course, that is the main point of that function - plotting, write to 38 | disk, ...). Functions shouldn't make any changes in any 39 | environment. The only return their output. 40 | - **Do not use global variables**. Everything the function needs is 41 | being passed as an argument. Function must be **self-contained**. 42 | - Function streamline code and process 43 | 44 | From the `R Inferno`: 45 | 46 | Make your functions as simple as possible. Simple has many advantages: 47 | 48 | - Simple functions are likely to be human efficient: they will be easy 49 | to understand and to modify. 50 | - Simple functions are likely to be computer efficient. 51 | - Simple functions are less likely to be buggy, and bugs will be 52 | easier to fix. 53 | - (Perhaps ironically) simple functions may be more general—thinking 54 | about the heart of the matter often broadens the application. 55 | 56 | 57 | Functions can be 58 | 59 | 1. Correct. 60 | 2. An error occurs that is clearly identified. 61 | 3. An obscure error occurs. 62 | 4. An incorrect value is returned. 63 | 64 | We like category 1. Category 2 is the right behavior if the inputs do 65 | not make sense, but not if the inputs are sensible. Category 3 is an 66 | unpleasant place for your users, and possibly for you if the users 67 | have access to you. Category 4 is by far the worst place to be - the 68 | user has no reason to believe that anything is wrong. Steer clear of 69 | category 4. 70 | 71 | 72 | Finally, functions are 73 | 74 | - Easier to debug (part III) 75 | - Easier to profile (part IV) 76 | - Easier to parallelise (part IV) 77 | 78 | Functions are an central part of robust R programming. 79 | 80 | ## Function parts 81 | 82 | A function is made of 83 | - a name 84 | - some inputs (formal parameters) 85 | - a single output (return value) 86 | - a body 87 | - an environment, the map of the location of the functions variable 88 | 89 | ```{r, eval=FALSE} 90 | f <- function(x) { 91 | y <- x + 1 92 | return(x * y) 93 | } 94 | ``` 95 | 96 | And these can be accessed and modified indivdually 97 | 98 | ```{r, eval=FALSE} 99 | body(f) 100 | args(f) 101 | environment(f) 102 | 103 | body(f) <- quote({ 104 | y <- x * y 105 | return(x + y) 106 | }) 107 | ``` 108 | 109 | 110 | ## Lexical scoping 111 | 112 | - If a name is not found in a functions environment, it is looked up 113 | in the parent (enclosing) from. 114 | - If it is not found in the parent (enclosing) frame, it is looked up 115 | in the parent's parent frame, and so on... 116 | 117 | *Lexical scoping*: default behaviour, current environment, then 118 | traversing *enclosing/parent environments*. 119 | 120 | ```{r, eval=FALSE} 121 | f <- function(x) x + y 122 | 123 | f(1) 124 | 125 | environment(f) 126 | y <- 2 127 | f(1) 128 | ``` 129 | 130 | ```{r, eval=FALSE} 131 | 132 | e <- new.env() 133 | environment(f) <- e 134 | 135 | f(1) 136 | e$y <- 10 137 | f(1) 138 | ``` 139 | 140 | This is of course bad practice, we don't want to rely on global variables. 141 | 142 | ```{r, eval=FALSE} 143 | codetools::findGlobals(f) 144 | ``` 145 | 146 | ## Exercises 147 | 148 | Start by mentally running the code chunks below - what do the functions return? 149 | 150 | After testing new code chunks, don't forget to clean up your 151 | workspace, to avoid unexpected results. 152 | 153 | ```{r, eval=FALSE} 154 | f <- function() { 155 | x <- 1 156 | y <- 2 157 | c(x, y) 158 | } 159 | f() 160 | ``` 161 | 162 | ```{r, eval=FALSE} 163 | x <- 2 164 | g <- function(){ 165 | y <- 1 166 | c(x, y) 167 | } 168 | g() 169 | ``` 170 | 171 | ```{r, eval=FALSE} 172 | x <- 1 173 | h <- function() { 174 | y <- 2 175 | i <- function() { 176 | z <- 3 177 | c(x, y, z) 178 | } 179 | i() 180 | } 181 | h() 182 | ``` 183 | 184 | ```{r, eval=FALSE} 185 | j <- function(x) { 186 | y <- 2 187 | function(){ 188 | c(x, y) 189 | } 190 | } 191 | k <- j(1) 192 | k() 193 | ``` 194 | 195 | ```{r, eval=FALSE} 196 | j <- function() { 197 | if (!exists("a")) { 198 | a <- 1 199 | } else { 200 | a <- a + 1 201 | } 202 | print(a) 203 | } 204 | j() ## First call 205 | j() ## Second call 206 | ``` 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | ```{r, eval=FALSE} 217 | f <- function(x) { 218 | f <- function(x) { 219 | f <- function(x) { 220 | x^2 221 | } 222 | f(x) + 1 223 | } 224 | f(x) * 2 225 | } 226 | f(10) 227 | ``` 228 | 229 | ## More about functions 230 | 231 | - Argument matching by position or by names 232 | - Calling a function with a list of arguments 233 | 234 | ```{r, eval=FALSE} 235 | args <- list(x = 1:10, trim = 0.3) 236 | do.call(mean, args) 237 | ``` 238 | 239 | - Default arguments 240 | 241 | ```{r, eval=FALSE} 242 | f <- function(x = 1, y = 2) x * y 243 | f <- function(x = 1, y = x + 2) x * y 244 | ``` 245 | 246 | - Missing arguments 247 | 248 | ```{r, eval=FALSE} 249 | f <- function(x = 1, y) { 250 | c(missing(x), missing(y)) 251 | } 252 | f() 253 | f(x = 1) 254 | ``` 255 | 256 | - Passing non-matched parameters `...` to an inner function 257 | 258 | ```{r, eval=FALSE} 259 | plot2 <- function(...) { 260 | message("Verbose plotting...") 261 | plot(...) 262 | } 263 | 264 | f <- function(...) list(...) 265 | ``` 266 | 267 | - Return values: last statement, explicit `return`, make output 268 | `invisible` 269 | 270 | ```{r, eval=FALSE} 271 | f1 <- function() 1 272 | f2 <- function() return(1) 273 | f3 <- function() return(invisible(1)) 274 | ``` 275 | 276 | - Explicit triggers before exiting. Useful to restore global state 277 | (plotting parameters, cleaning temporary files, ...) 278 | 279 | ```{r, eval=FALSE} 280 | f1 <- function(x) { 281 | on.exit(print("!")) 282 | x + 1 283 | } 284 | 285 | f2 <- function(x) { 286 | on.exit(print("!")) 287 | stop("Error") 288 | } 289 | ``` 290 | 291 | ```{r, eval=FALSE} 292 | f3 <- function() { 293 | on.exit(print("1")) 294 | on.exit(print("2")) 295 | invisible(TRUE) 296 | } 297 | 298 | 299 | f4 <- function() { 300 | on.exit(print("1")) 301 | on.exit(print("2"), add = TRUE) 302 | invisible(TRUE) 303 | } 304 | ``` 305 | 306 | - Anonymous functions, created on-the-flight and passed to `lapply` or 307 | other high-level functions. 308 | 309 | ```{r, eval=FALSE} 310 | function(x) x + y 311 | body(function(x) x + y) 312 | args(function(x) x + y) 313 | environment(function(x) x + y) 314 | ``` 315 | 316 | ## More about scoping 317 | 318 | *Lexical scoping*: default behaviour, current environment, then 319 | traversing *enclosing/parent environments*. 320 | 321 | 322 | *Dynamic scoping*: looking up variables in the *calling environment*, 323 | used in non-standard evaluation. 324 | 325 | 326 | # Functional programming 327 | 328 | **First-class functions** - a function is a value just like any other 329 | variable. Functions can thus be used as arguments to other 330 | functions. Functions are considered *first-class citizens*. 331 | 332 | **Higher-order functions** - refers to functions that take functions 333 | as parameters (input) or return functions (output). 334 | 335 | ## Closures 336 | 337 | > "An object is data with functions. A closure is a function with 338 | > data." - John D. Cool 339 | 340 | Closures: functions written by functions. They enclose the envionment 341 | of the parent function and can access all its variables. 342 | 343 | ```{r, eval=FALSE} 344 | make.power <- function(n) 345 | function(x) x^n 346 | 347 | 348 | cube <- make.power(3) 349 | square <- make.power(2) 350 | cube(2) 351 | square(2) 352 | environment(cube) 353 | environment(square) 354 | ``` 355 | 356 | Mutable state: a counter function 357 | 358 | ```{r, eval=FALSE} 359 | new_counter <- function() { 360 | i <- 0 361 | function() { 362 | i <<- i + 1 363 | i 364 | } 365 | } 366 | 367 | count1 <- new_counter() 368 | count2 <- new_counter() 369 | 370 | count1() 371 | count1() 372 | count2() 373 | 374 | environment(count1) 375 | environment(count2) 376 | environment(count1)$i 377 | environment(count2)$i 378 | ``` 379 | 380 | Questions: 381 | 382 | - What happens of we place the code `i <- 0` and the function 383 | definition outside of a function, i.e in the global environment? 384 | 385 | - What happens if we use `<-` instead of `<<-`? 386 | 387 | The `colorRampPallette` 388 | 389 | ```{r, eval=FALSE} 390 | colramp <- colorRampPalette(c("blue", "yellow")) 391 | colramp(5) 392 | plot(1:10, col = colramp(10), pch = 19, cex = 2, 393 | main = "colramp(10)") 394 | ``` 395 | 396 | ## Functional 397 | 398 | Take a function as input or return a function as output. 399 | 400 | - `Reduce(f, x)` uses a binary function to successively combine the 401 | elements of a given vector and a possibly given initial value. 402 | 403 | ```{r, eval=FALSE} 404 | L <- replicate(3, matrix(rnorm(9), 3), simplify = FALSE) 405 | Reduce("+", L) 406 | try(sum(L)) 407 | ``` 408 | 409 | ```{r, eval=FALSE} 410 | Reduce("+", list(1, 2, 3), init = 10) 411 | Reduce("+", list(1, 2, 3), accumulate = TRUE) 412 | Reduce("+", list(1, 2, 3), right = TRUE, accumulate = TRUE) 413 | ``` 414 | 415 | - `Filter(f, x)` extracts the elements of a vector for which a 416 | predicate (logical) function gives true. 417 | 418 | - `Negate(f)` creates the negation of a given function. 419 | 420 | ```{r, eval=FALSE} 421 | even <- function(x) x %% 2 == 0 422 | (y <- sample(100, 10)) 423 | Filter(even, y) 424 | Filter(Negate(even), y) 425 | ``` 426 | 427 | - `Map(f, ...)` applies a function to the corresponding elements of 428 | given vectors. Similar to `mapply` without any attempt to simplify. 429 | 430 | ```{r, eval=FALSE} 431 | Map(even, 1:3) 432 | ``` 433 | 434 | - `Find(f, x)` and `Position(f, x)` give the first (or last elements) 435 | and its position in the vector, for which a predicate (logical) 436 | function gives true. 437 | 438 | ```{r, eval=FALSE} 439 | Find(even, 10:15) 440 | Find(even, 10:15, right = TRUE) 441 | Position(Negate(even), 10:15) 442 | Position(Negate(even), 10:15, right = TRUE) 443 | ``` 444 | 445 | ## References 446 | 447 | - R Gentleman, *R Programming for Bioinformatics*, CRC Press, 2008 448 | - `?Map`, or any other of the higher order functions 449 | - Blog post, *Higher Order Functions in R*, John Myles White 450 | http://www.johnmyleswhite.com/notebook/2010/09/23/higher-order-functions-in-r/ 451 | 452 | # Vectorisation 453 | 454 | 455 | > Many operations in R are vectorized, and understanding and using 456 | > vectorization is an essential component of becoming a proficient 457 | > programmer. - R Gentleman in *R Programming for Bioinformatics*. 458 | 459 | A *vectorised computation* is one that, when applied to a vector (of 460 | length greater than 1), automatically operates directly on all 461 | elements of the input vector. 462 | 463 | 464 | ```{r, eval=FALSE} 465 | (x <- 1:5) 466 | (y <- 5:1) 467 | x + y 468 | ``` 469 | 470 | ## Recycling rule 471 | 472 | What is `x` and `y` are of different length: the shorter vector is 473 | replicate so that its length matches the longer ones. 474 | 475 | ```{r, eval=FALSE} 476 | (x <- 1:6) 477 | (y <- 1:2) 478 | x+y 479 | ``` 480 | 481 | If the shorter vector is not an even multiple of the longer, a warning 482 | is issued. 483 | 484 | ## Example 485 | 486 | Compute difference between times of events, `e`. Given `n` events, 487 | there will be `n-1` inter-event times. `interval[i] <- e[i+1] - e[i]` 488 | 489 | Procedural implementation: 490 | 491 | ```{r, eval=FALSE} 492 | diff1 <- function(e) { 493 | n <- length(e) 494 | interval <- rep(0, n - 1) 495 | for (i in 1:(n - 1)) 496 | interval[i] <- e[i + 1] - e[i] 497 | interval 498 | } 499 | e <- c(2, 5, 10.2, 12, 19) 500 | diff1(e) 501 | ``` 502 | 503 | Vectorised implementation: 504 | 505 | ```{r, eval=FALSE} 506 | diff2 <- function(e) { 507 | n <- length(e) 508 | e[-1] - e[-n] 509 | } 510 | e <- c(2, 5, 10.2, 12, 19) 511 | diff2(e) 512 | ``` 513 | 514 | ## `*apply` functions 515 | 516 | How to apply a function, iteratively, on a set of elements? 517 | 518 | `apply(X, MARGIN, FUN, ...)` 519 | 520 | 521 | - `MARGIN` = 1 for row, 2 for cols. 522 | - `FUN` = function to apply 523 | - `...` = extra args to function. 524 | - `simplify` = should the result be simplified if possible. 525 | 526 | 527 | `*apply` functions are (generally) **NOT** faster than loops, but more 528 | succint and thus clearer. 529 | 530 | ```{r, eval=FALSE} 531 | v <- rnorm(1000) ## or a list 532 | res <- numeric(length(v)) 533 | 534 | for (i in 1:length(v)) 535 | res[i] <- f(v[i]) 536 | 537 | res <- sapply(v, f) 538 | 539 | ## if f is vectorised 540 | f(v) 541 | ``` 542 | 543 | function | use case 544 | -------|--------------------------------------- 545 | apply | matrices, arrays, data.frames 546 | lapply | lists, vectors 547 | sapply | lists, vectors 548 | vapply | with a pre-specified type of return value 549 | tapply | atomic objects, typically vectors 550 | by | similar to tapply 551 | eapply | environments 552 | mapply | multiple values 553 | rapply | recursive version of lapply 554 | esApply | `ExpressionSet`, defined in `Biobase` 555 | 556 | See also the `BiocGenerics` package for `[l|m|s|t]apply` S4 generics, 557 | as well as parallel versions in the `parallel` package (see 558 | `Performance` section). 559 | 560 | 561 | See also the `plyr` package, that offers its own flavour of `apply` 562 | functions. 563 | 564 | in/out | list | data frame | array 565 | ------------|---------|------------|--------- 566 | list | llply() | ldply() | laply() 567 | data frame | dlply() | ddply() | daply() 568 | array | alply() | adply() | aaply() 569 | 570 | 571 | ## Other functions 572 | 573 | - `replicate` - repeated evaluation of an expression 574 | - `aggregate` - compute summary statistics of data subsets 575 | - `ave` - group averages over level combinations of factors 576 | - `sweep` - sweep out array summaries 577 | 578 | ## Anonymous functions 579 | 580 | A function defined/called without being assigned to an identifier and 581 | generally passed as argument to other functions. 582 | 583 | 584 | ```{r, eval=FALSE} 585 | M <- matrix(rnorm(100), 10) 586 | apply(M, 1, function(Mrow) 'do something with Mrow') 587 | apply(M, 2, function(Mcol) 'do something with Mcol') 588 | ``` 589 | 590 | ## Use case: integration 591 | 592 | ```{r, echo=FALSE, eval=FALSE} 593 | f <- function(x, a = 1) sin(x^2)/ (a + abs(x)) 594 | x <- seq(-7, 7, 0.02 ) 595 | x0 <- seq(-2, 2, 0.02) 596 | y0 <- f(x0) 597 | y0[y0 < 0] <- 0 598 | plot(x, f(x), type = "l", main = expression(f(x) == frac(sin(x^2),(a + abs(x))))) 599 | grid() 600 | abline(v = c(-2, 2), lty = "dotted") 601 | polygon(x0, y0, col = "#00000010") 602 | ``` 603 | 604 | ![`sin(x^2)/ (a + abs(x))`](./figs/sinfun.png) 605 | 606 | The `integrate` function approximates definite integrals by 607 | adaptive quadrature. 608 | 609 | 610 | ```{r, eval=FALSE} 611 | f <- function(x, a = 1) sin(x^2)/ (a + abs(x)) 612 | integrate(f, lower = -2, upper = 2) 613 | ``` 614 | 615 | It is not vectorised. 616 | 617 | ```{r, eval=FALSE} 618 | lo <- c(-2, 0) 619 | hi <- c(0, 2) 620 | integrate(f, lower = lo, upper = hi) 621 | ``` 622 | 623 | ## How to vectorise 624 | 625 | - To vectorise a function, we can explicitly wrap it inside a helper 626 | function that will take care of argument recycling (via `rep`), then 627 | loop over the inputs and call the non-vectorised function. 628 | 629 | - To vectorise a function, we can explicitate the vectorised 630 | calculation using `mapply`. 631 | 632 | ```{r, eval=FALSE} 633 | mapply(function(lo, hi) integrate(f, lo, hi)$value, 634 | lo, hi) 635 | ``` 636 | 637 | - Create a vectorised form using `Vectorize`. It takes a function 638 | (here, an anonymous function) as input and returns a function. 639 | 640 | ```{r, eval=FALSE} 641 | Integrate <- Vectorize( 642 | function(fn, lower, upper) 643 | integrate(fn, lower, upper)$value, 644 | vectorize.args=c("lower", "upper") 645 | ) 646 | Integrate(f, lower=lo, upper=hi) 647 | ``` 648 | 649 | ## **Efficient** apply-like functions 650 | 651 | These functions combine high-level vectorised syntax for clarity 652 | **and** efficient C-level vectorised imputation (see *Performance* 653 | section). 654 | 655 | - In `base`: rowSums, rowMeans, colSums, colMeans 656 | - In `Biobase`: rowQ, rowMax, rowMin, rowMedias, ... 657 | - In `genefilter`: rowttests, rowFtests, rowSds, rowVars, ... 658 | 659 | Generalisable on other data structures, like `ExpressionSet` 660 | instances. 661 | 662 | ## Parallelisation 663 | 664 | Vectorised operations are natural candidats for parallel execution. 665 | See later, *Parallel computation* topic. 666 | 667 | ## References 668 | 669 | - R Gentleman, *R Programming for Bioinformatics*, CRC Press, 2008 670 | - Ligges and Fox, *R Help Desk, How Can I Avoid This Loop or Make It 671 | Faster?* R News, Vol 8/1. May 2008. 672 | - Grouping functions: sapply vs. lapply vs. apply. vs. tapply 673 | vs. by vs. aggregate ... http://stackoverflow.com/questions/3505701/ 674 | -------------------------------------------------------------------------------- /02-funprog.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Part II: Functional programming" 3 | author: "Laurent Gatto" 4 | --- 5 | 6 | ## Content 7 | 8 | - Functions 9 | - Robust programming with functions 10 | - Scoping 11 | - Closures 12 | - High-level functions 13 | - Vectorisation 14 | 15 | # Functions 16 | 17 | Among the R's strong points, Hadley Whickham cites: 18 | 19 | > [R has] a strong foundation in functional programming. The ideas of 20 | > functional programming are well suited to solving many of the 21 | > challenges of data analysis. R provides a powerful and flexible 22 | > toolkit which allows you to write concise yet descriptive code. 23 | 24 | Also 25 | > To understand computations in R, two slogans are helpful: 26 | > - Everything that exists is an object. 27 | > - Everything that happens is a function call. 28 | > John Chambers 29 | 30 | ![Messy code hides bugs](./figs/funs.png) 31 | 32 | - Functions are a means of **abstraction**. A concept/computation is 33 | encapsulated/isolated from the rest with a function. 34 | - Functions should **do one thing**, and do it well (compute, or plot, 35 | or save, ... not all in one go). 36 | - **Side effects**: your functions should not have any (unless, of 37 | course, that is the main point of that function - plotting, write to 38 | disk, ...). Functions shouldn't make any changes in any 39 | environment. The only return their output. 40 | - **Do not use global variables**. Everything the function needs is 41 | being passed as an argument. Function must be **self-contained**. 42 | - Function streamline code and process 43 | 44 | From the `R Inferno`: 45 | 46 | Make your functions as simple as possible. Simple has many advantages: 47 | 48 | - Simple functions are likely to be human efficient: they will be easy 49 | to understand and to modify. 50 | - Simple functions are likely to be computer efficient. 51 | - Simple functions are less likely to be buggy, and bugs will be 52 | easier to fix. 53 | - (Perhaps ironically) simple functions may be more general—thinking 54 | about the heart of the matter often broadens the application. 55 | 56 | 57 | Functions can be 58 | 59 | 1. Correct. 60 | 2. An error occurs that is clearly identified. 61 | 3. An obscure error occurs. 62 | 4. An incorrect value is returned. 63 | 64 | We like category 1. Category 2 is the right behavior if the inputs do 65 | not make sense, but not if the inputs are sensible. Category 3 is an 66 | unpleasant place for your users, and possibly for you if the users 67 | have access to you. Category 4 is by far the worst place to be - the 68 | user has no reason to believe that anything is wrong. Steer clear of 69 | category 4. 70 | 71 | 72 | Finally, functions are 73 | 74 | - Easier to debug (part III) 75 | - Easier to profile (part IV) 76 | - Easier to parallelise (part IV) 77 | 78 | Functions are an central part of robust R programming. 79 | 80 | ## Function parts 81 | 82 | A function is made of 83 | - a name 84 | - some inputs (formal parameters) 85 | - a single output (return value) 86 | - a body 87 | - an environment, the map of the location of the functions variable 88 | 89 | 90 | ```r 91 | f <- function(x) { 92 | y <- x + 1 93 | return(x * y) 94 | } 95 | ``` 96 | 97 | And these can be accessed and modified indivdually 98 | 99 | 100 | ```r 101 | body(f) 102 | args(f) 103 | environment(f) 104 | 105 | body(f) <- quote({ 106 | y <- x * y 107 | return(x + y) 108 | }) 109 | ``` 110 | 111 | 112 | ## Lexical scoping 113 | 114 | - If a name is not found in a functions environment, it is looked up 115 | in the parent (enclosing) from. 116 | - If it is not found in the parent (enclosing) frame, it is looked up 117 | in the parent's parent frame, and so on... 118 | 119 | *Lexical scoping*: default behaviour, current environment, then 120 | traversing *enclosing/parent environments*. 121 | 122 | 123 | ```r 124 | f <- function(x) x + y 125 | 126 | f(1) 127 | 128 | environment(f) 129 | y <- 2 130 | f(1) 131 | ``` 132 | 133 | 134 | ```r 135 | e <- new.env() 136 | environment(f) <- e 137 | 138 | f(1) 139 | e$y <- 10 140 | f(1) 141 | ``` 142 | 143 | This is of course bad practice, we don't want to rely on global variables. 144 | 145 | 146 | ```r 147 | codetools::findGlobals(f) 148 | ``` 149 | 150 | ## Exercises 151 | 152 | Start by mentally running the code chunks below - what do the functions return? 153 | 154 | After testing new code chunks, don't forget to clean up your 155 | workspace, to avoid unexpected results. 156 | 157 | 158 | ```r 159 | f <- function() { 160 | x <- 1 161 | y <- 2 162 | c(x, y) 163 | } 164 | f() 165 | ``` 166 | 167 | 168 | ```r 169 | x <- 2 170 | g <- function(){ 171 | y <- 1 172 | c(x, y) 173 | } 174 | g() 175 | ``` 176 | 177 | 178 | ```r 179 | x <- 1 180 | h <- function() { 181 | y <- 2 182 | i <- function() { 183 | z <- 3 184 | c(x, y, z) 185 | } 186 | i() 187 | } 188 | h() 189 | ``` 190 | 191 | 192 | ```r 193 | j <- function(x) { 194 | y <- 2 195 | function(){ 196 | c(x, y) 197 | } 198 | } 199 | k <- j(1) 200 | k() 201 | ``` 202 | 203 | 204 | ```r 205 | j <- function() { 206 | if (!exists("a")) { 207 | a <- 1 208 | } else { 209 | a <- a + 1 210 | } 211 | print(a) 212 | } 213 | j() ## First call 214 | j() ## Second call 215 | ``` 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | ```r 227 | f <- function(x) { 228 | f <- function(x) { 229 | f <- function(x) { 230 | x^2 231 | } 232 | f(x) + 1 233 | } 234 | f(x) * 2 235 | } 236 | f(10) 237 | ``` 238 | 239 | ## More about functions 240 | 241 | - Argument matching by position or by names 242 | - Calling a function with a list of arguments 243 | 244 | 245 | ```r 246 | args <- list(x = 1:10, trim = 0.3) 247 | do.call(mean, args) 248 | ``` 249 | 250 | - Default arguments 251 | 252 | 253 | ```r 254 | f <- function(x = 1, y = 2) x * y 255 | f <- function(x = 1, y = x + 2) x * y 256 | ``` 257 | 258 | - Missing arguments 259 | 260 | 261 | ```r 262 | f <- function(x = 1, y) { 263 | c(missing(x), missing(y)) 264 | } 265 | f() 266 | f(x = 1) 267 | ``` 268 | 269 | - Passing non-matched parameters `...` to an inner function 270 | 271 | 272 | ```r 273 | plot2 <- function(...) { 274 | message("Verbose plotting...") 275 | plot(...) 276 | } 277 | 278 | f <- function(...) list(...) 279 | ``` 280 | 281 | - Return values: last statement, explicit `return`, make output 282 | `invisible` 283 | 284 | 285 | ```r 286 | f1 <- function() 1 287 | f2 <- function() return(1) 288 | f3 <- function() return(invisible(1)) 289 | ``` 290 | 291 | - Explicit triggers before exiting. Useful to restore global state 292 | (plotting parameters, cleaning temporary files, ...) 293 | 294 | 295 | ```r 296 | f1 <- function(x) { 297 | on.exit(print("!")) 298 | x + 1 299 | } 300 | 301 | f2 <- function(x) { 302 | on.exit(print("!")) 303 | stop("Error") 304 | } 305 | ``` 306 | 307 | 308 | ```r 309 | f3 <- function() { 310 | on.exit(print("1")) 311 | on.exit(print("2")) 312 | invisible(TRUE) 313 | } 314 | 315 | 316 | f4 <- function() { 317 | on.exit(print("1")) 318 | on.exit(print("2"), add = TRUE) 319 | invisible(TRUE) 320 | } 321 | ``` 322 | 323 | - Anonymous functions, created on-the-flight and passed to `lapply` or 324 | other high-level functions. 325 | 326 | 327 | ```r 328 | function(x) x + y 329 | body(function(x) x + y) 330 | args(function(x) x + y) 331 | environment(function(x) x + y) 332 | ``` 333 | 334 | ## More about scoping 335 | 336 | *Lexical scoping*: default behaviour, current environment, then 337 | traversing *enclosing/parent environments*. 338 | 339 | 340 | *Dynamic scoping*: looking up variables in the *calling environment*, 341 | used in non-standard evaluation. 342 | 343 | 344 | # Functional programming 345 | 346 | **First-class functions** - a function is a value just like any other 347 | variable. Functions can thus be used as arguments to other 348 | functions. Functions are considered *first-class citizens*. 349 | 350 | **Higher-order functions** - refers to functions that take functions 351 | as parameters (input) or return functions (output). 352 | 353 | ## Closures 354 | 355 | > "An object is data with functions. A closure is a function with 356 | > data." - John D. Cool 357 | 358 | Closures: functions written by functions. They enclose the envionment 359 | of the parent function and can access all its variables. 360 | 361 | 362 | ```r 363 | make.power <- function(n) 364 | function(x) x^n 365 | 366 | 367 | cube <- make.power(3) 368 | square <- make.power(2) 369 | cube(2) 370 | square(2) 371 | environment(cube) 372 | environment(square) 373 | ``` 374 | 375 | Mutable state: a counter function 376 | 377 | 378 | ```r 379 | new_counter <- function() { 380 | i <- 0 381 | function() { 382 | i <<- i + 1 383 | i 384 | } 385 | } 386 | 387 | count1 <- new_counter() 388 | count2 <- new_counter() 389 | 390 | count1() 391 | count1() 392 | count2() 393 | 394 | environment(count1) 395 | environment(count2) 396 | environment(count1)$i 397 | environment(count2)$i 398 | ``` 399 | 400 | Questions: 401 | 402 | - What happens of we place the code `i <- 0` and the function 403 | definition outside of a function, i.e in the global environment? 404 | 405 | - What happens if we use `<-` instead of `<<-`? 406 | 407 | The `colorRampPallette` 408 | 409 | 410 | ```r 411 | colramp <- colorRampPalette(c("blue", "yellow")) 412 | colramp(5) 413 | plot(1:10, col = colramp(10), pch = 19, cex = 2, 414 | main = "colramp(10)") 415 | ``` 416 | 417 | ## Functional 418 | 419 | Take a function as input or return a function as output. 420 | 421 | - `Reduce(f, x)` uses a binary function to successively combine the 422 | elements of a given vector and a possibly given initial value. 423 | 424 | 425 | ```r 426 | L <- replicate(3, matrix(rnorm(9), 3), simplify = FALSE) 427 | Reduce("+", L) 428 | try(sum(L)) 429 | ``` 430 | 431 | 432 | ```r 433 | Reduce("+", list(1, 2, 3), init = 10) 434 | Reduce("+", list(1, 2, 3), accumulate = TRUE) 435 | Reduce("+", list(1, 2, 3), right = TRUE, accumulate = TRUE) 436 | ``` 437 | 438 | - `Filter(f, x)` extracts the elements of a vector for which a 439 | predicate (logical) function gives true. 440 | 441 | - `Negate(f)` creates the negation of a given function. 442 | 443 | 444 | ```r 445 | even <- function(x) x %% 2 == 0 446 | (y <- sample(100, 10)) 447 | Filter(even, y) 448 | Filter(Negate(even), y) 449 | ``` 450 | 451 | - `Map(f, ...)` applies a function to the corresponding elements of 452 | given vectors. Similar to `mapply` without any attempt to simplify. 453 | 454 | 455 | ```r 456 | Map(even, 1:3) 457 | ``` 458 | 459 | - `Find(f, x)` and `Position(f, x)` give the first (or last elements) 460 | and its position in the vector, for which a predicate (logical) 461 | function gives true. 462 | 463 | 464 | ```r 465 | Find(even, 10:15) 466 | Find(even, 10:15, right = TRUE) 467 | Position(Negate(even), 10:15) 468 | Position(Negate(even), 10:15, right = TRUE) 469 | ``` 470 | 471 | ## References 472 | 473 | - R Gentleman, *R Programming for Bioinformatics*, CRC Press, 2008 474 | - `?Map`, or any other of the higher order functions 475 | - Blog post, *Higher Order Functions in R*, John Myles White 476 | http://www.johnmyleswhite.com/notebook/2010/09/23/higher-order-functions-in-r/ 477 | 478 | # Vectorisation 479 | 480 | 481 | > Many operations in R are vectorized, and understanding and using 482 | > vectorization is an essential component of becoming a proficient 483 | > programmer. - R Gentleman in *R Programming for Bioinformatics*. 484 | 485 | A *vectorised computation* is one that, when applied to a vector (of 486 | length greater than 1), automatically operates directly on all 487 | elements of the input vector. 488 | 489 | 490 | 491 | ```r 492 | (x <- 1:5) 493 | (y <- 5:1) 494 | x + y 495 | ``` 496 | 497 | ## Recycling rule 498 | 499 | What is `x` and `y` are of different length: the shorter vector is 500 | replicate so that its length matches the longer ones. 501 | 502 | 503 | ```r 504 | (x <- 1:6) 505 | (y <- 1:2) 506 | x+y 507 | ``` 508 | 509 | If the shorter vector is not an even multiple of the longer, a warning 510 | is issued. 511 | 512 | ## Example 513 | 514 | Compute difference between times of events, `e`. Given `n` events, 515 | there will be `n-1` inter-event times. `interval[i] <- e[i+1] - e[i]` 516 | 517 | Procedural implementation: 518 | 519 | 520 | ```r 521 | diff1 <- function(e) { 522 | n <- length(e) 523 | interval <- rep(0, n - 1) 524 | for (i in 1:(n - 1)) 525 | interval[i] <- e[i + 1] - e[i] 526 | interval 527 | } 528 | e <- c(2, 5, 10.2, 12, 19) 529 | diff1(e) 530 | ``` 531 | 532 | Vectorised implementation: 533 | 534 | 535 | ```r 536 | diff2 <- function(e) { 537 | n <- length(e) 538 | e[-1] - e[-n] 539 | } 540 | e <- c(2, 5, 10.2, 12, 19) 541 | diff2(e) 542 | ``` 543 | 544 | ## `*apply` functions 545 | 546 | How to apply a function, iteratively, on a set of elements? 547 | 548 | `apply(X, MARGIN, FUN, ...)` 549 | 550 | 551 | - `MARGIN` = 1 for row, 2 for cols. 552 | - `FUN` = function to apply 553 | - `...` = extra args to function. 554 | - `simplify` = should the result be simplified if possible. 555 | 556 | 557 | `*apply` functions are (generally) **NOT** faster than loops, but more 558 | succint and thus clearer. 559 | 560 | 561 | ```r 562 | v <- rnorm(1000) ## or a list 563 | res <- numeric(length(v)) 564 | 565 | for (i in 1:length(v)) 566 | res[i] <- f(v[i]) 567 | 568 | res <- sapply(v, f) 569 | 570 | ## if f is vectorised 571 | f(v) 572 | ``` 573 | 574 | function | use case 575 | -------|--------------------------------------- 576 | apply | matrices, arrays, data.frames 577 | lapply | lists, vectors 578 | sapply | lists, vectors 579 | vapply | with a pre-specified type of return value 580 | tapply | atomic objects, typically vectors 581 | by | similar to tapply 582 | eapply | environments 583 | mapply | multiple values 584 | rapply | recursive version of lapply 585 | esApply | `ExpressionSet`, defined in `Biobase` 586 | 587 | See also the `BiocGenerics` package for `[l|m|s|t]apply` S4 generics, 588 | as well as parallel versions in the `parallel` package (see 589 | `Performance` section). 590 | 591 | 592 | See also the `plyr` package, that offers its own flavour of `apply` 593 | functions. 594 | 595 | in/out | list | data frame | array 596 | ------------|---------|------------|--------- 597 | list | llply() | ldply() | laply() 598 | data frame | dlply() | ddply() | daply() 599 | array | alply() | adply() | aaply() 600 | 601 | 602 | ## Other functions 603 | 604 | - `replicate` - repeated evaluation of an expression 605 | - `aggregate` - compute summary statistics of data subsets 606 | - `ave` - group averages over level combinations of factors 607 | - `sweep` - sweep out array summaries 608 | 609 | ## Anonymous functions 610 | 611 | A function defined/called without being assigned to an identifier and 612 | generally passed as argument to other functions. 613 | 614 | 615 | 616 | ```r 617 | M <- matrix(rnorm(100), 10) 618 | apply(M, 1, function(Mrow) 'do something with Mrow') 619 | apply(M, 2, function(Mcol) 'do something with Mcol') 620 | ``` 621 | 622 | ## Use case: integration 623 | 624 | 625 | 626 | ![`sin(x^2)/ (a + abs(x))`](./figs/sinfun.png) 627 | 628 | The `integrate` function approximates definite integrals by 629 | adaptive quadrature. 630 | 631 | 632 | 633 | ```r 634 | f <- function(x, a = 1) sin(x^2)/ (a + abs(x)) 635 | integrate(f, lower = -2, upper = 2) 636 | ``` 637 | 638 | It is not vectorised. 639 | 640 | 641 | ```r 642 | lo <- c(-2, 0) 643 | hi <- c(0, 2) 644 | integrate(f, lower = lo, upper = hi) 645 | ``` 646 | 647 | ## How to vectorise 648 | 649 | - To vectorise a function, we can explicitly wrap it inside a helper 650 | function that will take care of argument recycling (via `rep`), then 651 | loop over the inputs and call the non-vectorised function. 652 | 653 | - To vectorise a function, we can explicitate the vectorised 654 | calculation using `mapply`. 655 | 656 | 657 | ```r 658 | mapply(function(lo, hi) integrate(f, lo, hi)$value, 659 | lo, hi) 660 | ``` 661 | 662 | - Create a vectorised form using `Vectorize`. It takes a function 663 | (here, an anonymous function) as input and returns a function. 664 | 665 | 666 | ```r 667 | Integrate <- Vectorize( 668 | function(fn, lower, upper) 669 | integrate(fn, lower, upper)$value, 670 | vectorize.args=c("lower", "upper") 671 | ) 672 | Integrate(f, lower=lo, upper=hi) 673 | ``` 674 | 675 | ## **Efficient** apply-like functions 676 | 677 | These functions combine high-level vectorised syntax for clarity 678 | **and** efficient C-level vectorised imputation (see *Performance* 679 | section). 680 | 681 | - In `base`: rowSums, rowMeans, colSums, colMeans 682 | - In `Biobase`: rowQ, rowMax, rowMin, rowMedias, ... 683 | - In `genefilter`: rowttests, rowFtests, rowSds, rowVars, ... 684 | 685 | Generalisable on other data structures, like `ExpressionSet` 686 | instances. 687 | 688 | ## Parallelisation 689 | 690 | Vectorised operations are natural candidats for parallel execution. 691 | See later, *Parallel computation* topic. 692 | 693 | ## References 694 | 695 | - R Gentleman, *R Programming for Bioinformatics*, CRC Press, 2008 696 | - Ligges and Fox, *R Help Desk, How Can I Avoid This Loop or Make It 697 | Faster?* R News, Vol 8/1. May 2008. 698 | - Grouping functions: sapply vs. lapply vs. apply. vs. tapply 699 | vs. by vs. aggregate ... http://stackoverflow.com/questions/3505701/ 700 | -------------------------------------------------------------------------------- /03-debug.R: -------------------------------------------------------------------------------- 1 | 2 | message("This is a message for our dear users.") 3 | 4 | 5 | message("This is a message for our dear users. ", 6 | paste("Thank you for using our software", 7 | sw, "version", packageVersion(sw))) 8 | 9 | 10 | f1 <- function() { 11 | cat("I AM LOUD AND YOU CAN'T HELP IT.\n") 12 | ## do stuff 13 | invisible(TRUE) 14 | } 15 | f1() 16 | 17 | 18 | f2 <- function() { 19 | message("Sorry to interup, but...") 20 | ## do stuff 21 | invisible(TRUE) 22 | } 23 | f2() 24 | suppressMessages(f2()) 25 | 26 | 27 | f3 <- function(verbose = TRUE) { 28 | if (verbose) 29 | message("I am being verbose because you let me.") 30 | ## do stuff 31 | invisible(TRUE) 32 | } 33 | f3() 34 | f3(verbose = FALSE) 35 | 36 | 37 | warning("Do not ignore me. Somthing bad might have happened.") 38 | warning("Do not ignore me. Somthing bad might be happening.", immediate. = TRUE) 39 | 40 | 41 | f <- function(...) 42 | warning("Attention, attention, ...!", ...) 43 | f() 44 | f(call. = FALSE) 45 | 46 | 47 | warnings() 48 | last.warning 49 | 50 | 51 | option("warn") 52 | 53 | 54 | stop("This is the end, my friend.") 55 | 56 | 57 | log(c(2, 1, 0, -1, 2)); print('end') ## warning 58 | xor(c(TRUE, FALSE)); print ('end') ## error 59 | 60 | 61 | geterrmessage() 62 | 63 | 64 | ## Import the log4r package. 65 | library('log4r') 66 | 67 | ## Create a new logger object with create.logger(). 68 | logger <- create.logger() 69 | 70 | ## Set the logger's file output: currently only allows flat files. 71 | logfile(logger) <- file.path('base.log') 72 | 73 | ## Set the current level of the logger. 74 | level(logger) <- "INFO" 75 | 76 | ## Try logging messages at different priority levels. 77 | debug(logger, 'A Debugging Message') ## Won't print anything 78 | info(logger, 'An Info Message') 79 | warn(logger, 'A Warning Message') 80 | error(logger, 'An Error Message') 81 | fatal(logger, 'A Fatal Error Message') 82 | 83 | 84 | n <- 10 85 | pb <- txtProgressBar(min = 0, max = n, style = 3) 86 | for (i in 1:n) { 87 | setTxtProgressBar(pb, i) 88 | Sys.sleep(0.5) 89 | } 90 | close(pb) 91 | 92 | 93 | library("progress") 94 | pb <- progress_bar$new(total = n) 95 | for (i in 1:n) { 96 | pb$tick() 97 | Sys.sleep(0.5) 98 | } 99 | 100 | 101 | if (!condition) stop(...) 102 | 103 | 104 | stopifnot(TRUE) 105 | stopifnot(TRUE, FALSE) 106 | 107 | 108 | f <- function(x) { 109 | stopifnot(is.numeric(x), length(x) == 1) 110 | invisible(TRUE) 111 | } 112 | 113 | f(1) 114 | f("1") 115 | f(1:2) 116 | f(letters) 117 | 118 | 119 | x <- "1" 120 | library("assertthat") 121 | stopifnot(is.numeric(x)) 122 | assert_that(is.numeric(x)) 123 | assert_that(length(x) == 2) 124 | 125 | 126 | a <- sqrt(2) 127 | a * a == 2 128 | a * a - 2 129 | 130 | 131 | 1L + 2L == 3L 132 | 1.0 + 2.0 == 3.0 133 | 0.1 + 0.2 == 0.3 134 | 135 | 136 | all.equal(0.1 + 0.2, 0.3) 137 | all.equal(0.1 + 0.2, 3.0) 138 | isTRUE(all.equal(0.1 + 0.2, 3)) ## when you just want TRUE/FALSE 139 | 140 | 141 | 1 == NULL 142 | all.equal(1, NULL) 143 | identical(1, NULL) 144 | identical(1, 1.) ## TRUE in R (both are stored as doubles) 145 | all.equal(1, 1L) 146 | identical(1, 1L) ## stored as different types 147 | 148 | 149 | col_means <- function(df) { 150 | numeric <- sapply(df, is.numeric) 151 | numeric_cols <- df[, numeric] 152 | data.frame(lapply(numeric_cols, mean)) 153 | } 154 | 155 | 156 | col_means(mtcars) 157 | col_means(mtcars[, 0]) 158 | col_means(mtcars[0, ]) 159 | col_means(mtcars[, "mpg", drop = FALSE]) 160 | col_means(1:10) 161 | col_means(as.matrix(mtcars)) 162 | col_means(as.list(mtcars)) 163 | 164 | mtcars2 <- mtcars 165 | mtcars2[-1] <- lapply(mtcars2[-1], as.character) 166 | col_means(mtcars2) 167 | 168 | 169 | e <- function(i) { 170 | x <- 1:4 171 | if (i < 5) x[1:2] 172 | else x[-1:2] 173 | } 174 | f <- function() sapply(1:10, e) 175 | g <- function() f() 176 | 177 | 178 | g() 179 | traceback() 180 | 181 | 182 | debug(g) 183 | g() 184 | 185 | 186 | e <- function(i) { 187 | x <- 1:4 188 | if (i < 5) x[1:2] 189 | else x[-1:2] # oops! x[-(1:2)] 190 | } 191 | f <- function() sapply(1:10, e) 192 | g <- function() f() 193 | 194 | 195 | ## make sure you have the 'sequences' package. 196 | ## Get readFasta2, the function to debug 197 | library(devtools) 198 | install_github("lgatto/sequences") ## from github 199 | ## or 200 | install.packages("sequences") ## from CRAN 201 | library("sequences") 202 | sequences:::debugme() 203 | ## Get an example file 204 | f <- dir(system.file("extdata", package = "sequences"), 205 | full.names=TRUE, pattern = "moreDnaSeqs.fasta") 206 | ## BANG! 207 | readFasta2(f) 208 | 209 | 210 | f <- function() { 211 | x <- "1" 212 | log(x) 213 | message("x was the ", class(x), " ", x) 214 | } 215 | f() 216 | 217 | 218 | f <- function() { 219 | x <- "1" 220 | try(log(x)) 221 | message("x was the ", class(x), " ", x) 222 | } 223 | f() 224 | 225 | 226 | try({ 227 | a <- 1 228 | b <- "2" 229 | a + b 230 | }) 231 | 232 | 233 | success <- try(1 + 2) 234 | failure <- try(1 + "2", silent = TRUE) 235 | class(success) 236 | class(failure) 237 | 238 | 239 | inherits(failure, "try-error") 240 | 241 | if (inherits(failure, "try-error")) 242 | message("There was an error here.") 243 | 244 | 245 | el <- list(1:10, c(-1, 1), TRUE, "1") 246 | res <- lapply(el, log) 247 | res 248 | res <- lapply(el, function(x) try(log(x))) 249 | res 250 | 251 | 252 | default <- NULL 253 | try(default <- read.csv("possibly-bad-input.csv"), silent = TRUE) 254 | 255 | 256 | f <- function(x) 257 | if (x == 1) stop("Error!") else 1 258 | 259 | f(1) 260 | f(2) 261 | 262 | 263 | safef <- failwith(NULL, f) 264 | safef(1) 265 | safef(2) 266 | 267 | 268 | f <- function() { 269 | x <- "1" 270 | tryCatch(log(x), 271 | error = function(e) cat("There was an error!\n")) 272 | message("x was the ", class(x), " ", x) 273 | } 274 | f() 275 | 276 | 277 | show_condition <- function(code) { 278 | tryCatch(code, 279 | error = function(c) "error", 280 | warning = function(c) "warning", 281 | message = function(c) "message" 282 | ) 283 | } 284 | show_condition(stop("!")) 285 | show_condition(warning("?!")) 286 | show_condition(message("?")) 287 | show_condition(0) 288 | 289 | 290 | read.csv2 <- function(file, ...) { 291 | tryCatch(read.csv(file, ...), error = function(c) { 292 | c$message <- paste0(c$message, " (in ", file, ")") 293 | stop(c) 294 | }) 295 | } 296 | read.csv("code/dummy.csv") 297 | read.csv2("code/dummy.csv") 298 | 299 | 300 | f <- function(x = 10) { 301 | lapply(seq_len(x), function(i) { 302 | ## make an example 2x2 contingency table 303 | d <- matrix(sample(4:10, 4), nrow = 2, ncol = 2) 304 | ## will produce warning if there is a 5 or less 305 | ## in the contingency table 306 | chisq.test(d) 307 | }) 308 | } 309 | 310 | 311 | set.seed(1) 312 | f() 313 | set.seed(1) 314 | withCallingHandlers(f(), warning=function(e) recover()) 315 | 316 | 317 | f <- function() g() 318 | g <- function() h() 319 | h <- function() stop("!") 320 | 321 | tryCatch(f(), error = function(e) print(sys.calls())) 322 | withCallingHandlers(f(), error = function(e) print(sys.calls())) 323 | 324 | 325 | safelog <- function(x) { 326 | tryCatch(log(x), 327 | error = function(e) paste("an error with input", x), 328 | warning = function(e) paste("a warning with input", x)) 329 | } 330 | 331 | 332 | log(1) 333 | safelog(1) 334 | log(-1) 335 | safelog(-1) 336 | log("a") 337 | safelog("a") 338 | 339 | 340 | safelog <- function(x) { 341 | tryCatch(log(x), 342 | error = function(e) paste("an error with input", x), 343 | warning = function(e) paste("a warning with input", x)) 344 | } 345 | 346 | 347 | ## Report whenever e invoked 348 | trace(sum) 349 | hist(rnorm(100)) 350 | untrace(sum) 351 | 352 | 353 | ## Evaluate arbitrary code whenever e invoked 354 | trace(e, quote(cat("i am", i, "\n"))) 355 | ## Another way to enter browser whenver e invoked 356 | trace(e, browser) 357 | ## stop tracing 358 | untrace(e) 359 | 360 | 361 | f <- function() { 362 | ## make an example 2x2 contingency table 363 | d <- matrix(sample(4:10, 4), nrow=2, ncol=2) 364 | chisq.test(d) 365 | } 366 | set.seed(1) 367 | f() ## no warning 368 | 369 | set.seed(11) 370 | f() ## warning 371 | 372 | 373 | if (any(d < 5)) 374 | browser() 375 | 376 | 377 | as.list(body(f)) 378 | 379 | 380 | trace("f", quote(if (any(d < 5)) browser()), at = 3) 381 | 382 | 383 | f 384 | body(f) 385 | 386 | 387 | set.seed(1) 388 | f() ## normal execution 389 | 390 | set.seed(11) 391 | f() ## enters browser mode 392 | 393 | 394 | library("MSnbase") 395 | data(itraqdata) 396 | x <- itraqdata[[1]] 397 | plot(x, full=TRUE) 398 | 399 | 400 | debug(plot) 401 | plot(x, full=TRUE) 402 | 403 | 404 | trace("plot", browser, 405 | signature = c("Spectrum", "missing")) 406 | plot(x, full=TRUE) 407 | 408 | -------------------------------------------------------------------------------- /03-debug.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Part III: Debugging" 3 | author: "Laurent Gatto" 4 | --- 5 | 6 | # Overview 7 | 8 | - Defensive programming 9 | - Debbugging: techniques and tools 10 | - Condition handling 11 | - Testing 12 | 13 | # Defensive programming 14 | 15 | Before we begin with debugging, let's look at ways to prevent bugs 16 | (more at the end of this part). 17 | 18 | *Defensive programming*: 19 | - making the code work in a predicable manner 20 | - writing code that fails in a well-defined manner 21 | - if something *weird* happens, either properly deal with it, of fail 22 | quickly and loudly 23 | 24 | The level of defensiveness will depend whether you write a function 25 | for interactive of programmatic usage. 26 | 27 | ## Talk to your users 28 | 29 | ### Diagnostic messages 30 | 31 | ```{r, eval=FALSE} 32 | message("This is a message for our dear users.") 33 | ``` 34 | 35 | ```{r, eval=FALSE} 36 | message("This is a message for our dear users. ", 37 | paste("Thank you for using our software", 38 | sw, "version", packageVersion(sw))) 39 | ``` 40 | 41 | Do not use `print` or `cat`: 42 | 43 | ```{r, eval=FALSE} 44 | f1 <- function() { 45 | cat("I AM LOUD AND YOU CAN'T HELP IT.\n") 46 | ## do stuff 47 | invisible(TRUE) 48 | } 49 | f1() 50 | ``` 51 | 52 | ```{r, eval=FALSE} 53 | f2 <- function() { 54 | message("Sorry to interup, but...") 55 | ## do stuff 56 | invisible(TRUE) 57 | } 58 | f2() 59 | suppressMessages(f2()) 60 | ``` 61 | 62 | Of course, it is also possible to manually define verbosity. This 63 | makes you write more code for a feature readily available. But still 64 | better to use `message`. 65 | 66 | ```{r, eval=FALSE} 67 | f3 <- function(verbose = TRUE) { 68 | if (verbose) 69 | message("I am being verbose because you let me.") 70 | ## do stuff 71 | invisible(TRUE) 72 | } 73 | f3() 74 | f3(verbose = FALSE) 75 | ``` 76 | 77 | ### Warning 78 | 79 | > There is a problem with warnings. No one reads them. Pat Burns, in 80 | > *R inferno*. 81 | 82 | ```{r, eval=FALSE} 83 | warning("Do not ignore me. Somthing bad might have happened.") 84 | warning("Do not ignore me. Somthing bad might be happening.", immediate. = TRUE) 85 | ``` 86 | 87 | ```{r, eval=FALSE} 88 | f <- function(...) 89 | warning("Attention, attention, ...!", ...) 90 | f() 91 | f(call. = FALSE) 92 | ``` 93 | Print warnings after they have been thrown. 94 | 95 | ```{r, eval=FALSE} 96 | warnings() 97 | last.warning 98 | ``` 99 | 100 | See also to `warn` option in `?options` . 101 | 102 | ```{r, eval=FALSE} 103 | option("warn") 104 | ``` 105 | 106 | ### Error 107 | 108 | ```{r, eval=FALSE} 109 | stop("This is the end, my friend.") 110 | ``` 111 | 112 | ```{r, eval=FALSE} 113 | log(c(2, 1, 0, -1, 2)); print('end') ## warning 114 | xor(c(TRUE, FALSE)); print ('end') ## error 115 | ``` 116 | 117 | Stop also has a `call.` parameter. 118 | 119 | ```{r, eval=FALSE} 120 | geterrmessage() 121 | ``` 122 | 123 | ### Logging 124 | 125 | See for example the [`log4r`](https://github.com/johnmyleswhite/log4r) 126 | package: 127 | 128 | ```{r, eval=FALSE} 129 | ## Import the log4r package. 130 | library('log4r') 131 | 132 | ## Create a new logger object with create.logger(). 133 | logger <- create.logger() 134 | 135 | ## Set the logger's file output: currently only allows flat files. 136 | logfile(logger) <- file.path('base.log') 137 | 138 | ## Set the current level of the logger. 139 | level(logger) <- "INFO" 140 | 141 | ## Try logging messages at different priority levels. 142 | debug(logger, 'A Debugging Message') ## Won't print anything 143 | info(logger, 'An Info Message') 144 | warn(logger, 'A Warning Message') 145 | error(logger, 'An Error Message') 146 | fatal(logger, 'A Fatal Error Message') 147 | ``` 148 | 149 | ### Progress bars 150 | 151 | - `utils::txtProgressBar` function 152 | 153 | ```{r, eval=FALSE} 154 | n <- 10 155 | pb <- txtProgressBar(min = 0, max = n, style = 3) 156 | for (i in 1:n) { 157 | setTxtProgressBar(pb, i) 158 | Sys.sleep(0.5) 159 | } 160 | close(pb) 161 | ``` 162 | 163 | - [`progress`](https://github.com/gaborcsardi/progress) package 164 | 165 | ```{r, eval=FALSE} 166 | library("progress") 167 | pb <- progress_bar$new(total = n) 168 | for (i in 1:n) { 169 | pb$tick() 170 | Sys.sleep(0.5) 171 | } 172 | ``` 173 | 174 | Tip: do not over use progress bars. Ideally, a user should be 175 | confident that everything is under control and progress is made while 176 | waiting for a function to return. In my experience, a progress bar is 177 | usefull when there is a specific and/or user-defined number of 178 | iterations, such a *iterating over n files*, or *running a simulation 179 | n times*. 180 | 181 | **Question**: What about mixing progress bars and verbosity. 182 | 183 | 184 | ## KISS 185 | 186 | Keep your functions simple and stupid (and short). 187 | 188 | ## Failing fast and well 189 | 190 | > Bounds errors are ugly, nasty things that should be stamped out 191 | > whenever possible. One solution to this problem is to use the 192 | > `assert` statement. The `assert` statement tells C++, "This can 193 | > never happen, but if it does, abort the program in a nice way." One 194 | > thing you find out as you gain programming experience is that things 195 | > that can "never happen" happen with alarming frequency. So just to 196 | > make sure that things work as they are supposed to, it’s a good idea 197 | > to put lots of self checks in your program. -- Practical C++ 198 | > Programming, Steve Oualline, O'Reilly. 199 | 200 | ```{r, eval=FALSE} 201 | if (!condition) stop(...) 202 | ``` 203 | 204 | ```{r, eval=FALSE} 205 | stopifnot(TRUE) 206 | stopifnot(TRUE, FALSE) 207 | ``` 208 | 209 | For example to test input classes, lengths, ... 210 | 211 | ```{r, eval=FALSE} 212 | f <- function(x) { 213 | stopifnot(is.numeric(x), length(x) == 1) 214 | invisible(TRUE) 215 | } 216 | 217 | f(1) 218 | f("1") 219 | f(1:2) 220 | f(letters) 221 | ``` 222 | 223 | The [`assertthat`](https://github.com/hadley/assertthat) package: 224 | 225 | ```{r, eval=FALSE} 226 | x <- "1" 227 | library("assertthat") 228 | stopifnot(is.numeric(x)) 229 | assert_that(is.numeric(x)) 230 | assert_that(length(x) == 2) 231 | ``` 232 | 233 | * `assert_that()` signal an error. 234 | * `see_if()` returns a logical value, with the error message as an attribute. 235 | * `validate_that()` returns `TRUE` on success, otherwise returns the error as 236 | a string. 237 | 238 | 239 | 240 | * `is.flag(x)`: is x `TRUE` or `FALSE`? (a boolean flag) 241 | * `is.string(x)`: is x a length 1 character vector? 242 | * `has_name(x, nm)`, `x %has_name% nm`: does `x` have component `nm`? 243 | * `has_attr(x, attr)`, `x %has_attr% attr`: does `x` have attribute `attr`? 244 | * `is.count(x)`: is x a single positive integer? 245 | * `are_equal(x, y)`: are `x` and `y` equal? 246 | * `not_empty(x)`: are all dimensions of `x` greater than 0? 247 | * `noNA(x)`: is `x` free from missing values? 248 | * `is.dir(path)`: is `path` a directory? 249 | * `is.writeable(path)`/`is.readable(path)`: is `path` writeable/readable? 250 | * `has_extension(path, extension)`: does `file` have given `extension`? 251 | 252 | 253 | ## Consistency and predictability 254 | 255 | Reminder of the interactive use vs programming examples: 256 | - `[` and `drop` 257 | - `sapply`, `lapply`, `vapply` 258 | 259 | Remember also the concept of *tidy data*. 260 | 261 | ## Comparisons 262 | 263 | ### Floating point issues to be aware of 264 | 265 | R FAQ [7.31](http://cran.r-project.org/doc/FAQ/R-FAQ.html#Why-doesn_0027t-R-think-these-numbers-are-equal_003f)? 266 | 267 | 268 | ```{r, eval=FALSE} 269 | a <- sqrt(2) 270 | a * a == 2 271 | a * a - 2 272 | ``` 273 | 274 | ```{r, eval=FALSE} 275 | 1L + 2L == 3L 276 | 1.0 + 2.0 == 3.0 277 | 0.1 + 0.2 == 0.3 278 | ``` 279 | 280 | ### Floating point: how to compare 281 | 282 | - `all.equal` compares R objects for *near equality*. Takes into 283 | account whether object attributes and names ought the taken into 284 | consideration (`check.attributes` and `check.names` parameters) and 285 | tolerance, which is machine dependent. 286 | 287 | ```{r, eval=FALSE} 288 | all.equal(0.1 + 0.2, 0.3) 289 | all.equal(0.1 + 0.2, 3.0) 290 | isTRUE(all.equal(0.1 + 0.2, 3)) ## when you just want TRUE/FALSE 291 | ``` 292 | 293 | ### Exact identity 294 | 295 | `identical`: test objects for exact equality 296 | 297 | ```{r, eval=FALSE} 298 | 1 == NULL 299 | all.equal(1, NULL) 300 | identical(1, NULL) 301 | identical(1, 1.) ## TRUE in R (both are stored as doubles) 302 | all.equal(1, 1L) 303 | identical(1, 1L) ## stored as different types 304 | ``` 305 | 306 | Appropriate within `if`, `while` condition statements. (not 307 | `all.equal`, unless wrapped in `isTRUE`). 308 | 309 | ## Exercise 310 | 311 | (From [adv-r](http://adv-r.had.co.nz/Exceptions-Debugging.html#defensive-programming).) 312 | 313 | The `col_means` function computes the means of all numeric columns in 314 | a data frame. 315 | 316 | ```{r, eval=FALSE} 317 | col_means <- function(df) { 318 | numeric <- sapply(df, is.numeric) 319 | numeric_cols <- df[, numeric] 320 | data.frame(lapply(numeric_cols, mean)) 321 | } 322 | ``` 323 | 324 | Is it a robust function? What happens if there are unusual inputs. 325 | 326 | ```{r, eval=FALSE} 327 | col_means(mtcars) 328 | col_means(mtcars[, 0]) 329 | col_means(mtcars[0, ]) 330 | col_means(mtcars[, "mpg", drop = FALSE]) 331 | col_means(1:10) 332 | col_means(as.matrix(mtcars)) 333 | col_means(as.list(mtcars)) 334 | 335 | mtcars2 <- mtcars 336 | mtcars2[-1] <- lapply(mtcars2[-1], as.character) 337 | col_means(mtcars2) 338 | ``` 339 | # Debugging: techniques and tools 340 | 341 | ### Shit happens 342 | 343 | > Funding your bug is a process of confirming the many things that you 344 | > believe are true - until you find one which is not true. -- Norm Matloff 345 | 346 | #### 1. Identify the bug (the difficult part) 347 | - Something went wrong! 348 | - Where in the code does it happen? 349 | - Does it happen every time? 350 | - What input triggered it? 351 | - Report it (even if it is in your code - use github issues, for 352 | example). 353 | 354 | **Tip**: Beware of your intuition. As a scientist, do what you are 355 | used to: generate a hypotheses, *design an experiment* to test them, 356 | and record the results. 357 | 358 | #### 2. Fix it (the less difficult part) 359 | - Correct the bug. 360 | - Make sure that bug will not repeat itself! 361 | - How can we be confident that we haven't introduced new bugs? 362 | 363 | ## Tools 364 | 365 | - `print`/`cat` 366 | - `traceback()` 367 | - `browser()` 368 | - `options(error = )`, `options(warn = )` 369 | - `trace` 370 | - IDE: RStudio, StatET, emacs' ess tracebug. 371 | 372 | 373 | ### Manually 374 | 375 | Inserting `print` and `cat` statements in the code. Works, but time 376 | consuming. 377 | 378 | ### Finding the bug 379 | 380 | > Many bugs are subtle and hard to find. -- Hadley Wickham 381 | 382 | Bugs are shy, and are generally hidden, deep down in your code, to 383 | make it as difficult as possible for you to find them. 384 | 385 | ```{r, echo=TRUE} 386 | e <- function(i) { 387 | x <- 1:4 388 | if (i < 5) x[1:2] 389 | else x[-1:2] 390 | } 391 | f <- function() sapply(1:10, e) 392 | g <- function() f() 393 | ``` 394 | 395 | `traceback`: lists the sequence of calls that lead to the error 396 | 397 | ```{r, eval=FALSE} 398 | g() 399 | traceback() 400 | ``` 401 | 402 | If the source code is available (for example for `source()`d code), 403 | then traceback will display the exact location in the function, in the 404 | form `filename.R#linenum`. 405 | 406 | ### Browsing the error 407 | 408 | - Register the function for debugging: `debug(g)`. This adds a call to 409 | the `browser()` function (see also below) and the very beginning of 410 | the function `g`. 411 | 412 | - Every call to `g()` will not be run interactively. 413 | 414 | - To finish debugging: `undebug(g)`. 415 | 416 | 417 | ```{r, eval=FALSE} 418 | debug(g) 419 | g() 420 | ``` 421 | 422 | How to debug: 423 | 424 | - `n` executes the next step of the function. Use `print(n)` or 425 | `get(n)` to print/access the variable `n`. 426 | - `s` to step into the next function. If it is not a function, same as 427 | `n`. 428 | - `f` to finish execution of the current loop of function. 429 | - `c` to leave interactive debugging and continue regular execution of 430 | the function. 431 | - `Q` to stop debugging, terminate the function and return to the 432 | global workspace. 433 | - `where` print a stack trace of all active function calls. 434 | - `Enter` same as `n` (or `s`, if it was used most recently), unless 435 | `options(browserNLdisabled = TRUE)` is set. 436 | 437 | To fix a function when the source code is not directly available, use 438 | `fix(fun)`. This will open the function's source code for editing and, 439 | after saving and closing, store the updated function in the global 440 | workspace. 441 | 442 | ### Breakpoints 443 | 444 | - Add a call to `browser()` anywhere in the source code to execute the 445 | rest of the code interactively. 446 | 447 | - To run breakpoints conditionally, wrap the call to `browser()` in a 448 | condition. 449 | 450 | ### Setting options 451 | 452 | - `options(error = recover)`: display the call stack and choose where 453 | to step in. 454 | 455 | - `options(error = stop)` or `options(error = NULL)`: reset to default 456 | behaviour. 457 | 458 | - `options(warn = 'numeric')`: sets the handling of warning 459 | messages. If `warn` is negative all warnings are ignored. If `warn` 460 | is zero (the default) warnings are stored until the top-level 461 | function returns. If 10 or fewer warnings were signalled they will 462 | be printed otherwise a message saying how many were signalled. An 463 | object called `last.warning` is created and can be printed through 464 | the function `warnings`. If `warn` is one, warnings are printed as 465 | they occur. If `warn` is two or larger all warnings are turned into 466 | errors. 467 | 468 | - `options(error = dump.frames)`: like `recover` but for 469 | non-interactive use. Will create a `last.dump.rda` file in the 470 | current working directory, which can then be reloaded in an 471 | interactive session to re-inter interactive debugging (using 472 | `debugger()`). 473 | 474 | ### Debugging with IDEs 475 | 476 | - RSudio: `Show Traceback`, `Rerun with Debug` and interactive debugging. 477 | 478 | ![RStudio debugging 1](./figs/debugRStudio1.png) 479 | ![RStudio debugging 2](./figs/debugRStudio2.png) 480 | 481 | - StatET (Eclipse plugin) 482 | 483 | - [emacs ESS and tracebug](http://ess.r-project.org/Manual/ess.html#Developing-with-ESS) 484 | 485 | ### Exercise 486 | 487 | 1. Your turn - play with `traceback`, `recover` and `debug`: 488 | 489 | (Example originally by Martin Morgan and Robert Gentleman.) 490 | 491 | ```{r, echo=TRUE} 492 | e <- function(i) { 493 | x <- 1:4 494 | if (i < 5) x[1:2] 495 | else x[-1:2] # oops! x[-(1:2)] 496 | } 497 | f <- function() sapply(1:10, e) 498 | g <- function() f() 499 | ``` 500 | 501 | 2. Fix `readFasta2`. 502 | 503 | ```{r, eval=FALSE} 504 | ## make sure you have the 'sequences' package. 505 | ## Get readFasta2, the function to debug 506 | library(devtools) 507 | install_github("lgatto/sequences") ## from github 508 | ## or 509 | install.packages("sequences") ## from CRAN 510 | library("sequences") 511 | sequences:::debugme() 512 | ## Get an example file 513 | f <- dir(system.file("extdata", package = "sequences"), 514 | full.names=TRUE, pattern = "moreDnaSeqs.fasta") 515 | ## BANG! 516 | readFasta2(f) 517 | ``` 518 | 519 | ## Condition handling 520 | 521 | ### `try` and `tryCatch` 522 | 523 | The function `f` will never terminate. 524 | 525 | ```{r, eval=FALSE} 526 | f <- function() { 527 | x <- "1" 528 | log(x) 529 | message("x was the ", class(x), " ", x) 530 | } 531 | f() 532 | ``` 533 | 534 | Use `try` to proceed with the execution even when an error occurs. 535 | 536 | ```{r, eval=FALSE} 537 | f <- function() { 538 | x <- "1" 539 | try(log(x)) 540 | message("x was the ", class(x), " ", x) 541 | } 542 | f() 543 | ``` 544 | 545 | ```{r, eval=FALSE} 546 | try({ 547 | a <- 1 548 | b <- "2" 549 | a + b 550 | }) 551 | ``` 552 | 553 | In case of error, `try` returns a object of class `try-error`: 554 | 555 | ```{r, eval=FALSE} 556 | success <- try(1 + 2) 557 | failure <- try(1 + "2", silent = TRUE) 558 | class(success) 559 | class(failure) 560 | ``` 561 | 562 | ```{r, eval=FALSE} 563 | inherits(failure, "try-error") 564 | 565 | if (inherits(failure, "try-error")) 566 | message("There was an error here.") 567 | ``` 568 | Handling errors is particularly useful to iterate over all elements of 569 | an input, despite errors (and inspecting/handling/fixing the errors 570 | afterwards). 571 | 572 | ```{r, eval=FALSE} 573 | el <- list(1:10, c(-1, 1), TRUE, "1") 574 | res <- lapply(el, log) 575 | res 576 | res <- lapply(el, function(x) try(log(x))) 577 | res 578 | ``` 579 | 580 | #### Hadley's tip 581 | 582 | > Another useful `try()` idiom is using a default value if an 583 | > expression fails. Simply assign the default outside the `try` block, 584 | > and then run the risky code: 585 | 586 | ```{r, eval=FALSE} 587 | default <- NULL 588 | try(default <- read.csv("possibly-bad-input.csv"), silent = TRUE) 589 | ``` 590 | 591 | > There is also `plyr::failwith()`, which makes this strategy even 592 | > easier to implement. 593 | 594 | ```{r, eval=FALSE} 595 | f <- function(x) 596 | if (x == 1) stop("Error!") else 1 597 | 598 | f(1) 599 | f(2) 600 | 601 | 602 | safef <- failwith(NULL, f) 603 | safef(1) 604 | safef(2) 605 | ``` 606 | 607 | Use `tryCatch` to specify a behaviour (handler function) in case of 608 | error, warning or message. 609 | 610 | ```{r, eval=FALSE} 611 | f <- function() { 612 | x <- "1" 613 | tryCatch(log(x), 614 | error = function(e) cat("There was an error!\n")) 615 | message("x was the ", class(x), " ", x) 616 | } 617 | f() 618 | ``` 619 | 620 | More example from Hadleys' *Advanced R* book. 621 | 622 | ```{r, eval=FALSE} 623 | show_condition <- function(code) { 624 | tryCatch(code, 625 | error = function(c) "error", 626 | warning = function(c) "warning", 627 | message = function(c) "message" 628 | ) 629 | } 630 | show_condition(stop("!")) 631 | show_condition(warning("?!")) 632 | show_condition(message("?")) 633 | show_condition(0) 634 | ``` 635 | 636 | A more informative `read.csv` version: 637 | 638 | ```{r, eval=FALSE} 639 | read.csv2 <- function(file, ...) { 640 | tryCatch(read.csv(file, ...), error = function(c) { 641 | c$message <- paste0(c$message, " (in ", file, ")") 642 | stop(c) 643 | }) 644 | } 645 | read.csv("code/dummy.csv") 646 | read.csv2("code/dummy.csv") 647 | ``` 648 | 649 | `tryCatch` has a `finally` argument that specifies a code block to be 650 | executed regardless of whether the initial expression succeeds or 651 | fails. Usefull, for example, to clean up (deleting files, closing 652 | connections, ...). 653 | 654 | 655 | ### `withCallingHandlers` 656 | 657 | 658 | The `withCallingHandlers` function allows to defined special behaviour 659 | in case of *unusual conditions*, including warnings and errors. In the 660 | example below, we start a browser in case of (obscure) warnings. 661 | 662 | ```{r, eval=FALSE} 663 | f <- function(x = 10) { 664 | lapply(seq_len(x), function(i) { 665 | ## make an example 2x2 contingency table 666 | d <- matrix(sample(4:10, 4), nrow = 2, ncol = 2) 667 | ## will produce warning if there is a 5 or less 668 | ## in the contingency table 669 | chisq.test(d) 670 | }) 671 | } 672 | ``` 673 | 674 | ```{r, eval=FALSE} 675 | set.seed(1) 676 | f() 677 | set.seed(1) 678 | withCallingHandlers(f(), warning=function(e) recover()) 679 | ``` 680 | 681 | ### Difference between `tryCatch` and `withCallingHandlers` 682 | 683 | (From [*Advanced R*](http://adv-r.had.co.nz/Exceptions-Debugging.html#condition-handling)) 684 | 685 | The handlers in `withCallingHandlers()` are called in the context of 686 | the call that generated the condition whereas the handlers in 687 | `tryCatch()` are called in the context of `tryCatch()`. This is shown 688 | here with `sys.calls()`, which is the run-time equivalent of 689 | `traceback()` -- it lists all calls leading to the current function. 690 | 691 | ```{r, eval=FALSE} 692 | f <- function() g() 693 | g <- function() h() 694 | h <- function() stop("!") 695 | 696 | tryCatch(f(), error = function(e) print(sys.calls())) 697 | withCallingHandlers(f(), error = function(e) print(sys.calls())) 698 | ``` 699 | 700 | ### Debugging at the C level with `ddd` or `gdb` 701 | 702 | Demo 703 | 704 | ### Exercise 705 | 706 | ```{r, eval=TRUE, echo=FALSE} 707 | safelog <- function(x) { 708 | tryCatch(log(x), 709 | error = function(e) paste("an error with input", x), 710 | warning = function(e) paste("a warning with input", x)) 711 | } 712 | ``` 713 | 714 | 715 | Write a new `safelog` function that catches and handles errors and 716 | warnings to emulate the following behaviour. 717 | 718 | ```{r, echo=TRUE, eval=TRUE} 719 | log(1) 720 | safelog(1) 721 | log(-1) 722 | safelog(-1) 723 | log("a") 724 | safelog("a") 725 | ``` 726 | 727 | **Answer** 728 | 729 | ```{r, echo=TRUE} 730 | safelog <- function(x) { 731 | tryCatch(log(x), 732 | error = function(e) paste("an error with input", x), 733 | warning = function(e) paste("a warning with input", x)) 734 | } 735 | ``` 736 | 737 | ## Tracing code 738 | 739 | From `?trace`: 740 | 741 | > A call to `trace` allows you to insert debugging code (e.g., a call 742 | > to `browser` or `recover`) at chosen places in any function. A call 743 | > to `untrace` cancels the tracing. 744 | 745 | ```{r, eval=FALSE} 746 | ## Report whenever e invoked 747 | trace(sum) 748 | hist(rnorm(100)) 749 | untrace(sum) 750 | ``` 751 | 752 | ```{r, eval=FALSE} 753 | ## Evaluate arbitrary code whenever e invoked 754 | trace(e, quote(cat("i am", i, "\n"))) 755 | ## Another way to enter browser whenver e invoked 756 | trace(e, browser) 757 | ## stop tracing 758 | untrace(e) 759 | ``` 760 | 761 | ### Inserting code dynamically 762 | 763 | > The `trace` function operates by constructing a revised version of 764 | > the function (or of the method, if ‘signature’ is supplied), and 765 | > assigning the new object back where the original was found. 766 | 767 | ```{r, eval=FALSE} 768 | f <- function() { 769 | ## make an example 2x2 contingency table 770 | d <- matrix(sample(4:10, 4), nrow=2, ncol=2) 771 | chisq.test(d) 772 | } 773 | set.seed(1) 774 | f() ## no warning 775 | 776 | set.seed(11) 777 | f() ## warning 778 | ``` 779 | 780 | We want to conditionally enter brower mode, when an element of `d` is 781 | smaller than 5. 782 | 783 | ```{r, eval=FALSE} 784 | if (any(d < 5)) 785 | browser() 786 | ``` 787 | 788 | This expression must be executed at a specific location in our function `f`: 789 | 790 | ```{r, eval=FALSE} 791 | as.list(body(f)) 792 | ``` 793 | 794 | ```{r, eval=FALSE} 795 | trace("f", quote(if (any(d < 5)) browser()), at = 3) 796 | ``` 797 | 798 | We can now run our updated function `f` 799 | 800 | ```{r, eval=FALSE} 801 | f 802 | body(f) 803 | ``` 804 | 805 | ```{r, eval=FALSE} 806 | set.seed(1) 807 | f() ## normal execution 808 | 809 | set.seed(11) 810 | f() ## enters browser mode 811 | ``` 812 | 813 | ### Debugging S4 methods 814 | 815 | > The `trace` function operates by constructing a revised version of 816 | > the function (or of the method, if ‘signature’ is supplied), and 817 | > assigning the new object back where the original was found. 818 | 819 | ```{r, eval=FALSE} 820 | library("MSnbase") 821 | data(itraqdata) 822 | x <- itraqdata[[1]] 823 | plot(x, full=TRUE) 824 | ``` 825 | 826 | Not helpful: 827 | 828 | ```{r, eval=FALSE} 829 | debug(plot) 830 | plot(x, full=TRUE) 831 | ``` 832 | 833 | Try again: 834 | 835 | ```{r, eval=FALSE} 836 | trace("plot", browser, 837 | signature = c("Spectrum", "missing")) 838 | plot(x, full=TRUE) 839 | ``` 840 | 841 | 842 | ## Unit testing 843 | 844 | See [here](https://github.com/lgatto/2016-02-25-adv-programming-EMBL/blob/master/unittesting.md). 845 | 846 | -------------------------------------------------------------------------------- /03-debug.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Part III: Debugging" 3 | author: "Laurent Gatto" 4 | --- 5 | 6 | # Overview 7 | 8 | - Defensive programming 9 | - Debbugging: techniques and tools 10 | - Condition handling 11 | - Testing 12 | 13 | # Defensive programming 14 | 15 | Before we begin with debugging, let's look at ways to prevent bugs 16 | (more at the end of this part). 17 | 18 | *Defensive programming*: 19 | - making the code work in a predicable manner 20 | - writing code that fails in a well-defined manner 21 | - if something *weird* happens, either properly deal with it, of fail 22 | quickly and loudly 23 | 24 | The level of defensiveness will depend whether you write a function 25 | for interactive of programmatic usage. 26 | 27 | ## Talk to your users 28 | 29 | ### Diagnostic messages 30 | 31 | 32 | ```r 33 | message("This is a message for our dear users.") 34 | ``` 35 | 36 | 37 | ```r 38 | message("This is a message for our dear users. ", 39 | paste("Thank you for using our software", 40 | sw, "version", packageVersion(sw))) 41 | ``` 42 | 43 | Do not use `print` or `cat`: 44 | 45 | 46 | ```r 47 | f1 <- function() { 48 | cat("I AM LOUD AND YOU CAN'T HELP IT.\n") 49 | ## do stuff 50 | invisible(TRUE) 51 | } 52 | f1() 53 | ``` 54 | 55 | 56 | ```r 57 | f2 <- function() { 58 | message("Sorry to interup, but...") 59 | ## do stuff 60 | invisible(TRUE) 61 | } 62 | f2() 63 | suppressMessages(f2()) 64 | ``` 65 | 66 | Of course, it is also possible to manually define verbosity. This 67 | makes you write more code for a feature readily available. But still 68 | better to use `message`. 69 | 70 | 71 | ```r 72 | f3 <- function(verbose = TRUE) { 73 | if (verbose) 74 | message("I am being verbose because you let me.") 75 | ## do stuff 76 | invisible(TRUE) 77 | } 78 | f3() 79 | f3(verbose = FALSE) 80 | ``` 81 | 82 | ### Warning 83 | 84 | > There is a problem with warnings. No one reads them. Pat Burns, in 85 | > *R inferno*. 86 | 87 | 88 | ```r 89 | warning("Do not ignore me. Somthing bad might have happened.") 90 | warning("Do not ignore me. Somthing bad might be happening.", immediate. = TRUE) 91 | ``` 92 | 93 | 94 | ```r 95 | f <- function(...) 96 | warning("Attention, attention, ...!", ...) 97 | f() 98 | f(call. = FALSE) 99 | ``` 100 | Print warnings after they have been thrown. 101 | 102 | 103 | ```r 104 | warnings() 105 | last.warning 106 | ``` 107 | 108 | See also to `warn` option in `?options` . 109 | 110 | 111 | ```r 112 | option("warn") 113 | ``` 114 | 115 | ### Error 116 | 117 | 118 | ```r 119 | stop("This is the end, my friend.") 120 | ``` 121 | 122 | 123 | ```r 124 | log(c(2, 1, 0, -1, 2)); print('end') ## warning 125 | xor(c(TRUE, FALSE)); print ('end') ## error 126 | ``` 127 | 128 | Stop also has a `call.` parameter. 129 | 130 | 131 | ```r 132 | geterrmessage() 133 | ``` 134 | 135 | ### Logging 136 | 137 | See for example the [`log4r`](https://github.com/johnmyleswhite/log4r) 138 | package: 139 | 140 | 141 | ```r 142 | ## Import the log4r package. 143 | library('log4r') 144 | 145 | ## Create a new logger object with create.logger(). 146 | logger <- create.logger() 147 | 148 | ## Set the logger's file output: currently only allows flat files. 149 | logfile(logger) <- file.path('base.log') 150 | 151 | ## Set the current level of the logger. 152 | level(logger) <- "INFO" 153 | 154 | ## Try logging messages at different priority levels. 155 | debug(logger, 'A Debugging Message') ## Won't print anything 156 | info(logger, 'An Info Message') 157 | warn(logger, 'A Warning Message') 158 | error(logger, 'An Error Message') 159 | fatal(logger, 'A Fatal Error Message') 160 | ``` 161 | 162 | ### Progress bars 163 | 164 | - `utils::txtProgressBar` function 165 | 166 | 167 | ```r 168 | n <- 10 169 | pb <- txtProgressBar(min = 0, max = n, style = 3) 170 | for (i in 1:n) { 171 | setTxtProgressBar(pb, i) 172 | Sys.sleep(0.5) 173 | } 174 | close(pb) 175 | ``` 176 | 177 | - [`progress`](https://github.com/gaborcsardi/progress) package 178 | 179 | 180 | ```r 181 | library("progress") 182 | pb <- progress_bar$new(total = n) 183 | for (i in 1:n) { 184 | pb$tick() 185 | Sys.sleep(0.5) 186 | } 187 | ``` 188 | 189 | Tip: do not over use progress bars. Ideally, a user should be 190 | confident that everything is under control and progress is made while 191 | waiting for a function to return. In my experience, a progress bar is 192 | usefull when there is a specific and/or user-defined number of 193 | iterations, such a *iterating over n files*, or *running a simulation 194 | n times*. 195 | 196 | **Question**: What about mixing progress bars and verbosity. 197 | 198 | 199 | ## KISS 200 | 201 | Keep your functions simple and stupid (and short). 202 | 203 | ## Failing fast and well 204 | 205 | > Bounds errors are ugly, nasty things that should be stamped out 206 | > whenever possible. One solution to this problem is to use the 207 | > `assert` statement. The `assert` statement tells C++, "This can 208 | > never happen, but if it does, abort the program in a nice way." One 209 | > thing you find out as you gain programming experience is that things 210 | > that can "never happen" happen with alarming frequency. So just to 211 | > make sure that things work as they are supposed to, it’s a good idea 212 | > to put lots of self checks in your program. -- Practical C++ 213 | > Programming, Steve Oualline, O'Reilly. 214 | 215 | 216 | ```r 217 | if (!condition) stop(...) 218 | ``` 219 | 220 | 221 | ```r 222 | stopifnot(TRUE) 223 | stopifnot(TRUE, FALSE) 224 | ``` 225 | 226 | For example to test input classes, lengths, ... 227 | 228 | 229 | ```r 230 | f <- function(x) { 231 | stopifnot(is.numeric(x), length(x) == 1) 232 | invisible(TRUE) 233 | } 234 | 235 | f(1) 236 | f("1") 237 | f(1:2) 238 | f(letters) 239 | ``` 240 | 241 | The [`assertthat`](https://github.com/hadley/assertthat) package: 242 | 243 | 244 | ```r 245 | x <- "1" 246 | library("assertthat") 247 | stopifnot(is.numeric(x)) 248 | assert_that(is.numeric(x)) 249 | assert_that(length(x) == 2) 250 | ``` 251 | 252 | * `assert_that()` signal an error. 253 | * `see_if()` returns a logical value, with the error message as an attribute. 254 | * `validate_that()` returns `TRUE` on success, otherwise returns the error as 255 | a string. 256 | 257 | 258 | 259 | * `is.flag(x)`: is x `TRUE` or `FALSE`? (a boolean flag) 260 | * `is.string(x)`: is x a length 1 character vector? 261 | * `has_name(x, nm)`, `x %has_name% nm`: does `x` have component `nm`? 262 | * `has_attr(x, attr)`, `x %has_attr% attr`: does `x` have attribute `attr`? 263 | * `is.count(x)`: is x a single positive integer? 264 | * `are_equal(x, y)`: are `x` and `y` equal? 265 | * `not_empty(x)`: are all dimensions of `x` greater than 0? 266 | * `noNA(x)`: is `x` free from missing values? 267 | * `is.dir(path)`: is `path` a directory? 268 | * `is.writeable(path)`/`is.readable(path)`: is `path` writeable/readable? 269 | * `has_extension(path, extension)`: does `file` have given `extension`? 270 | 271 | 272 | ## Consistency and predictability 273 | 274 | Reminder of the interactive use vs programming examples: 275 | - `[` and `drop` 276 | - `sapply`, `lapply`, `vapply` 277 | 278 | Remember also the concept of *tidy data*. 279 | 280 | ## Comparisons 281 | 282 | ### Floating point issues to be aware of 283 | 284 | R FAQ [7.31](http://cran.r-project.org/doc/FAQ/R-FAQ.html#Why-doesn_0027t-R-think-these-numbers-are-equal_003f)? 285 | 286 | 287 | 288 | ```r 289 | a <- sqrt(2) 290 | a * a == 2 291 | a * a - 2 292 | ``` 293 | 294 | 295 | ```r 296 | 1L + 2L == 3L 297 | 1.0 + 2.0 == 3.0 298 | 0.1 + 0.2 == 0.3 299 | ``` 300 | 301 | ### Floating point: how to compare 302 | 303 | - `all.equal` compares R objects for *near equality*. Takes into 304 | account whether object attributes and names ought the taken into 305 | consideration (`check.attributes` and `check.names` parameters) and 306 | tolerance, which is machine dependent. 307 | 308 | 309 | ```r 310 | all.equal(0.1 + 0.2, 0.3) 311 | all.equal(0.1 + 0.2, 3.0) 312 | isTRUE(all.equal(0.1 + 0.2, 3)) ## when you just want TRUE/FALSE 313 | ``` 314 | 315 | ### Exact identity 316 | 317 | `identical`: test objects for exact equality 318 | 319 | 320 | ```r 321 | 1 == NULL 322 | all.equal(1, NULL) 323 | identical(1, NULL) 324 | identical(1, 1.) ## TRUE in R (both are stored as doubles) 325 | all.equal(1, 1L) 326 | identical(1, 1L) ## stored as different types 327 | ``` 328 | 329 | Appropriate within `if`, `while` condition statements. (not 330 | `all.equal`, unless wrapped in `isTRUE`). 331 | 332 | ## Exercise 333 | 334 | (From [adv-r](http://adv-r.had.co.nz/Exceptions-Debugging.html#defensive-programming).) 335 | 336 | The `col_means` function computes the means of all numeric columns in 337 | a data frame. 338 | 339 | 340 | ```r 341 | col_means <- function(df) { 342 | numeric <- sapply(df, is.numeric) 343 | numeric_cols <- df[, numeric] 344 | data.frame(lapply(numeric_cols, mean)) 345 | } 346 | ``` 347 | 348 | Is it a robust function? What happens if there are unusual inputs. 349 | 350 | 351 | ```r 352 | col_means(mtcars) 353 | col_means(mtcars[, 0]) 354 | col_means(mtcars[0, ]) 355 | col_means(mtcars[, "mpg", drop = FALSE]) 356 | col_means(1:10) 357 | col_means(as.matrix(mtcars)) 358 | col_means(as.list(mtcars)) 359 | 360 | mtcars2 <- mtcars 361 | mtcars2[-1] <- lapply(mtcars2[-1], as.character) 362 | col_means(mtcars2) 363 | ``` 364 | # Debugging: techniques and tools 365 | 366 | ### Shit happens 367 | 368 | > Funding your bug is a process of confirming the many things that you 369 | > believe are true - until you find one which is not true. -- Norm Matloff 370 | 371 | #### 1. Identify the bug (the difficult part) 372 | - Something went wrong! 373 | - Where in the code does it happen? 374 | - Does it happen every time? 375 | - What input triggered it? 376 | - Report it (even if it is in your code - use github issues, for 377 | example). 378 | 379 | **Tip**: Beware of your intuition. As a scientist, do what you are 380 | used to: generate a hypotheses, *design an experiment* to test them, 381 | and record the results. 382 | 383 | #### 2. Fix it (the less difficult part) 384 | - Correct the bug. 385 | - Make sure that bug will not repeat itself! 386 | - How can we be confident that we haven't introduced new bugs? 387 | 388 | ## Tools 389 | 390 | - `print`/`cat` 391 | - `traceback()` 392 | - `browser()` 393 | - `options(error = )`, `options(warn = )` 394 | - `trace` 395 | - IDE: RStudio, StatET, emacs' ess tracebug. 396 | 397 | 398 | ### Manually 399 | 400 | Inserting `print` and `cat` statements in the code. Works, but time 401 | consuming. 402 | 403 | ### Finding the bug 404 | 405 | > Many bugs are subtle and hard to find. -- Hadley Wickham 406 | 407 | Bugs are shy, and are generally hidden, deep down in your code, to 408 | make it as difficult as possible for you to find them. 409 | 410 | 411 | ```r 412 | e <- function(i) { 413 | x <- 1:4 414 | if (i < 5) x[1:2] 415 | else x[-1:2] 416 | } 417 | f <- function() sapply(1:10, e) 418 | g <- function() f() 419 | ``` 420 | 421 | `traceback`: lists the sequence of calls that lead to the error 422 | 423 | 424 | ```r 425 | g() 426 | traceback() 427 | ``` 428 | 429 | If the source code is available (for example for `source()`d code), 430 | then traceback will display the exact location in the function, in the 431 | form `filename.R#linenum`. 432 | 433 | ### Browsing the error 434 | 435 | - Register the function for debugging: `debug(g)`. This adds a call to 436 | the `browser()` function (see also below) and the very beginning of 437 | the function `g`. 438 | 439 | - Every call to `g()` will not be run interactively. 440 | 441 | - To finish debugging: `undebug(g)`. 442 | 443 | 444 | 445 | ```r 446 | debug(g) 447 | g() 448 | ``` 449 | 450 | How to debug: 451 | 452 | - `n` executes the next step of the function. Use `print(n)` or 453 | `get(n)` to print/access the variable `n`. 454 | - `s` to step into the next function. If it is not a function, same as 455 | `n`. 456 | - `f` to finish execution of the current loop of function. 457 | - `c` to leave interactive debugging and continue regular execution of 458 | the function. 459 | - `Q` to stop debugging, terminate the function and return to the 460 | global workspace. 461 | - `where` print a stack trace of all active function calls. 462 | - `Enter` same as `n` (or `s`, if it was used most recently), unless 463 | `options(browserNLdisabled = TRUE)` is set. 464 | 465 | To fix a function when the source code is not directly available, use 466 | `fix(fun)`. This will open the function's source code for editing and, 467 | after saving and closing, store the updated function in the global 468 | workspace. 469 | 470 | ### Breakpoints 471 | 472 | - Add a call to `browser()` anywhere in the source code to execute the 473 | rest of the code interactively. 474 | 475 | - To run breakpoints conditionally, wrap the call to `browser()` in a 476 | condition. 477 | 478 | ### Setting options 479 | 480 | - `options(error = recover)`: display the call stack and choose where 481 | to step in. 482 | 483 | - `options(error = stop)` or `options(error = NULL)`: reset to default 484 | behaviour. 485 | 486 | - `options(warn = 'numeric')`: sets the handling of warning 487 | messages. If `warn` is negative all warnings are ignored. If `warn` 488 | is zero (the default) warnings are stored until the top-level 489 | function returns. If 10 or fewer warnings were signalled they will 490 | be printed otherwise a message saying how many were signalled. An 491 | object called `last.warning` is created and can be printed through 492 | the function `warnings`. If `warn` is one, warnings are printed as 493 | they occur. If `warn` is two or larger all warnings are turned into 494 | errors. 495 | 496 | - `options(error = dump.frames)`: like `recover` but for 497 | non-interactive use. Will create a `last.dump.rda` file in the 498 | current working directory, which can then be reloaded in an 499 | interactive session to re-inter interactive debugging (using 500 | `debugger()`). 501 | 502 | ### Debugging with IDEs 503 | 504 | - RSudio: `Show Traceback`, `Rerun with Debug` and interactive debugging. 505 | 506 | ![RStudio debugging 1](./figs/debugRStudio1.png) 507 | ![RStudio debugging 2](./figs/debugRStudio2.png) 508 | 509 | - StatET (Eclipse plugin) 510 | 511 | - [emacs ESS and tracebug](http://ess.r-project.org/Manual/ess.html#Developing-with-ESS) 512 | 513 | ### Exercise 514 | 515 | 1. Your turn - play with `traceback`, `recover` and `debug`: 516 | 517 | (Example originally by Martin Morgan and Robert Gentleman.) 518 | 519 | 520 | ```r 521 | e <- function(i) { 522 | x <- 1:4 523 | if (i < 5) x[1:2] 524 | else x[-1:2] # oops! x[-(1:2)] 525 | } 526 | f <- function() sapply(1:10, e) 527 | g <- function() f() 528 | ``` 529 | 530 | 2. Fix `readFasta2`. 531 | 532 | 533 | ```r 534 | ## make sure you have the 'sequences' package. 535 | ## Get readFasta2, the function to debug 536 | library(devtools) 537 | install_github("lgatto/sequences") ## from github 538 | ## or 539 | install.packages("sequences") ## from CRAN 540 | library("sequences") 541 | sequences:::debugme() 542 | ## Get an example file 543 | f <- dir(system.file("extdata", package = "sequences"), 544 | full.names=TRUE, pattern = "moreDnaSeqs.fasta") 545 | ## BANG! 546 | readFasta2(f) 547 | ``` 548 | 549 | ## Condition handling 550 | 551 | ### `try` and `tryCatch` 552 | 553 | The function `f` will never terminate. 554 | 555 | 556 | ```r 557 | f <- function() { 558 | x <- "1" 559 | log(x) 560 | message("x was the ", class(x), " ", x) 561 | } 562 | f() 563 | ``` 564 | 565 | Use `try` to proceed with the execution even when an error occurs. 566 | 567 | 568 | ```r 569 | f <- function() { 570 | x <- "1" 571 | try(log(x)) 572 | message("x was the ", class(x), " ", x) 573 | } 574 | f() 575 | ``` 576 | 577 | 578 | ```r 579 | try({ 580 | a <- 1 581 | b <- "2" 582 | a + b 583 | }) 584 | ``` 585 | 586 | In case of error, `try` returns a object of class `try-error`: 587 | 588 | 589 | ```r 590 | success <- try(1 + 2) 591 | failure <- try(1 + "2", silent = TRUE) 592 | class(success) 593 | class(failure) 594 | ``` 595 | 596 | 597 | ```r 598 | inherits(failure, "try-error") 599 | 600 | if (inherits(failure, "try-error")) 601 | message("There was an error here.") 602 | ``` 603 | Handling errors is particularly useful to iterate over all elements of 604 | an input, despite errors (and inspecting/handling/fixing the errors 605 | afterwards). 606 | 607 | 608 | ```r 609 | el <- list(1:10, c(-1, 1), TRUE, "1") 610 | res <- lapply(el, log) 611 | res 612 | res <- lapply(el, function(x) try(log(x))) 613 | res 614 | ``` 615 | 616 | #### Hadley's tip 617 | 618 | > Another useful `try()` idiom is using a default value if an 619 | > expression fails. Simply assign the default outside the `try` block, 620 | > and then run the risky code: 621 | 622 | 623 | ```r 624 | default <- NULL 625 | try(default <- read.csv("possibly-bad-input.csv"), silent = TRUE) 626 | ``` 627 | 628 | > There is also `plyr::failwith()`, which makes this strategy even 629 | > easier to implement. 630 | 631 | 632 | ```r 633 | f <- function(x) 634 | if (x == 1) stop("Error!") else 1 635 | 636 | f(1) 637 | f(2) 638 | 639 | 640 | safef <- failwith(NULL, f) 641 | safef(1) 642 | safef(2) 643 | ``` 644 | 645 | Use `tryCatch` to specify a behaviour (handler function) in case of 646 | error, warning or message. 647 | 648 | 649 | ```r 650 | f <- function() { 651 | x <- "1" 652 | tryCatch(log(x), 653 | error = function(e) cat("There was an error!\n")) 654 | message("x was the ", class(x), " ", x) 655 | } 656 | f() 657 | ``` 658 | 659 | More example from Hadleys' *Advanced R* book. 660 | 661 | 662 | ```r 663 | show_condition <- function(code) { 664 | tryCatch(code, 665 | error = function(c) "error", 666 | warning = function(c) "warning", 667 | message = function(c) "message" 668 | ) 669 | } 670 | show_condition(stop("!")) 671 | show_condition(warning("?!")) 672 | show_condition(message("?")) 673 | show_condition(0) 674 | ``` 675 | 676 | A more informative `read.csv` version: 677 | 678 | 679 | ```r 680 | read.csv2 <- function(file, ...) { 681 | tryCatch(read.csv(file, ...), error = function(c) { 682 | c$message <- paste0(c$message, " (in ", file, ")") 683 | stop(c) 684 | }) 685 | } 686 | read.csv("code/dummy.csv") 687 | read.csv2("code/dummy.csv") 688 | ``` 689 | 690 | `tryCatch` has a `finally` argument that specifies a code block to be 691 | executed regardless of whether the initial expression succeeds or 692 | fails. Usefull, for example, to clean up (deleting files, closing 693 | connections, ...). 694 | 695 | 696 | ### `withCallingHandlers` 697 | 698 | 699 | The `withCallingHandlers` function allows to defined special behaviour 700 | in case of *unusual conditions*, including warnings and errors. In the 701 | example below, we start a browser in case of (obscure) warnings. 702 | 703 | 704 | ```r 705 | f <- function(x = 10) { 706 | lapply(seq_len(x), function(i) { 707 | ## make an example 2x2 contingency table 708 | d <- matrix(sample(4:10, 4), nrow = 2, ncol = 2) 709 | ## will produce warning if there is a 5 or less 710 | ## in the contingency table 711 | chisq.test(d) 712 | }) 713 | } 714 | ``` 715 | 716 | 717 | ```r 718 | set.seed(1) 719 | f() 720 | set.seed(1) 721 | withCallingHandlers(f(), warning=function(e) recover()) 722 | ``` 723 | 724 | ### Difference between `tryCatch` and `withCallingHandlers` 725 | 726 | (From [*Advanced R*](http://adv-r.had.co.nz/Exceptions-Debugging.html#condition-handling)) 727 | 728 | The handlers in `withCallingHandlers()` are called in the context of 729 | the call that generated the condition whereas the handlers in 730 | `tryCatch()` are called in the context of `tryCatch()`. This is shown 731 | here with `sys.calls()`, which is the run-time equivalent of 732 | `traceback()` -- it lists all calls leading to the current function. 733 | 734 | 735 | ```r 736 | f <- function() g() 737 | g <- function() h() 738 | h <- function() stop("!") 739 | 740 | tryCatch(f(), error = function(e) print(sys.calls())) 741 | withCallingHandlers(f(), error = function(e) print(sys.calls())) 742 | ``` 743 | 744 | ### Debugging at the C level with `ddd` or `gdb` 745 | 746 | Demo 747 | 748 | ### Exercise 749 | 750 | 751 | 752 | 753 | Write a new `safelog` function that catches and handles errors and 754 | warnings to emulate the following behaviour. 755 | 756 | 757 | ```r 758 | log(1) 759 | ``` 760 | 761 | ``` 762 | ## [1] 0 763 | ``` 764 | 765 | ```r 766 | safelog(1) 767 | ``` 768 | 769 | ``` 770 | ## [1] 0 771 | ``` 772 | 773 | ```r 774 | log(-1) 775 | ``` 776 | 777 | ``` 778 | ## Warning in log(-1): NaNs produced 779 | ``` 780 | 781 | ``` 782 | ## [1] NaN 783 | ``` 784 | 785 | ```r 786 | safelog(-1) 787 | ``` 788 | 789 | ``` 790 | ## [1] "a warning with input -1" 791 | ``` 792 | 793 | ```r 794 | log("a") 795 | ``` 796 | 797 | ``` 798 | ## Error in log("a"): non-numeric argument to mathematical function 799 | ``` 800 | 801 | ```r 802 | safelog("a") 803 | ``` 804 | 805 | ``` 806 | ## [1] "an error with input a" 807 | ``` 808 | 809 | **Answer** 810 | 811 | 812 | ```r 813 | safelog <- function(x) { 814 | tryCatch(log(x), 815 | error = function(e) paste("an error with input", x), 816 | warning = function(e) paste("a warning with input", x)) 817 | } 818 | ``` 819 | 820 | ## Tracing code 821 | 822 | From `?trace`: 823 | 824 | > A call to `trace` allows you to insert debugging code (e.g., a call 825 | > to `browser` or `recover`) at chosen places in any function. A call 826 | > to `untrace` cancels the tracing. 827 | 828 | 829 | ```r 830 | ## Report whenever e invoked 831 | trace(sum) 832 | hist(rnorm(100)) 833 | untrace(sum) 834 | ``` 835 | 836 | 837 | ```r 838 | ## Evaluate arbitrary code whenever e invoked 839 | trace(e, quote(cat("i am", i, "\n"))) 840 | ## Another way to enter browser whenver e invoked 841 | trace(e, browser) 842 | ## stop tracing 843 | untrace(e) 844 | ``` 845 | 846 | ### Inserting code dynamically 847 | 848 | > The `trace` function operates by constructing a revised version of 849 | > the function (or of the method, if ‘signature’ is supplied), and 850 | > assigning the new object back where the original was found. 851 | 852 | 853 | ```r 854 | f <- function() { 855 | ## make an example 2x2 contingency table 856 | d <- matrix(sample(4:10, 4), nrow=2, ncol=2) 857 | chisq.test(d) 858 | } 859 | set.seed(1) 860 | f() ## no warning 861 | 862 | set.seed(11) 863 | f() ## warning 864 | ``` 865 | 866 | We want to conditionally enter brower mode, when an element of `d` is 867 | smaller than 5. 868 | 869 | 870 | ```r 871 | if (any(d < 5)) 872 | browser() 873 | ``` 874 | 875 | This expression must be executed at a specific location in our function `f`: 876 | 877 | 878 | ```r 879 | as.list(body(f)) 880 | ``` 881 | 882 | 883 | ```r 884 | trace("f", quote(if (any(d < 5)) browser()), at = 3) 885 | ``` 886 | 887 | We can now run our updated function `f` 888 | 889 | 890 | ```r 891 | f 892 | body(f) 893 | ``` 894 | 895 | 896 | ```r 897 | set.seed(1) 898 | f() ## normal execution 899 | 900 | set.seed(11) 901 | f() ## enters browser mode 902 | ``` 903 | 904 | ### Debugging S4 methods 905 | 906 | > The `trace` function operates by constructing a revised version of 907 | > the function (or of the method, if ‘signature’ is supplied), and 908 | > assigning the new object back where the original was found. 909 | 910 | 911 | ```r 912 | library("MSnbase") 913 | data(itraqdata) 914 | x <- itraqdata[[1]] 915 | plot(x, full=TRUE) 916 | ``` 917 | 918 | Not helpful: 919 | 920 | 921 | ```r 922 | debug(plot) 923 | plot(x, full=TRUE) 924 | ``` 925 | 926 | Try again: 927 | 928 | 929 | ```r 930 | trace("plot", browser, 931 | signature = c("Spectrum", "missing")) 932 | plot(x, full=TRUE) 933 | ``` 934 | 935 | 936 | ## Unit testing 937 | 938 | See [here](https://github.com/lgatto/2016-02-25-adv-programming-EMBL/blob/master/unittesting.md). 939 | 940 | -------------------------------------------------------------------------------- /04-perf.R: -------------------------------------------------------------------------------- 1 | 2 | x <- runif(100) 3 | system.time(sqrt(x)) 4 | system.time(x^0.5) 5 | 6 | 7 | x <- runif(1e5) 8 | system.time(sqrt(x)) 9 | system.time(x^0.5) 10 | 11 | 12 | summary(replicate(10, system.time(x^0.5)[["elapsed"]])) 13 | 14 | 15 | x <- runif(100) 16 | library(microbenchmark) 17 | 18 | microbenchmark(sqrt(x), 19 | x ^ 0.5) 20 | 21 | 22 | f <- function() 1 23 | f() 24 | body(f) 25 | body(f) <- 2 26 | f() 27 | formals(f) <- pairlist(x = 1) 28 | body(f) <- quote(x + 1) 29 | f() 30 | f(10) 31 | 32 | 33 | setMethod(...) 34 | 35 | 36 | ctl <- c(4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14) 37 | trt <- c(4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69) 38 | group <- gl(2, 10, 20, labels = c("Ctl","Trt")) 39 | weight <- c(ctl, trt) 40 | mod <- lm(weight ~ group) 41 | mod 42 | mod$foo <- 1 43 | mod 44 | summary(mod) 45 | names(mod) 46 | 47 | 48 | class(mod) <- c("mod", "foo") 49 | 50 | 51 | ## function 52 | f <- function(x) NULL 53 | 54 | ## S3 method 55 | s3 <- function(x) UseMethod("s3") 56 | s3.integer <- f 57 | 58 | ## S4 method 59 | A <- setClass("A", slots = c(a = "list")) 60 | setGeneric("s4", function(x) standardGeneric("s4")) 61 | setMethod("s4", "A", f) 62 | a <- A() 63 | 64 | ## S4 Reference Class 65 | B <- setRefClass("B", methods = list(rc = f)) 66 | b <- B$new() 67 | 68 | 69 | microbenchmark( 70 | fun = f(), 71 | S3 = s3(1L), 72 | S4 = s4(a), 73 | RC = b$rc() 74 | ) 75 | 76 | 77 | a <- 1 78 | f <- function() { 79 | g <- function() { 80 | print(a) ## from global workspace 81 | assign("a", 2, envir = parent.frame()) 82 | print(a) ## f's environment 83 | a <- 3 84 | print(a) ## g's environment 85 | } 86 | g() 87 | } 88 | f() 89 | 90 | 91 | f <- function(x, y) { 92 | (x + y) ^ 2 93 | } 94 | 95 | 96 | microbenchmark( 97 | "[32, 11]" = mtcars[32, 11], 98 | "$carb[32]" = mtcars$carb[32], 99 | "[[c(11, 32)]]" = mtcars[[c(11, 32)]], 100 | "[[11]][32]" = mtcars[[11]][32], 101 | ".subset" = .subset(mtcars, 11)[32], 102 | ".subset2" = .subset2(mtcars, 11)[32] 103 | ) 104 | 105 | 106 | x <- runif(100) 107 | all.equal(sqrt(x), x ^ 0.5) 108 | 109 | 110 | library("sequences") 111 | gccount 112 | gccountr <- function(x) table(strsplit(x, "")[[1]]) 113 | gccountr2 <- function(x) tabulate(factor(strsplit(x, "")[[1]])) 114 | 115 | 116 | s <- paste(sample(c("A", "C", "G", "T"), 117 | 100, replace = TRUE), 118 | collapse = "") 119 | 120 | gccount(s) 121 | gccountr(s) 122 | gccountr2(s) 123 | 124 | 125 | library("microbenchmark") 126 | microbenchmark(gccount(s), 127 | gccountr(s), 128 | gccountr2(s), 129 | times = 1e4, 130 | unit = "eps") 131 | 132 | 133 | library("ggplot2") 134 | mb <- microbenchmark(gccount(s), 135 | gccountr(s), 136 | gccountr2(s)) 137 | print(mb) 138 | microbenchmark:::autoplot.microbenchmark(mb) 139 | 140 | 141 | m <- matrix(rnorm(1e6), ncol = 10) 142 | 143 | Rprof("rprof") 144 | res <- apply(m, 1, mean, trim=.3) 145 | Rprof(NULL) 146 | summaryRprof("rprof") 147 | 148 | 149 | f <- function() { 150 | pause(0.1) 151 | g() 152 | h() 153 | } 154 | 155 | g <- function() { 156 | pause(0.1) 157 | h() 158 | } 159 | 160 | h <- function() { 161 | pause(0.1) 162 | } 163 | 164 | 165 | 166 | library("profvis") 167 | source("lineprof-example.R") 168 | profvis(f()) 169 | 170 | 171 | profvis({ 172 | m <- matrix(rnorm(1e6), ncol = 10) 173 | res <- apply(m, 1, mean, trim=.3) 174 | sum(res) 175 | }) 176 | 177 | 178 | make_id2GO <- function(n = 1e3) { ## could be 1e4 - 1e5 179 | gn <- sprintf(paste0("ENSG%0", 10, "d"), sample(1e6, n)) 180 | goid <- function(n = 10) sprintf(paste0("GO:%0", 10, "d"), sample(1e6, n)) 181 | structure(replicate(n, goid(sample(50, 1))), 182 | names = gn) 183 | } 184 | id2GO <- make_id2GO() 185 | 186 | 187 | length(id2GO) 188 | str(head(id2GO)) 189 | str(unlist(id2GO)) 190 | 191 | 192 | library(microbenchmark) 193 | microbenchmark(unlist(l), 194 | unlist(l, use.names = FALSE), 195 | times = 10) 196 | 197 | 198 | f1 <- function(n) { 199 | a <- NULL 200 | for (i in 1:n) a <- c(a, sqrt(i)) 201 | a 202 | } 203 | 204 | f2 <- function(n) { 205 | a <- numeric(n) 206 | for (i in 1:n) a[i] <- sqrt(i) 207 | a 208 | } 209 | 210 | 211 | microbenchmark(f1(1e3), f2(1e3)) 212 | microbenchmark(f1(1e4), f2(1e4)) 213 | 214 | 215 | e <- new.env() 216 | e$x <- 1 217 | f <- function(myenv) myenv$x <- 2 218 | f(e) 219 | e$x 220 | 221 | 222 | f3 <- function(n) 223 | sapply(seq_len(n), sqrt) 224 | 225 | f4 <- function(n) sqrt(n) 226 | 227 | 228 | n <- 10^(2:5) 229 | t1 <- sapply(n, function(.n) system.time(f1(.n))[["elapsed"]]) 230 | t2 <- sapply(n, function(.n) system.time(f2(.n))[["elapsed"]]) 231 | t3 <- sapply(n, function(.n) system.time(f3(.n))[["elapsed"]]) 232 | t4 <- sapply(n, function(.n) system.time(f4(.n))[["elapsed"]]) 233 | 234 | elapsed <- data.frame(t1, t2, t3, t4) 235 | rownames(elapsed) <- n 236 | 237 | colnames(elapsed) <- 238 | c("for loop\nwithout init", 239 | "for loop\nwith init", 240 | "wrapped in\napply", 241 | "built-in sqrt\n(vectorised)") 242 | 243 | suppressPackageStartupMessages(library("grid")) 244 | suppressPackageStartupMessages(library("reshape2")) 245 | suppressPackageStartupMessages(library("scales")) 246 | suppressPackageStartupMessages(library("ggplot2")) 247 | 248 | mainvp <- viewport(width = 1, 249 | height = 1, 250 | x = 0.5, y = 0.5) 251 | subvp <- viewport(width = 6/9, 252 | height = 5/9, 253 | x = .1, 254 | y = .95, 255 | just = c("left","top")) 256 | df <- melt(elapsed) 257 | colnames(df) <- c("Implementation", "Elapsed") 258 | df$Iterations <- rep(n, 4) 259 | ymax <- max(elapsed[, -1]) 260 | p <- ggplot(data=df, aes(x=Iterations, y=Elapsed, col=Implementation)) + 261 | geom_line() + geom_point() + 262 | theme(legend.position="bottom") + 263 | scale_x_continuous(trans=log10_trans()) + 264 | coord_trans(x="log2") 265 | q <- p + coord_cartesian(ylim=c(0, (ymax+.05))) + 266 | theme_gray(8) + 267 | labs(x = NULL, y = NULL) + 268 | theme(plot.margin = unit(rep(0.3, 4), "lines")) + 269 | theme(legend.position="none") 270 | print(p, vp = mainvp) 271 | print(q, vp = subvp) 272 | 273 | 274 | lapply2 <- function(x, f, ...) { 275 | out <- vector("list", length(x)) 276 | for (i in seq_along(x)) { 277 | out[[i]] <- f(x[[i]], ...) 278 | } 279 | out 280 | } 281 | 282 | lapply2_c <- compiler::cmpfun(lapply2) 283 | 284 | x <- list(1:10, letters, c(FALSE, TRUE), NULL) 285 | 286 | 287 | microbenchmark( 288 | lapply2(x, is.null), 289 | lapply2_c(x, is.null), 290 | lapply(x, is.null)) 291 | 292 | 293 | library("pryr") 294 | library("profvis") 295 | 296 | 297 | x <- 1:1e5 298 | object.size(x) 299 | print(object.size(x), units = "Kb") 300 | object_size(x) 301 | 302 | 303 | ll <- list(x, x, x) 304 | print(object.size(ll), units = "Kb") 305 | object_size(ll) 306 | 307 | 308 | x <- 1:1e6 309 | y <- list(1:1e6, 1:1e6, 1:1e6) 310 | object_size(x) 311 | object_size(y) 312 | 313 | 314 | e <- new.env() 315 | object.size(e) 316 | object_size(e) 317 | e$a <- 1:1e6 318 | object.size(e) 319 | object_size(e) 320 | 321 | 322 | mem_used() 323 | 324 | 325 | mem_change(v <- 1:1e6) 326 | mem_change(rm(v)) 327 | 328 | 329 | rm(list = ls()) 330 | mem_change(x <- 1:1e6) 331 | mem_change(y <- x) 332 | mem_change(rm(x)) 333 | mem_change(rm(y)) 334 | 335 | 336 | x <- 1:10 337 | c(address(x), refs(x)) 338 | x[5] <- 0L 339 | c(address(x), refs(x)) 340 | 341 | 342 | y <- x 343 | c(address(x), refs(x)) 344 | c(address(y), refs(y)) 345 | 346 | x[5] <- 1L 347 | c(address(x), refs(x)) 348 | c(address(y), refs(y)) 349 | 350 | 351 | x <- 1:5 352 | y <- x 353 | c(address(x), refs(x)) 354 | rm(y) 355 | c(address(x), refs(x)) ## should be 1 356 | 357 | x <- 1:5 358 | y <- x 359 | z <- x 360 | c(address(x), refs(x)) ## should be 3 361 | 362 | 363 | x <- 1:10 364 | tracemem(x) 365 | x[5] <- 0L 366 | 367 | y <- x 368 | x[5] <- 10L 369 | 370 | address(x) 371 | address(y) 372 | 373 | 374 | f1 <- function(n) { 375 | a <- NULL 376 | for (i in 1:n) { 377 | a <- c(a, sqrt(i)) 378 | print(address(a)) 379 | } 380 | invisible(a) 381 | } 382 | 383 | f2 <- function(n) { 384 | a <- numeric(n) 385 | for (i in 1:n) { 386 | a[i] <- sqrt(i) 387 | print(address(a)) 388 | } 389 | invisible(a) 390 | } 391 | 392 | -------------------------------------------------------------------------------- /04-perf.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Part IV: Performance" 3 | author: "Laurent Gatto" 4 | --- 5 | 6 | # Overview 7 | 8 | - Benchmarking 9 | - Profiling 10 | - Optimisation 11 | - Memory 12 | - Rcpp 13 | 14 | # A word of caution 15 | 16 | Knuth, Donald. *Structured Programming with `go to` Statements*, ACM 17 | Journal Computing Surveys, Vol 6, No. 4, Dec. 1974. p.268. 18 | 19 | > We should forget about small efficiencies, say about 97% of the 20 | > time: premature optimization is the root of all evil. Yet we should 21 | > not pass up our opportunities in that critical 3%. A good programmer 22 | > will not be lulled into complacency by such reasoning, he will be 23 | > wise to look carefully at the critical code; but only after that 24 | > code has been identified 25 | 26 | 27 | Robert Gentleman, in R Programming for Bioinformatics, 2008, 28 | about R's built-in C interfaces: 29 | 30 | > Since R is not compiled, in some situations its performance can be 31 | > substantially improved by writing code in a compiled language. There 32 | > are also reasons not to write code in other languages, and in 33 | > particular we caution against premature optimization, prototyping in 34 | > R is often cost effective. And in our experience very few routines 35 | > need to be implemented in other languages for efficiency 36 | > reasons. Another substantial reason not to use an implementation in 37 | > some other language is increased complexity. The use of another 38 | > language almost always results in higher maintenance costs and less 39 | > stability. In addition, any extensions or enhancements of the code 40 | > will require someone that is proficient in both R and the other 41 | > language. 42 | 43 | (`Rcpp` does make some of the above caution statements slightly less 44 | critical.) 45 | 46 | # R performance 47 | 48 | R is not a fast language, but it is most of the time *fast enough* for 49 | what we want to do, in particular with respect to interactive data 50 | analysis. In such cases a slow but expressive and flexible language is 51 | way better than a fast but less expressive and flexible 52 | alternative. It is also relatively easy to avoid bad R programming 53 | idioms that make code too slow. 54 | 55 | ## Timing, benchmarking 56 | 57 | Let's compare two implementation of the square root calculation: 58 | `sqrt(x)` and `x ^ 0.5`. 59 | 60 | 61 | ```r 62 | x <- runif(100) 63 | system.time(sqrt(x)) 64 | system.time(x^0.5) 65 | ``` 66 | 67 | Does this work? Try 68 | 69 | 70 | ```r 71 | x <- runif(1e5) 72 | system.time(sqrt(x)) 73 | system.time(x^0.5) 74 | ``` 75 | 76 | We want to repeat timings multiple times: 77 | 78 | 79 | ```r 80 | summary(replicate(10, system.time(x^0.5)[["elapsed"]])) 81 | ``` 82 | 83 | A better approach for such cases is the 84 | [`microbenchmark` package](https://cran.rstudio.com/web/packages/microbenchmark/index.html), 85 | which is ideal to accurately benchmark small pieces of code, in 86 | particular sub-millisecond (nanoseconds) executions (see units below). 87 | 88 | Each expression is run 100 times (controlled by the `times` 89 | argument). In addition, the execution order is randomised and summary 90 | timings are reported. 91 | 92 | 93 | ```r 94 | x <- runif(100) 95 | library(microbenchmark) 96 | 97 | microbenchmark(sqrt(x), 98 | x ^ 0.5) 99 | ``` 100 | 101 | **Question**: where does this difference come from? 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | Below are some illustrations of R's language designs and 111 | implementations that account for some cuts in performance (taken from 112 | *Advanced R*). 113 | 114 | ## Extreme dynamism 115 | 116 | R is an extremely dynamics language where almost any symbol (there are 117 | a few reserved key words such as `TRUE`, `if`, `for`, ..., package 118 | namespaces, locked environment with locked binding) can be modified at 119 | any points; in particular 120 | 121 | 122 | - body, arguments, and environments of functions 123 | 124 | 125 | ```r 126 | f <- function() 1 127 | f() 128 | body(f) 129 | body(f) <- 2 130 | f() 131 | formals(f) <- pairlist(x = 1) 132 | body(f) <- quote(x + 1) 133 | f() 134 | f(10) 135 | ``` 136 | 137 | - Change the S4 methods for a generic 138 | 139 | 140 | ```r 141 | setMethod(...) 142 | ``` 143 | 144 | - Add new fields to an S3 object 145 | 146 | 147 | ```r 148 | ctl <- c(4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14) 149 | trt <- c(4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69) 150 | group <- gl(2, 10, 20, labels = c("Ctl","Trt")) 151 | weight <- c(ctl, trt) 152 | mod <- lm(weight ~ group) 153 | mod 154 | mod$foo <- 1 155 | mod 156 | summary(mod) 157 | names(mod) 158 | ``` 159 | 160 | - Change the class of an object 161 | 162 | 163 | ```r 164 | class(mod) <- c("mod", "foo") 165 | ``` 166 | - modify objects outside or the local environment with `<<-`, or 167 | anywhere with `assign(..., envir)`. 168 | 169 | While this dynamism gives a lot of flexibility, the interpreter can 170 | not make any assumptions or optimisations. For example, let's assess 171 | the cost of methods lookup: 172 | 173 | 174 | ```r 175 | ## function 176 | f <- function(x) NULL 177 | 178 | ## S3 method 179 | s3 <- function(x) UseMethod("s3") 180 | s3.integer <- f 181 | 182 | ## S4 method 183 | A <- setClass("A", slots = c(a = "list")) 184 | setGeneric("s4", function(x) standardGeneric("s4")) 185 | setMethod("s4", "A", f) 186 | a <- A() 187 | 188 | ## S4 Reference Class 189 | B <- setRefClass("B", methods = list(rc = f)) 190 | b <- B$new() 191 | ``` 192 | 193 | 194 | ```r 195 | microbenchmark( 196 | fun = f(), 197 | S3 = s3(1L), 198 | S4 = s4(a), 199 | RC = b$rc() 200 | ) 201 | ``` 202 | 203 | ## Name lookup 204 | 205 | To find the value of a symbol is difficult, given that **everything** 206 | can be modified (extreme dynamics, see above) and lexical scoping. 207 | 208 | 209 | ```r 210 | a <- 1 211 | f <- function() { 212 | g <- function() { 213 | print(a) ## from global workspace 214 | assign("a", 2, envir = parent.frame()) 215 | print(a) ## f's environment 216 | a <- 3 217 | print(a) ## g's environment 218 | } 219 | g() 220 | } 221 | f() 222 | ``` 223 | 224 | ``` 225 | ## [1] 1 226 | ## [1] 2 227 | ## [1] 3 228 | ``` 229 | 230 | And by **everything**, we mean really nearly **everything**: `+`, `^`, `(`, and `{` in 231 | 232 | 233 | ```r 234 | f <- function(x, y) { 235 | (x + y) ^ 2 236 | } 237 | ``` 238 | 239 | ## Extracting a single value from a data frame 240 | 241 | 242 | ```r 243 | microbenchmark( 244 | "[32, 11]" = mtcars[32, 11], 245 | "$carb[32]" = mtcars$carb[32], 246 | "[[c(11, 32)]]" = mtcars[[c(11, 32)]], 247 | "[[11]][32]" = mtcars[[11]][32], 248 | ".subset" = .subset(mtcars, 11)[32], 249 | ".subset2" = .subset2(mtcars, 11)[32] 250 | ) 251 | ``` 252 | 253 | 254 | ## Improving R's performance 255 | 256 | R implementation: 257 | 258 | - [GNU R](http://www.r-project.org/) 259 | - [pqR](http://www.pqr-project.org) 260 | - [Renjin](http://www.renjin.org) 261 | - [FastR](https://github.com/allr/fastr) 262 | - [Riposte](https://github.com/jtalbot/riposte) 263 | - [CXXR](http://www.cs.kent.ac.uk/project/cxxr/) 264 | 265 | Several of these projects implement *deferred evaluation*, 266 | i.e. evaluations are only executed if they really need to be. We will 267 | see (and implement) and example in the `Rcpp` section. 268 | 269 | # Profiling 270 | 271 | ### Reminder 272 | 273 | > We should forget about small efficiencies, say about 97% of the 274 | > time: premature optimization is the root of all evil. Yet we should 275 | > not pass up our opportunities in that critical 3%. A good programmer 276 | > will not be lulled into complacency by such reasoning, he will be 277 | > wise to look carefully at the critical code; but only after that 278 | > code has been identified 279 | 280 | Knuth, Donald. *Structured Programming with `go to` Statements*, ACM 281 | Journal Computing Surveys, Vol 6, No. 4, Dec. 1974. p.268. 282 | 283 | ### Optimisation 284 | 285 | Optimisations often have their own costs: 286 | 287 | - Trade-off fast vs dangerous, flexibility/functionality vs performance. 288 | - Use any assumptions about the data, at the cost of generalisation. 289 | 290 | Pat Burns reminds us that 291 | 292 | > Our first duty is to create clear, correct code. Only consider 293 | > optimising your code when: 294 | 295 | - The code is debugged and stable. 296 | - Optimisation is likely to make a significant impact. 297 | 298 | then 299 | 300 | 1. Find the major bottleneck: code *profiling*. 301 | 2. Try to eliminate it. 302 | 3. Repeat until *fast enough*: ideally, define fast enough in advance. 303 | 304 | #### Make sure the code remains correct 305 | 306 | 307 | ```r 308 | x <- runif(100) 309 | all.equal(sqrt(x), x ^ 0.5) 310 | ``` 311 | 312 | ``` 313 | ## [1] TRUE 314 | ``` 315 | 316 | and/or [unit 317 | tests](https://github.com/lgatto/2016-02-25-adv-programming-EMBL/blob/master/unittesting.md) 318 | to compare different implementations (and regression test). 319 | 320 | #### Are implementations really equivalent? 321 | 322 | 323 | ```r 324 | library("sequences") 325 | gccount 326 | gccountr <- function(x) table(strsplit(x, "")[[1]]) 327 | gccountr2 <- function(x) tabulate(factor(strsplit(x, "")[[1]])) 328 | ``` 329 | 330 | Checking that our different implementations give the same results: 331 | 332 | 333 | ```r 334 | s <- paste(sample(c("A", "C", "G", "T"), 335 | 100, replace = TRUE), 336 | collapse = "") 337 | 338 | gccount(s) 339 | gccountr(s) 340 | gccountr2(s) 341 | ``` 342 | 343 | But are they really the same? Are we really comparing the same 344 | functionalities? 345 | 346 | Is it worth it? 347 | 348 | 349 | ```r 350 | library("microbenchmark") 351 | microbenchmark(gccount(s), 352 | gccountr(s), 353 | gccountr2(s), 354 | times = 1e4, 355 | unit = "eps") 356 | ``` 357 | 358 | 359 | ```r 360 | library("ggplot2") 361 | mb <- microbenchmark(gccount(s), 362 | gccountr(s), 363 | gccountr2(s)) 364 | print(mb) 365 | microbenchmark:::autoplot.microbenchmark(mb) 366 | ``` 367 | 368 | ## Profiling tools 369 | 370 | Sampling or statistical profiling: at regular intervals, stop the 371 | execution and record which functions are being executing and the call 372 | stack. 373 | 374 | - `Rprof` (and `summaryRprof`): record and summarise timings at fixed 375 | intervals (default `interval` is 0.02 seconds). 376 | 377 | - [`proftools`](https://cran.rstudio.com/web/packages/proftools/index.html) 378 | package: Tools for examining `Rprof` profile output. Comes with an 379 | extensive 380 | [vignette](https://cran.rstudio.com/web/packages/proftools/vignettes/proftools.pdf). 381 | 382 | - [`profvis`](https://github.com/rstudio/profvis): interactive 383 | visualisation collected by `Rprof`, whith emphasis on *lines of 384 | code*. See also and 385 | [*Introduction to profvis*](http://rpubs.com/wch/123888). 386 | 387 | 388 | 389 | - ([`lineprof`](https://github.com/hadley/lineprof) [\*] package: each 390 | *line of code* is profiled. This is less precise (than `Rprof`) but 391 | easier to interprete. Code must be sourced with `source()`.) 392 | - ([`profr`](https://cran.rstudio.com/web/packages/profr/index.html) 393 | package [\*]: provides an alternative data structure and visual 394 | rendering for the profiling information generated by `Rprof). 395 | 396 | [\*] [`lineprof`](https://github.com/hadley/lineprof) and 397 | [`profr`](https://cran.rstudio.com/web/packages/profr/index.html) and 398 | are now deprecated in favour of 399 | [`profvis`](https://github.com/rstudio/profvis). 400 | 401 | 402 | ## `Rprof` 403 | 404 | 405 | ```r 406 | m <- matrix(rnorm(1e6), ncol = 10) 407 | 408 | Rprof("rprof") 409 | res <- apply(m, 1, mean, trim=.3) 410 | Rprof(NULL) 411 | summaryRprof("rprof") 412 | ``` 413 | 414 | ## `profvis` 415 | 416 | Needs to `source()` the code or directly input the code to have access 417 | to the individual lines. 418 | 419 | 420 | ```r 421 | f <- function() { 422 | pause(0.1) 423 | g() 424 | h() 425 | } 426 | 427 | g <- function() { 428 | pause(0.1) 429 | h() 430 | } 431 | 432 | h <- function() { 433 | pause(0.1) 434 | } 435 | ``` 436 | 437 | 438 | ```r 439 | library("profvis") 440 | source("lineprof-example.R") 441 | profvis(f()) 442 | ``` 443 | 444 | ![profviz inteface](./figs/profvis.png) 445 | 446 | ## Limitations 447 | 448 | - Not profiling of C/C++ code, or primitive functions, or byte 449 | compiled code. 450 | - Anonymous functions are labelled as *anonymous*; name them 451 | explicitly in such cases. 452 | 453 | ## Exercise 454 | 455 | Profile the code chunk that calculates the timmed means using 456 | `profvis` and interpret the results. 457 | 458 | 459 | ```r 460 | profvis({ 461 | m <- matrix(rnorm(1e6), ncol = 10) 462 | res <- apply(m, 1, mean, trim=.3) 463 | sum(res) 464 | }) 465 | ``` 466 | 467 | # Optimisation 468 | 469 | ## Look for existing solutions 470 | 471 | - `readr::read_csv` or `data.table::fread` instead of `read_csv` 472 | 473 | ## Do as little as possible 474 | 475 | - `gccountr` vs `gccountr2` example above 476 | - simpler data structures 477 | - set `colClasses` in `read.csv` 478 | 479 | - Usual suspects: names, growing objects: 480 | 481 | ## Names 482 | 483 | 484 | ```r 485 | make_id2GO <- function(n = 1e3) { ## could be 1e4 - 1e5 486 | gn <- sprintf(paste0("ENSG%0", 10, "d"), sample(1e6, n)) 487 | goid <- function(n = 10) sprintf(paste0("GO:%0", 10, "d"), sample(1e6, n)) 488 | structure(replicate(n, goid(sample(50, 1))), 489 | names = gn) 490 | } 491 | id2GO <- make_id2GO() 492 | ``` 493 | 494 | We have a list of 1000 genes, and each of these genes is 495 | characterised by a set of 1 to 50 GO terms. 496 | 497 | To obtain the go terms, we `unlist` the gene list. 498 | 499 | 500 | ```r 501 | length(id2GO) 502 | str(head(id2GO)) 503 | str(unlist(id2GO)) 504 | ``` 505 | 506 | This can be executed much faster if we ignore the names in the 507 | original list. 508 | 509 | 510 | ```r 511 | library(microbenchmark) 512 | microbenchmark(unlist(l), 513 | unlist(l, use.names = FALSE), 514 | times = 10) 515 | ``` 516 | 517 | ## Initialise, do not grow dynamically (to avoid copies) 518 | 519 | 520 | ```r 521 | f1 <- function(n) { 522 | a <- NULL 523 | for (i in 1:n) a <- c(a, sqrt(i)) 524 | a 525 | } 526 | 527 | f2 <- function(n) { 528 | a <- numeric(n) 529 | for (i in 1:n) a[i] <- sqrt(i) 530 | a 531 | } 532 | ``` 533 | 534 | 535 | ```r 536 | microbenchmark(f1(1e3), f2(1e3)) 537 | microbenchmark(f1(1e4), f2(1e4)) 538 | ``` 539 | 540 | ## Pass-by-reference with environments 541 | 542 | When passing an environment as function argument, it is **not** 543 | copied: all its values are accessible within the function and can be 544 | persistently modified. 545 | 546 | 547 | ```r 548 | e <- new.env() 549 | e$x <- 1 550 | f <- function(myenv) myenv$x <- 2 551 | f(e) 552 | e$x 553 | ``` 554 | 555 | ``` 556 | ## [1] 2 557 | ``` 558 | 559 | This is used in the `eSet` et al. microarray data structures to store 560 | the expression data. 561 | 562 | ## Vectorisation 563 | 564 | 565 | ```r 566 | f3 <- function(n) 567 | sapply(seq_len(n), sqrt) 568 | 569 | f4 <- function(n) sqrt(n) 570 | ``` 571 | 572 | ![Initialisation and vectorisation](./figs/vectimings.png) 573 | 574 | 575 | 576 | Code vectorisation is not only about avoiding loops at all cost, and 577 | replacing them with `*apply`. As we have seen, this does not make any 578 | real difference in terms of speed. 579 | 580 | Difference between vectorisation in high level code, to improve 581 | clarity (`apply`, `Vectorise`, ...) and, vectorise to improve 582 | performance, which involved re-writing for loops in C/C++ (see below). 583 | 584 | ## Re-implementing the code in C/C++ 585 | 586 | See below 587 | 588 | ## Parallelisation 589 | 590 | See the [R-parallel](https://github.com/lgatto/R-parallel/blob/f83667f3ef62ca3f82a7f2ef58b05db497da4b75/parallel.pdf) material. 591 | 592 | ## Byte-code compilation 593 | 594 | The `compile::cmpfun` function compiles the body of a closure and 595 | returns a new closure with the same formals and the body replaced by 596 | the compiled body expression. It does not always provide a speed 597 | improvement, but is very easy to implement. 598 | 599 | 600 | ```r 601 | lapply2 <- function(x, f, ...) { 602 | out <- vector("list", length(x)) 603 | for (i in seq_along(x)) { 604 | out[[i]] <- f(x[[i]], ...) 605 | } 606 | out 607 | } 608 | 609 | lapply2_c <- compiler::cmpfun(lapply2) 610 | 611 | x <- list(1:10, letters, c(FALSE, TRUE), NULL) 612 | 613 | 614 | microbenchmark( 615 | lapply2(x, is.null), 616 | lapply2_c(x, is.null), 617 | lapply(x, is.null)) 618 | ``` 619 | 620 | Note that all base R functions are aleady byte compiled. This can be 621 | observed with the `` attribute of a function. 622 | 623 | # Memory 624 | (See Chapter 18 in *Advanced R* for more details) 625 | 626 | Assessing memory needs is useful to save memory in general and limit 627 | memory access (read/write), which is one common bottleneck in R. 628 | 629 | Requirement: 630 | 631 | 632 | ```r 633 | library("pryr") 634 | library("profvis") 635 | ``` 636 | 637 | ## Object size 638 | 639 | 640 | ```r 641 | x <- 1:1e5 642 | object.size(x) 643 | print(object.size(x), units = "Kb") 644 | object_size(x) 645 | ``` 646 | 647 | But, `object.size` does not account for shared elements, nor for the 648 | size of environments. 649 | 650 | 651 | ```r 652 | ll <- list(x, x, x) 653 | print(object.size(ll), units = "Kb") 654 | object_size(ll) 655 | ``` 656 | 657 | But, this does not hold when there's no shared components: 658 | 659 | 660 | ```r 661 | x <- 1:1e6 662 | y <- list(1:1e6, 1:1e6, 1:1e6) 663 | object_size(x) 664 | object_size(y) 665 | ``` 666 | 667 | Environments: 668 | 669 | 670 | ```r 671 | e <- new.env() 672 | object.size(e) 673 | object_size(e) 674 | e$a <- 1:1e6 675 | object.size(e) 676 | object_size(e) 677 | ``` 678 | 679 | ### Exercises 680 | 681 | - What is the object size of an empty numeric vector? 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | - How does the size of a numeric vector grow with it size (say from 696 | 0 to 50)? 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | ## Memory usage 708 | 709 | To get the total size of all object that were created by R that 710 | currently take space in memory: 711 | 712 | 713 | ```r 714 | mem_used() 715 | ``` 716 | 717 | To track memory change 718 | 719 | 720 | ```r 721 | mem_change(v <- 1:1e6) 722 | mem_change(rm(v)) 723 | ``` 724 | 725 | 726 | ```r 727 | rm(list = ls()) 728 | mem_change(x <- 1:1e6) 729 | mem_change(y <- x) 730 | mem_change(rm(x)) 731 | mem_change(rm(y)) 732 | ``` 733 | 734 | #### Garbage collection 735 | 736 | When objects in memory are not accessed from R anymore, there is no 737 | need to explicitly free up that memory chunk explicity. This is done 738 | automatically by the garbage collector, as illustrated in the examples 739 | above. There is no need to call it explicityly with `gc()`; the only 740 | effect of this is for R to explicitly return memory to the OS. 741 | 742 | ## Modifiation in place 743 | 744 | What happens in this cas? Is `x` copied (and hence more memory is used 745 | up), or is it modified in place? 746 | 747 | 748 | ```r 749 | x <- 1:10 750 | c(address(x), refs(x)) 751 | x[5] <- 0L 752 | c(address(x), refs(x)) 753 | 754 | 755 | y <- x 756 | c(address(x), refs(x)) 757 | c(address(y), refs(y)) 758 | 759 | x[5] <- 1L 760 | c(address(x), refs(x)) 761 | c(address(y), refs(y)) 762 | 763 | 764 | x <- 1:5 765 | y <- x 766 | c(address(x), refs(x)) 767 | rm(y) 768 | c(address(x), refs(x)) ## should be 1 769 | 770 | x <- 1:5 771 | y <- x 772 | z <- x 773 | c(address(x), refs(x)) ## should be 3 774 | ``` 775 | 776 | `tracemem` tracks memory location of objects: 777 | 778 | 779 | ```r 780 | x <- 1:10 781 | tracemem(x) 782 | x[5] <- 0L 783 | 784 | y <- x 785 | x[5] <- 10L 786 | 787 | address(x) 788 | address(y) 789 | ``` 790 | 791 | ## Exercise 792 | 793 | We have quantified the substantial cost in execution time when growing 794 | data dynamically. Trace the impact of dynamically growing a vector on 795 | the memory. You can use `f1()` and `f2()` above, and trace `a`. 796 | 797 | 798 | 799 | 800 | # Rcpp 801 | 802 | See [here](https://github.com/lgatto/rccpp/blob/b59a1ee23dd1ace7c45bb9e2239e853c93e9ca0c/rc.md). 803 | 804 | # Big data 805 | 806 | - [CRAN High-Performance and Parallel Computing task view](http://cran.r-project.org/web/views/HighPerformanceComputing.html). 807 | - Storing data in database or databases-like structures: `RMySQL`, 808 | `RdbiPgSQL`, \ldots, `RSQLite`, `qldf`, `data.table` (the 809 | `data.table::fread`, when `read.table` is slow, also `scan`), 810 | `dplyr`, ... packages 811 | - The `ff` package by Adler et al. offers file-based access to data 812 | sets that are too large to be loaded into memory, along with a 813 | number of higher-level functions 814 | - The `bigmemory` package by Kane and Emerson permits storing large 815 | objects such as matrices in memory (as well as via files) and uses 816 | `external pointer` objects to refer to them 817 | - `netCDF` data files: `ncdf` and `RNetCDF` packages 818 | - `hdf5` format: `rhdf5` package 819 | - `mmap` memory-mapped files/devices I/O 820 | - hadoop and R 821 | - See http://r-pbd.org/ and the 822 | [pbdDemo](http://cran.r-project.org/web/packages/pbdDEMO/) 823 | package/vignette. 824 | - [Bioconductor in the cloud](http://bioconductor.org/help/bioconductor-cloud-ami/) 825 | - [Bioconductor docker containers](http://bioconductor.org/help/docker/) 826 | - ... 827 | 828 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | %.md: %.Rmd 2 | Rscript -e "knitr::knit('$^')" 3 | 4 | %.html: %.Rmd 5 | Rscript -e "rmarkdown::render('$^', output_format=rmarkdown::ioslides_presentation())" 6 | # Rscript -e "rmarkdown::render('$^', output_format=rmarkdown::html_document())" 7 | 8 | %.R: %.Rmd 9 | Rscript -e "knitr::purl('$^')" 10 | perl -pi -e 's/^## (--.+)?//' $@ 11 | 12 | all: 13 | make 01-intro.md 02-funprog.md 03-debug.md 04-perf.md unittesting.md 14 | make 01-intro.R 02-funprog.R 03-debug.R 04-perf.R 15 | 16 | .PHONY: all 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # R debugging and robust programming 2 | 3 | **Short link to this page: http://bit.ly/advrEMBL2016** 4 | 5 | 25-26 February 2016 (Thursday-Friday), EMBL Heidelberg 6 | Instructors: Laurent Gatto, Robert Stojnic (University of Cambridge) 7 | Organiser: Wolfgang Huber (EMBL) 8 | 9 | This two-day course will teach participants debugging techniques and 10 | good practice in writing reliable, robust code. The material will 11 | provide the opportunity to gain experience and understanding of how to 12 | identify, resolve, and avoid bugs, in order to produce 13 | publication-quality code. The course will be taught using R and will 14 | be driven by many practical exercises. Course outline 15 | 16 | The material will focus on: 17 | 18 | - debugging, to fix problems with code 19 | - defensive programming - writing effective tests to detect bugs 20 | - profiling and optimisation of code 21 | 22 | ## Pre-requisites 23 | 24 | The course is aimed at those with experience of scripting, who want to 25 | learn more about writing robust and efficient code and who may want to 26 | develop and release packages in the future. 27 | 28 | ## Content 29 | 30 | #### [Part I:](https://github.com/lgatto/2016-02-25-adv-programming-EMBL/blob/master/01-intro.md) 31 | - Coding style(s) 32 | - Interactive use and programming 33 | - Environments 34 | - Tidy data 35 | - Computing on the language 36 | 37 | #### [Part II: Functional programming](https://github.com/lgatto/2016-02-25-adv-programming-EMBL/blob/master/02-funprog.md) 38 | - Functions 39 | - Robust programming with functions 40 | - Scoping 41 | - Closures 42 | - High-level functions 43 | - Vectorisation 44 | 45 | #### [Part III: Debugging](https://github.com/lgatto/2016-02-25-adv-programming-EMBL/blob/master/03-debug.md) 46 | - Defensive programming 47 | - Debbugging: techniques and tools 48 | - Condition handling: try/tryCatch 49 | - [Unit testing](https://github.com/lgatto/2016-02-25-adv-programming-EMBL/blob/master/unittesting.md) 50 | 51 | #### [Part IV: Performance](https://github.com/lgatto/2016-02-25-adv-programming-EMBL/blob/master/04-perf.md) 52 | - Benchmarking 53 | - Profiling 54 | - Optimisation 55 | - Memory 56 | - [Rcpp](https://github.com/lgatto/rccpp/blob/master/rc.md) 57 | 58 | #### Other topics 59 | - Packages and documentation 60 | - Reproducible research and vignettes (`Rmarkdown`) 61 | - Source code versioning with (for example) git and GitHub 62 | - Automation with (for example) `Make` 63 | 64 | See the [TeachingMaterial](http://lgatto.github.io/TeachingMaterial/) 65 | repository for content. 66 | 67 | ## References 68 | 69 | - [Previous courses](https://github.com/lgatto/teachingmaterial) and [here](https://github.com/DataProgrammers/2015-01-15-EMBLHeidelberg). 70 | - [Advanced R](http://adv-r.had.co.nz/), Hadley Wickham. 71 | - [The R Inferno](http://www.burns-stat.com/documents/books/the-r-inferno/), Patrick Burns. 72 | - [An Introduction to the Interactive Debugging Tools in R](http://www.biostat.jhsph.edu/~rpeng/docs/R-debug-tools.pdf), Roger D. Peng. 73 | - [R Programming for Bioinformatics](http://master.bioconductor.org/help/publications/books/r-programming-for-bioinformatics/), Robert Gentleman. 74 | 75 | 76 | ## License 77 | 78 | This work is licensed under a CC BY-SA 3.0 License. 79 | -------------------------------------------------------------------------------- /figs/covr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/covr.png -------------------------------------------------------------------------------- /figs/debugRStudio1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/debugRStudio1.png -------------------------------------------------------------------------------- /figs/debugRStudio2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/debugRStudio2.png -------------------------------------------------------------------------------- /figs/envex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/envex.png -------------------------------------------------------------------------------- /figs/envex.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 21 | 23 | 30 | 35 | 36 | 43 | 49 | 50 | 57 | 63 | 64 | 65 | 83 | 85 | 86 | 88 | image/svg+xml 89 | 91 | 92 | 93 | 94 | 95 | 99 | 106 | "1" 117 | 120 | 127 | 134 | 141 | 142 | 1 2 3 153 | 160 | 123 171 | TRUE 182 | 189 | 191 | 194 | 201 | 204 | 211 | a 222 | 223 | 226 | 233 | b 244 | 245 | 248 | 255 | c 266 | 267 | 277 | 278 | e 289 | 290 | 292 | 295 | 302 | 305 | 312 | d 323 | 324 | 327 | 334 | e 345 | 346 | 347 | e2 358 | 359 | 366 | 369 | 376 | x 387 | 388 | 398 | 403 | 409 | R_GlobalEnv 421 | 424 | "!" 435 | 442 | 443 | 448 | 453 | 458 | 463 | 468 | 473 | 474 | 475 | -------------------------------------------------------------------------------- /figs/funs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/funs.png -------------------------------------------------------------------------------- /figs/optim.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/optim.png -------------------------------------------------------------------------------- /figs/profvis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/profvis.png -------------------------------------------------------------------------------- /figs/sinfun.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/sinfun.png -------------------------------------------------------------------------------- /figs/style.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/style.png -------------------------------------------------------------------------------- /figs/vectimings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/vectimings.png -------------------------------------------------------------------------------- /lineprof-example.R: -------------------------------------------------------------------------------- 1 | library("lineprof") 2 | 3 | f <- function() { 4 | pause(0.1) 5 | g() 6 | h() 7 | } 8 | 9 | g <- function() { 10 | pause(0.1) 11 | h() 12 | } 13 | 14 | h <- function() { 15 | pause(0.1) 16 | } 17 | -------------------------------------------------------------------------------- /unittesting.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Unit testing" 3 | author: "Laurent Gatto" 4 | --- 5 | 6 | These exercises were written by Martin Morgan and Laurent Gatto for a 7 | [Bioconductor Developer Day workshop](http://bioconductor.org/help/course-materials/2013/BioC2013/developer-day-debug/). 8 | 9 | # Introduction 10 | 11 | > Whenever you are templted to type something into a print statement 12 | > or a debugger expression, write it as a test insted -- Martin Fowler 13 | 14 | **Why unit testing?** 15 | 16 | - Writing code to test code; 17 | - anticipate bugs, in particular for edge cases; 18 | - anticipate disruptive updates; 19 | - document and test observed bugs using specific tests. 20 | 21 | Each section provides a function that supposedly works as expected, 22 | but quickly proves to misbehave. The exercise aims at first writing 23 | some dedicated testing functions that will identify the problems and 24 | then update the function so that it passes the specific tests. This 25 | practice is called unit testing and we use the RUnit package for 26 | this. 27 | 28 | See the 29 | [Unit Testing How-To](http://bioconductor.org/developers/how-to/unitTesting-guidelines/) 30 | guide for details on unit testing using the 31 | [`RUnit`](http://cran.r-project.org/web/packages/RUnit/index.html) 32 | package. The 33 | [`testthat`](http://cran.r-project.org/web/packages/testthat/) is 34 | another package that provides unit testing infrastructure. Both 35 | packages can conveniently be used to automate unit testing within 36 | package testing. 37 | 38 | # Example 39 | 40 | ## Subsetting 41 | 42 | ### Problem 43 | 44 | This function should return the elements of `x` that are in `y`. 45 | 46 | ```{r} 47 | ## Example 48 | isIn <- function(x, y) { 49 | sel <- match(x, y) 50 | y[sel] 51 | } 52 | 53 | ## Expected 54 | x <- sample(LETTERS, 5) 55 | isIn(x, LETTERS) 56 | ``` 57 | But 58 | 59 | ```{r} 60 | ## Bug! 61 | isIn(c(x, "a"), LETTERS) 62 | ``` 63 | 64 | ### Solution 65 | 66 | Write a unit test that demonstrates the issue 67 | 68 | ```{r} 69 | ## Unit test: 70 | library("RUnit") 71 | test_isIn <- function() { 72 | x <- c("A", "B", "Z") 73 | checkIdentical(x, isIn(x, LETTERS)) 74 | checkIdentical(x, isIn(c(x, "a"), LETTERS)) 75 | 76 | } 77 | 78 | test_isIn() 79 | ``` 80 | 81 | Update the buggy function until the unit test succeeds 82 | 83 | ```{r} 84 | ## updated function 85 | isIn <- function(x, y) { 86 | sel <- x %in% y 87 | x[sel] 88 | } 89 | 90 | test_isIn() ## the bug is fixed and monitored 91 | ``` 92 | 93 | ## The `testthat` syntax 94 | 95 | `expect_that(object_or_expression, condition)` with conditions 96 | - equals: `expect_that(1+2,equals(3))` or `expect_equal(1+2,3)` 97 | - gives warning: `expect_that(warning("a")`, `gives_warning())` 98 | - is a: `expect_that(1, is_a("numeric"))` or `expect_is(1,"numeric")` 99 | - is true: `expect_that(2 == 2, is_true())` or `expect_true(2==2)` 100 | - matches: `expect_that("Testing is fun", matches("fun"))` or `expect_match("Testing is fun", "f.n")` 101 | - takes less: `than expect_that(Sys.sleep(1), takes_less_than(3))` 102 | 103 | and 104 | 105 | ```r 106 | test_that("description", { 107 | a <- foo() 108 | b <- bar() 109 | expect_equal(a, b) 110 | }) 111 | ``` 112 | 113 | ## Batch unit testing 114 | 115 | ```r 116 | library("testthat") 117 | test_dir("./unittests/") 118 | test_file("./unittests/test_foo.R") 119 | ``` 120 | 121 | # Exercises 122 | 123 | ## Column means 124 | 125 | ## Problem 126 | 127 | The `col_means` function computes the means of all numeric columns in 128 | a data frame (example from *Advanced R*, to illustrate defensive 129 | programming). 130 | 131 | ```{r} 132 | col_means <- function(df) { 133 | numeric <- sapply(df, is.numeric) 134 | numeric_cols <- df[, numeric] 135 | data.frame(lapply(numeric_cols, mean)) 136 | } 137 | 138 | ## Expected 139 | col_means(mtcars) 140 | 141 | ## Bugs 142 | col_means(mtcars[, "mpg"]) 143 | col_means(mtcars[, "mpg", drop=FALSE]) 144 | col_means(mtcars[, 0]) 145 | col_means(mtcars[0, ]) 146 | col_means(as.list(mtcars)) 147 | ``` 148 | 149 | ## Character matching 150 | 151 | ### Problem 152 | 153 | What are the exact matches of `x` in `y`? 154 | 155 | ```{r} 156 | isExactIn <- function(x, y) 157 | y[grep(x, y)] 158 | 159 | ## Expected 160 | isExactIn("a", letters) 161 | 162 | ## Bugs 163 | isExactIn("a", c("abc", letters)) 164 | isExactIn(c("a", "z"), c("abc", letters)) 165 | ``` 166 | 167 | ### Solution 168 | 169 | ```{r} 170 | ## Unit test: 171 | library("RUnit") 172 | test_isExactIn <- function() { 173 | checkIdentical("a", isExactIn("a", letters)) 174 | checkIdentical("a", isExactIn("a", c("abc", letters))) 175 | checkIdentical(c("a", "z"), isExactIn(c("a", "z"), c("abc", letters))) 176 | } 177 | 178 | test_isExactIn() 179 | 180 | ## updated function: 181 | isExactIn <- function(x, y) 182 | x[x %in% y] 183 | 184 | test_isExactIn() 185 | ``` 186 | 187 | ## If conditions with length > 1 188 | 189 | ### Problem 190 | 191 | If `x` is greater than `y`, we want the difference of their 192 | squares. Otherwise, we want the sum. 193 | 194 | ```{r} 195 | ifcond <- function(x, y) { 196 | if (x > y) { 197 | ans <- x*x - y*y 198 | } else { 199 | ans <- x*x + y*y 200 | } 201 | ans 202 | } 203 | 204 | ## Expected 205 | ifcond(3, 2) 206 | ifcond(2, 2) 207 | ifcond(1, 2) 208 | 209 | ## Bug! 210 | ifcond(3:1, c(2, 2, 2)) 211 | ``` 212 | 213 | ### Solution 214 | 215 | ```{r} 216 | ## Unit test: 217 | library("RUnit") 218 | test_ifcond <- function() { 219 | checkIdentical(5, ifcond(3, 2)) 220 | checkIdentical(8, ifcond(2, 2)) 221 | checkIdentical(5, ifcond(1, 2)) 222 | checkIdentical(c(5, 8, 5), ifcond(3:1, c(2, 2, 2))) 223 | } 224 | 225 | test_ifcond() 226 | 227 | ## updated function: 228 | ifcond <- function(x, y) 229 | ifelse(x > y, x*x - y*y, x*x + y*y) 230 | 231 | test_ifcond() 232 | ``` 233 | 234 | ## Know your inputs 235 | 236 | ### Problem 237 | 238 | Calculate the euclidean distance between a single point and a set of 239 | other points. 240 | 241 | ```{r} 242 | ## Example 243 | distances <- function(point, pointVec) { 244 | x <- point[1] 245 | y <- point[2] 246 | xVec <- pointVec[,1] 247 | yVec <- pointVec[,2] 248 | sqrt((xVec - x)^2 + (yVec - y)^2) 249 | } 250 | 251 | ## Expected 252 | x <- rnorm(5) 253 | y <- rnorm(5) 254 | 255 | (m <- cbind(x, y)) 256 | (p <- m[1, ]) 257 | 258 | distances(p, m) 259 | 260 | ## Bug! 261 | (dd <- data.frame(x, y)) 262 | (q <- dd[1, ]) 263 | 264 | distances(q, dd) 265 | ``` 266 | 267 | ### Solution 268 | 269 | ```{r} 270 | ## Unit test: 271 | library("RUnit") 272 | test_distances <- function() { 273 | x <- y <- c(0, 1, 2) 274 | m <- cbind(x, y) 275 | p <- m[1, ] 276 | dd <- data.frame(x, y) 277 | q <- dd[1, ] 278 | expct <- c(0, sqrt(c(2, 8))) 279 | checkIdentical(expct, distances(p, m)) 280 | checkIdentical(expct, distances(q, dd)) 281 | } 282 | 283 | test_distances() 284 | 285 | ## updated function 286 | distances <- function(point, pointVec) { 287 | point <- as.numeric(point) 288 | x <- point[1] 289 | y <- point[2] 290 | xVec <- pointVec[,1] 291 | yVec <- pointVec[,2] 292 | dist <- sqrt((xVec - x)^2 + (yVec - y)^2) 293 | return(dist) 294 | } 295 | 296 | test_distances() 297 | ``` 298 | 299 | ## Iterate on 0 length 300 | 301 | ### Problem 302 | 303 | Calculate the square root of the absolute value of a set of numbers. 304 | 305 | ```{r} 306 | sqrtabs <- function(x) { 307 | v <- abs(x) 308 | sapply(1:length(v), function(i) sqrt(v[i])) 309 | } 310 | 311 | ## Expected 312 | all(sqrtabs(c(-4, 0, 4)) == c(2, 0, 2)) 313 | 314 | ## Bug! 315 | sqrtabs(numeric()) 316 | ``` 317 | 318 | ### Solution 319 | 320 | ```{r} 321 | ## Unit test: 322 | library(RUnit) 323 | test_sqrtabs <- function() { 324 | checkIdentical(c(2, 0, 2), sqrtabs(c(-4, 0, 4))) 325 | checkIdentical(numeric(), sqrtabs(numeric())) 326 | } 327 | test_sqrtabs() 328 | 329 | ## updated function: 330 | sqrtabs <- function(x) { 331 | v <- abs(x) 332 | sapply(seq_along(v), function(i) sqrt(v[i])) 333 | } 334 | test_sqrtabs() # nope! 335 | 336 | sqrtabs <- function(x) { 337 | v <- abs(x) 338 | vapply(seq_along(v), function(i) sqrt(v[i]), 0) 339 | } 340 | test_sqrtabs() # yes! 341 | ``` 342 | 343 | # Unit testing in a package 344 | 345 | 346 | ## In a package 347 | 348 | 1. Create a directory `./mypackage/tests`. 349 | 2. Create the `testthat.R` file 350 | 351 | ```r 352 | library("testthat") 353 | library("mypackage") 354 | test_check("sequences") 355 | ``` 356 | 357 | 3. Create a sub-directory `./mypackage/tests/testthat` and include as 358 | many unit test files as desired that are named with the `test_` 359 | prefix and contain unit tests. 360 | 361 | 4. Suggest the unit testing package in your `DESCRIPTION` file: 362 | 363 | ``` 364 | Suggests: testthat 365 | ``` 366 | 367 | ## Example from the `sequences` package 368 | 369 | From the `./sequences/tests/testthat/test_sequences.R` file: 370 | 371 | ### Object creation and validity 372 | 373 | We have a fasta file and the corresponding `DnaSeq` object. 374 | 375 | 1. Let's make sure that the `DnaSeq` instance is valid, as changes in 376 | the class definition might have altered its validity. 377 | 378 | 2. Let's verify that `readFasta` regenerates and identical `DnaSeq` 379 | object given the original fasta file. 380 | 381 | ```r 382 | test_that("dnaseq validity", { 383 | data(dnaseq) 384 | expect_true(validObject(dnaseq)) 385 | }) 386 | 387 | test_that("readFasta", { 388 | ## loading _valid_ dnaseq 389 | data(dnaseq) 390 | ## reading fasta sequence 391 | f <- dir(system.file("extdata",package="sequences"),pattern="fasta",full.names=TRUE) 392 | xx <- readFasta(f[1]) 393 | expect_true(all.equal(xx, dnaseq)) 394 | }) 395 | ``` 396 | 397 | ### Multiple implementations 398 | 399 | Let's check that the R, C and C++ (via `Rcpp`) give the same result 400 | 401 | ```r 402 | test_that("ccpp code", { 403 | gccountr <- 404 | function(x) tabulate(factor(strsplit(x, "")[[1]])) 405 | x <- "AACGACTACAGCATACTAC" 406 | expect_true(identical(gccount(x), gccountr(x))) 407 | expect_true(identical(gccount2(x), gccountr(x))) 408 | }) 409 | ``` 410 | 411 | ## Exercise 412 | 413 | Choose any data package of your choice and write a unit test that 414 | tests the validity of all the its data. 415 | 416 | Tips 417 | 418 | - To get all the data distributed with a package, use `data(package = "packageName")` 419 | 420 | ```{r datatest0, eval=FALSE} 421 | library("pRolocdata") 422 | data(package = "pRolocdata") 423 | ``` 424 | 425 | - To test the validity of an object, use `validObject` 426 | 427 | ```{r dataset1, echo=FALSE} 428 | suppressPackageStartupMessages(library("pRolocdata")) 429 | ``` 430 | 431 | ```{r datatest2, eval=FALSE, eval=TRUE} 432 | data(andy2011) 433 | validObject(andy2011) 434 | ``` 435 | 436 | - Using the `testthat` syntax, the actual test for that data set would be 437 | 438 | ```{r datatest3, eval=TRUE} 439 | library("testthat") 440 | expect_true(validObject(andy2011)) 441 | ``` 442 | 443 | 444 | ## Testing coverage in a package 445 | 446 | The [covr](https://github.com/jimhester/covr) package: 447 | 448 | ![package coverage](./figs/covr.png) 449 | 450 | We can use `type="all"` to examine the coverage in unit tests, examples and vignettes. This can 451 | also be done interactively with Shiny: 452 | 453 | ```{r, eval=FALSE} 454 | library(covr) 455 | coverage <- package_coverage("/path/to/package/source", type="all") 456 | shine(coverage) 457 | ``` 458 | 459 | [Coverage for all Bioconductor packages](https://codecov.io/github/Bioconductor-mirror). 460 | -------------------------------------------------------------------------------- /unittesting.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Unit testing" 3 | author: "Laurent Gatto" 4 | --- 5 | 6 | These exercises were written by Martin Morgan and Laurent Gatto for a 7 | [Bioconductor Developer Day workshop](http://bioconductor.org/help/course-materials/2013/BioC2013/developer-day-debug/). 8 | 9 | # Introduction 10 | 11 | > Whenever you are templted to type something into a print statement 12 | > or a debugger expression, write it as a test insted -- Martin Fowler 13 | 14 | **Why unit testing?** 15 | 16 | - Writing code to test code; 17 | - anticipate bugs, in particular for edge cases; 18 | - anticipate disruptive updates; 19 | - document and test observed bugs using specific tests. 20 | 21 | Each section provides a function that supposedly works as expected, 22 | but quickly proves to misbehave. The exercise aims at first writing 23 | some dedicated testing functions that will identify the problems and 24 | then update the function so that it passes the specific tests. This 25 | practice is called unit testing and we use the RUnit package for 26 | this. 27 | 28 | See the 29 | [Unit Testing How-To](http://bioconductor.org/developers/how-to/unitTesting-guidelines/) 30 | guide for details on unit testing using the 31 | [`RUnit`](http://cran.r-project.org/web/packages/RUnit/index.html) 32 | package. The 33 | [`testthat`](http://cran.r-project.org/web/packages/testthat/) is 34 | another package that provides unit testing infrastructure. Both 35 | packages can conveniently be used to automate unit testing within 36 | package testing. 37 | 38 | # Example 39 | 40 | ## Subsetting 41 | 42 | ### Problem 43 | 44 | This function should return the elements of `x` that are in `y`. 45 | 46 | 47 | ```r 48 | ## Example 49 | isIn <- function(x, y) { 50 | sel <- match(x, y) 51 | y[sel] 52 | } 53 | 54 | ## Expected 55 | x <- sample(LETTERS, 5) 56 | isIn(x, LETTERS) 57 | ``` 58 | 59 | ``` 60 | ## [1] "I" "K" "S" "Y" "W" 61 | ``` 62 | But 63 | 64 | 65 | ```r 66 | ## Bug! 67 | isIn(c(x, "a"), LETTERS) 68 | ``` 69 | 70 | ``` 71 | ## [1] "I" "K" "S" "Y" "W" NA 72 | ``` 73 | 74 | ### Solution 75 | 76 | Write a unit test that demonstrates the issue 77 | 78 | 79 | ```r 80 | ## Unit test: 81 | library("RUnit") 82 | ``` 83 | 84 | ``` 85 | ## Loading required package: methods 86 | ``` 87 | 88 | ```r 89 | test_isIn <- function() { 90 | x <- c("A", "B", "Z") 91 | checkIdentical(x, isIn(x, LETTERS)) 92 | checkIdentical(x, isIn(c(x, "a"), LETTERS)) 93 | 94 | } 95 | 96 | test_isIn() 97 | ``` 98 | 99 | ``` 100 | ## Error in checkIdentical(x, isIn(c(x, "a"), LETTERS)): FALSE 101 | ## 102 | ``` 103 | 104 | Update the buggy function until the unit test succeeds 105 | 106 | 107 | ```r 108 | ## updated function 109 | isIn <- function(x, y) { 110 | sel <- x %in% y 111 | x[sel] 112 | } 113 | 114 | test_isIn() ## the bug is fixed and monitored 115 | ``` 116 | 117 | ``` 118 | ## [1] TRUE 119 | ``` 120 | 121 | ## The `testthat` syntax 122 | 123 | `expect_that(object_or_expression, condition)` with conditions 124 | - equals: `expect_that(1+2,equals(3))` or `expect_equal(1+2,3)` 125 | - gives warning: `expect_that(warning("a")`, `gives_warning())` 126 | - is a: `expect_that(1, is_a("numeric"))` or `expect_is(1,"numeric")` 127 | - is true: `expect_that(2 == 2, is_true())` or `expect_true(2==2)` 128 | - matches: `expect_that("Testing is fun", matches("fun"))` or `expect_match("Testing is fun", "f.n")` 129 | - takes less: `than expect_that(Sys.sleep(1), takes_less_than(3))` 130 | 131 | and 132 | 133 | ```r 134 | test_that("description", { 135 | a <- foo() 136 | b <- bar() 137 | expect_equal(a, b) 138 | }) 139 | ``` 140 | 141 | ## Batch unit testing 142 | 143 | ```r 144 | library("testthat") 145 | test_dir("./unittests/") 146 | test_file("./unittests/test_foo.R") 147 | ``` 148 | 149 | # Exercises 150 | 151 | ## Column means 152 | 153 | ## Problem 154 | 155 | The `col_means` function computes the means of all numeric columns in 156 | a data frame (example from *Advanced R*, to illustrate defensive 157 | programming). 158 | 159 | 160 | ```r 161 | col_means <- function(df) { 162 | numeric <- sapply(df, is.numeric) 163 | numeric_cols <- df[, numeric] 164 | data.frame(lapply(numeric_cols, mean)) 165 | } 166 | 167 | ## Expected 168 | col_means(mtcars) 169 | ``` 170 | 171 | ``` 172 | ## mpg cyl disp hp drat wt qsec vs 173 | ## 1 20.09062 6.1875 230.7219 146.6875 3.596563 3.21725 17.84875 0.4375 174 | ## am gear carb 175 | ## 1 0.40625 3.6875 2.8125 176 | ``` 177 | 178 | ```r 179 | ## Bugs 180 | col_means(mtcars[, "mpg"]) 181 | ``` 182 | 183 | ``` 184 | ## Error in df[, numeric]: incorrect number of dimensions 185 | ``` 186 | 187 | ```r 188 | col_means(mtcars[, "mpg", drop=FALSE]) 189 | ``` 190 | 191 | ``` 192 | ## X21 X21.1 X22.8 X21.4 X18.7 X18.1 X14.3 X24.4 X22.8.1 X19.2 X17.8 X16.4 193 | ## 1 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 194 | ## X17.3 X15.2 X10.4 X10.4.1 X14.7 X32.4 X30.4 X33.9 X21.5 X15.5 X15.2.1 195 | ## 1 17.3 15.2 10.4 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 196 | ## X13.3 X19.2.1 X27.3 X26 X30.4.1 X15.8 X19.7 X15 X21.4.1 197 | ## 1 13.3 19.2 27.3 26 30.4 15.8 19.7 15 21.4 198 | ``` 199 | 200 | ```r 201 | col_means(mtcars[, 0]) 202 | ``` 203 | 204 | ``` 205 | ## Error in .subset(x, j): invalid subscript type 'list' 206 | ``` 207 | 208 | ```r 209 | col_means(mtcars[0, ]) 210 | ``` 211 | 212 | ``` 213 | ## mpg cyl disp hp drat wt qsec vs am gear carb 214 | ## 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 215 | ``` 216 | 217 | ```r 218 | col_means(as.list(mtcars)) 219 | ``` 220 | 221 | ``` 222 | ## Error in df[, numeric]: incorrect number of dimensions 223 | ``` 224 | 225 | ## Character matching 226 | 227 | ### Problem 228 | 229 | What are the exact matches of `x` in `y`? 230 | 231 | 232 | ```r 233 | isExactIn <- function(x, y) 234 | y[grep(x, y)] 235 | 236 | ## Expected 237 | isExactIn("a", letters) 238 | ``` 239 | 240 | ``` 241 | ## [1] "a" 242 | ``` 243 | 244 | ```r 245 | ## Bugs 246 | isExactIn("a", c("abc", letters)) 247 | ``` 248 | 249 | ``` 250 | ## [1] "abc" "a" 251 | ``` 252 | 253 | ```r 254 | isExactIn(c("a", "z"), c("abc", letters)) 255 | ``` 256 | 257 | ``` 258 | ## Warning in grep(x, y): argument 'pattern' has length > 1 and only the first 259 | ## element will be used 260 | ``` 261 | 262 | ``` 263 | ## [1] "abc" "a" 264 | ``` 265 | 266 | ### Solution 267 | 268 | 269 | ```r 270 | ## Unit test: 271 | library("RUnit") 272 | test_isExactIn <- function() { 273 | checkIdentical("a", isExactIn("a", letters)) 274 | checkIdentical("a", isExactIn("a", c("abc", letters))) 275 | checkIdentical(c("a", "z"), isExactIn(c("a", "z"), c("abc", letters))) 276 | } 277 | 278 | test_isExactIn() 279 | ``` 280 | 281 | ``` 282 | ## Error in checkIdentical("a", isExactIn("a", c("abc", letters))): FALSE 283 | ## 284 | ``` 285 | 286 | ```r 287 | ## updated function: 288 | isExactIn <- function(x, y) 289 | x[x %in% y] 290 | 291 | test_isExactIn() 292 | ``` 293 | 294 | ``` 295 | ## [1] TRUE 296 | ``` 297 | 298 | ## If conditions with length > 1 299 | 300 | ### Problem 301 | 302 | If `x` is greater than `y`, we want the difference of their 303 | squares. Otherwise, we want the sum. 304 | 305 | 306 | ```r 307 | ifcond <- function(x, y) { 308 | if (x > y) { 309 | ans <- x*x - y*y 310 | } else { 311 | ans <- x*x + y*y 312 | } 313 | ans 314 | } 315 | 316 | ## Expected 317 | ifcond(3, 2) 318 | ``` 319 | 320 | ``` 321 | ## [1] 5 322 | ``` 323 | 324 | ```r 325 | ifcond(2, 2) 326 | ``` 327 | 328 | ``` 329 | ## [1] 8 330 | ``` 331 | 332 | ```r 333 | ifcond(1, 2) 334 | ``` 335 | 336 | ``` 337 | ## [1] 5 338 | ``` 339 | 340 | ```r 341 | ## Bug! 342 | ifcond(3:1, c(2, 2, 2)) 343 | ``` 344 | 345 | ``` 346 | ## Warning in if (x > y) {: the condition has length > 1 and only the first 347 | ## element will be used 348 | ``` 349 | 350 | ``` 351 | ## [1] 5 0 -3 352 | ``` 353 | 354 | ### Solution 355 | 356 | 357 | ```r 358 | ## Unit test: 359 | library("RUnit") 360 | test_ifcond <- function() { 361 | checkIdentical(5, ifcond(3, 2)) 362 | checkIdentical(8, ifcond(2, 2)) 363 | checkIdentical(5, ifcond(1, 2)) 364 | checkIdentical(c(5, 8, 5), ifcond(3:1, c(2, 2, 2))) 365 | } 366 | 367 | test_ifcond() 368 | ``` 369 | 370 | ``` 371 | ## Warning in if (x > y) {: the condition has length > 1 and only the first 372 | ## element will be used 373 | ``` 374 | 375 | ``` 376 | ## Error in checkIdentical(c(5, 8, 5), ifcond(3:1, c(2, 2, 2))): FALSE 377 | ## 378 | ``` 379 | 380 | ```r 381 | ## updated function: 382 | ifcond <- function(x, y) 383 | ifelse(x > y, x*x - y*y, x*x + y*y) 384 | 385 | test_ifcond() 386 | ``` 387 | 388 | ``` 389 | ## [1] TRUE 390 | ``` 391 | 392 | ## Know your inputs 393 | 394 | ### Problem 395 | 396 | Calculate the euclidean distance between a single point and a set of 397 | other points. 398 | 399 | 400 | ```r 401 | ## Example 402 | distances <- function(point, pointVec) { 403 | x <- point[1] 404 | y <- point[2] 405 | xVec <- pointVec[,1] 406 | yVec <- pointVec[,2] 407 | sqrt((xVec - x)^2 + (yVec - y)^2) 408 | } 409 | 410 | ## Expected 411 | x <- rnorm(5) 412 | y <- rnorm(5) 413 | 414 | (m <- cbind(x, y)) 415 | ``` 416 | 417 | ``` 418 | ## x y 419 | ## [1,] -3.1164934 -1.81741540 420 | ## [2,] 1.5741209 -0.05377479 421 | ## [3,] -0.6298675 0.68829382 422 | ## [4,] -1.3595685 -1.09019047 423 | ## [5,] 0.9666774 0.76398426 424 | ``` 425 | 426 | ```r 427 | (p <- m[1, ]) 428 | ``` 429 | 430 | ``` 431 | ## x y 432 | ## -3.116493 -1.817415 433 | ``` 434 | 435 | ```r 436 | distances(p, m) 437 | ``` 438 | 439 | ``` 440 | ## [1] 0.000000 5.011217 3.530140 1.901484 4.830725 441 | ``` 442 | 443 | ```r 444 | ## Bug! 445 | (dd <- data.frame(x, y)) 446 | ``` 447 | 448 | ``` 449 | ## x y 450 | ## 1 -3.1164934 -1.81741540 451 | ## 2 1.5741209 -0.05377479 452 | ## 3 -0.6298675 0.68829382 453 | ## 4 -1.3595685 -1.09019047 454 | ## 5 0.9666774 0.76398426 455 | ``` 456 | 457 | ```r 458 | (q <- dd[1, ]) 459 | ``` 460 | 461 | ``` 462 | ## x y 463 | ## 1 -3.116493 -1.817415 464 | ``` 465 | 466 | ```r 467 | distances(q, dd) 468 | ``` 469 | 470 | ``` 471 | ## x 472 | ## 1 0 473 | ``` 474 | 475 | ### Solution 476 | 477 | 478 | ```r 479 | ## Unit test: 480 | library("RUnit") 481 | test_distances <- function() { 482 | x <- y <- c(0, 1, 2) 483 | m <- cbind(x, y) 484 | p <- m[1, ] 485 | dd <- data.frame(x, y) 486 | q <- dd[1, ] 487 | expct <- c(0, sqrt(c(2, 8))) 488 | checkIdentical(expct, distances(p, m)) 489 | checkIdentical(expct, distances(q, dd)) 490 | } 491 | 492 | test_distances() 493 | ``` 494 | 495 | ``` 496 | ## Error in checkIdentical(expct, distances(q, dd)): FALSE 497 | ## 498 | ``` 499 | 500 | ```r 501 | ## updated function 502 | distances <- function(point, pointVec) { 503 | point <- as.numeric(point) 504 | x <- point[1] 505 | y <- point[2] 506 | xVec <- pointVec[,1] 507 | yVec <- pointVec[,2] 508 | dist <- sqrt((xVec - x)^2 + (yVec - y)^2) 509 | return(dist) 510 | } 511 | 512 | test_distances() 513 | ``` 514 | 515 | ``` 516 | ## [1] TRUE 517 | ``` 518 | 519 | ## Iterate on 0 length 520 | 521 | ### Problem 522 | 523 | Calculate the square root of the absolute value of a set of numbers. 524 | 525 | 526 | ```r 527 | sqrtabs <- function(x) { 528 | v <- abs(x) 529 | sapply(1:length(v), function(i) sqrt(v[i])) 530 | } 531 | 532 | ## Expected 533 | all(sqrtabs(c(-4, 0, 4)) == c(2, 0, 2)) 534 | ``` 535 | 536 | ``` 537 | ## [1] TRUE 538 | ``` 539 | 540 | ```r 541 | ## Bug! 542 | sqrtabs(numeric()) 543 | ``` 544 | 545 | ``` 546 | ## [[1]] 547 | ## [1] NA 548 | ## 549 | ## [[2]] 550 | ## numeric(0) 551 | ``` 552 | 553 | ### Solution 554 | 555 | 556 | ```r 557 | ## Unit test: 558 | library(RUnit) 559 | test_sqrtabs <- function() { 560 | checkIdentical(c(2, 0, 2), sqrtabs(c(-4, 0, 4))) 561 | checkIdentical(numeric(), sqrtabs(numeric())) 562 | } 563 | test_sqrtabs() 564 | ``` 565 | 566 | ``` 567 | ## Error in checkIdentical(numeric(), sqrtabs(numeric())): FALSE 568 | ## 569 | ``` 570 | 571 | ```r 572 | ## updated function: 573 | sqrtabs <- function(x) { 574 | v <- abs(x) 575 | sapply(seq_along(v), function(i) sqrt(v[i])) 576 | } 577 | test_sqrtabs() # nope! 578 | ``` 579 | 580 | ``` 581 | ## Error in checkIdentical(numeric(), sqrtabs(numeric())): FALSE 582 | ## 583 | ``` 584 | 585 | ```r 586 | sqrtabs <- function(x) { 587 | v <- abs(x) 588 | vapply(seq_along(v), function(i) sqrt(v[i]), 0) 589 | } 590 | test_sqrtabs() # yes! 591 | ``` 592 | 593 | ``` 594 | ## [1] TRUE 595 | ``` 596 | 597 | # Unit testing in a package 598 | 599 | 600 | ## In a package 601 | 602 | 1. Create a directory `./mypackage/tests`. 603 | 2. Create the `testthat.R` file 604 | 605 | ```r 606 | library("testthat") 607 | library("mypackage") 608 | test_check("sequences") 609 | ``` 610 | 611 | 3. Create a sub-directory `./mypackage/tests/testthat` and include as 612 | many unit test files as desired that are named with the `test_` 613 | prefix and contain unit tests. 614 | 615 | 4. Suggest the unit testing package in your `DESCRIPTION` file: 616 | 617 | ``` 618 | Suggests: testthat 619 | ``` 620 | 621 | ## Example from the `sequences` package 622 | 623 | From the `./sequences/tests/testthat/test_sequences.R` file: 624 | 625 | ### Object creation and validity 626 | 627 | We have a fasta file and the corresponding `DnaSeq` object. 628 | 629 | 1. Let's make sure that the `DnaSeq` instance is valid, as changes in 630 | the class definition might have altered its validity. 631 | 632 | 2. Let's verify that `readFasta` regenerates and identical `DnaSeq` 633 | object given the original fasta file. 634 | 635 | ```r 636 | test_that("dnaseq validity", { 637 | data(dnaseq) 638 | expect_true(validObject(dnaseq)) 639 | }) 640 | 641 | test_that("readFasta", { 642 | ## loading _valid_ dnaseq 643 | data(dnaseq) 644 | ## reading fasta sequence 645 | f <- dir(system.file("extdata",package="sequences"),pattern="fasta",full.names=TRUE) 646 | xx <- readFasta(f[1]) 647 | expect_true(all.equal(xx, dnaseq)) 648 | }) 649 | ``` 650 | 651 | ### Multiple implementations 652 | 653 | Let's check that the R, C and C++ (via `Rcpp`) give the same result 654 | 655 | ```r 656 | test_that("ccpp code", { 657 | gccountr <- 658 | function(x) tabulate(factor(strsplit(x, "")[[1]])) 659 | x <- "AACGACTACAGCATACTAC" 660 | expect_true(identical(gccount(x), gccountr(x))) 661 | expect_true(identical(gccount2(x), gccountr(x))) 662 | }) 663 | ``` 664 | 665 | ## Exercise 666 | 667 | Choose any data package of your choice and write a unit test that 668 | tests the validity of all the its data. 669 | 670 | Tips 671 | 672 | - To get all the data distributed with a package, use `data(package = "packageName")` 673 | 674 | 675 | ```r 676 | library("pRolocdata") 677 | data(package = "pRolocdata") 678 | ``` 679 | 680 | - To test the validity of an object, use `validObject` 681 | 682 | 683 | 684 | 685 | ```r 686 | data(andy2011) 687 | validObject(andy2011) 688 | ``` 689 | 690 | ``` 691 | ## [1] TRUE 692 | ``` 693 | 694 | - Using the `testthat` syntax, the actual test for that data set would be 695 | 696 | 697 | ```r 698 | library("testthat") 699 | expect_true(validObject(andy2011)) 700 | ``` 701 | 702 | 703 | ## Testing coverage in a package 704 | 705 | The [covr](https://github.com/jimhester/covr) package: 706 | 707 | ![package coverage](./figs/covr.png) 708 | 709 | We can use `type="all"` to examine the coverage in unit tests, examples and vignettes. This can 710 | also be done interactively with Shiny: 711 | 712 | 713 | ```r 714 | library(covr) 715 | coverage <- package_coverage("/path/to/package/source", type="all") 716 | shine(coverage) 717 | ``` 718 | 719 | [Coverage for all Bioconductor packages](https://codecov.io/github/Bioconductor-mirror). 720 | --------------------------------------------------------------------------------