├── .gitmodules
├── 01-intro.R
├── 01-intro.Rmd
├── 01-intro.md
├── 02-funprog.R
├── 02-funprog.Rmd
├── 02-funprog.md
├── 03-debug.R
├── 03-debug.Rmd
├── 03-debug.md
├── 04-perf.R
├── 04-perf.Rmd
├── 04-perf.md
├── Makefile
├── README.md
├── figs
    ├── covr.png
    ├── debugRStudio1.png
    ├── debugRStudio2.png
    ├── envex.png
    ├── envex.svg
    ├── funs.png
    ├── optim.png
    ├── profvis.png
    ├── sinfun.png
    ├── style.png
    └── vectimings.png
├── lineprof-example.R
├── unittesting.Rmd
└── unittesting.md


/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "R-parallel"]
2 | 	path = R-parallel
3 | 	url = git@github.com:lgatto/R-parallel.git
4 | [submodule "rccpp"]
5 | 	path = rccpp
6 | 	url = git@github.com:lgatto/rccpp.git
7 | 


--------------------------------------------------------------------------------
/01-intro.R:
--------------------------------------------------------------------------------
  1 | 
  2 | library("formatR")
  3 | tidy_source(text = "a=1+1;a  # print the value
  4 |                     matrix ( rnorm(10),5)",
  5 |             arrow = TRUE)
  6 | 
  7 | 
  8 | head(cars)
  9 | head(cars[, 1])
 10 | head(cars[, 1, drop = FALSE])
 11 | 
 12 | 
 13 | df1 <- data.frame(x = 1:3, y = LETTERS[1:3])
 14 | sapply(df1, class)
 15 | df2 <- data.frame(x = 1:3, y = Sys.time() + 1:3)
 16 | sapply(df2, class)
 17 | 
 18 | 
 19 | lapply(df1, class)
 20 | lapply(df2, class)
 21 | 
 22 | 
 23 | vapply(df1, class, "1")
 24 | vapply(df2, class, "1")
 25 | 
 26 | 
 27 | x <- 1
 28 | f <- function(x) {
 29 |     x <- 2
 30 |     x
 31 | }
 32 | x
 33 | f(x)
 34 | x
 35 | 
 36 | 
 37 | e <- new.env()
 38 | e$a <- 1
 39 | e$b <- LETTERS[1:5]
 40 | e$c <- TRUE
 41 | e$d <- mean
 42 | 
 43 | 
 44 | e$a <- e$b
 45 | e$a <- LETTERS[1:5]
 46 | 
 47 | 
 48 | e <- new.env()
 49 | parent.env(e)
 50 | 
 51 | 
 52 | environment()
 53 | parent.env(globalenv())
 54 | parent.env(parent.env(globalenv()))
 55 | 
 56 | 
 57 | globalenv()
 58 | emptyenv()
 59 | baseenv()
 60 | 
 61 | 
 62 | search()
 63 | as.environment("package:stats")
 64 | 
 65 | 
 66 | ls() ## default is R_GlobalEnv
 67 | ls(envir = e)
 68 | ls(pos = 1)
 69 | 
 70 | 
 71 | search()
 72 | 
 73 | 
 74 | e1 <- new.env()
 75 | e2 <- new.env()
 76 | e1$a <- 1:10
 77 | e2$a <- e1$a
 78 | 
 79 | 
 80 | e3 <- e1
 81 | e3
 82 | e1
 83 | identical(e1, e3)
 84 | 
 85 | 
 86 | e <- new.env()
 87 | e$a <- 1
 88 | e$b <- 2  ## add
 89 | e$a <- 10 ## modify
 90 | 
 91 | 
 92 | lockEnvironment(e)
 93 | e$k <- 1
 94 | e$a <- 100
 95 | 
 96 | 
 97 | lockBinding("a", e)
 98 | e$a <- 10
 99 | e$b <- 10
100 | 
101 | lockEnvironment(e, bindings = TRUE)
102 | e$b <- 1
103 | 
104 | 
105 | e <- new.env()
106 | e$foo <- 1
107 | bar <- 2
108 | where("foo")
109 | where("bar")
110 | where("foo", env = e)
111 | where("bar", env = e)
112 | 
113 | 
114 | search()
115 | mean <- function(x) cat("The mean is", sum(x)/length(x), "\n")
116 | mean(1:10)
117 | base::mean(1:10)
118 | rm(mean)
119 | mean(1:10)
120 | 
121 | 
122 | library("fortunes")
123 | fortune(174)
124 | 
125 | 
126 | rm(list = ls())
127 | x
128 | f1 <- function() x <<- 1
129 | f1()
130 | x
131 | 
132 | 
133 | f2 <- function() x <<- 2
134 | f2()
135 | x
136 | 
137 | 
138 | f3 <- function() x <- 10
139 | f3()
140 | x
141 | 
142 | 
143 | f4 <- function(x) x <-10
144 | f4(x)
145 | x
146 | 
147 | 
148 | modify <- function(x) {
149 | 	x$a <- 2
150 | 	invisible(TRUE)
151 | }
152 | 
153 | 
154 | x_l <- list(a = 1)
155 | modify(x_l)
156 | x_l$a
157 | 
158 | 
159 | x_e <- new.env()
160 | x_e$a <- 1
161 | modify(x_e)
162 | x_e$a
163 | 
164 | 
165 | e <- new.env()
166 | e$a <- 1
167 | e
168 | parent.env(e)
169 | 
170 | parent.env(e) <- emptyenv()
171 | parent.env(e)
172 | e
173 | 
174 | 
175 | e <- new.env(parent.env = empty.env())
176 | 
177 | 
178 | x <- 1
179 | e1 <- new.env()
180 | get("x", envir = e1)
181 | 
182 | 
183 | get("x", envir = e1, inherits = FALSE)
184 | 
185 | 
186 | e2 <- new.env(parent = emptyenv())
187 | get("x", envir = e2)
188 | 
189 | 
190 | get("x", envir = e1, inherits = FALSE)
191 | 
192 | 
193 | e <- new.env()
194 | e$x <- 1
195 | f <- function(myenv) myenv$x <- 2
196 | f(e)
197 | e$x
198 | 
199 | 
200 | library("Biobase")
201 | getClass("eSet")
202 | getClass("AssayData")
203 | new("ExpressionSet")
204 | 
205 | 
206 | .pRolocEnv <- new.env(parent=emptyenv(), hash=TRUE)
207 | 
208 | stockcol <- c("#E41A1C", "#377EB8", "#238B45", "#FF7F00", "#FFD700", "#333333",
209 |               "#00CED1", "#A65628", "#F781BF", "#984EA3", "#9ACD32", "#B0C4DE",
210 |               "#00008A", "#8B795E", "#FDAE6B", "#66C2A5", "#276419", "#CD8C95",
211 |               "#6A51A3", "#EEAD0E", "#0000FF", "#9ACD32", "#CD6090", "#CD5B45",
212 |               "#8E0152", "#808000", "#67000D", "#3F007D", "#6BAED6", "#FC9272")
213 | 
214 | assign("stockcol", stockcol, envir = .pRolocEnv)
215 | 
216 | getStockcol <- function() get("stockcol", envir = .pRolocEnv)
217 | 
218 | setStockcol <- function(cols) {
219 |     if (is.null(cols)) {
220 |         assign("stockcol", stockcol, envir = .pRolocEnv)
221 |     } else {
222 | 		assign("stockcol", cols, envir = .pRolocEnv)
223 | 	}
224 | }
225 | 
226 | 
227 | ...
228 | if (missing(col))
229 |   col <- getStockcol()
230 | ...
231 | 
232 | 
233 | setStockcol <- function(cols) {
234 | 	prevcols <- getStockcol()
235 |     if (is.null(cols)) {
236 |         assign("stockcol", stockcol, envir = .pRolocEnv)
237 |     } else {
238 | 		assign("stockcol", cols, envir = .pRolocEnv)
239 | 	}
240 | 	invisible(prevcols)
241 | }
242 | 
243 | 
244 | library("dplyr")
245 | surveys <- read.csv("http://datacarpentry.github.io/dc_zurich/data/portal_data_joined.csv")
246 | head(surveys)
247 | 
248 | surveys %>%
249 |   filter(weight < 5) %>%
250 |   select(species_id, sex, weight)
251 | 
252 | surveys %>%
253 |   mutate(weight_kg = weight / 1000) %>%
254 |   filter(!is.na(weight)) %>%
255 |   head
256 | 
257 | surveys %>%
258 |   group_by(sex) %>%
259 |   tally()
260 | 
261 | surveys %>%
262 |   group_by(sex, species_id) %>%
263 |   summarize(mean_weight = mean(weight, na.rm = TRUE))
264 | 
265 | surveys %>%
266 |   group_by(sex, species_id) %>%
267 |   summarize(mean_weight = mean(weight, na.rm = TRUE),
268 |             min_weight = min(weight, na.rm = TRUE)) %>%
269 |   filter(!is.nan(mean_weight))
270 | 
271 | 
272 | quote(1:10)
273 | quote(paste(letters, LETTERS, sep = "-"))
274 | 
275 | 
276 | eval(quote(1 + 1))
277 | eval(quote(1:10))
278 | 
279 | x <- 10
280 | eval(quote(x + 1))
281 | 
282 | e <- new.env()
283 | e$x <- 1
284 | eval(quote(x + 1), env = e)
285 | 
286 | eval(quote(x), list(x = 30))
287 | 
288 | dfr <- data.frame(x = 1:10, y = LETTERS[1:10])
289 | eval(quote(sum(x)), dfr)
290 | 
291 | 
292 | x <- 10
293 | substitute(sqrt(x))
294 | 
295 | e <- new.env()
296 | e$x <- 1
297 | substitute(sqrt(x), env = e)
298 | 
299 | 
300 | parse(text = "1:10")
301 | parse(file = "lineprof-example.R")
302 | 
303 | 
304 | x <- 123
305 | deparse(substitute(x))
306 | 
307 | 
308 | foo <- "bar"
309 | as.name(foo)
310 | string <- "1:10"
311 | parse(text=string)
312 | eval(parse(text=string))
313 | 
314 | 
315 | varName1 <- "varName2"
316 | assign(varName1, "123")
317 | varName1
318 | get(varName1)
319 | varName2
320 | 
321 | 
322 | test <- function(x) {
323 |     y <- deparse(substitute(x))
324 |     print(y)
325 |     print(x)
326 | }
327 | var <- c("one","two","three")
328 | test(var)
329 | 
330 | 


--------------------------------------------------------------------------------
/01-intro.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Part I: Introduction"
  3 | author: "Laurent Gatto"
  4 | ---
  5 | 
  6 | ## Overview
  7 | 
  8 | - Coding style(s)
  9 | - Interactive use and programming
 10 | - Environments
 11 | - Tidy data
 12 | - Computing on the language
 13 | 
 14 | ## Introduction
 15 | 
 16 | > Computers are cheap, and thinking hurts. -- Uwe Ligges
 17 | 
 18 | Simplicity, readability and consistency are a long way towards
 19 | robust code.
 20 | 
 21 | ## Coding style(s)
 22 | 
 23 | Why?
 24 | 
 25 | > Good coding style is like using correct punctuation. You can manage
 26 | > without it, but it sure makes things easier to read.
 27 | -- Hadley Wickham
 28 | 
 29 | for **consistency** and **readability**.
 30 | 
 31 | ## Which one?
 32 | 
 33 | - [Bioconductor](http://master.bioconductor.org/developers/how-to/coding-style/)
 34 | - [Hadley Wickham](http://r-pkgs.had.co.nz/style.html)
 35 | - [Google](http://google.github.io/styleguide/Rguide.xml)
 36 | - ...
 37 | 
 38 | ## Examples
 39 | 
 40 | - Place spaces around all infix operators (`=`, `+`, `-`, `<-`, etc., but *not* `:`)
 41 |   and after a comma (`x[i, j]`).
 42 | - Spaces before `(` and after `)`; not for function.
 43 | - Use `<-` rather than `=`.
 44 | - Limit your code to 80 characters per line
 45 | - Indentation: do not use tabs, use 2 (HW)/4 (Bioc) spaces
 46 | - Function names: use verbs
 47 | - Variable names: camelCaps (Bioc)/ `_` (HW) (but not a `.`)
 48 | - Prefix non-exported functions with a ‘.’ (Bioc).
 49 | - Class names: start with a capital
 50 | - Comments: `# ` or `## ` (from emacs)
 51 | 
 52 | ## [`formatR`](https://cran.rstudio.com/web/packages/formatR/index.html)
 53 | 
 54 | ```{r, eval=TRUE}
 55 | library("formatR")
 56 | tidy_source(text = "a=1+1;a  # print the value
 57 |                     matrix ( rnorm(10),5)",
 58 |             arrow = TRUE)
 59 | ```
 60 | 
 61 | ## [`BiocCheck`](http://bioconductor.org/packages/devel/bioc/html/BiocCheck.html)
 62 | 
 63 | ```
 64 | $ R CMD BiocCheck package_1.0.0.tgz
 65 | ```
 66 | 
 67 | ```
 68 | * Checking function lengths................
 69 |   The longest function is 677 lines long
 70 |   The longest 5 functions are:
 71 | * Checking formatting of DESCRIPTION, NAMESPACE, man pages, R source,
 72 |   and vignette source...
 73 |     * CONSIDER: Shortening lines; 616 lines (11%) are > 80 characters
 74 |       long.
 75 |     * CONSIDER: Replacing tabs with 4 spaces; 3295 lines (60%) contain
 76 |       tabs.
 77 |     * CONSIDER: Indenting lines with a multiple of 4 spaces; 162 lines
 78 |       (2%) are not.
 79 | ```
 80 | 
 81 | ## Style changes over time
 82 | 
 83 | ![Style changes over time](./figs/style.png)
 84 | 
 85 | 
 86 | ## Ineractive use vs programming
 87 | 
 88 | Moving from using R to programming R is *abstraction*, *automation*,
 89 | *generalisation*.
 90 | 
 91 | ## Interactive use vs programming: `drop`
 92 | 
 93 | ```{r, eval=FALSE}
 94 | head(cars)
 95 | head(cars[, 1])
 96 | head(cars[, 1, drop = FALSE])
 97 | ```
 98 | 
 99 | ## Interactive use vs programming: `sapply/lapply`
100 | 
101 | ```{r, eval=FALSE}
102 | df1 <- data.frame(x = 1:3, y = LETTERS[1:3])
103 | sapply(df1, class)
104 | df2 <- data.frame(x = 1:3, y = Sys.time() + 1:3)
105 | sapply(df2, class)
106 | ```
107 | 
108 | Rather use a form where the return data structure is known...
109 | 
110 | ```{r, eval=FALSE}
111 | lapply(df1, class)
112 | lapply(df2, class)
113 | ```
114 | 
115 | or that will break if the result is not what is exected
116 | 
117 | ```{r, eval=FALSE}
118 | vapply(df1, class, "1")
119 | vapply(df2, class, "1")
120 | ```
121 | 
122 | ## Semantics
123 | 
124 | - *pass-by-value* copy-on-modify
125 | - *pass-by-reference*: environments, S4 Reference Classes
126 | 
127 | ```{r, eval=FALSE}
128 | x <- 1
129 | f <- function(x) {
130 |     x <- 2
131 |     x
132 | }
133 | x
134 | f(x)
135 | x
136 | ```
137 | 
138 | ## Environments
139 | 
140 | ### Motivation
141 | 
142 | - Data structure that enables *scoping* (see later).
143 | - Have reference semantics
144 | - Useful data structure on their own
145 | 
146 | ### Definition (1)
147 | 
148 | An environment associates, or *binds*, names to values in memory.
149 | Variables in an environment are hence called *bindings*.
150 | 
151 | ## Creating and populate environments
152 | 
153 | ```{r, eval=FALSE}
154 | e <- new.env()
155 | e$a <- 1
156 | e$b <- LETTERS[1:5]
157 | e$c <- TRUE
158 | e$d <- mean
159 | ```
160 | 
161 | ```{r, eval=FALSE}
162 | e$a <- e$b
163 | e$a <- LETTERS[1:5]
164 | ```
165 | 
166 | - Objects in environments have unique names
167 | - Objects in different environments can of course have identical names
168 | - Objects in an environment have no order
169 | - Environments have parents
170 | 
171 | ## Definition (2)
172 | 
173 | An environment is composed of a *frame* that contains the name-object
174 | bindings and a parent (enclosing) environment.
175 | 
176 | ## Relationship between environments
177 | 
178 | Every environment has a parent (enclosing) environment
179 | 
180 | ```{r, eval=FALSE}
181 | e <- new.env()
182 | parent.env(e)
183 | ```
184 | Current environment
185 | 
186 | ```{r, eval=FALSE}
187 | environment()
188 | parent.env(globalenv())
189 | parent.env(parent.env(globalenv()))
190 | ```
191 | 
192 | Noteworthy environments
193 | 
194 | ```{r, eval=FALSE}
195 | globalenv()
196 | emptyenv()
197 | baseenv()
198 | ```
199 | 
200 | All parent of `R_GlobalEnv`:
201 | 
202 | ```{r, eval=FALSE}
203 | search()
204 | as.environment("package:stats")
205 | ```
206 | 
207 | Listing objects in an environment
208 | 
209 | ```{r, eval=FALSE}
210 | ls() ## default is R_GlobalEnv
211 | ls(envir = e)
212 | ls(pos = 1)
213 | ```
214 | 
215 | ```{r, eval=FALSE}
216 | search()
217 | ```
218 | 
219 | Note: Every time a package is loaded with `library`, it is inserted in
220 | the search path after the `R_GlobalEnv`.
221 | 
222 | ## Accessors and setters
223 | 
224 | - In addition to `$`, one can also use `[[`, `get` and `assign`.
225 | - To check if a name exists in an environmet (or in any or its
226 |   parents), one can use `exists`.
227 | - Compare two environments with `identical` (not `==`).
228 | 
229 | **Question** Are `e1` and `e2` below identical?
230 | 
231 | ```{r, eval=FALSE}
232 | e1 <- new.env()
233 | e2 <- new.env()
234 | e1$a <- 1:10
235 | e2$a <- e1$a
236 | ```
237 | 
238 | What about `e1` and `e3`?
239 | 
240 | ```{r, eval=FALSE}
241 | e3 <- e1
242 | e3
243 | e1
244 | identical(e1, e3)
245 | ```
246 | 
247 | ## Locking environments and bindings
248 | 
249 | ```{r, eval=FALSE}
250 | e <- new.env()
251 | e$a <- 1
252 | e$b <- 2  ## add
253 | e$a <- 10 ## modify
254 | ```
255 | 
256 | Locking an environment stops from adding new bindings:
257 | 
258 | ```{r, eval=FALSE}
259 | lockEnvironment(e)
260 | e$k <- 1
261 | e$a <- 100
262 | ```
263 | 
264 | Locking bindings stops from modifying bindings with en envionment:
265 | 
266 | ```{r, eval=FALSE}
267 | lockBinding("a", e)
268 | e$a <- 10
269 | e$b <- 10
270 | 
271 | lockEnvironment(e, bindings = TRUE)
272 | e$b <- 1
273 | ```
274 | 
275 | ## Exercise
276 | 
277 | Reproduce the following environments and variables in R.
278 | 
279 | ![envionments and variables exercise](./figs/envex.png)
280 | 
281 | ## Where is a symbol defined?
282 | 
283 | `pryr::where()` implements the regular scoping rules to find in which
284 | environment a binding is defined.
285 | 
286 | ```{r, eval=FALSE}
287 | e <- new.env()
288 | e$foo <- 1
289 | bar <- 2
290 | where("foo")
291 | where("bar")
292 | where("foo", env = e)
293 | where("bar", env = e)
294 | ```
295 | 
296 | ## Lexical scoping
297 | 
298 | [Lexical comes from *lexical analysis* in computer science, which is
299 | the conversion of characters (code) into a sequence of meaningful (for
300 | the computer) tokens.]
301 | 
302 | **Definition**: Rules that define how R looks up values for a given name/symbol.
303 | 
304 | - Objects in environments have unique names
305 | - Objects in different environments can of course have identical names.
306 | - If a name is not found in the current environment, it is looked up
307 |   in the parent (enclosing) from. 
308 | - If it is not found in the parent (enclosing) frame, it is looked up
309 |   in the parent's parent frame, and so on...
310 | 
311 | ```{r, eval=FALSE}
312 | search()
313 | mean <- function(x) cat("The mean is", sum(x)/length(x), "\n")
314 | mean(1:10)
315 | base::mean(1:10)
316 | rm(mean)
317 | mean(1:10)
318 | ```
319 | 
320 | ## Assignments
321 | 
322 | - `<-` assigns/creates in the current environment
323 | 
324 | - `<<-` (deep assignment) never creates/updates a variable in the
325 |   current environment, but modifies an existing variable in the
326 |   current or first enclosing environment where that name is defined.
327 |   
328 | - If `<<-` does not find the name, it will create the variable in the
329 |   global environment.
330 | 
331 | ```{r, eval=TRUE}
332 | library("fortunes")
333 | fortune(174)
334 | ```
335 | 
336 | ```{r, eval=FALSE}
337 | rm(list = ls())
338 | x
339 | f1 <- function() x <<- 1
340 | f1()
341 | x
342 | ```
343 | 
344 | ```{r, eval=FALSE}
345 | f2 <- function() x <<- 2
346 | f2()
347 | x
348 | ```
349 | 
350 | ```{r, eval=FALSE}
351 | f3 <- function() x <- 10
352 | f3()
353 | x
354 | ```
355 | 
356 | ```{r, eval=FALSE}
357 | f4 <- function(x) x <-10
358 | f4(x)
359 | x
360 | ```
361 | 
362 | ## Using environments
363 | 
364 | Most environments are created when creating and calling
365 | functions. They are also used in packages as *package* and *namespace*
366 | environments.
367 | 
368 | There are several reasons to create then manually.
369 | 
370 | - Reference semantics
371 | - Avoiding copies
372 | - Package state
373 | - As a hashmap for fast name lookup
374 | 
375 | ## Reference semantics
376 | 
377 | ```{r, eval=TRUE}
378 | modify <- function(x) {
379 | 	x$a <- 2
380 | 	invisible(TRUE)
381 | }
382 | ```
383 | 
384 | ```{r, eval=FALSE}
385 | x_l <- list(a = 1)
386 | modify(x_l)
387 | x_l$a
388 | ```
389 | 
390 | ```{r, eval=FALSE}
391 | x_e <- new.env()
392 | x_e$a <- 1
393 | modify(x_e)
394 | x_e$a
395 | ```
396 | 
397 | Tip: when setting up environments, it is advised to set to parent
398 | (enclosing) environment to be `emptyenv()`, to avoid accidentally
399 | inheriting objects from somewhere else on the search path.
400 | 
401 | ```{r, eval=FALSE}
402 | e <- new.env()
403 | e$a <- 1
404 | e
405 | parent.env(e)
406 | 
407 | parent.env(e) <- emptyenv()
408 | parent.env(e)
409 | e
410 | ```
411 | 
412 | or directly
413 | 
414 | ```{r, eval=FALSE}
415 | e <- new.env(parent.env = empty.env())
416 | ```
417 | 
418 | ### Exercise
419 | 
420 | What is going to happen when we access `"x"` in the four cases below?
421 | 
422 | ```{r, eval=FALSE}
423 | x <- 1
424 | e1 <- new.env()
425 | get("x", envir = e1)
426 | ```
427 | 
428 | ```{r, eval=FALSE}
429 | get("x", envir = e1, inherits = FALSE)
430 | ```
431 | 
432 | ```{r, eval=FALSE}
433 | e2 <- new.env(parent = emptyenv())
434 | get("x", envir = e2)
435 | ```
436 | 
437 | ```{r, eval=FALSE}
438 | get("x", envir = e1, inherits = FALSE)
439 | ```
440 | 
441 | ## Avoiding copies
442 | 
443 | Since environments have reference semantics, they are not copied.
444 | When passing an environment as function argument (directly, or as part
445 | of a more complex data structure), it is **not** copied: all its
446 | values are accessible within the function and can be persistently
447 | modified.
448 | 
449 | ```{r, eval=FALSE}
450 | e <- new.env()
451 | e$x <- 1
452 | f <- function(myenv) myenv$x <- 2
453 | f(e)
454 | e$x
455 | ```
456 | 
457 | This is used in the `eSet` class family to store the expression data.
458 | 
459 | ```{r, eval=FALSE}
460 | library("Biobase")
461 | getClass("eSet")
462 | getClass("AssayData")
463 | new("ExpressionSet")
464 | ```
465 | 
466 | ## Preserving state in packages
467 | 
468 | Explicit envirionments are also useful to preserve state or define
469 | constants-like variables in a package. One can then set getters and
470 | setters for users to access the variables within that private
471 | envionment.
472 | 
473 | #### Use case
474 | 
475 | Colour management in [`pRoloc`](https://github.com/lgatto/pRoloc/blob/master/R/environment.R):
476 | 
477 | ```{r, eval=FALSE}
478 | .pRolocEnv <- new.env(parent=emptyenv(), hash=TRUE)
479 | 
480 | stockcol <- c("#E41A1C", "#377EB8", "#238B45", "#FF7F00", "#FFD700", "#333333",
481 |               "#00CED1", "#A65628", "#F781BF", "#984EA3", "#9ACD32", "#B0C4DE",
482 |               "#00008A", "#8B795E", "#FDAE6B", "#66C2A5", "#276419", "#CD8C95",
483 |               "#6A51A3", "#EEAD0E", "#0000FF", "#9ACD32", "#CD6090", "#CD5B45",
484 |               "#8E0152", "#808000", "#67000D", "#3F007D", "#6BAED6", "#FC9272")
485 | 
486 | assign("stockcol", stockcol, envir = .pRolocEnv)
487 | 
488 | getStockcol <- function() get("stockcol", envir = .pRolocEnv)
489 | 
490 | setStockcol <- function(cols) {
491 |     if (is.null(cols)) {
492 |         assign("stockcol", stockcol, envir = .pRolocEnv)
493 |     } else { 
494 | 		assign("stockcol", cols, envir = .pRolocEnv)
495 | 	}
496 | }
497 | ```
498 | 
499 | and in plotting functions (we will see the `missing` in more details later):
500 | 
501 | ```{r, eval=FALSE}
502 | ...
503 | if (missing(col))
504 |   col <- getStockcol()
505 | ...
506 | ```
507 | 
508 | Hadley's tip: Invisibly returning the old value from 
509 | 
510 | ```{r, eval=FALSE}
511 | setStockcol <- function(cols) {
512 | 	prevcols <- getStockcol()
513 |     if (is.null(cols)) {
514 |         assign("stockcol", stockcol, envir = .pRolocEnv)
515 |     } else { 
516 | 		assign("stockcol", cols, envir = .pRolocEnv)
517 | 	}
518 | 	invisible(prevcols)
519 | }
520 | ```
521 | 
522 | ## Tidy data
523 | 
524 | > Hadley Wickham, Tidy Data, Vol. 59, Issue 10, Sep 2014, Journal of
525 | > Statistical Software. http://www.jstatsoft.org/v59/i10.
526 | 
527 | Tidy datasets are easy to manipulate, model and visualize, and have a
528 | specific structure: each variable is a column, each observation is a
529 | row, and each type of observational unit is a table.
530 | 
531 | 
532 | ## Tidy tools
533 | 
534 | Tidy data also makes it easier to develop tidy tools for data
535 | analysis, tools that both input and output tidy datasets.
536 | 
537 | - `dply::select` select columns
538 | - `dlpy::filter` select rows
539 | - `dplyr:mutate` create new columns
540 | - `dpplyr:group_by` split-apply-combine
541 | - `dlpyr:summarise` collapse each group into a single-row summary of
542 |   that group
543 | - `magrittr::%>%` piping
544 | 
545 | 
546 | ## Examples
547 | 
548 | ```{r, eval=FALSE}
549 | library("dplyr")
550 | surveys <- read.csv("http://datacarpentry.github.io/dc_zurich/data/portal_data_joined.csv")
551 | head(surveys)
552 | 
553 | surveys %>%
554 |   filter(weight < 5) %>%
555 |   select(species_id, sex, weight)
556 |   
557 | surveys %>%
558 |   mutate(weight_kg = weight / 1000) %>%
559 |   filter(!is.na(weight)) %>%
560 |   head
561 | 
562 | surveys %>%
563 |   group_by(sex) %>%
564 |   tally()
565 |   
566 | surveys %>%
567 |   group_by(sex, species_id) %>%
568 |   summarize(mean_weight = mean(weight, na.rm = TRUE))
569 |   
570 | surveys %>%
571 |   group_by(sex, species_id) %>%
572 |   summarize(mean_weight = mean(weight, na.rm = TRUE),
573 |             min_weight = min(weight, na.rm = TRUE)) %>%
574 |   filter(!is.nan(mean_weight))
575 | ```
576 | 
577 | ## Application to other data structures
578 | 
579 | > Hadley Wickham (@hadleywickham) tweeted at 8:45 pm on Fri, Feb 12,
580 | > 2016: @mark_scheuerell @drob the importance of tidy data is not the
581 | > specific form, but the consistency
582 | > (https://twitter.com/hadleywickham/status/698246671629549568?s=09)
583 | 
584 | - Well-formatted and well-documented `S4` class 
585 | - `S4` as input -(function)-> `S4` as output
586 | 
587 | ![MSnSet schematics](https://raw.githubusercontent.com/lgatto/pRoloc/master/vignettes/Figures/msnset.png)
588 | 
589 | ## Computing on the language
590 | 
591 | 
592 | #### Quoting and evaluating expressions
593 | 
594 | Quote an expression, don't evaluate it:
595 | 
596 | ```{r, eval=FALSE}
597 | quote(1:10)
598 | quote(paste(letters, LETTERS, sep = "-"))
599 | ```
600 | Evaluate an expression in a specific environment:
601 | 
602 | ```{r, eval=FALSE}
603 | eval(quote(1 + 1))
604 | eval(quote(1:10))
605 | 
606 | x <- 10
607 | eval(quote(x + 1))
608 | 
609 | e <- new.env()
610 | e$x <- 1
611 | eval(quote(x + 1), env = e)
612 | 
613 | eval(quote(x), list(x = 30))
614 | 
615 | dfr <- data.frame(x = 1:10, y = LETTERS[1:10])
616 | eval(quote(sum(x)), dfr)
617 | ```
618 | 
619 | Substitute any variables bound in `env`, but don't evaluate the
620 | expression:
621 | 
622 | ```{r, eval=FALSE}
623 | x <- 10
624 | substitute(sqrt(x))
625 | 
626 | e <- new.env()
627 | e$x <- 1
628 | substitute(sqrt(x), env = e)
629 | ```
630 | 
631 | Parse, but don't evaluate an expression:
632 | 
633 | ```{r, eval=FALSE}
634 | parse(text = "1:10")
635 | parse(file = "lineprof-example.R")
636 | ```
637 | 
638 | Turn an unevaluated expressions into character strings:
639 | 
640 | ```{r, eval=FALSE}
641 | x <- 123
642 | deparse(substitute(x))
643 | ```
644 | 
645 | #### Characters as variables names 
646 | 
647 | 
648 | ```{r, eval=FALSE}
649 | foo <- "bar"
650 | as.name(foo)
651 | string <- "1:10"
652 | parse(text=string)
653 | eval(parse(text=string))
654 | ```
655 |       
656 | 
657 | And with `assign` and `get`
658 | 
659 | ```{r, eval=FALSE}
660 | varName1 <- "varName2"
661 | assign(varName1, "123")
662 | varName1
663 | get(varName1)
664 | varName2
665 | ```
666 | 
667 | Using `substitute` and `deparse`
668 | 
669 | ```{r, eval=FALSE}
670 | test <- function(x) {
671 |     y <- deparse(substitute(x))
672 |     print(y)
673 |     print(x)
674 | }
675 | var <- c("one","two","three")
676 | test(var)
677 | ```
678 | 
679 | 
680 | 


--------------------------------------------------------------------------------
/01-intro.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Part I: Introduction"
  3 | author: "Laurent Gatto"
  4 | ---
  5 | 
  6 | ## Overview
  7 | 
  8 | - Coding style(s)
  9 | - Interactive use and programming
 10 | - Environments
 11 | - Tidy data
 12 | - Computing on the language
 13 | 
 14 | ## Introduction
 15 | 
 16 | > Computers are cheap, and thinking hurts. -- Uwe Ligges
 17 | 
 18 | Simplicity, readability and consistency are a long way towards
 19 | robust code.
 20 | 
 21 | ## Coding style(s)
 22 | 
 23 | Why?
 24 | 
 25 | > Good coding style is like using correct punctuation. You can manage
 26 | > without it, but it sure makes things easier to read.
 27 | -- Hadley Wickham
 28 | 
 29 | for **consistency** and **readability**.
 30 | 
 31 | ## Which one?
 32 | 
 33 | - [Bioconductor](http://master.bioconductor.org/developers/how-to/coding-style/)
 34 | - [Hadley Wickham](http://r-pkgs.had.co.nz/style.html)
 35 | - [Google](http://google.github.io/styleguide/Rguide.xml)
 36 | - ...
 37 | 
 38 | ## Examples
 39 | 
 40 | - Place spaces around all infix operators (`=`, `+`, `-`, `<-`, etc., but *not* `:`)
 41 |   and after a comma (`x[i, j]`).
 42 | - Spaces before `(` and after `)`; not for function.
 43 | - Use `<-` rather than `=`.
 44 | - Limit your code to 80 characters per line
 45 | - Indentation: do not use tabs, use 2 (HW)/4 (Bioc) spaces
 46 | - Function names: use verbs
 47 | - Variable names: camelCaps (Bioc)/ `_` (HW) (but not a `.`)
 48 | - Prefix non-exported functions with a ‘.’ (Bioc).
 49 | - Class names: start with a capital
 50 | - Comments: `# ` or `## ` (from emacs)
 51 | 
 52 | ## [`formatR`](https://cran.rstudio.com/web/packages/formatR/index.html)
 53 | 
 54 | 
 55 | ```r
 56 | library("formatR")
 57 | tidy_source(text = "a=1+1;a  # print the value
 58 |                     matrix ( rnorm(10),5)",
 59 |             arrow = TRUE)
 60 | ```
 61 | 
 62 | ```
 63 | ## a <- 1 + 1
 64 | ## a  # print the value
 65 | ## matrix(rnorm(10), 5)
 66 | ```
 67 | 
 68 | ## [`BiocCheck`](http://bioconductor.org/packages/devel/bioc/html/BiocCheck.html)
 69 | 
 70 | ```
 71 | $ R CMD BiocCheck package_1.0.0.tgz
 72 | ```
 73 | 
 74 | ```
 75 | * Checking function lengths................
 76 |   The longest function is 677 lines long
 77 |   The longest 5 functions are:
 78 | * Checking formatting of DESCRIPTION, NAMESPACE, man pages, R source,
 79 |   and vignette source...
 80 |     * CONSIDER: Shortening lines; 616 lines (11%) are > 80 characters
 81 |       long.
 82 |     * CONSIDER: Replacing tabs with 4 spaces; 3295 lines (60%) contain
 83 |       tabs.
 84 |     * CONSIDER: Indenting lines with a multiple of 4 spaces; 162 lines
 85 |       (2%) are not.
 86 | ```
 87 | 
 88 | ## Style changes over time
 89 | 
 90 | ![Style changes over time](./figs/style.png)
 91 | 
 92 | 
 93 | ## Ineractive use vs programming
 94 | 
 95 | Moving from using R to programming R is *abstraction*, *automation*,
 96 | *generalisation*.
 97 | 
 98 | ## Interactive use vs programming: `drop`
 99 | 
100 | 
101 | ```r
102 | head(cars)
103 | head(cars[, 1])
104 | head(cars[, 1, drop = FALSE])
105 | ```
106 | 
107 | ## Interactive use vs programming: `sapply/lapply`
108 | 
109 | 
110 | ```r
111 | df1 <- data.frame(x = 1:3, y = LETTERS[1:3])
112 | sapply(df1, class)
113 | df2 <- data.frame(x = 1:3, y = Sys.time() + 1:3)
114 | sapply(df2, class)
115 | ```
116 | 
117 | Rather use a form where the return data structure is known...
118 | 
119 | 
120 | ```r
121 | lapply(df1, class)
122 | lapply(df2, class)
123 | ```
124 | 
125 | or that will break if the result is not what is exected
126 | 
127 | 
128 | ```r
129 | vapply(df1, class, "1")
130 | vapply(df2, class, "1")
131 | ```
132 | 
133 | ## Semantics
134 | 
135 | - *pass-by-value* copy-on-modify
136 | - *pass-by-reference*: environments, S4 Reference Classes
137 | 
138 | 
139 | ```r
140 | x <- 1
141 | f <- function(x) {
142 |     x <- 2
143 |     x
144 | }
145 | x
146 | f(x)
147 | x
148 | ```
149 | 
150 | ## Environments
151 | 
152 | ### Motivation
153 | 
154 | - Data structure that enables *scoping* (see later).
155 | - Have reference semantics
156 | - Useful data structure on their own
157 | 
158 | ### Definition (1)
159 | 
160 | An environment associates, or *binds*, names to values in memory.
161 | Variables in an environment are hence called *bindings*.
162 | 
163 | ## Creating and populate environments
164 | 
165 | 
166 | ```r
167 | e <- new.env()
168 | e$a <- 1
169 | e$b <- LETTERS[1:5]
170 | e$c <- TRUE
171 | e$d <- mean
172 | ```
173 | 
174 | 
175 | ```r
176 | e$a <- e$b
177 | e$a <- LETTERS[1:5]
178 | ```
179 | 
180 | - Objects in environments have unique names
181 | - Objects in different environments can of course have identical names
182 | - Objects in an environment have no order
183 | - Environments have parents
184 | 
185 | ## Definition (2)
186 | 
187 | An environment is composed of a *frame* that contains the name-object
188 | bindings and a parent (enclosing) environment.
189 | 
190 | ## Relationship between environments
191 | 
192 | Every environment has a parent (enclosing) environment
193 | 
194 | 
195 | ```r
196 | e <- new.env()
197 | parent.env(e)
198 | ```
199 | Current environment
200 | 
201 | 
202 | ```r
203 | environment()
204 | parent.env(globalenv())
205 | parent.env(parent.env(globalenv()))
206 | ```
207 | 
208 | Noteworthy environments
209 | 
210 | 
211 | ```r
212 | globalenv()
213 | emptyenv()
214 | baseenv()
215 | ```
216 | 
217 | All parent of `R_GlobalEnv`:
218 | 
219 | 
220 | ```r
221 | search()
222 | as.environment("package:stats")
223 | ```
224 | 
225 | Listing objects in an environment
226 | 
227 | 
228 | ```r
229 | ls() ## default is R_GlobalEnv
230 | ls(envir = e)
231 | ls(pos = 1)
232 | ```
233 | 
234 | 
235 | ```r
236 | search()
237 | ```
238 | 
239 | Note: Every time a package is loaded with `library`, it is inserted in
240 | the search path after the `R_GlobalEnv`.
241 | 
242 | ## Accessors and setters
243 | 
244 | - In addition to `$`, one can also use `[[`, `get` and `assign`.
245 | - To check if a name exists in an environmet (or in any or its
246 |   parents), one can use `exists`.
247 | - Compare two environments with `identical` (not `==`).
248 | 
249 | **Question** Are `e1` and `e2` below identical?
250 | 
251 | 
252 | ```r
253 | e1 <- new.env()
254 | e2 <- new.env()
255 | e1$a <- 1:10
256 | e2$a <- e1$a
257 | ```
258 | 
259 | What about `e1` and `e3`?
260 | 
261 | 
262 | ```r
263 | e3 <- e1
264 | e3
265 | e1
266 | identical(e1, e3)
267 | ```
268 | 
269 | ## Locking environments and bindings
270 | 
271 | 
272 | ```r
273 | e <- new.env()
274 | e$a <- 1
275 | e$b <- 2  ## add
276 | e$a <- 10 ## modify
277 | ```
278 | 
279 | Locking an environment stops from adding new bindings:
280 | 
281 | 
282 | ```r
283 | lockEnvironment(e)
284 | e$k <- 1
285 | e$a <- 100
286 | ```
287 | 
288 | Locking bindings stops from modifying bindings with en envionment:
289 | 
290 | 
291 | ```r
292 | lockBinding("a", e)
293 | e$a <- 10
294 | e$b <- 10
295 | 
296 | lockEnvironment(e, bindings = TRUE)
297 | e$b <- 1
298 | ```
299 | 
300 | ## Exercise
301 | 
302 | Reproduce the following environments and variables in R.
303 | 
304 | ![envionments and variables exercise](./figs/envex.png)
305 | 
306 | ## Where is a symbol defined?
307 | 
308 | `pryr::where()` implements the regular scoping rules to find in which
309 | environment a binding is defined.
310 | 
311 | 
312 | ```r
313 | e <- new.env()
314 | e$foo <- 1
315 | bar <- 2
316 | where("foo")
317 | where("bar")
318 | where("foo", env = e)
319 | where("bar", env = e)
320 | ```
321 | 
322 | ## Lexical scoping
323 | 
324 | [Lexical comes from *lexical analysis* in computer science, which is
325 | the conversion of characters (code) into a sequence of meaningful (for
326 | the computer) tokens.]
327 | 
328 | **Definition**: Rules that define how R looks up values for a given name/symbol.
329 | 
330 | - Objects in environments have unique names
331 | - Objects in different environments can of course have identical names.
332 | - If a name is not found in the current environment, it is looked up
333 |   in the parent (enclosing) from. 
334 | - If it is not found in the parent (enclosing) frame, it is looked up
335 |   in the parent's parent frame, and so on...
336 | 
337 | 
338 | ```r
339 | search()
340 | mean <- function(x) cat("The mean is", sum(x)/length(x), "\n")
341 | mean(1:10)
342 | base::mean(1:10)
343 | rm(mean)
344 | mean(1:10)
345 | ```
346 | 
347 | ## Assignments
348 | 
349 | - `<-` assigns/creates in the current environment
350 | 
351 | - `<<-` (deep assignment) never creates/updates a variable in the
352 |   current environment, but modifies an existing variable in the
353 |   current or first enclosing environment where that name is defined.
354 |   
355 | - If `<<-` does not find the name, it will create the variable in the
356 |   global environment.
357 | 
358 | 
359 | ```r
360 | library("fortunes")
361 | fortune(174)
362 | ```
363 | 
364 | ```
365 | ## 
366 | ## I wish <<- had never been invented, as it makes an esoteric and dangerous
367 | ## feature of the language *seem* normal and reasonable. If you want to dumb
368 | ## down R/S into a macro language, this is the operator for you.
369 | ##    -- Bill Venables
370 | ##       R-help (July 2001)
371 | ```
372 | 
373 | 
374 | ```r
375 | rm(list = ls())
376 | x
377 | f1 <- function() x <<- 1
378 | f1()
379 | x
380 | ```
381 | 
382 | 
383 | ```r
384 | f2 <- function() x <<- 2
385 | f2()
386 | x
387 | ```
388 | 
389 | 
390 | ```r
391 | f3 <- function() x <- 10
392 | f3()
393 | x
394 | ```
395 | 
396 | 
397 | ```r
398 | f4 <- function(x) x <-10
399 | f4(x)
400 | x
401 | ```
402 | 
403 | ## Using environments
404 | 
405 | Most environments are created when creating and calling
406 | functions. They are also used in packages as *package* and *namespace*
407 | environments.
408 | 
409 | There are several reasons to create then manually.
410 | 
411 | - Reference semantics
412 | - Avoiding copies
413 | - Package state
414 | - As a hashmap for fast name lookup
415 | 
416 | ## Reference semantics
417 | 
418 | 
419 | ```r
420 | modify <- function(x) {
421 | 	x$a <- 2
422 | 	invisible(TRUE)
423 | }
424 | ```
425 | 
426 | 
427 | ```r
428 | x_l <- list(a = 1)
429 | modify(x_l)
430 | x_l$a
431 | ```
432 | 
433 | 
434 | ```r
435 | x_e <- new.env()
436 | x_e$a <- 1
437 | modify(x_e)
438 | x_e$a
439 | ```
440 | 
441 | Tip: when setting up environments, it is advised to set to parent
442 | (enclosing) environment to be `emptyenv()`, to avoid accidentally
443 | inheriting objects from somewhere else on the search path.
444 | 
445 | 
446 | ```r
447 | e <- new.env()
448 | e$a <- 1
449 | e
450 | parent.env(e)
451 | 
452 | parent.env(e) <- emptyenv()
453 | parent.env(e)
454 | e
455 | ```
456 | 
457 | or directly
458 | 
459 | 
460 | ```r
461 | e <- new.env(parent.env = empty.env())
462 | ```
463 | 
464 | ### Exercise
465 | 
466 | What is going to happen when we access `"x"` in the four cases below?
467 | 
468 | 
469 | ```r
470 | x <- 1
471 | e1 <- new.env()
472 | get("x", envir = e1)
473 | ```
474 | 
475 | 
476 | ```r
477 | get("x", envir = e1, inherits = FALSE)
478 | ```
479 | 
480 | 
481 | ```r
482 | e2 <- new.env(parent = emptyenv())
483 | get("x", envir = e2)
484 | ```
485 | 
486 | 
487 | ```r
488 | get("x", envir = e1, inherits = FALSE)
489 | ```
490 | 
491 | ## Avoiding copies
492 | 
493 | Since environments have reference semantics, they are not copied.
494 | When passing an environment as function argument (directly, or as part
495 | of a more complex data structure), it is **not** copied: all its
496 | values are accessible within the function and can be persistently
497 | modified.
498 | 
499 | 
500 | ```r
501 | e <- new.env()
502 | e$x <- 1
503 | f <- function(myenv) myenv$x <- 2
504 | f(e)
505 | e$x
506 | ```
507 | 
508 | This is used in the `eSet` class family to store the expression data.
509 | 
510 | 
511 | ```r
512 | library("Biobase")
513 | getClass("eSet")
514 | getClass("AssayData")
515 | new("ExpressionSet")
516 | ```
517 | 
518 | ## Preserving state in packages
519 | 
520 | Explicit envirionments are also useful to preserve state or define
521 | constants-like variables in a package. One can then set getters and
522 | setters for users to access the variables within that private
523 | envionment.
524 | 
525 | #### Use case
526 | 
527 | Colour management in [`pRoloc`](https://github.com/lgatto/pRoloc/blob/master/R/environment.R):
528 | 
529 | 
530 | ```r
531 | .pRolocEnv <- new.env(parent=emptyenv(), hash=TRUE)
532 | 
533 | stockcol <- c("#E41A1C", "#377EB8", "#238B45", "#FF7F00", "#FFD700", "#333333",
534 |               "#00CED1", "#A65628", "#F781BF", "#984EA3", "#9ACD32", "#B0C4DE",
535 |               "#00008A", "#8B795E", "#FDAE6B", "#66C2A5", "#276419", "#CD8C95",
536 |               "#6A51A3", "#EEAD0E", "#0000FF", "#9ACD32", "#CD6090", "#CD5B45",
537 |               "#8E0152", "#808000", "#67000D", "#3F007D", "#6BAED6", "#FC9272")
538 | 
539 | assign("stockcol", stockcol, envir = .pRolocEnv)
540 | 
541 | getStockcol <- function() get("stockcol", envir = .pRolocEnv)
542 | 
543 | setStockcol <- function(cols) {
544 |     if (is.null(cols)) {
545 |         assign("stockcol", stockcol, envir = .pRolocEnv)
546 |     } else { 
547 | 		assign("stockcol", cols, envir = .pRolocEnv)
548 | 	}
549 | }
550 | ```
551 | 
552 | and in plotting functions (we will see the `missing` in more details later):
553 | 
554 | 
555 | ```r
556 | ...
557 | if (missing(col))
558 |   col <- getStockcol()
559 | ...
560 | ```
561 | 
562 | Hadley's tip: Invisibly returning the old value from 
563 | 
564 | 
565 | ```r
566 | setStockcol <- function(cols) {
567 | 	prevcols <- getStockcol()
568 |     if (is.null(cols)) {
569 |         assign("stockcol", stockcol, envir = .pRolocEnv)
570 |     } else { 
571 | 		assign("stockcol", cols, envir = .pRolocEnv)
572 | 	}
573 | 	invisible(prevcols)
574 | }
575 | ```
576 | 
577 | ## Tidy data
578 | 
579 | > Hadley Wickham, Tidy Data, Vol. 59, Issue 10, Sep 2014, Journal of
580 | > Statistical Software. http://www.jstatsoft.org/v59/i10.
581 | 
582 | Tidy datasets are easy to manipulate, model and visualize, and have a
583 | specific structure: each variable is a column, each observation is a
584 | row, and each type of observational unit is a table.
585 | 
586 | 
587 | ## Tidy tools
588 | 
589 | Tidy data also makes it easier to develop tidy tools for data
590 | analysis, tools that both input and output tidy datasets.
591 | 
592 | - `dply::select` select columns
593 | - `dlpy::filter` select rows
594 | - `dplyr:mutate` create new columns
595 | - `dpplyr:group_by` split-apply-combine
596 | - `dlpyr:summarise` collapse each group into a single-row summary of
597 |   that group
598 | - `magrittr::%>%` piping
599 | 
600 | 
601 | ## Examples
602 | 
603 | 
604 | ```r
605 | library("dplyr")
606 | surveys <- read.csv("http://datacarpentry.github.io/dc_zurich/data/portal_data_joined.csv")
607 | head(surveys)
608 | 
609 | surveys %>%
610 |   filter(weight < 5) %>%
611 |   select(species_id, sex, weight)
612 |   
613 | surveys %>%
614 |   mutate(weight_kg = weight / 1000) %>%
615 |   filter(!is.na(weight)) %>%
616 |   head
617 | 
618 | surveys %>%
619 |   group_by(sex) %>%
620 |   tally()
621 |   
622 | surveys %>%
623 |   group_by(sex, species_id) %>%
624 |   summarize(mean_weight = mean(weight, na.rm = TRUE))
625 |   
626 | surveys %>%
627 |   group_by(sex, species_id) %>%
628 |   summarize(mean_weight = mean(weight, na.rm = TRUE),
629 |             min_weight = min(weight, na.rm = TRUE)) %>%
630 |   filter(!is.nan(mean_weight))
631 | ```
632 | 
633 | ## Application to other data structures
634 | 
635 | > Hadley Wickham (@hadleywickham) tweeted at 8:45 pm on Fri, Feb 12,
636 | > 2016: @mark_scheuerell @drob the importance of tidy data is not the
637 | > specific form, but the consistency
638 | > (https://twitter.com/hadleywickham/status/698246671629549568?s=09)
639 | 
640 | - Well-formatted and well-documented `S4` class 
641 | - `S4` as input -(function)-> `S4` as output
642 | 
643 | ![MSnSet schematics](https://raw.githubusercontent.com/lgatto/pRoloc/master/vignettes/Figures/msnset.png)
644 | 
645 | ## Computing on the language
646 | 
647 | 
648 | #### Quoting and evaluating expressions
649 | 
650 | Quote an expression, don't evaluate it:
651 | 
652 | 
653 | ```r
654 | quote(1:10)
655 | quote(paste(letters, LETTERS, sep = "-"))
656 | ```
657 | Evaluate an expression in a specific environment:
658 | 
659 | 
660 | ```r
661 | eval(quote(1 + 1))
662 | eval(quote(1:10))
663 | 
664 | x <- 10
665 | eval(quote(x + 1))
666 | 
667 | e <- new.env()
668 | e$x <- 1
669 | eval(quote(x + 1), env = e)
670 | 
671 | eval(quote(x), list(x = 30))
672 | 
673 | dfr <- data.frame(x = 1:10, y = LETTERS[1:10])
674 | eval(quote(sum(x)), dfr)
675 | ```
676 | 
677 | Substitute any variables bound in `env`, but don't evaluate the
678 | expression:
679 | 
680 | 
681 | ```r
682 | x <- 10
683 | substitute(sqrt(x))
684 | 
685 | e <- new.env()
686 | e$x <- 1
687 | substitute(sqrt(x), env = e)
688 | ```
689 | 
690 | Parse, but don't evaluate an expression:
691 | 
692 | 
693 | ```r
694 | parse(text = "1:10")
695 | parse(file = "lineprof-example.R")
696 | ```
697 | 
698 | Turn an unevaluated expressions into character strings:
699 | 
700 | 
701 | ```r
702 | x <- 123
703 | deparse(substitute(x))
704 | ```
705 | 
706 | #### Characters as variables names 
707 | 
708 | 
709 | 
710 | ```r
711 | foo <- "bar"
712 | as.name(foo)
713 | string <- "1:10"
714 | parse(text=string)
715 | eval(parse(text=string))
716 | ```
717 |       
718 | 
719 | And with `assign` and `get`
720 | 
721 | 
722 | ```r
723 | varName1 <- "varName2"
724 | assign(varName1, "123")
725 | varName1
726 | get(varName1)
727 | varName2
728 | ```
729 | 
730 | Using `substitute` and `deparse`
731 | 
732 | 
733 | ```r
734 | test <- function(x) {
735 |     y <- deparse(substitute(x))
736 |     print(y)
737 |     print(x)
738 | }
739 | var <- c("one","two","three")
740 | test(var)
741 | ```
742 | 
743 | 
744 | 


--------------------------------------------------------------------------------
/02-funprog.R:
--------------------------------------------------------------------------------
  1 | 
  2 | f <- function(x) {
  3 |     y <- x + 1
  4 |     return(x * y)
  5 | }
  6 | 
  7 | 
  8 | body(f)
  9 | args(f)
 10 | environment(f)
 11 | 
 12 | body(f) <- quote({
 13 |     y <- x * y
 14 |     return(x + y)
 15 | })
 16 | 
 17 | 
 18 | f <- function(x) x + y
 19 | 
 20 | f(1)
 21 | 
 22 | environment(f)
 23 | y <- 2
 24 | f(1)
 25 | 
 26 | 
 27 | 
 28 | e <- new.env()
 29 | environment(f) <- e
 30 | 
 31 | f(1)
 32 | e$y <- 10
 33 | f(1)
 34 | 
 35 | 
 36 | codetools::findGlobals(f)
 37 | 
 38 | 
 39 | f <- function() {
 40 |     x <- 1
 41 |     y <- 2
 42 |     c(x, y)
 43 | }
 44 | f()
 45 | 
 46 | 
 47 | x <- 2
 48 | g <- function(){
 49 |     y <- 1
 50 |     c(x, y)
 51 | }
 52 | g()
 53 | 
 54 | 
 55 | x <- 1
 56 | h <- function() {
 57 |     y <- 2
 58 |     i <- function() {
 59 |         z <- 3
 60 |         c(x, y, z)
 61 |     }
 62 |     i()
 63 | }
 64 | h()
 65 | 
 66 | 
 67 | j <- function(x) {
 68 |     y <- 2
 69 |     function(){
 70 |         c(x, y)
 71 |     }
 72 | }
 73 | k <- j(1)
 74 | k()
 75 | 
 76 | 
 77 | j <- function() {
 78 |     if (!exists("a")) {
 79 |         a <- 1
 80 |     } else {
 81 |         a <- a + 1
 82 |     }
 83 |     print(a)
 84 | }
 85 | j() ## First call
 86 | j() ## Second call
 87 | 
 88 | 
 89 | f <- function(x) {
 90 |     f <- function(x) {
 91 |         f <- function(x) {
 92 |             x^2
 93 |         }
 94 |         f(x) + 1
 95 |     }
 96 |     f(x) * 2
 97 | }
 98 | f(10)
 99 | 
100 | 
101 | args <- list(x = 1:10, trim = 0.3)
102 | do.call(mean, args)
103 | 
104 | 
105 | f <- function(x = 1, y = 2) x * y
106 | f <- function(x = 1, y = x + 2) x * y
107 | 
108 | 
109 | f <- function(x = 1, y) {
110 | 	c(missing(x), missing(y))
111 | }
112 | f()
113 | f(x = 1)
114 | 
115 | 
116 | plot2 <- function(...) {
117 |     message("Verbose plotting...")
118 |     plot(...)
119 | }
120 | 
121 | f <- function(...) list(...)
122 | 
123 | 
124 | f1 <- function() 1
125 | f2 <- function() return(1)
126 | f3 <- function() return(invisible(1))
127 | 
128 | 
129 | f1 <- function(x) {
130 |     on.exit(print("!"))
131 |     x + 1
132 | }
133 | 
134 | f2 <- function(x) {
135 |     on.exit(print("!"))
136 |     stop("Error")
137 | }
138 | 
139 | 
140 | f3 <- function() {
141 |     on.exit(print("1"))
142 |     on.exit(print("2"))
143 |     invisible(TRUE)
144 | }
145 | 
146 | 
147 | f4 <- function() {
148 |     on.exit(print("1"))
149 |     on.exit(print("2"), add = TRUE)
150 |     invisible(TRUE)
151 | }
152 | 
153 | 
154 | function(x) x + y
155 | body(function(x) x + y)
156 | args(function(x) x + y)
157 | environment(function(x) x + y)
158 | 
159 | 
160 | make.power <- function(n)
161 |     function(x) x^n
162 | 
163 | 
164 | cube <- make.power(3)
165 | square <- make.power(2)
166 | cube(2)
167 | square(2)
168 | environment(cube)
169 | environment(square)
170 | 
171 | 
172 | new_counter <- function() {
173 |     i <- 0
174 |     function() {
175 |         i <<- i + 1
176 |         i
177 |     }
178 | }
179 | 
180 | count1 <- new_counter()
181 | count2 <- new_counter()
182 | 
183 | count1()
184 | count1()
185 | count2()
186 | 
187 | environment(count1)
188 | environment(count2)
189 | environment(count1)$i
190 | environment(count2)$i
191 | 
192 | 
193 | colramp <- colorRampPalette(c("blue", "yellow"))
194 | colramp(5)
195 | plot(1:10, col = colramp(10), pch = 19, cex = 2,
196 |      main = "colramp(10)")
197 | 
198 | 
199 | L <- replicate(3, matrix(rnorm(9), 3), simplify = FALSE)
200 | Reduce("+", L)
201 | try(sum(L))
202 | 
203 | 
204 | Reduce("+", list(1, 2, 3), init = 10)
205 | Reduce("+", list(1, 2, 3), accumulate = TRUE)
206 | Reduce("+", list(1, 2, 3), right = TRUE, accumulate = TRUE)
207 | 
208 | 
209 | even <- function(x) x %% 2 == 0
210 | (y <- sample(100, 10))
211 | Filter(even, y)
212 | Filter(Negate(even), y)
213 | 
214 | 
215 | Map(even, 1:3)
216 | 
217 | 
218 | Find(even, 10:15)
219 | Find(even, 10:15, right = TRUE)
220 | Position(Negate(even), 10:15)
221 | Position(Negate(even), 10:15, right = TRUE)
222 | 
223 | 
224 | (x <- 1:5)
225 | (y <- 5:1)
226 | x + y
227 | 
228 | 
229 | (x <- 1:6)
230 | (y <- 1:2)
231 | x+y
232 | 
233 | 
234 | diff1 <- function(e) {
235 |   n <- length(e)
236 |   interval <- rep(0, n - 1)
237 |   for (i in 1:(n - 1))
238 |       interval[i] <- e[i + 1] - e[i]
239 |   interval
240 | }
241 | e <- c(2, 5, 10.2, 12, 19)
242 | diff1(e)
243 | 
244 | 
245 | diff2 <- function(e) {
246 |   n <- length(e)
247 |   e[-1] - e[-n]
248 | }
249 | e <- c(2, 5, 10.2, 12, 19)
250 | diff2(e)
251 | 
252 | 
253 | v <- rnorm(1000) ## or a list
254 | res <- numeric(length(v))
255 | 
256 | for (i in 1:length(v))
257 |   res[i] <- f(v[i])
258 | 
259 | res <- sapply(v, f)
260 | 
261 | ## if f is vectorised
262 | f(v)
263 | 
264 | 
265 | M <- matrix(rnorm(100), 10)
266 | apply(M, 1, function(Mrow) 'do something with Mrow')
267 | apply(M, 2, function(Mcol) 'do something with Mcol')
268 | 
269 | 
270 | f <- function(x, a = 1) sin(x^2)/ (a + abs(x))
271 | x <- seq(-7, 7, 0.02 )
272 | x0 <- seq(-2, 2, 0.02)
273 | y0 <- f(x0)
274 | y0[y0 < 0] <- 0
275 | plot(x, f(x), type = "l", main = expression(f(x) ==  frac(sin(x^2),(a + abs(x)))))
276 | grid()
277 | abline(v = c(-2, 2), lty = "dotted")
278 | polygon(x0, y0, col = "#00000010")
279 | 
280 | 
281 | f <- function(x, a = 1) sin(x^2)/ (a + abs(x))
282 | integrate(f, lower = -2, upper = 2)
283 | 
284 | 
285 | lo <- c(-2, 0)
286 | hi <- c(0, 2)
287 | integrate(f, lower = lo, upper = hi)
288 | 
289 | 
290 | mapply(function(lo, hi) integrate(f, lo, hi)$value,
291 |        lo, hi)
292 | 
293 | 
294 | Integrate <- Vectorize(
295 |   function(fn, lower, upper)
296 |   integrate(fn, lower, upper)$value,
297 |   vectorize.args=c("lower", "upper")
298 |   )
299 | Integrate(f, lower=lo, upper=hi)
300 | 
301 | 


--------------------------------------------------------------------------------
/02-funprog.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Part II: Functional programming"
  3 | author: "Laurent Gatto"
  4 | ---
  5 | 
  6 | ## Content
  7 | 
  8 | - Functions 
  9 | - Robust programming with functions
 10 | - Scoping
 11 | - Closures
 12 | - High-level functions
 13 | - Vectorisation
 14 | 
 15 | # Functions
 16 | 
 17 | Among the R's strong points, Hadley Whickham cites:
 18 | 
 19 | > [R has] a strong foundation in functional programming. The ideas of
 20 | > functional programming are well suited to solving many of the
 21 | > challenges of data analysis. R provides a powerful and flexible
 22 | > toolkit which allows you to write concise yet descriptive code.
 23 | 
 24 | Also
 25 | > To understand computations in R, two slogans are helpful:
 26 | > - Everything that exists is an object.
 27 | > - Everything that happens is a function call.
 28 | > John Chambers
 29 | 
 30 | ![Messy code hides bugs](./figs/funs.png) 
 31 | 
 32 | - Functions are a means of **abstraction**. A concept/computation is
 33 |   encapsulated/isolated from the rest with a function.
 34 | - Functions should **do one thing**, and do it well (compute, or plot,
 35 |   or save, ... not all in one go).
 36 | - **Side effects**: your functions should not have any (unless, of
 37 |   course, that is the main point of that function - plotting, write to
 38 |   disk, ...). Functions shouldn't make any changes in any
 39 |   environment. The only return their output.
 40 | - **Do not use global variables**. Everything the function needs is
 41 |   being passed as an argument. Function must be **self-contained**.
 42 | - Function streamline code and process
 43 | 
 44 | From the `R Inferno`:
 45 | 
 46 | Make your functions as simple as possible. Simple has many advantages:
 47 | 
 48 | - Simple functions are likely to be human efficient: they will be easy
 49 |   to understand and to modify.
 50 | - Simple functions are likely to be computer efficient.
 51 | - Simple functions are less likely to be buggy, and bugs will be
 52 |   easier to fix.
 53 | - (Perhaps ironically) simple functions may be more general—thinking
 54 |   about the heart of the matter often broadens the application.
 55 | 
 56 | 
 57 | Functions can be
 58 | 
 59 | 1. Correct.
 60 | 2. An error occurs that is clearly identified.
 61 | 3. An obscure error occurs.
 62 | 4. An incorrect value is returned.
 63 | 
 64 | We like category 1. Category 2 is the right behavior if the inputs do
 65 | not make sense, but not if the inputs are sensible. Category 3 is an
 66 | unpleasant place for your users, and possibly for you if the users
 67 | have access to you. Category 4 is by far the worst place to be - the
 68 | user has no reason to believe that anything is wrong. Steer clear of
 69 | category 4.
 70 | 
 71 | 
 72 | Finally, functions are
 73 | 
 74 | - Easier to debug (part III)
 75 | - Easier to profile (part IV)
 76 | - Easier to parallelise (part IV)
 77 | 
 78 | Functions are an central part of robust R programming.
 79 | 
 80 | ## Function parts
 81 | 
 82 | A function is made of
 83 | - a name
 84 | - some inputs (formal parameters)
 85 | - a single output (return value)
 86 | - a body
 87 | - an environment, the map of the location of the functions variable
 88 | 
 89 | ```{r, eval=FALSE}
 90 | f <- function(x) {
 91 |     y <- x + 1
 92 |     return(x * y)
 93 | }
 94 | ```
 95 | 
 96 | And these can be accessed and modified indivdually
 97 | 
 98 | ```{r, eval=FALSE}
 99 | body(f)
100 | args(f)
101 | environment(f)
102 | 
103 | body(f) <- quote({
104 |     y <- x * y
105 |     return(x + y)
106 | })
107 | ```
108 | 
109 | 
110 | ## Lexical scoping
111 | 
112 | - If a name is not found in a functions environment, it is looked up
113 |   in the parent (enclosing) from.
114 | - If it is not found in the parent (enclosing) frame, it is looked up
115 |   in the parent's parent frame, and so on...
116 | 
117 | *Lexical scoping*: default behaviour, current environment, then
118 | traversing *enclosing/parent environments*.
119 | 
120 | ```{r, eval=FALSE}
121 | f <- function(x) x + y
122 | 
123 | f(1)
124 | 
125 | environment(f)
126 | y <- 2
127 | f(1)
128 | ```
129 | 
130 | ```{r, eval=FALSE}
131 | 
132 | e <- new.env()
133 | environment(f) <- e
134 | 
135 | f(1)
136 | e$y <- 10
137 | f(1)
138 | ```
139 | 
140 | This is of course bad practice, we don't want to rely on global variables.
141 | 
142 | ```{r, eval=FALSE}
143 | codetools::findGlobals(f)
144 | ```
145 | 
146 | ## Exercises
147 | 
148 | Start by mentally running the code chunks below - what do the functions return? 
149 | 
150 | After testing new code chunks, don't forget to clean up your
151 | workspace, to avoid unexpected results.
152 | 
153 | ```{r, eval=FALSE}
154 | f <- function() {
155 |     x <- 1
156 |     y <- 2
157 |     c(x, y)
158 | }
159 | f()
160 | ```
161 | 
162 | ```{r, eval=FALSE}
163 | x <- 2
164 | g <- function(){
165 |     y <- 1
166 |     c(x, y)
167 | }
168 | g()
169 | ```
170 | 
171 | ```{r, eval=FALSE}
172 | x <- 1
173 | h <- function() {
174 |     y <- 2
175 |     i <- function() {
176 |         z <- 3
177 |         c(x, y, z)
178 |     }
179 |     i()
180 | }
181 | h()
182 | ```
183 | 
184 | ```{r, eval=FALSE}
185 | j <- function(x) {
186 |     y <- 2
187 |     function(){
188 |         c(x, y)
189 |     }
190 | }
191 | k <- j(1)
192 | k()
193 | ```
194 | 
195 | ```{r, eval=FALSE}
196 | j <- function() {
197 |     if (!exists("a")) {
198 |         a <- 1
199 |     } else {
200 |         a <- a + 1
201 |     }
202 |     print(a)
203 | }
204 | j() ## First call
205 | j() ## Second call
206 | ```
207 | 
208 | <!-- ```{r, eval=FALSE} -->
209 | <!-- f <- function() x -->
210 | <!-- x <- 1 -->
211 | <!-- f()  -->
212 | <!-- x <- 2 -->
213 | <!-- f() -->
214 | <!-- ``` -->
215 | 
216 | ```{r, eval=FALSE}
217 | f <- function(x) {
218 |     f <- function(x) {
219 |         f <- function(x) {
220 |             x^2
221 |         }
222 |         f(x) + 1
223 |     }
224 |     f(x) * 2
225 | }
226 | f(10)
227 | ```
228 | 
229 | ## More about functions
230 | 
231 | - Argument matching by position or by names
232 | - Calling a function with a list of arguments
233 | 
234 | ```{r, eval=FALSE}
235 | args <- list(x = 1:10, trim = 0.3)
236 | do.call(mean, args)
237 | ```
238 | 
239 | - Default arguments
240 | 
241 | ```{r, eval=FALSE}
242 | f <- function(x = 1, y = 2) x * y
243 | f <- function(x = 1, y = x + 2) x * y
244 | ```
245 | 
246 | - Missing arguments
247 | 
248 | ```{r, eval=FALSE}
249 | f <- function(x = 1, y) {
250 | 	c(missing(x), missing(y))
251 | }
252 | f()
253 | f(x = 1)
254 | ```
255 | 
256 | - Passing non-matched parameters `...` to an inner function
257 | 
258 | ```{r, eval=FALSE}
259 | plot2 <- function(...) {
260 |     message("Verbose plotting...")
261 |     plot(...)
262 | }
263 | 
264 | f <- function(...) list(...)
265 | ```
266 | 
267 | - Return values: last statement, explicit `return`, make output
268 |   `invisible`
269 |   
270 | ```{r, eval=FALSE}
271 | f1 <- function() 1
272 | f2 <- function() return(1)
273 | f3 <- function() return(invisible(1))
274 | ```
275 | 
276 | - Explicit triggers before exiting. Useful to restore global state
277 |   (plotting parameters, cleaning temporary files, ...)
278 | 
279 | ```{r, eval=FALSE}
280 | f1 <- function(x) {
281 |     on.exit(print("!"))
282 |     x + 1
283 | }
284 | 
285 | f2 <- function(x) {
286 |     on.exit(print("!"))
287 |     stop("Error")
288 | }
289 | ```
290 | 
291 | ```{r, eval=FALSE}
292 | f3 <- function() {
293 |     on.exit(print("1"))
294 |     on.exit(print("2"))
295 |     invisible(TRUE)
296 | }
297 | 
298 | 
299 | f4 <- function() {
300 |     on.exit(print("1"))
301 |     on.exit(print("2"), add = TRUE)
302 |     invisible(TRUE)
303 | }
304 | ```
305 | 
306 | - Anonymous functions, created on-the-flight and passed to `lapply` or
307 |   other high-level functions.
308 | 
309 | ```{r, eval=FALSE}
310 | function(x) x + y
311 | body(function(x) x + y)
312 | args(function(x) x + y)
313 | environment(function(x) x + y)
314 | ```
315 | 
316 | ## More about scoping
317 | 
318 | *Lexical scoping*: default behaviour, current environment, then
319 | traversing *enclosing/parent environments*.
320 | 
321 | 
322 | *Dynamic scoping*: looking up variables in the *calling environment*,
323 | used in non-standard evaluation.
324 | 
325 | 
326 | # Functional programming
327 | 
328 | **First-class functions** - a function is a value just like any other
329 | variable. Functions can thus be used as arguments to other
330 | functions. Functions are considered *first-class citizens*.
331 | 
332 | **Higher-order functions** - refers to functions that take functions
333 | as parameters (input) or return functions (output).
334 | 
335 | ## Closures
336 | 
337 | > "An object is data with functions. A closure is a function with
338 | > data." - John D. Cool
339 | 
340 | Closures: functions written by functions. They enclose the envionment
341 | of the parent function and can access all its variables. 
342 | 
343 | ```{r, eval=FALSE}
344 | make.power <- function(n) 
345 |     function(x) x^n
346 | 
347 |   
348 | cube <- make.power(3)
349 | square <- make.power(2)
350 | cube(2)
351 | square(2)
352 | environment(cube)
353 | environment(square)
354 | ```
355 | 
356 | Mutable state: a counter function
357 | 
358 | ```{r, eval=FALSE}
359 | new_counter <- function() {
360 |     i <- 0
361 |     function() {
362 |         i <<- i + 1
363 |         i 
364 |     }
365 | }
366 | 
367 | count1 <- new_counter()
368 | count2 <- new_counter()
369 | 
370 | count1()
371 | count1()
372 | count2()
373 | 
374 | environment(count1)
375 | environment(count2)
376 | environment(count1)$i
377 | environment(count2)$i
378 | ```
379 | 
380 | Questions:
381 | 
382 | - What happens of we place the code `i <- 0` and the function
383 |   definition outside of a function, i.e in the global environment?
384 | 
385 | - What happens if we use `<-` instead of `<<-`?
386 | 
387 | The `colorRampPallette`
388 | 
389 | ```{r, eval=FALSE}
390 | colramp <- colorRampPalette(c("blue", "yellow"))
391 | colramp(5)
392 | plot(1:10, col = colramp(10), pch = 19, cex = 2,
393 |      main = "colramp(10)")
394 | ```
395 | 
396 | ## Functional
397 | 
398 | Take a function as input or return a function as output.
399 | 
400 | - `Reduce(f, x)` uses a binary function to successively combine the
401 |   elements of a given vector and a possibly given initial value.
402 | 
403 | ```{r, eval=FALSE}
404 | L <- replicate(3, matrix(rnorm(9), 3), simplify = FALSE)
405 | Reduce("+", L)
406 | try(sum(L))
407 | ```
408 | 
409 | ```{r, eval=FALSE}
410 | Reduce("+", list(1, 2, 3), init = 10)
411 | Reduce("+", list(1, 2, 3), accumulate = TRUE)
412 | Reduce("+", list(1, 2, 3), right = TRUE, accumulate = TRUE)
413 | ```
414 | 
415 | - `Filter(f, x)` extracts the elements of a vector for which a
416 |   predicate (logical) function gives true.
417 | 
418 | - `Negate(f)`  creates the negation of a given function.
419 | 
420 | ```{r, eval=FALSE}
421 | even <- function(x) x %% 2 == 0
422 | (y <- sample(100, 10))
423 | Filter(even, y)
424 | Filter(Negate(even), y)
425 | ```
426 | 
427 | - `Map(f, ...)` applies a function to the corresponding elements of
428 |   given vectors. Similar to `mapply` without any attempt to simplify.
429 | 
430 | ```{r, eval=FALSE}
431 | Map(even, 1:3)
432 | ```
433 | 
434 | - `Find(f, x)` and `Position(f, x)` give the first (or last elements)
435 |   and its position in the vector, for which a predicate (logical)
436 |   function gives true.
437 | 
438 | ```{r, eval=FALSE}
439 | Find(even, 10:15)
440 | Find(even, 10:15, right = TRUE)
441 | Position(Negate(even), 10:15)
442 | Position(Negate(even), 10:15, right = TRUE)
443 | ```
444 | 
445 | ## References
446 | 
447 | - R Gentleman, *R Programming for Bioinformatics*, CRC Press, 2008
448 | - `?Map`, or any other of the higher order functions
449 | - Blog post, *Higher Order Functions in R*, John Myles White
450 | 		http://www.johnmyleswhite.com/notebook/2010/09/23/higher-order-functions-in-r/
451 | 
452 | # Vectorisation
453 | 
454 | 
455 | > Many operations in R are vectorized, and understanding and using
456 | > vectorization is an essential component of becoming a proficient
457 | > programmer. -  R Gentleman in *R Programming for Bioinformatics*.
458 | 
459 | A *vectorised computation* is one that, when applied to a vector (of
460 | length greater than 1), automatically operates directly on all
461 | elements of the input vector.
462 | 
463 | 
464 | ```{r, eval=FALSE}
465 | (x <- 1:5)
466 | (y <- 5:1)
467 | x + y
468 | ```
469 | 
470 | ## Recycling rule
471 | 
472 | What is `x` and `y` are of different length: the shorter vector is
473 | replicate so that its length matches the longer ones.
474 | 
475 | ```{r, eval=FALSE}
476 | (x <- 1:6)
477 | (y <- 1:2)
478 | x+y
479 | ```
480 | 
481 | If the shorter vector is not an even multiple of the longer, a warning
482 | is issued.
483 | 
484 | ## Example
485 | 
486 | Compute difference between times of events, `e`. Given `n` events,
487 | there will be `n-1` inter-event times. `interval[i] <- e[i+1] - e[i]`
488 | 
489 | Procedural implementation:
490 | 
491 | ```{r, eval=FALSE}
492 | diff1 <- function(e) {
493 |   n <- length(e)
494 |   interval <- rep(0, n - 1) 
495 |   for (i in 1:(n - 1)) 
496 |       interval[i] <- e[i + 1] - e[i]
497 |   interval
498 | }
499 | e <- c(2, 5, 10.2, 12, 19)
500 | diff1(e)
501 | ```
502 | 
503 | Vectorised implementation:
504 | 
505 | ```{r, eval=FALSE}
506 | diff2 <- function(e) {
507 |   n <- length(e)
508 |   e[-1] - e[-n]
509 | }
510 | e <- c(2, 5, 10.2, 12, 19)
511 | diff2(e)
512 | ```
513 | 
514 | ## `*apply` functions
515 | 
516 | How to apply a function, iteratively, on a set of elements?
517 | 
518 | `apply(X, MARGIN, FUN, ...)`
519 | 
520 | 
521 | - `MARGIN` = 1 for row, 2 for cols.
522 | - `FUN` = function to apply
523 | - `...` = extra args to function.
524 | - `simplify` =  should the result be simplified if possible.
525 | 
526 | 
527 | `*apply` functions are (generally) **NOT** faster than loops, but more
528 | succint and thus clearer.
529 | 
530 | ```{r, eval=FALSE}
531 | v <- rnorm(1000) ## or a list
532 | res <- numeric(length(v))
533 | 
534 | for (i in 1:length(v)) 
535 |   res[i] <- f(v[i])
536 | 
537 | res <- sapply(v, f)
538 | 
539 | ## if f is vectorised
540 | f(v)
541 | ```
542 | 
543 | function | use case
544 | -------|---------------------------------------
545 | apply  | matrices, arrays, data.frames
546 | lapply | lists, vectors
547 | sapply | lists, vectors
548 | vapply | with a pre-specified type of return value
549 | tapply | atomic objects, typically vectors
550 | by     | similar to tapply
551 | eapply | environments
552 | mapply | multiple values
553 | rapply | recursive version of lapply
554 | esApply | `ExpressionSet`, defined in `Biobase`
555 | 
556 | See also the `BiocGenerics` package for `[l|m|s|t]apply` S4 generics,
557 | as well as parallel versions in the `parallel` package (see
558 | `Performance` section).
559 | 
560 | 
561 | See also the `plyr` package, that offers its own flavour of `apply`
562 | functions.
563 | 
564 |   in/out    |  list   | data frame | array
565 | ------------|---------|------------|---------
566 |  list       | llply() |	ldply()    | laply()
567 |  data frame | dlply() |	ddply()    | daply()
568 |  array      | alply() |	adply()    | aaply()
569 | 
570 | 
571 | ## Other functions
572 | 
573 | - `replicate` - repeated evaluation of an expression
574 | - `aggregate` - compute summary statistics of data subsets
575 | - `ave` - group averages over level combinations of factors
576 | - `sweep` - sweep out array summaries
577 | 
578 | ## Anonymous functions
579 | 
580 | A function defined/called without being assigned to an identifier and
581 | generally passed as argument to other functions.
582 | 
583 | 
584 | ```{r, eval=FALSE}
585 | M <- matrix(rnorm(100), 10)
586 | apply(M, 1, function(Mrow) 'do something with Mrow')
587 | apply(M, 2, function(Mcol) 'do something with Mcol')
588 | ```
589 | 
590 | ## Use case: integration
591 | 
592 | ```{r, echo=FALSE, eval=FALSE}
593 | f <- function(x, a = 1) sin(x^2)/ (a + abs(x))
594 | x <- seq(-7, 7, 0.02 )
595 | x0 <- seq(-2, 2, 0.02)
596 | y0 <- f(x0)
597 | y0[y0 < 0] <- 0
598 | plot(x, f(x), type = "l", main = expression(f(x) ==  frac(sin(x^2),(a + abs(x)))))
599 | grid()
600 | abline(v = c(-2, 2), lty = "dotted")
601 | polygon(x0, y0, col = "#00000010")
602 | ```
603 | 
604 | ![`sin(x^2)/ (a + abs(x))`](./figs/sinfun.png)
605 | 
606 | The `integrate` function approximates definite integrals by
607 | adaptive quadrature.
608 | 
609 | 
610 | ```{r, eval=FALSE}
611 | f <- function(x, a = 1) sin(x^2)/ (a + abs(x))
612 | integrate(f, lower = -2, upper = 2)
613 | ```
614 | 
615 | It is not vectorised.
616 | 
617 | ```{r, eval=FALSE}
618 | lo <- c(-2, 0)
619 | hi <- c(0, 2)
620 | integrate(f, lower = lo, upper = hi)
621 | ```
622 | 
623 | ## How to vectorise
624 | 
625 | - To vectorise a function, we can explicitly wrap it inside a helper
626 |   function that will take care of argument recycling (via `rep`), then
627 |   loop over the inputs and call the non-vectorised function.
628 | 
629 | - To vectorise a function, we can explicitate the vectorised
630 |   calculation using `mapply`.
631 | 
632 | ```{r, eval=FALSE}
633 | mapply(function(lo, hi) integrate(f, lo, hi)$value,
634 |        lo, hi)
635 | ```
636 | 
637 | - Create a vectorised form using `Vectorize`. It takes a function
638 |   (here, an anonymous function) as input and returns a function.
639 | 
640 | ```{r, eval=FALSE}
641 | Integrate <- Vectorize(
642 |   function(fn, lower, upper)
643 |   integrate(fn, lower, upper)$value,
644 |   vectorize.args=c("lower", "upper")
645 |   )
646 | Integrate(f, lower=lo, upper=hi)
647 | ```
648 | 
649 | ## **Efficient** apply-like functions
650 | 
651 | These functions combine high-level vectorised syntax for clarity
652 | **and** efficient C-level vectorised imputation (see *Performance*
653 | section).
654 | 
655 | - In `base`: rowSums, rowMeans, colSums, colMeans
656 | - In `Biobase`: rowQ, rowMax, rowMin, rowMedias, ...
657 | - In `genefilter`: rowttests, rowFtests, rowSds, rowVars, ...
658 | 
659 | Generalisable on other data structures, like `ExpressionSet`
660 | instances.
661 | 
662 | ## Parallelisation
663 | 
664 | Vectorised operations are natural candidats for parallel execution.
665 | See later, *Parallel computation* topic.
666 | 
667 | ## References
668 | 
669 | - R Gentleman, *R Programming for Bioinformatics*, CRC Press, 2008
670 | - Ligges and Fox, *R Help Desk, How Can I Avoid This Loop or Make It
671 |   Faster?* R News, Vol 8/1. May 2008.
672 | - Grouping functions: sapply vs. lapply vs. apply. vs. tapply
673 |   vs. by vs. aggregate ... http://stackoverflow.com/questions/3505701/
674 | 


--------------------------------------------------------------------------------
/02-funprog.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Part II: Functional programming"
  3 | author: "Laurent Gatto"
  4 | ---
  5 | 
  6 | ## Content
  7 | 
  8 | - Functions 
  9 | - Robust programming with functions
 10 | - Scoping
 11 | - Closures
 12 | - High-level functions
 13 | - Vectorisation
 14 | 
 15 | # Functions
 16 | 
 17 | Among the R's strong points, Hadley Whickham cites:
 18 | 
 19 | > [R has] a strong foundation in functional programming. The ideas of
 20 | > functional programming are well suited to solving many of the
 21 | > challenges of data analysis. R provides a powerful and flexible
 22 | > toolkit which allows you to write concise yet descriptive code.
 23 | 
 24 | Also
 25 | > To understand computations in R, two slogans are helpful:
 26 | > - Everything that exists is an object.
 27 | > - Everything that happens is a function call.
 28 | > John Chambers
 29 | 
 30 | ![Messy code hides bugs](./figs/funs.png) 
 31 | 
 32 | - Functions are a means of **abstraction**. A concept/computation is
 33 |   encapsulated/isolated from the rest with a function.
 34 | - Functions should **do one thing**, and do it well (compute, or plot,
 35 |   or save, ... not all in one go).
 36 | - **Side effects**: your functions should not have any (unless, of
 37 |   course, that is the main point of that function - plotting, write to
 38 |   disk, ...). Functions shouldn't make any changes in any
 39 |   environment. The only return their output.
 40 | - **Do not use global variables**. Everything the function needs is
 41 |   being passed as an argument. Function must be **self-contained**.
 42 | - Function streamline code and process
 43 | 
 44 | From the `R Inferno`:
 45 | 
 46 | Make your functions as simple as possible. Simple has many advantages:
 47 | 
 48 | - Simple functions are likely to be human efficient: they will be easy
 49 |   to understand and to modify.
 50 | - Simple functions are likely to be computer efficient.
 51 | - Simple functions are less likely to be buggy, and bugs will be
 52 |   easier to fix.
 53 | - (Perhaps ironically) simple functions may be more general—thinking
 54 |   about the heart of the matter often broadens the application.
 55 | 
 56 | 
 57 | Functions can be
 58 | 
 59 | 1. Correct.
 60 | 2. An error occurs that is clearly identified.
 61 | 3. An obscure error occurs.
 62 | 4. An incorrect value is returned.
 63 | 
 64 | We like category 1. Category 2 is the right behavior if the inputs do
 65 | not make sense, but not if the inputs are sensible. Category 3 is an
 66 | unpleasant place for your users, and possibly for you if the users
 67 | have access to you. Category 4 is by far the worst place to be - the
 68 | user has no reason to believe that anything is wrong. Steer clear of
 69 | category 4.
 70 | 
 71 | 
 72 | Finally, functions are
 73 | 
 74 | - Easier to debug (part III)
 75 | - Easier to profile (part IV)
 76 | - Easier to parallelise (part IV)
 77 | 
 78 | Functions are an central part of robust R programming.
 79 | 
 80 | ## Function parts
 81 | 
 82 | A function is made of
 83 | - a name
 84 | - some inputs (formal parameters)
 85 | - a single output (return value)
 86 | - a body
 87 | - an environment, the map of the location of the functions variable
 88 | 
 89 | 
 90 | ```r
 91 | f <- function(x) {
 92 |     y <- x + 1
 93 |     return(x * y)
 94 | }
 95 | ```
 96 | 
 97 | And these can be accessed and modified indivdually
 98 | 
 99 | 
100 | ```r
101 | body(f)
102 | args(f)
103 | environment(f)
104 | 
105 | body(f) <- quote({
106 |     y <- x * y
107 |     return(x + y)
108 | })
109 | ```
110 | 
111 | 
112 | ## Lexical scoping
113 | 
114 | - If a name is not found in a functions environment, it is looked up
115 |   in the parent (enclosing) from.
116 | - If it is not found in the parent (enclosing) frame, it is looked up
117 |   in the parent's parent frame, and so on...
118 | 
119 | *Lexical scoping*: default behaviour, current environment, then
120 | traversing *enclosing/parent environments*.
121 | 
122 | 
123 | ```r
124 | f <- function(x) x + y
125 | 
126 | f(1)
127 | 
128 | environment(f)
129 | y <- 2
130 | f(1)
131 | ```
132 | 
133 | 
134 | ```r
135 | e <- new.env()
136 | environment(f) <- e
137 | 
138 | f(1)
139 | e$y <- 10
140 | f(1)
141 | ```
142 | 
143 | This is of course bad practice, we don't want to rely on global variables.
144 | 
145 | 
146 | ```r
147 | codetools::findGlobals(f)
148 | ```
149 | 
150 | ## Exercises
151 | 
152 | Start by mentally running the code chunks below - what do the functions return? 
153 | 
154 | After testing new code chunks, don't forget to clean up your
155 | workspace, to avoid unexpected results.
156 | 
157 | 
158 | ```r
159 | f <- function() {
160 |     x <- 1
161 |     y <- 2
162 |     c(x, y)
163 | }
164 | f()
165 | ```
166 | 
167 | 
168 | ```r
169 | x <- 2
170 | g <- function(){
171 |     y <- 1
172 |     c(x, y)
173 | }
174 | g()
175 | ```
176 | 
177 | 
178 | ```r
179 | x <- 1
180 | h <- function() {
181 |     y <- 2
182 |     i <- function() {
183 |         z <- 3
184 |         c(x, y, z)
185 |     }
186 |     i()
187 | }
188 | h()
189 | ```
190 | 
191 | 
192 | ```r
193 | j <- function(x) {
194 |     y <- 2
195 |     function(){
196 |         c(x, y)
197 |     }
198 | }
199 | k <- j(1)
200 | k()
201 | ```
202 | 
203 | 
204 | ```r
205 | j <- function() {
206 |     if (!exists("a")) {
207 |         a <- 1
208 |     } else {
209 |         a <- a + 1
210 |     }
211 |     print(a)
212 | }
213 | j() ## First call
214 | j() ## Second call
215 | ```
216 | 
217 | <!-- ```{r, eval=FALSE} -->
218 | <!-- f <- function() x -->
219 | <!-- x <- 1 -->
220 | <!-- f()  -->
221 | <!-- x <- 2 -->
222 | <!-- f() -->
223 | <!-- ``` -->
224 | 
225 | 
226 | ```r
227 | f <- function(x) {
228 |     f <- function(x) {
229 |         f <- function(x) {
230 |             x^2
231 |         }
232 |         f(x) + 1
233 |     }
234 |     f(x) * 2
235 | }
236 | f(10)
237 | ```
238 | 
239 | ## More about functions
240 | 
241 | - Argument matching by position or by names
242 | - Calling a function with a list of arguments
243 | 
244 | 
245 | ```r
246 | args <- list(x = 1:10, trim = 0.3)
247 | do.call(mean, args)
248 | ```
249 | 
250 | - Default arguments
251 | 
252 | 
253 | ```r
254 | f <- function(x = 1, y = 2) x * y
255 | f <- function(x = 1, y = x + 2) x * y
256 | ```
257 | 
258 | - Missing arguments
259 | 
260 | 
261 | ```r
262 | f <- function(x = 1, y) {
263 | 	c(missing(x), missing(y))
264 | }
265 | f()
266 | f(x = 1)
267 | ```
268 | 
269 | - Passing non-matched parameters `...` to an inner function
270 | 
271 | 
272 | ```r
273 | plot2 <- function(...) {
274 |     message("Verbose plotting...")
275 |     plot(...)
276 | }
277 | 
278 | f <- function(...) list(...)
279 | ```
280 | 
281 | - Return values: last statement, explicit `return`, make output
282 |   `invisible`
283 |   
284 | 
285 | ```r
286 | f1 <- function() 1
287 | f2 <- function() return(1)
288 | f3 <- function() return(invisible(1))
289 | ```
290 | 
291 | - Explicit triggers before exiting. Useful to restore global state
292 |   (plotting parameters, cleaning temporary files, ...)
293 | 
294 | 
295 | ```r
296 | f1 <- function(x) {
297 |     on.exit(print("!"))
298 |     x + 1
299 | }
300 | 
301 | f2 <- function(x) {
302 |     on.exit(print("!"))
303 |     stop("Error")
304 | }
305 | ```
306 | 
307 | 
308 | ```r
309 | f3 <- function() {
310 |     on.exit(print("1"))
311 |     on.exit(print("2"))
312 |     invisible(TRUE)
313 | }
314 | 
315 | 
316 | f4 <- function() {
317 |     on.exit(print("1"))
318 |     on.exit(print("2"), add = TRUE)
319 |     invisible(TRUE)
320 | }
321 | ```
322 | 
323 | - Anonymous functions, created on-the-flight and passed to `lapply` or
324 |   other high-level functions.
325 | 
326 | 
327 | ```r
328 | function(x) x + y
329 | body(function(x) x + y)
330 | args(function(x) x + y)
331 | environment(function(x) x + y)
332 | ```
333 | 
334 | ## More about scoping
335 | 
336 | *Lexical scoping*: default behaviour, current environment, then
337 | traversing *enclosing/parent environments*.
338 | 
339 | 
340 | *Dynamic scoping*: looking up variables in the *calling environment*,
341 | used in non-standard evaluation.
342 | 
343 | 
344 | # Functional programming
345 | 
346 | **First-class functions** - a function is a value just like any other
347 | variable. Functions can thus be used as arguments to other
348 | functions. Functions are considered *first-class citizens*.
349 | 
350 | **Higher-order functions** - refers to functions that take functions
351 | as parameters (input) or return functions (output).
352 | 
353 | ## Closures
354 | 
355 | > "An object is data with functions. A closure is a function with
356 | > data." - John D. Cool
357 | 
358 | Closures: functions written by functions. They enclose the envionment
359 | of the parent function and can access all its variables. 
360 | 
361 | 
362 | ```r
363 | make.power <- function(n) 
364 |     function(x) x^n
365 | 
366 |   
367 | cube <- make.power(3)
368 | square <- make.power(2)
369 | cube(2)
370 | square(2)
371 | environment(cube)
372 | environment(square)
373 | ```
374 | 
375 | Mutable state: a counter function
376 | 
377 | 
378 | ```r
379 | new_counter <- function() {
380 |     i <- 0
381 |     function() {
382 |         i <<- i + 1
383 |         i 
384 |     }
385 | }
386 | 
387 | count1 <- new_counter()
388 | count2 <- new_counter()
389 | 
390 | count1()
391 | count1()
392 | count2()
393 | 
394 | environment(count1)
395 | environment(count2)
396 | environment(count1)$i
397 | environment(count2)$i
398 | ```
399 | 
400 | Questions:
401 | 
402 | - What happens of we place the code `i <- 0` and the function
403 |   definition outside of a function, i.e in the global environment?
404 | 
405 | - What happens if we use `<-` instead of `<<-`?
406 | 
407 | The `colorRampPallette`
408 | 
409 | 
410 | ```r
411 | colramp <- colorRampPalette(c("blue", "yellow"))
412 | colramp(5)
413 | plot(1:10, col = colramp(10), pch = 19, cex = 2,
414 |      main = "colramp(10)")
415 | ```
416 | 
417 | ## Functional
418 | 
419 | Take a function as input or return a function as output.
420 | 
421 | - `Reduce(f, x)` uses a binary function to successively combine the
422 |   elements of a given vector and a possibly given initial value.
423 | 
424 | 
425 | ```r
426 | L <- replicate(3, matrix(rnorm(9), 3), simplify = FALSE)
427 | Reduce("+", L)
428 | try(sum(L))
429 | ```
430 | 
431 | 
432 | ```r
433 | Reduce("+", list(1, 2, 3), init = 10)
434 | Reduce("+", list(1, 2, 3), accumulate = TRUE)
435 | Reduce("+", list(1, 2, 3), right = TRUE, accumulate = TRUE)
436 | ```
437 | 
438 | - `Filter(f, x)` extracts the elements of a vector for which a
439 |   predicate (logical) function gives true.
440 | 
441 | - `Negate(f)`  creates the negation of a given function.
442 | 
443 | 
444 | ```r
445 | even <- function(x) x %% 2 == 0
446 | (y <- sample(100, 10))
447 | Filter(even, y)
448 | Filter(Negate(even), y)
449 | ```
450 | 
451 | - `Map(f, ...)` applies a function to the corresponding elements of
452 |   given vectors. Similar to `mapply` without any attempt to simplify.
453 | 
454 | 
455 | ```r
456 | Map(even, 1:3)
457 | ```
458 | 
459 | - `Find(f, x)` and `Position(f, x)` give the first (or last elements)
460 |   and its position in the vector, for which a predicate (logical)
461 |   function gives true.
462 | 
463 | 
464 | ```r
465 | Find(even, 10:15)
466 | Find(even, 10:15, right = TRUE)
467 | Position(Negate(even), 10:15)
468 | Position(Negate(even), 10:15, right = TRUE)
469 | ```
470 | 
471 | ## References
472 | 
473 | - R Gentleman, *R Programming for Bioinformatics*, CRC Press, 2008
474 | - `?Map`, or any other of the higher order functions
475 | - Blog post, *Higher Order Functions in R*, John Myles White
476 | 		http://www.johnmyleswhite.com/notebook/2010/09/23/higher-order-functions-in-r/
477 | 
478 | # Vectorisation
479 | 
480 | 
481 | > Many operations in R are vectorized, and understanding and using
482 | > vectorization is an essential component of becoming a proficient
483 | > programmer. -  R Gentleman in *R Programming for Bioinformatics*.
484 | 
485 | A *vectorised computation* is one that, when applied to a vector (of
486 | length greater than 1), automatically operates directly on all
487 | elements of the input vector.
488 | 
489 | 
490 | 
491 | ```r
492 | (x <- 1:5)
493 | (y <- 5:1)
494 | x + y
495 | ```
496 | 
497 | ## Recycling rule
498 | 
499 | What is `x` and `y` are of different length: the shorter vector is
500 | replicate so that its length matches the longer ones.
501 | 
502 | 
503 | ```r
504 | (x <- 1:6)
505 | (y <- 1:2)
506 | x+y
507 | ```
508 | 
509 | If the shorter vector is not an even multiple of the longer, a warning
510 | is issued.
511 | 
512 | ## Example
513 | 
514 | Compute difference between times of events, `e`. Given `n` events,
515 | there will be `n-1` inter-event times. `interval[i] <- e[i+1] - e[i]`
516 | 
517 | Procedural implementation:
518 | 
519 | 
520 | ```r
521 | diff1 <- function(e) {
522 |   n <- length(e)
523 |   interval <- rep(0, n - 1) 
524 |   for (i in 1:(n - 1)) 
525 |       interval[i] <- e[i + 1] - e[i]
526 |   interval
527 | }
528 | e <- c(2, 5, 10.2, 12, 19)
529 | diff1(e)
530 | ```
531 | 
532 | Vectorised implementation:
533 | 
534 | 
535 | ```r
536 | diff2 <- function(e) {
537 |   n <- length(e)
538 |   e[-1] - e[-n]
539 | }
540 | e <- c(2, 5, 10.2, 12, 19)
541 | diff2(e)
542 | ```
543 | 
544 | ## `*apply` functions
545 | 
546 | How to apply a function, iteratively, on a set of elements?
547 | 
548 | `apply(X, MARGIN, FUN, ...)`
549 | 
550 | 
551 | - `MARGIN` = 1 for row, 2 for cols.
552 | - `FUN` = function to apply
553 | - `...` = extra args to function.
554 | - `simplify` =  should the result be simplified if possible.
555 | 
556 | 
557 | `*apply` functions are (generally) **NOT** faster than loops, but more
558 | succint and thus clearer.
559 | 
560 | 
561 | ```r
562 | v <- rnorm(1000) ## or a list
563 | res <- numeric(length(v))
564 | 
565 | for (i in 1:length(v)) 
566 |   res[i] <- f(v[i])
567 | 
568 | res <- sapply(v, f)
569 | 
570 | ## if f is vectorised
571 | f(v)
572 | ```
573 | 
574 | function | use case
575 | -------|---------------------------------------
576 | apply  | matrices, arrays, data.frames
577 | lapply | lists, vectors
578 | sapply | lists, vectors
579 | vapply | with a pre-specified type of return value
580 | tapply | atomic objects, typically vectors
581 | by     | similar to tapply
582 | eapply | environments
583 | mapply | multiple values
584 | rapply | recursive version of lapply
585 | esApply | `ExpressionSet`, defined in `Biobase`
586 | 
587 | See also the `BiocGenerics` package for `[l|m|s|t]apply` S4 generics,
588 | as well as parallel versions in the `parallel` package (see
589 | `Performance` section).
590 | 
591 | 
592 | See also the `plyr` package, that offers its own flavour of `apply`
593 | functions.
594 | 
595 |   in/out    |  list   | data frame | array
596 | ------------|---------|------------|---------
597 |  list       | llply() |	ldply()    | laply()
598 |  data frame | dlply() |	ddply()    | daply()
599 |  array      | alply() |	adply()    | aaply()
600 | 
601 | 
602 | ## Other functions
603 | 
604 | - `replicate` - repeated evaluation of an expression
605 | - `aggregate` - compute summary statistics of data subsets
606 | - `ave` - group averages over level combinations of factors
607 | - `sweep` - sweep out array summaries
608 | 
609 | ## Anonymous functions
610 | 
611 | A function defined/called without being assigned to an identifier and
612 | generally passed as argument to other functions.
613 | 
614 | 
615 | 
616 | ```r
617 | M <- matrix(rnorm(100), 10)
618 | apply(M, 1, function(Mrow) 'do something with Mrow')
619 | apply(M, 2, function(Mcol) 'do something with Mcol')
620 | ```
621 | 
622 | ## Use case: integration
623 | 
624 | 
625 | 
626 | ![`sin(x^2)/ (a + abs(x))`](./figs/sinfun.png)
627 | 
628 | The `integrate` function approximates definite integrals by
629 | adaptive quadrature.
630 | 
631 | 
632 | 
633 | ```r
634 | f <- function(x, a = 1) sin(x^2)/ (a + abs(x))
635 | integrate(f, lower = -2, upper = 2)
636 | ```
637 | 
638 | It is not vectorised.
639 | 
640 | 
641 | ```r
642 | lo <- c(-2, 0)
643 | hi <- c(0, 2)
644 | integrate(f, lower = lo, upper = hi)
645 | ```
646 | 
647 | ## How to vectorise
648 | 
649 | - To vectorise a function, we can explicitly wrap it inside a helper
650 |   function that will take care of argument recycling (via `rep`), then
651 |   loop over the inputs and call the non-vectorised function.
652 | 
653 | - To vectorise a function, we can explicitate the vectorised
654 |   calculation using `mapply`.
655 | 
656 | 
657 | ```r
658 | mapply(function(lo, hi) integrate(f, lo, hi)$value,
659 |        lo, hi)
660 | ```
661 | 
662 | - Create a vectorised form using `Vectorize`. It takes a function
663 |   (here, an anonymous function) as input and returns a function.
664 | 
665 | 
666 | ```r
667 | Integrate <- Vectorize(
668 |   function(fn, lower, upper)
669 |   integrate(fn, lower, upper)$value,
670 |   vectorize.args=c("lower", "upper")
671 |   )
672 | Integrate(f, lower=lo, upper=hi)
673 | ```
674 | 
675 | ## **Efficient** apply-like functions
676 | 
677 | These functions combine high-level vectorised syntax for clarity
678 | **and** efficient C-level vectorised imputation (see *Performance*
679 | section).
680 | 
681 | - In `base`: rowSums, rowMeans, colSums, colMeans
682 | - In `Biobase`: rowQ, rowMax, rowMin, rowMedias, ...
683 | - In `genefilter`: rowttests, rowFtests, rowSds, rowVars, ...
684 | 
685 | Generalisable on other data structures, like `ExpressionSet`
686 | instances.
687 | 
688 | ## Parallelisation
689 | 
690 | Vectorised operations are natural candidats for parallel execution.
691 | See later, *Parallel computation* topic.
692 | 
693 | ## References
694 | 
695 | - R Gentleman, *R Programming for Bioinformatics*, CRC Press, 2008
696 | - Ligges and Fox, *R Help Desk, How Can I Avoid This Loop or Make It
697 |   Faster?* R News, Vol 8/1. May 2008.
698 | - Grouping functions: sapply vs. lapply vs. apply. vs. tapply
699 |   vs. by vs. aggregate ... http://stackoverflow.com/questions/3505701/
700 | 


--------------------------------------------------------------------------------
/03-debug.R:
--------------------------------------------------------------------------------
  1 | 
  2 | message("This is a message for our dear users.")
  3 | 
  4 | 
  5 | message("This is a message for our dear users. ",
  6 | 	paste("Thank you for using our software",
  7 |               sw, "version", packageVersion(sw)))
  8 | 
  9 | 
 10 | f1 <- function() {
 11 |     cat("I AM LOUD AND YOU CAN'T HELP IT.\n")
 12 |     ## do stuff
 13 |     invisible(TRUE)
 14 | }
 15 | f1()
 16 | 
 17 | 
 18 | f2 <- function() {
 19 |     message("Sorry to interup, but...")
 20 |     ## do stuff
 21 |     invisible(TRUE)
 22 | }
 23 | f2()
 24 | suppressMessages(f2())
 25 | 
 26 | 
 27 | f3 <- function(verbose = TRUE) {
 28 |     if (verbose)
 29 |         message("I am being verbose because you let me.")
 30 |     ## do stuff
 31 |     invisible(TRUE)
 32 | }
 33 | f3()
 34 | f3(verbose = FALSE)
 35 | 
 36 | 
 37 | warning("Do not ignore me. Somthing bad might have happened.")
 38 | warning("Do not ignore me. Somthing bad might be happening.", immediate. = TRUE)
 39 | 
 40 | 
 41 | f <- function(...)
 42 |     warning("Attention, attention, ...!", ...)
 43 | f()
 44 | f(call. = FALSE)
 45 | 
 46 | 
 47 | warnings()
 48 | last.warning
 49 | 
 50 | 
 51 | option("warn")
 52 | 
 53 | 
 54 | stop("This is the end, my friend.")
 55 | 
 56 | 
 57 | log(c(2, 1, 0, -1, 2)); print('end') ## warning
 58 | xor(c(TRUE, FALSE));  print ('end')  ## error
 59 | 
 60 | 
 61 | geterrmessage()
 62 | 
 63 | 
 64 | ## Import the log4r package.
 65 | library('log4r')
 66 | 
 67 | ## Create a new logger object with create.logger().
 68 | logger <- create.logger()
 69 | 
 70 | ## Set the logger's file output: currently only allows flat files.
 71 | logfile(logger) <- file.path('base.log')
 72 | 
 73 | ## Set the current level of the logger.
 74 | level(logger) <- "INFO"
 75 | 
 76 | ## Try logging messages at different priority levels.
 77 | debug(logger, 'A Debugging Message') ## Won't print anything
 78 | info(logger, 'An Info Message')
 79 | warn(logger, 'A Warning Message')
 80 | error(logger, 'An Error Message')
 81 | fatal(logger, 'A Fatal Error Message')
 82 | 
 83 | 
 84 | n <- 10
 85 | pb <- txtProgressBar(min = 0, max = n, style = 3)
 86 | for (i in 1:n) {
 87 |     setTxtProgressBar(pb, i)
 88 |     Sys.sleep(0.5)
 89 | }
 90 | close(pb)
 91 | 
 92 | 
 93 | library("progress")
 94 | pb <- progress_bar$new(total = n)
 95 | for (i in 1:n) {
 96 |     pb$tick()
 97 |     Sys.sleep(0.5)
 98 | }
 99 | 
100 | 
101 | if (!condition) stop(...)
102 | 
103 | 
104 | stopifnot(TRUE)
105 | stopifnot(TRUE, FALSE)
106 | 
107 | 
108 | f <- function(x) {
109 |     stopifnot(is.numeric(x), length(x) == 1)
110 |     invisible(TRUE)
111 | }
112 | 
113 | f(1)
114 | f("1")
115 | f(1:2)
116 | f(letters)
117 | 
118 | 
119 | x <- "1"
120 | library("assertthat")
121 | stopifnot(is.numeric(x))
122 | assert_that(is.numeric(x))
123 | assert_that(length(x) == 2)
124 | 
125 | 
126 | a <- sqrt(2)
127 | a * a == 2
128 | a * a - 2
129 | 
130 | 
131 | 1L + 2L == 3L
132 | 1.0 + 2.0 == 3.0
133 | 0.1 + 0.2 == 0.3
134 | 
135 | 
136 | all.equal(0.1 + 0.2, 0.3)
137 | all.equal(0.1 + 0.2, 3.0)
138 | isTRUE(all.equal(0.1 + 0.2, 3)) ## when you just want TRUE/FALSE
139 | 
140 | 
141 | 1 == NULL
142 | all.equal(1, NULL)
143 | identical(1, NULL)
144 | identical(1, 1.)   ## TRUE in R (both are stored as doubles)
145 | all.equal(1, 1L)
146 | identical(1, 1L)   ## stored as different types
147 | 
148 | 
149 | col_means <- function(df) {
150 |   numeric <- sapply(df, is.numeric)
151 |   numeric_cols <- df[, numeric]
152 |   data.frame(lapply(numeric_cols, mean))
153 | }
154 | 
155 | 
156 | col_means(mtcars)
157 | col_means(mtcars[, 0])
158 | col_means(mtcars[0, ])
159 | col_means(mtcars[, "mpg", drop = FALSE])
160 | col_means(1:10)
161 | col_means(as.matrix(mtcars))
162 | col_means(as.list(mtcars))
163 | 
164 | mtcars2 <- mtcars
165 | mtcars2[-1] <- lapply(mtcars2[-1], as.character)
166 | col_means(mtcars2)
167 | 
168 | 
169 | e <- function(i) {
170 |   x <- 1:4
171 |   if (i < 5) x[1:2]
172 |   else x[-1:2]
173 | }
174 | f <- function() sapply(1:10, e)
175 | g <- function() f()
176 | 
177 | 
178 | g()
179 | traceback()
180 | 
181 | 
182 | debug(g)
183 | g()
184 | 
185 | 
186 | e <- function(i) {
187 |   x <- 1:4
188 |   if (i < 5) x[1:2]
189 |   else x[-1:2] # oops! x[-(1:2)]
190 | }
191 | f <- function() sapply(1:10, e)
192 | g <- function() f()
193 | 
194 | 
195 | ## make sure you have the 'sequences' package.
196 | ## Get readFasta2, the function to debug
197 | library(devtools)
198 | install_github("lgatto/sequences") ## from github
199 | ## or
200 | install.packages("sequences") ## from CRAN
201 | library("sequences")
202 | sequences:::debugme()
203 | ## Get an example file
204 | f <- dir(system.file("extdata", package = "sequences"),
205 |          full.names=TRUE, pattern = "moreDnaSeqs.fasta")
206 | ## BANG!
207 | readFasta2(f)
208 | 
209 | 
210 | f <- function() {
211 |     x <- "1"
212 |     log(x)
213 |     message("x was the ", class(x), " ", x)
214 | }
215 | f()
216 | 
217 | 
218 | f <- function() {
219 |     x <- "1"
220 |     try(log(x))
221 |     message("x was the ", class(x), " ", x)
222 | }
223 | f()
224 | 
225 | 
226 | try({
227 |     a <- 1
228 |     b <- "2"
229 |     a + b
230 | })
231 | 
232 | 
233 | success <- try(1 + 2)
234 | failure <- try(1 + "2", silent = TRUE)
235 | class(success)
236 | class(failure)
237 | 
238 | 
239 | inherits(failure, "try-error")
240 | 
241 | if (inherits(failure, "try-error"))
242 | 	message("There was an error here.")
243 | 
244 | 
245 | el <- list(1:10, c(-1, 1), TRUE, "1")
246 | res <- lapply(el, log)
247 | res
248 | res <- lapply(el, function(x) try(log(x)))
249 | res
250 | 
251 | 
252 | default <- NULL
253 | try(default <- read.csv("possibly-bad-input.csv"), silent = TRUE)
254 | 
255 | 
256 | f <- function(x)
257 |     if (x == 1) stop("Error!") else 1
258 | 
259 | f(1)
260 | f(2)
261 | 
262 | 
263 | safef <- failwith(NULL, f)
264 | safef(1)
265 | safef(2)
266 | 
267 | 
268 | f <- function() {
269 |     x <- "1"
270 |     tryCatch(log(x),
271 |              error = function(e) cat("There was an error!\n"))
272 |     message("x was the ", class(x), " ", x)
273 | }
274 | f()
275 | 
276 | 
277 | show_condition <- function(code) {
278 |   tryCatch(code,
279 |     error = function(c) "error",
280 |     warning = function(c) "warning",
281 |     message = function(c) "message"
282 |   )
283 | }
284 | show_condition(stop("!"))
285 | show_condition(warning("?!"))
286 | show_condition(message("?"))
287 | show_condition(0)
288 | 
289 | 
290 | read.csv2 <- function(file, ...) {
291 |   tryCatch(read.csv(file, ...), error = function(c) {
292 |     c$message <- paste0(c$message, " (in ", file, ")")
293 |     stop(c)
294 |   })
295 | }
296 | read.csv("code/dummy.csv")
297 | read.csv2("code/dummy.csv")
298 | 
299 | 
300 | f <- function(x = 10) {
301 |     lapply(seq_len(x), function(i) {
302 |         ## make an example 2x2 contingency table
303 |         d <- matrix(sample(4:10, 4), nrow = 2, ncol = 2)
304 |         ## will produce warning if there is a 5 or less
305 |         ## in the contingency table
306 |         chisq.test(d)
307 |     })
308 | }
309 | 
310 | 
311 | set.seed(1)
312 | f()
313 | set.seed(1)
314 | withCallingHandlers(f(), warning=function(e) recover())
315 | 
316 | 
317 | f <- function() g()
318 | g <- function() h()
319 | h <- function() stop("!")
320 | 
321 | tryCatch(f(), error = function(e) print(sys.calls()))
322 | withCallingHandlers(f(), error = function(e) print(sys.calls()))
323 | 
324 | 
325 | safelog <- function(x) {
326 |   tryCatch(log(x),
327 |            error = function(e) paste("an error with input", x),
328 |            warning = function(e) paste("a warning with input", x))
329 | }
330 | 
331 | 
332 | log(1)
333 | safelog(1)
334 | log(-1)
335 | safelog(-1)
336 | log("a")
337 | safelog("a")
338 | 
339 | 
340 | safelog <- function(x) {
341 |   tryCatch(log(x),
342 |            error = function(e) paste("an error with input", x),
343 |            warning = function(e) paste("a warning with input", x))
344 | }
345 | 
346 | 
347 | ## Report whenever e invoked
348 | trace(sum)
349 | hist(rnorm(100))
350 | untrace(sum)
351 | 
352 | 
353 | ## Evaluate arbitrary code whenever e invoked
354 | trace(e, quote(cat("i am", i, "\n")))
355 | ## Another way to enter browser whenver e invoked
356 | trace(e, browser)
357 | ## stop tracing
358 | untrace(e)
359 | 
360 | 
361 | f <- function() {
362 |     ## make an example 2x2 contingency table
363 |     d <- matrix(sample(4:10, 4), nrow=2, ncol=2)
364 |      chisq.test(d)
365 | }
366 | set.seed(1)
367 | f() ## no warning
368 | 
369 | set.seed(11)
370 | f() ## warning
371 | 
372 | 
373 | if (any(d < 5))
374 |   browser()
375 | 
376 | 
377 | as.list(body(f))
378 | 
379 | 
380 | trace("f", quote(if (any(d < 5)) browser()), at = 3)
381 | 
382 | 
383 | f
384 | body(f)
385 | 
386 | 
387 | set.seed(1)
388 | f() ## normal execution
389 | 
390 | set.seed(11)
391 | f() ## enters browser mode
392 | 
393 | 
394 | library("MSnbase")
395 | data(itraqdata)
396 | x <- itraqdata[[1]]
397 | plot(x, full=TRUE)
398 | 
399 | 
400 | debug(plot)
401 | plot(x, full=TRUE)
402 | 
403 | 
404 | trace("plot", browser,
405 |       signature = c("Spectrum", "missing"))
406 | plot(x, full=TRUE)
407 | 
408 | 


--------------------------------------------------------------------------------
/03-debug.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Part III: Debugging"
  3 | author: "Laurent Gatto"
  4 | ---
  5 | 
  6 | # Overview
  7 | 
  8 | - Defensive programming
  9 | - Debbugging: techniques and tools
 10 | - Condition handling
 11 | - Testing
 12 | 
 13 | # Defensive programming
 14 | 
 15 | Before we begin with debugging, let's look at ways to prevent bugs
 16 | (more at the end of this part). 
 17 | 
 18 | *Defensive programming*: 
 19 | - making the code work in a predicable manner
 20 | - writing code that fails in a well-defined manner
 21 | - if something *weird* happens, either properly deal with it, of fail
 22 |   quickly and loudly
 23 | 
 24 | The level of defensiveness will depend whether you write a function
 25 | for interactive of programmatic usage.
 26 | 
 27 | ## Talk to your users
 28 | 
 29 | ### Diagnostic messages
 30 | 
 31 | ```{r, eval=FALSE}
 32 | message("This is a message for our dear users.")
 33 | ```
 34 | 
 35 | ```{r, eval=FALSE}
 36 | message("This is a message for our dear users. ",
 37 | 	paste("Thank you for using our software",
 38 |               sw, "version", packageVersion(sw)))
 39 | ```
 40 | 
 41 | Do not use `print` or `cat`:
 42 | 
 43 | ```{r, eval=FALSE}
 44 | f1 <- function() {
 45 |     cat("I AM LOUD AND YOU CAN'T HELP IT.\n")
 46 |     ## do stuff
 47 |     invisible(TRUE)
 48 | }
 49 | f1()
 50 | ```
 51 | 
 52 | ```{r, eval=FALSE}
 53 | f2 <- function() {
 54 |     message("Sorry to interup, but...")
 55 |     ## do stuff
 56 |     invisible(TRUE)
 57 | }
 58 | f2()
 59 | suppressMessages(f2())
 60 | ```
 61 | 
 62 | Of course, it is also possible to manually define verbosity. This
 63 | makes you write more code for a feature readily available. But still
 64 | better to use `message`.
 65 | 
 66 | ```{r, eval=FALSE}
 67 | f3 <- function(verbose = TRUE) {
 68 |     if (verbose)
 69 |         message("I am being verbose because you let me.")
 70 |     ## do stuff
 71 |     invisible(TRUE)
 72 | }
 73 | f3()
 74 | f3(verbose = FALSE)
 75 | ```
 76 | 
 77 | ### Warning
 78 | 
 79 | > There is a problem with warnings. No one reads them. Pat Burns, in
 80 | > *R inferno*.
 81 | 
 82 | ```{r, eval=FALSE}
 83 | warning("Do not ignore me. Somthing bad might have happened.")
 84 | warning("Do not ignore me. Somthing bad might be happening.", immediate. = TRUE)
 85 | ```
 86 | 
 87 | ```{r, eval=FALSE}
 88 | f <- function(...)
 89 |     warning("Attention, attention, ...!", ...)
 90 | f()
 91 | f(call. = FALSE)
 92 | ```
 93 | Print warnings after they have been thrown.
 94 | 
 95 | ```{r, eval=FALSE}
 96 | warnings()
 97 | last.warning
 98 | ```
 99 | 
100 | See also to `warn` option in `?options` .
101 | 
102 | ```{r, eval=FALSE}
103 | option("warn")
104 | ```
105 | 
106 | ### Error
107 | 
108 | ```{r, eval=FALSE}
109 | stop("This is the end, my friend.")
110 | ```
111 | 
112 | ```{r, eval=FALSE}
113 | log(c(2, 1, 0, -1, 2)); print('end') ## warning 
114 | xor(c(TRUE, FALSE));  print ('end')  ## error
115 | ```
116 | 
117 | Stop also has a `call.` parameter.
118 | 
119 | ```{r, eval=FALSE}
120 | geterrmessage()
121 | ```
122 | 
123 | ### Logging
124 | 
125 | See for example the [`log4r`](https://github.com/johnmyleswhite/log4r)
126 | package:
127 | 
128 | ```{r, eval=FALSE}
129 | ## Import the log4r package.
130 | library('log4r')
131 | 
132 | ## Create a new logger object with create.logger().
133 | logger <- create.logger()
134 | 
135 | ## Set the logger's file output: currently only allows flat files.
136 | logfile(logger) <- file.path('base.log')
137 | 
138 | ## Set the current level of the logger.
139 | level(logger) <- "INFO"
140 | 
141 | ## Try logging messages at different priority levels.
142 | debug(logger, 'A Debugging Message') ## Won't print anything
143 | info(logger, 'An Info Message')
144 | warn(logger, 'A Warning Message')
145 | error(logger, 'An Error Message')
146 | fatal(logger, 'A Fatal Error Message')
147 | ```
148 | 
149 | ### Progress bars
150 | 
151 | - `utils::txtProgressBar` function
152 | 
153 | ```{r, eval=FALSE}
154 | n <- 10
155 | pb <- txtProgressBar(min = 0, max = n, style = 3)
156 | for (i in 1:n) {
157 |     setTxtProgressBar(pb, i)
158 |     Sys.sleep(0.5)
159 | }
160 | close(pb)
161 | ```
162 | 
163 | - [`progress`](https://github.com/gaborcsardi/progress) package
164 | 
165 | ```{r, eval=FALSE}
166 | library("progress")
167 | pb <- progress_bar$new(total = n)
168 | for (i in 1:n) {
169 |     pb$tick()
170 |     Sys.sleep(0.5)
171 | }
172 | ```
173 | 
174 | Tip: do not over use progress bars. Ideally, a user should be
175 | confident that everything is under control and progress is made while
176 | waiting for a function to return. In my experience, a progress bar is
177 | usefull when there is a specific and/or user-defined number of
178 | iterations, such a *iterating over n files*, or *running a simulation
179 | n times*.
180 | 
181 | **Question**: What about mixing progress bars and verbosity.
182 | 
183 | 
184 | ## KISS
185 | 
186 | Keep your functions simple and stupid (and short). 
187 | 
188 | ## Failing fast and well
189 | 
190 | > Bounds errors are ugly, nasty things that should be stamped out
191 | > whenever possible. One solution to this problem is to use the
192 | > `assert` statement. The `assert` statement tells C++, "This can
193 | > never happen, but if it does, abort the program in a nice way." One
194 | > thing you find out as you gain programming experience is that things
195 | > that can "never happen" happen with alarming frequency. So just to
196 | > make sure that things work as they are supposed to, it’s a good idea
197 | > to put lots of self checks in your program. -- Practical C++
198 | > Programming, Steve Oualline, O'Reilly.
199 | 
200 | ```{r, eval=FALSE}
201 | if (!condition) stop(...)
202 | ```
203 | 
204 | ```{r, eval=FALSE}
205 | stopifnot(TRUE)
206 | stopifnot(TRUE, FALSE)
207 | ```
208 | 
209 | For example to test input classes, lengths, ...
210 | 
211 | ```{r, eval=FALSE}
212 | f <- function(x) {
213 |     stopifnot(is.numeric(x), length(x) == 1)
214 |     invisible(TRUE)
215 | }
216 | 
217 | f(1)
218 | f("1")
219 | f(1:2)
220 | f(letters)
221 | ```
222 | 
223 | The [`assertthat`](https://github.com/hadley/assertthat) package:
224 | 
225 | ```{r, eval=FALSE}
226 | x <- "1"
227 | library("assertthat")
228 | stopifnot(is.numeric(x))
229 | assert_that(is.numeric(x))
230 | assert_that(length(x) == 2)
231 | ```
232 | 
233 | * `assert_that()` signal an error.
234 | * `see_if()` returns a logical value, with the error message as an attribute.
235 | * `validate_that()` returns `TRUE` on success, otherwise returns the error as
236 |   a string.
237 | 
238 |   
239 |   
240 | * `is.flag(x)`: is x `TRUE` or `FALSE`? (a boolean flag)
241 | * `is.string(x)`: is x a length 1 character vector?
242 | * `has_name(x, nm)`, `x %has_name% nm`: does `x` have component `nm`?
243 | * `has_attr(x, attr)`, `x %has_attr% attr`: does `x` have attribute `attr`?
244 | * `is.count(x)`: is x a single positive integer?
245 | * `are_equal(x, y)`: are `x` and `y` equal?
246 | * `not_empty(x)`: are all dimensions of `x` greater than 0?
247 | * `noNA(x)`: is `x` free from missing values?
248 | * `is.dir(path)`: is `path` a directory?
249 | * `is.writeable(path)`/`is.readable(path)`: is `path` writeable/readable?
250 | * `has_extension(path, extension)`: does `file` have given `extension`?
251 | 
252 | 
253 | ## Consistency and predictability
254 | 
255 | Reminder of the interactive use vs programming examples: 
256 | - `[` and `drop` 
257 | - `sapply`, `lapply`, `vapply`
258 | 
259 | Remember also the concept of *tidy data*.
260 | 
261 | ## Comparisons
262 | 
263 | ### Floating point issues to be aware of
264 | 
265 | R FAQ [7.31](http://cran.r-project.org/doc/FAQ/R-FAQ.html#Why-doesn_0027t-R-think-these-numbers-are-equal_003f)?
266 | 
267 | 
268 | ```{r, eval=FALSE}
269 | a <- sqrt(2)
270 | a * a == 2
271 | a * a - 2
272 | ```
273 | 
274 | ```{r, eval=FALSE}
275 | 1L + 2L == 3L
276 | 1.0 + 2.0 == 3.0
277 | 0.1 + 0.2 == 0.3
278 | ```
279 | 
280 | ### Floating point: how to compare
281 | 
282 | - `all.equal` compares R objects for *near equality*. Takes into
283 |   account whether object attributes and names ought the taken into
284 |   consideration (`check.attributes` and `check.names` parameters) and
285 |   tolerance, which is machine dependent.
286 | 
287 | ```{r, eval=FALSE}
288 | all.equal(0.1 + 0.2, 0.3)
289 | all.equal(0.1 + 0.2, 3.0)
290 | isTRUE(all.equal(0.1 + 0.2, 3)) ## when you just want TRUE/FALSE
291 | ```
292 | 
293 | ### Exact identity
294 | 
295 | `identical`: test objects for exact equality
296 | 
297 | ```{r, eval=FALSE}
298 | 1 == NULL
299 | all.equal(1, NULL)
300 | identical(1, NULL)
301 | identical(1, 1.)   ## TRUE in R (both are stored as doubles)
302 | all.equal(1, 1L)
303 | identical(1, 1L)   ## stored as different types
304 | ```
305 | 
306 | Appropriate within `if`, `while` condition statements. (not
307 | `all.equal`, unless wrapped in `isTRUE`).
308 | 
309 | ## Exercise
310 | 
311 | (From [adv-r](http://adv-r.had.co.nz/Exceptions-Debugging.html#defensive-programming).)
312 | 
313 | The `col_means` function computes the means of all numeric columns in
314 | a data frame.
315 | 
316 | ```{r, eval=FALSE}
317 | col_means <- function(df) {
318 |   numeric <- sapply(df, is.numeric)
319 |   numeric_cols <- df[, numeric]
320 |   data.frame(lapply(numeric_cols, mean))
321 | }
322 | ```
323 | 
324 | Is it a robust function? What happens if there are unusual inputs.
325 | 
326 | ```{r, eval=FALSE}
327 | col_means(mtcars)
328 | col_means(mtcars[, 0])
329 | col_means(mtcars[0, ])
330 | col_means(mtcars[, "mpg", drop = FALSE])
331 | col_means(1:10)
332 | col_means(as.matrix(mtcars))
333 | col_means(as.list(mtcars))
334 | 
335 | mtcars2 <- mtcars
336 | mtcars2[-1] <- lapply(mtcars2[-1], as.character)
337 | col_means(mtcars2)
338 | ```
339 | # Debugging: techniques and tools
340 | 
341 | ### Shit happens
342 | 
343 | > Funding your bug is a process of confirming the many things that you
344 | > believe are true - until you find one which is not true. -- Norm Matloff
345 | 
346 | #### 1. Identify the bug (the difficult part)
347 | - Something went wrong!
348 | - Where in the code does it happen?
349 | - Does it happen every time?
350 | - What input triggered it?
351 | - Report it (even if it is in your code - use github issues, for
352 |   example).
353 | 
354 | **Tip**: Beware of your intuition. As a scientist, do what you are
355 | used to: generate a hypotheses, *design an experiment* to test them,
356 | and record the results.
357 | 
358 | #### 2. Fix it (the less difficult part)
359 | - Correct the bug.
360 | - Make sure that bug will not repeat itself!
361 | - How can we be confident that we haven't introduced new bugs?
362 | 
363 | ## Tools
364 | 
365 | - `print`/`cat`
366 | - `traceback()`
367 | - `browser()`
368 | - `options(error = )`, `options(warn = )`
369 | - `trace`
370 | - IDE: RStudio, StatET, emacs' ess tracebug.
371 | 
372 | 
373 | ### Manually
374 | 
375 | Inserting `print` and `cat` statements in the code. Works, but time
376 | consuming. 
377 | 
378 | ### Finding the bug
379 | 
380 | > Many bugs are subtle and hard to find. -- Hadley Wickham
381 | 
382 | Bugs are shy, and are generally hidden, deep down in your code, to
383 | make it as difficult as possible for you to find them.
384 | 
385 | ```{r, echo=TRUE}
386 | e <- function(i) {
387 |   x <- 1:4
388 |   if (i < 5) x[1:2]
389 |   else x[-1:2]
390 | }
391 | f <- function() sapply(1:10, e)
392 | g <- function() f()
393 | ```
394 | 
395 | `traceback`: lists the sequence of calls that lead to the error
396 | 
397 | ```{r, eval=FALSE}
398 | g()
399 | traceback()
400 | ```
401 | 
402 | If the source code is available (for example for `source()`d code),
403 | then traceback will display the exact location in the function, in the
404 | form `filename.R#linenum`.
405 | 
406 | ### Browsing the error
407 | 
408 | - Register the function for debugging: `debug(g)`. This adds a call to
409 |   the `browser()` function (see also below) and the very beginning of
410 |   the function `g`.
411 |   
412 | - Every call to `g()` will not be run interactively.
413 | 
414 | - To finish debugging: `undebug(g)`.
415 | 
416 | 
417 | ```{r, eval=FALSE}
418 | debug(g)
419 | g()
420 | ```
421 | 
422 | How to debug:
423 | 
424 | - `n` executes the next step of the function. Use `print(n)` or
425 |   `get(n)` to print/access the variable `n`.
426 | - `s` to step into the next function. If it is not a function, same as
427 |   `n`.
428 | - `f` to finish execution of the current loop of function.
429 | - `c` to leave interactive debugging and continue regular execution of
430 |   the function. 
431 | - `Q` to stop debugging, terminate the function and return to the
432 |   global workspace.
433 | - `where` print a stack trace of all active function calls.
434 | - `Enter` same as `n` (or `s`, if it was used most recently), unless
435 |   `options(browserNLdisabled = TRUE)` is set.
436 | 
437 | To fix a function when the source code is not directly available, use
438 | `fix(fun)`. This will open the function's source code for editing and,
439 | after saving and closing, store the updated function in the global
440 | workspace.
441 | 
442 | ### Breakpoints
443 | 
444 | - Add a call to `browser()` anywhere in the source code to execute the
445 |   rest of the code interactively.
446 |   
447 | - To run breakpoints conditionally, wrap the call to `browser()` in a
448 |   condition.
449 | 
450 | ### Setting options
451 | 
452 | - `options(error = recover)`: display the call stack and choose where
453 |   to step in.
454 | 
455 | - `options(error = stop)` or `options(error = NULL)`: reset to default
456 |   behaviour.
457 | 
458 | - `options(warn = 'numeric')`: sets the handling of warning
459 |   messages. If `warn` is negative all warnings are ignored.  If `warn`
460 |   is zero (the default) warnings are stored until the top-level
461 |   function returns.  If 10 or fewer warnings were signalled they will
462 |   be printed otherwise a message saying how many were signalled. An
463 |   object called `last.warning` is created and can be printed through
464 |   the function `warnings`.  If `warn` is one, warnings are printed as
465 |   they occur.  If `warn` is two or larger all warnings are turned into
466 |   errors.
467 | 
468 | - `options(error = dump.frames)`: like `recover` but for
469 |   non-interactive use. Will create a `last.dump.rda` file in the
470 |   current working directory, which can then be reloaded in an
471 |   interactive session to re-inter interactive debugging (using
472 |   `debugger()`).
473 | 
474 | ### Debugging with IDEs
475 | 
476 | - RSudio: `Show Traceback`, `Rerun with Debug` and interactive debugging.
477 | 
478 | ![RStudio debugging 1](./figs/debugRStudio1.png)
479 | ![RStudio debugging 2](./figs/debugRStudio2.png)
480 | 
481 | - StatET (Eclipse plugin)
482 | 
483 | - [emacs ESS and tracebug](http://ess.r-project.org/Manual/ess.html#Developing-with-ESS)
484 | 
485 | ### Exercise
486 | 
487 | 1. Your turn - play with `traceback`, `recover` and `debug`:
488 | 
489 | (Example originally by Martin Morgan and Robert Gentleman.)
490 | 
491 | ```{r, echo=TRUE}
492 | e <- function(i) {
493 |   x <- 1:4
494 |   if (i < 5) x[1:2]
495 |   else x[-1:2] # oops! x[-(1:2)]
496 | }
497 | f <- function() sapply(1:10, e)
498 | g <- function() f()
499 | ```
500 | 
501 | 2. Fix `readFasta2`.
502 | 
503 | ```{r, eval=FALSE}
504 | ## make sure you have the 'sequences' package.
505 | ## Get readFasta2, the function to debug
506 | library(devtools)
507 | install_github("lgatto/sequences") ## from github
508 | ## or 
509 | install.packages("sequences") ## from CRAN
510 | library("sequences")
511 | sequences:::debugme()
512 | ## Get an example file
513 | f <- dir(system.file("extdata", package = "sequences"),
514 |          full.names=TRUE, pattern = "moreDnaSeqs.fasta")
515 | ## BANG!
516 | readFasta2(f)
517 | ```
518 | 
519 | ## Condition handling
520 | 
521 | ### `try` and `tryCatch`
522 | 
523 | The function `f` will never terminate.
524 | 
525 | ```{r, eval=FALSE}
526 | f <- function() {
527 |     x <- "1"
528 |     log(x)
529 |     message("x was the ", class(x), " ", x)
530 | }
531 | f()
532 | ```
533 | 
534 | Use `try` to proceed with the execution even when an error occurs.
535 | 
536 | ```{r, eval=FALSE}
537 | f <- function() {
538 |     x <- "1"
539 |     try(log(x))
540 |     message("x was the ", class(x), " ", x)
541 | }
542 | f()
543 | ```
544 | 
545 | ```{r, eval=FALSE}
546 | try({
547 |     a <- 1
548 |     b <- "2"
549 |     a + b
550 | })
551 | ```
552 | 
553 | In case of error, `try` returns a object of class `try-error`:
554 | 
555 | ```{r, eval=FALSE}
556 | success <- try(1 + 2)
557 | failure <- try(1 + "2", silent = TRUE)
558 | class(success)
559 | class(failure)
560 | ```
561 | 
562 | ```{r, eval=FALSE}
563 | inherits(failure, "try-error")
564 | 
565 | if (inherits(failure, "try-error"))
566 | 	message("There was an error here.")
567 | ```
568 | Handling errors is particularly useful to iterate over all elements of
569 | an input, despite errors (and inspecting/handling/fixing the errors
570 | afterwards).
571 | 
572 | ```{r, eval=FALSE}
573 | el <- list(1:10, c(-1, 1), TRUE, "1")
574 | res <- lapply(el, log)
575 | res
576 | res <- lapply(el, function(x) try(log(x)))
577 | res
578 | ```
579 | 
580 | #### Hadley's tip
581 | 
582 | > Another useful `try()` idiom is using a default value if an
583 | > expression fails. Simply assign the default outside the `try` block,
584 | > and then run the risky code:
585 | 
586 | ```{r, eval=FALSE}
587 | default <- NULL
588 | try(default <- read.csv("possibly-bad-input.csv"), silent = TRUE)
589 | ```
590 | 
591 | > There is also `plyr::failwith()`, which makes this strategy even
592 | > easier to implement.
593 | 
594 | ```{r, eval=FALSE}
595 | f <- function(x)
596 |     if (x == 1) stop("Error!") else 1
597 | 
598 | f(1)
599 | f(2)
600 |      
601 |      
602 | safef <- failwith(NULL, f)
603 | safef(1)
604 | safef(2)
605 | ```
606 | 
607 | Use `tryCatch` to specify a behaviour (handler function) in case of
608 | error, warning or message.
609 | 
610 | ```{r, eval=FALSE}
611 | f <- function() {
612 |     x <- "1"
613 |     tryCatch(log(x),
614 |              error = function(e) cat("There was an error!\n"))
615 |     message("x was the ", class(x), " ", x)
616 | }
617 | f()
618 | ```
619 | 
620 | More example from Hadleys' *Advanced R* book.
621 | 
622 | ```{r, eval=FALSE}
623 | show_condition <- function(code) {
624 |   tryCatch(code,
625 |     error = function(c) "error",
626 |     warning = function(c) "warning",
627 |     message = function(c) "message"
628 |   )
629 | }
630 | show_condition(stop("!"))
631 | show_condition(warning("?!"))
632 | show_condition(message("?"))
633 | show_condition(0)
634 | ```
635 | 
636 | A more informative `read.csv` version:
637 | 
638 | ```{r, eval=FALSE}
639 | read.csv2 <- function(file, ...) {
640 |   tryCatch(read.csv(file, ...), error = function(c) {
641 |     c$message <- paste0(c$message, " (in ", file, ")")
642 |     stop(c)
643 |   })
644 | }
645 | read.csv("code/dummy.csv")
646 | read.csv2("code/dummy.csv")
647 | ```
648 | 
649 | `tryCatch` has a `finally` argument that specifies a code block to be
650 | executed regardless of whether the initial expression succeeds or
651 | fails. Usefull, for example, to clean up (deleting files, closing
652 | connections, ...).
653 | 
654 | 
655 | ### `withCallingHandlers`
656 | 
657 | 
658 | The `withCallingHandlers` function allows to defined special behaviour
659 | in case of *unusual conditions*, including warnings and errors. In the
660 | example below, we start a browser in case of (obscure) warnings.
661 | 
662 | ```{r, eval=FALSE}
663 | f <- function(x = 10) {
664 |     lapply(seq_len(x), function(i) {
665 |         ## make an example 2x2 contingency table
666 |         d <- matrix(sample(4:10, 4), nrow = 2, ncol = 2)
667 |         ## will produce warning if there is a 5 or less 
668 |         ## in the contingency table
669 |         chisq.test(d)
670 |     })
671 | }
672 | ```
673 | 
674 | ```{r, eval=FALSE}
675 | set.seed(1)
676 | f()
677 | set.seed(1)
678 | withCallingHandlers(f(), warning=function(e) recover())
679 | ```
680 | 
681 | ### Difference between `tryCatch` and `withCallingHandlers`
682 | 
683 | (From [*Advanced R*](http://adv-r.had.co.nz/Exceptions-Debugging.html#condition-handling))
684 | 
685 | The handlers in `withCallingHandlers()` are called in the context of
686 | the call that generated the condition whereas the handlers in
687 | `tryCatch()` are called in the context of `tryCatch()`. This is shown
688 | here with `sys.calls()`, which is the run-time equivalent of
689 | `traceback()` -- it lists all calls leading to the current function.
690 | 
691 | ```{r, eval=FALSE}
692 | f <- function() g()
693 | g <- function() h()
694 | h <- function() stop("!")
695 | 
696 | tryCatch(f(), error = function(e) print(sys.calls()))
697 | withCallingHandlers(f(), error = function(e) print(sys.calls()))
698 | ```
699 | 
700 | ### Debugging at the C level with `ddd` or `gdb`
701 | 
702 | Demo
703 | 
704 | ### Exercise
705 | 
706 | ```{r, eval=TRUE, echo=FALSE}
707 | safelog <- function(x) {
708 |   tryCatch(log(x),
709 |            error = function(e) paste("an error with input", x),
710 |            warning = function(e) paste("a warning with input", x))
711 | }
712 | ```
713 | 
714 | 
715 | Write a new `safelog` function that catches and handles errors and
716 | warnings to emulate the following behaviour.
717 | 
718 | ```{r, echo=TRUE, eval=TRUE}
719 | log(1)
720 | safelog(1)
721 | log(-1)
722 | safelog(-1)
723 | log("a")
724 | safelog("a")
725 | ```
726 | 
727 | **Answer**
728 | 
729 | ```{r, echo=TRUE}
730 | safelog <- function(x) {
731 |   tryCatch(log(x),
732 |            error = function(e) paste("an error with input", x),
733 |            warning = function(e) paste("a warning with input", x))
734 | }
735 | ```
736 | 
737 | ## Tracing code
738 | 
739 | From `?trace`:
740 | 
741 | > A call to `trace` allows you to insert debugging code (e.g., a call
742 | > to `browser` or `recover`) at chosen places in any function. A call
743 | > to `untrace` cancels the tracing.
744 | 
745 | ```{r, eval=FALSE}
746 | ## Report whenever e invoked
747 | trace(sum)
748 | hist(rnorm(100))
749 | untrace(sum)
750 | ```
751 | 
752 | ```{r, eval=FALSE}
753 | ## Evaluate arbitrary code whenever e invoked
754 | trace(e, quote(cat("i am", i, "\n")))
755 | ## Another way to enter browser whenver e invoked
756 | trace(e, browser)
757 | ## stop tracing
758 | untrace(e)
759 | ```
760 | 
761 | ### Inserting code dynamically
762 | 
763 | > The `trace` function operates by constructing a revised version of
764 | > the function (or of the method, if ‘signature’ is supplied), and
765 | > assigning the new object back where the original was found.
766 | 
767 | ```{r, eval=FALSE}
768 | f <- function() {
769 |     ## make an example 2x2 contingency table
770 |     d <- matrix(sample(4:10, 4), nrow=2, ncol=2)
771 |      chisq.test(d)
772 | }
773 | set.seed(1)
774 | f() ## no warning
775 | 
776 | set.seed(11)
777 | f() ## warning
778 | ```
779 | 
780 | We want to conditionally enter brower mode, when an element of `d` is
781 | smaller than 5.
782 | 
783 | ```{r, eval=FALSE}
784 | if (any(d < 5))
785 |   browser()
786 | ```
787 | 
788 | This expression must be executed at a specific location in our function `f`:
789 | 
790 | ```{r, eval=FALSE}
791 | as.list(body(f))
792 | ```
793 | 
794 | ```{r, eval=FALSE}
795 | trace("f", quote(if (any(d < 5)) browser()), at = 3)
796 | ```
797 | 
798 | We can now run our updated function `f`
799 | 
800 | ```{r, eval=FALSE}
801 | f
802 | body(f)
803 | ```
804 | 
805 | ```{r, eval=FALSE}
806 | set.seed(1)
807 | f() ## normal execution
808 | 
809 | set.seed(11)
810 | f() ## enters browser mode
811 | ```
812 | 
813 | ### Debugging S4 methods
814 | 
815 | > The `trace` function operates by constructing a revised version of
816 | > the function (or of the method, if ‘signature’ is supplied), and
817 | > assigning the new object back where the original was found.
818 | 
819 | ```{r, eval=FALSE}
820 | library("MSnbase")
821 | data(itraqdata)
822 | x <- itraqdata[[1]]
823 | plot(x, full=TRUE)
824 | ```
825 | 
826 | Not helpful:
827 | 
828 | ```{r, eval=FALSE}
829 | debug(plot)
830 | plot(x, full=TRUE)
831 | ```
832 | 
833 | Try again:
834 | 
835 | ```{r, eval=FALSE}
836 | trace("plot", browser, 
837 |       signature = c("Spectrum", "missing"))
838 | plot(x, full=TRUE)
839 | ```
840 | 
841 | 
842 | ## Unit testing
843 | 
844 | See [here](https://github.com/lgatto/2016-02-25-adv-programming-EMBL/blob/master/unittesting.md).
845 | 
846 | 


--------------------------------------------------------------------------------
/03-debug.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Part III: Debugging"
  3 | author: "Laurent Gatto"
  4 | ---
  5 | 
  6 | # Overview
  7 | 
  8 | - Defensive programming
  9 | - Debbugging: techniques and tools
 10 | - Condition handling
 11 | - Testing
 12 | 
 13 | # Defensive programming
 14 | 
 15 | Before we begin with debugging, let's look at ways to prevent bugs
 16 | (more at the end of this part). 
 17 | 
 18 | *Defensive programming*: 
 19 | - making the code work in a predicable manner
 20 | - writing code that fails in a well-defined manner
 21 | - if something *weird* happens, either properly deal with it, of fail
 22 |   quickly and loudly
 23 | 
 24 | The level of defensiveness will depend whether you write a function
 25 | for interactive of programmatic usage.
 26 | 
 27 | ## Talk to your users
 28 | 
 29 | ### Diagnostic messages
 30 | 
 31 | 
 32 | ```r
 33 | message("This is a message for our dear users.")
 34 | ```
 35 | 
 36 | 
 37 | ```r
 38 | message("This is a message for our dear users. ",
 39 | 	paste("Thank you for using our software",
 40 |               sw, "version", packageVersion(sw)))
 41 | ```
 42 | 
 43 | Do not use `print` or `cat`:
 44 | 
 45 | 
 46 | ```r
 47 | f1 <- function() {
 48 |     cat("I AM LOUD AND YOU CAN'T HELP IT.\n")
 49 |     ## do stuff
 50 |     invisible(TRUE)
 51 | }
 52 | f1()
 53 | ```
 54 | 
 55 | 
 56 | ```r
 57 | f2 <- function() {
 58 |     message("Sorry to interup, but...")
 59 |     ## do stuff
 60 |     invisible(TRUE)
 61 | }
 62 | f2()
 63 | suppressMessages(f2())
 64 | ```
 65 | 
 66 | Of course, it is also possible to manually define verbosity. This
 67 | makes you write more code for a feature readily available. But still
 68 | better to use `message`.
 69 | 
 70 | 
 71 | ```r
 72 | f3 <- function(verbose = TRUE) {
 73 |     if (verbose)
 74 |         message("I am being verbose because you let me.")
 75 |     ## do stuff
 76 |     invisible(TRUE)
 77 | }
 78 | f3()
 79 | f3(verbose = FALSE)
 80 | ```
 81 | 
 82 | ### Warning
 83 | 
 84 | > There is a problem with warnings. No one reads them. Pat Burns, in
 85 | > *R inferno*.
 86 | 
 87 | 
 88 | ```r
 89 | warning("Do not ignore me. Somthing bad might have happened.")
 90 | warning("Do not ignore me. Somthing bad might be happening.", immediate. = TRUE)
 91 | ```
 92 | 
 93 | 
 94 | ```r
 95 | f <- function(...)
 96 |     warning("Attention, attention, ...!", ...)
 97 | f()
 98 | f(call. = FALSE)
 99 | ```
100 | Print warnings after they have been thrown.
101 | 
102 | 
103 | ```r
104 | warnings()
105 | last.warning
106 | ```
107 | 
108 | See also to `warn` option in `?options` .
109 | 
110 | 
111 | ```r
112 | option("warn")
113 | ```
114 | 
115 | ### Error
116 | 
117 | 
118 | ```r
119 | stop("This is the end, my friend.")
120 | ```
121 | 
122 | 
123 | ```r
124 | log(c(2, 1, 0, -1, 2)); print('end') ## warning 
125 | xor(c(TRUE, FALSE));  print ('end')  ## error
126 | ```
127 | 
128 | Stop also has a `call.` parameter.
129 | 
130 | 
131 | ```r
132 | geterrmessage()
133 | ```
134 | 
135 | ### Logging
136 | 
137 | See for example the [`log4r`](https://github.com/johnmyleswhite/log4r)
138 | package:
139 | 
140 | 
141 | ```r
142 | ## Import the log4r package.
143 | library('log4r')
144 | 
145 | ## Create a new logger object with create.logger().
146 | logger <- create.logger()
147 | 
148 | ## Set the logger's file output: currently only allows flat files.
149 | logfile(logger) <- file.path('base.log')
150 | 
151 | ## Set the current level of the logger.
152 | level(logger) <- "INFO"
153 | 
154 | ## Try logging messages at different priority levels.
155 | debug(logger, 'A Debugging Message') ## Won't print anything
156 | info(logger, 'An Info Message')
157 | warn(logger, 'A Warning Message')
158 | error(logger, 'An Error Message')
159 | fatal(logger, 'A Fatal Error Message')
160 | ```
161 | 
162 | ### Progress bars
163 | 
164 | - `utils::txtProgressBar` function
165 | 
166 | 
167 | ```r
168 | n <- 10
169 | pb <- txtProgressBar(min = 0, max = n, style = 3)
170 | for (i in 1:n) {
171 |     setTxtProgressBar(pb, i)
172 |     Sys.sleep(0.5)
173 | }
174 | close(pb)
175 | ```
176 | 
177 | - [`progress`](https://github.com/gaborcsardi/progress) package
178 | 
179 | 
180 | ```r
181 | library("progress")
182 | pb <- progress_bar$new(total = n)
183 | for (i in 1:n) {
184 |     pb$tick()
185 |     Sys.sleep(0.5)
186 | }
187 | ```
188 | 
189 | Tip: do not over use progress bars. Ideally, a user should be
190 | confident that everything is under control and progress is made while
191 | waiting for a function to return. In my experience, a progress bar is
192 | usefull when there is a specific and/or user-defined number of
193 | iterations, such a *iterating over n files*, or *running a simulation
194 | n times*.
195 | 
196 | **Question**: What about mixing progress bars and verbosity.
197 | 
198 | 
199 | ## KISS
200 | 
201 | Keep your functions simple and stupid (and short). 
202 | 
203 | ## Failing fast and well
204 | 
205 | > Bounds errors are ugly, nasty things that should be stamped out
206 | > whenever possible. One solution to this problem is to use the
207 | > `assert` statement. The `assert` statement tells C++, "This can
208 | > never happen, but if it does, abort the program in a nice way." One
209 | > thing you find out as you gain programming experience is that things
210 | > that can "never happen" happen with alarming frequency. So just to
211 | > make sure that things work as they are supposed to, it’s a good idea
212 | > to put lots of self checks in your program. -- Practical C++
213 | > Programming, Steve Oualline, O'Reilly.
214 | 
215 | 
216 | ```r
217 | if (!condition) stop(...)
218 | ```
219 | 
220 | 
221 | ```r
222 | stopifnot(TRUE)
223 | stopifnot(TRUE, FALSE)
224 | ```
225 | 
226 | For example to test input classes, lengths, ...
227 | 
228 | 
229 | ```r
230 | f <- function(x) {
231 |     stopifnot(is.numeric(x), length(x) == 1)
232 |     invisible(TRUE)
233 | }
234 | 
235 | f(1)
236 | f("1")
237 | f(1:2)
238 | f(letters)
239 | ```
240 | 
241 | The [`assertthat`](https://github.com/hadley/assertthat) package:
242 | 
243 | 
244 | ```r
245 | x <- "1"
246 | library("assertthat")
247 | stopifnot(is.numeric(x))
248 | assert_that(is.numeric(x))
249 | assert_that(length(x) == 2)
250 | ```
251 | 
252 | * `assert_that()` signal an error.
253 | * `see_if()` returns a logical value, with the error message as an attribute.
254 | * `validate_that()` returns `TRUE` on success, otherwise returns the error as
255 |   a string.
256 | 
257 |   
258 |   
259 | * `is.flag(x)`: is x `TRUE` or `FALSE`? (a boolean flag)
260 | * `is.string(x)`: is x a length 1 character vector?
261 | * `has_name(x, nm)`, `x %has_name% nm`: does `x` have component `nm`?
262 | * `has_attr(x, attr)`, `x %has_attr% attr`: does `x` have attribute `attr`?
263 | * `is.count(x)`: is x a single positive integer?
264 | * `are_equal(x, y)`: are `x` and `y` equal?
265 | * `not_empty(x)`: are all dimensions of `x` greater than 0?
266 | * `noNA(x)`: is `x` free from missing values?
267 | * `is.dir(path)`: is `path` a directory?
268 | * `is.writeable(path)`/`is.readable(path)`: is `path` writeable/readable?
269 | * `has_extension(path, extension)`: does `file` have given `extension`?
270 | 
271 | 
272 | ## Consistency and predictability
273 | 
274 | Reminder of the interactive use vs programming examples: 
275 | - `[` and `drop` 
276 | - `sapply`, `lapply`, `vapply`
277 | 
278 | Remember also the concept of *tidy data*.
279 | 
280 | ## Comparisons
281 | 
282 | ### Floating point issues to be aware of
283 | 
284 | R FAQ [7.31](http://cran.r-project.org/doc/FAQ/R-FAQ.html#Why-doesn_0027t-R-think-these-numbers-are-equal_003f)?
285 | 
286 | 
287 | 
288 | ```r
289 | a <- sqrt(2)
290 | a * a == 2
291 | a * a - 2
292 | ```
293 | 
294 | 
295 | ```r
296 | 1L + 2L == 3L
297 | 1.0 + 2.0 == 3.0
298 | 0.1 + 0.2 == 0.3
299 | ```
300 | 
301 | ### Floating point: how to compare
302 | 
303 | - `all.equal` compares R objects for *near equality*. Takes into
304 |   account whether object attributes and names ought the taken into
305 |   consideration (`check.attributes` and `check.names` parameters) and
306 |   tolerance, which is machine dependent.
307 | 
308 | 
309 | ```r
310 | all.equal(0.1 + 0.2, 0.3)
311 | all.equal(0.1 + 0.2, 3.0)
312 | isTRUE(all.equal(0.1 + 0.2, 3)) ## when you just want TRUE/FALSE
313 | ```
314 | 
315 | ### Exact identity
316 | 
317 | `identical`: test objects for exact equality
318 | 
319 | 
320 | ```r
321 | 1 == NULL
322 | all.equal(1, NULL)
323 | identical(1, NULL)
324 | identical(1, 1.)   ## TRUE in R (both are stored as doubles)
325 | all.equal(1, 1L)
326 | identical(1, 1L)   ## stored as different types
327 | ```
328 | 
329 | Appropriate within `if`, `while` condition statements. (not
330 | `all.equal`, unless wrapped in `isTRUE`).
331 | 
332 | ## Exercise
333 | 
334 | (From [adv-r](http://adv-r.had.co.nz/Exceptions-Debugging.html#defensive-programming).)
335 | 
336 | The `col_means` function computes the means of all numeric columns in
337 | a data frame.
338 | 
339 | 
340 | ```r
341 | col_means <- function(df) {
342 |   numeric <- sapply(df, is.numeric)
343 |   numeric_cols <- df[, numeric]
344 |   data.frame(lapply(numeric_cols, mean))
345 | }
346 | ```
347 | 
348 | Is it a robust function? What happens if there are unusual inputs.
349 | 
350 | 
351 | ```r
352 | col_means(mtcars)
353 | col_means(mtcars[, 0])
354 | col_means(mtcars[0, ])
355 | col_means(mtcars[, "mpg", drop = FALSE])
356 | col_means(1:10)
357 | col_means(as.matrix(mtcars))
358 | col_means(as.list(mtcars))
359 | 
360 | mtcars2 <- mtcars
361 | mtcars2[-1] <- lapply(mtcars2[-1], as.character)
362 | col_means(mtcars2)
363 | ```
364 | # Debugging: techniques and tools
365 | 
366 | ### Shit happens
367 | 
368 | > Funding your bug is a process of confirming the many things that you
369 | > believe are true - until you find one which is not true. -- Norm Matloff
370 | 
371 | #### 1. Identify the bug (the difficult part)
372 | - Something went wrong!
373 | - Where in the code does it happen?
374 | - Does it happen every time?
375 | - What input triggered it?
376 | - Report it (even if it is in your code - use github issues, for
377 |   example).
378 | 
379 | **Tip**: Beware of your intuition. As a scientist, do what you are
380 | used to: generate a hypotheses, *design an experiment* to test them,
381 | and record the results.
382 | 
383 | #### 2. Fix it (the less difficult part)
384 | - Correct the bug.
385 | - Make sure that bug will not repeat itself!
386 | - How can we be confident that we haven't introduced new bugs?
387 | 
388 | ## Tools
389 | 
390 | - `print`/`cat`
391 | - `traceback()`
392 | - `browser()`
393 | - `options(error = )`, `options(warn = )`
394 | - `trace`
395 | - IDE: RStudio, StatET, emacs' ess tracebug.
396 | 
397 | 
398 | ### Manually
399 | 
400 | Inserting `print` and `cat` statements in the code. Works, but time
401 | consuming. 
402 | 
403 | ### Finding the bug
404 | 
405 | > Many bugs are subtle and hard to find. -- Hadley Wickham
406 | 
407 | Bugs are shy, and are generally hidden, deep down in your code, to
408 | make it as difficult as possible for you to find them.
409 | 
410 | 
411 | ```r
412 | e <- function(i) {
413 |   x <- 1:4
414 |   if (i < 5) x[1:2]
415 |   else x[-1:2]
416 | }
417 | f <- function() sapply(1:10, e)
418 | g <- function() f()
419 | ```
420 | 
421 | `traceback`: lists the sequence of calls that lead to the error
422 | 
423 | 
424 | ```r
425 | g()
426 | traceback()
427 | ```
428 | 
429 | If the source code is available (for example for `source()`d code),
430 | then traceback will display the exact location in the function, in the
431 | form `filename.R#linenum`.
432 | 
433 | ### Browsing the error
434 | 
435 | - Register the function for debugging: `debug(g)`. This adds a call to
436 |   the `browser()` function (see also below) and the very beginning of
437 |   the function `g`.
438 |   
439 | - Every call to `g()` will not be run interactively.
440 | 
441 | - To finish debugging: `undebug(g)`.
442 | 
443 | 
444 | 
445 | ```r
446 | debug(g)
447 | g()
448 | ```
449 | 
450 | How to debug:
451 | 
452 | - `n` executes the next step of the function. Use `print(n)` or
453 |   `get(n)` to print/access the variable `n`.
454 | - `s` to step into the next function. If it is not a function, same as
455 |   `n`.
456 | - `f` to finish execution of the current loop of function.
457 | - `c` to leave interactive debugging and continue regular execution of
458 |   the function. 
459 | - `Q` to stop debugging, terminate the function and return to the
460 |   global workspace.
461 | - `where` print a stack trace of all active function calls.
462 | - `Enter` same as `n` (or `s`, if it was used most recently), unless
463 |   `options(browserNLdisabled = TRUE)` is set.
464 | 
465 | To fix a function when the source code is not directly available, use
466 | `fix(fun)`. This will open the function's source code for editing and,
467 | after saving and closing, store the updated function in the global
468 | workspace.
469 | 
470 | ### Breakpoints
471 | 
472 | - Add a call to `browser()` anywhere in the source code to execute the
473 |   rest of the code interactively.
474 |   
475 | - To run breakpoints conditionally, wrap the call to `browser()` in a
476 |   condition.
477 | 
478 | ### Setting options
479 | 
480 | - `options(error = recover)`: display the call stack and choose where
481 |   to step in.
482 | 
483 | - `options(error = stop)` or `options(error = NULL)`: reset to default
484 |   behaviour.
485 | 
486 | - `options(warn = 'numeric')`: sets the handling of warning
487 |   messages. If `warn` is negative all warnings are ignored.  If `warn`
488 |   is zero (the default) warnings are stored until the top-level
489 |   function returns.  If 10 or fewer warnings were signalled they will
490 |   be printed otherwise a message saying how many were signalled. An
491 |   object called `last.warning` is created and can be printed through
492 |   the function `warnings`.  If `warn` is one, warnings are printed as
493 |   they occur.  If `warn` is two or larger all warnings are turned into
494 |   errors.
495 | 
496 | - `options(error = dump.frames)`: like `recover` but for
497 |   non-interactive use. Will create a `last.dump.rda` file in the
498 |   current working directory, which can then be reloaded in an
499 |   interactive session to re-inter interactive debugging (using
500 |   `debugger()`).
501 | 
502 | ### Debugging with IDEs
503 | 
504 | - RSudio: `Show Traceback`, `Rerun with Debug` and interactive debugging.
505 | 
506 | ![RStudio debugging 1](./figs/debugRStudio1.png)
507 | ![RStudio debugging 2](./figs/debugRStudio2.png)
508 | 
509 | - StatET (Eclipse plugin)
510 | 
511 | - [emacs ESS and tracebug](http://ess.r-project.org/Manual/ess.html#Developing-with-ESS)
512 | 
513 | ### Exercise
514 | 
515 | 1. Your turn - play with `traceback`, `recover` and `debug`:
516 | 
517 | (Example originally by Martin Morgan and Robert Gentleman.)
518 | 
519 | 
520 | ```r
521 | e <- function(i) {
522 |   x <- 1:4
523 |   if (i < 5) x[1:2]
524 |   else x[-1:2] # oops! x[-(1:2)]
525 | }
526 | f <- function() sapply(1:10, e)
527 | g <- function() f()
528 | ```
529 | 
530 | 2. Fix `readFasta2`.
531 | 
532 | 
533 | ```r
534 | ## make sure you have the 'sequences' package.
535 | ## Get readFasta2, the function to debug
536 | library(devtools)
537 | install_github("lgatto/sequences") ## from github
538 | ## or 
539 | install.packages("sequences") ## from CRAN
540 | library("sequences")
541 | sequences:::debugme()
542 | ## Get an example file
543 | f <- dir(system.file("extdata", package = "sequences"),
544 |          full.names=TRUE, pattern = "moreDnaSeqs.fasta")
545 | ## BANG!
546 | readFasta2(f)
547 | ```
548 | 
549 | ## Condition handling
550 | 
551 | ### `try` and `tryCatch`
552 | 
553 | The function `f` will never terminate.
554 | 
555 | 
556 | ```r
557 | f <- function() {
558 |     x <- "1"
559 |     log(x)
560 |     message("x was the ", class(x), " ", x)
561 | }
562 | f()
563 | ```
564 | 
565 | Use `try` to proceed with the execution even when an error occurs.
566 | 
567 | 
568 | ```r
569 | f <- function() {
570 |     x <- "1"
571 |     try(log(x))
572 |     message("x was the ", class(x), " ", x)
573 | }
574 | f()
575 | ```
576 | 
577 | 
578 | ```r
579 | try({
580 |     a <- 1
581 |     b <- "2"
582 |     a + b
583 | })
584 | ```
585 | 
586 | In case of error, `try` returns a object of class `try-error`:
587 | 
588 | 
589 | ```r
590 | success <- try(1 + 2)
591 | failure <- try(1 + "2", silent = TRUE)
592 | class(success)
593 | class(failure)
594 | ```
595 | 
596 | 
597 | ```r
598 | inherits(failure, "try-error")
599 | 
600 | if (inherits(failure, "try-error"))
601 | 	message("There was an error here.")
602 | ```
603 | Handling errors is particularly useful to iterate over all elements of
604 | an input, despite errors (and inspecting/handling/fixing the errors
605 | afterwards).
606 | 
607 | 
608 | ```r
609 | el <- list(1:10, c(-1, 1), TRUE, "1")
610 | res <- lapply(el, log)
611 | res
612 | res <- lapply(el, function(x) try(log(x)))
613 | res
614 | ```
615 | 
616 | #### Hadley's tip
617 | 
618 | > Another useful `try()` idiom is using a default value if an
619 | > expression fails. Simply assign the default outside the `try` block,
620 | > and then run the risky code:
621 | 
622 | 
623 | ```r
624 | default <- NULL
625 | try(default <- read.csv("possibly-bad-input.csv"), silent = TRUE)
626 | ```
627 | 
628 | > There is also `plyr::failwith()`, which makes this strategy even
629 | > easier to implement.
630 | 
631 | 
632 | ```r
633 | f <- function(x)
634 |     if (x == 1) stop("Error!") else 1
635 | 
636 | f(1)
637 | f(2)
638 |      
639 |      
640 | safef <- failwith(NULL, f)
641 | safef(1)
642 | safef(2)
643 | ```
644 | 
645 | Use `tryCatch` to specify a behaviour (handler function) in case of
646 | error, warning or message.
647 | 
648 | 
649 | ```r
650 | f <- function() {
651 |     x <- "1"
652 |     tryCatch(log(x),
653 |              error = function(e) cat("There was an error!\n"))
654 |     message("x was the ", class(x), " ", x)
655 | }
656 | f()
657 | ```
658 | 
659 | More example from Hadleys' *Advanced R* book.
660 | 
661 | 
662 | ```r
663 | show_condition <- function(code) {
664 |   tryCatch(code,
665 |     error = function(c) "error",
666 |     warning = function(c) "warning",
667 |     message = function(c) "message"
668 |   )
669 | }
670 | show_condition(stop("!"))
671 | show_condition(warning("?!"))
672 | show_condition(message("?"))
673 | show_condition(0)
674 | ```
675 | 
676 | A more informative `read.csv` version:
677 | 
678 | 
679 | ```r
680 | read.csv2 <- function(file, ...) {
681 |   tryCatch(read.csv(file, ...), error = function(c) {
682 |     c$message <- paste0(c$message, " (in ", file, ")")
683 |     stop(c)
684 |   })
685 | }
686 | read.csv("code/dummy.csv")
687 | read.csv2("code/dummy.csv")
688 | ```
689 | 
690 | `tryCatch` has a `finally` argument that specifies a code block to be
691 | executed regardless of whether the initial expression succeeds or
692 | fails. Usefull, for example, to clean up (deleting files, closing
693 | connections, ...).
694 | 
695 | 
696 | ### `withCallingHandlers`
697 | 
698 | 
699 | The `withCallingHandlers` function allows to defined special behaviour
700 | in case of *unusual conditions*, including warnings and errors. In the
701 | example below, we start a browser in case of (obscure) warnings.
702 | 
703 | 
704 | ```r
705 | f <- function(x = 10) {
706 |     lapply(seq_len(x), function(i) {
707 |         ## make an example 2x2 contingency table
708 |         d <- matrix(sample(4:10, 4), nrow = 2, ncol = 2)
709 |         ## will produce warning if there is a 5 or less 
710 |         ## in the contingency table
711 |         chisq.test(d)
712 |     })
713 | }
714 | ```
715 | 
716 | 
717 | ```r
718 | set.seed(1)
719 | f()
720 | set.seed(1)
721 | withCallingHandlers(f(), warning=function(e) recover())
722 | ```
723 | 
724 | ### Difference between `tryCatch` and `withCallingHandlers`
725 | 
726 | (From [*Advanced R*](http://adv-r.had.co.nz/Exceptions-Debugging.html#condition-handling))
727 | 
728 | The handlers in `withCallingHandlers()` are called in the context of
729 | the call that generated the condition whereas the handlers in
730 | `tryCatch()` are called in the context of `tryCatch()`. This is shown
731 | here with `sys.calls()`, which is the run-time equivalent of
732 | `traceback()` -- it lists all calls leading to the current function.
733 | 
734 | 
735 | ```r
736 | f <- function() g()
737 | g <- function() h()
738 | h <- function() stop("!")
739 | 
740 | tryCatch(f(), error = function(e) print(sys.calls()))
741 | withCallingHandlers(f(), error = function(e) print(sys.calls()))
742 | ```
743 | 
744 | ### Debugging at the C level with `ddd` or `gdb`
745 | 
746 | Demo
747 | 
748 | ### Exercise
749 | 
750 | 
751 | 
752 | 
753 | Write a new `safelog` function that catches and handles errors and
754 | warnings to emulate the following behaviour.
755 | 
756 | 
757 | ```r
758 | log(1)
759 | ```
760 | 
761 | ```
762 | ## [1] 0
763 | ```
764 | 
765 | ```r
766 | safelog(1)
767 | ```
768 | 
769 | ```
770 | ## [1] 0
771 | ```
772 | 
773 | ```r
774 | log(-1)
775 | ```
776 | 
777 | ```
778 | ## Warning in log(-1): NaNs produced
779 | ```
780 | 
781 | ```
782 | ## [1] NaN
783 | ```
784 | 
785 | ```r
786 | safelog(-1)
787 | ```
788 | 
789 | ```
790 | ## [1] "a warning with input -1"
791 | ```
792 | 
793 | ```r
794 | log("a")
795 | ```
796 | 
797 | ```
798 | ## Error in log("a"): non-numeric argument to mathematical function
799 | ```
800 | 
801 | ```r
802 | safelog("a")
803 | ```
804 | 
805 | ```
806 | ## [1] "an error with input a"
807 | ```
808 | 
809 | **Answer**
810 | 
811 | 
812 | ```r
813 | safelog <- function(x) {
814 |   tryCatch(log(x),
815 |            error = function(e) paste("an error with input", x),
816 |            warning = function(e) paste("a warning with input", x))
817 | }
818 | ```
819 | 
820 | ## Tracing code
821 | 
822 | From `?trace`:
823 | 
824 | > A call to `trace` allows you to insert debugging code (e.g., a call
825 | > to `browser` or `recover`) at chosen places in any function. A call
826 | > to `untrace` cancels the tracing.
827 | 
828 | 
829 | ```r
830 | ## Report whenever e invoked
831 | trace(sum)
832 | hist(rnorm(100))
833 | untrace(sum)
834 | ```
835 | 
836 | 
837 | ```r
838 | ## Evaluate arbitrary code whenever e invoked
839 | trace(e, quote(cat("i am", i, "\n")))
840 | ## Another way to enter browser whenver e invoked
841 | trace(e, browser)
842 | ## stop tracing
843 | untrace(e)
844 | ```
845 | 
846 | ### Inserting code dynamically
847 | 
848 | > The `trace` function operates by constructing a revised version of
849 | > the function (or of the method, if ‘signature’ is supplied), and
850 | > assigning the new object back where the original was found.
851 | 
852 | 
853 | ```r
854 | f <- function() {
855 |     ## make an example 2x2 contingency table
856 |     d <- matrix(sample(4:10, 4), nrow=2, ncol=2)
857 |      chisq.test(d)
858 | }
859 | set.seed(1)
860 | f() ## no warning
861 | 
862 | set.seed(11)
863 | f() ## warning
864 | ```
865 | 
866 | We want to conditionally enter brower mode, when an element of `d` is
867 | smaller than 5.
868 | 
869 | 
870 | ```r
871 | if (any(d < 5))
872 |   browser()
873 | ```
874 | 
875 | This expression must be executed at a specific location in our function `f`:
876 | 
877 | 
878 | ```r
879 | as.list(body(f))
880 | ```
881 | 
882 | 
883 | ```r
884 | trace("f", quote(if (any(d < 5)) browser()), at = 3)
885 | ```
886 | 
887 | We can now run our updated function `f`
888 | 
889 | 
890 | ```r
891 | f
892 | body(f)
893 | ```
894 | 
895 | 
896 | ```r
897 | set.seed(1)
898 | f() ## normal execution
899 | 
900 | set.seed(11)
901 | f() ## enters browser mode
902 | ```
903 | 
904 | ### Debugging S4 methods
905 | 
906 | > The `trace` function operates by constructing a revised version of
907 | > the function (or of the method, if ‘signature’ is supplied), and
908 | > assigning the new object back where the original was found.
909 | 
910 | 
911 | ```r
912 | library("MSnbase")
913 | data(itraqdata)
914 | x <- itraqdata[[1]]
915 | plot(x, full=TRUE)
916 | ```
917 | 
918 | Not helpful:
919 | 
920 | 
921 | ```r
922 | debug(plot)
923 | plot(x, full=TRUE)
924 | ```
925 | 
926 | Try again:
927 | 
928 | 
929 | ```r
930 | trace("plot", browser, 
931 |       signature = c("Spectrum", "missing"))
932 | plot(x, full=TRUE)
933 | ```
934 | 
935 | 
936 | ## Unit testing
937 | 
938 | See [here](https://github.com/lgatto/2016-02-25-adv-programming-EMBL/blob/master/unittesting.md).
939 | 
940 | 


--------------------------------------------------------------------------------
/04-perf.R:
--------------------------------------------------------------------------------
  1 | 
  2 | x <- runif(100)
  3 | system.time(sqrt(x))
  4 | system.time(x^0.5)
  5 | 
  6 | 
  7 | x <- runif(1e5)
  8 | system.time(sqrt(x))
  9 | system.time(x^0.5)
 10 | 
 11 | 
 12 | summary(replicate(10, system.time(x^0.5)[["elapsed"]]))
 13 | 
 14 | 
 15 | x <- runif(100)
 16 | library(microbenchmark)
 17 | 
 18 | microbenchmark(sqrt(x),
 19 |                x ^ 0.5)
 20 | 
 21 | 
 22 | f <- function() 1
 23 | f()
 24 | body(f)
 25 | body(f) <- 2
 26 | f()
 27 | formals(f) <- pairlist(x = 1)
 28 | body(f) <- quote(x + 1)
 29 | f()
 30 | f(10)
 31 | 
 32 | 
 33 | setMethod(...)
 34 | 
 35 | 
 36 | ctl <- c(4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14)
 37 | trt <- c(4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69)
 38 | group <- gl(2, 10, 20, labels = c("Ctl","Trt"))
 39 | weight <- c(ctl, trt)
 40 | mod <- lm(weight ~ group)
 41 | mod
 42 | mod$foo <- 1
 43 | mod
 44 | summary(mod)
 45 | names(mod)
 46 | 
 47 | 
 48 | class(mod) <- c("mod", "foo")
 49 | 
 50 | 
 51 | ## function
 52 | f <- function(x) NULL
 53 | 
 54 | ## S3 method
 55 | s3 <- function(x) UseMethod("s3")
 56 | s3.integer <- f
 57 | 
 58 | ## S4 method
 59 | A <- setClass("A", slots = c(a = "list"))
 60 | setGeneric("s4", function(x) standardGeneric("s4"))
 61 | setMethod("s4", "A", f)
 62 | a <- A()
 63 | 
 64 | ## S4 Reference Class
 65 | B <- setRefClass("B", methods = list(rc = f))
 66 | b <- B$new()
 67 | 
 68 | 
 69 | microbenchmark(
 70 |      fun = f(),
 71 |      S3 = s3(1L),
 72 |      S4 = s4(a),
 73 |      RC = b$rc()
 74 | )
 75 | 
 76 | 
 77 | a <- 1
 78 | f <- function() {
 79 |     g <- function() {
 80 |         print(a) ## from global workspace
 81 |         assign("a", 2, envir = parent.frame())
 82 |         print(a) ## f's environment
 83 |         a <- 3
 84 |         print(a) ## g's environment
 85 |     }
 86 |     g()
 87 | }
 88 | f()
 89 | 
 90 | 
 91 | f <- function(x, y) {
 92 |     (x + y) ^ 2
 93 | }
 94 | 
 95 | 
 96 | microbenchmark(
 97 |     "[32, 11]"      = mtcars[32, 11],
 98 |     "$carb[32]"     = mtcars$carb[32],
 99 |     "[[c(11, 32)]]" = mtcars[[c(11, 32)]],
100 |     "[[11]][32]"    = mtcars[[11]][32],
101 |     ".subset"       = .subset(mtcars, 11)[32],
102 |     ".subset2"      = .subset2(mtcars, 11)[32]
103 | )
104 | 
105 | 
106 | x <- runif(100)
107 | all.equal(sqrt(x), x ^ 0.5)
108 | 
109 | 
110 | library("sequences")
111 | gccount
112 | gccountr <- function(x) table(strsplit(x, "")[[1]])
113 | gccountr2 <- function(x) tabulate(factor(strsplit(x, "")[[1]]))
114 | 
115 | 
116 | s <- paste(sample(c("A", "C", "G", "T"),
117 |                   100, replace = TRUE),
118 |            collapse = "")
119 | 
120 | gccount(s)
121 | gccountr(s)
122 | gccountr2(s)
123 | 
124 | 
125 | library("microbenchmark")
126 | microbenchmark(gccount(s),
127 |                      gccountr(s),
128 |                      gccountr2(s),
129 |                      times = 1e4,
130 | 					 unit = "eps")
131 | 
132 | 
133 | library("ggplot2")
134 | mb <- microbenchmark(gccount(s),
135 |                      gccountr(s),
136 |                      gccountr2(s))
137 | print(mb)
138 | microbenchmark:::autoplot.microbenchmark(mb)
139 | 
140 | 
141 | m <- matrix(rnorm(1e6), ncol = 10)
142 | 
143 | Rprof("rprof")
144 | res <- apply(m, 1, mean, trim=.3)
145 | Rprof(NULL)
146 | summaryRprof("rprof")
147 | 
148 | 
149 | f <- function() {
150 |   pause(0.1)
151 |   g()
152 |   h()
153 | }
154 | 
155 | g <- function() {
156 |   pause(0.1)
157 |   h()
158 | }
159 | 
160 | h <- function() {
161 |   pause(0.1)
162 | }
163 | 
164 | 
165 | 
166 | library("profvis")
167 | source("lineprof-example.R")
168 | profvis(f())
169 | 
170 | 
171 | profvis({
172 |     m <- matrix(rnorm(1e6), ncol = 10)
173 |     res <- apply(m, 1, mean, trim=.3)
174 |     sum(res)
175 | })
176 | 
177 | 
178 | make_id2GO <- function(n = 1e3) { ## could be 1e4 - 1e5
179 |     gn <- sprintf(paste0("ENSG%0", 10, "d"), sample(1e6, n))
180 |     goid <- function(n = 10) sprintf(paste0("GO:%0", 10, "d"), sample(1e6, n))
181 |     structure(replicate(n, goid(sample(50, 1))),
182 |               names = gn)
183 | }
184 | id2GO <- make_id2GO()
185 | 
186 | 
187 | length(id2GO)
188 | str(head(id2GO))
189 | str(unlist(id2GO))
190 | 
191 | 
192 | library(microbenchmark)
193 | microbenchmark(unlist(l),
194 |                unlist(l, use.names = FALSE),
195 |                times = 10)
196 | 
197 | 
198 | f1 <- function(n) {
199 |   a <- NULL 
200 |   for (i in 1:n) a <- c(a, sqrt(i))
201 |   a
202 | }
203 | 
204 | f2 <- function(n) {
205 |   a <- numeric(n)
206 |   for (i in 1:n) a[i] <- sqrt(i)
207 |   a
208 | }
209 | 
210 | 
211 | microbenchmark(f1(1e3), f2(1e3))
212 | microbenchmark(f1(1e4), f2(1e4))
213 | 
214 | 
215 | e <- new.env()
216 | e$x <- 1
217 | f <- function(myenv) myenv$x <- 2
218 | f(e)
219 | e$x
220 | 
221 | 
222 | f3 <- function(n)
223 |   sapply(seq_len(n), sqrt)
224 | 
225 | f4 <- function(n) sqrt(n)
226 | 
227 | 
228 | n <- 10^(2:5)
229 | t1 <- sapply(n, function(.n) system.time(f1(.n))[["elapsed"]])
230 | t2 <- sapply(n, function(.n) system.time(f2(.n))[["elapsed"]])
231 | t3 <- sapply(n, function(.n) system.time(f3(.n))[["elapsed"]])
232 | t4 <- sapply(n, function(.n) system.time(f4(.n))[["elapsed"]])
233 | 
234 | elapsed <- data.frame(t1, t2, t3, t4)
235 | rownames(elapsed) <- n
236 | 
237 | colnames(elapsed) <-
238 |     c("for loop\nwithout init",
239 |       "for loop\nwith init",
240 |       "wrapped in\napply",
241 |       "built-in sqrt\n(vectorised)")
242 | 	
243 | suppressPackageStartupMessages(library("grid"))
244 | suppressPackageStartupMessages(library("reshape2"))
245 | suppressPackageStartupMessages(library("scales"))
246 | suppressPackageStartupMessages(library("ggplot2"))
247 | 
248 | mainvp <- viewport(width = 1,
249 |                    height = 1,
250 |                    x = 0.5, y = 0.5)
251 | subvp <- viewport(width = 6/9,
252 |                   height = 5/9,
253 |                   x = .1,
254 |                   y = .95,
255 |                   just = c("left","top"))
256 | df <- melt(elapsed)
257 | colnames(df) <- c("Implementation", "Elapsed")
258 | df$Iterations <- rep(n, 4)
259 | ymax <- max(elapsed[, -1])
260 | p <- ggplot(data=df, aes(x=Iterations, y=Elapsed, col=Implementation)) +
261 |     geom_line() + geom_point() +
262 |         theme(legend.position="bottom") +
263 |             scale_x_continuous(trans=log10_trans()) +
264 |                 coord_trans(x="log2")
265 | q <- p + coord_cartesian(ylim=c(0, (ymax+.05))) +
266 |     theme_gray(8) +
267 |         labs(x = NULL, y = NULL) +
268 |             theme(plot.margin = unit(rep(0.3, 4), "lines")) +
269 |                 theme(legend.position="none")
270 | print(p, vp = mainvp)
271 | print(q, vp = subvp)
272 | 
273 | 
274 | lapply2 <- function(x, f, ...) {
275 |   out <- vector("list", length(x))
276 |   for (i in seq_along(x)) {
277 |     out[[i]] <- f(x[[i]], ...)
278 |   }
279 |   out
280 | }
281 | 
282 | lapply2_c <- compiler::cmpfun(lapply2)
283 | 
284 | x <- list(1:10, letters, c(FALSE, TRUE), NULL)
285 | 
286 | 
287 | microbenchmark(
288 |   lapply2(x, is.null),
289 |   lapply2_c(x, is.null),
290 |   lapply(x, is.null))
291 | 
292 | 
293 | library("pryr")
294 | library("profvis")
295 | 
296 | 
297 | x <- 1:1e5
298 | object.size(x)
299 | print(object.size(x), units = "Kb")
300 | object_size(x)
301 | 
302 | 
303 | ll <- list(x, x, x)
304 | print(object.size(ll), units = "Kb")
305 | object_size(ll)
306 | 
307 | 
308 | x <- 1:1e6
309 | y <- list(1:1e6, 1:1e6, 1:1e6)
310 | object_size(x)
311 | object_size(y)
312 | 
313 | 
314 | e <- new.env()
315 | object.size(e)
316 | object_size(e)
317 | e$a <- 1:1e6
318 | object.size(e)
319 | object_size(e)
320 | 
321 | 
322 | mem_used()
323 | 
324 | 
325 | mem_change(v <- 1:1e6)
326 | mem_change(rm(v))
327 | 
328 | 
329 | rm(list = ls())
330 | mem_change(x <- 1:1e6)
331 | mem_change(y <- x)
332 | mem_change(rm(x))
333 | mem_change(rm(y))
334 | 
335 | 
336 | x <- 1:10
337 | c(address(x), refs(x))
338 | x[5] <- 0L
339 | c(address(x), refs(x))
340 | 
341 | 
342 | y <- x
343 | c(address(x), refs(x))
344 | c(address(y), refs(y))
345 | 
346 | x[5] <- 1L
347 | c(address(x), refs(x))
348 | c(address(y), refs(y))
349 | 
350 | 
351 | x <- 1:5
352 | y <- x
353 | c(address(x), refs(x))
354 | rm(y)
355 | c(address(x), refs(x)) ## should be 1
356 | 
357 | x <- 1:5
358 | y <- x
359 | z <- x
360 | c(address(x), refs(x)) ## should be 3
361 | 
362 | 
363 | x <- 1:10
364 | tracemem(x)
365 | x[5] <- 0L
366 | 
367 | y <- x
368 | x[5] <- 10L
369 | 
370 | address(x)
371 | address(y)
372 | 
373 | 
374 | f1 <- function(n) {
375 |   a <- NULL
376 |   for (i in 1:n) {
377 |       a <- c(a, sqrt(i))
378 |       print(address(a))
379 |   }
380 |   invisible(a)
381 | }
382 | 
383 | f2 <- function(n) {
384 |   a <- numeric(n)
385 |   for (i in 1:n) {
386 |       a[i] <- sqrt(i)
387 |       print(address(a))
388 |   }
389 |   invisible(a)
390 | }
391 | 
392 | 


--------------------------------------------------------------------------------
/04-perf.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Part IV: Performance"
  3 | author: "Laurent Gatto"
  4 | ---
  5 | 
  6 | # Overview
  7 | 
  8 | - Benchmarking
  9 | - Profiling
 10 | - Optimisation
 11 | - Memory
 12 | - Rcpp
 13 | 
 14 | # A word of caution
 15 | 
 16 | Knuth, Donald. *Structured Programming with `go to` Statements*, ACM
 17 | Journal Computing Surveys, Vol 6, No. 4, Dec. 1974. p.268.
 18 | 
 19 | > We should forget about small efficiencies, say about 97% of the
 20 | > time: premature optimization is the root of all evil. Yet we should
 21 | > not pass up our opportunities in that critical 3%. A good programmer
 22 | > will not be lulled into complacency by such reasoning, he will be
 23 | > wise to look carefully at the critical code; but only after that
 24 | > code has been identified
 25 | 
 26 | 
 27 | Robert Gentleman, in R Programming for Bioinformatics, 2008,
 28 | about R's built-in C interfaces:
 29 | 
 30 | > Since R is not compiled, in some situations its performance can be
 31 | > substantially improved by writing code in a compiled language. There
 32 | > are also reasons not to write code in other languages, and in
 33 | > particular we caution against premature optimization, prototyping in
 34 | > R is often cost effective. And in our experience very few routines
 35 | > need to be implemented in other languages for efficiency
 36 | > reasons. Another substantial reason not to use an implementation in
 37 | > some other language is increased complexity. The use of another
 38 | > language almost always results in higher maintenance costs and less
 39 | > stability. In addition, any extensions or enhancements of the code
 40 | > will require someone that is proficient in both R and the other
 41 | > language.
 42 | 
 43 | (`Rcpp` does make some of the above caution statements slightly less
 44 | critical.)
 45 | 
 46 | # R performance
 47 | 
 48 | R is not a fast language, but it is most of the time *fast enough* for
 49 | what we want to do, in particular with respect to interactive data
 50 | analysis. In such cases a slow but expressive and flexible language is
 51 | way better than a fast but less expressive and flexible
 52 | alternative. It is also relatively easy to avoid bad R programming
 53 | idioms that make code too slow.
 54 | 
 55 | ## Timing, benchmarking
 56 | 
 57 | Let's compare two implementation of the square root calculation:
 58 | `sqrt(x)` and `x ^ 0.5`.
 59 | 
 60 | 
 61 | ```r
 62 | x <- runif(100)
 63 | system.time(sqrt(x))
 64 | system.time(x^0.5)
 65 | ```
 66 | 
 67 | Does this work? Try
 68 | 
 69 | 
 70 | ```r
 71 | x <- runif(1e5)
 72 | system.time(sqrt(x))
 73 | system.time(x^0.5)
 74 | ```
 75 | 
 76 | We want to repeat timings multiple times:
 77 | 
 78 | 
 79 | ```r
 80 | summary(replicate(10, system.time(x^0.5)[["elapsed"]]))
 81 | ```
 82 | 
 83 | A better approach for such cases is the
 84 | [`microbenchmark` package](https://cran.rstudio.com/web/packages/microbenchmark/index.html),
 85 | which is ideal to accurately benchmark small pieces of code, in
 86 | particular sub-millisecond (nanoseconds) executions (see units below). 
 87 | 
 88 | Each expression is run 100 times (controlled by the `times`
 89 | argument). In addition, the execution order is randomised and summary
 90 | timings are reported.
 91 | 
 92 | 
 93 | ```r
 94 | x <- runif(100)
 95 | library(microbenchmark)
 96 | 
 97 | microbenchmark(sqrt(x),
 98 |                x ^ 0.5)
 99 | ```
100 | 
101 | **Question**: where does this difference come from?
102 | 
103 | <!-- ```{r, eval=FALSE} -->
104 | <!-- sqrt ## 1 argument -->
105 | <!-- `^`  ## 2 arguments -->
106 | <!-- as.list(body(function(x) x^0.5))   ## 2 symbols -->
107 | <!-- as.list(body(function(x) sqrt(x))) ## 3 symbols -->
108 | <!-- ``` -->
109 | 
110 | Below are some illustrations of R's language designs and
111 | implementations that account for some cuts in performance (taken from
112 | *Advanced R*).
113 | 
114 | ## Extreme dynamism 
115 | 
116 | R is an extremely dynamics language where almost any symbol (there are
117 | a few reserved key words such as `TRUE`, `if`, `for`, ..., package
118 | namespaces, locked environment with locked binding) can be modified at
119 | any points; in particular
120 | 
121 | 
122 | - body, arguments, and environments of functions
123 | 
124 | 
125 | ```r
126 | f <- function() 1
127 | f()
128 | body(f)
129 | body(f) <- 2
130 | f()
131 | formals(f) <- pairlist(x = 1)
132 | body(f) <- quote(x + 1)
133 | f()
134 | f(10)
135 | ```
136 | 
137 | - Change the S4 methods for a generic
138 | 
139 | 
140 | ```r
141 | setMethod(...)
142 | ```
143 | 
144 | - Add new fields to an S3 object
145 | 
146 | 
147 | ```r
148 | ctl <- c(4.17,5.58,5.18,6.11,4.50,4.61,5.17,4.53,5.33,5.14)
149 | trt <- c(4.81,4.17,4.41,3.59,5.87,3.83,6.03,4.89,4.32,4.69)
150 | group <- gl(2, 10, 20, labels = c("Ctl","Trt"))
151 | weight <- c(ctl, trt)
152 | mod <- lm(weight ~ group)
153 | mod
154 | mod$foo <- 1
155 | mod
156 | summary(mod)
157 | names(mod)
158 | ```
159 | 
160 | - Change the class of an object
161 | 
162 | 
163 | ```r
164 | class(mod) <- c("mod", "foo")
165 | ```
166 | - modify objects outside or the local environment with `<<-`, or
167 |   anywhere with `assign(..., envir)`.
168 | 
169 | While this dynamism gives a lot of flexibility, the interpreter can
170 | not make any assumptions or optimisations. For example, let's assess
171 | the cost of methods lookup:
172 | 
173 | 
174 | ```r
175 | ## function
176 | f <- function(x) NULL
177 | 
178 | ## S3 method
179 | s3 <- function(x) UseMethod("s3")
180 | s3.integer <- f
181 | 
182 | ## S4 method
183 | A <- setClass("A", slots = c(a = "list"))
184 | setGeneric("s4", function(x) standardGeneric("s4"))
185 | setMethod("s4", "A", f)
186 | a <- A()
187 | 
188 | ## S4 Reference Class
189 | B <- setRefClass("B", methods = list(rc = f))
190 | b <- B$new()
191 | ```
192 | 
193 | 
194 | ```r
195 | microbenchmark(
196 |      fun = f(),
197 |      S3 = s3(1L),
198 |      S4 = s4(a),
199 |      RC = b$rc()
200 | )
201 | ```
202 | 
203 | ## Name lookup
204 | 
205 | To find the value of a symbol is difficult, given that **everything**
206 | can be modified (extreme dynamics, see above) and lexical scoping.
207 | 
208 | 
209 | ```r
210 | a <- 1
211 | f <- function() {
212 |     g <- function() {
213 |         print(a) ## from global workspace
214 |         assign("a", 2, envir = parent.frame())
215 |         print(a) ## f's environment
216 |         a <- 3
217 |         print(a) ## g's environment
218 |     }
219 |     g()
220 | }
221 | f()
222 | ```
223 | 
224 | ```
225 | ## [1] 1
226 | ## [1] 2
227 | ## [1] 3
228 | ```
229 | 
230 | And by **everything**, we mean really nearly **everything**: `+`, `^`, `(`, and `{` in
231 | 
232 | 
233 | ```r
234 | f <- function(x, y) {
235 |     (x + y) ^ 2
236 | }
237 | ```
238 | 
239 | ## Extracting a single value from a data frame
240 | 
241 | 
242 | ```r
243 | microbenchmark(
244 |     "[32, 11]"      = mtcars[32, 11],
245 |     "$carb[32]"     = mtcars$carb[32],
246 |     "[[c(11, 32)]]" = mtcars[[c(11, 32)]],
247 |     "[[11]][32]"    = mtcars[[11]][32],
248 |     ".subset"       = .subset(mtcars, 11)[32],
249 |     ".subset2"      = .subset2(mtcars, 11)[32]
250 | )
251 | ```
252 | 
253 | 
254 | ## Improving R's performance
255 | 
256 | R implementation:
257 | 
258 | - [GNU R](http://www.r-project.org/)
259 | - [pqR](http://www.pqr-project.org)
260 | - [Renjin](http://www.renjin.org)
261 | - [FastR](https://github.com/allr/fastr)
262 | - [Riposte](https://github.com/jtalbot/riposte)
263 | - [CXXR](http://www.cs.kent.ac.uk/project/cxxr/)
264 | 
265 | Several of these projects implement *deferred evaluation*,
266 | i.e. evaluations are only executed if they really need to be. We will
267 | see (and implement) and example in the `Rcpp` section.
268 | 
269 | # Profiling
270 | 
271 | ### Reminder
272 | 
273 | > We should forget about small efficiencies, say about 97% of the
274 | > time: premature optimization is the root of all evil. Yet we should
275 | > not pass up our opportunities in that critical 3%. A good programmer
276 | > will not be lulled into complacency by such reasoning, he will be
277 | > wise to look carefully at the critical code; but only after that
278 | > code has been identified
279 | 
280 | Knuth, Donald. *Structured Programming with `go to` Statements*, ACM
281 | Journal Computing Surveys, Vol 6, No. 4, Dec. 1974. p.268.
282 | 
283 | ### Optimisation
284 | 
285 | Optimisations often have their own costs:
286 | 
287 | - Trade-off fast vs dangerous, flexibility/functionality vs performance.
288 | - Use any assumptions about the data, at the cost of generalisation. 
289 | 
290 | Pat Burns reminds us that 
291 | 
292 | > Our first duty is to create clear, correct code. Only consider
293 | > optimising your code when:
294 | 
295 | - The code is debugged and stable.
296 | - Optimisation is likely to make a significant impact. 
297 | 
298 | then
299 | 
300 | 1. Find the major bottleneck: code *profiling*.
301 | 2. Try to eliminate it.
302 | 3. Repeat until *fast enough*: ideally, define fast enough in advance.
303 | 
304 | #### Make sure the code remains correct
305 | 
306 | 
307 | ```r
308 | x <- runif(100)
309 | all.equal(sqrt(x), x ^ 0.5)
310 | ```
311 | 
312 | ```
313 | ## [1] TRUE
314 | ```
315 | 
316 | and/or [unit
317 | tests](https://github.com/lgatto/2016-02-25-adv-programming-EMBL/blob/master/unittesting.md)
318 | to compare different implementations (and regression test).
319 | 
320 | #### Are implementations really equivalent?
321 | 
322 | 
323 | ```r
324 | library("sequences")
325 | gccount
326 | gccountr <- function(x) table(strsplit(x, "")[[1]])
327 | gccountr2 <- function(x) tabulate(factor(strsplit(x, "")[[1]]))
328 | ```
329 | 
330 | Checking that our different implementations give the same results:
331 | 
332 | 
333 | ```r
334 | s <- paste(sample(c("A", "C", "G", "T"),
335 |                   100, replace = TRUE),
336 |            collapse = "")
337 | 
338 | gccount(s)
339 | gccountr(s)
340 | gccountr2(s)
341 | ```
342 | 
343 | But are they really the same? Are we really comparing the same
344 | functionalities?
345 | 
346 | Is it worth it?
347 | 
348 | 
349 | ```r
350 | library("microbenchmark")
351 | microbenchmark(gccount(s),
352 |                      gccountr(s),
353 |                      gccountr2(s),
354 |                      times = 1e4, 
355 | 					 unit = "eps")
356 | ```
357 | 
358 | 
359 | ```r
360 | library("ggplot2")
361 | mb <- microbenchmark(gccount(s),
362 |                      gccountr(s),
363 |                      gccountr2(s))
364 | print(mb)
365 | microbenchmark:::autoplot.microbenchmark(mb)
366 | ```
367 | 
368 | ## Profiling tools
369 | 
370 | Sampling or statistical profiling: at regular intervals, stop the
371 | execution and record which functions are being executing and the call
372 | stack. 
373 | 
374 | - `Rprof` (and `summaryRprof`): record and summarise timings at fixed
375 |   intervals (default `interval` is 0.02 seconds).
376 |   
377 | - [`proftools`](https://cran.rstudio.com/web/packages/proftools/index.html)
378 |   package: Tools for examining `Rprof` profile output. Comes with an
379 |   extensive
380 |   [vignette](https://cran.rstudio.com/web/packages/proftools/vignettes/proftools.pdf).
381 |     
382 | - [`profvis`](https://github.com/rstudio/profvis): interactive
383 |   visualisation collected by `Rprof`, whith emphasis on *lines of
384 |   code*. See also and
385 |   [*Introduction to profvis*](http://rpubs.com/wch/123888).
386 |   
387 |     
388 |   
389 | - ([`lineprof`](https://github.com/hadley/lineprof) [\*] package: each
390 |   *line of code* is profiled. This is less precise (than `Rprof`) but
391 |   easier to interprete. Code must be sourced with `source()`.)
392 | - ([`profr`](https://cran.rstudio.com/web/packages/profr/index.html)
393 |   package [\*]: provides an alternative data structure and visual
394 |   rendering for the profiling information generated by `Rprof).
395 | 
396 | [\*] [`lineprof`](https://github.com/hadley/lineprof) and
397 | [`profr`](https://cran.rstudio.com/web/packages/profr/index.html) and
398 | are now deprecated in favour of
399 | [`profvis`](https://github.com/rstudio/profvis).
400 | 
401 | 
402 | ## `Rprof`
403 | 
404 | 
405 | ```r
406 | m <- matrix(rnorm(1e6), ncol = 10)
407 | 
408 | Rprof("rprof")
409 | res <- apply(m, 1, mean, trim=.3)
410 | Rprof(NULL)
411 | summaryRprof("rprof")
412 | ```
413 | 
414 | ## `profvis`
415 | 
416 | Needs to `source()` the code or directly input the code to have access
417 | to the individual lines.
418 | 
419 | 
420 | ```r
421 | f <- function() {
422 |   pause(0.1)
423 |   g()
424 |   h()
425 | }
426 | 
427 | g <- function() {
428 |   pause(0.1)
429 |   h()
430 | }
431 | 
432 | h <- function() {
433 |   pause(0.1)
434 | }
435 | ```
436 | 
437 | 
438 | ```r
439 | library("profvis")
440 | source("lineprof-example.R")
441 | profvis(f())
442 | ```
443 | 
444 | ![profviz inteface](./figs/profvis.png)
445 | 
446 | ## Limitations
447 | 
448 | - Not profiling of C/C++ code, or primitive functions, or byte
449 |   compiled code.
450 | - Anonymous functions are labelled as *anonymous*; name them
451 |   explicitly in such cases.
452 | 
453 | ## Exercise
454 | 
455 | Profile the code chunk that calculates the timmed means using
456 | `profvis` and interpret the results.
457 | 
458 | 
459 | ```r
460 | profvis({
461 |     m <- matrix(rnorm(1e6), ncol = 10)
462 |     res <- apply(m, 1, mean, trim=.3)
463 |     sum(res)
464 | })
465 | ```
466 | 
467 | # Optimisation
468 | 
469 | ## Look for existing solutions
470 | 
471 | - `readr::read_csv` or  `data.table::fread` instead of `read_csv`
472 | 
473 | ## Do as little as possible
474 | 
475 | - `gccountr` vs `gccountr2` example above
476 | - simpler data structures
477 | - set `colClasses` in `read.csv`
478 | 
479 | - Usual suspects: names, growing objects:
480 | 
481 | ## Names
482 | 
483 | 
484 | ```r
485 | make_id2GO <- function(n = 1e3) { ## could be 1e4 - 1e5
486 |     gn <- sprintf(paste0("ENSG%0", 10, "d"), sample(1e6, n))
487 |     goid <- function(n = 10) sprintf(paste0("GO:%0", 10, "d"), sample(1e6, n))
488 |     structure(replicate(n, goid(sample(50, 1))),
489 |               names = gn)
490 | }
491 | id2GO <- make_id2GO()
492 | ```
493 | 
494 | We have a list of 1000 genes, and each of these genes is
495 | characterised by a set of 1 to 50 GO terms. 
496 | 
497 | To obtain the go terms, we `unlist` the gene list.
498 | 
499 | 
500 | ```r
501 | length(id2GO)
502 | str(head(id2GO))
503 | str(unlist(id2GO))
504 | ```
505 | 
506 | This can be executed much faster if we ignore the names in the
507 | original list.
508 | 
509 | 
510 | ```r
511 | library(microbenchmark)
512 | microbenchmark(unlist(l),
513 |                unlist(l, use.names = FALSE),
514 |                times = 10)
515 | ```
516 | 
517 | ## Initialise, do not grow dynamically (to avoid copies)
518 | 
519 | 
520 | ```r
521 | f1 <- function(n) {
522 |   a <- NULL 
523 |   for (i in 1:n) a <- c(a, sqrt(i))
524 |   a
525 | }
526 | 
527 | f2 <- function(n) {
528 |   a <- numeric(n)
529 |   for (i in 1:n) a[i] <- sqrt(i)
530 |   a
531 | }
532 | ```
533 | 
534 | 
535 | ```r
536 | microbenchmark(f1(1e3), f2(1e3))
537 | microbenchmark(f1(1e4), f2(1e4))
538 | ```
539 | 
540 | ## Pass-by-reference with environments
541 | 
542 | When passing an environment as function argument, it is **not**
543 | copied: all its values are accessible within the function and can be
544 | persistently modified.
545 | 
546 | 
547 | ```r
548 | e <- new.env()
549 | e$x <- 1
550 | f <- function(myenv) myenv$x <- 2
551 | f(e)
552 | e$x
553 | ```
554 | 
555 | ```
556 | ## [1] 2
557 | ```
558 | 
559 | This is used in the `eSet` et al. microarray data structures to store
560 | the expression data.
561 | 
562 | ## Vectorisation
563 | 
564 | 
565 | ```r
566 | f3 <- function(n)
567 |   sapply(seq_len(n), sqrt)
568 | 
569 | f4 <- function(n) sqrt(n)
570 | ```
571 | 
572 | ![Initialisation and vectorisation](./figs/vectimings.png)
573 | 
574 | 
575 | 
576 | Code vectorisation is not only about avoiding loops at all cost, and
577 | replacing them with `*apply`. As we have seen, this does not make any
578 | real difference in terms of speed.
579 | 
580 | Difference between vectorisation in high level code, to improve
581 | clarity (`apply`, `Vectorise`, ...) and, vectorise to improve
582 | performance, which involved re-writing for loops in C/C++ (see below).
583 | 
584 | ## Re-implementing the code in C/C++ 
585 | 
586 | See below
587 | 
588 | ## Parallelisation
589 | 
590 | See the [R-parallel](https://github.com/lgatto/R-parallel/blob/f83667f3ef62ca3f82a7f2ef58b05db497da4b75/parallel.pdf) material.
591 | 
592 | ## Byte-code compilation
593 | 
594 | The `compile::cmpfun` function compiles the body of a closure and
595 | returns a new closure with the same formals and the body replaced by
596 | the compiled body expression. It does not always provide a speed
597 | improvement, but is very easy to implement.
598 | 
599 | 
600 | ```r
601 | lapply2 <- function(x, f, ...) {
602 |   out <- vector("list", length(x))
603 |   for (i in seq_along(x)) {
604 |     out[[i]] <- f(x[[i]], ...)
605 |   }
606 |   out
607 | }
608 | 
609 | lapply2_c <- compiler::cmpfun(lapply2)
610 | 
611 | x <- list(1:10, letters, c(FALSE, TRUE), NULL)
612 | 
613 | 
614 | microbenchmark(
615 |   lapply2(x, is.null),
616 |   lapply2_c(x, is.null),
617 |   lapply(x, is.null))
618 | ```
619 | 
620 | Note that all base R functions are aleady byte compiled. This can be
621 | observed with the `<bytecode: 0x3858bb8>` attribute of a function.
622 | 
623 | # Memory
624 | (See Chapter 18 in *Advanced R* for more details)
625 | 
626 | Assessing memory needs is useful to save memory in general and limit
627 | memory access (read/write), which is one common bottleneck in R.
628 | 
629 | Requirement: 
630 | 
631 | 
632 | ```r
633 | library("pryr")
634 | library("profvis")
635 | ```
636 | 
637 | ## Object size
638 | 
639 | 
640 | ```r
641 | x <- 1:1e5
642 | object.size(x)
643 | print(object.size(x), units = "Kb")
644 | object_size(x)
645 | ```
646 | 
647 | But, `object.size` does not account for shared elements, nor for the
648 | size of environments.
649 | 
650 | 
651 | ```r
652 | ll <- list(x, x, x)
653 | print(object.size(ll), units = "Kb")
654 | object_size(ll)
655 | ```
656 | 
657 | But, this does not hold when there's no shared components:
658 | 
659 | 
660 | ```r
661 | x <- 1:1e6
662 | y <- list(1:1e6, 1:1e6, 1:1e6)
663 | object_size(x)
664 | object_size(y)
665 | ```
666 | 
667 | Environments:
668 | 
669 | 
670 | ```r
671 | e <- new.env()
672 | object.size(e)
673 | object_size(e)
674 | e$a <- 1:1e6
675 | object.size(e)
676 | object_size(e)
677 | ```
678 | 
679 | ### Exercises
680 | 
681 | - What is the object size of an empty numeric vector?
682 | 
683 | <!-- ```{r, eval=FALSE} -->
684 | <!-- object_size(numeric()) -->
685 | <!-- ``` -->
686 | 
687 | <!-- The object components that make up 40 bytes are  -->
688 | <!-- - metadata (type, debugging and mempry management information: 4 bytes);  -->
689 | <!-- - pointers to the previous and next elements (doubly-linked list, 2 * 8 bytes);  -->
690 | <!-- - a pointer to attributes (8 bytes);  -->
691 | <!-- - the vector length (4 bytes); -->
692 | <!-- - "true" length, actual allocated space (4 bytes) -->
693 | <!-- - the data (0 bytes in this case) -->
694 | 
695 | - How does the size of a numeric vector grow with it size (say from
696 |   0 to 50)?
697 | 
698 | <!-- ```{r, eval=FALSE} -->
699 | <!-- sz <- sapply(0:50, function(i) object_size(seq_len(i))) -->
700 | <!-- plot(0:50, sz, type = "s", xlab = "Bytes", ylab = "Length") -->
701 | <!-- plot(0:50, sz - 40, type = "s", xlab = "Bytes", ylab = "Length") -->
702 | <!-- abline(h = c(0, 8, 16, 32, 48, 64, 128), col = "grey") -->
703 | <!-- abline(0, 4) -->
704 | <!-- ``` -->
705 | 
706 | 
707 | ## Memory usage
708 | 
709 | To get the total size of all object that were created by R that
710 | currently take space in memory:
711 | 
712 | 
713 | ```r
714 | mem_used()
715 | ```
716 | 
717 | To track memory change
718 | 
719 | 
720 | ```r
721 | mem_change(v <- 1:1e6)
722 | mem_change(rm(v))
723 | ```
724 | 
725 | 
726 | ```r
727 | rm(list = ls())
728 | mem_change(x <- 1:1e6)
729 | mem_change(y <- x)
730 | mem_change(rm(x))
731 | mem_change(rm(y))
732 | ```
733 | 
734 | #### Garbage collection
735 | 
736 | When objects in memory are not accessed from R anymore, there is no
737 | need to explicitly free up that memory chunk explicity. This is done
738 | automatically by the garbage collector, as illustrated in the examples
739 | above. There is no need to call it explicityly with `gc()`; the only
740 | effect of this is for R to explicitly return memory to the OS. 
741 | 
742 | ## Modifiation in place
743 | 
744 | What happens in this cas? Is `x` copied (and hence more memory is used
745 | up), or is it modified in place?
746 | 
747 | 
748 | ```r
749 | x <- 1:10
750 | c(address(x), refs(x))
751 | x[5] <- 0L
752 | c(address(x), refs(x))
753 | 
754 | 
755 | y <- x
756 | c(address(x), refs(x))
757 | c(address(y), refs(y))
758 | 
759 | x[5] <- 1L
760 | c(address(x), refs(x))
761 | c(address(y), refs(y))
762 | 
763 | 
764 | x <- 1:5
765 | y <- x
766 | c(address(x), refs(x))
767 | rm(y)
768 | c(address(x), refs(x)) ## should be 1
769 | 
770 | x <- 1:5
771 | y <- x
772 | z <- x
773 | c(address(x), refs(x)) ## should be 3
774 | ```
775 | 
776 | `tracemem` tracks memory location of objects:
777 | 
778 | 
779 | ```r
780 | x <- 1:10
781 | tracemem(x)
782 | x[5] <- 0L
783 | 
784 | y <- x
785 | x[5] <- 10L
786 | 
787 | address(x)
788 | address(y)
789 | ```
790 | 
791 | ## Exercise
792 | 
793 | We have quantified the substantial cost in execution time when growing
794 | data dynamically. Trace the impact of dynamically growing a vector on
795 | the memory. You can use `f1()` and `f2()` above, and trace `a`.
796 |   
797 | 
798 | 
799 | 
800 | # Rcpp
801 | 
802 | See [here](https://github.com/lgatto/rccpp/blob/b59a1ee23dd1ace7c45bb9e2239e853c93e9ca0c/rc.md).
803 | 
804 | # Big data
805 | 
806 | - [CRAN High-Performance and Parallel Computing task view](http://cran.r-project.org/web/views/HighPerformanceComputing.html).
807 | - Storing data in database or databases-like structures: `RMySQL`,
808 |       `RdbiPgSQL`, \ldots, `RSQLite`, `qldf`, `data.table` (the
809 |       `data.table::fread`, when `read.table` is slow, also `scan`),
810 |       `dplyr`, ... packages
811 | - The `ff` package by Adler et al. offers file-based access to data
812 |   sets that are too large to be loaded into memory, along with a
813 |   number of higher-level functions
814 | - The `bigmemory` package by Kane and Emerson permits storing large
815 |   objects such as matrices in memory (as well as via files) and uses
816 |   `external pointer` objects to refer to them
817 | - `netCDF` data files: `ncdf` and `RNetCDF` packages
818 | - `hdf5` format: `rhdf5` package
819 | - `mmap` memory-mapped files/devices I/O
820 | - hadoop and R
821 | - See http://r-pbd.org/ and the
822 |   [pbdDemo](http://cran.r-project.org/web/packages/pbdDEMO/)
823 |   package/vignette.
824 | - [Bioconductor in the cloud](http://bioconductor.org/help/bioconductor-cloud-ami/)
825 | - [Bioconductor docker containers](http://bioconductor.org/help/docker/)
826 | - ...
827 | 
828 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | %.md: %.Rmd
 2 | 	Rscript -e "knitr::knit('$^')"
 3 | 
 4 | %.html: %.Rmd
 5 | 	Rscript -e "rmarkdown::render('$^', output_format=rmarkdown::ioslides_presentation())"
 6 | 	# Rscript -e "rmarkdown::render('$^', output_format=rmarkdown::html_document())"
 7 | 
 8 | %.R: %.Rmd
 9 | 	Rscript -e "knitr::purl('$^')"
10 | 	perl -pi -e 's/^## (--.+)?//' $@
11 | 
12 | all: 
13 | 	make 01-intro.md 02-funprog.md 03-debug.md 04-perf.md unittesting.md
14 | 	make 01-intro.R 02-funprog.R 03-debug.R 04-perf.R 
15 | 
16 | .PHONY: all
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # R debugging and robust programming
 2 | 
 3 | **Short link to this page: http://bit.ly/advrEMBL2016**  
 4 | 
 5 | 25-26 February 2016 (Thursday-Friday), EMBL Heidelberg  
 6 | Instructors: Laurent Gatto, Robert Stojnic (University of Cambridge)  
 7 | Organiser: Wolfgang Huber (EMBL)  
 8 | 
 9 | This two-day course will teach participants debugging techniques and
10 | good practice in writing reliable, robust code. The material will
11 | provide the opportunity to gain experience and understanding of how to
12 | identify, resolve, and avoid bugs, in order to produce
13 | publication-quality code. The course will be taught using R and will
14 | be driven by many practical exercises.  Course outline
15 | 
16 | The material will focus on:
17 | 
18 | - debugging, to fix problems with code
19 | - defensive programming - writing effective tests to detect bugs
20 | - profiling and optimisation of code
21 | 
22 | ## Pre-requisites
23 | 
24 | The course is aimed at those with experience of scripting, who want to
25 | learn more about writing robust and efficient code and who may want to
26 | develop and release packages in the future.
27 | 
28 | ## Content
29 | 
30 | #### [Part I:](https://github.com/lgatto/2016-02-25-adv-programming-EMBL/blob/master/01-intro.md)
31 | - Coding style(s)
32 | - Interactive use and programming
33 | - Environments
34 | - Tidy data
35 | - Computing on the language
36 | 
37 | #### [Part II: Functional programming](https://github.com/lgatto/2016-02-25-adv-programming-EMBL/blob/master/02-funprog.md)
38 | - Functions 
39 | - Robust programming with functions
40 | - Scoping
41 | - Closures
42 | - High-level functions
43 | - Vectorisation
44 | 
45 | #### [Part III: Debugging](https://github.com/lgatto/2016-02-25-adv-programming-EMBL/blob/master/03-debug.md)
46 | - Defensive programming
47 | - Debbugging: techniques and tools
48 | - Condition handling: try/tryCatch
49 | - [Unit testing](https://github.com/lgatto/2016-02-25-adv-programming-EMBL/blob/master/unittesting.md)
50 | 
51 | #### [Part IV: Performance](https://github.com/lgatto/2016-02-25-adv-programming-EMBL/blob/master/04-perf.md)
52 | - Benchmarking
53 | - Profiling
54 | - Optimisation
55 | - Memory
56 | - [Rcpp](https://github.com/lgatto/rccpp/blob/master/rc.md)
57 | 
58 | #### Other topics
59 | - Packages and documentation
60 | - Reproducible research and vignettes (`Rmarkdown`)
61 | - Source code versioning with (for example) git and GitHub
62 | - Automation with (for example) `Make`
63 | 
64 | See the [TeachingMaterial](http://lgatto.github.io/TeachingMaterial/)
65 | repository for content.
66 | 
67 | ## References
68 | 
69 | - [Previous courses](https://github.com/lgatto/teachingmaterial) and [here](https://github.com/DataProgrammers/2015-01-15-EMBLHeidelberg).
70 | - [Advanced R](http://adv-r.had.co.nz/), Hadley Wickham.
71 | - [The R Inferno](http://www.burns-stat.com/documents/books/the-r-inferno/), Patrick Burns.
72 | - [An Introduction to the Interactive Debugging Tools in R](http://www.biostat.jhsph.edu/~rpeng/docs/R-debug-tools.pdf), Roger D. Peng.
73 | - [R Programming for Bioinformatics](http://master.bioconductor.org/help/publications/books/r-programming-for-bioinformatics/), Robert Gentleman.
74 | 
75 | 
76 | ## License
77 | 
78 | This work is licensed under a CC BY-SA 3.0 License.
79 | 


--------------------------------------------------------------------------------
/figs/covr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/covr.png


--------------------------------------------------------------------------------
/figs/debugRStudio1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/debugRStudio1.png


--------------------------------------------------------------------------------
/figs/debugRStudio2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/debugRStudio2.png


--------------------------------------------------------------------------------
/figs/envex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/envex.png


--------------------------------------------------------------------------------
/figs/envex.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!-- Created with Inkscape (http://www.inkscape.org/) -->
  3 | 
  4 | <svg
  5 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
  6 |    xmlns:cc="http://creativecommons.org/ns#"
  7 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  8 |    xmlns:svg="http://www.w3.org/2000/svg"
  9 |    xmlns="http://www.w3.org/2000/svg"
 10 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
 11 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
 12 |    width="744.09448819"
 13 |    height="1052.3622047"
 14 |    id="svg2"
 15 |    version="1.1"
 16 |    inkscape:version="0.48.4 r9939"
 17 |    sodipodi:docname="envex.svg"
 18 |    inkscape:export-filename="/home/lg390/Documents/Teaching/2016-02-25-adv-programming-EMBL/figs/envex.png"
 19 |    inkscape:export-xdpi="73.470001"
 20 |    inkscape:export-ydpi="73.470001">
 21 |   <defs
 22 |      id="defs4">
 23 |     <marker
 24 |        inkscape:stockid="Arrow2Lend"
 25 |        orient="auto"
 26 |        refY="0.0"
 27 |        refX="0.0"
 28 |        id="Arrow2Lend"
 29 |        style="overflow:visible;">
 30 |       <path
 31 |          id="path3958"
 32 |          style="fill-rule:evenodd;stroke-width:0.62500000;stroke-linejoin:round;"
 33 |          d="M 8.7185878,4.0337352 L -2.2072895,0.016013256 L 8.7185884,-4.0017078 C 6.9730900,-1.6296469 6.9831476,1.6157441 8.7185878,4.0337352 z "
 34 |          transform="scale(1.1) rotate(180) translate(1,0)" />
 35 |     </marker>
 36 |     <marker
 37 |        inkscape:stockid="Arrow2Lend"
 38 |        orient="auto"
 39 |        refY="0"
 40 |        refX="0"
 41 |        id="Arrow2Lend-6"
 42 |        style="overflow:visible">
 43 |       <path
 44 |          inkscape:connector-curvature="0"
 45 |          id="path3958-5"
 46 |          style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
 47 |          d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
 48 |          transform="matrix(-1.1,0,0,-1.1,-1.1,0)" />
 49 |     </marker>
 50 |     <marker
 51 |        inkscape:stockid="Arrow2Lend"
 52 |        orient="auto"
 53 |        refY="0"
 54 |        refX="0"
 55 |        id="Arrow2Lend-0"
 56 |        style="overflow:visible">
 57 |       <path
 58 |          inkscape:connector-curvature="0"
 59 |          id="path3958-2"
 60 |          style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
 61 |          d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
 62 |          transform="matrix(-1.1,0,0,-1.1,-1.1,0)" />
 63 |     </marker>
 64 |   </defs>
 65 |   <sodipodi:namedview
 66 |      id="base"
 67 |      pagecolor="#ffffff"
 68 |      bordercolor="#666666"
 69 |      borderopacity="1.0"
 70 |      inkscape:pageopacity="0.0"
 71 |      inkscape:pageshadow="2"
 72 |      inkscape:zoom="0.89430344"
 73 |      inkscape:cx="293.20943"
 74 |      inkscape:cy="485.60607"
 75 |      inkscape:document-units="px"
 76 |      inkscape:current-layer="layer1"
 77 |      showgrid="false"
 78 |      inkscape:window-width="1596"
 79 |      inkscape:window-height="880"
 80 |      inkscape:window-x="0"
 81 |      inkscape:window-y="18"
 82 |      inkscape:window-maximized="0" />
 83 |   <metadata
 84 |      id="metadata7">
 85 |     <rdf:RDF>
 86 |       <cc:Work
 87 |          rdf:about="">
 88 |         <dc:format>image/svg+xml</dc:format>
 89 |         <dc:type
 90 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
 91 |         <dc:title></dc:title>
 92 |       </cc:Work>
 93 |     </rdf:RDF>
 94 |   </metadata>
 95 |   <g
 96 |      inkscape:label="Layer 1"
 97 |      inkscape:groupmode="layer"
 98 |      id="layer1">
 99 |     <rect
100 |        style="fill:none;stroke:#000000;stroke-width:2.44097662;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
101 |        id="rect4405"
102 |        width="74.310318"
103 |        height="56.127567"
104 |        x="93.622093"
105 |        y="302.18555" />
106 |     <text
107 |        xml:space="preserve"
108 |        style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
109 |        x="99.624901"
110 |        y="344.82941"
111 |        id="text4407"
112 |        sodipodi:linespacing="125%"><tspan
113 |          sodipodi:role="line"
114 |          id="tspan4409"
115 |          x="99.624901"
116 |          y="344.82941">&quot;1&quot;</tspan></text>
117 |     <g
118 |        id="g4483"
119 |        transform="translate(142.38434,15.041281)">
120 |       <rect
121 |          y="280.71402"
122 |          x="365.65491"
123 |          height="52.527931"
124 |          width="48.487324"
125 |          id="rect4443"
126 |          style="fill:none;stroke:#000000;stroke-width:2;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
127 |       <rect
128 |          y="280.71402"
129 |          x="414.12192"
130 |          height="52.527931"
131 |          width="48.487324"
132 |          id="rect4443-0"
133 |          style="fill:none;stroke:#000000;stroke-width:2;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
134 |       <rect
135 |          y="280.71402"
136 |          x="462.6532"
137 |          height="52.527931"
138 |          width="48.487324"
139 |          id="rect4443-0-7"
140 |          style="fill:none;stroke:#000000;stroke-width:2;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
141 |     </g>
142 |     <text
143 |        xml:space="preserve"
144 |        style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
145 |        x="519.61029"
146 |        y="334.89651"
147 |        id="text4488"
148 |        sodipodi:linespacing="125%"><tspan
149 |          sodipodi:role="line"
150 |          id="tspan4490"
151 |          x="519.61029"
152 |          y="334.89651">1  2  3</tspan></text>
153 |     <rect
154 |        style="fill:none;stroke:#000000;stroke-width:2.62357759;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
155 |        id="rect4500"
156 |        width="109.60358"
157 |        height="71.077583"
158 |        x="184.93633"
159 |        y="551.40381" />
160 |     <text
161 |        xml:space="preserve"
162 |        style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
163 |        x="200.94905"
164 |        y="601.50317"
165 |        id="text4502"
166 |        sodipodi:linespacing="125%"><tspan
167 |          sodipodi:role="line"
168 |          id="tspan4504"
169 |          x="200.94905"
170 |          y="601.50317">123</tspan></text>
171 |     <text
172 |        xml:space="preserve"
173 |        style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
174 |        x="310.72717"
175 |        y="759.08905"
176 |        id="text4521"
177 |        sodipodi:linespacing="125%"><tspan
178 |          sodipodi:role="line"
179 |          id="tspan4523"
180 |          x="310.72717"
181 |          y="759.08905">TRUE</tspan></text>
182 |     <rect
183 |        style="fill:none;stroke:#000000;stroke-width:2;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
184 |        id="rect4525"
185 |        width="120.92883"
186 |        height="44.946995"
187 |        x="302.34283"
188 |        y="722.31866" />
189 |     <g
190 |        id="g5100">
191 |       <g
192 |          transform="translate(137.53721,-10.063698)"
193 |          id="g3913">
194 |         <rect
195 |            style="fill:#e2e2e2;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:2;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
196 |            id="rect2985"
197 |            width="168.69548"
198 |            height="231.32494"
199 |            x="77.781746"
200 |            y="287.6767" />
201 |         <g
202 |            id="g3875"
203 |            transform="translate(-337.39095,68.690373)">
204 |           <rect
205 |              y="248.89423"
206 |              x="434.6767"
207 |              height="51.51778"
208 |              width="52.527931"
209 |              id="rect3773"
210 |              style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:2;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
211 |           <text
212 |              sodipodi:linespacing="125%"
213 |              id="text3775"
214 |              y="285.57108"
215 |              x="449.30005"
216 |              style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
217 |              xml:space="preserve"><tspan
218 |                y="285.57108"
219 |                x="449.30005"
220 |                id="tspan3777"
221 |                sodipodi:role="line">a</tspan></text>
222 |         </g>
223 |         <g
224 |            id="g3880"
225 |            transform="translate(-341.43156,64.649763)">
226 |           <rect
227 |              y="246.76553"
228 |              x="519.21838"
229 |              height="51.51778"
230 |              width="52.527931"
231 |              id="rect3773-7"
232 |              style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:2;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
233 |           <text
234 |              sodipodi:linespacing="125%"
235 |              id="text3775-8"
236 |              y="283.44235"
237 |              x="533.84174"
238 |              style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
239 |              xml:space="preserve"><tspan
240 |                y="283.44235"
241 |                x="533.84174"
242 |                id="tspan3777-0"
243 |                sodipodi:role="line">b</tspan></text>
244 |         </g>
245 |         <g
246 |            id="g3885"
247 |            transform="translate(-430.32498,86.873119)">
248 |           <rect
249 |              y="354.85187"
250 |              x="547.50269"
251 |              height="51.51778"
252 |              width="52.527931"
253 |              id="rect3773-7-3"
254 |              style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:2;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
255 |           <text
256 |              sodipodi:linespacing="125%"
257 |              id="text3775-8-5"
258 |              y="391.52869"
259 |              x="562.12604"
260 |              style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
261 |              xml:space="preserve"><tspan
262 |                y="391.52869"
263 |                x="562.12604"
264 |                id="tspan3777-0-4"
265 |                sodipodi:role="line">c</tspan></text>
266 |         </g>
267 |         <path
268 |            sodipodi:type="arc"
269 |            style="fill:#050505;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:2;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
270 |            id="path3900"
271 |            sodipodi:cx="353.55341"
272 |            sodipodi:cy="263.93811"
273 |            sodipodi:rx="9.0913725"
274 |            sodipodi:ry="8.586297"
275 |            d="m 362.64478,263.93811 c 0,4.74208 -4.07035,8.5863 -9.09137,8.5863 -5.02103,0 -9.09138,-3.84422 -9.09138,-8.5863 0,-4.74208 4.07035,-8.5863 9.09138,-8.5863 5.02102,0 9.09137,3.84422 9.09137,8.5863 z"
276 |            transform="translate(-132.32998,222.23356)" />
277 |       </g>
278 |       <text
279 |          sodipodi:linespacing="125%"
280 |          id="text4863"
281 |          y="265.15732"
282 |          x="229.22868"
283 |          style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
284 |          xml:space="preserve"><tspan
285 |            y="265.15732"
286 |            x="229.22868"
287 |            id="tspan4865"
288 |            sodipodi:role="line">e</tspan></text>
289 |     </g>
290 |     <g
291 |        id="g5119">
292 |       <g
293 |          transform="translate(140.56767,27.311946)"
294 |          id="g3902">
295 |         <rect
296 |            style="fill:#e2e2e2;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:2;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
297 |            id="rect2985-3"
298 |            width="168.69548"
299 |            height="231.32494"
300 |            x="339.91632"
301 |            y="448.79602" />
302 |         <g
303 |            id="g3890"
304 |            transform="translate(-167.68532,15.152288)">
305 |           <rect
306 |              y="468.99908"
307 |              x="586.89868"
308 |              height="51.51778"
309 |              width="52.527931"
310 |              id="rect3773-7-3-7"
311 |              style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:2;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
312 |           <text
313 |              sodipodi:linespacing="125%"
314 |              id="text3775-8-5-6"
315 |              y="505.6759"
316 |              x="601.52203"
317 |              style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
318 |              xml:space="preserve"><tspan
319 |                y="505.6759"
320 |                x="601.52203"
321 |                id="tspan3777-0-4-9"
322 |                sodipodi:role="line">d</tspan></text>
323 |         </g>
324 |         <g
325 |            id="g3895"
326 |            transform="translate(-220.21325,35.355339)">
327 |           <rect
328 |              y="559.91278"
329 |              x="593.96967"
330 |              height="51.51778"
331 |              width="52.527931"
332 |              id="rect3773-7-3-7-1"
333 |              style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:2;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
334 |           <text
335 |              sodipodi:linespacing="125%"
336 |              id="text3775-8-5-6-6"
337 |              y="596.5896"
338 |              x="608.59302"
339 |              style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
340 |              xml:space="preserve"><tspan
341 |                y="596.5896"
342 |                x="608.59302"
343 |                id="tspan3777-0-4-9-8"
344 |                sodipodi:role="line">e</tspan></text>
345 |         </g>
346 |       </g>
347 |       <text
348 |          sodipodi:linespacing="125%"
349 |          id="text4867"
350 |          y="466.43127"
351 |          x="604.94006"
352 |          style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
353 |          xml:space="preserve"><tspan
354 |            y="466.43127"
355 |            x="604.94006"
356 |            id="tspan4869"
357 |            sodipodi:role="line">e2</tspan></text>
358 |     </g>
359 |     <rect
360 |        style="fill:#e2e2e2;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:2;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none"
361 |        id="rect2985-2"
362 |        width="168.69548"
363 |        height="231.32494"
364 |        x="-125.72073"
365 |        y="416.74197" />
366 |     <g
367 |        id="g5325"
368 |        transform="translate(-3.3545661,13.418264)">
369 |       <rect
370 |          y="555.13562"
371 |          x="-88.561142"
372 |          height="51.51778"
373 |          width="52.527931"
374 |          id="rect3773-7-3-5"
375 |          style="fill:#ffffff;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:2;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
376 |       <text
377 |          sodipodi:linespacing="125%"
378 |          id="text3775-8-5-4"
379 |          y="591.81244"
380 |          x="-73.93779"
381 |          style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
382 |          xml:space="preserve"><tspan
383 |            y="591.81244"
384 |            x="-73.93779"
385 |            id="tspan3777-0-4-0"
386 |            sodipodi:role="line">x</tspan></text>
387 |     </g>
388 |     <path
389 |        sodipodi:type="arc"
390 |        style="fill:#050505;fill-opacity:1;fill-rule:nonzero;stroke:#000000;stroke-width:2;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
391 |        id="path3900-5"
392 |        sodipodi:cx="353.55341"
393 |        sodipodi:cy="263.93811"
394 |        sodipodi:rx="9.0913725"
395 |        sodipodi:ry="8.586297"
396 |        d="m 362.64478,263.93811 a 9.0913725,8.586297 0 1 1 -18.18275,0 9.0913725,8.586297 0 1 1 18.18275,0 z"
397 |        transform="translate(-335.83245,351.29884)" />
398 |     <path
399 |        style="fill:none;stroke:#000000;stroke-width:2;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
400 |        d="m 358.76062,471.56229 c 0,0 17.17259,60.60916 41.41625,69.70053 24.24366,9.09137 78.7919,8.08122 78.7919,8.08122"
401 |        id="path3931"
402 |        inkscape:connector-curvature="0" />
403 |     <path
404 |        style="fill:none;stroke:#000000;stroke-width:2;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
405 |        d="m 20.262415,610.49205 c 0,0 66.35838,-102.88481 104.789135,-119.15617 38.43074,-16.27135 85.76334,-20.05437 85.76334,-20.05437"
406 |        id="path3931-2"
407 |        inkscape:connector-curvature="0"
408 |        sodipodi:nodetypes="csc" />
409 |     <text
410 |        xml:space="preserve"
411 |        style="font-size:32px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
412 |        x="-140.89177"
413 |        y="398.2218"
414 |        id="text5263"
415 |        sodipodi:linespacing="125%"><tspan
416 |          sodipodi:role="line"
417 |          id="tspan5265"
418 |          x="-140.89177"
419 |          y="398.2218"
420 |          style="font-size:32px;line-height:119.00000572%">R_GlobalEnv</tspan></text>
421 |     <g
422 |        id="g5320"
423 |        transform="translate(-20.127397,-33.545661)">
424 |       <text
425 |          sodipodi:linespacing="125%"
426 |          id="text5290"
427 |          y="761.11768"
428 |          x="-111.96722"
429 |          style="font-size:40px;font-style:normal;font-weight:normal;line-height:125%;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;font-family:Sans"
430 |          xml:space="preserve"><tspan
431 |            y="761.11768"
432 |            x="-111.96722"
433 |            id="tspan5292"
434 |            sodipodi:role="line">&quot;!&quot;</tspan></text>
435 |       <rect
436 |          y="722.49652"
437 |          x="-117.40981"
438 |          height="48.082115"
439 |          width="63.736755"
440 |          id="rect5294"
441 |          style="fill:none;stroke:#000000;stroke-width:2;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0" />
442 |     </g>
443 |     <path
444 |        style="fill:none;stroke:#000000;stroke-width:1.20000005;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
445 |        d="m 234.92237,327.42721 -66.35033,-6.421"
446 |        id="path4511"
447 |        inkscape:connector-curvature="0" />
448 |     <path
449 |        style="fill:none;stroke:#000000;stroke-width:1.20000005;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
450 |        d="m 368.69319,328.49737 136.98132,-4.28066"
451 |        id="path4513"
452 |        inkscape:connector-curvature="0" />
453 |     <path
454 |        style="fill:none;stroke:#000000;stroke-width:1.20000005;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
455 |        d="m 582.72651,511.49586 2.14033,-160.52499"
456 |        id="path4515"
457 |        inkscape:connector-curvature="0" />
458 |     <path
459 |        style="fill:none;stroke:#000000;stroke-width:1.20000005;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
460 |        d="m 534.56901,673.09101 c 0,0 -21.40333,80.2625 -111.29732,70.631"
461 |        id="path4534"
462 |        inkscape:connector-curvature="0" />
463 |     <path
464 |        style="fill:none;stroke:#000000;stroke-width:1.20000005;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
465 |        d="m 272.3782,482.60136 -20.33317,67.42049"
466 |        id="path4517"
467 |        inkscape:connector-curvature="0" />
468 |     <path
469 |        style="fill:none;stroke:#000000;stroke-width:1.20000005;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;marker-end:url(#Arrow2Lend)"
470 |        d="m -77.793755,620.11083 -20.33317,67.42049"
471 |        id="path4517-8"
472 |        inkscape:connector-curvature="0" />
473 |   </g>
474 | </svg>
475 | 


--------------------------------------------------------------------------------
/figs/funs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/funs.png


--------------------------------------------------------------------------------
/figs/optim.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/optim.png


--------------------------------------------------------------------------------
/figs/profvis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/profvis.png


--------------------------------------------------------------------------------
/figs/sinfun.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/sinfun.png


--------------------------------------------------------------------------------
/figs/style.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/style.png


--------------------------------------------------------------------------------
/figs/vectimings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lgatto/2016-02-25-adv-programming-EMBL/df9f289ec712debc8fd74cc66d09acd12ede838a/figs/vectimings.png


--------------------------------------------------------------------------------
/lineprof-example.R:
--------------------------------------------------------------------------------
 1 | library("lineprof")
 2 | 
 3 | f <- function() {
 4 |   pause(0.1)
 5 |   g()
 6 |   h()
 7 | }
 8 | 
 9 | g <- function() {
10 |   pause(0.1)
11 |   h()
12 | }
13 | 
14 | h <- function() {
15 |   pause(0.1)
16 | }
17 | 


--------------------------------------------------------------------------------
/unittesting.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Unit testing"
  3 | author: "Laurent Gatto"
  4 | ---
  5 | 
  6 | These exercises were written by Martin Morgan and Laurent Gatto for a
  7 | [Bioconductor Developer Day workshop](http://bioconductor.org/help/course-materials/2013/BioC2013/developer-day-debug/).
  8 | 
  9 | # Introduction
 10 | 
 11 | > Whenever you are templted to type something into a print statement
 12 | > or a debugger expression, write it as a test insted -- Martin Fowler
 13 | 
 14 | **Why unit testing?**
 15 | 
 16 | - Writing code to test code;
 17 | - anticipate bugs, in particular for edge cases;
 18 | - anticipate disruptive updates;
 19 | - document and test observed bugs using specific tests.
 20 | 
 21 | Each section provides a function that supposedly works as expected,
 22 | but quickly proves to misbehave. The exercise aims at first writing
 23 | some dedicated testing functions that will identify the problems and
 24 | then update the function so that it passes the specific tests. This
 25 | practice is called unit testing and we use the RUnit package for
 26 | this.
 27 | 
 28 | See the
 29 | [Unit Testing How-To](http://bioconductor.org/developers/how-to/unitTesting-guidelines/)
 30 | guide for details on unit testing using the
 31 | [`RUnit`](http://cran.r-project.org/web/packages/RUnit/index.html)
 32 | package. The
 33 | [`testthat`](http://cran.r-project.org/web/packages/testthat/) is
 34 | another package that provides unit testing infrastructure. Both
 35 | packages can conveniently be used to automate unit testing within
 36 | package testing. 
 37 | 
 38 | # Example
 39 | 
 40 | ## Subsetting
 41 | 
 42 | ### Problem
 43 | 
 44 | This function should return the elements of `x` that are in `y`.
 45 | 
 46 | ```{r}
 47 | ## Example
 48 | isIn <- function(x, y) {
 49 |     sel <- match(x, y)
 50 |     y[sel]
 51 | }
 52 | 
 53 | ## Expected
 54 | x <- sample(LETTERS, 5)
 55 | isIn(x, LETTERS)
 56 | ```
 57 | But
 58 | 
 59 | ```{r}
 60 | ## Bug!
 61 | isIn(c(x, "a"), LETTERS)
 62 | ```
 63 | 
 64 | ### Solution
 65 | 
 66 | Write a unit test that demonstrates the issue
 67 | 
 68 | ```{r}
 69 | ## Unit test:
 70 | library("RUnit")
 71 | test_isIn <- function() {
 72 |     x <- c("A", "B", "Z")
 73 |     checkIdentical(x, isIn(x, LETTERS))
 74 |     checkIdentical(x, isIn(c(x, "a"), LETTERS))
 75 | 
 76 | }
 77 | 
 78 | test_isIn()
 79 | ```
 80 | 
 81 | Update the buggy function until the unit test succeeds
 82 | 
 83 | ```{r}
 84 | ## updated function
 85 | isIn <- function(x, y) {
 86 |     sel <- x %in% y
 87 |     x[sel]
 88 | }
 89 | 
 90 | test_isIn() ## the bug is fixed and monitored
 91 | ```
 92 | 
 93 | ## The `testthat` syntax
 94 | 
 95 | `expect_that(object_or_expression, condition)` with conditions
 96 | - equals: `expect_that(1+2,equals(3))` or `expect_equal(1+2,3)`
 97 | - gives warning: `expect_that(warning("a")`, `gives_warning())`
 98 | - is a: `expect_that(1, is_a("numeric"))` or `expect_is(1,"numeric")`
 99 | - is true: `expect_that(2 == 2, is_true())` or `expect_true(2==2)`
100 | - matches: `expect_that("Testing is fun", matches("fun"))` or `expect_match("Testing is fun", "f.n")`
101 | - takes less: `than expect_that(Sys.sleep(1), takes_less_than(3))`
102 | 
103 | and
104 | 
105 | ```r
106 | test_that("description", {
107 |     a <- foo()
108 |     b <- bar()
109 |     expect_equal(a, b)
110 | })
111 | ```
112 | 
113 | ## Batch unit testing
114 | 
115 | ```r
116 | library("testthat")
117 | test_dir("./unittests/")
118 | test_file("./unittests/test_foo.R")
119 | ```
120 | 
121 | # Exercises
122 | 
123 | ## Column means
124 | 
125 | ## Problem
126 | 
127 | The `col_means` function computes the means of all numeric columns in
128 | a data frame (example from *Advanced R*, to illustrate defensive
129 | programming).
130 | 
131 | ```{r}
132 | col_means <- function(df) {
133 |   numeric <- sapply(df, is.numeric)
134 |   numeric_cols <- df[, numeric]
135 |   data.frame(lapply(numeric_cols, mean))
136 | }
137 | 
138 | ## Expected
139 | col_means(mtcars)
140 | 
141 | ## Bugs
142 | col_means(mtcars[, "mpg"])
143 | col_means(mtcars[, "mpg", drop=FALSE])
144 | col_means(mtcars[, 0])
145 | col_means(mtcars[0, ])
146 | col_means(as.list(mtcars))
147 | ```
148 | 
149 | ## Character matching
150 | 
151 | ### Problem
152 | 
153 | What are the exact matches of `x` in `y`?
154 | 
155 | ```{r}
156 | isExactIn <- function(x, y)
157 |     y[grep(x, y)]
158 | 
159 | ## Expected
160 | isExactIn("a", letters)
161 | 
162 | ## Bugs
163 | isExactIn("a", c("abc", letters))
164 | isExactIn(c("a", "z"), c("abc", letters))
165 | ```
166 | 
167 | ### Solution
168 | 
169 | ```{r}
170 | ## Unit test:
171 | library("RUnit")
172 | test_isExactIn <- function() {
173 |     checkIdentical("a", isExactIn("a", letters))
174 |     checkIdentical("a", isExactIn("a", c("abc", letters)))
175 |     checkIdentical(c("a", "z"), isExactIn(c("a", "z"), c("abc", letters)))
176 | }
177 | 
178 | test_isExactIn()
179 | 
180 | ## updated function:
181 | isExactIn <- function(x, y)
182 |     x[x %in% y]
183 | 
184 | test_isExactIn()
185 | ```
186 | 
187 | ## If conditions with length > 1
188 | 
189 | ### Problem
190 | 
191 | If `x` is greater than `y`, we want the difference of their
192 | squares. Otherwise, we want the sum.
193 | 
194 | ```{r}
195 | ifcond <- function(x, y) {
196 |     if (x > y) {
197 |         ans <- x*x - y*y
198 |     } else {
199 |         ans <- x*x + y*y
200 |     } 
201 |     ans
202 | }
203 | 
204 | ## Expected
205 | ifcond(3, 2)
206 | ifcond(2, 2)
207 | ifcond(1, 2)
208 | 
209 | ## Bug!
210 | ifcond(3:1, c(2, 2, 2))
211 | ```
212 | 
213 | ### Solution
214 | 
215 | ```{r}
216 | ## Unit test:
217 | library("RUnit")
218 | test_ifcond <- function() {
219 |     checkIdentical(5, ifcond(3, 2))
220 |     checkIdentical(8, ifcond(2, 2))
221 |     checkIdentical(5, ifcond(1, 2))
222 |     checkIdentical(c(5, 8, 5), ifcond(3:1, c(2, 2, 2)))
223 | }
224 | 
225 | test_ifcond()
226 | 
227 | ## updated function:
228 | ifcond <- function(x, y)
229 |     ifelse(x > y, x*x - y*y, x*x + y*y)
230 | 
231 | test_ifcond()
232 | ```
233 | 
234 | ## Know your inputs
235 | 
236 | ### Problem
237 | 
238 | Calculate the euclidean distance between a single point and a set of
239 | other points.
240 | 
241 | ```{r}
242 | ## Example
243 | distances <- function(point, pointVec) {
244 |     x <- point[1]
245 |     y <- point[2]
246 |     xVec <- pointVec[,1]
247 |     yVec <- pointVec[,2]
248 |     sqrt((xVec - x)^2 + (yVec - y)^2)
249 | }
250 | 
251 | ## Expected
252 | x <- rnorm(5)
253 | y <- rnorm(5)
254 | 
255 | (m <- cbind(x, y))
256 | (p <- m[1, ])
257 | 
258 | distances(p, m)
259 | 
260 | ## Bug!
261 | (dd <- data.frame(x, y))
262 | (q <- dd[1, ])
263 | 
264 | distances(q, dd)
265 | ```
266 | 
267 | ### Solution
268 | 
269 | ```{r}
270 | ## Unit test:
271 | library("RUnit")
272 | test_distances <- function() {
273 |     x <- y <- c(0, 1, 2)
274 |     m <- cbind(x, y)
275 |     p <- m[1, ]
276 |     dd <- data.frame(x, y)
277 |     q <- dd[1, ]
278 |     expct <- c(0, sqrt(c(2, 8)))
279 |     checkIdentical(expct, distances(p, m))
280 |     checkIdentical(expct, distances(q, dd))
281 | }
282 | 
283 | test_distances()
284 | 
285 | ## updated function
286 | distances <- function(point, pointVec) {
287 |     point <- as.numeric(point)
288 |     x <- point[1]
289 |     y <- point[2]
290 |     xVec <- pointVec[,1]
291 |     yVec <- pointVec[,2]
292 |     dist <- sqrt((xVec - x)^2 + (yVec - y)^2)
293 |     return(dist)
294 | }
295 | 
296 | test_distances()
297 | ```
298 | 
299 | ## Iterate on 0 length
300 | 
301 | ### Problem
302 | 
303 | Calculate the square root of the absolute value of a set of numbers.
304 | 
305 | ```{r}
306 | sqrtabs <- function(x) {
307 |     v <- abs(x)
308 |     sapply(1:length(v), function(i) sqrt(v[i]))
309 | }
310 | 
311 | ## Expected
312 | all(sqrtabs(c(-4, 0, 4)) == c(2, 0, 2))
313 | 
314 | ## Bug!
315 | sqrtabs(numeric())
316 | ```
317 | 
318 | ### Solution
319 | 
320 | ```{r}
321 | ## Unit test:
322 | library(RUnit)
323 | test_sqrtabs <- function() {
324 |     checkIdentical(c(2, 0, 2), sqrtabs(c(-4, 0, 4)))
325 |     checkIdentical(numeric(), sqrtabs(numeric()))
326 | }
327 | test_sqrtabs()
328 | 
329 | ## updated function:
330 | sqrtabs <- function(x) {
331 |   v <- abs(x)
332 |   sapply(seq_along(v), function(i) sqrt(v[i]))
333 | }
334 | test_sqrtabs()                          # nope!
335 | 
336 | sqrtabs <- function(x) {
337 |   v <- abs(x)
338 |   vapply(seq_along(v), function(i) sqrt(v[i]), 0)
339 | }
340 | test_sqrtabs()                          # yes!
341 | ```
342 | 
343 | # Unit testing in a package 
344 | 
345 | 
346 | ## In a package
347 | 
348 | 1. Create a directory `./mypackage/tests`.
349 | 2. Create the `testthat.R` file
350 | 
351 | ```r
352 | library("testthat")
353 | library("mypackage")
354 | test_check("sequences")
355 | ```
356 | 
357 | 3. Create a sub-directory `./mypackage/tests/testthat` and include as
358 |    many unit test files as desired that are named with the `test_`
359 |    prefix and contain unit tests.
360 | 
361 | 4. Suggest the unit testing package in your `DESCRIPTION` file:
362 | 
363 | ```
364 | Suggests: testthat
365 | ```
366 | 
367 | ## Example from the `sequences` package
368 | 
369 | From the `./sequences/tests/testthat/test_sequences.R` file:
370 | 
371 | ### Object creation and validity
372 | 
373 | We have a fasta file and the corresponding `DnaSeq` object.
374 | 
375 | 1. Let's make sure that the `DnaSeq` instance is valid, as changes in
376 |    the class definition might have altered its validity.
377 | 
378 | 2. Let's verify that `readFasta` regenerates and identical `DnaSeq`
379 |    object given the original fasta file.
380 | 
381 | ```r
382 | test_that("dnaseq validity", {
383 |   data(dnaseq)
384 |   expect_true(validObject(dnaseq))
385 | })
386 | 
387 | test_that("readFasta", {
388 |   ## loading _valid_ dnaseq
389 |   data(dnaseq)
390 |   ## reading fasta sequence
391 |   f <- dir(system.file("extdata",package="sequences"),pattern="fasta",full.names=TRUE)
392 |   xx <- readFasta(f[1])
393 |   expect_true(all.equal(xx, dnaseq))
394 | })
395 | ```
396 | 
397 | ### Multiple implementations
398 | 
399 | Let's check that the R, C and C++ (via `Rcpp`) give the same result
400 | 
401 | ```r
402 | test_that("ccpp code", {
403 |   gccountr <-
404 |     function(x) tabulate(factor(strsplit(x, "")[[1]]))
405 |   x <- "AACGACTACAGCATACTAC"
406 |   expect_true(identical(gccount(x), gccountr(x)))
407 |   expect_true(identical(gccount2(x), gccountr(x)))
408 | })
409 | ```
410 | 
411 | ## Exercise
412 | 
413 | Choose any data package of your choice and write a unit test that
414 | tests the validity of all the its data. 
415 | 
416 | Tips 
417 | 
418 | - To get all the data distributed with a package, use `data(package = "packageName")`
419 | 
420 | ```{r datatest0, eval=FALSE}
421 | library("pRolocdata")
422 | data(package = "pRolocdata")
423 | ```
424 | 
425 | - To test the validity of an object, use `validObject`
426 | 
427 | ```{r dataset1, echo=FALSE}
428 | suppressPackageStartupMessages(library("pRolocdata"))
429 | ```
430 | 
431 | ```{r datatest2, eval=FALSE, eval=TRUE}
432 | data(andy2011)
433 | validObject(andy2011)
434 | ```
435 | 
436 | - Using the `testthat` syntax, the actual test for that data set would be 
437 | 
438 | ```{r datatest3, eval=TRUE}
439 | library("testthat")
440 | expect_true(validObject(andy2011))
441 | ```
442 | 
443 | 
444 | ## Testing coverage in a package
445 | 
446 | The [covr](https://github.com/jimhester/covr) package:
447 | 
448 | ![package coverage](./figs/covr.png)
449 | 
450 | We can use `type="all"` to examine the coverage in unit tests, examples and vignettes. This can
451 | also be done interactively with Shiny:
452 | 
453 | ```{r, eval=FALSE}
454 | library(covr)
455 | coverage <- package_coverage("/path/to/package/source", type="all")
456 | shine(coverage)
457 | ```
458 | 
459 | [Coverage for all Bioconductor packages](https://codecov.io/github/Bioconductor-mirror).
460 | 


--------------------------------------------------------------------------------
/unittesting.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Unit testing"
  3 | author: "Laurent Gatto"
  4 | ---
  5 | 
  6 | These exercises were written by Martin Morgan and Laurent Gatto for a
  7 | [Bioconductor Developer Day workshop](http://bioconductor.org/help/course-materials/2013/BioC2013/developer-day-debug/).
  8 | 
  9 | # Introduction
 10 | 
 11 | > Whenever you are templted to type something into a print statement
 12 | > or a debugger expression, write it as a test insted -- Martin Fowler
 13 | 
 14 | **Why unit testing?**
 15 | 
 16 | - Writing code to test code;
 17 | - anticipate bugs, in particular for edge cases;
 18 | - anticipate disruptive updates;
 19 | - document and test observed bugs using specific tests.
 20 | 
 21 | Each section provides a function that supposedly works as expected,
 22 | but quickly proves to misbehave. The exercise aims at first writing
 23 | some dedicated testing functions that will identify the problems and
 24 | then update the function so that it passes the specific tests. This
 25 | practice is called unit testing and we use the RUnit package for
 26 | this.
 27 | 
 28 | See the
 29 | [Unit Testing How-To](http://bioconductor.org/developers/how-to/unitTesting-guidelines/)
 30 | guide for details on unit testing using the
 31 | [`RUnit`](http://cran.r-project.org/web/packages/RUnit/index.html)
 32 | package. The
 33 | [`testthat`](http://cran.r-project.org/web/packages/testthat/) is
 34 | another package that provides unit testing infrastructure. Both
 35 | packages can conveniently be used to automate unit testing within
 36 | package testing. 
 37 | 
 38 | # Example
 39 | 
 40 | ## Subsetting
 41 | 
 42 | ### Problem
 43 | 
 44 | This function should return the elements of `x` that are in `y`.
 45 | 
 46 | 
 47 | ```r
 48 | ## Example
 49 | isIn <- function(x, y) {
 50 |     sel <- match(x, y)
 51 |     y[sel]
 52 | }
 53 | 
 54 | ## Expected
 55 | x <- sample(LETTERS, 5)
 56 | isIn(x, LETTERS)
 57 | ```
 58 | 
 59 | ```
 60 | ## [1] "I" "K" "S" "Y" "W"
 61 | ```
 62 | But
 63 | 
 64 | 
 65 | ```r
 66 | ## Bug!
 67 | isIn(c(x, "a"), LETTERS)
 68 | ```
 69 | 
 70 | ```
 71 | ## [1] "I" "K" "S" "Y" "W" NA
 72 | ```
 73 | 
 74 | ### Solution
 75 | 
 76 | Write a unit test that demonstrates the issue
 77 | 
 78 | 
 79 | ```r
 80 | ## Unit test:
 81 | library("RUnit")
 82 | ```
 83 | 
 84 | ```
 85 | ## Loading required package: methods
 86 | ```
 87 | 
 88 | ```r
 89 | test_isIn <- function() {
 90 |     x <- c("A", "B", "Z")
 91 |     checkIdentical(x, isIn(x, LETTERS))
 92 |     checkIdentical(x, isIn(c(x, "a"), LETTERS))
 93 | 
 94 | }
 95 | 
 96 | test_isIn()
 97 | ```
 98 | 
 99 | ```
100 | ## Error in checkIdentical(x, isIn(c(x, "a"), LETTERS)): FALSE 
101 | ## 
102 | ```
103 | 
104 | Update the buggy function until the unit test succeeds
105 | 
106 | 
107 | ```r
108 | ## updated function
109 | isIn <- function(x, y) {
110 |     sel <- x %in% y
111 |     x[sel]
112 | }
113 | 
114 | test_isIn() ## the bug is fixed and monitored
115 | ```
116 | 
117 | ```
118 | ## [1] TRUE
119 | ```
120 | 
121 | ## The `testthat` syntax
122 | 
123 | `expect_that(object_or_expression, condition)` with conditions
124 | - equals: `expect_that(1+2,equals(3))` or `expect_equal(1+2,3)`
125 | - gives warning: `expect_that(warning("a")`, `gives_warning())`
126 | - is a: `expect_that(1, is_a("numeric"))` or `expect_is(1,"numeric")`
127 | - is true: `expect_that(2 == 2, is_true())` or `expect_true(2==2)`
128 | - matches: `expect_that("Testing is fun", matches("fun"))` or `expect_match("Testing is fun", "f.n")`
129 | - takes less: `than expect_that(Sys.sleep(1), takes_less_than(3))`
130 | 
131 | and
132 | 
133 | ```r
134 | test_that("description", {
135 |     a <- foo()
136 |     b <- bar()
137 |     expect_equal(a, b)
138 | })
139 | ```
140 | 
141 | ## Batch unit testing
142 | 
143 | ```r
144 | library("testthat")
145 | test_dir("./unittests/")
146 | test_file("./unittests/test_foo.R")
147 | ```
148 | 
149 | # Exercises
150 | 
151 | ## Column means
152 | 
153 | ## Problem
154 | 
155 | The `col_means` function computes the means of all numeric columns in
156 | a data frame (example from *Advanced R*, to illustrate defensive
157 | programming).
158 | 
159 | 
160 | ```r
161 | col_means <- function(df) {
162 |   numeric <- sapply(df, is.numeric)
163 |   numeric_cols <- df[, numeric]
164 |   data.frame(lapply(numeric_cols, mean))
165 | }
166 | 
167 | ## Expected
168 | col_means(mtcars)
169 | ```
170 | 
171 | ```
172 | ##        mpg    cyl     disp       hp     drat      wt     qsec     vs
173 | ## 1 20.09062 6.1875 230.7219 146.6875 3.596563 3.21725 17.84875 0.4375
174 | ##        am   gear   carb
175 | ## 1 0.40625 3.6875 2.8125
176 | ```
177 | 
178 | ```r
179 | ## Bugs
180 | col_means(mtcars[, "mpg"])
181 | ```
182 | 
183 | ```
184 | ## Error in df[, numeric]: incorrect number of dimensions
185 | ```
186 | 
187 | ```r
188 | col_means(mtcars[, "mpg", drop=FALSE])
189 | ```
190 | 
191 | ```
192 | ##   X21 X21.1 X22.8 X21.4 X18.7 X18.1 X14.3 X24.4 X22.8.1 X19.2 X17.8 X16.4
193 | ## 1  21    21  22.8  21.4  18.7  18.1  14.3  24.4    22.8  19.2  17.8  16.4
194 | ##   X17.3 X15.2 X10.4 X10.4.1 X14.7 X32.4 X30.4 X33.9 X21.5 X15.5 X15.2.1
195 | ## 1  17.3  15.2  10.4    10.4  14.7  32.4  30.4  33.9  21.5  15.5    15.2
196 | ##   X13.3 X19.2.1 X27.3 X26 X30.4.1 X15.8 X19.7 X15 X21.4.1
197 | ## 1  13.3    19.2  27.3  26    30.4  15.8  19.7  15    21.4
198 | ```
199 | 
200 | ```r
201 | col_means(mtcars[, 0])
202 | ```
203 | 
204 | ```
205 | ## Error in .subset(x, j): invalid subscript type 'list'
206 | ```
207 | 
208 | ```r
209 | col_means(mtcars[0, ])
210 | ```
211 | 
212 | ```
213 | ##   mpg cyl disp  hp drat  wt qsec  vs  am gear carb
214 | ## 1 NaN NaN  NaN NaN  NaN NaN  NaN NaN NaN  NaN  NaN
215 | ```
216 | 
217 | ```r
218 | col_means(as.list(mtcars))
219 | ```
220 | 
221 | ```
222 | ## Error in df[, numeric]: incorrect number of dimensions
223 | ```
224 | 
225 | ## Character matching
226 | 
227 | ### Problem
228 | 
229 | What are the exact matches of `x` in `y`?
230 | 
231 | 
232 | ```r
233 | isExactIn <- function(x, y)
234 |     y[grep(x, y)]
235 | 
236 | ## Expected
237 | isExactIn("a", letters)
238 | ```
239 | 
240 | ```
241 | ## [1] "a"
242 | ```
243 | 
244 | ```r
245 | ## Bugs
246 | isExactIn("a", c("abc", letters))
247 | ```
248 | 
249 | ```
250 | ## [1] "abc" "a"
251 | ```
252 | 
253 | ```r
254 | isExactIn(c("a", "z"), c("abc", letters))
255 | ```
256 | 
257 | ```
258 | ## Warning in grep(x, y): argument 'pattern' has length > 1 and only the first
259 | ## element will be used
260 | ```
261 | 
262 | ```
263 | ## [1] "abc" "a"
264 | ```
265 | 
266 | ### Solution
267 | 
268 | 
269 | ```r
270 | ## Unit test:
271 | library("RUnit")
272 | test_isExactIn <- function() {
273 |     checkIdentical("a", isExactIn("a", letters))
274 |     checkIdentical("a", isExactIn("a", c("abc", letters)))
275 |     checkIdentical(c("a", "z"), isExactIn(c("a", "z"), c("abc", letters)))
276 | }
277 | 
278 | test_isExactIn()
279 | ```
280 | 
281 | ```
282 | ## Error in checkIdentical("a", isExactIn("a", c("abc", letters))): FALSE 
283 | ## 
284 | ```
285 | 
286 | ```r
287 | ## updated function:
288 | isExactIn <- function(x, y)
289 |     x[x %in% y]
290 | 
291 | test_isExactIn()
292 | ```
293 | 
294 | ```
295 | ## [1] TRUE
296 | ```
297 | 
298 | ## If conditions with length > 1
299 | 
300 | ### Problem
301 | 
302 | If `x` is greater than `y`, we want the difference of their
303 | squares. Otherwise, we want the sum.
304 | 
305 | 
306 | ```r
307 | ifcond <- function(x, y) {
308 |     if (x > y) {
309 |         ans <- x*x - y*y
310 |     } else {
311 |         ans <- x*x + y*y
312 |     } 
313 |     ans
314 | }
315 | 
316 | ## Expected
317 | ifcond(3, 2)
318 | ```
319 | 
320 | ```
321 | ## [1] 5
322 | ```
323 | 
324 | ```r
325 | ifcond(2, 2)
326 | ```
327 | 
328 | ```
329 | ## [1] 8
330 | ```
331 | 
332 | ```r
333 | ifcond(1, 2)
334 | ```
335 | 
336 | ```
337 | ## [1] 5
338 | ```
339 | 
340 | ```r
341 | ## Bug!
342 | ifcond(3:1, c(2, 2, 2))
343 | ```
344 | 
345 | ```
346 | ## Warning in if (x > y) {: the condition has length > 1 and only the first
347 | ## element will be used
348 | ```
349 | 
350 | ```
351 | ## [1]  5  0 -3
352 | ```
353 | 
354 | ### Solution
355 | 
356 | 
357 | ```r
358 | ## Unit test:
359 | library("RUnit")
360 | test_ifcond <- function() {
361 |     checkIdentical(5, ifcond(3, 2))
362 |     checkIdentical(8, ifcond(2, 2))
363 |     checkIdentical(5, ifcond(1, 2))
364 |     checkIdentical(c(5, 8, 5), ifcond(3:1, c(2, 2, 2)))
365 | }
366 | 
367 | test_ifcond()
368 | ```
369 | 
370 | ```
371 | ## Warning in if (x > y) {: the condition has length > 1 and only the first
372 | ## element will be used
373 | ```
374 | 
375 | ```
376 | ## Error in checkIdentical(c(5, 8, 5), ifcond(3:1, c(2, 2, 2))): FALSE 
377 | ## 
378 | ```
379 | 
380 | ```r
381 | ## updated function:
382 | ifcond <- function(x, y)
383 |     ifelse(x > y, x*x - y*y, x*x + y*y)
384 | 
385 | test_ifcond()
386 | ```
387 | 
388 | ```
389 | ## [1] TRUE
390 | ```
391 | 
392 | ## Know your inputs
393 | 
394 | ### Problem
395 | 
396 | Calculate the euclidean distance between a single point and a set of
397 | other points.
398 | 
399 | 
400 | ```r
401 | ## Example
402 | distances <- function(point, pointVec) {
403 |     x <- point[1]
404 |     y <- point[2]
405 |     xVec <- pointVec[,1]
406 |     yVec <- pointVec[,2]
407 |     sqrt((xVec - x)^2 + (yVec - y)^2)
408 | }
409 | 
410 | ## Expected
411 | x <- rnorm(5)
412 | y <- rnorm(5)
413 | 
414 | (m <- cbind(x, y))
415 | ```
416 | 
417 | ```
418 | ##               x           y
419 | ## [1,] -3.1164934 -1.81741540
420 | ## [2,]  1.5741209 -0.05377479
421 | ## [3,] -0.6298675  0.68829382
422 | ## [4,] -1.3595685 -1.09019047
423 | ## [5,]  0.9666774  0.76398426
424 | ```
425 | 
426 | ```r
427 | (p <- m[1, ])
428 | ```
429 | 
430 | ```
431 | ##         x         y 
432 | ## -3.116493 -1.817415
433 | ```
434 | 
435 | ```r
436 | distances(p, m)
437 | ```
438 | 
439 | ```
440 | ## [1] 0.000000 5.011217 3.530140 1.901484 4.830725
441 | ```
442 | 
443 | ```r
444 | ## Bug!
445 | (dd <- data.frame(x, y))
446 | ```
447 | 
448 | ```
449 | ##            x           y
450 | ## 1 -3.1164934 -1.81741540
451 | ## 2  1.5741209 -0.05377479
452 | ## 3 -0.6298675  0.68829382
453 | ## 4 -1.3595685 -1.09019047
454 | ## 5  0.9666774  0.76398426
455 | ```
456 | 
457 | ```r
458 | (q <- dd[1, ])
459 | ```
460 | 
461 | ```
462 | ##           x         y
463 | ## 1 -3.116493 -1.817415
464 | ```
465 | 
466 | ```r
467 | distances(q, dd)
468 | ```
469 | 
470 | ```
471 | ##   x
472 | ## 1 0
473 | ```
474 | 
475 | ### Solution
476 | 
477 | 
478 | ```r
479 | ## Unit test:
480 | library("RUnit")
481 | test_distances <- function() {
482 |     x <- y <- c(0, 1, 2)
483 |     m <- cbind(x, y)
484 |     p <- m[1, ]
485 |     dd <- data.frame(x, y)
486 |     q <- dd[1, ]
487 |     expct <- c(0, sqrt(c(2, 8)))
488 |     checkIdentical(expct, distances(p, m))
489 |     checkIdentical(expct, distances(q, dd))
490 | }
491 | 
492 | test_distances()
493 | ```
494 | 
495 | ```
496 | ## Error in checkIdentical(expct, distances(q, dd)): FALSE 
497 | ## 
498 | ```
499 | 
500 | ```r
501 | ## updated function
502 | distances <- function(point, pointVec) {
503 |     point <- as.numeric(point)
504 |     x <- point[1]
505 |     y <- point[2]
506 |     xVec <- pointVec[,1]
507 |     yVec <- pointVec[,2]
508 |     dist <- sqrt((xVec - x)^2 + (yVec - y)^2)
509 |     return(dist)
510 | }
511 | 
512 | test_distances()
513 | ```
514 | 
515 | ```
516 | ## [1] TRUE
517 | ```
518 | 
519 | ## Iterate on 0 length
520 | 
521 | ### Problem
522 | 
523 | Calculate the square root of the absolute value of a set of numbers.
524 | 
525 | 
526 | ```r
527 | sqrtabs <- function(x) {
528 |     v <- abs(x)
529 |     sapply(1:length(v), function(i) sqrt(v[i]))
530 | }
531 | 
532 | ## Expected
533 | all(sqrtabs(c(-4, 0, 4)) == c(2, 0, 2))
534 | ```
535 | 
536 | ```
537 | ## [1] TRUE
538 | ```
539 | 
540 | ```r
541 | ## Bug!
542 | sqrtabs(numeric())
543 | ```
544 | 
545 | ```
546 | ## [[1]]
547 | ## [1] NA
548 | ## 
549 | ## [[2]]
550 | ## numeric(0)
551 | ```
552 | 
553 | ### Solution
554 | 
555 | 
556 | ```r
557 | ## Unit test:
558 | library(RUnit)
559 | test_sqrtabs <- function() {
560 |     checkIdentical(c(2, 0, 2), sqrtabs(c(-4, 0, 4)))
561 |     checkIdentical(numeric(), sqrtabs(numeric()))
562 | }
563 | test_sqrtabs()
564 | ```
565 | 
566 | ```
567 | ## Error in checkIdentical(numeric(), sqrtabs(numeric())): FALSE 
568 | ## 
569 | ```
570 | 
571 | ```r
572 | ## updated function:
573 | sqrtabs <- function(x) {
574 |   v <- abs(x)
575 |   sapply(seq_along(v), function(i) sqrt(v[i]))
576 | }
577 | test_sqrtabs()                          # nope!
578 | ```
579 | 
580 | ```
581 | ## Error in checkIdentical(numeric(), sqrtabs(numeric())): FALSE 
582 | ## 
583 | ```
584 | 
585 | ```r
586 | sqrtabs <- function(x) {
587 |   v <- abs(x)
588 |   vapply(seq_along(v), function(i) sqrt(v[i]), 0)
589 | }
590 | test_sqrtabs()                          # yes!
591 | ```
592 | 
593 | ```
594 | ## [1] TRUE
595 | ```
596 | 
597 | # Unit testing in a package 
598 | 
599 | 
600 | ## In a package
601 | 
602 | 1. Create a directory `./mypackage/tests`.
603 | 2. Create the `testthat.R` file
604 | 
605 | ```r
606 | library("testthat")
607 | library("mypackage")
608 | test_check("sequences")
609 | ```
610 | 
611 | 3. Create a sub-directory `./mypackage/tests/testthat` and include as
612 |    many unit test files as desired that are named with the `test_`
613 |    prefix and contain unit tests.
614 | 
615 | 4. Suggest the unit testing package in your `DESCRIPTION` file:
616 | 
617 | ```
618 | Suggests: testthat
619 | ```
620 | 
621 | ## Example from the `sequences` package
622 | 
623 | From the `./sequences/tests/testthat/test_sequences.R` file:
624 | 
625 | ### Object creation and validity
626 | 
627 | We have a fasta file and the corresponding `DnaSeq` object.
628 | 
629 | 1. Let's make sure that the `DnaSeq` instance is valid, as changes in
630 |    the class definition might have altered its validity.
631 | 
632 | 2. Let's verify that `readFasta` regenerates and identical `DnaSeq`
633 |    object given the original fasta file.
634 | 
635 | ```r
636 | test_that("dnaseq validity", {
637 |   data(dnaseq)
638 |   expect_true(validObject(dnaseq))
639 | })
640 | 
641 | test_that("readFasta", {
642 |   ## loading _valid_ dnaseq
643 |   data(dnaseq)
644 |   ## reading fasta sequence
645 |   f <- dir(system.file("extdata",package="sequences"),pattern="fasta",full.names=TRUE)
646 |   xx <- readFasta(f[1])
647 |   expect_true(all.equal(xx, dnaseq))
648 | })
649 | ```
650 | 
651 | ### Multiple implementations
652 | 
653 | Let's check that the R, C and C++ (via `Rcpp`) give the same result
654 | 
655 | ```r
656 | test_that("ccpp code", {
657 |   gccountr <-
658 |     function(x) tabulate(factor(strsplit(x, "")[[1]]))
659 |   x <- "AACGACTACAGCATACTAC"
660 |   expect_true(identical(gccount(x), gccountr(x)))
661 |   expect_true(identical(gccount2(x), gccountr(x)))
662 | })
663 | ```
664 | 
665 | ## Exercise
666 | 
667 | Choose any data package of your choice and write a unit test that
668 | tests the validity of all the its data. 
669 | 
670 | Tips 
671 | 
672 | - To get all the data distributed with a package, use `data(package = "packageName")`
673 | 
674 | 
675 | ```r
676 | library("pRolocdata")
677 | data(package = "pRolocdata")
678 | ```
679 | 
680 | - To test the validity of an object, use `validObject`
681 | 
682 | 
683 | 
684 | 
685 | ```r
686 | data(andy2011)
687 | validObject(andy2011)
688 | ```
689 | 
690 | ```
691 | ## [1] TRUE
692 | ```
693 | 
694 | - Using the `testthat` syntax, the actual test for that data set would be 
695 | 
696 | 
697 | ```r
698 | library("testthat")
699 | expect_true(validObject(andy2011))
700 | ```
701 | 
702 | 
703 | ## Testing coverage in a package
704 | 
705 | The [covr](https://github.com/jimhester/covr) package:
706 | 
707 | ![package coverage](./figs/covr.png)
708 | 
709 | We can use `type="all"` to examine the coverage in unit tests, examples and vignettes. This can
710 | also be done interactively with Shiny:
711 | 
712 | 
713 | ```r
714 | library(covr)
715 | coverage <- package_coverage("/path/to/package/source", type="all")
716 | shine(coverage)
717 | ```
718 | 
719 | [Coverage for all Bioconductor packages](https://codecov.io/github/Bioconductor-mirror).
720 | 


--------------------------------------------------------------------------------