├── .gitignore
├── ENCFF231UNV.bed.gz
├── GenomicRanges.png
├── README.md
├── boot_and_match_examples.R
├── boot_and_match_script.R
├── code_examples.R
├── czi.png
├── data.tsv
├── dplyr.png
├── narrowpeak.png
├── non-tidy.numbers
├── non-tidy.png
├── nullranges.png
├── plyranges.png
├── si.rda
├── tidy-enrichment.pdf
├── tidy-genomics-talk.Rmd
├── tidy-genomics-talk.pdf
├── tidyomics1.png
├── tidyomics2.png
├── tt_roadmap.png
└── woodjoin.png


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pdf
2 | *.key
3 | tidy-genomics-talk_cache/
4 | .DS_store
5 | 


--------------------------------------------------------------------------------
/ENCFF231UNV.bed.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/ENCFF231UNV.bed.gz


--------------------------------------------------------------------------------
/GenomicRanges.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/GenomicRanges.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Tidy genomics talk
2 | 
3 | Two presentations in this repo:
4 | 
5 | 1. `tidy-genomics-talk.pdf` with Rmd
6 | 2. `tidy-enrichment.pdf` with code in `boot_and_match_examples.R`
7 | 
8 | 


--------------------------------------------------------------------------------
/boot_and_match_examples.R:
--------------------------------------------------------------------------------
  1 | ##########################################
  2 | ## visualizing genes and other features ##
  3 | ##########################################
  4 | 
  5 | library(TxDb.Hsapiens.UCSC.hg38.knownGene)
  6 | library(org.Hs.eg.db)
  7 | txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
  8 | g <- genes(txdb)
  9 | 
 10 | # add symbols to genes
 11 | suppressPackageStartupMessages(library(plyranges))
 12 | 
 13 | g <- g %>%
 14 |   mutate(symbol = mapIds(org.Hs.eg.db, gene_id,
 15 |                          "SYMBOL", "ENTREZID"))
 16 | 
 17 | # for visualizing, restrict to a range of chr4
 18 | chrom <- "chr4"
 19 | rng <- c(98.8e6, 99.8e6) # where we will zoom into, 1 Mb
 20 | rng_big <- c(90e6, 110e6) # where features live, 20 Mb
 21 | 
 22 | # filtering the genes to this range
 23 | r <- data.frame(seqnames=chrom, start=rng[1]+1, end=rng[2]) %>%
 24 |   as_granges()
 25 | 
 26 | # just look at the genes in this small range
 27 | g <- g %>%
 28 |   filter_by_overlaps(r) %>%
 29 |   sort() %>%
 30 |   arrange(strand)
 31 | 
 32 | source("boot_and_match_script.R")
 33 | 
 34 | suppressPackageStartupMessages(library(plotgardener))
 35 | 
 36 | plotSomeGenes(chrom, rng, showGuides=FALSE)
 37 | 
 38 | # make n features in clumps of ~lambda
 39 | seqlens <- seqlengths(g)[chrom]
 40 | set.seed(5)
 41 | p <- makeClusterRanges(chrom, rng_big, n=300, lambda=5, seqlens)
 42 | 
 43 | # define some plotting parameters for plotgardener,
 44 | # e.g. a palette for feature 'score':
 45 | pal <- colorRampPalette(c("dodgerblue2", "firebrick2"))
 46 | 
 47 | # shared genomic location, width & height, x position, fill, etc.
 48 | params <- pgParams(
 49 |   chrom=chrom, chromstart=rng[1], chromend=rng[2],
 50 |   width=5.5, height=1, x=.25,
 51 |   fill=colorby("score", palette=pal),
 52 |   order="random", baseline=TRUE, 
 53 | )
 54 | 
 55 | # shared parameters for text labels
 56 | textparams <- pgParams(x=.1, rot=90, just="left")
 57 | 
 58 | # plot the original GRanges, e.g. suppose ATAC-seq peaks
 59 | plotRanges(p, params=params, y=2)
 60 | plotText("original", params=textparams, y=3)
 61 | 
 62 | # uniform shuffling
 63 | shuf <- shuffle(p, rng_big)
 64 | 
 65 | # plot shuffled ranges
 66 | plotRanges(shuf, params=params, y=1)
 67 | plotText("shuffled", params=textparams, y=2)
 68 | 
 69 | # segmented block bootstrapping
 70 | # blocks 100kb, not proportion to segment length
 71 | library(nullranges)
 72 | seg <- makeSegmentation(chrom, rng_big, seqlens)
 73 | set.seed(1)
 74 | boot <- bootRanges(p, blockLength=1e5, R=1,
 75 |                    seg=seg, proportionLength=FALSE)
 76 | 
 77 | # plot bootstrapped ranges
 78 | plotRanges(boot, params=params, y=0)
 79 | plotText("boot", params=textparams, y=1)
 80 | 
 81 | # for genome-wide analysis, consider excluding gaps, repeats, etc.
 82 | # see https://dozmorovlab.github.io/excluderanges for details
 83 | 
 84 | #library(AnnotationHub)
 85 | #ah <- AnnotationHub()
 86 | #query(ah, "excluderanges")
 87 | 
 88 | ###########################
 89 | ## bootstrapping example ##
 90 | ###########################
 91 | 
 92 | # first just counts as statistic
 93 | 
 94 | g %>%
 95 |   mutate(n_overlaps = count_overlaps(., p))
 96 | 
 97 | g %>% 
 98 |   join_overlap_left(p) %>%
 99 |   group_by(symbol) %>%
100 |   summarize(n_overlaps = sum(!is.na(id)))
101 | 
102 | # working with metadata
103 | 
104 | g %>% 
105 |   join_overlap_left(p) %>%
106 |   group_by(symbol) %>% # per gene symbol
107 |   summarize(sum_score = sum(score))
108 | 
109 | # simple violin plot
110 | 
111 | library(tibble)
112 | library(ggplot2)
113 | # inner instead of left: leaves out no-overlap genes
114 | g %>% 
115 |   join_overlap_inner(p) %>%
116 |   mutate(type = "original") %>%
117 |   group_by(symbol, type) %>%
118 |   summarize(sum_score = sum(score)) %>%
119 |   as_tibble() %>%
120 |   ggplot(aes(type, sum_score)) +
121 |   geom_violin() +
122 |   geom_point()
123 | 
124 | # adding more draws from the distribution for simulated features
125 | 
126 | niter <- 50
127 | sim_list <- replicate(niter, {
128 |   makeClusterRanges(chrom, rng_big, n=300, lambda=5, seqlens)
129 | })
130 | sim_long <- bind_ranges(sim_list, .id="iter")
131 | 
132 | g %>% 
133 |   join_overlap_inner(sim_long) %>%
134 |   mutate(type = "original") %>%
135 |   group_by(symbol, iter, type) %>%
136 |   summarize(sum_score = sum(score)) %>%
137 |   as_tibble() %>%
138 |   ggplot(aes(type, sum_score)) +
139 |   geom_violin() +
140 |   geom_jitter()
141 | 
142 | # shuffling and bootstrapping multiple times
143 | 
144 | shuf_list <- replicate(niter, shuffle(p, rng_big))
145 | shuf_long <- bind_ranges(shuf_list, .id="iter")
146 | 
147 | boot_long <- bootRanges(p, blockLength=1e5, R=niter,
148 |                    seg=seg, proportionLength=FALSE)
149 | 
150 | # bind together
151 | 
152 | lvls <- c("sim","shuffle","boot")
153 | all <- bind_ranges(sim=sim_long, shuffle=shuf_long,
154 |                    boot=boot_long, .id="type") %>%
155 |   mutate(type = factor(type, levels=lvls))
156 | 
157 | # show table of features per iteration
158 | head(table(all$iter, all$type))
159 | 
160 | # final plot of distributions:
161 | # multiple draws, shuffling one instance, bootstrapping one instances
162 | 
163 | g %>% 
164 |   join_overlap_inner(all) %>%
165 |   group_by(symbol, iter, type) %>%
166 |   summarize(sum_score = sum(score)) %>%
167 |   as_tibble() %>%
168 |   ggplot(aes(type, sum_score)) +
169 |   geom_violin() +
170 |   geom_jitter(width=.25, alpha=.15)
171 | 
172 | ######################
173 | ## matching example ##
174 | ######################
175 | 
176 | # start with gene plot again
177 | plotSomeGenes(chrom, rng, showGuides=FALSE)
178 | 
179 | # make some features with particular distribution
180 | # 1) near gene TSS, 2) tend to have large 'score' values
181 | set.seed(1)
182 | focal <- makeFocalFeatures(g, chrom, rng)
183 | 
184 | # 5 color palette for 'score'
185 | pal <- colorRampPalette(c("blue","green","yellow","red"))
186 | 
187 | # new plot parameters
188 | params <- pgParams(
189 |   chrom=chrom, chromstart=rng[1], chromend=rng[2],
190 |   width=5.5, height=1, x=.25,
191 |   fill=colorby("score", palette=pal, range=c(1,5)),
192 |   order="random", baseline=TRUE, 
193 | )
194 | 
195 | # plot the original 'focal' GRanges
196 | plotRanges(focal, params=params, y=2)
197 | plotText("focal", params=textparams, y=3)
198 | 
199 | # make a 'pool' of features to select from
200 | pool <- makePool(5000, chrom, rng, seqlens)
201 | 
202 | # plot the pool (subset)
203 | plotRanges(pool[1:200], params=params, y=1)
204 | plotText("pool", params=textparams, y=2)
205 | 
206 | # add another feature: distance to nearest TSS
207 | tss <- g %>% anchor_5p() %>% mutate(width=1)
208 | 
209 | both <- bind_ranges(focal = focal, pool = pool, .id="type") %>%
210 |   add_nearest_distance(tss) %>%
211 |   mutate(log10dist = log10(distance + 1000))
212 | 
213 | hist(both$log10dist)
214 | 
215 | m <- both %>% {
216 |   matchRanges(filter(., type=="focal"),
217 |               filter(., type=="pool"),
218 |               covar=~score + log10dist,
219 |               method="nearest", replace=TRUE)
220 | }
221 | 
222 | library(patchwork)
223 | plotCovariate(m, covar="score") +
224 |   plotCovariate(m, covar="log10dist")
225 | 
226 | # plot the matched set (need to replot the others)
227 | set.seed(1)
228 | plotSomeGenes(chrom, rng, showGuides=FALSE)
229 | plotRanges(focal, params=params, y=2)
230 | plotText("focal", params=textparams, y=3)
231 | plotRanges(pool[1:200], params=params, y=1)
232 | plotText("pool", params=textparams, y=2)
233 | plotRanges(matched(m), params=params, y=0)
234 | plotText("matched", params=textparams, y=1)
235 | 


--------------------------------------------------------------------------------
/boot_and_match_script.R:
--------------------------------------------------------------------------------
 1 | plotSomeGenes <- function(chrom, rng, showGuides) {
 2 |   pageCreate(width=6, height=4, showGuides=showGuides)
 3 |   p <- pgParams(chrom=chrom, chromstart=rng[1], chromend=rng[2], width=5.5)
 4 |   cols <- c("dodgerblue","navy")
 5 |   gplt <- plotGenes(params=p, x=.25, y=3, height=.75, fill=cols, fontcolor=cols)
 6 |   annoGenomeLabel(plot=gplt, x=.25, y=3.75, scale="Mb")
 7 | }
 8 | 
 9 | makeClusterRanges <- function(chrom, rng, n, lambda, seqlens) {
10 |   niter <- n/lambda
11 |   out <- lapply(seq_len(niter), function(i) {
12 |     nranges <- max(rpois(1, lambda), 1)
13 |     pos <- round(runif(1, rng[1], rng[2]))
14 |     mu <- rnorm(1, 0, 2)
15 |     start <- pos + round(runif(nranges, -2e4, 2e4))
16 |     score <- rnorm(nranges, mu, .5)
17 |     data.frame(seqnames=chrom, start, width=1e4, score)
18 |   })
19 |   gr <- do.call(rbind, out) %>%
20 |     as_granges() %>%
21 |     sort() %>%
22 |     mutate(id = seq_along(.))
23 |   seqlengths(gr) <- seqlens
24 |   gr
25 | }
26 | 
27 | shuffle <- function(gr, rng, width=1e4) {
28 |   new_pos <- round(runif(length(gr), rng[1], rng[2]))
29 |   data.frame(seqnames=seqnames(gr), start=new_pos, end=new_pos + width,
30 |              score=gr$score, id=gr$id) %>%
31 |     as_granges()
32 | }
33 | 
34 | 
35 | makeSegmentation <- function(chrom, rng, seqlens) {
36 |   seg <- data.frame(seqnames=chrom, start=c(1,rng[1]+1,rng[2]+1),
37 |                     end=c(rng[1],rng[2],seqlens),
38 |                     state=c(1,2,1)) %>%
39 |     as_granges()
40 | }
41 | 
42 | makeFocalFeatures <- function(g, chrom, rng) {
43 |   tss <- g %>%
44 |     anchor_5p() %>%
45 |     mutate(width = 1e4) %>%
46 |     select(-c(gene_id, symbol))
47 |   bind_ranges(replicate(3, tss)) %>%
48 |     shift(round(runif(length(.), -1e4, 1e4))) %>%
49 |     mutate(score = runif(length(.), 3, 5)) %>%
50 |     unname()
51 | }
52 | 
53 | makePool <- function(n, chrom, rng, seqlens) {
54 |   gr <- data.frame(seqnames=chrom, start=round(runif(n, rng[1], rng[2])),
55 |                    width=1e4, score = runif(n, 1, 5)) %>%
56 |     as_granges()
57 |   seqlengths(gr) <- seqlens
58 |   gr
59 | }
60 | 


--------------------------------------------------------------------------------
/code_examples.R:
--------------------------------------------------------------------------------
  1 | # code examples in tidy format (plyranges) and base Bioconductor
  2 | # Michael Love
  3 | # July 12 2023
  4 | 
  5 | ###############
  6 | ## example 1 ##
  7 | ###############
  8 | 
  9 | # first example is from the plyranges paper Figure 3
 10 | # https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1597-8
 11 | # "an overlap and aggregate operation that returns the same result"
 12 | 
 13 | library(plyranges)
 14 | 
 15 | # create data in R rather than reading in BED files
 16 | 
 17 | gwas <- data.frame(seqnames=1,
 18 |                    start=round(runif(100,0,100)),
 19 |                    width=1, rsID=paste0("rs",1:100)) %>%
 20 |   as_granges()
 21 | 
 22 | exons <- data.frame(seqnames=1,
 23 |                     start=round(runif(100,0,100)),
 24 |                     width=5, exonID=paste0("e",1:100)) %>%
 25 |   as_granges()
 26 | 
 27 | # tidy
 28 | 
 29 | res1 <- exons %>%
 30 |   join_overlap_inner(gwas) %>%
 31 |   group_by(rsID) %>%
 32 |   summarise(n = n_distinct(exonID))
 33 | 
 34 | # base bioc
 35 | 
 36 | hits <- findOverlaps(exons, gwas, ignore.strand = FALSE)
 37 | olap <- splitAsList(exons$exonID[queryHits(hits)], gwas$rsID[subjectHits(hits)])
 38 | n <- lengths(unique(olap))
 39 | res2 <- DataFrame(rsID = names(n), n = as.integer(n))
 40 | 
 41 | 
 42 | all.equal(res1, res2)
 43 | 
 44 | ###############
 45 | ## example 2 ##
 46 | ###############
 47 | 
 48 | # distance from one set of features (5p) to nearest other set (center)
 49 | # group by the type of features and plot histogram
 50 | 
 51 | x <- data.frame(seqnames=1,
 52 |                 start=round(runif(100,0,1e4)),
 53 |                 width=round(runif(100,5,15))) %>%
 54 |   as_granges() %>%
 55 |   sort()
 56 | x <- x %>%
 57 |   mutate(xID = paste0("x",1:100),
 58 |          group = paste0("g",rep(1:2,each=50)))
 59 | 
 60 | y <- data.frame(seqnames=1,
 61 |                 start=round(runif(100,0,1e4)),
 62 |                 width=round(runif(100,5,15))) %>%
 63 |   as_granges() %>%
 64 |   sort()
 65 | y <- y %>%
 66 |   mutate(yID=paste0("y",1:100))
 67 | 
 68 | library(tibble)
 69 | library(ggplot2)
 70 | 
 71 | # tidy
 72 | 
 73 | x %>%
 74 |   anchor_5p() %>%
 75 |   mutate(width=1) %>%
 76 |   add_nearest_distance(y %>% anchor_center %>% mutate(width=1)) %>%
 77 |   as_tibble() %>%
 78 |   ggplot(aes(distance, group=group, fill=group)) +
 79 |   geom_histogram(position="dodge")
 80 | 
 81 | # base bioc
 82 | 
 83 | x_5p <- resize(x, width=1)
 84 | y_mid <- y - ifelse(width(y) %% 2 == 0, width(y)/2-.5, floor(width(y)/2))
 85 | hits <- distanceToNearest(x_5p, y_mid)
 86 | x$distance[queryHits(hits)] <- mcols(hits)$distance
 87 | df <- as.data.frame(mcols(x)[,c("group","distance")])
 88 | ggplot(df, aes(distance, group=group, fill=group)) +
 89 |   geom_histogram(position="dodge")
 90 | 
 91 | 
 92 | ###############
 93 | ## example 3 ##
 94 | ###############
 95 | 
 96 | # find disjoint regions within groups of features, filter to the overlapping pieces
 97 | 
 98 | # tidy
 99 | 
100 | x %>%
101 |   join_overlap_inner(range(x) %>%
102 |                      tile_ranges(width=1000) %>%
103 |                      mutate(tile=seq_along(.))) %>%
104 |   group_by(tile) %>%
105 |   disjoin_ranges(total = n()) %>%
106 |   filter(total > 1)
107 | 
108 | # base bioc
109 | 
110 | tiles <- tile(range(x), width=1000)[[1]]
111 | tiles$tile <- seq_along(tiles)
112 | hits <- findOverlaps(x, tiles)
113 | res <- lapply(1:length(tiles), function(t) {
114 |   x_sub <- x[queryHits(hits)[subjectHits(hits) == t]]
115 |   d <- disjoin(x_sub)
116 |   cov <- as(coverage(x_sub), "GRanges")
117 |   d[d %over% cov[cov$score > 1]]
118 | })
119 | do.call(c, res)
120 | 


--------------------------------------------------------------------------------
/czi.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/czi.png


--------------------------------------------------------------------------------
/data.tsv:
--------------------------------------------------------------------------------
 1 | drug	genotype	rep	outlier
 2 | 1	a	1	F
 3 | 1	a	2	F
 4 | 1	a	3	F
 5 | 2	a	1	F
 6 | 2	a	2	F
 7 | 2	a	3	F
 8 | 3	a	1	F
 9 | 3	a	2	F
10 | 3	a	3	T
11 | 1	b	1	F
12 | 1	b	2	T
13 | 1	b	3	F
14 | 2	b	1	F
15 | 2	b	2	F
16 | 2	b	3	F
17 | 3	b	1	F
18 | 3	b	2	F
19 | 3	b	3	F
20 | 


--------------------------------------------------------------------------------
/dplyr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/dplyr.png


--------------------------------------------------------------------------------
/narrowpeak.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/narrowpeak.png


--------------------------------------------------------------------------------
/non-tidy.numbers:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/non-tidy.numbers


--------------------------------------------------------------------------------
/non-tidy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/non-tidy.png


--------------------------------------------------------------------------------
/nullranges.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/nullranges.png


--------------------------------------------------------------------------------
/plyranges.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/plyranges.png


--------------------------------------------------------------------------------
/si.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/si.rda


--------------------------------------------------------------------------------
/tidy-enrichment.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/tidy-enrichment.pdf


--------------------------------------------------------------------------------
/tidy-genomics-talk.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Tidy Analysis of Genomic Data"
  3 | author: |
  4 |   | Michael Love
  5 |   | Dept of Genetics &
  6 |   | Dept of Biostatistics
  7 |   | UNC-Chapel Hill
  8 | date: "UVA ~ October 2023"
  9 | output: beamer_presentation
 10 | urlcolor: blue
 11 | ---
 12 | 
 13 | ```{r setup, echo=FALSE}
 14 | suppressPackageStartupMessages(library(tidyverse))
 15 | knitr::opts_chunk$set(cache = TRUE)
 16 | ```
 17 | 
 18 | # Data organization depends on purpose
 19 | 
 20 | ![](non-tidy.png)
 21 | 
 22 | # "Tidy data" is organized for programming
 23 | 
 24 | One row per observation, one column per variable
 25 | 
 26 | ```{r include=FALSE}
 27 | dat <- read_delim("data.tsv")
 28 | dat$value <- runif(nrow(dat))
 29 | dat$drug <- factor(dat$drug)
 30 | ```
 31 | 
 32 | ```{r echo=FALSE}
 33 | head(dat)
 34 | ```
 35 | 
 36 | # The pipe
 37 | 
 38 | ```
 39 | command | command | command > output.txt
 40 | ```
 41 | 
 42 | \vspace{2em}
 43 | 
 44 | > "Pipes rank alongside the hierarchical file system and regular expressions as one of the most powerful yet elegant features of Unix-like operating systems."
 45 | 
 46 | <http://www.linfo.org/pipe.html>
 47 | 
 48 | \vspace{2em}
 49 | 
 50 | In R we use `%>%` or `|>` instead of `|` to chain operations.
 51 | 
 52 | # Verb-based operations
 53 | 
 54 | In the R package *dplyr*:
 55 | 
 56 | \small
 57 | * `mutate()` adds new variables that are functions of existing variables.
 58 | * `select()` picks variables based on their names.
 59 | * `filter()` picks cases based on their values.
 60 | * `slice()` picks cases based on their position.
 61 | * `summarize()` reduces multiple values down to a single summary.
 62 | * `arrange()` changes the ordering of the rows.
 63 | * `group_by()` perform any operation by group.
 64 | 
 65 | <https://dplyr.tidyverse.org/>
 66 | \normalsize
 67 | 
 68 | # Summarize after grouping
 69 | 
 70 | A useful paradigm is to *group* data and then *summarize*:
 71 | 
 72 | ```{r eval=FALSE}
 73 | dat %>%
 74 |   filter(!outlier) %>%
 75 |   group_by(drug, genotype) %>%
 76 |   summarize(mu_hat = mean(value))
 77 | ```
 78 | 
 79 | # Summarized output
 80 | 
 81 | ```{r echo=FALSE, message=FALSE}
 82 | dat %>%
 83 |   filter(!outlier) %>%
 84 |   group_by(drug, genotype) %>%
 85 |   summarize(mu_est = mean(value))
 86 | ```
 87 | 
 88 | # Piping directly into plots facilitates data exploration
 89 | 
 90 | ```{r fig.dim=c(5,2)}
 91 | dat %>%
 92 |   mutate(newvalue = value^2) %>%
 93 |   ggplot(aes(genotype, newvalue)) + 
 94 |   geom_boxplot() + 
 95 |   facet_wrap(~drug)
 96 | ```
 97 | 
 98 | # Summary I
 99 | 
100 | * I teach both base R and "tidy"
101 | * Both are wrappers, choose based on 1) efficiency 2) flow
102 | * I use the former for writing software, latter for scripting
103 | * Students know dplyr/ggplot2 already
104 | * Next:
105 |   - tidy for genomic ranges
106 |   - tidy for matrix data (scRNA-seq)
107 | 
108 | # Genomic range data is already tidy
109 | 
110 | ![](narrowpeak.png)
111 | 
112 | # Great packages in Bioconductor to work with ranges
113 | 
114 | * [LOLA](https://code.databio.org/LOLA/) - facilitates testing overlaps, fast, useful databases
115 | * [COCOA](https://code.databio.org/COCOA/) - explore sample variation along genome
116 | * [GenomicDistributions](http://code.databio.org/GenomicDistributions/) - annotate, visualize distribution with respect to other features (genes)
117 | * [regioneR]( https://bioconductor.org/packages/regioneR/) - permutation testing
118 | * [ChIPpeakAnno](https://bioconductor.org/packages/ChIPpeakAnno/) - facilitates downstream analysis
119 | 
120 | Going to talk now about data exploration
121 | 
122 | # Exploring data with tidy syntax
123 | 
124 | \large
125 | Helps avoid intermediate variables, and tucks away control code
126 | 
127 | \vspace{1em}
128 | 
129 | ```{r eval=FALSE}
130 | dat3 <- dat2[dat2$signal > 5]
131 | 
132 | # vs.
133 | 
134 | dat %>%
135 |   filter(signal > 5)
136 | ```
137 | 
138 | \normalsize
139 | 
140 | ```{r echo=FALSE, fig.align="center", out.width="25%"}
141 | knitr::include_graphics("plyranges.png")
142 | ```
143 | 
144 | This is *plyranges* from Stuart Lee, Michael Lawrence and Di Cook
145 | 
146 | # Bringing range data into R
147 | 
148 | ENCODE mouse embryonic fibroblast, H3K4me1:
149 | 
150 | \vspace{1em}
151 | 
152 | ```{r echo=FALSE}
153 | suppressPackageStartupMessages(library(plyranges))
154 | ```
155 | 
156 | ```{r}
157 | library(plyranges)
158 | pks <- read_narrowpeaks("ENCFF231UNV.bed.gz")
159 | ```
160 | 
161 | or equivalently:
162 | 
163 | ```{r eval=FALSE}
164 | pks <- read.csv("file.csv") %>% 
165 |   rename(seqnames = chr) %>%
166 |   as_granges()
167 | ```
168 | 
169 | ```{r echo=FALSE}
170 | #library(GenomeInfoDb)
171 | #si <- Seqinfo(genome="mm10")
172 | #si <- keepStandardChromosomes(si)
173 | #save(si, file="si.rda")
174 | load("si.rda")
175 | seqlevels(pks) <- seqlevels(si)
176 | seqinfo(pks) <- si
177 | ```
178 | 
179 | # Another common paradigm, separating single column
180 | 
181 | ```{r eval=FALSE}
182 | pks <- read.delim("file.tsv") %>%
183 |   tidyr::separate_wider_delim(
184 |     location, 
185 |     delim=":|-", # e.g. chr1:123-456
186 |     into=c("seqnames","start","end")
187 |   ) %>%
188 |   as_granges()
189 | ```
190 | 
191 | # Ranges are rows, metadata are columns
192 | 
193 | \footnotesize
194 | ```{r}
195 | pks %>% 
196 |   slice(1:3) %>% # first 3 ranges
197 |   select(signalValue) # just one metadata column
198 | ```
199 | \normalsize
200 | 
201 | # Example use of *plyranges*
202 | 
203 | \Large
204 | 
205 | * Suppose query ranges, `tiles` (e.g. ~1 Mb)
206 | * Find all overlaps between `pks` and `tiles`
207 | * Perform computation on the overlaps
208 | * Many other choices in Bioc for enrichment (e.g. LOLA)
209 | 
210 | \normalsize
211 | 
212 | # Example use of *plyranges*
213 | 
214 | ```{r echo=FALSE}
215 | tile0 <- data.frame(seqnames="chr1", 
216 |                     start=51e6 + 1, 
217 |                     width=3e6) %>%
218 |   as_granges()
219 | tiles <- tile0 %>%
220 |   tile_ranges(1e6) %>%
221 |   select(-partition) %>%
222 |   mutate(tile_id = 1:3)
223 | seqinfo(tiles) <- si
224 | ```
225 | 
226 | Created with `tile_ranges` (see also `tileGenome`):
227 | 
228 | \vspace{1em}
229 | 
230 | \footnotesize
231 | ```{r}
232 | tiles
233 | ```
234 | \normalsize
235 | 
236 | # Consider genomic overlaps as a `join`
237 | 
238 | ```{r echo=FALSE, fig.align="center", out.width="50%"}
239 | # https://www.flickr.com/photos/hellothomas/5073821890
240 | knitr::include_graphics("woodjoin.png")
241 | ```
242 | 
243 | * We are joining two sources of information by match
244 | * How would you then pick top scoring peak (`pks`) per `tile`?
245 | * What verbs would be involved?
246 | 
247 | # Consider overlaps as a `join`
248 | 
249 | \footnotesize
250 | ```{r}
251 | pks %>%
252 |   select(score) %>% # just `score` column
253 |   join_overlap_inner(tiles) %>% # overlap -> add cols from tiles
254 |   group_by(tile_id) %>% # group matches by which tile
255 |   slice(which.max(score)) # take the top scoring peak
256 | ```
257 | \normalsize
258 | 
259 | # Counting overlaps
260 | 
261 | * Use "`.`" to specify self within a command
262 | * Add number of overlaps to each entry in `tiles`:
263 | * Can specify `maxgap` and/or `minoverlap`
264 | 
265 | \vspace{1em}
266 | 
267 | \footnotesize
268 | ```{r}
269 | tiles %>% 
270 |   mutate(n_overlaps = count_overlaps(., pks))
271 | ```
272 | \normalsize
273 | 
274 | # More complex cases
275 | 
276 | * For peaks near genes, compute correlation of cell-type-specific accessibility and expression (Wancen Mu) → similar to COCOA
277 | * For regulatory variants falling in open chromatin peaks, visualize their distribution stratified by SNP and peak categories (Jon Rosen)
278 | * For looped and un-looped enhancer-promoter pairs, compare average ATAC and RNA time series, while controlling for genomic distance and contact frequency (Eric Davis)
279 | 
280 | # Nest $\rightarrow$ map $\rightarrow$ unnest
281 | 
282 | ```{r eval=FALSE}
283 | library(purrr)
284 | library(broom)
285 | pks %>%
286 |   join_overlap_inner(tiles) %>%
287 |   as_tibble() %>%
288 |   select(tile_id, signalValue, qValue) %>%
289 |   nest(data = -tile_id) %>%
290 |   mutate(fit = map(data, 
291 |                    ~lm(signalValue ~ qValue, data=.)
292 |                    ),
293 |          stats = map(fit, glance)) %>%
294 |   unnest(stats)
295 | ```
296 | 
297 | # Nest $\rightarrow$ map $\rightarrow$ unnest
298 | 
299 | ```{r echo=FALSE}
300 | library(purrr)
301 | library(broom)
302 | pks %>%
303 |   join_overlap_inner(tiles) %>%
304 |   as_tibble() %>%
305 |   select(tile_id, signalValue, qValue) %>%
306 |   nest(data = -tile_id) %>%
307 |   mutate(fit = map(data, ~lm(signalValue ~ qValue, data=.)),
308 |          stats = map(fit, glance)) %>%
309 |   unnest(stats) %>%
310 |   select(tile_id, data, fit, r.squared)
311 | ```
312 | 
313 | # More *plyranges*-based tutorials online
314 | 
315 | * *plyranges* vignettes (on Bioc and GitHub)
316 | * Enrichment of peaks and genes: "Fluent Genomics" workflow
317 | * Null regions: *nullranges* vignettes (on Bioc and GitHub)
318 | * Other examples, incl. bootstrap: "Tidy Ranges Tutorial"
319 | * `#tidiness_in_bioc` and `#nullranges` Slack channels
320 | 
321 | # Summary: tidy analysis for genomic range data
322 | 
323 | ```{r echo=FALSE, fig.show="hold", fig.align="center", out.width="25%"}
324 | knitr::include_graphics(c("dplyr.png","GenomicRanges.png"))
325 | ```
326 | 
327 | ```{r echo=FALSE, fig.show="hold", fig.align="center", out.width="25%"}
328 | knitr::include_graphics(c("plyranges.png","nullranges.png"))
329 | ```
330 | 
331 | \small
332 | *nullranges* development sponsored by CZI EOSS ![](czi.png){width=50px}
333 | \normalsize
334 | 
335 | # Tidy analysis of matrix data
336 | 
337 | ```{r echo=FALSE, fig.align="center", out.width="50%"}
338 | knitr::include_graphics("tt_roadmap.png")
339 | ```
340 | 
341 | tidy-* from Stefano Mangiola (WEHI) *et al.*
342 | 
343 | # Example use of tidySingleCellExperiment
344 | 
345 | ```{r message=FALSE, echo=FALSE}
346 | library(tidySingleCellExperiment)
347 | sce <- tidySingleCellExperiment::pbmc_small
348 | library(scran)
349 | var_genes <- sce %>%
350 |     modelGeneVar() %>%
351 |     getTopHVGs(prop=0.1)
352 | library(scater) # for next chunk
353 | library(ggplot2) # for next chunk
354 | ```
355 | 
356 | ```{r fig.dim=c(4,3), fig.align="center", out.width="50%"}
357 | sce %>%
358 |   scater::runPCA(ncomp=2, subset_row=var_genes) %>%
359 |   ggplot(aes(PC1, PC2, color=groups)) + 
360 |   geom_point()
361 | ```
362 | 
363 | # Example use of tidySingleCellExperiment
364 | 
365 | ```{r echo=FALSE, message=FALSE, warning=FALSE}
366 | library(ggforce)
367 | colLabels(sce) <- sce %>%
368 |     buildSNNGraph(use.dimred="PCA") %>%
369 |     igraph::cluster_walktrap() %$%
370 |     membership %>%
371 |     as.factor()
372 | ```
373 | 
374 | ```{r fig.dim=c(4,3), fig.align="center", out.width="50%", message=FALSE}
375 | sce %>%
376 |   join_features(c("CCL5","CST3")) %>%
377 |   ggplot(aes(label, .abundance_logcounts)) + 
378 |   geom_violin() +
379 |   geom_sina() +
380 |   facet_wrap(~.feature)
381 | ```
382 | 
383 | # More complex cases
384 | 
385 | * Join extra cell-level data
386 | * Perform nested analyses per cell population
387 | * Create a custom expression signature from subset of genes
388 | * Find genes near ChIP-seq peaks, convert to pseudobulk, plot
389 | 
390 | See [our Bioc2023 workshop](https://tidyomics.github.io/tidyomicsWorkshopBioc2023/articles/tidyGenomicsTranscriptomics.html)
391 | and [tidyseurat](https://stemangiola.github.io/tidyseurat/) / [tidySCE](https://stemangiola.github.io/tidySingleCellExperiment/)
392 | 
393 | # Altogether, "tidyomics"
394 | 
395 | <https://github.com/tidyomics>
396 | 
397 | ```{r echo=FALSE, fig.show="hold", fig.align="center", out.width="30%"}
398 | knitr::include_graphics(c("tidyomics1.png", "tidyomics2.png"))
399 | ```
400 | 
401 | # Reading
402 | 
403 | \small
404 | * Hutchison, WJ, Keyes, TJ, *et al.* The tidyomics ecosystem: Enhancing omic data analyses *bioRxiv* (2023) [10.1101/2023.09.10.557072](https://doi.org/10.1101/2023.09.10.557072)
405 | * Lee, S, Cook, D, Lawrence, M. plyranges: a grammar of genomic data transformation. *Genome Biology* (2019) [10.1186/s13059-018-1597-8](https://doi.org/10.1186/s13059-018-1597-8)
406 | * Lee S, Lawrence M, Love MI. Fluent genomics with plyranges and tximeta. *F1000Research* (2020) [10.12688/f1000research.22259.1](https://doi.org/10.12688/f1000research.22259.1)
407 | 
408 | Tidy analysis for matrix data:
409 | 
410 | * Mangiola, S, Molania, R, Dong, R et al. tidybulk: an R tidy framework for modular transcriptomic data analysis. *Genome Biology* (2021) [10.1186/s13059-020-02233-7](https://doi.org/10.1186/s13059-020-02233-7)
411 | * tidySE, tidySCE, tidyseurat
412 |   [stemangiola.github.io/tidytranscriptomics](https://stemangiola.github.io/tidytranscriptomics)
413 | 
414 | # Extra slides
415 | 
416 | # plyranges pointers
417 | 
418 | * TSS: `anchor_5p() %>% mutate(width=1)`
419 | * Overlaps can specify `*_directed` or `*_within`
420 | * Flatten/break up ranges: `reduce_ranges`, `disjoin_ranges`
421 | * Concatenating ranges: `bind_ranges` with `.id` argument
422 | * Overlaps are handled often with "joins": `join_overlap_*`, 
423 |   `join_nearest`, `join_nearest_downstream`, etc.
424 | * Also `add_neareast_distance`
425 | * Load *plyranges* last to avoid name masking with *AnnotationDbi*
426 |   and *dplyr*
427 | 


--------------------------------------------------------------------------------
/tidy-genomics-talk.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/tidy-genomics-talk.pdf


--------------------------------------------------------------------------------
/tidyomics1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/tidyomics1.png


--------------------------------------------------------------------------------
/tidyomics2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/tidyomics2.png


--------------------------------------------------------------------------------
/tt_roadmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/tt_roadmap.png


--------------------------------------------------------------------------------
/woodjoin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/woodjoin.png


--------------------------------------------------------------------------------