├── .gitignore ├── ENCFF231UNV.bed.gz ├── GenomicRanges.png ├── README.md ├── boot_and_match_examples.R ├── boot_and_match_script.R ├── code_examples.R ├── czi.png ├── data.tsv ├── dplyr.png ├── narrowpeak.png ├── non-tidy.numbers ├── non-tidy.png ├── nullranges.png ├── plyranges.png ├── si.rda ├── tidy-enrichment.pdf ├── tidy-genomics-talk.Rmd ├── tidy-genomics-talk.pdf ├── tidyomics1.png ├── tidyomics2.png ├── tt_roadmap.png └── woodjoin.png /.gitignore: -------------------------------------------------------------------------------- 1 | *.pdf 2 | *.key 3 | tidy-genomics-talk_cache/ 4 | .DS_store 5 | -------------------------------------------------------------------------------- /ENCFF231UNV.bed.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/ENCFF231UNV.bed.gz -------------------------------------------------------------------------------- /GenomicRanges.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/GenomicRanges.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tidy genomics talk 2 | 3 | Two presentations in this repo: 4 | 5 | 1. `tidy-genomics-talk.pdf` with Rmd 6 | 2. `tidy-enrichment.pdf` with code in `boot_and_match_examples.R` 7 | 8 | -------------------------------------------------------------------------------- /boot_and_match_examples.R: -------------------------------------------------------------------------------- 1 | ########################################## 2 | ## visualizing genes and other features ## 3 | ########################################## 4 | 5 | library(TxDb.Hsapiens.UCSC.hg38.knownGene) 6 | library(org.Hs.eg.db) 7 | txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene 8 | g <- genes(txdb) 9 | 10 | # add symbols to genes 11 | suppressPackageStartupMessages(library(plyranges)) 12 | 13 | g <- g %>% 14 | mutate(symbol = mapIds(org.Hs.eg.db, gene_id, 15 | "SYMBOL", "ENTREZID")) 16 | 17 | # for visualizing, restrict to a range of chr4 18 | chrom <- "chr4" 19 | rng <- c(98.8e6, 99.8e6) # where we will zoom into, 1 Mb 20 | rng_big <- c(90e6, 110e6) # where features live, 20 Mb 21 | 22 | # filtering the genes to this range 23 | r <- data.frame(seqnames=chrom, start=rng[1]+1, end=rng[2]) %>% 24 | as_granges() 25 | 26 | # just look at the genes in this small range 27 | g <- g %>% 28 | filter_by_overlaps(r) %>% 29 | sort() %>% 30 | arrange(strand) 31 | 32 | source("boot_and_match_script.R") 33 | 34 | suppressPackageStartupMessages(library(plotgardener)) 35 | 36 | plotSomeGenes(chrom, rng, showGuides=FALSE) 37 | 38 | # make n features in clumps of ~lambda 39 | seqlens <- seqlengths(g)[chrom] 40 | set.seed(5) 41 | p <- makeClusterRanges(chrom, rng_big, n=300, lambda=5, seqlens) 42 | 43 | # define some plotting parameters for plotgardener, 44 | # e.g. a palette for feature 'score': 45 | pal <- colorRampPalette(c("dodgerblue2", "firebrick2")) 46 | 47 | # shared genomic location, width & height, x position, fill, etc. 48 | params <- pgParams( 49 | chrom=chrom, chromstart=rng[1], chromend=rng[2], 50 | width=5.5, height=1, x=.25, 51 | fill=colorby("score", palette=pal), 52 | order="random", baseline=TRUE, 53 | ) 54 | 55 | # shared parameters for text labels 56 | textparams <- pgParams(x=.1, rot=90, just="left") 57 | 58 | # plot the original GRanges, e.g. suppose ATAC-seq peaks 59 | plotRanges(p, params=params, y=2) 60 | plotText("original", params=textparams, y=3) 61 | 62 | # uniform shuffling 63 | shuf <- shuffle(p, rng_big) 64 | 65 | # plot shuffled ranges 66 | plotRanges(shuf, params=params, y=1) 67 | plotText("shuffled", params=textparams, y=2) 68 | 69 | # segmented block bootstrapping 70 | # blocks 100kb, not proportion to segment length 71 | library(nullranges) 72 | seg <- makeSegmentation(chrom, rng_big, seqlens) 73 | set.seed(1) 74 | boot <- bootRanges(p, blockLength=1e5, R=1, 75 | seg=seg, proportionLength=FALSE) 76 | 77 | # plot bootstrapped ranges 78 | plotRanges(boot, params=params, y=0) 79 | plotText("boot", params=textparams, y=1) 80 | 81 | # for genome-wide analysis, consider excluding gaps, repeats, etc. 82 | # see https://dozmorovlab.github.io/excluderanges for details 83 | 84 | #library(AnnotationHub) 85 | #ah <- AnnotationHub() 86 | #query(ah, "excluderanges") 87 | 88 | ########################### 89 | ## bootstrapping example ## 90 | ########################### 91 | 92 | # first just counts as statistic 93 | 94 | g %>% 95 | mutate(n_overlaps = count_overlaps(., p)) 96 | 97 | g %>% 98 | join_overlap_left(p) %>% 99 | group_by(symbol) %>% 100 | summarize(n_overlaps = sum(!is.na(id))) 101 | 102 | # working with metadata 103 | 104 | g %>% 105 | join_overlap_left(p) %>% 106 | group_by(symbol) %>% # per gene symbol 107 | summarize(sum_score = sum(score)) 108 | 109 | # simple violin plot 110 | 111 | library(tibble) 112 | library(ggplot2) 113 | # inner instead of left: leaves out no-overlap genes 114 | g %>% 115 | join_overlap_inner(p) %>% 116 | mutate(type = "original") %>% 117 | group_by(symbol, type) %>% 118 | summarize(sum_score = sum(score)) %>% 119 | as_tibble() %>% 120 | ggplot(aes(type, sum_score)) + 121 | geom_violin() + 122 | geom_point() 123 | 124 | # adding more draws from the distribution for simulated features 125 | 126 | niter <- 50 127 | sim_list <- replicate(niter, { 128 | makeClusterRanges(chrom, rng_big, n=300, lambda=5, seqlens) 129 | }) 130 | sim_long <- bind_ranges(sim_list, .id="iter") 131 | 132 | g %>% 133 | join_overlap_inner(sim_long) %>% 134 | mutate(type = "original") %>% 135 | group_by(symbol, iter, type) %>% 136 | summarize(sum_score = sum(score)) %>% 137 | as_tibble() %>% 138 | ggplot(aes(type, sum_score)) + 139 | geom_violin() + 140 | geom_jitter() 141 | 142 | # shuffling and bootstrapping multiple times 143 | 144 | shuf_list <- replicate(niter, shuffle(p, rng_big)) 145 | shuf_long <- bind_ranges(shuf_list, .id="iter") 146 | 147 | boot_long <- bootRanges(p, blockLength=1e5, R=niter, 148 | seg=seg, proportionLength=FALSE) 149 | 150 | # bind together 151 | 152 | lvls <- c("sim","shuffle","boot") 153 | all <- bind_ranges(sim=sim_long, shuffle=shuf_long, 154 | boot=boot_long, .id="type") %>% 155 | mutate(type = factor(type, levels=lvls)) 156 | 157 | # show table of features per iteration 158 | head(table(all$iter, all$type)) 159 | 160 | # final plot of distributions: 161 | # multiple draws, shuffling one instance, bootstrapping one instances 162 | 163 | g %>% 164 | join_overlap_inner(all) %>% 165 | group_by(symbol, iter, type) %>% 166 | summarize(sum_score = sum(score)) %>% 167 | as_tibble() %>% 168 | ggplot(aes(type, sum_score)) + 169 | geom_violin() + 170 | geom_jitter(width=.25, alpha=.15) 171 | 172 | ###################### 173 | ## matching example ## 174 | ###################### 175 | 176 | # start with gene plot again 177 | plotSomeGenes(chrom, rng, showGuides=FALSE) 178 | 179 | # make some features with particular distribution 180 | # 1) near gene TSS, 2) tend to have large 'score' values 181 | set.seed(1) 182 | focal <- makeFocalFeatures(g, chrom, rng) 183 | 184 | # 5 color palette for 'score' 185 | pal <- colorRampPalette(c("blue","green","yellow","red")) 186 | 187 | # new plot parameters 188 | params <- pgParams( 189 | chrom=chrom, chromstart=rng[1], chromend=rng[2], 190 | width=5.5, height=1, x=.25, 191 | fill=colorby("score", palette=pal, range=c(1,5)), 192 | order="random", baseline=TRUE, 193 | ) 194 | 195 | # plot the original 'focal' GRanges 196 | plotRanges(focal, params=params, y=2) 197 | plotText("focal", params=textparams, y=3) 198 | 199 | # make a 'pool' of features to select from 200 | pool <- makePool(5000, chrom, rng, seqlens) 201 | 202 | # plot the pool (subset) 203 | plotRanges(pool[1:200], params=params, y=1) 204 | plotText("pool", params=textparams, y=2) 205 | 206 | # add another feature: distance to nearest TSS 207 | tss <- g %>% anchor_5p() %>% mutate(width=1) 208 | 209 | both <- bind_ranges(focal = focal, pool = pool, .id="type") %>% 210 | add_nearest_distance(tss) %>% 211 | mutate(log10dist = log10(distance + 1000)) 212 | 213 | hist(both$log10dist) 214 | 215 | m <- both %>% { 216 | matchRanges(filter(., type=="focal"), 217 | filter(., type=="pool"), 218 | covar=~score + log10dist, 219 | method="nearest", replace=TRUE) 220 | } 221 | 222 | library(patchwork) 223 | plotCovariate(m, covar="score") + 224 | plotCovariate(m, covar="log10dist") 225 | 226 | # plot the matched set (need to replot the others) 227 | set.seed(1) 228 | plotSomeGenes(chrom, rng, showGuides=FALSE) 229 | plotRanges(focal, params=params, y=2) 230 | plotText("focal", params=textparams, y=3) 231 | plotRanges(pool[1:200], params=params, y=1) 232 | plotText("pool", params=textparams, y=2) 233 | plotRanges(matched(m), params=params, y=0) 234 | plotText("matched", params=textparams, y=1) 235 | -------------------------------------------------------------------------------- /boot_and_match_script.R: -------------------------------------------------------------------------------- 1 | plotSomeGenes <- function(chrom, rng, showGuides) { 2 | pageCreate(width=6, height=4, showGuides=showGuides) 3 | p <- pgParams(chrom=chrom, chromstart=rng[1], chromend=rng[2], width=5.5) 4 | cols <- c("dodgerblue","navy") 5 | gplt <- plotGenes(params=p, x=.25, y=3, height=.75, fill=cols, fontcolor=cols) 6 | annoGenomeLabel(plot=gplt, x=.25, y=3.75, scale="Mb") 7 | } 8 | 9 | makeClusterRanges <- function(chrom, rng, n, lambda, seqlens) { 10 | niter <- n/lambda 11 | out <- lapply(seq_len(niter), function(i) { 12 | nranges <- max(rpois(1, lambda), 1) 13 | pos <- round(runif(1, rng[1], rng[2])) 14 | mu <- rnorm(1, 0, 2) 15 | start <- pos + round(runif(nranges, -2e4, 2e4)) 16 | score <- rnorm(nranges, mu, .5) 17 | data.frame(seqnames=chrom, start, width=1e4, score) 18 | }) 19 | gr <- do.call(rbind, out) %>% 20 | as_granges() %>% 21 | sort() %>% 22 | mutate(id = seq_along(.)) 23 | seqlengths(gr) <- seqlens 24 | gr 25 | } 26 | 27 | shuffle <- function(gr, rng, width=1e4) { 28 | new_pos <- round(runif(length(gr), rng[1], rng[2])) 29 | data.frame(seqnames=seqnames(gr), start=new_pos, end=new_pos + width, 30 | score=gr$score, id=gr$id) %>% 31 | as_granges() 32 | } 33 | 34 | 35 | makeSegmentation <- function(chrom, rng, seqlens) { 36 | seg <- data.frame(seqnames=chrom, start=c(1,rng[1]+1,rng[2]+1), 37 | end=c(rng[1],rng[2],seqlens), 38 | state=c(1,2,1)) %>% 39 | as_granges() 40 | } 41 | 42 | makeFocalFeatures <- function(g, chrom, rng) { 43 | tss <- g %>% 44 | anchor_5p() %>% 45 | mutate(width = 1e4) %>% 46 | select(-c(gene_id, symbol)) 47 | bind_ranges(replicate(3, tss)) %>% 48 | shift(round(runif(length(.), -1e4, 1e4))) %>% 49 | mutate(score = runif(length(.), 3, 5)) %>% 50 | unname() 51 | } 52 | 53 | makePool <- function(n, chrom, rng, seqlens) { 54 | gr <- data.frame(seqnames=chrom, start=round(runif(n, rng[1], rng[2])), 55 | width=1e4, score = runif(n, 1, 5)) %>% 56 | as_granges() 57 | seqlengths(gr) <- seqlens 58 | gr 59 | } 60 | -------------------------------------------------------------------------------- /code_examples.R: -------------------------------------------------------------------------------- 1 | # code examples in tidy format (plyranges) and base Bioconductor 2 | # Michael Love 3 | # July 12 2023 4 | 5 | ############### 6 | ## example 1 ## 7 | ############### 8 | 9 | # first example is from the plyranges paper Figure 3 10 | # https://genomebiology.biomedcentral.com/articles/10.1186/s13059-018-1597-8 11 | # "an overlap and aggregate operation that returns the same result" 12 | 13 | library(plyranges) 14 | 15 | # create data in R rather than reading in BED files 16 | 17 | gwas <- data.frame(seqnames=1, 18 | start=round(runif(100,0,100)), 19 | width=1, rsID=paste0("rs",1:100)) %>% 20 | as_granges() 21 | 22 | exons <- data.frame(seqnames=1, 23 | start=round(runif(100,0,100)), 24 | width=5, exonID=paste0("e",1:100)) %>% 25 | as_granges() 26 | 27 | # tidy 28 | 29 | res1 <- exons %>% 30 | join_overlap_inner(gwas) %>% 31 | group_by(rsID) %>% 32 | summarise(n = n_distinct(exonID)) 33 | 34 | # base bioc 35 | 36 | hits <- findOverlaps(exons, gwas, ignore.strand = FALSE) 37 | olap <- splitAsList(exons$exonID[queryHits(hits)], gwas$rsID[subjectHits(hits)]) 38 | n <- lengths(unique(olap)) 39 | res2 <- DataFrame(rsID = names(n), n = as.integer(n)) 40 | 41 | 42 | all.equal(res1, res2) 43 | 44 | ############### 45 | ## example 2 ## 46 | ############### 47 | 48 | # distance from one set of features (5p) to nearest other set (center) 49 | # group by the type of features and plot histogram 50 | 51 | x <- data.frame(seqnames=1, 52 | start=round(runif(100,0,1e4)), 53 | width=round(runif(100,5,15))) %>% 54 | as_granges() %>% 55 | sort() 56 | x <- x %>% 57 | mutate(xID = paste0("x",1:100), 58 | group = paste0("g",rep(1:2,each=50))) 59 | 60 | y <- data.frame(seqnames=1, 61 | start=round(runif(100,0,1e4)), 62 | width=round(runif(100,5,15))) %>% 63 | as_granges() %>% 64 | sort() 65 | y <- y %>% 66 | mutate(yID=paste0("y",1:100)) 67 | 68 | library(tibble) 69 | library(ggplot2) 70 | 71 | # tidy 72 | 73 | x %>% 74 | anchor_5p() %>% 75 | mutate(width=1) %>% 76 | add_nearest_distance(y %>% anchor_center %>% mutate(width=1)) %>% 77 | as_tibble() %>% 78 | ggplot(aes(distance, group=group, fill=group)) + 79 | geom_histogram(position="dodge") 80 | 81 | # base bioc 82 | 83 | x_5p <- resize(x, width=1) 84 | y_mid <- y - ifelse(width(y) %% 2 == 0, width(y)/2-.5, floor(width(y)/2)) 85 | hits <- distanceToNearest(x_5p, y_mid) 86 | x$distance[queryHits(hits)] <- mcols(hits)$distance 87 | df <- as.data.frame(mcols(x)[,c("group","distance")]) 88 | ggplot(df, aes(distance, group=group, fill=group)) + 89 | geom_histogram(position="dodge") 90 | 91 | 92 | ############### 93 | ## example 3 ## 94 | ############### 95 | 96 | # find disjoint regions within groups of features, filter to the overlapping pieces 97 | 98 | # tidy 99 | 100 | x %>% 101 | join_overlap_inner(range(x) %>% 102 | tile_ranges(width=1000) %>% 103 | mutate(tile=seq_along(.))) %>% 104 | group_by(tile) %>% 105 | disjoin_ranges(total = n()) %>% 106 | filter(total > 1) 107 | 108 | # base bioc 109 | 110 | tiles <- tile(range(x), width=1000)[[1]] 111 | tiles$tile <- seq_along(tiles) 112 | hits <- findOverlaps(x, tiles) 113 | res <- lapply(1:length(tiles), function(t) { 114 | x_sub <- x[queryHits(hits)[subjectHits(hits) == t]] 115 | d <- disjoin(x_sub) 116 | cov <- as(coverage(x_sub), "GRanges") 117 | d[d %over% cov[cov$score > 1]] 118 | }) 119 | do.call(c, res) 120 | -------------------------------------------------------------------------------- /czi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/czi.png -------------------------------------------------------------------------------- /data.tsv: -------------------------------------------------------------------------------- 1 | drug genotype rep outlier 2 | 1 a 1 F 3 | 1 a 2 F 4 | 1 a 3 F 5 | 2 a 1 F 6 | 2 a 2 F 7 | 2 a 3 F 8 | 3 a 1 F 9 | 3 a 2 F 10 | 3 a 3 T 11 | 1 b 1 F 12 | 1 b 2 T 13 | 1 b 3 F 14 | 2 b 1 F 15 | 2 b 2 F 16 | 2 b 3 F 17 | 3 b 1 F 18 | 3 b 2 F 19 | 3 b 3 F 20 | -------------------------------------------------------------------------------- /dplyr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/dplyr.png -------------------------------------------------------------------------------- /narrowpeak.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/narrowpeak.png -------------------------------------------------------------------------------- /non-tidy.numbers: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/non-tidy.numbers -------------------------------------------------------------------------------- /non-tidy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/non-tidy.png -------------------------------------------------------------------------------- /nullranges.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/nullranges.png -------------------------------------------------------------------------------- /plyranges.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/plyranges.png -------------------------------------------------------------------------------- /si.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/si.rda -------------------------------------------------------------------------------- /tidy-enrichment.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/tidy-enrichment.pdf -------------------------------------------------------------------------------- /tidy-genomics-talk.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Tidy Analysis of Genomic Data" 3 | author: | 4 | | Michael Love 5 | | Dept of Genetics & 6 | | Dept of Biostatistics 7 | | UNC-Chapel Hill 8 | date: "UVA ~ October 2023" 9 | output: beamer_presentation 10 | urlcolor: blue 11 | --- 12 | 13 | ```{r setup, echo=FALSE} 14 | suppressPackageStartupMessages(library(tidyverse)) 15 | knitr::opts_chunk$set(cache = TRUE) 16 | ``` 17 | 18 | # Data organization depends on purpose 19 | 20 | ![](non-tidy.png) 21 | 22 | # "Tidy data" is organized for programming 23 | 24 | One row per observation, one column per variable 25 | 26 | ```{r include=FALSE} 27 | dat <- read_delim("data.tsv") 28 | dat$value <- runif(nrow(dat)) 29 | dat$drug <- factor(dat$drug) 30 | ``` 31 | 32 | ```{r echo=FALSE} 33 | head(dat) 34 | ``` 35 | 36 | # The pipe 37 | 38 | ``` 39 | command | command | command > output.txt 40 | ``` 41 | 42 | \vspace{2em} 43 | 44 | > "Pipes rank alongside the hierarchical file system and regular expressions as one of the most powerful yet elegant features of Unix-like operating systems." 45 | 46 | 47 | 48 | \vspace{2em} 49 | 50 | In R we use `%>%` or `|>` instead of `|` to chain operations. 51 | 52 | # Verb-based operations 53 | 54 | In the R package *dplyr*: 55 | 56 | \small 57 | * `mutate()` adds new variables that are functions of existing variables. 58 | * `select()` picks variables based on their names. 59 | * `filter()` picks cases based on their values. 60 | * `slice()` picks cases based on their position. 61 | * `summarize()` reduces multiple values down to a single summary. 62 | * `arrange()` changes the ordering of the rows. 63 | * `group_by()` perform any operation by group. 64 | 65 | 66 | \normalsize 67 | 68 | # Summarize after grouping 69 | 70 | A useful paradigm is to *group* data and then *summarize*: 71 | 72 | ```{r eval=FALSE} 73 | dat %>% 74 | filter(!outlier) %>% 75 | group_by(drug, genotype) %>% 76 | summarize(mu_hat = mean(value)) 77 | ``` 78 | 79 | # Summarized output 80 | 81 | ```{r echo=FALSE, message=FALSE} 82 | dat %>% 83 | filter(!outlier) %>% 84 | group_by(drug, genotype) %>% 85 | summarize(mu_est = mean(value)) 86 | ``` 87 | 88 | # Piping directly into plots facilitates data exploration 89 | 90 | ```{r fig.dim=c(5,2)} 91 | dat %>% 92 | mutate(newvalue = value^2) %>% 93 | ggplot(aes(genotype, newvalue)) + 94 | geom_boxplot() + 95 | facet_wrap(~drug) 96 | ``` 97 | 98 | # Summary I 99 | 100 | * I teach both base R and "tidy" 101 | * Both are wrappers, choose based on 1) efficiency 2) flow 102 | * I use the former for writing software, latter for scripting 103 | * Students know dplyr/ggplot2 already 104 | * Next: 105 | - tidy for genomic ranges 106 | - tidy for matrix data (scRNA-seq) 107 | 108 | # Genomic range data is already tidy 109 | 110 | ![](narrowpeak.png) 111 | 112 | # Great packages in Bioconductor to work with ranges 113 | 114 | * [LOLA](https://code.databio.org/LOLA/) - facilitates testing overlaps, fast, useful databases 115 | * [COCOA](https://code.databio.org/COCOA/) - explore sample variation along genome 116 | * [GenomicDistributions](http://code.databio.org/GenomicDistributions/) - annotate, visualize distribution with respect to other features (genes) 117 | * [regioneR]( https://bioconductor.org/packages/regioneR/) - permutation testing 118 | * [ChIPpeakAnno](https://bioconductor.org/packages/ChIPpeakAnno/) - facilitates downstream analysis 119 | 120 | Going to talk now about data exploration 121 | 122 | # Exploring data with tidy syntax 123 | 124 | \large 125 | Helps avoid intermediate variables, and tucks away control code 126 | 127 | \vspace{1em} 128 | 129 | ```{r eval=FALSE} 130 | dat3 <- dat2[dat2$signal > 5] 131 | 132 | # vs. 133 | 134 | dat %>% 135 | filter(signal > 5) 136 | ``` 137 | 138 | \normalsize 139 | 140 | ```{r echo=FALSE, fig.align="center", out.width="25%"} 141 | knitr::include_graphics("plyranges.png") 142 | ``` 143 | 144 | This is *plyranges* from Stuart Lee, Michael Lawrence and Di Cook 145 | 146 | # Bringing range data into R 147 | 148 | ENCODE mouse embryonic fibroblast, H3K4me1: 149 | 150 | \vspace{1em} 151 | 152 | ```{r echo=FALSE} 153 | suppressPackageStartupMessages(library(plyranges)) 154 | ``` 155 | 156 | ```{r} 157 | library(plyranges) 158 | pks <- read_narrowpeaks("ENCFF231UNV.bed.gz") 159 | ``` 160 | 161 | or equivalently: 162 | 163 | ```{r eval=FALSE} 164 | pks <- read.csv("file.csv") %>% 165 | rename(seqnames = chr) %>% 166 | as_granges() 167 | ``` 168 | 169 | ```{r echo=FALSE} 170 | #library(GenomeInfoDb) 171 | #si <- Seqinfo(genome="mm10") 172 | #si <- keepStandardChromosomes(si) 173 | #save(si, file="si.rda") 174 | load("si.rda") 175 | seqlevels(pks) <- seqlevels(si) 176 | seqinfo(pks) <- si 177 | ``` 178 | 179 | # Another common paradigm, separating single column 180 | 181 | ```{r eval=FALSE} 182 | pks <- read.delim("file.tsv") %>% 183 | tidyr::separate_wider_delim( 184 | location, 185 | delim=":|-", # e.g. chr1:123-456 186 | into=c("seqnames","start","end") 187 | ) %>% 188 | as_granges() 189 | ``` 190 | 191 | # Ranges are rows, metadata are columns 192 | 193 | \footnotesize 194 | ```{r} 195 | pks %>% 196 | slice(1:3) %>% # first 3 ranges 197 | select(signalValue) # just one metadata column 198 | ``` 199 | \normalsize 200 | 201 | # Example use of *plyranges* 202 | 203 | \Large 204 | 205 | * Suppose query ranges, `tiles` (e.g. ~1 Mb) 206 | * Find all overlaps between `pks` and `tiles` 207 | * Perform computation on the overlaps 208 | * Many other choices in Bioc for enrichment (e.g. LOLA) 209 | 210 | \normalsize 211 | 212 | # Example use of *plyranges* 213 | 214 | ```{r echo=FALSE} 215 | tile0 <- data.frame(seqnames="chr1", 216 | start=51e6 + 1, 217 | width=3e6) %>% 218 | as_granges() 219 | tiles <- tile0 %>% 220 | tile_ranges(1e6) %>% 221 | select(-partition) %>% 222 | mutate(tile_id = 1:3) 223 | seqinfo(tiles) <- si 224 | ``` 225 | 226 | Created with `tile_ranges` (see also `tileGenome`): 227 | 228 | \vspace{1em} 229 | 230 | \footnotesize 231 | ```{r} 232 | tiles 233 | ``` 234 | \normalsize 235 | 236 | # Consider genomic overlaps as a `join` 237 | 238 | ```{r echo=FALSE, fig.align="center", out.width="50%"} 239 | # https://www.flickr.com/photos/hellothomas/5073821890 240 | knitr::include_graphics("woodjoin.png") 241 | ``` 242 | 243 | * We are joining two sources of information by match 244 | * How would you then pick top scoring peak (`pks`) per `tile`? 245 | * What verbs would be involved? 246 | 247 | # Consider overlaps as a `join` 248 | 249 | \footnotesize 250 | ```{r} 251 | pks %>% 252 | select(score) %>% # just `score` column 253 | join_overlap_inner(tiles) %>% # overlap -> add cols from tiles 254 | group_by(tile_id) %>% # group matches by which tile 255 | slice(which.max(score)) # take the top scoring peak 256 | ``` 257 | \normalsize 258 | 259 | # Counting overlaps 260 | 261 | * Use "`.`" to specify self within a command 262 | * Add number of overlaps to each entry in `tiles`: 263 | * Can specify `maxgap` and/or `minoverlap` 264 | 265 | \vspace{1em} 266 | 267 | \footnotesize 268 | ```{r} 269 | tiles %>% 270 | mutate(n_overlaps = count_overlaps(., pks)) 271 | ``` 272 | \normalsize 273 | 274 | # More complex cases 275 | 276 | * For peaks near genes, compute correlation of cell-type-specific accessibility and expression (Wancen Mu) → similar to COCOA 277 | * For regulatory variants falling in open chromatin peaks, visualize their distribution stratified by SNP and peak categories (Jon Rosen) 278 | * For looped and un-looped enhancer-promoter pairs, compare average ATAC and RNA time series, while controlling for genomic distance and contact frequency (Eric Davis) 279 | 280 | # Nest $\rightarrow$ map $\rightarrow$ unnest 281 | 282 | ```{r eval=FALSE} 283 | library(purrr) 284 | library(broom) 285 | pks %>% 286 | join_overlap_inner(tiles) %>% 287 | as_tibble() %>% 288 | select(tile_id, signalValue, qValue) %>% 289 | nest(data = -tile_id) %>% 290 | mutate(fit = map(data, 291 | ~lm(signalValue ~ qValue, data=.) 292 | ), 293 | stats = map(fit, glance)) %>% 294 | unnest(stats) 295 | ``` 296 | 297 | # Nest $\rightarrow$ map $\rightarrow$ unnest 298 | 299 | ```{r echo=FALSE} 300 | library(purrr) 301 | library(broom) 302 | pks %>% 303 | join_overlap_inner(tiles) %>% 304 | as_tibble() %>% 305 | select(tile_id, signalValue, qValue) %>% 306 | nest(data = -tile_id) %>% 307 | mutate(fit = map(data, ~lm(signalValue ~ qValue, data=.)), 308 | stats = map(fit, glance)) %>% 309 | unnest(stats) %>% 310 | select(tile_id, data, fit, r.squared) 311 | ``` 312 | 313 | # More *plyranges*-based tutorials online 314 | 315 | * *plyranges* vignettes (on Bioc and GitHub) 316 | * Enrichment of peaks and genes: "Fluent Genomics" workflow 317 | * Null regions: *nullranges* vignettes (on Bioc and GitHub) 318 | * Other examples, incl. bootstrap: "Tidy Ranges Tutorial" 319 | * `#tidiness_in_bioc` and `#nullranges` Slack channels 320 | 321 | # Summary: tidy analysis for genomic range data 322 | 323 | ```{r echo=FALSE, fig.show="hold", fig.align="center", out.width="25%"} 324 | knitr::include_graphics(c("dplyr.png","GenomicRanges.png")) 325 | ``` 326 | 327 | ```{r echo=FALSE, fig.show="hold", fig.align="center", out.width="25%"} 328 | knitr::include_graphics(c("plyranges.png","nullranges.png")) 329 | ``` 330 | 331 | \small 332 | *nullranges* development sponsored by CZI EOSS ![](czi.png){width=50px} 333 | \normalsize 334 | 335 | # Tidy analysis of matrix data 336 | 337 | ```{r echo=FALSE, fig.align="center", out.width="50%"} 338 | knitr::include_graphics("tt_roadmap.png") 339 | ``` 340 | 341 | tidy-* from Stefano Mangiola (WEHI) *et al.* 342 | 343 | # Example use of tidySingleCellExperiment 344 | 345 | ```{r message=FALSE, echo=FALSE} 346 | library(tidySingleCellExperiment) 347 | sce <- tidySingleCellExperiment::pbmc_small 348 | library(scran) 349 | var_genes <- sce %>% 350 | modelGeneVar() %>% 351 | getTopHVGs(prop=0.1) 352 | library(scater) # for next chunk 353 | library(ggplot2) # for next chunk 354 | ``` 355 | 356 | ```{r fig.dim=c(4,3), fig.align="center", out.width="50%"} 357 | sce %>% 358 | scater::runPCA(ncomp=2, subset_row=var_genes) %>% 359 | ggplot(aes(PC1, PC2, color=groups)) + 360 | geom_point() 361 | ``` 362 | 363 | # Example use of tidySingleCellExperiment 364 | 365 | ```{r echo=FALSE, message=FALSE, warning=FALSE} 366 | library(ggforce) 367 | colLabels(sce) <- sce %>% 368 | buildSNNGraph(use.dimred="PCA") %>% 369 | igraph::cluster_walktrap() %$% 370 | membership %>% 371 | as.factor() 372 | ``` 373 | 374 | ```{r fig.dim=c(4,3), fig.align="center", out.width="50%", message=FALSE} 375 | sce %>% 376 | join_features(c("CCL5","CST3")) %>% 377 | ggplot(aes(label, .abundance_logcounts)) + 378 | geom_violin() + 379 | geom_sina() + 380 | facet_wrap(~.feature) 381 | ``` 382 | 383 | # More complex cases 384 | 385 | * Join extra cell-level data 386 | * Perform nested analyses per cell population 387 | * Create a custom expression signature from subset of genes 388 | * Find genes near ChIP-seq peaks, convert to pseudobulk, plot 389 | 390 | See [our Bioc2023 workshop](https://tidyomics.github.io/tidyomicsWorkshopBioc2023/articles/tidyGenomicsTranscriptomics.html) 391 | and [tidyseurat](https://stemangiola.github.io/tidyseurat/) / [tidySCE](https://stemangiola.github.io/tidySingleCellExperiment/) 392 | 393 | # Altogether, "tidyomics" 394 | 395 | 396 | 397 | ```{r echo=FALSE, fig.show="hold", fig.align="center", out.width="30%"} 398 | knitr::include_graphics(c("tidyomics1.png", "tidyomics2.png")) 399 | ``` 400 | 401 | # Reading 402 | 403 | \small 404 | * Hutchison, WJ, Keyes, TJ, *et al.* The tidyomics ecosystem: Enhancing omic data analyses *bioRxiv* (2023) [10.1101/2023.09.10.557072](https://doi.org/10.1101/2023.09.10.557072) 405 | * Lee, S, Cook, D, Lawrence, M. plyranges: a grammar of genomic data transformation. *Genome Biology* (2019) [10.1186/s13059-018-1597-8](https://doi.org/10.1186/s13059-018-1597-8) 406 | * Lee S, Lawrence M, Love MI. Fluent genomics with plyranges and tximeta. *F1000Research* (2020) [10.12688/f1000research.22259.1](https://doi.org/10.12688/f1000research.22259.1) 407 | 408 | Tidy analysis for matrix data: 409 | 410 | * Mangiola, S, Molania, R, Dong, R et al. tidybulk: an R tidy framework for modular transcriptomic data analysis. *Genome Biology* (2021) [10.1186/s13059-020-02233-7](https://doi.org/10.1186/s13059-020-02233-7) 411 | * tidySE, tidySCE, tidyseurat 412 | [stemangiola.github.io/tidytranscriptomics](https://stemangiola.github.io/tidytranscriptomics) 413 | 414 | # Extra slides 415 | 416 | # plyranges pointers 417 | 418 | * TSS: `anchor_5p() %>% mutate(width=1)` 419 | * Overlaps can specify `*_directed` or `*_within` 420 | * Flatten/break up ranges: `reduce_ranges`, `disjoin_ranges` 421 | * Concatenating ranges: `bind_ranges` with `.id` argument 422 | * Overlaps are handled often with "joins": `join_overlap_*`, 423 | `join_nearest`, `join_nearest_downstream`, etc. 424 | * Also `add_neareast_distance` 425 | * Load *plyranges* last to avoid name masking with *AnnotationDbi* 426 | and *dplyr* 427 | -------------------------------------------------------------------------------- /tidy-genomics-talk.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/tidy-genomics-talk.pdf -------------------------------------------------------------------------------- /tidyomics1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/tidyomics1.png -------------------------------------------------------------------------------- /tidyomics2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/tidyomics2.png -------------------------------------------------------------------------------- /tt_roadmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/tt_roadmap.png -------------------------------------------------------------------------------- /woodjoin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tidyomics/tidy-genomics-talk/9b44787ad5a8baa9d96f1ce3612438399a8f4efe/woodjoin.png --------------------------------------------------------------------------------