├── .gitignore ├── README.md ├── cca ├── index.Rmd └── index.html ├── clust ├── index.Rmd ├── index.html ├── m15P2.Rmd └── m15P2.html ├── cor_bulk ├── expr │ ├── index.Rmd │ └── index.html └── met │ ├── index.Rmd │ ├── index.html │ └── src.R ├── cov ├── m14 │ ├── index.Rmd │ └── index.html ├── m14_m15P3 │ ├── index.Rmd │ └── index.html └── m15P3 │ ├── index.Rmd │ └── index.html ├── data ├── expr │ └── data_proc │ │ ├── index.Rmd │ │ └── index.html ├── join │ ├── README.html │ ├── README.md │ ├── README.rmd │ └── data.rds └── met │ ├── join │ ├── index.Rmd │ └── index.html │ └── prepro │ ├── H3K27ac │ ├── index.Rmd │ └── index.html │ ├── H3K27me3 │ ├── index.Rmd │ └── index.html │ ├── H3K4me1 │ ├── index.Rmd │ └── index.html │ ├── H3K4me1_Tet1 │ ├── index.Rmd │ └── index.html │ ├── IAP │ ├── index.Rmd │ └── index.html │ ├── LMR │ ├── index.Rmd │ └── index.html │ ├── Makefile │ ├── Tet2 │ ├── index.Rmd │ └── index.html │ ├── Wu_Tet1 │ ├── index.Rmd │ └── index.html │ ├── active_enhancer │ ├── index.Rmd │ └── index.html │ ├── cgi │ ├── index.Rmd │ └── index.html │ ├── exon │ ├── index.Rmd │ └── index.html │ ├── gene_body │ ├── index.Rmd │ └── index.html │ ├── intergenic │ ├── index.Rmd │ └── index.html │ ├── intron │ ├── index.Rmd │ └── index.html │ ├── p300 │ ├── index.Rmd │ └── index.html │ ├── prom │ ├── index.Rmd │ └── index.html │ ├── prom_cgi │ ├── index.Rmd │ └── index.html │ ├── prom_non_cgi │ ├── index.Rmd │ └── index.html │ └── src │ └── index.Rmd ├── gene ├── index.Rmd └── index.html ├── gene_mean ├── index.Rmd └── index.html ├── gene_robust ├── index.Rmd └── index.html ├── heat ├── index.Rmd ├── index.html ├── index2.Rmd ├── index2.html ├── met.html └── viz.R ├── index.Rmd ├── index.html ├── lib └── utils.R ├── qc ├── data.xlsx ├── index.Rmd └── index.html ├── sample ├── index.Rmd └── index.html ├── var ├── m14_m15P3 │ ├── index.Rmd │ └── index.html ├── m15P3 │ ├── index.Rmd │ └── index.html └── met14 │ ├── index.Rmd │ └── index.html └── zoom ├── index.Rmd ├── index.html └── viz.R /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | .*.swp 3 | /tmp/ 4 | *.DS_Store 5 | 6 | # R 7 | .Rhistory 8 | .RData 9 | .Rapp.history 10 | .Rproj.user 11 | *.Rproj 12 | *.rds 13 | 14 | # Python 15 | *.py[cod] 16 | *.so 17 | *.cfg 18 | *.orig 19 | *.log 20 | *.pot 21 | __pycache__/* 22 | .cache/* 23 | *_cache/ 24 | 25 | index_* 26 | *.csv 27 | *.png 28 | *.pptx 29 | *.ipynb 30 | *.jpg 31 | 32 | /zoom/w3*/ 33 | rdata 34 | fig_* 35 | *.zip 36 | *.pdf 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | scM&T-Seq 2 | ========= 3 | 4 | Source code of the manuscript ***Parallel single-cell sequencing links 5 | transcriptional and epigenetic heterogeneity*** ([Nature Methods](http://www.nature.com/nmeth/journal/v13/n3/full/nmeth.3728.html)). 6 | 7 | Abstract 8 | -------- 9 | We report scM&T–seq, a method for parallel single–cell genome–wide methylome and 10 | transcriptome sequencing, allowing discovery of associations between 11 | transcriptional and epigenetic variation. Profiling of 61 mouse embryonic stem 12 | cells confirmed known links between DNA methylation and transcription. Notably, 13 | the method reveals novel associations between heterogeneous methylation of 14 | distal regulatory elements and transcriptional heterogeneity of key pluripotency 15 | genes. 16 | 17 | Content 18 | ------- 19 | * `/cca/`: Canonical Correlation Analysis 20 | * `/clust/`: Clustering scM&T-Seq and scBS-Seq cells 21 | * `/cov/`: Coverage analysis 22 | * `/cor_bulk/`: Correlation scM&T-Seq methylation rates with bulk methylation rate 23 | * `/data/`: Data directory 24 | * `index.Rmd`: Table of content 25 | * `/gene/`: Gene-specific correlation analysis 26 | * `/gene_mean/`: Correlating mean methylation with gene expression 27 | * `/gene_robust/`: Robustness analysis gene-specific correlation 28 | * `/heat/`: Visualizing methylation and expression heatmap 29 | * `/lib/`: Library functions 30 | * `/sample/`: Sample-specific correlation analysis 31 | * `/qc/`: Quality control DNA methylation 32 | * `/var/`: Comparison methylation variability in context 33 | * `/zoom/`: Visualizing Esrrb gene 34 | 35 | `data/join/data.rds` contains the pre-processed and joined methylation and 36 | expression data, which were used for the correlation analysis reported in the 37 | manuscript. The raw data and intermediate output files can be downloaded 38 | from [GEO](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE74535). 39 | 40 | Contact 41 | ------- 42 | * Christof Angermueller 43 | * cangermueller@ebi.ac.uk 44 | * https://cangermueller.com 45 | -------------------------------------------------------------------------------- /cca/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Canonical Correlation Analysis 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(ggplot2) 15 | library(dplyr) 16 | library(tidyr) 17 | library(RColorBrewer) 18 | library(gridExtra) 19 | library(corrplot) 20 | library(gplots) 21 | library(stringr) 22 | library(xtable) 23 | source('../lib/utils.R') 24 | ``` 25 | 26 | ```{r} 27 | options(xtable.type='html') 28 | ``` 29 | 30 | ```{r} 31 | opts <- list() 32 | opts$expr$file <- '../data/expr/data_proc/data_expr.rds' 33 | opts$met$file <- '../data/met/join/data_met.rds' 34 | opts$pca_on <- 'gene_body' 35 | 36 | opts$clust <- list() 37 | opts$clust$c1 <- c('H03', 'C05', 'C07', 'B09', 'F07', 'C02', 'D05', 'E09', 'F01', 'B05', 'H10', 'E01', 'B10') 38 | opts$clust$c2 <- c('F06', 'D07', 'G09', 'G03') 39 | opts$clust_colors <- c('default'='#fdc086', 'c1'='#7fc97f', 'c2'='brown1') 40 | 41 | samples_colors <- function(x) { 42 | h <- rep(opts$clust_colors['default'], length(x)) 43 | for (n in names(opts$clust)) { 44 | h[x %in% opts$clust[[n]]] <- opts$clust_colors[n] 45 | } 46 | names(h) <- x 47 | return (h) 48 | } 49 | ``` 50 | 51 | ```{r} 52 | read_samples <- function(filename) { 53 | h <- read.table(filename, sep='\t', head=T) %>% tbl_df 54 | return (h) 55 | } 56 | 57 | format_sample <- function(s) { 58 | l <- str_split(s, '_') 59 | l <- sapply(l, function(x) x[length(x)]) 60 | return (l) 61 | } 62 | 63 | dat <- list() 64 | dat$expr <- readRDS(opts$expr$file) %>% mutate(sample=factor(sample, label=format_sample(levels(sample)))) 65 | dat$met <- readRDS(opts$met$file) %>% 66 | mutate(sample=factor(sample, label=format_sample(levels(sample)))) %>% 67 | filter(name == opts$pca_on) %>% droplevels 68 | ``` 69 | 70 | ```{r} 71 | plot_pca_vec <- function(pc_vec, x=1, y=2) { 72 | t <- data.frame(sample=factor(rownames(pc_vec)), 73 | pcx=pc_vec[,x], pcy=pc_vec[,y]) 74 | cols <- samples_colors(as.vector(t$sample)) 75 | p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes(color=sample), size=2) + 76 | scale_color_manual(values=cols) + 77 | geom_text(aes(label=sample), vjust=-.4, hjust= .3, size=3) + 78 | xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) + 79 | guides(color=F) + theme_pub() 80 | return (p) 81 | } 82 | 83 | plot_pca_val <- function(pc_val) { 84 | t <- data.frame(pc=1:length(pc_val), val=pc_val) 85 | p <- ggplot(t, aes(x=pc, y=val)) + 86 | geom_bar(stat='identity', fill='salmon', color='black') + 87 | xlab('principle component') + 88 | ylab('% variance explained') + theme_pub() 89 | return (p) 90 | } 91 | 92 | plot_pca_heat <- function(d, title='PC') { 93 | colors <- rev(brewer.pal(9, 'Spectral')) 94 | colors <- colorRampPalette(colors)(50) 95 | 96 | p <- heatmap.2(d, density.info='none', trace='none', 97 | col=colors, Rowv=F, Colv=T, keysize=1.0, dendro='column', 98 | lwid=c(2, 5), key.title='', srtCol=45, key.xlab=title) 99 | return (p) 100 | } 101 | ``` 102 | 103 | ## PCA expression 104 | 105 | ```{r} 106 | e <- dat$expr %>% select(id_, sample, expr) %>% spread(sample, expr) 107 | e <- e %>% select(-id_) %>% as.matrix 108 | pe <- pca(e) 109 | ``` 110 | 111 | ```{r fig.width=6, fig.height=6} 112 | p <- plot_pca_vec(pe$vec) 113 | p 114 | ``` 115 | 116 | ```{r eval=F} 117 | ggsave(p, file='fig_pca_expr.pdf', width=7, height=6) 118 | ``` 119 | 120 | ```{r fig.width=6, fig.height=4} 121 | p <- plot_pca_val(pe$val) 122 | p 123 | ``` 124 | 125 | ```{r eval=F} 126 | ggsave(p, file='fig_pca_expr_var.pdf', width=7, height=6) 127 | ``` 128 | 129 | ## PCA methylation 130 | 131 | ```{r} 132 | m <- dat$met %>% select(id_, sample, met) %>% 133 | spread(sample, met) %>% select(-id_) %>% as.matrix 134 | mi <- m %>% impute 135 | pm <- pca(mi) 136 | ``` 137 | 138 | ```{r fig.width=6, fig.height=6} 139 | p <- plot_pca_vec(pm$vec) 140 | p 141 | ``` 142 | 143 | ```{r eval=F} 144 | ggsave(p, file='fig_pca_expr.pdf', width=7, height=6) 145 | ``` 146 | 147 | ```{r fig.width=6, fig.height=4} 148 | p <- plot_pca_val(pm$val) 149 | p 150 | ``` 151 | 152 | ```{r eval=F} 153 | ggsave(p, file='fig_pca_met_var.pdf', width=7, height=6) 154 | ``` 155 | 156 | ## Comparison PCAs 157 | 158 | ```{r fig.width=10, fig.height=5} 159 | grid.arrange(plot_pca_vec(pe$vec), plot_pca_vec(pm$vec), ncol=2) 160 | ``` 161 | 162 | ```{r fig.width=10, fig.height=5} 163 | grid.arrange(plot_pca_val(pe$val), plot_pca_val(pm$val), ncol=2) 164 | ``` 165 | 166 | ```{r eval=F} 167 | write.csv(pe$vec, 'S8a.csv') 168 | write.csv(pm$vec, 'S8b.csv') 169 | ``` 170 | 171 | ## Principle components 172 | 173 | ```{r fig.width=10, fig.height=7} 174 | pce <- t(pe$vec) 175 | pcm <- t(pm$vec) 176 | rownames(pce) <- paste('epc', 1:ncol(pce)) 177 | rownames(pcm) <- paste('mpc', 1:ncol(pcm)) 178 | h <- plot_pca_heat(pce, title='Expression PC') 179 | h <- plot_pca_heat(pcm, title='Methylation PC') 180 | ``` 181 | 182 | #### Correlation Principle Components 183 | 184 | ```{r fig.width=10, fig.height=10} 185 | h <- cor(t(pce), t(pcm)) 186 | corrplot(h, method='color') 187 | ``` 188 | 189 | ```{r eval=F} 190 | pdf(file='cor_pc.pdf', width=10, height=10) 191 | corrplot(h, method='color') 192 | dev.off() 193 | ``` 194 | 195 | ```{r} 196 | de <- pce %>% as.data.frame %>% mutate(pc=1:n()) %>% 197 | gather(sample, value, -pc) %>% tbl_df 198 | dm <- pcm %>% as.data.frame %>% mutate(pc=1:n()) %>% 199 | gather(sample, value, -pc) %>% tbl_df 200 | pcs <- de %>% inner_join(dm, by='sample') %>% 201 | rename(pce=pc.x, expr=value.x, pcm=pc.y, met=value.y) 202 | ``` 203 | 204 | ```{r fig.height=10} 205 | n <- 5 206 | d <- pcs %>% filter(pce <= n, pcm <= 5, pce > 1, pcm > 1) 207 | cols <- samples_colors(as.vector(d$sample)) 208 | p <- ggplot(d, aes(x=met, y=expr)) + 209 | stat_smooth(method=lm, color='black') + 210 | geom_point(aes(color=sample), size=1.5) + 211 | scale_color_manual(values=cols) + 212 | xlab('Methylation PC') + ylab('Expression PC') + 213 | facet_grid(pce~pcm) + 214 | guides(color=F) 215 | print(p) 216 | ``` 217 | 218 | ```{r eval=F} 219 | ggsave('cor_pc_scatter.pdf', p, width=10, height=10) 220 | ``` 221 | 222 | ```{r eval=F} 223 | write.csv(p$data, 'S9.csv') 224 | ``` 225 | 226 | ```{r} 227 | plot_cor_pc <- function(pcm_, pce_) { 228 | d <- pcs %>% filter(pcm == pcm_, pce == pce_) 229 | cols <- samples_colors(as.vector(d$sample)) 230 | p <- ggplot(d, aes(x=met, y=expr)) + 231 | stat_smooth(method=lm, color='black') + 232 | geom_point(aes(color=sample), size=2) + 233 | scale_color_manual(values=cols) + 234 | xlab(sprintf('Methylation PC %d', pcm_)) + 235 | ylab(sprintf('Expression PC %d', pce_)) + 236 | guides(color=F) + theme_pub() 237 | return (p) 238 | } 239 | ``` 240 | 241 | ```{r} 242 | r <- pcs %>% group_by(pcm, pce) %>% summarise(r=cor(expr, met)) %>% ungroup %>% 243 | arrange(desc(abs(r))) %>% head(10) 244 | ``` 245 | 246 | ```{r results='asis'} 247 | xtable(r, digits=c(0, 0, 0, 3)) 248 | ``` 249 | 250 | ```{r fig.width=6, fig.height=6} 251 | for (i in 1:nrow(r)) { 252 | print(plot_cor_pc(r[i,]$pcm, r[i,]$pce)) 253 | } 254 | ``` 255 | 256 | 257 | 258 | 259 | 260 | 261 | ```{r} 262 | opts_chunk$set(eval=F) 263 | ``` 264 | 265 | 266 | ```{r cache=T} 267 | X <- t(ev) 268 | Y <- t(mc) 269 | rc <- rcc(X, Y, 0.1, 0.5) 270 | ``` 271 | 272 | ```{r} 273 | xs <- rc$scores$xscores 274 | ys <- rc$scores$yscores 275 | d <- data.frame(sample=rownames(xs), x=xs[,1], y=ys[,2]) 276 | d <- d %>% inner_join(dat$samples, by='sample') 277 | d$sample_short <- sub('^CSC\\d+_', '', d$sample) 278 | 279 | p1 <- ggplot(d, aes(x=x, y=y)) + geom_point(aes(color=method)) + 280 | geom_text(aes(label=sample_short), size=2.5, just=0.5, vjust=-0.2) + 281 | xlab('Dimension 1') + ylab('Dimension 2') + 282 | theme(legend.position='bottom') 283 | 284 | d <- data.frame(x=1:22, y=rc$cor[1:22]) 285 | p2 <- ggplot(d, aes(x=x, y=y)) + geom_bar(stat='identity', fill='salmon') + 286 | xlab('CC 2') + ylab('Correlation') 287 | 288 | grid.arrange(p1, p2, ncol=2) 289 | ``` 290 | 291 | ```{r} 292 | plt.cc(rc) 293 | ``` 294 | -------------------------------------------------------------------------------- /cor_bulk/expr/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Correlation expression scM&T and Ficz 3 | date: 151022 4 | output: 5 | html_document: 6 | toc: yes 7 | --- 8 | 9 | ```{r, include=F} 10 | library(knitr) 11 | opts_chunk$set(echo=F, warning=F, message=F) 12 | ``` 13 | 14 | ```{r, include=F} 15 | library(ggplot2) 16 | library(dplyr) 17 | library(tidyr) 18 | ``` 19 | 20 | ```{r} 21 | opts <- list() 22 | opts$file1 <- 'data/ficz/counts.csv' 23 | opts$file2 <- 'data/m15P3/bsr3.csv' 24 | ``` 25 | 26 | ```{r} 27 | dat <- list() 28 | ``` 29 | 30 | ```{r} 31 | d <- read.table(opts$file1, head=T) %>% tbl_df 32 | names(d) <- c('ens_id', 'bulk') 33 | dat$e1 <- d 34 | ``` 35 | 36 | ```{r} 37 | d <- read.table(opts$file2, head=T) %>% tbl_df 38 | d <- d %>% gather(sample, expr, -ens_id) 39 | d <- d %>% group_by(ens_id) %>% summarise(bulk=mean(expr)) 40 | dat$e2 <- d 41 | ``` 42 | 43 | ```{r} 44 | dat$e <- dat$e1 %>% inner_join(dat$e2, by='ens_id') 45 | ``` 46 | 47 | ```{r} 48 | ggplot(dat$e, aes(x=bulk.x, y=bulk.y)) + geom_point(size=0.5) + 49 | xlim(0, 1000) + ylim(0, 1000) 50 | ``` 51 | 52 | ```{r} 53 | cor(dat$e$bulk.x, dat$e$bulk.y) 54 | ``` 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /cor_bulk/met/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Correlation methylation scM&T and Ficz 3 | date: 151022 4 | output: 5 | html_document: 6 | toc: yes 7 | --- 8 | 9 | ```{r, include=F} 10 | library(knitr) 11 | opts_chunk$set(echo=F, warning=F, message=F) 12 | ``` 13 | 14 | ```{r, include=F} 15 | library(ggplot2) 16 | library(dplyr) 17 | library(tidyr) 18 | source('./src.R') 19 | source('./../../lib/utils.R') 20 | ``` 21 | 22 | ```{r} 23 | opts <- list() 24 | opts$met1_file <- 'data/ficz/rates.txt' 25 | opts$met2_file <- 'data/m15P3/rates.txt' 26 | opts$met3_file <- 'data/m14/rates.txt' 27 | opts$met2_samples_file <- './data/m15P3/samples/samples.csv' 28 | opts$nrow <- 10000000 29 | opts$cache <- T 30 | ``` 31 | 32 | ```{r} 33 | dat <- list() 34 | ``` 35 | 36 | ```{r read_met1, cache=opts$cache} 37 | dat$met1 <- read_values(opts$met1_file, n=opts$nrow) %>% unlist 38 | names(dat$met1) <- NULL 39 | ``` 40 | 41 | ```{r read_met2, cache=opts$cache} 42 | read_samples <- function(filename) { 43 | h <- read.table(filename, sep='\t', head=F) %>% tbl_df 44 | colnames(h) <- c('id') 45 | h <- as.vector(h$id) 46 | return (h) 47 | } 48 | 49 | samples <- read_samples(opts$met2_samples_file) 50 | dat$met2 <- read_values(opts$met2_file, samples=samples, n=opts$nrow) %>% 51 | rowMeans(na.rm=T) 52 | ``` 53 | 54 | ```{r cache=opts$cache} 55 | dat$met3 <- read_values(opts$met3_file, n=opts$nrow) %>% 56 | rowMeans(na.rm=T) 57 | ``` 58 | 59 | ```{r} 60 | stopifnot(length(dat$met1) == length(dat$met2)) 61 | stopifnot(length(dat$met1) == length(dat$met3)) 62 | d <- data.frame(ficz=dat$met1, m15=dat$met2, m14=dat$met3) %>% tbl_df 63 | d <- d[!(rowSums(is.na(d)) == ncol(d)),] 64 | dat$met <- d 65 | ``` 66 | 67 | ```{r} 68 | r <- cor(dat$met, use='complete.obs') 69 | r 70 | ``` 71 | 72 | ```{r} 73 | plot_cor <- function(name, nmax=50000) { 74 | d <- data.frame(x=dat$met$ficz, y=dat$met[[name]]) 75 | d <- d[complete.cases(d),] 76 | if (nrow(d) > nmax) { 77 | d <- d[sample(nrow(d), nmax),] 78 | } 79 | p <- ggplot(d, aes(x=x, y=y)) + 80 | geom_abline(slope=1, linetype='dashed') + 81 | geom_point(color='blue', size=0.3) + 82 | xlab('Ficz') + ylab(name) + theme_pub() 83 | return (p) 84 | } 85 | ``` 86 | 87 | ## scM&T-Seq 88 | 89 | ```{r fig.width=8, fig.height=8} 90 | plot_cor('m15') 91 | ``` 92 | 93 | ## scBS-Seq 94 | 95 | ```{r fig.width=8, fig.height=8} 96 | plot_cor('m14') 97 | ``` 98 | 99 | ## Both 100 | 101 | ```{r} 102 | nmax <- 50000 103 | d <- dat$met %>% gather(set, value, -ficz) 104 | d <- d[complete.cases(d),] 105 | d <- d %>% group_by(set) %>% sample_n(nmax) %>% ungroup 106 | d <- d %>% mutate(set=factor(set, levels=c('m15', 'm14'), labels=c('scM&T', 'scBS14'))) 107 | ``` 108 | 109 | ```{r fig.width=8, fig.height=8} 110 | h <- d %>% mutate(set=as.vector(set)) %>% arrange(set) 111 | colors <- c('scBS14'='green4', 'scM&T'='royalblue') 112 | p <- ggplot(h, aes(x=ficz, y=value)) + 113 | geom_abline(slope=1, linetype='dashed') + 114 | geom_point(aes(color=set), size=0.8) + 115 | scale_color_manual(values=colors) + 116 | xlab('Methylation bulk') + ylab('Methylation merged scM&T') + theme_pub() 117 | print(p) 118 | ``` 119 | 120 | ```{r} 121 | ggsave(p, file='fig_bulk.pdf', width=8, height=8) 122 | ``` 123 | 124 | ```{r eval=F} 125 | write.csv(p$data, 'S7.csv') 126 | ``` 127 | -------------------------------------------------------------------------------- /cor_bulk/met/src.R: -------------------------------------------------------------------------------- 1 | read_meta <- function(filename, n=NULL) { 2 | h <- 'cut -f 2-5,7,8,12' 3 | if (!is.null(n)) { 4 | h <- sprintf('head -n %d %s | %s', n, filename, h) 5 | } else { 6 | h <- sprintf('%s %s', h, filename) 7 | } 8 | h <- 'cut -f 2-5,7,8,12' 9 | if (!is.null(n)) { 10 | h <- sprintf('head -n %d | %s', n, h) 11 | } 12 | h <- read.table(pipe(h), head=T, sep='\t') 13 | names(h) <- tolower(names(h)) 14 | h <- h %>% rename(chromo=chromosome) 15 | h <- h %>% tbl_df 16 | return (h) 17 | } 18 | 19 | read_all_meta <- function(filenames) { 20 | m <- read_meta(filenames[1]) 21 | if (length(filenames) > 1) { 22 | for (i in 2:length(filenames)) { 23 | s <- read_meta_quick(filenames[i]) 24 | stopifnot(all(m$start == s$start)) 25 | } 26 | } 27 | return (m) 28 | } 29 | 30 | read_values <- function(filename, samples=NULL, n=NULL) { 31 | h <- 'cut -f 13-' 32 | if (!is.null(n)) { 33 | h <- sprintf('head -n %d %s | %s', n, filename, h) 34 | } else { 35 | h <- sprintf('%s %s', h, filename) 36 | } 37 | 38 | h <- read.table(pipe(h), head=T, sep='\t') 39 | if (!is.null(samples)) { 40 | h <- subset(h, select=intersect(colnames(h), samples)) 41 | } 42 | h <- h %>% tbl_df 43 | return (h) 44 | } 45 | 46 | read_all_values <- function(filenames, samples=NULL, n=NULL) { 47 | d <- lapply(filenames, function(x) read_values(x, samples=samples, n=n)) 48 | e <- list() 49 | for (dd in d) { 50 | if (ncol(dd) > 0) { 51 | e[[length(e) + 1]] <- dd 52 | } 53 | } 54 | d <- e 55 | stopifnot(length(d) > 0) 56 | h <- d[[1]] 57 | if (length(d) > 1) { 58 | for (i in 2:length(d)) { 59 | if (ncol(d[[i]]) > 0) { 60 | h <- cbind.data.frame(h, d[[i]]) 61 | } 62 | } 63 | } 64 | h <- h %>% tbl_df 65 | return (h) 66 | } 67 | -------------------------------------------------------------------------------- /cov/m14/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Coverage analysis scBS14 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(ggplot2) 15 | library(dplyr) 16 | library(tidyr) 17 | source('../../lib/utils.R') 18 | ``` 19 | 20 | ```{r} 21 | opts <- list() 22 | opts$reports <- '../data/m14' 23 | opts$names_ <- list.dirs(opts$reports, full=F, rec=F) 24 | ``` 25 | 26 | ```{r} 27 | dat <- list() 28 | cmp <- list() 29 | ``` 30 | 31 | ```{r, cache=T} 32 | get_cov <- function(f) { 33 | d <- read_report_values(f) 34 | d <- d[,grep('Ser', colnames(d))] 35 | stopifnot(ncol(d) == 20) 36 | cov <- rowSums(d > 0) 37 | return (cov) 38 | } 39 | 40 | d <- list() 41 | for (name in opts$names_) { 42 | f <- file.path(opts$reports, name, 'weights.txt') 43 | if (file.exists(f)) { 44 | message(f) 45 | cov <- get_cov(f) 46 | di <- data.frame(name=name, id_=1:length(cov), cov=cov) 47 | d[[length(d) + 1]] <- di 48 | } 49 | } 50 | d <- do.call(rbind.data.frame, d) %>% tbl_df 51 | d <- d %>% mutate(name=factor(name)) %>% 52 | mutate(name=factor(name, levels=rev(sort(levels(name))))) 53 | cov <- d 54 | ``` 55 | 56 | ```{r} 57 | cmp$cov <- cov 58 | ``` 59 | 60 | ```{r} 61 | cov_min <- function(d, n) { 62 | h <- sapply(0:n, function(x) sum(d$cov >= x) / length(d$cov)) 63 | h <- data.frame(name=d$name[[1]], n=1:length(h), per=h) 64 | return (h) 65 | } 66 | 67 | n <- max(cmp$cov$cov) 68 | cmp$cov_min <- cmp$cov %>% group_by(name) %>% do(cov_min(., n)) %>% ungroup %>% tbl_df 69 | ``` 70 | 71 | ```{r} 72 | saveRDS(cmp, 'cov.rds') 73 | ``` 74 | 75 | ```{r fig.width=8, fig.height=6} 76 | d <- cmp$cov_min %>% mutate(name=factor(name, levels=rev(levels(name)))) 77 | ggplot(d, aes(x=n, y=per, color=name)) + 78 | geom_line(lwd=0.6) + 79 | xlab('Minimum number of cell') + ylab('Coverage') + 80 | theme_pub() + 81 | guides(color=guide_legend(title='Context', ncol=4)) 82 | ``` 83 | 84 | ```{r fig.width=12, fig.height=10} 85 | d <- cmp$cov_min %>% mutate(name=factor(name, levels=rev(levels(name)))) 86 | ggplot(d, aes(x=n, y=per, color=name)) + 87 | geom_line(lwd=0.6) + 88 | facet_wrap(~name) + 89 | xlab('Minimum number of cell') + ylab('Coverage') + 90 | theme_pub() + 91 | guides(color=guide_legend(title='Context', ncol=4)) 92 | ``` 93 | -------------------------------------------------------------------------------- /cov/m14_m15P3/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Coverage analysis 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(ggplot2) 15 | library(dplyr) 16 | library(tidyr) 17 | source('../../lib/utils.R') 18 | ``` 19 | 20 | ```{r} 21 | opts <- list() 22 | opts$colors <- c('scBS-Seq'='forestgreen', 'scM&T-Seq'='royalblue') 23 | opts$global <- c('10kb'='w10k_s1k', '5kb'='w5k_s1k', '1kb'='w1k_s1k') 24 | ``` 25 | 26 | ```{r} 27 | dat <- list() 28 | dat$m14 <- readRDS('../m14/cov.rds') 29 | dat$m15 <- readRDS('../m15P3/cov.rds') 30 | ``` 31 | 32 | ```{r} 33 | d <- list() 34 | for (n in c('m14', 'm15')) { 35 | di <- dat[[n]]$cov_min 36 | di$np <- di$n / max(di$n) 37 | di$study <- n 38 | d[[length(d) + 1]] <- di 39 | } 40 | d <- do.call(rbind.data.frame, d) %>% 41 | mutate( 42 | study=factor(study, levels=c('m14', 'm15'), labels=c('scBS-Seq', 'scM&T-Seq')), 43 | name=factor(name)) %>% 44 | mutate(name=factor(name, levels=sort(levels(name)))) 45 | dat$cov_min <- d 46 | ``` 47 | 48 | ## Genome-wide 49 | 50 | ```{r} 51 | d <- dat$cov_min %>% filter(name %in% opts$global) %>% 52 | mutate(name=factor(name, levels=opts$global, 53 | labels=names(opts$global))) %>% droplevels 54 | ``` 55 | 56 | ```{r eval=F} 57 | write.csv(d, 'S4.csv') 58 | ``` 59 | 60 | ```{r fig.width=8, fig.height=6} 61 | p <- ggplot(d, aes(x=n, y=per * 100, color=study)) + 62 | geom_line(aes(linetype=name), lwd=0.8) + 63 | scale_color_manual(values=opts$colors) + 64 | guides(color=guide_legend(title='Study'), 65 | linetype=guide_legend(title='Window size')) + 66 | xlab('Minimum number of cells') + 67 | ylab('Percentage of genomic contexts covered') + 68 | theme_pub() + 69 | theme( 70 | axis.title.x=element_text(vjust=-0.8), 71 | axis.title.y=element_text(vjust=1.2) 72 | ) 73 | print(p) 74 | ``` 75 | 76 | ```{r eval=F} 77 | ggsave(p, file='fig_genome.pdf', width=8, height=6) 78 | ``` 79 | 80 | ## Context specific 81 | 82 | ```{r} 83 | d <- dat$cov_min %>% filter(!(name %in% opts$global)) %>% droplevels 84 | ``` 85 | 86 | ```{r eval=F} 87 | write.csv(d, 'S5.csv') 88 | ``` 89 | 90 | ```{r fig.width=12, fig.height=12} 91 | p <- ggplot(d, aes(x=n, y=per * 100, color=study)) + 92 | geom_line(lwd=0.8) + 93 | scale_color_manual(values=opts$colors) + 94 | guides(color=guide_legend(title='Study')) + 95 | facet_wrap(~name, ncol=3) + 96 | xlab('Minimum number of cells') + 97 | ylab('Percentage of sites covered') + 98 | theme_pub() + 99 | theme( 100 | axis.title.x=element_text(vjust=-0.8), 101 | axis.title.y=element_text(vjust=1.2) 102 | ) 103 | print(p) 104 | ``` 105 | 106 | ```{r eval=F} 107 | ggsave(p, file='fig_context.pdf', width=12, height=13) 108 | ``` 109 | -------------------------------------------------------------------------------- /cov/m15P3/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Coverage analysis CSCP3 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(ggplot2) 15 | library(dplyr) 16 | library(tidyr) 17 | source('../../lib/utils.R') 18 | ``` 19 | 20 | ```{r} 21 | opts <- list() 22 | opts$reports <- '../data/m15P3' 23 | opts$samples <- '../../data/samples/samples.csv' 24 | opts$names_ <- list.dirs(opts$reports, full=F, rec=F) 25 | ``` 26 | 27 | ```{r} 28 | dat <- list() 29 | dat$samples <- read_samples_list(opts$samples) 30 | cmp <- list() 31 | ``` 32 | 33 | ```{r, cache=T} 34 | get_cov <- function(f) { 35 | d <- read_report_values(f, samples=dat$samples) 36 | cov <- rowSums(d > 0) 37 | return (cov) 38 | } 39 | 40 | d <- list() 41 | for (name in opts$names_) { 42 | f <- file.path(opts$reports, name, 'weights.txt') 43 | if (file.exists(f)) { 44 | message(f) 45 | cov <- get_cov(f) 46 | di <- data.frame(name=name, id_=1:length(cov), cov=cov) 47 | d[[length(d) + 1]] <- di 48 | } 49 | } 50 | d <- do.call(rbind.data.frame, d) %>% tbl_df 51 | d <- d %>% mutate(name=factor(name)) %>% 52 | mutate(name=factor(name, levels=rev(sort(levels(name))))) 53 | cov <- d 54 | ``` 55 | 56 | ```{r} 57 | cmp$cov <- cov 58 | ``` 59 | 60 | ```{r} 61 | cov_min <- function(d, n) { 62 | h <- sapply(0:n, function(x) sum(d$cov >= x) / length(d$cov)) 63 | h <- data.frame(name=d$name[[1]], n=1:length(h), per=h) 64 | return (h) 65 | } 66 | 67 | n <- max(cmp$cov$cov) 68 | cmp$cov_min <- cmp$cov %>% group_by(name) %>% do(cov_min(., n)) %>% ungroup %>% tbl_df 69 | ``` 70 | 71 | ```{r} 72 | saveRDS(cmp, 'cov.rds') 73 | ``` 74 | 75 | ```{r fig.width=8, fig.height=6} 76 | d <- cmp$cov_min %>% mutate(name=factor(name, levels=rev(levels(name)))) 77 | ggplot(d, aes(x=n, y=per, color=name)) + 78 | geom_line(lwd=0.6) + 79 | xlab('Minimum number of cell') + ylab('Coverage') + 80 | theme_pub() + 81 | guides(color=guide_legend(title='Context', ncol=4)) 82 | ``` 83 | 84 | ```{r fig.width=12, fig.height=10} 85 | d <- cmp$cov_min %>% mutate(name=factor(name, levels=rev(levels(name)))) 86 | ggplot(d, aes(x=n, y=per, color=name)) + 87 | geom_line(lwd=0.6) + 88 | facet_wrap(~name) + 89 | xlab('Minimum number of cell') + ylab('Coverage') + 90 | theme_pub() + 91 | guides(color=guide_legend(title='Context', ncol=4)) 92 | ``` 93 | -------------------------------------------------------------------------------- /data/expr/data_proc/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Preprocessing expression data 3 | date: 151014 4 | output: 5 | html_document: 6 | toc: yes 7 | --- 8 | 9 | ```{r, include=F} 10 | library(knitr) 11 | opts_chunk$set(echo=F, warning=F, message=F) 12 | options(dplyr.width=200) 13 | ``` 14 | 15 | ```{r, include=F} 16 | library(dplyr) 17 | library(tidyr) 18 | library(ggplot2) 19 | library(RColorBrewer) 20 | library(gplots) 21 | library(gridExtra) 22 | source('../../../lib/utils.R') 23 | ``` 24 | 25 | ```{r} 26 | opts <- list() 27 | opts$expr_files <- c('../data_raw/counts.txt') 28 | opts$genes_file <- '../data_raw/genes.txt' 29 | opts$out_base <- './data' 30 | opts$samples_file <- '../../samples/samples_stats.csv' 31 | opts$samples_select_file <- '../../samples/samples.csv' 32 | opts$plot_heat <- T 33 | ``` 34 | 35 | ```{r opts_fil_expr, echo=T} 36 | opts$fil$expr$cov <- 10 # Minimum number of cells with minimum number of counts 37 | opts$fil$expr$counts <- 10 # Minimum number of raw counts 38 | opts$fil$expr$top_var <- 7500 # Number of most variable genes to be selected 39 | opts$fil$expr$include <- NULL 40 | opts$samples_methods <- NULL 41 | opts$samples_batch2 <- NULL 42 | opts$adjust <- F 43 | ``` 44 | 45 | ```{r} 46 | read_expr <- function(filename, log=T, gather=T) { 47 | d <- read.csv(filename, head=T, sep='\t') 48 | stopifnot(sum(duplicated(d$ens_id)) == 0) 49 | h <- names(d) != 'ens_id' 50 | if (log) { 51 | d[,h] <- log_counts(d[,h]) 52 | } 53 | d$id_ <- 1:nrow(d) 54 | if (gather) { 55 | d <- gather(d, sample, value, -c(id_, ens_id)) 56 | d <- d %>% tbl_df %>% rename(expr=value) 57 | } 58 | return (d) 59 | } 60 | 61 | read_all_expr <- function(filenames, log=T, gather=T, samples=NULL) { 62 | d <- NULL 63 | for (filename in filenames) { 64 | h <- read_expr(filename, log=log, gather=F) %>% select(-id_) 65 | if (!is.null(samples)) { 66 | h <- subset(h, select=intersect(colnames(h), c('ens_id', samples))) 67 | } 68 | if (is.null(d)) { 69 | d <- h 70 | } else { 71 | d <- d %>% inner_join(h, by='ens_id') 72 | } 73 | } 74 | d$id_ <- 1:nrow(d) 75 | 76 | if (gather) { 77 | d <- gather(d, sample, value, -c(id_, ens_id)) 78 | d <- d %>% rename(expr=value) 79 | } 80 | d <- d %>% mutate(ens_id=factor(ens_id)) %>% tbl_df 81 | return (d) 82 | } 83 | 84 | read_samples <- function(filename) { 85 | h <- read.table(filename, sep='\t', head=T) %>% tbl_df 86 | h$sample <- h$id 87 | return (h) 88 | } 89 | 90 | read_samples_select <- function(filename) { 91 | h <- read.table(filename, sep='\t', head=F) %>% tbl_df 92 | colnames(h) <- c('id') 93 | h <- as.vector(h$id) 94 | return (h) 95 | } 96 | 97 | filter_expr <- function(d, cov, counts, include=NULL) { 98 | lcounts <- log_counts(counts) 99 | if (cov < 1) { 100 | n <- length(levels(d$sample)) 101 | ncov <- n * cov 102 | } else { 103 | ncov <- cov 104 | } 105 | f <- d %>% group_by(id_, ens_id) %>% 106 | dplyr::summarize(cov=sum(expr >= lcounts)) %>% 107 | filter(cov >= ncov) %>% ungroup %>% select(ens_id) %>% unlist %>% as.vector 108 | if (!is.null(include)) { 109 | f <- union(f, include) 110 | } 111 | df <- d %>% filter(ens_id %in% f) 112 | return (df) 113 | } 114 | 115 | filter_expr_var <- function(d, n, include=NULL) { 116 | f <- d %>% group_by(id_, ens_id) %>% 117 | dplyr::summarize(var=var(expr)) %>% as.data.frame %>% 118 | dplyr::arrange(desc(var)) %>% head(n) %>% 119 | ungroup %>% select(ens_id) %>% unlist %>% as.vector 120 | if (!is.null(include)) { 121 | f <- union(f, include) 122 | } 123 | df <- d %>% filter(ens_id %in% f) 124 | return (df) 125 | } 126 | 127 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL, 128 | sel=NULL) { 129 | f <- excl 130 | if (!is.null(f)) { 131 | d <- d %>% filter(!(sample %in% f)) 132 | } 133 | f <- methods 134 | if (!is.null(f) && 'method' %in% colnames(d)) { 135 | d <- d %>% filter(method %in% f) 136 | } 137 | f <- batch2 138 | if (!is.null(f) && 'batch2' %in% colnames(d)) { 139 | d <- d %>% filter(batch2 %in% f) 140 | } 141 | if (!is.null(sel)) { 142 | d <- d %>% filter(sample %in% sel) 143 | } 144 | d <- droplevels(d) 145 | 146 | return (d) 147 | } 148 | ``` 149 | 150 | ```{r read} 151 | dat <- list() 152 | dat$samples <- read_samples(opts$samples_file) 153 | dat$samples_select <- read_samples_select(opts$samples_select_file) 154 | dat$samples <- filter_samples(dat$samples, 155 | excl=opts$samples_excl, 156 | methods=opts$samples_methods, 157 | sel=dat$samples_select) 158 | 159 | dat$expr <- read_all_expr(opts$expr_files, samples=levels(dat$samples$sample)) 160 | stopifnot(length(levels(dat$expr$sample)) == length(levels(dat$samples$sample))) 161 | ``` 162 | 163 | `r nrow(dat$samples)` samples 164 | 165 | 166 | ```{r filter} 167 | dat$fil$expr <- filter_expr(dat$expr, opts$fil$expr$cov, opts$fil$expr$counts, opts$fil$expr$include) 168 | ``` 169 | 170 | ```{r} 171 | n_unique <- function(d) { 172 | return (length(unique(d))) 173 | } 174 | ``` 175 | 176 | `r n_unique(dat$fil$expr$id_)` out of `r n_unique(dat$expr$id_)` genes 177 | passed expression filter. 178 | 179 | 180 | ```{r adjust, eval=opts$adjust} 181 | adjust_batch_expr <- function(d, what='batch2') { 182 | d <- d %>% rename(y=expr) 183 | stopifnot(all(!is.na(d$y))) 184 | d <- d %>% inner_join(select_(dat$samples, 'sample', x=what), by='sample') 185 | h <- d %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup 186 | d <- h %>% select(-c(y, x)) %>% rename(expr=yr) 187 | } 188 | 189 | dat$fil$expr <- adjust_batch_expr(dat$fil$expr) 190 | ``` 191 | 192 | Taking the `r opts$fil$expr$top_var` most variable genes. 193 | 194 | ```{r filter_var} 195 | dat$fil$expr_var <- filter_expr_var(dat$fil$expr, opts$fil$expr$top_var, opts$fil$expr$include) 196 | ``` 197 | 198 | ```{r} 199 | read_genes <- function(filename) { 200 | h <- read.table(pipe(paste('cut -f 2-5,7,8', filename)), head=T, sep='\t') 201 | names(h) <- tolower(names(h)) 202 | h <- h %>% rename(chromo=chromosome, ens_id=id, gene_id=feature) 203 | return (h) 204 | } 205 | 206 | dat$genes <- read_genes(opts$genes_file) 207 | ``` 208 | 209 | ```{r join_genes} 210 | dat$out <- dat$fil$expr_var %>% 211 | inner_join(dat$genes, by=c('ens_id')) 212 | dat$out$ens_id <- factor(dat$out$ens_id) 213 | dat$out <- dat$out %>% arrange(ens_id) 214 | dat$outm <- dat$out %>% spread(sample, expr) 215 | dat$fil$genes <- dat$genes %>% inner_join(select(dat$outm, ens_id, id_), by='ens_id') 216 | dat$fil$genes$ens_id <- as.factor(dat$fil$genes$ens_id) 217 | stopifnot(nrow(dat$fil$genes) == nrow(dat$outm)) 218 | ``` 219 | 220 | `r length(unique(dat$out$id_))` final records. 221 | 222 | ```{r write} 223 | save_rds <- function(d, name) { 224 | f <- sprintf('%s_%s.rds', opts$out_base, name) 225 | saveRDS(d, f) 226 | } 227 | 228 | dat$out <- dat$out %>% tbl_df 229 | dat$fil$genes <- dat$fil$genes %>% tbl_df 230 | dat$genes <- dat$genes %>% tbl_df 231 | 232 | save_rds(dat$out, 'expr') 233 | save_rds(dat$fil$genes, 'meta') 234 | save_rds(dat$genes, 'meta_all') 235 | ``` 236 | 237 | 238 | ```{r} 239 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') { 240 | t <- data.frame(sample=factor(rownames(pc_vec)), 241 | pcx=pc_vec[,x], pcy=pc_vec[,y]) 242 | t <- t %>% left_join(dat$samples, by='sample') 243 | t$sample_short <- t$sample 244 | t$sample_short <- sub('^CSCP3_SERUM_', '', t$sample) 245 | p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) + 246 | geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) + 247 | xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) + 248 | theme(legend.position='right') 249 | return (p) 250 | } 251 | 252 | plot_pca_val <- function(pc_val) { 253 | t <- data.frame(pc=1:length(pc_val), val=pc_val) 254 | p <- ggplot(t, aes(x=pc, y=val)) + 255 | geom_bar(stat='identity', fill='salmon', color='black') + 256 | xlab('principle component') + 257 | ylab('% variance explained') 258 | return (p) 259 | } 260 | 261 | plot_heat <- function(d, Rowv=T, xlab='value') { 262 | colors <- rev(brewer.pal(9, 'Spectral')) 263 | colors <- colorRampPalette(colors)(50) 264 | labRow <- NA 265 | if (nrow(d) > 500) { 266 | dendro='column' 267 | } else { 268 | dendro = 'both' 269 | } 270 | p <- heatmap.2(d, density.info='none', trace='none', col=colors, 271 | Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA, 272 | lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab) 273 | return (p) 274 | } 275 | ``` 276 | 277 | ```{r} 278 | d <- dat$out %>% select(id_, sample, expr) %>% spread(sample, expr) %>% 279 | select(-id_) 280 | pc <- pca(as.matrix(d)) 281 | ``` 282 | 283 | ```{r fig.width=8, fig.height=7} 284 | p1 <- plot_pca_vec(pc$vec) 285 | p1 + theme(legend.position='top') 286 | ``` 287 | 288 | ```{r fig.width=10, fig.height=5} 289 | p2 <- plot_pca_val(pc$val) 290 | p2 291 | ``` 292 | 293 | ```{r message=F, echo=F, eval=opts$plot_heat, fig.width=10, fig.height=10} 294 | max_rows <- 1000 295 | if (nrow(d) > max_rows) { 296 | d <- d %>% sample_n(max_rows) 297 | } 298 | h <- plot_heat(as.matrix(d)) 299 | ``` 300 | -------------------------------------------------------------------------------- /data/join/README.md: -------------------------------------------------------------------------------- 1 | Preprocessed data of scM&T 2 | ========================== 3 | 4 | 5 | ```r 6 | scMT <- readRDS('data.rds') 7 | cols <- c('name', 'id_.x', 'id_.y', 'sample', 'met', 'weight', 'expr', 8 | 'chromo.x', 'start.x', 'end.x', 'chromo.y', 'start.y', 'end.y', 'strand', 9 | 'ens_id', 'gene_id') 10 | scMT <- scMT[, c(cols)] 11 | glimpse(scMT) 12 | ``` 13 | 14 | ``` 15 | ## Observations: 7,709,302 16 | ## Variables: 16 17 | ## $ name (fctr) H3K27ac, H3K27ac, H3K27ac, H3K27ac, H3K27ac, H3K27ac... 18 | ## $ id_.x (int) 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 1... 19 | ## $ id_.y (int) 9848, 9848, 9848, 9848, 9848, 9848, 9848, 9848, 9848,... 20 | ## $ sample (fctr) CSCP3_SERUM_A02, CSCP3_SERUM_A03, CSCP3_SERUM_A04, C... 21 | ## $ met (dbl) 70.588234, 100.000000, NaN, 8.333333, 16.666666, 0.00... 22 | ## $ weight (dbl) 17, 12, 0, 12, 6, 6, 0, 5, 0, 40, 17, 0, 5, 15, 13, 3... 23 | ## $ expr (dbl) 0.4506199, 1.2592708, 0.4329859, 0.4880455, 0.3933367... 24 | ## $ chromo.x (fctr) 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, ... 25 | ## $ start.x (int) 108920349, 108920349, 108920349, 108920349, 108920349... 26 | ## $ end.x (int) 108950783, 108950783, 108950783, 108950783, 108950783... 27 | ## $ chromo.y (fctr) 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, ... 28 | ## $ start.y (int) 108909110, 108909110, 108909110, 108909110, 108909110... 29 | ## $ end.y (int) 108912460, 108912460, 108912460, 108912460, 108912460... 30 | ## $ strand (fctr) +, +, +, +, +, +, +, +, +, +, +, +, +, +, +, +, +, +... 31 | ## $ ens_id (fctr) ENSMUSG00000000142, ENSMUSG00000000142, ENSMUSG00000... 32 | ## $ gene_id (fctr) Axin2, Axin2, Axin2, Axin2, Axin2, Axin2, Axin2, Axi... 33 | ``` 34 | 35 | * `name`: Name of annotation. 36 | * `id_.x`: Identifier of methylated region. 37 | * `id_.y`: Identifier of gene matched to methylated region. 38 | * `sample`: Cell identifier. 39 | * `met`: Methylation rate of methylated region. 40 | * `weight`: Weight of methylated region proportional to the number of covered 41 | CpG sites in that region. 42 | * `expr: Expression rate of gene. 43 | * `ens_id`: ENSEMBL gene identifier. 44 | * `gene_id`: Gene name. 45 | * `chromo.x`: Chromosome of methylated region. 46 | * `start.x`: Start position of methylated region. 47 | * `end.x`: End position of methylated region. 48 | * `chromo.y`: Chromosome of gene. 49 | * `start.y`: Start position of gene. 50 | * `end.y`: End position of gene. 51 | 52 | -------------------------------------------------------------------------------- /data/join/README.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Preprocessed data of scM&T" 3 | author: "Christof Angermueller" 4 | date: "`r format(Sys.time(), '%Y-%m-%d')`" 5 | output: 6 | html_document: 7 | toc: no 8 | --- 9 | 10 | ```{r, include=F} 11 | library(knitr) 12 | opts_chunk$set(echo=T, warning=F, message=F) 13 | ``` 14 | 15 | ```{r, include=F} 16 | library(ggplot2) 17 | library(dplyr) 18 | library(tidyr) 19 | ``` 20 | 21 | ```{r} 22 | scMT <- readRDS('data.rds') 23 | cols <- c('name', 'id_.x', 'id_.y', 'sample', 'met', 'weight', 'expr', 24 | 'chromo.x', 'start.x', 'end.x', 'chromo.y', 'start.y', 'end.y', 'strand', 25 | 'ens_id', 'gene_id') 26 | scMT <- scMT[, c(cols)] 27 | glimpse(scMT) 28 | ``` 29 | 30 | * `name`: Name of annotation. 31 | * `id_.x`: Identifier of methylated region. 32 | * `id_.y`: Identifier of gene matched to methylated region. 33 | * `sample`: Cell identifier. 34 | * `met`: Methylation rate of methylated region. 35 | * `weight`: Weight of methylated region proportional to the number of covered 36 | CpG sites in that region. 37 | * `expr: Expression rate of gene. 38 | * `ens_id`: ENSEMBL gene identifier. 39 | * `gene_id`: Gene name. 40 | * `chromo.x`: Chromosome of methylated region. 41 | * `start.x`: Start position of methylated region. 42 | * `end.x`: End position of methylated region. 43 | * `chromo.y`: Chromosome of gene. 44 | * `start.y`: Start position of gene. 45 | * `end.y`: End position of gene. 46 | 47 | -------------------------------------------------------------------------------- /data/join/data.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PMBio/scMT-seq/cabc3b66bc8e1d9f61bb2b6d903018c0c88d8093/data/join/data.rds -------------------------------------------------------------------------------- /data/met/join/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Join methlation data 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(dplyr) 15 | ``` 16 | 17 | ```{r} 18 | opts <- list() 19 | opts$files <- Sys.glob('../prepro/*/data_met.rds') 20 | ``` 21 | 22 | ```{r join} 23 | d <- list() 24 | dm <- list() 25 | for (f in opts$files) { 26 | name <- basename(dirname(f)) 27 | d[[name]] <- readRDS(f) 28 | d[[name]]$name <- name 29 | fm <- file.path(dirname(f), 'data_meta.rds') 30 | dm[[name]] <- readRDS(fm) 31 | dm[[name]]$name <- name 32 | } 33 | h <- sort(names(dm)) 34 | d <- do.call(rbind.data.frame, d) %>% mutate(name=factor(name, levels=h)) 35 | dm <- do.call(rbind.data.frame, dm) %>% mutate(name=factor(name, levels=h)) 36 | d <- d %>% droplevels %>% tbl_df 37 | dm <- dm %>% droplevels %>% tbl_df 38 | saveRDS(d, 'data_met.rds') 39 | saveRDS(dm, 'data_meta.rds') 40 | ``` 41 | -------------------------------------------------------------------------------- /data/met/prepro/H3K27ac/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Preprocessing Seqmonk methylation files 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(dplyr) 15 | library(tidyr) 16 | library(ggplot2) 17 | library(RColorBrewer) 18 | library(gplots) 19 | library(gridExtra) 20 | source('../../../../lib/utils.R') 21 | ``` 22 | 23 | ```{r opts, echo=T} 24 | opts <- list() 25 | opts$fil$cov <- 30 26 | opts$fil$min_var <- 10 27 | opts$fil$top_var <- 0.75 28 | opts$adjust <- F 29 | opts$scale_adjust <- T 30 | opts$samples_excl <- NULL 31 | opts$samples_methods <- NULL 32 | opts$samples_batch2 <- NULL 33 | ``` 34 | 35 | ```{r} 36 | opts$name <- basename(getwd()) 37 | opts$seqmonk_dirs <- c( 38 | file.path('../../reports', opts$name)) 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt')) 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt')) 41 | opts$out_base <- './data' 42 | opts$samples_file <- '../../../samples/samples_stats.csv' 43 | opts$samples_select_file <- '../../../samples/samples.csv' 44 | opts$cache <- F 45 | opts$plot_heat <- F 46 | ``` 47 | 48 | **Name**: `r opts$name` 49 | 50 | 51 | 52 | ```{r read_src} 53 | read_meta <- function(filename) { 54 | h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t') 55 | names(h) <- tolower(names(h)) 56 | h <- h %>% rename(chromo=chromosome) 57 | h <- h %>% tbl_df 58 | return (h) 59 | } 60 | 61 | read_meta_quick <- function(filename) { 62 | h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t') 63 | names(h) <- tolower(names(h)) 64 | h <- h %>% tbl_df 65 | return (h) 66 | } 67 | 68 | read_all_meta <- function(filenames) { 69 | m <- read_meta(filenames[1]) 70 | if (length(filenames) > 1) { 71 | for (i in 2:length(filenames)) { 72 | s <- read_meta_quick(filenames[i]) 73 | stopifnot(all(m$start == s$start)) 74 | } 75 | } 76 | return (m) 77 | } 78 | 79 | read_values <- function(filename, samples=NULL) { 80 | h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t') 81 | if (!is.null(samples)) { 82 | h <- subset(h, select=intersect(colnames(h), samples)) 83 | } 84 | h <- h %>% tbl_df 85 | return (h) 86 | } 87 | 88 | read_all_values <- function(filenames, samples=NULL) { 89 | d <- lapply(filenames, function(x) read_values(x, samples=samples)) 90 | e <- list() 91 | for (dd in d) { 92 | if (ncol(dd) > 0) { 93 | e[[length(e) + 1]] <- dd 94 | } 95 | } 96 | d <- e 97 | stopifnot(length(d) > 0) 98 | h <- d[[1]] 99 | if (length(d) > 1) { 100 | for (i in 2:length(d)) { 101 | if (ncol(d[[i]]) > 0) { 102 | h <- cbind.data.frame(h, d[[i]]) 103 | } 104 | } 105 | } 106 | h <- h %>% tbl_df 107 | return (h) 108 | } 109 | 110 | read_samples <- function(filename) { 111 | h <- read.table(filename, sep='\t', head=T) %>% tbl_df 112 | h$sample <- h$id 113 | return (h) 114 | } 115 | 116 | read_samples_select <- function(filename) { 117 | h <- read.table(filename, sep='\t', head=F) %>% tbl_df 118 | colnames(h) <- c('id') 119 | h <- as.vector(h$id) 120 | return (h) 121 | } 122 | 123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL, 124 | sel=NULL) { 125 | f <- excl 126 | if (!is.null(f)) { 127 | d <- d %>% filter(!(sample %in% f)) 128 | } 129 | f <- methods 130 | if (!is.null(f) && 'method' %in% colnames(d)) { 131 | d <- d %>% filter(method %in% f) 132 | } 133 | f <- batch2 134 | if (!is.null(f) && 'batch2' %in% colnames(d)) { 135 | d <- d %>% filter(batch2 %in% f) 136 | } 137 | if (!is.null(sel)) { 138 | d <- d %>% filter(sample %in% sel) 139 | } 140 | d <- droplevels(d) 141 | 142 | return (d) 143 | } 144 | ``` 145 | 146 | ```{r} 147 | dat <- list() 148 | dat$samples <- read_samples(opts$samples_file) 149 | dat$samples_select <- read_samples_select(opts$samples_select_file) 150 | dat$samples <- filter_samples(dat$samples, 151 | excl=opts$samples_excl, 152 | methods=opts$samples_methods, 153 | batch2=opts$samples_batch2, 154 | sel=dat$samples_select) 155 | 156 | dat$meta <- read_all_meta(opts$rates_files) 157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample)) 158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample)) 159 | 160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100))) 161 | stopifnot(all(round(dat$weights) == dat$weights)) 162 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 163 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 164 | ``` 165 | 166 | 167 | 168 | ## Preprocessing 169 | 170 | ```{r} 171 | get_cov <- function(d, perc=TRUE) { 172 | cov <- rowSums(!is.na(d)) 173 | if (perc) { 174 | cov <- cov / ncol(d) 175 | } 176 | return (cov) 177 | } 178 | 179 | get_var <- function(d) { 180 | return (apply(d, 1, var, na.rm=T)) 181 | } 182 | ``` 183 | 184 | ```{r} 185 | h <- duplicated(dat$meta) 186 | print(sprintf('Remove %d duplicated records.', sum(h))) 187 | dat$meta <- dat$meta[!h, ] 188 | dat$meta$id_ <- 1:nrow(dat$meta) 189 | for (n in c('rates', 'weights')) { 190 | dat[[n]] <- dat[[n]][!h,] 191 | dat[[n]]$id_ <- dat$meta$id_ 192 | } 193 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 194 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 195 | ``` 196 | 197 | ```{r filter_cov} 198 | d <- dat$rates 199 | print(sprintf('%d records in total.', nrow(d))) 200 | if (!is.na(opts$fil$cov)) { 201 | d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,] 202 | } 203 | print(sprintf('%d records passed coverage filter.', nrow(d))) 204 | dat$rates <- d 205 | ``` 206 | 207 | ```{r} 208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') { 209 | t <- data.frame(sample=factor(rownames(pc_vec)), 210 | pcx=pc_vec[,x], pcy=pc_vec[,y]) 211 | t <- t %>% left_join(dat$samples, by='sample') 212 | t$sample_short <- sub('^CSCP3_SERUM', '', t$sample) 213 | p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) + 214 | geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) + 215 | xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) + 216 | theme(legend.position='bottom') 217 | return (p) 218 | } 219 | 220 | plot_pca_val <- function(pc_val) { 221 | t <- data.frame(pc=1:length(pc_val), val=pc_val) 222 | p <- ggplot(t, aes(x=pc, y=val)) + 223 | geom_bar(stat='identity', fill='salmon', color='black') + 224 | xlab('principle component') + 225 | ylab('% variance explained') 226 | return (p) 227 | } 228 | ``` 229 | 230 | 231 | ## PCA unadjusted data 232 | 233 | ```{r} 234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 235 | ``` 236 | 237 | ```{r fig.width=8, fig.height=7} 238 | p1 <- plot_pca_vec(pc$vec) 239 | p1 + theme(legend.position='top') 240 | ``` 241 | 242 | ```{r fig.width=10, fig.height=5} 243 | p2 <- plot_pca_val(pc$val) 244 | p2 245 | ``` 246 | 247 | ```{r} 248 | d <- dat$rates %>% select(-id_) 249 | cov <- rowSums(!is.na(d)) / ncol(d) 250 | d[h == ncol(d)] 251 | ``` 252 | 253 | ```{r adjust, eval=opts$adjust} 254 | scale_rates <- function(d) { 255 | h <- d %>% select(-id_) 256 | a <- min(h, na.rm=T) 257 | b <- max(h, na.rm=T) 258 | h <- (h - a) / (b - a) * 100 259 | h$id_ <- d$id_ 260 | return (h) 261 | } 262 | 263 | adjust_batch_met <- function(d, what='batch2', scale=F) { 264 | is_na <- is.na(d) 265 | h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_) 266 | h <- h %>% gather(sample, y, -id_) %>% 267 | inner_join(select_(dat$samples, 'sample', x=what), by='sample') 268 | h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup 269 | h <- h %>% select(id_, sample, y=yr) 270 | h <- h %>% spread(sample, y) 271 | stopifnot(all(dim(is_na) == dim(h))) 272 | stopifnot(all(colnames(is_na) %in% colnames(h))) 273 | h <- h[,colnames(is_na)] 274 | h[is_na] <- NA 275 | return (h) 276 | } 277 | 278 | h <- dat$rates %>% adjust_batch_met 279 | if (opts$scale_adjust) { 280 | h <- scale_rates(h) 281 | } 282 | dat$rates <- h 283 | ``` 284 | 285 | ## PCA adjusted data 286 | 287 | ```{r} 288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 289 | ``` 290 | 291 | ```{r fig.width=8, fig.height=7} 292 | p1 <- plot_pca_vec(pc$vec) 293 | p1 + theme(legend.position='top') 294 | ``` 295 | 296 | ```{r fig.width=10, fig.height=5} 297 | p2 <- plot_pca_val(pc$val) 298 | p2 299 | ``` 300 | 301 | ## Filtering 302 | 303 | ```{r filter_var} 304 | d <- dat$rates 305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples 306 | d <- d[!is.na(h),] 307 | if (!is.na(opts$fil$min_var)) { 308 | stopifnot(sum(is.na(h)) == 0) 309 | d <- d[h >= opts$fil$min_var,] 310 | } 311 | print(sprintf('%d records passed min var filter.', nrow(d))) 312 | 313 | if (!is.na(opts$fil$top_var)) { 314 | t <- opts$fil$top_var 315 | if (t <= 1) { 316 | t <- as.integer(t * nrow(d)) 317 | } 318 | d <- d[order(get_var(select(d, -id_)), decreasing=T),] 319 | d <- d[1:t,] 320 | } 321 | print(sprintf('%d final record.', nrow(d))) 322 | dat$rates <- d 323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_)) 324 | ``` 325 | 326 | ```{r join} 327 | d <- dat$meta %>% select(chromo, start, end, id_) %>% 328 | inner_join(dat$rates, by='id_') 329 | stopifnot(nrow(d) == nrow(dat$rates)) 330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_)) 331 | 332 | d <- dat$weights %>% gather(sample, weight, -id_) 333 | h <- nrow(dat$df) 334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample')) 335 | 336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup 337 | 338 | stopifnot(h == nrow(dat$df)) 339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1)) 340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0)) 341 | ``` 342 | 343 | ```{r} 344 | dat$df <- dat$df %>% ungroup %>% tbl_df 345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df 346 | ``` 347 | 348 | ```{r write} 349 | save_rds <- function(d, name) { 350 | f <- sprintf('%s_%s.rds', opts$out_base, name) 351 | saveRDS(d, f) 352 | } 353 | 354 | save_rds(dat$df, 'met') 355 | save_rds(dat$meta, 'meta') 356 | ``` 357 | 358 | 359 | 360 | ```{r eval=opts$plot_heat, fig.height=12} 361 | plot_heat <- function(d, Rowv=T, xlab='value') { 362 | colors <- rev(brewer.pal(9, 'Spectral')) 363 | colors <- colorRampPalette(colors)(50) 364 | 365 | labRow <- NA 366 | if (nrow(d) > 500) { 367 | dendro='column' 368 | } else { 369 | dendro = 'both' 370 | } 371 | p <- heatmap.2(d, density.info='none', trace='none', col=colors, 372 | Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA, 373 | lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab) 374 | return (p) 375 | } 376 | 377 | d <- dat$rates 378 | max_rows <- 1000 379 | if (nrow(d) > max_rows) { 380 | d <- d %>% sample_n(max_rows) 381 | } 382 | d <- d %>% select(-id_) %>% as.matrix 383 | if (nrow(d) > 100) { 384 | h <- plot_heat(d) 385 | } 386 | ``` 387 | -------------------------------------------------------------------------------- /data/met/prepro/H3K27me3/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Preprocessing Seqmonk methylation files 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(dplyr) 15 | library(tidyr) 16 | library(ggplot2) 17 | library(RColorBrewer) 18 | library(gplots) 19 | library(gridExtra) 20 | source('../../../../lib/utils.R') 21 | ``` 22 | 23 | ```{r opts, echo=T} 24 | opts <- list() 25 | opts$fil$cov <- 30 26 | opts$fil$min_var <- 10 27 | opts$fil$top_var <- 0.75 28 | opts$adjust <- F 29 | opts$scale_adjust <- T 30 | opts$samples_excl <- NULL 31 | opts$samples_methods <- NULL 32 | opts$samples_batch2 <- NULL 33 | ``` 34 | 35 | ```{r} 36 | opts$name <- basename(getwd()) 37 | opts$seqmonk_dirs <- c( 38 | file.path('../../reports', opts$name)) 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt')) 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt')) 41 | opts$out_base <- './data' 42 | opts$samples_file <- '../../../samples/samples_stats.csv' 43 | opts$samples_select_file <- '../../../samples/samples.csv' 44 | opts$cache <- F 45 | opts$plot_heat <- F 46 | ``` 47 | 48 | **Name**: `r opts$name` 49 | 50 | 51 | 52 | ```{r read_src} 53 | read_meta <- function(filename) { 54 | h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t') 55 | names(h) <- tolower(names(h)) 56 | h <- h %>% rename(chromo=chromosome) 57 | h <- h %>% tbl_df 58 | return (h) 59 | } 60 | 61 | read_meta_quick <- function(filename) { 62 | h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t') 63 | names(h) <- tolower(names(h)) 64 | h <- h %>% tbl_df 65 | return (h) 66 | } 67 | 68 | read_all_meta <- function(filenames) { 69 | m <- read_meta(filenames[1]) 70 | if (length(filenames) > 1) { 71 | for (i in 2:length(filenames)) { 72 | s <- read_meta_quick(filenames[i]) 73 | stopifnot(all(m$start == s$start)) 74 | } 75 | } 76 | return (m) 77 | } 78 | 79 | read_values <- function(filename, samples=NULL) { 80 | h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t') 81 | if (!is.null(samples)) { 82 | h <- subset(h, select=intersect(colnames(h), samples)) 83 | } 84 | h <- h %>% tbl_df 85 | return (h) 86 | } 87 | 88 | read_all_values <- function(filenames, samples=NULL) { 89 | d <- lapply(filenames, function(x) read_values(x, samples=samples)) 90 | e <- list() 91 | for (dd in d) { 92 | if (ncol(dd) > 0) { 93 | e[[length(e) + 1]] <- dd 94 | } 95 | } 96 | d <- e 97 | stopifnot(length(d) > 0) 98 | h <- d[[1]] 99 | if (length(d) > 1) { 100 | for (i in 2:length(d)) { 101 | if (ncol(d[[i]]) > 0) { 102 | h <- cbind.data.frame(h, d[[i]]) 103 | } 104 | } 105 | } 106 | h <- h %>% tbl_df 107 | return (h) 108 | } 109 | 110 | read_samples <- function(filename) { 111 | h <- read.table(filename, sep='\t', head=T) %>% tbl_df 112 | h$sample <- h$id 113 | return (h) 114 | } 115 | 116 | read_samples_select <- function(filename) { 117 | h <- read.table(filename, sep='\t', head=F) %>% tbl_df 118 | colnames(h) <- c('id') 119 | h <- as.vector(h$id) 120 | return (h) 121 | } 122 | 123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL, 124 | sel=NULL) { 125 | f <- excl 126 | if (!is.null(f)) { 127 | d <- d %>% filter(!(sample %in% f)) 128 | } 129 | f <- methods 130 | if (!is.null(f) && 'method' %in% colnames(d)) { 131 | d <- d %>% filter(method %in% f) 132 | } 133 | f <- batch2 134 | if (!is.null(f) && 'batch2' %in% colnames(d)) { 135 | d <- d %>% filter(batch2 %in% f) 136 | } 137 | if (!is.null(sel)) { 138 | d <- d %>% filter(sample %in% sel) 139 | } 140 | d <- droplevels(d) 141 | 142 | return (d) 143 | } 144 | ``` 145 | 146 | ```{r} 147 | dat <- list() 148 | dat$samples <- read_samples(opts$samples_file) 149 | dat$samples_select <- read_samples_select(opts$samples_select_file) 150 | dat$samples <- filter_samples(dat$samples, 151 | excl=opts$samples_excl, 152 | methods=opts$samples_methods, 153 | batch2=opts$samples_batch2, 154 | sel=dat$samples_select) 155 | 156 | dat$meta <- read_all_meta(opts$rates_files) 157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample)) 158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample)) 159 | 160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100))) 161 | stopifnot(all(round(dat$weights) == dat$weights)) 162 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 163 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 164 | ``` 165 | 166 | 167 | 168 | ## Preprocessing 169 | 170 | ```{r} 171 | get_cov <- function(d, perc=TRUE) { 172 | cov <- rowSums(!is.na(d)) 173 | if (perc) { 174 | cov <- cov / ncol(d) 175 | } 176 | return (cov) 177 | } 178 | 179 | get_var <- function(d) { 180 | return (apply(d, 1, var, na.rm=T)) 181 | } 182 | ``` 183 | 184 | ```{r} 185 | h <- duplicated(dat$meta) 186 | print(sprintf('Remove %d duplicated records.', sum(h))) 187 | dat$meta <- dat$meta[!h, ] 188 | dat$meta$id_ <- 1:nrow(dat$meta) 189 | for (n in c('rates', 'weights')) { 190 | dat[[n]] <- dat[[n]][!h,] 191 | dat[[n]]$id_ <- dat$meta$id_ 192 | } 193 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 194 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 195 | ``` 196 | 197 | ```{r filter_cov} 198 | d <- dat$rates 199 | print(sprintf('%d records in total.', nrow(d))) 200 | if (!is.na(opts$fil$cov)) { 201 | d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,] 202 | } 203 | print(sprintf('%d records passed coverage filter.', nrow(d))) 204 | dat$rates <- d 205 | ``` 206 | 207 | ```{r} 208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') { 209 | t <- data.frame(sample=factor(rownames(pc_vec)), 210 | pcx=pc_vec[,x], pcy=pc_vec[,y]) 211 | t <- t %>% left_join(dat$samples, by='sample') 212 | t$sample_short <- sub('^CSCP3_SERUM', '', t$sample) 213 | p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) + 214 | geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) + 215 | xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) + 216 | theme(legend.position='bottom') 217 | return (p) 218 | } 219 | 220 | plot_pca_val <- function(pc_val) { 221 | t <- data.frame(pc=1:length(pc_val), val=pc_val) 222 | p <- ggplot(t, aes(x=pc, y=val)) + 223 | geom_bar(stat='identity', fill='salmon', color='black') + 224 | xlab('principle component') + 225 | ylab('% variance explained') 226 | return (p) 227 | } 228 | ``` 229 | 230 | 231 | ## PCA unadjusted data 232 | 233 | ```{r} 234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 235 | ``` 236 | 237 | ```{r fig.width=8, fig.height=7} 238 | p1 <- plot_pca_vec(pc$vec) 239 | p1 + theme(legend.position='top') 240 | ``` 241 | 242 | ```{r fig.width=10, fig.height=5} 243 | p2 <- plot_pca_val(pc$val) 244 | p2 245 | ``` 246 | 247 | ```{r} 248 | d <- dat$rates %>% select(-id_) 249 | cov <- rowSums(!is.na(d)) / ncol(d) 250 | d[h == ncol(d)] 251 | ``` 252 | 253 | ```{r adjust, eval=opts$adjust} 254 | scale_rates <- function(d) { 255 | h <- d %>% select(-id_) 256 | a <- min(h, na.rm=T) 257 | b <- max(h, na.rm=T) 258 | h <- (h - a) / (b - a) * 100 259 | h$id_ <- d$id_ 260 | return (h) 261 | } 262 | 263 | adjust_batch_met <- function(d, what='batch2', scale=F) { 264 | is_na <- is.na(d) 265 | h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_) 266 | h <- h %>% gather(sample, y, -id_) %>% 267 | inner_join(select_(dat$samples, 'sample', x=what), by='sample') 268 | h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup 269 | h <- h %>% select(id_, sample, y=yr) 270 | h <- h %>% spread(sample, y) 271 | stopifnot(all(dim(is_na) == dim(h))) 272 | stopifnot(all(colnames(is_na) %in% colnames(h))) 273 | h <- h[,colnames(is_na)] 274 | h[is_na] <- NA 275 | return (h) 276 | } 277 | 278 | h <- dat$rates %>% adjust_batch_met 279 | if (opts$scale_adjust) { 280 | h <- scale_rates(h) 281 | } 282 | dat$rates <- h 283 | ``` 284 | 285 | ## PCA adjusted data 286 | 287 | ```{r} 288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 289 | ``` 290 | 291 | ```{r fig.width=8, fig.height=7} 292 | p1 <- plot_pca_vec(pc$vec) 293 | p1 + theme(legend.position='top') 294 | ``` 295 | 296 | ```{r fig.width=10, fig.height=5} 297 | p2 <- plot_pca_val(pc$val) 298 | p2 299 | ``` 300 | 301 | ## Filtering 302 | 303 | ```{r filter_var} 304 | d <- dat$rates 305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples 306 | d <- d[!is.na(h),] 307 | if (!is.na(opts$fil$min_var)) { 308 | stopifnot(sum(is.na(h)) == 0) 309 | d <- d[h >= opts$fil$min_var,] 310 | } 311 | print(sprintf('%d records passed min var filter.', nrow(d))) 312 | 313 | if (!is.na(opts$fil$top_var)) { 314 | t <- opts$fil$top_var 315 | if (t <= 1) { 316 | t <- as.integer(t * nrow(d)) 317 | } 318 | d <- d[order(get_var(select(d, -id_)), decreasing=T),] 319 | d <- d[1:t,] 320 | } 321 | print(sprintf('%d final record.', nrow(d))) 322 | dat$rates <- d 323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_)) 324 | ``` 325 | 326 | ```{r join} 327 | d <- dat$meta %>% select(chromo, start, end, id_) %>% 328 | inner_join(dat$rates, by='id_') 329 | stopifnot(nrow(d) == nrow(dat$rates)) 330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_)) 331 | 332 | d <- dat$weights %>% gather(sample, weight, -id_) 333 | h <- nrow(dat$df) 334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample')) 335 | 336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup 337 | 338 | stopifnot(h == nrow(dat$df)) 339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1)) 340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0)) 341 | ``` 342 | 343 | ```{r} 344 | dat$df <- dat$df %>% ungroup %>% tbl_df 345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df 346 | ``` 347 | 348 | ```{r write} 349 | save_rds <- function(d, name) { 350 | f <- sprintf('%s_%s.rds', opts$out_base, name) 351 | saveRDS(d, f) 352 | } 353 | 354 | save_rds(dat$df, 'met') 355 | save_rds(dat$meta, 'meta') 356 | ``` 357 | 358 | 359 | 360 | ```{r eval=opts$plot_heat, fig.height=12} 361 | plot_heat <- function(d, Rowv=T, xlab='value') { 362 | colors <- rev(brewer.pal(9, 'Spectral')) 363 | colors <- colorRampPalette(colors)(50) 364 | 365 | labRow <- NA 366 | if (nrow(d) > 500) { 367 | dendro='column' 368 | } else { 369 | dendro = 'both' 370 | } 371 | p <- heatmap.2(d, density.info='none', trace='none', col=colors, 372 | Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA, 373 | lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab) 374 | return (p) 375 | } 376 | 377 | d <- dat$rates 378 | max_rows <- 1000 379 | if (nrow(d) > max_rows) { 380 | d <- d %>% sample_n(max_rows) 381 | } 382 | d <- d %>% select(-id_) %>% as.matrix 383 | if (nrow(d) > 100) { 384 | h <- plot_heat(d) 385 | } 386 | ``` 387 | -------------------------------------------------------------------------------- /data/met/prepro/H3K4me1/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Preprocessing Seqmonk methylation files 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(dplyr) 15 | library(tidyr) 16 | library(ggplot2) 17 | library(RColorBrewer) 18 | library(gplots) 19 | library(gridExtra) 20 | source('../../../../lib/utils.R') 21 | ``` 22 | 23 | ```{r opts, echo=T} 24 | opts <- list() 25 | opts$fil$cov <- 30 26 | opts$fil$min_var <- 10 27 | opts$fil$top_var <- 0.75 28 | opts$adjust <- F 29 | opts$scale_adjust <- T 30 | opts$samples_excl <- NULL 31 | opts$samples_methods <- NULL 32 | opts$samples_batch2 <- NULL 33 | ``` 34 | 35 | ```{r} 36 | opts$name <- basename(getwd()) 37 | opts$seqmonk_dirs <- c( 38 | file.path('../../reports', opts$name)) 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt')) 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt')) 41 | opts$out_base <- './data' 42 | opts$samples_file <- '../../../samples/samples_stats.csv' 43 | opts$samples_select_file <- '../../../samples/samples.csv' 44 | opts$cache <- F 45 | opts$plot_heat <- F 46 | ``` 47 | 48 | **Name**: `r opts$name` 49 | 50 | 51 | 52 | ```{r read_src} 53 | read_meta <- function(filename) { 54 | h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t') 55 | names(h) <- tolower(names(h)) 56 | h <- h %>% rename(chromo=chromosome) 57 | h <- h %>% tbl_df 58 | return (h) 59 | } 60 | 61 | read_meta_quick <- function(filename) { 62 | h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t') 63 | names(h) <- tolower(names(h)) 64 | h <- h %>% tbl_df 65 | return (h) 66 | } 67 | 68 | read_all_meta <- function(filenames) { 69 | m <- read_meta(filenames[1]) 70 | if (length(filenames) > 1) { 71 | for (i in 2:length(filenames)) { 72 | s <- read_meta_quick(filenames[i]) 73 | stopifnot(all(m$start == s$start)) 74 | } 75 | } 76 | return (m) 77 | } 78 | 79 | read_values <- function(filename, samples=NULL) { 80 | h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t') 81 | if (!is.null(samples)) { 82 | h <- subset(h, select=intersect(colnames(h), samples)) 83 | } 84 | h <- h %>% tbl_df 85 | return (h) 86 | } 87 | 88 | read_all_values <- function(filenames, samples=NULL) { 89 | d <- lapply(filenames, function(x) read_values(x, samples=samples)) 90 | e <- list() 91 | for (dd in d) { 92 | if (ncol(dd) > 0) { 93 | e[[length(e) + 1]] <- dd 94 | } 95 | } 96 | d <- e 97 | stopifnot(length(d) > 0) 98 | h <- d[[1]] 99 | if (length(d) > 1) { 100 | for (i in 2:length(d)) { 101 | if (ncol(d[[i]]) > 0) { 102 | h <- cbind.data.frame(h, d[[i]]) 103 | } 104 | } 105 | } 106 | h <- h %>% tbl_df 107 | return (h) 108 | } 109 | 110 | read_samples <- function(filename) { 111 | h <- read.table(filename, sep='\t', head=T) %>% tbl_df 112 | h$sample <- h$id 113 | return (h) 114 | } 115 | 116 | read_samples_select <- function(filename) { 117 | h <- read.table(filename, sep='\t', head=F) %>% tbl_df 118 | colnames(h) <- c('id') 119 | h <- as.vector(h$id) 120 | return (h) 121 | } 122 | 123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL, 124 | sel=NULL) { 125 | f <- excl 126 | if (!is.null(f)) { 127 | d <- d %>% filter(!(sample %in% f)) 128 | } 129 | f <- methods 130 | if (!is.null(f) && 'method' %in% colnames(d)) { 131 | d <- d %>% filter(method %in% f) 132 | } 133 | f <- batch2 134 | if (!is.null(f) && 'batch2' %in% colnames(d)) { 135 | d <- d %>% filter(batch2 %in% f) 136 | } 137 | if (!is.null(sel)) { 138 | d <- d %>% filter(sample %in% sel) 139 | } 140 | d <- droplevels(d) 141 | 142 | return (d) 143 | } 144 | ``` 145 | 146 | ```{r} 147 | dat <- list() 148 | dat$samples <- read_samples(opts$samples_file) 149 | dat$samples_select <- read_samples_select(opts$samples_select_file) 150 | dat$samples <- filter_samples(dat$samples, 151 | excl=opts$samples_excl, 152 | methods=opts$samples_methods, 153 | batch2=opts$samples_batch2, 154 | sel=dat$samples_select) 155 | 156 | dat$meta <- read_all_meta(opts$rates_files) 157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample)) 158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample)) 159 | 160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100))) 161 | stopifnot(all(round(dat$weights) == dat$weights)) 162 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 163 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 164 | ``` 165 | 166 | 167 | 168 | ## Preprocessing 169 | 170 | ```{r} 171 | get_cov <- function(d, perc=TRUE) { 172 | cov <- rowSums(!is.na(d)) 173 | if (perc) { 174 | cov <- cov / ncol(d) 175 | } 176 | return (cov) 177 | } 178 | 179 | get_var <- function(d) { 180 | return (apply(d, 1, var, na.rm=T)) 181 | } 182 | ``` 183 | 184 | ```{r} 185 | h <- duplicated(dat$meta) 186 | print(sprintf('Remove %d duplicated records.', sum(h))) 187 | dat$meta <- dat$meta[!h, ] 188 | dat$meta$id_ <- 1:nrow(dat$meta) 189 | for (n in c('rates', 'weights')) { 190 | dat[[n]] <- dat[[n]][!h,] 191 | dat[[n]]$id_ <- dat$meta$id_ 192 | } 193 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 194 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 195 | ``` 196 | 197 | ```{r filter_cov} 198 | d <- dat$rates 199 | print(sprintf('%d records in total.', nrow(d))) 200 | if (!is.na(opts$fil$cov)) { 201 | d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,] 202 | } 203 | print(sprintf('%d records passed coverage filter.', nrow(d))) 204 | dat$rates <- d 205 | ``` 206 | 207 | ```{r} 208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') { 209 | t <- data.frame(sample=factor(rownames(pc_vec)), 210 | pcx=pc_vec[,x], pcy=pc_vec[,y]) 211 | t <- t %>% left_join(dat$samples, by='sample') 212 | t$sample_short <- sub('^CSCP3_SERUM', '', t$sample) 213 | p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) + 214 | geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) + 215 | xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) + 216 | theme(legend.position='bottom') 217 | return (p) 218 | } 219 | 220 | plot_pca_val <- function(pc_val) { 221 | t <- data.frame(pc=1:length(pc_val), val=pc_val) 222 | p <- ggplot(t, aes(x=pc, y=val)) + 223 | geom_bar(stat='identity', fill='salmon', color='black') + 224 | xlab('principle component') + 225 | ylab('% variance explained') 226 | return (p) 227 | } 228 | ``` 229 | 230 | 231 | ## PCA unadjusted data 232 | 233 | ```{r} 234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 235 | ``` 236 | 237 | ```{r fig.width=8, fig.height=7} 238 | p1 <- plot_pca_vec(pc$vec) 239 | p1 + theme(legend.position='top') 240 | ``` 241 | 242 | ```{r fig.width=10, fig.height=5} 243 | p2 <- plot_pca_val(pc$val) 244 | p2 245 | ``` 246 | 247 | ```{r} 248 | d <- dat$rates %>% select(-id_) 249 | cov <- rowSums(!is.na(d)) / ncol(d) 250 | d[h == ncol(d)] 251 | ``` 252 | 253 | ```{r adjust, eval=opts$adjust} 254 | scale_rates <- function(d) { 255 | h <- d %>% select(-id_) 256 | a <- min(h, na.rm=T) 257 | b <- max(h, na.rm=T) 258 | h <- (h - a) / (b - a) * 100 259 | h$id_ <- d$id_ 260 | return (h) 261 | } 262 | 263 | adjust_batch_met <- function(d, what='batch2', scale=F) { 264 | is_na <- is.na(d) 265 | h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_) 266 | h <- h %>% gather(sample, y, -id_) %>% 267 | inner_join(select_(dat$samples, 'sample', x=what), by='sample') 268 | h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup 269 | h <- h %>% select(id_, sample, y=yr) 270 | h <- h %>% spread(sample, y) 271 | stopifnot(all(dim(is_na) == dim(h))) 272 | stopifnot(all(colnames(is_na) %in% colnames(h))) 273 | h <- h[,colnames(is_na)] 274 | h[is_na] <- NA 275 | return (h) 276 | } 277 | 278 | h <- dat$rates %>% adjust_batch_met 279 | if (opts$scale_adjust) { 280 | h <- scale_rates(h) 281 | } 282 | dat$rates <- h 283 | ``` 284 | 285 | ## PCA adjusted data 286 | 287 | ```{r} 288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 289 | ``` 290 | 291 | ```{r fig.width=8, fig.height=7} 292 | p1 <- plot_pca_vec(pc$vec) 293 | p1 + theme(legend.position='top') 294 | ``` 295 | 296 | ```{r fig.width=10, fig.height=5} 297 | p2 <- plot_pca_val(pc$val) 298 | p2 299 | ``` 300 | 301 | ## Filtering 302 | 303 | ```{r filter_var} 304 | d <- dat$rates 305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples 306 | d <- d[!is.na(h),] 307 | if (!is.na(opts$fil$min_var)) { 308 | stopifnot(sum(is.na(h)) == 0) 309 | d <- d[h >= opts$fil$min_var,] 310 | } 311 | print(sprintf('%d records passed min var filter.', nrow(d))) 312 | 313 | if (!is.na(opts$fil$top_var)) { 314 | t <- opts$fil$top_var 315 | if (t <= 1) { 316 | t <- as.integer(t * nrow(d)) 317 | } 318 | d <- d[order(get_var(select(d, -id_)), decreasing=T),] 319 | d <- d[1:t,] 320 | } 321 | print(sprintf('%d final record.', nrow(d))) 322 | dat$rates <- d 323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_)) 324 | ``` 325 | 326 | ```{r join} 327 | d <- dat$meta %>% select(chromo, start, end, id_) %>% 328 | inner_join(dat$rates, by='id_') 329 | stopifnot(nrow(d) == nrow(dat$rates)) 330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_)) 331 | 332 | d <- dat$weights %>% gather(sample, weight, -id_) 333 | h <- nrow(dat$df) 334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample')) 335 | 336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup 337 | 338 | stopifnot(h == nrow(dat$df)) 339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1)) 340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0)) 341 | ``` 342 | 343 | ```{r} 344 | dat$df <- dat$df %>% ungroup %>% tbl_df 345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df 346 | ``` 347 | 348 | ```{r write} 349 | save_rds <- function(d, name) { 350 | f <- sprintf('%s_%s.rds', opts$out_base, name) 351 | saveRDS(d, f) 352 | } 353 | 354 | save_rds(dat$df, 'met') 355 | save_rds(dat$meta, 'meta') 356 | ``` 357 | 358 | 359 | 360 | ```{r eval=opts$plot_heat, fig.height=12} 361 | plot_heat <- function(d, Rowv=T, xlab='value') { 362 | colors <- rev(brewer.pal(9, 'Spectral')) 363 | colors <- colorRampPalette(colors)(50) 364 | 365 | labRow <- NA 366 | if (nrow(d) > 500) { 367 | dendro='column' 368 | } else { 369 | dendro = 'both' 370 | } 371 | p <- heatmap.2(d, density.info='none', trace='none', col=colors, 372 | Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA, 373 | lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab) 374 | return (p) 375 | } 376 | 377 | d <- dat$rates 378 | max_rows <- 1000 379 | if (nrow(d) > max_rows) { 380 | d <- d %>% sample_n(max_rows) 381 | } 382 | d <- d %>% select(-id_) %>% as.matrix 383 | if (nrow(d) > 100) { 384 | h <- plot_heat(d) 385 | } 386 | ``` 387 | -------------------------------------------------------------------------------- /data/met/prepro/IAP/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Preprocessing Seqmonk methylation files 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(dplyr) 15 | library(tidyr) 16 | library(ggplot2) 17 | library(RColorBrewer) 18 | library(gplots) 19 | library(gridExtra) 20 | source('../../../../lib/utils.R') 21 | ``` 22 | 23 | ```{r opts, echo=T} 24 | opts <- list() 25 | opts$fil$cov <- 30 26 | opts$fil$min_var <- 10 27 | opts$fil$top_var <- 0.75 28 | opts$adjust <- F 29 | opts$scale_adjust <- T 30 | opts$samples_excl <- NULL 31 | opts$samples_methods <- NULL 32 | opts$samples_batch2 <- NULL 33 | ``` 34 | 35 | ```{r} 36 | opts$name <- basename(getwd()) 37 | opts$seqmonk_dirs <- c( 38 | file.path('../../reports', opts$name)) 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt')) 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt')) 41 | opts$out_base <- './data' 42 | opts$samples_file <- '../../../samples/samples_stats.csv' 43 | opts$samples_select_file <- '../../../samples/samples.csv' 44 | opts$cache <- F 45 | opts$plot_heat <- F 46 | ``` 47 | 48 | **Name**: `r opts$name` 49 | 50 | 51 | 52 | ```{r read_src} 53 | read_meta <- function(filename) { 54 | h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t') 55 | names(h) <- tolower(names(h)) 56 | h <- h %>% rename(chromo=chromosome) 57 | h <- h %>% tbl_df 58 | return (h) 59 | } 60 | 61 | read_meta_quick <- function(filename) { 62 | h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t') 63 | names(h) <- tolower(names(h)) 64 | h <- h %>% tbl_df 65 | return (h) 66 | } 67 | 68 | read_all_meta <- function(filenames) { 69 | m <- read_meta(filenames[1]) 70 | if (length(filenames) > 1) { 71 | for (i in 2:length(filenames)) { 72 | s <- read_meta_quick(filenames[i]) 73 | stopifnot(all(m$start == s$start)) 74 | } 75 | } 76 | return (m) 77 | } 78 | 79 | read_values <- function(filename, samples=NULL) { 80 | h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t') 81 | if (!is.null(samples)) { 82 | h <- subset(h, select=intersect(colnames(h), samples)) 83 | } 84 | h <- h %>% tbl_df 85 | return (h) 86 | } 87 | 88 | read_all_values <- function(filenames, samples=NULL) { 89 | d <- lapply(filenames, function(x) read_values(x, samples=samples)) 90 | e <- list() 91 | for (dd in d) { 92 | if (ncol(dd) > 0) { 93 | e[[length(e) + 1]] <- dd 94 | } 95 | } 96 | d <- e 97 | stopifnot(length(d) > 0) 98 | h <- d[[1]] 99 | if (length(d) > 1) { 100 | for (i in 2:length(d)) { 101 | if (ncol(d[[i]]) > 0) { 102 | h <- cbind.data.frame(h, d[[i]]) 103 | } 104 | } 105 | } 106 | h <- h %>% tbl_df 107 | return (h) 108 | } 109 | 110 | read_samples <- function(filename) { 111 | h <- read.table(filename, sep='\t', head=T) %>% tbl_df 112 | h$sample <- h$id 113 | return (h) 114 | } 115 | 116 | read_samples_select <- function(filename) { 117 | h <- read.table(filename, sep='\t', head=F) %>% tbl_df 118 | colnames(h) <- c('id') 119 | h <- as.vector(h$id) 120 | return (h) 121 | } 122 | 123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL, 124 | sel=NULL) { 125 | f <- excl 126 | if (!is.null(f)) { 127 | d <- d %>% filter(!(sample %in% f)) 128 | } 129 | f <- methods 130 | if (!is.null(f) && 'method' %in% colnames(d)) { 131 | d <- d %>% filter(method %in% f) 132 | } 133 | f <- batch2 134 | if (!is.null(f) && 'batch2' %in% colnames(d)) { 135 | d <- d %>% filter(batch2 %in% f) 136 | } 137 | if (!is.null(sel)) { 138 | d <- d %>% filter(sample %in% sel) 139 | } 140 | d <- droplevels(d) 141 | 142 | return (d) 143 | } 144 | ``` 145 | 146 | ```{r} 147 | dat <- list() 148 | dat$samples <- read_samples(opts$samples_file) 149 | dat$samples_select <- read_samples_select(opts$samples_select_file) 150 | dat$samples <- filter_samples(dat$samples, 151 | excl=opts$samples_excl, 152 | methods=opts$samples_methods, 153 | batch2=opts$samples_batch2, 154 | sel=dat$samples_select) 155 | 156 | dat$meta <- read_all_meta(opts$rates_files) 157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample)) 158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample)) 159 | 160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100))) 161 | stopifnot(all(round(dat$weights) == dat$weights)) 162 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 163 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 164 | ``` 165 | 166 | 167 | 168 | ## Preprocessing 169 | 170 | ```{r} 171 | get_cov <- function(d, perc=TRUE) { 172 | cov <- rowSums(!is.na(d)) 173 | if (perc) { 174 | cov <- cov / ncol(d) 175 | } 176 | return (cov) 177 | } 178 | 179 | get_var <- function(d) { 180 | return (apply(d, 1, var, na.rm=T)) 181 | } 182 | ``` 183 | 184 | ```{r} 185 | h <- duplicated(dat$meta) 186 | print(sprintf('Remove %d duplicated records.', sum(h))) 187 | dat$meta <- dat$meta[!h, ] 188 | dat$meta$id_ <- 1:nrow(dat$meta) 189 | for (n in c('rates', 'weights')) { 190 | dat[[n]] <- dat[[n]][!h,] 191 | dat[[n]]$id_ <- dat$meta$id_ 192 | } 193 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 194 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 195 | ``` 196 | 197 | ```{r filter_cov} 198 | d <- dat$rates 199 | print(sprintf('%d records in total.', nrow(d))) 200 | if (!is.na(opts$fil$cov)) { 201 | d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,] 202 | } 203 | print(sprintf('%d records passed coverage filter.', nrow(d))) 204 | dat$rates <- d 205 | ``` 206 | 207 | ```{r} 208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') { 209 | t <- data.frame(sample=factor(rownames(pc_vec)), 210 | pcx=pc_vec[,x], pcy=pc_vec[,y]) 211 | t <- t %>% left_join(dat$samples, by='sample') 212 | t$sample_short <- sub('^CSCP3_SERUM', '', t$sample) 213 | p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) + 214 | geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) + 215 | xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) + 216 | theme(legend.position='bottom') 217 | return (p) 218 | } 219 | 220 | plot_pca_val <- function(pc_val) { 221 | t <- data.frame(pc=1:length(pc_val), val=pc_val) 222 | p <- ggplot(t, aes(x=pc, y=val)) + 223 | geom_bar(stat='identity', fill='salmon', color='black') + 224 | xlab('principle component') + 225 | ylab('% variance explained') 226 | return (p) 227 | } 228 | ``` 229 | 230 | 231 | ## PCA unadjusted data 232 | 233 | ```{r} 234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 235 | ``` 236 | 237 | ```{r fig.width=8, fig.height=7} 238 | p1 <- plot_pca_vec(pc$vec) 239 | p1 + theme(legend.position='top') 240 | ``` 241 | 242 | ```{r fig.width=10, fig.height=5} 243 | p2 <- plot_pca_val(pc$val) 244 | p2 245 | ``` 246 | 247 | ```{r} 248 | d <- dat$rates %>% select(-id_) 249 | cov <- rowSums(!is.na(d)) / ncol(d) 250 | d[h == ncol(d)] 251 | ``` 252 | 253 | ```{r adjust, eval=opts$adjust} 254 | scale_rates <- function(d) { 255 | h <- d %>% select(-id_) 256 | a <- min(h, na.rm=T) 257 | b <- max(h, na.rm=T) 258 | h <- (h - a) / (b - a) * 100 259 | h$id_ <- d$id_ 260 | return (h) 261 | } 262 | 263 | adjust_batch_met <- function(d, what='batch2', scale=F) { 264 | is_na <- is.na(d) 265 | h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_) 266 | h <- h %>% gather(sample, y, -id_) %>% 267 | inner_join(select_(dat$samples, 'sample', x=what), by='sample') 268 | h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup 269 | h <- h %>% select(id_, sample, y=yr) 270 | h <- h %>% spread(sample, y) 271 | stopifnot(all(dim(is_na) == dim(h))) 272 | stopifnot(all(colnames(is_na) %in% colnames(h))) 273 | h <- h[,colnames(is_na)] 274 | h[is_na] <- NA 275 | return (h) 276 | } 277 | 278 | h <- dat$rates %>% adjust_batch_met 279 | if (opts$scale_adjust) { 280 | h <- scale_rates(h) 281 | } 282 | dat$rates <- h 283 | ``` 284 | 285 | ## PCA adjusted data 286 | 287 | ```{r} 288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 289 | ``` 290 | 291 | ```{r fig.width=8, fig.height=7} 292 | p1 <- plot_pca_vec(pc$vec) 293 | p1 + theme(legend.position='top') 294 | ``` 295 | 296 | ```{r fig.width=10, fig.height=5} 297 | p2 <- plot_pca_val(pc$val) 298 | p2 299 | ``` 300 | 301 | ## Filtering 302 | 303 | ```{r filter_var} 304 | d <- dat$rates 305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples 306 | d <- d[!is.na(h),] 307 | if (!is.na(opts$fil$min_var)) { 308 | stopifnot(sum(is.na(h)) == 0) 309 | d <- d[h >= opts$fil$min_var,] 310 | } 311 | print(sprintf('%d records passed min var filter.', nrow(d))) 312 | 313 | if (!is.na(opts$fil$top_var)) { 314 | t <- opts$fil$top_var 315 | if (t <= 1) { 316 | t <- as.integer(t * nrow(d)) 317 | } 318 | d <- d[order(get_var(select(d, -id_)), decreasing=T),] 319 | d <- d[1:t,] 320 | } 321 | print(sprintf('%d final record.', nrow(d))) 322 | dat$rates <- d 323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_)) 324 | ``` 325 | 326 | ```{r join} 327 | d <- dat$meta %>% select(chromo, start, end, id_) %>% 328 | inner_join(dat$rates, by='id_') 329 | stopifnot(nrow(d) == nrow(dat$rates)) 330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_)) 331 | 332 | d <- dat$weights %>% gather(sample, weight, -id_) 333 | h <- nrow(dat$df) 334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample')) 335 | 336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup 337 | 338 | stopifnot(h == nrow(dat$df)) 339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1)) 340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0)) 341 | ``` 342 | 343 | ```{r} 344 | dat$df <- dat$df %>% ungroup %>% tbl_df 345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df 346 | ``` 347 | 348 | ```{r write} 349 | save_rds <- function(d, name) { 350 | f <- sprintf('%s_%s.rds', opts$out_base, name) 351 | saveRDS(d, f) 352 | } 353 | 354 | save_rds(dat$df, 'met') 355 | save_rds(dat$meta, 'meta') 356 | ``` 357 | 358 | 359 | 360 | ```{r eval=opts$plot_heat, fig.height=12} 361 | plot_heat <- function(d, Rowv=T, xlab='value') { 362 | colors <- rev(brewer.pal(9, 'Spectral')) 363 | colors <- colorRampPalette(colors)(50) 364 | 365 | labRow <- NA 366 | if (nrow(d) > 500) { 367 | dendro='column' 368 | } else { 369 | dendro = 'both' 370 | } 371 | p <- heatmap.2(d, density.info='none', trace='none', col=colors, 372 | Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA, 373 | lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab) 374 | return (p) 375 | } 376 | 377 | d <- dat$rates 378 | max_rows <- 1000 379 | if (nrow(d) > max_rows) { 380 | d <- d %>% sample_n(max_rows) 381 | } 382 | d <- d %>% select(-id_) %>% as.matrix 383 | if (nrow(d) > 100) { 384 | h <- plot_heat(d) 385 | } 386 | ``` 387 | -------------------------------------------------------------------------------- /data/met/prepro/LMR/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Preprocessing Seqmonk methylation files 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(dplyr) 15 | library(tidyr) 16 | library(ggplot2) 17 | library(RColorBrewer) 18 | library(gplots) 19 | library(gridExtra) 20 | source('../../../../lib/utils.R') 21 | ``` 22 | 23 | ```{r opts, echo=T} 24 | opts <- list() 25 | opts$fil$cov <- 30 26 | opts$fil$min_var <- 10 27 | opts$fil$top_var <- 0.75 28 | opts$adjust <- F 29 | opts$scale_adjust <- T 30 | opts$samples_excl <- NULL 31 | opts$samples_methods <- NULL 32 | opts$samples_batch2 <- NULL 33 | ``` 34 | 35 | ```{r} 36 | opts$name <- basename(getwd()) 37 | opts$seqmonk_dirs <- c( 38 | file.path('../../reports', opts$name)) 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt')) 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt')) 41 | opts$out_base <- './data' 42 | opts$samples_file <- '../../../samples/samples_stats.csv' 43 | opts$samples_select_file <- '../../../samples/samples.csv' 44 | opts$cache <- F 45 | opts$plot_heat <- F 46 | ``` 47 | 48 | **Name**: `r opts$name` 49 | 50 | 51 | 52 | ```{r read_src} 53 | read_meta <- function(filename) { 54 | h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t') 55 | names(h) <- tolower(names(h)) 56 | h <- h %>% rename(chromo=chromosome) 57 | h <- h %>% tbl_df 58 | return (h) 59 | } 60 | 61 | read_meta_quick <- function(filename) { 62 | h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t') 63 | names(h) <- tolower(names(h)) 64 | h <- h %>% tbl_df 65 | return (h) 66 | } 67 | 68 | read_all_meta <- function(filenames) { 69 | m <- read_meta(filenames[1]) 70 | if (length(filenames) > 1) { 71 | for (i in 2:length(filenames)) { 72 | s <- read_meta_quick(filenames[i]) 73 | stopifnot(all(m$start == s$start)) 74 | } 75 | } 76 | return (m) 77 | } 78 | 79 | read_values <- function(filename, samples=NULL) { 80 | h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t') 81 | if (!is.null(samples)) { 82 | h <- subset(h, select=intersect(colnames(h), samples)) 83 | } 84 | h <- h %>% tbl_df 85 | return (h) 86 | } 87 | 88 | read_all_values <- function(filenames, samples=NULL) { 89 | d <- lapply(filenames, function(x) read_values(x, samples=samples)) 90 | e <- list() 91 | for (dd in d) { 92 | if (ncol(dd) > 0) { 93 | e[[length(e) + 1]] <- dd 94 | } 95 | } 96 | d <- e 97 | stopifnot(length(d) > 0) 98 | h <- d[[1]] 99 | if (length(d) > 1) { 100 | for (i in 2:length(d)) { 101 | if (ncol(d[[i]]) > 0) { 102 | h <- cbind.data.frame(h, d[[i]]) 103 | } 104 | } 105 | } 106 | h <- h %>% tbl_df 107 | return (h) 108 | } 109 | 110 | read_samples <- function(filename) { 111 | h <- read.table(filename, sep='\t', head=T) %>% tbl_df 112 | h$sample <- h$id 113 | return (h) 114 | } 115 | 116 | read_samples_select <- function(filename) { 117 | h <- read.table(filename, sep='\t', head=F) %>% tbl_df 118 | colnames(h) <- c('id') 119 | h <- as.vector(h$id) 120 | return (h) 121 | } 122 | 123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL, 124 | sel=NULL) { 125 | f <- excl 126 | if (!is.null(f)) { 127 | d <- d %>% filter(!(sample %in% f)) 128 | } 129 | f <- methods 130 | if (!is.null(f) && 'method' %in% colnames(d)) { 131 | d <- d %>% filter(method %in% f) 132 | } 133 | f <- batch2 134 | if (!is.null(f) && 'batch2' %in% colnames(d)) { 135 | d <- d %>% filter(batch2 %in% f) 136 | } 137 | if (!is.null(sel)) { 138 | d <- d %>% filter(sample %in% sel) 139 | } 140 | d <- droplevels(d) 141 | 142 | return (d) 143 | } 144 | ``` 145 | 146 | ```{r} 147 | dat <- list() 148 | dat$samples <- read_samples(opts$samples_file) 149 | dat$samples_select <- read_samples_select(opts$samples_select_file) 150 | dat$samples <- filter_samples(dat$samples, 151 | excl=opts$samples_excl, 152 | methods=opts$samples_methods, 153 | batch2=opts$samples_batch2, 154 | sel=dat$samples_select) 155 | 156 | dat$meta <- read_all_meta(opts$rates_files) 157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample)) 158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample)) 159 | 160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100))) 161 | stopifnot(all(round(dat$weights) == dat$weights)) 162 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 163 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 164 | ``` 165 | 166 | 167 | 168 | ## Preprocessing 169 | 170 | ```{r} 171 | get_cov <- function(d, perc=TRUE) { 172 | cov <- rowSums(!is.na(d)) 173 | if (perc) { 174 | cov <- cov / ncol(d) 175 | } 176 | return (cov) 177 | } 178 | 179 | get_var <- function(d) { 180 | return (apply(d, 1, var, na.rm=T)) 181 | } 182 | ``` 183 | 184 | ```{r} 185 | h <- duplicated(dat$meta) 186 | print(sprintf('Remove %d duplicated records.', sum(h))) 187 | dat$meta <- dat$meta[!h, ] 188 | dat$meta$id_ <- 1:nrow(dat$meta) 189 | for (n in c('rates', 'weights')) { 190 | dat[[n]] <- dat[[n]][!h,] 191 | dat[[n]]$id_ <- dat$meta$id_ 192 | } 193 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 194 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 195 | ``` 196 | 197 | ```{r filter_cov} 198 | d <- dat$rates 199 | print(sprintf('%d records in total.', nrow(d))) 200 | if (!is.na(opts$fil$cov)) { 201 | d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,] 202 | } 203 | print(sprintf('%d records passed coverage filter.', nrow(d))) 204 | dat$rates <- d 205 | ``` 206 | 207 | ```{r} 208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') { 209 | t <- data.frame(sample=factor(rownames(pc_vec)), 210 | pcx=pc_vec[,x], pcy=pc_vec[,y]) 211 | t <- t %>% left_join(dat$samples, by='sample') 212 | t$sample_short <- sub('^CSCP3_SERUM', '', t$sample) 213 | p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) + 214 | geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) + 215 | xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) + 216 | theme(legend.position='bottom') 217 | return (p) 218 | } 219 | 220 | plot_pca_val <- function(pc_val) { 221 | t <- data.frame(pc=1:length(pc_val), val=pc_val) 222 | p <- ggplot(t, aes(x=pc, y=val)) + 223 | geom_bar(stat='identity', fill='salmon', color='black') + 224 | xlab('principle component') + 225 | ylab('% variance explained') 226 | return (p) 227 | } 228 | ``` 229 | 230 | 231 | ## PCA unadjusted data 232 | 233 | ```{r} 234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 235 | ``` 236 | 237 | ```{r fig.width=8, fig.height=7} 238 | p1 <- plot_pca_vec(pc$vec) 239 | p1 + theme(legend.position='top') 240 | ``` 241 | 242 | ```{r fig.width=10, fig.height=5} 243 | p2 <- plot_pca_val(pc$val) 244 | p2 245 | ``` 246 | 247 | ```{r} 248 | d <- dat$rates %>% select(-id_) 249 | cov <- rowSums(!is.na(d)) / ncol(d) 250 | d[h == ncol(d)] 251 | ``` 252 | 253 | ```{r adjust, eval=opts$adjust} 254 | scale_rates <- function(d) { 255 | h <- d %>% select(-id_) 256 | a <- min(h, na.rm=T) 257 | b <- max(h, na.rm=T) 258 | h <- (h - a) / (b - a) * 100 259 | h$id_ <- d$id_ 260 | return (h) 261 | } 262 | 263 | adjust_batch_met <- function(d, what='batch2', scale=F) { 264 | is_na <- is.na(d) 265 | h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_) 266 | h <- h %>% gather(sample, y, -id_) %>% 267 | inner_join(select_(dat$samples, 'sample', x=what), by='sample') 268 | h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup 269 | h <- h %>% select(id_, sample, y=yr) 270 | h <- h %>% spread(sample, y) 271 | stopifnot(all(dim(is_na) == dim(h))) 272 | stopifnot(all(colnames(is_na) %in% colnames(h))) 273 | h <- h[,colnames(is_na)] 274 | h[is_na] <- NA 275 | return (h) 276 | } 277 | 278 | h <- dat$rates %>% adjust_batch_met 279 | if (opts$scale_adjust) { 280 | h <- scale_rates(h) 281 | } 282 | dat$rates <- h 283 | ``` 284 | 285 | ## PCA adjusted data 286 | 287 | ```{r} 288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 289 | ``` 290 | 291 | ```{r fig.width=8, fig.height=7} 292 | p1 <- plot_pca_vec(pc$vec) 293 | p1 + theme(legend.position='top') 294 | ``` 295 | 296 | ```{r fig.width=10, fig.height=5} 297 | p2 <- plot_pca_val(pc$val) 298 | p2 299 | ``` 300 | 301 | ## Filtering 302 | 303 | ```{r filter_var} 304 | d <- dat$rates 305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples 306 | d <- d[!is.na(h),] 307 | if (!is.na(opts$fil$min_var)) { 308 | stopifnot(sum(is.na(h)) == 0) 309 | d <- d[h >= opts$fil$min_var,] 310 | } 311 | print(sprintf('%d records passed min var filter.', nrow(d))) 312 | 313 | if (!is.na(opts$fil$top_var)) { 314 | t <- opts$fil$top_var 315 | if (t <= 1) { 316 | t <- as.integer(t * nrow(d)) 317 | } 318 | d <- d[order(get_var(select(d, -id_)), decreasing=T),] 319 | d <- d[1:t,] 320 | } 321 | print(sprintf('%d final record.', nrow(d))) 322 | dat$rates <- d 323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_)) 324 | ``` 325 | 326 | ```{r join} 327 | d <- dat$meta %>% select(chromo, start, end, id_) %>% 328 | inner_join(dat$rates, by='id_') 329 | stopifnot(nrow(d) == nrow(dat$rates)) 330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_)) 331 | 332 | d <- dat$weights %>% gather(sample, weight, -id_) 333 | h <- nrow(dat$df) 334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample')) 335 | 336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup 337 | 338 | stopifnot(h == nrow(dat$df)) 339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1)) 340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0)) 341 | ``` 342 | 343 | ```{r} 344 | dat$df <- dat$df %>% ungroup %>% tbl_df 345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df 346 | ``` 347 | 348 | ```{r write} 349 | save_rds <- function(d, name) { 350 | f <- sprintf('%s_%s.rds', opts$out_base, name) 351 | saveRDS(d, f) 352 | } 353 | 354 | save_rds(dat$df, 'met') 355 | save_rds(dat$meta, 'meta') 356 | ``` 357 | 358 | 359 | 360 | ```{r eval=opts$plot_heat, fig.height=12} 361 | plot_heat <- function(d, Rowv=T, xlab='value') { 362 | colors <- rev(brewer.pal(9, 'Spectral')) 363 | colors <- colorRampPalette(colors)(50) 364 | 365 | labRow <- NA 366 | if (nrow(d) > 500) { 367 | dendro='column' 368 | } else { 369 | dendro = 'both' 370 | } 371 | p <- heatmap.2(d, density.info='none', trace='none', col=colors, 372 | Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA, 373 | lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab) 374 | return (p) 375 | } 376 | 377 | d <- dat$rates 378 | max_rows <- 1000 379 | if (nrow(d) > max_rows) { 380 | d <- d %>% sample_n(max_rows) 381 | } 382 | d <- d %>% select(-id_) %>% as.matrix 383 | if (nrow(d) > 100) { 384 | h <- plot_heat(d) 385 | } 386 | ``` 387 | -------------------------------------------------------------------------------- /data/met/prepro/Makefile: -------------------------------------------------------------------------------- 1 | seqmonk_names := $(notdir $(wildcard ../reports/*)) 2 | seqmonk_names := $(filter-out README.txt,$(seqmonk_names)) 3 | seqmonk_names := $(filter-out w%,$(seqmonk_names)) 4 | 5 | 6 | %.html: %.Rmd 7 | Rscript -e "library(rmarkdown); render('$<', output_format='html_document')" 8 | 9 | dirs: 10 | for n in $(seqmonk_names); do \ 11 | if [ ! -e $$n ]; then \ 12 | mkdir $$n && cp src/index.Rmd $$n; \ 13 | fi \ 14 | done 15 | 16 | clean: 17 | rm -rf $(seqmonk_names) 18 | 19 | rmd_files = $(filter-out src/%,$(wildcard */index.Rmd)) 20 | html_files = $(patsubst %.Rmd,%.html, $(rmd_files)) 21 | 22 | run: $(html_files) 23 | -------------------------------------------------------------------------------- /data/met/prepro/Tet2/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Preprocessing Seqmonk methylation files 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(dplyr) 15 | library(tidyr) 16 | library(ggplot2) 17 | library(RColorBrewer) 18 | library(gplots) 19 | library(gridExtra) 20 | source('../../../../lib/utils.R') 21 | ``` 22 | 23 | ```{r opts, echo=T} 24 | opts <- list() 25 | opts$fil$cov <- 30 26 | opts$fil$min_var <- 10 27 | opts$fil$top_var <- 0.75 28 | opts$adjust <- F 29 | opts$scale_adjust <- T 30 | opts$samples_excl <- NULL 31 | opts$samples_methods <- NULL 32 | opts$samples_batch2 <- NULL 33 | ``` 34 | 35 | ```{r} 36 | opts$name <- basename(getwd()) 37 | opts$seqmonk_dirs <- c( 38 | file.path('../../reports', opts$name)) 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt')) 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt')) 41 | opts$out_base <- './data' 42 | opts$samples_file <- '../../../samples/samples_stats.csv' 43 | opts$samples_select_file <- '../../../samples/samples.csv' 44 | opts$cache <- F 45 | opts$plot_heat <- F 46 | ``` 47 | 48 | **Name**: `r opts$name` 49 | 50 | 51 | 52 | ```{r read_src} 53 | read_meta <- function(filename) { 54 | h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t') 55 | names(h) <- tolower(names(h)) 56 | h <- h %>% rename(chromo=chromosome) 57 | h <- h %>% tbl_df 58 | return (h) 59 | } 60 | 61 | read_meta_quick <- function(filename) { 62 | h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t') 63 | names(h) <- tolower(names(h)) 64 | h <- h %>% tbl_df 65 | return (h) 66 | } 67 | 68 | read_all_meta <- function(filenames) { 69 | m <- read_meta(filenames[1]) 70 | if (length(filenames) > 1) { 71 | for (i in 2:length(filenames)) { 72 | s <- read_meta_quick(filenames[i]) 73 | stopifnot(all(m$start == s$start)) 74 | } 75 | } 76 | return (m) 77 | } 78 | 79 | read_values <- function(filename, samples=NULL) { 80 | h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t') 81 | if (!is.null(samples)) { 82 | h <- subset(h, select=intersect(colnames(h), samples)) 83 | } 84 | h <- h %>% tbl_df 85 | return (h) 86 | } 87 | 88 | read_all_values <- function(filenames, samples=NULL) { 89 | d <- lapply(filenames, function(x) read_values(x, samples=samples)) 90 | e <- list() 91 | for (dd in d) { 92 | if (ncol(dd) > 0) { 93 | e[[length(e) + 1]] <- dd 94 | } 95 | } 96 | d <- e 97 | stopifnot(length(d) > 0) 98 | h <- d[[1]] 99 | if (length(d) > 1) { 100 | for (i in 2:length(d)) { 101 | if (ncol(d[[i]]) > 0) { 102 | h <- cbind.data.frame(h, d[[i]]) 103 | } 104 | } 105 | } 106 | h <- h %>% tbl_df 107 | return (h) 108 | } 109 | 110 | read_samples <- function(filename) { 111 | h <- read.table(filename, sep='\t', head=T) %>% tbl_df 112 | h$sample <- h$id 113 | return (h) 114 | } 115 | 116 | read_samples_select <- function(filename) { 117 | h <- read.table(filename, sep='\t', head=F) %>% tbl_df 118 | colnames(h) <- c('id') 119 | h <- as.vector(h$id) 120 | return (h) 121 | } 122 | 123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL, 124 | sel=NULL) { 125 | f <- excl 126 | if (!is.null(f)) { 127 | d <- d %>% filter(!(sample %in% f)) 128 | } 129 | f <- methods 130 | if (!is.null(f) && 'method' %in% colnames(d)) { 131 | d <- d %>% filter(method %in% f) 132 | } 133 | f <- batch2 134 | if (!is.null(f) && 'batch2' %in% colnames(d)) { 135 | d <- d %>% filter(batch2 %in% f) 136 | } 137 | if (!is.null(sel)) { 138 | d <- d %>% filter(sample %in% sel) 139 | } 140 | d <- droplevels(d) 141 | 142 | return (d) 143 | } 144 | ``` 145 | 146 | ```{r} 147 | dat <- list() 148 | dat$samples <- read_samples(opts$samples_file) 149 | dat$samples_select <- read_samples_select(opts$samples_select_file) 150 | dat$samples <- filter_samples(dat$samples, 151 | excl=opts$samples_excl, 152 | methods=opts$samples_methods, 153 | batch2=opts$samples_batch2, 154 | sel=dat$samples_select) 155 | 156 | dat$meta <- read_all_meta(opts$rates_files) 157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample)) 158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample)) 159 | 160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100))) 161 | stopifnot(all(round(dat$weights) == dat$weights)) 162 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 163 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 164 | ``` 165 | 166 | 167 | 168 | ## Preprocessing 169 | 170 | ```{r} 171 | get_cov <- function(d, perc=TRUE) { 172 | cov <- rowSums(!is.na(d)) 173 | if (perc) { 174 | cov <- cov / ncol(d) 175 | } 176 | return (cov) 177 | } 178 | 179 | get_var <- function(d) { 180 | return (apply(d, 1, var, na.rm=T)) 181 | } 182 | ``` 183 | 184 | ```{r} 185 | h <- duplicated(dat$meta) 186 | print(sprintf('Remove %d duplicated records.', sum(h))) 187 | dat$meta <- dat$meta[!h, ] 188 | dat$meta$id_ <- 1:nrow(dat$meta) 189 | for (n in c('rates', 'weights')) { 190 | dat[[n]] <- dat[[n]][!h,] 191 | dat[[n]]$id_ <- dat$meta$id_ 192 | } 193 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 194 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 195 | ``` 196 | 197 | ```{r filter_cov} 198 | d <- dat$rates 199 | print(sprintf('%d records in total.', nrow(d))) 200 | if (!is.na(opts$fil$cov)) { 201 | d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,] 202 | } 203 | print(sprintf('%d records passed coverage filter.', nrow(d))) 204 | dat$rates <- d 205 | ``` 206 | 207 | ```{r} 208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') { 209 | t <- data.frame(sample=factor(rownames(pc_vec)), 210 | pcx=pc_vec[,x], pcy=pc_vec[,y]) 211 | t <- t %>% left_join(dat$samples, by='sample') 212 | t$sample_short <- sub('^CSCP3_SERUM', '', t$sample) 213 | p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) + 214 | geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) + 215 | xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) + 216 | theme(legend.position='bottom') 217 | return (p) 218 | } 219 | 220 | plot_pca_val <- function(pc_val) { 221 | t <- data.frame(pc=1:length(pc_val), val=pc_val) 222 | p <- ggplot(t, aes(x=pc, y=val)) + 223 | geom_bar(stat='identity', fill='salmon', color='black') + 224 | xlab('principle component') + 225 | ylab('% variance explained') 226 | return (p) 227 | } 228 | ``` 229 | 230 | 231 | ## PCA unadjusted data 232 | 233 | ```{r} 234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 235 | ``` 236 | 237 | ```{r fig.width=8, fig.height=7} 238 | p1 <- plot_pca_vec(pc$vec) 239 | p1 + theme(legend.position='top') 240 | ``` 241 | 242 | ```{r fig.width=10, fig.height=5} 243 | p2 <- plot_pca_val(pc$val) 244 | p2 245 | ``` 246 | 247 | ```{r} 248 | d <- dat$rates %>% select(-id_) 249 | cov <- rowSums(!is.na(d)) / ncol(d) 250 | d[h == ncol(d)] 251 | ``` 252 | 253 | ```{r adjust, eval=opts$adjust} 254 | scale_rates <- function(d) { 255 | h <- d %>% select(-id_) 256 | a <- min(h, na.rm=T) 257 | b <- max(h, na.rm=T) 258 | h <- (h - a) / (b - a) * 100 259 | h$id_ <- d$id_ 260 | return (h) 261 | } 262 | 263 | adjust_batch_met <- function(d, what='batch2', scale=F) { 264 | is_na <- is.na(d) 265 | h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_) 266 | h <- h %>% gather(sample, y, -id_) %>% 267 | inner_join(select_(dat$samples, 'sample', x=what), by='sample') 268 | h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup 269 | h <- h %>% select(id_, sample, y=yr) 270 | h <- h %>% spread(sample, y) 271 | stopifnot(all(dim(is_na) == dim(h))) 272 | stopifnot(all(colnames(is_na) %in% colnames(h))) 273 | h <- h[,colnames(is_na)] 274 | h[is_na] <- NA 275 | return (h) 276 | } 277 | 278 | h <- dat$rates %>% adjust_batch_met 279 | if (opts$scale_adjust) { 280 | h <- scale_rates(h) 281 | } 282 | dat$rates <- h 283 | ``` 284 | 285 | ## PCA adjusted data 286 | 287 | ```{r} 288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 289 | ``` 290 | 291 | ```{r fig.width=8, fig.height=7} 292 | p1 <- plot_pca_vec(pc$vec) 293 | p1 + theme(legend.position='top') 294 | ``` 295 | 296 | ```{r fig.width=10, fig.height=5} 297 | p2 <- plot_pca_val(pc$val) 298 | p2 299 | ``` 300 | 301 | ## Filtering 302 | 303 | ```{r filter_var} 304 | d <- dat$rates 305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples 306 | d <- d[!is.na(h),] 307 | if (!is.na(opts$fil$min_var)) { 308 | stopifnot(sum(is.na(h)) == 0) 309 | d <- d[h >= opts$fil$min_var,] 310 | } 311 | print(sprintf('%d records passed min var filter.', nrow(d))) 312 | 313 | if (!is.na(opts$fil$top_var)) { 314 | t <- opts$fil$top_var 315 | if (t <= 1) { 316 | t <- as.integer(t * nrow(d)) 317 | } 318 | d <- d[order(get_var(select(d, -id_)), decreasing=T),] 319 | d <- d[1:t,] 320 | } 321 | print(sprintf('%d final record.', nrow(d))) 322 | dat$rates <- d 323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_)) 324 | ``` 325 | 326 | ```{r join} 327 | d <- dat$meta %>% select(chromo, start, end, id_) %>% 328 | inner_join(dat$rates, by='id_') 329 | stopifnot(nrow(d) == nrow(dat$rates)) 330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_)) 331 | 332 | d <- dat$weights %>% gather(sample, weight, -id_) 333 | h <- nrow(dat$df) 334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample')) 335 | 336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup 337 | 338 | stopifnot(h == nrow(dat$df)) 339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1)) 340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0)) 341 | ``` 342 | 343 | ```{r} 344 | dat$df <- dat$df %>% ungroup %>% tbl_df 345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df 346 | ``` 347 | 348 | ```{r write} 349 | save_rds <- function(d, name) { 350 | f <- sprintf('%s_%s.rds', opts$out_base, name) 351 | saveRDS(d, f) 352 | } 353 | 354 | save_rds(dat$df, 'met') 355 | save_rds(dat$meta, 'meta') 356 | ``` 357 | 358 | 359 | 360 | ```{r eval=opts$plot_heat, fig.height=12} 361 | plot_heat <- function(d, Rowv=T, xlab='value') { 362 | colors <- rev(brewer.pal(9, 'Spectral')) 363 | colors <- colorRampPalette(colors)(50) 364 | 365 | labRow <- NA 366 | if (nrow(d) > 500) { 367 | dendro='column' 368 | } else { 369 | dendro = 'both' 370 | } 371 | p <- heatmap.2(d, density.info='none', trace='none', col=colors, 372 | Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA, 373 | lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab) 374 | return (p) 375 | } 376 | 377 | d <- dat$rates 378 | max_rows <- 1000 379 | if (nrow(d) > max_rows) { 380 | d <- d %>% sample_n(max_rows) 381 | } 382 | d <- d %>% select(-id_) %>% as.matrix 383 | if (nrow(d) > 100) { 384 | h <- plot_heat(d) 385 | } 386 | ``` 387 | -------------------------------------------------------------------------------- /data/met/prepro/Wu_Tet1/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Preprocessing Seqmonk methylation files 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(dplyr) 15 | library(tidyr) 16 | library(ggplot2) 17 | library(RColorBrewer) 18 | library(gplots) 19 | library(gridExtra) 20 | source('../../../../lib/utils.R') 21 | ``` 22 | 23 | ```{r opts, echo=T} 24 | opts <- list() 25 | opts$fil$cov <- 30 26 | opts$fil$min_var <- 10 27 | opts$fil$top_var <- 0.75 28 | opts$adjust <- F 29 | opts$scale_adjust <- T 30 | opts$samples_excl <- NULL 31 | opts$samples_methods <- NULL 32 | opts$samples_batch2 <- NULL 33 | ``` 34 | 35 | ```{r} 36 | opts$name <- basename(getwd()) 37 | opts$seqmonk_dirs <- c( 38 | file.path('../../reports', opts$name)) 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt')) 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt')) 41 | opts$out_base <- './data' 42 | opts$samples_file <- '../../../samples/samples_stats.csv' 43 | opts$samples_select_file <- '../../../samples/samples.csv' 44 | opts$cache <- F 45 | opts$plot_heat <- F 46 | ``` 47 | 48 | **Name**: `r opts$name` 49 | 50 | 51 | 52 | ```{r read_src} 53 | read_meta <- function(filename) { 54 | h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t') 55 | names(h) <- tolower(names(h)) 56 | h <- h %>% rename(chromo=chromosome) 57 | h <- h %>% tbl_df 58 | return (h) 59 | } 60 | 61 | read_meta_quick <- function(filename) { 62 | h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t') 63 | names(h) <- tolower(names(h)) 64 | h <- h %>% tbl_df 65 | return (h) 66 | } 67 | 68 | read_all_meta <- function(filenames) { 69 | m <- read_meta(filenames[1]) 70 | if (length(filenames) > 1) { 71 | for (i in 2:length(filenames)) { 72 | s <- read_meta_quick(filenames[i]) 73 | stopifnot(all(m$start == s$start)) 74 | } 75 | } 76 | return (m) 77 | } 78 | 79 | read_values <- function(filename, samples=NULL) { 80 | h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t') 81 | if (!is.null(samples)) { 82 | h <- subset(h, select=intersect(colnames(h), samples)) 83 | } 84 | h <- h %>% tbl_df 85 | return (h) 86 | } 87 | 88 | read_all_values <- function(filenames, samples=NULL) { 89 | d <- lapply(filenames, function(x) read_values(x, samples=samples)) 90 | e <- list() 91 | for (dd in d) { 92 | if (ncol(dd) > 0) { 93 | e[[length(e) + 1]] <- dd 94 | } 95 | } 96 | d <- e 97 | stopifnot(length(d) > 0) 98 | h <- d[[1]] 99 | if (length(d) > 1) { 100 | for (i in 2:length(d)) { 101 | if (ncol(d[[i]]) > 0) { 102 | h <- cbind.data.frame(h, d[[i]]) 103 | } 104 | } 105 | } 106 | h <- h %>% tbl_df 107 | return (h) 108 | } 109 | 110 | read_samples <- function(filename) { 111 | h <- read.table(filename, sep='\t', head=T) %>% tbl_df 112 | h$sample <- h$id 113 | return (h) 114 | } 115 | 116 | read_samples_select <- function(filename) { 117 | h <- read.table(filename, sep='\t', head=F) %>% tbl_df 118 | colnames(h) <- c('id') 119 | h <- as.vector(h$id) 120 | return (h) 121 | } 122 | 123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL, 124 | sel=NULL) { 125 | f <- excl 126 | if (!is.null(f)) { 127 | d <- d %>% filter(!(sample %in% f)) 128 | } 129 | f <- methods 130 | if (!is.null(f) && 'method' %in% colnames(d)) { 131 | d <- d %>% filter(method %in% f) 132 | } 133 | f <- batch2 134 | if (!is.null(f) && 'batch2' %in% colnames(d)) { 135 | d <- d %>% filter(batch2 %in% f) 136 | } 137 | if (!is.null(sel)) { 138 | d <- d %>% filter(sample %in% sel) 139 | } 140 | d <- droplevels(d) 141 | 142 | return (d) 143 | } 144 | ``` 145 | 146 | ```{r} 147 | dat <- list() 148 | dat$samples <- read_samples(opts$samples_file) 149 | dat$samples_select <- read_samples_select(opts$samples_select_file) 150 | dat$samples <- filter_samples(dat$samples, 151 | excl=opts$samples_excl, 152 | methods=opts$samples_methods, 153 | batch2=opts$samples_batch2, 154 | sel=dat$samples_select) 155 | 156 | dat$meta <- read_all_meta(opts$rates_files) 157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample)) 158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample)) 159 | 160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100))) 161 | stopifnot(all(round(dat$weights) == dat$weights)) 162 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 163 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 164 | ``` 165 | 166 | 167 | 168 | ## Preprocessing 169 | 170 | ```{r} 171 | get_cov <- function(d, perc=TRUE) { 172 | cov <- rowSums(!is.na(d)) 173 | if (perc) { 174 | cov <- cov / ncol(d) 175 | } 176 | return (cov) 177 | } 178 | 179 | get_var <- function(d) { 180 | return (apply(d, 1, var, na.rm=T)) 181 | } 182 | ``` 183 | 184 | ```{r} 185 | h <- duplicated(dat$meta) 186 | print(sprintf('Remove %d duplicated records.', sum(h))) 187 | dat$meta <- dat$meta[!h, ] 188 | dat$meta$id_ <- 1:nrow(dat$meta) 189 | for (n in c('rates', 'weights')) { 190 | dat[[n]] <- dat[[n]][!h,] 191 | dat[[n]]$id_ <- dat$meta$id_ 192 | } 193 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 194 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 195 | ``` 196 | 197 | ```{r filter_cov} 198 | d <- dat$rates 199 | print(sprintf('%d records in total.', nrow(d))) 200 | if (!is.na(opts$fil$cov)) { 201 | d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,] 202 | } 203 | print(sprintf('%d records passed coverage filter.', nrow(d))) 204 | dat$rates <- d 205 | ``` 206 | 207 | ```{r} 208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') { 209 | t <- data.frame(sample=factor(rownames(pc_vec)), 210 | pcx=pc_vec[,x], pcy=pc_vec[,y]) 211 | t <- t %>% left_join(dat$samples, by='sample') 212 | t$sample_short <- sub('^CSCP3_SERUM', '', t$sample) 213 | p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) + 214 | geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) + 215 | xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) + 216 | theme(legend.position='bottom') 217 | return (p) 218 | } 219 | 220 | plot_pca_val <- function(pc_val) { 221 | t <- data.frame(pc=1:length(pc_val), val=pc_val) 222 | p <- ggplot(t, aes(x=pc, y=val)) + 223 | geom_bar(stat='identity', fill='salmon', color='black') + 224 | xlab('principle component') + 225 | ylab('% variance explained') 226 | return (p) 227 | } 228 | ``` 229 | 230 | 231 | ## PCA unadjusted data 232 | 233 | ```{r} 234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 235 | ``` 236 | 237 | ```{r fig.width=8, fig.height=7} 238 | p1 <- plot_pca_vec(pc$vec) 239 | p1 + theme(legend.position='top') 240 | ``` 241 | 242 | ```{r fig.width=10, fig.height=5} 243 | p2 <- plot_pca_val(pc$val) 244 | p2 245 | ``` 246 | 247 | ```{r} 248 | d <- dat$rates %>% select(-id_) 249 | cov <- rowSums(!is.na(d)) / ncol(d) 250 | d[h == ncol(d)] 251 | ``` 252 | 253 | ```{r adjust, eval=opts$adjust} 254 | scale_rates <- function(d) { 255 | h <- d %>% select(-id_) 256 | a <- min(h, na.rm=T) 257 | b <- max(h, na.rm=T) 258 | h <- (h - a) / (b - a) * 100 259 | h$id_ <- d$id_ 260 | return (h) 261 | } 262 | 263 | adjust_batch_met <- function(d, what='batch2', scale=F) { 264 | is_na <- is.na(d) 265 | h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_) 266 | h <- h %>% gather(sample, y, -id_) %>% 267 | inner_join(select_(dat$samples, 'sample', x=what), by='sample') 268 | h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup 269 | h <- h %>% select(id_, sample, y=yr) 270 | h <- h %>% spread(sample, y) 271 | stopifnot(all(dim(is_na) == dim(h))) 272 | stopifnot(all(colnames(is_na) %in% colnames(h))) 273 | h <- h[,colnames(is_na)] 274 | h[is_na] <- NA 275 | return (h) 276 | } 277 | 278 | h <- dat$rates %>% adjust_batch_met 279 | if (opts$scale_adjust) { 280 | h <- scale_rates(h) 281 | } 282 | dat$rates <- h 283 | ``` 284 | 285 | ## PCA adjusted data 286 | 287 | ```{r} 288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 289 | ``` 290 | 291 | ```{r fig.width=8, fig.height=7} 292 | p1 <- plot_pca_vec(pc$vec) 293 | p1 + theme(legend.position='top') 294 | ``` 295 | 296 | ```{r fig.width=10, fig.height=5} 297 | p2 <- plot_pca_val(pc$val) 298 | p2 299 | ``` 300 | 301 | ## Filtering 302 | 303 | ```{r filter_var} 304 | d <- dat$rates 305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples 306 | d <- d[!is.na(h),] 307 | if (!is.na(opts$fil$min_var)) { 308 | stopifnot(sum(is.na(h)) == 0) 309 | d <- d[h >= opts$fil$min_var,] 310 | } 311 | print(sprintf('%d records passed min var filter.', nrow(d))) 312 | 313 | if (!is.na(opts$fil$top_var)) { 314 | t <- opts$fil$top_var 315 | if (t <= 1) { 316 | t <- as.integer(t * nrow(d)) 317 | } 318 | d <- d[order(get_var(select(d, -id_)), decreasing=T),] 319 | d <- d[1:t,] 320 | } 321 | print(sprintf('%d final record.', nrow(d))) 322 | dat$rates <- d 323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_)) 324 | ``` 325 | 326 | ```{r join} 327 | d <- dat$meta %>% select(chromo, start, end, id_) %>% 328 | inner_join(dat$rates, by='id_') 329 | stopifnot(nrow(d) == nrow(dat$rates)) 330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_)) 331 | 332 | d <- dat$weights %>% gather(sample, weight, -id_) 333 | h <- nrow(dat$df) 334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample')) 335 | 336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup 337 | 338 | stopifnot(h == nrow(dat$df)) 339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1)) 340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0)) 341 | ``` 342 | 343 | ```{r} 344 | dat$df <- dat$df %>% ungroup %>% tbl_df 345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df 346 | ``` 347 | 348 | ```{r write} 349 | save_rds <- function(d, name) { 350 | f <- sprintf('%s_%s.rds', opts$out_base, name) 351 | saveRDS(d, f) 352 | } 353 | 354 | save_rds(dat$df, 'met') 355 | save_rds(dat$meta, 'meta') 356 | ``` 357 | 358 | 359 | 360 | ```{r eval=opts$plot_heat, fig.height=12} 361 | plot_heat <- function(d, Rowv=T, xlab='value') { 362 | colors <- rev(brewer.pal(9, 'Spectral')) 363 | colors <- colorRampPalette(colors)(50) 364 | 365 | labRow <- NA 366 | if (nrow(d) > 500) { 367 | dendro='column' 368 | } else { 369 | dendro = 'both' 370 | } 371 | p <- heatmap.2(d, density.info='none', trace='none', col=colors, 372 | Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA, 373 | lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab) 374 | return (p) 375 | } 376 | 377 | d <- dat$rates 378 | max_rows <- 1000 379 | if (nrow(d) > max_rows) { 380 | d <- d %>% sample_n(max_rows) 381 | } 382 | d <- d %>% select(-id_) %>% as.matrix 383 | if (nrow(d) > 100) { 384 | h <- plot_heat(d) 385 | } 386 | ``` 387 | -------------------------------------------------------------------------------- /data/met/prepro/cgi/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Preprocessing Seqmonk methylation files 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(dplyr) 15 | library(tidyr) 16 | library(ggplot2) 17 | library(RColorBrewer) 18 | library(gplots) 19 | library(gridExtra) 20 | source('../../../../lib/utils.R') 21 | ``` 22 | 23 | ```{r opts, echo=T} 24 | opts <- list() 25 | opts$fil$cov <- 30 26 | opts$fil$min_var <- 10 27 | opts$fil$top_var <- 0.75 28 | opts$adjust <- F 29 | opts$scale_adjust <- T 30 | opts$samples_excl <- NULL 31 | opts$samples_methods <- NULL 32 | opts$samples_batch2 <- NULL 33 | ``` 34 | 35 | ```{r} 36 | opts$name <- basename(getwd()) 37 | opts$seqmonk_dirs <- c( 38 | file.path('../../reports', opts$name)) 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt')) 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt')) 41 | opts$out_base <- './data' 42 | opts$samples_file <- '../../../samples/samples_stats.csv' 43 | opts$samples_select_file <- '../../../samples/samples.csv' 44 | opts$cache <- F 45 | opts$plot_heat <- F 46 | ``` 47 | 48 | **Name**: `r opts$name` 49 | 50 | 51 | 52 | ```{r read_src} 53 | read_meta <- function(filename) { 54 | h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t') 55 | names(h) <- tolower(names(h)) 56 | h <- h %>% rename(chromo=chromosome) 57 | h <- h %>% tbl_df 58 | return (h) 59 | } 60 | 61 | read_meta_quick <- function(filename) { 62 | h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t') 63 | names(h) <- tolower(names(h)) 64 | h <- h %>% tbl_df 65 | return (h) 66 | } 67 | 68 | read_all_meta <- function(filenames) { 69 | m <- read_meta(filenames[1]) 70 | if (length(filenames) > 1) { 71 | for (i in 2:length(filenames)) { 72 | s <- read_meta_quick(filenames[i]) 73 | stopifnot(all(m$start == s$start)) 74 | } 75 | } 76 | return (m) 77 | } 78 | 79 | read_values <- function(filename, samples=NULL) { 80 | h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t') 81 | if (!is.null(samples)) { 82 | h <- subset(h, select=intersect(colnames(h), samples)) 83 | } 84 | h <- h %>% tbl_df 85 | return (h) 86 | } 87 | 88 | read_all_values <- function(filenames, samples=NULL) { 89 | d <- lapply(filenames, function(x) read_values(x, samples=samples)) 90 | e <- list() 91 | for (dd in d) { 92 | if (ncol(dd) > 0) { 93 | e[[length(e) + 1]] <- dd 94 | } 95 | } 96 | d <- e 97 | stopifnot(length(d) > 0) 98 | h <- d[[1]] 99 | if (length(d) > 1) { 100 | for (i in 2:length(d)) { 101 | if (ncol(d[[i]]) > 0) { 102 | h <- cbind.data.frame(h, d[[i]]) 103 | } 104 | } 105 | } 106 | h <- h %>% tbl_df 107 | return (h) 108 | } 109 | 110 | read_samples <- function(filename) { 111 | h <- read.table(filename, sep='\t', head=T) %>% tbl_df 112 | h$sample <- h$id 113 | return (h) 114 | } 115 | 116 | read_samples_select <- function(filename) { 117 | h <- read.table(filename, sep='\t', head=F) %>% tbl_df 118 | colnames(h) <- c('id') 119 | h <- as.vector(h$id) 120 | return (h) 121 | } 122 | 123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL, 124 | sel=NULL) { 125 | f <- excl 126 | if (!is.null(f)) { 127 | d <- d %>% filter(!(sample %in% f)) 128 | } 129 | f <- methods 130 | if (!is.null(f) && 'method' %in% colnames(d)) { 131 | d <- d %>% filter(method %in% f) 132 | } 133 | f <- batch2 134 | if (!is.null(f) && 'batch2' %in% colnames(d)) { 135 | d <- d %>% filter(batch2 %in% f) 136 | } 137 | if (!is.null(sel)) { 138 | d <- d %>% filter(sample %in% sel) 139 | } 140 | d <- droplevels(d) 141 | 142 | return (d) 143 | } 144 | ``` 145 | 146 | ```{r} 147 | dat <- list() 148 | dat$samples <- read_samples(opts$samples_file) 149 | dat$samples_select <- read_samples_select(opts$samples_select_file) 150 | dat$samples <- filter_samples(dat$samples, 151 | excl=opts$samples_excl, 152 | methods=opts$samples_methods, 153 | batch2=opts$samples_batch2, 154 | sel=dat$samples_select) 155 | 156 | dat$meta <- read_all_meta(opts$rates_files) 157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample)) 158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample)) 159 | 160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100))) 161 | stopifnot(all(round(dat$weights) == dat$weights)) 162 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 163 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 164 | ``` 165 | 166 | 167 | 168 | ## Preprocessing 169 | 170 | ```{r} 171 | get_cov <- function(d, perc=TRUE) { 172 | cov <- rowSums(!is.na(d)) 173 | if (perc) { 174 | cov <- cov / ncol(d) 175 | } 176 | return (cov) 177 | } 178 | 179 | get_var <- function(d) { 180 | return (apply(d, 1, var, na.rm=T)) 181 | } 182 | ``` 183 | 184 | ```{r} 185 | h <- duplicated(dat$meta) 186 | print(sprintf('Remove %d duplicated records.', sum(h))) 187 | dat$meta <- dat$meta[!h, ] 188 | dat$meta$id_ <- 1:nrow(dat$meta) 189 | for (n in c('rates', 'weights')) { 190 | dat[[n]] <- dat[[n]][!h,] 191 | dat[[n]]$id_ <- dat$meta$id_ 192 | } 193 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 194 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 195 | ``` 196 | 197 | ```{r filter_cov} 198 | d <- dat$rates 199 | print(sprintf('%d records in total.', nrow(d))) 200 | if (!is.na(opts$fil$cov)) { 201 | d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,] 202 | } 203 | print(sprintf('%d records passed coverage filter.', nrow(d))) 204 | dat$rates <- d 205 | ``` 206 | 207 | ```{r} 208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') { 209 | t <- data.frame(sample=factor(rownames(pc_vec)), 210 | pcx=pc_vec[,x], pcy=pc_vec[,y]) 211 | t <- t %>% left_join(dat$samples, by='sample') 212 | t$sample_short <- sub('^CSCP3_SERUM', '', t$sample) 213 | p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) + 214 | geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) + 215 | xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) + 216 | theme(legend.position='bottom') 217 | return (p) 218 | } 219 | 220 | plot_pca_val <- function(pc_val) { 221 | t <- data.frame(pc=1:length(pc_val), val=pc_val) 222 | p <- ggplot(t, aes(x=pc, y=val)) + 223 | geom_bar(stat='identity', fill='salmon', color='black') + 224 | xlab('principle component') + 225 | ylab('% variance explained') 226 | return (p) 227 | } 228 | ``` 229 | 230 | 231 | ## PCA unadjusted data 232 | 233 | ```{r} 234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 235 | ``` 236 | 237 | ```{r fig.width=8, fig.height=7} 238 | p1 <- plot_pca_vec(pc$vec) 239 | p1 + theme(legend.position='top') 240 | ``` 241 | 242 | ```{r fig.width=10, fig.height=5} 243 | p2 <- plot_pca_val(pc$val) 244 | p2 245 | ``` 246 | 247 | ```{r} 248 | d <- dat$rates %>% select(-id_) 249 | cov <- rowSums(!is.na(d)) / ncol(d) 250 | d[h == ncol(d)] 251 | ``` 252 | 253 | ```{r adjust, eval=opts$adjust} 254 | scale_rates <- function(d) { 255 | h <- d %>% select(-id_) 256 | a <- min(h, na.rm=T) 257 | b <- max(h, na.rm=T) 258 | h <- (h - a) / (b - a) * 100 259 | h$id_ <- d$id_ 260 | return (h) 261 | } 262 | 263 | adjust_batch_met <- function(d, what='batch2', scale=F) { 264 | is_na <- is.na(d) 265 | h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_) 266 | h <- h %>% gather(sample, y, -id_) %>% 267 | inner_join(select_(dat$samples, 'sample', x=what), by='sample') 268 | h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup 269 | h <- h %>% select(id_, sample, y=yr) 270 | h <- h %>% spread(sample, y) 271 | stopifnot(all(dim(is_na) == dim(h))) 272 | stopifnot(all(colnames(is_na) %in% colnames(h))) 273 | h <- h[,colnames(is_na)] 274 | h[is_na] <- NA 275 | return (h) 276 | } 277 | 278 | h <- dat$rates %>% adjust_batch_met 279 | if (opts$scale_adjust) { 280 | h <- scale_rates(h) 281 | } 282 | dat$rates <- h 283 | ``` 284 | 285 | ## PCA adjusted data 286 | 287 | ```{r} 288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 289 | ``` 290 | 291 | ```{r fig.width=8, fig.height=7} 292 | p1 <- plot_pca_vec(pc$vec) 293 | p1 + theme(legend.position='top') 294 | ``` 295 | 296 | ```{r fig.width=10, fig.height=5} 297 | p2 <- plot_pca_val(pc$val) 298 | p2 299 | ``` 300 | 301 | ## Filtering 302 | 303 | ```{r filter_var} 304 | d <- dat$rates 305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples 306 | d <- d[!is.na(h),] 307 | if (!is.na(opts$fil$min_var)) { 308 | stopifnot(sum(is.na(h)) == 0) 309 | d <- d[h >= opts$fil$min_var,] 310 | } 311 | print(sprintf('%d records passed min var filter.', nrow(d))) 312 | 313 | if (!is.na(opts$fil$top_var)) { 314 | t <- opts$fil$top_var 315 | if (t <= 1) { 316 | t <- as.integer(t * nrow(d)) 317 | } 318 | d <- d[order(get_var(select(d, -id_)), decreasing=T),] 319 | d <- d[1:t,] 320 | } 321 | print(sprintf('%d final record.', nrow(d))) 322 | dat$rates <- d 323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_)) 324 | ``` 325 | 326 | ```{r join} 327 | d <- dat$meta %>% select(chromo, start, end, id_) %>% 328 | inner_join(dat$rates, by='id_') 329 | stopifnot(nrow(d) == nrow(dat$rates)) 330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_)) 331 | 332 | d <- dat$weights %>% gather(sample, weight, -id_) 333 | h <- nrow(dat$df) 334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample')) 335 | 336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup 337 | 338 | stopifnot(h == nrow(dat$df)) 339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1)) 340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0)) 341 | ``` 342 | 343 | ```{r} 344 | dat$df <- dat$df %>% ungroup %>% tbl_df 345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df 346 | ``` 347 | 348 | ```{r write} 349 | save_rds <- function(d, name) { 350 | f <- sprintf('%s_%s.rds', opts$out_base, name) 351 | saveRDS(d, f) 352 | } 353 | 354 | save_rds(dat$df, 'met') 355 | save_rds(dat$meta, 'meta') 356 | ``` 357 | 358 | 359 | 360 | ```{r eval=opts$plot_heat, fig.height=12} 361 | plot_heat <- function(d, Rowv=T, xlab='value') { 362 | colors <- rev(brewer.pal(9, 'Spectral')) 363 | colors <- colorRampPalette(colors)(50) 364 | 365 | labRow <- NA 366 | if (nrow(d) > 500) { 367 | dendro='column' 368 | } else { 369 | dendro = 'both' 370 | } 371 | p <- heatmap.2(d, density.info='none', trace='none', col=colors, 372 | Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA, 373 | lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab) 374 | return (p) 375 | } 376 | 377 | d <- dat$rates 378 | max_rows <- 1000 379 | if (nrow(d) > max_rows) { 380 | d <- d %>% sample_n(max_rows) 381 | } 382 | d <- d %>% select(-id_) %>% as.matrix 383 | if (nrow(d) > 100) { 384 | h <- plot_heat(d) 385 | } 386 | ``` 387 | -------------------------------------------------------------------------------- /data/met/prepro/exon/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Preprocessing Seqmonk methylation files 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(dplyr) 15 | library(tidyr) 16 | library(ggplot2) 17 | library(RColorBrewer) 18 | library(gplots) 19 | library(gridExtra) 20 | source('../../../../lib/utils.R') 21 | ``` 22 | 23 | ```{r opts, echo=T} 24 | opts <- list() 25 | opts$fil$cov <- 30 26 | opts$fil$min_var <- 10 27 | opts$fil$top_var <- 0.75 28 | opts$adjust <- F 29 | opts$scale_adjust <- T 30 | opts$samples_excl <- NULL 31 | opts$samples_methods <- NULL 32 | opts$samples_batch2 <- NULL 33 | ``` 34 | 35 | ```{r} 36 | opts$name <- basename(getwd()) 37 | opts$seqmonk_dirs <- c( 38 | file.path('../../reports', opts$name)) 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt')) 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt')) 41 | opts$out_base <- './data' 42 | opts$samples_file <- '../../../samples/samples_stats.csv' 43 | opts$samples_select_file <- '../../../samples/samples.csv' 44 | opts$cache <- F 45 | opts$plot_heat <- F 46 | ``` 47 | 48 | **Name**: `r opts$name` 49 | 50 | 51 | 52 | ```{r read_src} 53 | read_meta <- function(filename) { 54 | h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t') 55 | names(h) <- tolower(names(h)) 56 | h <- h %>% rename(chromo=chromosome) 57 | h <- h %>% tbl_df 58 | return (h) 59 | } 60 | 61 | read_meta_quick <- function(filename) { 62 | h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t') 63 | names(h) <- tolower(names(h)) 64 | h <- h %>% tbl_df 65 | return (h) 66 | } 67 | 68 | read_all_meta <- function(filenames) { 69 | m <- read_meta(filenames[1]) 70 | if (length(filenames) > 1) { 71 | for (i in 2:length(filenames)) { 72 | s <- read_meta_quick(filenames[i]) 73 | stopifnot(all(m$start == s$start)) 74 | } 75 | } 76 | return (m) 77 | } 78 | 79 | read_values <- function(filename, samples=NULL) { 80 | h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t') 81 | if (!is.null(samples)) { 82 | h <- subset(h, select=intersect(colnames(h), samples)) 83 | } 84 | h <- h %>% tbl_df 85 | return (h) 86 | } 87 | 88 | read_all_values <- function(filenames, samples=NULL) { 89 | d <- lapply(filenames, function(x) read_values(x, samples=samples)) 90 | e <- list() 91 | for (dd in d) { 92 | if (ncol(dd) > 0) { 93 | e[[length(e) + 1]] <- dd 94 | } 95 | } 96 | d <- e 97 | stopifnot(length(d) > 0) 98 | h <- d[[1]] 99 | if (length(d) > 1) { 100 | for (i in 2:length(d)) { 101 | if (ncol(d[[i]]) > 0) { 102 | h <- cbind.data.frame(h, d[[i]]) 103 | } 104 | } 105 | } 106 | h <- h %>% tbl_df 107 | return (h) 108 | } 109 | 110 | read_samples <- function(filename) { 111 | h <- read.table(filename, sep='\t', head=T) %>% tbl_df 112 | h$sample <- h$id 113 | return (h) 114 | } 115 | 116 | read_samples_select <- function(filename) { 117 | h <- read.table(filename, sep='\t', head=F) %>% tbl_df 118 | colnames(h) <- c('id') 119 | h <- as.vector(h$id) 120 | return (h) 121 | } 122 | 123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL, 124 | sel=NULL) { 125 | f <- excl 126 | if (!is.null(f)) { 127 | d <- d %>% filter(!(sample %in% f)) 128 | } 129 | f <- methods 130 | if (!is.null(f) && 'method' %in% colnames(d)) { 131 | d <- d %>% filter(method %in% f) 132 | } 133 | f <- batch2 134 | if (!is.null(f) && 'batch2' %in% colnames(d)) { 135 | d <- d %>% filter(batch2 %in% f) 136 | } 137 | if (!is.null(sel)) { 138 | d <- d %>% filter(sample %in% sel) 139 | } 140 | d <- droplevels(d) 141 | 142 | return (d) 143 | } 144 | ``` 145 | 146 | ```{r} 147 | dat <- list() 148 | dat$samples <- read_samples(opts$samples_file) 149 | dat$samples_select <- read_samples_select(opts$samples_select_file) 150 | dat$samples <- filter_samples(dat$samples, 151 | excl=opts$samples_excl, 152 | methods=opts$samples_methods, 153 | batch2=opts$samples_batch2, 154 | sel=dat$samples_select) 155 | 156 | dat$meta <- read_all_meta(opts$rates_files) 157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample)) 158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample)) 159 | 160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100))) 161 | stopifnot(all(round(dat$weights) == dat$weights)) 162 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 163 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 164 | ``` 165 | 166 | 167 | 168 | ## Preprocessing 169 | 170 | ```{r} 171 | get_cov <- function(d, perc=TRUE) { 172 | cov <- rowSums(!is.na(d)) 173 | if (perc) { 174 | cov <- cov / ncol(d) 175 | } 176 | return (cov) 177 | } 178 | 179 | get_var <- function(d) { 180 | return (apply(d, 1, var, na.rm=T)) 181 | } 182 | ``` 183 | 184 | ```{r} 185 | h <- duplicated(dat$meta) 186 | print(sprintf('Remove %d duplicated records.', sum(h))) 187 | dat$meta <- dat$meta[!h, ] 188 | dat$meta$id_ <- 1:nrow(dat$meta) 189 | for (n in c('rates', 'weights')) { 190 | dat[[n]] <- dat[[n]][!h,] 191 | dat[[n]]$id_ <- dat$meta$id_ 192 | } 193 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 194 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 195 | ``` 196 | 197 | ```{r filter_cov} 198 | d <- dat$rates 199 | print(sprintf('%d records in total.', nrow(d))) 200 | if (!is.na(opts$fil$cov)) { 201 | d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,] 202 | } 203 | print(sprintf('%d records passed coverage filter.', nrow(d))) 204 | dat$rates <- d 205 | ``` 206 | 207 | ```{r} 208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') { 209 | t <- data.frame(sample=factor(rownames(pc_vec)), 210 | pcx=pc_vec[,x], pcy=pc_vec[,y]) 211 | t <- t %>% left_join(dat$samples, by='sample') 212 | t$sample_short <- sub('^CSCP3_SERUM', '', t$sample) 213 | p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) + 214 | geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) + 215 | xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) + 216 | theme(legend.position='bottom') 217 | return (p) 218 | } 219 | 220 | plot_pca_val <- function(pc_val) { 221 | t <- data.frame(pc=1:length(pc_val), val=pc_val) 222 | p <- ggplot(t, aes(x=pc, y=val)) + 223 | geom_bar(stat='identity', fill='salmon', color='black') + 224 | xlab('principle component') + 225 | ylab('% variance explained') 226 | return (p) 227 | } 228 | ``` 229 | 230 | 231 | ## PCA unadjusted data 232 | 233 | ```{r} 234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 235 | ``` 236 | 237 | ```{r fig.width=8, fig.height=7} 238 | p1 <- plot_pca_vec(pc$vec) 239 | p1 + theme(legend.position='top') 240 | ``` 241 | 242 | ```{r fig.width=10, fig.height=5} 243 | p2 <- plot_pca_val(pc$val) 244 | p2 245 | ``` 246 | 247 | ```{r} 248 | d <- dat$rates %>% select(-id_) 249 | cov <- rowSums(!is.na(d)) / ncol(d) 250 | d[h == ncol(d)] 251 | ``` 252 | 253 | ```{r adjust, eval=opts$adjust} 254 | scale_rates <- function(d) { 255 | h <- d %>% select(-id_) 256 | a <- min(h, na.rm=T) 257 | b <- max(h, na.rm=T) 258 | h <- (h - a) / (b - a) * 100 259 | h$id_ <- d$id_ 260 | return (h) 261 | } 262 | 263 | adjust_batch_met <- function(d, what='batch2', scale=F) { 264 | is_na <- is.na(d) 265 | h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_) 266 | h <- h %>% gather(sample, y, -id_) %>% 267 | inner_join(select_(dat$samples, 'sample', x=what), by='sample') 268 | h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup 269 | h <- h %>% select(id_, sample, y=yr) 270 | h <- h %>% spread(sample, y) 271 | stopifnot(all(dim(is_na) == dim(h))) 272 | stopifnot(all(colnames(is_na) %in% colnames(h))) 273 | h <- h[,colnames(is_na)] 274 | h[is_na] <- NA 275 | return (h) 276 | } 277 | 278 | h <- dat$rates %>% adjust_batch_met 279 | if (opts$scale_adjust) { 280 | h <- scale_rates(h) 281 | } 282 | dat$rates <- h 283 | ``` 284 | 285 | ## PCA adjusted data 286 | 287 | ```{r} 288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 289 | ``` 290 | 291 | ```{r fig.width=8, fig.height=7} 292 | p1 <- plot_pca_vec(pc$vec) 293 | p1 + theme(legend.position='top') 294 | ``` 295 | 296 | ```{r fig.width=10, fig.height=5} 297 | p2 <- plot_pca_val(pc$val) 298 | p2 299 | ``` 300 | 301 | ## Filtering 302 | 303 | ```{r filter_var} 304 | d <- dat$rates 305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples 306 | d <- d[!is.na(h),] 307 | if (!is.na(opts$fil$min_var)) { 308 | stopifnot(sum(is.na(h)) == 0) 309 | d <- d[h >= opts$fil$min_var,] 310 | } 311 | print(sprintf('%d records passed min var filter.', nrow(d))) 312 | 313 | if (!is.na(opts$fil$top_var)) { 314 | t <- opts$fil$top_var 315 | if (t <= 1) { 316 | t <- as.integer(t * nrow(d)) 317 | } 318 | d <- d[order(get_var(select(d, -id_)), decreasing=T),] 319 | d <- d[1:t,] 320 | } 321 | print(sprintf('%d final record.', nrow(d))) 322 | dat$rates <- d 323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_)) 324 | ``` 325 | 326 | ```{r join} 327 | d <- dat$meta %>% select(chromo, start, end, id_) %>% 328 | inner_join(dat$rates, by='id_') 329 | stopifnot(nrow(d) == nrow(dat$rates)) 330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_)) 331 | 332 | d <- dat$weights %>% gather(sample, weight, -id_) 333 | h <- nrow(dat$df) 334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample')) 335 | 336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup 337 | 338 | stopifnot(h == nrow(dat$df)) 339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1)) 340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0)) 341 | ``` 342 | 343 | ```{r} 344 | dat$df <- dat$df %>% ungroup %>% tbl_df 345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df 346 | ``` 347 | 348 | ```{r write} 349 | save_rds <- function(d, name) { 350 | f <- sprintf('%s_%s.rds', opts$out_base, name) 351 | saveRDS(d, f) 352 | } 353 | 354 | save_rds(dat$df, 'met') 355 | save_rds(dat$meta, 'meta') 356 | ``` 357 | 358 | 359 | 360 | ```{r eval=opts$plot_heat, fig.height=12} 361 | plot_heat <- function(d, Rowv=T, xlab='value') { 362 | colors <- rev(brewer.pal(9, 'Spectral')) 363 | colors <- colorRampPalette(colors)(50) 364 | 365 | labRow <- NA 366 | if (nrow(d) > 500) { 367 | dendro='column' 368 | } else { 369 | dendro = 'both' 370 | } 371 | p <- heatmap.2(d, density.info='none', trace='none', col=colors, 372 | Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA, 373 | lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab) 374 | return (p) 375 | } 376 | 377 | d <- dat$rates 378 | max_rows <- 1000 379 | if (nrow(d) > max_rows) { 380 | d <- d %>% sample_n(max_rows) 381 | } 382 | d <- d %>% select(-id_) %>% as.matrix 383 | if (nrow(d) > 100) { 384 | h <- plot_heat(d) 385 | } 386 | ``` 387 | -------------------------------------------------------------------------------- /data/met/prepro/gene_body/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Preprocessing Seqmonk methylation files 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(dplyr) 15 | library(tidyr) 16 | library(ggplot2) 17 | library(RColorBrewer) 18 | library(gplots) 19 | library(gridExtra) 20 | source('../../../../lib/utils.R') 21 | ``` 22 | 23 | ```{r opts, echo=T} 24 | opts <- list() 25 | opts$fil$cov <- 30 26 | opts$fil$min_var <- 10 27 | opts$fil$top_var <- 0.75 28 | opts$adjust <- F 29 | opts$scale_adjust <- T 30 | opts$samples_excl <- NULL 31 | opts$samples_methods <- NULL 32 | opts$samples_batch2 <- NULL 33 | ``` 34 | 35 | ```{r} 36 | opts$name <- basename(getwd()) 37 | opts$seqmonk_dirs <- c( 38 | file.path('../../reports', opts$name)) 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt')) 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt')) 41 | opts$out_base <- './data' 42 | opts$samples_file <- '../../../samples/samples_stats.csv' 43 | opts$samples_select_file <- '../../../samples/samples.csv' 44 | opts$cache <- F 45 | opts$plot_heat <- F 46 | ``` 47 | 48 | **Name**: `r opts$name` 49 | 50 | 51 | 52 | ```{r read_src} 53 | read_meta <- function(filename) { 54 | h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t') 55 | names(h) <- tolower(names(h)) 56 | h <- h %>% rename(chromo=chromosome) 57 | h <- h %>% tbl_df 58 | return (h) 59 | } 60 | 61 | read_meta_quick <- function(filename) { 62 | h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t') 63 | names(h) <- tolower(names(h)) 64 | h <- h %>% tbl_df 65 | return (h) 66 | } 67 | 68 | read_all_meta <- function(filenames) { 69 | m <- read_meta(filenames[1]) 70 | if (length(filenames) > 1) { 71 | for (i in 2:length(filenames)) { 72 | s <- read_meta_quick(filenames[i]) 73 | stopifnot(all(m$start == s$start)) 74 | } 75 | } 76 | return (m) 77 | } 78 | 79 | read_values <- function(filename, samples=NULL) { 80 | h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t') 81 | if (!is.null(samples)) { 82 | h <- subset(h, select=intersect(colnames(h), samples)) 83 | } 84 | h <- h %>% tbl_df 85 | return (h) 86 | } 87 | 88 | read_all_values <- function(filenames, samples=NULL) { 89 | d <- lapply(filenames, function(x) read_values(x, samples=samples)) 90 | e <- list() 91 | for (dd in d) { 92 | if (ncol(dd) > 0) { 93 | e[[length(e) + 1]] <- dd 94 | } 95 | } 96 | d <- e 97 | stopifnot(length(d) > 0) 98 | h <- d[[1]] 99 | if (length(d) > 1) { 100 | for (i in 2:length(d)) { 101 | if (ncol(d[[i]]) > 0) { 102 | h <- cbind.data.frame(h, d[[i]]) 103 | } 104 | } 105 | } 106 | h <- h %>% tbl_df 107 | return (h) 108 | } 109 | 110 | read_samples <- function(filename) { 111 | h <- read.table(filename, sep='\t', head=T) %>% tbl_df 112 | h$sample <- h$id 113 | return (h) 114 | } 115 | 116 | read_samples_select <- function(filename) { 117 | h <- read.table(filename, sep='\t', head=F) %>% tbl_df 118 | colnames(h) <- c('id') 119 | h <- as.vector(h$id) 120 | return (h) 121 | } 122 | 123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL, 124 | sel=NULL) { 125 | f <- excl 126 | if (!is.null(f)) { 127 | d <- d %>% filter(!(sample %in% f)) 128 | } 129 | f <- methods 130 | if (!is.null(f) && 'method' %in% colnames(d)) { 131 | d <- d %>% filter(method %in% f) 132 | } 133 | f <- batch2 134 | if (!is.null(f) && 'batch2' %in% colnames(d)) { 135 | d <- d %>% filter(batch2 %in% f) 136 | } 137 | if (!is.null(sel)) { 138 | d <- d %>% filter(sample %in% sel) 139 | } 140 | d <- droplevels(d) 141 | 142 | return (d) 143 | } 144 | ``` 145 | 146 | ```{r} 147 | dat <- list() 148 | dat$samples <- read_samples(opts$samples_file) 149 | dat$samples_select <- read_samples_select(opts$samples_select_file) 150 | dat$samples <- filter_samples(dat$samples, 151 | excl=opts$samples_excl, 152 | methods=opts$samples_methods, 153 | batch2=opts$samples_batch2, 154 | sel=dat$samples_select) 155 | 156 | dat$meta <- read_all_meta(opts$rates_files) 157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample)) 158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample)) 159 | 160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100))) 161 | stopifnot(all(round(dat$weights) == dat$weights)) 162 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 163 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 164 | ``` 165 | 166 | 167 | 168 | ## Preprocessing 169 | 170 | ```{r} 171 | get_cov <- function(d, perc=TRUE) { 172 | cov <- rowSums(!is.na(d)) 173 | if (perc) { 174 | cov <- cov / ncol(d) 175 | } 176 | return (cov) 177 | } 178 | 179 | get_var <- function(d) { 180 | return (apply(d, 1, var, na.rm=T)) 181 | } 182 | ``` 183 | 184 | ```{r} 185 | h <- duplicated(dat$meta) 186 | print(sprintf('Remove %d duplicated records.', sum(h))) 187 | dat$meta <- dat$meta[!h, ] 188 | dat$meta$id_ <- 1:nrow(dat$meta) 189 | for (n in c('rates', 'weights')) { 190 | dat[[n]] <- dat[[n]][!h,] 191 | dat[[n]]$id_ <- dat$meta$id_ 192 | } 193 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 194 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 195 | ``` 196 | 197 | ```{r filter_cov} 198 | d <- dat$rates 199 | print(sprintf('%d records in total.', nrow(d))) 200 | if (!is.na(opts$fil$cov)) { 201 | d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,] 202 | } 203 | print(sprintf('%d records passed coverage filter.', nrow(d))) 204 | dat$rates <- d 205 | ``` 206 | 207 | ```{r} 208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') { 209 | t <- data.frame(sample=factor(rownames(pc_vec)), 210 | pcx=pc_vec[,x], pcy=pc_vec[,y]) 211 | t <- t %>% left_join(dat$samples, by='sample') 212 | t$sample_short <- sub('^CSCP3_SERUM', '', t$sample) 213 | p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) + 214 | geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) + 215 | xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) + 216 | theme(legend.position='bottom') 217 | return (p) 218 | } 219 | 220 | plot_pca_val <- function(pc_val) { 221 | t <- data.frame(pc=1:length(pc_val), val=pc_val) 222 | p <- ggplot(t, aes(x=pc, y=val)) + 223 | geom_bar(stat='identity', fill='salmon', color='black') + 224 | xlab('principle component') + 225 | ylab('% variance explained') 226 | return (p) 227 | } 228 | ``` 229 | 230 | 231 | ## PCA unadjusted data 232 | 233 | ```{r} 234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 235 | ``` 236 | 237 | ```{r fig.width=8, fig.height=7} 238 | p1 <- plot_pca_vec(pc$vec) 239 | p1 + theme(legend.position='top') 240 | ``` 241 | 242 | ```{r fig.width=10, fig.height=5} 243 | p2 <- plot_pca_val(pc$val) 244 | p2 245 | ``` 246 | 247 | ```{r} 248 | d <- dat$rates %>% select(-id_) 249 | cov <- rowSums(!is.na(d)) / ncol(d) 250 | d[h == ncol(d)] 251 | ``` 252 | 253 | ```{r adjust, eval=opts$adjust} 254 | scale_rates <- function(d) { 255 | h <- d %>% select(-id_) 256 | a <- min(h, na.rm=T) 257 | b <- max(h, na.rm=T) 258 | h <- (h - a) / (b - a) * 100 259 | h$id_ <- d$id_ 260 | return (h) 261 | } 262 | 263 | adjust_batch_met <- function(d, what='batch2', scale=F) { 264 | is_na <- is.na(d) 265 | h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_) 266 | h <- h %>% gather(sample, y, -id_) %>% 267 | inner_join(select_(dat$samples, 'sample', x=what), by='sample') 268 | h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup 269 | h <- h %>% select(id_, sample, y=yr) 270 | h <- h %>% spread(sample, y) 271 | stopifnot(all(dim(is_na) == dim(h))) 272 | stopifnot(all(colnames(is_na) %in% colnames(h))) 273 | h <- h[,colnames(is_na)] 274 | h[is_na] <- NA 275 | return (h) 276 | } 277 | 278 | h <- dat$rates %>% adjust_batch_met 279 | if (opts$scale_adjust) { 280 | h <- scale_rates(h) 281 | } 282 | dat$rates <- h 283 | ``` 284 | 285 | ## PCA adjusted data 286 | 287 | ```{r} 288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 289 | ``` 290 | 291 | ```{r fig.width=8, fig.height=7} 292 | p1 <- plot_pca_vec(pc$vec) 293 | p1 + theme(legend.position='top') 294 | ``` 295 | 296 | ```{r fig.width=10, fig.height=5} 297 | p2 <- plot_pca_val(pc$val) 298 | p2 299 | ``` 300 | 301 | ## Filtering 302 | 303 | ```{r filter_var} 304 | d <- dat$rates 305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples 306 | d <- d[!is.na(h),] 307 | if (!is.na(opts$fil$min_var)) { 308 | stopifnot(sum(is.na(h)) == 0) 309 | d <- d[h >= opts$fil$min_var,] 310 | } 311 | print(sprintf('%d records passed min var filter.', nrow(d))) 312 | 313 | if (!is.na(opts$fil$top_var)) { 314 | t <- opts$fil$top_var 315 | if (t <= 1) { 316 | t <- as.integer(t * nrow(d)) 317 | } 318 | d <- d[order(get_var(select(d, -id_)), decreasing=T),] 319 | d <- d[1:t,] 320 | } 321 | print(sprintf('%d final record.', nrow(d))) 322 | dat$rates <- d 323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_)) 324 | ``` 325 | 326 | ```{r join} 327 | d <- dat$meta %>% select(chromo, start, end, id_) %>% 328 | inner_join(dat$rates, by='id_') 329 | stopifnot(nrow(d) == nrow(dat$rates)) 330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_)) 331 | 332 | d <- dat$weights %>% gather(sample, weight, -id_) 333 | h <- nrow(dat$df) 334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample')) 335 | 336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup 337 | 338 | stopifnot(h == nrow(dat$df)) 339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1)) 340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0)) 341 | ``` 342 | 343 | ```{r} 344 | dat$df <- dat$df %>% ungroup %>% tbl_df 345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df 346 | ``` 347 | 348 | ```{r write} 349 | save_rds <- function(d, name) { 350 | f <- sprintf('%s_%s.rds', opts$out_base, name) 351 | saveRDS(d, f) 352 | } 353 | 354 | save_rds(dat$df, 'met') 355 | save_rds(dat$meta, 'meta') 356 | ``` 357 | 358 | 359 | 360 | ```{r eval=opts$plot_heat, fig.height=12} 361 | plot_heat <- function(d, Rowv=T, xlab='value') { 362 | colors <- rev(brewer.pal(9, 'Spectral')) 363 | colors <- colorRampPalette(colors)(50) 364 | 365 | labRow <- NA 366 | if (nrow(d) > 500) { 367 | dendro='column' 368 | } else { 369 | dendro = 'both' 370 | } 371 | p <- heatmap.2(d, density.info='none', trace='none', col=colors, 372 | Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA, 373 | lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab) 374 | return (p) 375 | } 376 | 377 | d <- dat$rates 378 | max_rows <- 1000 379 | if (nrow(d) > max_rows) { 380 | d <- d %>% sample_n(max_rows) 381 | } 382 | d <- d %>% select(-id_) %>% as.matrix 383 | if (nrow(d) > 100) { 384 | h <- plot_heat(d) 385 | } 386 | ``` 387 | -------------------------------------------------------------------------------- /data/met/prepro/p300/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Preprocessing Seqmonk methylation files 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(dplyr) 15 | library(tidyr) 16 | library(ggplot2) 17 | library(RColorBrewer) 18 | library(gplots) 19 | library(gridExtra) 20 | source('../../../../lib/utils.R') 21 | ``` 22 | 23 | ```{r opts, echo=T} 24 | opts <- list() 25 | opts$fil$cov <- 30 26 | opts$fil$min_var <- 10 27 | opts$fil$top_var <- 0.75 28 | opts$adjust <- F 29 | opts$scale_adjust <- T 30 | opts$samples_excl <- NULL 31 | opts$samples_methods <- NULL 32 | opts$samples_batch2 <- NULL 33 | ``` 34 | 35 | ```{r} 36 | opts$name <- basename(getwd()) 37 | opts$seqmonk_dirs <- c( 38 | file.path('../../reports', opts$name)) 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt')) 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt')) 41 | opts$out_base <- './data' 42 | opts$samples_file <- '../../../samples/samples_stats.csv' 43 | opts$samples_select_file <- '../../../samples/samples.csv' 44 | opts$cache <- F 45 | opts$plot_heat <- F 46 | ``` 47 | 48 | **Name**: `r opts$name` 49 | 50 | 51 | 52 | ```{r read_src} 53 | read_meta <- function(filename) { 54 | h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t') 55 | names(h) <- tolower(names(h)) 56 | h <- h %>% rename(chromo=chromosome) 57 | h <- h %>% tbl_df 58 | return (h) 59 | } 60 | 61 | read_meta_quick <- function(filename) { 62 | h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t') 63 | names(h) <- tolower(names(h)) 64 | h <- h %>% tbl_df 65 | return (h) 66 | } 67 | 68 | read_all_meta <- function(filenames) { 69 | m <- read_meta(filenames[1]) 70 | if (length(filenames) > 1) { 71 | for (i in 2:length(filenames)) { 72 | s <- read_meta_quick(filenames[i]) 73 | stopifnot(all(m$start == s$start)) 74 | } 75 | } 76 | return (m) 77 | } 78 | 79 | read_values <- function(filename, samples=NULL) { 80 | h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t') 81 | if (!is.null(samples)) { 82 | h <- subset(h, select=intersect(colnames(h), samples)) 83 | } 84 | h <- h %>% tbl_df 85 | return (h) 86 | } 87 | 88 | read_all_values <- function(filenames, samples=NULL) { 89 | d <- lapply(filenames, function(x) read_values(x, samples=samples)) 90 | e <- list() 91 | for (dd in d) { 92 | if (ncol(dd) > 0) { 93 | e[[length(e) + 1]] <- dd 94 | } 95 | } 96 | d <- e 97 | stopifnot(length(d) > 0) 98 | h <- d[[1]] 99 | if (length(d) > 1) { 100 | for (i in 2:length(d)) { 101 | if (ncol(d[[i]]) > 0) { 102 | h <- cbind.data.frame(h, d[[i]]) 103 | } 104 | } 105 | } 106 | h <- h %>% tbl_df 107 | return (h) 108 | } 109 | 110 | read_samples <- function(filename) { 111 | h <- read.table(filename, sep='\t', head=T) %>% tbl_df 112 | h$sample <- h$id 113 | return (h) 114 | } 115 | 116 | read_samples_select <- function(filename) { 117 | h <- read.table(filename, sep='\t', head=F) %>% tbl_df 118 | colnames(h) <- c('id') 119 | h <- as.vector(h$id) 120 | return (h) 121 | } 122 | 123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL, 124 | sel=NULL) { 125 | f <- excl 126 | if (!is.null(f)) { 127 | d <- d %>% filter(!(sample %in% f)) 128 | } 129 | f <- methods 130 | if (!is.null(f) && 'method' %in% colnames(d)) { 131 | d <- d %>% filter(method %in% f) 132 | } 133 | f <- batch2 134 | if (!is.null(f) && 'batch2' %in% colnames(d)) { 135 | d <- d %>% filter(batch2 %in% f) 136 | } 137 | if (!is.null(sel)) { 138 | d <- d %>% filter(sample %in% sel) 139 | } 140 | d <- droplevels(d) 141 | 142 | return (d) 143 | } 144 | ``` 145 | 146 | ```{r} 147 | dat <- list() 148 | dat$samples <- read_samples(opts$samples_file) 149 | dat$samples_select <- read_samples_select(opts$samples_select_file) 150 | dat$samples <- filter_samples(dat$samples, 151 | excl=opts$samples_excl, 152 | methods=opts$samples_methods, 153 | batch2=opts$samples_batch2, 154 | sel=dat$samples_select) 155 | 156 | dat$meta <- read_all_meta(opts$rates_files) 157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample)) 158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample)) 159 | 160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100))) 161 | stopifnot(all(round(dat$weights) == dat$weights)) 162 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 163 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 164 | ``` 165 | 166 | 167 | 168 | ## Preprocessing 169 | 170 | ```{r} 171 | get_cov <- function(d, perc=TRUE) { 172 | cov <- rowSums(!is.na(d)) 173 | if (perc) { 174 | cov <- cov / ncol(d) 175 | } 176 | return (cov) 177 | } 178 | 179 | get_var <- function(d) { 180 | return (apply(d, 1, var, na.rm=T)) 181 | } 182 | ``` 183 | 184 | ```{r} 185 | h <- duplicated(dat$meta) 186 | print(sprintf('Remove %d duplicated records.', sum(h))) 187 | dat$meta <- dat$meta[!h, ] 188 | dat$meta$id_ <- 1:nrow(dat$meta) 189 | for (n in c('rates', 'weights')) { 190 | dat[[n]] <- dat[[n]][!h,] 191 | dat[[n]]$id_ <- dat$meta$id_ 192 | } 193 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 194 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 195 | ``` 196 | 197 | ```{r filter_cov} 198 | d <- dat$rates 199 | print(sprintf('%d records in total.', nrow(d))) 200 | if (!is.na(opts$fil$cov)) { 201 | d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,] 202 | } 203 | print(sprintf('%d records passed coverage filter.', nrow(d))) 204 | dat$rates <- d 205 | ``` 206 | 207 | ```{r} 208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') { 209 | t <- data.frame(sample=factor(rownames(pc_vec)), 210 | pcx=pc_vec[,x], pcy=pc_vec[,y]) 211 | t <- t %>% left_join(dat$samples, by='sample') 212 | t$sample_short <- sub('^CSCP3_SERUM', '', t$sample) 213 | p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) + 214 | geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) + 215 | xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) + 216 | theme(legend.position='bottom') 217 | return (p) 218 | } 219 | 220 | plot_pca_val <- function(pc_val) { 221 | t <- data.frame(pc=1:length(pc_val), val=pc_val) 222 | p <- ggplot(t, aes(x=pc, y=val)) + 223 | geom_bar(stat='identity', fill='salmon', color='black') + 224 | xlab('principle component') + 225 | ylab('% variance explained') 226 | return (p) 227 | } 228 | ``` 229 | 230 | 231 | ## PCA unadjusted data 232 | 233 | ```{r} 234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 235 | ``` 236 | 237 | ```{r fig.width=8, fig.height=7} 238 | p1 <- plot_pca_vec(pc$vec) 239 | p1 + theme(legend.position='top') 240 | ``` 241 | 242 | ```{r fig.width=10, fig.height=5} 243 | p2 <- plot_pca_val(pc$val) 244 | p2 245 | ``` 246 | 247 | ```{r} 248 | d <- dat$rates %>% select(-id_) 249 | cov <- rowSums(!is.na(d)) / ncol(d) 250 | d[h == ncol(d)] 251 | ``` 252 | 253 | ```{r adjust, eval=opts$adjust} 254 | scale_rates <- function(d) { 255 | h <- d %>% select(-id_) 256 | a <- min(h, na.rm=T) 257 | b <- max(h, na.rm=T) 258 | h <- (h - a) / (b - a) * 100 259 | h$id_ <- d$id_ 260 | return (h) 261 | } 262 | 263 | adjust_batch_met <- function(d, what='batch2', scale=F) { 264 | is_na <- is.na(d) 265 | h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_) 266 | h <- h %>% gather(sample, y, -id_) %>% 267 | inner_join(select_(dat$samples, 'sample', x=what), by='sample') 268 | h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup 269 | h <- h %>% select(id_, sample, y=yr) 270 | h <- h %>% spread(sample, y) 271 | stopifnot(all(dim(is_na) == dim(h))) 272 | stopifnot(all(colnames(is_na) %in% colnames(h))) 273 | h <- h[,colnames(is_na)] 274 | h[is_na] <- NA 275 | return (h) 276 | } 277 | 278 | h <- dat$rates %>% adjust_batch_met 279 | if (opts$scale_adjust) { 280 | h <- scale_rates(h) 281 | } 282 | dat$rates <- h 283 | ``` 284 | 285 | ## PCA adjusted data 286 | 287 | ```{r} 288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 289 | ``` 290 | 291 | ```{r fig.width=8, fig.height=7} 292 | p1 <- plot_pca_vec(pc$vec) 293 | p1 + theme(legend.position='top') 294 | ``` 295 | 296 | ```{r fig.width=10, fig.height=5} 297 | p2 <- plot_pca_val(pc$val) 298 | p2 299 | ``` 300 | 301 | ## Filtering 302 | 303 | ```{r filter_var} 304 | d <- dat$rates 305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples 306 | d <- d[!is.na(h),] 307 | if (!is.na(opts$fil$min_var)) { 308 | stopifnot(sum(is.na(h)) == 0) 309 | d <- d[h >= opts$fil$min_var,] 310 | } 311 | print(sprintf('%d records passed min var filter.', nrow(d))) 312 | 313 | if (!is.na(opts$fil$top_var)) { 314 | t <- opts$fil$top_var 315 | if (t <= 1) { 316 | t <- as.integer(t * nrow(d)) 317 | } 318 | d <- d[order(get_var(select(d, -id_)), decreasing=T),] 319 | d <- d[1:t,] 320 | } 321 | print(sprintf('%d final record.', nrow(d))) 322 | dat$rates <- d 323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_)) 324 | ``` 325 | 326 | ```{r join} 327 | d <- dat$meta %>% select(chromo, start, end, id_) %>% 328 | inner_join(dat$rates, by='id_') 329 | stopifnot(nrow(d) == nrow(dat$rates)) 330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_)) 331 | 332 | d <- dat$weights %>% gather(sample, weight, -id_) 333 | h <- nrow(dat$df) 334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample')) 335 | 336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup 337 | 338 | stopifnot(h == nrow(dat$df)) 339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1)) 340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0)) 341 | ``` 342 | 343 | ```{r} 344 | dat$df <- dat$df %>% ungroup %>% tbl_df 345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df 346 | ``` 347 | 348 | ```{r write} 349 | save_rds <- function(d, name) { 350 | f <- sprintf('%s_%s.rds', opts$out_base, name) 351 | saveRDS(d, f) 352 | } 353 | 354 | save_rds(dat$df, 'met') 355 | save_rds(dat$meta, 'meta') 356 | ``` 357 | 358 | 359 | 360 | ```{r eval=opts$plot_heat, fig.height=12} 361 | plot_heat <- function(d, Rowv=T, xlab='value') { 362 | colors <- rev(brewer.pal(9, 'Spectral')) 363 | colors <- colorRampPalette(colors)(50) 364 | 365 | labRow <- NA 366 | if (nrow(d) > 500) { 367 | dendro='column' 368 | } else { 369 | dendro = 'both' 370 | } 371 | p <- heatmap.2(d, density.info='none', trace='none', col=colors, 372 | Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA, 373 | lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab) 374 | return (p) 375 | } 376 | 377 | d <- dat$rates 378 | max_rows <- 1000 379 | if (nrow(d) > max_rows) { 380 | d <- d %>% sample_n(max_rows) 381 | } 382 | d <- d %>% select(-id_) %>% as.matrix 383 | if (nrow(d) > 100) { 384 | h <- plot_heat(d) 385 | } 386 | ``` 387 | -------------------------------------------------------------------------------- /data/met/prepro/prom/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Preprocessing Seqmonk methylation files 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(dplyr) 15 | library(tidyr) 16 | library(ggplot2) 17 | library(RColorBrewer) 18 | library(gplots) 19 | library(gridExtra) 20 | source('../../../../lib/utils.R') 21 | ``` 22 | 23 | ```{r opts, echo=T} 24 | opts <- list() 25 | opts$fil$cov <- 30 26 | opts$fil$min_var <- 10 27 | opts$fil$top_var <- 0.75 28 | opts$adjust <- F 29 | opts$scale_adjust <- T 30 | opts$samples_excl <- NULL 31 | opts$samples_methods <- NULL 32 | opts$samples_batch2 <- NULL 33 | ``` 34 | 35 | ```{r} 36 | opts$name <- basename(getwd()) 37 | opts$seqmonk_dirs <- c( 38 | file.path('../../reports', opts$name)) 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt')) 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt')) 41 | opts$out_base <- './data' 42 | opts$samples_file <- '../../../samples/samples_stats.csv' 43 | opts$samples_select_file <- '../../../samples/samples.csv' 44 | opts$cache <- F 45 | opts$plot_heat <- F 46 | ``` 47 | 48 | **Name**: `r opts$name` 49 | 50 | 51 | 52 | ```{r read_src} 53 | read_meta <- function(filename) { 54 | h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t') 55 | names(h) <- tolower(names(h)) 56 | h <- h %>% rename(chromo=chromosome) 57 | h <- h %>% tbl_df 58 | return (h) 59 | } 60 | 61 | read_meta_quick <- function(filename) { 62 | h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t') 63 | names(h) <- tolower(names(h)) 64 | h <- h %>% tbl_df 65 | return (h) 66 | } 67 | 68 | read_all_meta <- function(filenames) { 69 | m <- read_meta(filenames[1]) 70 | if (length(filenames) > 1) { 71 | for (i in 2:length(filenames)) { 72 | s <- read_meta_quick(filenames[i]) 73 | stopifnot(all(m$start == s$start)) 74 | } 75 | } 76 | return (m) 77 | } 78 | 79 | read_values <- function(filename, samples=NULL) { 80 | h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t') 81 | if (!is.null(samples)) { 82 | h <- subset(h, select=intersect(colnames(h), samples)) 83 | } 84 | h <- h %>% tbl_df 85 | return (h) 86 | } 87 | 88 | read_all_values <- function(filenames, samples=NULL) { 89 | d <- lapply(filenames, function(x) read_values(x, samples=samples)) 90 | e <- list() 91 | for (dd in d) { 92 | if (ncol(dd) > 0) { 93 | e[[length(e) + 1]] <- dd 94 | } 95 | } 96 | d <- e 97 | stopifnot(length(d) > 0) 98 | h <- d[[1]] 99 | if (length(d) > 1) { 100 | for (i in 2:length(d)) { 101 | if (ncol(d[[i]]) > 0) { 102 | h <- cbind.data.frame(h, d[[i]]) 103 | } 104 | } 105 | } 106 | h <- h %>% tbl_df 107 | return (h) 108 | } 109 | 110 | read_samples <- function(filename) { 111 | h <- read.table(filename, sep='\t', head=T) %>% tbl_df 112 | h$sample <- h$id 113 | return (h) 114 | } 115 | 116 | read_samples_select <- function(filename) { 117 | h <- read.table(filename, sep='\t', head=F) %>% tbl_df 118 | colnames(h) <- c('id') 119 | h <- as.vector(h$id) 120 | return (h) 121 | } 122 | 123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL, 124 | sel=NULL) { 125 | f <- excl 126 | if (!is.null(f)) { 127 | d <- d %>% filter(!(sample %in% f)) 128 | } 129 | f <- methods 130 | if (!is.null(f) && 'method' %in% colnames(d)) { 131 | d <- d %>% filter(method %in% f) 132 | } 133 | f <- batch2 134 | if (!is.null(f) && 'batch2' %in% colnames(d)) { 135 | d <- d %>% filter(batch2 %in% f) 136 | } 137 | if (!is.null(sel)) { 138 | d <- d %>% filter(sample %in% sel) 139 | } 140 | d <- droplevels(d) 141 | 142 | return (d) 143 | } 144 | ``` 145 | 146 | ```{r} 147 | dat <- list() 148 | dat$samples <- read_samples(opts$samples_file) 149 | dat$samples_select <- read_samples_select(opts$samples_select_file) 150 | dat$samples <- filter_samples(dat$samples, 151 | excl=opts$samples_excl, 152 | methods=opts$samples_methods, 153 | batch2=opts$samples_batch2, 154 | sel=dat$samples_select) 155 | 156 | dat$meta <- read_all_meta(opts$rates_files) 157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample)) 158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample)) 159 | 160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100))) 161 | stopifnot(all(round(dat$weights) == dat$weights)) 162 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 163 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 164 | ``` 165 | 166 | 167 | 168 | ## Preprocessing 169 | 170 | ```{r} 171 | get_cov <- function(d, perc=TRUE) { 172 | cov <- rowSums(!is.na(d)) 173 | if (perc) { 174 | cov <- cov / ncol(d) 175 | } 176 | return (cov) 177 | } 178 | 179 | get_var <- function(d) { 180 | return (apply(d, 1, var, na.rm=T)) 181 | } 182 | ``` 183 | 184 | ```{r} 185 | h <- duplicated(dat$meta) 186 | print(sprintf('Remove %d duplicated records.', sum(h))) 187 | dat$meta <- dat$meta[!h, ] 188 | dat$meta$id_ <- 1:nrow(dat$meta) 189 | for (n in c('rates', 'weights')) { 190 | dat[[n]] <- dat[[n]][!h,] 191 | dat[[n]]$id_ <- dat$meta$id_ 192 | } 193 | stopifnot(nrow(dat$meta) == nrow(dat$rates)) 194 | stopifnot(nrow(dat$rates) == nrow(dat$weights)) 195 | ``` 196 | 197 | ```{r filter_cov} 198 | d <- dat$rates 199 | print(sprintf('%d records in total.', nrow(d))) 200 | if (!is.na(opts$fil$cov)) { 201 | d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,] 202 | } 203 | print(sprintf('%d records passed coverage filter.', nrow(d))) 204 | dat$rates <- d 205 | ``` 206 | 207 | ```{r} 208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') { 209 | t <- data.frame(sample=factor(rownames(pc_vec)), 210 | pcx=pc_vec[,x], pcy=pc_vec[,y]) 211 | t <- t %>% left_join(dat$samples, by='sample') 212 | t$sample_short <- sub('^CSCP3_SERUM', '', t$sample) 213 | p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) + 214 | geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) + 215 | xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) + 216 | theme(legend.position='bottom') 217 | return (p) 218 | } 219 | 220 | plot_pca_val <- function(pc_val) { 221 | t <- data.frame(pc=1:length(pc_val), val=pc_val) 222 | p <- ggplot(t, aes(x=pc, y=val)) + 223 | geom_bar(stat='identity', fill='salmon', color='black') + 224 | xlab('principle component') + 225 | ylab('% variance explained') 226 | return (p) 227 | } 228 | ``` 229 | 230 | 231 | ## PCA unadjusted data 232 | 233 | ```{r} 234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 235 | ``` 236 | 237 | ```{r fig.width=8, fig.height=7} 238 | p1 <- plot_pca_vec(pc$vec) 239 | p1 + theme(legend.position='top') 240 | ``` 241 | 242 | ```{r fig.width=10, fig.height=5} 243 | p2 <- plot_pca_val(pc$val) 244 | p2 245 | ``` 246 | 247 | ```{r} 248 | d <- dat$rates %>% select(-id_) 249 | cov <- rowSums(!is.na(d)) / ncol(d) 250 | d[h == ncol(d)] 251 | ``` 252 | 253 | ```{r adjust, eval=opts$adjust} 254 | scale_rates <- function(d) { 255 | h <- d %>% select(-id_) 256 | a <- min(h, na.rm=T) 257 | b <- max(h, na.rm=T) 258 | h <- (h - a) / (b - a) * 100 259 | h$id_ <- d$id_ 260 | return (h) 261 | } 262 | 263 | adjust_batch_met <- function(d, what='batch2', scale=F) { 264 | is_na <- is.na(d) 265 | h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_) 266 | h <- h %>% gather(sample, y, -id_) %>% 267 | inner_join(select_(dat$samples, 'sample', x=what), by='sample') 268 | h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup 269 | h <- h %>% select(id_, sample, y=yr) 270 | h <- h %>% spread(sample, y) 271 | stopifnot(all(dim(is_na) == dim(h))) 272 | stopifnot(all(colnames(is_na) %in% colnames(h))) 273 | h <- h[,colnames(is_na)] 274 | h[is_na] <- NA 275 | return (h) 276 | } 277 | 278 | h <- dat$rates %>% adjust_batch_met 279 | if (opts$scale_adjust) { 280 | h <- scale_rates(h) 281 | } 282 | dat$rates <- h 283 | ``` 284 | 285 | ## PCA adjusted data 286 | 287 | ```{r} 288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca 289 | ``` 290 | 291 | ```{r fig.width=8, fig.height=7} 292 | p1 <- plot_pca_vec(pc$vec) 293 | p1 + theme(legend.position='top') 294 | ``` 295 | 296 | ```{r fig.width=10, fig.height=5} 297 | p2 <- plot_pca_val(pc$val) 298 | p2 299 | ``` 300 | 301 | ## Filtering 302 | 303 | ```{r filter_var} 304 | d <- dat$rates 305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples 306 | d <- d[!is.na(h),] 307 | if (!is.na(opts$fil$min_var)) { 308 | stopifnot(sum(is.na(h)) == 0) 309 | d <- d[h >= opts$fil$min_var,] 310 | } 311 | print(sprintf('%d records passed min var filter.', nrow(d))) 312 | 313 | if (!is.na(opts$fil$top_var)) { 314 | t <- opts$fil$top_var 315 | if (t <= 1) { 316 | t <- as.integer(t * nrow(d)) 317 | } 318 | d <- d[order(get_var(select(d, -id_)), decreasing=T),] 319 | d <- d[1:t,] 320 | } 321 | print(sprintf('%d final record.', nrow(d))) 322 | dat$rates <- d 323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_)) 324 | ``` 325 | 326 | ```{r join} 327 | d <- dat$meta %>% select(chromo, start, end, id_) %>% 328 | inner_join(dat$rates, by='id_') 329 | stopifnot(nrow(d) == nrow(dat$rates)) 330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_)) 331 | 332 | d <- dat$weights %>% gather(sample, weight, -id_) 333 | h <- nrow(dat$df) 334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample')) 335 | 336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup 337 | 338 | stopifnot(h == nrow(dat$df)) 339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1)) 340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0)) 341 | ``` 342 | 343 | ```{r} 344 | dat$df <- dat$df %>% ungroup %>% tbl_df 345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df 346 | ``` 347 | 348 | ```{r write} 349 | save_rds <- function(d, name) { 350 | f <- sprintf('%s_%s.rds', opts$out_base, name) 351 | saveRDS(d, f) 352 | } 353 | 354 | save_rds(dat$df, 'met') 355 | save_rds(dat$meta, 'meta') 356 | ``` 357 | 358 | 359 | 360 | ```{r eval=opts$plot_heat, fig.height=12} 361 | plot_heat <- function(d, Rowv=T, xlab='value') { 362 | colors <- rev(brewer.pal(9, 'Spectral')) 363 | colors <- colorRampPalette(colors)(50) 364 | 365 | labRow <- NA 366 | if (nrow(d) > 500) { 367 | dendro='column' 368 | } else { 369 | dendro = 'both' 370 | } 371 | p <- heatmap.2(d, density.info='none', trace='none', col=colors, 372 | Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA, 373 | lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab) 374 | return (p) 375 | } 376 | 377 | d <- dat$rates 378 | max_rows <- 1000 379 | if (nrow(d) > max_rows) { 380 | d <- d %>% sample_n(max_rows) 381 | } 382 | d <- d %>% select(-id_) %>% as.matrix 383 | if (nrow(d) > 100) { 384 | h <- plot_heat(d) 385 | } 386 | ``` 387 | -------------------------------------------------------------------------------- /gene_robust/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Robustness gene-specific analysis 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(ggplot2) 15 | library(dplyr) 16 | library(tidyr) 17 | library(xtable) 18 | options(xtable.type='html') 19 | source('../lib/utils.R') 20 | ``` 21 | 22 | ```{r} 23 | opts <- list() 24 | opts$data_file <- '../data/join/data.rds' 25 | opts$expr <- '../data/expr/data_proc/data_expr.rds' 26 | opts$expr_meta <- '../data/expr/data_proc/data_meta.rds' 27 | opts$genes_file <- '../data/expr/data_raw/subpopulation_lif_genes9.csv' 28 | opts$alpha <- 0.1 29 | opts$keep <- c(0.6, 0.7, 0.8) 30 | opts$trials <- 5 31 | 32 | opts$rnd_trials <- 0 33 | opts$filter_genes <- F 34 | opts$cache <- T 35 | opts$mean <- F 36 | opts$permute <- F 37 | opts$num_genes <- 0 38 | opts$annos <- c() 39 | # opts$annos <- c('active_enhancer', 'H3K4me1', 'H3K4me1_Tet1', 'H3K27ac', 40 | # 'H3K27me3', 'cgi', 'prom', 'prom_cgi', 'prom_non_cgi', 'LMR', 'p300', 41 | # 'gene_body', 'Wu_Tet1') 42 | set.seed(0) 43 | ``` 44 | 45 | ```{r} 46 | theme_pub <- function() { 47 | p <- theme( 48 | axis.text=element_text(size=rel(1.2), color='black'), 49 | axis.title=element_text(size=rel(1.5)), 50 | legend.position='top', 51 | legend.text=element_text(size=rel(1.2)), 52 | legend.title=element_text(size=rel(1.2)), 53 | legend.key=element_rect(fill='transparent'), 54 | panel.border=element_blank(), 55 | panel.grid.major = element_blank(), 56 | panel.grid.minor = element_blank(), 57 | panel.background = element_blank(), 58 | axis.line = element_line(colour="black", size=1), 59 | axis.ticks.length = unit(.3, 'cm'), 60 | axis.ticks.margin = unit(.3, 'cm') 61 | ) 62 | return (p) 63 | } 64 | ``` 65 | 66 | Significance threshold: `r opts$alpha` 67 | 68 | ```{r} 69 | cmp <- list() 70 | dat <- list() 71 | ``` 72 | 73 | ```{r dat} 74 | read_genes <- function(f) { 75 | d <- read.csv(f, sep=',', head=T) %>% tbl_df 76 | names(d) <- c('ens_id', 'gene_id') 77 | return (d) 78 | } 79 | 80 | dat$em <- readRDS(opts$data_file) 81 | # dat$em <- dat$em %>% filter(chromo.x %in% c(1)) %>% droplevels 82 | dat$expr_meta <- readRDS(opts$expr_meta) 83 | dat$genes <- read_genes(opts$genes_file) 84 | if (opts$filter_genes) { 85 | h <- as.vector(dat$genes$ens_id) 86 | dat$em <- dat$em %>% filter(ens_id %in% h) 87 | } 88 | if (!is.null(opts$num_genes) && opts$num_genes > 0) { 89 | h <- levels(dat$em$ens_id) 90 | h <- h[sample(length(h))][1:opts$num_genes] 91 | dat$em <- dat$em %>% filter(ens_id %in% h) 92 | } 93 | dat$em <- dat$em %>% droplevels 94 | 95 | h <- dat$em %>% group_by(name, id_.x, id_.y) %>% top_n(1, expr) %>% ungroup %>% 96 | select(id_.x, id_.y, name, ens_id, gene_id, chromo=chromo.x, start.x, end.x, start.y, end.y, strand) 97 | a <- dat$genes %>% mutate(pluri=T) %>% select(-gene_id) 98 | h <- h %>% left_join(a, by='ens_id') 99 | h[is.na(h$pluri),]$pluri <- F 100 | dat$em_meta <- h 101 | dat$em_meta <- dat$em_meta %>% droplevels 102 | ``` 103 | 104 | ```{r} 105 | correlate <- function(d) { 106 | r <- d %>% group_by(name, id_.x, id_.y) %>% 107 | do(wtd_cor(.$expr, .$met, .$weight)) %>% ungroup 108 | r <- r %>% group_by(name) %>% mutate(p_adj=p.adjust(p, method='fdr')) %>% ungroup 109 | return (r) 110 | } 111 | ``` 112 | 113 | ```{r cor, cache=opts$cache} 114 | r <- correlate(dat$em) 115 | saveRDS(r, 'r.rds') 116 | ``` 117 | 118 | ```{r} 119 | cmp$r <- r 120 | ``` 121 | 122 | ```{r cor_robust, cache=opts$cache} 123 | samples <- as.vector(levels(dat$em$sample)) 124 | r <- list() 125 | for (k in opts$keep) { 126 | for (i in 1:opts$trials) { 127 | s <- sample(samples, round(length(samples) * k)) 128 | d <- dat$em %>% filter(sample %in% s) %>% droplevels 129 | rb <- correlate(d) 130 | rb$trial <- i 131 | rb$keep <- k 132 | r[[length(r) + 1]] <- rb 133 | } 134 | } 135 | r <- do.call(rbind.data.frame, r) 136 | r <- r[complete.cases(r),] 137 | saveRDS(r, 'rb.rds') 138 | ``` 139 | 140 | ```{r} 141 | cmp$rb <- r 142 | ``` 143 | 144 | ```{r} 145 | d <- cmp$r %>% inner_join(cmp$rb, by=c('name', 'id_.x', 'id_.y')) 146 | cmp$st <- d %>% group_by(name, keep, trial) %>% 147 | summarise( 148 | mse=mean(sqrt((r.x - r.y)**2), na.rm=T), 149 | mad=mean(abs(r.x - r.y), na.rm=T), 150 | sig0=sum(p_adj.x <= opts$alpha), 151 | sig0_pos=sum(p_adj.x <= opts$alpha & r.x > 0), 152 | sig0_neg=sum(p_adj.x <= opts$alpha & r.x < 0), 153 | sig=sum(p_adj.y <= opts$alpha), 154 | sig_pos=sum(p_adj.y <= opts$alpha & r.y > 0), 155 | sig_neg=sum(p_adj.y <= opts$alpha & r.y < 0), 156 | dsig=sig - sig0, 157 | dsig_pos=sig_pos - sig0_pos, 158 | dsig_neg=sig_neg - sig0_neg, 159 | psig=abs(dsig) / sig0, 160 | psig_pos=abs(dsig_pos) / sig0_pos, 161 | psig_neg=abs(dsig_neg) / sig0_neg 162 | ) %>% ungroup %>% 163 | mutate(name=factor(name, levels=sort(unique(name)))) %>% 164 | arrange(name, keep, trial) 165 | ``` 166 | 167 | ```{r} 168 | cmp$s <- cmp$st %>% group_by(name, keep) %>% select(-trial) %>% 169 | summarise( 170 | mse_mean=mean(mse), 171 | mse_min=min(mse), 172 | mse_max=max(mse), 173 | mse_sd=sd(mse), 174 | sig0=max(sig0), 175 | sig0_pos=max(sig0_pos), 176 | sig0_neg=max(sig0_neg), 177 | sig=mean(sig), 178 | sig_pos=mean(sig_pos), 179 | sig_neg=mean(sig_neg), 180 | dsig=mean(dsig), 181 | dsig_pos=mean(dsig_pos), 182 | dsig_neg=mean(dsig_neg), 183 | psig=mean(psig), 184 | psig_pos=mean(psig_pos), 185 | psig_neg=mean(psig_neg) 186 | ) %>% ungroup %>% arrange(name, keep) 187 | cmp$s <- cmp$s %>% 188 | mutate(keep=factor(keep), name=factor(name, levels=rev(levels(name)))) 189 | ``` 190 | 191 | ```{r} 192 | # Average over trials 193 | cmp$rba <- cmp$rb %>% group_by(name, id_.x, id_.y, keep) %>% 194 | summarise_each(funs(mean)) %>% ungroup 195 | ``` 196 | 197 | ```{r eval=T, fig.width=10, fig.height=10} 198 | h <- cmp$r %>% inner_join(cmp$rba, by=c('name', 'id_.x', 'id_.y')) 199 | d <- h %>% mutate(keep=factor(keep)) 200 | ggplot(d, aes(x=r.x, y=r.y, color=keep)) + 201 | geom_abline(slope=1, color='grey') + 202 | stat_density2d(aes(fill=keep)) + 203 | # geom_point(size=0.5) + 204 | facet_wrap(~name) + 205 | xlim(-1, 1) + ylim(-1, 1) + 206 | xlab('All samples') + ylab('Bootstrapped samples') + 207 | theme_pub() 208 | ``` 209 | 210 | ```{r fig.width=8, fig.height=7} 211 | p <- ggplot(cmp$s, aes(x=name, y=mse_mean)) + 212 | geom_bar(aes(fill=keep), stat='identity', position='dodge') + 213 | xlab('') + ylab('RMSE from all cells') + 214 | guides(fill=guide_legend(title='Fraction of cells')) + 215 | coord_flip() + 216 | theme_pub() 217 | print(p) 218 | ``` 219 | 220 | ```{r eval=F} 221 | ggsave(p, file='fig_rmse.pdf', width=8, height=9) 222 | ``` 223 | 224 | ```{r fig.width=8, fig.height=7} 225 | p <- ggplot(cmp$s, aes(x=name, y=dsig)) + 226 | geom_bar(aes(fill=keep), stat='identity', position='dodge') + 227 | xlab('') + ylab('Absolute loss # significant correlations') + 228 | guides(fill=guide_legend(title='Fraction of cells')) + 229 | coord_flip() + 230 | theme_pub() 231 | print(p) 232 | ``` 233 | 234 | ```{r eval=F} 235 | ggsave(p, file='fig_loss_abs.pdf', width=8, height=9) 236 | ``` 237 | 238 | ```{r fig.width=8, fig.height=7} 239 | p <- ggplot(cmp$s, aes(x=name, y=psig)) + 240 | geom_bar(aes(fill=keep), stat='identity', position='dodge') + 241 | xlab('') + ylab('Relative loss # significant correlations') + 242 | guides(fill=guide_legend(title='Fraction of cells')) + 243 | coord_flip() + 244 | theme_pub() 245 | print(p) 246 | ``` 247 | 248 | ```{r eval=F} 249 | ggsave(p, file='fig_loss_rel.pdf', width=8, height=9) 250 | ``` 251 | 252 | ```{r results='asis'} 253 | xtable(cmp$s, digits=2) 254 | ``` 255 | 256 | ```{r eval=F} 257 | write.csv(cmp$s, 'S11.csv') 258 | ``` 259 | -------------------------------------------------------------------------------- /heat/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Joint clustering on expression and methylation 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(ggplot2) 15 | library(dplyr) 16 | library(tidyr) 17 | library(RColorBrewer) 18 | library(gridExtra) 19 | library(gplots) 20 | library(corrplot) 21 | library(ggdendro) 22 | library(weights) 23 | library(stringr) 24 | library(weights) 25 | source('viz.R') 26 | source('../lib/utils.R') 27 | ``` 28 | 29 | ```{r} 30 | opts <- list() 31 | opts$data_file <- '../data/join/data.rds' 32 | opts$r_file <- '../gene/r.rds' 33 | opts$genes_file <- '../data/expr/data_raw/subpopulation_lif_genes9.csv' 34 | opts$name <- 'gene_body' 35 | opts$select_by <- 'met_var' 36 | opts$n <- 300 37 | 38 | opts$clust <- list() 39 | opts$clust$c1 <- c('C05', 'B05', 'D05', 'E01', 'H03', 'E09', 'C07', 'F01', 'B10', 'F07', 'C02', 'B07') 40 | opts$clust$c2 <- c('G02', 'E07', 'F03', 'C03', 'A07', 'C01', 'A06', 'A04', 'A02', 'A03', 'G07', 'C10', 'H01', 'D01', 'G09', 'E05', 'G01') 41 | opts$clust_colors <- c('default'='#fdc086', 'c1'='#7fc97f', 'c2'='#beaed4') 42 | ``` 43 | 44 | ```{r read} 45 | read_genes <- function(f) { 46 | d <- read.csv(f, sep=',', head=T) %>% tbl_df 47 | names(d) <- c('ens_id', 'gene_id') 48 | return (d) 49 | } 50 | 51 | dat <- list() 52 | dat$em <- readRDS(opts$data_file) %>% 53 | filter(name == opts$name) %>% 54 | mutate(sample=factor(sample, labels=format_sample(levels(sample)))) %>% 55 | droplevels 56 | dat$r <- readRDS(opts$r_file) %>% 57 | filter(name == opts$name) %>% droplevels 58 | dat$genes <- read_genes(opts$genes_file) 59 | ``` 60 | 61 | ```{r prepro} 62 | dat$s <- dat$em %>% group_by(name, id_.x, id_.y) %>% 63 | summarise( 64 | expr_mean = mean(expr), 65 | expr_var = var(expr), 66 | met_mean = weighted.mean(met, weight, na.rm=T), 67 | met_var = wtd.var(met, weight, na.rm=T) 68 | ) %>% ungroup 69 | dat$s <- dat$s %>% inner_join(dat$r, by=c('name', 'id_.x', 'id_.y')) 70 | ``` 71 | 72 | 73 | 74 | ```{r} 75 | col_names <- function(d, heat) { 76 | m <- colnames(d) 77 | m <- m[3:length(m)] 78 | m <- m[heat$colInd] 79 | m <- paste(sapply(m, function(x) sprintf('\'%s\'', x)), collapse=', ') 80 | return (m) 81 | } 82 | ``` 83 | 84 | ```{r} 85 | dclust <- data_clust(data_select(opts$n, opts$select_by)) 86 | ``` 87 | 88 | ```{r eval=F} 89 | write.csv(dclust$e, 'fig1e_expr.csv') 90 | write.csv(dclust$m, 'fig1e_met.csv') 91 | write.csv(dclust$r, 'fig1e_cor.csv') 92 | ``` 93 | 94 | ```{r fig.width=10, fig.height=10} 95 | pm <- plot_heat_met(dclust) 96 | ``` 97 | 98 | ```{r eval=F} 99 | pdf(file='fig_heat_met.pdf', width=10, height=10) 100 | pm <- plot_heat_met(dclust) 101 | dev.off() 102 | ``` 103 | 104 | ```{r eval=F} 105 | col_names(dclust$m, pm) 106 | ``` 107 | 108 | ```{r fig.width=10, fig.height=10} 109 | pe <- plot_heat_expr(dclust, Rowv=pm$rowDendrogram) 110 | ``` 111 | 112 | ```{r eval=F} 113 | pdf(file='fig_heat_expr.pdf', width=10, height=10) 114 | pe <- plot_heat_expr(dclust, Rowv=pm$rowDendrogram) 115 | dev.off() 116 | ``` 117 | 118 | ```{r fig.height=10, fig.width=1.5} 119 | plot_tracks(data_track(dclust, pm)) 120 | ``` 121 | 122 | ```{r eval=F} 123 | pdf(file='fig_heat_tracks.pdf', width=1.5, height=10) 124 | plot_tracks(data_track(dclust, pm)) 125 | dev.off() 126 | ``` 127 | -------------------------------------------------------------------------------- /heat/index2.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Joint clustering on expression and methylation 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(ggplot2) 15 | library(dplyr) 16 | library(tidyr) 17 | library(RColorBrewer) 18 | library(gridExtra) 19 | library(gplots) 20 | library(corrplot) 21 | library(ggdendro) 22 | library(weights) 23 | library(stringr) 24 | library(weights) 25 | source('viz.R') 26 | source('../lib/utils.R') 27 | ``` 28 | 29 | ```{r} 30 | opts <- list() 31 | opts$data_file <- '../data/join/data.rds' 32 | opts$r_file <- '../gene/r.rds' 33 | opts$genes_file <- '../data/expr/data_raw/subpopulation_lif_genes9.csv' 34 | opts$name <- 'gene_body' 35 | opts$select_by <- 'expr_var' 36 | opts$n <- 300 37 | 38 | opts$clust <- list() 39 | opts$clust$c1 <- c('B10', 'H10', 'F01', 'F07', 'E09', 'E01', 'H03', 'C02', 'C05', 'B05', 'B09', 'D05', 'C07') 40 | opts$clust$c2 <- c('H02', 'D10', 'F09', 'B06', 'F06', 'D06', 'E05', 'B07', 'B04', 'B01', 'E03', 'A06', 'C04') 41 | opts$clust_colors <- c('default'='#fdc086', 'c1'='#7fc97f', 'c2'='#beaed4') 42 | ``` 43 | 44 | ```{r read} 45 | read_genes <- function(f) { 46 | d <- read.csv(f, sep=',', head=T) %>% tbl_df 47 | names(d) <- c('ens_id', 'gene_id') 48 | return (d) 49 | } 50 | 51 | dat <- list() 52 | dat$em <- readRDS(opts$data_file) %>% 53 | filter(name == opts$name) %>% 54 | mutate(sample=factor(sample, labels=format_sample(levels(sample)))) %>% 55 | droplevels 56 | dat$r <- readRDS(opts$r_file) %>% 57 | filter(name == opts$name) %>% droplevels 58 | dat$genes <- read_genes(opts$genes_file) 59 | ``` 60 | 61 | ```{r prepro} 62 | dat$s <- dat$em %>% group_by(name, id_.x, id_.y) %>% 63 | summarise( 64 | expr_mean = mean(expr), 65 | expr_var = var(expr), 66 | met_mean = weighted.mean(met, weight, na.rm=T), 67 | met_var = wtd.var(met, weight, na.rm=T) 68 | ) %>% ungroup 69 | dat$s <- dat$s %>% inner_join(dat$r, by=c('name', 'id_.x', 'id_.y')) 70 | ``` 71 | 72 | 73 | 74 | ```{r} 75 | col_names <- function(d, heat) { 76 | m <- colnames(d) 77 | m <- m[3:length(m)] 78 | m <- m[heat$colInd] 79 | m <- paste(sapply(m, function(x) sprintf('\'%s\'', x)), collapse=', ') 80 | return (m) 81 | } 82 | ``` 83 | 84 | ```{r} 85 | dclust <- data_clust(data_select(opts$n, opts$select_by)) 86 | ``` 87 | 88 | ```{r fig.width=10, fig.height=10} 89 | pm <- plot_heat_met(dclust) 90 | ``` 91 | 92 | ```{r eval=F} 93 | pdf(file='fig_heat2_met.pdf', width=10, height=10) 94 | pm <- plot_heat_met(dclust) 95 | dev.off() 96 | ``` 97 | 98 | ```{r eval=F} 99 | col_names(dclust$m, pm) 100 | ``` 101 | 102 | ```{r fig.width=10, fig.height=10} 103 | pe <- plot_heat_expr(dclust, Rowv=pm$rowDendrogram) 104 | ``` 105 | 106 | ```{r eval=F} 107 | pdf(file='fig_heat2_expr.pdf', width=10, height=10) 108 | pe <- plot_heat_expr(dclust, Rowv=pm$rowDendrogram) 109 | dev.off() 110 | ``` 111 | 112 | ```{r fig.height=10, fig.width=1.5} 113 | plot_tracks(data_track(dclust, pm)) 114 | ``` 115 | 116 | ```{r eval=F} 117 | pdf(file='fig_heat2_tracks.pdf', width=1.5, height=10) 118 | plot_tracks(data_track(dclust, pm)) 119 | dev.off() 120 | ``` 121 | 122 | ```{r eval=F} 123 | write.csv(dclust$m, 'S10_met.csv') 124 | write.csv(dclust$e, 'S10_expr.csv') 125 | write.csv(dclust$r, 'S10_cor.csv') 126 | ``` 127 | 128 | -------------------------------------------------------------------------------- /heat/viz.R: -------------------------------------------------------------------------------- 1 | data_select <- function(n, by='met_var') { 2 | h <- dat$s %>% select_('name', 'id_.x', 'id_.y', sel=by) 3 | h <- h %>% group_by(name, id_.x) %>% top_n(1, sel) %>% ungroup 4 | h <- h %>% group_by(name) %>% top_n(n, sel) %>% ungroup 5 | h <- h %>% select(name, id_.x, id_.y, sel) 6 | return (h) 7 | } 8 | 9 | data_clust <- function(s) { 10 | em <- dat$em %>% semi_join(s, by=c('name', 'id_.x', 'id_.y')) %>% 11 | select(id_.x, id_.y, gene_id, sample, expr, met) 12 | e <- em %>% select(id_.y, gene_id, sample, expr) %>% spread(sample, expr) 13 | m <- em %>% select(id_.y, gene_id, sample, met) %>% spread(sample, met) 14 | rs <- dat$s %>% semi_join(s, by=c('name', 'id_.x', 'id_.y')) 15 | return (list(r=rs, e=e, m=m, em=em)) 16 | } 17 | 18 | samples_colors <- function(x) { 19 | h <- rep(opts$clust_colors['default'], length(x)) 20 | for (n in names(opts$clust)) { 21 | h[x %in% opts$clust[[n]]] <- opts$clust_colors[n] 22 | } 23 | names(h) <- x 24 | return (h) 25 | } 26 | 27 | plot_heat <- function(d, xlab='value', col=NULL, col_colors=NULL, 28 | labRow=NA, ...) { 29 | d <- as.matrix(d) 30 | if (is.null(col)) { 31 | col <- rev(brewer.pal(9, 'Spectral')) 32 | col <- colorRampPalette(col)(50) 33 | } 34 | 35 | if (nrow(d) > 500) { 36 | dendro='column' 37 | } else { 38 | dendro = 'both' 39 | } 40 | 41 | col_colors <- samples_colors(colnames(d)) 42 | 43 | p <- heatmap.2(d, density.info='none', trace='none', col=col, 44 | keysize=0.5, dendro=dendro, labRow=labRow, 45 | ColSideColors=col_colors, 46 | lhei=c(2,9), 47 | lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab, ...) 48 | return (p) 49 | } 50 | 51 | plot_heat_expr <- function(dclust, col=NULL, ...) { 52 | d <- dclust$e %>% select(-id_.y) %>% to_matrix(rowcol='gene_id') 53 | if (is.null(col)) { 54 | col <- brewer_cols('OrRd') 55 | } 56 | pe <- plot_heat(d, col=col, xlab='Expression', ...) 57 | return (pe) 58 | } 59 | 60 | plot_heat_met <- function(dclust, col=NULL, ...) { 61 | d <- dclust$m %>% select(-id_.y) %>% to_matrix(rowcol='gene_id') 62 | if (is.null(col)) { 63 | col <- brewer_cols('PuBuGn', rev=F) 64 | } 65 | pm <- plot_heat(d, col=col, xlab='Methylation', ...) 66 | return (pm) 67 | } 68 | 69 | plot_tracks <- function(d, r=F) { 70 | te <- plot_track(filter(d, param=='expr_var')) + 71 | scale_fill_gradient(low='white', 'high'='red', name='var(expr)') 72 | tm <- plot_track(filter(d, param=='met_var')) + 73 | scale_fill_gradient(low='white', 'high'='royalblue2', name='var(met)') 74 | if (r) { 75 | tr <- plot_track(filter(d, param=='r')) + 76 | scale_fill_gradient2(low='royalblue', mid='white', 'high'='red2', name='r') 77 | grid.arrange(tm, tr, te, ncol=3) 78 | } else { 79 | grid.arrange(tm, te, ncol=2) 80 | } 81 | } 82 | 83 | brewer_cols <- function(pal='Spectral', rev=F) { 84 | ncol <- brewer.pal.info[pal, 'maxcolors'] 85 | col <- colorRampPalette(brewer.pal(ncol, pal))(50) 86 | if (rev) { 87 | col <- rev(col) 88 | } 89 | return (col) 90 | } 91 | 92 | to_matrix <- function(d, rowcol=NULL) { 93 | d <- as.data.frame(d) 94 | if (!is.null(rowcol)) { 95 | rownames(d) <- d[[rowcol]] 96 | d <- d[,setdiff(names(d), rowcol)] 97 | } 98 | d <- as.matrix(d) 99 | return (d) 100 | } 101 | 102 | data_track <- function(dclust, p=NA) { 103 | h <- dclust$e 104 | if (!is.na(p)) { 105 | h <- h[p$rowInd,] 106 | } 107 | h <- h %>% select(id_.y) 108 | rc <- h %>% inner_join(dclust$r, by='id_.y') 109 | d <- rc %>% select(r, expr_var, met_var) %>% mutate(i=1:n()) %>% 110 | gather(param, value, -i) 111 | return (d) 112 | } 113 | 114 | plot_track <- function(d, low='white', high='red') { 115 | p <- ggplot(d, aes(x=param, y=i, fill=value)) + geom_tile() + 116 | theme(line=element_blank(), axis.text=element_blank(), 117 | axis.title.x=element_blank(), 118 | axis.title.y=element_blank(), 119 | panel.background=element_blank(), 120 | plot.margin=unit(rep(0, 4), 'mm'), 121 | legend.position='top', legend.direction='vertical') 122 | return (p) 123 | } 124 | -------------------------------------------------------------------------------- /index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: scM&T-Seq 3 | output: 4 | html_document: 5 | toc: no 6 | --- 7 | 8 | # Gene-specific 9 | * [Gene-specific correlation](./gene/index.html) 10 | * [Robustness gene-specific correlation](./gene_robust/index.html) 11 | * [Correlation with mean methylation](./gene_mean/index.html) 12 | * [Heatmap expression vs. methylation](./heat/met.html) 13 | * [Esrrb](./zoom/index.html) 14 | 15 | # Sample-specific 16 | * [Sample-specific correlation](./sample/index.html) 17 | 18 | # QC 19 | * [Statistics](./qc/index.html) 20 | * [Clustering](./clust/index.html) 21 | * [Comparison variability](./var/m14_m15P3/index.html) 22 | * [Correlation with bulk methylation](./cor_bulk/met/index.html) 23 | * [Coverage analysis](./cov/m14_m15P3/index.html) 24 | * [CCA](./cca/index.html) 25 | 26 | -------------------------------------------------------------------------------- /lib/utils.R: -------------------------------------------------------------------------------- 1 | library(weights) 2 | library(dplyr) 3 | library(tidyr) 4 | library(stringr) 5 | 6 | theme_pub <- function() { 7 | p <- theme( 8 | axis.text=element_text(size=rel(1.2), color='black'), 9 | axis.title=element_text(size=rel(1.5)), 10 | # axis.title.y=element_text(vjust=1.5), 11 | # axis.title.x=element_text(vjust=-0.2), 12 | legend.position='top', 13 | legend.text=element_text(size=rel(1.2)), 14 | legend.title=element_text(size=rel(1.2)), 15 | legend.key=element_rect(fill='transparent'), 16 | panel.border=element_blank(), 17 | panel.grid.major = element_blank(), 18 | panel.grid.minor = element_blank(), 19 | panel.background = element_blank(), 20 | axis.line = element_line(colour="black", size=1), 21 | axis.ticks.length = unit(.3, 'cm'), 22 | axis.ticks.margin = unit(.3, 'cm') 23 | ) 24 | return (p) 25 | } 26 | 27 | format_sample <- function(s) { 28 | l <- str_split(s, '_') 29 | l <- sapply(l, function(x) x[length(x)]) 30 | return (l) 31 | } 32 | 33 | log_counts <- function(x) { 34 | return (log10(x + 1)) 35 | } 36 | 37 | 38 | # Tests for (weighted) association between two random variables. 39 | 40 | # For weighted Pearson's correlation calls wtd.cor(). For weighted Spearman's 41 | # correlation, calls cor.test() with weighted ranks. For unweighted correction, 42 | # calls cor.test(). 43 | 44 | # Arguments: 45 | # x, y: variables to be tested. 46 | # weights: optional sample weights. 47 | # method: a character string indicating which correlation coefficient is to be 48 | # used for the test. Either 'pearson' or 'spearman'. 49 | # alternative: alternative hypothesis. Either 'one.sided' or 'two.sided'. 50 | wtd_cor <- function(x, y, weights=NULL, method='pearson', 51 | alternative='two.sided', n_min=3) { 52 | r <- data.frame(r=NA, r_lo=NA, r_up=NA, p=NA, n=NA, n_wtd=NA) 53 | o <- !is.na(x) & !is.na(y) 54 | x <- x[o] 55 | y <- y[o] 56 | weights <- weights[o] 57 | r$n <- sum(o) 58 | if (is.null(weights)) { 59 | r$n_wtd <- r$n 60 | } else { 61 | r$n_wtd <- sum(weights) 62 | } 63 | if (r$n >= n_min & var(x) > 0 & var(y) > 0) { 64 | ct <- NULL 65 | ct_wtd <- NULL 66 | if (is.null(weights)) { 67 | ct <- cor.test(x, y, alternative=alternative, method=method) 68 | } else { 69 | if (method == 'spearman') { 70 | rx <- wtd.rank(x, weights=weights) 71 | ry <- wtd.rank(y, weights=weights) 72 | ct <- cor.test(rx, ry, alternative=alternative, method=method) 73 | } else { 74 | ct_wtd <- wtd.cor(x, y, weight=weights) 75 | } 76 | } 77 | if (!is.null(ct)) { 78 | r$r <- ct$estimate[1] 79 | r$p <- ct$p.value 80 | if (!is.null(ct$conf.int)) { 81 | r$r_lo <- ct$conf.int[1] 82 | r$r_up <- ct$conf.int[2] 83 | } 84 | } else { 85 | r$r <- ct_wtd[1] 86 | r$p <- ct_wtd[4] 87 | # Computes bounded 95% confidence interval 88 | r$r_lo <- max(-1, ct_wtd[1] - 1.96 * ct_wtd[2]) 89 | r$r_up <- min(1, ct_wtd[1] + 1.96 * ct_wtd[2]) 90 | } 91 | } 92 | return (r) 93 | } 94 | 95 | # mean imputation by columns 96 | impute <- function(d) { 97 | means <- colMeans(d, na.rm=T) 98 | if (any(is.na(means))) { 99 | stop('Insufficient data for mean imputation!') 100 | } 101 | for (i in 1:length(means)) { 102 | d[is.na(d[,i]), i] <- means[i] 103 | 104 | return (d) 105 | } 106 | 107 | # adjust batch effect 108 | adjust_lm <- function(y, x) { 109 | m <- lm(y ~ x) 110 | return (residuals(m)) 111 | } 112 | 113 | adjust_df <- function(d) { 114 | d$yr <- adjust_lm(d$y, d$x) 115 | return (d) 116 | } 117 | 118 | pca <- function(d, center=T, scale=F) { 119 | # columns are samples 120 | d <- scale(d, center=center, scale=scale) 121 | d <- t(d) 122 | s <- svd(d) 123 | vec <- s$u 124 | rownames(vec) <- rownames(d) 125 | val <- s$d**2 126 | val <- val / sum(val) 127 | return (list(vec=vec, val=val)) 128 | } 129 | 130 | 131 | # rbinds data frames by common columns 132 | rbind_frames <- function(d) { 133 | cols <- colnames(d[[1]]) 134 | for (dd in d) { 135 | cols <- intersect(cols, colnames(dd)) 136 | } 137 | dcols <- lapply(d, function(x) subset(x, select=cols)) 138 | d <- do.call(rbind.data.frame, dcols) 139 | return (d) 140 | } 141 | 142 | read_report_meta <- function(filename, n=NULL) { 143 | if (!is.null(n)) { 144 | h <- 'cut -f 2-5,7,8,12' 145 | h <- sprintf('head -n %d %s | %s', n, filename, h) 146 | } else { 147 | h <- sprintf('%s %s', h, filename) 148 | } 149 | h <- 'cut -f 2-5,7,8,12' 150 | if (!is.null(n)) { 151 | h <- sprintf('head -n %d | %s', n, h) 152 | } 153 | h <- read.table(pipe(h), head=T, sep='\t') 154 | names(h) <- tolower(names(h)) 155 | h <- h %>% rename(chromo=chromosome) 156 | h <- h %>% tbl_df 157 | return (h) 158 | } 159 | 160 | read_report_values <- function(filename, samples=NULL, n=NULL) { 161 | h <- 'cut -f 13-' 162 | if (!is.null(n)) { 163 | h <- sprintf('head -n %d %s | %s', n, filename, h) 164 | } else { 165 | h <- sprintf('%s %s', h, filename) 166 | } 167 | 168 | h <- read.table(pipe(h), head=T, sep='\t') 169 | if (!is.null(samples)) { 170 | h <- subset(h, select=intersect(colnames(h), samples)) 171 | } 172 | h <- h %>% tbl_df 173 | return (h) 174 | } 175 | 176 | read_samples_list <- function(sample_file) { 177 | d <- read.table(sample_file, head=F) %>% unlist %>% as.vector 178 | return (d) 179 | } 180 | 181 | read_samples_stats <- function(stats_file, samples=NULL) { 182 | d <- read.table(opts$samples_stats, sep='\t', head=T) %>% 183 | rename(sample=id, cpg_rate=CpG.rate, chh_rate=CHH.rate, cph_rate=CpH.rate) %>% 184 | droplevels %>% tbl_df 185 | if (!is.null(samples)) { 186 | d <- d[d$sample %in% samples,] %>% droplevels 187 | } 188 | return (d) 189 | } 190 | 191 | plot_pca_vec <- function(pc_vec, x=1, y=2) { 192 | t <- data.frame(sample=factor(rownames(pc_vec)), 193 | pcx=pc_vec[,x], pcy=pc_vec[,y]) 194 | p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point() + 195 | geom_text(aes(label=sample), vjust=-.4, hjust= .3, size=3) + 196 | xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) + 197 | guides(color=F) + theme_pub() 198 | return (p) 199 | } 200 | 201 | plot_pca_val <- function(pc_val) { 202 | t <- data.frame(pc=1:length(pc_val), val=pc_val) 203 | p <- ggplot(t, aes(x=pc, y=val)) + 204 | geom_bar(stat='identity', fill='salmon', color='black') + 205 | xlab('principle component') + 206 | ylab('% variance explained') + theme_pub() 207 | return (p) 208 | } 209 | -------------------------------------------------------------------------------- /qc/data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PMBio/scMT-seq/cabc3b66bc8e1d9f61bb2b6d903018c0c88d8093/qc/data.xlsx -------------------------------------------------------------------------------- /qc/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Samples QC 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(ggplot2) 15 | library(dplyr) 16 | library(tidyr) 17 | library(gridExtra) 18 | source('../lib/utils.R') 19 | ``` 20 | 21 | ```{r} 22 | opts <- list() 23 | opts$data <- './data.csv' 24 | opts$use <- c('m14', 'm15P3') 25 | opts$colors <- c( 26 | 'm14'='forestgreen', 27 | 'm15'='brown2', 28 | 'm15P3'='royalblue' 29 | ) 30 | ``` 31 | 32 | ```{r} 33 | d <- read.csv(opts$data, head=T) %>% tbl_df 34 | names(d) <- c('sample', 'met', 'batch', 'val_trimmed', 'unique_mapped', 'num_cpg', 'per_cpg') 35 | d$set <- 'm14' 36 | d[d$batch %in% c('off bead', 'original'),]$set <- 'm15' 37 | d[d$batch == 'CSCP2',]$set <- 'm15P2' 38 | d[d$batch == 'optimised',]$set <- 'm15P3' 39 | d <- d %>% filter(set %in% opts$use) %>% 40 | mutate(per_cpg=as.numeric(gsub('%', '', per_cpg)) / 100) 41 | dat <- d 42 | ``` 43 | 44 | ```{r} 45 | write.csv(dat, 'fig1b.csv') 46 | ``` 47 | 48 | ```{r fig.width=7, fig.height=6} 49 | p <- ggplot(dat, aes(x=unique_mapped, y=per_cpg, color=set)) + 50 | geom_point() + 51 | geom_smooth(method='loess', degree=1, alpha=0.2, size=1) + 52 | scale_color_manual(values=opts$colors) + 53 | xlab('Number of mapped reads') + ylab('CpG recovery') + 54 | theme_pub() 55 | p 56 | ``` 57 | 58 | ```{r eval=F} 59 | ggsave(p, file='fig_mapping.pdf', width=7, height=6) 60 | ``` 61 | 62 | ```{r fig.width=7, fig.height=6} 63 | p <- ggplot(dat, aes(x=val_trimmed, y=per_cpg, color=set)) + 64 | geom_point() + 65 | geom_smooth(method='loess', degree=1, alpha=0.2, size=1) + 66 | scale_color_manual(values=opts$colors) + 67 | xlab('Number of trimmed validated reads') + ylab('CpG recovery') + 68 | theme_pub() 69 | p 70 | ``` 71 | 72 | ```{r} 73 | d <- read.csv('./sup_fig_3_data.csv') %>% tbl_df 74 | d <- d %>% rename(type=Cell, map_eff=Mapping, dup_rate=Duplication, 75 | cpg_rate=CpG.Methylation, chh_rate=CH.methylation) %>% 76 | mutate(Study=factor(Study, levels=c('scBS2014', 'M&T'), labels=c('scBS-Seq', 'scM&T-Seq'))) %>% 77 | filter(type=='Serum') 78 | levels(d$set) 79 | ``` 80 | 81 | ```{r} 82 | opts$colors <- c( 83 | 'scM&T-Seq'='royalblue', 84 | 'scBS-Seq'='forestgreen' 85 | ) 86 | ``` 87 | 88 | ```{r fig.width=5, fig.height=6} 89 | p1 <- ggplot(d, aes(x=Study, y=map_eff, fill=Study)) + 90 | geom_boxplot() + 91 | scale_fill_manual(values=opts$colors) + 92 | theme_pub() + 93 | xlab('') + ylab('Mapping efficiency') + 94 | theme(axis.title.y=element_text(vjust=1.5)) 95 | print(p1) 96 | ``` 97 | 98 | ```{r fig.width=5, fig.height=6} 99 | p2 <- ggplot(d, aes(x=Study, y=dup_rate, fill=Study)) + 100 | geom_boxplot() + 101 | scale_fill_manual(values=opts$colors) + 102 | theme_pub() + 103 | xlab('') + ylab('Duplication rate') + 104 | theme(axis.title.y=element_text(vjust=1.5)) 105 | print(p2) 106 | ``` 107 | 108 | ```{r fig.width=5, fig.height=6} 109 | h <- d %>% select(Study, type, cpg_rate, chh_rate) %>% gather(rate, value, -c(Study, type)) 110 | p3 <- ggplot(d, aes(x=Study, y=cpg_rate, fill=Study)) + 111 | geom_boxplot() + 112 | scale_fill_manual(values=opts$colors) + 113 | geom_point(data=h, aes(x=Study, y=value, shape=rate), position=position_jitter(w=0.1, h=0)) + 114 | theme_pub() + 115 | xlab('') + ylab('Methylation rate') + 116 | theme(axis.title.y=element_text(vjust=1.5)) 117 | print(p3) 118 | ``` 119 | 120 | ```{r fig.width=12, fig.height=5} 121 | grid.arrange(p1, p2, p3, ncol=3) 122 | ``` 123 | 124 | ```{r eval=F} 125 | pdf('fig_stats.pdf', width=12, height=5) 126 | grid.arrange(p1, p2, p3, ncol=3) 127 | dev.off() 128 | ``` 129 | -------------------------------------------------------------------------------- /sample/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Sample-specific correlation for all genes 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, fig.width=10, warning=F, message=F) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(ggplot2) 15 | library(dplyr) 16 | library(tidyr) 17 | library(xtable) 18 | options(xtable.type='html') 19 | source('../lib/utils.R') 20 | ``` 21 | 22 | ```{r} 23 | opts <- list() 24 | opts$data_file <- '../data/join/data.rds' 25 | opts$expr <- '../data/expr/data_proc/data_expr.rds' 26 | opts$genes_file <- NA 27 | # opts$genes_file <- '../data/expr/data_raw/subpopulation_lif_genes9.csv' 28 | opts$samples_file <- '../data/samples/samples_stats.csv' 29 | opts$bulk_file <- '../../151019_ficz_sample/sample/r.rds' 30 | opts$cache <- T 31 | opts$permute <- F 32 | opts$alpha <- 0.1 33 | ``` 34 | 35 | ```{r} 36 | theme_pub <- function() { 37 | p <- theme( 38 | axis.text=element_text(size=rel(1.2), color='black'), 39 | axis.title=element_text(size=rel(1.5)), 40 | legend.position='top', 41 | legend.text=element_text(size=rel(1.2)), 42 | legend.title=element_text(size=rel(1.2)), 43 | legend.key=element_rect(fill='transparent'), 44 | panel.border=element_blank(), 45 | panel.grid.major = element_blank(), 46 | panel.grid.minor = element_blank(), 47 | panel.background = element_blank(), 48 | axis.line = element_line(colour="black", size=1), 49 | axis.ticks.length = unit(.3, 'cm'), 50 | axis.ticks.margin = unit(.3, 'cm') 51 | ) 52 | return (p) 53 | } 54 | ``` 55 | 56 | ```{r} 57 | cmp <- list() 58 | dat <- list() 59 | ``` 60 | 61 | ```{r data} 62 | dat$em <- readRDS(opts$data_file) 63 | if (!is.na(opts$genes_file)) { 64 | dat$genes <- read.csv(opts$genes_file, sep=',', head=T) %>% tbl_df 65 | names(dat$genes) <- c('ens_id', 'gene_id') 66 | h <- as.vector(dat$genes$ens_id) 67 | dat$em <- dat$em %>% filter(ens_id %in% h) 68 | } 69 | ``` 70 | 71 | ```{r} 72 | dat$samples <- read.csv(opts$samples_file, sep='\t', head=T) %>% 73 | mutate(sample=id, sample_short=sub('^[^_]+_', '', sample)) %>% tbl_df 74 | ``` 75 | 76 | ## Bulk correlation 77 | 78 | ```{r} 79 | dat$emb <- dat$em %>% group_by(name, id_.x, id_.y) %>% 80 | summarise(expr=mean(expr, na.rm=T), met=weighted.mean(met, weight, na.rm=T)) %>% 81 | ungroup 82 | cmp$rb <- dat$emb %>% group_by(name) %>% do(wtd_cor(.$met, .$expr)) %>% 83 | ungroup %>% arrange(desc(abs(r))) %>% 84 | mutate(name=factor(name, levels=rev(name)), sig=(p <= opts$alpha)) 85 | ``` 86 | 87 | 88 | ```{r results='asis'} 89 | h <- cmp$rb %>% select(name, n, r, p, sig) %>% as.data.frame %>% arrange(as.vector(name)) 90 | print(xtable(h, digits=2)) 91 | ``` 92 | 93 | ```{r fig.width=10, fig.height=8} 94 | ggplot(cmp$rb, aes(x=name, y=r, fill=-log10(p))) + geom_bar(stat='identity') + 95 | xlab('') + ylab('r') + coord_flip() + theme_pub() 96 | ``` 97 | 98 | ```{r fig.height=8} 99 | d <- dat$emb 100 | d$name <- factor(d$name, levels=rev(levels(cmp$rb$name))) 101 | ggplot(d, aes(x=met, y=expr)) + 102 | geom_point(size=0.3) + 103 | stat_density2d(color='darkgrey') + 104 | stat_smooth(method=lm, color='blue') + 105 | facet_wrap(~name) + theme_pub() 106 | ``` 107 | 108 | 109 | ## Sample-specific correlation 110 | 111 | ```{r cor, cache=opts$cache} 112 | r <- dat$em %>% group_by(name, sample) 113 | if (opts$permute) { 114 | r <- r %>% do(wtd_cor(.$expr[sample(length(.$expr))], .$met, .$weight)) 115 | } else { 116 | r <- r %>% do(wtd_cor(.$expr, .$met, .$weight)) 117 | } 118 | r <- r %>% ungroup %>% group_by(name) %>% 119 | mutate(p_adj=p.adjust(p, method='fdr')) %>% ungroup 120 | cmp$r <- r 121 | ``` 122 | 123 | ```{r} 124 | cmp$r <- cmp$r %>% mutate(name=factor(name, levels=rev(levels(name)))) %>% 125 | inner_join(dat$samples, by='sample') 126 | ``` 127 | 128 | ```{r} 129 | saveRDS(cmp$r, file='r.rds') 130 | ``` 131 | 132 | ```{r eval=T} 133 | d <- cmp$r %>% select(anno=name, sample, r, p, p_adj, cpg_cov=cov, mean_met=CpG.rate) %>% 134 | arrange(as.vector(anno)) 135 | write.table(d, file='correlations.csv', quote=F, row=F, sep=',') 136 | ``` 137 | 138 | ```{r} 139 | cmp$rs <- cmp$r %>% group_by(name) %>% 140 | summarise( 141 | n=mean(n), 142 | r=mean(r), 143 | sig_pos=sum(p_adj <= opts$alpha & r > 0), 144 | sig_neg=sum(p_adj <= opts$alpha & r < 0) 145 | ) %>% arrange(as.vector(name)) 146 | ``` 147 | 148 | ```{r results='asis'} 149 | xtable(as.data.frame(cmp$rs), digits=2) 150 | ``` 151 | 152 | ```{r eval=F} 153 | write.table(cmp$rs, 'tab_sample_r.csv', sep='\t', row.names=F) 154 | ``` 155 | 156 | ```{r eval=!is.na(opts$bulk_file)} 157 | cmp$rb <- readRDS(opts$bulk_file) 158 | ``` 159 | 160 | ```{r fig.height=10} 161 | d <- cmp$r 162 | h <- d %>% group_by(name) %>% summarise(r_mean=mean(r, na.rm=T)) 163 | d <- d %>% inner_join(h, by='name') 164 | 165 | db <- cmp$rb 166 | p <- ggplot(d, aes(x=name, y=r)) + 167 | geom_hline(yintercept=0, linetype='dashed', color='darkgrey') + 168 | geom_boxplot(aes(fill=r_mean), alpha=0.4, outlier.size=0) + 169 | geom_point(position=position_jitter(w=0.1), size=1.5) + 170 | geom_point(data=db, color='black', fill='orange1', shape=21, size=2.5) + 171 | scale_fill_gradient2(low='darkblue', mid='white', high='darkred', midpoint=0.0) + 172 | xlab('') + ylab('Pearson correlation') + 173 | coord_flip() + 174 | theme_pub() + 175 | theme(legend.position='right') 176 | print(p) 177 | ``` 178 | 179 | ```{r eval=F} 180 | write.csv(d, 'fig2c.csv') 181 | ``` 182 | 183 | ```{r eval=F} 184 | ggsave(p, file='fig_sample.pdf', width=8, height=8) 185 | ``` 186 | 187 | ```{r eval=F} 188 | s <- c('gene_body', 'cgi', 'intron', 'exon', 'intergenic', 'prom_cgi', 'prom_non_cgi', 'H3K27ac', 'H3K27me3', 'H3K4me1', 'LMR', 'p300') 189 | d <- cmp$r %>% filter(name %in% s) %>% 190 | mutate(name=factor(name, levels=s)) 191 | h <- d %>% group_by(name) %>% summarise(r_mean=mean(r, na.rm=T)) 192 | d <- d %>% inner_join(h, by='name') 193 | 194 | db <- cmp$rb %>% filter(name %in% s) 195 | p <- ggplot(d, aes(x=name, y=r)) + 196 | geom_hline(yintercept=0, linetype='dashed', color='darkgrey') + 197 | geom_boxplot(aes(fill=r_mean), alpha=0.4, outlier.size=0) + 198 | geom_point(position=position_jitter(w=0.1), size=1.5) + 199 | geom_point(data=db, color='black', fill='orange1', shape=21, size=2.5) + 200 | scale_fill_gradient2(low='darkblue', mid='white', high='darkred', midpoint=0.0) + 201 | xlab('') + ylab('Pearson correlation') + 202 | theme_pub() + 203 | theme( 204 | legend.position='right', 205 | axis.text.x=element_text(angle=30, hjust=1) 206 | ) 207 | print(p) 208 | ``` 209 | 210 | ```{r eval=F} 211 | ggsave(p, file='fig_sample_hor.pdf', width=12, height=6) 212 | ``` 213 | 214 | 215 | ## R versus coverage 216 | 217 | ```{r fig.height=12} 218 | d <- cmp$r %>% mutate(name=factor(name, levels=rev(levels(name)))) 219 | p <- ggplot(d, aes(x=cov, y=r)) + 220 | geom_hline(yintercept=0, color='darkgrey') + 221 | geom_point(size=2) + 222 | guides(color=guide_legend(title='Method ')) + 223 | facet_wrap(~name, ncol=3) + theme(legend.position='bottom') + 224 | xlab('\nCpG coverage') + ylab('Pearson correlation\n') + 225 | theme_pub() 226 | print(p) 227 | ``` 228 | 229 | ```{r eval=F} 230 | ggsave('fig_r_cov.pdf', p, width=10, height=13) 231 | ``` 232 | 233 | ```{r eval=F} 234 | write.csv(d, 'S16.csv') 235 | ``` 236 | 237 | ## R versus mean methylation 238 | 239 | 240 | ```{r fig.height=12} 241 | d <- cmp$r %>% mutate(name=factor(name, levels=rev(levels(name)))) 242 | p <- ggplot(d, aes(x=CpG.rate, y=r)) + 243 | geom_hline(yintercept=0, color='darkgrey') + 244 | geom_point(size=2) + 245 | guides(color=guide_legend(title='Method ')) + 246 | facet_wrap(~name, ncol=3) + theme(legend.position='bottom') + 247 | xlab('\nMean CpG methylation rate') + ylab('Pearson correlation\n') + 248 | theme_pub() 249 | print(p) 250 | ``` 251 | 252 | ```{r eval=F} 253 | ggsave('fig_r_mean.pdf', p, width=10, height=13) 254 | ``` 255 | 256 | ```{r eval=F} 257 | write.csv(d, 'S15.csv') 258 | ``` 259 | -------------------------------------------------------------------------------- /var/m14_m15P3/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Comparison methylation variability scBS14 scBS15 3 | date: 151020 4 | output: 5 | html_document: 6 | toc: yes 7 | --- 8 | 9 | ```{r, include=F} 10 | library(knitr) 11 | opts_chunk$set(echo=F, warning=F, message=F) 12 | ``` 13 | 14 | ```{r, include=F} 15 | library(ggplot2) 16 | library(dplyr) 17 | library(tidyr) 18 | library(grid) 19 | source('../../lib/utils.R') 20 | ``` 21 | 22 | ```{r} 23 | opts <- list() 24 | opts$s14_file <- '../met14/stats.rds' 25 | opts$s15_file <- '../m15P3/stats.rds' 26 | opts$colors <- c('scBS14'='forestgreen', 'scBS15'='royalblue') 27 | opts$annos <- c('all', 'gene_body', 'intron', 'exon', 'intergenic', 'prom_cgi', 28 | 'prom_non_cgi', 'cgi', 'p300', 'H3K4me1', 'H3K27ac', 'H3K4me3', 'LMR') 29 | ``` 30 | 31 | ```{r} 32 | dat <- list() 33 | dat$s14 <- readRDS(opts$s14_file) %>% filter(name %in% opts$annos) %>% tbl_df 34 | dat$s15 <- readRDS(opts$s15_file) %>% filter(name %in% opts$annos) %>% tbl_df 35 | d <- list() 36 | for (n in c('s14', 's15')) { 37 | d[[n]] <- dat[[n]] 38 | d[[n]]$data <- n 39 | } 40 | dat$var <- do.call(rbind.data.frame, d) %>% 41 | mutate(data=factor(data, levels=c('s14', 's15'), labels=c('scBS14', 'scBS15'))) %>% 42 | tbl_df 43 | # Scaled variance 44 | dat$var <- dat$var %>% group_by(data, name) %>% 45 | mutate(wtd_var_s=wtd_var/sd(wtd_var, na.rm=T)) %>% ungroup 46 | ``` 47 | 48 | ## Weighted variance 49 | 50 | ```{r eval=F} 51 | 52 | ``` 53 | 54 | 55 | ```{r} 56 | d <- dat$var %>% filter(name %in% opts$annos) %>% 57 | mutate(name=factor(name, levels=rev(opts$annos))) 58 | ``` 59 | 60 | ```{r eval=F} 61 | h <- d %>% select(data, name, id_) %>% 62 | group_by(data, name) %>% sample_frac(0.3) %>% ungroup 63 | write.csv(h, 'fig1d.csv') 64 | ``` 65 | 66 | ```{r fig.width=8, fig.height=8} 67 | p <- ggplot(d, aes(x=name, fill=data, y=wtd_var)) + geom_boxplot() + 68 | scale_fill_manual(values=opts$colors) + 69 | xlab('') + ylab('\nVariance') + coord_flip() + 70 | theme_pub() + ylim(0, 1300) + 71 | theme(legend.position='right') 72 | print(p) 73 | ``` 74 | 75 | ```{r eval=F} 76 | ggsave('fig_var.pdf', p, width=8, height=9) 77 | ``` 78 | 79 | 80 | ## Unweighted variance 81 | 82 | ```{r} 83 | ggplot(dat$var, aes(x=name, fill=data, y=var)) + geom_boxplot() + 84 | xlab('') + ylab('Unweighted variance') + coord_flip() + 85 | ylim(0, 2000) + theme_pub() 86 | ``` 87 | 88 | ```{r} 89 | ggplot(dat$var, aes(x=name, fill=name, y=wtd_var)) + geom_boxplot() + 90 | xlab('') + ylab('Unweighted variance') + coord_flip() + 91 | facet_wrap(~data) + 92 | guides(fill=F) + 93 | ylim(0, 2000) 94 | ``` 95 | -------------------------------------------------------------------------------- /var/m15P3/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Quantification methylation variability 3 | date: 151019 4 | output: 5 | html_document: 6 | toc: yes 7 | --- 8 | 9 | ```{r, include=F} 10 | library(knitr) 11 | opts_chunk$set(echo=F) 12 | ``` 13 | 14 | ```{r, include=F} 15 | library(ggplot2) 16 | library(dplyr) 17 | library(tidyr) 18 | library(weights) 19 | library(xtable) 20 | options(xtable.type='html') 21 | ``` 22 | 23 | ```{r} 24 | opts <- list() 25 | opts$met_file <- '../data/m15P3/join/data_met.rds' 26 | opts$met_meta_file <- '../data/m15P3/join/data_meta.rds' 27 | opts$cache <- F 28 | ``` 29 | 30 | ```{r read, cache=opts$cache} 31 | dat <- list() 32 | dat$met <- readRDS(opts$met_file) 33 | ``` 34 | 35 | ```{r compute, cache=opts$cache} 36 | stats <- function(d) { 37 | s <- summarise(d, 38 | mean=mean(met, na.rm=T), 39 | var=var(met, na.rm=T), 40 | wtd_mean=weighted.mean(met, weight, na.rm=T), 41 | wtd_var=wtd.var(met, weight, na.rm=T) 42 | ) 43 | return (s) 44 | } 45 | 46 | cmp <- list() 47 | cmp$var <- dat$met %>% group_by(name, id_) %>% stats 48 | d <- dat$met %>% group_by(id_) %>% stats 49 | d$name <- 'all' 50 | cmp$var <- rbind.data.frame(cmp$var, d) %>% ungroup 51 | cmp$var <- cmp$var %>% mutate(name=factor(name)) %>% 52 | mutate(name=factor(name, levels=sort(levels(name)))) %>% 53 | mutate(name=relevel(name, 'all')) %>% 54 | mutate(name=factor(name, levels=rev(levels(name)))) 55 | 56 | cmp$stats <- cmp$var %>% group_by(name) %>% summarise( 57 | n=n(), 58 | mean=mean(mean, na.rm=T), 59 | wtd_mean=mean(wtd_mean, na.rm=T), 60 | var=mean(var, na.rm=T), 61 | wtd_var=mean(wtd_var, na.rm=T) 62 | ) %>% arrange(name) 63 | ``` 64 | 65 | ```{r} 66 | saveRDS(cmp$var, file='stats.rds') 67 | ``` 68 | 69 | 70 | ## Statistics 71 | 72 | ```{r results='asis'} 73 | d <- as.data.frame(cmp$stats %>% arrange(desc(name))) 74 | print(xtable(d, digits=2)) 75 | ``` 76 | 77 | ## Weighted variance 78 | 79 | ```{r} 80 | ggplot(cmp$var, aes(x=name, fill=name, y=wtd_var)) + geom_boxplot() + 81 | xlab('') + ylab('Weighted variance') + coord_flip() + guides(fill=F) 82 | ``` 83 | 84 | ## Unweighted variance 85 | 86 | ```{r} 87 | ggplot(cmp$var, aes(x=name, fill=name, y=var)) + geom_boxplot() + 88 | xlab('') + ylab('Unweighted variance') + coord_flip() + guides(fill=F) 89 | ``` 90 | 91 | ## Weighted mean 92 | 93 | ```{r} 94 | ggplot(cmp$var, aes(x=name, fill=name, y=wtd_mean)) + geom_boxplot() + 95 | xlab('') + ylab('Weighted mean') + coord_flip() + guides(fill=F) 96 | ``` 97 | 98 | ## Unweighted mean 99 | 100 | ```{r} 101 | ggplot(cmp$var, aes(x=name, fill=name, y=mean)) + geom_boxplot() + 102 | xlab('') + ylab('Unweighted mean') + coord_flip() + guides(fill=F) 103 | ``` 104 | 105 | ## Number of sites 106 | 107 | ```{r} 108 | d <- cmp$stats %>% filter(name != 'all') 109 | ggplot(d, aes(x=name, y=n, fill=name)) + 110 | geom_bar(stat='identity') + xlab('') + ylab('Number of sites') + 111 | coord_flip() + guides(fill=F) 112 | ``` 113 | -------------------------------------------------------------------------------- /var/met14/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Quantification methylation variability 3 | date: 150418 4 | output: 5 | html_document: 6 | toc: yes 7 | --- 8 | 9 | ```{r, include=F} 10 | library(knitr) 11 | opts_chunk$set(echo=F) 12 | ``` 13 | 14 | ```{r, include=F} 15 | library(ggplot2) 16 | library(dplyr) 17 | library(tidyr) 18 | library(weights) 19 | library(xtable) 20 | options(xtable.type='html') 21 | ``` 22 | 23 | ```{r} 24 | opts <- list() 25 | opts$data_dir <- '../data/met14' 26 | opts$samples_file <- '../data/met14/samples_meta.csv' 27 | opts$cache <- T 28 | ``` 29 | 30 | ```{r read, cache=opts$cache} 31 | read_samples <- function(filename) { 32 | h <- read.table(filename, sep='\t', head=T) %>% tbl_df 33 | return (h) 34 | } 35 | 36 | dat <- list() 37 | d <- list.dirs(opts$data_dir, full.names=F, recursive=F) %>% basename 38 | d <- setdiff(d, c('tmp', 'met')) 39 | dat$annos <- d 40 | # dat$annos <- c('active_enhancer', 'IAP', 'cgi') 41 | d <- list() 42 | for (a in dat$annos) { 43 | d[[a]] <- readRDS(file.path(opts$data_dir, a, 'data_met.rds')) 44 | d[[a]]$name <- a 45 | } 46 | d <- do.call(rbind.data.frame, d) 47 | dat$met <- d %>% tbl_df 48 | dat$samples <- read_samples(opts$samples_file) 49 | h <- dat$samples %>% filter(cond == 'serum', id != 'ESC_Ser3_RSC27_4', 50 | id != 'ESC_Ser6_RSC27_7') %>% select(id) %>% unlist %>% 51 | as.vector 52 | dat$met <- dat$met %>% filter(sample %in% h) %>% mutate(sample=factor(sample)) 53 | rownames(dat$met) <- NULL 54 | ``` 55 | 56 | ```{r compute, cache=opts$cache} 57 | stats <- function(d) { 58 | s <- summarise(d, 59 | mean=mean(met, na.rm=T), 60 | var=var(met, na.rm=T), 61 | wtd_mean=weighted.mean(met, weight, na.rm=T), 62 | wtd_var=wtd.var(met, weight, na.rm=T) 63 | ) 64 | return (s) 65 | } 66 | 67 | cmp <- list() 68 | cmp$var <- dat$met %>% group_by(name, id_) %>% stats 69 | d <- dat$met %>% group_by(id_) %>% stats 70 | d$name <- 'all' 71 | cmp$var <- rbind.data.frame(cmp$var, d) %>% ungroup 72 | cmp$var <- cmp$var %>% mutate(name=factor(name)) %>% 73 | mutate(name=factor(name, levels=sort(levels(name)))) %>% 74 | mutate(name=relevel(name, 'all')) %>% 75 | mutate(name=factor(name, levels=rev(levels(name)))) 76 | 77 | cmp$stats <- cmp$var %>% group_by(name) %>% summarise( 78 | n=n(), 79 | mean=mean(mean, na.rm=T), 80 | wtd_mean=mean(wtd_mean, na.rm=T), 81 | var=mean(var, na.rm=T), 82 | wtd_var=mean(wtd_var, na.rm=T) 83 | ) %>% arrange(name) 84 | ``` 85 | 86 | ```{r} 87 | saveRDS(cmp$var, file='stats.rds') 88 | ``` 89 | 90 | ## Statistics 91 | 92 | ```{r results='asis'} 93 | d <- as.data.frame(cmp$stats %>% arrange(desc(name))) 94 | print(xtable(d, digits=2)) 95 | ``` 96 | 97 | ## Weighted variance 98 | 99 | ```{r} 100 | ggplot(cmp$var, aes(x=name, fill=name, y=wtd_var)) + geom_boxplot() + 101 | xlab('') + ylab('Weighted variance') + coord_flip() + guides(fill=F) 102 | ``` 103 | 104 | ## Unweighted variance 105 | 106 | ```{r} 107 | ggplot(cmp$var, aes(x=name, fill=name, y=var)) + geom_boxplot() + 108 | xlab('') + ylab('Unweighted variance') + coord_flip() + guides(fill=F) 109 | ``` 110 | 111 | ## Weighted mean 112 | 113 | ```{r} 114 | ggplot(cmp$var, aes(x=name, fill=name, y=wtd_mean)) + geom_boxplot() + 115 | xlab('') + ylab('Weighted mean') + coord_flip() + guides(fill=F) 116 | ``` 117 | 118 | ## Unweighted mean 119 | 120 | ```{r} 121 | ggplot(cmp$var, aes(x=name, fill=name, y=mean)) + geom_boxplot() + 122 | xlab('') + ylab('Unweighted mean') + coord_flip() + guides(fill=F) 123 | ``` 124 | 125 | ## Number of sites 126 | 127 | ```{r} 128 | d <- cmp$stats %>% filter(name != 'all') 129 | ggplot(d, aes(x=name, y=n, fill=name)) + 130 | geom_bar(stat='identity') + xlab('') + ylab('Number of sites') + 131 | coord_flip() + guides(fill=F) 132 | ``` 133 | -------------------------------------------------------------------------------- /zoom/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: Esrrb 3 | output: 4 | html_document: 5 | toc: yes 6 | --- 7 | 8 | ```{r, include=F} 9 | library(knitr) 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10) 11 | ``` 12 | 13 | ```{r, include=F} 14 | library(ggplot2) 15 | library(dplyr) 16 | library(tidyr) 17 | library(gridExtra) 18 | library(scales) 19 | library(xtable) 20 | source('../lib/utils.R') 21 | source('./viz.R') 22 | ``` 23 | 24 | ```{r} 25 | options(xtable.type='html') 26 | ``` 27 | 28 | ```{r} 29 | opts <- list() 30 | opts$max_dist <- 10000 31 | opts$win$met_file <- 'w3k_s1k_d20k/data_met.rds' 32 | opts$win$meta_file <- 'w3k_s1k_d20k/data_meta.rds' 33 | opts$em_file <- '../data/join/data.rds' 34 | opts$expr$rate_file <- '../data/expr/data_proc/data_expr.rds' 35 | opts$expr$meta_file <- '../data/expr/data_proc/data_meta.rds' 36 | opts$sample_filter <- c() 37 | opts$init <- T 38 | ``` 39 | 40 | ```{r read, eval=opts$init} 41 | filter_samples <- function(d) { 42 | f <- opts$samples_filter 43 | if (!is.na(t) & length(f) > 0) { 44 | d <- d %>% filter(sample %in% f) 45 | } 46 | return (d) 47 | } 48 | 49 | dat <- list() 50 | dat$win$met <- readRDS(opts$win$met_file) %>% tbl_df %>% filter_samples 51 | dat$win$meta <- readRDS(opts$win$meta_file) %>% tbl_df 52 | dat$expr$rate <- readRDS(opts$expr$rate_file) %>% tbl_df %>% filter_samples 53 | dat$expr$meta <- readRDS(opts$expr$meta_file) %>% tbl_df 54 | dat$em$all <- readRDS(opts$em_file) %>% tbl_df %>% filter_samples 55 | dat$em$meta <- dat$em$all %>% select(-c(sample, expr, met, weight)) %>% distinct 56 | ``` 57 | 58 | ```{r eval=opts$init} 59 | cmp <- list() 60 | cmp$r <- readRDS('../gene/r.rds') 61 | ``` 62 | 63 | ```{r eval=opts$init} 64 | viz <- list() 65 | viz$sel <- cmp$r %>% 66 | filter(name %in% c('LMR', 'p300'), gene_id == 'Esrrb', p_adj < 0.1) %>% 67 | arrange(r) %>% head(2) %>% 68 | select(ens_id, gene_id, name, id_.x, id_.y, r, p, p_adj, chromo, start.x, end.x, start.y, end.y) 69 | idx <- viz$sel[1,]$id_.x 70 | ``` 71 | 72 | ```{r eval=opts$init} 73 | viz$d <- make_data(idx) 74 | ``` 75 | 76 | ```{r} 77 | a <- list() 78 | a$gene <- c(86361117, 86521628) 79 | a$TSS <- c(86360117, 86361217) 80 | for (i in 1:nrow(viz$sel)) { 81 | v <- viz$sel[i,] 82 | a[[sprintf('%s_%d', v$name, i)]] <- c(v$start.y, v$end.y) 83 | } 84 | 85 | viz$da <- data_anno(a) 86 | 87 | viz$a <- plot_anno(viz$da) 88 | viz$v <- plot_var(viz$d$v, pa=viz$a, xlab=T) 89 | viz$r <- plot_cor(viz$d$r, pa=viz$a, xlab=T) 90 | viz$m <- plot_met(viz$d$em, pa=viz$a) 91 | ``` 92 | 93 | ```{r tracks, fig.width=14, fig.height=10} 94 | grid.arrange(viz$v, viz$r, viz$m, nrow=3, heights=c(0.25, 0.25, 0.5)) 95 | ``` 96 | 97 | ```{r eval=F} 98 | write.csv(viz$v$data, 'fig2b_var.csv') 99 | write.csv(viz$r$data, 'fig2b_cor.csv') 100 | write.csv(viz$m$data, 'fig2b_cpg.csv') 101 | ``` 102 | 103 | ```{r results='asis'} 104 | xtable(viz$sel, digits=4) 105 | ``` 106 | 107 | ```{r scatter, fig.width=6, fig.height=6} 108 | ps <- list() 109 | for (i in 1:nrow(viz$sel)) { 110 | v <- viz$sel[i,] 111 | p <- plot_scatter(as.character(v$name), v$id_.x, v$id_.y) 112 | ps[[length(ps) + 1]] <- p 113 | print(p) 114 | } 115 | ``` 116 | 117 | ```{r eval=F} 118 | write.csv(ps[[1]]$data, 'fig2b_p300.csv') 119 | write.csv(ps[[2]]$data, 'fig2b_LMR.csv') 120 | ``` 121 | 122 | 123 | ```{r} 124 | opts_chunk$set(eval=F) 125 | ``` 126 | 127 | ```{r} 128 | pdf(file='fig_esrrb.pdf', width=18, height=10) 129 | grid.arrange(viz$v, viz$r, viz$m, nrow=3, heights=c(0.25, 0.25, 0.5)) 130 | dev.off() 131 | ``` 132 | 133 | ```{r eval=F} 134 | ggsave(viz$m, file='met_annos.jpg', width=18, height=6, dpi=600) 135 | 136 | h <- list() 137 | h$v <- plot_var(viz$d$v, xlab=T) 138 | h$r <- plot_cor(viz$d$r, xlab=T) 139 | h$m <- plot_met(viz$d$em) 140 | h$m 141 | ggsave(h$m, file='met.jpg', width=18, height=6, dpi=600) 142 | # grid.arrange(h$v, h$r, h$m, nrow=3, heights=c(0.25, 0.25, 0.5)) 143 | ``` 144 | 145 | 146 | 147 | ```{r} 148 | s <- viz$sel %>% select(ens_id, gene_id, name, chromo, start=start.y, end=end.y, r, p_value=p_adj) 149 | write.table(s, file='fig_esrrb.csv', sep='\t') 150 | ``` 151 | 152 | ```{r} 153 | for (i in 1:nrow(viz$sel)) { 154 | v <- viz$sel[i,] 155 | p <- plot_scatter(v$name, v$id_.x, v$id_.y) 156 | ggsave(p, file=sprintf('fig_esrrb_%d.pdf', i), width=6.5, height=6) 157 | } 158 | ``` 159 | -------------------------------------------------------------------------------- /zoom/viz.R: -------------------------------------------------------------------------------- 1 | theme_pub <- function() { 2 | p <- theme( 3 | axis.text=element_text(size=rel(1.2), color='black'), 4 | axis.title=element_text(size=rel(1.5)), 5 | legend.position='top', 6 | legend.text=element_text(size=rel(1.2)), 7 | legend.title=element_text(size=rel(1.2)), 8 | legend.key=element_rect(fill='transparent'), 9 | panel.border=element_blank(), 10 | panel.grid.major = element_blank(), 11 | panel.grid.minor = element_blank(), 12 | panel.background = element_blank(), 13 | axis.line = element_line(colour="black", size=1), 14 | axis.ticks.length = unit(.3, 'cm'), 15 | axis.ticks.margin = unit(.3, 'cm') 16 | ) 17 | return (p) 18 | } 19 | 20 | plot_scatter <- function(name_, id_.x_, id_.y_, legend=F) { 21 | name_ <- as.character(name_) 22 | d <- dat$em$all %>% filter(name == name_, id_.x == id_.x_, id_.y == id_.y_) 23 | dm <- d[1,] 24 | rc <- cmp$r %>% filter(id_.x==id_.x_, id_.y==id_.y_) 25 | r <- wtd_cor(d$expr, d$met, d$weight) 26 | pos <- d %>% select(start.y, end.y) %>% distinct 27 | title <- sprintf('%s (%d-%d) r=%.2f p=%5g', 28 | name_, rc$start.y, rc$end.y, 29 | rc$r, rc$p_adj) 30 | p <- ggplot(d, aes(x=met, y=expr)) + 31 | stat_smooth(method=lm, color='grey34', aes(weight=weight), size=1) + 32 | geom_point(aes(color=sample, size=weight), alpha=0.7) + 33 | scale_size(range=c(4, 8)) + 34 | xlab('\nMethylation rate') + ylab('Expression rate\n') + 35 | ggtitle(title) 36 | if (legend) { 37 | p <- p + 38 | guides( 39 | color=guide_legend(title='Sample ', ncol=7), 40 | size=F) + 41 | theme(legend.direction='horizontal') 42 | } else { 43 | p <- p + guides(color=F, size=F) 44 | } 45 | p <- p + theme_pub() 46 | return (p) 47 | } 48 | 49 | 50 | plot_var <- function(d, pa=NULL, xlab=F) { 51 | p <- ggplot(d, aes(x=0.5*(start.y+end.y), y=wtd_var)) 52 | if (!is.null(pa)) { 53 | p <- p + pa 54 | } 55 | p <- p + 56 | geom_line(color='blue', lwd=1) + 57 | # geom_point(color='blue') + 58 | xlab('') + ylab('Variance\n') + 59 | scale_x_continuous(labels=comma) + 60 | theme_pub() + 61 | theme(axis.title.x=element_blank()) 62 | if (xlab == F) { 63 | p <- p + theme(axis.text.x=element_blank()) 64 | } 65 | return (p) 66 | } 67 | 68 | plot_cor <- function(d, pa=NULL, legend=F, xlab=F) { 69 | p <- ggplot(d, aes(x=0.5*(start.y+end.y), y=r)) + 70 | ylim(-1, 1) + geom_hline(yintercept=0, color='darkgrey') 71 | if (!is.null(pa)) { 72 | p <- p + pa 73 | } 74 | p <- p + 75 | geom_ribbon(aes(ymin=r_lo, ymax=r_up), alpha=0.2) + 76 | geom_line(aes(color=-log10(p)), lwd=1) + 77 | # geom_point(aes(color=-log10(p)), size=3) + 78 | scale_color_gradient(low='black', high='red') + 79 | scale_x_continuous(labels=comma) + 80 | xlab('') + ylab('Correlation\n') + 81 | theme_pub() + 82 | theme(legend.position='top', axis.title.x=element_blank()) 83 | if (xlab == F) { 84 | p <- p + theme(axis.text.x=element_blank()) 85 | } 86 | if (!legend) { 87 | p <- p + guides(color=F) 88 | } 89 | return (p) 90 | } 91 | 92 | plot_met <- function(d, pa=NULL, legend=F, mean=T) { 93 | p <- ggplot(d, aes(x=0.5*(start.y+end.y), y=met, color=sample)) 94 | if (!is.null(pa)) { 95 | p <- p + pa 96 | } 97 | if (mean) { 98 | m <- d %>% group_by(id_.x, id_.y) %>% summarise( 99 | start.y=unique(start.y), end.y=unique(end.y), 100 | mean=weighted.mean(met, weight, na.rm=T) 101 | ) %>% ungroup 102 | p <- p + 103 | geom_line(data=m, aes(x=0.5*(start.y+end.y), y=mean), 104 | color='grey27', size=0.8) 105 | } 106 | 107 | p <- p + 108 | geom_point(aes(color=sample, size=log2(weight), alpha=weight)) + 109 | theme_pub() + 110 | guides(color=F, alpha=F, size=F) + 111 | xlab('') + ylab('Methylation rate\n') + 112 | scale_x_continuous(labels=comma) + 113 | scale_size(range=c(1, 4)) + 114 | scale_alpha(range=c(0.5, 1)) 115 | if (legend) { 116 | p <- p + guides(size=guide_legend(title='# CpGs in window')) 117 | } 118 | return (p) 119 | } 120 | # id__ <- 2487 121 | # d <- make_data(id__) 122 | # p <- plot_met(d$em) 123 | # print(p) 124 | 125 | make_data <- function(id__, max_dist=opts$max_dist) { 126 | e <- dat$expr$rate %>% filter(id_ == id__) 127 | e_meta <- e[1,] %>% select(-c(sample, expr)) 128 | 129 | m <- dat$win$met %>% filter( 130 | chromo == as.character(e_meta$chromo), 131 | start >= e_meta$start - max_dist, 132 | end <= e_meta$end + max_dist) 133 | em <- e %>% inner_join(m, by='sample') 134 | 135 | r <- em %>% group_by(id_.x, id_.y) %>% 136 | do(wtd_cor(.$met, .$expr, weights=.$weight)) %>% ungroup 137 | r <- r %>% 138 | inner_join(dat$expr$meta, by=c('id_.x'='id_')) %>% 139 | inner_join(dat$win$meta, by=c('id_.y'='id_')) 140 | 141 | v <- em %>% group_by(id_.x, id_.y) %>% 142 | summarise( 143 | wtd_mean=weighted.mean(met, weight, na.rm=T), 144 | wtd_var=wtd.var(met, weight, na.rm=T) 145 | ) %>% ungroup 146 | v <- v %>% 147 | inner_join(dat$expr$meta, by=c('id_.x'='id_')) %>% 148 | inner_join(dat$win$meta, by=c('id_.y'='id_')) 149 | 150 | stopifnot(nrow(v) == nrow(r)) 151 | stopifnot(all(r$start.x == v$start.x)) 152 | stopifnot(all(r$end.x == v$end.x)) 153 | 154 | return (list(em=em, r=r, v=v)) 155 | } 156 | 157 | data_anno <- function(a) { 158 | d <- list() 159 | for (n in names(a)) { 160 | an <- a[[n]] 161 | d[[n]]$name <- n 162 | d[[n]]$start <- an[1] 163 | d[[n]]$end <- an[2] 164 | } 165 | d <- do.call(rbind.data.frame, d) %>% mutate(name=factor(name)) %>% 166 | gather(pos, x, -name) %>% tbl_df 167 | return (d) 168 | } 169 | 170 | plot_anno <- function(d) { 171 | p <- geom_vline(data=d, aes(xintercept=x, linetype=name)) 172 | return (p) 173 | } 174 | 175 | make_plots_scatter <- function(name_, id__) { 176 | em <- dat$em$all %>% filter(name == name_, id_.x == id__) 177 | em_meta <- em %>% select(name, id_.x, id_.y) %>% distinct 178 | p <- list() 179 | for (i in 1:nrow(em_meta)) { 180 | emi <- em_meta[i,] 181 | p[[i]] <- plot_scatter(em %>% filter(id_.y == emi$id_.y)) 182 | } 183 | return (p) 184 | } 185 | # name_ <- 'active_enhancer' 186 | # id__ <- 4373 187 | # p <- make_plots_scatter(name_, id__) 188 | 189 | make_plots_track <- function(name_, id__) { 190 | em_meta <- dat$em$meta %>% filter(name == name_, id_.x == id__) %>% 191 | arrange(start.y) 192 | 193 | p <- list() 194 | d <- data_anno(em_meta[1,]$start.x, em_meta[1,]$end.x, 195 | em_meta$start.y, em_meta$end.y) 196 | p$anno <- plot_anno(d) 197 | 198 | pd <- make_data(id__) 199 | p$var <- plot_var(pd$v, p$anno) 200 | p$cor <- plot_cor(pd$r, p$anno) 201 | p$met <- plot_met(pd$em, p$anno) 202 | 203 | return (p) 204 | } 205 | # name_ <- 'p300' 206 | # id__ <- 2647 207 | # p <- make_plots_track(name_, id__) 208 | 209 | make_plots <- function(name_, id__) { 210 | p <- make_plots_track(name_, id__) 211 | p$scatter <- make_plots_scatter(name_, id__) 212 | return (p) 213 | } 214 | 215 | plot_to_file <- function(name, id__, out_dir='plots') { 216 | m <- dat$expr$meta %>% filter(id_ == id__) 217 | fb <- sprintf('%s/%s_id%d_%s_%s', out_dir, name, id__, m$ens_id, m$gene_id) 218 | p <- make_plots(name, id__) 219 | 220 | fn <- sprintf('%s_01.pdf', fb) 221 | nrow <- ceil(length(p$scatter) / 2) 222 | pdf(file=fn, width=10, height=5 * nrow) 223 | h <- p$scatter 224 | h[['ncol']] <- 2 225 | do.call(grid.arrange, h) 226 | dev.off() 227 | 228 | fn <- sprintf('%s_02.pdf', fb) 229 | pdf(file=fn, width=10, height=8) 230 | h <- list(p$var, p$cor, p$met) 231 | h[['heights']] <- c(0.2, 0.2, 0.6) 232 | do.call(grid.arrange, h) 233 | dev.off() 234 | } 235 | # name_ <- 'active_enhancer' 236 | # id__ <- 4373 237 | # plot_to_file(name_, id__) 238 | 239 | plot_tracks <- function(name, id__) { 240 | p <- make_plots(name, id__) 241 | grid.arrange(p$var, p$cor, p$met, heights=c(0.2, 0.2, 0.6)) 242 | } 243 | # name_ <- 'active_enhancer' 244 | # id__ <- 55 245 | # plot_tracks(name_, id__) 246 | 247 | --------------------------------------------------------------------------------