├── .gitignore
├── README.md
├── cca
    ├── index.Rmd
    └── index.html
├── clust
    ├── index.Rmd
    ├── index.html
    ├── m15P2.Rmd
    └── m15P2.html
├── cor_bulk
    ├── expr
    │   ├── index.Rmd
    │   └── index.html
    └── met
    │   ├── index.Rmd
    │   ├── index.html
    │   └── src.R
├── cov
    ├── m14
    │   ├── index.Rmd
    │   └── index.html
    ├── m14_m15P3
    │   ├── index.Rmd
    │   └── index.html
    └── m15P3
    │   ├── index.Rmd
    │   └── index.html
├── data
    ├── expr
    │   └── data_proc
    │   │   ├── index.Rmd
    │   │   └── index.html
    ├── join
    │   ├── README.html
    │   ├── README.md
    │   ├── README.rmd
    │   └── data.rds
    └── met
    │   ├── join
    │       ├── index.Rmd
    │       └── index.html
    │   └── prepro
    │       ├── H3K27ac
    │           ├── index.Rmd
    │           └── index.html
    │       ├── H3K27me3
    │           ├── index.Rmd
    │           └── index.html
    │       ├── H3K4me1
    │           ├── index.Rmd
    │           └── index.html
    │       ├── H3K4me1_Tet1
    │           ├── index.Rmd
    │           └── index.html
    │       ├── IAP
    │           ├── index.Rmd
    │           └── index.html
    │       ├── LMR
    │           ├── index.Rmd
    │           └── index.html
    │       ├── Makefile
    │       ├── Tet2
    │           ├── index.Rmd
    │           └── index.html
    │       ├── Wu_Tet1
    │           ├── index.Rmd
    │           └── index.html
    │       ├── active_enhancer
    │           ├── index.Rmd
    │           └── index.html
    │       ├── cgi
    │           ├── index.Rmd
    │           └── index.html
    │       ├── exon
    │           ├── index.Rmd
    │           └── index.html
    │       ├── gene_body
    │           ├── index.Rmd
    │           └── index.html
    │       ├── intergenic
    │           ├── index.Rmd
    │           └── index.html
    │       ├── intron
    │           ├── index.Rmd
    │           └── index.html
    │       ├── p300
    │           ├── index.Rmd
    │           └── index.html
    │       ├── prom
    │           ├── index.Rmd
    │           └── index.html
    │       ├── prom_cgi
    │           ├── index.Rmd
    │           └── index.html
    │       ├── prom_non_cgi
    │           ├── index.Rmd
    │           └── index.html
    │       └── src
    │           └── index.Rmd
├── gene
    ├── index.Rmd
    └── index.html
├── gene_mean
    ├── index.Rmd
    └── index.html
├── gene_robust
    ├── index.Rmd
    └── index.html
├── heat
    ├── index.Rmd
    ├── index.html
    ├── index2.Rmd
    ├── index2.html
    ├── met.html
    └── viz.R
├── index.Rmd
├── index.html
├── lib
    └── utils.R
├── qc
    ├── data.xlsx
    ├── index.Rmd
    └── index.html
├── sample
    ├── index.Rmd
    └── index.html
├── var
    ├── m14_m15P3
    │   ├── index.Rmd
    │   └── index.html
    ├── m15P3
    │   ├── index.Rmd
    │   └── index.html
    └── met14
    │   ├── index.Rmd
    │   └── index.html
└── zoom
    ├── index.Rmd
    ├── index.html
    └── viz.R


/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | .*.swp
 3 | /tmp/
 4 | *.DS_Store
 5 | 
 6 | # R
 7 | .Rhistory
 8 | .RData
 9 | .Rapp.history
10 | .Rproj.user
11 | *.Rproj
12 | *.rds
13 | 
14 | # Python
15 | *.py[cod]
16 | *.so
17 | *.cfg
18 | *.orig
19 | *.log
20 | *.pot
21 | __pycache__/*
22 | .cache/*
23 | *_cache/
24 | 
25 | index_*
26 | *.csv
27 | *.png
28 | *.pptx
29 | *.ipynb
30 | *.jpg
31 | 
32 | /zoom/w3*/
33 | rdata
34 | fig_*
35 | *.zip
36 | *.pdf
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | scM&T-Seq
 2 | =========
 3 | 
 4 | Source code of the manuscript ***Parallel single-cell sequencing links
 5 | transcriptional and epigenetic heterogeneity*** ([Nature Methods](http://www.nature.com/nmeth/journal/v13/n3/full/nmeth.3728.html)).
 6 | 
 7 | Abstract
 8 | --------
 9 | We report scM&T–seq, a method for parallel single–cell genome–wide methylome and
10 | transcriptome sequencing, allowing discovery of associations between
11 | transcriptional and epigenetic variation. Profiling of 61 mouse embryonic stem
12 | cells confirmed known links between DNA methylation and transcription. Notably,
13 | the method reveals novel associations between heterogeneous methylation of
14 | distal regulatory elements and transcriptional heterogeneity of key pluripotency
15 | genes.
16 | 
17 | Content
18 | -------
19 | * `/cca/`: Canonical Correlation Analysis
20 | * `/clust/`: Clustering scM&T-Seq and scBS-Seq cells
21 | * `/cov/`: Coverage analysis
22 | * `/cor_bulk/`: Correlation scM&T-Seq methylation rates with bulk methylation rate
23 | * `/data/`: Data directory
24 | * `index.Rmd`: Table of content
25 | * `/gene/`: Gene-specific correlation analysis
26 | * `/gene_mean/`: Correlating mean methylation with gene expression
27 | * `/gene_robust/`: Robustness analysis gene-specific correlation
28 | * `/heat/`: Visualizing methylation and expression heatmap
29 | * `/lib/`: Library functions
30 | * `/sample/`: Sample-specific correlation analysis
31 | * `/qc/`: Quality control DNA methylation
32 | * `/var/`: Comparison methylation variability in context
33 | * `/zoom/`: Visualizing Esrrb gene
34 | 
35 | `data/join/data.rds` contains the pre-processed and joined methylation and
36 | expression data, which were used for the correlation analysis reported in the
37 | manuscript. The raw data and intermediate output files can be downloaded
38 | from [GEO](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE74535).
39 | 
40 | Contact
41 | -------
42 | * Christof Angermueller
43 | * cangermueller@ebi.ac.uk
44 | * https://cangermueller.com
45 | 


--------------------------------------------------------------------------------
/cca/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Canonical Correlation Analysis
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(ggplot2)
 15 | library(dplyr)
 16 | library(tidyr)
 17 | library(RColorBrewer)
 18 | library(gridExtra)
 19 | library(corrplot)
 20 | library(gplots)
 21 | library(stringr)
 22 | library(xtable)
 23 | source('../lib/utils.R')
 24 | ```
 25 | 
 26 | ```{r}
 27 | options(xtable.type='html')
 28 | ```
 29 | 
 30 | ```{r}
 31 | opts <- list()
 32 | opts$expr$file <- '../data/expr/data_proc/data_expr.rds'
 33 | opts$met$file <- '../data/met/join/data_met.rds'
 34 | opts$pca_on <- 'gene_body'
 35 | 
 36 | opts$clust <- list()
 37 | opts$clust$c1 <- c('H03', 'C05', 'C07', 'B09', 'F07', 'C02', 'D05', 'E09', 'F01', 'B05', 'H10', 'E01', 'B10')
 38 | opts$clust$c2 <- c('F06', 'D07', 'G09', 'G03')
 39 | opts$clust_colors <- c('default'='#fdc086', 'c1'='#7fc97f', 'c2'='brown1')
 40 | 
 41 | samples_colors <- function(x) {
 42 |   h <- rep(opts$clust_colors['default'], length(x))
 43 |   for (n in names(opts$clust)) {
 44 |     h[x %in% opts$clust[[n]]] <- opts$clust_colors[n]
 45 |   }
 46 |   names(h) <- x
 47 |   return (h)
 48 | }
 49 | ```
 50 | 
 51 | ```{r}
 52 | read_samples <- function(filename) {
 53 |   h <- read.table(filename, sep='\t', head=T) %>% tbl_df
 54 |   return (h)
 55 | }
 56 | 
 57 | format_sample <- function(s) {
 58 |   l <- str_split(s, '_')
 59 |   l <- sapply(l, function(x) x[length(x)])
 60 |   return (l)
 61 | }
 62 | 
 63 | dat <- list()
 64 | dat$expr <- readRDS(opts$expr$file) %>% mutate(sample=factor(sample, label=format_sample(levels(sample))))
 65 | dat$met <- readRDS(opts$met$file) %>%
 66 |   mutate(sample=factor(sample, label=format_sample(levels(sample)))) %>%
 67 |   filter(name == opts$pca_on) %>% droplevels
 68 | ```
 69 | 
 70 | ```{r}
 71 | plot_pca_vec <- function(pc_vec, x=1, y=2) {
 72 |   t <- data.frame(sample=factor(rownames(pc_vec)),
 73 |     pcx=pc_vec[,x], pcy=pc_vec[,y])
 74 |   cols <- samples_colors(as.vector(t$sample))
 75 |   p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes(color=sample), size=2) +
 76 |     scale_color_manual(values=cols) +
 77 |     geom_text(aes(label=sample), vjust=-.4, hjust= .3, size=3) +
 78 |     xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) +
 79 |     guides(color=F) + theme_pub()
 80 |   return (p)
 81 | }
 82 | 
 83 | plot_pca_val <- function(pc_val) {
 84 |   t <- data.frame(pc=1:length(pc_val), val=pc_val)
 85 |   p <- ggplot(t, aes(x=pc, y=val)) +
 86 |     geom_bar(stat='identity', fill='salmon', color='black') +
 87 |     xlab('principle component') +
 88 |     ylab('% variance explained') + theme_pub()
 89 |   return (p)
 90 | }
 91 | 
 92 | plot_pca_heat <- function(d, title='PC') {
 93 |   colors <- rev(brewer.pal(9, 'Spectral'))
 94 |   colors <- colorRampPalette(colors)(50)
 95 | 
 96 |   p <- heatmap.2(d, density.info='none', trace='none',
 97 |     col=colors, Rowv=F, Colv=T, keysize=1.0, dendro='column',
 98 |     lwid=c(2, 5), key.title='', srtCol=45, key.xlab=title)
 99 |   return (p)
100 | }
101 | ```
102 | 
103 | ## PCA expression
104 | 
105 | ```{r}
106 | e <- dat$expr %>% select(id_, sample, expr) %>% spread(sample, expr)
107 | e <- e %>% select(-id_) %>% as.matrix
108 | pe <- pca(e)
109 | ```
110 | 
111 | ```{r fig.width=6, fig.height=6}
112 | p <- plot_pca_vec(pe$vec)
113 | p
114 | ```
115 | 
116 | ```{r eval=F}
117 | ggsave(p, file='fig_pca_expr.pdf', width=7, height=6)
118 | ```
119 | 
120 | ```{r fig.width=6, fig.height=4}
121 | p <- plot_pca_val(pe$val)
122 | p
123 | ```
124 | 
125 | ```{r eval=F}
126 | ggsave(p, file='fig_pca_expr_var.pdf', width=7, height=6)
127 | ```
128 | 
129 | ## PCA methylation
130 | 
131 | ```{r}
132 | m <- dat$met %>% select(id_, sample, met) %>%
133 |   spread(sample, met) %>% select(-id_) %>% as.matrix
134 | mi <- m %>% impute
135 | pm <- pca(mi)
136 | ```
137 | 
138 | ```{r fig.width=6, fig.height=6}
139 | p <- plot_pca_vec(pm$vec)
140 | p
141 | ```
142 | 
143 | ```{r eval=F}
144 | ggsave(p, file='fig_pca_expr.pdf', width=7, height=6)
145 | ```
146 | 
147 | ```{r fig.width=6, fig.height=4}
148 | p <- plot_pca_val(pm$val)
149 | p
150 | ```
151 | 
152 | ```{r eval=F}
153 | ggsave(p, file='fig_pca_met_var.pdf', width=7, height=6)
154 | ```
155 | 
156 | ## Comparison PCAs
157 | 
158 | ```{r fig.width=10, fig.height=5}
159 | grid.arrange(plot_pca_vec(pe$vec), plot_pca_vec(pm$vec), ncol=2)
160 | ```
161 | 
162 | ```{r fig.width=10, fig.height=5}
163 | grid.arrange(plot_pca_val(pe$val), plot_pca_val(pm$val), ncol=2)
164 | ```
165 | 
166 | ```{r eval=F}
167 | write.csv(pe$vec, 'S8a.csv')
168 | write.csv(pm$vec, 'S8b.csv')
169 | ```
170 | 
171 | ## Principle components
172 | 
173 | ```{r fig.width=10, fig.height=7}
174 | pce <- t(pe$vec)
175 | pcm <- t(pm$vec)
176 | rownames(pce) <- paste('epc', 1:ncol(pce))
177 | rownames(pcm) <- paste('mpc', 1:ncol(pcm))
178 | h <- plot_pca_heat(pce, title='Expression PC')
179 | h <- plot_pca_heat(pcm, title='Methylation PC')
180 | ```
181 | 
182 | #### Correlation Principle Components
183 | 
184 | ```{r fig.width=10, fig.height=10}
185 | h <- cor(t(pce), t(pcm))
186 | corrplot(h, method='color')
187 | ```
188 | 
189 | ```{r eval=F}
190 | pdf(file='cor_pc.pdf', width=10, height=10)
191 | corrplot(h, method='color')
192 | dev.off()
193 | ```
194 | 
195 | ```{r}
196 | de <- pce %>% as.data.frame %>% mutate(pc=1:n()) %>%
197 |   gather(sample, value, -pc) %>% tbl_df
198 | dm <- pcm %>% as.data.frame %>% mutate(pc=1:n()) %>%
199 |   gather(sample, value, -pc) %>% tbl_df
200 | pcs <- de %>% inner_join(dm, by='sample') %>%
201 |   rename(pce=pc.x, expr=value.x, pcm=pc.y, met=value.y)
202 | ```
203 | 
204 | ```{r fig.height=10}
205 | n <- 5
206 | d <- pcs %>% filter(pce <= n, pcm <= 5, pce > 1, pcm > 1)
207 | cols <- samples_colors(as.vector(d$sample))
208 | p <- ggplot(d, aes(x=met, y=expr)) +
209 |   stat_smooth(method=lm, color='black') +
210 |   geom_point(aes(color=sample), size=1.5) +
211 |   scale_color_manual(values=cols) +
212 |   xlab('Methylation PC') + ylab('Expression PC') +
213 |   facet_grid(pce~pcm) +
214 |   guides(color=F)
215 | print(p)
216 | ```
217 | 
218 | ```{r eval=F}
219 | ggsave('cor_pc_scatter.pdf', p, width=10, height=10)
220 | ```
221 | 
222 | ```{r eval=F}
223 | write.csv(p$data, 'S9.csv')
224 | ```
225 | 
226 | ```{r}
227 | plot_cor_pc <- function(pcm_, pce_) {
228 |   d <- pcs %>% filter(pcm == pcm_, pce == pce_)
229 |   cols <- samples_colors(as.vector(d$sample))
230 |   p <- ggplot(d, aes(x=met, y=expr)) +
231 |     stat_smooth(method=lm, color='black') +
232 |     geom_point(aes(color=sample), size=2) +
233 |     scale_color_manual(values=cols) +
234 |     xlab(sprintf('Methylation PC %d', pcm_)) +
235 |     ylab(sprintf('Expression PC %d', pce_)) +
236 |     guides(color=F) + theme_pub()
237 |   return (p)
238 | }
239 | ```
240 | 
241 | ```{r}
242 | r <- pcs %>% group_by(pcm, pce) %>% summarise(r=cor(expr, met)) %>% ungroup %>%
243 |   arrange(desc(abs(r))) %>% head(10)
244 | ```
245 | 
246 | ```{r results='asis'}
247 | xtable(r, digits=c(0, 0, 0, 3))
248 | ```
249 | 
250 | ```{r fig.width=6, fig.height=6}
251 | for (i in 1:nrow(r)) {
252 |   print(plot_cor_pc(r[i,]$pcm, r[i,]$pce))
253 | }
254 | ```
255 | 
256 | 
257 | 
258 | 
259 | 
260 | 
261 | ```{r}
262 | opts_chunk$set(eval=F)
263 | ```
264 | 
265 | 
266 | ```{r cache=T}
267 | X <- t(ev)
268 | Y <- t(mc)
269 | rc <- rcc(X, Y, 0.1, 0.5)
270 | ```
271 | 
272 | ```{r}
273 | xs <- rc$scores$xscores
274 | ys <- rc$scores$yscores
275 | d <- data.frame(sample=rownames(xs), x=xs[,1], y=ys[,2])
276 | d <- d %>% inner_join(dat$samples, by='sample')
277 | d$sample_short <- sub('^CSC\\d+_', '', d$sample)
278 | 
279 | p1 <- ggplot(d, aes(x=x, y=y)) + geom_point(aes(color=method)) +
280 |   geom_text(aes(label=sample_short), size=2.5, just=0.5, vjust=-0.2) +
281 |   xlab('Dimension 1') + ylab('Dimension 2') +
282 |   theme(legend.position='bottom')
283 | 
284 | d <- data.frame(x=1:22, y=rc$cor[1:22])
285 | p2 <- ggplot(d, aes(x=x, y=y)) + geom_bar(stat='identity', fill='salmon') +
286 |   xlab('CC 2') + ylab('Correlation')
287 | 
288 | grid.arrange(p1, p2, ncol=2)
289 | ```
290 | 
291 | ```{r}
292 | plt.cc(rc)
293 | ```
294 | 


--------------------------------------------------------------------------------
/cor_bulk/expr/index.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Correlation expression scM&T and Ficz
 3 | date: 151022
 4 | output:
 5 |   html_document:
 6 |     toc: yes
 7 | ---
 8 | 
 9 | ```{r, include=F}
10 | library(knitr)
11 | opts_chunk$set(echo=F, warning=F, message=F)
12 | ```
13 | 
14 | ```{r, include=F}
15 | library(ggplot2)
16 | library(dplyr)
17 | library(tidyr)
18 | ```
19 | 
20 | ```{r}
21 | opts <- list()
22 | opts$file1 <- 'data/ficz/counts.csv'
23 | opts$file2 <- 'data/m15P3/bsr3.csv'
24 | ```
25 | 
26 | ```{r}
27 | dat <- list()
28 | ```
29 | 
30 | ```{r}
31 | d <- read.table(opts$file1, head=T) %>% tbl_df
32 | names(d) <- c('ens_id', 'bulk')
33 | dat$e1 <- d
34 | ```
35 | 
36 | ```{r}
37 | d <- read.table(opts$file2, head=T) %>% tbl_df
38 | d <- d %>% gather(sample, expr, -ens_id)
39 | d <- d %>% group_by(ens_id) %>% summarise(bulk=mean(expr))
40 | dat$e2 <- d
41 | ```
42 | 
43 | ```{r}
44 | dat$e <- dat$e1 %>% inner_join(dat$e2, by='ens_id')
45 | ```
46 | 
47 | ```{r}
48 | ggplot(dat$e, aes(x=bulk.x, y=bulk.y)) + geom_point(size=0.5) +
49 |   xlim(0, 1000) + ylim(0, 1000)
50 | ```
51 | 
52 | ```{r}
53 | cor(dat$e$bulk.x, dat$e$bulk.y)
54 | ```
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/cor_bulk/met/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Correlation methylation scM&T and Ficz
  3 | date: 151022
  4 | output:
  5 |   html_document:
  6 |     toc: yes
  7 | ---
  8 | 
  9 | ```{r, include=F}
 10 | library(knitr)
 11 | opts_chunk$set(echo=F, warning=F, message=F)
 12 | ```
 13 | 
 14 | ```{r, include=F}
 15 | library(ggplot2)
 16 | library(dplyr)
 17 | library(tidyr)
 18 | source('./src.R')
 19 | source('./../../lib/utils.R')
 20 | ```
 21 | 
 22 | ```{r}
 23 | opts <- list()
 24 | opts$met1_file <- 'data/ficz/rates.txt'
 25 | opts$met2_file <- 'data/m15P3/rates.txt'
 26 | opts$met3_file <- 'data/m14/rates.txt'
 27 | opts$met2_samples_file <- './data/m15P3/samples/samples.csv'
 28 | opts$nrow <- 10000000
 29 | opts$cache <- T
 30 | ```
 31 | 
 32 | ```{r}
 33 | dat <- list()
 34 | ```
 35 | 
 36 | ```{r read_met1, cache=opts$cache}
 37 | dat$met1 <- read_values(opts$met1_file, n=opts$nrow) %>% unlist
 38 | names(dat$met1) <- NULL
 39 | ```
 40 | 
 41 | ```{r read_met2, cache=opts$cache}
 42 | read_samples <- function(filename) {
 43 |   h <- read.table(filename, sep='\t', head=F) %>% tbl_df
 44 |   colnames(h) <- c('id')
 45 |   h <- as.vector(h$id)
 46 |   return (h)
 47 | }
 48 | 
 49 | samples <- read_samples(opts$met2_samples_file)
 50 | dat$met2 <- read_values(opts$met2_file, samples=samples, n=opts$nrow) %>%
 51 |   rowMeans(na.rm=T)
 52 | ```
 53 | 
 54 | ```{r cache=opts$cache}
 55 | dat$met3 <- read_values(opts$met3_file, n=opts$nrow) %>%
 56 |   rowMeans(na.rm=T)
 57 | ```
 58 | 
 59 | ```{r}
 60 | stopifnot(length(dat$met1) == length(dat$met2))
 61 | stopifnot(length(dat$met1) == length(dat$met3))
 62 | d <- data.frame(ficz=dat$met1, m15=dat$met2, m14=dat$met3) %>% tbl_df
 63 | d <- d[!(rowSums(is.na(d)) == ncol(d)),]
 64 | dat$met <- d
 65 | ```
 66 | 
 67 | ```{r}
 68 | r <- cor(dat$met, use='complete.obs')
 69 | r
 70 | ```
 71 | 
 72 | ```{r}
 73 | plot_cor <- function(name, nmax=50000) {
 74 |   d <- data.frame(x=dat$met$ficz, y=dat$met[[name]])
 75 |   d <- d[complete.cases(d),]
 76 |   if (nrow(d) > nmax) {
 77 |     d <- d[sample(nrow(d), nmax),]
 78 |   }
 79 |   p <- ggplot(d, aes(x=x, y=y)) +
 80 |     geom_abline(slope=1, linetype='dashed') +
 81 |     geom_point(color='blue', size=0.3) +
 82 |     xlab('Ficz') + ylab(name) + theme_pub()
 83 |   return (p)
 84 | }
 85 | ```
 86 | 
 87 | ## scM&T-Seq
 88 | 
 89 | ```{r fig.width=8, fig.height=8}
 90 | plot_cor('m15')
 91 | ```
 92 | 
 93 | ## scBS-Seq
 94 | 
 95 | ```{r fig.width=8, fig.height=8}
 96 | plot_cor('m14')
 97 | ```
 98 | 
 99 | ## Both
100 | 
101 | ```{r}
102 | nmax <- 50000
103 | d <- dat$met %>% gather(set, value, -ficz)
104 | d <- d[complete.cases(d),]
105 | d <- d %>% group_by(set) %>% sample_n(nmax) %>% ungroup
106 | d <- d %>% mutate(set=factor(set, levels=c('m15', 'm14'), labels=c('scM&T', 'scBS14')))
107 | ```
108 | 
109 | ```{r fig.width=8, fig.height=8}
110 | h <- d %>% mutate(set=as.vector(set)) %>% arrange(set)
111 | colors <- c('scBS14'='green4', 'scM&T'='royalblue')
112 | p <- ggplot(h, aes(x=ficz, y=value)) +
113 |   geom_abline(slope=1, linetype='dashed') +
114 |   geom_point(aes(color=set), size=0.8) +
115 |   scale_color_manual(values=colors) +
116 |   xlab('Methylation bulk') + ylab('Methylation merged scM&T') + theme_pub()
117 | print(p)
118 | ```
119 | 
120 | ```{r}
121 | ggsave(p, file='fig_bulk.pdf', width=8, height=8)
122 | ```
123 | 
124 | ```{r eval=F}
125 | write.csv(p$data, 'S7.csv')
126 | ```
127 | 


--------------------------------------------------------------------------------
/cor_bulk/met/src.R:
--------------------------------------------------------------------------------
 1 | read_meta <- function(filename, n=NULL) {
 2 |   h <- 'cut -f 2-5,7,8,12'
 3 |   if (!is.null(n)) {
 4 |     h <- sprintf('head -n %d %s | %s', n, filename, h)
 5 |   } else {
 6 |     h <- sprintf('%s %s', h, filename)
 7 |   }
 8 |   h <- 'cut -f 2-5,7,8,12'
 9 |   if (!is.null(n)) {
10 |     h <- sprintf('head -n %d | %s', n, h)
11 |   }
12 |   h <- read.table(pipe(h), head=T, sep='\t')
13 |   names(h) <- tolower(names(h))
14 |   h <- h %>% rename(chromo=chromosome)
15 |   h <- h %>% tbl_df
16 |   return (h)
17 | }
18 | 
19 | read_all_meta <- function(filenames) {
20 |   m <- read_meta(filenames[1])
21 |   if (length(filenames) > 1) {
22 |     for (i in 2:length(filenames)) {
23 |       s <- read_meta_quick(filenames[i])
24 |       stopifnot(all(m$start == s$start))
25 |     }
26 |   }
27 |   return (m)
28 | }
29 | 
30 | read_values <- function(filename, samples=NULL, n=NULL) {
31 |   h <- 'cut -f 13-'
32 |   if (!is.null(n)) {
33 |     h <- sprintf('head -n %d %s | %s', n, filename, h)
34 |   } else {
35 |     h <- sprintf('%s %s', h, filename)
36 |   }
37 | 
38 |   h <- read.table(pipe(h), head=T, sep='\t')
39 |   if (!is.null(samples)) {
40 |     h <- subset(h, select=intersect(colnames(h), samples))
41 |   }
42 |   h <- h %>% tbl_df
43 |   return (h)
44 | }
45 | 
46 | read_all_values <- function(filenames, samples=NULL, n=NULL) {
47 |   d <- lapply(filenames, function(x) read_values(x, samples=samples, n=n))
48 |   e <- list()
49 |   for (dd in d) {
50 |     if (ncol(dd) > 0) {
51 |       e[[length(e) + 1]] <- dd
52 |     }
53 |   }
54 |   d <- e
55 |   stopifnot(length(d) > 0)
56 |   h <- d[[1]]
57 |   if (length(d) > 1) {
58 |     for (i in 2:length(d)) {
59 |       if (ncol(d[[i]]) > 0) {
60 |         h <- cbind.data.frame(h, d[[i]])
61 |       }
62 |     }
63 |   }
64 |   h <- h %>% tbl_df
65 |   return (h)
66 | }
67 | 


--------------------------------------------------------------------------------
/cov/m14/index.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Coverage analysis scBS14
 3 | output:
 4 |   html_document:
 5 |     toc: yes
 6 | ---
 7 | 
 8 | ```{r, include=F}
 9 | library(knitr)
10 | opts_chunk$set(echo=F, warning=F, message=F)
11 | ```
12 | 
13 | ```{r, include=F}
14 | library(ggplot2)
15 | library(dplyr)
16 | library(tidyr)
17 | source('../../lib/utils.R')
18 | ```
19 | 
20 | ```{r}
21 | opts <- list()
22 | opts$reports <- '../data/m14'
23 | opts$names_ <- list.dirs(opts$reports, full=F, rec=F)
24 | ```
25 | 
26 | ```{r}
27 | dat <- list()
28 | cmp <- list()
29 | ```
30 | 
31 | ```{r, cache=T}
32 | get_cov <- function(f) {
33 |   d <- read_report_values(f)
34 |   d <- d[,grep('Ser', colnames(d))]
35 |   stopifnot(ncol(d) == 20)
36 |   cov <- rowSums(d > 0)
37 |   return (cov)
38 | }
39 | 
40 | d <- list()
41 | for (name in opts$names_) {
42 |   f <- file.path(opts$reports, name, 'weights.txt')
43 |   if (file.exists(f)) {
44 |     message(f)
45 |     cov <- get_cov(f)
46 |     di <- data.frame(name=name, id_=1:length(cov), cov=cov)
47 |     d[[length(d) + 1]] <- di
48 |   }
49 | }
50 | d <- do.call(rbind.data.frame, d) %>% tbl_df
51 | d <- d %>% mutate(name=factor(name)) %>%
52 |   mutate(name=factor(name, levels=rev(sort(levels(name)))))
53 | cov <- d
54 | ```
55 | 
56 | ```{r}
57 | cmp$cov <- cov
58 | ```
59 | 
60 | ```{r}
61 | cov_min <- function(d, n) {
62 |   h <- sapply(0:n, function(x) sum(d$cov >= x) / length(d$cov))
63 |   h <- data.frame(name=d$name[[1]], n=1:length(h), per=h)
64 |   return (h)
65 | }
66 | 
67 | n <- max(cmp$cov$cov)
68 | cmp$cov_min <- cmp$cov %>% group_by(name) %>% do(cov_min(., n)) %>% ungroup %>% tbl_df
69 | ```
70 | 
71 | ```{r}
72 | saveRDS(cmp, 'cov.rds')
73 | ```
74 | 
75 | ```{r fig.width=8, fig.height=6}
76 | d <- cmp$cov_min %>% mutate(name=factor(name, levels=rev(levels(name))))
77 | ggplot(d, aes(x=n, y=per, color=name)) +
78 |   geom_line(lwd=0.6) +
79 |   xlab('Minimum number of cell') + ylab('Coverage') +
80 |   theme_pub() +
81 |   guides(color=guide_legend(title='Context', ncol=4))
82 | ```
83 | 
84 | ```{r fig.width=12, fig.height=10}
85 | d <- cmp$cov_min %>% mutate(name=factor(name, levels=rev(levels(name))))
86 | ggplot(d, aes(x=n, y=per, color=name)) +
87 |   geom_line(lwd=0.6) +
88 |   facet_wrap(~name) +
89 |   xlab('Minimum number of cell') + ylab('Coverage') +
90 |   theme_pub() +
91 |   guides(color=guide_legend(title='Context', ncol=4))
92 | ```
93 | 


--------------------------------------------------------------------------------
/cov/m14_m15P3/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Coverage analysis
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, warning=F, message=F)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(ggplot2)
 15 | library(dplyr)
 16 | library(tidyr)
 17 | source('../../lib/utils.R')
 18 | ```
 19 | 
 20 | ```{r}
 21 | opts <- list()
 22 | opts$colors <- c('scBS-Seq'='forestgreen', 'scM&T-Seq'='royalblue')
 23 | opts$global <- c('10kb'='w10k_s1k', '5kb'='w5k_s1k', '1kb'='w1k_s1k')
 24 | ```
 25 | 
 26 | ```{r}
 27 | dat <- list()
 28 | dat$m14 <- readRDS('../m14/cov.rds')
 29 | dat$m15 <- readRDS('../m15P3/cov.rds')
 30 | ```
 31 | 
 32 | ```{r}
 33 | d <- list()
 34 | for (n in c('m14', 'm15')) {
 35 |   di <- dat[[n]]$cov_min
 36 |   di$np <- di$n / max(di$n)
 37 |   di$study <- n
 38 |   d[[length(d) + 1]] <- di
 39 | }
 40 | d <- do.call(rbind.data.frame, d) %>%
 41 |   mutate(
 42 |     study=factor(study, levels=c('m14', 'm15'), labels=c('scBS-Seq', 'scM&T-Seq')),
 43 |     name=factor(name)) %>%
 44 |   mutate(name=factor(name, levels=sort(levels(name))))
 45 | dat$cov_min <- d
 46 | ```
 47 | 
 48 | ## Genome-wide
 49 | 
 50 | ```{r}
 51 | d <- dat$cov_min %>% filter(name %in% opts$global) %>%
 52 |   mutate(name=factor(name, levels=opts$global,
 53 |   labels=names(opts$global))) %>% droplevels
 54 | ```
 55 | 
 56 | ```{r eval=F}
 57 | write.csv(d, 'S4.csv')
 58 | ```
 59 | 
 60 | ```{r fig.width=8, fig.height=6}
 61 | p <- ggplot(d, aes(x=n, y=per * 100, color=study)) +
 62 |   geom_line(aes(linetype=name), lwd=0.8) +
 63 |   scale_color_manual(values=opts$colors) +
 64 |   guides(color=guide_legend(title='Study'),
 65 |         linetype=guide_legend(title='Window size')) +
 66 |   xlab('Minimum number of cells') +
 67 |   ylab('Percentage of genomic contexts covered') +
 68 |   theme_pub() +
 69 |   theme(
 70 |     axis.title.x=element_text(vjust=-0.8),
 71 |     axis.title.y=element_text(vjust=1.2)
 72 |   )
 73 | print(p)
 74 | ```
 75 | 
 76 | ```{r eval=F}
 77 | ggsave(p, file='fig_genome.pdf', width=8, height=6)
 78 | ```
 79 | 
 80 | ## Context specific
 81 | 
 82 | ```{r}
 83 | d <- dat$cov_min %>% filter(!(name %in% opts$global)) %>% droplevels
 84 | ```
 85 | 
 86 | ```{r eval=F}
 87 | write.csv(d, 'S5.csv')
 88 | ```
 89 | 
 90 | ```{r fig.width=12, fig.height=12}
 91 | p <- ggplot(d, aes(x=n, y=per * 100, color=study)) +
 92 |   geom_line(lwd=0.8) +
 93 |   scale_color_manual(values=opts$colors) +
 94 |   guides(color=guide_legend(title='Study')) +
 95 |   facet_wrap(~name, ncol=3) +
 96 |   xlab('Minimum number of cells') +
 97 |   ylab('Percentage of sites covered') +
 98 |   theme_pub() +
 99 |   theme(
100 |     axis.title.x=element_text(vjust=-0.8),
101 |     axis.title.y=element_text(vjust=1.2)
102 |   )
103 | print(p)
104 | ```
105 | 
106 | ```{r eval=F}
107 | ggsave(p, file='fig_context.pdf', width=12, height=13)
108 | ```
109 | 


--------------------------------------------------------------------------------
/cov/m15P3/index.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Coverage analysis CSCP3
 3 | output:
 4 |   html_document:
 5 |     toc: yes
 6 | ---
 7 | 
 8 | ```{r, include=F}
 9 | library(knitr)
10 | opts_chunk$set(echo=F, warning=F, message=F)
11 | ```
12 | 
13 | ```{r, include=F}
14 | library(ggplot2)
15 | library(dplyr)
16 | library(tidyr)
17 | source('../../lib/utils.R')
18 | ```
19 | 
20 | ```{r}
21 | opts <- list()
22 | opts$reports <- '../data/m15P3'
23 | opts$samples <- '../../data/samples/samples.csv'
24 | opts$names_ <- list.dirs(opts$reports, full=F, rec=F)
25 | ```
26 | 
27 | ```{r}
28 | dat <- list()
29 | dat$samples <- read_samples_list(opts$samples)
30 | cmp <- list()
31 | ```
32 | 
33 | ```{r, cache=T}
34 | get_cov <- function(f) {
35 |   d <- read_report_values(f, samples=dat$samples)
36 |   cov <- rowSums(d > 0)
37 |   return (cov)
38 | }
39 | 
40 | d <- list()
41 | for (name in opts$names_) {
42 |   f <- file.path(opts$reports, name, 'weights.txt')
43 |   if (file.exists(f)) {
44 |     message(f)
45 |     cov <- get_cov(f)
46 |     di <- data.frame(name=name, id_=1:length(cov), cov=cov)
47 |     d[[length(d) + 1]] <- di
48 |   }
49 | }
50 | d <- do.call(rbind.data.frame, d) %>% tbl_df
51 | d <- d %>% mutate(name=factor(name)) %>%
52 |   mutate(name=factor(name, levels=rev(sort(levels(name)))))
53 | cov <- d
54 | ```
55 | 
56 | ```{r}
57 | cmp$cov <- cov
58 | ```
59 | 
60 | ```{r}
61 | cov_min <- function(d, n) {
62 |   h <- sapply(0:n, function(x) sum(d$cov >= x) / length(d$cov))
63 |   h <- data.frame(name=d$name[[1]], n=1:length(h), per=h)
64 |   return (h)
65 | }
66 | 
67 | n <- max(cmp$cov$cov)
68 | cmp$cov_min <- cmp$cov %>% group_by(name) %>% do(cov_min(., n)) %>% ungroup %>% tbl_df
69 | ```
70 | 
71 | ```{r}
72 | saveRDS(cmp, 'cov.rds')
73 | ```
74 | 
75 | ```{r fig.width=8, fig.height=6}
76 | d <- cmp$cov_min %>% mutate(name=factor(name, levels=rev(levels(name))))
77 | ggplot(d, aes(x=n, y=per, color=name)) +
78 |   geom_line(lwd=0.6) +
79 |   xlab('Minimum number of cell') + ylab('Coverage') +
80 |   theme_pub() +
81 |   guides(color=guide_legend(title='Context', ncol=4))
82 | ```
83 | 
84 | ```{r fig.width=12, fig.height=10}
85 | d <- cmp$cov_min %>% mutate(name=factor(name, levels=rev(levels(name))))
86 | ggplot(d, aes(x=n, y=per, color=name)) +
87 |   geom_line(lwd=0.6) +
88 |   facet_wrap(~name) +
89 |   xlab('Minimum number of cell') + ylab('Coverage') +
90 |   theme_pub() +
91 |   guides(color=guide_legend(title='Context', ncol=4))
92 | ```
93 | 


--------------------------------------------------------------------------------
/data/expr/data_proc/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Preprocessing expression data
  3 | date: 151014
  4 | output:
  5 |   html_document:
  6 |     toc: yes
  7 | ---
  8 | 
  9 | ```{r, include=F}
 10 | library(knitr)
 11 | opts_chunk$set(echo=F, warning=F, message=F)
 12 | options(dplyr.width=200)
 13 | ```
 14 | 
 15 | ```{r, include=F}
 16 | library(dplyr)
 17 | library(tidyr)
 18 | library(ggplot2)
 19 | library(RColorBrewer)
 20 | library(gplots)
 21 | library(gridExtra)
 22 | source('../../../lib/utils.R')
 23 | ```
 24 | 
 25 | ```{r}
 26 | opts <- list()
 27 | opts$expr_files <- c('../data_raw/counts.txt')
 28 | opts$genes_file <- '../data_raw/genes.txt'
 29 | opts$out_base <- './data'
 30 | opts$samples_file <- '../../samples/samples_stats.csv'
 31 | opts$samples_select_file <- '../../samples/samples.csv'
 32 | opts$plot_heat <- T
 33 | ```
 34 | 
 35 | ```{r opts_fil_expr, echo=T}
 36 | opts$fil$expr$cov <- 10 # Minimum number of cells with minimum number of counts
 37 | opts$fil$expr$counts <- 10 # Minimum number of raw counts
 38 | opts$fil$expr$top_var <- 7500 # Number of most variable genes to be selected
 39 | opts$fil$expr$include <- NULL
 40 | opts$samples_methods <- NULL
 41 | opts$samples_batch2 <- NULL
 42 | opts$adjust <- F
 43 | ```
 44 | 
 45 | ```{r}
 46 | read_expr <- function(filename, log=T, gather=T) {
 47 |   d <- read.csv(filename, head=T, sep='\t')
 48 |   stopifnot(sum(duplicated(d$ens_id)) == 0)
 49 |   h <- names(d) != 'ens_id'
 50 |   if (log) {
 51 |     d[,h] <- log_counts(d[,h])
 52 |   }
 53 |   d$id_ <- 1:nrow(d)
 54 |   if (gather) {
 55 |     d <- gather(d, sample, value, -c(id_, ens_id))
 56 |     d <- d %>% tbl_df %>% rename(expr=value)
 57 |   }
 58 |   return (d)
 59 | }
 60 | 
 61 | read_all_expr <- function(filenames, log=T, gather=T, samples=NULL) {
 62 |   d <- NULL
 63 |   for (filename in filenames) {
 64 |     h <- read_expr(filename, log=log, gather=F) %>% select(-id_)
 65 |     if (!is.null(samples)) {
 66 |       h <- subset(h, select=intersect(colnames(h), c('ens_id', samples)))
 67 |     }
 68 |     if (is.null(d)) {
 69 |       d <- h
 70 |     } else {
 71 |       d <- d %>% inner_join(h, by='ens_id')
 72 |     }
 73 |   }
 74 |   d$id_ <- 1:nrow(d)
 75 | 
 76 |   if (gather) {
 77 |     d <- gather(d, sample, value, -c(id_, ens_id))
 78 |     d <- d %>% rename(expr=value)
 79 |   }
 80 |   d <- d %>% mutate(ens_id=factor(ens_id)) %>% tbl_df
 81 |   return (d)
 82 | }
 83 | 
 84 | read_samples <- function(filename) {
 85 |   h <- read.table(filename, sep='\t', head=T) %>% tbl_df
 86 |   h$sample <- h$id
 87 |   return (h)
 88 | }
 89 | 
 90 | read_samples_select <- function(filename) {
 91 |   h <- read.table(filename, sep='\t', head=F) %>% tbl_df
 92 |   colnames(h) <- c('id')
 93 |   h <- as.vector(h$id)
 94 |   return (h)
 95 | }
 96 | 
 97 | filter_expr <- function(d, cov, counts, include=NULL) {
 98 |   lcounts <- log_counts(counts)
 99 |   if (cov < 1) {
100 |     n <- length(levels(d$sample))
101 |     ncov <- n * cov
102 |   } else {
103 |     ncov <- cov
104 |   }
105 |   f <- d %>% group_by(id_, ens_id) %>%
106 |     dplyr::summarize(cov=sum(expr >= lcounts)) %>%
107 |     filter(cov >= ncov) %>% ungroup %>% select(ens_id) %>% unlist %>% as.vector
108 |   if (!is.null(include)) {
109 |     f <- union(f, include)
110 |   }
111 |   df <- d %>% filter(ens_id %in% f)
112 |   return (df)
113 | }
114 | 
115 | filter_expr_var <- function(d, n, include=NULL) {
116 |   f <- d %>% group_by(id_, ens_id) %>%
117 |     dplyr::summarize(var=var(expr)) %>% as.data.frame %>%
118 |     dplyr::arrange(desc(var)) %>% head(n) %>%
119 |     ungroup %>% select(ens_id) %>% unlist %>% as.vector
120 |   if (!is.null(include)) {
121 |     f <- union(f, include)
122 |   }
123 |   df <- d %>% filter(ens_id %in% f)
124 |   return (df)
125 | }
126 | 
127 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL,
128 |   sel=NULL) {
129 |   f <- excl
130 |   if (!is.null(f)) {
131 |     d <- d %>% filter(!(sample %in% f))
132 |   }
133 |   f <- methods
134 |   if (!is.null(f) && 'method' %in% colnames(d)) {
135 |     d <- d %>% filter(method %in% f)
136 |   }
137 |   f <- batch2
138 |   if (!is.null(f) && 'batch2' %in% colnames(d)) {
139 |     d <- d %>% filter(batch2 %in% f)
140 |   }
141 |   if (!is.null(sel)) {
142 |     d <- d %>% filter(sample %in% sel)
143 |   }
144 |   d <- droplevels(d)
145 | 
146 |   return (d)
147 | }
148 | ```
149 | 
150 | ```{r read}
151 | dat <- list()
152 | dat$samples <- read_samples(opts$samples_file)
153 | dat$samples_select <- read_samples_select(opts$samples_select_file)
154 | dat$samples <- filter_samples(dat$samples,
155 |   excl=opts$samples_excl,
156 |   methods=opts$samples_methods,
157 |   sel=dat$samples_select)
158 | 
159 | dat$expr <- read_all_expr(opts$expr_files, samples=levels(dat$samples$sample))
160 | stopifnot(length(levels(dat$expr$sample)) == length(levels(dat$samples$sample)))
161 | ```
162 | 
163 | `r nrow(dat$samples)` samples
164 | 
165 | 
166 | ```{r filter}
167 | dat$fil$expr <- filter_expr(dat$expr, opts$fil$expr$cov, opts$fil$expr$counts, opts$fil$expr$include)
168 | ```
169 | 
170 | ```{r}
171 | n_unique <- function(d) {
172 |   return (length(unique(d)))
173 | }
174 | ```
175 | 
176 | `r n_unique(dat$fil$expr$id_)` out of `r n_unique(dat$expr$id_)` genes
177 | passed expression filter.
178 | 
179 | 
180 | ```{r adjust, eval=opts$adjust}
181 | adjust_batch_expr <- function(d, what='batch2') {
182 |   d <- d %>% rename(y=expr)
183 |   stopifnot(all(!is.na(d$y)))
184 |   d <- d %>% inner_join(select_(dat$samples, 'sample', x=what), by='sample')
185 |   h <- d %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup
186 |   d <- h %>% select(-c(y, x)) %>% rename(expr=yr)
187 | }
188 | 
189 | dat$fil$expr <- adjust_batch_expr(dat$fil$expr)
190 | ```
191 | 
192 | Taking the `r opts$fil$expr$top_var` most variable genes.
193 | 
194 | ```{r filter_var}
195 | dat$fil$expr_var <- filter_expr_var(dat$fil$expr, opts$fil$expr$top_var, opts$fil$expr$include)
196 | ```
197 | 
198 | ```{r}
199 | read_genes <- function(filename) {
200 |   h <- read.table(pipe(paste('cut -f 2-5,7,8', filename)), head=T, sep='\t')
201 |   names(h) <- tolower(names(h))
202 |   h <- h %>% rename(chromo=chromosome, ens_id=id, gene_id=feature)
203 |   return (h)
204 | }
205 | 
206 | dat$genes <- read_genes(opts$genes_file)
207 | ```
208 | 
209 | ```{r join_genes}
210 | dat$out <- dat$fil$expr_var %>%
211 |   inner_join(dat$genes, by=c('ens_id'))
212 | dat$out$ens_id <- factor(dat$out$ens_id)
213 | dat$out <- dat$out %>% arrange(ens_id)
214 | dat$outm <- dat$out %>% spread(sample, expr)
215 | dat$fil$genes <- dat$genes %>% inner_join(select(dat$outm, ens_id, id_), by='ens_id')
216 | dat$fil$genes$ens_id <- as.factor(dat$fil$genes$ens_id)
217 | stopifnot(nrow(dat$fil$genes) == nrow(dat$outm))
218 | ```
219 | 
220 | `r length(unique(dat$out$id_))` final records.
221 | 
222 | ```{r write}
223 | save_rds <- function(d, name) {
224 |   f <- sprintf('%s_%s.rds', opts$out_base, name)
225 |   saveRDS(d, f)
226 | }
227 | 
228 | dat$out <- dat$out %>% tbl_df
229 | dat$fil$genes <- dat$fil$genes %>% tbl_df
230 | dat$genes <- dat$genes %>% tbl_df
231 | 
232 | save_rds(dat$out, 'expr')
233 | save_rds(dat$fil$genes, 'meta')
234 | save_rds(dat$genes, 'meta_all')
235 | ```
236 | 
237 | 
238 | ```{r}
239 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') {
240 |   t <- data.frame(sample=factor(rownames(pc_vec)),
241 |     pcx=pc_vec[,x], pcy=pc_vec[,y])
242 |   t <- t %>% left_join(dat$samples, by='sample')
243 |   t$sample_short <- t$sample
244 |   t$sample_short <- sub('^CSCP3_SERUM_', '', t$sample)
245 |   p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) +
246 |     geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) +
247 |     xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) +
248 |     theme(legend.position='right')
249 |   return (p)
250 | }
251 | 
252 | plot_pca_val <- function(pc_val) {
253 |   t <- data.frame(pc=1:length(pc_val), val=pc_val)
254 |   p <- ggplot(t, aes(x=pc, y=val)) +
255 |     geom_bar(stat='identity', fill='salmon', color='black') +
256 |     xlab('principle component') +
257 |     ylab('% variance explained')
258 |   return (p)
259 | }
260 | 
261 | plot_heat <- function(d, Rowv=T, xlab='value') {
262 |   colors <- rev(brewer.pal(9, 'Spectral'))
263 |   colors <- colorRampPalette(colors)(50)
264 |   labRow <- NA
265 |   if (nrow(d) > 500) {
266 |     dendro='column'
267 |   } else {
268 |     dendro = 'both'
269 |   }
270 |   p <- heatmap.2(d, density.info='none', trace='none', col=colors,
271 |     Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA,
272 |     lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab)
273 |   return (p)
274 | }
275 | ```
276 | 
277 | ```{r}
278 | d <- dat$out %>% select(id_, sample, expr) %>% spread(sample, expr) %>%
279 |   select(-id_)
280 | pc <- pca(as.matrix(d))
281 | ```
282 | 
283 | ```{r fig.width=8, fig.height=7}
284 | p1 <- plot_pca_vec(pc$vec)
285 | p1 + theme(legend.position='top')
286 | ```
287 | 
288 | ```{r fig.width=10, fig.height=5}
289 | p2 <- plot_pca_val(pc$val)
290 | p2
291 | ```
292 | 
293 | ```{r message=F, echo=F, eval=opts$plot_heat, fig.width=10, fig.height=10}
294 | max_rows <- 1000
295 | if (nrow(d) > max_rows) {
296 |   d <- d %>% sample_n(max_rows)
297 | }
298 | h <- plot_heat(as.matrix(d))
299 | ```
300 | 


--------------------------------------------------------------------------------
/data/join/README.md:
--------------------------------------------------------------------------------
 1 | Preprocessed data of scM&T
 2 | ==========================
 3 | 
 4 | 
 5 | ```r
 6 | scMT <- readRDS('data.rds')
 7 | cols <- c('name', 'id_.x', 'id_.y', 'sample', 'met', 'weight', 'expr',
 8 |   'chromo.x', 'start.x', 'end.x', 'chromo.y', 'start.y', 'end.y', 'strand',
 9 |   'ens_id', 'gene_id')
10 | scMT <- scMT[, c(cols)]
11 | glimpse(scMT)
12 | ```
13 | 
14 | ```
15 | ## Observations: 7,709,302
16 | ## Variables: 16
17 | ## $ name     (fctr) H3K27ac, H3K27ac, H3K27ac, H3K27ac, H3K27ac, H3K27ac...
18 | ## $ id_.x    (int) 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 1...
19 | ## $ id_.y    (int) 9848, 9848, 9848, 9848, 9848, 9848, 9848, 9848, 9848,...
20 | ## $ sample   (fctr) CSCP3_SERUM_A02, CSCP3_SERUM_A03, CSCP3_SERUM_A04, C...
21 | ## $ met      (dbl) 70.588234, 100.000000, NaN, 8.333333, 16.666666, 0.00...
22 | ## $ weight   (dbl) 17, 12, 0, 12, 6, 6, 0, 5, 0, 40, 17, 0, 5, 15, 13, 3...
23 | ## $ expr     (dbl) 0.4506199, 1.2592708, 0.4329859, 0.4880455, 0.3933367...
24 | ## $ chromo.x (fctr) 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, ...
25 | ## $ start.x  (int) 108920349, 108920349, 108920349, 108920349, 108920349...
26 | ## $ end.x    (int) 108950783, 108950783, 108950783, 108950783, 108950783...
27 | ## $ chromo.y (fctr) 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, ...
28 | ## $ start.y  (int) 108909110, 108909110, 108909110, 108909110, 108909110...
29 | ## $ end.y    (int) 108912460, 108912460, 108912460, 108912460, 108912460...
30 | ## $ strand   (fctr) +, +, +, +, +, +, +, +, +, +, +, +, +, +, +, +, +, +...
31 | ## $ ens_id   (fctr) ENSMUSG00000000142, ENSMUSG00000000142, ENSMUSG00000...
32 | ## $ gene_id  (fctr) Axin2, Axin2, Axin2, Axin2, Axin2, Axin2, Axin2, Axi...
33 | ```
34 | 
35 | * `name`: Name of annotation.
36 | * `id_.x`: Identifier of methylated region.
37 | * `id_.y`: Identifier of gene matched to methylated region.
38 | * `sample`: Cell identifier.
39 | * `met`: Methylation rate of methylated region.
40 | * `weight`: Weight of methylated region proportional to the number of covered
41 |   CpG sites in that region.
42 | * `expr: Expression rate of gene.
43 | * `ens_id`: ENSEMBL gene identifier.
44 | * `gene_id`: Gene name.
45 | * `chromo.x`: Chromosome of methylated region.
46 | * `start.x`: Start position of methylated region.
47 | * `end.x`: End position of methylated region.
48 | * `chromo.y`: Chromosome of gene.
49 | * `start.y`: Start position of gene.
50 | * `end.y`: End position of gene.
51 | 
52 | 


--------------------------------------------------------------------------------
/data/join/README.rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Preprocessed data of scM&T"
 3 | author: "Christof Angermueller"
 4 | date: "`r format(Sys.time(), '%Y-%m-%d')`"
 5 | output:
 6 |   html_document:
 7 |     toc: no
 8 | ---
 9 | 
10 | ```{r, include=F}
11 | library(knitr)
12 | opts_chunk$set(echo=T, warning=F, message=F)
13 | ```
14 | 
15 | ```{r, include=F}
16 | library(ggplot2)
17 | library(dplyr)
18 | library(tidyr)
19 | ```
20 | 
21 | ```{r}
22 | scMT <- readRDS('data.rds')
23 | cols <- c('name', 'id_.x', 'id_.y', 'sample', 'met', 'weight', 'expr',
24 |   'chromo.x', 'start.x', 'end.x', 'chromo.y', 'start.y', 'end.y', 'strand',
25 |   'ens_id', 'gene_id')
26 | scMT <- scMT[, c(cols)]
27 | glimpse(scMT)
28 | ```
29 | 
30 | * `name`: Name of annotation.
31 | * `id_.x`: Identifier of methylated region.
32 | * `id_.y`: Identifier of gene matched to methylated region.
33 | * `sample`: Cell identifier.
34 | * `met`: Methylation rate of methylated region.
35 | * `weight`: Weight of methylated region proportional to the number of covered
36 |   CpG sites in that region.
37 | * `expr: Expression rate of gene.
38 | * `ens_id`: ENSEMBL gene identifier.
39 | * `gene_id`: Gene name.
40 | * `chromo.x`: Chromosome of methylated region.
41 | * `start.x`: Start position of methylated region.
42 | * `end.x`: End position of methylated region.
43 | * `chromo.y`: Chromosome of gene.
44 | * `start.y`: Start position of gene.
45 | * `end.y`: End position of gene.
46 | 
47 | 


--------------------------------------------------------------------------------
/data/join/data.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PMBio/scMT-seq/cabc3b66bc8e1d9f61bb2b6d903018c0c88d8093/data/join/data.rds


--------------------------------------------------------------------------------
/data/met/join/index.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Join methlation data
 3 | output:
 4 |   html_document:
 5 |     toc: yes
 6 | ---
 7 | 
 8 | ```{r, include=F}
 9 | library(knitr)
10 | opts_chunk$set(echo=F)
11 | ```
12 | 
13 | ```{r, include=F}
14 | library(dplyr)
15 | ```
16 | 
17 | ```{r}
18 | opts <- list()
19 | opts$files <- Sys.glob('../prepro/*/data_met.rds')
20 | ```
21 | 
22 | ```{r join}
23 | d <- list()
24 | dm <- list()
25 | for (f in opts$files) {
26 |   name <- basename(dirname(f))
27 |   d[[name]] <- readRDS(f)
28 |   d[[name]]$name <- name
29 |   fm <- file.path(dirname(f), 'data_meta.rds')
30 |   dm[[name]] <- readRDS(fm)
31 |   dm[[name]]$name <- name
32 | }
33 | h <- sort(names(dm))
34 | d <- do.call(rbind.data.frame, d) %>% mutate(name=factor(name, levels=h))
35 | dm <- do.call(rbind.data.frame, dm) %>% mutate(name=factor(name, levels=h))
36 | d <- d %>% droplevels %>% tbl_df
37 | dm <- dm %>% droplevels %>% tbl_df
38 | saveRDS(d, 'data_met.rds')
39 | saveRDS(dm, 'data_meta.rds')
40 | ```
41 | 


--------------------------------------------------------------------------------
/data/met/prepro/H3K27ac/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Preprocessing Seqmonk methylation files
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(dplyr)
 15 | library(tidyr)
 16 | library(ggplot2)
 17 | library(RColorBrewer)
 18 | library(gplots)
 19 | library(gridExtra)
 20 | source('../../../../lib/utils.R')
 21 | ```
 22 | 
 23 | ```{r opts, echo=T}
 24 | opts <- list()
 25 | opts$fil$cov <- 30
 26 | opts$fil$min_var <- 10
 27 | opts$fil$top_var <- 0.75
 28 | opts$adjust <- F
 29 | opts$scale_adjust <- T
 30 | opts$samples_excl <- NULL
 31 | opts$samples_methods <- NULL
 32 | opts$samples_batch2 <- NULL
 33 | ```
 34 | 
 35 | ```{r}
 36 | opts$name <- basename(getwd())
 37 | opts$seqmonk_dirs <- c(
 38 |   file.path('../../reports', opts$name))
 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt'))
 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt'))
 41 | opts$out_base <- './data'
 42 | opts$samples_file <- '../../../samples/samples_stats.csv'
 43 | opts$samples_select_file <- '../../../samples/samples.csv'
 44 | opts$cache <- F
 45 | opts$plot_heat <- F
 46 | ```
 47 | 
 48 | **Name**: `r opts$name`
 49 | 
 50 | 
 51 | 
 52 | ```{r read_src}
 53 | read_meta <- function(filename) {
 54 |   h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t')
 55 |   names(h) <- tolower(names(h))
 56 |   h <- h %>% rename(chromo=chromosome)
 57 |   h <- h %>% tbl_df
 58 |   return (h)
 59 | }
 60 | 
 61 | read_meta_quick <- function(filename) {
 62 |   h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t')
 63 |   names(h) <- tolower(names(h))
 64 |   h <- h %>% tbl_df
 65 |   return (h)
 66 | }
 67 | 
 68 | read_all_meta <- function(filenames) {
 69 |   m <- read_meta(filenames[1])
 70 |   if (length(filenames) > 1) {
 71 |     for (i in 2:length(filenames)) {
 72 |       s <- read_meta_quick(filenames[i])
 73 |       stopifnot(all(m$start == s$start))
 74 |     }
 75 |   }
 76 |   return (m)
 77 | }
 78 | 
 79 | read_values <- function(filename, samples=NULL) {
 80 |   h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t')
 81 |   if (!is.null(samples)) {
 82 |     h <- subset(h, select=intersect(colnames(h), samples))
 83 |   }
 84 |   h <- h %>% tbl_df
 85 |   return (h)
 86 | }
 87 | 
 88 | read_all_values <- function(filenames, samples=NULL) {
 89 |   d <- lapply(filenames, function(x) read_values(x, samples=samples))
 90 |   e <- list()
 91 |   for (dd in d) {
 92 |     if (ncol(dd) > 0) {
 93 |       e[[length(e) + 1]] <- dd
 94 |     }
 95 |   }
 96 |   d <- e
 97 |   stopifnot(length(d) > 0)
 98 |   h <- d[[1]]
 99 |   if (length(d) > 1) {
100 |     for (i in 2:length(d)) {
101 |       if (ncol(d[[i]]) > 0) {
102 |         h <- cbind.data.frame(h, d[[i]])
103 |       }
104 |     }
105 |   }
106 |   h <- h %>% tbl_df
107 |   return (h)
108 | }
109 | 
110 | read_samples <- function(filename) {
111 |   h <- read.table(filename, sep='\t', head=T) %>% tbl_df
112 |   h$sample <- h$id
113 |   return (h)
114 | }
115 | 
116 | read_samples_select <- function(filename) {
117 |   h <- read.table(filename, sep='\t', head=F) %>% tbl_df
118 |   colnames(h) <- c('id')
119 |   h <- as.vector(h$id)
120 |   return (h)
121 | }
122 | 
123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL,
124 |   sel=NULL) {
125 |   f <- excl
126 |   if (!is.null(f)) {
127 |     d <- d %>% filter(!(sample %in% f))
128 |   }
129 |   f <- methods
130 |   if (!is.null(f) && 'method' %in% colnames(d)) {
131 |     d <- d %>% filter(method %in% f)
132 |   }
133 |   f <- batch2
134 |   if (!is.null(f) && 'batch2' %in% colnames(d)) {
135 |     d <- d %>% filter(batch2 %in% f)
136 |   }
137 |   if (!is.null(sel)) {
138 |     d <- d %>% filter(sample %in% sel)
139 |   }
140 |   d <- droplevels(d)
141 | 
142 |   return (d)
143 | }
144 | ```
145 | 
146 | ```{r}
147 | dat <- list()
148 | dat$samples <- read_samples(opts$samples_file)
149 | dat$samples_select <- read_samples_select(opts$samples_select_file)
150 | dat$samples <- filter_samples(dat$samples,
151 |   excl=opts$samples_excl,
152 |   methods=opts$samples_methods,
153 |   batch2=opts$samples_batch2,
154 |   sel=dat$samples_select)
155 | 
156 | dat$meta <- read_all_meta(opts$rates_files)
157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample))
158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample))
159 | 
160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100)))
161 | stopifnot(all(round(dat$weights) == dat$weights))
162 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
163 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
164 | ```
165 | 
166 | 
167 | 
168 | ## Preprocessing
169 | 
170 | ```{r}
171 | get_cov <- function(d, perc=TRUE) {
172 |   cov <- rowSums(!is.na(d))
173 |   if (perc) {
174 |     cov <- cov / ncol(d)
175 |   }
176 |   return (cov)
177 | }
178 | 
179 | get_var <- function(d) {
180 |   return (apply(d, 1, var, na.rm=T))
181 | }
182 | ```
183 | 
184 | ```{r}
185 | h <- duplicated(dat$meta)
186 | print(sprintf('Remove %d duplicated records.', sum(h)))
187 | dat$meta <- dat$meta[!h, ]
188 | dat$meta$id_ <- 1:nrow(dat$meta)
189 | for (n in c('rates', 'weights')) {
190 |   dat[[n]] <- dat[[n]][!h,]
191 |   dat[[n]]$id_ <- dat$meta$id_
192 | }
193 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
194 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
195 | ```
196 | 
197 | ```{r filter_cov}
198 | d <- dat$rates
199 | print(sprintf('%d records in total.', nrow(d)))
200 | if (!is.na(opts$fil$cov)) {
201 |   d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,]
202 | }
203 | print(sprintf('%d records passed coverage filter.', nrow(d)))
204 | dat$rates <- d
205 | ```
206 | 
207 | ```{r}
208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') {
209 |   t <- data.frame(sample=factor(rownames(pc_vec)),
210 |     pcx=pc_vec[,x], pcy=pc_vec[,y])
211 |   t <- t %>% left_join(dat$samples, by='sample')
212 |   t$sample_short <- sub('^CSCP3_SERUM', '', t$sample)
213 |   p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) +
214 |     geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) +
215 |     xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) +
216 |     theme(legend.position='bottom')
217 |   return (p)
218 | }
219 | 
220 | plot_pca_val <- function(pc_val) {
221 |   t <- data.frame(pc=1:length(pc_val), val=pc_val)
222 |   p <- ggplot(t, aes(x=pc, y=val)) +
223 |     geom_bar(stat='identity', fill='salmon', color='black') +
224 |     xlab('principle component') +
225 |     ylab('% variance explained')
226 |   return (p)
227 | }
228 | ```
229 | 
230 | 
231 | ## PCA unadjusted data
232 | 
233 | ```{r}
234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
235 | ```
236 | 
237 | ```{r fig.width=8, fig.height=7}
238 | p1 <- plot_pca_vec(pc$vec)
239 | p1 + theme(legend.position='top')
240 | ```
241 | 
242 | ```{r fig.width=10, fig.height=5}
243 | p2 <- plot_pca_val(pc$val)
244 | p2
245 | ```
246 | 
247 | ```{r}
248 | d <- dat$rates %>% select(-id_)
249 | cov <- rowSums(!is.na(d)) / ncol(d)
250 | d[h == ncol(d)]
251 | ```
252 | 
253 | ```{r adjust, eval=opts$adjust}
254 | scale_rates <- function(d) {
255 |   h <- d %>% select(-id_)
256 |   a <- min(h, na.rm=T)
257 |   b <- max(h, na.rm=T)
258 |   h <- (h - a) / (b - a) * 100
259 |   h$id_ <- d$id_
260 |   return (h)
261 | }
262 | 
263 | adjust_batch_met <- function(d, what='batch2', scale=F) {
264 |   is_na <- is.na(d)
265 |   h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_)
266 |   h <- h %>% gather(sample, y, -id_) %>%
267 |     inner_join(select_(dat$samples, 'sample', x=what), by='sample')
268 |   h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup
269 |   h <- h %>% select(id_, sample, y=yr)
270 |   h <- h %>% spread(sample, y)
271 |   stopifnot(all(dim(is_na) == dim(h)))
272 |   stopifnot(all(colnames(is_na) %in% colnames(h)))
273 |   h <- h[,colnames(is_na)]
274 |   h[is_na] <- NA
275 |   return (h)
276 | }
277 | 
278 | h <- dat$rates %>% adjust_batch_met
279 | if (opts$scale_adjust) {
280 |   h <- scale_rates(h)
281 | }
282 | dat$rates <- h
283 | ```
284 | 
285 | ## PCA adjusted data
286 | 
287 | ```{r}
288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
289 | ```
290 | 
291 | ```{r fig.width=8, fig.height=7}
292 | p1 <- plot_pca_vec(pc$vec)
293 | p1 + theme(legend.position='top')
294 | ```
295 | 
296 | ```{r fig.width=10, fig.height=5}
297 | p2 <- plot_pca_val(pc$val)
298 | p2
299 | ```
300 | 
301 | ## Filtering
302 | 
303 | ```{r filter_var}
304 | d <- dat$rates
305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples
306 | d <- d[!is.na(h),]
307 | if (!is.na(opts$fil$min_var)) {
308 |   stopifnot(sum(is.na(h)) == 0)
309 |   d <- d[h >= opts$fil$min_var,]
310 | }
311 | print(sprintf('%d records passed min var filter.', nrow(d)))
312 | 
313 | if (!is.na(opts$fil$top_var)) {
314 |   t <- opts$fil$top_var
315 |   if (t <= 1) {
316 |     t <- as.integer(t * nrow(d))
317 |   }
318 |   d <- d[order(get_var(select(d, -id_)), decreasing=T),]
319 |   d <- d[1:t,]
320 | }
321 | print(sprintf('%d final record.', nrow(d)))
322 | dat$rates <- d
323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_))
324 | ```
325 | 
326 | ```{r join}
327 | d <- dat$meta %>% select(chromo, start, end, id_) %>%
328 |   inner_join(dat$rates, by='id_')
329 | stopifnot(nrow(d) == nrow(dat$rates))
330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_))
331 | 
332 | d <- dat$weights %>% gather(sample, weight, -id_)
333 | h <- nrow(dat$df)
334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample'))
335 | 
336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup
337 | 
338 | stopifnot(h == nrow(dat$df))
339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1))
340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0))
341 | ```
342 | 
343 | ```{r}
344 | dat$df <- dat$df %>% ungroup %>% tbl_df
345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df
346 | ```
347 | 
348 | ```{r write}
349 | save_rds <- function(d, name) {
350 |   f <- sprintf('%s_%s.rds', opts$out_base, name)
351 |   saveRDS(d, f)
352 | }
353 | 
354 | save_rds(dat$df, 'met')
355 | save_rds(dat$meta, 'meta')
356 | ```
357 | 
358 | 
359 | 
360 | ```{r eval=opts$plot_heat, fig.height=12}
361 | plot_heat <- function(d, Rowv=T, xlab='value') {
362 |   colors <- rev(brewer.pal(9, 'Spectral'))
363 |   colors <- colorRampPalette(colors)(50)
364 | 
365 |   labRow <- NA
366 |   if (nrow(d) > 500) {
367 |     dendro='column'
368 |   } else {
369 |     dendro = 'both'
370 |   }
371 |   p <- heatmap.2(d, density.info='none', trace='none', col=colors,
372 |     Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA,
373 |     lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab)
374 |   return (p)
375 | }
376 | 
377 | d <- dat$rates
378 | max_rows <- 1000
379 | if (nrow(d) > max_rows) {
380 |   d <- d %>% sample_n(max_rows)
381 | }
382 | d <- d %>% select(-id_) %>% as.matrix
383 | if (nrow(d) > 100) {
384 |   h <- plot_heat(d)
385 | }
386 | ```
387 | 


--------------------------------------------------------------------------------
/data/met/prepro/H3K27me3/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Preprocessing Seqmonk methylation files
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(dplyr)
 15 | library(tidyr)
 16 | library(ggplot2)
 17 | library(RColorBrewer)
 18 | library(gplots)
 19 | library(gridExtra)
 20 | source('../../../../lib/utils.R')
 21 | ```
 22 | 
 23 | ```{r opts, echo=T}
 24 | opts <- list()
 25 | opts$fil$cov <- 30
 26 | opts$fil$min_var <- 10
 27 | opts$fil$top_var <- 0.75
 28 | opts$adjust <- F
 29 | opts$scale_adjust <- T
 30 | opts$samples_excl <- NULL
 31 | opts$samples_methods <- NULL
 32 | opts$samples_batch2 <- NULL
 33 | ```
 34 | 
 35 | ```{r}
 36 | opts$name <- basename(getwd())
 37 | opts$seqmonk_dirs <- c(
 38 |   file.path('../../reports', opts$name))
 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt'))
 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt'))
 41 | opts$out_base <- './data'
 42 | opts$samples_file <- '../../../samples/samples_stats.csv'
 43 | opts$samples_select_file <- '../../../samples/samples.csv'
 44 | opts$cache <- F
 45 | opts$plot_heat <- F
 46 | ```
 47 | 
 48 | **Name**: `r opts$name`
 49 | 
 50 | 
 51 | 
 52 | ```{r read_src}
 53 | read_meta <- function(filename) {
 54 |   h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t')
 55 |   names(h) <- tolower(names(h))
 56 |   h <- h %>% rename(chromo=chromosome)
 57 |   h <- h %>% tbl_df
 58 |   return (h)
 59 | }
 60 | 
 61 | read_meta_quick <- function(filename) {
 62 |   h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t')
 63 |   names(h) <- tolower(names(h))
 64 |   h <- h %>% tbl_df
 65 |   return (h)
 66 | }
 67 | 
 68 | read_all_meta <- function(filenames) {
 69 |   m <- read_meta(filenames[1])
 70 |   if (length(filenames) > 1) {
 71 |     for (i in 2:length(filenames)) {
 72 |       s <- read_meta_quick(filenames[i])
 73 |       stopifnot(all(m$start == s$start))
 74 |     }
 75 |   }
 76 |   return (m)
 77 | }
 78 | 
 79 | read_values <- function(filename, samples=NULL) {
 80 |   h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t')
 81 |   if (!is.null(samples)) {
 82 |     h <- subset(h, select=intersect(colnames(h), samples))
 83 |   }
 84 |   h <- h %>% tbl_df
 85 |   return (h)
 86 | }
 87 | 
 88 | read_all_values <- function(filenames, samples=NULL) {
 89 |   d <- lapply(filenames, function(x) read_values(x, samples=samples))
 90 |   e <- list()
 91 |   for (dd in d) {
 92 |     if (ncol(dd) > 0) {
 93 |       e[[length(e) + 1]] <- dd
 94 |     }
 95 |   }
 96 |   d <- e
 97 |   stopifnot(length(d) > 0)
 98 |   h <- d[[1]]
 99 |   if (length(d) > 1) {
100 |     for (i in 2:length(d)) {
101 |       if (ncol(d[[i]]) > 0) {
102 |         h <- cbind.data.frame(h, d[[i]])
103 |       }
104 |     }
105 |   }
106 |   h <- h %>% tbl_df
107 |   return (h)
108 | }
109 | 
110 | read_samples <- function(filename) {
111 |   h <- read.table(filename, sep='\t', head=T) %>% tbl_df
112 |   h$sample <- h$id
113 |   return (h)
114 | }
115 | 
116 | read_samples_select <- function(filename) {
117 |   h <- read.table(filename, sep='\t', head=F) %>% tbl_df
118 |   colnames(h) <- c('id')
119 |   h <- as.vector(h$id)
120 |   return (h)
121 | }
122 | 
123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL,
124 |   sel=NULL) {
125 |   f <- excl
126 |   if (!is.null(f)) {
127 |     d <- d %>% filter(!(sample %in% f))
128 |   }
129 |   f <- methods
130 |   if (!is.null(f) && 'method' %in% colnames(d)) {
131 |     d <- d %>% filter(method %in% f)
132 |   }
133 |   f <- batch2
134 |   if (!is.null(f) && 'batch2' %in% colnames(d)) {
135 |     d <- d %>% filter(batch2 %in% f)
136 |   }
137 |   if (!is.null(sel)) {
138 |     d <- d %>% filter(sample %in% sel)
139 |   }
140 |   d <- droplevels(d)
141 | 
142 |   return (d)
143 | }
144 | ```
145 | 
146 | ```{r}
147 | dat <- list()
148 | dat$samples <- read_samples(opts$samples_file)
149 | dat$samples_select <- read_samples_select(opts$samples_select_file)
150 | dat$samples <- filter_samples(dat$samples,
151 |   excl=opts$samples_excl,
152 |   methods=opts$samples_methods,
153 |   batch2=opts$samples_batch2,
154 |   sel=dat$samples_select)
155 | 
156 | dat$meta <- read_all_meta(opts$rates_files)
157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample))
158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample))
159 | 
160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100)))
161 | stopifnot(all(round(dat$weights) == dat$weights))
162 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
163 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
164 | ```
165 | 
166 | 
167 | 
168 | ## Preprocessing
169 | 
170 | ```{r}
171 | get_cov <- function(d, perc=TRUE) {
172 |   cov <- rowSums(!is.na(d))
173 |   if (perc) {
174 |     cov <- cov / ncol(d)
175 |   }
176 |   return (cov)
177 | }
178 | 
179 | get_var <- function(d) {
180 |   return (apply(d, 1, var, na.rm=T))
181 | }
182 | ```
183 | 
184 | ```{r}
185 | h <- duplicated(dat$meta)
186 | print(sprintf('Remove %d duplicated records.', sum(h)))
187 | dat$meta <- dat$meta[!h, ]
188 | dat$meta$id_ <- 1:nrow(dat$meta)
189 | for (n in c('rates', 'weights')) {
190 |   dat[[n]] <- dat[[n]][!h,]
191 |   dat[[n]]$id_ <- dat$meta$id_
192 | }
193 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
194 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
195 | ```
196 | 
197 | ```{r filter_cov}
198 | d <- dat$rates
199 | print(sprintf('%d records in total.', nrow(d)))
200 | if (!is.na(opts$fil$cov)) {
201 |   d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,]
202 | }
203 | print(sprintf('%d records passed coverage filter.', nrow(d)))
204 | dat$rates <- d
205 | ```
206 | 
207 | ```{r}
208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') {
209 |   t <- data.frame(sample=factor(rownames(pc_vec)),
210 |     pcx=pc_vec[,x], pcy=pc_vec[,y])
211 |   t <- t %>% left_join(dat$samples, by='sample')
212 |   t$sample_short <- sub('^CSCP3_SERUM', '', t$sample)
213 |   p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) +
214 |     geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) +
215 |     xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) +
216 |     theme(legend.position='bottom')
217 |   return (p)
218 | }
219 | 
220 | plot_pca_val <- function(pc_val) {
221 |   t <- data.frame(pc=1:length(pc_val), val=pc_val)
222 |   p <- ggplot(t, aes(x=pc, y=val)) +
223 |     geom_bar(stat='identity', fill='salmon', color='black') +
224 |     xlab('principle component') +
225 |     ylab('% variance explained')
226 |   return (p)
227 | }
228 | ```
229 | 
230 | 
231 | ## PCA unadjusted data
232 | 
233 | ```{r}
234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
235 | ```
236 | 
237 | ```{r fig.width=8, fig.height=7}
238 | p1 <- plot_pca_vec(pc$vec)
239 | p1 + theme(legend.position='top')
240 | ```
241 | 
242 | ```{r fig.width=10, fig.height=5}
243 | p2 <- plot_pca_val(pc$val)
244 | p2
245 | ```
246 | 
247 | ```{r}
248 | d <- dat$rates %>% select(-id_)
249 | cov <- rowSums(!is.na(d)) / ncol(d)
250 | d[h == ncol(d)]
251 | ```
252 | 
253 | ```{r adjust, eval=opts$adjust}
254 | scale_rates <- function(d) {
255 |   h <- d %>% select(-id_)
256 |   a <- min(h, na.rm=T)
257 |   b <- max(h, na.rm=T)
258 |   h <- (h - a) / (b - a) * 100
259 |   h$id_ <- d$id_
260 |   return (h)
261 | }
262 | 
263 | adjust_batch_met <- function(d, what='batch2', scale=F) {
264 |   is_na <- is.na(d)
265 |   h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_)
266 |   h <- h %>% gather(sample, y, -id_) %>%
267 |     inner_join(select_(dat$samples, 'sample', x=what), by='sample')
268 |   h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup
269 |   h <- h %>% select(id_, sample, y=yr)
270 |   h <- h %>% spread(sample, y)
271 |   stopifnot(all(dim(is_na) == dim(h)))
272 |   stopifnot(all(colnames(is_na) %in% colnames(h)))
273 |   h <- h[,colnames(is_na)]
274 |   h[is_na] <- NA
275 |   return (h)
276 | }
277 | 
278 | h <- dat$rates %>% adjust_batch_met
279 | if (opts$scale_adjust) {
280 |   h <- scale_rates(h)
281 | }
282 | dat$rates <- h
283 | ```
284 | 
285 | ## PCA adjusted data
286 | 
287 | ```{r}
288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
289 | ```
290 | 
291 | ```{r fig.width=8, fig.height=7}
292 | p1 <- plot_pca_vec(pc$vec)
293 | p1 + theme(legend.position='top')
294 | ```
295 | 
296 | ```{r fig.width=10, fig.height=5}
297 | p2 <- plot_pca_val(pc$val)
298 | p2
299 | ```
300 | 
301 | ## Filtering
302 | 
303 | ```{r filter_var}
304 | d <- dat$rates
305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples
306 | d <- d[!is.na(h),]
307 | if (!is.na(opts$fil$min_var)) {
308 |   stopifnot(sum(is.na(h)) == 0)
309 |   d <- d[h >= opts$fil$min_var,]
310 | }
311 | print(sprintf('%d records passed min var filter.', nrow(d)))
312 | 
313 | if (!is.na(opts$fil$top_var)) {
314 |   t <- opts$fil$top_var
315 |   if (t <= 1) {
316 |     t <- as.integer(t * nrow(d))
317 |   }
318 |   d <- d[order(get_var(select(d, -id_)), decreasing=T),]
319 |   d <- d[1:t,]
320 | }
321 | print(sprintf('%d final record.', nrow(d)))
322 | dat$rates <- d
323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_))
324 | ```
325 | 
326 | ```{r join}
327 | d <- dat$meta %>% select(chromo, start, end, id_) %>%
328 |   inner_join(dat$rates, by='id_')
329 | stopifnot(nrow(d) == nrow(dat$rates))
330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_))
331 | 
332 | d <- dat$weights %>% gather(sample, weight, -id_)
333 | h <- nrow(dat$df)
334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample'))
335 | 
336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup
337 | 
338 | stopifnot(h == nrow(dat$df))
339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1))
340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0))
341 | ```
342 | 
343 | ```{r}
344 | dat$df <- dat$df %>% ungroup %>% tbl_df
345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df
346 | ```
347 | 
348 | ```{r write}
349 | save_rds <- function(d, name) {
350 |   f <- sprintf('%s_%s.rds', opts$out_base, name)
351 |   saveRDS(d, f)
352 | }
353 | 
354 | save_rds(dat$df, 'met')
355 | save_rds(dat$meta, 'meta')
356 | ```
357 | 
358 | 
359 | 
360 | ```{r eval=opts$plot_heat, fig.height=12}
361 | plot_heat <- function(d, Rowv=T, xlab='value') {
362 |   colors <- rev(brewer.pal(9, 'Spectral'))
363 |   colors <- colorRampPalette(colors)(50)
364 | 
365 |   labRow <- NA
366 |   if (nrow(d) > 500) {
367 |     dendro='column'
368 |   } else {
369 |     dendro = 'both'
370 |   }
371 |   p <- heatmap.2(d, density.info='none', trace='none', col=colors,
372 |     Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA,
373 |     lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab)
374 |   return (p)
375 | }
376 | 
377 | d <- dat$rates
378 | max_rows <- 1000
379 | if (nrow(d) > max_rows) {
380 |   d <- d %>% sample_n(max_rows)
381 | }
382 | d <- d %>% select(-id_) %>% as.matrix
383 | if (nrow(d) > 100) {
384 |   h <- plot_heat(d)
385 | }
386 | ```
387 | 


--------------------------------------------------------------------------------
/data/met/prepro/H3K4me1/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Preprocessing Seqmonk methylation files
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(dplyr)
 15 | library(tidyr)
 16 | library(ggplot2)
 17 | library(RColorBrewer)
 18 | library(gplots)
 19 | library(gridExtra)
 20 | source('../../../../lib/utils.R')
 21 | ```
 22 | 
 23 | ```{r opts, echo=T}
 24 | opts <- list()
 25 | opts$fil$cov <- 30
 26 | opts$fil$min_var <- 10
 27 | opts$fil$top_var <- 0.75
 28 | opts$adjust <- F
 29 | opts$scale_adjust <- T
 30 | opts$samples_excl <- NULL
 31 | opts$samples_methods <- NULL
 32 | opts$samples_batch2 <- NULL
 33 | ```
 34 | 
 35 | ```{r}
 36 | opts$name <- basename(getwd())
 37 | opts$seqmonk_dirs <- c(
 38 |   file.path('../../reports', opts$name))
 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt'))
 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt'))
 41 | opts$out_base <- './data'
 42 | opts$samples_file <- '../../../samples/samples_stats.csv'
 43 | opts$samples_select_file <- '../../../samples/samples.csv'
 44 | opts$cache <- F
 45 | opts$plot_heat <- F
 46 | ```
 47 | 
 48 | **Name**: `r opts$name`
 49 | 
 50 | 
 51 | 
 52 | ```{r read_src}
 53 | read_meta <- function(filename) {
 54 |   h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t')
 55 |   names(h) <- tolower(names(h))
 56 |   h <- h %>% rename(chromo=chromosome)
 57 |   h <- h %>% tbl_df
 58 |   return (h)
 59 | }
 60 | 
 61 | read_meta_quick <- function(filename) {
 62 |   h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t')
 63 |   names(h) <- tolower(names(h))
 64 |   h <- h %>% tbl_df
 65 |   return (h)
 66 | }
 67 | 
 68 | read_all_meta <- function(filenames) {
 69 |   m <- read_meta(filenames[1])
 70 |   if (length(filenames) > 1) {
 71 |     for (i in 2:length(filenames)) {
 72 |       s <- read_meta_quick(filenames[i])
 73 |       stopifnot(all(m$start == s$start))
 74 |     }
 75 |   }
 76 |   return (m)
 77 | }
 78 | 
 79 | read_values <- function(filename, samples=NULL) {
 80 |   h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t')
 81 |   if (!is.null(samples)) {
 82 |     h <- subset(h, select=intersect(colnames(h), samples))
 83 |   }
 84 |   h <- h %>% tbl_df
 85 |   return (h)
 86 | }
 87 | 
 88 | read_all_values <- function(filenames, samples=NULL) {
 89 |   d <- lapply(filenames, function(x) read_values(x, samples=samples))
 90 |   e <- list()
 91 |   for (dd in d) {
 92 |     if (ncol(dd) > 0) {
 93 |       e[[length(e) + 1]] <- dd
 94 |     }
 95 |   }
 96 |   d <- e
 97 |   stopifnot(length(d) > 0)
 98 |   h <- d[[1]]
 99 |   if (length(d) > 1) {
100 |     for (i in 2:length(d)) {
101 |       if (ncol(d[[i]]) > 0) {
102 |         h <- cbind.data.frame(h, d[[i]])
103 |       }
104 |     }
105 |   }
106 |   h <- h %>% tbl_df
107 |   return (h)
108 | }
109 | 
110 | read_samples <- function(filename) {
111 |   h <- read.table(filename, sep='\t', head=T) %>% tbl_df
112 |   h$sample <- h$id
113 |   return (h)
114 | }
115 | 
116 | read_samples_select <- function(filename) {
117 |   h <- read.table(filename, sep='\t', head=F) %>% tbl_df
118 |   colnames(h) <- c('id')
119 |   h <- as.vector(h$id)
120 |   return (h)
121 | }
122 | 
123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL,
124 |   sel=NULL) {
125 |   f <- excl
126 |   if (!is.null(f)) {
127 |     d <- d %>% filter(!(sample %in% f))
128 |   }
129 |   f <- methods
130 |   if (!is.null(f) && 'method' %in% colnames(d)) {
131 |     d <- d %>% filter(method %in% f)
132 |   }
133 |   f <- batch2
134 |   if (!is.null(f) && 'batch2' %in% colnames(d)) {
135 |     d <- d %>% filter(batch2 %in% f)
136 |   }
137 |   if (!is.null(sel)) {
138 |     d <- d %>% filter(sample %in% sel)
139 |   }
140 |   d <- droplevels(d)
141 | 
142 |   return (d)
143 | }
144 | ```
145 | 
146 | ```{r}
147 | dat <- list()
148 | dat$samples <- read_samples(opts$samples_file)
149 | dat$samples_select <- read_samples_select(opts$samples_select_file)
150 | dat$samples <- filter_samples(dat$samples,
151 |   excl=opts$samples_excl,
152 |   methods=opts$samples_methods,
153 |   batch2=opts$samples_batch2,
154 |   sel=dat$samples_select)
155 | 
156 | dat$meta <- read_all_meta(opts$rates_files)
157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample))
158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample))
159 | 
160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100)))
161 | stopifnot(all(round(dat$weights) == dat$weights))
162 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
163 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
164 | ```
165 | 
166 | 
167 | 
168 | ## Preprocessing
169 | 
170 | ```{r}
171 | get_cov <- function(d, perc=TRUE) {
172 |   cov <- rowSums(!is.na(d))
173 |   if (perc) {
174 |     cov <- cov / ncol(d)
175 |   }
176 |   return (cov)
177 | }
178 | 
179 | get_var <- function(d) {
180 |   return (apply(d, 1, var, na.rm=T))
181 | }
182 | ```
183 | 
184 | ```{r}
185 | h <- duplicated(dat$meta)
186 | print(sprintf('Remove %d duplicated records.', sum(h)))
187 | dat$meta <- dat$meta[!h, ]
188 | dat$meta$id_ <- 1:nrow(dat$meta)
189 | for (n in c('rates', 'weights')) {
190 |   dat[[n]] <- dat[[n]][!h,]
191 |   dat[[n]]$id_ <- dat$meta$id_
192 | }
193 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
194 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
195 | ```
196 | 
197 | ```{r filter_cov}
198 | d <- dat$rates
199 | print(sprintf('%d records in total.', nrow(d)))
200 | if (!is.na(opts$fil$cov)) {
201 |   d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,]
202 | }
203 | print(sprintf('%d records passed coverage filter.', nrow(d)))
204 | dat$rates <- d
205 | ```
206 | 
207 | ```{r}
208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') {
209 |   t <- data.frame(sample=factor(rownames(pc_vec)),
210 |     pcx=pc_vec[,x], pcy=pc_vec[,y])
211 |   t <- t %>% left_join(dat$samples, by='sample')
212 |   t$sample_short <- sub('^CSCP3_SERUM', '', t$sample)
213 |   p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) +
214 |     geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) +
215 |     xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) +
216 |     theme(legend.position='bottom')
217 |   return (p)
218 | }
219 | 
220 | plot_pca_val <- function(pc_val) {
221 |   t <- data.frame(pc=1:length(pc_val), val=pc_val)
222 |   p <- ggplot(t, aes(x=pc, y=val)) +
223 |     geom_bar(stat='identity', fill='salmon', color='black') +
224 |     xlab('principle component') +
225 |     ylab('% variance explained')
226 |   return (p)
227 | }
228 | ```
229 | 
230 | 
231 | ## PCA unadjusted data
232 | 
233 | ```{r}
234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
235 | ```
236 | 
237 | ```{r fig.width=8, fig.height=7}
238 | p1 <- plot_pca_vec(pc$vec)
239 | p1 + theme(legend.position='top')
240 | ```
241 | 
242 | ```{r fig.width=10, fig.height=5}
243 | p2 <- plot_pca_val(pc$val)
244 | p2
245 | ```
246 | 
247 | ```{r}
248 | d <- dat$rates %>% select(-id_)
249 | cov <- rowSums(!is.na(d)) / ncol(d)
250 | d[h == ncol(d)]
251 | ```
252 | 
253 | ```{r adjust, eval=opts$adjust}
254 | scale_rates <- function(d) {
255 |   h <- d %>% select(-id_)
256 |   a <- min(h, na.rm=T)
257 |   b <- max(h, na.rm=T)
258 |   h <- (h - a) / (b - a) * 100
259 |   h$id_ <- d$id_
260 |   return (h)
261 | }
262 | 
263 | adjust_batch_met <- function(d, what='batch2', scale=F) {
264 |   is_na <- is.na(d)
265 |   h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_)
266 |   h <- h %>% gather(sample, y, -id_) %>%
267 |     inner_join(select_(dat$samples, 'sample', x=what), by='sample')
268 |   h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup
269 |   h <- h %>% select(id_, sample, y=yr)
270 |   h <- h %>% spread(sample, y)
271 |   stopifnot(all(dim(is_na) == dim(h)))
272 |   stopifnot(all(colnames(is_na) %in% colnames(h)))
273 |   h <- h[,colnames(is_na)]
274 |   h[is_na] <- NA
275 |   return (h)
276 | }
277 | 
278 | h <- dat$rates %>% adjust_batch_met
279 | if (opts$scale_adjust) {
280 |   h <- scale_rates(h)
281 | }
282 | dat$rates <- h
283 | ```
284 | 
285 | ## PCA adjusted data
286 | 
287 | ```{r}
288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
289 | ```
290 | 
291 | ```{r fig.width=8, fig.height=7}
292 | p1 <- plot_pca_vec(pc$vec)
293 | p1 + theme(legend.position='top')
294 | ```
295 | 
296 | ```{r fig.width=10, fig.height=5}
297 | p2 <- plot_pca_val(pc$val)
298 | p2
299 | ```
300 | 
301 | ## Filtering
302 | 
303 | ```{r filter_var}
304 | d <- dat$rates
305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples
306 | d <- d[!is.na(h),]
307 | if (!is.na(opts$fil$min_var)) {
308 |   stopifnot(sum(is.na(h)) == 0)
309 |   d <- d[h >= opts$fil$min_var,]
310 | }
311 | print(sprintf('%d records passed min var filter.', nrow(d)))
312 | 
313 | if (!is.na(opts$fil$top_var)) {
314 |   t <- opts$fil$top_var
315 |   if (t <= 1) {
316 |     t <- as.integer(t * nrow(d))
317 |   }
318 |   d <- d[order(get_var(select(d, -id_)), decreasing=T),]
319 |   d <- d[1:t,]
320 | }
321 | print(sprintf('%d final record.', nrow(d)))
322 | dat$rates <- d
323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_))
324 | ```
325 | 
326 | ```{r join}
327 | d <- dat$meta %>% select(chromo, start, end, id_) %>%
328 |   inner_join(dat$rates, by='id_')
329 | stopifnot(nrow(d) == nrow(dat$rates))
330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_))
331 | 
332 | d <- dat$weights %>% gather(sample, weight, -id_)
333 | h <- nrow(dat$df)
334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample'))
335 | 
336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup
337 | 
338 | stopifnot(h == nrow(dat$df))
339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1))
340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0))
341 | ```
342 | 
343 | ```{r}
344 | dat$df <- dat$df %>% ungroup %>% tbl_df
345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df
346 | ```
347 | 
348 | ```{r write}
349 | save_rds <- function(d, name) {
350 |   f <- sprintf('%s_%s.rds', opts$out_base, name)
351 |   saveRDS(d, f)
352 | }
353 | 
354 | save_rds(dat$df, 'met')
355 | save_rds(dat$meta, 'meta')
356 | ```
357 | 
358 | 
359 | 
360 | ```{r eval=opts$plot_heat, fig.height=12}
361 | plot_heat <- function(d, Rowv=T, xlab='value') {
362 |   colors <- rev(brewer.pal(9, 'Spectral'))
363 |   colors <- colorRampPalette(colors)(50)
364 | 
365 |   labRow <- NA
366 |   if (nrow(d) > 500) {
367 |     dendro='column'
368 |   } else {
369 |     dendro = 'both'
370 |   }
371 |   p <- heatmap.2(d, density.info='none', trace='none', col=colors,
372 |     Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA,
373 |     lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab)
374 |   return (p)
375 | }
376 | 
377 | d <- dat$rates
378 | max_rows <- 1000
379 | if (nrow(d) > max_rows) {
380 |   d <- d %>% sample_n(max_rows)
381 | }
382 | d <- d %>% select(-id_) %>% as.matrix
383 | if (nrow(d) > 100) {
384 |   h <- plot_heat(d)
385 | }
386 | ```
387 | 


--------------------------------------------------------------------------------
/data/met/prepro/IAP/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Preprocessing Seqmonk methylation files
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(dplyr)
 15 | library(tidyr)
 16 | library(ggplot2)
 17 | library(RColorBrewer)
 18 | library(gplots)
 19 | library(gridExtra)
 20 | source('../../../../lib/utils.R')
 21 | ```
 22 | 
 23 | ```{r opts, echo=T}
 24 | opts <- list()
 25 | opts$fil$cov <- 30
 26 | opts$fil$min_var <- 10
 27 | opts$fil$top_var <- 0.75
 28 | opts$adjust <- F
 29 | opts$scale_adjust <- T
 30 | opts$samples_excl <- NULL
 31 | opts$samples_methods <- NULL
 32 | opts$samples_batch2 <- NULL
 33 | ```
 34 | 
 35 | ```{r}
 36 | opts$name <- basename(getwd())
 37 | opts$seqmonk_dirs <- c(
 38 |   file.path('../../reports', opts$name))
 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt'))
 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt'))
 41 | opts$out_base <- './data'
 42 | opts$samples_file <- '../../../samples/samples_stats.csv'
 43 | opts$samples_select_file <- '../../../samples/samples.csv'
 44 | opts$cache <- F
 45 | opts$plot_heat <- F
 46 | ```
 47 | 
 48 | **Name**: `r opts$name`
 49 | 
 50 | 
 51 | 
 52 | ```{r read_src}
 53 | read_meta <- function(filename) {
 54 |   h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t')
 55 |   names(h) <- tolower(names(h))
 56 |   h <- h %>% rename(chromo=chromosome)
 57 |   h <- h %>% tbl_df
 58 |   return (h)
 59 | }
 60 | 
 61 | read_meta_quick <- function(filename) {
 62 |   h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t')
 63 |   names(h) <- tolower(names(h))
 64 |   h <- h %>% tbl_df
 65 |   return (h)
 66 | }
 67 | 
 68 | read_all_meta <- function(filenames) {
 69 |   m <- read_meta(filenames[1])
 70 |   if (length(filenames) > 1) {
 71 |     for (i in 2:length(filenames)) {
 72 |       s <- read_meta_quick(filenames[i])
 73 |       stopifnot(all(m$start == s$start))
 74 |     }
 75 |   }
 76 |   return (m)
 77 | }
 78 | 
 79 | read_values <- function(filename, samples=NULL) {
 80 |   h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t')
 81 |   if (!is.null(samples)) {
 82 |     h <- subset(h, select=intersect(colnames(h), samples))
 83 |   }
 84 |   h <- h %>% tbl_df
 85 |   return (h)
 86 | }
 87 | 
 88 | read_all_values <- function(filenames, samples=NULL) {
 89 |   d <- lapply(filenames, function(x) read_values(x, samples=samples))
 90 |   e <- list()
 91 |   for (dd in d) {
 92 |     if (ncol(dd) > 0) {
 93 |       e[[length(e) + 1]] <- dd
 94 |     }
 95 |   }
 96 |   d <- e
 97 |   stopifnot(length(d) > 0)
 98 |   h <- d[[1]]
 99 |   if (length(d) > 1) {
100 |     for (i in 2:length(d)) {
101 |       if (ncol(d[[i]]) > 0) {
102 |         h <- cbind.data.frame(h, d[[i]])
103 |       }
104 |     }
105 |   }
106 |   h <- h %>% tbl_df
107 |   return (h)
108 | }
109 | 
110 | read_samples <- function(filename) {
111 |   h <- read.table(filename, sep='\t', head=T) %>% tbl_df
112 |   h$sample <- h$id
113 |   return (h)
114 | }
115 | 
116 | read_samples_select <- function(filename) {
117 |   h <- read.table(filename, sep='\t', head=F) %>% tbl_df
118 |   colnames(h) <- c('id')
119 |   h <- as.vector(h$id)
120 |   return (h)
121 | }
122 | 
123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL,
124 |   sel=NULL) {
125 |   f <- excl
126 |   if (!is.null(f)) {
127 |     d <- d %>% filter(!(sample %in% f))
128 |   }
129 |   f <- methods
130 |   if (!is.null(f) && 'method' %in% colnames(d)) {
131 |     d <- d %>% filter(method %in% f)
132 |   }
133 |   f <- batch2
134 |   if (!is.null(f) && 'batch2' %in% colnames(d)) {
135 |     d <- d %>% filter(batch2 %in% f)
136 |   }
137 |   if (!is.null(sel)) {
138 |     d <- d %>% filter(sample %in% sel)
139 |   }
140 |   d <- droplevels(d)
141 | 
142 |   return (d)
143 | }
144 | ```
145 | 
146 | ```{r}
147 | dat <- list()
148 | dat$samples <- read_samples(opts$samples_file)
149 | dat$samples_select <- read_samples_select(opts$samples_select_file)
150 | dat$samples <- filter_samples(dat$samples,
151 |   excl=opts$samples_excl,
152 |   methods=opts$samples_methods,
153 |   batch2=opts$samples_batch2,
154 |   sel=dat$samples_select)
155 | 
156 | dat$meta <- read_all_meta(opts$rates_files)
157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample))
158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample))
159 | 
160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100)))
161 | stopifnot(all(round(dat$weights) == dat$weights))
162 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
163 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
164 | ```
165 | 
166 | 
167 | 
168 | ## Preprocessing
169 | 
170 | ```{r}
171 | get_cov <- function(d, perc=TRUE) {
172 |   cov <- rowSums(!is.na(d))
173 |   if (perc) {
174 |     cov <- cov / ncol(d)
175 |   }
176 |   return (cov)
177 | }
178 | 
179 | get_var <- function(d) {
180 |   return (apply(d, 1, var, na.rm=T))
181 | }
182 | ```
183 | 
184 | ```{r}
185 | h <- duplicated(dat$meta)
186 | print(sprintf('Remove %d duplicated records.', sum(h)))
187 | dat$meta <- dat$meta[!h, ]
188 | dat$meta$id_ <- 1:nrow(dat$meta)
189 | for (n in c('rates', 'weights')) {
190 |   dat[[n]] <- dat[[n]][!h,]
191 |   dat[[n]]$id_ <- dat$meta$id_
192 | }
193 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
194 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
195 | ```
196 | 
197 | ```{r filter_cov}
198 | d <- dat$rates
199 | print(sprintf('%d records in total.', nrow(d)))
200 | if (!is.na(opts$fil$cov)) {
201 |   d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,]
202 | }
203 | print(sprintf('%d records passed coverage filter.', nrow(d)))
204 | dat$rates <- d
205 | ```
206 | 
207 | ```{r}
208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') {
209 |   t <- data.frame(sample=factor(rownames(pc_vec)),
210 |     pcx=pc_vec[,x], pcy=pc_vec[,y])
211 |   t <- t %>% left_join(dat$samples, by='sample')
212 |   t$sample_short <- sub('^CSCP3_SERUM', '', t$sample)
213 |   p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) +
214 |     geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) +
215 |     xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) +
216 |     theme(legend.position='bottom')
217 |   return (p)
218 | }
219 | 
220 | plot_pca_val <- function(pc_val) {
221 |   t <- data.frame(pc=1:length(pc_val), val=pc_val)
222 |   p <- ggplot(t, aes(x=pc, y=val)) +
223 |     geom_bar(stat='identity', fill='salmon', color='black') +
224 |     xlab('principle component') +
225 |     ylab('% variance explained')
226 |   return (p)
227 | }
228 | ```
229 | 
230 | 
231 | ## PCA unadjusted data
232 | 
233 | ```{r}
234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
235 | ```
236 | 
237 | ```{r fig.width=8, fig.height=7}
238 | p1 <- plot_pca_vec(pc$vec)
239 | p1 + theme(legend.position='top')
240 | ```
241 | 
242 | ```{r fig.width=10, fig.height=5}
243 | p2 <- plot_pca_val(pc$val)
244 | p2
245 | ```
246 | 
247 | ```{r}
248 | d <- dat$rates %>% select(-id_)
249 | cov <- rowSums(!is.na(d)) / ncol(d)
250 | d[h == ncol(d)]
251 | ```
252 | 
253 | ```{r adjust, eval=opts$adjust}
254 | scale_rates <- function(d) {
255 |   h <- d %>% select(-id_)
256 |   a <- min(h, na.rm=T)
257 |   b <- max(h, na.rm=T)
258 |   h <- (h - a) / (b - a) * 100
259 |   h$id_ <- d$id_
260 |   return (h)
261 | }
262 | 
263 | adjust_batch_met <- function(d, what='batch2', scale=F) {
264 |   is_na <- is.na(d)
265 |   h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_)
266 |   h <- h %>% gather(sample, y, -id_) %>%
267 |     inner_join(select_(dat$samples, 'sample', x=what), by='sample')
268 |   h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup
269 |   h <- h %>% select(id_, sample, y=yr)
270 |   h <- h %>% spread(sample, y)
271 |   stopifnot(all(dim(is_na) == dim(h)))
272 |   stopifnot(all(colnames(is_na) %in% colnames(h)))
273 |   h <- h[,colnames(is_na)]
274 |   h[is_na] <- NA
275 |   return (h)
276 | }
277 | 
278 | h <- dat$rates %>% adjust_batch_met
279 | if (opts$scale_adjust) {
280 |   h <- scale_rates(h)
281 | }
282 | dat$rates <- h
283 | ```
284 | 
285 | ## PCA adjusted data
286 | 
287 | ```{r}
288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
289 | ```
290 | 
291 | ```{r fig.width=8, fig.height=7}
292 | p1 <- plot_pca_vec(pc$vec)
293 | p1 + theme(legend.position='top')
294 | ```
295 | 
296 | ```{r fig.width=10, fig.height=5}
297 | p2 <- plot_pca_val(pc$val)
298 | p2
299 | ```
300 | 
301 | ## Filtering
302 | 
303 | ```{r filter_var}
304 | d <- dat$rates
305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples
306 | d <- d[!is.na(h),]
307 | if (!is.na(opts$fil$min_var)) {
308 |   stopifnot(sum(is.na(h)) == 0)
309 |   d <- d[h >= opts$fil$min_var,]
310 | }
311 | print(sprintf('%d records passed min var filter.', nrow(d)))
312 | 
313 | if (!is.na(opts$fil$top_var)) {
314 |   t <- opts$fil$top_var
315 |   if (t <= 1) {
316 |     t <- as.integer(t * nrow(d))
317 |   }
318 |   d <- d[order(get_var(select(d, -id_)), decreasing=T),]
319 |   d <- d[1:t,]
320 | }
321 | print(sprintf('%d final record.', nrow(d)))
322 | dat$rates <- d
323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_))
324 | ```
325 | 
326 | ```{r join}
327 | d <- dat$meta %>% select(chromo, start, end, id_) %>%
328 |   inner_join(dat$rates, by='id_')
329 | stopifnot(nrow(d) == nrow(dat$rates))
330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_))
331 | 
332 | d <- dat$weights %>% gather(sample, weight, -id_)
333 | h <- nrow(dat$df)
334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample'))
335 | 
336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup
337 | 
338 | stopifnot(h == nrow(dat$df))
339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1))
340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0))
341 | ```
342 | 
343 | ```{r}
344 | dat$df <- dat$df %>% ungroup %>% tbl_df
345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df
346 | ```
347 | 
348 | ```{r write}
349 | save_rds <- function(d, name) {
350 |   f <- sprintf('%s_%s.rds', opts$out_base, name)
351 |   saveRDS(d, f)
352 | }
353 | 
354 | save_rds(dat$df, 'met')
355 | save_rds(dat$meta, 'meta')
356 | ```
357 | 
358 | 
359 | 
360 | ```{r eval=opts$plot_heat, fig.height=12}
361 | plot_heat <- function(d, Rowv=T, xlab='value') {
362 |   colors <- rev(brewer.pal(9, 'Spectral'))
363 |   colors <- colorRampPalette(colors)(50)
364 | 
365 |   labRow <- NA
366 |   if (nrow(d) > 500) {
367 |     dendro='column'
368 |   } else {
369 |     dendro = 'both'
370 |   }
371 |   p <- heatmap.2(d, density.info='none', trace='none', col=colors,
372 |     Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA,
373 |     lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab)
374 |   return (p)
375 | }
376 | 
377 | d <- dat$rates
378 | max_rows <- 1000
379 | if (nrow(d) > max_rows) {
380 |   d <- d %>% sample_n(max_rows)
381 | }
382 | d <- d %>% select(-id_) %>% as.matrix
383 | if (nrow(d) > 100) {
384 |   h <- plot_heat(d)
385 | }
386 | ```
387 | 


--------------------------------------------------------------------------------
/data/met/prepro/LMR/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Preprocessing Seqmonk methylation files
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(dplyr)
 15 | library(tidyr)
 16 | library(ggplot2)
 17 | library(RColorBrewer)
 18 | library(gplots)
 19 | library(gridExtra)
 20 | source('../../../../lib/utils.R')
 21 | ```
 22 | 
 23 | ```{r opts, echo=T}
 24 | opts <- list()
 25 | opts$fil$cov <- 30
 26 | opts$fil$min_var <- 10
 27 | opts$fil$top_var <- 0.75
 28 | opts$adjust <- F
 29 | opts$scale_adjust <- T
 30 | opts$samples_excl <- NULL
 31 | opts$samples_methods <- NULL
 32 | opts$samples_batch2 <- NULL
 33 | ```
 34 | 
 35 | ```{r}
 36 | opts$name <- basename(getwd())
 37 | opts$seqmonk_dirs <- c(
 38 |   file.path('../../reports', opts$name))
 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt'))
 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt'))
 41 | opts$out_base <- './data'
 42 | opts$samples_file <- '../../../samples/samples_stats.csv'
 43 | opts$samples_select_file <- '../../../samples/samples.csv'
 44 | opts$cache <- F
 45 | opts$plot_heat <- F
 46 | ```
 47 | 
 48 | **Name**: `r opts$name`
 49 | 
 50 | 
 51 | 
 52 | ```{r read_src}
 53 | read_meta <- function(filename) {
 54 |   h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t')
 55 |   names(h) <- tolower(names(h))
 56 |   h <- h %>% rename(chromo=chromosome)
 57 |   h <- h %>% tbl_df
 58 |   return (h)
 59 | }
 60 | 
 61 | read_meta_quick <- function(filename) {
 62 |   h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t')
 63 |   names(h) <- tolower(names(h))
 64 |   h <- h %>% tbl_df
 65 |   return (h)
 66 | }
 67 | 
 68 | read_all_meta <- function(filenames) {
 69 |   m <- read_meta(filenames[1])
 70 |   if (length(filenames) > 1) {
 71 |     for (i in 2:length(filenames)) {
 72 |       s <- read_meta_quick(filenames[i])
 73 |       stopifnot(all(m$start == s$start))
 74 |     }
 75 |   }
 76 |   return (m)
 77 | }
 78 | 
 79 | read_values <- function(filename, samples=NULL) {
 80 |   h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t')
 81 |   if (!is.null(samples)) {
 82 |     h <- subset(h, select=intersect(colnames(h), samples))
 83 |   }
 84 |   h <- h %>% tbl_df
 85 |   return (h)
 86 | }
 87 | 
 88 | read_all_values <- function(filenames, samples=NULL) {
 89 |   d <- lapply(filenames, function(x) read_values(x, samples=samples))
 90 |   e <- list()
 91 |   for (dd in d) {
 92 |     if (ncol(dd) > 0) {
 93 |       e[[length(e) + 1]] <- dd
 94 |     }
 95 |   }
 96 |   d <- e
 97 |   stopifnot(length(d) > 0)
 98 |   h <- d[[1]]
 99 |   if (length(d) > 1) {
100 |     for (i in 2:length(d)) {
101 |       if (ncol(d[[i]]) > 0) {
102 |         h <- cbind.data.frame(h, d[[i]])
103 |       }
104 |     }
105 |   }
106 |   h <- h %>% tbl_df
107 |   return (h)
108 | }
109 | 
110 | read_samples <- function(filename) {
111 |   h <- read.table(filename, sep='\t', head=T) %>% tbl_df
112 |   h$sample <- h$id
113 |   return (h)
114 | }
115 | 
116 | read_samples_select <- function(filename) {
117 |   h <- read.table(filename, sep='\t', head=F) %>% tbl_df
118 |   colnames(h) <- c('id')
119 |   h <- as.vector(h$id)
120 |   return (h)
121 | }
122 | 
123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL,
124 |   sel=NULL) {
125 |   f <- excl
126 |   if (!is.null(f)) {
127 |     d <- d %>% filter(!(sample %in% f))
128 |   }
129 |   f <- methods
130 |   if (!is.null(f) && 'method' %in% colnames(d)) {
131 |     d <- d %>% filter(method %in% f)
132 |   }
133 |   f <- batch2
134 |   if (!is.null(f) && 'batch2' %in% colnames(d)) {
135 |     d <- d %>% filter(batch2 %in% f)
136 |   }
137 |   if (!is.null(sel)) {
138 |     d <- d %>% filter(sample %in% sel)
139 |   }
140 |   d <- droplevels(d)
141 | 
142 |   return (d)
143 | }
144 | ```
145 | 
146 | ```{r}
147 | dat <- list()
148 | dat$samples <- read_samples(opts$samples_file)
149 | dat$samples_select <- read_samples_select(opts$samples_select_file)
150 | dat$samples <- filter_samples(dat$samples,
151 |   excl=opts$samples_excl,
152 |   methods=opts$samples_methods,
153 |   batch2=opts$samples_batch2,
154 |   sel=dat$samples_select)
155 | 
156 | dat$meta <- read_all_meta(opts$rates_files)
157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample))
158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample))
159 | 
160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100)))
161 | stopifnot(all(round(dat$weights) == dat$weights))
162 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
163 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
164 | ```
165 | 
166 | 
167 | 
168 | ## Preprocessing
169 | 
170 | ```{r}
171 | get_cov <- function(d, perc=TRUE) {
172 |   cov <- rowSums(!is.na(d))
173 |   if (perc) {
174 |     cov <- cov / ncol(d)
175 |   }
176 |   return (cov)
177 | }
178 | 
179 | get_var <- function(d) {
180 |   return (apply(d, 1, var, na.rm=T))
181 | }
182 | ```
183 | 
184 | ```{r}
185 | h <- duplicated(dat$meta)
186 | print(sprintf('Remove %d duplicated records.', sum(h)))
187 | dat$meta <- dat$meta[!h, ]
188 | dat$meta$id_ <- 1:nrow(dat$meta)
189 | for (n in c('rates', 'weights')) {
190 |   dat[[n]] <- dat[[n]][!h,]
191 |   dat[[n]]$id_ <- dat$meta$id_
192 | }
193 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
194 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
195 | ```
196 | 
197 | ```{r filter_cov}
198 | d <- dat$rates
199 | print(sprintf('%d records in total.', nrow(d)))
200 | if (!is.na(opts$fil$cov)) {
201 |   d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,]
202 | }
203 | print(sprintf('%d records passed coverage filter.', nrow(d)))
204 | dat$rates <- d
205 | ```
206 | 
207 | ```{r}
208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') {
209 |   t <- data.frame(sample=factor(rownames(pc_vec)),
210 |     pcx=pc_vec[,x], pcy=pc_vec[,y])
211 |   t <- t %>% left_join(dat$samples, by='sample')
212 |   t$sample_short <- sub('^CSCP3_SERUM', '', t$sample)
213 |   p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) +
214 |     geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) +
215 |     xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) +
216 |     theme(legend.position='bottom')
217 |   return (p)
218 | }
219 | 
220 | plot_pca_val <- function(pc_val) {
221 |   t <- data.frame(pc=1:length(pc_val), val=pc_val)
222 |   p <- ggplot(t, aes(x=pc, y=val)) +
223 |     geom_bar(stat='identity', fill='salmon', color='black') +
224 |     xlab('principle component') +
225 |     ylab('% variance explained')
226 |   return (p)
227 | }
228 | ```
229 | 
230 | 
231 | ## PCA unadjusted data
232 | 
233 | ```{r}
234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
235 | ```
236 | 
237 | ```{r fig.width=8, fig.height=7}
238 | p1 <- plot_pca_vec(pc$vec)
239 | p1 + theme(legend.position='top')
240 | ```
241 | 
242 | ```{r fig.width=10, fig.height=5}
243 | p2 <- plot_pca_val(pc$val)
244 | p2
245 | ```
246 | 
247 | ```{r}
248 | d <- dat$rates %>% select(-id_)
249 | cov <- rowSums(!is.na(d)) / ncol(d)
250 | d[h == ncol(d)]
251 | ```
252 | 
253 | ```{r adjust, eval=opts$adjust}
254 | scale_rates <- function(d) {
255 |   h <- d %>% select(-id_)
256 |   a <- min(h, na.rm=T)
257 |   b <- max(h, na.rm=T)
258 |   h <- (h - a) / (b - a) * 100
259 |   h$id_ <- d$id_
260 |   return (h)
261 | }
262 | 
263 | adjust_batch_met <- function(d, what='batch2', scale=F) {
264 |   is_na <- is.na(d)
265 |   h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_)
266 |   h <- h %>% gather(sample, y, -id_) %>%
267 |     inner_join(select_(dat$samples, 'sample', x=what), by='sample')
268 |   h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup
269 |   h <- h %>% select(id_, sample, y=yr)
270 |   h <- h %>% spread(sample, y)
271 |   stopifnot(all(dim(is_na) == dim(h)))
272 |   stopifnot(all(colnames(is_na) %in% colnames(h)))
273 |   h <- h[,colnames(is_na)]
274 |   h[is_na] <- NA
275 |   return (h)
276 | }
277 | 
278 | h <- dat$rates %>% adjust_batch_met
279 | if (opts$scale_adjust) {
280 |   h <- scale_rates(h)
281 | }
282 | dat$rates <- h
283 | ```
284 | 
285 | ## PCA adjusted data
286 | 
287 | ```{r}
288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
289 | ```
290 | 
291 | ```{r fig.width=8, fig.height=7}
292 | p1 <- plot_pca_vec(pc$vec)
293 | p1 + theme(legend.position='top')
294 | ```
295 | 
296 | ```{r fig.width=10, fig.height=5}
297 | p2 <- plot_pca_val(pc$val)
298 | p2
299 | ```
300 | 
301 | ## Filtering
302 | 
303 | ```{r filter_var}
304 | d <- dat$rates
305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples
306 | d <- d[!is.na(h),]
307 | if (!is.na(opts$fil$min_var)) {
308 |   stopifnot(sum(is.na(h)) == 0)
309 |   d <- d[h >= opts$fil$min_var,]
310 | }
311 | print(sprintf('%d records passed min var filter.', nrow(d)))
312 | 
313 | if (!is.na(opts$fil$top_var)) {
314 |   t <- opts$fil$top_var
315 |   if (t <= 1) {
316 |     t <- as.integer(t * nrow(d))
317 |   }
318 |   d <- d[order(get_var(select(d, -id_)), decreasing=T),]
319 |   d <- d[1:t,]
320 | }
321 | print(sprintf('%d final record.', nrow(d)))
322 | dat$rates <- d
323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_))
324 | ```
325 | 
326 | ```{r join}
327 | d <- dat$meta %>% select(chromo, start, end, id_) %>%
328 |   inner_join(dat$rates, by='id_')
329 | stopifnot(nrow(d) == nrow(dat$rates))
330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_))
331 | 
332 | d <- dat$weights %>% gather(sample, weight, -id_)
333 | h <- nrow(dat$df)
334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample'))
335 | 
336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup
337 | 
338 | stopifnot(h == nrow(dat$df))
339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1))
340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0))
341 | ```
342 | 
343 | ```{r}
344 | dat$df <- dat$df %>% ungroup %>% tbl_df
345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df
346 | ```
347 | 
348 | ```{r write}
349 | save_rds <- function(d, name) {
350 |   f <- sprintf('%s_%s.rds', opts$out_base, name)
351 |   saveRDS(d, f)
352 | }
353 | 
354 | save_rds(dat$df, 'met')
355 | save_rds(dat$meta, 'meta')
356 | ```
357 | 
358 | 
359 | 
360 | ```{r eval=opts$plot_heat, fig.height=12}
361 | plot_heat <- function(d, Rowv=T, xlab='value') {
362 |   colors <- rev(brewer.pal(9, 'Spectral'))
363 |   colors <- colorRampPalette(colors)(50)
364 | 
365 |   labRow <- NA
366 |   if (nrow(d) > 500) {
367 |     dendro='column'
368 |   } else {
369 |     dendro = 'both'
370 |   }
371 |   p <- heatmap.2(d, density.info='none', trace='none', col=colors,
372 |     Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA,
373 |     lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab)
374 |   return (p)
375 | }
376 | 
377 | d <- dat$rates
378 | max_rows <- 1000
379 | if (nrow(d) > max_rows) {
380 |   d <- d %>% sample_n(max_rows)
381 | }
382 | d <- d %>% select(-id_) %>% as.matrix
383 | if (nrow(d) > 100) {
384 |   h <- plot_heat(d)
385 | }
386 | ```
387 | 


--------------------------------------------------------------------------------
/data/met/prepro/Makefile:
--------------------------------------------------------------------------------
 1 | seqmonk_names := $(notdir $(wildcard ../reports/*))
 2 | seqmonk_names := $(filter-out README.txt,$(seqmonk_names))
 3 | seqmonk_names := $(filter-out w%,$(seqmonk_names))
 4 | 
 5 | 
 6 | %.html: %.Rmd
 7 | 	Rscript -e "library(rmarkdown); render('$<', output_format='html_document')"
 8 | 
 9 | dirs:
10 | 	for n in $(seqmonk_names); do \
11 | 		if [ ! -e $$n ]; then \
12 | 			mkdir $$n && cp src/index.Rmd $$n; \
13 | 		fi \
14 | 	done
15 | 
16 | clean:
17 | 	rm -rf $(seqmonk_names)
18 | 
19 | rmd_files = $(filter-out src/%,$(wildcard */index.Rmd))
20 | html_files = $(patsubst %.Rmd,%.html, $(rmd_files))
21 | 
22 | run: $(html_files)
23 | 


--------------------------------------------------------------------------------
/data/met/prepro/Tet2/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Preprocessing Seqmonk methylation files
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(dplyr)
 15 | library(tidyr)
 16 | library(ggplot2)
 17 | library(RColorBrewer)
 18 | library(gplots)
 19 | library(gridExtra)
 20 | source('../../../../lib/utils.R')
 21 | ```
 22 | 
 23 | ```{r opts, echo=T}
 24 | opts <- list()
 25 | opts$fil$cov <- 30
 26 | opts$fil$min_var <- 10
 27 | opts$fil$top_var <- 0.75
 28 | opts$adjust <- F
 29 | opts$scale_adjust <- T
 30 | opts$samples_excl <- NULL
 31 | opts$samples_methods <- NULL
 32 | opts$samples_batch2 <- NULL
 33 | ```
 34 | 
 35 | ```{r}
 36 | opts$name <- basename(getwd())
 37 | opts$seqmonk_dirs <- c(
 38 |   file.path('../../reports', opts$name))
 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt'))
 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt'))
 41 | opts$out_base <- './data'
 42 | opts$samples_file <- '../../../samples/samples_stats.csv'
 43 | opts$samples_select_file <- '../../../samples/samples.csv'
 44 | opts$cache <- F
 45 | opts$plot_heat <- F
 46 | ```
 47 | 
 48 | **Name**: `r opts$name`
 49 | 
 50 | 
 51 | 
 52 | ```{r read_src}
 53 | read_meta <- function(filename) {
 54 |   h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t')
 55 |   names(h) <- tolower(names(h))
 56 |   h <- h %>% rename(chromo=chromosome)
 57 |   h <- h %>% tbl_df
 58 |   return (h)
 59 | }
 60 | 
 61 | read_meta_quick <- function(filename) {
 62 |   h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t')
 63 |   names(h) <- tolower(names(h))
 64 |   h <- h %>% tbl_df
 65 |   return (h)
 66 | }
 67 | 
 68 | read_all_meta <- function(filenames) {
 69 |   m <- read_meta(filenames[1])
 70 |   if (length(filenames) > 1) {
 71 |     for (i in 2:length(filenames)) {
 72 |       s <- read_meta_quick(filenames[i])
 73 |       stopifnot(all(m$start == s$start))
 74 |     }
 75 |   }
 76 |   return (m)
 77 | }
 78 | 
 79 | read_values <- function(filename, samples=NULL) {
 80 |   h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t')
 81 |   if (!is.null(samples)) {
 82 |     h <- subset(h, select=intersect(colnames(h), samples))
 83 |   }
 84 |   h <- h %>% tbl_df
 85 |   return (h)
 86 | }
 87 | 
 88 | read_all_values <- function(filenames, samples=NULL) {
 89 |   d <- lapply(filenames, function(x) read_values(x, samples=samples))
 90 |   e <- list()
 91 |   for (dd in d) {
 92 |     if (ncol(dd) > 0) {
 93 |       e[[length(e) + 1]] <- dd
 94 |     }
 95 |   }
 96 |   d <- e
 97 |   stopifnot(length(d) > 0)
 98 |   h <- d[[1]]
 99 |   if (length(d) > 1) {
100 |     for (i in 2:length(d)) {
101 |       if (ncol(d[[i]]) > 0) {
102 |         h <- cbind.data.frame(h, d[[i]])
103 |       }
104 |     }
105 |   }
106 |   h <- h %>% tbl_df
107 |   return (h)
108 | }
109 | 
110 | read_samples <- function(filename) {
111 |   h <- read.table(filename, sep='\t', head=T) %>% tbl_df
112 |   h$sample <- h$id
113 |   return (h)
114 | }
115 | 
116 | read_samples_select <- function(filename) {
117 |   h <- read.table(filename, sep='\t', head=F) %>% tbl_df
118 |   colnames(h) <- c('id')
119 |   h <- as.vector(h$id)
120 |   return (h)
121 | }
122 | 
123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL,
124 |   sel=NULL) {
125 |   f <- excl
126 |   if (!is.null(f)) {
127 |     d <- d %>% filter(!(sample %in% f))
128 |   }
129 |   f <- methods
130 |   if (!is.null(f) && 'method' %in% colnames(d)) {
131 |     d <- d %>% filter(method %in% f)
132 |   }
133 |   f <- batch2
134 |   if (!is.null(f) && 'batch2' %in% colnames(d)) {
135 |     d <- d %>% filter(batch2 %in% f)
136 |   }
137 |   if (!is.null(sel)) {
138 |     d <- d %>% filter(sample %in% sel)
139 |   }
140 |   d <- droplevels(d)
141 | 
142 |   return (d)
143 | }
144 | ```
145 | 
146 | ```{r}
147 | dat <- list()
148 | dat$samples <- read_samples(opts$samples_file)
149 | dat$samples_select <- read_samples_select(opts$samples_select_file)
150 | dat$samples <- filter_samples(dat$samples,
151 |   excl=opts$samples_excl,
152 |   methods=opts$samples_methods,
153 |   batch2=opts$samples_batch2,
154 |   sel=dat$samples_select)
155 | 
156 | dat$meta <- read_all_meta(opts$rates_files)
157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample))
158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample))
159 | 
160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100)))
161 | stopifnot(all(round(dat$weights) == dat$weights))
162 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
163 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
164 | ```
165 | 
166 | 
167 | 
168 | ## Preprocessing
169 | 
170 | ```{r}
171 | get_cov <- function(d, perc=TRUE) {
172 |   cov <- rowSums(!is.na(d))
173 |   if (perc) {
174 |     cov <- cov / ncol(d)
175 |   }
176 |   return (cov)
177 | }
178 | 
179 | get_var <- function(d) {
180 |   return (apply(d, 1, var, na.rm=T))
181 | }
182 | ```
183 | 
184 | ```{r}
185 | h <- duplicated(dat$meta)
186 | print(sprintf('Remove %d duplicated records.', sum(h)))
187 | dat$meta <- dat$meta[!h, ]
188 | dat$meta$id_ <- 1:nrow(dat$meta)
189 | for (n in c('rates', 'weights')) {
190 |   dat[[n]] <- dat[[n]][!h,]
191 |   dat[[n]]$id_ <- dat$meta$id_
192 | }
193 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
194 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
195 | ```
196 | 
197 | ```{r filter_cov}
198 | d <- dat$rates
199 | print(sprintf('%d records in total.', nrow(d)))
200 | if (!is.na(opts$fil$cov)) {
201 |   d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,]
202 | }
203 | print(sprintf('%d records passed coverage filter.', nrow(d)))
204 | dat$rates <- d
205 | ```
206 | 
207 | ```{r}
208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') {
209 |   t <- data.frame(sample=factor(rownames(pc_vec)),
210 |     pcx=pc_vec[,x], pcy=pc_vec[,y])
211 |   t <- t %>% left_join(dat$samples, by='sample')
212 |   t$sample_short <- sub('^CSCP3_SERUM', '', t$sample)
213 |   p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) +
214 |     geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) +
215 |     xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) +
216 |     theme(legend.position='bottom')
217 |   return (p)
218 | }
219 | 
220 | plot_pca_val <- function(pc_val) {
221 |   t <- data.frame(pc=1:length(pc_val), val=pc_val)
222 |   p <- ggplot(t, aes(x=pc, y=val)) +
223 |     geom_bar(stat='identity', fill='salmon', color='black') +
224 |     xlab('principle component') +
225 |     ylab('% variance explained')
226 |   return (p)
227 | }
228 | ```
229 | 
230 | 
231 | ## PCA unadjusted data
232 | 
233 | ```{r}
234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
235 | ```
236 | 
237 | ```{r fig.width=8, fig.height=7}
238 | p1 <- plot_pca_vec(pc$vec)
239 | p1 + theme(legend.position='top')
240 | ```
241 | 
242 | ```{r fig.width=10, fig.height=5}
243 | p2 <- plot_pca_val(pc$val)
244 | p2
245 | ```
246 | 
247 | ```{r}
248 | d <- dat$rates %>% select(-id_)
249 | cov <- rowSums(!is.na(d)) / ncol(d)
250 | d[h == ncol(d)]
251 | ```
252 | 
253 | ```{r adjust, eval=opts$adjust}
254 | scale_rates <- function(d) {
255 |   h <- d %>% select(-id_)
256 |   a <- min(h, na.rm=T)
257 |   b <- max(h, na.rm=T)
258 |   h <- (h - a) / (b - a) * 100
259 |   h$id_ <- d$id_
260 |   return (h)
261 | }
262 | 
263 | adjust_batch_met <- function(d, what='batch2', scale=F) {
264 |   is_na <- is.na(d)
265 |   h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_)
266 |   h <- h %>% gather(sample, y, -id_) %>%
267 |     inner_join(select_(dat$samples, 'sample', x=what), by='sample')
268 |   h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup
269 |   h <- h %>% select(id_, sample, y=yr)
270 |   h <- h %>% spread(sample, y)
271 |   stopifnot(all(dim(is_na) == dim(h)))
272 |   stopifnot(all(colnames(is_na) %in% colnames(h)))
273 |   h <- h[,colnames(is_na)]
274 |   h[is_na] <- NA
275 |   return (h)
276 | }
277 | 
278 | h <- dat$rates %>% adjust_batch_met
279 | if (opts$scale_adjust) {
280 |   h <- scale_rates(h)
281 | }
282 | dat$rates <- h
283 | ```
284 | 
285 | ## PCA adjusted data
286 | 
287 | ```{r}
288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
289 | ```
290 | 
291 | ```{r fig.width=8, fig.height=7}
292 | p1 <- plot_pca_vec(pc$vec)
293 | p1 + theme(legend.position='top')
294 | ```
295 | 
296 | ```{r fig.width=10, fig.height=5}
297 | p2 <- plot_pca_val(pc$val)
298 | p2
299 | ```
300 | 
301 | ## Filtering
302 | 
303 | ```{r filter_var}
304 | d <- dat$rates
305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples
306 | d <- d[!is.na(h),]
307 | if (!is.na(opts$fil$min_var)) {
308 |   stopifnot(sum(is.na(h)) == 0)
309 |   d <- d[h >= opts$fil$min_var,]
310 | }
311 | print(sprintf('%d records passed min var filter.', nrow(d)))
312 | 
313 | if (!is.na(opts$fil$top_var)) {
314 |   t <- opts$fil$top_var
315 |   if (t <= 1) {
316 |     t <- as.integer(t * nrow(d))
317 |   }
318 |   d <- d[order(get_var(select(d, -id_)), decreasing=T),]
319 |   d <- d[1:t,]
320 | }
321 | print(sprintf('%d final record.', nrow(d)))
322 | dat$rates <- d
323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_))
324 | ```
325 | 
326 | ```{r join}
327 | d <- dat$meta %>% select(chromo, start, end, id_) %>%
328 |   inner_join(dat$rates, by='id_')
329 | stopifnot(nrow(d) == nrow(dat$rates))
330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_))
331 | 
332 | d <- dat$weights %>% gather(sample, weight, -id_)
333 | h <- nrow(dat$df)
334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample'))
335 | 
336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup
337 | 
338 | stopifnot(h == nrow(dat$df))
339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1))
340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0))
341 | ```
342 | 
343 | ```{r}
344 | dat$df <- dat$df %>% ungroup %>% tbl_df
345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df
346 | ```
347 | 
348 | ```{r write}
349 | save_rds <- function(d, name) {
350 |   f <- sprintf('%s_%s.rds', opts$out_base, name)
351 |   saveRDS(d, f)
352 | }
353 | 
354 | save_rds(dat$df, 'met')
355 | save_rds(dat$meta, 'meta')
356 | ```
357 | 
358 | 
359 | 
360 | ```{r eval=opts$plot_heat, fig.height=12}
361 | plot_heat <- function(d, Rowv=T, xlab='value') {
362 |   colors <- rev(brewer.pal(9, 'Spectral'))
363 |   colors <- colorRampPalette(colors)(50)
364 | 
365 |   labRow <- NA
366 |   if (nrow(d) > 500) {
367 |     dendro='column'
368 |   } else {
369 |     dendro = 'both'
370 |   }
371 |   p <- heatmap.2(d, density.info='none', trace='none', col=colors,
372 |     Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA,
373 |     lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab)
374 |   return (p)
375 | }
376 | 
377 | d <- dat$rates
378 | max_rows <- 1000
379 | if (nrow(d) > max_rows) {
380 |   d <- d %>% sample_n(max_rows)
381 | }
382 | d <- d %>% select(-id_) %>% as.matrix
383 | if (nrow(d) > 100) {
384 |   h <- plot_heat(d)
385 | }
386 | ```
387 | 


--------------------------------------------------------------------------------
/data/met/prepro/Wu_Tet1/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Preprocessing Seqmonk methylation files
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(dplyr)
 15 | library(tidyr)
 16 | library(ggplot2)
 17 | library(RColorBrewer)
 18 | library(gplots)
 19 | library(gridExtra)
 20 | source('../../../../lib/utils.R')
 21 | ```
 22 | 
 23 | ```{r opts, echo=T}
 24 | opts <- list()
 25 | opts$fil$cov <- 30
 26 | opts$fil$min_var <- 10
 27 | opts$fil$top_var <- 0.75
 28 | opts$adjust <- F
 29 | opts$scale_adjust <- T
 30 | opts$samples_excl <- NULL
 31 | opts$samples_methods <- NULL
 32 | opts$samples_batch2 <- NULL
 33 | ```
 34 | 
 35 | ```{r}
 36 | opts$name <- basename(getwd())
 37 | opts$seqmonk_dirs <- c(
 38 |   file.path('../../reports', opts$name))
 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt'))
 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt'))
 41 | opts$out_base <- './data'
 42 | opts$samples_file <- '../../../samples/samples_stats.csv'
 43 | opts$samples_select_file <- '../../../samples/samples.csv'
 44 | opts$cache <- F
 45 | opts$plot_heat <- F
 46 | ```
 47 | 
 48 | **Name**: `r opts$name`
 49 | 
 50 | 
 51 | 
 52 | ```{r read_src}
 53 | read_meta <- function(filename) {
 54 |   h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t')
 55 |   names(h) <- tolower(names(h))
 56 |   h <- h %>% rename(chromo=chromosome)
 57 |   h <- h %>% tbl_df
 58 |   return (h)
 59 | }
 60 | 
 61 | read_meta_quick <- function(filename) {
 62 |   h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t')
 63 |   names(h) <- tolower(names(h))
 64 |   h <- h %>% tbl_df
 65 |   return (h)
 66 | }
 67 | 
 68 | read_all_meta <- function(filenames) {
 69 |   m <- read_meta(filenames[1])
 70 |   if (length(filenames) > 1) {
 71 |     for (i in 2:length(filenames)) {
 72 |       s <- read_meta_quick(filenames[i])
 73 |       stopifnot(all(m$start == s$start))
 74 |     }
 75 |   }
 76 |   return (m)
 77 | }
 78 | 
 79 | read_values <- function(filename, samples=NULL) {
 80 |   h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t')
 81 |   if (!is.null(samples)) {
 82 |     h <- subset(h, select=intersect(colnames(h), samples))
 83 |   }
 84 |   h <- h %>% tbl_df
 85 |   return (h)
 86 | }
 87 | 
 88 | read_all_values <- function(filenames, samples=NULL) {
 89 |   d <- lapply(filenames, function(x) read_values(x, samples=samples))
 90 |   e <- list()
 91 |   for (dd in d) {
 92 |     if (ncol(dd) > 0) {
 93 |       e[[length(e) + 1]] <- dd
 94 |     }
 95 |   }
 96 |   d <- e
 97 |   stopifnot(length(d) > 0)
 98 |   h <- d[[1]]
 99 |   if (length(d) > 1) {
100 |     for (i in 2:length(d)) {
101 |       if (ncol(d[[i]]) > 0) {
102 |         h <- cbind.data.frame(h, d[[i]])
103 |       }
104 |     }
105 |   }
106 |   h <- h %>% tbl_df
107 |   return (h)
108 | }
109 | 
110 | read_samples <- function(filename) {
111 |   h <- read.table(filename, sep='\t', head=T) %>% tbl_df
112 |   h$sample <- h$id
113 |   return (h)
114 | }
115 | 
116 | read_samples_select <- function(filename) {
117 |   h <- read.table(filename, sep='\t', head=F) %>% tbl_df
118 |   colnames(h) <- c('id')
119 |   h <- as.vector(h$id)
120 |   return (h)
121 | }
122 | 
123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL,
124 |   sel=NULL) {
125 |   f <- excl
126 |   if (!is.null(f)) {
127 |     d <- d %>% filter(!(sample %in% f))
128 |   }
129 |   f <- methods
130 |   if (!is.null(f) && 'method' %in% colnames(d)) {
131 |     d <- d %>% filter(method %in% f)
132 |   }
133 |   f <- batch2
134 |   if (!is.null(f) && 'batch2' %in% colnames(d)) {
135 |     d <- d %>% filter(batch2 %in% f)
136 |   }
137 |   if (!is.null(sel)) {
138 |     d <- d %>% filter(sample %in% sel)
139 |   }
140 |   d <- droplevels(d)
141 | 
142 |   return (d)
143 | }
144 | ```
145 | 
146 | ```{r}
147 | dat <- list()
148 | dat$samples <- read_samples(opts$samples_file)
149 | dat$samples_select <- read_samples_select(opts$samples_select_file)
150 | dat$samples <- filter_samples(dat$samples,
151 |   excl=opts$samples_excl,
152 |   methods=opts$samples_methods,
153 |   batch2=opts$samples_batch2,
154 |   sel=dat$samples_select)
155 | 
156 | dat$meta <- read_all_meta(opts$rates_files)
157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample))
158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample))
159 | 
160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100)))
161 | stopifnot(all(round(dat$weights) == dat$weights))
162 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
163 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
164 | ```
165 | 
166 | 
167 | 
168 | ## Preprocessing
169 | 
170 | ```{r}
171 | get_cov <- function(d, perc=TRUE) {
172 |   cov <- rowSums(!is.na(d))
173 |   if (perc) {
174 |     cov <- cov / ncol(d)
175 |   }
176 |   return (cov)
177 | }
178 | 
179 | get_var <- function(d) {
180 |   return (apply(d, 1, var, na.rm=T))
181 | }
182 | ```
183 | 
184 | ```{r}
185 | h <- duplicated(dat$meta)
186 | print(sprintf('Remove %d duplicated records.', sum(h)))
187 | dat$meta <- dat$meta[!h, ]
188 | dat$meta$id_ <- 1:nrow(dat$meta)
189 | for (n in c('rates', 'weights')) {
190 |   dat[[n]] <- dat[[n]][!h,]
191 |   dat[[n]]$id_ <- dat$meta$id_
192 | }
193 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
194 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
195 | ```
196 | 
197 | ```{r filter_cov}
198 | d <- dat$rates
199 | print(sprintf('%d records in total.', nrow(d)))
200 | if (!is.na(opts$fil$cov)) {
201 |   d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,]
202 | }
203 | print(sprintf('%d records passed coverage filter.', nrow(d)))
204 | dat$rates <- d
205 | ```
206 | 
207 | ```{r}
208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') {
209 |   t <- data.frame(sample=factor(rownames(pc_vec)),
210 |     pcx=pc_vec[,x], pcy=pc_vec[,y])
211 |   t <- t %>% left_join(dat$samples, by='sample')
212 |   t$sample_short <- sub('^CSCP3_SERUM', '', t$sample)
213 |   p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) +
214 |     geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) +
215 |     xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) +
216 |     theme(legend.position='bottom')
217 |   return (p)
218 | }
219 | 
220 | plot_pca_val <- function(pc_val) {
221 |   t <- data.frame(pc=1:length(pc_val), val=pc_val)
222 |   p <- ggplot(t, aes(x=pc, y=val)) +
223 |     geom_bar(stat='identity', fill='salmon', color='black') +
224 |     xlab('principle component') +
225 |     ylab('% variance explained')
226 |   return (p)
227 | }
228 | ```
229 | 
230 | 
231 | ## PCA unadjusted data
232 | 
233 | ```{r}
234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
235 | ```
236 | 
237 | ```{r fig.width=8, fig.height=7}
238 | p1 <- plot_pca_vec(pc$vec)
239 | p1 + theme(legend.position='top')
240 | ```
241 | 
242 | ```{r fig.width=10, fig.height=5}
243 | p2 <- plot_pca_val(pc$val)
244 | p2
245 | ```
246 | 
247 | ```{r}
248 | d <- dat$rates %>% select(-id_)
249 | cov <- rowSums(!is.na(d)) / ncol(d)
250 | d[h == ncol(d)]
251 | ```
252 | 
253 | ```{r adjust, eval=opts$adjust}
254 | scale_rates <- function(d) {
255 |   h <- d %>% select(-id_)
256 |   a <- min(h, na.rm=T)
257 |   b <- max(h, na.rm=T)
258 |   h <- (h - a) / (b - a) * 100
259 |   h$id_ <- d$id_
260 |   return (h)
261 | }
262 | 
263 | adjust_batch_met <- function(d, what='batch2', scale=F) {
264 |   is_na <- is.na(d)
265 |   h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_)
266 |   h <- h %>% gather(sample, y, -id_) %>%
267 |     inner_join(select_(dat$samples, 'sample', x=what), by='sample')
268 |   h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup
269 |   h <- h %>% select(id_, sample, y=yr)
270 |   h <- h %>% spread(sample, y)
271 |   stopifnot(all(dim(is_na) == dim(h)))
272 |   stopifnot(all(colnames(is_na) %in% colnames(h)))
273 |   h <- h[,colnames(is_na)]
274 |   h[is_na] <- NA
275 |   return (h)
276 | }
277 | 
278 | h <- dat$rates %>% adjust_batch_met
279 | if (opts$scale_adjust) {
280 |   h <- scale_rates(h)
281 | }
282 | dat$rates <- h
283 | ```
284 | 
285 | ## PCA adjusted data
286 | 
287 | ```{r}
288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
289 | ```
290 | 
291 | ```{r fig.width=8, fig.height=7}
292 | p1 <- plot_pca_vec(pc$vec)
293 | p1 + theme(legend.position='top')
294 | ```
295 | 
296 | ```{r fig.width=10, fig.height=5}
297 | p2 <- plot_pca_val(pc$val)
298 | p2
299 | ```
300 | 
301 | ## Filtering
302 | 
303 | ```{r filter_var}
304 | d <- dat$rates
305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples
306 | d <- d[!is.na(h),]
307 | if (!is.na(opts$fil$min_var)) {
308 |   stopifnot(sum(is.na(h)) == 0)
309 |   d <- d[h >= opts$fil$min_var,]
310 | }
311 | print(sprintf('%d records passed min var filter.', nrow(d)))
312 | 
313 | if (!is.na(opts$fil$top_var)) {
314 |   t <- opts$fil$top_var
315 |   if (t <= 1) {
316 |     t <- as.integer(t * nrow(d))
317 |   }
318 |   d <- d[order(get_var(select(d, -id_)), decreasing=T),]
319 |   d <- d[1:t,]
320 | }
321 | print(sprintf('%d final record.', nrow(d)))
322 | dat$rates <- d
323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_))
324 | ```
325 | 
326 | ```{r join}
327 | d <- dat$meta %>% select(chromo, start, end, id_) %>%
328 |   inner_join(dat$rates, by='id_')
329 | stopifnot(nrow(d) == nrow(dat$rates))
330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_))
331 | 
332 | d <- dat$weights %>% gather(sample, weight, -id_)
333 | h <- nrow(dat$df)
334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample'))
335 | 
336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup
337 | 
338 | stopifnot(h == nrow(dat$df))
339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1))
340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0))
341 | ```
342 | 
343 | ```{r}
344 | dat$df <- dat$df %>% ungroup %>% tbl_df
345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df
346 | ```
347 | 
348 | ```{r write}
349 | save_rds <- function(d, name) {
350 |   f <- sprintf('%s_%s.rds', opts$out_base, name)
351 |   saveRDS(d, f)
352 | }
353 | 
354 | save_rds(dat$df, 'met')
355 | save_rds(dat$meta, 'meta')
356 | ```
357 | 
358 | 
359 | 
360 | ```{r eval=opts$plot_heat, fig.height=12}
361 | plot_heat <- function(d, Rowv=T, xlab='value') {
362 |   colors <- rev(brewer.pal(9, 'Spectral'))
363 |   colors <- colorRampPalette(colors)(50)
364 | 
365 |   labRow <- NA
366 |   if (nrow(d) > 500) {
367 |     dendro='column'
368 |   } else {
369 |     dendro = 'both'
370 |   }
371 |   p <- heatmap.2(d, density.info='none', trace='none', col=colors,
372 |     Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA,
373 |     lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab)
374 |   return (p)
375 | }
376 | 
377 | d <- dat$rates
378 | max_rows <- 1000
379 | if (nrow(d) > max_rows) {
380 |   d <- d %>% sample_n(max_rows)
381 | }
382 | d <- d %>% select(-id_) %>% as.matrix
383 | if (nrow(d) > 100) {
384 |   h <- plot_heat(d)
385 | }
386 | ```
387 | 


--------------------------------------------------------------------------------
/data/met/prepro/cgi/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Preprocessing Seqmonk methylation files
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(dplyr)
 15 | library(tidyr)
 16 | library(ggplot2)
 17 | library(RColorBrewer)
 18 | library(gplots)
 19 | library(gridExtra)
 20 | source('../../../../lib/utils.R')
 21 | ```
 22 | 
 23 | ```{r opts, echo=T}
 24 | opts <- list()
 25 | opts$fil$cov <- 30
 26 | opts$fil$min_var <- 10
 27 | opts$fil$top_var <- 0.75
 28 | opts$adjust <- F
 29 | opts$scale_adjust <- T
 30 | opts$samples_excl <- NULL
 31 | opts$samples_methods <- NULL
 32 | opts$samples_batch2 <- NULL
 33 | ```
 34 | 
 35 | ```{r}
 36 | opts$name <- basename(getwd())
 37 | opts$seqmonk_dirs <- c(
 38 |   file.path('../../reports', opts$name))
 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt'))
 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt'))
 41 | opts$out_base <- './data'
 42 | opts$samples_file <- '../../../samples/samples_stats.csv'
 43 | opts$samples_select_file <- '../../../samples/samples.csv'
 44 | opts$cache <- F
 45 | opts$plot_heat <- F
 46 | ```
 47 | 
 48 | **Name**: `r opts$name`
 49 | 
 50 | 
 51 | 
 52 | ```{r read_src}
 53 | read_meta <- function(filename) {
 54 |   h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t')
 55 |   names(h) <- tolower(names(h))
 56 |   h <- h %>% rename(chromo=chromosome)
 57 |   h <- h %>% tbl_df
 58 |   return (h)
 59 | }
 60 | 
 61 | read_meta_quick <- function(filename) {
 62 |   h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t')
 63 |   names(h) <- tolower(names(h))
 64 |   h <- h %>% tbl_df
 65 |   return (h)
 66 | }
 67 | 
 68 | read_all_meta <- function(filenames) {
 69 |   m <- read_meta(filenames[1])
 70 |   if (length(filenames) > 1) {
 71 |     for (i in 2:length(filenames)) {
 72 |       s <- read_meta_quick(filenames[i])
 73 |       stopifnot(all(m$start == s$start))
 74 |     }
 75 |   }
 76 |   return (m)
 77 | }
 78 | 
 79 | read_values <- function(filename, samples=NULL) {
 80 |   h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t')
 81 |   if (!is.null(samples)) {
 82 |     h <- subset(h, select=intersect(colnames(h), samples))
 83 |   }
 84 |   h <- h %>% tbl_df
 85 |   return (h)
 86 | }
 87 | 
 88 | read_all_values <- function(filenames, samples=NULL) {
 89 |   d <- lapply(filenames, function(x) read_values(x, samples=samples))
 90 |   e <- list()
 91 |   for (dd in d) {
 92 |     if (ncol(dd) > 0) {
 93 |       e[[length(e) + 1]] <- dd
 94 |     }
 95 |   }
 96 |   d <- e
 97 |   stopifnot(length(d) > 0)
 98 |   h <- d[[1]]
 99 |   if (length(d) > 1) {
100 |     for (i in 2:length(d)) {
101 |       if (ncol(d[[i]]) > 0) {
102 |         h <- cbind.data.frame(h, d[[i]])
103 |       }
104 |     }
105 |   }
106 |   h <- h %>% tbl_df
107 |   return (h)
108 | }
109 | 
110 | read_samples <- function(filename) {
111 |   h <- read.table(filename, sep='\t', head=T) %>% tbl_df
112 |   h$sample <- h$id
113 |   return (h)
114 | }
115 | 
116 | read_samples_select <- function(filename) {
117 |   h <- read.table(filename, sep='\t', head=F) %>% tbl_df
118 |   colnames(h) <- c('id')
119 |   h <- as.vector(h$id)
120 |   return (h)
121 | }
122 | 
123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL,
124 |   sel=NULL) {
125 |   f <- excl
126 |   if (!is.null(f)) {
127 |     d <- d %>% filter(!(sample %in% f))
128 |   }
129 |   f <- methods
130 |   if (!is.null(f) && 'method' %in% colnames(d)) {
131 |     d <- d %>% filter(method %in% f)
132 |   }
133 |   f <- batch2
134 |   if (!is.null(f) && 'batch2' %in% colnames(d)) {
135 |     d <- d %>% filter(batch2 %in% f)
136 |   }
137 |   if (!is.null(sel)) {
138 |     d <- d %>% filter(sample %in% sel)
139 |   }
140 |   d <- droplevels(d)
141 | 
142 |   return (d)
143 | }
144 | ```
145 | 
146 | ```{r}
147 | dat <- list()
148 | dat$samples <- read_samples(opts$samples_file)
149 | dat$samples_select <- read_samples_select(opts$samples_select_file)
150 | dat$samples <- filter_samples(dat$samples,
151 |   excl=opts$samples_excl,
152 |   methods=opts$samples_methods,
153 |   batch2=opts$samples_batch2,
154 |   sel=dat$samples_select)
155 | 
156 | dat$meta <- read_all_meta(opts$rates_files)
157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample))
158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample))
159 | 
160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100)))
161 | stopifnot(all(round(dat$weights) == dat$weights))
162 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
163 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
164 | ```
165 | 
166 | 
167 | 
168 | ## Preprocessing
169 | 
170 | ```{r}
171 | get_cov <- function(d, perc=TRUE) {
172 |   cov <- rowSums(!is.na(d))
173 |   if (perc) {
174 |     cov <- cov / ncol(d)
175 |   }
176 |   return (cov)
177 | }
178 | 
179 | get_var <- function(d) {
180 |   return (apply(d, 1, var, na.rm=T))
181 | }
182 | ```
183 | 
184 | ```{r}
185 | h <- duplicated(dat$meta)
186 | print(sprintf('Remove %d duplicated records.', sum(h)))
187 | dat$meta <- dat$meta[!h, ]
188 | dat$meta$id_ <- 1:nrow(dat$meta)
189 | for (n in c('rates', 'weights')) {
190 |   dat[[n]] <- dat[[n]][!h,]
191 |   dat[[n]]$id_ <- dat$meta$id_
192 | }
193 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
194 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
195 | ```
196 | 
197 | ```{r filter_cov}
198 | d <- dat$rates
199 | print(sprintf('%d records in total.', nrow(d)))
200 | if (!is.na(opts$fil$cov)) {
201 |   d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,]
202 | }
203 | print(sprintf('%d records passed coverage filter.', nrow(d)))
204 | dat$rates <- d
205 | ```
206 | 
207 | ```{r}
208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') {
209 |   t <- data.frame(sample=factor(rownames(pc_vec)),
210 |     pcx=pc_vec[,x], pcy=pc_vec[,y])
211 |   t <- t %>% left_join(dat$samples, by='sample')
212 |   t$sample_short <- sub('^CSCP3_SERUM', '', t$sample)
213 |   p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) +
214 |     geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) +
215 |     xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) +
216 |     theme(legend.position='bottom')
217 |   return (p)
218 | }
219 | 
220 | plot_pca_val <- function(pc_val) {
221 |   t <- data.frame(pc=1:length(pc_val), val=pc_val)
222 |   p <- ggplot(t, aes(x=pc, y=val)) +
223 |     geom_bar(stat='identity', fill='salmon', color='black') +
224 |     xlab('principle component') +
225 |     ylab('% variance explained')
226 |   return (p)
227 | }
228 | ```
229 | 
230 | 
231 | ## PCA unadjusted data
232 | 
233 | ```{r}
234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
235 | ```
236 | 
237 | ```{r fig.width=8, fig.height=7}
238 | p1 <- plot_pca_vec(pc$vec)
239 | p1 + theme(legend.position='top')
240 | ```
241 | 
242 | ```{r fig.width=10, fig.height=5}
243 | p2 <- plot_pca_val(pc$val)
244 | p2
245 | ```
246 | 
247 | ```{r}
248 | d <- dat$rates %>% select(-id_)
249 | cov <- rowSums(!is.na(d)) / ncol(d)
250 | d[h == ncol(d)]
251 | ```
252 | 
253 | ```{r adjust, eval=opts$adjust}
254 | scale_rates <- function(d) {
255 |   h <- d %>% select(-id_)
256 |   a <- min(h, na.rm=T)
257 |   b <- max(h, na.rm=T)
258 |   h <- (h - a) / (b - a) * 100
259 |   h$id_ <- d$id_
260 |   return (h)
261 | }
262 | 
263 | adjust_batch_met <- function(d, what='batch2', scale=F) {
264 |   is_na <- is.na(d)
265 |   h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_)
266 |   h <- h %>% gather(sample, y, -id_) %>%
267 |     inner_join(select_(dat$samples, 'sample', x=what), by='sample')
268 |   h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup
269 |   h <- h %>% select(id_, sample, y=yr)
270 |   h <- h %>% spread(sample, y)
271 |   stopifnot(all(dim(is_na) == dim(h)))
272 |   stopifnot(all(colnames(is_na) %in% colnames(h)))
273 |   h <- h[,colnames(is_na)]
274 |   h[is_na] <- NA
275 |   return (h)
276 | }
277 | 
278 | h <- dat$rates %>% adjust_batch_met
279 | if (opts$scale_adjust) {
280 |   h <- scale_rates(h)
281 | }
282 | dat$rates <- h
283 | ```
284 | 
285 | ## PCA adjusted data
286 | 
287 | ```{r}
288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
289 | ```
290 | 
291 | ```{r fig.width=8, fig.height=7}
292 | p1 <- plot_pca_vec(pc$vec)
293 | p1 + theme(legend.position='top')
294 | ```
295 | 
296 | ```{r fig.width=10, fig.height=5}
297 | p2 <- plot_pca_val(pc$val)
298 | p2
299 | ```
300 | 
301 | ## Filtering
302 | 
303 | ```{r filter_var}
304 | d <- dat$rates
305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples
306 | d <- d[!is.na(h),]
307 | if (!is.na(opts$fil$min_var)) {
308 |   stopifnot(sum(is.na(h)) == 0)
309 |   d <- d[h >= opts$fil$min_var,]
310 | }
311 | print(sprintf('%d records passed min var filter.', nrow(d)))
312 | 
313 | if (!is.na(opts$fil$top_var)) {
314 |   t <- opts$fil$top_var
315 |   if (t <= 1) {
316 |     t <- as.integer(t * nrow(d))
317 |   }
318 |   d <- d[order(get_var(select(d, -id_)), decreasing=T),]
319 |   d <- d[1:t,]
320 | }
321 | print(sprintf('%d final record.', nrow(d)))
322 | dat$rates <- d
323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_))
324 | ```
325 | 
326 | ```{r join}
327 | d <- dat$meta %>% select(chromo, start, end, id_) %>%
328 |   inner_join(dat$rates, by='id_')
329 | stopifnot(nrow(d) == nrow(dat$rates))
330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_))
331 | 
332 | d <- dat$weights %>% gather(sample, weight, -id_)
333 | h <- nrow(dat$df)
334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample'))
335 | 
336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup
337 | 
338 | stopifnot(h == nrow(dat$df))
339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1))
340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0))
341 | ```
342 | 
343 | ```{r}
344 | dat$df <- dat$df %>% ungroup %>% tbl_df
345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df
346 | ```
347 | 
348 | ```{r write}
349 | save_rds <- function(d, name) {
350 |   f <- sprintf('%s_%s.rds', opts$out_base, name)
351 |   saveRDS(d, f)
352 | }
353 | 
354 | save_rds(dat$df, 'met')
355 | save_rds(dat$meta, 'meta')
356 | ```
357 | 
358 | 
359 | 
360 | ```{r eval=opts$plot_heat, fig.height=12}
361 | plot_heat <- function(d, Rowv=T, xlab='value') {
362 |   colors <- rev(brewer.pal(9, 'Spectral'))
363 |   colors <- colorRampPalette(colors)(50)
364 | 
365 |   labRow <- NA
366 |   if (nrow(d) > 500) {
367 |     dendro='column'
368 |   } else {
369 |     dendro = 'both'
370 |   }
371 |   p <- heatmap.2(d, density.info='none', trace='none', col=colors,
372 |     Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA,
373 |     lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab)
374 |   return (p)
375 | }
376 | 
377 | d <- dat$rates
378 | max_rows <- 1000
379 | if (nrow(d) > max_rows) {
380 |   d <- d %>% sample_n(max_rows)
381 | }
382 | d <- d %>% select(-id_) %>% as.matrix
383 | if (nrow(d) > 100) {
384 |   h <- plot_heat(d)
385 | }
386 | ```
387 | 


--------------------------------------------------------------------------------
/data/met/prepro/exon/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Preprocessing Seqmonk methylation files
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(dplyr)
 15 | library(tidyr)
 16 | library(ggplot2)
 17 | library(RColorBrewer)
 18 | library(gplots)
 19 | library(gridExtra)
 20 | source('../../../../lib/utils.R')
 21 | ```
 22 | 
 23 | ```{r opts, echo=T}
 24 | opts <- list()
 25 | opts$fil$cov <- 30
 26 | opts$fil$min_var <- 10
 27 | opts$fil$top_var <- 0.75
 28 | opts$adjust <- F
 29 | opts$scale_adjust <- T
 30 | opts$samples_excl <- NULL
 31 | opts$samples_methods <- NULL
 32 | opts$samples_batch2 <- NULL
 33 | ```
 34 | 
 35 | ```{r}
 36 | opts$name <- basename(getwd())
 37 | opts$seqmonk_dirs <- c(
 38 |   file.path('../../reports', opts$name))
 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt'))
 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt'))
 41 | opts$out_base <- './data'
 42 | opts$samples_file <- '../../../samples/samples_stats.csv'
 43 | opts$samples_select_file <- '../../../samples/samples.csv'
 44 | opts$cache <- F
 45 | opts$plot_heat <- F
 46 | ```
 47 | 
 48 | **Name**: `r opts$name`
 49 | 
 50 | 
 51 | 
 52 | ```{r read_src}
 53 | read_meta <- function(filename) {
 54 |   h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t')
 55 |   names(h) <- tolower(names(h))
 56 |   h <- h %>% rename(chromo=chromosome)
 57 |   h <- h %>% tbl_df
 58 |   return (h)
 59 | }
 60 | 
 61 | read_meta_quick <- function(filename) {
 62 |   h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t')
 63 |   names(h) <- tolower(names(h))
 64 |   h <- h %>% tbl_df
 65 |   return (h)
 66 | }
 67 | 
 68 | read_all_meta <- function(filenames) {
 69 |   m <- read_meta(filenames[1])
 70 |   if (length(filenames) > 1) {
 71 |     for (i in 2:length(filenames)) {
 72 |       s <- read_meta_quick(filenames[i])
 73 |       stopifnot(all(m$start == s$start))
 74 |     }
 75 |   }
 76 |   return (m)
 77 | }
 78 | 
 79 | read_values <- function(filename, samples=NULL) {
 80 |   h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t')
 81 |   if (!is.null(samples)) {
 82 |     h <- subset(h, select=intersect(colnames(h), samples))
 83 |   }
 84 |   h <- h %>% tbl_df
 85 |   return (h)
 86 | }
 87 | 
 88 | read_all_values <- function(filenames, samples=NULL) {
 89 |   d <- lapply(filenames, function(x) read_values(x, samples=samples))
 90 |   e <- list()
 91 |   for (dd in d) {
 92 |     if (ncol(dd) > 0) {
 93 |       e[[length(e) + 1]] <- dd
 94 |     }
 95 |   }
 96 |   d <- e
 97 |   stopifnot(length(d) > 0)
 98 |   h <- d[[1]]
 99 |   if (length(d) > 1) {
100 |     for (i in 2:length(d)) {
101 |       if (ncol(d[[i]]) > 0) {
102 |         h <- cbind.data.frame(h, d[[i]])
103 |       }
104 |     }
105 |   }
106 |   h <- h %>% tbl_df
107 |   return (h)
108 | }
109 | 
110 | read_samples <- function(filename) {
111 |   h <- read.table(filename, sep='\t', head=T) %>% tbl_df
112 |   h$sample <- h$id
113 |   return (h)
114 | }
115 | 
116 | read_samples_select <- function(filename) {
117 |   h <- read.table(filename, sep='\t', head=F) %>% tbl_df
118 |   colnames(h) <- c('id')
119 |   h <- as.vector(h$id)
120 |   return (h)
121 | }
122 | 
123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL,
124 |   sel=NULL) {
125 |   f <- excl
126 |   if (!is.null(f)) {
127 |     d <- d %>% filter(!(sample %in% f))
128 |   }
129 |   f <- methods
130 |   if (!is.null(f) && 'method' %in% colnames(d)) {
131 |     d <- d %>% filter(method %in% f)
132 |   }
133 |   f <- batch2
134 |   if (!is.null(f) && 'batch2' %in% colnames(d)) {
135 |     d <- d %>% filter(batch2 %in% f)
136 |   }
137 |   if (!is.null(sel)) {
138 |     d <- d %>% filter(sample %in% sel)
139 |   }
140 |   d <- droplevels(d)
141 | 
142 |   return (d)
143 | }
144 | ```
145 | 
146 | ```{r}
147 | dat <- list()
148 | dat$samples <- read_samples(opts$samples_file)
149 | dat$samples_select <- read_samples_select(opts$samples_select_file)
150 | dat$samples <- filter_samples(dat$samples,
151 |   excl=opts$samples_excl,
152 |   methods=opts$samples_methods,
153 |   batch2=opts$samples_batch2,
154 |   sel=dat$samples_select)
155 | 
156 | dat$meta <- read_all_meta(opts$rates_files)
157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample))
158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample))
159 | 
160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100)))
161 | stopifnot(all(round(dat$weights) == dat$weights))
162 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
163 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
164 | ```
165 | 
166 | 
167 | 
168 | ## Preprocessing
169 | 
170 | ```{r}
171 | get_cov <- function(d, perc=TRUE) {
172 |   cov <- rowSums(!is.na(d))
173 |   if (perc) {
174 |     cov <- cov / ncol(d)
175 |   }
176 |   return (cov)
177 | }
178 | 
179 | get_var <- function(d) {
180 |   return (apply(d, 1, var, na.rm=T))
181 | }
182 | ```
183 | 
184 | ```{r}
185 | h <- duplicated(dat$meta)
186 | print(sprintf('Remove %d duplicated records.', sum(h)))
187 | dat$meta <- dat$meta[!h, ]
188 | dat$meta$id_ <- 1:nrow(dat$meta)
189 | for (n in c('rates', 'weights')) {
190 |   dat[[n]] <- dat[[n]][!h,]
191 |   dat[[n]]$id_ <- dat$meta$id_
192 | }
193 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
194 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
195 | ```
196 | 
197 | ```{r filter_cov}
198 | d <- dat$rates
199 | print(sprintf('%d records in total.', nrow(d)))
200 | if (!is.na(opts$fil$cov)) {
201 |   d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,]
202 | }
203 | print(sprintf('%d records passed coverage filter.', nrow(d)))
204 | dat$rates <- d
205 | ```
206 | 
207 | ```{r}
208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') {
209 |   t <- data.frame(sample=factor(rownames(pc_vec)),
210 |     pcx=pc_vec[,x], pcy=pc_vec[,y])
211 |   t <- t %>% left_join(dat$samples, by='sample')
212 |   t$sample_short <- sub('^CSCP3_SERUM', '', t$sample)
213 |   p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) +
214 |     geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) +
215 |     xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) +
216 |     theme(legend.position='bottom')
217 |   return (p)
218 | }
219 | 
220 | plot_pca_val <- function(pc_val) {
221 |   t <- data.frame(pc=1:length(pc_val), val=pc_val)
222 |   p <- ggplot(t, aes(x=pc, y=val)) +
223 |     geom_bar(stat='identity', fill='salmon', color='black') +
224 |     xlab('principle component') +
225 |     ylab('% variance explained')
226 |   return (p)
227 | }
228 | ```
229 | 
230 | 
231 | ## PCA unadjusted data
232 | 
233 | ```{r}
234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
235 | ```
236 | 
237 | ```{r fig.width=8, fig.height=7}
238 | p1 <- plot_pca_vec(pc$vec)
239 | p1 + theme(legend.position='top')
240 | ```
241 | 
242 | ```{r fig.width=10, fig.height=5}
243 | p2 <- plot_pca_val(pc$val)
244 | p2
245 | ```
246 | 
247 | ```{r}
248 | d <- dat$rates %>% select(-id_)
249 | cov <- rowSums(!is.na(d)) / ncol(d)
250 | d[h == ncol(d)]
251 | ```
252 | 
253 | ```{r adjust, eval=opts$adjust}
254 | scale_rates <- function(d) {
255 |   h <- d %>% select(-id_)
256 |   a <- min(h, na.rm=T)
257 |   b <- max(h, na.rm=T)
258 |   h <- (h - a) / (b - a) * 100
259 |   h$id_ <- d$id_
260 |   return (h)
261 | }
262 | 
263 | adjust_batch_met <- function(d, what='batch2', scale=F) {
264 |   is_na <- is.na(d)
265 |   h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_)
266 |   h <- h %>% gather(sample, y, -id_) %>%
267 |     inner_join(select_(dat$samples, 'sample', x=what), by='sample')
268 |   h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup
269 |   h <- h %>% select(id_, sample, y=yr)
270 |   h <- h %>% spread(sample, y)
271 |   stopifnot(all(dim(is_na) == dim(h)))
272 |   stopifnot(all(colnames(is_na) %in% colnames(h)))
273 |   h <- h[,colnames(is_na)]
274 |   h[is_na] <- NA
275 |   return (h)
276 | }
277 | 
278 | h <- dat$rates %>% adjust_batch_met
279 | if (opts$scale_adjust) {
280 |   h <- scale_rates(h)
281 | }
282 | dat$rates <- h
283 | ```
284 | 
285 | ## PCA adjusted data
286 | 
287 | ```{r}
288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
289 | ```
290 | 
291 | ```{r fig.width=8, fig.height=7}
292 | p1 <- plot_pca_vec(pc$vec)
293 | p1 + theme(legend.position='top')
294 | ```
295 | 
296 | ```{r fig.width=10, fig.height=5}
297 | p2 <- plot_pca_val(pc$val)
298 | p2
299 | ```
300 | 
301 | ## Filtering
302 | 
303 | ```{r filter_var}
304 | d <- dat$rates
305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples
306 | d <- d[!is.na(h),]
307 | if (!is.na(opts$fil$min_var)) {
308 |   stopifnot(sum(is.na(h)) == 0)
309 |   d <- d[h >= opts$fil$min_var,]
310 | }
311 | print(sprintf('%d records passed min var filter.', nrow(d)))
312 | 
313 | if (!is.na(opts$fil$top_var)) {
314 |   t <- opts$fil$top_var
315 |   if (t <= 1) {
316 |     t <- as.integer(t * nrow(d))
317 |   }
318 |   d <- d[order(get_var(select(d, -id_)), decreasing=T),]
319 |   d <- d[1:t,]
320 | }
321 | print(sprintf('%d final record.', nrow(d)))
322 | dat$rates <- d
323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_))
324 | ```
325 | 
326 | ```{r join}
327 | d <- dat$meta %>% select(chromo, start, end, id_) %>%
328 |   inner_join(dat$rates, by='id_')
329 | stopifnot(nrow(d) == nrow(dat$rates))
330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_))
331 | 
332 | d <- dat$weights %>% gather(sample, weight, -id_)
333 | h <- nrow(dat$df)
334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample'))
335 | 
336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup
337 | 
338 | stopifnot(h == nrow(dat$df))
339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1))
340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0))
341 | ```
342 | 
343 | ```{r}
344 | dat$df <- dat$df %>% ungroup %>% tbl_df
345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df
346 | ```
347 | 
348 | ```{r write}
349 | save_rds <- function(d, name) {
350 |   f <- sprintf('%s_%s.rds', opts$out_base, name)
351 |   saveRDS(d, f)
352 | }
353 | 
354 | save_rds(dat$df, 'met')
355 | save_rds(dat$meta, 'meta')
356 | ```
357 | 
358 | 
359 | 
360 | ```{r eval=opts$plot_heat, fig.height=12}
361 | plot_heat <- function(d, Rowv=T, xlab='value') {
362 |   colors <- rev(brewer.pal(9, 'Spectral'))
363 |   colors <- colorRampPalette(colors)(50)
364 | 
365 |   labRow <- NA
366 |   if (nrow(d) > 500) {
367 |     dendro='column'
368 |   } else {
369 |     dendro = 'both'
370 |   }
371 |   p <- heatmap.2(d, density.info='none', trace='none', col=colors,
372 |     Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA,
373 |     lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab)
374 |   return (p)
375 | }
376 | 
377 | d <- dat$rates
378 | max_rows <- 1000
379 | if (nrow(d) > max_rows) {
380 |   d <- d %>% sample_n(max_rows)
381 | }
382 | d <- d %>% select(-id_) %>% as.matrix
383 | if (nrow(d) > 100) {
384 |   h <- plot_heat(d)
385 | }
386 | ```
387 | 


--------------------------------------------------------------------------------
/data/met/prepro/gene_body/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Preprocessing Seqmonk methylation files
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(dplyr)
 15 | library(tidyr)
 16 | library(ggplot2)
 17 | library(RColorBrewer)
 18 | library(gplots)
 19 | library(gridExtra)
 20 | source('../../../../lib/utils.R')
 21 | ```
 22 | 
 23 | ```{r opts, echo=T}
 24 | opts <- list()
 25 | opts$fil$cov <- 30
 26 | opts$fil$min_var <- 10
 27 | opts$fil$top_var <- 0.75
 28 | opts$adjust <- F
 29 | opts$scale_adjust <- T
 30 | opts$samples_excl <- NULL
 31 | opts$samples_methods <- NULL
 32 | opts$samples_batch2 <- NULL
 33 | ```
 34 | 
 35 | ```{r}
 36 | opts$name <- basename(getwd())
 37 | opts$seqmonk_dirs <- c(
 38 |   file.path('../../reports', opts$name))
 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt'))
 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt'))
 41 | opts$out_base <- './data'
 42 | opts$samples_file <- '../../../samples/samples_stats.csv'
 43 | opts$samples_select_file <- '../../../samples/samples.csv'
 44 | opts$cache <- F
 45 | opts$plot_heat <- F
 46 | ```
 47 | 
 48 | **Name**: `r opts$name`
 49 | 
 50 | 
 51 | 
 52 | ```{r read_src}
 53 | read_meta <- function(filename) {
 54 |   h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t')
 55 |   names(h) <- tolower(names(h))
 56 |   h <- h %>% rename(chromo=chromosome)
 57 |   h <- h %>% tbl_df
 58 |   return (h)
 59 | }
 60 | 
 61 | read_meta_quick <- function(filename) {
 62 |   h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t')
 63 |   names(h) <- tolower(names(h))
 64 |   h <- h %>% tbl_df
 65 |   return (h)
 66 | }
 67 | 
 68 | read_all_meta <- function(filenames) {
 69 |   m <- read_meta(filenames[1])
 70 |   if (length(filenames) > 1) {
 71 |     for (i in 2:length(filenames)) {
 72 |       s <- read_meta_quick(filenames[i])
 73 |       stopifnot(all(m$start == s$start))
 74 |     }
 75 |   }
 76 |   return (m)
 77 | }
 78 | 
 79 | read_values <- function(filename, samples=NULL) {
 80 |   h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t')
 81 |   if (!is.null(samples)) {
 82 |     h <- subset(h, select=intersect(colnames(h), samples))
 83 |   }
 84 |   h <- h %>% tbl_df
 85 |   return (h)
 86 | }
 87 | 
 88 | read_all_values <- function(filenames, samples=NULL) {
 89 |   d <- lapply(filenames, function(x) read_values(x, samples=samples))
 90 |   e <- list()
 91 |   for (dd in d) {
 92 |     if (ncol(dd) > 0) {
 93 |       e[[length(e) + 1]] <- dd
 94 |     }
 95 |   }
 96 |   d <- e
 97 |   stopifnot(length(d) > 0)
 98 |   h <- d[[1]]
 99 |   if (length(d) > 1) {
100 |     for (i in 2:length(d)) {
101 |       if (ncol(d[[i]]) > 0) {
102 |         h <- cbind.data.frame(h, d[[i]])
103 |       }
104 |     }
105 |   }
106 |   h <- h %>% tbl_df
107 |   return (h)
108 | }
109 | 
110 | read_samples <- function(filename) {
111 |   h <- read.table(filename, sep='\t', head=T) %>% tbl_df
112 |   h$sample <- h$id
113 |   return (h)
114 | }
115 | 
116 | read_samples_select <- function(filename) {
117 |   h <- read.table(filename, sep='\t', head=F) %>% tbl_df
118 |   colnames(h) <- c('id')
119 |   h <- as.vector(h$id)
120 |   return (h)
121 | }
122 | 
123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL,
124 |   sel=NULL) {
125 |   f <- excl
126 |   if (!is.null(f)) {
127 |     d <- d %>% filter(!(sample %in% f))
128 |   }
129 |   f <- methods
130 |   if (!is.null(f) && 'method' %in% colnames(d)) {
131 |     d <- d %>% filter(method %in% f)
132 |   }
133 |   f <- batch2
134 |   if (!is.null(f) && 'batch2' %in% colnames(d)) {
135 |     d <- d %>% filter(batch2 %in% f)
136 |   }
137 |   if (!is.null(sel)) {
138 |     d <- d %>% filter(sample %in% sel)
139 |   }
140 |   d <- droplevels(d)
141 | 
142 |   return (d)
143 | }
144 | ```
145 | 
146 | ```{r}
147 | dat <- list()
148 | dat$samples <- read_samples(opts$samples_file)
149 | dat$samples_select <- read_samples_select(opts$samples_select_file)
150 | dat$samples <- filter_samples(dat$samples,
151 |   excl=opts$samples_excl,
152 |   methods=opts$samples_methods,
153 |   batch2=opts$samples_batch2,
154 |   sel=dat$samples_select)
155 | 
156 | dat$meta <- read_all_meta(opts$rates_files)
157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample))
158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample))
159 | 
160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100)))
161 | stopifnot(all(round(dat$weights) == dat$weights))
162 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
163 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
164 | ```
165 | 
166 | 
167 | 
168 | ## Preprocessing
169 | 
170 | ```{r}
171 | get_cov <- function(d, perc=TRUE) {
172 |   cov <- rowSums(!is.na(d))
173 |   if (perc) {
174 |     cov <- cov / ncol(d)
175 |   }
176 |   return (cov)
177 | }
178 | 
179 | get_var <- function(d) {
180 |   return (apply(d, 1, var, na.rm=T))
181 | }
182 | ```
183 | 
184 | ```{r}
185 | h <- duplicated(dat$meta)
186 | print(sprintf('Remove %d duplicated records.', sum(h)))
187 | dat$meta <- dat$meta[!h, ]
188 | dat$meta$id_ <- 1:nrow(dat$meta)
189 | for (n in c('rates', 'weights')) {
190 |   dat[[n]] <- dat[[n]][!h,]
191 |   dat[[n]]$id_ <- dat$meta$id_
192 | }
193 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
194 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
195 | ```
196 | 
197 | ```{r filter_cov}
198 | d <- dat$rates
199 | print(sprintf('%d records in total.', nrow(d)))
200 | if (!is.na(opts$fil$cov)) {
201 |   d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,]
202 | }
203 | print(sprintf('%d records passed coverage filter.', nrow(d)))
204 | dat$rates <- d
205 | ```
206 | 
207 | ```{r}
208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') {
209 |   t <- data.frame(sample=factor(rownames(pc_vec)),
210 |     pcx=pc_vec[,x], pcy=pc_vec[,y])
211 |   t <- t %>% left_join(dat$samples, by='sample')
212 |   t$sample_short <- sub('^CSCP3_SERUM', '', t$sample)
213 |   p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) +
214 |     geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) +
215 |     xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) +
216 |     theme(legend.position='bottom')
217 |   return (p)
218 | }
219 | 
220 | plot_pca_val <- function(pc_val) {
221 |   t <- data.frame(pc=1:length(pc_val), val=pc_val)
222 |   p <- ggplot(t, aes(x=pc, y=val)) +
223 |     geom_bar(stat='identity', fill='salmon', color='black') +
224 |     xlab('principle component') +
225 |     ylab('% variance explained')
226 |   return (p)
227 | }
228 | ```
229 | 
230 | 
231 | ## PCA unadjusted data
232 | 
233 | ```{r}
234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
235 | ```
236 | 
237 | ```{r fig.width=8, fig.height=7}
238 | p1 <- plot_pca_vec(pc$vec)
239 | p1 + theme(legend.position='top')
240 | ```
241 | 
242 | ```{r fig.width=10, fig.height=5}
243 | p2 <- plot_pca_val(pc$val)
244 | p2
245 | ```
246 | 
247 | ```{r}
248 | d <- dat$rates %>% select(-id_)
249 | cov <- rowSums(!is.na(d)) / ncol(d)
250 | d[h == ncol(d)]
251 | ```
252 | 
253 | ```{r adjust, eval=opts$adjust}
254 | scale_rates <- function(d) {
255 |   h <- d %>% select(-id_)
256 |   a <- min(h, na.rm=T)
257 |   b <- max(h, na.rm=T)
258 |   h <- (h - a) / (b - a) * 100
259 |   h$id_ <- d$id_
260 |   return (h)
261 | }
262 | 
263 | adjust_batch_met <- function(d, what='batch2', scale=F) {
264 |   is_na <- is.na(d)
265 |   h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_)
266 |   h <- h %>% gather(sample, y, -id_) %>%
267 |     inner_join(select_(dat$samples, 'sample', x=what), by='sample')
268 |   h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup
269 |   h <- h %>% select(id_, sample, y=yr)
270 |   h <- h %>% spread(sample, y)
271 |   stopifnot(all(dim(is_na) == dim(h)))
272 |   stopifnot(all(colnames(is_na) %in% colnames(h)))
273 |   h <- h[,colnames(is_na)]
274 |   h[is_na] <- NA
275 |   return (h)
276 | }
277 | 
278 | h <- dat$rates %>% adjust_batch_met
279 | if (opts$scale_adjust) {
280 |   h <- scale_rates(h)
281 | }
282 | dat$rates <- h
283 | ```
284 | 
285 | ## PCA adjusted data
286 | 
287 | ```{r}
288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
289 | ```
290 | 
291 | ```{r fig.width=8, fig.height=7}
292 | p1 <- plot_pca_vec(pc$vec)
293 | p1 + theme(legend.position='top')
294 | ```
295 | 
296 | ```{r fig.width=10, fig.height=5}
297 | p2 <- plot_pca_val(pc$val)
298 | p2
299 | ```
300 | 
301 | ## Filtering
302 | 
303 | ```{r filter_var}
304 | d <- dat$rates
305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples
306 | d <- d[!is.na(h),]
307 | if (!is.na(opts$fil$min_var)) {
308 |   stopifnot(sum(is.na(h)) == 0)
309 |   d <- d[h >= opts$fil$min_var,]
310 | }
311 | print(sprintf('%d records passed min var filter.', nrow(d)))
312 | 
313 | if (!is.na(opts$fil$top_var)) {
314 |   t <- opts$fil$top_var
315 |   if (t <= 1) {
316 |     t <- as.integer(t * nrow(d))
317 |   }
318 |   d <- d[order(get_var(select(d, -id_)), decreasing=T),]
319 |   d <- d[1:t,]
320 | }
321 | print(sprintf('%d final record.', nrow(d)))
322 | dat$rates <- d
323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_))
324 | ```
325 | 
326 | ```{r join}
327 | d <- dat$meta %>% select(chromo, start, end, id_) %>%
328 |   inner_join(dat$rates, by='id_')
329 | stopifnot(nrow(d) == nrow(dat$rates))
330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_))
331 | 
332 | d <- dat$weights %>% gather(sample, weight, -id_)
333 | h <- nrow(dat$df)
334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample'))
335 | 
336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup
337 | 
338 | stopifnot(h == nrow(dat$df))
339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1))
340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0))
341 | ```
342 | 
343 | ```{r}
344 | dat$df <- dat$df %>% ungroup %>% tbl_df
345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df
346 | ```
347 | 
348 | ```{r write}
349 | save_rds <- function(d, name) {
350 |   f <- sprintf('%s_%s.rds', opts$out_base, name)
351 |   saveRDS(d, f)
352 | }
353 | 
354 | save_rds(dat$df, 'met')
355 | save_rds(dat$meta, 'meta')
356 | ```
357 | 
358 | 
359 | 
360 | ```{r eval=opts$plot_heat, fig.height=12}
361 | plot_heat <- function(d, Rowv=T, xlab='value') {
362 |   colors <- rev(brewer.pal(9, 'Spectral'))
363 |   colors <- colorRampPalette(colors)(50)
364 | 
365 |   labRow <- NA
366 |   if (nrow(d) > 500) {
367 |     dendro='column'
368 |   } else {
369 |     dendro = 'both'
370 |   }
371 |   p <- heatmap.2(d, density.info='none', trace='none', col=colors,
372 |     Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA,
373 |     lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab)
374 |   return (p)
375 | }
376 | 
377 | d <- dat$rates
378 | max_rows <- 1000
379 | if (nrow(d) > max_rows) {
380 |   d <- d %>% sample_n(max_rows)
381 | }
382 | d <- d %>% select(-id_) %>% as.matrix
383 | if (nrow(d) > 100) {
384 |   h <- plot_heat(d)
385 | }
386 | ```
387 | 


--------------------------------------------------------------------------------
/data/met/prepro/p300/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Preprocessing Seqmonk methylation files
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(dplyr)
 15 | library(tidyr)
 16 | library(ggplot2)
 17 | library(RColorBrewer)
 18 | library(gplots)
 19 | library(gridExtra)
 20 | source('../../../../lib/utils.R')
 21 | ```
 22 | 
 23 | ```{r opts, echo=T}
 24 | opts <- list()
 25 | opts$fil$cov <- 30
 26 | opts$fil$min_var <- 10
 27 | opts$fil$top_var <- 0.75
 28 | opts$adjust <- F
 29 | opts$scale_adjust <- T
 30 | opts$samples_excl <- NULL
 31 | opts$samples_methods <- NULL
 32 | opts$samples_batch2 <- NULL
 33 | ```
 34 | 
 35 | ```{r}
 36 | opts$name <- basename(getwd())
 37 | opts$seqmonk_dirs <- c(
 38 |   file.path('../../reports', opts$name))
 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt'))
 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt'))
 41 | opts$out_base <- './data'
 42 | opts$samples_file <- '../../../samples/samples_stats.csv'
 43 | opts$samples_select_file <- '../../../samples/samples.csv'
 44 | opts$cache <- F
 45 | opts$plot_heat <- F
 46 | ```
 47 | 
 48 | **Name**: `r opts$name`
 49 | 
 50 | 
 51 | 
 52 | ```{r read_src}
 53 | read_meta <- function(filename) {
 54 |   h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t')
 55 |   names(h) <- tolower(names(h))
 56 |   h <- h %>% rename(chromo=chromosome)
 57 |   h <- h %>% tbl_df
 58 |   return (h)
 59 | }
 60 | 
 61 | read_meta_quick <- function(filename) {
 62 |   h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t')
 63 |   names(h) <- tolower(names(h))
 64 |   h <- h %>% tbl_df
 65 |   return (h)
 66 | }
 67 | 
 68 | read_all_meta <- function(filenames) {
 69 |   m <- read_meta(filenames[1])
 70 |   if (length(filenames) > 1) {
 71 |     for (i in 2:length(filenames)) {
 72 |       s <- read_meta_quick(filenames[i])
 73 |       stopifnot(all(m$start == s$start))
 74 |     }
 75 |   }
 76 |   return (m)
 77 | }
 78 | 
 79 | read_values <- function(filename, samples=NULL) {
 80 |   h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t')
 81 |   if (!is.null(samples)) {
 82 |     h <- subset(h, select=intersect(colnames(h), samples))
 83 |   }
 84 |   h <- h %>% tbl_df
 85 |   return (h)
 86 | }
 87 | 
 88 | read_all_values <- function(filenames, samples=NULL) {
 89 |   d <- lapply(filenames, function(x) read_values(x, samples=samples))
 90 |   e <- list()
 91 |   for (dd in d) {
 92 |     if (ncol(dd) > 0) {
 93 |       e[[length(e) + 1]] <- dd
 94 |     }
 95 |   }
 96 |   d <- e
 97 |   stopifnot(length(d) > 0)
 98 |   h <- d[[1]]
 99 |   if (length(d) > 1) {
100 |     for (i in 2:length(d)) {
101 |       if (ncol(d[[i]]) > 0) {
102 |         h <- cbind.data.frame(h, d[[i]])
103 |       }
104 |     }
105 |   }
106 |   h <- h %>% tbl_df
107 |   return (h)
108 | }
109 | 
110 | read_samples <- function(filename) {
111 |   h <- read.table(filename, sep='\t', head=T) %>% tbl_df
112 |   h$sample <- h$id
113 |   return (h)
114 | }
115 | 
116 | read_samples_select <- function(filename) {
117 |   h <- read.table(filename, sep='\t', head=F) %>% tbl_df
118 |   colnames(h) <- c('id')
119 |   h <- as.vector(h$id)
120 |   return (h)
121 | }
122 | 
123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL,
124 |   sel=NULL) {
125 |   f <- excl
126 |   if (!is.null(f)) {
127 |     d <- d %>% filter(!(sample %in% f))
128 |   }
129 |   f <- methods
130 |   if (!is.null(f) && 'method' %in% colnames(d)) {
131 |     d <- d %>% filter(method %in% f)
132 |   }
133 |   f <- batch2
134 |   if (!is.null(f) && 'batch2' %in% colnames(d)) {
135 |     d <- d %>% filter(batch2 %in% f)
136 |   }
137 |   if (!is.null(sel)) {
138 |     d <- d %>% filter(sample %in% sel)
139 |   }
140 |   d <- droplevels(d)
141 | 
142 |   return (d)
143 | }
144 | ```
145 | 
146 | ```{r}
147 | dat <- list()
148 | dat$samples <- read_samples(opts$samples_file)
149 | dat$samples_select <- read_samples_select(opts$samples_select_file)
150 | dat$samples <- filter_samples(dat$samples,
151 |   excl=opts$samples_excl,
152 |   methods=opts$samples_methods,
153 |   batch2=opts$samples_batch2,
154 |   sel=dat$samples_select)
155 | 
156 | dat$meta <- read_all_meta(opts$rates_files)
157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample))
158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample))
159 | 
160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100)))
161 | stopifnot(all(round(dat$weights) == dat$weights))
162 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
163 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
164 | ```
165 | 
166 | 
167 | 
168 | ## Preprocessing
169 | 
170 | ```{r}
171 | get_cov <- function(d, perc=TRUE) {
172 |   cov <- rowSums(!is.na(d))
173 |   if (perc) {
174 |     cov <- cov / ncol(d)
175 |   }
176 |   return (cov)
177 | }
178 | 
179 | get_var <- function(d) {
180 |   return (apply(d, 1, var, na.rm=T))
181 | }
182 | ```
183 | 
184 | ```{r}
185 | h <- duplicated(dat$meta)
186 | print(sprintf('Remove %d duplicated records.', sum(h)))
187 | dat$meta <- dat$meta[!h, ]
188 | dat$meta$id_ <- 1:nrow(dat$meta)
189 | for (n in c('rates', 'weights')) {
190 |   dat[[n]] <- dat[[n]][!h,]
191 |   dat[[n]]$id_ <- dat$meta$id_
192 | }
193 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
194 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
195 | ```
196 | 
197 | ```{r filter_cov}
198 | d <- dat$rates
199 | print(sprintf('%d records in total.', nrow(d)))
200 | if (!is.na(opts$fil$cov)) {
201 |   d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,]
202 | }
203 | print(sprintf('%d records passed coverage filter.', nrow(d)))
204 | dat$rates <- d
205 | ```
206 | 
207 | ```{r}
208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') {
209 |   t <- data.frame(sample=factor(rownames(pc_vec)),
210 |     pcx=pc_vec[,x], pcy=pc_vec[,y])
211 |   t <- t %>% left_join(dat$samples, by='sample')
212 |   t$sample_short <- sub('^CSCP3_SERUM', '', t$sample)
213 |   p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) +
214 |     geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) +
215 |     xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) +
216 |     theme(legend.position='bottom')
217 |   return (p)
218 | }
219 | 
220 | plot_pca_val <- function(pc_val) {
221 |   t <- data.frame(pc=1:length(pc_val), val=pc_val)
222 |   p <- ggplot(t, aes(x=pc, y=val)) +
223 |     geom_bar(stat='identity', fill='salmon', color='black') +
224 |     xlab('principle component') +
225 |     ylab('% variance explained')
226 |   return (p)
227 | }
228 | ```
229 | 
230 | 
231 | ## PCA unadjusted data
232 | 
233 | ```{r}
234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
235 | ```
236 | 
237 | ```{r fig.width=8, fig.height=7}
238 | p1 <- plot_pca_vec(pc$vec)
239 | p1 + theme(legend.position='top')
240 | ```
241 | 
242 | ```{r fig.width=10, fig.height=5}
243 | p2 <- plot_pca_val(pc$val)
244 | p2
245 | ```
246 | 
247 | ```{r}
248 | d <- dat$rates %>% select(-id_)
249 | cov <- rowSums(!is.na(d)) / ncol(d)
250 | d[h == ncol(d)]
251 | ```
252 | 
253 | ```{r adjust, eval=opts$adjust}
254 | scale_rates <- function(d) {
255 |   h <- d %>% select(-id_)
256 |   a <- min(h, na.rm=T)
257 |   b <- max(h, na.rm=T)
258 |   h <- (h - a) / (b - a) * 100
259 |   h$id_ <- d$id_
260 |   return (h)
261 | }
262 | 
263 | adjust_batch_met <- function(d, what='batch2', scale=F) {
264 |   is_na <- is.na(d)
265 |   h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_)
266 |   h <- h %>% gather(sample, y, -id_) %>%
267 |     inner_join(select_(dat$samples, 'sample', x=what), by='sample')
268 |   h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup
269 |   h <- h %>% select(id_, sample, y=yr)
270 |   h <- h %>% spread(sample, y)
271 |   stopifnot(all(dim(is_na) == dim(h)))
272 |   stopifnot(all(colnames(is_na) %in% colnames(h)))
273 |   h <- h[,colnames(is_na)]
274 |   h[is_na] <- NA
275 |   return (h)
276 | }
277 | 
278 | h <- dat$rates %>% adjust_batch_met
279 | if (opts$scale_adjust) {
280 |   h <- scale_rates(h)
281 | }
282 | dat$rates <- h
283 | ```
284 | 
285 | ## PCA adjusted data
286 | 
287 | ```{r}
288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
289 | ```
290 | 
291 | ```{r fig.width=8, fig.height=7}
292 | p1 <- plot_pca_vec(pc$vec)
293 | p1 + theme(legend.position='top')
294 | ```
295 | 
296 | ```{r fig.width=10, fig.height=5}
297 | p2 <- plot_pca_val(pc$val)
298 | p2
299 | ```
300 | 
301 | ## Filtering
302 | 
303 | ```{r filter_var}
304 | d <- dat$rates
305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples
306 | d <- d[!is.na(h),]
307 | if (!is.na(opts$fil$min_var)) {
308 |   stopifnot(sum(is.na(h)) == 0)
309 |   d <- d[h >= opts$fil$min_var,]
310 | }
311 | print(sprintf('%d records passed min var filter.', nrow(d)))
312 | 
313 | if (!is.na(opts$fil$top_var)) {
314 |   t <- opts$fil$top_var
315 |   if (t <= 1) {
316 |     t <- as.integer(t * nrow(d))
317 |   }
318 |   d <- d[order(get_var(select(d, -id_)), decreasing=T),]
319 |   d <- d[1:t,]
320 | }
321 | print(sprintf('%d final record.', nrow(d)))
322 | dat$rates <- d
323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_))
324 | ```
325 | 
326 | ```{r join}
327 | d <- dat$meta %>% select(chromo, start, end, id_) %>%
328 |   inner_join(dat$rates, by='id_')
329 | stopifnot(nrow(d) == nrow(dat$rates))
330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_))
331 | 
332 | d <- dat$weights %>% gather(sample, weight, -id_)
333 | h <- nrow(dat$df)
334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample'))
335 | 
336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup
337 | 
338 | stopifnot(h == nrow(dat$df))
339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1))
340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0))
341 | ```
342 | 
343 | ```{r}
344 | dat$df <- dat$df %>% ungroup %>% tbl_df
345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df
346 | ```
347 | 
348 | ```{r write}
349 | save_rds <- function(d, name) {
350 |   f <- sprintf('%s_%s.rds', opts$out_base, name)
351 |   saveRDS(d, f)
352 | }
353 | 
354 | save_rds(dat$df, 'met')
355 | save_rds(dat$meta, 'meta')
356 | ```
357 | 
358 | 
359 | 
360 | ```{r eval=opts$plot_heat, fig.height=12}
361 | plot_heat <- function(d, Rowv=T, xlab='value') {
362 |   colors <- rev(brewer.pal(9, 'Spectral'))
363 |   colors <- colorRampPalette(colors)(50)
364 | 
365 |   labRow <- NA
366 |   if (nrow(d) > 500) {
367 |     dendro='column'
368 |   } else {
369 |     dendro = 'both'
370 |   }
371 |   p <- heatmap.2(d, density.info='none', trace='none', col=colors,
372 |     Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA,
373 |     lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab)
374 |   return (p)
375 | }
376 | 
377 | d <- dat$rates
378 | max_rows <- 1000
379 | if (nrow(d) > max_rows) {
380 |   d <- d %>% sample_n(max_rows)
381 | }
382 | d <- d %>% select(-id_) %>% as.matrix
383 | if (nrow(d) > 100) {
384 |   h <- plot_heat(d)
385 | }
386 | ```
387 | 


--------------------------------------------------------------------------------
/data/met/prepro/prom/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Preprocessing Seqmonk methylation files
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(dplyr)
 15 | library(tidyr)
 16 | library(ggplot2)
 17 | library(RColorBrewer)
 18 | library(gplots)
 19 | library(gridExtra)
 20 | source('../../../../lib/utils.R')
 21 | ```
 22 | 
 23 | ```{r opts, echo=T}
 24 | opts <- list()
 25 | opts$fil$cov <- 30
 26 | opts$fil$min_var <- 10
 27 | opts$fil$top_var <- 0.75
 28 | opts$adjust <- F
 29 | opts$scale_adjust <- T
 30 | opts$samples_excl <- NULL
 31 | opts$samples_methods <- NULL
 32 | opts$samples_batch2 <- NULL
 33 | ```
 34 | 
 35 | ```{r}
 36 | opts$name <- basename(getwd())
 37 | opts$seqmonk_dirs <- c(
 38 |   file.path('../../reports', opts$name))
 39 | opts$rates_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'rates.txt'))
 40 | opts$weights_files <- sapply(opts$seqmonk_dirs, function(x) file.path(x, 'weights.txt'))
 41 | opts$out_base <- './data'
 42 | opts$samples_file <- '../../../samples/samples_stats.csv'
 43 | opts$samples_select_file <- '../../../samples/samples.csv'
 44 | opts$cache <- F
 45 | opts$plot_heat <- F
 46 | ```
 47 | 
 48 | **Name**: `r opts$name`
 49 | 
 50 | 
 51 | 
 52 | ```{r read_src}
 53 | read_meta <- function(filename) {
 54 |   h <- read.table(pipe(paste('cut -f 2-5,7,8,12', filename)), head=T, sep='\t')
 55 |   names(h) <- tolower(names(h))
 56 |   h <- h %>% rename(chromo=chromosome)
 57 |   h <- h %>% tbl_df
 58 |   return (h)
 59 | }
 60 | 
 61 | read_meta_quick <- function(filename) {
 62 |   h <- read.table(pipe(paste('cut -f 3', filename)), head=T, sep='\t')
 63 |   names(h) <- tolower(names(h))
 64 |   h <- h %>% tbl_df
 65 |   return (h)
 66 | }
 67 | 
 68 | read_all_meta <- function(filenames) {
 69 |   m <- read_meta(filenames[1])
 70 |   if (length(filenames) > 1) {
 71 |     for (i in 2:length(filenames)) {
 72 |       s <- read_meta_quick(filenames[i])
 73 |       stopifnot(all(m$start == s$start))
 74 |     }
 75 |   }
 76 |   return (m)
 77 | }
 78 | 
 79 | read_values <- function(filename, samples=NULL) {
 80 |   h <- read.table(pipe(paste('cut -f 13-', filename)), head=T, sep='\t')
 81 |   if (!is.null(samples)) {
 82 |     h <- subset(h, select=intersect(colnames(h), samples))
 83 |   }
 84 |   h <- h %>% tbl_df
 85 |   return (h)
 86 | }
 87 | 
 88 | read_all_values <- function(filenames, samples=NULL) {
 89 |   d <- lapply(filenames, function(x) read_values(x, samples=samples))
 90 |   e <- list()
 91 |   for (dd in d) {
 92 |     if (ncol(dd) > 0) {
 93 |       e[[length(e) + 1]] <- dd
 94 |     }
 95 |   }
 96 |   d <- e
 97 |   stopifnot(length(d) > 0)
 98 |   h <- d[[1]]
 99 |   if (length(d) > 1) {
100 |     for (i in 2:length(d)) {
101 |       if (ncol(d[[i]]) > 0) {
102 |         h <- cbind.data.frame(h, d[[i]])
103 |       }
104 |     }
105 |   }
106 |   h <- h %>% tbl_df
107 |   return (h)
108 | }
109 | 
110 | read_samples <- function(filename) {
111 |   h <- read.table(filename, sep='\t', head=T) %>% tbl_df
112 |   h$sample <- h$id
113 |   return (h)
114 | }
115 | 
116 | read_samples_select <- function(filename) {
117 |   h <- read.table(filename, sep='\t', head=F) %>% tbl_df
118 |   colnames(h) <- c('id')
119 |   h <- as.vector(h$id)
120 |   return (h)
121 | }
122 | 
123 | filter_samples <- function(d, excl=NULL, methods=NULL, batch2=NULL,
124 |   sel=NULL) {
125 |   f <- excl
126 |   if (!is.null(f)) {
127 |     d <- d %>% filter(!(sample %in% f))
128 |   }
129 |   f <- methods
130 |   if (!is.null(f) && 'method' %in% colnames(d)) {
131 |     d <- d %>% filter(method %in% f)
132 |   }
133 |   f <- batch2
134 |   if (!is.null(f) && 'batch2' %in% colnames(d)) {
135 |     d <- d %>% filter(batch2 %in% f)
136 |   }
137 |   if (!is.null(sel)) {
138 |     d <- d %>% filter(sample %in% sel)
139 |   }
140 |   d <- droplevels(d)
141 | 
142 |   return (d)
143 | }
144 | ```
145 | 
146 | ```{r}
147 | dat <- list()
148 | dat$samples <- read_samples(opts$samples_file)
149 | dat$samples_select <- read_samples_select(opts$samples_select_file)
150 | dat$samples <- filter_samples(dat$samples,
151 |   excl=opts$samples_excl,
152 |   methods=opts$samples_methods,
153 |   batch2=opts$samples_batch2,
154 |   sel=dat$samples_select)
155 | 
156 | dat$meta <- read_all_meta(opts$rates_files)
157 | dat$rates <- read_all_values(opts$rates_file, levels(dat$samples$sample))
158 | dat$weights <- read_all_values(opts$weights_file, levels(dat$samples$sample))
159 | 
160 | stopifnot(all(range(dat$rates, na.rm=T) == c(0, 100)))
161 | stopifnot(all(round(dat$weights) == dat$weights))
162 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
163 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
164 | ```
165 | 
166 | 
167 | 
168 | ## Preprocessing
169 | 
170 | ```{r}
171 | get_cov <- function(d, perc=TRUE) {
172 |   cov <- rowSums(!is.na(d))
173 |   if (perc) {
174 |     cov <- cov / ncol(d)
175 |   }
176 |   return (cov)
177 | }
178 | 
179 | get_var <- function(d) {
180 |   return (apply(d, 1, var, na.rm=T))
181 | }
182 | ```
183 | 
184 | ```{r}
185 | h <- duplicated(dat$meta)
186 | print(sprintf('Remove %d duplicated records.', sum(h)))
187 | dat$meta <- dat$meta[!h, ]
188 | dat$meta$id_ <- 1:nrow(dat$meta)
189 | for (n in c('rates', 'weights')) {
190 |   dat[[n]] <- dat[[n]][!h,]
191 |   dat[[n]]$id_ <- dat$meta$id_
192 | }
193 | stopifnot(nrow(dat$meta) == nrow(dat$rates))
194 | stopifnot(nrow(dat$rates) == nrow(dat$weights))
195 | ```
196 | 
197 | ```{r filter_cov}
198 | d <- dat$rates
199 | print(sprintf('%d records in total.', nrow(d)))
200 | if (!is.na(opts$fil$cov)) {
201 |   d <- d[get_cov(select(d, -id_), opts$fil$cov <= 1) >= opts$fil$cov,]
202 | }
203 | print(sprintf('%d records passed coverage filter.', nrow(d)))
204 | dat$rates <- d
205 | ```
206 | 
207 | ```{r}
208 | plot_pca_vec <- function(pc_vec, x=1, y=2, color='cov') {
209 |   t <- data.frame(sample=factor(rownames(pc_vec)),
210 |     pcx=pc_vec[,x], pcy=pc_vec[,y])
211 |   t <- t %>% left_join(dat$samples, by='sample')
212 |   t$sample_short <- sub('^CSCP3_SERUM', '', t$sample)
213 |   p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point(aes_string(color=color)) +
214 |     geom_text(aes(label=sample_short), vjust=-.4, hjust= .3, size=2.5) +
215 |     xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) +
216 |     theme(legend.position='bottom')
217 |   return (p)
218 | }
219 | 
220 | plot_pca_val <- function(pc_val) {
221 |   t <- data.frame(pc=1:length(pc_val), val=pc_val)
222 |   p <- ggplot(t, aes(x=pc, y=val)) +
223 |     geom_bar(stat='identity', fill='salmon', color='black') +
224 |     xlab('principle component') +
225 |     ylab('% variance explained')
226 |   return (p)
227 | }
228 | ```
229 | 
230 | 
231 | ## PCA unadjusted data
232 | 
233 | ```{r}
234 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
235 | ```
236 | 
237 | ```{r fig.width=8, fig.height=7}
238 | p1 <- plot_pca_vec(pc$vec)
239 | p1 + theme(legend.position='top')
240 | ```
241 | 
242 | ```{r fig.width=10, fig.height=5}
243 | p2 <- plot_pca_val(pc$val)
244 | p2
245 | ```
246 | 
247 | ```{r}
248 | d <- dat$rates %>% select(-id_)
249 | cov <- rowSums(!is.na(d)) / ncol(d)
250 | d[h == ncol(d)]
251 | ```
252 | 
253 | ```{r adjust, eval=opts$adjust}
254 | scale_rates <- function(d) {
255 |   h <- d %>% select(-id_)
256 |   a <- min(h, na.rm=T)
257 |   b <- max(h, na.rm=T)
258 |   h <- (h - a) / (b - a) * 100
259 |   h$id_ <- d$id_
260 |   return (h)
261 | }
262 | 
263 | adjust_batch_met <- function(d, what='batch2', scale=F) {
264 |   is_na <- is.na(d)
265 |   h <- d %>% select(-id_) %>% impute %>% mutate(id_=d$id_)
266 |   h <- h %>% gather(sample, y, -id_) %>%
267 |     inner_join(select_(dat$samples, 'sample', x=what), by='sample')
268 |   h <- h %>% group_by(id_) %>% do(adjust_df(.)) %>% ungroup
269 |   h <- h %>% select(id_, sample, y=yr)
270 |   h <- h %>% spread(sample, y)
271 |   stopifnot(all(dim(is_na) == dim(h)))
272 |   stopifnot(all(colnames(is_na) %in% colnames(h)))
273 |   h <- h[,colnames(is_na)]
274 |   h[is_na] <- NA
275 |   return (h)
276 | }
277 | 
278 | h <- dat$rates %>% adjust_batch_met
279 | if (opts$scale_adjust) {
280 |   h <- scale_rates(h)
281 | }
282 | dat$rates <- h
283 | ```
284 | 
285 | ## PCA adjusted data
286 | 
287 | ```{r}
288 | pc <- dat$rates %>% select(-id_) %>% impute %>% pca
289 | ```
290 | 
291 | ```{r fig.width=8, fig.height=7}
292 | p1 <- plot_pca_vec(pc$vec)
293 | p1 + theme(legend.position='top')
294 | ```
295 | 
296 | ```{r fig.width=10, fig.height=5}
297 | p2 <- plot_pca_val(pc$val)
298 | p2
299 | ```
300 | 
301 | ## Filtering
302 | 
303 | ```{r filter_var}
304 | d <- dat$rates
305 | h <- get_var(select(d, -id_)) # remove sites covered by < 2 samples
306 | d <- d[!is.na(h),]
307 | if (!is.na(opts$fil$min_var)) {
308 |   stopifnot(sum(is.na(h)) == 0)
309 |   d <- d[h >= opts$fil$min_var,]
310 | }
311 | print(sprintf('%d records passed min var filter.', nrow(d)))
312 | 
313 | if (!is.na(opts$fil$top_var)) {
314 |   t <- opts$fil$top_var
315 |   if (t <= 1) {
316 |     t <- as.integer(t * nrow(d))
317 |   }
318 |   d <- d[order(get_var(select(d, -id_)), decreasing=T),]
319 |   d <- d[1:t,]
320 | }
321 | print(sprintf('%d final record.', nrow(d)))
322 | dat$rates <- d
323 | stopifnot(all(dat$rates$id_ %in% dat$meta$id_))
324 | ```
325 | 
326 | ```{r join}
327 | d <- dat$meta %>% select(chromo, start, end, id_) %>%
328 |   inner_join(dat$rates, by='id_')
329 | stopifnot(nrow(d) == nrow(dat$rates))
330 | dat$df <- d %>% gather(sample, rate, -c(chromo, start, end, id_))
331 | 
332 | d <- dat$weights %>% gather(sample, weight, -id_)
333 | h <- nrow(dat$df)
334 | dat$df <- dat$df %>% inner_join(d, by=c('id_', 'sample'))
335 | 
336 | dat$df <- dat$df %>% rename(met=rate) %>% ungroup
337 | 
338 | stopifnot(h == nrow(dat$df))
339 | stopifnot(nrow(dat$df) == nrow(dat$rates) * (ncol(dat$rates) - 1))
340 | stopifnot(all(dat$df[is.na(dat$df$met),]$weight == 0))
341 | ```
342 | 
343 | ```{r}
344 | dat$df <- dat$df %>% ungroup %>% tbl_df
345 | dat$meta <- dat$meta %>% ungroup %>% tbl_df
346 | ```
347 | 
348 | ```{r write}
349 | save_rds <- function(d, name) {
350 |   f <- sprintf('%s_%s.rds', opts$out_base, name)
351 |   saveRDS(d, f)
352 | }
353 | 
354 | save_rds(dat$df, 'met')
355 | save_rds(dat$meta, 'meta')
356 | ```
357 | 
358 | 
359 | 
360 | ```{r eval=opts$plot_heat, fig.height=12}
361 | plot_heat <- function(d, Rowv=T, xlab='value') {
362 |   colors <- rev(brewer.pal(9, 'Spectral'))
363 |   colors <- colorRampPalette(colors)(50)
364 | 
365 |   labRow <- NA
366 |   if (nrow(d) > 500) {
367 |     dendro='column'
368 |   } else {
369 |     dendro = 'both'
370 |   }
371 |   p <- heatmap.2(d, density.info='none', trace='none', col=colors,
372 |     Rowv=Rowv, Colv=T, keysize=1.0, dendro=dendro, labRow=NA,
373 |     lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab)
374 |   return (p)
375 | }
376 | 
377 | d <- dat$rates
378 | max_rows <- 1000
379 | if (nrow(d) > max_rows) {
380 |   d <- d %>% sample_n(max_rows)
381 | }
382 | d <- d %>% select(-id_) %>% as.matrix
383 | if (nrow(d) > 100) {
384 |   h <- plot_heat(d)
385 | }
386 | ```
387 | 


--------------------------------------------------------------------------------
/gene_robust/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Robustness gene-specific analysis
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, warning=F, message=F)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(ggplot2)
 15 | library(dplyr)
 16 | library(tidyr)
 17 | library(xtable)
 18 | options(xtable.type='html')
 19 | source('../lib/utils.R')
 20 | ```
 21 | 
 22 | ```{r}
 23 | opts <- list()
 24 | opts$data_file <- '../data/join/data.rds'
 25 | opts$expr <- '../data/expr/data_proc/data_expr.rds'
 26 | opts$expr_meta <- '../data/expr/data_proc/data_meta.rds'
 27 | opts$genes_file <- '../data/expr/data_raw/subpopulation_lif_genes9.csv'
 28 | opts$alpha <- 0.1
 29 | opts$keep <- c(0.6, 0.7, 0.8)
 30 | opts$trials <- 5
 31 | 
 32 | opts$rnd_trials <- 0
 33 | opts$filter_genes <- F
 34 | opts$cache <- T
 35 | opts$mean <- F
 36 | opts$permute <- F
 37 | opts$num_genes <- 0
 38 | opts$annos <- c()
 39 | # opts$annos <- c('active_enhancer', 'H3K4me1', 'H3K4me1_Tet1', 'H3K27ac',
 40 | #   'H3K27me3', 'cgi', 'prom', 'prom_cgi', 'prom_non_cgi', 'LMR', 'p300',
 41 | #   'gene_body', 'Wu_Tet1')
 42 | set.seed(0)
 43 | ```
 44 | 
 45 | ```{r}
 46 | theme_pub <- function() {
 47 |   p <- theme(
 48 |     axis.text=element_text(size=rel(1.2), color='black'),
 49 |     axis.title=element_text(size=rel(1.5)),
 50 |     legend.position='top',
 51 |     legend.text=element_text(size=rel(1.2)),
 52 |     legend.title=element_text(size=rel(1.2)),
 53 |     legend.key=element_rect(fill='transparent'),
 54 |     panel.border=element_blank(),
 55 |     panel.grid.major = element_blank(),
 56 |     panel.grid.minor = element_blank(),
 57 |     panel.background = element_blank(),
 58 |     axis.line = element_line(colour="black", size=1),
 59 |     axis.ticks.length = unit(.3, 'cm'),
 60 |     axis.ticks.margin = unit(.3, 'cm')
 61 |     )
 62 |   return (p)
 63 | }
 64 | ```
 65 | 
 66 | Significance threshold: `r opts$alpha`
 67 | 
 68 | ```{r}
 69 | cmp <- list()
 70 | dat <- list()
 71 | ```
 72 | 
 73 | ```{r dat}
 74 | read_genes <- function(f) {
 75 |   d <- read.csv(f, sep=',', head=T) %>% tbl_df
 76 |   names(d) <- c('ens_id', 'gene_id')
 77 |   return (d)
 78 | }
 79 | 
 80 | dat$em <- readRDS(opts$data_file)
 81 | # dat$em <- dat$em %>% filter(chromo.x %in% c(1)) %>% droplevels
 82 | dat$expr_meta <- readRDS(opts$expr_meta)
 83 | dat$genes <- read_genes(opts$genes_file)
 84 | if (opts$filter_genes) {
 85 |   h <- as.vector(dat$genes$ens_id)
 86 |   dat$em <- dat$em %>% filter(ens_id %in% h)
 87 | }
 88 | if (!is.null(opts$num_genes) && opts$num_genes > 0) {
 89 |   h <- levels(dat$em$ens_id)
 90 |   h <- h[sample(length(h))][1:opts$num_genes]
 91 |   dat$em <- dat$em %>% filter(ens_id %in% h)
 92 | }
 93 | dat$em <- dat$em %>% droplevels
 94 | 
 95 | h <- dat$em %>% group_by(name, id_.x, id_.y) %>% top_n(1, expr) %>% ungroup %>%
 96 |   select(id_.x, id_.y, name, ens_id, gene_id, chromo=chromo.x, start.x, end.x, start.y, end.y, strand)
 97 | a <- dat$genes %>% mutate(pluri=T) %>% select(-gene_id)
 98 | h <- h %>% left_join(a, by='ens_id')
 99 | h[is.na(h$pluri),]$pluri <- F
100 | dat$em_meta <- h
101 | dat$em_meta <- dat$em_meta %>% droplevels
102 | ```
103 | 
104 | ```{r}
105 | correlate <- function(d) {
106 |   r <- d %>% group_by(name, id_.x, id_.y) %>%
107 |     do(wtd_cor(.$expr, .$met, .$weight)) %>% ungroup
108 |   r <- r %>% group_by(name) %>% mutate(p_adj=p.adjust(p, method='fdr')) %>% ungroup
109 |   return (r)
110 | }
111 | ```
112 | 
113 | ```{r cor, cache=opts$cache}
114 | r <- correlate(dat$em)
115 | saveRDS(r, 'r.rds')
116 | ```
117 | 
118 | ```{r}
119 | cmp$r <- r
120 | ```
121 | 
122 | ```{r cor_robust, cache=opts$cache}
123 | samples <- as.vector(levels(dat$em$sample))
124 | r <- list()
125 | for (k in opts$keep) {
126 |   for (i in 1:opts$trials) {
127 |     s <- sample(samples, round(length(samples) * k))
128 |     d <- dat$em %>% filter(sample %in% s) %>% droplevels
129 |     rb <- correlate(d)
130 |     rb$trial <- i
131 |     rb$keep <- k
132 |     r[[length(r) + 1]] <- rb
133 |   }
134 | }
135 | r <- do.call(rbind.data.frame, r)
136 | r <- r[complete.cases(r),]
137 | saveRDS(r, 'rb.rds')
138 | ```
139 | 
140 | ```{r}
141 | cmp$rb <- r
142 | ```
143 | 
144 | ```{r}
145 | d <- cmp$r %>% inner_join(cmp$rb, by=c('name', 'id_.x', 'id_.y'))
146 | cmp$st <- d %>% group_by(name, keep, trial) %>%
147 |   summarise(
148 |     mse=mean(sqrt((r.x - r.y)**2), na.rm=T),
149 |     mad=mean(abs(r.x - r.y), na.rm=T),
150 |     sig0=sum(p_adj.x <= opts$alpha),
151 |     sig0_pos=sum(p_adj.x <= opts$alpha & r.x > 0),
152 |     sig0_neg=sum(p_adj.x <= opts$alpha & r.x < 0),
153 |     sig=sum(p_adj.y <= opts$alpha),
154 |     sig_pos=sum(p_adj.y <= opts$alpha & r.y > 0),
155 |     sig_neg=sum(p_adj.y <= opts$alpha & r.y < 0),
156 |     dsig=sig - sig0,
157 |     dsig_pos=sig_pos - sig0_pos,
158 |     dsig_neg=sig_neg - sig0_neg,
159 |     psig=abs(dsig) / sig0,
160 |     psig_pos=abs(dsig_pos) / sig0_pos,
161 |     psig_neg=abs(dsig_neg) / sig0_neg
162 |     ) %>% ungroup %>%
163 |   mutate(name=factor(name, levels=sort(unique(name)))) %>%
164 |   arrange(name, keep, trial)
165 | ```
166 | 
167 | ```{r}
168 | cmp$s <- cmp$st %>% group_by(name, keep) %>% select(-trial) %>%
169 |   summarise(
170 |     mse_mean=mean(mse),
171 |     mse_min=min(mse),
172 |     mse_max=max(mse),
173 |     mse_sd=sd(mse),
174 |     sig0=max(sig0),
175 |     sig0_pos=max(sig0_pos),
176 |     sig0_neg=max(sig0_neg),
177 |     sig=mean(sig),
178 |     sig_pos=mean(sig_pos),
179 |     sig_neg=mean(sig_neg),
180 |     dsig=mean(dsig),
181 |     dsig_pos=mean(dsig_pos),
182 |     dsig_neg=mean(dsig_neg),
183 |     psig=mean(psig),
184 |     psig_pos=mean(psig_pos),
185 |     psig_neg=mean(psig_neg)
186 |     ) %>% ungroup %>% arrange(name, keep)
187 | cmp$s <- cmp$s %>%
188 |   mutate(keep=factor(keep), name=factor(name, levels=rev(levels(name))))
189 | ```
190 | 
191 | ```{r}
192 | # Average over trials
193 | cmp$rba <- cmp$rb %>% group_by(name, id_.x, id_.y, keep) %>%
194 |   summarise_each(funs(mean)) %>% ungroup
195 | ```
196 | 
197 | ```{r eval=T, fig.width=10, fig.height=10}
198 | h <- cmp$r %>% inner_join(cmp$rba, by=c('name', 'id_.x', 'id_.y'))
199 | d <- h %>% mutate(keep=factor(keep))
200 | ggplot(d, aes(x=r.x, y=r.y, color=keep)) +
201 |   geom_abline(slope=1, color='grey') +
202 |   stat_density2d(aes(fill=keep)) +
203 |   # geom_point(size=0.5) +
204 |   facet_wrap(~name) +
205 |   xlim(-1, 1) + ylim(-1, 1) +
206 |   xlab('All samples') + ylab('Bootstrapped samples') +
207 |   theme_pub()
208 | ```
209 | 
210 | ```{r fig.width=8, fig.height=7}
211 | p <- ggplot(cmp$s, aes(x=name, y=mse_mean)) +
212 |   geom_bar(aes(fill=keep), stat='identity', position='dodge') +
213 |   xlab('') + ylab('RMSE from all cells') +
214 |   guides(fill=guide_legend(title='Fraction of cells')) +
215 |   coord_flip() +
216 |   theme_pub()
217 | print(p)
218 | ```
219 | 
220 | ```{r eval=F}
221 | ggsave(p, file='fig_rmse.pdf', width=8, height=9)
222 | ```
223 | 
224 | ```{r fig.width=8, fig.height=7}
225 | p <- ggplot(cmp$s, aes(x=name, y=dsig)) +
226 |   geom_bar(aes(fill=keep), stat='identity', position='dodge') +
227 |   xlab('') + ylab('Absolute loss # significant correlations') +
228 |   guides(fill=guide_legend(title='Fraction of cells')) +
229 |   coord_flip() +
230 |   theme_pub()
231 | print(p)
232 | ```
233 | 
234 | ```{r eval=F}
235 | ggsave(p, file='fig_loss_abs.pdf', width=8, height=9)
236 | ```
237 | 
238 | ```{r fig.width=8, fig.height=7}
239 | p <- ggplot(cmp$s, aes(x=name, y=psig)) +
240 |   geom_bar(aes(fill=keep), stat='identity', position='dodge') +
241 |   xlab('') + ylab('Relative loss # significant correlations') +
242 |   guides(fill=guide_legend(title='Fraction of cells')) +
243 |   coord_flip() +
244 |   theme_pub()
245 | print(p)
246 | ```
247 | 
248 | ```{r eval=F}
249 | ggsave(p, file='fig_loss_rel.pdf', width=8, height=9)
250 | ```
251 | 
252 | ```{r results='asis'}
253 | xtable(cmp$s, digits=2)
254 | ```
255 | 
256 | ```{r eval=F}
257 | write.csv(cmp$s, 'S11.csv')
258 | ```
259 | 


--------------------------------------------------------------------------------
/heat/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Joint clustering on expression and methylation
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, warning=F, message=F)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(ggplot2)
 15 | library(dplyr)
 16 | library(tidyr)
 17 | library(RColorBrewer)
 18 | library(gridExtra)
 19 | library(gplots)
 20 | library(corrplot)
 21 | library(ggdendro)
 22 | library(weights)
 23 | library(stringr)
 24 | library(weights)
 25 | source('viz.R')
 26 | source('../lib/utils.R')
 27 | ```
 28 | 
 29 | ```{r}
 30 | opts <- list()
 31 | opts$data_file <- '../data/join/data.rds'
 32 | opts$r_file <- '../gene/r.rds'
 33 | opts$genes_file <- '../data/expr/data_raw/subpopulation_lif_genes9.csv'
 34 | opts$name <- 'gene_body'
 35 | opts$select_by <- 'met_var'
 36 | opts$n <- 300
 37 | 
 38 | opts$clust <- list()
 39 | opts$clust$c1 <- c('C05', 'B05', 'D05', 'E01', 'H03', 'E09', 'C07', 'F01', 'B10', 'F07', 'C02', 'B07')
 40 | opts$clust$c2 <- c('G02', 'E07', 'F03', 'C03', 'A07', 'C01', 'A06', 'A04', 'A02', 'A03', 'G07', 'C10', 'H01', 'D01', 'G09', 'E05', 'G01')
 41 | opts$clust_colors <- c('default'='#fdc086', 'c1'='#7fc97f', 'c2'='#beaed4')
 42 | ```
 43 | 
 44 | ```{r read}
 45 | read_genes <- function(f) {
 46 |   d <- read.csv(f, sep=',', head=T) %>% tbl_df
 47 |   names(d) <- c('ens_id', 'gene_id')
 48 |   return (d)
 49 | }
 50 | 
 51 | dat <- list()
 52 | dat$em <- readRDS(opts$data_file) %>%
 53 |   filter(name == opts$name) %>%
 54 |   mutate(sample=factor(sample, labels=format_sample(levels(sample)))) %>%
 55 |   droplevels
 56 | dat$r <- readRDS(opts$r_file) %>%
 57 |   filter(name == opts$name) %>% droplevels
 58 | dat$genes <- read_genes(opts$genes_file)
 59 | ```
 60 | 
 61 | ```{r prepro}
 62 | dat$s <- dat$em %>% group_by(name, id_.x, id_.y) %>%
 63 |   summarise(
 64 |     expr_mean = mean(expr),
 65 |     expr_var = var(expr),
 66 |     met_mean = weighted.mean(met, weight, na.rm=T),
 67 |     met_var = wtd.var(met, weight, na.rm=T)
 68 |     ) %>% ungroup
 69 | dat$s <- dat$s %>% inner_join(dat$r, by=c('name', 'id_.x', 'id_.y'))
 70 | ```
 71 | 
 72 | 
 73 | 
 74 | ```{r}
 75 | col_names <- function(d, heat) {
 76 |   m <- colnames(d)
 77 |   m <- m[3:length(m)]
 78 |   m <- m[heat$colInd]
 79 |   m <- paste(sapply(m, function(x) sprintf('\'%s\'', x)), collapse=', ')
 80 |   return (m)
 81 | }
 82 | ```
 83 | 
 84 | ```{r}
 85 | dclust <- data_clust(data_select(opts$n, opts$select_by))
 86 | ```
 87 | 
 88 | ```{r eval=F}
 89 | write.csv(dclust$e, 'fig1e_expr.csv')
 90 | write.csv(dclust$m, 'fig1e_met.csv')
 91 | write.csv(dclust$r, 'fig1e_cor.csv')
 92 | ```
 93 | 
 94 | ```{r fig.width=10, fig.height=10}
 95 | pm <- plot_heat_met(dclust)
 96 | ```
 97 | 
 98 | ```{r eval=F}
 99 | pdf(file='fig_heat_met.pdf', width=10, height=10)
100 | pm <- plot_heat_met(dclust)
101 | dev.off()
102 | ```
103 | 
104 | ```{r eval=F}
105 | col_names(dclust$m, pm)
106 | ```
107 | 
108 | ```{r fig.width=10, fig.height=10}
109 | pe <- plot_heat_expr(dclust, Rowv=pm$rowDendrogram)
110 | ```
111 | 
112 | ```{r eval=F}
113 | pdf(file='fig_heat_expr.pdf', width=10, height=10)
114 | pe <- plot_heat_expr(dclust, Rowv=pm$rowDendrogram)
115 | dev.off()
116 | ```
117 | 
118 | ```{r fig.height=10, fig.width=1.5}
119 | plot_tracks(data_track(dclust, pm))
120 | ```
121 | 
122 | ```{r eval=F}
123 | pdf(file='fig_heat_tracks.pdf', width=1.5, height=10)
124 | plot_tracks(data_track(dclust, pm))
125 | dev.off()
126 | ```
127 | 


--------------------------------------------------------------------------------
/heat/index2.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Joint clustering on expression and methylation
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, warning=F, message=F)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(ggplot2)
 15 | library(dplyr)
 16 | library(tidyr)
 17 | library(RColorBrewer)
 18 | library(gridExtra)
 19 | library(gplots)
 20 | library(corrplot)
 21 | library(ggdendro)
 22 | library(weights)
 23 | library(stringr)
 24 | library(weights)
 25 | source('viz.R')
 26 | source('../lib/utils.R')
 27 | ```
 28 | 
 29 | ```{r}
 30 | opts <- list()
 31 | opts$data_file <- '../data/join/data.rds'
 32 | opts$r_file <- '../gene/r.rds'
 33 | opts$genes_file <- '../data/expr/data_raw/subpopulation_lif_genes9.csv'
 34 | opts$name <- 'gene_body'
 35 | opts$select_by <- 'expr_var'
 36 | opts$n <- 300
 37 | 
 38 | opts$clust <- list()
 39 | opts$clust$c1 <- c('B10', 'H10', 'F01', 'F07', 'E09', 'E01', 'H03', 'C02', 'C05', 'B05', 'B09', 'D05', 'C07')
 40 | opts$clust$c2 <- c('H02', 'D10', 'F09', 'B06', 'F06', 'D06', 'E05', 'B07', 'B04', 'B01', 'E03', 'A06', 'C04')
 41 | opts$clust_colors <- c('default'='#fdc086', 'c1'='#7fc97f', 'c2'='#beaed4')
 42 | ```
 43 | 
 44 | ```{r read}
 45 | read_genes <- function(f) {
 46 |   d <- read.csv(f, sep=',', head=T) %>% tbl_df
 47 |   names(d) <- c('ens_id', 'gene_id')
 48 |   return (d)
 49 | }
 50 | 
 51 | dat <- list()
 52 | dat$em <- readRDS(opts$data_file) %>%
 53 |   filter(name == opts$name) %>%
 54 |   mutate(sample=factor(sample, labels=format_sample(levels(sample)))) %>%
 55 |   droplevels
 56 | dat$r <- readRDS(opts$r_file) %>%
 57 |   filter(name == opts$name) %>% droplevels
 58 | dat$genes <- read_genes(opts$genes_file)
 59 | ```
 60 | 
 61 | ```{r prepro}
 62 | dat$s <- dat$em %>% group_by(name, id_.x, id_.y) %>%
 63 |   summarise(
 64 |     expr_mean = mean(expr),
 65 |     expr_var = var(expr),
 66 |     met_mean = weighted.mean(met, weight, na.rm=T),
 67 |     met_var = wtd.var(met, weight, na.rm=T)
 68 |     ) %>% ungroup
 69 | dat$s <- dat$s %>% inner_join(dat$r, by=c('name', 'id_.x', 'id_.y'))
 70 | ```
 71 | 
 72 | 
 73 | 
 74 | ```{r}
 75 | col_names <- function(d, heat) {
 76 |   m <- colnames(d)
 77 |   m <- m[3:length(m)]
 78 |   m <- m[heat$colInd]
 79 |   m <- paste(sapply(m, function(x) sprintf('\'%s\'', x)), collapse=', ')
 80 |   return (m)
 81 | }
 82 | ```
 83 | 
 84 | ```{r}
 85 | dclust <- data_clust(data_select(opts$n, opts$select_by))
 86 | ```
 87 | 
 88 | ```{r fig.width=10, fig.height=10}
 89 | pm <- plot_heat_met(dclust)
 90 | ```
 91 | 
 92 | ```{r eval=F}
 93 | pdf(file='fig_heat2_met.pdf', width=10, height=10)
 94 | pm <- plot_heat_met(dclust)
 95 | dev.off()
 96 | ```
 97 | 
 98 | ```{r eval=F}
 99 | col_names(dclust$m, pm)
100 | ```
101 | 
102 | ```{r fig.width=10, fig.height=10}
103 | pe <- plot_heat_expr(dclust, Rowv=pm$rowDendrogram)
104 | ```
105 | 
106 | ```{r eval=F}
107 | pdf(file='fig_heat2_expr.pdf', width=10, height=10)
108 | pe <- plot_heat_expr(dclust, Rowv=pm$rowDendrogram)
109 | dev.off()
110 | ```
111 | 
112 | ```{r fig.height=10, fig.width=1.5}
113 | plot_tracks(data_track(dclust, pm))
114 | ```
115 | 
116 | ```{r eval=F}
117 | pdf(file='fig_heat2_tracks.pdf', width=1.5, height=10)
118 | plot_tracks(data_track(dclust, pm))
119 | dev.off()
120 | ```
121 | 
122 | ```{r eval=F}
123 | write.csv(dclust$m, 'S10_met.csv')
124 | write.csv(dclust$e, 'S10_expr.csv')
125 | write.csv(dclust$r, 'S10_cor.csv')
126 | ```
127 | 
128 | 


--------------------------------------------------------------------------------
/heat/viz.R:
--------------------------------------------------------------------------------
  1 | data_select <- function(n, by='met_var') {
  2 |   h <- dat$s %>% select_('name', 'id_.x', 'id_.y', sel=by)
  3 |   h <- h %>% group_by(name, id_.x) %>% top_n(1, sel) %>% ungroup
  4 |   h <- h %>% group_by(name) %>% top_n(n, sel) %>% ungroup
  5 |   h <- h %>% select(name, id_.x, id_.y, sel)
  6 |   return (h)
  7 | }
  8 | 
  9 | data_clust <- function(s) {
 10 |   em <- dat$em %>% semi_join(s, by=c('name', 'id_.x', 'id_.y')) %>%
 11 |     select(id_.x, id_.y, gene_id, sample, expr, met)
 12 |   e <- em %>% select(id_.y, gene_id, sample, expr) %>% spread(sample, expr)
 13 |   m <- em %>% select(id_.y, gene_id, sample, met) %>% spread(sample, met)
 14 |   rs <- dat$s %>% semi_join(s, by=c('name', 'id_.x', 'id_.y'))
 15 |   return (list(r=rs, e=e, m=m, em=em))
 16 | }
 17 | 
 18 | samples_colors <- function(x) {
 19 |   h <- rep(opts$clust_colors['default'], length(x))
 20 |   for (n in names(opts$clust)) {
 21 |     h[x %in% opts$clust[[n]]] <- opts$clust_colors[n]
 22 |   }
 23 |   names(h) <- x
 24 |   return (h)
 25 | }
 26 | 
 27 | plot_heat <- function(d, xlab='value', col=NULL, col_colors=NULL,
 28 |   labRow=NA, ...) {
 29 |   d <- as.matrix(d)
 30 |   if (is.null(col)) {
 31 |     col <- rev(brewer.pal(9, 'Spectral'))
 32 |     col <- colorRampPalette(col)(50)
 33 |   }
 34 | 
 35 |   if (nrow(d) > 500) {
 36 |     dendro='column'
 37 |   } else {
 38 |     dendro = 'both'
 39 |   }
 40 | 
 41 |   col_colors <- samples_colors(colnames(d))
 42 | 
 43 |   p <- heatmap.2(d, density.info='none', trace='none', col=col,
 44 |     keysize=0.5, dendro=dendro, labRow=labRow,
 45 |     ColSideColors=col_colors,
 46 |     lhei=c(2,9),
 47 |     lwid=c(2, 5), key.title='', srtCol=45, key.xlab=xlab, ...)
 48 |   return (p)
 49 | }
 50 | 
 51 | plot_heat_expr <- function(dclust, col=NULL, ...) {
 52 |   d <- dclust$e %>% select(-id_.y) %>% to_matrix(rowcol='gene_id')
 53 |   if (is.null(col)) {
 54 |     col <- brewer_cols('OrRd')
 55 |   }
 56 |   pe <- plot_heat(d, col=col, xlab='Expression', ...)
 57 |   return (pe)
 58 | }
 59 | 
 60 | plot_heat_met <- function(dclust, col=NULL, ...) {
 61 |   d <- dclust$m %>% select(-id_.y) %>% to_matrix(rowcol='gene_id')
 62 |   if (is.null(col)) {
 63 |     col <- brewer_cols('PuBuGn', rev=F)
 64 |   }
 65 |   pm <- plot_heat(d, col=col, xlab='Methylation', ...)
 66 |   return (pm)
 67 | }
 68 | 
 69 | plot_tracks <- function(d, r=F) {
 70 |   te <- plot_track(filter(d, param=='expr_var')) +
 71 |     scale_fill_gradient(low='white', 'high'='red', name='var(expr)')
 72 |   tm <- plot_track(filter(d, param=='met_var')) +
 73 |     scale_fill_gradient(low='white', 'high'='royalblue2', name='var(met)')
 74 |   if (r) {
 75 |     tr <- plot_track(filter(d, param=='r')) +
 76 |       scale_fill_gradient2(low='royalblue', mid='white', 'high'='red2', name='r')
 77 |     grid.arrange(tm, tr, te, ncol=3)
 78 |   } else {
 79 |     grid.arrange(tm, te, ncol=2)
 80 |   }
 81 | }
 82 | 
 83 | brewer_cols <- function(pal='Spectral', rev=F) {
 84 |   ncol <- brewer.pal.info[pal, 'maxcolors']
 85 |   col <- colorRampPalette(brewer.pal(ncol, pal))(50)
 86 |   if (rev) {
 87 |     col <- rev(col)
 88 |   }
 89 |   return (col)
 90 | }
 91 | 
 92 | to_matrix <- function(d, rowcol=NULL) {
 93 |   d <- as.data.frame(d)
 94 |   if (!is.null(rowcol)) {
 95 |     rownames(d) <- d[[rowcol]]
 96 |     d <- d[,setdiff(names(d), rowcol)]
 97 |   }
 98 |   d <- as.matrix(d)
 99 |   return (d)
100 | }
101 | 
102 | data_track <- function(dclust, p=NA) {
103 |   h <- dclust$e
104 |   if (!is.na(p)) {
105 |     h <- h[p$rowInd,]
106 |   }
107 |   h <- h %>% select(id_.y)
108 |   rc <- h %>% inner_join(dclust$r, by='id_.y')
109 |   d <- rc %>% select(r, expr_var, met_var) %>% mutate(i=1:n()) %>%
110 |     gather(param, value, -i)
111 |   return (d)
112 | }
113 | 
114 | plot_track <- function(d, low='white', high='red') {
115 |   p <- ggplot(d, aes(x=param, y=i, fill=value)) + geom_tile() +
116 |     theme(line=element_blank(), axis.text=element_blank(),
117 |       axis.title.x=element_blank(),
118 |       axis.title.y=element_blank(),
119 |       panel.background=element_blank(),
120 |       plot.margin=unit(rep(0, 4), 'mm'),
121 |       legend.position='top', legend.direction='vertical')
122 |   return (p)
123 | }
124 | 


--------------------------------------------------------------------------------
/index.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: scM&T-Seq
 3 | output:
 4 |   html_document:
 5 |     toc: no
 6 | ---
 7 | 
 8 | # Gene-specific
 9 | * [Gene-specific correlation](./gene/index.html)
10 | * [Robustness gene-specific correlation](./gene_robust/index.html)
11 | * [Correlation with mean methylation](./gene_mean/index.html)
12 | * [Heatmap expression vs. methylation](./heat/met.html)
13 | * [Esrrb](./zoom/index.html)
14 | 
15 | # Sample-specific
16 | * [Sample-specific correlation](./sample/index.html)
17 | 
18 | # QC
19 | * [Statistics](./qc/index.html)
20 | * [Clustering](./clust/index.html)
21 | * [Comparison variability](./var/m14_m15P3/index.html)
22 | * [Correlation with bulk methylation](./cor_bulk/met/index.html)
23 | * [Coverage analysis](./cov/m14_m15P3/index.html)
24 | * [CCA](./cca/index.html)
25 | 
26 | 


--------------------------------------------------------------------------------
/lib/utils.R:
--------------------------------------------------------------------------------
  1 | library(weights)
  2 | library(dplyr)
  3 | library(tidyr)
  4 | library(stringr)
  5 | 
  6 | theme_pub <- function() {
  7 |   p <- theme(
  8 |     axis.text=element_text(size=rel(1.2), color='black'),
  9 |     axis.title=element_text(size=rel(1.5)),
 10 |     # axis.title.y=element_text(vjust=1.5),
 11 |     # axis.title.x=element_text(vjust=-0.2),
 12 |     legend.position='top',
 13 |     legend.text=element_text(size=rel(1.2)),
 14 |     legend.title=element_text(size=rel(1.2)),
 15 |     legend.key=element_rect(fill='transparent'),
 16 |     panel.border=element_blank(),
 17 |     panel.grid.major = element_blank(),
 18 |     panel.grid.minor = element_blank(),
 19 |     panel.background = element_blank(),
 20 |     axis.line = element_line(colour="black", size=1),
 21 |     axis.ticks.length = unit(.3, 'cm'),
 22 |     axis.ticks.margin = unit(.3, 'cm')
 23 |     )
 24 |   return (p)
 25 | }
 26 | 
 27 | format_sample <- function(s) {
 28 |   l <- str_split(s, '_')
 29 |   l <- sapply(l, function(x) x[length(x)])
 30 |   return (l)
 31 | }
 32 | 
 33 | log_counts <- function(x) {
 34 |   return (log10(x + 1))
 35 | }
 36 | 
 37 | 
 38 | # Tests for (weighted) association between two random variables.
 39 | 
 40 | # For weighted Pearson's correlation calls wtd.cor(). For weighted Spearman's
 41 | # correlation, calls cor.test() with weighted ranks. For unweighted correction,
 42 | # calls cor.test().
 43 | 
 44 | # Arguments:
 45 | #   x, y: variables to be tested.
 46 | #   weights: optional sample weights.
 47 | #   method: a character string indicating which correlation coefficient is to be
 48 | #           used for the test. Either 'pearson' or 'spearman'.
 49 | #   alternative: alternative hypothesis. Either 'one.sided' or 'two.sided'.
 50 | wtd_cor <- function(x, y, weights=NULL, method='pearson',
 51 |   alternative='two.sided', n_min=3) {
 52 |   r <- data.frame(r=NA, r_lo=NA, r_up=NA, p=NA, n=NA, n_wtd=NA)
 53 |   o <- !is.na(x) & !is.na(y)
 54 |   x <- x[o]
 55 |   y <- y[o]
 56 |   weights <- weights[o]
 57 |   r$n <- sum(o)
 58 |   if (is.null(weights)) {
 59 |     r$n_wtd <- r$n
 60 |   } else {
 61 |     r$n_wtd <- sum(weights)
 62 |   }
 63 |   if (r$n >= n_min & var(x) > 0 & var(y) > 0) {
 64 |     ct <- NULL
 65 |     ct_wtd <- NULL
 66 |     if (is.null(weights)) {
 67 |       ct <- cor.test(x, y, alternative=alternative, method=method)
 68 |     } else {
 69 |       if (method == 'spearman') {
 70 |         rx <- wtd.rank(x, weights=weights)
 71 |         ry <- wtd.rank(y, weights=weights)
 72 |         ct <- cor.test(rx, ry, alternative=alternative, method=method)
 73 |       } else {
 74 |         ct_wtd <- wtd.cor(x, y, weight=weights)
 75 |       }
 76 |     }
 77 |     if (!is.null(ct)) {
 78 |       r$r <- ct$estimate[1]
 79 |       r$p <- ct$p.value
 80 |       if (!is.null(ct$conf.int)) {
 81 |         r$r_lo <- ct$conf.int[1]
 82 |         r$r_up <- ct$conf.int[2]
 83 |       }
 84 |     } else {
 85 |       r$r <- ct_wtd[1]
 86 |       r$p <- ct_wtd[4]
 87 |       # Computes bounded 95% confidence interval
 88 |       r$r_lo <- max(-1, ct_wtd[1] - 1.96 * ct_wtd[2])
 89 |       r$r_up <- min(1, ct_wtd[1] + 1.96 * ct_wtd[2])
 90 |     }
 91 |   }
 92 |   return (r)
 93 | }
 94 | 
 95 | # mean imputation by columns
 96 | impute <- function(d) {
 97 |   means <- colMeans(d, na.rm=T)
 98 |   if (any(is.na(means))) {
 99 |     stop('Insufficient data for mean imputation!')
100 |   }
101 |   for (i in 1:length(means)) {
102 |     d[is.na(d[,i]), i] <- means[i]
103 | 
104 |   return (d)
105 | }
106 | 
107 | # adjust batch effect
108 | adjust_lm <- function(y, x) {
109 |   m <- lm(y ~ x)
110 |   return (residuals(m))
111 | }
112 | 
113 | adjust_df <- function(d) {
114 |   d$yr <- adjust_lm(d$y, d$x)
115 |   return (d)
116 | }
117 | 
118 | pca <- function(d, center=T, scale=F) {
119 |   # columns are samples
120 |   d <- scale(d, center=center, scale=scale)
121 |   d <- t(d)
122 |   s <- svd(d)
123 |   vec <- s$u
124 |   rownames(vec) <- rownames(d)
125 |   val <- s$d**2
126 |   val <- val / sum(val)
127 |   return (list(vec=vec, val=val))
128 | }
129 | 
130 | 
131 | # rbinds data frames by common columns
132 | rbind_frames <- function(d) {
133 |   cols <- colnames(d[[1]])
134 |   for (dd in d) {
135 |     cols <- intersect(cols, colnames(dd))
136 |   }
137 |   dcols <- lapply(d, function(x) subset(x, select=cols))
138 |   d <- do.call(rbind.data.frame, dcols)
139 |   return (d)
140 | }
141 | 
142 | read_report_meta <- function(filename, n=NULL) {
143 |   if (!is.null(n)) {
144 |   h <- 'cut -f 2-5,7,8,12'
145 |     h <- sprintf('head -n %d %s | %s', n, filename, h)
146 |   } else {
147 |     h <- sprintf('%s %s', h, filename)
148 |   }
149 |   h <- 'cut -f 2-5,7,8,12'
150 |   if (!is.null(n)) {
151 |     h <- sprintf('head -n %d | %s', n, h)
152 |   }
153 |   h <- read.table(pipe(h), head=T, sep='\t')
154 |   names(h) <- tolower(names(h))
155 |   h <- h %>% rename(chromo=chromosome)
156 |   h <- h %>% tbl_df
157 |   return (h)
158 | }
159 | 
160 | read_report_values <- function(filename, samples=NULL, n=NULL) {
161 |   h <- 'cut -f 13-'
162 |   if (!is.null(n)) {
163 |     h <- sprintf('head -n %d %s | %s', n, filename, h)
164 |   } else {
165 |     h <- sprintf('%s %s', h, filename)
166 |   }
167 | 
168 |   h <- read.table(pipe(h), head=T, sep='\t')
169 |   if (!is.null(samples)) {
170 |     h <- subset(h, select=intersect(colnames(h), samples))
171 |   }
172 |   h <- h %>% tbl_df
173 |   return (h)
174 | }
175 | 
176 | read_samples_list <- function(sample_file) {
177 |   d <- read.table(sample_file, head=F) %>% unlist %>% as.vector
178 |   return (d)
179 | }
180 | 
181 | read_samples_stats <- function(stats_file, samples=NULL) {
182 |   d <- read.table(opts$samples_stats, sep='\t', head=T) %>%
183 |     rename(sample=id, cpg_rate=CpG.rate, chh_rate=CHH.rate, cph_rate=CpH.rate) %>%
184 |     droplevels %>% tbl_df
185 |   if (!is.null(samples)) {
186 |     d <- d[d$sample %in% samples,] %>% droplevels
187 |   }
188 |   return (d)
189 | }
190 | 
191 | plot_pca_vec <- function(pc_vec, x=1, y=2) {
192 |   t <- data.frame(sample=factor(rownames(pc_vec)),
193 |     pcx=pc_vec[,x], pcy=pc_vec[,y])
194 |   p <- ggplot(t, aes(x=pcx, y=pcy)) + geom_point() +
195 |     geom_text(aes(label=sample), vjust=-.4, hjust= .3, size=3) +
196 |     xlab(sprintf('pc%d', x)) + ylab(sprintf('pc%d', y)) +
197 |     guides(color=F) + theme_pub()
198 |   return (p)
199 | }
200 | 
201 | plot_pca_val <- function(pc_val) {
202 |   t <- data.frame(pc=1:length(pc_val), val=pc_val)
203 |   p <- ggplot(t, aes(x=pc, y=val)) +
204 |     geom_bar(stat='identity', fill='salmon', color='black') +
205 |     xlab('principle component') +
206 |     ylab('% variance explained') + theme_pub()
207 |   return (p)
208 | }
209 | 


--------------------------------------------------------------------------------
/qc/data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PMBio/scMT-seq/cabc3b66bc8e1d9f61bb2b6d903018c0c88d8093/qc/data.xlsx


--------------------------------------------------------------------------------
/qc/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Samples QC
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, warning=F, message=F)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(ggplot2)
 15 | library(dplyr)
 16 | library(tidyr)
 17 | library(gridExtra)
 18 | source('../lib/utils.R')
 19 | ```
 20 | 
 21 | ```{r}
 22 | opts <- list()
 23 | opts$data <- './data.csv'
 24 | opts$use <- c('m14', 'm15P3')
 25 | opts$colors <- c(
 26 |   'm14'='forestgreen',
 27 |   'm15'='brown2',
 28 |   'm15P3'='royalblue'
 29 |   )
 30 | ```
 31 | 
 32 | ```{r}
 33 | d <- read.csv(opts$data, head=T) %>% tbl_df
 34 | names(d) <- c('sample', 'met', 'batch', 'val_trimmed', 'unique_mapped', 'num_cpg', 'per_cpg')
 35 | d$set <- 'm14'
 36 | d[d$batch %in% c('off bead', 'original'),]$set <- 'm15'
 37 | d[d$batch == 'CSCP2',]$set <- 'm15P2'
 38 | d[d$batch == 'optimised',]$set <- 'm15P3'
 39 | d <- d %>% filter(set %in% opts$use) %>%
 40 |   mutate(per_cpg=as.numeric(gsub('%', '', per_cpg)) / 100)
 41 | dat <- d
 42 | ```
 43 | 
 44 | ```{r}
 45 | write.csv(dat, 'fig1b.csv')
 46 | ```
 47 | 
 48 | ```{r fig.width=7, fig.height=6}
 49 | p <- ggplot(dat, aes(x=unique_mapped, y=per_cpg, color=set)) +
 50 |   geom_point() +
 51 |   geom_smooth(method='loess', degree=1, alpha=0.2, size=1) +
 52 |   scale_color_manual(values=opts$colors) +
 53 |   xlab('Number of mapped reads') + ylab('CpG recovery') +
 54 |   theme_pub()
 55 | p
 56 | ```
 57 | 
 58 | ```{r eval=F}
 59 | ggsave(p, file='fig_mapping.pdf', width=7, height=6)
 60 | ```
 61 | 
 62 | ```{r fig.width=7, fig.height=6}
 63 | p <- ggplot(dat, aes(x=val_trimmed, y=per_cpg, color=set)) +
 64 |   geom_point() +
 65 |   geom_smooth(method='loess', degree=1, alpha=0.2, size=1) +
 66 |   scale_color_manual(values=opts$colors) +
 67 |   xlab('Number of trimmed validated reads') + ylab('CpG recovery') +
 68 |   theme_pub()
 69 | p
 70 | ```
 71 | 
 72 | ```{r}
 73 | d <- read.csv('./sup_fig_3_data.csv') %>% tbl_df
 74 | d <- d %>% rename(type=Cell, map_eff=Mapping, dup_rate=Duplication,
 75 |   cpg_rate=CpG.Methylation, chh_rate=CH.methylation) %>%
 76 |   mutate(Study=factor(Study, levels=c('scBS2014', 'M&T'), labels=c('scBS-Seq', 'scM&T-Seq'))) %>%
 77 |   filter(type=='Serum')
 78 | levels(d$set)
 79 | ```
 80 | 
 81 | ```{r}
 82 | opts$colors <- c(
 83 |   'scM&T-Seq'='royalblue',
 84 |   'scBS-Seq'='forestgreen'
 85 |   )
 86 | ```
 87 | 
 88 | ```{r fig.width=5, fig.height=6}
 89 | p1 <- ggplot(d, aes(x=Study, y=map_eff, fill=Study)) +
 90 |   geom_boxplot() +
 91 |   scale_fill_manual(values=opts$colors) +
 92 |   theme_pub() +
 93 |   xlab('') + ylab('Mapping efficiency') +
 94 |   theme(axis.title.y=element_text(vjust=1.5))
 95 | print(p1)
 96 | ```
 97 | 
 98 | ```{r fig.width=5, fig.height=6}
 99 | p2 <- ggplot(d, aes(x=Study, y=dup_rate, fill=Study)) +
100 |   geom_boxplot() +
101 |   scale_fill_manual(values=opts$colors) +
102 |   theme_pub() +
103 |   xlab('') + ylab('Duplication rate') +
104 |   theme(axis.title.y=element_text(vjust=1.5))
105 | print(p2)
106 | ```
107 | 
108 | ```{r fig.width=5, fig.height=6}
109 | h <- d %>% select(Study, type, cpg_rate, chh_rate) %>% gather(rate, value, -c(Study, type))
110 | p3 <- ggplot(d, aes(x=Study, y=cpg_rate, fill=Study)) +
111 |   geom_boxplot() +
112 |   scale_fill_manual(values=opts$colors) +
113 |   geom_point(data=h, aes(x=Study, y=value, shape=rate), position=position_jitter(w=0.1, h=0)) +
114 |   theme_pub() +
115 |   xlab('') + ylab('Methylation rate') +
116 |   theme(axis.title.y=element_text(vjust=1.5))
117 | print(p3)
118 | ```
119 | 
120 | ```{r fig.width=12, fig.height=5}
121 | grid.arrange(p1, p2, p3, ncol=3)
122 | ```
123 | 
124 | ```{r eval=F}
125 | pdf('fig_stats.pdf', width=12, height=5)
126 | grid.arrange(p1, p2, p3, ncol=3)
127 | dev.off()
128 | ```
129 | 


--------------------------------------------------------------------------------
/sample/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Sample-specific correlation for all genes
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, fig.width=10, warning=F, message=F)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(ggplot2)
 15 | library(dplyr)
 16 | library(tidyr)
 17 | library(xtable)
 18 | options(xtable.type='html')
 19 | source('../lib/utils.R')
 20 | ```
 21 | 
 22 | ```{r}
 23 | opts <- list()
 24 | opts$data_file <- '../data/join/data.rds'
 25 | opts$expr <- '../data/expr/data_proc/data_expr.rds'
 26 | opts$genes_file <- NA
 27 | # opts$genes_file <- '../data/expr/data_raw/subpopulation_lif_genes9.csv'
 28 | opts$samples_file <- '../data/samples/samples_stats.csv'
 29 | opts$bulk_file <- '../../151019_ficz_sample/sample/r.rds'
 30 | opts$cache <- T
 31 | opts$permute <- F
 32 | opts$alpha <- 0.1
 33 | ```
 34 | 
 35 | ```{r}
 36 | theme_pub <- function() {
 37 |   p <- theme(
 38 |     axis.text=element_text(size=rel(1.2), color='black'),
 39 |     axis.title=element_text(size=rel(1.5)),
 40 |     legend.position='top',
 41 |     legend.text=element_text(size=rel(1.2)),
 42 |     legend.title=element_text(size=rel(1.2)),
 43 |     legend.key=element_rect(fill='transparent'),
 44 |     panel.border=element_blank(),
 45 |     panel.grid.major = element_blank(),
 46 |     panel.grid.minor = element_blank(),
 47 |     panel.background = element_blank(),
 48 |     axis.line = element_line(colour="black", size=1),
 49 |     axis.ticks.length = unit(.3, 'cm'),
 50 |     axis.ticks.margin = unit(.3, 'cm')
 51 |     )
 52 |   return (p)
 53 | }
 54 | ```
 55 | 
 56 | ```{r}
 57 | cmp <- list()
 58 | dat <- list()
 59 | ```
 60 | 
 61 | ```{r data}
 62 | dat$em <- readRDS(opts$data_file)
 63 | if (!is.na(opts$genes_file)) {
 64 |   dat$genes <- read.csv(opts$genes_file, sep=',', head=T) %>% tbl_df
 65 |   names(dat$genes) <- c('ens_id', 'gene_id')
 66 |   h <- as.vector(dat$genes$ens_id)
 67 |   dat$em <- dat$em %>% filter(ens_id %in% h)
 68 | }
 69 | ```
 70 | 
 71 | ```{r}
 72 | dat$samples <- read.csv(opts$samples_file, sep='\t', head=T) %>%
 73 |   mutate(sample=id, sample_short=sub('^[^_]+_', '', sample)) %>% tbl_df
 74 | ```
 75 | 
 76 | ## Bulk correlation
 77 | 
 78 | ```{r}
 79 | dat$emb <- dat$em %>% group_by(name, id_.x, id_.y) %>%
 80 |   summarise(expr=mean(expr, na.rm=T), met=weighted.mean(met, weight, na.rm=T)) %>%
 81 |   ungroup
 82 | cmp$rb <- dat$emb %>% group_by(name) %>% do(wtd_cor(.$met, .$expr)) %>%
 83 |   ungroup %>% arrange(desc(abs(r))) %>%
 84 |   mutate(name=factor(name, levels=rev(name)), sig=(p <= opts$alpha))
 85 | ```
 86 | 
 87 | 
 88 | ```{r results='asis'}
 89 | h <- cmp$rb %>% select(name, n, r, p, sig) %>% as.data.frame %>% arrange(as.vector(name))
 90 | print(xtable(h, digits=2))
 91 | ```
 92 | 
 93 | ```{r fig.width=10, fig.height=8}
 94 | ggplot(cmp$rb, aes(x=name, y=r, fill=-log10(p))) + geom_bar(stat='identity') +
 95 |   xlab('') + ylab('r') + coord_flip() + theme_pub()
 96 | ```
 97 | 
 98 | ```{r fig.height=8}
 99 | d <- dat$emb
100 | d$name <- factor(d$name, levels=rev(levels(cmp$rb$name)))
101 | ggplot(d, aes(x=met, y=expr)) +
102 |   geom_point(size=0.3) +
103 |   stat_density2d(color='darkgrey') +
104 |   stat_smooth(method=lm, color='blue') +
105 |   facet_wrap(~name) + theme_pub()
106 | ```
107 | 
108 | 
109 | ## Sample-specific correlation
110 | 
111 | ```{r cor, cache=opts$cache}
112 | r <- dat$em %>% group_by(name, sample)
113 | if (opts$permute) {
114 |   r <- r %>% do(wtd_cor(.$expr[sample(length(.$expr))], .$met, .$weight))
115 | } else {
116 |   r <- r %>% do(wtd_cor(.$expr, .$met, .$weight))
117 | }
118 | r <- r %>% ungroup %>% group_by(name) %>%
119 |   mutate(p_adj=p.adjust(p, method='fdr')) %>% ungroup
120 | cmp$r <- r
121 | ```
122 | 
123 | ```{r}
124 | cmp$r <- cmp$r %>% mutate(name=factor(name, levels=rev(levels(name)))) %>%
125 |   inner_join(dat$samples, by='sample')
126 | ```
127 | 
128 | ```{r}
129 | saveRDS(cmp$r, file='r.rds')
130 | ```
131 | 
132 | ```{r eval=T}
133 | d <- cmp$r %>% select(anno=name, sample, r, p, p_adj, cpg_cov=cov, mean_met=CpG.rate) %>%
134 |   arrange(as.vector(anno))
135 | write.table(d, file='correlations.csv', quote=F, row=F, sep=',')
136 | ```
137 | 
138 | ```{r}
139 | cmp$rs <- cmp$r %>% group_by(name) %>%
140 |   summarise(
141 |     n=mean(n),
142 |     r=mean(r),
143 |     sig_pos=sum(p_adj <= opts$alpha & r > 0),
144 |     sig_neg=sum(p_adj <= opts$alpha & r < 0)
145 |   ) %>% arrange(as.vector(name))
146 | ```
147 | 
148 | ```{r results='asis'}
149 | xtable(as.data.frame(cmp$rs), digits=2)
150 | ```
151 | 
152 | ```{r eval=F}
153 | write.table(cmp$rs, 'tab_sample_r.csv', sep='\t', row.names=F)
154 | ```
155 | 
156 | ```{r eval=!is.na(opts$bulk_file)}
157 | cmp$rb <- readRDS(opts$bulk_file)
158 | ```
159 | 
160 | ```{r fig.height=10}
161 | d <- cmp$r
162 | h <- d %>% group_by(name) %>% summarise(r_mean=mean(r, na.rm=T))
163 | d <- d %>% inner_join(h, by='name')
164 | 
165 | db <- cmp$rb
166 | p <- ggplot(d, aes(x=name, y=r)) +
167 |   geom_hline(yintercept=0, linetype='dashed', color='darkgrey') +
168 |   geom_boxplot(aes(fill=r_mean), alpha=0.4, outlier.size=0) +
169 |   geom_point(position=position_jitter(w=0.1), size=1.5) +
170 |   geom_point(data=db, color='black', fill='orange1', shape=21, size=2.5) +
171 |   scale_fill_gradient2(low='darkblue', mid='white', high='darkred', midpoint=0.0) +
172 |   xlab('') + ylab('Pearson correlation') +
173 |   coord_flip() +
174 |   theme_pub() +
175 |   theme(legend.position='right')
176 | print(p)
177 | ```
178 | 
179 | ```{r eval=F}
180 | write.csv(d, 'fig2c.csv')
181 | ```
182 | 
183 | ```{r eval=F}
184 | ggsave(p, file='fig_sample.pdf', width=8, height=8)
185 | ```
186 | 
187 | ```{r eval=F}
188 | s <- c('gene_body', 'cgi', 'intron', 'exon', 'intergenic', 'prom_cgi', 'prom_non_cgi', 'H3K27ac', 'H3K27me3', 'H3K4me1', 'LMR', 'p300')
189 | d <- cmp$r %>% filter(name %in% s) %>%
190 |   mutate(name=factor(name, levels=s))
191 | h <- d %>% group_by(name) %>% summarise(r_mean=mean(r, na.rm=T))
192 | d <- d %>% inner_join(h, by='name')
193 | 
194 | db <- cmp$rb %>% filter(name %in% s)
195 | p <- ggplot(d, aes(x=name, y=r)) +
196 |   geom_hline(yintercept=0, linetype='dashed', color='darkgrey') +
197 |   geom_boxplot(aes(fill=r_mean), alpha=0.4, outlier.size=0) +
198 |   geom_point(position=position_jitter(w=0.1), size=1.5) +
199 |   geom_point(data=db, color='black', fill='orange1', shape=21, size=2.5) +
200 |   scale_fill_gradient2(low='darkblue', mid='white', high='darkred', midpoint=0.0) +
201 |   xlab('') + ylab('Pearson correlation') +
202 |   theme_pub() +
203 |   theme(
204 |     legend.position='right',
205 |     axis.text.x=element_text(angle=30, hjust=1)
206 |     )
207 | print(p)
208 | ```
209 | 
210 | ```{r eval=F}
211 | ggsave(p, file='fig_sample_hor.pdf', width=12, height=6)
212 | ```
213 | 
214 | 
215 | ## R versus coverage
216 | 
217 | ```{r fig.height=12}
218 | d <- cmp$r %>% mutate(name=factor(name, levels=rev(levels(name))))
219 | p <- ggplot(d, aes(x=cov, y=r)) +
220 |   geom_hline(yintercept=0, color='darkgrey') +
221 |   geom_point(size=2) +
222 |   guides(color=guide_legend(title='Method ')) +
223 |   facet_wrap(~name, ncol=3) + theme(legend.position='bottom') +
224 |   xlab('\nCpG coverage') + ylab('Pearson correlation\n') +
225 |   theme_pub()
226 | print(p)
227 | ```
228 | 
229 | ```{r eval=F}
230 | ggsave('fig_r_cov.pdf', p, width=10, height=13)
231 | ```
232 | 
233 | ```{r eval=F}
234 | write.csv(d, 'S16.csv')
235 | ```
236 | 
237 | ## R versus mean methylation
238 | 
239 | 
240 | ```{r fig.height=12}
241 | d <- cmp$r %>% mutate(name=factor(name, levels=rev(levels(name))))
242 | p <- ggplot(d, aes(x=CpG.rate, y=r)) +
243 |   geom_hline(yintercept=0, color='darkgrey') +
244 |   geom_point(size=2) +
245 |   guides(color=guide_legend(title='Method ')) +
246 |   facet_wrap(~name, ncol=3) + theme(legend.position='bottom') +
247 |   xlab('\nMean CpG methylation rate') + ylab('Pearson correlation\n') +
248 |   theme_pub()
249 | print(p)
250 | ```
251 | 
252 | ```{r eval=F}
253 | ggsave('fig_r_mean.pdf', p, width=10, height=13)
254 | ```
255 | 
256 | ```{r eval=F}
257 | write.csv(d, 'S15.csv')
258 | ```
259 | 


--------------------------------------------------------------------------------
/var/m14_m15P3/index.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: Comparison methylation variability scBS14 scBS15
 3 | date: 151020
 4 | output:
 5 |   html_document:
 6 |     toc: yes
 7 | ---
 8 | 
 9 | ```{r, include=F}
10 | library(knitr)
11 | opts_chunk$set(echo=F, warning=F, message=F)
12 | ```
13 | 
14 | ```{r, include=F}
15 | library(ggplot2)
16 | library(dplyr)
17 | library(tidyr)
18 | library(grid)
19 | source('../../lib/utils.R')
20 | ```
21 | 
22 | ```{r}
23 | opts <- list()
24 | opts$s14_file <- '../met14/stats.rds'
25 | opts$s15_file <- '../m15P3/stats.rds'
26 | opts$colors <- c('scBS14'='forestgreen', 'scBS15'='royalblue')
27 | opts$annos <- c('all', 'gene_body', 'intron', 'exon', 'intergenic', 'prom_cgi',
28 |   'prom_non_cgi', 'cgi', 'p300', 'H3K4me1', 'H3K27ac', 'H3K4me3', 'LMR')
29 | ```
30 | 
31 | ```{r}
32 | dat <- list()
33 | dat$s14 <- readRDS(opts$s14_file) %>% filter(name %in% opts$annos) %>% tbl_df
34 | dat$s15 <- readRDS(opts$s15_file) %>% filter(name %in% opts$annos) %>% tbl_df
35 | d <- list()
36 | for (n in c('s14', 's15')) {
37 |   d[[n]] <- dat[[n]]
38 |   d[[n]]$data <- n
39 | }
40 | dat$var <- do.call(rbind.data.frame, d) %>%
41 |   mutate(data=factor(data, levels=c('s14', 's15'), labels=c('scBS14', 'scBS15'))) %>%
42 |   tbl_df
43 | # Scaled variance
44 | dat$var <- dat$var %>% group_by(data, name) %>%
45 |   mutate(wtd_var_s=wtd_var/sd(wtd_var, na.rm=T)) %>% ungroup
46 | ```
47 | 
48 | ## Weighted variance
49 | 
50 | ```{r eval=F}
51 | 
52 | ```
53 | 
54 | 
55 | ```{r}
56 | d <- dat$var %>% filter(name %in% opts$annos) %>%
57 |   mutate(name=factor(name, levels=rev(opts$annos)))
58 | ```
59 | 
60 | ```{r eval=F}
61 | h <- d %>% select(data, name, id_) %>%
62 |   group_by(data, name) %>% sample_frac(0.3) %>% ungroup
63 | write.csv(h, 'fig1d.csv')
64 | ```
65 | 
66 | ```{r fig.width=8, fig.height=8}
67 | p <- ggplot(d, aes(x=name, fill=data, y=wtd_var)) + geom_boxplot() +
68 |   scale_fill_manual(values=opts$colors) +
69 |   xlab('') + ylab('\nVariance') + coord_flip() +
70 |   theme_pub() + ylim(0, 1300) +
71 |   theme(legend.position='right')
72 | print(p)
73 | ```
74 | 
75 | ```{r eval=F}
76 | ggsave('fig_var.pdf', p, width=8, height=9)
77 | ```
78 | 
79 | 
80 | ## Unweighted variance
81 | 
82 | ```{r}
83 | ggplot(dat$var, aes(x=name, fill=data, y=var)) + geom_boxplot() +
84 |   xlab('') + ylab('Unweighted variance') + coord_flip() +
85 |   ylim(0, 2000) + theme_pub()
86 | ```
87 | 
88 | ```{r}
89 | ggplot(dat$var, aes(x=name, fill=name, y=wtd_var)) + geom_boxplot() +
90 |   xlab('') + ylab('Unweighted variance') + coord_flip() +
91 |   facet_wrap(~data) +
92 |   guides(fill=F) +
93 |   ylim(0, 2000)
94 | ```
95 | 


--------------------------------------------------------------------------------
/var/m15P3/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Quantification methylation variability
  3 | date: 151019
  4 | output:
  5 |   html_document:
  6 |     toc: yes
  7 | ---
  8 | 
  9 | ```{r, include=F}
 10 | library(knitr)
 11 | opts_chunk$set(echo=F)
 12 | ```
 13 | 
 14 | ```{r, include=F}
 15 | library(ggplot2)
 16 | library(dplyr)
 17 | library(tidyr)
 18 | library(weights)
 19 | library(xtable)
 20 | options(xtable.type='html')
 21 | ```
 22 | 
 23 | ```{r}
 24 | opts <- list()
 25 | opts$met_file <- '../data/m15P3/join/data_met.rds'
 26 | opts$met_meta_file <- '../data/m15P3/join/data_meta.rds'
 27 | opts$cache <- F
 28 | ```
 29 | 
 30 | ```{r read, cache=opts$cache}
 31 | dat <- list()
 32 | dat$met <- readRDS(opts$met_file)
 33 | ```
 34 | 
 35 | ```{r compute, cache=opts$cache}
 36 | stats <- function(d) {
 37 |   s <- summarise(d,
 38 |     mean=mean(met, na.rm=T),
 39 |     var=var(met, na.rm=T),
 40 |     wtd_mean=weighted.mean(met, weight, na.rm=T),
 41 |     wtd_var=wtd.var(met, weight, na.rm=T)
 42 |     )
 43 |   return (s)
 44 | }
 45 | 
 46 | cmp <- list()
 47 | cmp$var <- dat$met %>% group_by(name, id_) %>% stats
 48 | d <- dat$met %>% group_by(id_) %>% stats
 49 | d$name <- 'all'
 50 | cmp$var <- rbind.data.frame(cmp$var, d) %>% ungroup
 51 | cmp$var <- cmp$var %>% mutate(name=factor(name)) %>%
 52 |   mutate(name=factor(name, levels=sort(levels(name)))) %>%
 53 |   mutate(name=relevel(name, 'all')) %>%
 54 |   mutate(name=factor(name, levels=rev(levels(name))))
 55 | 
 56 | cmp$stats <- cmp$var %>% group_by(name) %>% summarise(
 57 |   n=n(),
 58 |   mean=mean(mean, na.rm=T),
 59 |   wtd_mean=mean(wtd_mean, na.rm=T),
 60 |   var=mean(var, na.rm=T),
 61 |   wtd_var=mean(wtd_var, na.rm=T)
 62 |   ) %>% arrange(name)
 63 | ```
 64 | 
 65 | ```{r}
 66 | saveRDS(cmp$var, file='stats.rds')
 67 | ```
 68 | 
 69 | 
 70 | ## Statistics
 71 | 
 72 | ```{r results='asis'}
 73 | d <- as.data.frame(cmp$stats %>% arrange(desc(name)))
 74 | print(xtable(d, digits=2))
 75 | ```
 76 | 
 77 | ## Weighted variance
 78 | 
 79 | ```{r}
 80 | ggplot(cmp$var, aes(x=name, fill=name, y=wtd_var)) + geom_boxplot() +
 81 |   xlab('') + ylab('Weighted variance') + coord_flip() + guides(fill=F)
 82 | ```
 83 | 
 84 | ## Unweighted variance
 85 | 
 86 | ```{r}
 87 | ggplot(cmp$var, aes(x=name, fill=name, y=var)) + geom_boxplot() +
 88 |   xlab('') + ylab('Unweighted variance') + coord_flip() + guides(fill=F)
 89 | ```
 90 | 
 91 | ## Weighted mean
 92 | 
 93 | ```{r}
 94 | ggplot(cmp$var, aes(x=name, fill=name, y=wtd_mean)) + geom_boxplot() +
 95 |   xlab('') + ylab('Weighted mean') + coord_flip() + guides(fill=F)
 96 | ```
 97 | 
 98 | ## Unweighted mean
 99 | 
100 | ```{r}
101 | ggplot(cmp$var, aes(x=name, fill=name, y=mean)) + geom_boxplot() +
102 |   xlab('') + ylab('Unweighted mean') + coord_flip() + guides(fill=F)
103 | ```
104 | 
105 | ## Number of sites
106 | 
107 | ```{r}
108 | d <- cmp$stats %>% filter(name != 'all')
109 | ggplot(d, aes(x=name, y=n, fill=name)) +
110 |   geom_bar(stat='identity') + xlab('') + ylab('Number of sites') +
111 |   coord_flip() + guides(fill=F)
112 | ```
113 | 


--------------------------------------------------------------------------------
/var/met14/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Quantification methylation variability
  3 | date: 150418
  4 | output:
  5 |   html_document:
  6 |     toc: yes
  7 | ---
  8 | 
  9 | ```{r, include=F}
 10 | library(knitr)
 11 | opts_chunk$set(echo=F)
 12 | ```
 13 | 
 14 | ```{r, include=F}
 15 | library(ggplot2)
 16 | library(dplyr)
 17 | library(tidyr)
 18 | library(weights)
 19 | library(xtable)
 20 | options(xtable.type='html')
 21 | ```
 22 | 
 23 | ```{r}
 24 | opts <- list()
 25 | opts$data_dir <- '../data/met14'
 26 | opts$samples_file <- '../data/met14/samples_meta.csv'
 27 | opts$cache <- T
 28 | ```
 29 | 
 30 | ```{r read, cache=opts$cache}
 31 | read_samples <- function(filename) {
 32 |   h <- read.table(filename, sep='\t', head=T) %>% tbl_df
 33 |   return (h)
 34 | }
 35 | 
 36 | dat <- list()
 37 | d <- list.dirs(opts$data_dir, full.names=F, recursive=F) %>% basename
 38 | d <- setdiff(d, c('tmp', 'met'))
 39 | dat$annos <- d
 40 | # dat$annos <- c('active_enhancer', 'IAP', 'cgi')
 41 | d <- list()
 42 | for (a in dat$annos) {
 43 |   d[[a]] <- readRDS(file.path(opts$data_dir, a, 'data_met.rds'))
 44 |   d[[a]]$name <- a
 45 | }
 46 | d <- do.call(rbind.data.frame, d)
 47 | dat$met <- d %>% tbl_df
 48 | dat$samples <- read_samples(opts$samples_file)
 49 | h <- dat$samples %>% filter(cond == 'serum', id != 'ESC_Ser3_RSC27_4',
 50 |   id != 'ESC_Ser6_RSC27_7') %>% select(id) %>% unlist %>%
 51 |   as.vector
 52 | dat$met <- dat$met %>% filter(sample %in% h) %>% mutate(sample=factor(sample))
 53 | rownames(dat$met) <- NULL
 54 | ```
 55 | 
 56 | ```{r compute, cache=opts$cache}
 57 | stats <- function(d) {
 58 |   s <- summarise(d,
 59 |     mean=mean(met, na.rm=T),
 60 |     var=var(met, na.rm=T),
 61 |     wtd_mean=weighted.mean(met, weight, na.rm=T),
 62 |     wtd_var=wtd.var(met, weight, na.rm=T)
 63 |     )
 64 |   return (s)
 65 | }
 66 | 
 67 | cmp <- list()
 68 | cmp$var <- dat$met %>% group_by(name, id_) %>% stats
 69 | d <- dat$met %>% group_by(id_) %>% stats
 70 | d$name <- 'all'
 71 | cmp$var <- rbind.data.frame(cmp$var, d) %>% ungroup
 72 | cmp$var <- cmp$var %>% mutate(name=factor(name)) %>%
 73 |   mutate(name=factor(name, levels=sort(levels(name)))) %>%
 74 |   mutate(name=relevel(name, 'all')) %>%
 75 |   mutate(name=factor(name, levels=rev(levels(name))))
 76 | 
 77 | cmp$stats <- cmp$var %>% group_by(name) %>% summarise(
 78 |   n=n(),
 79 |   mean=mean(mean, na.rm=T),
 80 |   wtd_mean=mean(wtd_mean, na.rm=T),
 81 |   var=mean(var, na.rm=T),
 82 |   wtd_var=mean(wtd_var, na.rm=T)
 83 |   ) %>% arrange(name)
 84 | ```
 85 | 
 86 | ```{r}
 87 | saveRDS(cmp$var, file='stats.rds')
 88 | ```
 89 | 
 90 | ## Statistics
 91 | 
 92 | ```{r results='asis'}
 93 | d <- as.data.frame(cmp$stats %>% arrange(desc(name)))
 94 | print(xtable(d, digits=2))
 95 | ```
 96 | 
 97 | ## Weighted variance
 98 | 
 99 | ```{r}
100 | ggplot(cmp$var, aes(x=name, fill=name, y=wtd_var)) + geom_boxplot() +
101 |   xlab('') + ylab('Weighted variance') + coord_flip() + guides(fill=F)
102 | ```
103 | 
104 | ## Unweighted variance
105 | 
106 | ```{r}
107 | ggplot(cmp$var, aes(x=name, fill=name, y=var)) + geom_boxplot() +
108 |   xlab('') + ylab('Unweighted variance') + coord_flip() + guides(fill=F)
109 | ```
110 | 
111 | ## Weighted mean
112 | 
113 | ```{r}
114 | ggplot(cmp$var, aes(x=name, fill=name, y=wtd_mean)) + geom_boxplot() +
115 |   xlab('') + ylab('Weighted mean') + coord_flip() + guides(fill=F)
116 | ```
117 | 
118 | ## Unweighted mean
119 | 
120 | ```{r}
121 | ggplot(cmp$var, aes(x=name, fill=name, y=mean)) + geom_boxplot() +
122 |   xlab('') + ylab('Unweighted mean') + coord_flip() + guides(fill=F)
123 | ```
124 | 
125 | ## Number of sites
126 | 
127 | ```{r}
128 | d <- cmp$stats %>% filter(name != 'all')
129 | ggplot(d, aes(x=name, y=n, fill=name)) +
130 |   geom_bar(stat='identity') + xlab('') + ylab('Number of sites') +
131 |   coord_flip() + guides(fill=F)
132 | ```
133 | 


--------------------------------------------------------------------------------
/zoom/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Esrrb
  3 | output:
  4 |   html_document:
  5 |     toc: yes
  6 | ---
  7 | 
  8 | ```{r, include=F}
  9 | library(knitr)
 10 | opts_chunk$set(echo=F, warning=F, message=F, fig.width=10)
 11 | ```
 12 | 
 13 | ```{r, include=F}
 14 | library(ggplot2)
 15 | library(dplyr)
 16 | library(tidyr)
 17 | library(gridExtra)
 18 | library(scales)
 19 | library(xtable)
 20 | source('../lib/utils.R')
 21 | source('./viz.R')
 22 | ```
 23 | 
 24 | ```{r}
 25 | options(xtable.type='html')
 26 | ```
 27 | 
 28 | ```{r}
 29 | opts <- list()
 30 | opts$max_dist <- 10000
 31 | opts$win$met_file <- 'w3k_s1k_d20k/data_met.rds'
 32 | opts$win$meta_file <- 'w3k_s1k_d20k/data_meta.rds'
 33 | opts$em_file <- '../data/join/data.rds'
 34 | opts$expr$rate_file <- '../data/expr/data_proc/data_expr.rds'
 35 | opts$expr$meta_file <- '../data/expr/data_proc/data_meta.rds'
 36 | opts$sample_filter <- c()
 37 | opts$init <- T
 38 | ```
 39 | 
 40 | ```{r read, eval=opts$init}
 41 | filter_samples <- function(d) {
 42 |   f <- opts$samples_filter
 43 |   if (!is.na(t) & length(f) > 0) {
 44 |     d <- d %>% filter(sample %in% f)
 45 |   }
 46 |   return (d)
 47 | }
 48 | 
 49 | dat <- list()
 50 | dat$win$met <- readRDS(opts$win$met_file) %>% tbl_df %>% filter_samples
 51 | dat$win$meta <- readRDS(opts$win$meta_file) %>% tbl_df
 52 | dat$expr$rate <- readRDS(opts$expr$rate_file) %>% tbl_df %>% filter_samples
 53 | dat$expr$meta <- readRDS(opts$expr$meta_file) %>% tbl_df
 54 | dat$em$all <- readRDS(opts$em_file) %>% tbl_df %>% filter_samples
 55 | dat$em$meta <- dat$em$all %>% select(-c(sample, expr, met, weight)) %>% distinct
 56 | ```
 57 | 
 58 | ```{r eval=opts$init}
 59 | cmp <- list()
 60 | cmp$r <- readRDS('../gene/r.rds')
 61 | ```
 62 | 
 63 | ```{r eval=opts$init}
 64 | viz <- list()
 65 | viz$sel <- cmp$r %>%
 66 |   filter(name %in% c('LMR', 'p300'), gene_id == 'Esrrb', p_adj < 0.1) %>%
 67 |   arrange(r) %>% head(2) %>%
 68 |   select(ens_id, gene_id, name, id_.x, id_.y, r, p, p_adj, chromo, start.x, end.x, start.y, end.y)
 69 | idx <- viz$sel[1,]$id_.x
 70 | ```
 71 | 
 72 | ```{r eval=opts$init}
 73 | viz$d <- make_data(idx)
 74 | ```
 75 | 
 76 | ```{r}
 77 | a <- list()
 78 | a$gene <- c(86361117, 86521628)
 79 | a$TSS <- c(86360117, 86361217)
 80 | for (i in 1:nrow(viz$sel)) {
 81 |   v <- viz$sel[i,]
 82 |   a[[sprintf('%s_%d', v$name, i)]] <- c(v$start.y, v$end.y)
 83 | }
 84 | 
 85 | viz$da <- data_anno(a)
 86 | 
 87 | viz$a <- plot_anno(viz$da)
 88 | viz$v <- plot_var(viz$d$v, pa=viz$a, xlab=T)
 89 | viz$r <- plot_cor(viz$d$r, pa=viz$a, xlab=T)
 90 | viz$m <- plot_met(viz$d$em, pa=viz$a)
 91 | ```
 92 | 
 93 | ```{r tracks, fig.width=14, fig.height=10}
 94 | grid.arrange(viz$v, viz$r, viz$m, nrow=3, heights=c(0.25, 0.25, 0.5))
 95 | ```
 96 | 
 97 | ```{r eval=F}
 98 | write.csv(viz$v$data, 'fig2b_var.csv')
 99 | write.csv(viz$r$data, 'fig2b_cor.csv')
100 | write.csv(viz$m$data, 'fig2b_cpg.csv')
101 | ```
102 | 
103 | ```{r results='asis'}
104 | xtable(viz$sel, digits=4)
105 | ```
106 | 
107 | ```{r scatter, fig.width=6, fig.height=6}
108 | ps <- list()
109 | for (i in 1:nrow(viz$sel)) {
110 |   v <- viz$sel[i,]
111 |   p <- plot_scatter(as.character(v$name), v$id_.x, v$id_.y)
112 |   ps[[length(ps) + 1]] <- p
113 |   print(p)
114 | }
115 | ```
116 | 
117 | ```{r eval=F}
118 | write.csv(ps[[1]]$data, 'fig2b_p300.csv')
119 | write.csv(ps[[2]]$data, 'fig2b_LMR.csv')
120 | ```
121 | 
122 | 
123 | ```{r}
124 | opts_chunk$set(eval=F)
125 | ```
126 | 
127 | ```{r}
128 | pdf(file='fig_esrrb.pdf', width=18, height=10)
129 | grid.arrange(viz$v, viz$r, viz$m, nrow=3, heights=c(0.25, 0.25, 0.5))
130 | dev.off()
131 | ```
132 | 
133 | ```{r eval=F}
134 | ggsave(viz$m, file='met_annos.jpg', width=18, height=6, dpi=600)
135 | 
136 | h <- list()
137 | h$v <- plot_var(viz$d$v, xlab=T)
138 | h$r <- plot_cor(viz$d$r, xlab=T)
139 | h$m <- plot_met(viz$d$em)
140 | h$m
141 | ggsave(h$m, file='met.jpg', width=18, height=6, dpi=600)
142 | # grid.arrange(h$v, h$r, h$m, nrow=3, heights=c(0.25, 0.25, 0.5))
143 | ```
144 | 
145 | 
146 | 
147 | ```{r}
148 | s <- viz$sel %>% select(ens_id, gene_id, name, chromo, start=start.y, end=end.y, r, p_value=p_adj)
149 | write.table(s, file='fig_esrrb.csv', sep='\t')
150 | ```
151 | 
152 | ```{r}
153 | for (i in 1:nrow(viz$sel)) {
154 |   v <- viz$sel[i,]
155 |   p <- plot_scatter(v$name, v$id_.x, v$id_.y)
156 |   ggsave(p, file=sprintf('fig_esrrb_%d.pdf', i), width=6.5, height=6)
157 | }
158 | ```
159 | 


--------------------------------------------------------------------------------
/zoom/viz.R:
--------------------------------------------------------------------------------
  1 | theme_pub <- function() {
  2 |   p <- theme(
  3 |     axis.text=element_text(size=rel(1.2), color='black'),
  4 |     axis.title=element_text(size=rel(1.5)),
  5 |     legend.position='top',
  6 |     legend.text=element_text(size=rel(1.2)),
  7 |     legend.title=element_text(size=rel(1.2)),
  8 |     legend.key=element_rect(fill='transparent'),
  9 |     panel.border=element_blank(),
 10 |     panel.grid.major = element_blank(),
 11 |     panel.grid.minor = element_blank(),
 12 |     panel.background = element_blank(),
 13 |     axis.line = element_line(colour="black", size=1),
 14 |     axis.ticks.length = unit(.3, 'cm'),
 15 |     axis.ticks.margin = unit(.3, 'cm')
 16 |     )
 17 |   return (p)
 18 | }
 19 | 
 20 | plot_scatter <- function(name_, id_.x_, id_.y_, legend=F) {
 21 |   name_ <- as.character(name_)
 22 |   d <- dat$em$all %>% filter(name == name_, id_.x == id_.x_, id_.y == id_.y_)
 23 |   dm <- d[1,]
 24 |   rc <- cmp$r %>% filter(id_.x==id_.x_, id_.y==id_.y_)
 25 |   r <- wtd_cor(d$expr, d$met, d$weight)
 26 |   pos <- d %>% select(start.y, end.y) %>% distinct
 27 |   title <- sprintf('%s (%d-%d) r=%.2f p=%5g',
 28 |     name_, rc$start.y, rc$end.y,
 29 |     rc$r, rc$p_adj)
 30 |   p <- ggplot(d, aes(x=met, y=expr)) +
 31 |     stat_smooth(method=lm, color='grey34', aes(weight=weight), size=1) +
 32 |     geom_point(aes(color=sample, size=weight), alpha=0.7) +
 33 |     scale_size(range=c(4, 8)) +
 34 |     xlab('\nMethylation rate') + ylab('Expression rate\n') +
 35 |     ggtitle(title)
 36 |  if (legend) {
 37 |     p <- p +
 38 |       guides(
 39 |         color=guide_legend(title='Sample ', ncol=7),
 40 |         size=F) +
 41 |       theme(legend.direction='horizontal')
 42 |    } else {
 43 |      p <- p + guides(color=F, size=F)
 44 |    }
 45 |   p <- p + theme_pub()
 46 |   return (p)
 47 | }
 48 | 
 49 | 
 50 | plot_var <- function(d, pa=NULL, xlab=F) {
 51 |   p <- ggplot(d, aes(x=0.5*(start.y+end.y), y=wtd_var))
 52 |   if (!is.null(pa)) {
 53 |     p <- p + pa
 54 |   }
 55 |   p <- p +
 56 |     geom_line(color='blue', lwd=1) +
 57 |   # geom_point(color='blue') +
 58 |     xlab('') + ylab('Variance\n') +
 59 |     scale_x_continuous(labels=comma) +
 60 |     theme_pub() +
 61 |     theme(axis.title.x=element_blank())
 62 |   if (xlab == F) {
 63 |     p <- p + theme(axis.text.x=element_blank())
 64 |   }
 65 |   return (p)
 66 | }
 67 | 
 68 | plot_cor <- function(d, pa=NULL, legend=F, xlab=F) {
 69 |   p <- ggplot(d, aes(x=0.5*(start.y+end.y), y=r)) +
 70 |     ylim(-1, 1) + geom_hline(yintercept=0, color='darkgrey')
 71 |   if (!is.null(pa)) {
 72 |     p <- p + pa
 73 |   }
 74 |   p <- p +
 75 |     geom_ribbon(aes(ymin=r_lo, ymax=r_up), alpha=0.2) +
 76 |     geom_line(aes(color=-log10(p)), lwd=1) +
 77 |     # geom_point(aes(color=-log10(p)), size=3) +
 78 |     scale_color_gradient(low='black', high='red') +
 79 |     scale_x_continuous(labels=comma) +
 80 |     xlab('') + ylab('Correlation\n') +
 81 |     theme_pub() +
 82 |     theme(legend.position='top', axis.title.x=element_blank())
 83 |   if (xlab == F) {
 84 |     p <- p + theme(axis.text.x=element_blank())
 85 |   }
 86 |   if (!legend) {
 87 |     p <- p + guides(color=F)
 88 |   }
 89 |   return (p)
 90 | }
 91 | 
 92 | plot_met <- function(d, pa=NULL, legend=F, mean=T) {
 93 |   p <- ggplot(d, aes(x=0.5*(start.y+end.y), y=met, color=sample))
 94 |   if (!is.null(pa)) {
 95 |     p <- p + pa
 96 |   }
 97 |   if (mean) {
 98 |     m <- d %>% group_by(id_.x, id_.y) %>% summarise(
 99 |       start.y=unique(start.y), end.y=unique(end.y),
100 |       mean=weighted.mean(met, weight, na.rm=T)
101 |       ) %>% ungroup
102 |     p <- p +
103 |       geom_line(data=m, aes(x=0.5*(start.y+end.y), y=mean),
104 |         color='grey27', size=0.8)
105 |   }
106 | 
107 |   p <- p +
108 |     geom_point(aes(color=sample, size=log2(weight), alpha=weight)) +
109 |     theme_pub() +
110 |     guides(color=F, alpha=F, size=F) +
111 |     xlab('') + ylab('Methylation rate\n') +
112 |     scale_x_continuous(labels=comma) +
113 |     scale_size(range=c(1, 4)) +
114 |     scale_alpha(range=c(0.5, 1))
115 |   if (legend) {
116 |     p <- p + guides(size=guide_legend(title='# CpGs in window'))
117 |   }
118 |   return (p)
119 | }
120 | # id__ <- 2487
121 | # d <- make_data(id__)
122 | # p <- plot_met(d$em)
123 | # print(p)
124 | 
125 | make_data <- function(id__, max_dist=opts$max_dist) {
126 |   e <- dat$expr$rate %>% filter(id_ == id__)
127 |   e_meta <- e[1,] %>% select(-c(sample, expr))
128 | 
129 |   m <- dat$win$met %>% filter(
130 |     chromo == as.character(e_meta$chromo),
131 |     start >= e_meta$start - max_dist,
132 |     end <= e_meta$end + max_dist)
133 |   em <- e %>% inner_join(m, by='sample')
134 | 
135 |   r <- em %>% group_by(id_.x, id_.y) %>%
136 |     do(wtd_cor(.$met, .$expr, weights=.$weight)) %>% ungroup
137 |   r <- r %>%
138 |     inner_join(dat$expr$meta, by=c('id_.x'='id_')) %>%
139 |     inner_join(dat$win$meta, by=c('id_.y'='id_'))
140 | 
141 |   v <- em %>% group_by(id_.x, id_.y) %>%
142 |     summarise(
143 |       wtd_mean=weighted.mean(met, weight, na.rm=T),
144 |       wtd_var=wtd.var(met, weight, na.rm=T)
145 |       ) %>% ungroup
146 |   v <- v %>%
147 |     inner_join(dat$expr$meta, by=c('id_.x'='id_')) %>%
148 |     inner_join(dat$win$meta, by=c('id_.y'='id_'))
149 | 
150 |   stopifnot(nrow(v) == nrow(r))
151 |   stopifnot(all(r$start.x == v$start.x))
152 |   stopifnot(all(r$end.x == v$end.x))
153 | 
154 |   return (list(em=em, r=r, v=v))
155 | }
156 | 
157 | data_anno <- function(a) {
158 |   d <- list()
159 |   for (n in names(a)) {
160 |     an <- a[[n]]
161 |     d[[n]]$name <- n
162 |     d[[n]]$start <- an[1]
163 |     d[[n]]$end <- an[2]
164 |   }
165 |   d <- do.call(rbind.data.frame, d) %>% mutate(name=factor(name)) %>%
166 |     gather(pos, x, -name) %>% tbl_df
167 |   return (d)
168 | }
169 | 
170 | plot_anno <- function(d) {
171 |   p <- geom_vline(data=d, aes(xintercept=x, linetype=name))
172 |   return (p)
173 | }
174 | 
175 | make_plots_scatter <- function(name_, id__) {
176 |   em <- dat$em$all %>% filter(name == name_, id_.x == id__)
177 |   em_meta <- em %>% select(name, id_.x, id_.y) %>% distinct
178 |   p <- list()
179 |   for (i in 1:nrow(em_meta)) {
180 |     emi <- em_meta[i,]
181 |     p[[i]] <- plot_scatter(em %>% filter(id_.y == emi$id_.y))
182 |   }
183 |   return (p)
184 | }
185 | # name_ <- 'active_enhancer'
186 | # id__ <- 4373
187 | # p <- make_plots_scatter(name_, id__)
188 | 
189 | make_plots_track <- function(name_, id__) {
190 |   em_meta <- dat$em$meta %>% filter(name == name_, id_.x == id__) %>%
191 |     arrange(start.y)
192 | 
193 |   p <- list()
194 |   d <- data_anno(em_meta[1,]$start.x, em_meta[1,]$end.x,
195 |     em_meta$start.y, em_meta$end.y)
196 |   p$anno <- plot_anno(d)
197 | 
198 |   pd <- make_data(id__)
199 |   p$var <- plot_var(pd$v, p$anno)
200 |   p$cor <- plot_cor(pd$r, p$anno)
201 |   p$met <- plot_met(pd$em, p$anno)
202 | 
203 |   return (p)
204 | }
205 | # name_ <- 'p300'
206 | # id__ <- 2647
207 | # p <- make_plots_track(name_, id__)
208 | 
209 | make_plots <- function(name_, id__) {
210 |   p <- make_plots_track(name_, id__)
211 |   p$scatter <- make_plots_scatter(name_, id__)
212 |   return (p)
213 | }
214 | 
215 | plot_to_file <- function(name, id__, out_dir='plots') {
216 |   m <- dat$expr$meta %>% filter(id_ == id__)
217 |   fb <- sprintf('%s/%s_id%d_%s_%s', out_dir, name, id__, m$ens_id, m$gene_id)
218 |   p <- make_plots(name, id__)
219 | 
220 |   fn <- sprintf('%s_01.pdf', fb)
221 |   nrow <- ceil(length(p$scatter) / 2)
222 |   pdf(file=fn, width=10, height=5 * nrow)
223 |   h <- p$scatter
224 |   h[['ncol']] <- 2
225 |   do.call(grid.arrange, h)
226 |   dev.off()
227 | 
228 |   fn <- sprintf('%s_02.pdf', fb)
229 |   pdf(file=fn, width=10, height=8)
230 |   h <- list(p$var, p$cor, p$met)
231 |   h[['heights']] <- c(0.2, 0.2, 0.6)
232 |   do.call(grid.arrange, h)
233 |   dev.off()
234 | }
235 | # name_ <- 'active_enhancer'
236 | # id__ <- 4373
237 | # plot_to_file(name_, id__)
238 | 
239 | plot_tracks <- function(name, id__) {
240 |   p <- make_plots(name, id__)
241 |   grid.arrange(p$var, p$cor, p$met, heights=c(0.2, 0.2, 0.6))
242 | }
243 | # name_ <- 'active_enhancer'
244 | # id__ <- 55
245 | # plot_tracks(name_, id__)
246 | 
247 | 


--------------------------------------------------------------------------------