├── NAMESPACE
├── .Rbuildignore
├── .DS_Store
├── .gitignore
├── man
    ├── logo.png
    ├── plot_single_cond.Rd
    ├── dryseq_single.Rd
    ├── plot_models_rhythm.Rd
    ├── drylm.Rd
    └── dryseq.Rd
├── data
    ├── simulatedData.RData
    └── example_vector.RData
├── dryR.Rproj
├── DESCRIPTION
├── R
    ├── dryseq_single.R
    ├── function_plot_models_rhythm.R
    ├── plot_single_cond.R
    ├── f_24_function.R
    ├── dryseq_function_lm.R
    ├── dryseq_function.R
    └── misc_function.R
└── Readme.md


/NAMESPACE:
--------------------------------------------------------------------------------
1 | exportPattern("^[[:alpha:]]+")
2 | 


--------------------------------------------------------------------------------
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | 


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naef-lab/dryR/HEAD/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | 


--------------------------------------------------------------------------------
/man/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naef-lab/dryR/HEAD/man/logo.png


--------------------------------------------------------------------------------
/data/simulatedData.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naef-lab/dryR/HEAD/data/simulatedData.RData


--------------------------------------------------------------------------------
/data/example_vector.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naef-lab/dryR/HEAD/data/example_vector.RData


--------------------------------------------------------------------------------
/man/plot_single_cond.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot_single_cond.R
 3 | \name{plot_single_cond}
 4 | \alias{plot_single_cond}
 5 | \title{This function plots data and fit for one condition}
 6 | \usage{
 7 | plot_single_cond(out, gene_name)
 8 | }
 9 | \description{
10 | This function plots data and fit for one condition
11 | }
12 | 


--------------------------------------------------------------------------------
/dryR.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | 


--------------------------------------------------------------------------------
/man/dryseq_single.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dryseq_single.R
 3 | \name{dryseq_single}
 4 | \alias{dryseq_single}
 5 | \title{This function performs rhythmicity analysis for one condition}
 6 | \usage{
 7 | dryseq_single(
 8 |   countData,
 9 |   sample_name = names(countData),
10 |   group,
11 |   single,
12 |   time,
13 |   period = 24
14 | )
15 | }
16 | \description{
17 | This function performs rhythmicity analysis for one condition
18 | }
19 | 


--------------------------------------------------------------------------------
/man/plot_models_rhythm.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/function_plot_models_rhythm.R
 3 | \name{plot_models_rhythm}
 4 | \alias{plot_models_rhythm}
 5 | \title{Visualization of dryseq results}
 6 | \usage{
 7 | plot_models_rhythm(dryList, file_path_name, period = 24)
 8 | }
 9 | \arguments{
10 | \item{dryList}{from dryseq output}
11 | 
12 | \item{file_path_name}{folder to store output}
13 | }
14 | \description{
15 | This function allows to plot the results of dryseq.
16 | }
17 | \examples{
18 | XYZn
19 | }
20 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: dryR
 2 | Type: Package
 3 | Title: Differential RhythmicitY analysis 
 4 | Version: 1.0.0
 5 | Author: Cedric Gobet and Benjamin Weger
 6 | Maintainer: The package maintainer <cedric.gobet@epfl.ch>
 7 | Description: DryR ( Differential RhythmicitY analysis in R) is a statistical
 8 |      framework based on model selection that is designed to detect and estimate changes 
 9 |      in rhythmic parameters (amplitude and phase) and mean expression in multiple conditions.
10 | biocViews: Software, BiologicalQuestion
11 | Imports:
12 |     combinat,
13 |     parallel,
14 |     doParallel,
15 |     gplots,
16 |     RColorBrewer,
17 |     circular,
18 |     reshape2,
19 |     stringr,
20 |     Rmisc,
21 |     gridExtra
22 | Depends:
23 |     R (>= 3.5.0),
24 |     DESeq2,
25 |     ggplot2,
26 |     S4Vectors,
27 |     foreach,
28 |     pheatmap,
29 |     plotrix
30 | Encoding: UTF-8
31 | LazyData: true
32 | RoxygenNote: 7.1.1
33 | 


--------------------------------------------------------------------------------
/R/dryseq_single.R:
--------------------------------------------------------------------------------
 1 | #' This function performs rhythmicity analysis for one condition
 2 | #' @export
 3 | dryseq_single=function(countData,
 4 |                 group,
 5 |                 time,
 6 |                 single = group[1],
 7 |                 sample_name=names(countData),
 8 |                 period=24){
 9 | 
10 |   sel =  group %in% single
11 | 
12 |   if(!any(sel)){warning("Your single condition is not included in 'group'")}
13 | 
14 |   time      = time[sel]
15 |   group     = group[sel]
16 |   countData = countData[,sel]
17 |   sample_name = sample_name[sel]
18 | 
19 |   countData = countData[rowSums(countData)!=0,]
20 | 
21 |   s1 <- sin(2*pi*time/period)
22 |   c1 <- cos(2*pi*time/period)
23 | 
24 |   conds  = cbind(s1,c1)
25 |   colnames(conds) = c("s1","c1")
26 | 
27 |   colData <- data.frame(row.names=colnames(countData), conds)
28 |   N=length(unique(group))
29 | 
30 |   ############################
31 |   # FIT RHYTHMS
32 |   ######################
33 | 
34 |   dds = DESeq2::DESeqDataSetFromMatrix(countData = countData, colData = colData, design = ~ s1 + c1)
35 |   dds = DESeq(dds, test="LRT", reduced=~1)
36 | 
37 |   ################
38 |   # coefficients / mean, amplitude, phase, p-value
39 |   ###############
40 | 
41 |   res = as.data.frame(cbind(results(dds),coefficients(dds)))
42 |   res = res[,c('pvalue','padj','Intercept','s1','c1')]
43 | 
44 |   phase=period/(2*pi)*atan2(res$s1,res$c1)
45 |   phase=phase%%period
46 |   amp =2*sqrt(res$s1^2+res$c1^2)
47 | 
48 |   res=data.frame(res,phase,amp)
49 | 
50 | 
51 |   #normalized counts
52 |   ncounts       = counts(dds, normalized = TRUE)
53 |   global_df = as.data.frame(cbind(ncounts,res))
54 | 
55 | 
56 |   out = list()
57 | 
58 |   out[["time"]]  = time
59 |   out[["period"]] = period
60 |   out[["results"]] = global_df
61 |   out[["single"]] = single
62 |   message("finished!")
63 |   return(out)
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/R/function_plot_models_rhythm.R:
--------------------------------------------------------------------------------
 1 | #' Visualization of dryseq results
 2 | #'
 3 | #' This function allows to plot the results of dryseq.
 4 | #' @param dryList from dryseq output
 5 | #' @param file_path_name folder to store output
 6 | #' @export pdf that summarize the results of dryseq
 7 | #' @examples
 8 | #' XYZn
 9 | plot_models_rhythm = function(dryList,file_path_name, period=24){
10 |   if (length(dev.list()!=0)) {dev.off()}
11 |   t=dryList$time
12 |   group=dryList$group
13 |   x=dryList$results
14 |   
15 |   if("vsd" %in% names(dryList)){
16 |     x[,1:ncol(dryList$vsd)]=dryList$vsd
17 |     
18 |   }
19 |   
20 |   pdf(file =paste0(file_path_name,'summary_heatmap_models.pdf'), paper = "a4")
21 |   nb = table(x[,'chosen_model'])
22 |   nb = nb[order(-nb)]
23 |   mo = as.numeric(names(nb))
24 |   
25 |   for(i in mo){
26 |     
27 |     x_s = subset(x, chosen_model==i)
28 |     
29 |     if(nrow(x_s)>1){
30 |       
31 |       pos_phase = grep('phase',names(x_s))
32 |       
33 |       sum_phase = apply(x_s[,pos_phase],2,sum,na.rm=T)
34 |       if(sum(sum_phase)!=0){
35 |         x_s = x_s[order(x_s[,pos_phase[min(which(sum_phase !=0))]]),]
36 |       }
37 |       
38 |       x_s=as.matrix(x_s[,1:length(t)])
39 |       
40 |       ## Average replicates
41 |       com= paste(t,group)
42 |       com.u=unique(com)
43 |       t.u=sapply(strsplit(com.u," "),"[[",1)
44 |       cond.u=sapply(strsplit(com.u," "),"[[",2)
45 |       
46 |       ss=split(1:length(com),com)
47 |       
48 |       x_s.m=sapply(ss,function(x) if(length(x) > 1){rowMeans(x_s[,x])}else{x_s[,x]})
49 |       
50 |       #remove mean per condition
51 |       condi=sapply(strsplit(colnames(x_s.m)," "),"[[",2)
52 |       ss=split(1:length(condi),condi)
53 |       
54 |       for(k in ss){ x_s.m[,k]= sweep(x_s.m[,k],1,rowMeans(x_s.m[,k]),FUN="-") }
55 |       x_s.m=x_s.m[,match(com.u,colnames(x_s.m))]
56 |       
57 |       
58 |       pheatmap(as.matrix(x_s.m),
59 |                cluster_cols = FALSE,
60 |                cluster_rows = FALSE,
61 |                show_rownames = F ,
62 |                scale = "row",
63 |                gaps_col=which(diff(as.numeric(as.factor(cond.u)))!=0),
64 |                main = paste("model",i," #Genes",nrow(x_s),sep =" "),
65 |                col = colorRampPalette(c('blue','yellow'))(1000))
66 |       
67 |       x_s = subset(x, chosen_model==i)
68 |       ba = 1
69 |       gg = list()
70 |       
71 |       for(kk in pos_phase){
72 |         
73 |         if(sum(x_s[,kk],na.rm=T) !=0){
74 |           gg[[ba]] = circular_phase24H_histogram(x_s[,kk], unique(condi)[which(kk==pos_phase)], period)
75 |           ba = ba + 1
76 |         }
77 |         
78 |       }
79 |       
80 |       if(length(gg)>0){gridExtra::grid.arrange(grobs = gg, ncol = 4, nrow = 1+round(length(pos_phase)/4))}
81 |       
82 |       bas = unique(sum_phase)
83 |       bas=bas[bas!=0]
84 |       la = match(bas, sum_phase)
85 |       if (length(la) > 1) {
86 |         pairs(x_s[, pos_phase[la]], cex = 0.5)
87 |       }
88 |       
89 |     }
90 |     
91 |   }
92 |   dev.off()
93 |   
94 | }
95 | 


--------------------------------------------------------------------------------
/R/plot_single_cond.R:
--------------------------------------------------------------------------------
 1 | #' This function plots data and fit for one condition for glms
 2 | #' @export
 3 | plot_single_cond=function(out,gene_name){
 4 |   out.ncount = out$results[,1:(ncol(out$results)-7)]
 5 |   x=out.ncount[gene_name,]
 6 |   period=out$period
 7 |   t=out$time
 8 |   t.2=seq(min(t),max(t),0.1)
 9 |   c1=out$results[gene_name,'c1']
10 |   s1=out$results[gene_name,'s1']
11 | 
12 |   u=out$results[gene_name,'Intercept']
13 |   c1.f=c1*cos(2*pi*t.2/period)
14 |   s1.f=s1*sin(2*pi*t.2/period)
15 | 
16 |   fit=u+c1.f+s1.f
17 |   df=data.frame(t=t,dat=log2(1+as.numeric(x)))
18 |   df.2=data.frame(t.2=t.2,fit=fit)
19 |   g1=ggplot(data=df,aes(x=t,y=dat)) +
20 |     geom_point(shape=21,color='white',fill='black',size=2) +
21 |     geom_line(data=df.2,aes(x=t.2,y=fit),alpha=.8) +
22 |     theme_bw() + theme(aspect.ratio=1,axis.text=element_text(size = 12),plot.title = element_text(size=10)) +
23 |     xlab("Time") + ylab("Log2 normalized counts") +
24 |     ggtitle(paste(gene_name,"\n","adj_pval:",format(out$results[gene_name,'padj'],scientific=T,digits = 3),", phase:",round(out$results[gene_name,'phase'],2),
25 |                   ", amp:",round(out$results[gene_name,'amp'],2)))
26 |   print(g1)
27 | }
28 | 
29 | #' This function plots data and fit for one condition for linear models
30 | #' @export
31 | plot_single_cond_lm <- function(out, gene_name) {
32 |   # extract normalized counts (all columns except last 9 stats)
33 |   counts <- out$results[, seq_len(ncol(out$results) - 9)]
34 |   expr   <- counts[gene_name, ]
35 |   period <- out$period
36 |   times  <- out$time
37 | 
38 |   # fine grid for smooth fit
39 |   times_grid <- seq(min(times), max(times), by = 0.1)
40 | 
41 |   # coefficients from cycler
42 |   intercepts <- out$results[gene_name, grep("Intercept|mean", colnames(out$results))]
43 |   coef_cos   <- out$results[gene_name, "c1"]
44 |   coef_sin   <- out$results[gene_name, "s1"]
45 | 
46 |   # fitted cos/sin curve
47 |   cos_part <- coef_cos * cos(2 * pi * times_grid / period)
48 |   sin_part <- coef_sin * sin(2 * pi * times_grid / period)
49 |   fit_vals <- intercepts + cos_part + sin_part
50 | 
51 |   # data frames for plotting
52 |   df_points <- data.frame(time = times, expression = as.numeric(expr))
53 |   df_curve  <- data.frame(time = times_grid, fit = fit_vals)
54 | 
55 |   # build plot
56 |   p <- ggplot(df_points, aes(x = time, y = expression)) +
57 |     geom_point(shape = 21, color = "white", fill = "black", size = 2) +
58 |     geom_line(data = df_curve, aes(x = time, y = fit), alpha = 0.8) +
59 |     theme_bw() +
60 |     theme(
61 |       aspect.ratio = 1,
62 |       axis.text    = element_text(size = 12),
63 |       plot.title   = element_text(size = 10)
64 |     ) +
65 |     labs(
66 |       x = "Time",
67 |       y = "Log2 normalized counts",
68 |       title = sprintf(
69 |         "%s\nadj_pval: %.3g, phase: %.2f, amp: %.2f",
70 |         gene_name,
71 |         out$results[gene_name, "padj"],
72 |         round(out$results[gene_name, "phase"], 2),
73 |         round(out$results[gene_name, "amp"], 2)
74 |       )
75 |     )
76 | 
77 |   print(p)
78 | }
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/R/f_24_function.R:
--------------------------------------------------------------------------------
 1 | # Test a time series for rhythmicity using a harmonic regression method
 2 | # This function has been used in PMID:24344304. Please cite this paper if used.
 3 | # 
 4 | # @param x Numeric vector to test for rhythmicity
 5 | # @param t Numeric vector of time (in hours)
 6 | # @param period Period of oscillation to test
 7 | # @param offset Phase offset (if needed)
 8 | # @return A vector with number of timepoints, mean, amplitude, relative amplitude, phase, and p-value of rhythmicity
 9 | # 
10 | f24_R2_cycling <- function(x, t = 2 * (0:(length(x) - 1)), period = 24, offset = 0) {
11 |   # remove NAs
12 |   valid <- !is.na(x)
13 |   x <- x[valid]
14 |   t <- t[valid]
15 |   n <- length(x)
16 |   nb.timepoints <- length(valid)
17 | 
18 |   # handle insufficient data
19 |   if (n < 4) {
20 |     stats <- c(
21 |       nb.timepoints = nb.timepoints,
22 |       mean          = if (n > 0) mean(x) else NA,
23 |       amp           = NA,
24 |       relamp        = NA,
25 |       phase         = NA,
26 |       pval          = NA,
27 |       c1            = NA,
28 |       s1            = NA
29 |     )
30 |     return(stats)
31 |   }
32 | 
33 |   # compute sine/cosine terms
34 |   c <- cos(2 * pi * t / period)
35 |   s <- sin(2 * pi * t / period)
36 | 
37 |   # regression coefficients
38 |   A  <- cov(x, c)
39 |   B  <- cov(x, s)
40 |   C1 <- var(c)
41 |   C2 <- cov(c, s)
42 |   C3 <- var(s)
43 | 
44 |   b  <- (A * C2 - B * C1) / (C2^2 - C1 * C3)
45 |   a  <- (A - b * C2) / C1
46 |   mu <- mean(x) - a * mean(c) - b * mean(s)
47 | 
48 |   x_hat <- mu + a * c + b * s
49 |   R2    <- if (var(x) > 0) 1 - var(x - x_hat) / var(x) else NA
50 | 
51 |   # amplitude and phase
52 |   amp    <- 2 * sqrt(a^2 + b^2)
53 |   phi    <- (period / (2 * pi)) * atan2(b, a)
54 |   phase  <- (phi %% period + offset) %% period
55 | 
56 |   # p-value from beta distribution
57 |   pval <- pbeta(R2, (3 - 1) / 2, (n - 3) / 2, lower.tail = FALSE)
58 | 
59 |   c(
60 |     nb.timepoints = nb.timepoints,
61 |     mean          = mean(x),
62 |     amp           = amp,
63 |     relamp        = amp / mu,
64 |     phase         = phase,
65 |     pval          = pval,
66 |     c1            = a,
67 |     s1            = b
68 |   )
69 | }
70 | 
71 | 
72 | 
73 | # This function performs rhythmicity analysis for one condition for a dataframe that 
74 | #' @param data	matrix or vector containing linear data; if a matrix is provided each column represents a sample, each row represents a feature.
75 | #' @param time	vector containing numeric values of the zeitgeber/circadian time for each sample.
76 | #' @param period	numeric value to indicate period length of the oscillation. Default: period = 24 h.
77 | #' @param sample_name	vector containing sample names. Default: colnames are sample names.
78 | #' @return a list that contains the following data.frames/vectors: results (summary of results), parameters (rhythmic parameters), time ((timepoints), period (period length)
79 | #
80 | f_24 = function (data, time, period = 24, sample_name = names(data))
81 | {
82 |   if (is.vector(data)) {
83 |     data = rbind(data, data)
84 |     rownames(data) = c("X1", "X2")
85 |   }
86 |   res_tmp = apply(data, 1, function(x) f24_R2_cycling(x, t = time,
87 |                                                       period = period))
88 |   res = t(res_tmp)
89 |   padj = p.adjust(res[, "pval"], method = "BH")
90 |   res_complete = cbind(res, padj)
91 |   colnames(res_complete)[ncol(res_complete)] = "padj"
92 |   global_df = as.data.frame(cbind(data, res_complete))
93 |   out = list(time = time, period = period, results = global_df,
94 |              parameters = res_complete)
95 |   message("finished!")
96 |   return(out)
97 | }
98 | 


--------------------------------------------------------------------------------
/man/drylm.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dryseq_function_lm.R
 3 | \name{drylm}
 4 | \alias{drylm}
 5 | \title{Differential rhythmicity analysis for RNA-Seq datasets}
 6 | \usage{
 7 | drylm(
 8 |   data,
 9 |   group,
10 |   time,
11 |   period = 24,
12 |   sample_name = colnames(data),
13 |   batch = rep("A", length(sample_name)),
14 |   n.cores = 1
15 | )
16 | }
17 | \arguments{
18 | \item{data}{matrix or vector containing data; if a matrix is provided each column represents a sample, each row represents a feature.}
19 | 
20 | \item{group}{vector containing the name of each group (e.g. wildtype, knock-out).}
21 | 
22 | \item{time}{vector containing numeric values of the zeiteber/circadian time for each sample.}
23 | 
24 | \item{period}{numeric value to indicate period length of the oscillation. Default: period = 24 h.}
25 | 
26 | \item{sample_name}{vector containing sample names. Default: colnames are sample names.}
27 | 
28 | \item{batch}{vector containing potential batch effects between samples. Default: no batch effect.}
29 | 
30 | \item{nthreads}{vector numeric value to indicate the threads for parallel computing .Default: 60 \% of detected cores.}
31 | }
32 | \value{
33 | a list that contains the following data.frames: results (summary of results), parameters (rhythmic parameters), ncounts (normalized counts), counts (raw counts), cook (cook's distance)
34 | }
35 | \description{
36 | This function performs a rhythmicity analysis based on linear models with a subsequent models selection. The function accepts a time series assuming normally distributed noise of two or more groups. The function outputs the parameters mean, phase and amplitude are for each group.
37 | }
38 | \details{
39 | DryR assesses rhythmicity and mean differences of gene expression in normal data.
40 |      When necessary, a batch specific mean (m) can be given to the drylm function to account for technical batch effects.
41 |      A technical batch effect is not allowed to be confounding so the resulting model matrix is fully ranked.
42 |      To select an optimal gene-specific model, drylm first assesses rhythmicity across the different conditions. To this end, dryR defines different models across all groups.
43 |      Models refined to have either zero (non-rhythmic pattern) or non-zero (rhythmic pattern) α and β coefficients for each analyzed group. Moreover, for some models the values of α and β can be also shared within any combination of all groups
44 |      The coefficients α and β were used to calculate the phase (arctan(α/β)) and amplitude (log2-fold change peak-to-trough; 2sqrt(α^2+β^2) ) of a gene.
45 |      Bayesian information criterion (BIC) based model selection was employed to account for model complexity using the following formula:
46 |      \cr \cr BIC_j = n ln(RSS_j/n)+ k ln(n) \cr \cr
47 |      with RSS the sum of residuals square of the multilinear regression, n the number of time points, and k the number of parameters.
48 |      To assess the confidence of the selected model j we calculated the Schwarz weight (BICW):
49 |      \cr \cr   BICW_j = e^(0.5ΔBIC_j)\ sum(e^0.5 ΔBIC_m), with ΔBIC_j - BIC_j - BIC_m*\cr \cr
50 |      m* is the minimum BIC value in the entire model set. drylm consideres the BICW_j as the confidence level for model j. The model with the highest BICW is selected as the optimal model within the set of all defined models.
51 |      In a second iteration step, drylm set the coefficient α and β to the values of the selected model in the first regression.
52 |      drylm then defined different models for the mean coefficient with differing or shared means between groups. Each model is solved using linear regression and each gene was assigned to a preferred model based on the BICW as described above for the first iteration.
53 | }
54 | \examples{
55 | data = log(simData[["countData"]]+1)
56 | group = simData[["group"]]
57 | time  = simData[["time"]]
58 | dryList = drylm(data,group,time)
59 | head(dryList[["results"]])    # data frame summarizing results
60 | head(dryList[["parameters"]]) # coefficients: phase, amplitude and mean for each group
61 | head(dryList[["ncounts"]])    # normalized counts
62 | }
63 | 


--------------------------------------------------------------------------------
/man/dryseq.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/dryseq_function.R
 3 | \name{dryseq}
 4 | \alias{dryseq}
 5 | \title{Differential rhythmicity analysis for RNA-Seq datasets}
 6 | \usage{
 7 | dryseq(
 8 |   countData,
 9 |   group,
10 |   time,
11 |   period = 24,
12 |   sample_name = colnames(countData),
13 |   batch = rep("A", length(sample_name)),
14 |   n.cores = round(parallel::detectCores() * 0.6, 0)
15 | )
16 | }
17 | \arguments{
18 | \item{countData}{matrix containing non-negative integers; each column represents a sample, each row represents a gene/transcript.}
19 | 
20 | \item{group}{vector containing the name of each sample.}
21 | 
22 | \item{time}{vector containing numeric values of the time for each sample.}
23 | 
24 | \item{period}{numeric value to indicate period length of the oscillation. Default: circadian data period of 24 h.}
25 | 
26 | \item{sample_name}{vector containing sample names. Default: colnames are sample names.}
27 | 
28 | \item{batch}{vector containing potential batch effects between samples. Default: no batch effect.}
29 | 
30 | \item{nthreads}{vector numeric value to indicate the threads for parallel computing .Default: 60 \% of detected cores.}
31 | }
32 | \value{
33 | a list that contains the following data.frames: results (summary of results), parameters (rhythmic parameters), ncounts (normalized counts), counts (raw counts), cook (cook's distance)
34 | }
35 | \description{
36 | This function performs a rhythmicity analysis based on generalized linear models with a subsequent models selection. The function accepts raw count data from a temporal RNA-Seq dataset of two or more groups. The function outputs parameters mean, phase and amplitude are for each group.
37 | }
38 | \details{
39 | DryR assesses rhythmicity and mean differences of gene expression in RNA-Seq count data.
40 |     As proposed (Love et al. 2014), a count Y of a gene in a sample s can be modeled as a negative binomial with a fitted mean μ_gs and a gene-specific dispersion parameter θ_g.
41 |     \cr \cr  \emph{Y_gs}~NB(\emph{μ_gs},\emph{θ_g})\cr\cr
42 |     The fitted mean is proportional to the quantity q of fragments that correspond to a gene in a sample scaled by a sample-specific scaling factor s_s (Love et al. 2014).
43 |     This scaling factor depends on the sampling depth of each library and can be estimated using the median-of-ratios method of DESeq2 (Anders and Huber 2010).
44 |     \cr \cr   \emph{μ_gs} = \emph{s_s q_gs}\cr \cr
45 |     DryR estimates gene-specific distribution θg using empirical Bayes shrinkage described by Love et al. (Love et al. 2014).
46 |     and variance is computed from the following relationship to the dispersion parameter θ:
47 |     \cr \cr   Var(\emph{Y_gs}) = E[(\emph{Y_gs} + \emph{θ_g} \emph{μ^2_gs})]\cr \cr
48 |     The fit uses a generalized linear model with a logarithmic link function. Sample specific size factor (λ_s) is defined as an offset. The full GLM is defined as follows:
49 |     \cr \cr log(\emph{μ_gbcs}) = \emph{m_gb} + \emph{m_gc} + \emph{α_gc} cos(\emph{ω t(s)}) + \emph{β_gc} sin(\emph{ω t(s)}) + log(\emph{λ_t(s)})\cr \cr
50 |      μ is the raw count for gene g, condition/group c and Zeitgeber/circadian time t. α and β are coefficients of the cosine and sine functions, respectively. m is a coefficient to describe a mean expression level.
51 |      When necessary, a batch specific mean (m) can be given to the dryseq function to account for technical batch effects.
52 |      A technical batch effect is not allowed to be confounding so the resulting model matrix is fully ranked.
53 |      To select an optimal gene-specific model, dryseq first assesses rhythmicity across the different conditions. To this end, dryR defines different models across all groups.
54 |      Models refined to have either zero (non-rhythmic pattern) or non-zero (rhythmic pattern) α and β coefficients for each analyzed group. Moreover, for some models the values of α and β can be also shared within any combination of all groups
55 |      The coefficients α and β were used to calculate the phase (arctan(α/β)) and amplitude (log2-fold change peak-to-trough; 2sqrt(α^2+β^2) ) of a gene.
56 |      Bayesian information criterion (BIC) based model selection was employed to account for model complexity using the following formula:
57 |      \cr \cr   BIC_j = ln(n)k - 2ln(L̂_ĵ)\cr \cr
58 |      L̂ is defined as the log-likelihood of the model j from the regression, n is the number of data points and k is the number of parameters.
59 |      To assess the confidence of the selected model j we calculated the Schwarz weight (BICW):
60 |      \cr \cr   BICW_j = e^(0.5ΔBIC_j)\ sum(e^0.5 ΔBIC_m), with ΔBIC_j - BIC_j - BIC_m*\cr \cr
61 |      m* is the minimum BIC value in the entire model set. Dryseq consideres the BICW_j as the confidence level for model j. The model with the highest BICW is selected as the optimal model within the set of all defined models.
62 |      In a second iteration step, dryR set the coefficient α and β to the values of the selected model in the first regression.
63 |      dryseq then defined different models for the mean coefficient with differing or shared means between groups. Each model is solved using generalized linear regression and each gene was assigned to a preferred model based on the BICW as described above for the first iteration.
64 |      The model selection is sensitive to outliers: dryseq provide a cook's distance for each gene. A fit for a gene with a maximum cook's distance of higher than 1 should be considered with care.
65 | }
66 | \examples{
67 | countData = simData[["countData"]]
68 | group = simData[["group"]]
69 | time  = simData[["time"]]
70 | dryList = dryseq(countData,group,time)
71 | head(dryList[["results"]])    # data frame summarizing results
72 | head(dryList[["parameters"]]) # coefficients: phase, amplitude and mean for each group
73 | head(dryList[["ncounts"]])    # normalized counts
74 | head(dryList[["counts"]])     # raw counts
75 | head(dryList[["cook"]])       # cook's distance
76 | }
77 | \references{
78 | Love, M.I., Huber, W., Anders, S. (2014) Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology
79 | 
80 | Anders, S. and Huber, W. (2014) Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology
81 | }
82 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
  1 | # dryR <img src="man/logo.png" align="right" width=120 height=139 alt="" />
  2 | 
  3 | <!-- badges: start -->
  4 | [![](https://img.shields.io/badge/doi-10.1073/pnas.2015803118-green.svg)](https://doi.org/10.1073/pnas.2015803118)
  5 | <!-- badges: end -->
  6 | 
  7 | `dryR` (Differential RhythmicitY analysis in R) is an R package that provides the statistical framework to assess differential rhythmicity of a time series of RNA-Seq data with two and more conditions.
  8 | 
  9 | ## Getting Started
 10 | 
 11 | These instructions will allow you to get `dryR` running on your machine. 
 12 | 
 13 | ### Prerequisites
 14 | You need to install R (see https://www.r-project.org/).
 15 | 
 16 | `dryR` accepts count data typically produced by RNA-Seq with a time dimension and several conditions/groups. The input count data should contain only integer values and be organized in a matrix with rows indicating a specific gene and the column refering to a sample. 
 17 | 
 18 | The pipelines that can be used to produce these count data from RNA-Seq data (FASTQ files) are described in more detail here:
 19 | http://master.bioconductor.org/packages/release/workflows/vignettes/rnaseqGene/inst/doc/rnaseqGene.html
 20 | 
 21 | A user-friendly web application to generate a count table from FASTQ files is provided here: https://amp.pharm.mssm.edu/biojupies/
 22 | 
 23 | ### Installing
 24 | 
 25 | To install `dryR` run the following code in R.
 26 | ```
 27 | install.packages("devtools")
 28 | devtools::install_github("naef-lab/dryR")
 29 | ```
 30 | ## Quick start
 31 | ### Example dataset 
 32 | `dryR` comes with example data in form of a list called simData. The list contains count data simData[["countData"]], a vector with the different conditions/groups simData[["group"]], and a vector indicating Zeitgeber Time simData[["time"]]. The data was generated using simphony https://github.com/hugheylab/simphony.
 33 | 
 34 | ### Running an example
 35 | ```
 36 | library("dryR")
 37 | 
 38 | # prepare arguments
 39 | countData = simData[["countData"]]
 40 | group     = simData[["group"]]
 41 | time      = simData[["time"]]
 42 | 
 43 | # prefilter
 44 | keep <- rowSums(countData) > 0
 45 | countData <- countData[keep,]
 46 | 
 47 | # run the analysis for count data (e.g. RNA-Seq data)
 48 | dryList   = dryseq(countData,group,time)
 49 | 
 50 | # explore the results
 51 | dryList[["results"]]     # data frame summarizing results
 52 | dryList[["parameters"]]  # coefficients: phase, amplitude and mean for each group
 53 | dryList[["ncounts"]]     # normalized counts
 54 | dryList[["counts"]]      # raw counts
 55 | dryList[["cook"]]        # cook's distance for outlier detection
 56 | dryList[["BICW_rhythm"]] # BICW for each rhythmic model
 57 | dryList[["BICW_mean"]]   # BICW for each mean model
 58 | 
 59 | # generate a pdf with a global summary of all models
 60 | plot_models_rhythm(dryList, "./")
 61 | 
 62 | # plot a feature of interest
 63 | dry_plot(dryList, "feature_113")
 64 | ```
 65 | 
 66 | ## Decision Tree for Various Scenarios
 67 | The dryR package is equipped to handle a variety of dataset types and experimental designs extending beyond RNA-Seq data with multiple conditions/groups (see example above). The decision tree provided below serves as a guide for selecting the appropriate function to use. For example code, refer to the chapter "Non-standard Scenarios."
 68 | ![image](https://github.com/naef-lab/dryR/assets/64013365/a365dce5-16ea-4059-a028-7f60004fda99)
 69 | 
 70 | ## Non-standard Scenarios
 71 | 
 72 | ### Rhythmicity detection in RNA-Seq datasets with one condition
 73 | To detect rhythmic gene expression in RNA-Seq data with only one condition, we implemented the function `dryseq_single`. 
 74 | 
 75 | ```
 76 | library("dryR")
 77 | 
 78 | # prepare arguments for a one condition scenario
 79 | sel = grep("cond_1", simData[["group"]])
 80 | countData_single = simData[["countData"]][,sel]
 81 | group_single     = simData[["group"]][sel]
 82 | time_single      = simData[["time"]][sel]
 83 | 
 84 | # run the analysis for count data.
 85 | dryList   = dryseq_single(countData_single,group_single,time_single)
 86 | 
 87 | # explore the results
 88 | dryList[["results"]]    # data frame summarizing results
 89 | 
 90 | # plot a feature of interest
 91 | plot_single_cond(dryList, "feature_004")
 92 | ```
 93 | 
 94 | ### Normally distributed data
 95 | To asses temporal variation of normally distributed measurements, we implemented the function `drylm` that can deal with gaussian noise using linear models. 
 96 | 
 97 | ```
 98 | library("dryR")
 99 | 
100 | # prepare arguments
101 | data      = log(simData[["countData"]]+1)
102 | group     = simData[["group"]]
103 | time      = simData[["time"]]
104 | 
105 | # run the analysis with normally distributed data
106 | dryList = drylm(data,group,time)
107 | 
108 | # explore the results
109 | dryList[["results"]]    # data frame summarizing results
110 | dryList[["parameters"]] # coefficients: phase, amplitude and mean for each group
111 | 
112 | # generate a pdf with a global summary of all models
113 | plot_models_rhythm(dryList, "./")
114 | 
115 | # plot a feature of interest
116 | dry_plot(dryList, "feature_013")
117 | ```
118 | 
119 | 
120 | ### Normally distributed data with one condition
121 | To detect rhythmicity in normally disributed data with only one condition, we implemented the function `f_24`. 
122 | 
123 | ```
124 | library("dryR")
125 | 
126 | # prepare arguments for a one condition scenario
127 | simData_norm = simData
128 | simData[["normData"]] = log(simData[["countData"]]+1)
129 | sel = grep("cond_1", simData[["group"]])
130 | normData_single       = simData[["normData"]][,sel]
131 | time_single           = simData[["time"]][sel]
132 | 
133 | # run the analysis for count data.
134 | dryList   = f_24(normData_single,time_single)
135 | 
136 | # explore the results
137 | dryList[["results"]]    # data frame summarizing results
138 | 
139 | # plot a feature of interest
140 | plot_single_cond_lm(dryList, "feature_004")
141 | ```
142 | 
143 | 
144 | ### DryR with a simple vector as input
145 | You can run `drylm` with a simple vector that contains data from a time series of multiple groups. 
146 | 
147 | ```
148 | library("dryR")
149 | 
150 | # define time and group for each sample
151 | time        = c(1:48,1:48)   # Zeitgeber time or Circadian time in h for each sample
152 | group       = c(rep("KO",48), rep("WT",48))
153 | 
154 | # run the analysis with normally distributed data
155 | dryList = drylm(vector_example,group,time)
156 | 
157 | # explore the results
158 | dryList[["results"]]    # data frame summarizing results. Row 2 is a copy of row 1.
159 | dryList[["parameters"]] # coefficients: phase, amplitude and mean for each group. Row 2 is a copy of row 1.
160 | 
161 | #plot the result of the selected model
162 | dry_plot(dryList, "X1")
163 | ```
164 | 
165 | ## Help
166 | A documentation using `?dryseq`, `?drylm` or `?dryseq_single` is available. 
167 | 


--------------------------------------------------------------------------------
/R/dryseq_function_lm.R:
--------------------------------------------------------------------------------
  1 | #' Differential rhythmicity analysis for RNA-Seq datasets
  2 | #'
  3 | #' This function performs a rhythmicity analysis based on linear models with a subsequent models selection. The function accepts a time series assuming normally distributed noise of two or more groups. The function outputs the parameters mean, phase and amplitude are for each group.
  4 | #' @param data	matrix or vector containing data; if a matrix is provided each column represents a sample, each row represents a feature.
  5 | #' @param group	vector containing the name of each group (e.g. wildtype, knock-out).
  6 | #' @param time	vector containing numeric values of the zeiteber/circadian time for each sample.
  7 | #' @param period	numeric value to indicate period length of the oscillation. Default: period = 24 h.
  8 | #' @param sample_name	vector containing sample names. Default: colnames are sample names.
  9 | #' @param batch	vector containing potential batch effects between samples. Default: no batch effect.
 10 | #' @param nthreads vector numeric value to indicate the threads for parallel computing .Default: 60 \% of detected cores.
 11 | #' @return a list that contains the following data.frames: results (summary of results), parameters (rhythmic parameters), ncounts (normalized counts), counts (raw counts), cook (cook's distance)
 12 | #' @examples data = log(simData[["countData"]]+1)
 13 | #' group = simData[["group"]]
 14 | #' time  = simData[["time"]]
 15 | #' dryList = drylm(data,group,time)
 16 | #' head(dryList[["results"]])    # data frame summarizing results
 17 | #' head(dryList[["parameters"]]) # coefficients: phase, amplitude and mean for each group
 18 | #' head(dryList[["ncounts"]])    # normalized counts
 19 | #' @details DryR assesses rhythmicity and mean differences of gene expression in normal data.
 20 | #'      When necessary, a batch specific mean (m) can be given to the drylm function to account for technical batch effects.
 21 | #'      A technical batch effect is not allowed to be confounding so the resulting model matrix is fully ranked.
 22 | #'      To select an optimal gene-specific model, drylm first assesses rhythmicity across the different conditions. To this end, dryR defines different models across all groups.
 23 | #'      Models refined to have either zero (non-rhythmic pattern) or non-zero (rhythmic pattern) α and β coefficients for each analyzed group. Moreover, for some models the values of α and β can be also shared within any combination of all groups
 24 | #'      The coefficients α and β were used to calculate the phase (arctan(α/β)) and amplitude (log2-fold change peak-to-trough; 2sqrt(α^2+β^2) ) of a gene.
 25 | #'      Bayesian information criterion (BIC) based model selection was employed to account for model complexity using the following formula:
 26 | #'      \cr \cr BIC_j = n ln(RSS_j/n)+ k ln(n) \cr \cr
 27 | #'      with RSS the sum of residuals square of the multilinear regression, n the number of time points, and k the number of parameters.
 28 | #'      To assess the confidence of the selected model j we calculated the Schwarz weight (BICW):
 29 | #'      \cr \cr   BICW_j = e^(0.5ΔBIC_j)\ sum(e^0.5 ΔBIC_m), with ΔBIC_j - BIC_j - BIC_m*\cr \cr
 30 | #'      m* is the minimum BIC value in the entire model set. drylm consideres the BICW_j as the confidence level for model j. The model with the highest BICW is selected as the optimal model within the set of all defined models.
 31 | #'      In a second iteration step, drylm set the coefficient α and β to the values of the selected model in the first regression.
 32 | #'      drylm then defined different models for the mean coefficient with differing or shared means between groups. Each model is solved using linear regression and each gene was assigned to a preferred model based on the BICW as described above for the first iteration.
 33 | 
 34 | drylm=function(data,group,time,period=24,sample_name=colnames(data),batch=rep("A",length(sample_name)),n.cores=1 ){
 35 | 
 36 |   doParallel::registerDoParallel(cores=n.cores)
 37 |   #update
 38 |   vec = F
 39 |   if(is.vector(data)){data = rbind(data,data)
 40 |   rownames(data) = c("X1","X2")
 41 |   vec = T}
 42 | 
 43 |   sel         = order(group,time)
 44 |   time        = time[sel]
 45 |   group       = group[sel]
 46 |   data        = data[,sel]
 47 |   batch       = batch[sel]
 48 |   sample_name = as.character(sample_name[sel])
 49 | 
 50 |   s1 <- sin(2*pi*time/period)
 51 |   c1 <- cos(2*pi*time/period)
 52 | 
 53 |   conds  = cbind(group,s1,c1,batch)
 54 |   colnames(conds) = c("group","s1","c1","batch")
 55 | 
 56 |   colData <- data.frame(row.names=colnames(data), conds)
 57 |   N=length(unique(group))
 58 | 
 59 |   ############################
 60 |   # FIT RHYTHMS
 61 | 
 62 |   message("fitting rhythmic models")
 63 | 
 64 |   models = create_matrix_list(time, group, N,period)
 65 |   #Reorder u, a, b
 66 |   models = lapply(models, function(l) l[,c(grep("u",colnames(l)),grep("a|b",colnames(l)))])
 67 | 
 68 |   for (i in 1:length(models)){
 69 |     rownames(models[[i]]) = rownames(colData)}
 70 | 
 71 |   if (length(unique(batch))>1) {
 72 |     # add the batch effect
 73 |     model_b = as.matrix(model.matrix(~  batch),contrasts.arg=NULL)[,2:length(unique(batch)),drop=F]
 74 |     colnames(model_b)=paste0("BATCH_",unique(batch)[-1])
 75 |     models = lapply(models, function(l) cbind(model_b,l))
 76 |     models = lapply(models, function(l) l[,c(grep("^u",colnames(l)),grep("^BATCH",colnames(l)),grep("^a|^b",colnames(l)))]   )
 77 |   }
 78 | 
 79 |   fit = parallel::mclapply(split(data, rownames(data)),
 80 |                            do_all_lm,
 81 |                            my_mat= models,
 82 |                            mc.cores=n.cores)
 83 | 
 84 |   # calculate the BIC
 85 |   BIC = unlist(fit)[grep('BIC$',names(unlist(fit)))]
 86 |   BIC= matrix(BIC,nrow=nrow(data),byrow=T)
 87 |   rownames(BIC)=names(fit)
 88 |   BIC=BIC[rownames(data),]
 89 | 
 90 |   #calculate the BICW
 91 |   BICW               = t(apply(BIC,1,compute_BICW))
 92 |   chosen_model      = apply(BIC,1,which.min)
 93 |   chosen_model_BICW = apply(BICW,1,max)
 94 | 
 95 |   ############################
 96 |   # FIT BASELINE
 97 | 
 98 |   message("fitting mean models")
 99 | 
100 |   model_mean_cond=create_matrix_list_mean(N,group)
101 |   model_mean_cond=lapply(model_mean_cond,annotate_matrix,group)
102 | 
103 |   for (i in 1:length(model_mean_cond)){
104 |     rownames(model_mean_cond[[i]]) = rownames(colData)}
105 | 
106 |   gene.list=as.list(rownames(data))
107 |   fit = parallel::mclapply(gene.list,
108 |                            FUN=do_all_lm_mr,
109 |                            data,
110 |                            my_mat_r = models,
111 |                            my_mat_m = model_mean_cond,
112 |                            chosen_model = chosen_model,
113 |                            mc.cores=n.cores)
114 | 
115 |   #extract BIC
116 |   BIC_mean = unlist(fit)[grep('BIC$',names(unlist(fit)))]
117 |   BIC_mean = matrix(BIC_mean,nrow=nrow(data),byrow=T)
118 | 
119 |   #calculate the BICW
120 |   BICW_mean = t(apply(BIC_mean,1,compute_BICW))
121 | 
122 |   chosen_model_mean = apply(BIC_mean,1,which.min)
123 |   chosen_model_mean_BICW = apply(BICW_mean,1,max)
124 | 
125 |   ################
126 |   # coefficients / mean, amplitude and phase
127 |   ###############
128 | 
129 |   message("extracting rhythmic parameters")
130 |   parameters=NULL
131 | 
132 |   parameters =  foreach (i = 1:nrow(data)) %dopar% {
133 |     gene = rownames(data)[i]
134 | 
135 |     dds= fit[[i]][[chosen_model_mean[i]]]$param
136 |     out = compute_param_l(dds,period, N)
137 |     return(out)
138 |   }
139 |   parameters            = data.frame(t(do.call(cbind, parameters)))
140 |   colnames(parameters)  = c(paste(c('mean','a','b','amp','relamp','phase'),rep(unique(group),each =6), sep = "_"))
141 |   rownames(parameters)  = rownames(data)
142 | 
143 | 
144 |   #normalized counts
145 |   ncounts_RF = data
146 | 
147 |   # generate a table summarizing the analysis
148 |   complete_parameters = cbind(parameters,chosen_model,chosen_model_BICW, chosen_model_mean, chosen_model_mean_BICW)
149 |   global_table_df = merge(ncounts_RF,complete_parameters, by="row.names")
150 | 
151 |   rownames(global_table_df) = global_table_df[,1]
152 |   global_table_df           = global_table_df[,-1]
153 | 
154 |   out = list()
155 | 
156 |   out[["time"]]        = time
157 |   out[["group"]]       = group
158 |   out[["results"]]     = global_table_df
159 |   out[["BICW_rhythm"]] = BICW
160 |   out[["BICW_mean"]]   = BICW_mean
161 |   out[["values"]]      = ncounts_RF
162 |   out[["parameters"]]  = complete_parameters
163 | 
164 |   #if(vec == TRUE){
165 |   #  out[["results"]]     = global_table_df[1,]
166 |   #  out[["BICW_rhythm"]] = BICW[1,]
167 |   #  out[["BICW_mean"]]   = BICW_mean[1,]
168 |   #  out[["values"]]      = ncounts_RF[1,]
169 |   #  out[["parameters"]]  = complete_parameters[1,]
170 |   #}
171 | 
172 | 
173 |   message("finished!")
174 |   return(out)
175 | 
176 | 
177 | }
178 | 


--------------------------------------------------------------------------------
/R/dryseq_function.R:
--------------------------------------------------------------------------------
  1 | #' Differential rhythmicity analysis for RNA-Seq datasets
  2 | #'
  3 | #' This function performs a rhythmicity analysis based on generalized linear models with a subsequent models selection. The function accepts raw count data from a temporal RNA-Seq dataset of two or more groups. The function outputs parameters mean, phase and amplitude are for each group.
  4 | #' @param countData	matrix containing non-negative integers; each column represents a sample, each row represents a gene/transcript.
  5 | #' @param group	vector containing the name of each sample.
  6 | #' @param time	vector containing numeric values of the time for each sample.
  7 | #' @param period	numeric value to indicate period length of the oscillation. Default: circadian data period of 24 h.
  8 | #' @param sample_name	vector containing sample names. Default: colnames are sample names.
  9 | #' @param batch	vector containing potential batch effects between samples. Default: no batch effect.
 10 | #' @param nthreads vector numeric value to indicate the threads for parallel computing .Default: 60 \% of detected cores.
 11 | #' @return a list that contains the following data.frames: results (summary of results), parameters (rhythmic parameters), ncounts (normalized counts), counts (raw counts), cook (cook's distance)
 12 | #' @examples countData = simData[["countData"]]
 13 | #' group = simData[["group"]]
 14 | #' time  = simData[["time"]]
 15 | #' dryList = dryseq(countData,group,time)
 16 | #' head(dryList[["results"]])    # data frame summarizing results
 17 | #' head(dryList[["parameters"]]) # coefficients: phase, amplitude and mean for each group
 18 | #' head(dryList[["ncounts"]])    # normalized counts
 19 | #' head(dryList[["counts"]])     # raw counts
 20 | #' head(dryList[["cook"]])       # cook's distance
 21 | #' @details DryR assesses rhythmicity and mean differences of gene expression in RNA-Seq count data.
 22 | #'     As proposed (Love et al. 2014), a count Y of a gene in a sample s can be modeled as a negative binomial with a fitted mean μ_gs and a gene-specific dispersion parameter θ_g.
 23 | #'     \cr \cr  \emph{Y_gs}~NB(\emph{μ_gs},\emph{θ_g})\cr\cr
 24 | #'     The fitted mean is proportional to the quantity q of fragments that correspond to a gene in a sample scaled by a sample-specific scaling factor s_s (Love et al. 2014).
 25 | #'     This scaling factor depends on the sampling depth of each library and can be estimated using the median-of-ratios method of DESeq2 (Anders and Huber 2010).
 26 | #'     \cr \cr   \emph{μ_gs} = \emph{s_s q_gs}\cr \cr
 27 | #'     DryR estimates gene-specific distribution θg using empirical Bayes shrinkage described by Love et al. (Love et al. 2014).
 28 | #'     and variance is computed from the following relationship to the dispersion parameter θ:
 29 | #'     \cr \cr   Var(\emph{Y_gs}) = E[(\emph{Y_gs} + \emph{θ_g} \emph{μ^2_gs})]\cr \cr
 30 | #'     The fit uses a generalized linear model with a logarithmic link function. Sample specific size factor (λ_s) is defined as an offset. The full GLM is defined as follows:
 31 | #'     \cr \cr log(\emph{μ_gbcs}) = \emph{m_gb} + \emph{m_gc} + \emph{α_gc} cos(\emph{ω t(s)}) + \emph{β_gc} sin(\emph{ω t(s)}) + log(\emph{λ_t(s)})\cr \cr
 32 | #'      μ is the raw count for gene g, condition/group c and Zeitgeber/circadian time t. α and β are coefficients of the cosine and sine functions, respectively. m is a coefficient to describe a mean expression level.
 33 | #'      When necessary, a batch specific mean (m) can be given to the dryseq function to account for technical batch effects.
 34 | #'      A technical batch effect is not allowed to be confounding so the resulting model matrix is fully ranked.
 35 | #'      To select an optimal gene-specific model, dryseq first assesses rhythmicity across the different conditions. To this end, dryR defines different models across all groups.
 36 | #'      Models refined to have either zero (non-rhythmic pattern) or non-zero (rhythmic pattern) α and β coefficients for each analyzed group. Moreover, for some models the values of α and β can be also shared within any combination of all groups
 37 | #'      The coefficients α and β were used to calculate the phase (arctan(α/β)) and amplitude (log2-fold change peak-to-trough; 2sqrt(α^2+β^2) ) of a gene.
 38 | #'      Bayesian information criterion (BIC) based model selection was employed to account for model complexity using the following formula:
 39 | #'      \cr \cr   BIC_j = ln(n)k - 2ln(L̂_ĵ)\cr \cr
 40 | #'      L̂ is defined as the log-likelihood of the model j from the regression, n is the number of data points and k is the number of parameters.
 41 | #'      To assess the confidence of the selected model j we calculated the Schwarz weight (BICW):
 42 | #'      \cr \cr   BICW_j = e^(0.5ΔBIC_j)\ sum(e^0.5 ΔBIC_m), with ΔBIC_j - BIC_j - BIC_m*\cr \cr
 43 | #'      m* is the minimum BIC value in the entire model set. Dryseq consideres the BICW_j as the confidence level for model j. The model with the highest BICW is selected as the optimal model within the set of all defined models.
 44 | #'      In a second iteration step, dryR set the coefficient α and β to the values of the selected model in the first regression.
 45 | #'      dryseq then defined different models for the mean coefficient with differing or shared means between groups. Each model is solved using generalized linear regression and each gene was assigned to a preferred model based on the BICW as described above for the first iteration.
 46 | #'      The model selection is sensitive to outliers: dryseq provide a cook's distance for each gene. A fit for a gene with a maximum cook's distance of higher than 1 should be considered with care.
 47 | #' @references Love, M.I., Huber, W., Anders, S. (2014) Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology
 48 | #' @references Anders, S. and Huber, W. (2014) Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology
 49 | dryseq=function(countData,group,time,period=24,sample_name=colnames(countData),batch=rep("A",length(sample_name)),n.cores=round(parallel::detectCores()*.6,0) ){
 50 | #update
 51 |   doParallel::registerDoParallel(cores=n.cores)
 52 |   sel       = order(group,time)
 53 |   time      = time[sel]
 54 |   group     = group[sel]
 55 |   countData = countData[,sel]
 56 |   batch = batch[sel]
 57 |   sample_name = sample_name[sel]
 58 | 
 59 |   countData = countData[rowSums(countData)!=0,]
 60 | 
 61 |   s1 <- sin(2*pi*time/period)
 62 |   c1 <- cos(2*pi*time/period)
 63 | 
 64 |   conds  = cbind(group,s1,c1,batch)
 65 |   colnames(conds) = c("group","s1","c1","batch")
 66 | 
 67 |   colData <- data.frame(row.names=colnames(countData), conds)
 68 |   N=length(unique(group))
 69 | 
 70 |   ############################
 71 |   # FIT RHYTHMS
 72 | 
 73 |   message("fitting rhythmic models")
 74 | 
 75 |   models = create_matrix_list(time,group, N,period)
 76 |   #Reorder u, a, b
 77 |   models = lapply(models, function(l) l[,c(grep("u",colnames(l)),grep("a|b",colnames(l)))]   )
 78 | 
 79 |   for (i in 1:length(models)){
 80 |     rownames(models[[i]]) = rownames(colData)}
 81 | 
 82 |   if (length(unique(batch))>1) {
 83 |     # add the batch effect
 84 |     model_b = as.matrix(model.matrix(~  batch),contrasts.arg=NULL)[,2:length(unique(batch)),drop=F]
 85 |     colnames(model_b)=paste0("BATCH_",unique(batch)[-1])
 86 |     models = lapply(models, function(l) cbind(model_b,l))
 87 |     models = lapply(models, function(l) l[,c(grep("u",colnames(l)),grep("BATCH",colnames(l)),grep("a|b",colnames(l)))]   )
 88 |   }
 89 | 
 90 |   dds = DESeq2::DESeqDataSetFromMatrix(countData = countData, colData = colData,  design=~1)
 91 |   dds.full = DESeq2::DESeq(dds, full=models[[length(models)]], betaPrior = F, fitType = "parametric", test = "Wald", parallel =T, quiet = T)
 92 | 
 93 |   deviances = sapply(models[-length(models)], function(m){
 94 |     dds.x = DESeq2::nbinomWaldTest(dds.full, modelMatrix= m, betaPrior = F, quiet = T)
 95 |     return(mcols(dds.x)$deviance)
 96 |   })
 97 | 
 98 |   deviances = cbind(deviances,mcols(dds.full)$deviance)
 99 | 
100 |   message("computing BICW (rhythm)")
101 | 
102 |   # calculate the BIC
103 |   BIC = as.data.frame(sapply(1:ncol(deviances), function(i) { deviances[,i] + log(ncol(countData)) * ncol(models[[i]] )}   ))
104 | 
105 |   #calculate the BICW
106 |   BICW               = t(apply(BIC,1,compute_BICW))
107 |   chosen_model      = apply(BIC,1,which.min)
108 |   chosen_model_BICW = apply(BICW,1,max)
109 | 
110 |   ############################
111 |   # FIT BASELINE
112 | 
113 |   message("fitting mean models")
114 | 
115 |   model_mean_cond=create_matrix_list_mean(N,group)
116 |   model_mean_cond=lapply(model_mean_cond,annotate_matrix,group)
117 | 
118 |   for (i in 1:length(model_mean_cond)){
119 |     rownames(model_mean_cond[[i]]) = rownames(colData)}
120 | 
121 |   # choose the best model for rhthmicity and then run the mean on the samples
122 | 
123 |   DDS_dev =  foreach (i = 1:length(models)) %dopar% {
124 |     sel = which(chosen_model==i)
125 |     gene = rownames(dds.full)[sel]
126 | 
127 |     if(length(gene)>0){
128 |       M=models[[i]]
129 |       #build the gene specific model from the rhythmic point of view
130 |       gene_specific_mean_models = lapply(model_mean_cond,
131 |                                          function(x) cbind(x,M[,-grep("u",colnames(M))]))
132 | 
133 |       dev <- lapply(gene_specific_mean_models,function(m){
134 |         dds.m <- dds.full # Copying the full model
135 |         dds.m <- DESeq2::nbinomWaldTest(dds.m[gene], modelMatrix= as.matrix(m), betaPrior = F) # Re-run wald test
136 |         return(list(dds.m, mcols(dds.m)$deviance)) # Returning deviances (-2 * log likelihood) // https://support.bioconductor.org/p/107472/
137 | 
138 |       })
139 |     }
140 | 
141 |     if(length(gene)==0){dev = list (NA, NA)}
142 | 
143 |     return(dev)
144 |   }
145 | 
146 |   deviance_mean = NULL
147 |   for (cm_r in 1:length(models)){
148 | 
149 |     if(!is.na(DDS_dev[[cm_r]][1])){
150 |       deviance_mean.x            = rbind(sapply(1:length(model_mean_cond),function(x) {DDS_dev[[cm_r]][[x]][[2]]}))
151 |       rownames(deviance_mean.x)  = rownames(DDS_dev[[cm_r]][[1]][[1]])
152 |       deviance_mean              = rbind(deviance_mean, deviance_mean.x)}
153 |   }
154 | 
155 |   deviance_mean = deviance_mean[rownames(countData),]
156 | 
157 |   message("computing BICW (mean)")
158 | 
159 |   # calculate the BIC
160 |   BIC_mean = as.data.frame(sapply(1:ncol(deviance_mean), function(i) { deviance_mean[,i] + log(ncol(countData)) * ncol(model_mean_cond[[i]] )}   ))
161 | 
162 |   #calculate the BICW
163 |   BICW_mean = t(apply(BIC_mean,1,compute_BICW))
164 | 
165 |   chosen_model_mean = apply(BIC_mean,1,which.min)
166 |   chosen_model_mean_BICW = apply(BICW_mean,1,max)
167 | 
168 |   ################
169 |   # coefficients / mean, amplitude and phase
170 |   ###############
171 | 
172 |   message("extracting rhythmic parameters")
173 | 
174 |   parameters=NULL
175 | 
176 |   parameters =  foreach (i = 1:nrow(deviance_mean)) %dopar% {
177 |     gene = rownames(deviance_mean)[i]
178 |     cm_r = chosen_model[i]
179 |     cm_m = chosen_model_mean[i]
180 |     dds= DDS_dev[[cm_r]][[cm_m]][[1]]
181 |     out = compute_param(dds, gene ,period,N)
182 |     return(data.frame(row.names= gene, t(matrix(out)))           )
183 |   }
184 | 
185 |   parameters            = data.frame(do.call(rbind.data.frame, parameters))
186 |   colnames(parameters)  = c(paste(c('mean','a','b','amp','relamp','phase'),rep(unique(group),each =6), sep = "_"))
187 |   parameters            = parameters[rownames(countData),]
188 | 
189 |   # Generate all the count and expression data
190 |   # raw counts
191 |   counts_RF        =  counts(dds.full, normalized = FALSE)
192 | 
193 |   #vst stabilized counts
194 |   vsd <- DESeq2::varianceStabilizingTransformation(dds.full)
195 |   vsd <- assay(vsd)
196 | 
197 |   #normalized counts
198 |   ncounts_RF       = counts(dds.full, normalized = TRUE)
199 | 
200 |   # generate a table summarizing the analysis
201 |   complete_parameters = cbind(parameters,chosen_model,chosen_model_BICW, chosen_model_mean, chosen_model_mean_BICW)
202 |   global_table = merge(ncounts_RF,complete_parameters, by="row.names")
203 |   rownames(global_table) = global_table$Row.names
204 |   global_table_df  = global_table[,-grep("Row.names",colnames(global_table))]
205 | 
206 |   global_table_df = global_table_df[rownames(countData),]
207 | 
208 |   out = list()
209 | 
210 |   out[["time"]]        = time
211 |   out[["group"]]       = group
212 |   out[["results"]]     = global_table_df
213 |   out[["BICW_rhythm"]] = BICW
214 |   out[["BICW_mean"]]   = BICW_mean
215 |   out[["vsd"]]         = vsd
216 |   out[["ncounts"]]     = ncounts_RF
217 |   out[["counts"]]      = counts_RF
218 |   out[["parameters"]]  = complete_parameters
219 |   out[["cook"]]        = assays(dds.full)[["cooks"]]
220 |   out[["dds.full"]]    = dds.full
221 | 
222 |   message("finished!")
223 |   return(out)
224 | 
225 |   # to add flags for low expression (counts), high cook's distance
226 |   # to be added error messages when only one group is given etc.
227 | }
228 | 


--------------------------------------------------------------------------------
/R/misc_function.R:
--------------------------------------------------------------------------------
  1 | #####################################
  2 | comb = function(n,k){
  3 |   factorial(n)/(factorial(k)*factorial(n-k))
  4 | }
  5 | #####################################
  6 | nbt = function(x){
  7 |   l=length(which(x) == TRUE)
  8 |   l
  9 | }
 10 | #####################################
 11 | simply_it = function(x){
 12 |   a = 0
 13 |   for(i in x) {
 14 |     a= paste(a,paste(which(x == as.numeric(i)), collapse = "",sep = ""), collapse = "", sep = "")
 15 |   }
 16 |   a
 17 | }
 18 | #####################################
 19 | simply_it.2 = function(x){
 20 | 
 21 |   a = match(x,x)
 22 | }
 23 | #####################################
 24 | make_circ_coord = function(t,x,ttot) {
 25 |   dt=(t[2]-t[1])*.45
 26 |   a=(rep(t,rep(4,length(t)))+rep(c(-dt,-dt,dt,dt),length(t)))*2*pi/ttot
 27 |   h=rep(x,rep(4,length(x)))*rep(c(0,1,1,0),length(t))
 28 |   list(angles=a,heights=h)
 29 | }
 30 | #####################################
 31 | circular_phase24H_histogram = function(x,name,ttot){
 32 |   br=0:ttot
 33 |   h=hist(x, br=br,plot=F)
 34 |   df = data.frame(x= as.numeric(h$breaks[-1]%%ttot), y= h$counts)
 35 |   
 36 |   ggplot(df, aes(x=x, y=y)) + 
 37 |     geom_bar(stat='identity') + 
 38 |     coord_polar(start = -0.261799/2, direction=1) +
 39 |     scale_x_continuous(breaks = seq(0, ttot, round(ttot/24,0))) +
 40 |     ylab("") +
 41 |     xlab("") +
 42 |     theme_bw() +
 43 |     ggtitle(paste("", name)) +
 44 |     theme(aspect.ratio = 1, 
 45 |           axis.text=element_text(size=8), 
 46 |           panel.grid.minor = element_blank(), 
 47 |           panel.border = element_blank(), 
 48 |           axis.text.y = element_blank(), 
 49 |           axis.ticks = element_blank(), 
 50 |           plot.title = element_text(hjust = 0.5))
 51 | }
 52 | 
 53 | 
 54 | #####################################
 55 | compute_BICW = function(x){
 56 |   x = as.numeric(x)
 57 |   BIC_min = min(x)
 58 |   test = exp(-0.5*(x-BIC_min))/sum(exp(-0.5*(x-BIC_min)))
 59 |   return(test)
 60 | }
 61 | #################################3
 62 | compute_RSS = function(x, matX){
 63 | 
 64 |   xx = solve(t(matX)%*%matX)
 65 |   y = xx %*% t(matX) %*% as.numeric(x)
 66 |   y = as.matrix(y)
 67 |   rownames(y)  = colnames(matX)
 68 |   RSS = t(x) %*% x -t(x) %*% matX %*% xx %*% t(matX) %*% x
 69 |   list(param=y,RSS = RSS)
 70 | }
 71 | 
 72 | #############################################
 73 | compute_BIC = function(A,n){
 74 | 
 75 |   p = length(A$param)
 76 |   #AIC = n * log(A$RSS/n, base = exp(1)) + 2* p + 2*p*(p +1) /(n-p-1)
 77 |   BIC=  n * log(A$RSS/n, base = exp(1))  + log(n, base = exp(1)) * p
 78 |   list(BIC = BIC, param = A$param)
 79 | 
 80 | }
 81 | 
 82 | ##############################
 83 | do_all_lm = function(x,my_mat){
 84 |   x = as.numeric(x)
 85 |   n = length(x)
 86 | 
 87 |   my_fit = lapply(my_mat,compute_RSS, x = x)
 88 |   my_BIC =lapply(my_fit,compute_BIC,n=n)
 89 |   my_BIC
 90 | }
 91 | 
 92 | ##########################
 93 | 
 94 | do_all_lm_mr = function(x,countData,my_mat_r, my_mat_m, chosen_model){
 95 |   i=match(x,rownames(countData))
 96 |   x=countData[x,]
 97 |   M=my_mat_r[[chosen_model[i]]]
 98 |   #build the gene specific model from the rhythmic point of view
 99 |   gene_specific_mean_models = lapply(my_mat_m,
100 |                                      function(x) cbind(x,M[,-grep("u",colnames(M))]))
101 |   x = as.numeric(x)
102 |   n = length(x)
103 | 
104 |   my_fit = lapply(gene_specific_mean_models,compute_RSS, x = x)
105 |   my_BIC =lapply(my_fit,compute_BIC,n=n)
106 |   my_BIC
107 | }
108 | 
109 | ####################################
110 | compute_param = function(dds, gene, period=T_,N){
111 | 
112 |   dds = dds[gene,]
113 |   param = c(paste(rep(c('u','a','b'),each=N),rep(1:N,3), sep = "."))
114 | 
115 |   paramout = rep(NA,N*6)
116 | 
117 |   for(i in 1:N){
118 | 
119 |     u=coef(dds)[grep(paste(param[i],"Intercept",sep="|"), colnames(coef(dds)))]
120 |     a=coef(dds)[grep(param[i+N], colnames(coef(dds)))]
121 |     b=coef(dds)[grep(param[i+N*2], colnames(coef(dds)))]
122 | 
123 |     if(length(u) ==0) u=NA
124 |     if(length(a) ==0) a=NA
125 |     if(length(b) ==0) b=NA
126 | 
127 |     phase=period/(2*pi)*atan2(b,a)
128 |     amp =2*sqrt(a^2+b^2)
129 |     relamp=0.5*amp/u
130 |     if(!is.na(phase)){
131 |       #if(phase<0) phase=phase+period
132 |       #if(phase>period) phase=phase-period
133 |       phase=phase%%period
134 |     }
135 |     paramout[(1:6 + 6*(i-1))] = c(u,a,b,amp,relamp,phase)
136 |   }
137 | 
138 |   #names(paramout) = c(paste(c('mean','a','b','amp','relamp','phase'),rep(1:N,each =6), sep = "_"))
139 |   paramout
140 | }
141 | ####################################
142 | compute_param_l = function(dds, period=T_, N){
143 | 
144 | 
145 |   param = c(paste(rep(c('u','a','b'),each=N),rep(1:N,3), sep = "."))
146 | 
147 |   paramout = rep(NA,N*6)
148 | 
149 |   for(i in 1:N){
150 | 
151 |     u=dds[grep(paste(param[i],"Intercept",sep="|"), rownames(dds)),1]
152 |     a=dds[grep(param[i+N], rownames(dds)),1]
153 |     b=dds[grep(param[i+N*2], rownames(dds)),1]
154 | 
155 |     if(length(u) ==0) u=NA
156 |     if(length(a) ==0) a=NA
157 |     if(length(b) ==0) b=NA
158 | 
159 |     phase=period/(2*pi)*atan2(b,a)
160 |     amp =2*sqrt(a^2+b^2)
161 |     relamp=0.5*amp/u
162 |     if(!is.na(phase)){
163 |       #if(phase<0) phase=phase+period
164 |       #if(phase>period) phase=phase-period
165 |       phase=phase%%period
166 |     }
167 |     paramout[(1:6 + 6*(i-1))] = c(u,a,b,amp,relamp,phase)
168 |   }
169 | 
170 |   #names(paramout) = c(paste(c('mean','a','b','amp','relamp','phase'),rep(1:N,each =6), sep = "_"))
171 |   paramout
172 | }
173 | 
174 | #####################################
175 | create_matrix_list = function(t, conds, n.co, period){
176 |   require(combinat)
177 | 
178 |   my_matrix = list()
179 | 
180 |   c <- cos(2*pi*t/period)
181 |   s <- sin(2*pi*t/period)
182 | 
183 |   MAT <- cbind(rep(1,length(t)),c[1:length(t)],s[1:length(t)])
184 |   GMAT <- matrix(NA,ncol=3*n.co, nrow =length(t))
185 |   rownames(GMAT) <- conds
186 |   colnames(GMAT) <- c(paste(c('u','a','b'),rep(1:n.co,each =3), sep = "."))
187 | 
188 |   it <- 1
189 |   for(i in unique(rownames(GMAT))){
190 |     GMAT[rownames(GMAT)==i,grep(paste0('.',it,'$'),colnames(GMAT))] = MAT[rownames(GMAT)==i,]
191 |     it=it+1
192 |   }
193 | 
194 |   vn = rep(F,n.co)
195 |   for(i in 1:n.co){
196 |     g = rep(F,n.co)
197 |     g[1:i] = TRUE
198 |     p = unique(combinat::permn(g))
199 |     v = matrix(unlist(p),ncol = n.co,byrow = TRUE)
200 |     vn = rbind(vn,v)
201 | 
202 |   }
203 | 
204 | 
205 |   vn = vn[,rep(1:n.co,each=3)]
206 |   vn[,seq(1,3*n.co,3)] = TRUE
207 |   vn = data.frame(vn,row.names= NULL)
208 |   vn[,dim(vn)[2] + 1]=(apply(vn,1,nbt)-n.co)/2
209 |   colnames(vn) = c(paste(c('u','a','b'),rep(1:n.co,each =3), sep = "."),'nb_cycl')
210 | 
211 |   model = 1
212 |   for(g in 0:n.co){
213 | 
214 | 
215 |     nb_cycl =g
216 |     com = expand.grid(rep(list(1:nb_cycl),nb_cycl))
217 |     simply = apply(com,1,simply_it)
218 |     poss =match(unique(simply),simply)
219 |     com_l = com[poss,]
220 |     pos = which(vn$nb_cycl == g)
221 | 
222 |     for(k in pos){
223 |       if(g > 1){
224 |         for(v in 1:nrow(com_l)){
225 |           gmat = GMAT[,unlist(vn[k,-dim(vn)[2]])]
226 |           ve = as.numeric(com_l[v,])
227 |           id =1
228 |           sa = ve
229 |           while(length(ve) !=0){
230 | 
231 |             poc = which(sa == ve[1])
232 |             po = which(ve ==ve[1])
233 |             if(length(poc) !=1){
234 |               poch =c(2*poc-1,2*poc)
235 |               poch =poch[order(poch)]
236 |               he = grep("[ab]",colnames(gmat))
237 |               he = he[poch]
238 |               pp=0
239 |               for(z in 1:((length(he)-2)/2)){
240 |                 repl1 = which(gmat[,he[2*z+1]]!='NA')
241 |                 repl2 = which(gmat[,he[2*z+2]]!='NA')
242 |                 gmat[repl1,he[1]] = gmat[repl1,he[2*z+1]]
243 |                 gmat[repl2,he[2]] = gmat[repl2,he[2*z+2]]
244 |                 colnames(gmat)[he[1]]= paste(colnames(gmat)[he[1]],colnames(gmat)[he[2*z+1]],sep=',')
245 |                 colnames(gmat)[he[2]]= paste(colnames(gmat)[he[2]],colnames(gmat)[he[2*z+2]],sep=',')
246 |                 gmat[repl1,he[2*z+1]] =NA
247 |                 gmat[repl2,he[2*z+2]]=NA
248 |                 pp = pp+2
249 |               }
250 |               id = id+1
251 |               ve = ve[-po]
252 |             }else{
253 |               ve = ve[-1]
254 |             }
255 | 
256 |           }
257 |           gmat[is.na(gmat)] =0
258 |           del=which(apply(gmat,2,function(x) length(which(x == 0))) == length(t))
259 |           if(length(del)!=0){
260 |             gmat = gmat[,-del]
261 |           }
262 |           my_matrix[[model]] = gmat
263 |           model = model + 1
264 |         }
265 |       }else{
266 |         gmat = GMAT[,unlist(vn[k,-dim(vn)[2]])]
267 |         gmat[is.na(gmat)] =0
268 |         del =which(apply(gmat,2,function(x) length(which(x == 0))) == length(t))
269 |         if(length(del)!=0){
270 |           gmat = gmat[,-del]
271 |         }
272 |         my_matrix[[model]] = gmat
273 |         model = model +1
274 |       }
275 |     }
276 | 
277 |   }
278 | 
279 | 
280 | 
281 |   return(my_matrix)
282 | }
283 | #####################################
284 | create_matrix_list_mean = function(N,group){
285 |   com = expand.grid(rep(list(1:N),N))
286 |   simply = as.data.frame(t(apply(com,1,simply_it.2)))
287 |   simply = do.call("paste",simply)
288 |   poss =match(unique(simply),simply)
289 |   com_l = com[poss,]
290 |   names(com_l)=unique(group)
291 |   com_l=com_l[order(apply(com_l,1,function(x) length(unique(x))),apply(com_l,1,function(x) length(which(x==max(x))))),]
292 |   rownames(com_l)=1:nrow(com_l)
293 | 
294 |   com_l=com_l[,match(group,names(com_l))]
295 |   p=list()
296 |   for(j in 1:nrow(com_l)){
297 |     if(j==1){
298 |       p[[j]] = as.matrix(rep(1,ncol(com_l)))
299 | 
300 |     }else{
301 |       p[[j]]= model.matrix(~0+ factor(as.numeric(com_l[j,])))
302 |     }
303 |   }
304 |   p
305 | }
306 | #####################################
307 | annotate_matrix = function(m,group){
308 |   if(ncol(m)==1){
309 |     colnames(m)=paste("u",1:length(unique(group)),sep=".",collapse=".")
310 |   }else{
311 |     pos_ind= match(unique(group),group)
312 |     m=as.matrix(m)
313 |     l=list()
314 |     for(k in 1:ncol(m)){
315 |       l[[k]]=as.numeric(which(m[pos_ind,k]==1))
316 |     }
317 |     colnames(m)=sapply(l,function(x) paste("u",x,sep=".",collapse="."))
318 |   }
319 |   m
320 | }
321 | 
322 | #####################################
323 | dry_plot = function (dryList, gene, period=24)
324 | {
325 |   normal = FALSE
326 |   if("ncounts" %in% names(dryList)){vsd        = log2(dryList[["ncounts"]]+1)}
327 |   if("values" %in% names(dryList)){vsd         = dryList[["values"]]
328 |   normal = T}
329 |   
330 |   parameters = dryList[["parameters"]][,grep("^mean|^a_|^b_|^amp|^phase|^relamp",colnames(dryList[["parameters"]]))]
331 |   
332 |   ID = rownames(dryList[["results"]] )[grep(paste0('^',gene,'$'),rownames(dryList[["results"]] ))]
333 |   
334 |   #print(ID)
335 |   
336 |   d = vsd[ID, ]
337 |   d = reshape2::melt(d)
338 |   
339 |   d$group            = dryList[["group"]]
340 |   
341 |   d$time            = as.numeric(dryList[["time"]])
342 |   d$time            = d$time%%period
343 |   
344 |   suppressWarnings({ d <- Rmisc::summarySE(d, measurevar="value", groupvars=c("time","group")) })
345 |   
346 |   v = seq(0,period,round(period/24,0))
347 |   fit_d_0 = parameters[which(rownames(parameters)==ID),grep("mean",colnames(parameters))] # intercept
348 |   fit_d_1 = parameters[which(rownames(parameters)==ID),grep("a_",colnames(parameters))] # coefficient a
349 |   fit_d_2 = parameters[which(rownames(parameters)==ID),grep("^b_",colnames(parameters))] # coefficient b
350 |   
351 |   fit_d_0[is.na(fit_d_0)] = 0
352 |   fit_d_1[is.na(fit_d_1)] = 0
353 |   fit_d_2[is.na(fit_d_2)] = 0
354 |   
355 |   m = data.frame(v)
356 |   
357 |   dd = data.frame(v)
358 |   dd$v = v
359 |   
360 |   fit_values = function (x,n)
361 |   { as.numeric((fit_d_0[n] + fit_d_1[n]*cos(2*pi*x/period)  + fit_d_2[n]*sin(2*pi*x/period)))  }
362 |   
363 |   for (u in 1:length(unique(d$group))){
364 |     m[,u+1]  = NA
365 |     m[,u+1]  = apply(dd,1, fit_values,u)
366 |   }
367 |   
368 |   m = m[,-1]
369 |   
370 |   colnames(m) =  unique(dryList[["group"]])
371 |   
372 |   m =  reshape2::melt(m, , id.vars = NULL)
373 |   m$time = rep(v, length(unique(d$group)))
374 |   
375 |   colnames(m)       = c("group","value","time")
376 |   
377 |   if(normal==FALSE) {m$value[which(m$value<0)] = 0}
378 |   
379 |   gg1 = ggplot(d, aes(x=time, y=value, group=group, color=group)) +
380 |     geom_errorbar(aes(ymin=value-se, ymax=value+se), width=.4) +
381 |     geom_point(size=2, shape=19) +
382 |     xlab("Time (h)") +
383 |     ylab("Log2 normalized counts") +
384 |     ggtitle(ID) +
385 |     scale_x_continuous(breaks=seq(0,period+6,6)) +
386 |     theme_bw(base_size = 10) +
387 |     theme(aspect.ratio = 1, panel.grid.minor=element_blank(), legend.position = "right") +
388 |     geom_line(aes(x=time, y=(value), group=group), data = m, position=position_dodge(width=0.5)) +
389 |     facet_wrap(~group)
390 |   
391 |   gg1
392 | }
393 | 
394 | #####################################
395 | #' Test a time series for rhythmicity using a harmonic regression method
396 | #' This function has been used in PMID:24344304, please cite this paper if this function is used.
397 | #' 
398 | #' @param x Vector of numerics to test rhythmicity
399 | #' @param t Vector of time (in hours)
400 | #' @param period Period of oscillations to test
401 | #' @param offset Phase offset (if needed)
402 | #' @return Number of timepoints, mean, amplitude (max-min), relative amplitude, phase, and p-value of rhythmicity
403 | #' 
404 | f24_R2_cycling=function(x, t=2*(0:(length(x)-1)), period=24, offset=0)
405 | {
406 | 	
407 | 	kk = which(!is.na(x)==TRUE)
408 | 	x = x[kk]
409 | 	t = t[kk]
410 | 	n=length(x)
411 | 	#mu=mean(x)
412 | 	nb.timepoints=length(x)
413 | 	if(n<4)
414 | 	{ 
415 | 		if(n==0) c(nb.timepoints=nb.timepoints, mean=NA, amp=NA, relamp=NA,phase=NA,pval=NA) 
416 | 		else 
417 | 		{
418 | 			c(nb.timepoints=nb.timepoints, mean=mean(x), amp=NA, relamp=NA,phase=NA,pval=NA)
419 | 		}
420 | 	}
421 | 	else
422 | 	{
423 | 		sig2=var(x)
424 | 		c=cos(2*pi*t/period)
425 | 		s=sin(2*pi*t/period)
426 | 		A = mean(x*c)-mean(x)*mean(c)
427 | 		B = mean(x*s)-mean(x)*mean(s)
428 | 		c1 = mean(c^2)-mean(c)^2
429 | 		c2 = mean(c*s)-mean(c)*mean(s)
430 | 		c3 = mean(s^2)-mean(s)^2
431 | 		b = (A*c2-B*c1)/(c2^2-c1*c3)
432 | 		a = (A-b*c2)/c1
433 | 		mu = mean(x)-a*mean(c)-b*mean(s)
434 | 		#	b=2*mean(x*s)
435 | 		x.hat=mu+a*c+b*s
436 | 		sig2.1=var(x-x.hat)
437 | 		if(is.na(a)||is.na(b)) {c(nb.timepoints=nb.timepoints, mean=mean(x), amp=NA, relamp=NA,phase=NA,pval=NA)}
438 | 		else
439 | 		{
440 | 			p=3
441 | 			R2=0
442 | 			if(sig2>0) R2=1-sig2.1/sig2
443 | 			# http://www.combustion-modeling.com/downloads/beta-distribution-for-testing-r-squared.pdf
444 | 			# I checked that it works
445 | 			amp=max(x)-min(x)
446 | 			phase=period/(2*pi)*atan2(b, a)
447 | 			if(phase<0) phase=phase+period
448 | 			if(phase>period) phase=phase-period
449 | 			phase=(phase+offset)%%period
450 | 			pval = pbeta(R2, (p-1)/2, (n-p)/2, lower.tail = FALSE, log.p = FALSE)
451 | 			
452 | 			c(nb.timepoints=nb.timepoints, mean =mean(x), amp=2*sqrt(a^2+b^2),relamp=sqrt(a^2+b^2)/(mu),phase=phase, pval=pval)
453 | 		}
454 | 	}
455 | }
456 | 
457 | 


--------------------------------------------------------------------------------