├── NAMESPACE ├── .Rbuildignore ├── .DS_Store ├── .gitignore ├── man ├── logo.png ├── plot_single_cond.Rd ├── dryseq_single.Rd ├── plot_models_rhythm.Rd ├── drylm.Rd └── dryseq.Rd ├── data ├── simulatedData.RData └── example_vector.RData ├── dryR.Rproj ├── DESCRIPTION ├── R ├── dryseq_single.R ├── function_plot_models_rhythm.R ├── plot_single_cond.R ├── f_24_function.R ├── dryseq_function_lm.R ├── dryseq_function.R └── misc_function.R └── Readme.md /NAMESPACE: -------------------------------------------------------------------------------- 1 | exportPattern("^[[:alpha:]]+") 2 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naef-lab/dryR/HEAD/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | -------------------------------------------------------------------------------- /man/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naef-lab/dryR/HEAD/man/logo.png -------------------------------------------------------------------------------- /data/simulatedData.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naef-lab/dryR/HEAD/data/simulatedData.RData -------------------------------------------------------------------------------- /data/example_vector.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naef-lab/dryR/HEAD/data/example_vector.RData -------------------------------------------------------------------------------- /man/plot_single_cond.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/plot_single_cond.R 3 | \name{plot_single_cond} 4 | \alias{plot_single_cond} 5 | \title{This function plots data and fit for one condition} 6 | \usage{ 7 | plot_single_cond(out, gene_name) 8 | } 9 | \description{ 10 | This function plots data and fit for one condition 11 | } 12 | -------------------------------------------------------------------------------- /dryR.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | -------------------------------------------------------------------------------- /man/dryseq_single.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dryseq_single.R 3 | \name{dryseq_single} 4 | \alias{dryseq_single} 5 | \title{This function performs rhythmicity analysis for one condition} 6 | \usage{ 7 | dryseq_single( 8 | countData, 9 | sample_name = names(countData), 10 | group, 11 | single, 12 | time, 13 | period = 24 14 | ) 15 | } 16 | \description{ 17 | This function performs rhythmicity analysis for one condition 18 | } 19 | -------------------------------------------------------------------------------- /man/plot_models_rhythm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/function_plot_models_rhythm.R 3 | \name{plot_models_rhythm} 4 | \alias{plot_models_rhythm} 5 | \title{Visualization of dryseq results} 6 | \usage{ 7 | plot_models_rhythm(dryList, file_path_name, period = 24) 8 | } 9 | \arguments{ 10 | \item{dryList}{from dryseq output} 11 | 12 | \item{file_path_name}{folder to store output} 13 | } 14 | \description{ 15 | This function allows to plot the results of dryseq. 16 | } 17 | \examples{ 18 | XYZn 19 | } 20 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: dryR 2 | Type: Package 3 | Title: Differential RhythmicitY analysis 4 | Version: 1.0.0 5 | Author: Cedric Gobet and Benjamin Weger 6 | Maintainer: The package maintainer 7 | Description: DryR ( Differential RhythmicitY analysis in R) is a statistical 8 | framework based on model selection that is designed to detect and estimate changes 9 | in rhythmic parameters (amplitude and phase) and mean expression in multiple conditions. 10 | biocViews: Software, BiologicalQuestion 11 | Imports: 12 | combinat, 13 | parallel, 14 | doParallel, 15 | gplots, 16 | RColorBrewer, 17 | circular, 18 | reshape2, 19 | stringr, 20 | Rmisc, 21 | gridExtra 22 | Depends: 23 | R (>= 3.5.0), 24 | DESeq2, 25 | ggplot2, 26 | S4Vectors, 27 | foreach, 28 | pheatmap, 29 | plotrix 30 | Encoding: UTF-8 31 | LazyData: true 32 | RoxygenNote: 7.1.1 33 | -------------------------------------------------------------------------------- /R/dryseq_single.R: -------------------------------------------------------------------------------- 1 | #' This function performs rhythmicity analysis for one condition 2 | #' @export 3 | dryseq_single=function(countData, 4 | group, 5 | time, 6 | single = group[1], 7 | sample_name=names(countData), 8 | period=24){ 9 | 10 | sel = group %in% single 11 | 12 | if(!any(sel)){warning("Your single condition is not included in 'group'")} 13 | 14 | time = time[sel] 15 | group = group[sel] 16 | countData = countData[,sel] 17 | sample_name = sample_name[sel] 18 | 19 | countData = countData[rowSums(countData)!=0,] 20 | 21 | s1 <- sin(2*pi*time/period) 22 | c1 <- cos(2*pi*time/period) 23 | 24 | conds = cbind(s1,c1) 25 | colnames(conds) = c("s1","c1") 26 | 27 | colData <- data.frame(row.names=colnames(countData), conds) 28 | N=length(unique(group)) 29 | 30 | ############################ 31 | # FIT RHYTHMS 32 | ###################### 33 | 34 | dds = DESeq2::DESeqDataSetFromMatrix(countData = countData, colData = colData, design = ~ s1 + c1) 35 | dds = DESeq(dds, test="LRT", reduced=~1) 36 | 37 | ################ 38 | # coefficients / mean, amplitude, phase, p-value 39 | ############### 40 | 41 | res = as.data.frame(cbind(results(dds),coefficients(dds))) 42 | res = res[,c('pvalue','padj','Intercept','s1','c1')] 43 | 44 | phase=period/(2*pi)*atan2(res$s1,res$c1) 45 | phase=phase%%period 46 | amp =2*sqrt(res$s1^2+res$c1^2) 47 | 48 | res=data.frame(res,phase,amp) 49 | 50 | 51 | #normalized counts 52 | ncounts = counts(dds, normalized = TRUE) 53 | global_df = as.data.frame(cbind(ncounts,res)) 54 | 55 | 56 | out = list() 57 | 58 | out[["time"]] = time 59 | out[["period"]] = period 60 | out[["results"]] = global_df 61 | out[["single"]] = single 62 | message("finished!") 63 | return(out) 64 | 65 | } 66 | -------------------------------------------------------------------------------- /R/function_plot_models_rhythm.R: -------------------------------------------------------------------------------- 1 | #' Visualization of dryseq results 2 | #' 3 | #' This function allows to plot the results of dryseq. 4 | #' @param dryList from dryseq output 5 | #' @param file_path_name folder to store output 6 | #' @export pdf that summarize the results of dryseq 7 | #' @examples 8 | #' XYZn 9 | plot_models_rhythm = function(dryList,file_path_name, period=24){ 10 | if (length(dev.list()!=0)) {dev.off()} 11 | t=dryList$time 12 | group=dryList$group 13 | x=dryList$results 14 | 15 | if("vsd" %in% names(dryList)){ 16 | x[,1:ncol(dryList$vsd)]=dryList$vsd 17 | 18 | } 19 | 20 | pdf(file =paste0(file_path_name,'summary_heatmap_models.pdf'), paper = "a4") 21 | nb = table(x[,'chosen_model']) 22 | nb = nb[order(-nb)] 23 | mo = as.numeric(names(nb)) 24 | 25 | for(i in mo){ 26 | 27 | x_s = subset(x, chosen_model==i) 28 | 29 | if(nrow(x_s)>1){ 30 | 31 | pos_phase = grep('phase',names(x_s)) 32 | 33 | sum_phase = apply(x_s[,pos_phase],2,sum,na.rm=T) 34 | if(sum(sum_phase)!=0){ 35 | x_s = x_s[order(x_s[,pos_phase[min(which(sum_phase !=0))]]),] 36 | } 37 | 38 | x_s=as.matrix(x_s[,1:length(t)]) 39 | 40 | ## Average replicates 41 | com= paste(t,group) 42 | com.u=unique(com) 43 | t.u=sapply(strsplit(com.u," "),"[[",1) 44 | cond.u=sapply(strsplit(com.u," "),"[[",2) 45 | 46 | ss=split(1:length(com),com) 47 | 48 | x_s.m=sapply(ss,function(x) if(length(x) > 1){rowMeans(x_s[,x])}else{x_s[,x]}) 49 | 50 | #remove mean per condition 51 | condi=sapply(strsplit(colnames(x_s.m)," "),"[[",2) 52 | ss=split(1:length(condi),condi) 53 | 54 | for(k in ss){ x_s.m[,k]= sweep(x_s.m[,k],1,rowMeans(x_s.m[,k]),FUN="-") } 55 | x_s.m=x_s.m[,match(com.u,colnames(x_s.m))] 56 | 57 | 58 | pheatmap(as.matrix(x_s.m), 59 | cluster_cols = FALSE, 60 | cluster_rows = FALSE, 61 | show_rownames = F , 62 | scale = "row", 63 | gaps_col=which(diff(as.numeric(as.factor(cond.u)))!=0), 64 | main = paste("model",i," #Genes",nrow(x_s),sep =" "), 65 | col = colorRampPalette(c('blue','yellow'))(1000)) 66 | 67 | x_s = subset(x, chosen_model==i) 68 | ba = 1 69 | gg = list() 70 | 71 | for(kk in pos_phase){ 72 | 73 | if(sum(x_s[,kk],na.rm=T) !=0){ 74 | gg[[ba]] = circular_phase24H_histogram(x_s[,kk], unique(condi)[which(kk==pos_phase)], period) 75 | ba = ba + 1 76 | } 77 | 78 | } 79 | 80 | if(length(gg)>0){gridExtra::grid.arrange(grobs = gg, ncol = 4, nrow = 1+round(length(pos_phase)/4))} 81 | 82 | bas = unique(sum_phase) 83 | bas=bas[bas!=0] 84 | la = match(bas, sum_phase) 85 | if (length(la) > 1) { 86 | pairs(x_s[, pos_phase[la]], cex = 0.5) 87 | } 88 | 89 | } 90 | 91 | } 92 | dev.off() 93 | 94 | } 95 | -------------------------------------------------------------------------------- /R/plot_single_cond.R: -------------------------------------------------------------------------------- 1 | #' This function plots data and fit for one condition for glms 2 | #' @export 3 | plot_single_cond=function(out,gene_name){ 4 | out.ncount = out$results[,1:(ncol(out$results)-7)] 5 | x=out.ncount[gene_name,] 6 | period=out$period 7 | t=out$time 8 | t.2=seq(min(t),max(t),0.1) 9 | c1=out$results[gene_name,'c1'] 10 | s1=out$results[gene_name,'s1'] 11 | 12 | u=out$results[gene_name,'Intercept'] 13 | c1.f=c1*cos(2*pi*t.2/period) 14 | s1.f=s1*sin(2*pi*t.2/period) 15 | 16 | fit=u+c1.f+s1.f 17 | df=data.frame(t=t,dat=log2(1+as.numeric(x))) 18 | df.2=data.frame(t.2=t.2,fit=fit) 19 | g1=ggplot(data=df,aes(x=t,y=dat)) + 20 | geom_point(shape=21,color='white',fill='black',size=2) + 21 | geom_line(data=df.2,aes(x=t.2,y=fit),alpha=.8) + 22 | theme_bw() + theme(aspect.ratio=1,axis.text=element_text(size = 12),plot.title = element_text(size=10)) + 23 | xlab("Time") + ylab("Log2 normalized counts") + 24 | ggtitle(paste(gene_name,"\n","adj_pval:",format(out$results[gene_name,'padj'],scientific=T,digits = 3),", phase:",round(out$results[gene_name,'phase'],2), 25 | ", amp:",round(out$results[gene_name,'amp'],2))) 26 | print(g1) 27 | } 28 | 29 | #' This function plots data and fit for one condition for linear models 30 | #' @export 31 | plot_single_cond_lm <- function(out, gene_name) { 32 | # extract normalized counts (all columns except last 9 stats) 33 | counts <- out$results[, seq_len(ncol(out$results) - 9)] 34 | expr <- counts[gene_name, ] 35 | period <- out$period 36 | times <- out$time 37 | 38 | # fine grid for smooth fit 39 | times_grid <- seq(min(times), max(times), by = 0.1) 40 | 41 | # coefficients from cycler 42 | intercepts <- out$results[gene_name, grep("Intercept|mean", colnames(out$results))] 43 | coef_cos <- out$results[gene_name, "c1"] 44 | coef_sin <- out$results[gene_name, "s1"] 45 | 46 | # fitted cos/sin curve 47 | cos_part <- coef_cos * cos(2 * pi * times_grid / period) 48 | sin_part <- coef_sin * sin(2 * pi * times_grid / period) 49 | fit_vals <- intercepts + cos_part + sin_part 50 | 51 | # data frames for plotting 52 | df_points <- data.frame(time = times, expression = as.numeric(expr)) 53 | df_curve <- data.frame(time = times_grid, fit = fit_vals) 54 | 55 | # build plot 56 | p <- ggplot(df_points, aes(x = time, y = expression)) + 57 | geom_point(shape = 21, color = "white", fill = "black", size = 2) + 58 | geom_line(data = df_curve, aes(x = time, y = fit), alpha = 0.8) + 59 | theme_bw() + 60 | theme( 61 | aspect.ratio = 1, 62 | axis.text = element_text(size = 12), 63 | plot.title = element_text(size = 10) 64 | ) + 65 | labs( 66 | x = "Time", 67 | y = "Log2 normalized counts", 68 | title = sprintf( 69 | "%s\nadj_pval: %.3g, phase: %.2f, amp: %.2f", 70 | gene_name, 71 | out$results[gene_name, "padj"], 72 | round(out$results[gene_name, "phase"], 2), 73 | round(out$results[gene_name, "amp"], 2) 74 | ) 75 | ) 76 | 77 | print(p) 78 | } 79 | 80 | 81 | -------------------------------------------------------------------------------- /R/f_24_function.R: -------------------------------------------------------------------------------- 1 | # Test a time series for rhythmicity using a harmonic regression method 2 | # This function has been used in PMID:24344304. Please cite this paper if used. 3 | # 4 | # @param x Numeric vector to test for rhythmicity 5 | # @param t Numeric vector of time (in hours) 6 | # @param period Period of oscillation to test 7 | # @param offset Phase offset (if needed) 8 | # @return A vector with number of timepoints, mean, amplitude, relative amplitude, phase, and p-value of rhythmicity 9 | # 10 | f24_R2_cycling <- function(x, t = 2 * (0:(length(x) - 1)), period = 24, offset = 0) { 11 | # remove NAs 12 | valid <- !is.na(x) 13 | x <- x[valid] 14 | t <- t[valid] 15 | n <- length(x) 16 | nb.timepoints <- length(valid) 17 | 18 | # handle insufficient data 19 | if (n < 4) { 20 | stats <- c( 21 | nb.timepoints = nb.timepoints, 22 | mean = if (n > 0) mean(x) else NA, 23 | amp = NA, 24 | relamp = NA, 25 | phase = NA, 26 | pval = NA, 27 | c1 = NA, 28 | s1 = NA 29 | ) 30 | return(stats) 31 | } 32 | 33 | # compute sine/cosine terms 34 | c <- cos(2 * pi * t / period) 35 | s <- sin(2 * pi * t / period) 36 | 37 | # regression coefficients 38 | A <- cov(x, c) 39 | B <- cov(x, s) 40 | C1 <- var(c) 41 | C2 <- cov(c, s) 42 | C3 <- var(s) 43 | 44 | b <- (A * C2 - B * C1) / (C2^2 - C1 * C3) 45 | a <- (A - b * C2) / C1 46 | mu <- mean(x) - a * mean(c) - b * mean(s) 47 | 48 | x_hat <- mu + a * c + b * s 49 | R2 <- if (var(x) > 0) 1 - var(x - x_hat) / var(x) else NA 50 | 51 | # amplitude and phase 52 | amp <- 2 * sqrt(a^2 + b^2) 53 | phi <- (period / (2 * pi)) * atan2(b, a) 54 | phase <- (phi %% period + offset) %% period 55 | 56 | # p-value from beta distribution 57 | pval <- pbeta(R2, (3 - 1) / 2, (n - 3) / 2, lower.tail = FALSE) 58 | 59 | c( 60 | nb.timepoints = nb.timepoints, 61 | mean = mean(x), 62 | amp = amp, 63 | relamp = amp / mu, 64 | phase = phase, 65 | pval = pval, 66 | c1 = a, 67 | s1 = b 68 | ) 69 | } 70 | 71 | 72 | 73 | # This function performs rhythmicity analysis for one condition for a dataframe that 74 | #' @param data matrix or vector containing linear data; if a matrix is provided each column represents a sample, each row represents a feature. 75 | #' @param time vector containing numeric values of the zeitgeber/circadian time for each sample. 76 | #' @param period numeric value to indicate period length of the oscillation. Default: period = 24 h. 77 | #' @param sample_name vector containing sample names. Default: colnames are sample names. 78 | #' @return a list that contains the following data.frames/vectors: results (summary of results), parameters (rhythmic parameters), time ((timepoints), period (period length) 79 | # 80 | f_24 = function (data, time, period = 24, sample_name = names(data)) 81 | { 82 | if (is.vector(data)) { 83 | data = rbind(data, data) 84 | rownames(data) = c("X1", "X2") 85 | } 86 | res_tmp = apply(data, 1, function(x) f24_R2_cycling(x, t = time, 87 | period = period)) 88 | res = t(res_tmp) 89 | padj = p.adjust(res[, "pval"], method = "BH") 90 | res_complete = cbind(res, padj) 91 | colnames(res_complete)[ncol(res_complete)] = "padj" 92 | global_df = as.data.frame(cbind(data, res_complete)) 93 | out = list(time = time, period = period, results = global_df, 94 | parameters = res_complete) 95 | message("finished!") 96 | return(out) 97 | } 98 | -------------------------------------------------------------------------------- /man/drylm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dryseq_function_lm.R 3 | \name{drylm} 4 | \alias{drylm} 5 | \title{Differential rhythmicity analysis for RNA-Seq datasets} 6 | \usage{ 7 | drylm( 8 | data, 9 | group, 10 | time, 11 | period = 24, 12 | sample_name = colnames(data), 13 | batch = rep("A", length(sample_name)), 14 | n.cores = 1 15 | ) 16 | } 17 | \arguments{ 18 | \item{data}{matrix or vector containing data; if a matrix is provided each column represents a sample, each row represents a feature.} 19 | 20 | \item{group}{vector containing the name of each group (e.g. wildtype, knock-out).} 21 | 22 | \item{time}{vector containing numeric values of the zeiteber/circadian time for each sample.} 23 | 24 | \item{period}{numeric value to indicate period length of the oscillation. Default: period = 24 h.} 25 | 26 | \item{sample_name}{vector containing sample names. Default: colnames are sample names.} 27 | 28 | \item{batch}{vector containing potential batch effects between samples. Default: no batch effect.} 29 | 30 | \item{nthreads}{vector numeric value to indicate the threads for parallel computing .Default: 60 \% of detected cores.} 31 | } 32 | \value{ 33 | a list that contains the following data.frames: results (summary of results), parameters (rhythmic parameters), ncounts (normalized counts), counts (raw counts), cook (cook's distance) 34 | } 35 | \description{ 36 | This function performs a rhythmicity analysis based on linear models with a subsequent models selection. The function accepts a time series assuming normally distributed noise of two or more groups. The function outputs the parameters mean, phase and amplitude are for each group. 37 | } 38 | \details{ 39 | DryR assesses rhythmicity and mean differences of gene expression in normal data. 40 | When necessary, a batch specific mean (m) can be given to the drylm function to account for technical batch effects. 41 | A technical batch effect is not allowed to be confounding so the resulting model matrix is fully ranked. 42 | To select an optimal gene-specific model, drylm first assesses rhythmicity across the different conditions. To this end, dryR defines different models across all groups. 43 | Models refined to have either zero (non-rhythmic pattern) or non-zero (rhythmic pattern) α and β coefficients for each analyzed group. Moreover, for some models the values of α and β can be also shared within any combination of all groups 44 | The coefficients α and β were used to calculate the phase (arctan(α/β)) and amplitude (log2-fold change peak-to-trough; 2sqrt(α^2+β^2) ) of a gene. 45 | Bayesian information criterion (BIC) based model selection was employed to account for model complexity using the following formula: 46 | \cr \cr BIC_j = n ln(RSS_j/n)+ k ln(n) \cr \cr 47 | with RSS the sum of residuals square of the multilinear regression, n the number of time points, and k the number of parameters. 48 | To assess the confidence of the selected model j we calculated the Schwarz weight (BICW): 49 | \cr \cr BICW_j = e^(0.5ΔBIC_j)\ sum(e^0.5 ΔBIC_m), with ΔBIC_j - BIC_j - BIC_m*\cr \cr 50 | m* is the minimum BIC value in the entire model set. drylm consideres the BICW_j as the confidence level for model j. The model with the highest BICW is selected as the optimal model within the set of all defined models. 51 | In a second iteration step, drylm set the coefficient α and β to the values of the selected model in the first regression. 52 | drylm then defined different models for the mean coefficient with differing or shared means between groups. Each model is solved using linear regression and each gene was assigned to a preferred model based on the BICW as described above for the first iteration. 53 | } 54 | \examples{ 55 | data = log(simData[["countData"]]+1) 56 | group = simData[["group"]] 57 | time = simData[["time"]] 58 | dryList = drylm(data,group,time) 59 | head(dryList[["results"]]) # data frame summarizing results 60 | head(dryList[["parameters"]]) # coefficients: phase, amplitude and mean for each group 61 | head(dryList[["ncounts"]]) # normalized counts 62 | } 63 | -------------------------------------------------------------------------------- /man/dryseq.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/dryseq_function.R 3 | \name{dryseq} 4 | \alias{dryseq} 5 | \title{Differential rhythmicity analysis for RNA-Seq datasets} 6 | \usage{ 7 | dryseq( 8 | countData, 9 | group, 10 | time, 11 | period = 24, 12 | sample_name = colnames(countData), 13 | batch = rep("A", length(sample_name)), 14 | n.cores = round(parallel::detectCores() * 0.6, 0) 15 | ) 16 | } 17 | \arguments{ 18 | \item{countData}{matrix containing non-negative integers; each column represents a sample, each row represents a gene/transcript.} 19 | 20 | \item{group}{vector containing the name of each sample.} 21 | 22 | \item{time}{vector containing numeric values of the time for each sample.} 23 | 24 | \item{period}{numeric value to indicate period length of the oscillation. Default: circadian data period of 24 h.} 25 | 26 | \item{sample_name}{vector containing sample names. Default: colnames are sample names.} 27 | 28 | \item{batch}{vector containing potential batch effects between samples. Default: no batch effect.} 29 | 30 | \item{nthreads}{vector numeric value to indicate the threads for parallel computing .Default: 60 \% of detected cores.} 31 | } 32 | \value{ 33 | a list that contains the following data.frames: results (summary of results), parameters (rhythmic parameters), ncounts (normalized counts), counts (raw counts), cook (cook's distance) 34 | } 35 | \description{ 36 | This function performs a rhythmicity analysis based on generalized linear models with a subsequent models selection. The function accepts raw count data from a temporal RNA-Seq dataset of two or more groups. The function outputs parameters mean, phase and amplitude are for each group. 37 | } 38 | \details{ 39 | DryR assesses rhythmicity and mean differences of gene expression in RNA-Seq count data. 40 | As proposed (Love et al. 2014), a count Y of a gene in a sample s can be modeled as a negative binomial with a fitted mean μ_gs and a gene-specific dispersion parameter θ_g. 41 | \cr \cr \emph{Y_gs}~NB(\emph{μ_gs},\emph{θ_g})\cr\cr 42 | The fitted mean is proportional to the quantity q of fragments that correspond to a gene in a sample scaled by a sample-specific scaling factor s_s (Love et al. 2014). 43 | This scaling factor depends on the sampling depth of each library and can be estimated using the median-of-ratios method of DESeq2 (Anders and Huber 2010). 44 | \cr \cr \emph{μ_gs} = \emph{s_s q_gs}\cr \cr 45 | DryR estimates gene-specific distribution θg using empirical Bayes shrinkage described by Love et al. (Love et al. 2014). 46 | and variance is computed from the following relationship to the dispersion parameter θ: 47 | \cr \cr Var(\emph{Y_gs}) = E[(\emph{Y_gs} + \emph{θ_g} \emph{μ^2_gs})]\cr \cr 48 | The fit uses a generalized linear model with a logarithmic link function. Sample specific size factor (λ_s) is defined as an offset. The full GLM is defined as follows: 49 | \cr \cr log(\emph{μ_gbcs}) = \emph{m_gb} + \emph{m_gc} + \emph{α_gc} cos(\emph{ω t(s)}) + \emph{β_gc} sin(\emph{ω t(s)}) + log(\emph{λ_t(s)})\cr \cr 50 | μ is the raw count for gene g, condition/group c and Zeitgeber/circadian time t. α and β are coefficients of the cosine and sine functions, respectively. m is a coefficient to describe a mean expression level. 51 | When necessary, a batch specific mean (m) can be given to the dryseq function to account for technical batch effects. 52 | A technical batch effect is not allowed to be confounding so the resulting model matrix is fully ranked. 53 | To select an optimal gene-specific model, dryseq first assesses rhythmicity across the different conditions. To this end, dryR defines different models across all groups. 54 | Models refined to have either zero (non-rhythmic pattern) or non-zero (rhythmic pattern) α and β coefficients for each analyzed group. Moreover, for some models the values of α and β can be also shared within any combination of all groups 55 | The coefficients α and β were used to calculate the phase (arctan(α/β)) and amplitude (log2-fold change peak-to-trough; 2sqrt(α^2+β^2) ) of a gene. 56 | Bayesian information criterion (BIC) based model selection was employed to account for model complexity using the following formula: 57 | \cr \cr BIC_j = ln(n)k - 2ln(L̂_ĵ)\cr \cr 58 | L̂ is defined as the log-likelihood of the model j from the regression, n is the number of data points and k is the number of parameters. 59 | To assess the confidence of the selected model j we calculated the Schwarz weight (BICW): 60 | \cr \cr BICW_j = e^(0.5ΔBIC_j)\ sum(e^0.5 ΔBIC_m), with ΔBIC_j - BIC_j - BIC_m*\cr \cr 61 | m* is the minimum BIC value in the entire model set. Dryseq consideres the BICW_j as the confidence level for model j. The model with the highest BICW is selected as the optimal model within the set of all defined models. 62 | In a second iteration step, dryR set the coefficient α and β to the values of the selected model in the first regression. 63 | dryseq then defined different models for the mean coefficient with differing or shared means between groups. Each model is solved using generalized linear regression and each gene was assigned to a preferred model based on the BICW as described above for the first iteration. 64 | The model selection is sensitive to outliers: dryseq provide a cook's distance for each gene. A fit for a gene with a maximum cook's distance of higher than 1 should be considered with care. 65 | } 66 | \examples{ 67 | countData = simData[["countData"]] 68 | group = simData[["group"]] 69 | time = simData[["time"]] 70 | dryList = dryseq(countData,group,time) 71 | head(dryList[["results"]]) # data frame summarizing results 72 | head(dryList[["parameters"]]) # coefficients: phase, amplitude and mean for each group 73 | head(dryList[["ncounts"]]) # normalized counts 74 | head(dryList[["counts"]]) # raw counts 75 | head(dryList[["cook"]]) # cook's distance 76 | } 77 | \references{ 78 | Love, M.I., Huber, W., Anders, S. (2014) Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology 79 | 80 | Anders, S. and Huber, W. (2014) Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology 81 | } 82 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # dryR 2 | 3 | 4 | [![](https://img.shields.io/badge/doi-10.1073/pnas.2015803118-green.svg)](https://doi.org/10.1073/pnas.2015803118) 5 | 6 | 7 | `dryR` (Differential RhythmicitY analysis in R) is an R package that provides the statistical framework to assess differential rhythmicity of a time series of RNA-Seq data with two and more conditions. 8 | 9 | ## Getting Started 10 | 11 | These instructions will allow you to get `dryR` running on your machine. 12 | 13 | ### Prerequisites 14 | You need to install R (see https://www.r-project.org/). 15 | 16 | `dryR` accepts count data typically produced by RNA-Seq with a time dimension and several conditions/groups. The input count data should contain only integer values and be organized in a matrix with rows indicating a specific gene and the column refering to a sample. 17 | 18 | The pipelines that can be used to produce these count data from RNA-Seq data (FASTQ files) are described in more detail here: 19 | http://master.bioconductor.org/packages/release/workflows/vignettes/rnaseqGene/inst/doc/rnaseqGene.html 20 | 21 | A user-friendly web application to generate a count table from FASTQ files is provided here: https://amp.pharm.mssm.edu/biojupies/ 22 | 23 | ### Installing 24 | 25 | To install `dryR` run the following code in R. 26 | ``` 27 | install.packages("devtools") 28 | devtools::install_github("naef-lab/dryR") 29 | ``` 30 | ## Quick start 31 | ### Example dataset 32 | `dryR` comes with example data in form of a list called simData. The list contains count data simData[["countData"]], a vector with the different conditions/groups simData[["group"]], and a vector indicating Zeitgeber Time simData[["time"]]. The data was generated using simphony https://github.com/hugheylab/simphony. 33 | 34 | ### Running an example 35 | ``` 36 | library("dryR") 37 | 38 | # prepare arguments 39 | countData = simData[["countData"]] 40 | group = simData[["group"]] 41 | time = simData[["time"]] 42 | 43 | # prefilter 44 | keep <- rowSums(countData) > 0 45 | countData <- countData[keep,] 46 | 47 | # run the analysis for count data (e.g. RNA-Seq data) 48 | dryList = dryseq(countData,group,time) 49 | 50 | # explore the results 51 | dryList[["results"]] # data frame summarizing results 52 | dryList[["parameters"]] # coefficients: phase, amplitude and mean for each group 53 | dryList[["ncounts"]] # normalized counts 54 | dryList[["counts"]] # raw counts 55 | dryList[["cook"]] # cook's distance for outlier detection 56 | dryList[["BICW_rhythm"]] # BICW for each rhythmic model 57 | dryList[["BICW_mean"]] # BICW for each mean model 58 | 59 | # generate a pdf with a global summary of all models 60 | plot_models_rhythm(dryList, "./") 61 | 62 | # plot a feature of interest 63 | dry_plot(dryList, "feature_113") 64 | ``` 65 | 66 | ## Decision Tree for Various Scenarios 67 | The dryR package is equipped to handle a variety of dataset types and experimental designs extending beyond RNA-Seq data with multiple conditions/groups (see example above). The decision tree provided below serves as a guide for selecting the appropriate function to use. For example code, refer to the chapter "Non-standard Scenarios." 68 | ![image](https://github.com/naef-lab/dryR/assets/64013365/a365dce5-16ea-4059-a028-7f60004fda99) 69 | 70 | ## Non-standard Scenarios 71 | 72 | ### Rhythmicity detection in RNA-Seq datasets with one condition 73 | To detect rhythmic gene expression in RNA-Seq data with only one condition, we implemented the function `dryseq_single`. 74 | 75 | ``` 76 | library("dryR") 77 | 78 | # prepare arguments for a one condition scenario 79 | sel = grep("cond_1", simData[["group"]]) 80 | countData_single = simData[["countData"]][,sel] 81 | group_single = simData[["group"]][sel] 82 | time_single = simData[["time"]][sel] 83 | 84 | # run the analysis for count data. 85 | dryList = dryseq_single(countData_single,group_single,time_single) 86 | 87 | # explore the results 88 | dryList[["results"]] # data frame summarizing results 89 | 90 | # plot a feature of interest 91 | plot_single_cond(dryList, "feature_004") 92 | ``` 93 | 94 | ### Normally distributed data 95 | To asses temporal variation of normally distributed measurements, we implemented the function `drylm` that can deal with gaussian noise using linear models. 96 | 97 | ``` 98 | library("dryR") 99 | 100 | # prepare arguments 101 | data = log(simData[["countData"]]+1) 102 | group = simData[["group"]] 103 | time = simData[["time"]] 104 | 105 | # run the analysis with normally distributed data 106 | dryList = drylm(data,group,time) 107 | 108 | # explore the results 109 | dryList[["results"]] # data frame summarizing results 110 | dryList[["parameters"]] # coefficients: phase, amplitude and mean for each group 111 | 112 | # generate a pdf with a global summary of all models 113 | plot_models_rhythm(dryList, "./") 114 | 115 | # plot a feature of interest 116 | dry_plot(dryList, "feature_013") 117 | ``` 118 | 119 | 120 | ### Normally distributed data with one condition 121 | To detect rhythmicity in normally disributed data with only one condition, we implemented the function `f_24`. 122 | 123 | ``` 124 | library("dryR") 125 | 126 | # prepare arguments for a one condition scenario 127 | simData_norm = simData 128 | simData[["normData"]] = log(simData[["countData"]]+1) 129 | sel = grep("cond_1", simData[["group"]]) 130 | normData_single = simData[["normData"]][,sel] 131 | time_single = simData[["time"]][sel] 132 | 133 | # run the analysis for count data. 134 | dryList = f_24(normData_single,time_single) 135 | 136 | # explore the results 137 | dryList[["results"]] # data frame summarizing results 138 | 139 | # plot a feature of interest 140 | plot_single_cond_lm(dryList, "feature_004") 141 | ``` 142 | 143 | 144 | ### DryR with a simple vector as input 145 | You can run `drylm` with a simple vector that contains data from a time series of multiple groups. 146 | 147 | ``` 148 | library("dryR") 149 | 150 | # define time and group for each sample 151 | time = c(1:48,1:48) # Zeitgeber time or Circadian time in h for each sample 152 | group = c(rep("KO",48), rep("WT",48)) 153 | 154 | # run the analysis with normally distributed data 155 | dryList = drylm(vector_example,group,time) 156 | 157 | # explore the results 158 | dryList[["results"]] # data frame summarizing results. Row 2 is a copy of row 1. 159 | dryList[["parameters"]] # coefficients: phase, amplitude and mean for each group. Row 2 is a copy of row 1. 160 | 161 | #plot the result of the selected model 162 | dry_plot(dryList, "X1") 163 | ``` 164 | 165 | ## Help 166 | A documentation using `?dryseq`, `?drylm` or `?dryseq_single` is available. 167 | -------------------------------------------------------------------------------- /R/dryseq_function_lm.R: -------------------------------------------------------------------------------- 1 | #' Differential rhythmicity analysis for RNA-Seq datasets 2 | #' 3 | #' This function performs a rhythmicity analysis based on linear models with a subsequent models selection. The function accepts a time series assuming normally distributed noise of two or more groups. The function outputs the parameters mean, phase and amplitude are for each group. 4 | #' @param data matrix or vector containing data; if a matrix is provided each column represents a sample, each row represents a feature. 5 | #' @param group vector containing the name of each group (e.g. wildtype, knock-out). 6 | #' @param time vector containing numeric values of the zeiteber/circadian time for each sample. 7 | #' @param period numeric value to indicate period length of the oscillation. Default: period = 24 h. 8 | #' @param sample_name vector containing sample names. Default: colnames are sample names. 9 | #' @param batch vector containing potential batch effects between samples. Default: no batch effect. 10 | #' @param nthreads vector numeric value to indicate the threads for parallel computing .Default: 60 \% of detected cores. 11 | #' @return a list that contains the following data.frames: results (summary of results), parameters (rhythmic parameters), ncounts (normalized counts), counts (raw counts), cook (cook's distance) 12 | #' @examples data = log(simData[["countData"]]+1) 13 | #' group = simData[["group"]] 14 | #' time = simData[["time"]] 15 | #' dryList = drylm(data,group,time) 16 | #' head(dryList[["results"]]) # data frame summarizing results 17 | #' head(dryList[["parameters"]]) # coefficients: phase, amplitude and mean for each group 18 | #' head(dryList[["ncounts"]]) # normalized counts 19 | #' @details DryR assesses rhythmicity and mean differences of gene expression in normal data. 20 | #' When necessary, a batch specific mean (m) can be given to the drylm function to account for technical batch effects. 21 | #' A technical batch effect is not allowed to be confounding so the resulting model matrix is fully ranked. 22 | #' To select an optimal gene-specific model, drylm first assesses rhythmicity across the different conditions. To this end, dryR defines different models across all groups. 23 | #' Models refined to have either zero (non-rhythmic pattern) or non-zero (rhythmic pattern) α and β coefficients for each analyzed group. Moreover, for some models the values of α and β can be also shared within any combination of all groups 24 | #' The coefficients α and β were used to calculate the phase (arctan(α/β)) and amplitude (log2-fold change peak-to-trough; 2sqrt(α^2+β^2) ) of a gene. 25 | #' Bayesian information criterion (BIC) based model selection was employed to account for model complexity using the following formula: 26 | #' \cr \cr BIC_j = n ln(RSS_j/n)+ k ln(n) \cr \cr 27 | #' with RSS the sum of residuals square of the multilinear regression, n the number of time points, and k the number of parameters. 28 | #' To assess the confidence of the selected model j we calculated the Schwarz weight (BICW): 29 | #' \cr \cr BICW_j = e^(0.5ΔBIC_j)\ sum(e^0.5 ΔBIC_m), with ΔBIC_j - BIC_j - BIC_m*\cr \cr 30 | #' m* is the minimum BIC value in the entire model set. drylm consideres the BICW_j as the confidence level for model j. The model with the highest BICW is selected as the optimal model within the set of all defined models. 31 | #' In a second iteration step, drylm set the coefficient α and β to the values of the selected model in the first regression. 32 | #' drylm then defined different models for the mean coefficient with differing or shared means between groups. Each model is solved using linear regression and each gene was assigned to a preferred model based on the BICW as described above for the first iteration. 33 | 34 | drylm=function(data,group,time,period=24,sample_name=colnames(data),batch=rep("A",length(sample_name)),n.cores=1 ){ 35 | 36 | doParallel::registerDoParallel(cores=n.cores) 37 | #update 38 | vec = F 39 | if(is.vector(data)){data = rbind(data,data) 40 | rownames(data) = c("X1","X2") 41 | vec = T} 42 | 43 | sel = order(group,time) 44 | time = time[sel] 45 | group = group[sel] 46 | data = data[,sel] 47 | batch = batch[sel] 48 | sample_name = as.character(sample_name[sel]) 49 | 50 | s1 <- sin(2*pi*time/period) 51 | c1 <- cos(2*pi*time/period) 52 | 53 | conds = cbind(group,s1,c1,batch) 54 | colnames(conds) = c("group","s1","c1","batch") 55 | 56 | colData <- data.frame(row.names=colnames(data), conds) 57 | N=length(unique(group)) 58 | 59 | ############################ 60 | # FIT RHYTHMS 61 | 62 | message("fitting rhythmic models") 63 | 64 | models = create_matrix_list(time, group, N,period) 65 | #Reorder u, a, b 66 | models = lapply(models, function(l) l[,c(grep("u",colnames(l)),grep("a|b",colnames(l)))]) 67 | 68 | for (i in 1:length(models)){ 69 | rownames(models[[i]]) = rownames(colData)} 70 | 71 | if (length(unique(batch))>1) { 72 | # add the batch effect 73 | model_b = as.matrix(model.matrix(~ batch),contrasts.arg=NULL)[,2:length(unique(batch)),drop=F] 74 | colnames(model_b)=paste0("BATCH_",unique(batch)[-1]) 75 | models = lapply(models, function(l) cbind(model_b,l)) 76 | models = lapply(models, function(l) l[,c(grep("^u",colnames(l)),grep("^BATCH",colnames(l)),grep("^a|^b",colnames(l)))] ) 77 | } 78 | 79 | fit = parallel::mclapply(split(data, rownames(data)), 80 | do_all_lm, 81 | my_mat= models, 82 | mc.cores=n.cores) 83 | 84 | # calculate the BIC 85 | BIC = unlist(fit)[grep('BIC$',names(unlist(fit)))] 86 | BIC= matrix(BIC,nrow=nrow(data),byrow=T) 87 | rownames(BIC)=names(fit) 88 | BIC=BIC[rownames(data),] 89 | 90 | #calculate the BICW 91 | BICW = t(apply(BIC,1,compute_BICW)) 92 | chosen_model = apply(BIC,1,which.min) 93 | chosen_model_BICW = apply(BICW,1,max) 94 | 95 | ############################ 96 | # FIT BASELINE 97 | 98 | message("fitting mean models") 99 | 100 | model_mean_cond=create_matrix_list_mean(N,group) 101 | model_mean_cond=lapply(model_mean_cond,annotate_matrix,group) 102 | 103 | for (i in 1:length(model_mean_cond)){ 104 | rownames(model_mean_cond[[i]]) = rownames(colData)} 105 | 106 | gene.list=as.list(rownames(data)) 107 | fit = parallel::mclapply(gene.list, 108 | FUN=do_all_lm_mr, 109 | data, 110 | my_mat_r = models, 111 | my_mat_m = model_mean_cond, 112 | chosen_model = chosen_model, 113 | mc.cores=n.cores) 114 | 115 | #extract BIC 116 | BIC_mean = unlist(fit)[grep('BIC$',names(unlist(fit)))] 117 | BIC_mean = matrix(BIC_mean,nrow=nrow(data),byrow=T) 118 | 119 | #calculate the BICW 120 | BICW_mean = t(apply(BIC_mean,1,compute_BICW)) 121 | 122 | chosen_model_mean = apply(BIC_mean,1,which.min) 123 | chosen_model_mean_BICW = apply(BICW_mean,1,max) 124 | 125 | ################ 126 | # coefficients / mean, amplitude and phase 127 | ############### 128 | 129 | message("extracting rhythmic parameters") 130 | parameters=NULL 131 | 132 | parameters = foreach (i = 1:nrow(data)) %dopar% { 133 | gene = rownames(data)[i] 134 | 135 | dds= fit[[i]][[chosen_model_mean[i]]]$param 136 | out = compute_param_l(dds,period, N) 137 | return(out) 138 | } 139 | parameters = data.frame(t(do.call(cbind, parameters))) 140 | colnames(parameters) = c(paste(c('mean','a','b','amp','relamp','phase'),rep(unique(group),each =6), sep = "_")) 141 | rownames(parameters) = rownames(data) 142 | 143 | 144 | #normalized counts 145 | ncounts_RF = data 146 | 147 | # generate a table summarizing the analysis 148 | complete_parameters = cbind(parameters,chosen_model,chosen_model_BICW, chosen_model_mean, chosen_model_mean_BICW) 149 | global_table_df = merge(ncounts_RF,complete_parameters, by="row.names") 150 | 151 | rownames(global_table_df) = global_table_df[,1] 152 | global_table_df = global_table_df[,-1] 153 | 154 | out = list() 155 | 156 | out[["time"]] = time 157 | out[["group"]] = group 158 | out[["results"]] = global_table_df 159 | out[["BICW_rhythm"]] = BICW 160 | out[["BICW_mean"]] = BICW_mean 161 | out[["values"]] = ncounts_RF 162 | out[["parameters"]] = complete_parameters 163 | 164 | #if(vec == TRUE){ 165 | # out[["results"]] = global_table_df[1,] 166 | # out[["BICW_rhythm"]] = BICW[1,] 167 | # out[["BICW_mean"]] = BICW_mean[1,] 168 | # out[["values"]] = ncounts_RF[1,] 169 | # out[["parameters"]] = complete_parameters[1,] 170 | #} 171 | 172 | 173 | message("finished!") 174 | return(out) 175 | 176 | 177 | } 178 | -------------------------------------------------------------------------------- /R/dryseq_function.R: -------------------------------------------------------------------------------- 1 | #' Differential rhythmicity analysis for RNA-Seq datasets 2 | #' 3 | #' This function performs a rhythmicity analysis based on generalized linear models with a subsequent models selection. The function accepts raw count data from a temporal RNA-Seq dataset of two or more groups. The function outputs parameters mean, phase and amplitude are for each group. 4 | #' @param countData matrix containing non-negative integers; each column represents a sample, each row represents a gene/transcript. 5 | #' @param group vector containing the name of each sample. 6 | #' @param time vector containing numeric values of the time for each sample. 7 | #' @param period numeric value to indicate period length of the oscillation. Default: circadian data period of 24 h. 8 | #' @param sample_name vector containing sample names. Default: colnames are sample names. 9 | #' @param batch vector containing potential batch effects between samples. Default: no batch effect. 10 | #' @param nthreads vector numeric value to indicate the threads for parallel computing .Default: 60 \% of detected cores. 11 | #' @return a list that contains the following data.frames: results (summary of results), parameters (rhythmic parameters), ncounts (normalized counts), counts (raw counts), cook (cook's distance) 12 | #' @examples countData = simData[["countData"]] 13 | #' group = simData[["group"]] 14 | #' time = simData[["time"]] 15 | #' dryList = dryseq(countData,group,time) 16 | #' head(dryList[["results"]]) # data frame summarizing results 17 | #' head(dryList[["parameters"]]) # coefficients: phase, amplitude and mean for each group 18 | #' head(dryList[["ncounts"]]) # normalized counts 19 | #' head(dryList[["counts"]]) # raw counts 20 | #' head(dryList[["cook"]]) # cook's distance 21 | #' @details DryR assesses rhythmicity and mean differences of gene expression in RNA-Seq count data. 22 | #' As proposed (Love et al. 2014), a count Y of a gene in a sample s can be modeled as a negative binomial with a fitted mean μ_gs and a gene-specific dispersion parameter θ_g. 23 | #' \cr \cr \emph{Y_gs}~NB(\emph{μ_gs},\emph{θ_g})\cr\cr 24 | #' The fitted mean is proportional to the quantity q of fragments that correspond to a gene in a sample scaled by a sample-specific scaling factor s_s (Love et al. 2014). 25 | #' This scaling factor depends on the sampling depth of each library and can be estimated using the median-of-ratios method of DESeq2 (Anders and Huber 2010). 26 | #' \cr \cr \emph{μ_gs} = \emph{s_s q_gs}\cr \cr 27 | #' DryR estimates gene-specific distribution θg using empirical Bayes shrinkage described by Love et al. (Love et al. 2014). 28 | #' and variance is computed from the following relationship to the dispersion parameter θ: 29 | #' \cr \cr Var(\emph{Y_gs}) = E[(\emph{Y_gs} + \emph{θ_g} \emph{μ^2_gs})]\cr \cr 30 | #' The fit uses a generalized linear model with a logarithmic link function. Sample specific size factor (λ_s) is defined as an offset. The full GLM is defined as follows: 31 | #' \cr \cr log(\emph{μ_gbcs}) = \emph{m_gb} + \emph{m_gc} + \emph{α_gc} cos(\emph{ω t(s)}) + \emph{β_gc} sin(\emph{ω t(s)}) + log(\emph{λ_t(s)})\cr \cr 32 | #' μ is the raw count for gene g, condition/group c and Zeitgeber/circadian time t. α and β are coefficients of the cosine and sine functions, respectively. m is a coefficient to describe a mean expression level. 33 | #' When necessary, a batch specific mean (m) can be given to the dryseq function to account for technical batch effects. 34 | #' A technical batch effect is not allowed to be confounding so the resulting model matrix is fully ranked. 35 | #' To select an optimal gene-specific model, dryseq first assesses rhythmicity across the different conditions. To this end, dryR defines different models across all groups. 36 | #' Models refined to have either zero (non-rhythmic pattern) or non-zero (rhythmic pattern) α and β coefficients for each analyzed group. Moreover, for some models the values of α and β can be also shared within any combination of all groups 37 | #' The coefficients α and β were used to calculate the phase (arctan(α/β)) and amplitude (log2-fold change peak-to-trough; 2sqrt(α^2+β^2) ) of a gene. 38 | #' Bayesian information criterion (BIC) based model selection was employed to account for model complexity using the following formula: 39 | #' \cr \cr BIC_j = ln(n)k - 2ln(L̂_ĵ)\cr \cr 40 | #' L̂ is defined as the log-likelihood of the model j from the regression, n is the number of data points and k is the number of parameters. 41 | #' To assess the confidence of the selected model j we calculated the Schwarz weight (BICW): 42 | #' \cr \cr BICW_j = e^(0.5ΔBIC_j)\ sum(e^0.5 ΔBIC_m), with ΔBIC_j - BIC_j - BIC_m*\cr \cr 43 | #' m* is the minimum BIC value in the entire model set. Dryseq consideres the BICW_j as the confidence level for model j. The model with the highest BICW is selected as the optimal model within the set of all defined models. 44 | #' In a second iteration step, dryR set the coefficient α and β to the values of the selected model in the first regression. 45 | #' dryseq then defined different models for the mean coefficient with differing or shared means between groups. Each model is solved using generalized linear regression and each gene was assigned to a preferred model based on the BICW as described above for the first iteration. 46 | #' The model selection is sensitive to outliers: dryseq provide a cook's distance for each gene. A fit for a gene with a maximum cook's distance of higher than 1 should be considered with care. 47 | #' @references Love, M.I., Huber, W., Anders, S. (2014) Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology 48 | #' @references Anders, S. and Huber, W. (2014) Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biology 49 | dryseq=function(countData,group,time,period=24,sample_name=colnames(countData),batch=rep("A",length(sample_name)),n.cores=round(parallel::detectCores()*.6,0) ){ 50 | #update 51 | doParallel::registerDoParallel(cores=n.cores) 52 | sel = order(group,time) 53 | time = time[sel] 54 | group = group[sel] 55 | countData = countData[,sel] 56 | batch = batch[sel] 57 | sample_name = sample_name[sel] 58 | 59 | countData = countData[rowSums(countData)!=0,] 60 | 61 | s1 <- sin(2*pi*time/period) 62 | c1 <- cos(2*pi*time/period) 63 | 64 | conds = cbind(group,s1,c1,batch) 65 | colnames(conds) = c("group","s1","c1","batch") 66 | 67 | colData <- data.frame(row.names=colnames(countData), conds) 68 | N=length(unique(group)) 69 | 70 | ############################ 71 | # FIT RHYTHMS 72 | 73 | message("fitting rhythmic models") 74 | 75 | models = create_matrix_list(time,group, N,period) 76 | #Reorder u, a, b 77 | models = lapply(models, function(l) l[,c(grep("u",colnames(l)),grep("a|b",colnames(l)))] ) 78 | 79 | for (i in 1:length(models)){ 80 | rownames(models[[i]]) = rownames(colData)} 81 | 82 | if (length(unique(batch))>1) { 83 | # add the batch effect 84 | model_b = as.matrix(model.matrix(~ batch),contrasts.arg=NULL)[,2:length(unique(batch)),drop=F] 85 | colnames(model_b)=paste0("BATCH_",unique(batch)[-1]) 86 | models = lapply(models, function(l) cbind(model_b,l)) 87 | models = lapply(models, function(l) l[,c(grep("u",colnames(l)),grep("BATCH",colnames(l)),grep("a|b",colnames(l)))] ) 88 | } 89 | 90 | dds = DESeq2::DESeqDataSetFromMatrix(countData = countData, colData = colData, design=~1) 91 | dds.full = DESeq2::DESeq(dds, full=models[[length(models)]], betaPrior = F, fitType = "parametric", test = "Wald", parallel =T, quiet = T) 92 | 93 | deviances = sapply(models[-length(models)], function(m){ 94 | dds.x = DESeq2::nbinomWaldTest(dds.full, modelMatrix= m, betaPrior = F, quiet = T) 95 | return(mcols(dds.x)$deviance) 96 | }) 97 | 98 | deviances = cbind(deviances,mcols(dds.full)$deviance) 99 | 100 | message("computing BICW (rhythm)") 101 | 102 | # calculate the BIC 103 | BIC = as.data.frame(sapply(1:ncol(deviances), function(i) { deviances[,i] + log(ncol(countData)) * ncol(models[[i]] )} )) 104 | 105 | #calculate the BICW 106 | BICW = t(apply(BIC,1,compute_BICW)) 107 | chosen_model = apply(BIC,1,which.min) 108 | chosen_model_BICW = apply(BICW,1,max) 109 | 110 | ############################ 111 | # FIT BASELINE 112 | 113 | message("fitting mean models") 114 | 115 | model_mean_cond=create_matrix_list_mean(N,group) 116 | model_mean_cond=lapply(model_mean_cond,annotate_matrix,group) 117 | 118 | for (i in 1:length(model_mean_cond)){ 119 | rownames(model_mean_cond[[i]]) = rownames(colData)} 120 | 121 | # choose the best model for rhthmicity and then run the mean on the samples 122 | 123 | DDS_dev = foreach (i = 1:length(models)) %dopar% { 124 | sel = which(chosen_model==i) 125 | gene = rownames(dds.full)[sel] 126 | 127 | if(length(gene)>0){ 128 | M=models[[i]] 129 | #build the gene specific model from the rhythmic point of view 130 | gene_specific_mean_models = lapply(model_mean_cond, 131 | function(x) cbind(x,M[,-grep("u",colnames(M))])) 132 | 133 | dev <- lapply(gene_specific_mean_models,function(m){ 134 | dds.m <- dds.full # Copying the full model 135 | dds.m <- DESeq2::nbinomWaldTest(dds.m[gene], modelMatrix= as.matrix(m), betaPrior = F) # Re-run wald test 136 | return(list(dds.m, mcols(dds.m)$deviance)) # Returning deviances (-2 * log likelihood) // https://support.bioconductor.org/p/107472/ 137 | 138 | }) 139 | } 140 | 141 | if(length(gene)==0){dev = list (NA, NA)} 142 | 143 | return(dev) 144 | } 145 | 146 | deviance_mean = NULL 147 | for (cm_r in 1:length(models)){ 148 | 149 | if(!is.na(DDS_dev[[cm_r]][1])){ 150 | deviance_mean.x = rbind(sapply(1:length(model_mean_cond),function(x) {DDS_dev[[cm_r]][[x]][[2]]})) 151 | rownames(deviance_mean.x) = rownames(DDS_dev[[cm_r]][[1]][[1]]) 152 | deviance_mean = rbind(deviance_mean, deviance_mean.x)} 153 | } 154 | 155 | deviance_mean = deviance_mean[rownames(countData),] 156 | 157 | message("computing BICW (mean)") 158 | 159 | # calculate the BIC 160 | BIC_mean = as.data.frame(sapply(1:ncol(deviance_mean), function(i) { deviance_mean[,i] + log(ncol(countData)) * ncol(model_mean_cond[[i]] )} )) 161 | 162 | #calculate the BICW 163 | BICW_mean = t(apply(BIC_mean,1,compute_BICW)) 164 | 165 | chosen_model_mean = apply(BIC_mean,1,which.min) 166 | chosen_model_mean_BICW = apply(BICW_mean,1,max) 167 | 168 | ################ 169 | # coefficients / mean, amplitude and phase 170 | ############### 171 | 172 | message("extracting rhythmic parameters") 173 | 174 | parameters=NULL 175 | 176 | parameters = foreach (i = 1:nrow(deviance_mean)) %dopar% { 177 | gene = rownames(deviance_mean)[i] 178 | cm_r = chosen_model[i] 179 | cm_m = chosen_model_mean[i] 180 | dds= DDS_dev[[cm_r]][[cm_m]][[1]] 181 | out = compute_param(dds, gene ,period,N) 182 | return(data.frame(row.names= gene, t(matrix(out))) ) 183 | } 184 | 185 | parameters = data.frame(do.call(rbind.data.frame, parameters)) 186 | colnames(parameters) = c(paste(c('mean','a','b','amp','relamp','phase'),rep(unique(group),each =6), sep = "_")) 187 | parameters = parameters[rownames(countData),] 188 | 189 | # Generate all the count and expression data 190 | # raw counts 191 | counts_RF = counts(dds.full, normalized = FALSE) 192 | 193 | #vst stabilized counts 194 | vsd <- DESeq2::varianceStabilizingTransformation(dds.full) 195 | vsd <- assay(vsd) 196 | 197 | #normalized counts 198 | ncounts_RF = counts(dds.full, normalized = TRUE) 199 | 200 | # generate a table summarizing the analysis 201 | complete_parameters = cbind(parameters,chosen_model,chosen_model_BICW, chosen_model_mean, chosen_model_mean_BICW) 202 | global_table = merge(ncounts_RF,complete_parameters, by="row.names") 203 | rownames(global_table) = global_table$Row.names 204 | global_table_df = global_table[,-grep("Row.names",colnames(global_table))] 205 | 206 | global_table_df = global_table_df[rownames(countData),] 207 | 208 | out = list() 209 | 210 | out[["time"]] = time 211 | out[["group"]] = group 212 | out[["results"]] = global_table_df 213 | out[["BICW_rhythm"]] = BICW 214 | out[["BICW_mean"]] = BICW_mean 215 | out[["vsd"]] = vsd 216 | out[["ncounts"]] = ncounts_RF 217 | out[["counts"]] = counts_RF 218 | out[["parameters"]] = complete_parameters 219 | out[["cook"]] = assays(dds.full)[["cooks"]] 220 | out[["dds.full"]] = dds.full 221 | 222 | message("finished!") 223 | return(out) 224 | 225 | # to add flags for low expression (counts), high cook's distance 226 | # to be added error messages when only one group is given etc. 227 | } 228 | -------------------------------------------------------------------------------- /R/misc_function.R: -------------------------------------------------------------------------------- 1 | ##################################### 2 | comb = function(n,k){ 3 | factorial(n)/(factorial(k)*factorial(n-k)) 4 | } 5 | ##################################### 6 | nbt = function(x){ 7 | l=length(which(x) == TRUE) 8 | l 9 | } 10 | ##################################### 11 | simply_it = function(x){ 12 | a = 0 13 | for(i in x) { 14 | a= paste(a,paste(which(x == as.numeric(i)), collapse = "",sep = ""), collapse = "", sep = "") 15 | } 16 | a 17 | } 18 | ##################################### 19 | simply_it.2 = function(x){ 20 | 21 | a = match(x,x) 22 | } 23 | ##################################### 24 | make_circ_coord = function(t,x,ttot) { 25 | dt=(t[2]-t[1])*.45 26 | a=(rep(t,rep(4,length(t)))+rep(c(-dt,-dt,dt,dt),length(t)))*2*pi/ttot 27 | h=rep(x,rep(4,length(x)))*rep(c(0,1,1,0),length(t)) 28 | list(angles=a,heights=h) 29 | } 30 | ##################################### 31 | circular_phase24H_histogram = function(x,name,ttot){ 32 | br=0:ttot 33 | h=hist(x, br=br,plot=F) 34 | df = data.frame(x= as.numeric(h$breaks[-1]%%ttot), y= h$counts) 35 | 36 | ggplot(df, aes(x=x, y=y)) + 37 | geom_bar(stat='identity') + 38 | coord_polar(start = -0.261799/2, direction=1) + 39 | scale_x_continuous(breaks = seq(0, ttot, round(ttot/24,0))) + 40 | ylab("") + 41 | xlab("") + 42 | theme_bw() + 43 | ggtitle(paste("", name)) + 44 | theme(aspect.ratio = 1, 45 | axis.text=element_text(size=8), 46 | panel.grid.minor = element_blank(), 47 | panel.border = element_blank(), 48 | axis.text.y = element_blank(), 49 | axis.ticks = element_blank(), 50 | plot.title = element_text(hjust = 0.5)) 51 | } 52 | 53 | 54 | ##################################### 55 | compute_BICW = function(x){ 56 | x = as.numeric(x) 57 | BIC_min = min(x) 58 | test = exp(-0.5*(x-BIC_min))/sum(exp(-0.5*(x-BIC_min))) 59 | return(test) 60 | } 61 | #################################3 62 | compute_RSS = function(x, matX){ 63 | 64 | xx = solve(t(matX)%*%matX) 65 | y = xx %*% t(matX) %*% as.numeric(x) 66 | y = as.matrix(y) 67 | rownames(y) = colnames(matX) 68 | RSS = t(x) %*% x -t(x) %*% matX %*% xx %*% t(matX) %*% x 69 | list(param=y,RSS = RSS) 70 | } 71 | 72 | ############################################# 73 | compute_BIC = function(A,n){ 74 | 75 | p = length(A$param) 76 | #AIC = n * log(A$RSS/n, base = exp(1)) + 2* p + 2*p*(p +1) /(n-p-1) 77 | BIC= n * log(A$RSS/n, base = exp(1)) + log(n, base = exp(1)) * p 78 | list(BIC = BIC, param = A$param) 79 | 80 | } 81 | 82 | ############################## 83 | do_all_lm = function(x,my_mat){ 84 | x = as.numeric(x) 85 | n = length(x) 86 | 87 | my_fit = lapply(my_mat,compute_RSS, x = x) 88 | my_BIC =lapply(my_fit,compute_BIC,n=n) 89 | my_BIC 90 | } 91 | 92 | ########################## 93 | 94 | do_all_lm_mr = function(x,countData,my_mat_r, my_mat_m, chosen_model){ 95 | i=match(x,rownames(countData)) 96 | x=countData[x,] 97 | M=my_mat_r[[chosen_model[i]]] 98 | #build the gene specific model from the rhythmic point of view 99 | gene_specific_mean_models = lapply(my_mat_m, 100 | function(x) cbind(x,M[,-grep("u",colnames(M))])) 101 | x = as.numeric(x) 102 | n = length(x) 103 | 104 | my_fit = lapply(gene_specific_mean_models,compute_RSS, x = x) 105 | my_BIC =lapply(my_fit,compute_BIC,n=n) 106 | my_BIC 107 | } 108 | 109 | #################################### 110 | compute_param = function(dds, gene, period=T_,N){ 111 | 112 | dds = dds[gene,] 113 | param = c(paste(rep(c('u','a','b'),each=N),rep(1:N,3), sep = ".")) 114 | 115 | paramout = rep(NA,N*6) 116 | 117 | for(i in 1:N){ 118 | 119 | u=coef(dds)[grep(paste(param[i],"Intercept",sep="|"), colnames(coef(dds)))] 120 | a=coef(dds)[grep(param[i+N], colnames(coef(dds)))] 121 | b=coef(dds)[grep(param[i+N*2], colnames(coef(dds)))] 122 | 123 | if(length(u) ==0) u=NA 124 | if(length(a) ==0) a=NA 125 | if(length(b) ==0) b=NA 126 | 127 | phase=period/(2*pi)*atan2(b,a) 128 | amp =2*sqrt(a^2+b^2) 129 | relamp=0.5*amp/u 130 | if(!is.na(phase)){ 131 | #if(phase<0) phase=phase+period 132 | #if(phase>period) phase=phase-period 133 | phase=phase%%period 134 | } 135 | paramout[(1:6 + 6*(i-1))] = c(u,a,b,amp,relamp,phase) 136 | } 137 | 138 | #names(paramout) = c(paste(c('mean','a','b','amp','relamp','phase'),rep(1:N,each =6), sep = "_")) 139 | paramout 140 | } 141 | #################################### 142 | compute_param_l = function(dds, period=T_, N){ 143 | 144 | 145 | param = c(paste(rep(c('u','a','b'),each=N),rep(1:N,3), sep = ".")) 146 | 147 | paramout = rep(NA,N*6) 148 | 149 | for(i in 1:N){ 150 | 151 | u=dds[grep(paste(param[i],"Intercept",sep="|"), rownames(dds)),1] 152 | a=dds[grep(param[i+N], rownames(dds)),1] 153 | b=dds[grep(param[i+N*2], rownames(dds)),1] 154 | 155 | if(length(u) ==0) u=NA 156 | if(length(a) ==0) a=NA 157 | if(length(b) ==0) b=NA 158 | 159 | phase=period/(2*pi)*atan2(b,a) 160 | amp =2*sqrt(a^2+b^2) 161 | relamp=0.5*amp/u 162 | if(!is.na(phase)){ 163 | #if(phase<0) phase=phase+period 164 | #if(phase>period) phase=phase-period 165 | phase=phase%%period 166 | } 167 | paramout[(1:6 + 6*(i-1))] = c(u,a,b,amp,relamp,phase) 168 | } 169 | 170 | #names(paramout) = c(paste(c('mean','a','b','amp','relamp','phase'),rep(1:N,each =6), sep = "_")) 171 | paramout 172 | } 173 | 174 | ##################################### 175 | create_matrix_list = function(t, conds, n.co, period){ 176 | require(combinat) 177 | 178 | my_matrix = list() 179 | 180 | c <- cos(2*pi*t/period) 181 | s <- sin(2*pi*t/period) 182 | 183 | MAT <- cbind(rep(1,length(t)),c[1:length(t)],s[1:length(t)]) 184 | GMAT <- matrix(NA,ncol=3*n.co, nrow =length(t)) 185 | rownames(GMAT) <- conds 186 | colnames(GMAT) <- c(paste(c('u','a','b'),rep(1:n.co,each =3), sep = ".")) 187 | 188 | it <- 1 189 | for(i in unique(rownames(GMAT))){ 190 | GMAT[rownames(GMAT)==i,grep(paste0('.',it,'$'),colnames(GMAT))] = MAT[rownames(GMAT)==i,] 191 | it=it+1 192 | } 193 | 194 | vn = rep(F,n.co) 195 | for(i in 1:n.co){ 196 | g = rep(F,n.co) 197 | g[1:i] = TRUE 198 | p = unique(combinat::permn(g)) 199 | v = matrix(unlist(p),ncol = n.co,byrow = TRUE) 200 | vn = rbind(vn,v) 201 | 202 | } 203 | 204 | 205 | vn = vn[,rep(1:n.co,each=3)] 206 | vn[,seq(1,3*n.co,3)] = TRUE 207 | vn = data.frame(vn,row.names= NULL) 208 | vn[,dim(vn)[2] + 1]=(apply(vn,1,nbt)-n.co)/2 209 | colnames(vn) = c(paste(c('u','a','b'),rep(1:n.co,each =3), sep = "."),'nb_cycl') 210 | 211 | model = 1 212 | for(g in 0:n.co){ 213 | 214 | 215 | nb_cycl =g 216 | com = expand.grid(rep(list(1:nb_cycl),nb_cycl)) 217 | simply = apply(com,1,simply_it) 218 | poss =match(unique(simply),simply) 219 | com_l = com[poss,] 220 | pos = which(vn$nb_cycl == g) 221 | 222 | for(k in pos){ 223 | if(g > 1){ 224 | for(v in 1:nrow(com_l)){ 225 | gmat = GMAT[,unlist(vn[k,-dim(vn)[2]])] 226 | ve = as.numeric(com_l[v,]) 227 | id =1 228 | sa = ve 229 | while(length(ve) !=0){ 230 | 231 | poc = which(sa == ve[1]) 232 | po = which(ve ==ve[1]) 233 | if(length(poc) !=1){ 234 | poch =c(2*poc-1,2*poc) 235 | poch =poch[order(poch)] 236 | he = grep("[ab]",colnames(gmat)) 237 | he = he[poch] 238 | pp=0 239 | for(z in 1:((length(he)-2)/2)){ 240 | repl1 = which(gmat[,he[2*z+1]]!='NA') 241 | repl2 = which(gmat[,he[2*z+2]]!='NA') 242 | gmat[repl1,he[1]] = gmat[repl1,he[2*z+1]] 243 | gmat[repl2,he[2]] = gmat[repl2,he[2*z+2]] 244 | colnames(gmat)[he[1]]= paste(colnames(gmat)[he[1]],colnames(gmat)[he[2*z+1]],sep=',') 245 | colnames(gmat)[he[2]]= paste(colnames(gmat)[he[2]],colnames(gmat)[he[2*z+2]],sep=',') 246 | gmat[repl1,he[2*z+1]] =NA 247 | gmat[repl2,he[2*z+2]]=NA 248 | pp = pp+2 249 | } 250 | id = id+1 251 | ve = ve[-po] 252 | }else{ 253 | ve = ve[-1] 254 | } 255 | 256 | } 257 | gmat[is.na(gmat)] =0 258 | del=which(apply(gmat,2,function(x) length(which(x == 0))) == length(t)) 259 | if(length(del)!=0){ 260 | gmat = gmat[,-del] 261 | } 262 | my_matrix[[model]] = gmat 263 | model = model + 1 264 | } 265 | }else{ 266 | gmat = GMAT[,unlist(vn[k,-dim(vn)[2]])] 267 | gmat[is.na(gmat)] =0 268 | del =which(apply(gmat,2,function(x) length(which(x == 0))) == length(t)) 269 | if(length(del)!=0){ 270 | gmat = gmat[,-del] 271 | } 272 | my_matrix[[model]] = gmat 273 | model = model +1 274 | } 275 | } 276 | 277 | } 278 | 279 | 280 | 281 | return(my_matrix) 282 | } 283 | ##################################### 284 | create_matrix_list_mean = function(N,group){ 285 | com = expand.grid(rep(list(1:N),N)) 286 | simply = as.data.frame(t(apply(com,1,simply_it.2))) 287 | simply = do.call("paste",simply) 288 | poss =match(unique(simply),simply) 289 | com_l = com[poss,] 290 | names(com_l)=unique(group) 291 | com_l=com_l[order(apply(com_l,1,function(x) length(unique(x))),apply(com_l,1,function(x) length(which(x==max(x))))),] 292 | rownames(com_l)=1:nrow(com_l) 293 | 294 | com_l=com_l[,match(group,names(com_l))] 295 | p=list() 296 | for(j in 1:nrow(com_l)){ 297 | if(j==1){ 298 | p[[j]] = as.matrix(rep(1,ncol(com_l))) 299 | 300 | }else{ 301 | p[[j]]= model.matrix(~0+ factor(as.numeric(com_l[j,]))) 302 | } 303 | } 304 | p 305 | } 306 | ##################################### 307 | annotate_matrix = function(m,group){ 308 | if(ncol(m)==1){ 309 | colnames(m)=paste("u",1:length(unique(group)),sep=".",collapse=".") 310 | }else{ 311 | pos_ind= match(unique(group),group) 312 | m=as.matrix(m) 313 | l=list() 314 | for(k in 1:ncol(m)){ 315 | l[[k]]=as.numeric(which(m[pos_ind,k]==1)) 316 | } 317 | colnames(m)=sapply(l,function(x) paste("u",x,sep=".",collapse=".")) 318 | } 319 | m 320 | } 321 | 322 | ##################################### 323 | dry_plot = function (dryList, gene, period=24) 324 | { 325 | normal = FALSE 326 | if("ncounts" %in% names(dryList)){vsd = log2(dryList[["ncounts"]]+1)} 327 | if("values" %in% names(dryList)){vsd = dryList[["values"]] 328 | normal = T} 329 | 330 | parameters = dryList[["parameters"]][,grep("^mean|^a_|^b_|^amp|^phase|^relamp",colnames(dryList[["parameters"]]))] 331 | 332 | ID = rownames(dryList[["results"]] )[grep(paste0('^',gene,'$'),rownames(dryList[["results"]] ))] 333 | 334 | #print(ID) 335 | 336 | d = vsd[ID, ] 337 | d = reshape2::melt(d) 338 | 339 | d$group = dryList[["group"]] 340 | 341 | d$time = as.numeric(dryList[["time"]]) 342 | d$time = d$time%%period 343 | 344 | suppressWarnings({ d <- Rmisc::summarySE(d, measurevar="value", groupvars=c("time","group")) }) 345 | 346 | v = seq(0,period,round(period/24,0)) 347 | fit_d_0 = parameters[which(rownames(parameters)==ID),grep("mean",colnames(parameters))] # intercept 348 | fit_d_1 = parameters[which(rownames(parameters)==ID),grep("a_",colnames(parameters))] # coefficient a 349 | fit_d_2 = parameters[which(rownames(parameters)==ID),grep("^b_",colnames(parameters))] # coefficient b 350 | 351 | fit_d_0[is.na(fit_d_0)] = 0 352 | fit_d_1[is.na(fit_d_1)] = 0 353 | fit_d_2[is.na(fit_d_2)] = 0 354 | 355 | m = data.frame(v) 356 | 357 | dd = data.frame(v) 358 | dd$v = v 359 | 360 | fit_values = function (x,n) 361 | { as.numeric((fit_d_0[n] + fit_d_1[n]*cos(2*pi*x/period) + fit_d_2[n]*sin(2*pi*x/period))) } 362 | 363 | for (u in 1:length(unique(d$group))){ 364 | m[,u+1] = NA 365 | m[,u+1] = apply(dd,1, fit_values,u) 366 | } 367 | 368 | m = m[,-1] 369 | 370 | colnames(m) = unique(dryList[["group"]]) 371 | 372 | m = reshape2::melt(m, , id.vars = NULL) 373 | m$time = rep(v, length(unique(d$group))) 374 | 375 | colnames(m) = c("group","value","time") 376 | 377 | if(normal==FALSE) {m$value[which(m$value<0)] = 0} 378 | 379 | gg1 = ggplot(d, aes(x=time, y=value, group=group, color=group)) + 380 | geom_errorbar(aes(ymin=value-se, ymax=value+se), width=.4) + 381 | geom_point(size=2, shape=19) + 382 | xlab("Time (h)") + 383 | ylab("Log2 normalized counts") + 384 | ggtitle(ID) + 385 | scale_x_continuous(breaks=seq(0,period+6,6)) + 386 | theme_bw(base_size = 10) + 387 | theme(aspect.ratio = 1, panel.grid.minor=element_blank(), legend.position = "right") + 388 | geom_line(aes(x=time, y=(value), group=group), data = m, position=position_dodge(width=0.5)) + 389 | facet_wrap(~group) 390 | 391 | gg1 392 | } 393 | 394 | ##################################### 395 | #' Test a time series for rhythmicity using a harmonic regression method 396 | #' This function has been used in PMID:24344304, please cite this paper if this function is used. 397 | #' 398 | #' @param x Vector of numerics to test rhythmicity 399 | #' @param t Vector of time (in hours) 400 | #' @param period Period of oscillations to test 401 | #' @param offset Phase offset (if needed) 402 | #' @return Number of timepoints, mean, amplitude (max-min), relative amplitude, phase, and p-value of rhythmicity 403 | #' 404 | f24_R2_cycling=function(x, t=2*(0:(length(x)-1)), period=24, offset=0) 405 | { 406 | 407 | kk = which(!is.na(x)==TRUE) 408 | x = x[kk] 409 | t = t[kk] 410 | n=length(x) 411 | #mu=mean(x) 412 | nb.timepoints=length(x) 413 | if(n<4) 414 | { 415 | if(n==0) c(nb.timepoints=nb.timepoints, mean=NA, amp=NA, relamp=NA,phase=NA,pval=NA) 416 | else 417 | { 418 | c(nb.timepoints=nb.timepoints, mean=mean(x), amp=NA, relamp=NA,phase=NA,pval=NA) 419 | } 420 | } 421 | else 422 | { 423 | sig2=var(x) 424 | c=cos(2*pi*t/period) 425 | s=sin(2*pi*t/period) 426 | A = mean(x*c)-mean(x)*mean(c) 427 | B = mean(x*s)-mean(x)*mean(s) 428 | c1 = mean(c^2)-mean(c)^2 429 | c2 = mean(c*s)-mean(c)*mean(s) 430 | c3 = mean(s^2)-mean(s)^2 431 | b = (A*c2-B*c1)/(c2^2-c1*c3) 432 | a = (A-b*c2)/c1 433 | mu = mean(x)-a*mean(c)-b*mean(s) 434 | # b=2*mean(x*s) 435 | x.hat=mu+a*c+b*s 436 | sig2.1=var(x-x.hat) 437 | if(is.na(a)||is.na(b)) {c(nb.timepoints=nb.timepoints, mean=mean(x), amp=NA, relamp=NA,phase=NA,pval=NA)} 438 | else 439 | { 440 | p=3 441 | R2=0 442 | if(sig2>0) R2=1-sig2.1/sig2 443 | # http://www.combustion-modeling.com/downloads/beta-distribution-for-testing-r-squared.pdf 444 | # I checked that it works 445 | amp=max(x)-min(x) 446 | phase=period/(2*pi)*atan2(b, a) 447 | if(phase<0) phase=phase+period 448 | if(phase>period) phase=phase-period 449 | phase=(phase+offset)%%period 450 | pval = pbeta(R2, (p-1)/2, (n-p)/2, lower.tail = FALSE, log.p = FALSE) 451 | 452 | c(nb.timepoints=nb.timepoints, mean =mean(x), amp=2*sqrt(a^2+b^2),relamp=sqrt(a^2+b^2)/(mu),phase=phase, pval=pval) 453 | } 454 | } 455 | } 456 | 457 | --------------------------------------------------------------------------------