├── README.md ├── matrix_matrix_operation.R ├── k-means.R ├── flux.profile.R ├── PFAM_enrichment.R ├── mann-whitney.R ├── gene.list.enrichment.R ├── missing.values.R ├── functions.R ├── plot.isoform.levels.R ├── matrix_to_dist.R ├── element.detected.R ├── gene.pair.correlation.R ├── plot.bigWig.profile.R ├── modularity.R ├── sort.rpkm.lines.R ├── ggpie.R ├── VennDiagram.R ├── KEGG_enrichment.R ├── wordcloud.R ├── read.genomic.coverage.R ├── normalization.DESeq.R ├── cutree.R ├── DESeq.analysis.R ├── anova.R ├── DEXSeq.analysis.R ├── quantile_normalization.R ├── differential_coSI.R ├── barplot.GO.R ├── ggSOM.R ├── matrix_wilcox.R ├── projection.score.R ├── matrix_matrix_correlation.R ├── plot.mean.sd.R ├── GO_enrichment.R ├── SOM.R ├── Rtsne.R ├── major.isoform.across.samples.R ├── add_quantile.R ├── nt.coverage.R ├── dimensRed.R ├── gglines.R ├── matrix_funct.R ├── rpkm_fraction.R ├── scatterplot.R ├── plot.network.R ├── boxplot_expressed_isoforms.R ├── scale_matrix.R └── removeBatchEffect.R /README.md: -------------------------------------------------------------------------------- 1 | # Rscripts 2 | Several R scripts for exploratory data analysis 3 | -------------------------------------------------------------------------------- /matrix_matrix_operation.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ##------------ 4 | ## LIBRARIES 5 | ##------------ 6 | 7 | suppressPackageStartupMessages(library("optparse")) 8 | 9 | options(stringsAsFactors=F) 10 | 11 | ################## 12 | # OPTION PARSING 13 | ################## 14 | 15 | option_list <- list( 16 | 17 | make_option(c("-A", "--matrix_A"), 18 | help="the matrix you want to subtract from, WITH header (A-B)"), 19 | 20 | make_option(c("-B", "--matrix_B"), 21 | help="the matrix you want to subtract, WITH header (A-B)"), 22 | 23 | make_option(c("-r", "--replace_NA_with"), type="numeric", 24 | help="value you want to replace NA with, if null NAs are not replaced and difference will be NA"), 25 | 26 | make_option(c("-o", "--output"), default="out.tsv", 27 | help="additional prefix for otuput [default=%default]"), 28 | 29 | make_option(c("-e", "--expression"), 30 | help="expression you want to corresponding cells of the matrices, e.g. A-B"), 31 | 32 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, 33 | help="verbose output [default=%default]") 34 | 35 | ) 36 | 37 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 38 | arguments <- parse_args(parser, positional_arguments = TRUE) 39 | opt <- arguments$options 40 | 41 | if(opt$verbose) {print(opt)} 42 | 43 | ################### 44 | # BEGIN # 45 | ################### 46 | 47 | A = read.table(opt$matrix_A, h=T) 48 | B = read.table(opt$matrix_B, h=T) 49 | 50 | if (!is.null(opt$replace_NA_with)) { 51 | A <- replace(A, is.na(A), opt$replace_NA_with) 52 | B <- replace(B, is.na(B), opt$replace_NA_with) 53 | } 54 | 55 | M = eval(parse(text=opt$expression)) 56 | 57 | write.table(M, opt$output, quote=F, row.names=T, sep="\t") 58 | 59 | q(save='no') 60 | -------------------------------------------------------------------------------- /k-means.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors=FALSE) 4 | 5 | ################## 6 | # OPTION PARSING # 7 | ################## 8 | 9 | suppressPackageStartupMessages(library("optparse")) 10 | 11 | option_list <- list( 12 | 13 | make_option(c("-i", "--input"), default="stdin", 14 | help="File or stdin [default=%default]"), 15 | 16 | make_option(c("-o", "--output"), default="Kmeans.tsv", 17 | help="Output file name. Can be stdout [default=%default]"), 18 | 19 | make_option(c("--header"), action="store_true", default=FALSE, 20 | help="The input matrix has a header [default=%default]"), 21 | 22 | make_option(c("-l", "--log10"), action="store_true", default=FALSE, 23 | help="Apply the log10 to the whole matrix as pre-processing step [default=%default]"), 24 | 25 | make_option(c("-p", "--pseudocount"), default=0.001, 26 | help="Pseudocount to add when applying the log [default=%default]"), 27 | 28 | make_option(c("-k", "--nb_clusters"), default=3, 29 | help="Number of desired clusters [default=%default]"), 30 | 31 | make_option(c("-B", "--iterations"), default=50, 32 | help="Number of initializations to determine the best clustering [default=%default]"), 33 | 34 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, 35 | help="if you want more output [default=%default]") 36 | ) 37 | 38 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 39 | arguments <- parse_args(parser, positional_arguments = TRUE) 40 | opt <- arguments$options 41 | if (opt$verbose) {print(opt)} 42 | 43 | 44 | suppressPackageStartupMessages(library("ggplot2")) 45 | 46 | 47 | ############## 48 | # BEGIN 49 | ############## 50 | 51 | 52 | if (opt$input == "stdin") { 53 | m = read.table(file("stdin"), h=T) 54 | } else { 55 | m = read.table(opt$input, h=T) 56 | } 57 | 58 | if (opt$log10) { 59 | m = log10(m + opt$pseudocount) 60 | } 61 | 62 | 63 | set.seed(123) 64 | 65 | # Find the clusters from multiple random initializations 66 | Klist = replicate(opt$iterations, kmeans(m, opt$nb_clusters), simplify=F) 67 | 68 | # Choose the best 69 | K = Klist[[which.max(sapply(1:length(Klist), function(i) {Klist[[i]]$betweenss/Klist[[i]]$totss}))]]$cluster 70 | 71 | m$Kmeans = K 72 | 73 | 74 | # OUTPUT 75 | 76 | if (opt$output == "stdout") { 77 | output = "" 78 | } else { 79 | output = opt$output 80 | } 81 | 82 | write.table(m, output, quote=FALSE, col.names=TRUE, row.names=TRUE, sep='\t') 83 | 84 | q(save='no') 85 | -------------------------------------------------------------------------------- /flux.profile.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors=FALSE) 4 | 5 | ################## 6 | # OPTION PARSING 7 | ################## 8 | 9 | suppressPackageStartupMessages(library("optparse")) 10 | 11 | option_list <- list( 12 | 13 | make_option(c("-i", "--input"), 14 | help="Flux profile, .json"), 15 | 16 | make_option(c("-o", "--output"), default="flux.profile.out.pdf", 17 | help="Output file name [default=%default]"), 18 | 19 | make_option(c("-t", "--title"), default="", 20 | help="Plot title [default=%default]"), 21 | 22 | make_option(c("--palette"), type="character", default="/users/rg/abreschi/R/palettes/rainbow.2.txt", 23 | help="file with the palette [default=%default]"), 24 | 25 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, 26 | help="if you want more output [default=%default]") 27 | 28 | ) 29 | 30 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 31 | arguments <- parse_args(parser, positional_arguments = TRUE) 32 | opt <- arguments$options 33 | if (opt$verbose) {print(opt)} 34 | 35 | ################# 36 | # LIBRARIES 37 | ################# 38 | 39 | suppressPackageStartupMessages(library("RJSONIO")) 40 | suppressPackageStartupMessages(library("ggplot2")) 41 | 42 | 43 | 44 | # BEGIN 45 | 46 | palette = read.table(opt$palette, h=F, comment.char="%")$V1 47 | 48 | 49 | m = fromJSON(opt$input) 50 | 51 | 52 | dfSense = (do.call( 53 | rbind, sapply( 54 | 1:5, function(i) 55 | data.frame( 56 | x=seq_along(m$masters[[i]]$sense), 57 | y=m$masters[[i]]$sense, 58 | strand="sense", 59 | bin=i 60 | ), 61 | simplify=F 62 | ) 63 | ) 64 | ) 65 | 66 | dfAntisense = (do.call( 67 | rbind, sapply( 68 | 1:5, function(i) 69 | data.frame( 70 | x=seq_along(m$masters[[i]]$asense), 71 | y=m$masters[[i]]$asense, 72 | strand="antisense", 73 | bin=i 74 | ), 75 | simplify=F 76 | ) 77 | ) 78 | ) 79 | 80 | 81 | df = rbind(dfSense, dfAntisense) 82 | 83 | 84 | # PLOT 85 | theme_set(theme_bw(base_size=16)) 86 | 87 | gp = ggplot(df, aes(x=x, y=y)) 88 | gp = gp + geom_line(aes(group=bin, color=strand)) 89 | gp = gp + facet_wrap(~bin, scale="free_x", nrow=1) 90 | gp = gp + scale_y_log10() 91 | gp = gp + labs(y="log10(signal)", x="position", title=opt$title) 92 | gp = gp + scale_color_manual(values=palette) 93 | gp = gp + theme(axis.text.x=element_text(angle=45, hjust=1)) 94 | 95 | 96 | ggsave(opt$output, w=10, h=3) 97 | 98 | q(save="no") 99 | 100 | -------------------------------------------------------------------------------- /PFAM_enrichment.R: -------------------------------------------------------------------------------- 1 | ##------------ 2 | ## LIBRARIES 3 | ##------------ 4 | cat("Loading libraries... ") 5 | suppressPackageStartupMessages(library("PFAM.db")) 6 | suppressPackageStartupMessages(library("org.Hs.eg.db")) 7 | suppressPackageStartupMessages(library("GOstats")) 8 | suppressPackageStartupMessages(library("optparse")) 9 | suppressPackageStartupMessages(library("plyr")) 10 | cat("DONE\n\n") 11 | 12 | options(stringsAsFactors=F) 13 | pseudocount = 1e-04 14 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 15 | 16 | ################## 17 | # OPTION PARSING 18 | ################## 19 | 20 | 21 | option_list <- list( 22 | make_option(c("-u", "--universe"), help="a list of human gene identifiers (ensEMBL ids), NO header"), 23 | make_option(c("-G", "--genes"), help="a list of human gene identifiers for the foreground (ensEMBL ids), WITH header"), 24 | make_option(c("-o", "--output"), help="additional tags for otuput [default=out]") 25 | ) 26 | 27 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 28 | arguments <- parse_args(parser, positional_arguments = TRUE) 29 | opt <- arguments$options 30 | print(opt) 31 | 32 | na2null = function(x) if(is.na(x)) {return(NULL)}else{return(x)} 33 | 34 | 35 | ############################ 36 | # BEGIN 37 | ############################ 38 | 39 | U = read.table(opt$universe, h=F, col.names='hs') 40 | G = read.table(opt$genes, h=T, col.names='hs') 41 | 42 | 43 | # take the entrez gene ids for all the orthologous genes which will be my universe (the same for all the sets) 44 | universe = unlist(mget(U$hs, org.Hs.egENSEMBL2EG, ifnotfound=NA)) 45 | 46 | sprintf("%s background genes; %s with a corresponding entrez id", nrow(U), length(unique(universe))) 47 | # how many genes am I able to map? 48 | # First thing notice that also ensembl gene ids longer than 15 characters are included 49 | # if I remove these genes I end up with: 50 | # length(unique(as.character(universe[which(nchar(names(universe)) == 15)]))) ----> 15593 51 | 52 | 53 | createParams = function(x) { 54 | geneset = unlist(mget(x, org.Hs.egENSEMBL2EG, ifnotfound=NA)) 55 | sprintf("%s foreground genes; %s with a corresponding entrez id", length(x), length(unique(geneset))) 56 | pv = 1-(1-0.05)**(1/length(x)) 57 | params = new("PFAMHyperGParams", 58 | geneIds = geneset, 59 | universeGeneIds = universe, 60 | annotation = 'org.Hs.eg.db', 61 | pvalueCutoff = pv, 62 | testDirection='over') 63 | return(params)} 64 | 65 | res = hyperGTest(createParams(G$hs)) 66 | write.table(summary(res), file=sprintf("%s.tsv", opt$output), quote=F, sep="\t", row.names=F) 67 | htmlReport(res, file=sprintf("%s.html", opt$output)) 68 | 69 | q(save='no') 70 | -------------------------------------------------------------------------------- /mann-whitney.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors=F) 4 | 5 | ################## 6 | # OPTION PARSING 7 | ################## 8 | 9 | suppressPackageStartupMessages(library("optparse")) 10 | 11 | option_list <- list( 12 | 13 | make_option(c("-i", "--input"), type="character", default='stdin', 14 | help="tab-separated file. Can be stdin [default=%default]"), 15 | 16 | make_option(c("-o", "--output"), default="stdout", 17 | help="output file name with extension [default=%default]"), 18 | 19 | make_option(c("--header"), action="store_true", default=FALSE, 20 | help="The file has header [default=%default]"), 21 | 22 | make_option(c("-V", "--values"), type='numeric', default=1, 23 | help="Column index with the values [default=%default]"), 24 | 25 | make_option(c("-f", "--factor"), type='numeric', default=1, 26 | help="Column index with the factor [default=%default]"), 27 | 28 | make_option(c("-v", "--verbose"), action='store_true', default=FALSE, 29 | help="Verbose output [default=%default]") 30 | 31 | #make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"), 32 | ) 33 | 34 | parser <- OptionParser( 35 | usage = "%prog [options] file", 36 | option_list = option_list, 37 | description = "Compute wilcox.test for each pair of group in the column factor" 38 | ) 39 | 40 | 41 | arguments <- parse_args(parser, positional_arguments = TRUE) 42 | opt <- arguments$options 43 | if (opt$verbose) {print(opt)} 44 | 45 | 46 | ##------------ 47 | ## LIBRARIES 48 | ##------------ 49 | 50 | #if (opt$verbose) {cat("Loading libraries... ")} 51 | #suppressPackageStartupMessages(library(reshape2)) 52 | #suppressPackageStartupMessages(library(ggplot2)) 53 | ##suppressPackageStartupMessages(library(plyr)) 54 | #if (opt$verbose) {cat("DONE\n\n")} 55 | 56 | 57 | # ======== 58 | # BEGIN 59 | # ======== 60 | 61 | 62 | # Read data 63 | if (opt$input == "stdin") {input=file('stdin')} else {input=opt$input} 64 | m = read.table(input, h=opt$header, sep="\t") 65 | if(opt$verbose) {print(head(m))} 66 | 67 | # Read the axes 68 | values = colnames(m)[opt$values] 69 | fact = colnames(m)[opt$factor] 70 | # Build the formula 71 | form = as.formula(sprintf("%s~%s", values, fact)) 72 | 73 | if(opt$verbose) { 74 | cat("Formula: ") 75 | print(form) 76 | } 77 | 78 | # Vector of groups 79 | groups = unique(as.character(m[,fact])) 80 | 81 | # Perform mann-whitney on all pairs of groups 82 | res = apply(combn(groups, 2), 2, 83 | function(x) { 84 | df = m[m[,fact] %in% x,]; 85 | res.test = wilcox.test(form, data=df); 86 | data.frame( 87 | group1 = x[1], 88 | group2 = x[2], 89 | W = res.test$statistic, 90 | p.value = format(res.test$p.value, digits=4) 91 | ) 92 | } 93 | ) 94 | 95 | # Concatenate the data.frame 96 | res = do.call(rbind, res) 97 | 98 | # OUTPUT 99 | output = ifelse(opt$output == "stdout", "", opt$output) 100 | write.table(res, file=output, row.names=FALSE, quote=FALSE, sep="\t") 101 | 102 | # EXIT 103 | quit(save='no') 104 | -------------------------------------------------------------------------------- /gene.list.enrichment.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | options(stringsAsFactors=FALSE) 5 | 6 | ################## 7 | # OPTION PARSING 8 | ################## 9 | 10 | suppressPackageStartupMessages(library("optparse")) 11 | 12 | option_list <- list( 13 | 14 | make_option(c("-i", "--input"), default="stdin", 15 | help="File or stdin with a list of genes to test for enrichment [default=%default]"), 16 | 17 | make_option(c("-d", "--db"), 18 | help="A tab-separated database with the annotation. It has a header. First two columns are . \"%\" is the comment char."), 19 | 20 | make_option(c("--header"), action="store_true", default=FALSE, 21 | help="Use this if the input has a header [default=%default]"), 22 | 23 | #make_option(c("-U", "--all_genes_as_universe"), action="store_true", default=FALSE, 24 | # help="Use all genes in the list as universe, otherwise it uses only the genes annotated in db [default=%default]"), 25 | # 26 | make_option(c("-o", "--output"), default="gene.list.enrich.out.tsv", 27 | help="Output file name, stdout for printing on stdout [default=%default]"), 28 | 29 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, 30 | help="if you want more output [default=%default]") 31 | 32 | ) 33 | 34 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 35 | arguments <- parse_args(parser, positional_arguments = TRUE) 36 | opt <- arguments$options 37 | if (opt$verbose) {print(opt)} 38 | 39 | #------------ 40 | # LIBRARIES 41 | #------------ 42 | 43 | if (opt$verbose) {cat("Loading libraries... ")} 44 | suppressPackageStartupMessages(library(reshape2)) 45 | suppressPackageStartupMessages(library(ggplot2)) 46 | if (opt$verbose) {cat("DONE\n\n")} 47 | 48 | 49 | # Wrapper for hypergeometric probaility 50 | 51 | hyper.test = function(x) { 52 | x = as.numeric(x) 53 | q = x[1]; 54 | m = x[2]; 55 | n = x[4]-m; 56 | k = x[3]; 57 | phyper(q, m, n, k, lower.tail=FALSE) 58 | } 59 | 60 | 61 | 62 | # ======== # 63 | # BEGIN # 64 | # ======== # 65 | 66 | 67 | # Read data 68 | 69 | if (opt$input == "stdin") {inF = file("stdin")} else {inF = opt$input} 70 | m = read.table(inF, h=opt$header) 71 | 72 | 73 | db = read.table(opt$db, h=T, sep="\t", quote="", comment.char="%") 74 | 75 | formula = as.formula(paste(colnames(db)[2],"~",colnames(db)[1])) 76 | db_counts = setNames(aggregate(formula, db, length), c("feature", "total")) 77 | list_db = db[db[,2] %in% m[,1], ] 78 | list_counts = setNames(aggregate(formula, list_db, length), c("feature", "counts")) 79 | 80 | df = merge(list_counts, db_counts, by="feature") 81 | df$gene_list = length(unique(list_db[,2])) 82 | df$universe = length(unique(db[,2])) 83 | df$p.value = apply(df[-1], 1, hyper.test) 84 | df$FDR = p.adjust(df$p.value, method="BH") 85 | 86 | df = df[order(df$FDR),] 87 | 88 | # Format and write output 89 | 90 | df$p.value <- format(df$p.value, digits=2) 91 | df$FDR <- format(df$FDR, digits=2) 92 | 93 | output = ifelse(opt$output == "stdout", "", opt$output) 94 | write.table(df, output, quote=FALSE, row.names=FALSE, sep="\t") 95 | 96 | 97 | q(save='no') 98 | -------------------------------------------------------------------------------- /missing.values.R: -------------------------------------------------------------------------------- 1 | 2 | ##------------ 3 | ## LIBRARIES 4 | ##------------ 5 | suppressPackageStartupMessages(library(reshape2)) 6 | suppressPackageStartupMessages(library(ggplot2)) 7 | suppressPackageStartupMessages(library("optparse")) 8 | suppressPackageStartupMessages(library(plyr)) 9 | 10 | 11 | options(stringsAsFactors=F) 12 | pseudocount = 1e-04 13 | cbbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#000000", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 14 | 15 | ################## 16 | # OPTION PARSING 17 | ################## 18 | 19 | 20 | option_list <- list( 21 | make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze"), 22 | make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"), 23 | make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount), 24 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"), 25 | make_option(c("-o", "--output"), help="additional tags for otuput [default=%default]", default='out'), 26 | make_option(c("-t", "--tags"), help="comma-separated field names you want to display in the labels", default="cell,sex,age") 27 | ) 28 | 29 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 30 | arguments <- parse_args(parser, positional_arguments = TRUE) 31 | opt <- arguments$options 32 | print(opt) 33 | 34 | 35 | 36 | ##--------------------## 37 | ## CLUSTERING SAMPLES ## 38 | ##--------------------## 39 | 40 | # read the matrix from the command line 41 | m = read.table(opt$input_matrix, h=T) 42 | 43 | # remove potential gene id columns 44 | char_cols <- which(sapply(m, class) == 'character') 45 | sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols) 46 | if (length(char_cols) == 0) {genes = rownames(m)} 47 | if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]} 48 | 49 | # substitute the matrix with its log if required by the user 50 | if (opt$log) {m = log2(replace(m, is.na(m), 0) + opt$pseudocount)} 51 | 52 | # read the metadata from the metadata file 53 | mdata = read.table(opt$metadata, h=T, sep='\t') 54 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(",", ".", x)) 55 | 56 | # plot the number of missing values and zeros 57 | stat = data.frame(NAs = apply(m,2,function(x) sum(is.na(x))), 58 | zeros = apply(m,2,function(x) sum(x==0,na.rm=T)), pos_values = apply(m,2,function(x) sum(!is.na(x)&x!=0))) 59 | labels = strsplit(opt$tags, ',')[[1]] 60 | df = merge(unique(mdata[c('labExpId', labels)]), stat, by.y='row.names', by.x='labExpId') 61 | df = melt(df, ids = c('labExpId',labels), variable = 'values', value.name="Number_of_genes") 62 | df$labels = apply(df[labels], 1, paste, collapse='_' ) 63 | 64 | 65 | output = sprintf("log_%s.psd_%s.%s.missing.values", opt$log, opt$pseudocount, opt$output) 66 | pdf(sprintf('%s.pdf', output), h=log2(ncol(m)), w=7) 67 | gp = ggplot(df, aes(x=labExpId, y=Number_of_genes)) 68 | gp = gp + geom_bar(aes(fill=values), stat='identity') 69 | gp = gp + coord_flip() 70 | gp = gp + scale_x_discrete(labels=df$labels) 71 | gp = gp + labs(x='') 72 | gp = gp + theme(axis.text=element_text(size=10/log10(ncol(m)))) 73 | gp 74 | dev.off() 75 | 76 | q(save='no') 77 | -------------------------------------------------------------------------------- /functions.R: -------------------------------------------------------------------------------- 1 | 2 | # 3 | entropy = function(x, na.rm=T) {x = x[!is.na(x)]; x =x[x!=0]; p=x/sum(x); if (length(x) != 0) {return(-sum(p*log(p)))} else {NA}} 4 | 5 | # 6 | nentropy = function(x, na.rm=T) {y = x[!is.na(x)]; y =y[y!=0]; p=y/sum(y); if (length(y) != 0) {return(-sum(p*log(p))/log(length(x)))} else {NA}} 7 | 8 | # 9 | tau = function(x, na.rm=T) {return(sum(1-x/max(x))/(length(x)-1))} 10 | 11 | # 12 | cv = function(x, na.rm=T) {sd(x, na.rm=na.rm)/mean(x, na.rm=na.rm)} 13 | 14 | # 15 | logit = function(x, base=exp(1)) {return(log(x/(1-x), base=base))} 16 | 17 | # range 18 | range = function(x, na.rm=T) {return(max(x, na.rm=na.rm) - min(x, na.rm=na.rm))} 19 | 20 | # Dynamic range 21 | dynRange = function(x, na.rm=T) { 22 | #if (is.character(x) || is.factor(x)) {cat('char\n');return(NA)} 23 | x = as.numeric(x); 24 | x = replace(x, x==0, NA); 25 | if (sum(!is.na(x)) < 2) { 26 | return(NA) 27 | } else{ 28 | return(log10(max(x, na.rm=T)) - log10(min(x, na.rm=T))) 29 | } 30 | } 31 | 32 | 33 | # Shuffle data.frame 34 | shuffleData = function(df) { 35 | s = matrix(sample(unlist(m), ncol(m)*nrow(m)), ncol=ncol(m), nrow=nrow(m)) 36 | rownames(s) <- rownames(df) 37 | colnames(s) <- colnames(df) 38 | return(as.data.frame(s)) 39 | } 40 | 41 | # Extract the legend from ggplot 42 | g_legend<-function(a.gplot){ 43 | tmp <- ggplot_gtable(ggplot_build(a.gplot)) 44 | leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box") 45 | legend <- tmp$grobs[[leg]] 46 | return(legend) 47 | } 48 | 49 | 50 | 51 | #~~~~~~~~~~~~~~~~~~~~~~ 52 | # ggplot2 custom theme 53 | #~~~~~~~~~~~~~~~~~~~~~~ 54 | 55 | # Modified from theme_bw() from this site http://sape.inf.usi.ch/quick-reference/ggplot2/themes 56 | 57 | #theme_minimal <- function(base_size = 12) { 58 | # library(grid) 59 | # structure(list( 60 | # axis.line = element_blank(), 61 | # axis.text.x = element_text(size = base_size * 0.8 , lineheight = 0.9, vjust = 1), 62 | # axis.text.y = element_text(size = base_size * 0.8, lineheight = 0.9, hjust = 1), 63 | # axis.ticks = element_line(colour = "black", size = 0.2), 64 | # axis.title.x = element_text(size = base_size, vjust = 1), 65 | # axis.title.y = element_text(size = base_size, angle = 90, vjust = 0.5), 66 | # axis.ticks.length = unit(0.3, "lines"), 67 | # axis.ticks.margin = unit(0.5, "lines"), 68 | # 69 | # legend.background = element_rect(colour=NA), 70 | # legend.key = element_rect(colour = "grey80"), 71 | # legend.key.size = unit(1.2, "lines"), 72 | # legend.text = element_text(size = base_size * 0.8), 73 | # legend.title = element_text(size = base_size * 0.8, face = "bold", hjust = 0), 74 | # legend.position = "right", 75 | # 76 | # panel.background = element_rect(fill = "white", colour = NA), 77 | # panel.border = element_rect(fill = NA, colour="grey50"), 78 | # panel.grid.major = element_line(colour = "grey90", size = 0.2), 79 | # panel.grid.minor = element_line(colour = "grey98", size = 0.5), 80 | # panel.margin = unit(0.25, "lines"), 81 | # 82 | # strip.background = element_rect(fill = "grey80", colour = "grey50"), 83 | # strip.text.x = element_text(size = base_size * 0.8), 84 | # strip.text.y = element_text(size = base_size * 0.8, angle = -90), 85 | # 86 | # plot.background = element_rect(colour = NA), 87 | # plot.title = element_text(size = base_size * 1.2), 88 | # plot.margin = unit(c(1, 1, 0.5, 0.5), "lines") 89 | # ), class = "options") 90 | #} 91 | 92 | 93 | -------------------------------------------------------------------------------- /plot.isoform.levels.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library("reshape2")) 4 | suppressPackageStartupMessages(library("ggplot2")) 5 | suppressPackageStartupMessages(library("optparse")) 6 | suppressPackageStartupMessages(library("plyr")) 7 | 8 | options(stringsAsFactors=FALSE) 9 | 10 | 11 | ################## 12 | # OPTION PARSING 13 | ################## 14 | 15 | 16 | option_list <- list( 17 | make_option(c("-i", "--input_matrix"), help="the matrix with the transcript rpkms and gene id, columns names are trid, gnid, labExpId1, labExpId2, ..."), 18 | make_option(c("-l", "--log"), help="apply the log"), 19 | make_option(c("-G", "--gene"), help="the gene of interest"), 20 | make_option(c("-o", "--output"), default="isoforms.pdf", help="choose the name for the output file (with extension) [default=%default]"), 21 | make_option(c("-m", "--metadata"), help="metadata file"), 22 | make_option(c("--merge_mdata_on"), default="labExpId", help="metadata column with headers of the input matrix [default=%default]"), 23 | make_option(c("-f", "--field"), default="cell", help="column name of label [default=%default]"), 24 | make_option(c("-c", "--colors"), default="~/R/palettes/rainbow.6.txt", help="color file, the format is RGB [default=%default]"), 25 | make_option(c("-W", "--width"), default=7, help="width of the plot, in inches [default=%default]"), 26 | make_option(c("-r", "--representation"), default="stack", help="visualization method: stack | [defuault=%default]") 27 | 28 | #make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"), 29 | #make_option(c("-d", "--de"), help='output of edgeR for differentially expressed genes') 30 | ) 31 | 32 | 33 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 34 | arguments <- parse_args(parser, positional_arguments = TRUE) 35 | opt <- arguments$options 36 | 37 | #print(opt) 38 | 39 | #-------------------- 40 | # FUNCTIONS 41 | #-------------------- 42 | 43 | palette = as.character(read.table(opt$colors, h=F, comment.char="%")$V1) 44 | 45 | m = read.table(opt$input_matrix, h=T) 46 | colnames(m)[1:2] <- c("trid", "gnid") 47 | m = subset(m, gnid == opt$gene) 48 | 49 | if (nrow(m) == 1) { 50 | cat("The gene has only one transcript! \n") 51 | cat("EXIT \n") 52 | q(save="no") 53 | } 54 | 55 | m = melt(m) 56 | 57 | m = ddply(m, .(gnid, variable), transform, sum=sum(value, na.rm=T)) 58 | m$ratio = with(m, value/sum*100) 59 | m$sum = signif(m[,"sum"], 3) 60 | 61 | print(head(m)) 62 | 63 | # Read metadata 64 | if (!is.null(opt$metadata)) { 65 | field = opt$field 66 | # opt$merge_mdata_on = "labExpId" 67 | mdata = read.table(opt$metadata, h=T, sep="\t") 68 | mdata[,opt$merge_mdata_on] <- gsub(",",".", mdata[,opt$merge_mdata_on]) 69 | mdata = unique(mdata[,unique(c(field, opt$merge_mdata_on))]) 70 | m = merge(m, mdata, by.x="variable", by.y=opt$merge_mdata_on) 71 | } 72 | 73 | print(head(m)) 74 | 75 | if (opt$representation == "stack") { 76 | gp = ggplot(m, aes_string(x=field, y="ratio")) 77 | # gp = gp + geom_bar(position=position_stack(width=1), stat="identity", aes(fill=trid), color="black") 78 | gp = gp + geom_bar(position="stack", stat="identity", aes(fill=trid), color="black") 79 | gp = gp + geom_text(aes_string(x=field, y=102, label="sum"), angle=45, hjust=0) 80 | gp = gp + scale_fill_manual(values=palette, opt$gene) 81 | gp = gp + theme(axis.text.x = element_text(angle=45, hjust=1)) 82 | gp = gp + labs(x="") 83 | gp = gp + scale_y_continuous(limits=c(0,105)) 84 | } 85 | 86 | 87 | ggsave(opt$output, w=opt$width, h=6) 88 | 89 | q(save="no") 90 | -------------------------------------------------------------------------------- /matrix_to_dist.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors=F) 4 | 5 | # DEBUGGING OPTIONS 6 | 7 | opt = list() 8 | opt$input_matrix = "~abreschi/Documents/db/human/gencode18/Flux/encode.promo.human.gene.RPKM.idr_01.thr_0.names_False.tsv" 9 | opt$verbose = TRUE 10 | opt$log10 = FALSE 11 | opt$pseudocount = 1e-03 12 | opt$cor = "pearson" 13 | 14 | ################## 15 | # OPTION PARSING 16 | ################## 17 | 18 | suppressPackageStartupMessages(library("optparse")) 19 | 20 | option_list <- list( 21 | make_option(c("-i", "--input_matrix"), default="stdin", 22 | help="the matrix you want to analyze. \"stdin\" for reading from standard input [default=%default]"), 23 | make_option(c("-v", "--verbose"),action="store_true", default=FALSE, 24 | help="if you want detailed output"), 25 | make_option(c("-l", "--log10"), action="store_true", default=FALSE, 26 | help="apply the log10"), 27 | make_option(c("-p", "--pseudocount"), type="double", default=0, 28 | help="specify a pseudocount for the log [default=%default]. NAs are replaced by 0s"), 29 | make_option(c("-c", "--cor"), 30 | help="choose the correlation method"), 31 | make_option(c("-d", "--dist"), 32 | help="choose the distance method"), 33 | make_option(c("-k", "--keep_na"), action="store_true", default=FALSE, 34 | help="use this if you want to keep the NAs. By default they are converted to 0. [default=%default]"), 35 | make_option(c("-o", "--output"), default="stdout", 36 | help="a name for the output. \"stdout\" to print on standard output [default=%default]") 37 | ) 38 | 39 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 40 | arguments <- parse_args(parser, positional_arguments = TRUE) 41 | opt <- arguments$options 42 | 43 | # Stop if both the correlation method and the distance are different from NULL 44 | if (!is.null(opt$cor) & !is.null(opt$dist)) { 45 | cat("ERROR: cannot compute both the correlation and the distance\n") 46 | q(save="no") 47 | } 48 | 49 | if (opt$verbose) {print(opt)} 50 | 51 | ############## 52 | # BEGIN 53 | ############## 54 | 55 | # read input table 56 | if (opt$input_matrix == "stdin") {input=file("stdin")} else {input=opt$input_matrix} 57 | m = read.table(input, h=T) 58 | 59 | 60 | # remove potential gene id columns 61 | char_cols <- which(sapply(m, class) == 'character') 62 | if (opt$verbose) {sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols)} 63 | if (length(char_cols) == 0) {genes = rownames(m)} 64 | if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]} 65 | 66 | # Replace NAs with 0s if not asked otherwise 67 | if (!(opt$keep_na)) {m = replace(m, is.na(m), 0)} 68 | 69 | # apply the log if required 70 | if (opt$log10) {m = log10(m + opt$pseudocount)} 71 | 72 | 73 | # compute the correlation 74 | if (!is.null(opt$cor)) { 75 | df = cor(m, use='p', method=opt$cor) 76 | } 77 | 78 | # compute the distance 79 | if (!is.null(opt$dist)) { 80 | if (opt$dist == "p") {opt$dist <- "pearson"} 81 | if (opt$dist == "s") {opt$dist <- "spearman"} 82 | if (opt$dist != "spearman" & opt$dist != "pearson") { 83 | df = as.matrix(dist(t(m), method=opt$dist)) 84 | } else { 85 | df = 1 - abs(cor(m, use="p", method=opt$dist)) 86 | } 87 | } 88 | 89 | dec = 3 90 | 91 | df = round(df, dec) 92 | 93 | # print the results 94 | if (opt$output == "stdout") { 95 | write.table(df, file="", quote=FALSE, sep="\t") 96 | } else { 97 | write.table(df, file=opt$output, quote=FALSE, sep="\t") 98 | } 99 | 100 | q(save="no") 101 | -------------------------------------------------------------------------------- /element.detected.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ##------------ 4 | ## LIBRARIES 5 | ##------------ 6 | suppressPackageStartupMessages(library(reshape2)) 7 | suppressPackageStartupMessages(library(ggplot2)) 8 | suppressPackageStartupMessages(library("optparse")) 9 | suppressPackageStartupMessages(library(plyr)) 10 | 11 | 12 | #options(stringsAsFactors=F) 13 | pseudocount = 1e-04 14 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 15 | 16 | ################## 17 | # OPTION PARSING 18 | ################## 19 | 20 | 21 | option_list <- list( 22 | make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze"), 23 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"), 24 | make_option(c("-o", "--output"), help="additional flags for otuput", default="out"), 25 | make_option(c("-W", "--width"), default=5, help="width in inches [default=%default]"), 26 | make_option(c("-H", "--height"), default=5, help="height in inches [default=%default]"), 27 | #make_option(c("-c", "--color_by"), help="choose the color you want to color by [default=NA]", type='character', default=NA), 28 | make_option(c("-f", "--field"), help="dashboard field by which the individuals are grouped") 29 | #make_option(c("-t", "--tags"), help="comma-separated field names you want to display in the labels", default="cell,sex,age") 30 | ) 31 | 32 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 33 | arguments <- parse_args(parser, positional_arguments = TRUE) 34 | opt <- arguments$options 35 | print(opt) 36 | 37 | na2null = function(x) if(is.na(x)) {return(NULL)}else{return(x)} 38 | 39 | 40 | ##--------------------## 41 | ## CLUSTERING SAMPLES ## 42 | ##--------------------## 43 | output = sprintf("%s.%s", basename(opt$input_matrix), opt$output) 44 | 45 | # read the matrix from the command line 46 | m = read.table(opt$input_matrix, h=F, col.names=c("element","labExpId","n_det_el","prop","tag")) 47 | 48 | # read the metadata from the metadata file 49 | mdata = read.table(opt$metadata, h=T, sep='\t') 50 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(",", ".", x)) 51 | 52 | # prepare data.frame for ggplot 53 | df = merge(subset(m, tag=="individual"), unique(mdata[c("labExpId", opt$field)]), by='labExpId') 54 | 55 | 56 | # duplicate the data.frame to plot all if an opt$field is specified 57 | if (!is.null(opt$field)) { 58 | df_copy = df; 59 | df_copy[opt$field]= "all"; 60 | x_axis = opt$field 61 | # attach the two data frames 62 | new_df = rbind(df, df_copy) 63 | }else{ 64 | x_axis='x_lab'; 65 | df$x_lab = 'all' 66 | new_df = df} 67 | 68 | 69 | # order the elements with the hierarchy: junction, exon, transcript, gene 70 | new_df$element <- factor(new_df$element, levels=c('junction','exon','transcript','gene')) 71 | 72 | n_colors = length(unique(subset(m, tag=='cumulative')$labExpId)) 73 | gp = ggplot(new_df, aes_string(x=x_axis, y="prop")) 74 | gp = gp + geom_boxplot() 75 | gp = gp + facet_grid(element~.) 76 | gp = gp + geom_point(data=subset(m, tag=='cumulative'), aes_string(x="labExpId", y="prop"), size = 4, alpha = 0.7, col=cbbPalette[2:2+n_colors]) 77 | gp = gp + labs(y='Proportion of detected elements (%)', x="") 78 | gp = gp + theme(axis.text = element_text(size=13)) 79 | gp = gp + theme(axis.text.x = element_text(angle=45, hjust=1)) 80 | #gp 81 | 82 | w=opt$width 83 | h=opt$height 84 | 85 | ggsave(filename=sprintf("%s.pdf", output), h=h, w=w) 86 | ggsave(filename=sprintf("%s.png", output), h=h, w=w) 87 | ggsave(filename=sprintf("%s.eps", output), h=h, w=w) 88 | 89 | 90 | q(save='no') 91 | -------------------------------------------------------------------------------- /gene.pair.correlation.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | options(stringsAsFactors=FALSE) 5 | cbbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#000000", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 6 | 7 | ################## 8 | # OPTION PARSING 9 | ################## 10 | 11 | suppressPackageStartupMessages(library("optparse")) 12 | 13 | option_list <- list( 14 | 15 | make_option(c("-i", "--input"), 16 | help="Matrix with expression values"), 17 | 18 | make_option(c("-G", "--gene_pairs"), default="stdin", 19 | help="File or stdin. Two-column files with pairs of elements [default=%default]"), 20 | 21 | make_option(c("-o", "--output"), default="cor.out.tsv", 22 | help="Output file name or stdout [default=%default]"), 23 | 24 | make_option(c("--header"), action="store_true", default=FALSE, 25 | help="Use this if the input has a header [default=%default]"), 26 | 27 | make_option(c("-l", "--log"), action="store_true", default=FALSE, 28 | help="log10 of expression values [default=%default]"), 29 | 30 | make_option(c("-p", "--pseudocount"), default=1e-04, 31 | help="pseudocount"), 32 | 33 | make_option(c("-k", "--keep_na"), action="store_true", default=FALSE, 34 | help="do not replace NAs with 0 [default=%default]"), 35 | 36 | make_option(c("-m", "--method"), default="pearson", 37 | help="method for correlation: pearson | spearman [default=%default]"), 38 | 39 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, 40 | help="if you want more output [default=%default]") 41 | 42 | ) 43 | 44 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 45 | arguments <- parse_args(parser, positional_arguments = TRUE) 46 | opt <- arguments$options 47 | if (opt$verbose) {print(opt)} 48 | 49 | #------------ 50 | # LIBRARIES 51 | #------------ 52 | 53 | if (opt$verbose) {cat("Loading libraries... ")} 54 | suppressPackageStartupMessages(library(reshape2)) 55 | suppressPackageStartupMessages(library(ggplot2)) 56 | #suppressPackageStartupMessages(library(plyr)) 57 | if (opt$verbose) {cat("DONE\n\n")} 58 | 59 | 60 | # ======== # 61 | # BEGIN # 62 | # ======== # 63 | 64 | #input_files = strsplit(opt$input_files, ",")[[1]] 65 | #labels = strsplit(opt$labels, ",")[[1]] 66 | #vertical_lines = as.numeric(strsplit(opt$vertical_lines, ",")[[1]]) 67 | 68 | 69 | # Read data 70 | 71 | if (opt$gene_pairs == "stdin") { 72 | pairs = read.table(file("stdin"), h=opt$header) 73 | } else { 74 | pairs = read.table(opt$gene_pairs, h=opt$header) 75 | } 76 | 77 | m = read.table(opt$input, h=T) 78 | 79 | if (!(opt$keep_na)) { 80 | m = replace(m, is.na(m), 0) 81 | } 82 | 83 | if (opt$log) { 84 | m = log10(m + opt$pseudocount) 85 | } 86 | 87 | 88 | df = melt(as.matrix(m), value.name="value1", varnames=c("Var1", "variable")) 89 | df = merge(df, pairs, by.x="Var1", by.y=colnames(pairs)[1]) 90 | df = merge(df, melt(as.matrix(m), value.name="value2", varnames=c("Var2", "variable")), 91 | by.x=c(colnames(pairs)[2], "variable"), by.y=c("Var2","variable")) 92 | df$split_by = apply(df[c(colnames(pairs)[2],"Var1")], 1, paste, collapse="_") 93 | c = sapply(split(df, df$split_by), function(x) cor(x$value1, x$value2, u="p", m=opt$method)) 94 | 95 | df = merge(data.frame(c=c, split_by=names(c)), df) 96 | df = unique(df[c("Var1", colnames(pairs)[2], "c")]) 97 | 98 | #print(head(df)) 99 | output = ifelse(opt$output == "stdout", "", opt$output) 100 | write.table(df, output, row.names=FALSE, col.names=FALSE, sep="\t", quote=FALSE) 101 | 102 | #warnings() 103 | 104 | 105 | q(save='no') 106 | 107 | -------------------------------------------------------------------------------- /plot.bigWig.profile.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ##------------ 4 | ## LIBRARIES 5 | ##------------ 6 | cat("Loading libraries... ") 7 | suppressPackageStartupMessages(library(reshape2)) 8 | suppressPackageStartupMessages(library(ggplot2)) 9 | suppressPackageStartupMessages(library("optparse")) 10 | #suppressPackageStartupMessages(library(plyr)) 11 | cat("DONE\n\n") 12 | 13 | options(stringsAsFactors=F) 14 | pseudocount = 1e-04 15 | cbbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#000000", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 16 | 17 | ################## 18 | # OPTION PARSING 19 | ################## 20 | 21 | option_list <- list( 22 | 23 | make_option(c("-i", "--input_files"), type="character", 24 | help="list of files with the bigWig profiles, comma-separated. If only one file is given, can be stdin"), 25 | make_option(c("--header"), action="store_true", default=FALSE, 26 | help="The files have header [default=%default]"), 27 | make_option(c("-L", "--labels"), help="list of labels with the labels of each file, commma-separated.\n 28 | They must be in the same order as the file list", type="character"), 29 | make_option(c("-v", "--vertical_lines"), help="specify where you want the vertical lines [default=%default]", type="character", default="0"), 30 | #make_option(c("-f", "--facet"), type="integer", help="column index to facet"), 31 | make_option(c("-t", "--title"), help="Main title for the plot [default=%default]", default=""), 32 | make_option(c("-o", "--output"), help="output file name with extension [default=%default]", default="profile.pdf"), 33 | make_option(c("-y", "--y_title"), help="title for the y-axis [default=%default]", default="norm_read_density") 34 | #make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"), 35 | ) 36 | 37 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 38 | arguments <- parse_args(parser, positional_arguments = TRUE) 39 | opt <- arguments$options 40 | #print(opt) 41 | 42 | # ======== 43 | # BEGIN 44 | # ======== 45 | if (!is.null(opt$labels)) {labels = strsplit(opt$labels, ",")[[1]]} 46 | 47 | vertical_lines = as.numeric(strsplit(opt$vertical_lines, ",")[[1]]) 48 | 49 | 50 | # Read data 51 | 52 | if (opt$input_files != "stdin") { 53 | input_files = strsplit(opt$input_files, ",")[[1]] 54 | i = 1 55 | for (f in input_files) { 56 | if (i == 1 ) { 57 | m = read.table(f, h=opt$header) 58 | colnames(m)[1] <- "position" 59 | colnames(m)[2] <- labels[i] 60 | } 61 | if (i != 1) { 62 | tmp = read.table(f, h=F) 63 | colnames(tmp)[1] <- "position" 64 | colnames(tmp)[2] <- labels[i] 65 | m = merge(m, tmp, by = "position") 66 | } 67 | i = i+1 68 | } 69 | } else { 70 | m = read.table(file('stdin'), h=opt$header) 71 | colnames(m)[1] <- "position" 72 | if (is.null(opt$labels)) { 73 | colnames(m)[2] <- 'label' 74 | } else { 75 | colnames(m)[2] <- labels[1] 76 | } 77 | } 78 | 79 | 80 | # Melt data 81 | 82 | df = melt(m, id.vars="position") 83 | 84 | 85 | # GGPLOT 86 | 87 | theme_set(theme_bw(base_size=20)) 88 | 89 | gp = ggplot(df, aes(x=position, y=value)) 90 | gp = gp + geom_line(aes(group=variable, color=variable), alpha=0.5, size=1) 91 | gp = gp + geom_vline(xintercept=vertical_lines, color="grey", linetype="longdash") 92 | gp = gp + labs(title=opt$title, y=opt$y_title) 93 | gp = gp + scale_color_manual(values=cbbPalette) 94 | if (is.null(opt$label)) {gp = gp + theme(legend.position='none')} 95 | 96 | ggsave(opt$output, h=5, w=7) 97 | 98 | # EXIT 99 | quit(save='no') 100 | -------------------------------------------------------------------------------- /modularity.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors=F) 4 | 5 | ################## 6 | # OPTION PARSING 7 | ################## 8 | 9 | suppressPackageStartupMessages(library("optparse")) 10 | 11 | option_list <- list( 12 | make_option(c("-i", "--input_matrix"), default="stdin", 13 | help="Adjacency matrix in R format: the header is N-1 columns [default=%default] 14 | 15 | Example: 16 | 17 | sample1\tsample2\tsample3 18 | sample1\t1\t0\t1 19 | sample2\t1\t1\t0 20 | sample3\t0\t0\t1 21 | "), 22 | 23 | make_option(c("--diag"), default=FALSE, action="store_true", 24 | help="Consider self-edges. [default=%default]"), 25 | 26 | #make_option(c("-l", "--log"), action="store_true", default=FALSE, help="Log-transform the data [default=%default]"), 27 | 28 | #make_option(c("-p", "--pseudocount"), type="double", help="Pseudocount for the log [default=%default]"), 29 | 30 | #make_option(c("--d1"), help="Design for voom"), 31 | #make_option(c("-d", "--design"), help="Design for removing the batch effect (not including the batch effect)"), 32 | make_option(c("-C", "--community"), help="Column in the metadata with the information of the community"), 33 | 34 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"), 35 | make_option(c("-G", "--merge_mdata_on"), default="labExpId", 36 | help="Column in the metadata with the header of the input matrix [default=%default]"), 37 | 38 | make_option(c("-o", "--output"), default="stdout", help="output file name [default=%default]"), 39 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, help="verbose output [default=%default]") 40 | ) 41 | 42 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list, description="\nNormalize a matrix") 43 | arguments <- parse_args(parser, positional_arguments = TRUE) 44 | opt <- arguments$options 45 | if (opt$verbose) {print(opt)} 46 | 47 | 48 | 49 | # LIBRARIES 50 | 51 | suppressPackageStartupMessages(library(igraph)) 52 | #suppressPackageStartupMessages(library(limma)) 53 | 54 | 55 | ##--------------------## 56 | ## BEGIN ## 57 | ##--------------------## 58 | 59 | 60 | # read the matrix from the command line 61 | if(opt$input_matrix == "stdin"){inF=file("stdin")}else{inF=opt$input_matrix} 62 | m = read.table(inF, h=T, sep="\t", quote=NULL) 63 | 64 | # Replace missing values with 0 65 | m = replace(m, is.na(m), 0) 66 | 67 | #if (opt$log) {m = log10(m + opt$pseudocount)} 68 | 69 | g = graph.adjacency(as.matrix(m), mode='directed', diag=opt$diag, add.colnames=TRUE) 70 | V = vertex.attributes(g)[[1]] 71 | 72 | # =========================== Metadata ======================= 73 | 74 | merge_mdata_on = opt$merge_mdata_on 75 | # read the metadata 76 | mdata = read.table(opt$metadata, h=T, sep="\t", quote=NULL, comment.char="") 77 | # Get the fields from the formula 78 | if (is.null(opt$community)) { 79 | cat("ERROR: please specify the batch variable\n") 80 | q(save='no') 81 | } 82 | fields = opt$community 83 | mdata[opt$merge_mdata_on] <- gsub("[-,+]", ".", mdata[,opt$merge_mdata_on]) 84 | # Format the metadata 85 | mdata = unique(mdata[unique(c(merge_mdata_on, fields))]) 86 | rownames(mdata) <- mdata[,merge_mdata_on] 87 | mdata <- mdata[match(colnames(m), mdata[,merge_mdata_on]),, drop=FALSE] 88 | 89 | community = mdata[match(colnames(m), mdata[, opt$merge_mdata_on]), opt$community] 90 | 91 | 92 | modul = modularity(g, as.numeric(factor(community))) 93 | 94 | # =================== OUTPUT ====================== 95 | 96 | out = paste(ecount(g), modul, sep="\t") 97 | cat(out,"\n") 98 | #cat(modul, "\n") 99 | 100 | q(save='no') 101 | -------------------------------------------------------------------------------- /sort.rpkm.lines.R: -------------------------------------------------------------------------------- 1 | 2 | ##------------ 3 | ## LIBRARIES 4 | ##------------ 5 | suppressPackageStartupMessages(library(reshape2)) 6 | suppressPackageStartupMessages(library(ggplot2)) 7 | suppressPackageStartupMessages(library("optparse")) 8 | suppressPackageStartupMessages(library(plyr)) 9 | 10 | 11 | options(stringsAsFactors=F) 12 | pseudocount = 1e-04 13 | cbbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#000000", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 14 | 15 | ################## 16 | # OPTION PARSING 17 | ################## 18 | 19 | 20 | option_list <- list( 21 | make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze"), 22 | make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"), 23 | make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount), 24 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"), 25 | make_option(c("-o", "--output"), help="additional tags for otuput", default="out"), 26 | make_option(c("-c", "--color_by"), help="choose the factor you want to color by", type='character'), 27 | make_option(c("-L", "--linetype_by"), help="choose the factor you want to change the linetype by. Leave empty for none."), 28 | make_option(c("-v", "--value"), help="the value the you are plotting [default=%default]", default="RPKM"), 29 | make_option(c("-t", "--tags"), help="comma-separated field names you want to display in the labels", default="cell,sex,age") 30 | ) 31 | 32 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 33 | arguments <- parse_args(parser, positional_arguments = TRUE) 34 | opt <- arguments$options 35 | print(opt) 36 | 37 | na2null = function(x) if(is.na(x)) {return(NULL)}else{return(x)} 38 | 39 | 40 | ##--------------------## 41 | ## CLUSTERING SAMPLES ## 42 | ##--------------------## 43 | output = sprintf("log_%s.pseudo_%s.colby_%s.%s", opt$log, opt$pseudocount, opt$color_by, opt$output) 44 | 45 | # read the matrix from the command line 46 | m = read.table(opt$input_matrix, h=T) 47 | ylab = 'rpkm' 48 | 49 | # remove potential gene id columns 50 | char_cols <- which(sapply(m, class) == 'character') 51 | sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols) 52 | if (length(char_cols) == 0) {genes = rownames(m)} 53 | if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]} 54 | 55 | #substitute the matrix with its log if required by the user 56 | if (opt$log) { 57 | m = log10(replace(m, is.na(m), 0) + opt$pseudocount); 58 | ylab = sprintf('log10(%s+%s)', opt$value, opt$pseudocount)} 59 | 60 | # read the metadata from the metadata file 61 | mdata = read.table(opt$metadata, h=T, sep='\t') 62 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(",", ".", x)) 63 | 64 | # prepare data.frame for ggplot 65 | df = melt(m, variable.name = "labExpId", value.name="rpkm") 66 | df = merge(unique(mdata[unique(c("labExpId", strsplit(opt$tags, ",")[[1]], opt$color_by, opt$linetype_by))]), df, by="labExpId") 67 | df$labels = apply(df[strsplit(opt$tags, ",")[[1]]], 1, paste, collapse="_") 68 | # add a column with the x index 69 | df = ddply(df, .(labels), transform, x=seq_along(labels), y=sort(rpkm, na.last=T,d=T)) 70 | 71 | 72 | ############### 73 | # OUTPUT 74 | ############### 75 | 76 | # plotting... 77 | pdf(sprintf("%s.pdf",output), h=5, w=5) 78 | 79 | theme_set(theme_bw()) 80 | 81 | gp = ggplot(df, aes(x=x, y=y, group=labels)) 82 | gp = gp + geom_line(aes_string(color=opt$color_by, linetype=opt$linetype_by)) 83 | gp = gp + labs(y=ylab, x=sprintf('rank(%s)', opt$value)) 84 | gp = gp + scale_color_manual(values = cbbPalette) 85 | gp = gp + scale_x_log10(expand=c(0,0)) 86 | gp = gp + annotation_logticks(sides="b") 87 | gp = gp + theme(axis.text = element_text(size=15)) 88 | gp 89 | 90 | dev.off() 91 | 92 | q(save='no') 93 | -------------------------------------------------------------------------------- /ggpie.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | # Seed for jittering 5 | set.seed(123) 6 | 7 | 8 | #~~~~~~~~~~~~ 9 | # Libraries 10 | #~~~~~~~~~~~~ 11 | 12 | suppressPackageStartupMessages(library("ggplot2")) 13 | suppressPackageStartupMessages(library("optparse")) 14 | suppressPackageStartupMessages(library("grid")) 15 | 16 | 17 | # OPTION PARSING 18 | 19 | option_list <- list( 20 | 21 | make_option(c("-i", "--input"), default="stdin", 22 | help="input file. [default=%default]"), 23 | 24 | make_option(c("--header"), default=FALSE, 25 | help="the file has header [deafult=%default]"), 26 | 27 | make_option(c("-o", "--output"), default="ggpie.out.pdf", 28 | help="output file name. Must have a proper image extension (e.g. .pdf, .png) [deafult=%default]"), 29 | 30 | make_option(c("-p", "--percentages"), type='integer', 31 | help="column with the proportions as percentages"), 32 | 33 | make_option(c("-n", "--counts"), type='integer', 34 | help="column with the counts. If you provide counts instead of percentages"), 35 | 36 | make_option(c("-f", "--fill_by"), default=2, type="integer", 37 | help="column with the levels for filling [default=%default]"), 38 | 39 | make_option(c("-P", "--palette"), default="/users/rg/abreschi/R/palettes/Set2.8.txt", 40 | help="palette file [default=%default]"), 41 | 42 | make_option(c("-T", "--ggtitle"), 43 | help="Main title. Leave empty for no title"), 44 | 45 | make_option(c("-t", "--fill_title"), 46 | help="title for the fill legend") 47 | 48 | ) 49 | 50 | 51 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 52 | arguments <- parse_args(parser, positional_arguments = TRUE) 53 | opt <- arguments$options 54 | 55 | #~~~~~~~~~~~~ 56 | # BEGIN 57 | #~~~~~~~~~~~~ 58 | 59 | # Check that counts and percentages are not provided simultaneously 60 | if (!is.null(opt$percentages) & !is.null(opt$counts)) { 61 | cat("ERROR: cannot provide both percentages and counts\nAborted\n") 62 | q(save='no') 63 | } 64 | 65 | if (is.null(opt$percentages) & is.null(opt$counts)) { 66 | cat("MISSING INPUT: provide either percentages or counts\nAborted\n") 67 | q(save='no') 68 | } 69 | 70 | 71 | 72 | # read input 73 | if (opt$input == "stdin") { 74 | m = read.table(file("stdin"), h=opt$header, sep="\t") 75 | } else { 76 | m = read.table(opt$input, h=opt$header, sep="\t") 77 | } 78 | 79 | # read palette file 80 | palette = as.character(read.table(opt$palette, h=FALSE, comment.char="%")$V1) 81 | 82 | # Compute proportions if counts are given 83 | if (!is.null(opt$counts)) { 84 | counts = colnames(m)[opt$counts] 85 | m$perc = m[,counts]/sum(m[,counts])*100 86 | opt$percentages = which(colnames(m) == "perc") 87 | } 88 | 89 | 90 | #~~~~~~~~~~~ 91 | # Plot 92 | #~~~~~~~~~~~ 93 | 94 | theme_set(theme_bw(base_size=20)) 95 | 96 | theme_update( 97 | panel.grid = element_blank(), 98 | panel.border = element_blank(), 99 | axis.ticks = element_blank(), 100 | axis.ticks.margin = unit(0,'in'), 101 | axis.title = element_blank(), 102 | plot.margin = unit(c(0.01, 0.01, 0.01, 0.01),'in'), 103 | legend.key = element_blank() 104 | ) 105 | 106 | 107 | 108 | l = cumsum(m[,opt$percentages]) - c(m[1,opt$percentages], diff(cumsum(m[,opt$percentages])))/2 109 | x_labels = paste(round(m[,opt$percentages], 1), "%", sep="") 110 | 111 | 112 | gp = ggplot(m, aes_string(x=1, y=colnames(m)[opt$percentages])) 113 | gp = gp + geom_bar(stat="identity", position="stack", aes_string(fill=colnames(m)[opt$fill_by])) 114 | gp = gp + coord_polar(theta="y") 115 | gp = gp + scale_fill_manual(values=palette, name=opt$fill_title) 116 | gp = gp + scale_x_continuous(labels=NULL) 117 | #gp = gp + scale_y_continuous(labels=x_labels, breaks=l) 118 | gp = gp + scale_y_continuous(labels=NULL, breaks=NULL) 119 | gp = gp + geom_text(aes(x=1.5, y=l, label=x_labels), position=position_jitter(w=0.0, h=2.0)) 120 | gp = gp + labs(title=opt$ggtitle) 121 | 122 | 123 | ggsave(opt$output, w=6, h=3.5) 124 | 125 | q(save='no') 126 | -------------------------------------------------------------------------------- /VennDiagram.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | # -- Variables -- 5 | 6 | options(stringsAsFactors=F) 7 | 8 | 9 | ################## 10 | # OPTION PARSING 11 | ################## 12 | 13 | suppressPackageStartupMessages(library("optparse")) 14 | 15 | option_list <- list( 16 | make_option(c("-l", "--lcol"), help="Comma-separeted colors for the lines of the sets. Only names accepted for the moment! [default: black]"), 17 | make_option(c("-f", "--fcol"), help="Comma-separated colors for the surfaces of the sets. Only names accepted for the moment! [default:palette]"), 18 | make_option(c("-L", "--Lcol"), help="Comma-separated colors for the labels of the sets. Only names accepted for the moment! [default:black]"), 19 | 20 | make_option(c("-W", "--width"), default=3, type='integer', 21 | help="width of the plot in inches [default=%default]"), 22 | 23 | make_option(c("-H", "--height"), default=3, type='integer', 24 | help="height of the plot in inches [default=%default]"), 25 | 26 | make_option(c("-o", "--output"), help="output file name WITHOUT extension [default=Venn.out]", default="venn.out") 27 | ) 28 | 29 | cat("\nNOTE: NAs are treated as strings and not as missing values\n\n", file=stderr()) 30 | 31 | parser <- OptionParser(usage = "%prog [options] file(s)", option_list=option_list) 32 | arguments <- parse_args(parser, positional_arguments = TRUE) 33 | opt <- arguments$options 34 | arg <- arguments$args 35 | #print(arguments) 36 | 37 | 38 | 39 | ##------------ 40 | ## LIBRARIES 41 | ##------------ 42 | 43 | cat("Loading libraries... ") 44 | suppressPackageStartupMessages(library('VennDiagram')) 45 | cat("DONE\n\n") 46 | 47 | 48 | ################ 49 | # BEGIN 50 | ################ 51 | 52 | # read the lists of elements from args 53 | venn_list=list() 54 | for (f in arg) { 55 | l = as.list(read.table(f, h=T)) 56 | venn_list = modifyList(venn_list, l) 57 | } 58 | 59 | for (i in seq_along(arg)) { 60 | l = as.list(read.table(arg[i], h=T, na.strings=NULL)) 61 | if (i==1) { 62 | venn_list = l 63 | merged = l[[1]] 64 | } 65 | if (i!=1) { 66 | venn_list = c(venn_list, l) 67 | merged = intersect(merged, l[[1]]) 68 | } 69 | } 70 | 71 | 72 | 73 | # graphical parameters 74 | #----------------------- 75 | 76 | # change the line colors 77 | if (is.null(opt$lcol)) { 78 | col <- rep('black', length(venn_list)) 79 | 80 | }else{ 81 | col <- strsplit(opt$lcol, ',')[[1]] 82 | } 83 | 84 | 85 | # change the surface colors 86 | if (is.null(opt$fcol)) { 87 | face_col <- rainbow(length(venn_list)) 88 | }else{ 89 | face_col = strsplit(opt$fcol, ',')[[1]] 90 | } 91 | 92 | 93 | # change the label colors 94 | if (is.null(opt$Lcol)) { 95 | label_col = rep('black', length(venn_list)) 96 | }else{ 97 | label_col <- strsplit(opt$Lcol, ',')[[1]] 98 | } 99 | 100 | 101 | # =============== 102 | # ** plotting... 103 | # =============== 104 | 105 | image_type = 'png' 106 | w = opt$width 107 | h = opt$height 108 | 109 | 110 | # Set the label distances 111 | cat.dist = (w*c(0.01,0.01,0,0,0))[1:length(venn_list)] 112 | 113 | if (length(venn_list) == 4) { 114 | cat.dist = (w*c(0.08, 0.08, 0.02, 0.02)) 115 | } 116 | 117 | # Set the label position 118 | if (length(venn_list) == 5) { 119 | cat.pos = c(20, 40, 40, 30, 10) 120 | } 121 | 122 | if (length(venn_list) == 4) { 123 | cat.pos = c(180, 180, 0, 0) 124 | } 125 | 126 | if (length(venn_list) == 3) { 127 | cat.pos = c(-20, 20, 20) 128 | } 129 | 130 | if (length(venn_list) == 2) { 131 | cat.pos = c(20, 30) 132 | } 133 | 134 | 135 | 136 | venn.diagram(venn_list, 137 | filename=sprintf("%s.%s", opt$output, image_type), 138 | imagetype=image_type, 139 | units='in', 140 | width=w, 141 | height=h, 142 | col=col, 143 | cat.col = label_col, 144 | cat.pos = cat.pos, 145 | cat.dist = cat.dist, 146 | cat.cex=c(0.8,0.8,0.8,0.8,0.8)[1:length(venn_list)], 147 | fill=face_col 148 | ) 149 | 150 | # writing the intersection 151 | write.table(data.frame(merged), file=sprintf("%s.tsv", opt$output), quote=F, row.names=F) 152 | 153 | q(save='no') 154 | 155 | 156 | -------------------------------------------------------------------------------- /KEGG_enrichment.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | suppressPackageStartupMessages(library("optparse")) 5 | options(stringsAsFactors=F) 6 | 7 | ################## 8 | # OPTION PARSING 9 | ################## 10 | 11 | 12 | option_list <- list( 13 | make_option(c("-u", "--universe"), help="a list of human gene identifiers (ensEMBL ids), NO header"), 14 | make_option(c("-G", "--genes"), help="a list of human gene identifiers for the foreground (ensEMBL ids), WITH header"), 15 | #make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"), 16 | #make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount), 17 | #make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"), 18 | make_option(c("-o", "--output"), help="additional tags for otuput [default=out]") 19 | #make_option(c("-f", "--fill_by"), help="choose the color you want to fill by [default=NA]", type='character', default=NA) 20 | ) 21 | 22 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 23 | arguments <- parse_args(parser, positional_arguments = TRUE) 24 | opt <- arguments$options 25 | print(opt) 26 | 27 | ##------------ 28 | ## LIBRARIES 29 | ##------------ 30 | suppressPackageStartupMessages(library("KEGG.db")) 31 | suppressPackageStartupMessages(library("org.Hs.eg.db")) 32 | suppressPackageStartupMessages(library("GOstats")) 33 | suppressPackageStartupMessages(library("plyr")) 34 | 35 | na2null = function(x) if(is.na(x)) {return(NULL)}else{return(x)} 36 | 37 | 38 | ############################ 39 | # BEGIN 40 | ############################ 41 | 42 | U = read.table(opt$universe, h=F, col.names='hs') 43 | G = read.table(opt$genes, h=T, col.names='hs') 44 | 45 | 46 | #orth = read.table('~/Documents/db/human-mouse/orthologs/merged_orthologs_HumanMouse_1_1_genes.tsv', h=F, col.names=c('hs','mm')) 47 | #genes = read.table('~/Documents/human-mouse/tissues/maxdiff_tissuespecific.tsv', h=F, col.names=c('hs','mm','sample')) 48 | 49 | # I want to create a list of parameters to perform GO enrichment on different gene sets 50 | 51 | # take the entrez gene ids for all the orthologous genes which will be my universe (the same for all the sets) 52 | universe = unlist(mget(U$hs, org.Hs.egENSEMBL2EG, ifnotfound=NA)) 53 | 54 | sprintf("%s background genes; %s with a corresponding entrez id", nrow(U), length(unique(universe))) 55 | # how many genes am I able to map? 56 | # First thing notice that also ensembl gene ids longer than 15 characters are included 57 | # if I remove these genes I end up with: 58 | # length(unique(as.character(universe[which(nchar(names(universe)) == 15)]))) ----> 15593 59 | 60 | 61 | createParams = function(x) { 62 | geneset = unlist(mget(x, org.Hs.egENSEMBL2EG, ifnotfound=NA)) 63 | sprintf("%s foreground genes; %s with a corresponding entrez id", length(x), length(unique(geneset))) 64 | pv = 1-(1-0.05)**(1/length(x)) 65 | params = new("KEGGHyperGParams", 66 | geneIds = geneset, 67 | universeGeneIds = universe, 68 | annotation = 'org.Hs.eg.db', 69 | pvalueCutoff = pv, 70 | testDirection='over') 71 | return(params)} 72 | 73 | res = hyperGTest(createParams(G$hs)) 74 | write.table(summary(res), file=sprintf("%s.tsv", opt$output), quote=F, sep="\t", row.names=F) 75 | htmlReport(res, file=sprintf("%s.html", opt$output)) 76 | 77 | q(save='no') 78 | 79 | 80 | #listOfParamObjs = aggregate(G$hs, list(G$sample), createParams)$x 81 | 82 | # run it (and hope it is ok! ) 83 | #resultList = lapply(listOfParamObjs, hyperGTest) 84 | 85 | # give the name to each GO enrichment 86 | #names(resultList) <- aggregate(genes$hs, list(genes$sample), length)[,1] 87 | 88 | # put the results in a data.frame 89 | #resultDf = rbind.fill(lapply(as.list(names(resultList)[-which(lapply(resultList, function(x) nrow(summary(x))) == 0)]), 90 | # function(n)data.frame(summary(resultList[[n]]), n_genes=length(geneIds(resultList[[n]])), sample=n))) 91 | 92 | # save the data.frame in a file 93 | #write.table(resultDf, file='maxdiff_tissuespecific_GO.tsv', quote=F, row.names=F, col.names=T, sep='\t') 94 | 95 | -------------------------------------------------------------------------------- /wordcloud.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | options(stringsAsFactors=FALSE) 5 | 6 | ################## 7 | # OPTION PARSING 8 | ################## 9 | 10 | suppressPackageStartupMessages(library("optparse")) 11 | 12 | option_list <- list( 13 | 14 | make_option(c("-i", "--input"), default="stdin", 15 | help="File or stdin [default=%default]"), 16 | 17 | make_option(c("--header"), action="store_true", default=FALSE, 18 | help="Use this if the input has a header [default=%default]"), 19 | 20 | make_option(c("-o", "--output"), default="wordcloud.pdf", 21 | help="Output file name [default=%default]"), 22 | 23 | make_option(c("-w", "--words"), default=1, 24 | help="Index of the column with the words [default=%default]"), 25 | 26 | make_option(c("-e", "--weights"), default=2, type="integer", 27 | help="Index of the column with weights [default=%default]"), 28 | 29 | make_option(c("-C", "--color_by"), type="integer", 30 | help="Index of the factor for coloring"), 31 | 32 | make_option(c("-P", "--palette"), default="/users/rg/abreschi/R/palettes/Spectral.11.txt", 33 | help='File with colors [default=%default]'), 34 | 35 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, 36 | help="if you want more output [default=%default]") 37 | 38 | #make_option(c("--position"), default='dodge', 39 | # help="Position for histogram [default=%default]"), 40 | # 41 | #make_option(c("--scale_x_log10"), action="store_true", default=FALSE, 42 | # help="log10-transform x scale [default=%default]"), 43 | # 44 | #make_option(c("--scale_y_log10"), action="store_true", default=FALSE, 45 | # help="log10-transform y scale [default=%default]"), 46 | # 47 | #make_option(c("--y_title"), type="character", default="count", 48 | # help="Title for the y axis [default=%default]"), 49 | # 50 | #make_option(c("--x_title"), type="character", default="", 51 | # help="Title for the x axis [default=%default]"), 52 | # 53 | #make_option(c("-f", "--fill"), default="aquamarine", 54 | # help="choose the color which you want to fill the histogram with"), 55 | # 56 | #make_option(c("-c", "--color"), default="white", 57 | # help="choose the color which you want to contour the histogram with"), 58 | # 59 | #make_option(c("-F", "--fill_by"), type='numeric', 60 | # help="the column index with the factor to fill by. Leave empty for no factor."), 61 | # 62 | #make_option(c("--facet_by"), type='numeric', 63 | # help="the column index with the factor to facet by. Leave empty for no factor."), 64 | # 65 | #make_option(c("-W", "--width"), default=7, 66 | # help="width of the plot in inches. [default=%default]"), 67 | # 68 | #make_option(c("-b", "--binwidth"), type="double", 69 | # help="Specify binwidth. Leave empty for default") 70 | 71 | ) 72 | 73 | parser <- OptionParser( 74 | usage = "%prog [options] file", 75 | option_list=option_list, 76 | description = "Reads the values on the first column and outputs a histogram" 77 | ) 78 | arguments <- parse_args(parser, positional_arguments = TRUE) 79 | opt <- arguments$options 80 | if (opt$verbose) {print(opt)} 81 | 82 | #------------ 83 | # LIBRARIES 84 | #------------ 85 | 86 | if (opt$verbose) {cat("Loading libraries... ")} 87 | #suppressPackageStartupMessages(library(reshape2)) 88 | #suppressPackageStartupMessages(library(ggplot2)) 89 | suppressPackageStartupMessages(library(wordcloud)) 90 | if (opt$verbose) {cat("DONE\n\n")} 91 | 92 | 93 | # ======== # 94 | # BEGIN # 95 | # ======== # 96 | 97 | set.seed(123) 98 | 99 | # Read data 100 | if (opt$input == "stdin") {input=file("stdin")} else {input=opt$input} 101 | m = read.table(input, h=opt$header, sep="\t") 102 | 103 | df = m 104 | 105 | words = df[,opt$words] 106 | freq = df[,opt$weights] 107 | 108 | if (!is.null(opt$color_by)) { 109 | palette = as.character(read.table(opt$palette, comment.char="%", h=F)$V1) 110 | color_by = df[,opt$color_by] 111 | colors = palette[as.factor(color_by)] 112 | } 113 | 114 | 115 | pdf(opt$output, width=7, height=7) 116 | wordcloud( 117 | words = words, 118 | freq = freq, 119 | min.freq = -1, 120 | colors = colors, 121 | ordered.colors = TRUE 122 | ) 123 | dev.off() 124 | 125 | q(save='no') 126 | -------------------------------------------------------------------------------- /read.genomic.coverage.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | #options(stringsAsFactors=F) 4 | pseudocount = 1e-04 5 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 6 | 7 | 8 | 9 | ################## 10 | # OPTION PARSING 11 | ################## 12 | 13 | suppressPackageStartupMessages(library("optparse")) 14 | 15 | option_list <- list( 16 | 17 | make_option(c("-i", "--input_matrix"), 18 | help="the matrix you want to analyze. It must have not a header. It is the output of read.genome.coverage.py"), 19 | 20 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"), 21 | make_option(c("-M", "--merge_mdata_on"), help="which field corresponds to the ids in the summary file [default=%default]", default="labExpId"), 22 | make_option(c("-o", "--output"), help="output file name (without extension) [default=%default]", default="summary.out"), 23 | 24 | make_option(c("-H", "--height"), default=6, 25 | help="height of the plot in inches [default=%default]"), 26 | 27 | make_option(c("-W", "--width"), default=8, 28 | help="width of the plot in inches [default=%default]"), 29 | 30 | make_option(c("--facet_nrow"), type="integer", 31 | help="number of rows when faceting"), 32 | 33 | make_option(c("-f", "--facet"), help="dashboard field by which the individuals are faceted"), 34 | 35 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, 36 | help="verbose output [default=%default]") 37 | ) 38 | 39 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 40 | arguments <- parse_args(parser, positional_arguments = TRUE) 41 | opt <- arguments$options 42 | if( opt$verbose) {print(opt)} 43 | 44 | 45 | ##------------ 46 | ## LIBRARIES 47 | ##------------ 48 | 49 | cat("Loading libraries... ") 50 | suppressPackageStartupMessages(library(reshape2)) 51 | suppressPackageStartupMessages(library(ggplot2)) 52 | suppressPackageStartupMessages(library(plyr)) 53 | cat("DONE\n\n") 54 | 55 | 56 | 57 | ##--------------------## 58 | ## BEGIN ## 59 | ##--------------------## 60 | 61 | # read the matrix from the command line 62 | m = read.table(opt$input_matrix, h=F, col.names=c("labExpId", "type", "region", "nb_reads")) 63 | 64 | # read the metadata from the metadata file 65 | if (!is.null(opt$metadata)) { 66 | mdata = read.table(opt$metadata, h=T, sep='\t') 67 | mdata[opt$merge_mdata_on] <- sapply(mdata[opt$merge_mdata_on], function(x) gsub(",", ".", x)) 68 | } 69 | 70 | if (opt$verbose) {print(head(m))} 71 | 72 | # separate total from the rest 73 | df = merge(m, setNames(subset(m, type=="total")[c(1,4)], c("labExpId", "total")), by="labExpId") 74 | 75 | # merge split and continuous 76 | all = merge(aggregate(nb_reads~labExpId+region, subset(df, region!="total"), sum), subset(m, type=="total")[c(1,2,4)], by="labExpId") 77 | colnames(all)[c(3,5)] <- c("nb_reads", "total") 78 | all$type <- "all" 79 | df = rbind(df, all) 80 | 81 | if (opt$verbose) {print(head(df))} 82 | 83 | # attach the metadata 84 | if (!is.null(opt$metadata)) { 85 | mdata_header = unique(c(opt$facet, opt$merge_mdata_on)) 86 | df = merge(df, unique(mdata[mdata_header]), by.x='labExpId', by.y=opt$merge_mdata_on) 87 | } 88 | 89 | 90 | if (opt$verbose) {print(head(df))} 91 | 92 | # ----------------- ggplot options ------------------------------ 93 | 94 | theme_set(theme_bw(base_size=18)) 95 | 96 | gp = ggplot(subset(df, type!="total"), aes(y=nb_reads/total*100, x=region)) 97 | gp = gp + geom_boxplot(aes(color=type, fill=type), alpha=0.5) 98 | if (!is.null(opt$metadata)) { 99 | gp = gp + facet_wrap(as.formula(sprintf("~%s", opt$facet)), nrow=opt$facet_nrow) 100 | # gp = gp + facet_grid(as.formula(sprintf("~%s", opt$facet))) 101 | } 102 | gp = gp + labs(y='Proportion of mapped reads (%)', x="") 103 | gp = gp + theme(axis.text = element_text(size=13, angle=45, h=1)) 104 | gp = gp + scale_color_brewer(palette="Set1") 105 | 106 | w = opt$width 107 | h = opt$height 108 | 109 | ggsave(filename=sprintf("%s.pdf", opt$output), h=h, w=w) 110 | ggsave(filename=sprintf("%s.png", opt$output), h=h, w=w) 111 | ggsave(filename=sprintf("%s.eps", opt$output), h=h, w=w) 112 | 113 | 114 | q(save='no') 115 | -------------------------------------------------------------------------------- /normalization.DESeq.R: -------------------------------------------------------------------------------- 1 | 2 | # This script is useful for: 3 | # normalizing samples by row scaling 4 | 5 | 6 | ##------------ 7 | ## LIBRARIES 8 | ##------------ 9 | suppressPackageStartupMessages(library(reshape2)) 10 | suppressPackageStartupMessages(library(ggplot2)) 11 | suppressPackageStartupMessages(library(DESeq)) 12 | suppressPackageStartupMessages(library("optparse")) 13 | 14 | options(stringsAsFactors=F) 15 | pseudocount = 1e-05 16 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 17 | 18 | ################## 19 | # OPTION PARSING 20 | ################## 21 | 22 | option_list <- list( 23 | make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze"), 24 | make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"), 25 | make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount), 26 | make_option(c("-m", "--metadata"), help="tsv file with the metadata"), 27 | make_option(c("-s", "--scale_by"), help="choose one or multiple attributes you want to scale by. Null for normalize the whole matrix") 28 | #make_option(c("-r", "--row_first"), action="store_true", help="scale first by rows then by columns", default=FALSE), 29 | #make_option(c("-n", "--n_iter"), type='integer', help="how many times to iterate [default=%default]", default=20) 30 | ) 31 | 32 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 33 | arguments <- parse_args(parser, positional_arguments = TRUE) 34 | opt <- arguments$options 35 | print(opt) 36 | 37 | 38 | ############### 39 | # BEGIN 40 | ############### 41 | 42 | # read options 43 | m <- read.table(opt$input_matrix, h=T) 44 | 45 | mdata <- read.table(opt$metadata, h=T, row.names=NULL, sep="\t") 46 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(",", ".", x)) 47 | 48 | char_cols <- which(sapply(m, class) == 'character') 49 | sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols) 50 | 51 | if (length(char_cols) == 0) {genes = rownames(m)} 52 | if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]} 53 | 54 | m = (apply(m, 2, as.integer)) 55 | 56 | # apply the log if required 57 | if (opt$log) { m = log2(replace(m, is.na(m), 0) + pseudocount) } 58 | 59 | mdata = subset(mdata, labExpId %in% colnames(m)) 60 | 61 | 62 | 63 | # store the plot of the distribution with input matrix 64 | gp1 = ggplot(melt(m), aes(x=value)) + geom_density() + facet_wrap(~variable) 65 | 66 | 67 | 68 | ########## 69 | # EQUILIBRATION 70 | ########## 71 | 72 | 73 | DESeq_norm = function(matr) { 74 | condition = rep('group',ncol(matr)) 75 | cds = newCountDataSet(matr, condition) 76 | cds = estimateSizeFactors(cds) 77 | return(counts(cds, normalized=T)) 78 | } 79 | 80 | new_m = m 81 | 82 | # scale the whole matrix if no value is provided 83 | if (is.null(opt$scale_by)) { 84 | new_m = DESeq_norm(new_m) 85 | } 86 | 87 | # scale the sub-matrices defined the scale_by option 88 | if (!is.null(opt$scale_by)) { 89 | scale_by <- strsplit(opt$scale_by, ",")[[1]] 90 | if (length(scale_by) != 1){ 91 | ids = apply(data.frame(unique(mdata[, scale_by])), 1, function(x) unique(merge(t(as.data.frame(x)), mdata, by=scale_by)$labExpId ))} 92 | if (length(scale_by) == 1){ 93 | ids = sapply(unique(mdata[, scale_by]), function(x) unique(mdata[ mdata[,scale_by] == x,]$labExpId))} 94 | # apply normalization 95 | if (length(scale_by) != 1){for (i in 1:length(ids)) { new_m[, ids[[i]]] <- DESeq_norm(new_m[,ids[[i]]])} } 96 | if (length(scale_by) == 1){for (i in 1:ncol(ids)) { new_m[, ids[[i]]] <- DESeq_norm(new_m[,ids[[i]]])} } 97 | } 98 | 99 | 100 | if (is.null(opt$scale_by)) {gp2 = ggplot(melt(new_m), aes(x=value)) + geom_density() + facet_wrap(~Var2)}else{ 101 | gp2 = ggplot(melt(new_m), aes(x=value)) + geom_density() + facet_wrap(~variable)} 102 | if (length(char_cols) != 0) {new_m <- cbind(genes, new_m)} 103 | 104 | 105 | # print output 106 | #-------------- 107 | output = sprintf('DESeq_norm.log_%s.pscnt_%s', opt$log, opt$pseudocount) 108 | write.table(new_m, sprintf('%s.tsv',output), quote=F, sep='\t', row.names=F) 109 | pdf(sprintf("%s.pdf", output)); gp1; gp2; dev.off() 110 | q(save='no') 111 | -------------------------------------------------------------------------------- /cutree.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors=FALSE) 4 | 5 | ################## 6 | # OPTION PARSING # 7 | ################## 8 | 9 | suppressPackageStartupMessages(library("optparse")) 10 | 11 | option_list <- list( 12 | 13 | make_option(c("-i", "--input"), default="stdin", 14 | help="File or stdin [default=%default]"), 15 | 16 | make_option(c("-o", "--output"), default="cutree.tsv", 17 | help="Output file name. Can be stdout [default=%default]"), 18 | 19 | make_option(c("-r", "--replace_na"), default=FALSE, action="store_true", 20 | help="Replace NAs with 0, before adding the pseudocount and applying the log if asked [default=%default]"), 21 | 22 | make_option(c("-l", "--log10"), action="store_true", default=FALSE, 23 | help="Apply the log10 to the whole matrix as pre-processing step [default=%default]"), 24 | 25 | make_option(c("-p", "--pseudocount"), default=0.001, 26 | help="Pseudocount to add when applying the log [default=%default]"), 27 | 28 | make_option(c("-k", "--nb_clusters"), 29 | help="Number of desired clusters [default=%default]"), 30 | 31 | make_option(c("--order"), default=FALSE, action="store_true", 32 | help="Give only the order of elements without cutting the tree [default=%defult]"), 33 | 34 | make_option(c("--index"), action="store_true", default=FALSE, 35 | help="Output only the row names and the final index [default=%default]"), 36 | 37 | make_option(c("-H", "--cluster_height"), 38 | help="Height for cutting the tree [default=%default]"), 39 | 40 | make_option(c("-d", "--dist"), default="euclidean", 41 | help="Distance measure for clustering. Supports spearman (s) and pearson (p) as distance metrics [default=%default]"), 42 | 43 | make_option(c("-c", "--hclust"), default="complete", 44 | help="Algorithm for the hierarchical clustering [default=%default]"), 45 | 46 | make_option(c("--margin"), default=1, 47 | help="Cluster the rows (1) or the columns (2). Only rows can be clustered [default=%default]"), 48 | 49 | #make_option(c("-B", "--iterations"), default=50, 50 | # help="Number of initializations to determine the best clustering [default=%default]"), 51 | # 52 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, 53 | help="if you want more output [default=%default]") 54 | ) 55 | 56 | parser <- OptionParser( 57 | usage = "%prog [options] file", 58 | option_list = option_list, 59 | description = "Given a matrix cluster the element and cut the tree" 60 | ) 61 | 62 | arguments <- parse_args(parser, positional_arguments = TRUE) 63 | opt <- arguments$options 64 | if (opt$verbose) {print(opt)} 65 | 66 | 67 | # TODO: cluster the columns 68 | 69 | 70 | 71 | ############## 72 | # BEGIN 73 | ############## 74 | 75 | inF = opt$input; if(opt$input == "stdin") {inF=file("stdin")} 76 | m = read.table(inF, h=T) 77 | 78 | 79 | if (opt$replace_na) { 80 | m = replace(m, is.na(m), 0) 81 | } 82 | 83 | 84 | if (opt$log10) { 85 | m = log10(m + opt$pseudocount) 86 | } 87 | 88 | 89 | 90 | # -------------------------- Distance ------------------------------- 91 | 92 | # Compute the distance between columns 93 | if (opt$margin == 2) { 94 | if (opt$dist == "p" || opt$dist =="s") { 95 | Dist = as.dist(1-cor(m, method=opt$dist, use="p")) 96 | } else { 97 | Dist = dist(t(m), method=opt$dist) 98 | } 99 | } 100 | 101 | # Compute the distance between rows 102 | if (opt$margin == 1) { 103 | if (opt$dist == "p" || opt$dist =="s") { 104 | Dist = as.dist(1-cor(t(m), method=opt$dist, use="p")) 105 | } else { 106 | Dist = dist(m, method=opt$dist) 107 | } 108 | } 109 | 110 | 111 | # -------------------------- Clustering ------------------------------ 112 | 113 | klust = hclust(Dist, method=opt$hclust) 114 | if (opt$order) { 115 | K = order(klust$order) 116 | #print(klust$labels) 117 | #print(klust$order) 118 | #print (K) 119 | m$K = K 120 | } else { 121 | K = cutree(klust, k=opt$nb_clusters, h=opt$cluster_height) 122 | m$K = K 123 | } 124 | 125 | 126 | 127 | # OUTPUT 128 | 129 | output = ifelse(opt$output == "stdout", "", opt$output) 130 | 131 | if (opt$index) { 132 | write.table(m["K"], output, quote=FALSE, col.names=FALSE, row.names=TRUE, sep='\t') 133 | q(save='no') 134 | } 135 | 136 | write.table(m, output, quote=FALSE, col.names=TRUE, row.names=TRUE, sep='\t') 137 | 138 | q(save='no') 139 | -------------------------------------------------------------------------------- /DESeq.analysis.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | ##------------ 5 | ## LIBRARIES 6 | ##------------ 7 | 8 | cat('Loading libraries... ') 9 | 10 | #suppressPackageStartupMessages(library(reshape2)) 11 | #suppressPackageStartupMessages(library(ggplot2)) 12 | suppressPackageStartupMessages(library("optparse")) 13 | #suppressPackageStartupMessages(library(plyr)) 14 | suppressPackageStartupMessages(library('DESeq')) 15 | 16 | cat('DONE\n\n') 17 | 18 | options(stringsAsFactors=F) 19 | pseudocount = 1e-04 20 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 21 | 22 | ################## 23 | # OPTION PARSING 24 | ################## 25 | 26 | 27 | option_list <- list( 28 | make_option(c("-i", "--input_matrix"), help="the matrix with READ COUNTS you want to analyze"), 29 | make_option(c("-r", "--replace_NAs"), action="store_true", default=FALSE, help="replace NAs with 0 [default=%default]"), 30 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"), 31 | make_option(c("-f", "--fields"), help="choose the fields you want to use in the differential expression, comma-separated"), 32 | make_option(c("-o", "--output"), help="output file name"), 33 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE) 34 | ) 35 | 36 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 37 | arguments <- parse_args(parser, positional_arguments = TRUE) 38 | opt <- arguments$options 39 | if(opt$verbose) {print(opt)} 40 | 41 | 42 | 43 | ##--------------------## 44 | ## CLUSTERING SAMPLES ## 45 | ##--------------------## 46 | 47 | # read the matrix from the command line 48 | m = read.table(opt$input_matrix, h=T) 49 | genes = rownames(m) 50 | m = (apply(m, 2, as.integer)) 51 | rownames(m) <- genes 52 | 53 | if (opt$replace_NAs) {m = replace(m, is.na(m), 0)} 54 | 55 | # read the metadata from the metadata file 56 | mdata = read.table(opt$metadata, h=T, sep='\t') 57 | mdata["labExpId"] <- gsub("[,-]", ".", mdata[,"labExpId"]) 58 | 59 | 60 | # specify the design to the program 61 | fields = strsplit(opt$fields, ",")[[1]] 62 | #mdata = unique(mdata[c("labExpId", opt$fields)]) 63 | #print(match(colnames(m), mdata[,"labExpId"])) 64 | if (length(fields) == 1) { 65 | condition = factor(sapply(colnames(m), function(x) unique(subset(mdata, labExpId == x)[,opt$fields]))) 66 | }else{print('cannot handle multiple fields yet');q(save='no')} 67 | 68 | 69 | #colData = mdata[mdata[,"labExpId"] %in% colnames(m) & mdata[,"view"]=="Alignments",] 70 | #rownames(colData) <- colData[,"labExpId"] 71 | #colData = colData[match(colnames(m), rownames(colData)),] 72 | #dds = DESeqDataSetFromMatrix(countData = m, colData= colData, design=~cell) 73 | 74 | 75 | 76 | # create count object for DESeq 77 | cds = newCountDataSet(m, condition) 78 | 79 | # normalization 80 | if(opt$verbose) {cat("Estimating size factors... ")} 81 | cds = estimateSizeFactors(cds) 82 | if(opt$verbose) {cat("DONE\n")} 83 | 84 | # variance estimation 85 | if(opt$verbose) {cat("Estimating dispersions... ")} 86 | cds = estimateDispersions(cds) 87 | if(opt$verbose) {cat("DONE\n")} 88 | 89 | # plot dispersion estimates 90 | plotDispEsts = function(x) { 91 | norm_means = rowMeans(counts(x, normalized=T)) 92 | plot(sort(norm_means), fitInfo(x)$perGeneDispEsts[order(norm_means)], log='xy') 93 | lines(sort(norm_means), fData(x)$disp_pooled[order(norm_means)], col=2) 94 | } 95 | 96 | # calling differential expression 97 | if(opt$verbose) {cat("Binomial test... ")} 98 | res = nbinomTest(cds, levels(condition)[1], levels(condition)[2]) 99 | if(opt$verbose) {cat("DONE\n")} 100 | 101 | # plot MA 102 | plotMA = function(x) { 103 | plot(sort(log10(x$baseMean)), x$log2FoldChange[order(x$baseMean)]) 104 | points(log10(subset(x, padj<=0.01)[,'baseMean']), subset(x, padj<=0.01)[,'log2FoldChange'], col='red') 105 | } 106 | 107 | 108 | 109 | # output on file 110 | toTable = data.frame("id"=res[,1], "log2FC"=res[,6], "log2CPM"=log2(res[,2]), "PValue"=res[,7], "FDR"=res[,8]) 111 | write.table(toTable, file=opt$output, quote=F, sep='\t', row.names=F) 112 | 113 | q(save='no') 114 | 115 | 116 | # for debugging 117 | opt = list() 118 | opt$input_matrix = '~/Documents/blueprint/pilot/Flux/bp.human.gene.reads.idr_01.thr_0.names_False.tsv' 119 | opt$metadata = '~/Documents/blueprint/pilot/bp_rna_dashboard_temp.columns' 120 | opt$fields = 'cell' 121 | opt$genes = '~/Documents/blueprint/pilot/gen15.gene.pc.txt' 122 | 123 | -------------------------------------------------------------------------------- /anova.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors=FALSE) 4 | 5 | ################## 6 | # OPTION PARSING # 7 | ################## 8 | 9 | suppressPackageStartupMessages(library("optparse")) 10 | 11 | option_list <- list( 12 | 13 | make_option(c("-i", "--input"), default="stdin", 14 | help="Input matrix (R-friendly, the header has n-1 columns [default=%default]"), 15 | 16 | make_option(c("-o", "--output"), default="stdout", 17 | help="Output file name. [default=%default]"), 18 | 19 | make_option(c("-l", "--log10"), action="store_true", default=FALSE, 20 | help="Apply the log10 to the whole matrix as pre-processing step [default=%default]"), 21 | 22 | make_option(c("-p", "--pseudocount"), default=0.01, 23 | help="Pseudocount to add when applying the log [default=%default]"), 24 | 25 | make_option(c("-r", "--replace_NA"), default=FALSE, action="store_true", 26 | help="Replace NA with 0 [default=%default]"), 27 | 28 | make_option(c("-m", "--metadata"), 29 | help="Matrix with the metadata. It contains the information about the columns of the input matrix"), 30 | 31 | make_option(c("--merge_mdata_on"), default="labExpId", 32 | help="Metadata field which contains the column names of the input matrix [default=%default]"), 33 | 34 | make_option(c("-F", "--factors"), 35 | help="Factors for anova (right part of the formula after \"~\"), can also be interactions, e.g. value~cell+organism"), 36 | 37 | make_option(c("--p_adj"), default="BH", 38 | help="Method for correcting the pvalue for multiple testing [default=%default]"), 39 | 40 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, 41 | help="if you want more output [default=%default]") 42 | ) 43 | 44 | parser <- OptionParser( 45 | usage = "%prog [options] file", 46 | option_list=option_list, 47 | description = " 48 | Computes anova for each row of the input matrix given a design model. 49 | The SS for each factor of the design can be used to compute the 50 | proportion of variance explained by that factor 51 | 52 | Requires package \"reshape2\" 53 | " 54 | ) 55 | arguments <- parse_args(parser, positional_arguments = TRUE) 56 | opt <- arguments$options 57 | if (opt$verbose) {print(opt)} 58 | 59 | 60 | suppressPackageStartupMessages(library("reshape2")) 61 | 62 | 63 | ############## 64 | # BEGIN 65 | ############## 66 | 67 | # Read input 68 | #input = ifelse(opt$input == "stdin", file("stdin"), opt$input) # why is this line never working? 69 | if (opt$input == "stdin") { 70 | input = file("stdin") 71 | } else { 72 | input = opt$input 73 | } 74 | m = read.table(input, h=T) 75 | if (opt$verbose) { 76 | cat("Data sample:\n") 77 | print(head(m)) 78 | } 79 | 80 | # Process the data according to the user 81 | if (opt$replace_NA) {m = replace(m, is.na(m), 0)} 82 | if (opt$log10) {m = log10(m + opt$pseudocount)} 83 | 84 | 85 | # Read the metadata 86 | mdata = read.table(opt$metadata, h=T, sep="\t", quote=NULL, check.names=F) 87 | mdata[,opt$merge_mdata_on] = gsub("[,:-]", ".", mdata[,opt$merge_mdata_on]) 88 | mdata_col = unique(c(opt$merge_mdata_on, strsplit(opt$factors, "[+*:]")[[1]])) 89 | mdata = unique(mdata[,mdata_col]) 90 | if (opt$verbose) { 91 | cat("Metadata sample:\n") 92 | print(head(mdata)) 93 | } 94 | 95 | # Read the formula 96 | F = as.formula(sprintf("value~`%s`", opt$factors)) 97 | 98 | #m = m[1:100,] 99 | 100 | # Apply anova on each gene 101 | res = t(sapply(1:nrow(m), 102 | function(i) { 103 | mm = suppressMessages(melt(m[i,])) 104 | tmp = merge(mm, mdata, by.x="variable", by.y=opt$merge_mdata_on); 105 | # print(tmp) 106 | aov_res = anova(lm(F, tmp)); 107 | return(unlist(aov_res[,-1])) 108 | } 109 | )) 110 | 111 | tmp = merge(melt(m[1,]), mdata, by.x="variable", by.y=opt$merge_mdata_on); 112 | aov_res = anova(lm(F, tmp)); 113 | labels = apply(expand.grid(rownames(aov_res), c("SS", "MeanSq", "F", "pvalue")), 1, paste, collapse="_") 114 | 115 | res = data.frame(res) 116 | colnames(res) <- labels 117 | 118 | 119 | # Adjust pvalue 120 | for (i in grep("pvalue", colnames(res))) { 121 | adj_header = paste(colnames(res)[i], "adj", sep=".") 122 | res[,adj_header] = p.adjust(res[,i], method=opt$p_adj) 123 | } 124 | 125 | res = sapply(res, round, 4) 126 | rownames(res) <- rownames(m) 127 | 128 | # If the variance is zero set the results to NA 129 | res[apply(m, 1, var)==0,] <- NA 130 | 131 | 132 | # OUTPUT 133 | 134 | output = ifelse(opt$output == "stdout", "", opt$output) 135 | write.table(res, output, quote=FALSE, col.names=TRUE, row.names=TRUE, sep='\t') 136 | 137 | q(save='no') 138 | -------------------------------------------------------------------------------- /DEXSeq.analysis.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # == NOTES == 4 | # 1) the gtf should NOT have the trailing ";" 5 | # 2) the list of files with the exon read counts should have full paths 6 | # 3) the order of the id should be the same as the files 7 | 8 | 9 | ##------------ 10 | ## LIBRARIES 11 | ##------------ 12 | 13 | cat('Loading libraries... ') 14 | 15 | #suppressPackageStartupMessages(library(reshape2)) 16 | #suppressPackageStartupMessages(library(ggplot2)) 17 | suppressPackageStartupMessages(library("parallel")) 18 | suppressPackageStartupMessages(library("optparse")) 19 | suppressPackageStartupMessages(library("plyr")) 20 | suppressPackageStartupMessages(library('DEXSeq')) 21 | 22 | cat('DONE\n\n') 23 | 24 | options(stringsAsFactors=F) 25 | pseudocount = 1e-04 26 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 27 | 28 | # ===================== 29 | # == DEBUG OPTIONS == 30 | # ==================== 31 | 32 | opt = list() 33 | opt$metadata = "~/Documents/blueprint/pilot/bp_rna_dashboard_mmaps.crg.tsv" 34 | opt$countfiles = "ERR180942.exon.count.txt,ERR180943.exon.count.txt,ERR180944.exon.count.txt,ERR180945.exon.count.txt,ERR180948.exon.count.txt,ERR180950.exon.count.txt,ERR180951.exon.count.txt,ERR186015.exon.count.txt,ERR230581.exon.count.txt,ERR232403.exon.count.txt,ERR232404.exon.count.txt,ERR244135.exon.count.txt" 35 | opt$labExpId = "ERR180942,ERR180943,ERR180944,ERR180945,ERR180948,ERR180950,ERR180951,ERR186015,ERR230581,ERR232403,ERR232404,ERR244135" 36 | opt$DE_factor = "cell" 37 | opt$annotationfile = "~/Documents/db/human/gencode15/Long/DEXSeq/gen15.long.DEXSeq.gtf" 38 | 39 | 40 | ################## 41 | # OPTION PARSING 42 | ################## 43 | 44 | 45 | option_list <- list( 46 | make_option(c("-i", "--countfiles"), help="list of files with the counts for exons"), 47 | #make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"), 48 | #make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount), 49 | make_option(c("-I", "--labExpId"), help="list of ids in the same order as the files"), 50 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"), 51 | make_option(c("-f", "--DE_factor"), help="choose the field you want to use for the DE"), 52 | make_option(c("-a", "--annotationfile"), help="gtf file"), 53 | make_option(c("-o", "--output"), help="output file name"), 54 | #make_option(c("-f", "--fill_by"), help="choose the color you want to fill by [default=NA]", type='character', default=NA) 55 | make_option(c("-g", "--genes"), help='a file with a list of genes to filter', type='character', default=NA) 56 | ) 57 | 58 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 59 | arguments <- parse_args(parser, positional_arguments = TRUE) 60 | opt <- arguments$options 61 | print(opt) 62 | 63 | 64 | # == BEGIN == 65 | 66 | minCount = 24 67 | maxExon = 20 68 | nCores = 6 69 | progressFile = "DEXSeq.progress" 70 | 71 | mdata = read.table(opt$metadata, h=T, stringsAsFactors=T) 72 | countfiles = strsplit(opt$countfiles, ",")[[1]] 73 | labExpId = strsplit(opt$labExpId, ",")[[1]] 74 | 75 | # Prepare the design matrix from metadata 76 | df = unique(subset(mdata, labExpId %in% labExpId, select=c("labExpId", opt$DE_factor))) 77 | df = df[match(labExpId, df$labExpId),] 78 | rownames(df) <- df$labExpId 79 | colnames(df)[colnames(df) == opt$DE_factor] <- "condition" 80 | 81 | ecs = read.HTSeqCounts(countfiles = countfiles, design = df, flattenedfile=opt$annotationfile) 82 | sampleNames(ecs) <- labExpId 83 | 84 | # Run the test on a subset of genes 85 | test = subsetByGenes(ecs, genes=sample(geneIDs(ecs), 500)) 86 | 87 | # Estimate the scaling factors 88 | test <- estimateSizeFactors(test) 89 | 90 | # Estimate dispersions 91 | test <- estimateDispersions(test, minCount=minCount, maxExon=maxExon, nCores=nCores, file=progressFile) 92 | 93 | # Fit dispersions 94 | test <- fitDispersionFunction(test) 95 | 96 | # Make sure the regression line is ok 97 | plot(log10(rowMeans(counts(test, normalized=T))), log10(fData(test)$dispBeforeSharing)) 98 | points(log10(rowMeans(counts(test, normalized=T))), log10(fData(test)$dispFitted), col='red') 99 | 100 | # Test the DE 101 | test <- testForDEU(test, nCores=nCores) 102 | 103 | # Store the results in a data.frame 104 | res = DEUresultTable(test) 105 | plotDEXSeq(test, "ENSG00000139505.10", displayTranscripts=T, cex.axis=1.2, cex=1.3, lwd=2, legend=T, norCounts=T) 106 | 107 | 108 | -------------------------------------------------------------------------------- /quantile_normalization.R: -------------------------------------------------------------------------------- 1 | # This script is useful for: 2 | # normalizing samples by quantile 3 | 4 | 5 | ##------------ 6 | ## LIBRARIES 7 | ##------------ 8 | 9 | suppressPackageStartupMessages(library(reshape)) 10 | suppressPackageStartupMessages(library(reshape2)) 11 | suppressPackageStartupMessages(library(ggplot2)) 12 | suppressPackageStartupMessages(library("optparse")) 13 | 14 | options(stringsAsFactors=F) 15 | pseudocount = 1e-05 16 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 17 | 18 | ################## 19 | # OPTION PARSING 20 | ################## 21 | 22 | option_list <- list( 23 | make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze"), 24 | make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"), 25 | make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount), 26 | make_option(c("-m", "--metadata"), help="tsv file with the metadata"), 27 | make_option(c("-b", "--bottom"), type='double', help="fraction to remove from the bottom sorted distribution column-wise"), 28 | make_option(c("-t", "--top"), type='double', help="fraction to remove from the top sorted distribution column-wise"), 29 | make_option(c("-o", "--output"), help="output file name WIHTOUT extension [default=%default]", default="qnorm") 30 | #make_option(c("-s", "--scale_by"), help="choose one or multiple attributes you want to scale by"), 31 | #make_option(c("-r", "--row_first"), action="store_true", help="scale first by rows then by columns", default=FALSE), 32 | #make_option(c("-n", "--n_iter"), type='integer', help="how many times to iterate [default=%default]", default=20) 33 | ) 34 | 35 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 36 | arguments <- parse_args(parser, positional_arguments = TRUE) 37 | opt <- arguments$options 38 | print(opt) 39 | 40 | ############### 41 | # BEGIN 42 | ############### 43 | 44 | # read options 45 | m <- read.table(opt$input_matrix, h=T) 46 | mdata <- read.table(opt$metadata, h=T, row.names=NULL, sep="\t") 47 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(",", ".", x)) 48 | char_cols <- which(sapply(m, class) == 'character') 49 | sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols) 50 | 51 | if (length(char_cols) == 0) {genes = rownames(m)} 52 | if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]} 53 | 54 | 55 | # apply the log if required 56 | #if (opt$log) { m = log2(replace(m, is.na(m), 0) + pseudocount) } 57 | 58 | gp1 = ggplot(melt(m), aes(x=value)) + geom_density() + facet_wrap(~variable) 59 | 60 | # sort the columns of m, be careful to keep the labels 61 | # I will create two matrices, one with the sorted values, the other with the sorted labels 62 | 63 | sort_m = apply(m, 2, sort, na.last=T, d=F) 64 | sort_genes_m = apply(m, 2, function(x) as.numeric(rownames(sort_df(as.data.frame(x))))) 65 | 66 | 67 | shift1 = opt$bottom 68 | shift2 = opt$top 69 | 70 | # from this extract a range 71 | sort_m_range = sort_m[floor(nrow(sort_m)*(shift1)) : floor(nrow(sort_m)*(shift2)),] 72 | sort_genes_m_range = sort_genes_m[floor(nrow(sort_genes_m)*(shift1)) : floor(nrow(sort_genes_m)*(shift2)),] 73 | 74 | gp2 = ggplot(melt(as.data.frame(sort_m_range)), aes(x=value)) + geom_density() + facet_wrap(~variable) 75 | 76 | # calculate mean and standard deviation for the gated matrix 77 | #scl_mean = apply(sort_m_range, 2, mean) 78 | #scl_sd = apply(sort_m_range, 2, sd) 79 | 80 | # with mean and standard deviation scale the whole sorted and plot 81 | #scl_sort_m = sapply(1:ncol(sort_m), function(i) (sort_m[,i]-scl_mean[i])/scl_sd[i]) 82 | #colnames(scl_sort_m) <- colnames(sort_m) 83 | 84 | scl_sort_m = sort_m_range 85 | 86 | #gp3 = ggplot(melt(as.data.frame(scl_sort_m)), aes(x=value)) + geom_density() + facet_wrap(~variable) 87 | 88 | # do the average by rows and reassign the gene labels 89 | av = apply(scl_sort_m, 1, mean) 90 | qnorm_m = apply(sort_genes_m_range, 2, function(col) av[order(col)]) 91 | gp4 = ggplot(melt(as.data.frame(qnorm_m)), aes(x=value)) + geom_density() + facet_wrap(~variable) 92 | 93 | # write the output matrix to a file 94 | #if (length(char_cols) != 0) {qnorm_m <- cbind(genes, qnorm_m)} 95 | write.table(qnorm_m, sprintf("%s.tsv", opt$output), quote=F, sep='\t', row.names=F) 96 | 97 | # plot all the distributions after each of the steps 98 | pdf(sprintf("%s.pdf", opt$output)) 99 | gp1;gp2;#gp3; 100 | gp4 101 | dev.off() 102 | 103 | q(save='no') 104 | -------------------------------------------------------------------------------- /differential_coSI.R: -------------------------------------------------------------------------------- 1 | 2 | ##------------ 3 | ## LIBRARIES 4 | ##------------ 5 | 6 | cat("Loading libraries...") 7 | 8 | suppressPackageStartupMessages(library(reshape2)) 9 | suppressPackageStartupMessages(library(ggplot2)) 10 | suppressPackageStartupMessages(library("optparse")) 11 | suppressPackageStartupMessages(library(plyr)) 12 | 13 | cat("DONE\n\n") 14 | 15 | options(stringsAsFactors=F) 16 | pseudocount = 1e-04 17 | cbbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#000000", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 18 | 19 | ################## 20 | # OPTION PARSING 21 | ################## 22 | 23 | 24 | option_list <- list( 25 | make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze"), 26 | #make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"), 27 | #make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount), 28 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"), 29 | make_option(c("-o", "--output"), help="additional tags for otuput", default="out"), 30 | make_option(c("-c", "--diff_by"), help="choose the factor you want to differ by. Only one factor", type='character') 31 | ) 32 | 33 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 34 | arguments <- parse_args(parser, positional_arguments = TRUE) 35 | opt <- arguments$options 36 | print(opt) 37 | 38 | 39 | 40 | ##--------------------## 41 | ## CLUSTERING SAMPLES ## 42 | ##--------------------## 43 | output = sprintf("coSI_DE.%s", opt$output) 44 | 45 | # 1. read the matrix from the command line 46 | m = read.table(opt$input_matrix, h=T) 47 | 48 | # remove potential gene id columns 49 | char_cols <- which(sapply(m, class) == 'character') 50 | sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols) 51 | if (length(char_cols) == 0) {genes = rownames(m)} 52 | if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]} 53 | 54 | # 2. remove missing values 55 | m <- na.omit(m) 56 | 57 | # 3. read the metadata from the metadata file 58 | mdata = read.table(opt$metadata, h=T, sep='\t') 59 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(",", ".", x)) 60 | if (!is.null(opt$diff_by)) {opt$diff_by = strsplit(opt$diff_by, ",")[[1]]} 61 | 62 | # 4. prepare data.frame for ggplot 63 | df = melt(as.matrix(m), varnames=c("variable", "labExpId"), value.name="value") 64 | df = merge(unique(mdata[unique(c("labExpId", opt$diff_by))]), df, by="labExpId") 65 | 66 | # 5. do the average according to factor 67 | new_df = aggregate( as.formula(sprintf("value~%s+variable", opt$diff_by)), df, FUN=mean, na.action=na.omit) 68 | new_df = dcast(new_df, as.formula(sprintf("variable~%s", opt$diff_by))) 69 | 70 | # compute M and A 71 | new_df$A = 0.5*(new_df[,2]+new_df[,3]) 72 | new_df$M = (new_df[,3]) - (new_df[,2]) 73 | 74 | 75 | # Randomization 76 | set.seed(123) 77 | 78 | dfr = df 79 | dfr$value = sample(df$value) 80 | new_dfr = aggregate( as.formula(sprintf("value~%s+variable", opt$diff_by)), dfr, FUN=mean, na.action=na.omit) 81 | new_dfr = dcast(new_dfr, as.formula(sprintf("variable~%s", opt$diff_by))) 82 | 83 | new_dfr$A = 0.5*(new_dfr[,2]+new_dfr[,3]) 84 | new_dfr$M = (new_dfr[,3])-(new_dfr[,2]) 85 | 86 | 87 | # Add statistical significance after randomizing 88 | new_df$Z = (new_df$M - mean(new_dfr$M))/sd(new_dfr$M) 89 | new_df$pv = 1 - pnorm(abs(new_df$Z)) 90 | 91 | 92 | 93 | ############### 94 | # OUTPUT 95 | ############### 96 | 97 | # WRITE TABLE 98 | 99 | write.table(new_df, sprintf("%s.tsv", output), row.names=F, quote=F, sep="\t") 100 | 101 | # plotting... 102 | 103 | pdf(sprintf("%s.pdf",output), h=5, w=6) 104 | 105 | theme_set(theme_bw()) 106 | 107 | gp = ggplot(new_df, aes(x=A, y=M)) 108 | gp = gp + geom_point(aes(color=cut(abs(Z), breaks =c(0, 1, 1.5, 2, 2.5 ,3 , Inf)))) 109 | gp = gp + labs(y="cosi1-cosi2", x='1/2(cosi1+cosi2)') 110 | gp = gp + scale_color_manual(name = "Z-score", values = cbbPalette) 111 | gp = gp + scale_y_continuous(expand=c(0.01,0)) 112 | gp = gp + theme(axis.text = element_text(size=15)) 113 | gp 114 | 115 | gp = ggplot(new_df, aes(x=A, y=M)) 116 | gp = gp + geom_point(aes(color=cut(abs(pv), breaks =c(-Inf, 0, 0.001, 0.01, 0.05 , 1)))) 117 | gp = gp + labs(y="cosi1-cosi2", x='1/2(cosi1+cosi2)') 118 | gp = gp + scale_color_manual(name = "p-value", values = cbbPalette) 119 | gp = gp + scale_y_continuous(expand=c(0.01,0)) 120 | gp = gp + theme(axis.text = element_text(size=15)) 121 | gp 122 | 123 | dev.off() 124 | 125 | q(save='no') 126 | -------------------------------------------------------------------------------- /barplot.GO.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | options(stringsAsFactors=FALSE) 5 | 6 | ################## 7 | # OPTION PARSING 8 | ################## 9 | 10 | suppressPackageStartupMessages(library("optparse")) 11 | 12 | option_list <- list( 13 | 14 | make_option(c("-i", "--input"), default="stdin", 15 | help="File or stdin [default=%default]"), 16 | 17 | make_option(c("-o", "--output"), default="barplot.GO.pdf", 18 | help="Output file name [default=%default]"), 19 | 20 | make_option(c("--header"), action="store_true", default=FALSE, 21 | help="Use this if the input has a header [default=%default]"), 22 | 23 | make_option(c("-f", "--fill"), default="dodgerblue", 24 | help="choose the color which you want to fill the histogram with [default=%default]"), 25 | 26 | make_option(c("-F", "--fill_by"), type='integer', default=NULL, 27 | help="the column with the factor for the color [default=%default]"), 28 | 29 | make_option(c("-P", "--palette"), default=NULL, 30 | help="palette file [default=%default]"), 31 | 32 | make_option(c("-H", "--height"), default=7, 33 | help="Height of the plot in inches [default=%default]"), 34 | 35 | make_option(c("-W", "--width"), default=7, 36 | help="Width of the plot in inches [default=%default]"), 37 | 38 | make_option(c("-B", "--base_size"), default=20, 39 | help="Base size [default=%default]"), 40 | 41 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, 42 | help="if you want more output [default=%default]") 43 | 44 | ) 45 | 46 | parser <- OptionParser( 47 | usage = "%prog [options] file", 48 | option_list=option_list, 49 | description = "Reads the values on the first column and outputs a histogram" 50 | ) 51 | arguments <- parse_args(parser, positional_arguments = TRUE) 52 | opt <- arguments$options 53 | if (opt$verbose) {print(opt)} 54 | 55 | #------------ 56 | # LIBRARIES 57 | #------------ 58 | 59 | if (opt$verbose) {cat("Loading libraries... ")} 60 | suppressPackageStartupMessages(library(reshape2)) 61 | suppressPackageStartupMessages(library(ggplot2)) 62 | suppressPackageStartupMessages(library(stringr)) 63 | #suppressPackageStartupMessages(library(plyr)) 64 | if (opt$verbose) {cat("DONE\n\n")} 65 | 66 | 67 | # ======== # 68 | # BEGIN # 69 | # ======== # 70 | 71 | 72 | # Read data 73 | if (opt$input == "stdin") {input=file("stdin")} else {input=opt$input} 74 | m = read.table(input, h=opt$header, sep="\t") 75 | 76 | df = m 77 | 78 | # Read palette file 79 | if (!is.null(opt$palette)) { 80 | palette = read.table(opt$palette, h=F, sep="\t", comment.char="")$V1 81 | } 82 | 83 | 84 | # Read columns 85 | y_col = 2 86 | y_col = colnames(df)[y_col] 87 | df[y_col] <- -log10(df[y_col]) 88 | opt$x_axis <- 7 89 | x_col = colnames(df)[opt$x_axis] 90 | 91 | # Insert newlines in labels if they are too long 92 | max_nchar = 25 93 | toNL = which(sapply(as.character(df[,x_col]), nchar) > max_nchar) 94 | nbSpaces = str_count(as.character(df[,x_col]), " ") 95 | toNL = intersect(which(nbSpaces > 1), toNL) 96 | #print(df) 97 | matches = gregexpr(" ", as.character(df[, x_col]), fixed=TRUE) 98 | pos = as.numeric(sapply(matches, function(x) {i=floor((length(as.numeric(x))+1)/2); x[i]})) 99 | subst = sapply(1:length(df[,x_col]), function(i) {x=as.character(df[i,x_col]); substring(x, pos[i])<- "\n"; x }) 100 | df[toNL, x_col] <- subst[toNL] 101 | 102 | # Sort the labels by p-value 103 | levs = df[order(df[,y_col]),x_col] 104 | df[,x_col] <- factor(df[,x_col], levels=levs) 105 | 106 | #================ 107 | # GGPLOT 108 | #================ 109 | 110 | theme_set(theme_bw(base_size=opt$base_size)) 111 | theme_update( 112 | axis.text.x=element_text(angle=45, hjust=1, vjust=1), 113 | legend.key = element_rect(color='white'), 114 | panel.grid.minor = element_blank(), 115 | panel.grid.major = element_blank() 116 | ) 117 | 118 | #opt$fill = "dodgerblue" 119 | opt$color = "black" 120 | 121 | geom_params = list() 122 | 123 | if (is.null(opt$fill_by)) { 124 | geom_params$fill = opt$fill 125 | geom_params$color = opt$color 126 | } 127 | 128 | # specify fill column 129 | if (!is.null(opt$fill_by)) { 130 | F_col = colnames(df)[opt$fill_by] 131 | mapping <- aes_string(fill=F_col) 132 | } else { 133 | mapping = NULL 134 | } 135 | 136 | # define histogram layer 137 | histLayer <- layer( 138 | geom = "bar", 139 | # geom_params = geom_params, 140 | params = geom_params, 141 | position = "identity", 142 | mapping = mapping, 143 | stat = "identity" 144 | ) 145 | 146 | 147 | # start the plot 148 | gp = ggplot(df, aes_string(x = x_col, y = y_col)) 149 | gp = gp + histLayer 150 | gp = gp + coord_flip() 151 | gp = gp + labs(y="-log10(pvalue)", x=NULL) 152 | if (!is.null(opt$palette)) { 153 | gp = gp + scale_fill_manual(values=palette) 154 | } 155 | ggsave(opt$output, h=opt$height, w=opt$width, title=opt$output) 156 | 157 | q(save='no') 158 | -------------------------------------------------------------------------------- /ggSOM.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors=F) 4 | 5 | ################## 6 | # OPTION PARSING 7 | ################## 8 | 9 | suppressPackageStartupMessages(library("optparse")) 10 | 11 | option_list <- list( 12 | make_option(c("-i", "--input"), default="stdin", 13 | help="Tab-delimited file with unit id and attribute, no header. Can be stdin [default=%default]"), 14 | 15 | #make_option(c("-r", "--replace_NAs"), default=FALSE, action='store_true', 16 | # help="Replace NAs by 0s. If FALSE, rows containing NAs are omitted. [default=%default]"), 17 | # 18 | make_option(c("-G", "--grid"), default="5x4", 19 | help="Grid structure. Format: gridRowsxgridCols. [default=%default]"), 20 | 21 | make_option(c("-k", "--cluster"), type='integer', 22 | help="Index of the column with the cluster id"), 23 | 24 | make_option(c("-f", "--factor"), type='integer', 25 | help="Index of the column with the factor"), 26 | 27 | make_option(c("--func"), default="median", 28 | help="Function to aggregate in each cluster [default=%default]"), 29 | 30 | make_option(c("-d", "--is_discrete"), default=FALSE, action="store_true", 31 | help="The factor is a discrete class. [default=%default]"), 32 | 33 | make_option(c("-p", "--palette"), 34 | help="File with custom palette. If no palette is given, the ggplot default is used."), 35 | 36 | make_option(c("-T", "--title"), 37 | help="Main title"), 38 | 39 | #make_option(c("-t", "--topology"), default="hexagonal", 40 | # help="Grid topology. | [default=%default]"), 41 | # 42 | #make_option(c("-m", "--metadata"), 43 | # help="tsv file with metadata on matrix experiment"), 44 | # 45 | #make_option(c("-f", "--fields"), 46 | # help="choose the fields you want to use for super-organized maps, comma-separated. Needs a metadata file"), 47 | # 48 | make_option(c("-o", "--output"), default="ggSOM.pdf", 49 | help="Output file name [default=%default]"), 50 | 51 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, 52 | help="verbose output") 53 | ) 54 | 55 | parser <- OptionParser( 56 | usage = "%prog [options] file", 57 | description = " 58 | Plot the output of SOM as hexagonal grid. 59 | ", 60 | option_list=option_list) 61 | arguments <- parse_args(parser, positional_arguments = TRUE) 62 | opt <- arguments$options 63 | if (opt$verbose) {print(opt)} 64 | 65 | 66 | ################## 67 | # LIBRARIES # 68 | ################## 69 | 70 | suppressPackageStartupMessages(library('ggplot2')) 71 | 72 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 73 | # Functions to compute the hexagon borders 74 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 75 | 76 | hex.x = function(lx) { 77 | c( 78 | lx + 0, 79 | lx + 0.5, 80 | lx + 0.5, 81 | lx + 0, 82 | lx - 0.5, 83 | lx - 0.5 84 | ) 85 | } 86 | 87 | hex.y = function(ly) { 88 | c( 89 | ly - 1/(sqrt(3)), 90 | ly - 1/(2*sqrt(3)), 91 | ly + 1/(2*sqrt(3)), 92 | ly + 1/(sqrt(3)), 93 | ly + 1/(2*sqrt(3)), 94 | ly - 1/(2*sqrt(3)) 95 | ) 96 | } 97 | 98 | 99 | ################## 100 | # BEGIN 101 | ################## 102 | 103 | 104 | # read the SOM grid 105 | gridStruct = as.numeric(strsplit(opt$grid, "x")[[1]]) 106 | gridRows = gridStruct[1] 107 | gridCols = gridStruct[2] 108 | if(opt$verbose) {cat("SOM grid:", gridStruct, "\n")} 109 | 110 | # Read the id attributes 111 | if (opt$input == "stdin") {input = file('stdin')} else {input = opt$input} 112 | m = read.table(input, h=F) 113 | 114 | # Read the palette if provided 115 | if (!is.null(opt$palette)) { 116 | palette = read.table(opt$palette, h=F, comment.char="%")[,1] 117 | } 118 | 119 | # Name the cluster column 120 | colnames(m)[opt$cluster] <- 'id' 121 | 122 | # Build the grid 123 | hex = data.frame() 124 | k=0 125 | 126 | for (i in seq(gridRows)) { 127 | for (j in seq(gridCols)) { 128 | if (i%%2 != 0) {j = j+0.5} 129 | k = k+1 130 | hex = rbind(hex, data.frame(x = hex.x(j), y = hex.y(i), id=k)) 131 | } 132 | } 133 | 134 | # Aggregate the attribute by cluster 135 | factor_col = colnames(m)[opt$factor] 136 | 137 | 138 | if (!opt$is_discrete) { 139 | formula_agg = as.formula(sprintf("%s~id", factor_col)) 140 | m_agg = aggregate(formula_agg, m, eval(opt$func), na.rm=TRUE) 141 | # Merge the attribute with the hex grid 142 | df = merge(hex, m_agg) 143 | } else { 144 | df = merge(hex, m) 145 | } 146 | 147 | 148 | 149 | ######## 150 | # PLOT 151 | ######## 152 | 153 | #gp = ggplot(hex, aes(x, y)) + geom_polygon(aes(group=id, fill=id)) 154 | gp = ggplot(df, aes(x, y)) + geom_polygon(aes_string(group="id", fill=factor_col), color='black') 155 | gp = gp + labs(title=opt$title, x=NULL, y=NULL) 156 | if (!is.null(opt$palette)) { 157 | if (opt$is_discrete) { 158 | gp = gp + scale_fill_manual(values=palette) 159 | } else { 160 | gp = gp + scale_fill_gradientn(colours=palette) 161 | } 162 | } 163 | 164 | ggsave(opt$output, h=5, w=8) 165 | 166 | 167 | -------------------------------------------------------------------------------- /matrix_wilcox.R: -------------------------------------------------------------------------------- 1 | 2 | # This script is useful for: 3 | # normalizing samples by row scaling 4 | 5 | 6 | ##------------ 7 | ## LIBRARIES 8 | ##------------ 9 | cat("Loading libraries... ") 10 | suppressPackageStartupMessages(library(reshape2)) 11 | suppressPackageStartupMessages(library(ggplot2)) 12 | suppressPackageStartupMessages(library("optparse")) 13 | cat("DONE\n\n") 14 | 15 | 16 | options(stringsAsFactors=F) 17 | pseudocount = 1e-05 18 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 19 | 20 | ################## 21 | # OPTION PARSING 22 | ################## 23 | 24 | option_list <- list( 25 | make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze"), 26 | #make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"), 27 | #make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount), 28 | make_option(c("-m", "--metadata"), help="tsv file with the metadata"), 29 | make_option(c("-b", "--by"), help="choose ONE factor you want to do the test by"), 30 | make_option(c("-o", "--output_suffix"), help="additional output suffix [default=%default]", default='out'), 31 | #make_option(c("-f", "--func"), help="choose the function , [default=%default]", default='mean'), 32 | make_option(c("-n", "--not_na"), help="fraction of not NA values in the vector for the mean [default=%default]", default=1, type='double') 33 | #make_option(c("-r", "--row_first"), action="store_true", help="scale first by rows then by columns", default=FALSE), 34 | #make_option(c("-n", "--n_iter"), type='integer', help="how many times to iterate [default=%default]", default=20) 35 | ) 36 | 37 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 38 | arguments <- parse_args(parser, positional_arguments = TRUE) 39 | opt <- arguments$options 40 | print(opt) 41 | 42 | 43 | ############### 44 | # BEGIN 45 | ############### 46 | 47 | # read options 48 | m <- read.table(opt$input_matrix, h=T) 49 | mdata <- read.table(opt$metadata, h=T, row.names=NULL, sep="\t") 50 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(",", ".", x)) 51 | char_cols <- which(sapply(m, class) == 'character') 52 | sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols) 53 | 54 | if (length(char_cols) == 0) {genes = rownames(m)} 55 | if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]} 56 | 57 | # apply the log if required 58 | #if (opt$log) { m = log2(replace(m, is.na(m), 0) + pseudocount) } 59 | 60 | mdata = subset(mdata, labExpId %in% colnames(m)) 61 | 62 | # ---------------- 63 | # Functions 64 | # ---------------- 65 | 66 | #my_mean = function(x) { 67 | #ifelse((sum(!is.na(x)) < (opt$not_na*length(x))), NA, mean(x,na.rm=T) ) 68 | #} 69 | # 70 | #my_sd = function(x) { 71 | #ifelse((sum(!is.na(x)) < (opt$not_na*length(x))), NA, sd(x,na.rm=T) ) 72 | #} 73 | # 74 | #if (opt$func == "mean") {func = my_mean} 75 | #if (opt$func == "sd") {func = my_sd} 76 | 77 | 78 | # apply the function to the whole matrix if no value is provided 79 | #if (is.null(opt$mean_by)) { 80 | #new_m = rowMeans(m, na.rm=T); colnames(new_m) <- c("mean") 81 | #} 82 | 83 | # scale the sub-matrices defined the scale_by option 84 | if (!is.null(opt$by)) { 85 | by <- strsplit(opt$by, ",")[[1]] 86 | if (length(by) != 1){ 87 | #ids = apply(data.frame(unique(mdata[, mean_by])), 1, function(x) unique(merge(t(as.data.frame(x)), mdata, by=mean_by)$labExpId )) 88 | cat("ERROR: Only ONE factor must be provided!\n"); q(save='no')} 89 | if (length(by) == 1){ 90 | ids = sapply(unique(mdata[, by]), function(x) unique(mdata[ mdata[,by] == x,]$labExpId))} 91 | # apply normalization 92 | #if (length(mean_by) != 1){cat('to be implemented\n\n');for (i in 1:length(ids)) { new_m[, ids[[i]]] <- equil(new_m[,ids[[i]]])} } 93 | #if (length(by) == 1){new_m = matrix(ncol=ncol(ids), nrow=nrow(m)); for (i in 1:ncol(ids)) { 94 | #colnames(new_m) <- colnames(ids); new_m[,i] <- apply(m[,ids[,i]], 1, mean, na.rm=T)} } 95 | } 96 | 97 | wilc_pv = apply(m, 1, function(x) tryCatch({wt = wilcox.test(x[ids[,1]], x[ids[,2]]);wt$p.value}, error = function(e) NA_real_)) 98 | ttes_pv = apply(m, 1, function(x) tryCatch({wt = t.test(x[ids[,1]], x[ids[,2]]);wt$p.value}, error = function(e) NA_real_)) 99 | 100 | 101 | #if (is.null(opt$scale_by)) {gp2 = ggplot(melt(new_m), aes(x=value)) + geom_density() + facet_wrap(~Var2)}else{ 102 | #gp2 = ggplot(melt(new_m), aes(x=value)) + geom_density() + facet_wrap(~variable)} 103 | 104 | #if (length(char_cols) != 0) {new_m <- cbind(genes, new_m)} 105 | df = data.frame(cbind(genes, wilc_pv)) 106 | 107 | 108 | #-------------- 109 | # print output 110 | #-------------- 111 | 112 | output = sprintf('%s.by_%s.n_%s.%s', opt$func, opt$by, opt$not_na, opt$output_suffix) 113 | write.table(new_m, sprintf('%s.tsv',output), quote=F, sep='\t', row.names=F) 114 | #pdf(sprintf("%s.pdf", output)); gp1; gp2; dev.off() 115 | q(save='no') 116 | -------------------------------------------------------------------------------- /projection.score.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors=FALSE) 4 | 5 | ################## 6 | # OPTION PARSING # 7 | ################## 8 | 9 | suppressPackageStartupMessages(library("optparse")) 10 | 11 | option_list <- list( 12 | 13 | make_option(c("-i", "--input"), default="stdin", 14 | help="File or stdin [default=%default]"), 15 | 16 | make_option(c("-o", "--output"), default="proj.score", 17 | help="Output file name WITHOUT extension [default=%default]"), 18 | 19 | make_option(c("-s", "--statistics"), 20 | help="Two-column file with header, with the row names of the input and an associated statistics by which to filter"), 21 | 22 | make_option(c("-l", "--log10"), action="store_true", default=FALSE, 23 | help="Apply the log10 to the whole matrix as pre-processing step [default=%default]"), 24 | 25 | make_option(c("-p", "--pseudocount"), default=0.001, 26 | help="Pseudocount to add when applying the log [default=%default]"), 27 | 28 | make_option(c("-t", "--thresholds"), default="0.1,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8", 29 | help="A comma-separated list of different thresholds (percentages of the max value) [default=%default]"), 30 | 31 | make_option(c("-B", "--iterations"), default=10, 32 | help="Number of row permutations [default=%default]"), 33 | 34 | make_option(c("-S", "--nb_components"), default=3, 35 | help="Number of principal components you want to compute the projection score [default=%default]"), 36 | 37 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, 38 | help="if you want more output [default=%default]") 39 | ) 40 | 41 | parser <- OptionParser( 42 | usage = "%prog [options] file", 43 | option_list=option_list, 44 | description = "Compute the projection score on a matrix given a vector of values to use as thresholds. 45 | NOTE: The matrix is used as it comes with no normalization" 46 | ) 47 | 48 | arguments <- parse_args(parser, positional_arguments = TRUE) 49 | opt <- arguments$options 50 | if (opt$verbose) {print(opt)} 51 | 52 | 53 | suppressPackageStartupMessages(library("ggplot2")) 54 | 55 | 56 | ############## 57 | # BEGIN 58 | ############## 59 | 60 | # Read options 61 | var_thresholds = strsplit(opt$thresholds, ",")[[1]] 62 | B = opt$iterations 63 | S = opt$nb_components 64 | 65 | 66 | if (opt$input == "stdin") { 67 | m = read.table(file("stdin"), h=T) 68 | } else { 69 | m = read.table(opt$input, h=T) 70 | } 71 | 72 | if (opt$log10) { 73 | m = log10(m + opt$pseudocount) 74 | } 75 | 76 | # Read the statistics 77 | stats = read.table(opt$statistics, h=T) 78 | # Order them to be in the same order as the input matrix row names 79 | stats = stats[match(stats[,1], rownames(m)),] 80 | 81 | # Normalize the variance by the maximum variance 82 | variancen = stats[,2]/max(stats[,2], na.rm=T) 83 | #print(head(variancen)) 84 | names(variancen) <- stats[,1] 85 | 86 | # Function to compute the alpha_2 measure 87 | alpha_2 = function(lambda, S) { 88 | sqrt(sum(lambda[1:S]^2)/sum(lambda^2)) 89 | } 90 | 91 | # Initialize the vector of projection scores 92 | proj_scores = array(numeric(0)) 93 | 94 | # Iterate over different variance thresholds 95 | for (var_t in var_thresholds) { 96 | 97 | if (opt$verbose) {cat("Variance threshold: ", var_t, "\n")} 98 | 99 | # Get the actual submatrix for this variance threshold 100 | m_t = m[which(variancen>var_t),] 101 | 102 | # Compute the pca on this submatrix 103 | pca1 = prcomp(t(m_t), center=FALSE, scale.=FALSE) 104 | 105 | # Obtain a matrix of lambdas (sdev) for each permutation 106 | # Rows are the components and columns are the iterations 107 | set.seed(123) 108 | lambda = replicate(B, prcomp(apply(m_t, 1, sample), center=FALSE, scale.=FALSE)$sdev) 109 | # Count how many times the stdev for each component in the permutation is lower than the observed one 110 | lambda_counts = rowSums(apply(lambda, 2, function(x) pca1$sdev>=x)) 111 | 112 | # If at least one observed stdev is not higher than 95% of the permutated lambdas 113 | # the submatrix does not support S, and the projection score is assigned to NA 114 | if (sum(lambda_counts[1:S] < 0.05*B) != 0) { 115 | proj_score = NA 116 | } else { 117 | exp_l = mean(apply(lambda, 2, alpha_2, S=S)) 118 | obs_l = alpha_2(pca1$sdev, S) 119 | proj_score = obs_l - exp_l 120 | } 121 | proj_scores = c(proj_scores, proj_score) 122 | } 123 | 124 | 125 | ############### 126 | # OUTPUT 127 | ############### 128 | 129 | selected = names(which(variancen >= var_thresholds[which.max(proj_scores)])) 130 | write.table(selected, sprintf("%s.txt", opt$output), quote=FALSE, col.names=FALSE, row.names=FALSE, sep='\t') 131 | 132 | # Plot the projection score as a function of the variance 133 | df = data.frame(var_thresholds, proj_scores) 134 | if (opt$verbose) {print(df)} 135 | 136 | theme_set(theme_bw(base_size=18)) 137 | 138 | gp = ggplot(df, aes(x=as.numeric(var_thresholds), y=proj_scores)) + geom_point(size=3) + geom_line() 139 | gp = gp + labs(x="Thresholds (fraction of max)") 140 | 141 | ggsave(sprintf("%s.pdf", opt$output), h=5, w=7) 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /matrix_matrix_correlation.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ##------------ 4 | ## LIBRARIES 5 | ##------------ 6 | cat("Loading libraries... ") 7 | suppressPackageStartupMessages(library(reshape2)) 8 | suppressPackageStartupMessages(library(ggplot2)) 9 | suppressPackageStartupMessages(library("optparse")) 10 | suppressPackageStartupMessages(library(plyr)) 11 | cat("DONE\n\n") 12 | 13 | options(stringsAsFactors=F) 14 | pseudocount = 1e-04 15 | cbbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#000000", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 16 | 17 | ################## 18 | # OPTION PARSING 19 | ################## 20 | 21 | 22 | option_list <- list( 23 | make_option(c("-A", "--input1"), help="the first matrix of samples"), 24 | make_option(c("-B", "--input2"), help="the second matrix of samples, columns must have same order as in input1"), 25 | make_option(c("--tag1"), help="short tag describing the first input"), 26 | make_option(c("--tag2"), help="short tag describing the second input"), 27 | make_option(c("-L", "--labels"), help="select the labels from the dashboard fields, comma-separated"), 28 | make_option(c("-l", "--log"), help="apply the log to , , , [default=%default]", default="none"), 29 | make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount), 30 | make_option(c("-m", "--metadata"), default=NULL, help="tsv file with metadata on matrix experiment"), 31 | make_option(c("-r", "--representation"), help="choose , [default=%default]", default="hex"), 32 | make_option(c("-n", "--replace_na_with_0"), action="store_true", help="replace NAs with 0 [default=%default]", default=FALSE), 33 | make_option(c("--facet_nrow"), default=NULL, type="integer", help="Number of rows for faceting [default=%default]"), 34 | make_option(c("-o", "--output"), help="output file name without extension", default='out'), 35 | make_option(c("-v", "--verbose"), default=FALSE, action="store_true", help="verbose") 36 | ) 37 | 38 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 39 | arguments <- parse_args(parser, positional_arguments = TRUE) 40 | opt <- arguments$options 41 | if (opt$verbose) { 42 | print(opt) 43 | } 44 | 45 | 46 | ##--------------------## 47 | ## BEGIN ## 48 | ##--------------------## 49 | 50 | #output = sprintf("log_%s.pseudo_%s.fillby_%s.%s", opt$log, opt$pseudocount, opt$fill_by, opt$output) 51 | 52 | m1 = read.table(opt$input1, h=T) 53 | m2 = read.table(opt$input2, h=T) 54 | 55 | if (opt$replace_na_with_0) { 56 | m1 = replace(m1, is.na(m1), 0) 57 | m2 = replace(m2, is.na(m2), 0) 58 | } 59 | 60 | # check order of headers 61 | if (!all(colnames(m1) == colnames(m2))) {print('Headers are not the same!'); q(save='no') } 62 | 63 | 64 | mm1 = melt(cbind(m1, gene=rownames(m1)), value.name="value1", variable.name='labExpId') 65 | mm2 = melt(cbind(m2, gene=rownames(m2)), value.name="value2", variable.name='labExpId') 66 | 67 | df = merge(mm1, mm2, by=c('gene', 'labExpId')) 68 | 69 | if (opt$verbose) {print(head(df))} 70 | 71 | 72 | # Log-transformation 73 | if (opt$log=="both") {df$value1 <- log10(df$value1+opt$pseudocount); df$value2 <- log10(df$value2+opt$pseudocount)} 74 | if (opt$log=="A") {df$value1 <- log10(df$value1+opt$pseudocount)} 75 | if (opt$log=="B") {df$value2 <- log10(df$value2+opt$pseudocount)} 76 | 77 | 78 | # read the metadata 79 | if (!is.null(opt$metadata)) { 80 | mdata = read.table(opt$metadata, h=T, sep='\t', row.names=NULL) 81 | # select only the requested ones 82 | labels = strsplit(opt$labels, ',')[[1]] 83 | mdata = unique(mdata[c('labExpId', labels)]) 84 | mdata$labels = apply(mdata[labels], 1, function(x) paste(x, collapse='\n')) 85 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub("[,-]", ".", x)) 86 | # add this info to the data.frame 87 | df = merge(df, mdata[c('labExpId',"labels")], by='labExpId') 88 | 89 | # calculate correlation coefficient 90 | df = ddply(df, .(labExpId,labels), transform, pearson = cor(value1, value2, m='p', use='p')) 91 | } else { 92 | # calculate correlation coefficient 93 | df = ddply(df, .(labExpId), transform, pearson = cor(value1, value2, m='p', use='p')) 94 | df$labels = df$labExpId 95 | } 96 | 97 | 98 | ########## 99 | # PLOTTING 100 | ########## 101 | 102 | gp = ggplot(df, aes(x=value1, y=value2)) 103 | if (opt$representation == "hex") { 104 | gp = gp + geom_hex(aes(fill=cut(..count..,c(0,1,2,5,10,25,50,75,100,500,Inf))),binwidth=c(.2,.2)) 105 | gp = gp + scale_fill_manual('counts',values=terrain.colors(10))} 106 | if (opt$representation == "scatter") { 107 | gp = gp + geom_point(shape=".")} 108 | gp = gp + facet_wrap(~labels, nrow=opt$facet_nrow) 109 | gp = gp + geom_abline(slope=1, intercept=0, col='blue') 110 | gp = gp + geom_text(aes(x=min(value1,na.rm=T), y=max(value2,na.rm=T)-0.5, label=sprintf("r=%0.3f",pearson)), color='blue', hjust=0) 111 | gp = gp + labs(x=opt$tag1, y=opt$tag2) 112 | #gp 113 | ggsave(filename=sprintf('%s.pdf', opt$output), h=7, w=7) 114 | ggsave(filename=sprintf('%s.jpg', opt$output), h=7, w=7) 115 | 116 | 117 | 118 | q(save='no') 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /plot.mean.sd.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors=F) 4 | 5 | # ================== 6 | # DEBUG OPTIONS 7 | # ================== 8 | 9 | opt = list() 10 | opt$output = "detected.features.pdf" 11 | opt$merge_mdata_on = "labExpId" 12 | opt$base_size = 16 13 | 14 | ################## 15 | # OPTION PARSING 16 | ################## 17 | 18 | suppressPackageStartupMessages(library("optparse")) 19 | 20 | option_list <- list( 21 | 22 | make_option(c("-i", "--input_matrix"), default="stdin", 23 | help="the matrix you want to analyze [default=%default]"), 24 | 25 | make_option(c("-o", "--output"), default="detected.features.pdf", 26 | help="output file name. [default=%default]"), 27 | 28 | make_option(c("--header"), action="store_true", default=FALSE, 29 | help="input file has header [default=%default]"), 30 | 31 | make_option(c('--log10'), action="store_true", default=FALSE, 32 | help="Apply the log to the data [default=%default]"), 33 | 34 | make_option(c("-x", "--x_index"), default=1, type="character", 35 | help="column index (or indeces) for the x axis. [default=%default]"), 36 | 37 | make_option(c("-y", "--y_index"), default=2, type="integer", 38 | help="column index for the y axis. [default=%default]"), 39 | 40 | make_option(c("-F", "--fun"), type="character", default="mean_sdl", 41 | help="function to aggregate [default=%default]"), 42 | 43 | make_option(c("-G", "--geom"), type="character", default="pointrange", 44 | help="function to aggregate: pointrange | bar | crossbar [default=%default]"), 45 | 46 | make_option(c("-c", "--color"), type="character", default="orange", 47 | help="color of the point or bar [default=%default]"), 48 | 49 | make_option(c("--fill"), type="character", default="orange", 50 | help="fill of the point or bar. [default=%default]"), 51 | # help="fill of the point or bar. If an integer is provided, is the index of the column for the fill factor [default=%default]"), 52 | 53 | make_option(c("-f", "--facet_by"), 54 | help="column index to facet by"), 55 | 56 | make_option(c("--facet_scale"), default="fixed", 57 | help="facet scale: fixed | free | free_x | free_y"), 58 | 59 | make_option(c("--y_title"), default="Percentage of detected features", 60 | help="title for the y axis [default=%default]"), 61 | 62 | make_option(c("-B", "--base_size"), default=16, 63 | help="font base size [default=%default]"), 64 | 65 | make_option(c("-W", "--width"), default=7, 66 | help="width of the plot in inches [default=%default]"), 67 | 68 | make_option(c("-H", "--height"), default=5, 69 | help="height of the plot in inches [default=%default]") 70 | 71 | ) 72 | 73 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 74 | arguments <- parse_args(parser, positional_arguments = TRUE) 75 | opt <- arguments$options 76 | 77 | ##------------ 78 | ## LIBRARIES 79 | ##------------ 80 | 81 | cat("Loading libraries... ") 82 | #suppressPackageStartupMessages(library(reshape2)) 83 | suppressPackageStartupMessages(library(ggplot2)) 84 | source("~/R/functions.R") 85 | cat("DONE\n\n") 86 | 87 | 88 | ######### 89 | # BEGIN # 90 | ######### 91 | 92 | # read input matrix (decide if including the header or not) 93 | if (opt$input_matrix == "stdin") { 94 | m = read.table(file("stdin"), h=opt$header, sep="\t") 95 | } else { 96 | m = read.table(opt$input_matrix, h=opt$header, sep="\t") 97 | } 98 | 99 | df = m 100 | 101 | 102 | # Define x and y according to user indeces 103 | #x = colnames(df)[opt$x_index] 104 | x_indeces = as.numeric(strsplit(opt$x_index, split=",")[[1]]) 105 | df$x_labels = apply(df[x_indeces], 1, paste, collapse=',') 106 | x = 'x_labels' 107 | y = colnames(df)[opt$y_index] 108 | 109 | # Apply the log if needed 110 | if (opt$log10) {df[y] <- log10(df[y])} 111 | 112 | # Apply the function outside ggplot 113 | #tmp = as.data.frame(as.list(aggregate(formula, df, smedian.hilow))) 114 | #tmp = rename(tmp, replace=c("V4.Median" = "y", "V4.Lower"="ymin", "V4.Upper"="ymax")) 115 | 116 | 117 | # sort by mean 118 | formula = as.formula(sprintf("%s~%s", y, x)) 119 | lev = aggregate(formula, df, mean, na.rm=T) 120 | lev = lev[order(lev[,y], decreasing=T),x] 121 | df[x] = factor(df[,x], levels=lev) 122 | 123 | # ~~~~~~~~~ 124 | # PLOT 125 | # ~~~~~~~~~ 126 | 127 | theme_set(theme_bw(base_size=opt$base_size)) 128 | 129 | gp = ggplot(df, aes_string(x=x, y=y)) 130 | if (opt$geom == 'crossbar') { 131 | gp = gp + stat_summary(fun.data=opt$fun, mult=1, shape=15, size=1, width=0.4, color=opt$color, fill=opt$fill, geom=opt$geom) 132 | } else { 133 | gp = gp + stat_summary(fun.data=opt$fun, mult=1, shape=15, size=1, width=0.4, color=opt$color, geom="errorbar") 134 | gp = gp + stat_summary(fun.data=opt$fun, mult=1, shape=15, size=1, color=opt$color, fill=opt$fill, geom=opt$geom) 135 | } 136 | gp = gp + theme(axis.text.x=element_text(angle=45, hjust=1)) 137 | gp = gp + labs(y=opt$y_title, x="") 138 | 139 | 140 | if (!is.null(opt$facet_by)) { 141 | facet = as.formula(sprintf("~%s", colnames(df)[as.numeric(opt$facet_by)])) 142 | gp = gp + facet_wrap(facet, nrow=1, scale=opt$facet_scale) 143 | } 144 | 145 | 146 | ggsave(opt$output, h=opt$height, w=opt$width) 147 | 148 | q(save="no") 149 | 150 | 151 | -------------------------------------------------------------------------------- /GO_enrichment.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # -- Variables -- 4 | 5 | options(stringsAsFactors=F) 6 | pseudocount = 1e-04 7 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 8 | 9 | 10 | ################## 11 | # OPTION PARSING 12 | ################## 13 | 14 | suppressPackageStartupMessages(library("optparse")) 15 | 16 | option_list <- list( 17 | make_option(c("-u", "--universe"), help="a list of gene identifiers (ensEMBL ids), NO header"), 18 | make_option(c("-G", "--genes"), default="stdin", 19 | help="a list of gene identifiers for the foreground (ensEMBL ids), WITHOUT header [default=%default]"), 20 | make_option(c("-c", "--categ"), help="choose the GO category < BP | MF | CC > [default=%default]", default="BP"), 21 | make_option(c("-s", "--species"), help="choose the species < human | mouse | dmel > [default=%default]", default="human"), 22 | make_option(c("-o", "--output"), help="additional tags for otuput [default=%default]", default="out"), 23 | make_option(c("-d", "--output_dir"), default="./", help="directory for the output [default=%default]"), 24 | make_option(c("--output_genes"), action="store_true", default=FALSE, help="Output the list of genes from enriched GO terms in a separate file [default=%default]"), 25 | make_option(c("-v", "--verbose"), default=FALSE, action="store_true", help="verbose") 26 | ) 27 | 28 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 29 | arguments <- parse_args(parser, positional_arguments = TRUE) 30 | opt <- arguments$options 31 | if (opt$verbose) {print(opt)} 32 | 33 | 34 | 35 | ##------------ 36 | ## LIBRARIES 37 | ##------------ 38 | 39 | if (opt$verbose) {cat("Loading libraries... ")} 40 | 41 | 42 | suppressPackageStartupMessages(library("GO.db")) 43 | if (opt$species == "human") {ann = "org.Hs.eg.db"; suppressPackageStartupMessages(library("org.Hs.eg.db"))} 44 | if (opt$species == "mouse") {ann = "org.Mm.eg.db"; suppressPackageStartupMessages(library("org.Mm.eg.db"))} 45 | if (opt$species == "dmel") {ann = "org.Dm.eg.db"; suppressPackageStartupMessages(library("org.Dm.eg.db"))} 46 | suppressPackageStartupMessages(library("GOstats")) 47 | suppressPackageStartupMessages(library("plyr")) 48 | 49 | if (opt$verbose) {cat("DONE\n\n"); sessionInfo()} 50 | 51 | ############################ 52 | # BEGIN 53 | ############################ 54 | 55 | U = read.table(opt$universe, h=F, col.names='hs') 56 | 57 | U$hs = unique(U$hs) 58 | 59 | #if (opt$genes == "stdin") { 60 | # G = read.table(file("stdin"), h=T, col.names='hs') 61 | #} else { 62 | # G = read.table(opt$genes, h=T, col.names='hs') 63 | #} 64 | 65 | if (opt$genes == "stdin") { 66 | G = read.table(file("stdin"), h=F, col.names='hs') 67 | } else { 68 | G = read.table(opt$genes, h=F, col.names='hs') 69 | } 70 | 71 | 72 | # I want to create a list of parameters to perform GO enrichment on different gene sets 73 | 74 | # take the entrez gene ids for all the orthologous genes which will be my universe (the same for all the sets) 75 | if (opt$species == "human") { 76 | universe = unlist(mget(U$hs, org.Hs.egENSEMBL2EG, ifnotfound=NA))} 77 | 78 | if (opt$species == "mouse") { 79 | universe = unlist(mget(U$hs, org.Mm.egENSEMBL2EG, ifnotfound=NA))} 80 | 81 | if (opt$species == "dmel") { 82 | universe = unlist(mget(U$hs, org.Dm.egENSEMBL2EG, ifnotfound=NA))} 83 | 84 | 85 | if (opt$verbose) {sprintf("%s background genes; %s with a corresponding entrez id", nrow(U), length(unique(universe)))} 86 | # how many genes am I able to map? 87 | # First thing notice that also ensembl gene ids longer than 15 characters are included 88 | # if I remove these genes I end up with: 89 | # length(unique(as.character(universe[which(nchar(names(universe)) == 15)]))) ----> 15593 90 | 91 | 92 | createParams = function(x, species="human") { 93 | if (species == "human") { 94 | geneset = unlist(mget(x, org.Hs.egENSEMBL2EG, ifnotfound=NA)) 95 | } 96 | if (species == "mouse") { 97 | geneset = unlist(mget(x, org.Mm.egENSEMBL2EG, ifnotfound=NA)) 98 | } 99 | if (species == "dmel") { 100 | geneset = unlist(mget(x, org.Dm.egENSEMBL2EG, ifnotfound=NA)) 101 | } 102 | sprintf("%s foreground genes; %s with a corresponding entrez id", length(x), length(unique(geneset))) 103 | pv = 1-(1-0.05)**(1/length(x)) 104 | params = new("GOHyperGParams", 105 | geneIds = geneset, 106 | universeGeneIds = universe, 107 | annotation = ann, 108 | ontology = opt$categ, 109 | pvalueCutoff = pv, 110 | conditional = TRUE, 111 | testDirection='over') 112 | return(params)} 113 | 114 | res = hyperGTest(createParams(unique(G$hs), opt$species)) 115 | 116 | if (opt$verbose) { 117 | cat ("Finished Hypergeometric test\n") 118 | } 119 | 120 | # Reformat the output table 121 | df = summary(res) 122 | df$Pvalue = format(df$Pvalue, digits=1) 123 | df$OddsRatio <- round(df$OddsRatio, 2) 124 | df$ExpCount <- round(df$ExpCount, 2) 125 | 126 | 127 | 128 | # Get the genes for the enriched GO terms 129 | if (opt$output_genes) { 130 | enrichGenes <- ldply(geneIdsByCategory(res, catids=sigCategories(res, pvalueCutoff(res))), data.frame) 131 | enrichGenes[[2]] <- mapIds(eval(parse(text=ann)), keys=enrichGenes[[2]], keytype="ENTREZID", column=c("ENSEMBL")) 132 | colnames(enrichGenes) <- c("GO", "gene_id") 133 | output = sprintf("%s/%s.%s.genes", opt$output_dir, opt$output, opt$categ) 134 | write.table(enrichGenes, file=output, quote=F, sep="\t", row.names=F) 135 | } 136 | 137 | 138 | # Print output 139 | output = sprintf("%s/%s.%s", opt$output_dir, opt$output, opt$categ) 140 | write.table(df, file=sprintf("%s.tsv", output), quote=F, sep="\t", row.names=F) 141 | htmlReport(res, file=sprintf("%s.html", output)) 142 | 143 | 144 | 145 | 146 | q(save='no') 147 | 148 | 149 | -------------------------------------------------------------------------------- /SOM.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors=F) 4 | set.seed(1) 5 | 6 | ################## 7 | # OPTION PARSING 8 | ################## 9 | 10 | suppressPackageStartupMessages(library("optparse")) 11 | 12 | option_list <- list( 13 | make_option(c("-i", "--input_matrix"), default="stdin", 14 | help="Columns are samples and rows are dimensions. Can be stdin [default=%default]"), 15 | 16 | make_option(c("-r", "--replace_NAs"), default=FALSE, action='store_true', 17 | help="Replace NAs by 0s. If FALSE, rows containing NAs are omitted. [default=%default]"), 18 | 19 | make_option(c("-G", "--grid"), default="5x4", 20 | help="Grid structure. Format: gridRowsxgridCols. [default=%default]"), 21 | 22 | make_option(c("-t", "--topology"), default="hexagonal", 23 | help="Grid topology. | [default=%default]"), 24 | 25 | make_option(c("-T", "--toroidal"), default=FALSE, action="store_true", 26 | help="Toroidal strucure [default=%default]"), 27 | 28 | make_option(c("-n", "--iterations"), default=100, 29 | help="Number of iteration [default=%default]"), 30 | 31 | make_option(c("-m", "--metadata"), 32 | help="tsv file with metadata on matrix experiment"), 33 | 34 | make_option(c("-f", "--fields"), 35 | help="choose the fields you want to use for super-organized maps, comma-separated. Needs a metadata file"), 36 | 37 | make_option(c("-o", "--output"), default="SOM.out.tsv", 38 | help="Output file name, with extension .tsv [default=%default]"), 39 | 40 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, help="verbose output") 41 | ) 42 | 43 | parser <- OptionParser( 44 | usage = "%prog [options] file", 45 | description = " 46 | Apply SOM to a matrix of values. 47 | The resulting cluster unit is appended as last column to the original matrix.", 48 | option_list=option_list) 49 | arguments <- parse_args(parser, positional_arguments = TRUE) 50 | opt <- arguments$options 51 | if (opt$verbose) {print(opt)} 52 | 53 | 54 | ##------------ 55 | ## LIBRARIES 56 | ##------------ 57 | 58 | if(opt$verbose) {cat('Libraries loading... ')} 59 | 60 | suppressPackageStartupMessages(library('reshape2')) 61 | suppressPackageStartupMessages(library('ggplot2')) 62 | suppressPackageStartupMessages(library("kohonen")) 63 | 64 | if (opt$verbose) {cat('DONE\n\n')} 65 | 66 | ##-------------## 67 | ## BEGIN ## 68 | ##-------------## 69 | 70 | # read the matrix from the command line 71 | if (opt$input_matrix == "stdin") { 72 | input = "stdin" 73 | } else { 74 | input = opt$input_matrix 75 | } 76 | 77 | #input = ifelse(opt$input_matrix == "stdin", file("stdin"), opt$input_matrix) 78 | 79 | data = read.table(input, h=T) 80 | init_nrow = nrow(data) 81 | if (opt$verbose) {print(head(data))} 82 | 83 | # Deal with NAs 84 | if (opt$replace_NAs) { 85 | m = replace(data, is.na(data), 0) 86 | } else { 87 | keep_index = which(rowSums(is.na(data)) == 0) 88 | m = na.omit(data) 89 | if(opt$verbose) {cat("Dimensions retained after omittig NAs:", nrow(m), "of", init_nrow, "\n")} 90 | } 91 | 92 | # Initialize the column with the clusters 93 | data$K <- NA 94 | 95 | # read the SOM grid 96 | gridStruct = as.numeric(strsplit(opt$grid, "x")[[1]]) 97 | gridRows = gridStruct[1] 98 | gridCols = gridStruct[2] 99 | if(opt$verbose) {cat("SOM grid:", gridStruct, "\n")} 100 | 101 | 102 | # read the metadata from the metadata file if provided 103 | if (!is.null(opt$metadata)) { 104 | merge_mdata_on = 'labExpId' 105 | toSuperSOM = list() 106 | mdata = read.table(opt$metadata, h=T, sep='\t') 107 | mdata[merge_mdata_on] <- gsub(",", ".", mdata[,merge_mdata_on]) 108 | # Take only mdata rows which are in the columns of the input matrix 109 | mdata = mdata[which(mdata[,merge_mdata_on] %in% colnames(data)),] 110 | fields = strsplit(opt$fields, ",")[[1]] 111 | mdata = unique(mdata[,c(merge_mdata_on, fields)]) 112 | levs = levels(as.factor(mdata[,fields])) 113 | for (lev in levs) { 114 | ids = mdata[which(mdata[, fields] == lev), merge_mdata_on] 115 | toSuperSOM[[lev]] <- as.matrix(m[ids]) 116 | } 117 | SOM = supersom( 118 | toSuperSOM, 119 | grid = somgrid(gridCols, gridRows, opt$topology), 120 | toroidal=opt$toroidal, 121 | rlen = opt$iterations 122 | ) 123 | df_changes = melt(as.matrix(SOM$changes), varnames=c("x", "f"), value.name="y") 124 | data[keep_index, "K"] <- SOM$unit.classif 125 | } else { 126 | SOM = som( 127 | as.matrix(m), 128 | grid = somgrid(gridCols, gridRows, opt$topology), 129 | toroidal = opt$toroidal, 130 | rlen = opt$iterations 131 | ) 132 | df_changes = data.frame(x = seq_along(SOM$changes), y = SOM$changes) 133 | data[keep_index, "K"] <- SOM$unit.classif 134 | } 135 | 136 | 137 | 138 | # ~~~~~~~~~~~~ # 139 | # OUTPUT # 140 | # ~~~~~~~~~~~~ # 141 | 142 | write.table(data, file=opt$output, quote=FALSE, sep="\t") 143 | 144 | # ~~~~~~~~~~ # 145 | # PLOT # 146 | # ~~~~~~~~~~ # 147 | 148 | out.prefix = gsub(".tsv", "", opt$output) 149 | 150 | theme_set(theme_bw(base_size=16)) 151 | 152 | # convergence 153 | gp = ggplot(df_changes, aes(x, y)) 154 | if (!is.null(opt$metadata)) { 155 | gp = gp + geom_line(aes(color=f, group=f)) 156 | } else { 157 | gp = gp + geom_line() 158 | } 159 | gp = gp + labs(x='Iteration', y='Mean distance to unit') 160 | ggsave(sprintf('%s.convergence.pdf', out.prefix), h=5, w=5) 161 | 162 | # Number of elements per unit 163 | df = setNames(as.data.frame(table(SOM$unit.classif)), c('x', 'y')) 164 | gp = ggplot(df, aes(x, y)) 165 | gp = gp + stat_identity(geom='bar', color='blue', fill='white') 166 | gp = gp + labs(x='Unit', y='Number of elements') 167 | gp = gp + theme(axis.text.x = element_text(angle=45, hjust=1)) 168 | ggsave(sprintf('%s.units.pdf', out.prefix), h=5, w=8) 169 | 170 | 171 | q(save='no') 172 | -------------------------------------------------------------------------------- /Rtsne.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | set.seed(1555) 4 | 5 | # DEFAULT OPTIONS 6 | 7 | opt = list() 8 | opt$log10 = FALSE 9 | opt$pseudocount = 1e-04 10 | opt$row_as_variables = FALSE 11 | 12 | ##------------ 13 | ## LIBRARIES 14 | ##------------ 15 | suppressPackageStartupMessages(library("optparse")) 16 | 17 | options(stringsAsFactors=F) 18 | 19 | ################## 20 | # OPTION PARSING 21 | ################## 22 | 23 | option_list <- list( 24 | make_option(c("-i", "--input_matrix"), default="stdin", help="Matrix of observations (rows are the obs) or distances [default=%default]"), 25 | make_option(c("-l", "--log10"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"), 26 | make_option(c("-p", "--pseudocount"), type="double", help="specify a pseudocount for the log [default=%default]", default=0), 27 | 28 | make_option(c("-T", "--input_type"), default="dist", help="Choose between dist and obs. [default=%default]"), 29 | 30 | make_option(c("-n", "--iter"), default=1000, help="Number of iterations [default=%default]"), 31 | 32 | make_option(c("-m", "--metadata"), help="Tab-separated file with the metadata. Can be omitted"), 33 | 34 | make_option(c("--merge_mdata_on"), default="labExpId", 35 | help="[default=%default]"), 36 | 37 | make_option(c("--perplexity"), default=30, help="[default=%default]"), 38 | 39 | #make_option(c("-o", "--output"), help="additional info you want to put in the output file name", default="out"), 40 | make_option(c("-c", "--color_by"), help="choose the fields in the metadata you want to color by", type='character'), 41 | 42 | make_option(c("-s", "--shape_by"), help="choose the fields in the metadata you want to shape by", type='character'), 43 | 44 | make_option(c("-P", "--palette"), help="palette file [default=%default]"), 45 | 46 | make_option(c("-W", "--width"), default=7, 47 | help="Width of the plot in inches [default=%default]"), 48 | 49 | make_option(c("-H", "--height"), default=7, 50 | help="Height of the plot in inches [default=%default]"), 51 | 52 | make_option(c("-o", "--output"), default="Rtsne", 53 | help="Prefix for the file names. [default=%default]"), 54 | 55 | make_option(c("-v", "--verbose"), action='store_true', default=FALSE, 56 | help="verbose output [default=%default]") 57 | ) 58 | 59 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list, 60 | description="\nPerform Barnes Hut t-SNE. Return a plot with the points in the new 2D and a plot of the cost") 61 | arguments <- parse_args(parser, positional_arguments = TRUE) 62 | opt <- arguments$options 63 | 64 | if (opt$verbose) {print(opt)} 65 | 66 | suppressPackageStartupMessages(library("reshape")) 67 | suppressPackageStartupMessages(library("ggplot2")) 68 | suppressPackageStartupMessages(library("Rtsne")) 69 | suppressPackageStartupMessages(library("plyr")) 70 | 71 | ############### 72 | # BEGIN 73 | ############## 74 | 75 | # read input matrix 76 | if (opt$input_matrix == "stdin") {input=file("stdin")} else {input=opt$input_matrix} 77 | m = read.table(input, h=T, sep="\t") 78 | id = colnames(m) 79 | 80 | # --- dist --- 81 | if (opt$input_type == "dist") { 82 | m = as.dist(m) 83 | } 84 | 85 | # --- obs --- 86 | if (opt$input_type == "obs") { 87 | # apply the log if required 88 | if (opt$log10) {m = log10(replace(m, is.na(m), 0) + opt$pseudocount)} 89 | m = t(m) 90 | } 91 | 92 | 93 | ## Read the color palette 94 | if (!is.null(opt$palette)) {my_palette = read.table(opt$palette, h=F, comment.char="%", sep="\t")$V1} 95 | 96 | # 97 | ## Read the shapes 98 | #if (!is.null(opt$shapes)) { 99 | # my_shapes = read.table(opt$shapes, h=F, comment.char="%")$V1 100 | #} 101 | # 102 | tsne = Rtsne(m, max_iter=opt$iter, perplexity=opt$perplexity, pca=FALSE, is.distance=(opt$input_type=="dist")) 103 | 104 | # Plot the cost at each iteration 105 | cost_df = data.frame(iter=seq_along(tsne$itercosts), cost=tsne$itercosts) 106 | gp = ggplot(cost_df, aes(x=iter, y=cost)) + geom_line() 107 | ggsave(sprintf("%s.cost.pdf", opt$output), h=5, w=7) 108 | 109 | 110 | # *************** 111 | # Plot 112 | # *************** 113 | 114 | map_df = data.frame(x=tsne$Y[,1], y=tsne$Y[,2], id=id) 115 | 116 | 117 | # HANDLING METADATA 118 | 119 | if (!is.null(opt$metadata)){ 120 | mdata = read.table(opt$metadata, h=T, sep="\t", row.names=NULL, quote=""); 121 | mdata[,opt$merge_mdata_on] <- gsub("[,-]", ".", mdata[,opt$merge_mdata_on]) 122 | mdata <- unique(mdata[,c(opt$merge_mdata_on, opt$color_by, opt$shape_by)]) 123 | map_df <- merge(map_df, mdata, by.x="id", by.y=opt$merge_mdata_on) 124 | } 125 | 126 | 127 | # PLOTTING 128 | 129 | theme_set(theme_bw(base_size=18)) 130 | theme_update( 131 | legend.key = element_blank() 132 | ) 133 | 134 | 135 | geom_params = list() 136 | mapping = list() 137 | 138 | geom_params$size = 4 139 | 140 | if (is.null(opt$color_by)) { 141 | geom_params$color = "black" 142 | } else { 143 | mapping = modifyList(mapping, aes_string(color=opt$color_by)) 144 | } 145 | 146 | if (is.null(opt$shape_by)) { 147 | geom_params$shape = 19 148 | } else { 149 | mapping = modifyList(mapping, aes_string(shape=opt$shape_by)) 150 | } 151 | 152 | class(mapping) <- "uneval" 153 | 154 | pointLayer <- layer( 155 | geom = "point", 156 | # geom_params = geom_params, 157 | params = geom_params, 158 | stat = "identity", 159 | position = "identity", 160 | mapping = mapping 161 | ) 162 | 163 | 164 | gp = ggplot(map_df, aes(x=x, y=y)) + pointLayer 165 | gp = gp + labs(x=NULL, y=NULL) 166 | if (!is.null(opt$palette)) { 167 | if (is.discrete(map_df[,opt$color_by])) { 168 | gp = gp + scale_color_manual(values=my_palette) 169 | } else { 170 | gp = gp + scale_color_gradientn(colours=my_palette) 171 | } 172 | } 173 | ggsave(sprintf("%s.map.pdf", opt$output), w=opt$width, h=opt$height) 174 | 175 | q(save='no') 176 | -------------------------------------------------------------------------------- /major.isoform.across.samples.R: -------------------------------------------------------------------------------- 1 | 2 | ############################# 3 | # LIBRARIES 4 | ##################### 5 | 6 | suppressPackageStartupMessages(library(reshape2)) 7 | suppressPackageStartupMessages(library(ggplot2)) 8 | suppressPackageStartupMessages(library("optparse")) 9 | 10 | options(stringsAsFactors=F) 11 | pseudocount = 1e-04 12 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 13 | 14 | ################## 15 | # OPTION PARSING 16 | ################## 17 | 18 | 19 | option_list <- list( 20 | make_option(c("-i", "--input_matrix"), help="the matrix with transcript expression you want to analyze"), 21 | make_option(c("-a", "--annotation"), help="two-column file with gene and tx ids"), 22 | make_option(c("-o", "--output"), help="choose the name for the output file WITHOUT extension"), 23 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"), 24 | make_option(c("-c", "--compare_by"), help="field of the metadata by which you want to compare") 25 | ) 26 | 27 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 28 | arguments <- parse_args(parser, positional_arguments = TRUE) 29 | opt <- arguments$options 30 | 31 | #q(save='no') 32 | 33 | 34 | entropy = function(x) {x = x[!is.na(x)]; x =x[x!=0]; p=x/sum(x); if (length(x) != 0) {return(-sum(p*log(p)))} else {NA}} 35 | 36 | which_is_maj_iso = function(x) if (sum(x==0)==length(x)) {return(0)} else {return(which.max(x))} 37 | gn_fun1 = function(x) max(table(as.numeric(x))) # to be revised!!! 38 | gn_fun2 = function(x) length(unique(x[x!=0])) 39 | 40 | 41 | ########### 42 | ## HUMAN ## 43 | ########### 44 | 45 | # using only protein coding genes AND transcripts 46 | human_gn_tx = read.table(opt$annotation, h=F, col.names=c('gene','tx')) 47 | # transcript matrix 48 | human_expr = read.table(opt$input_matrix, h=T) 49 | human_expr[is.na(human_expr)] <- 0 50 | 51 | # transcript matrix only for protein coding transcripts with gene id appended 52 | human_tx_gn_expr = merge(human_gn_tx,human_expr,by.x='tx',by.y='row.names') 53 | human_expr_iso = aggregate(human_tx_gn_expr[,-(1:2)], list(human_tx_gn_expr$gene), which_is_maj_iso) 54 | rownames(human_expr_iso) = human_expr_iso[,1] 55 | human_expr_iso[,1] = NULL 56 | human_expr_iso_fun = data.frame(n_samples=apply(human_expr_iso,1,gn_fun1),n_maj_iso=apply(human_expr_iso,1,gn_fun2), organism='Human') 57 | 58 | # add the number of annotated for human 59 | hs_ann_iso = data.frame(table(human_gn_tx$gene)) 60 | names(hs_ann_iso) = c('gene','ann_iso') 61 | human_expr_iso_fun_ann = merge(hs_ann_iso, human_expr_iso_fun, by.x='gene', by.y='row.names') 62 | 63 | hs_mm_expr_iso_fun_ann = human_expr_iso_fun_ann 64 | 65 | 66 | # read the metadata 67 | mdata = read.table(opt$metadata, h=T, sep='\t') 68 | 69 | variables = unique(mdata[,opt$compare_by]) 70 | 71 | shift = sapply(variables, function(x) {ids = unique(mdata[which(mdata[opt$compare_by]==x), 'labExpId']); 72 | apply(human_expr_iso[ids], 1, function(x) if (length(unique(x))==1) {return(unique(x))}else{return(NA)} ) 73 | }) 74 | 75 | for (el in variables){ 76 | ids = unique(mdata[which(mdata[opt$compare_by]==el), 'labExpId']) 77 | human_expr_iso[el] = apply(human_expr_iso[ids], 1, function(x) if (length(unique(x))==1) {return(unique(x))}else{return(NA)} ) 78 | } 79 | 80 | shift2ggplot = data.frame( 81 | # Total genes in the annotation 82 | Total_genes = nrow(shift), 83 | # Genes with the same major isoform within the same group 84 | same_maj_iso_within = nrow(shift[which(rowSums(shift==0) ==0),]), 85 | # Genes with the same major isoform consistent within the group, but different between 86 | diff_maj_iso_between = sum(apply(shift[which(rowSums(shift==0) ==0),], 1, function(x) length(unique(x)))>1) 87 | ) 88 | 89 | write.table(shift2ggplot, sprintf("%s.tsv", opt$output), sep='\t', quote=F, row.names=F) 90 | 91 | ## PLOT ## 92 | 93 | library(ggplot2) 94 | pdf(sprintf("%s.pdf", opt$output),width=12) 95 | 96 | #ggplot(subset(hs_mm_expr_iso_fun_ann,ann_iso <= 25),aes(x=as.factor(ann_iso), y= n_samples)) + geom_boxplot() + facet_wrap(~organism,scale='free') + xlab('Number of annotated isoforms') + ylab('Max number of samples where the same major isoform is expressed') + opts(title='Most diffused major isoform') 97 | 98 | ggplot(subset(hs_mm_expr_iso_fun_ann, ann_iso >= 2 & n_maj_iso>0),aes(x=as.factor(n_maj_iso),fill=cut(ann_iso, c(1,2,3,4,5,10,15,25,Inf)))) + geom_histogram(position='stack') + guides(fill = guide_legend(reverse = TRUE)) + scale_fill_hue(name='Total isoforms per gene',labels=c(2,3,4,5,'6-10','11-15','16-25','>25')) + labs(x='Major isoforms per gene', y='Number of protein coding genes') 99 | 100 | ggplot(subset(hs_mm_expr_iso_fun_ann,ann_iso<=35 & ann_iso>=2 & n_maj_iso>=1 ),aes(x=as.numeric(ann_iso))) + geom_histogram(aes(fill=cut(n_maj_iso, c(0,1,2,3,4,5,6,7,Inf))),position='stack',binwidth=1) + guides(fill=guide_legend(reverse=TRUE)) + stat_bin(aes(y=..count../x),breaks=seq(2,34,by=1),geom='line') + labs(x='Annotated isoforms per gene', y='Number of protein coding genes') + scale_fill_hue(name='Major isoforms per gene',labels=c(1,2,3,4,5,6,7,'>8')) 101 | 102 | gp = ggplot(subset(hs_mm_expr_iso_fun_ann, ann_iso >= 2 & n_maj_iso>0), aes(x=ann_iso, y=n_maj_iso)) 103 | gp = gp + stat_bin2d(binwidth=c(1,1),aes(fill=cut(..count.., c(0,1,5,10,50,100,500,1000,2000,3000,Inf))),colour="black",size=.2) 104 | gp = gp + scale_fill_brewer(name='Number\nof genes', palette='Greens') + xlim(c(2,20)) 105 | gp = gp + ylim(c(0,20)) 106 | gp = gp + labs(x='Annotated isoforms per gene', y='Major isoforms per gene') 107 | gp 108 | 109 | #ggplot(subset(hs_mm_expr_iso_median_ann,hs_ann_iso!=1 & mm_ann_iso!=1 ),aes(x=floor(mm_entropy*10),y=floor(hs_entropy*10))) + stat_bin2d(binwidth=c(1,1),aes(fill=cut(..count.., c(0,1,5,10,100,500,1000,2000,3000,Inf))),colour ="black",size=.2) + scale_fill_brewer("count") + geom_abline(linetype=4) + opts(title="Median of entropy of isoform expression") + xlab('Mouse (entropy*10)') + ylab('Human (entropy*10)') 110 | dev.off() 111 | -------------------------------------------------------------------------------- /add_quantile.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors=FALSE) 4 | 5 | ################## 6 | # OPTION PARSING 7 | ################## 8 | 9 | suppressPackageStartupMessages(library("optparse")) 10 | 11 | option_list <- list( 12 | 13 | make_option(c("-i", "--input"), default="stdin", 14 | help="File or stdin [default=%default]"), 15 | 16 | make_option(c("-o", "--output"), default="quantile.out.tsv", 17 | help="Output file name. for printing on stdout [default=%default]"), 18 | 19 | make_option(c("--header"), action="store_true", default=FALSE, 20 | help="Use this if the input has a header [default=%default]"), 21 | 22 | make_option(c("-m", "--method"), default="quantiles", 23 | help="Choose the way to bin the data: < quantiles | breaks > [default=%default]"), 24 | 25 | make_option(c("-c", "--column"), default=1, 26 | help="The column with the values to divide in quantiles [default=%default]"), 27 | 28 | make_option(c("-q", "--quantiles"), default=4, type="integer", 29 | help="Number of quantiles [default=%default]"), 30 | 31 | make_option(c("-s", "--resolve_breaks"), default=FALSE, action="store_true", 32 | help="When breaks are not unique, don't crash but create fewer quantiles [default=%default]"), 33 | 34 | make_option(c("-b", "--breaks"), default="c(-Inf,1,2,3,Inf)", 35 | help="Breaks for the intervals, comma-separated, with c notation [default=%default]"), 36 | 37 | make_option(c("--paste"), action="store_true", default=FALSE, 38 | help="Paste index and interval in a unique column [default=%default]"), 39 | 40 | make_option(c("--rowNames"), action="store_true", default=FALSE, 41 | help="Print row names in the output [default=%default]"), 42 | 43 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, 44 | help="if you want more output [default=%default]") 45 | 46 | ) 47 | 48 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 49 | arguments <- parse_args(parser, positional_arguments = TRUE) 50 | opt <- arguments$options 51 | if (opt$verbose) {print(opt)} 52 | 53 | #------------ 54 | # LIBRARIES 55 | #------------ 56 | 57 | if (opt$verbose) {cat("Loading libraries... ")} 58 | #suppressPackageStartupMessages(library(reshape2)) 59 | suppressPackageStartupMessages(library(ggplot2)) 60 | #suppressPackageStartupMessages(library(plyr)) 61 | if (opt$verbose) {cat("DONE\n\n")} 62 | 63 | 64 | # ============== # 65 | # Functions # 66 | # ============== # 67 | 68 | formatInterval = function(interval) { 69 | 70 | interval = as.character(interval) 71 | if (length(grep("-Inf", interval) != 0)) { 72 | string = strsplit(interval, split=",")[[1]][2] 73 | outInterval = sprintf("<=%s", gsub("[\\)\\]]", "", string, perl=TRUE)) 74 | #outInterval = sprintf("<=%s", gsub("\\)", "", strsplit(interval, split=",")[[1]][2])) 75 | return(outInterval) 76 | } 77 | if (length(grep("Inf", interval) != 0)) { 78 | string = strsplit(interval, split=",")[[1]][1] 79 | outInterval = sprintf(">%s", gsub("[\\(\\[]", "", string, perl=TRUE)) 80 | return(outInterval) 81 | } 82 | 83 | # left = gsub("[\\(\\]]", "", strsplit(interval, split=",")[[1]][1], perl=TRUE) 84 | # right = gsub("\\)", "", strsplit(interval, split=",")[[1]][2]) 85 | # outInterval = sprintf("%s <= x < %s", left, right) 86 | outInterval = interval 87 | return(outInterval) 88 | } 89 | 90 | 91 | # ======== # 92 | # BEGIN # 93 | # ======== # 94 | 95 | 96 | # Read data 97 | 98 | if (opt$input == "stdin") {inF = file("stdin")} else {inF = opt$input} 99 | m = read.table(inF, h=opt$header, sep="\t") 100 | 101 | pasteSEP= ":" 102 | 103 | # Quantiles 104 | 105 | if (opt$method == "quantiles") { 106 | 107 | quantile_header = sprintf("quantile_%s", colnames(m)[opt$column]) 108 | quant_index_header = sprintf("quant_index_%s", colnames(m)[opt$column]) 109 | quantile_paste_header = sprintf("quant_paste_%s", colnames(m)[opt$column]) 110 | 111 | if (opt$resolve_breaks) { 112 | breaks <- c(unique(quantile(m[,opt$column], probs=seq(0,1,1/opt$quantiles), na.rm=T))) 113 | m[,quantile_header] = cut(m[,opt$column], breaks, include.lowest=TRUE) 114 | m[,quant_index_header] = match(m[,quantile_header], levels(m[,quantile_header])) 115 | } else { 116 | m[,quantile_header] = cut_number(m[,opt$column], opt$quantiles) 117 | m[,quant_index_header] = as.numeric(cut_number(m[,opt$column], opt$quantile)) 118 | } 119 | 120 | 121 | if (opt$paste) { 122 | m[, quantile_paste_header] <- with(m, paste(quant_index_header, quantile_header, sep=pasteSEP)) 123 | # if (max(m[,quant_index_header]) > 9 & max(m[,quant_index_header]) < 100) { 124 | # m[m$quant_index_header<9, quantile_paste_header] <- with(m{}, paste0("0", quantile_paste_header)) 125 | # } 126 | m[, quantile_header] <- NULL; m[, quant_index_header] <- NULL 127 | } 128 | 129 | 130 | } 131 | 132 | # Intervals 133 | 134 | if (opt$method == "breaks") { 135 | 136 | interval_header = sprintf("interval_%s", colnames(m)[opt$column]) 137 | interval_index_header = sprintf("interval_index_%s", colnames(m)[opt$column]) 138 | interval_paste_header = sprintf("interval_paste_%s", colnames(m)[opt$column]) 139 | 140 | breaks = eval(parse(text=opt$breaks)) 141 | m[,interval_header] = cut(m[,opt$column], breaks=breaks, include.lowest=TRUE, right=TRUE, dig.lab=4) 142 | m[,interval_index_header] = cut(m[,opt$column], breaks=breaks, include.lowest=TRUE, right=TRUE, labels=FALSE) 143 | 144 | # Format the interval 145 | m[,interval_header] = sapply(m[,interval_header], formatInterval) 146 | 147 | if (opt$paste) { 148 | m[, interval_paste_header] <- paste( as.character( m[, interval_index_header] ), m[, interval_header], sep=pasteSEP) 149 | if (max(m[,interval_index_header]) > 9 & max(m[,interval_index_header]) < 100) { 150 | m[m[,interval_index_header]<10, interval_paste_header] <- paste0("0", m[m[,interval_index_header]<10, interval_paste_header]) 151 | } 152 | m[, interval_header] <- NULL; m[, interval_index_header] <- NULL 153 | } 154 | 155 | } 156 | 157 | 158 | # Print output 159 | 160 | output = ifelse(opt$output == "stdout", "", opt$output) 161 | write.table(m, file=output, row.names=opt$rowNames, quote=FALSE, col.names=opt$header, sep="\t") 162 | 163 | 164 | # EXIT 165 | quit(save='no') 166 | -------------------------------------------------------------------------------- /nt.coverage.R: -------------------------------------------------------------------------------- 1 | 2 | ##------------ 3 | ## LIBRARIES 4 | ##------------ 5 | suppressPackageStartupMessages(library(reshape2)) 6 | suppressPackageStartupMessages(library(ggplot2)) 7 | suppressPackageStartupMessages(library("optparse")) 8 | suppressPackageStartupMessages(library(plyr)) 9 | 10 | 11 | #options(stringsAsFactors=F) 12 | pseudocount = 1e-04 13 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 14 | 15 | ################## 16 | # OPTION PARSING 17 | ################## 18 | 19 | 20 | option_list <- list( 21 | make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze"), 22 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"), 23 | make_option(c("-o", "--output"), help="additional flags for otuput", default="out"), 24 | make_option(c("-d", "--outdir"), help="directory for the output", default="./"), 25 | #make_option(c("-c", "--color_by"), help="choose the color you want to color by [default=NA]", type='character', default=NA), 26 | make_option(c("-f", "--field"), help="dashboard field by which the individuals are grouped"), 27 | make_option(c("-H", "--height"), default=5, help="output height in inches [default=%default]"), 28 | make_option(c("-W", "--width"), default=9, help="output width in inches [default=%default]"), 29 | make_option(c("--facet_nrow"), default=1, help="number of rows when faceting [default=%default]") 30 | 31 | #make_option(c("-t", "--tags"), help="comma-separated field names you want to display in the labels", default="cell,sex,age") 32 | ) 33 | 34 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 35 | arguments <- parse_args(parser, positional_arguments = TRUE) 36 | opt <- arguments$options 37 | print(opt) 38 | 39 | na2null = function(x) if(is.na(x)) {return(NULL)}else{return(x)} 40 | 41 | 42 | ##--------------------## 43 | ## CLUSTERING SAMPLES ## 44 | ##--------------------## 45 | output = sprintf("%s.%s", basename(opt$input_matrix), opt$output) 46 | 47 | # read the matrix from the command line 48 | m = read.table(opt$input_matrix, h=F, col.names=c("element","labExpId","n_det_el","prop","tag","det")) 49 | 50 | # read the metadata from the metadata file 51 | mdata = read.table(opt$metadata, h=T, sep='\t') 52 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(",", ".", x)) 53 | 54 | # prepare data.frame for ggplot 55 | df = merge(subset(m, tag=="individual"), unique(mdata[c("labExpId", opt$field)]), by='labExpId') 56 | 57 | # duplicate the data.frame to plot all 58 | df_copy = df 59 | df_copy[opt$field]= "all" 60 | 61 | # attach the two data frames 62 | new_df = rbind(df, df_copy) 63 | 64 | 65 | # order the elements with the hierarchy: exonic, intronic, intergenic, total 66 | new_df$element <- factor(new_df$element, levels=c('exonic','intronic','intergenic','total')) 67 | #new_df$labels <- factor(new_df$det, labels = c("Genome covered", "Covered by RNA-seq", "RNA-seq distribution")) 68 | new_df$facet2 = interaction(new_df$element, new_df$det) 69 | 70 | # create a separate data.frame for cumulative coverage 71 | cumul_df = subset(m, tag=='cumulative') 72 | cumul_df$facet2 = interaction(cumul_df$element, cumul_df$det) 73 | cumul_df[opt$field] = cumul_df$labExpId 74 | #cumul_df$cell = cumul_df$labExpId 75 | 76 | 77 | #gp = ggplot(new_df, aes(x=element, y=prop)) 78 | #gp = gp + geom_boxplot() 79 | #gp = gp + facet_grid(cell~det) 80 | ##gp = gp + facet_grid(sprintf(".~%s", opt$field), scales = 'free_y') 81 | ##gp = gp + facet_grid(new_df$cell~facet, scales = 'free_y') 82 | ##gp = gp + geom_point(data=subset(m, tag=='cumulative'), aes_string(x="element", y="prop"), size = 4, alpha = 0.7) 83 | #gp = gp + labs(y='Proportion of detected nucleotides', x="") 84 | #gp = gp + theme(axis.text = element_text(size=15), axis.text.x=element_text(angle=45)) 85 | #gp 86 | # 87 | #gp = ggplot(new_df, aes(x=element, y=prop)) 88 | #gp = gp + geom_boxplot(aes(col=cell)) 89 | #gp = gp + facet_grid(.~labels, scales='free_x') 90 | ##gp = gp + facet_grid(sprintf(".~%s", opt$field), scales = 'free_y') 91 | ##gp = gp + facet_grid(new_df$cell~facet, scales = 'free_y') 92 | ##gp = gp + geom_point(data=subset(m, tag=='cumulative'), aes_string(x="element", y="prop"), size = 4, alpha = 0.7) 93 | #gp = gp + labs(y='Proportion of detected nucleotides', x="Genomic domain (D)") 94 | #gp = gp + theme(axis.text = element_text(size=15), axis.text.x=element_text(angle=45, hjust=1)) 95 | #gp 96 | # 97 | ## almost final without cumulative 98 | #gp = ggplot(new_df, aes(x=facet2, y=prop)) 99 | #gp = gp + geom_boxplot(aes(col=labels)) 100 | #gp = gp + facet_grid(.~cell, scales='free_x') 101 | ##gp = gp + facet_grid(sprintf(".~%s", opt$field), scales = 'free_y') 102 | ##gp = gp + facet_grid(new_df$cell~facet, scales = 'free_y') 103 | ##gp = gp + geom_point(data=subset(m, tag=='cumulative'), aes_string(x="element", y="prop"), size = 4, alpha = 0.7) 104 | #gp = gp + labs(y='Proportion of nucleotides (%)', x="Genomic domain (D)") 105 | #gp = gp + theme(axis.text = element_text(size=15), axis.text.x=element_text(angle=45, hjust=1)) 106 | #gp = gp + scale_x_discrete(labels=c('total','exonic','intronic','intergenic','exonic','intronic','intergenic')) 107 | #gp 108 | 109 | # 110 | 111 | gp = ggplot(new_df, aes(x=facet2, y=prop)) 112 | gp = gp + geom_boxplot(aes(fill=as.character(det)) ) 113 | #gp = gp + facet_grid(.~cell, scales='free_x') 114 | if (!is.null(opt$field)) { 115 | #gp = gp + facet_grid(sprintf(".~%s", opt$field), scales='free_x')} 116 | gp = gp + facet_wrap(as.formula(sprintf("~%s", opt$field)), nrow=opt$nrow, scales='free_x')} 117 | gp = gp + geom_point(data=cumul_df, aes(x=facet2, y=prop, col=as.character(det)), size = 4, alpha = 0.7) 118 | gp = gp + geom_point(data=cumul_df, aes(x=facet2, y=prop), col='black', size = 4, alpha = 0.7, shape=1) 119 | gp = gp + labs(y='Proportion of nucleotides (%)', x="Genomic domain (D)") 120 | gp = gp + theme(axis.text = element_text(size=15), axis.text.x=element_text(angle=45, hjust=1)) 121 | gp = gp + scale_x_discrete(labels=c('total','exonic','intronic','intergenic','exonic','intronic','intergenic')) 122 | gp = gp + scale_fill_hue(name='',labels = c("Genome covered", "Covered by RNA-seq", "RNA-seq distribution")) 123 | gp = gp + scale_color_hue(name='Cumulative',labels = c("Genome covered", "Covered by RNA-seq", "RNA-seq distribution")) 124 | 125 | 126 | w = opt$width 127 | h = opt$height 128 | 129 | ggsave(filename=sprintf("%s/%s.pdf", opt$outdir, output), w=w, h=h) 130 | ggsave(filename=sprintf("%s/%s.png", opt$outdir, output), w=w, h=h) 131 | ggsave(filename=sprintf("%s/%s.eps", opt$outdir, output), w=w, h=h) 132 | 133 | 134 | q(save='no') 135 | -------------------------------------------------------------------------------- /dimensRed.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | set.seed(1555) 4 | 5 | # DEFAULT OPTIONS 6 | 7 | opt = list() 8 | opt$log10 = FALSE 9 | opt$pseudocount = 1e-04 10 | opt$row_as_variables = FALSE 11 | 12 | ##------------ 13 | ## LIBRARIES 14 | ##------------ 15 | suppressPackageStartupMessages(library("optparse")) 16 | 17 | options(stringsAsFactors=F) 18 | 19 | ################## 20 | # OPTION PARSING 21 | ################## 22 | 23 | option_list <- list( 24 | make_option(c("-i", "--input_matrix"), default="stdin", help="Matrix of observations (rows are the obs) or distances [default=%default]"), 25 | make_option(c("-l", "--log10"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"), 26 | make_option(c("-p", "--pseudocount"), type="double", help="specify a pseudocount for the log [default=%default]", default=0), 27 | 28 | make_option(c("-T", "--input_type"), default="dist", help="Choose between dist and obs. [default=%default]"), 29 | 30 | make_option(c("-n", "--iter"), default=1000, help="Number of iterations [default=%default]"), 31 | 32 | make_option(c("-m", "--metadata"), help="Tab-separated file with the metadata. Can be omitted"), 33 | 34 | make_option(c("-M", "--method"), help="Choose one of: "), 35 | 36 | make_option(c("--merge_mdata_on"), default="labExpId", 37 | help="[default=%default]"), 38 | 39 | make_option(c("--perplexity"), default=30, help="[default=%default]"), 40 | 41 | #make_option(c("-o", "--output"), help="additional info you want to put in the output file name", default="out"), 42 | make_option(c("-c", "--color_by"), help="choose the fields in the metadata you want to color by", type='character'), 43 | 44 | make_option(c("-s", "--shape_by"), help="choose the fields in the metadata you want to shape by", type='character'), 45 | 46 | make_option(c("-P", "--palette"), help="palette file [default=%default]"), 47 | 48 | make_option(c("--shapes"), 49 | help="File with the shapes [default=%default]"), 50 | 51 | make_option(c("-b", "--base-size"), dest="base_size", default=18, 52 | help="Text size [default=%default]"), 53 | 54 | make_option(c("-W", "--width"), default=7, 55 | help="Width of the plot in inches [default=%default]"), 56 | 57 | make_option(c("-H", "--height"), default=7, 58 | help="Height of the plot in inches [default=%default]"), 59 | 60 | make_option(c("-o", "--output"), default="Rtsne", 61 | help="Prefix for the file names. [default=%default]"), 62 | 63 | make_option(c("-v", "--verbose"), action='store_true', default=FALSE, 64 | help="verbose output [default=%default]") 65 | ) 66 | 67 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list, 68 | description="\nPerform Barnes Hut t-SNE. Return a plot with the points in the new 2D and a plot of the cost") 69 | arguments <- parse_args(parser, positional_arguments = TRUE) 70 | opt <- arguments$options 71 | 72 | if (opt$verbose) {print(opt)} 73 | 74 | suppressPackageStartupMessages(library("reshape")) 75 | suppressPackageStartupMessages(library("ggplot2")) 76 | suppressPackageStartupMessages(library("Rtsne")) 77 | suppressPackageStartupMessages(library("plyr")) 78 | 79 | ############### 80 | # BEGIN 81 | ############## 82 | 83 | # read input matrix 84 | if (opt$input_matrix == "stdin") {input=file("stdin")} else {input=opt$input_matrix} 85 | m = read.table(input, h=T, sep="\t") 86 | id = colnames(m) 87 | 88 | # --- dist --- 89 | if (opt$input_type == "dist") { 90 | m = as.dist(m) 91 | } 92 | 93 | # --- obs --- 94 | if (opt$input_type == "obs") { 95 | # apply the log if required 96 | if (opt$log10) {m = log10(replace(m, is.na(m), 0) + opt$pseudocount)} 97 | m = t(m) 98 | } 99 | 100 | 101 | ## Read the color palette 102 | if (!is.null(opt$palette)) {my_palette = read.table(opt$palette, h=F, comment.char="%", sep="\t")$V1} 103 | 104 | 105 | # Read the shapes 106 | if (!is.null(opt$shapes)) { 107 | my_shapes = read.table(opt$shapes, h=F, comment.char="%")$V1 108 | } 109 | 110 | if (opt$method == "tSNE") { 111 | tsne = Rtsne(m, max_iter=opt$iter, perplexity=opt$perplexity, pca=FALSE, is.distance=(opt$input_type=="dist")) 112 | x = tsne$Y[,1] 113 | y = tsne$Y[,2] 114 | 115 | # Plot the cost at each iteration 116 | cost_df = data.frame(iter=seq_along(tsne$itercosts), cost=tsne$itercosts) 117 | gp = ggplot(cost_df, aes(x=iter, y=cost)) + geom_line() 118 | ggsave(sprintf("%s.cost.pdf", opt$output), h=5, w=7) 119 | 120 | } 121 | 122 | if (opt$method == "MDS") { 123 | fit <- cmdscale(m, eig=TRUE, k=2) 124 | x <- fit$points[,1] 125 | y <- fit$points[,2] 126 | } 127 | 128 | 129 | # *************** 130 | # Plot 131 | # *************** 132 | 133 | map_df = data.frame(x=x, y=y, id=id) 134 | 135 | 136 | # HANDLING METADATA 137 | 138 | if (!is.null(opt$metadata)){ 139 | mdata = read.table(opt$metadata, h=T, sep="\t", row.names=NULL, quote="", check.names=F); 140 | mdata[,opt$merge_mdata_on] <- gsub("[,-]", ".", mdata[,opt$merge_mdata_on]) 141 | mdata <- unique(mdata[,c(opt$merge_mdata_on, opt$color_by, opt$shape_by)]) 142 | map_df <- merge(map_df, mdata, by.x="id", by.y=opt$merge_mdata_on) 143 | } 144 | 145 | 146 | # PLOTTING 147 | 148 | theme_set(theme_bw(base_size=opt$base_size)) 149 | theme_update( 150 | legend.title = element_text(size=opt$base_size), 151 | legend.key = element_blank(), 152 | panel.grid = element_blank() 153 | ) 154 | 155 | 156 | geom_params = list() 157 | mapping = list() 158 | 159 | geom_params$size = 4 160 | 161 | if (is.null(opt$color_by)) { 162 | geom_params$color = "black" 163 | } else { 164 | mapping = modifyList(mapping, aes_string(color=paste("`", opt$color_by, "`", sep=""))) 165 | } 166 | 167 | if (is.null(opt$shape_by)) { 168 | geom_params$shape = 19 169 | } else { 170 | mapping = modifyList(mapping, aes_string(shape=paste("`", opt$shape_by, "`", sep=""))) 171 | } 172 | 173 | class(mapping) <- "uneval" 174 | 175 | pointLayer <- layer( 176 | geom = "point", 177 | # geom_params = geom_params, 178 | params = geom_params, 179 | stat = "identity", 180 | position = "identity", 181 | mapping = mapping 182 | ) 183 | 184 | 185 | gp = ggplot(map_df, aes(x=x, y=y)) + pointLayer 186 | gp = gp + labs(x=paste(opt$method, "1", sep=" "), y=paste(opt$method, "2", sep=" "), title="") 187 | if (!is.null(opt$palette)) { 188 | if (is.discrete(map_df[,opt$color_by])) { 189 | gp = gp + scale_color_manual(values=my_palette) 190 | } else { 191 | gp = gp + scale_color_gradientn(colours=my_palette) 192 | } 193 | } 194 | if (!is.null(opt$shapes)) { 195 | gp = gp + scale_shape_manual(name=opt$shape_by, values=my_shapes); 196 | } 197 | 198 | 199 | ggsave(sprintf("%s.map.pdf", opt$output), w=opt$width, h=opt$height) 200 | 201 | q(save='no') 202 | -------------------------------------------------------------------------------- /gglines.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors=F) 4 | 5 | ################## 6 | # OPTION PARSING 7 | ################## 8 | 9 | suppressPackageStartupMessages(library("optparse")) 10 | 11 | option_list <- list( 12 | 13 | make_option(c("-i", "--input"), type="character", default='stdin', 14 | help="tab-separated file. Can be stdin [default=%default]"), 15 | 16 | make_option(c("-o", "--output"), default="profile.pdf", 17 | help="output file name with extension [default=%default]"), 18 | 19 | make_option(c("--header"), action="store_true", default=FALSE, 20 | help="The file has header [default=%default]"), 21 | 22 | make_option(c("--group"), type="numeric", 23 | help="Column index with the group factor [default=%default]"), 24 | 25 | make_option(c("-c", "--color_by"), type='numeric', 26 | help="Column index with the color factor. Leave empty for no color"), 27 | 28 | make_option(c("-L", "--linetype_by"), type='numeric', 29 | help="Column index of the linetype factor, Leave empty for no linetype"), 30 | 31 | make_option(c("-x", "--x_col"), type='numeric', default=1, 32 | help="Column index of the x axis [default=%default]"), 33 | 34 | make_option(c("-y", "--y_col"), type='numeric', default=2, 35 | help="Column index of the y axis [default=%default]"), 36 | 37 | make_option(c("-V", "--vertical_lines"), type='character', 38 | help="specify where you want the vertical lines [default=%default]"), 39 | 40 | make_option(c("-a", "--alpha"), type="numeric", default=1, 41 | help="Set transparency [default=%default]"), 42 | 43 | #make_option(c("-f", "--facet"), type="integer", help="column index to facet"), 44 | 45 | make_option(c("-t", "--title"), type="character", 46 | help="Main title for the plot. Leave emtpy for no title"), 47 | 48 | make_option(c("--y_title"), default="norm_read_density", 49 | help="title for the y-axis [default=%default]"), 50 | 51 | make_option(c("--x_title"), default="position", 52 | help="title for the x-axis [default=%default]"), 53 | 54 | make_option(c("--scale_x_log10"), action="store_true", default=FALSE, 55 | help="Change the x axis to log10 [default=%default]"), 56 | 57 | make_option(c("--x_limits"), default=NULL, 58 | help="Limits for the x axis [default=%default]"), 59 | 60 | make_option(c("-P", "--palette"), 61 | help='File with colors for the lines. Leave empty to use even color spacing'), 62 | 63 | make_option(c("-H", "--height"), default=5, 64 | help="Height of the plot in inches [default=%default]"), 65 | 66 | make_option(c("-W", "--width"), default=7, 67 | help="Width of the plot in inches [default=%default]"), 68 | 69 | make_option(c("-v", "--verbose"), action='store_true', default=FALSE, 70 | help="Verbose output [default=%default]") 71 | 72 | #make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"), 73 | ) 74 | 75 | parser <- OptionParser( 76 | usage = "%prog [options] file", 77 | option_list = option_list, 78 | description = "From a column file, plot a column vs another as lines" 79 | ) 80 | arguments <- parse_args(parser, positional_arguments = TRUE) 81 | opt <- arguments$options 82 | if (opt$verbose) {print(opt)} 83 | 84 | 85 | ##------------ 86 | ## LIBRARIES 87 | ##------------ 88 | 89 | if (opt$verbose) {cat("Loading libraries... ")} 90 | suppressPackageStartupMessages(library(reshape2)) 91 | suppressPackageStartupMessages(library(ggplot2)) 92 | #suppressPackageStartupMessages(library(plyr)) 93 | if (opt$verbose) {cat("DONE\n\n")} 94 | 95 | 96 | # ======== 97 | # BEGIN 98 | # ======== 99 | 100 | 101 | 102 | # Read data 103 | if (opt$input == "stdin") {input=file('stdin')} else {input=opt$input} 104 | m = read.table(input, h=opt$header, sep="\t") 105 | if(opt$verbose) {print(head(m))} 106 | 107 | # Read palette 108 | if (!is.null(opt$palette)) { 109 | palette = read.table(opt$palette, h=FALSE, comment.char='%')$V1 110 | } 111 | 112 | # Read coloring factor 113 | if (!is.null(opt$color_by)) { 114 | if (ncol(m)> color << 148 | 149 | if (is.null(opt$color_by)) { 150 | geom_params$color = "black" 151 | } else { 152 | mapping = modifyList(mapping, aes_string(color=colnames(m)[opt$color_by])) 153 | } 154 | 155 | 156 | # >> linetype << 157 | 158 | if (is.null(opt$linetype_by)) { 159 | geom_params$linetype = 1 160 | } else { 161 | mapping = modifyList(mapping, aes_string(linetype=colnames(m)[opt$linetype_by])) 162 | } 163 | 164 | 165 | # >> group << 166 | 167 | if (!is.null(opt$group)) { 168 | group = colnames(m)[opt$group] 169 | mapping = modifyList(mapping, aes_string(group=colnames(m)[opt$group])) 170 | m[,group] <- gsub("\\\\n", "\n", m[,group]) 171 | } 172 | 173 | geom_params$alpha = alpha 174 | geom_params$size = 1 175 | 176 | class(mapping) <- "uneval" 177 | 178 | # ----------- 179 | # lineLayer 180 | # ----------- 181 | 182 | lineLayer <- layer( 183 | geom = "line", 184 | params = geom_params, 185 | # geom_params = geom_params, 186 | mapping = mapping, 187 | stat = "identity", 188 | position = "identity" 189 | ) 190 | 191 | 192 | # plot 193 | gp = ggplot(m, aes_string(x=x, y=y)) 194 | 195 | # Add line layer 196 | gp = gp + lineLayer 197 | 198 | # Add the labels 199 | gp = gp + labs(title=opt$title, y=opt$y_title, x=opt$x_title) 200 | 201 | # Color scale 202 | if (!is.null(opt$palette)) { 203 | gp = gp + scale_color_manual(values=palette) 204 | } else { 205 | gp = gp + scale_color_hue() 206 | } 207 | 208 | # Vertical lines 209 | if (!is.null(opt$vertical_lines)) { 210 | vertical_lines = as.numeric(strsplit(opt$vertical_lines, ",")[[1]]) 211 | gp = gp + geom_vline(xintercept=vertical_lines, color="grey", linetype="longdash") 212 | } 213 | 214 | # Scale x log10 215 | if (opt$scale_x_log10) { 216 | gp = gp + scale_x_log10() 217 | } 218 | 219 | # Read x limits 220 | if (!is.null(opt$x_limits)) { 221 | x_lim = as.numeric(strsplit(opt$x_limits, ",")[[1]]) 222 | gp = gp + xlim(x_lim) 223 | } 224 | 225 | ggsave(opt$output, h=opt$height, w=opt$width) 226 | 227 | # EXIT 228 | quit(save='no') 229 | -------------------------------------------------------------------------------- /matrix_funct.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | cat("NOTE: Treat Inf and -Inf as NAs\n\n", file=stderr()) 4 | 5 | suppressPackageStartupMessages(library("optparse")) 6 | 7 | options(stringsAsFactors=F) 8 | 9 | 10 | # ================== 11 | # DEBUG OPTIONS 12 | # ================== 13 | 14 | opt = list() 15 | opt$input_matrix = "/users/rg/abreschi/Documents/human-mouse/antisense-transcription/s_as_read_counts/corr-mean/log2_FALSE.pdcn_1e-04.NAs_TRUE.human.mouse.S_AS_ratio.tsv" 16 | opt$metadata = '/users/rg/abreschi/Documents/human-mouse/paper-sample-clustering/merged_RNA_dashboard_files.crg.tsv' 17 | opt$mean_by="organism" 18 | opt$output = "s_as_ratio" 19 | opt$func = "mean" 20 | opt$not_na = 0.7 21 | opt$log=FALSE 22 | opt$replace_na = FALSE 23 | 24 | ################## 25 | # OPTION PARSING 26 | ################## 27 | 28 | option_list <- list( 29 | 30 | make_option(c("-i", "--input_matrix"), default="stdin", 31 | help="the matrix you want to analyze. Stdin to read from stdin"), 32 | 33 | make_option(c("--dt"), action="store_true", default=F, 34 | help="read the matrix as data.table"), 35 | 36 | make_option(c("-l", "--log"), action="store_true", default=FALSE, 37 | help="apply the log10, before applying the function [default=%default]"), 38 | 39 | make_option(c("-k", "--replace_na"), action="store_true", default=FALSE, 40 | help="use this if you want NAs to be replaced by 0 [default=%default]"), 41 | 42 | make_option(c("-p", "--pseudocount"), type="double", default=1e-04, 43 | help="specify a pseudocount for the log [default=%default]"), 44 | 45 | make_option(c("-m", "--metadata"), 46 | help="tsv file with the metadata"), 47 | 48 | make_option(c("-s", "--mean_by"), 49 | help="choose one or multiple attributes you want to average by"), 50 | 51 | make_option(c("-o", "--output"), default="out.tsv", 52 | help="Output file name. stdout for printing on standard output [default=%default]"), 53 | 54 | make_option(c("-f", "--func"), default="mean", 55 | help="choose the function , , , , , , , [default=%default]"), 56 | 57 | make_option(c("-C", "--byColumns"), action="store_true", default=FALSE, 58 | help="apply the function to the columns, instead of rows [default=%default]"), 59 | 60 | make_option(c("-n", "--not_na"), type="double", default=1, 61 | help="fraction of not NA values in the vector for the mean. If NAs are replaced they are not counted [default=%default]"), 62 | 63 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, 64 | help="verbose output") 65 | ) 66 | 67 | parser <- OptionParser( 68 | usage = "%prog [options] file", 69 | option_list=option_list, 70 | description = "\nExecute a function on matrix rows or columns" 71 | ) 72 | arguments <- parse_args(parser, positional_arguments = TRUE) 73 | opt <- arguments$options 74 | if (opt$verbose) {print(opt)} 75 | 76 | 77 | ##------------ 78 | ## LIBRARIES 79 | ##------------ 80 | 81 | if (opt$verbose) {cat("Loading libraries... ", file=stderr())} 82 | suppressPackageStartupMessages(library(reshape2)) 83 | suppressPackageStartupMessages(library(data.table)) 84 | source("./functions.R") 85 | if (opt$verbose) {cat("DONE\n\n", file=stderr())} 86 | 87 | 88 | # ========================================== 89 | # Function for loading Rdata 90 | # ========================================== 91 | 92 | load_obj <- function(f) 93 | { 94 | env <- new.env() 95 | nm <- load(f, env)[1] 96 | env[[nm]] 97 | } 98 | 99 | ############### 100 | # BEGIN 101 | ############### 102 | 103 | # read table 104 | if (opt$verbose) {cat(sprintf("%s: ", Sys.time()), "Reading matrix... ")} 105 | if (opt$dt) { 106 | inF = ifelse(opt$input_matrix == "stdin", "file:///dev/stdin", opt$input_matrix) 107 | m = fread(inF) 108 | row.names = m[,1][[1]] 109 | m = m[,-1] 110 | } else { 111 | if (opt$input_matrix == "stdin") { 112 | m = read.table(file("stdin"), h=T) 113 | } else { 114 | m <- try(load_obj(opt$input_matrix), silent=T) 115 | if (class(m) == "try-error") {m <- read.table(opt$input_matrix)} 116 | } 117 | row.names = rownames(m) 118 | } 119 | if (opt$verbose) {cat("DONE\n")} 120 | 121 | # apply the log if required 122 | if (opt$replace_na) {m <- replace(m, is.na(m), 0)} 123 | if (opt$log) {m = log10(m + opt$pseudocount)} 124 | 125 | # Set the result to NA when too many missing values are present 126 | func = function(x) { 127 | ifelse((sum(!is.na(x)) < (opt$not_na*length(x))), NA, format(eval(parse(text=opt$func))(x,na.rm=T), digits=5)) 128 | } 129 | 130 | 131 | # Apply the function by columns 132 | if (opt$byColumns) { 133 | if (!is.null(opt$metadata)) { 134 | mdata <- read.table(opt$metadata, h=T, sep="\t", quote="", check.names=FALSE) 135 | } 136 | if (!is.null(opt$mean_by)) { 137 | mean_by = strsplit(opt$mean_by, ",")[[1]] 138 | m[,ncol(m)+1] <- mdata[,mean_by][match(rownames(m), mdata[,"gene"])] 139 | colnames(m)[ncol(m)] <- mean_by 140 | form = as.formula(sprintf(".~%s", mean_by)) 141 | new_m <- aggregate(form, m, func) 142 | } else { 143 | new_m = data.frame(id=colnames(m), fun=apply(m, 2, func)) 144 | 145 | } 146 | 147 | } else { 148 | 149 | # apply the function to the whole matrix if no value is provided 150 | if (is.null(opt$mean_by)) { 151 | new_m = setNames(data.table("id"=row.names, "func"=apply(m, 1, func)), c("id", opt$func)) 152 | } else { 153 | 154 | # apply the function to the levels of the specified factors 155 | mean_by = strsplit(opt$mean_by, ",")[[1]] 156 | df = melt(as.matrix(m), varnames = c("gene_index", "labExpId")) 157 | 158 | # read metadata and merge with data.frame if needed 159 | if (!is.null(opt$metadata)) { 160 | mdata <- read.table(opt$metadata, h=T, row.names=NULL, sep="\t", quote="", check.names=FALSE) 161 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub("[-,]", ".", x)) 162 | mdata = subset(mdata, labExpId %in% colnames(m)) 163 | df = merge(df, unique(mdata[c("labExpId", mean_by)]), by = "labExpId") 164 | } 165 | 166 | df$value[abs(df$value)==Inf] <- NA 167 | aggr = aggregate(as.formula(sprintf("value~gene_index+`%s`", paste(mean_by,collapse="+"))), df, func, na.action="na.pass") 168 | aggr = dcast(aggr, as.formula(sprintf("gene_index~`%s`", paste(mean_by,collapse="+")))) 169 | new_m = aggr 170 | 171 | # if (length(char_cols)==0) {new_m = cbind(gene=genes, new_m)} 172 | # if (length(char_cols)!=0) {new_m = merge(genes, new_m, by.y="gene_index", by.x="row.names")[,-1]} 173 | } 174 | } 175 | 176 | 177 | #-------------- 178 | # print output 179 | #-------------- 180 | 181 | output = ifelse(opt$output == "stdout", "", opt$output) 182 | write.table(new_m, file = output, quote=F, sep='\t', row.names=F) 183 | 184 | q(save='no') 185 | 186 | -------------------------------------------------------------------------------- /rpkm_fraction.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | ##------------ 5 | ## LIBRARIES 6 | ##------------ 7 | suppressPackageStartupMessages(library(reshape2)) 8 | suppressPackageStartupMessages(library(ggplot2)) 9 | suppressPackageStartupMessages(library("optparse")) 10 | suppressPackageStartupMessages(library(plyr)) 11 | 12 | 13 | options(stringsAsFactors=F) 14 | 15 | ################## 16 | # OPTION PARSING 17 | ################## 18 | 19 | 20 | option_list <- list( 21 | make_option(c("-i", "--input_matrix"), default="stdin", help="the matrix you want to analyze [default=%default]"), 22 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"), 23 | make_option(c("-o", "--output"), help="additional tags for otuput", default="out"), 24 | make_option(c("-c", "--color_by"), help="choose the color you want to color by. Leave empty for no color", type='character'), 25 | make_option(c("-y", "--linetype_by"), help="choose the factor you want the linetype by. Leave empty for no linetype", type="character"), 26 | make_option(c("-f", "--file_sel"), help="list of elements of which computing the proportion at each point"), 27 | make_option(c("--out_file"), help="store the coordinates in a file [default=%default]"), 28 | make_option(c("-P", "--palette"), help="file with the colors"), 29 | make_option(c("-t", "--tags"), help="choose the factor by which grouping the lines [default=%default]", default="labExpId") 30 | ) 31 | 32 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 33 | arguments <- parse_args(parser, positional_arguments = TRUE) 34 | opt <- arguments$options 35 | #print(opt) 36 | 37 | 38 | 39 | ##--------------------## 40 | ## CLUSTERING SAMPLES ## 41 | ##--------------------## 42 | output = sprintf("rpkm_fraction.%s", opt$output) 43 | 44 | # 1. read the matrix from the command line 45 | if (opt$input_matrix == "stdin") {inF = file("stdin")} else {inF = opt$input_matrix} 46 | m = read.table(inF, h=T, sep="\t") 47 | 48 | 49 | # Read color palette if present 50 | if (!is.null(opt$palette)) {palette = read.table(opt$palette, h=F, comment.char="%", sep="\t")$V1} 51 | 52 | # remove potential gene id columns 53 | char_cols <- which(sapply(m, class) == 'character') 54 | sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols) 55 | if (length(char_cols) == 0) {genes = rownames(m)} 56 | if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]} 57 | 58 | 59 | # 2. sort all the values for each column 60 | sortm <- apply(m, 2, sort, na.last=T, d=T) 61 | #sortm <- sortm[1:min(which(sortm==0))-1,] 62 | 63 | # 3. calculate cumulative sum 64 | cumm <- as.data.frame(apply(sortm, 2, function(x) cumsum(x/sum(x,na.rm=T)))) 65 | 66 | # 4. read the metadata from the metadata file 67 | mdata = read.table(opt$metadata, h=T, sep='\t') 68 | mdata[,"labExpId"] <- sapply(mdata[,"labExpId"], function(x) gsub(",", ".", x)) 69 | if (!is.null(opt$color_by)) {opt$color_by <- strsplit(opt$color_by, ",")[[1]]} 70 | 71 | # prepare data.frame for ggplot 72 | df = melt(cumm, variable.name = "labExpId", value.name="rpkm_fraction") 73 | fields = unique(c("labExpId", strsplit(opt$tags, ",")[[1]], opt$color_by, opt$linetype_by)) 74 | df = merge(unique(mdata[fields]), df, by="labExpId") 75 | df$labels = apply(df[strsplit(opt$tags, ",")[[1]]], 1, paste, collapse="_") 76 | # add a column with the x index 77 | df = ddply(df, .(labels), transform, x=seq_along(labels), y=sort(rpkm_fraction, na.last=T, d=F)) 78 | 79 | # ===== Add an extra dataframe with the percentage of genes after the union ===== 80 | 81 | if (!is.null(opt$file_sel)) { 82 | 83 | prop_df = data.frame() 84 | sel = read.table(opt$file_sel, h=F)[,1] 85 | # Order the data 86 | orderm = apply(-m, 2, rank, na.last=T, ties.method='first') 87 | # NB: orderm has lost the rownames 88 | thresholds = c(1:10 %o% 10^(0:4)) 89 | thresholds = thresholds[which(thresholds < nrow(m))] 90 | 91 | props = sapply(thresholds, function(thr) { 92 | u = rownames(m)[which(rowSums(orderm <= thr) > 0)]; 93 | length(intersect(u, sel))/length(u) 94 | } 95 | ) 96 | prop_df = data.frame(x=thresholds, y=props) 97 | } 98 | 99 | 100 | 101 | 102 | 103 | ############### 104 | # OUTPUT 105 | ############### 106 | 107 | # coordinate file 108 | 109 | if (!is.null(opt$out_file)) { 110 | write.table(df, file=opt$out_file, row.names=FALSE, quote=FALSE, sep="\t") 111 | } 112 | 113 | # plotting... 114 | base_size=16 115 | legend_nrow = 18 116 | #legend_nrow = 4 117 | theme_set(theme_bw(base_size=base_size)) 118 | legend_text_inch = theme_get()$legend.text$size * base_size / 72.72 119 | add_w = legend_text_inch * max(nchar(df$labels)) * ceiling(length(levels(as.factor(df$labels)))/legend_nrow) 120 | 121 | geom_params = list() 122 | geom_params$size = opt$size 123 | geom_params$alpha = opt$alpha 124 | 125 | 126 | mapping = list() 127 | mapping <- modifyList(mapping, aes(x=x, y=y, group=labels)) 128 | 129 | if (!is.null(opt$color_by)) { 130 | gp_color_by = interaction(df[opt$color_by]) 131 | mapping = modifyList(mapping, aes(color=gp_color_by)) 132 | } 133 | 134 | if (!is.null(opt$linetype_by)) { 135 | gp_linetype_by = interaction(df[opt$linetype_by]) 136 | mapping = modifyList(mapping, aes(linetype=gp_linetype_by)) 137 | } 138 | 139 | 140 | class(mapping) <- "uneval" 141 | 142 | lineLayer <- layer( 143 | geom = "line", 144 | # geom_params = geom_params, 145 | params = geom_params, 146 | mapping = mapping, 147 | stat = "identity", 148 | position = "identity" 149 | ) 150 | 151 | 152 | 153 | # GGPLOT 154 | 155 | gp = ggplot(df) + lineLayer 156 | 157 | #if (!is.null(opt$color_by)) {gp_color_by=interaction(df[opt$color_by])} else {gp_color_by=NULL} 158 | #if (!is.null(opt$linetype_by)) {gp_linetype_by=interaction(df[opt$linetype_by])} else {gp_linetype_by=NULL} 159 | #gp = gp + geom_line(aes(color=gp_color_by, linetype=gp_linetype_by, group=labels)) 160 | 161 | #gp = gp + scale_color_hue(name=paste(opt$color_by, collapse=".")) 162 | if (!is.null(opt$color_by)) { 163 | if (!is.null(opt$palette)) { 164 | gp = gp + scale_color_manual(values=palette) 165 | } else { 166 | gp = gp + scale_color_hue() 167 | } 168 | } 169 | 170 | gp = gp + labs(y="Fraction of gene rpkm", x='Number of genes') 171 | gp = gp + scale_linetype_manual(values=c(2,1)) 172 | gp = gp + guides(col = guide_legend(nrow = legend_nrow, title=opt$color_by)) 173 | gp = gp + scale_x_log10(expand=c(0,0)) 174 | gp = gp + scale_y_continuous(expand=c(0.01,0), limits=c(0,1)) 175 | gp = gp + annotation_logticks(sides="b") 176 | 177 | if (!is.null(opt$file_sel)) { 178 | gp = gp + geom_point(data=prop_df, aes(x,y), shape=18, size=2) 179 | gp = gp + geom_point(data=prop_df, aes(x,y), shape=18, size=1.7, color='yellow') 180 | } 181 | 182 | ggsave(sprintf("%s.pdf",output), h=5, w=6+add_w, title=output) 183 | 184 | q(save='no') 185 | -------------------------------------------------------------------------------- /scatterplot.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors=F) 4 | x_psd = 1e-03 5 | y_psd = 1e-03 6 | 7 | 8 | 9 | 10 | ################## 11 | # OPTION PARSING 12 | ################## 13 | suppressPackageStartupMessages(library("optparse")) 14 | 15 | 16 | option_list <- list( 17 | make_option(c("-i", "--input_matrix"), default="stdin", 18 | help="the matrix you want to analyze. \"stdin\" for stdin [default=%default]"), 19 | 20 | make_option(c("--header"), action="store_true", default=FALSE, help="The file has header [default=%default]"), 21 | 22 | make_option(c("-r", "--replace_NAs"), action="store_true", default=FALSE, 23 | help="Replace NAs with 0 [default=%default]"), 24 | 25 | make_option(c("-x", "--x_axis"), type='integer', default=1, 26 | help="the index (1-based) of the column you want on the x axis [default=%default]"), 27 | 28 | make_option(c("-y", "--y_axis"), type='integer', default=2, 29 | help="the index (1-based) of the column you want on the y axis [default=%default]"), 30 | 31 | make_option(c("-C", "--color_by"), type="integer", 32 | help="Index of the column by which to color the dots [default=%default]"), 33 | 34 | make_option(c("--color_as_factor"), action="store_true", default=FALSE, 35 | help="Convert the color_by column to factor [default=%default]"), 36 | 37 | make_option(c("-o", "--output_suffix"), help="output filename [default=%default]", default='scatterplot.out.pdf'), 38 | make_option(c("-t", "--type"), help=", , [default=%default]", default="tile"), 39 | make_option(c("-b", "--binwidth"), help="comma-separated values for binwidth x,y [default=%default]", default="1,1"), 40 | make_option(c("--x_log"), action="store_true", help="x values log10 transformed [default=%default]", default=FALSE), 41 | make_option(c("--y_log"), action="store_true", help="y values log10 transformed [default=%default]", default=FALSE), 42 | make_option(c("--x_psd"), help="pseudocount for x values [default=%default]", default=x_psd, type='double'), 43 | make_option(c("--y_psd"), help="pseudocount for y values [default=%default]", default=y_psd, type='double'), 44 | make_option("--x_title", help="write a title for x axis"), 45 | make_option("--y_title", help="write a title for y axis"), 46 | make_option("--legend_title", help="write a title for the legend [default=%default]", default="count"), 47 | 48 | make_option(c("--highlight"), 49 | help="a list of element you want to overlay as extra dots"), 50 | 51 | make_option(c("--id_col"), default=1, 52 | help="column with ids"), 53 | 54 | make_option("--title", default="", 55 | help="write a title for the plot [default=%default]"), 56 | 57 | make_option("--diagonal", action="store_true", default=FALSE, 58 | help="plot the diagonal [default=%default]"), 59 | 60 | make_option(c("-R", "--linear_regression"), action="store_true", default=FALSE, 61 | help="plot the regression line [default=%default]"), 62 | 63 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, 64 | help="verbose output [default=%default]") 65 | ) 66 | 67 | 68 | parser <- OptionParser( 69 | usage = "%prog [options] file", 70 | option_list=option_list, 71 | description = "\n\nPlot a density scatterplot" 72 | ) 73 | 74 | arguments <- parse_args(parser, positional_arguments = TRUE) 75 | opt <- arguments$options 76 | if (opt$verbose) {print(opt)} 77 | 78 | ##------------ 79 | ## LIBRARIES 80 | ##------------ 81 | 82 | if (opt$verbose) {cat("Loading libraries... ")} 83 | suppressPackageStartupMessages(library(reshape2)) 84 | suppressPackageStartupMessages(library(ggplot2)) 85 | suppressPackageStartupMessages(library(plyr)) 86 | if (opt$verbose) {cat("DONE\n\n")} 87 | 88 | 89 | 90 | 91 | ################### 92 | # BEGIN # 93 | ################### 94 | 95 | if (opt$input_matrix == "stdin") { 96 | m = read.table(file("stdin"), h=opt$header, sep="\t") 97 | } else { 98 | m = read.table(opt$input_matrix, h=opt$header, sep="\t") 99 | } 100 | 101 | # Replace NAs with 0 if needed 102 | if (opt$replace_NAs) {m <- replace(m, is.na(m), 0)} 103 | 104 | if (opt$x_log) {m[,opt$x_axis] <- m[,opt$x_axis] + opt$x_psd} 105 | if (opt$y_log) {m[,opt$y_axis] <- m[,opt$y_axis] + opt$y_psd} 106 | 107 | df = m 108 | 109 | # Pearson correlation coefficient 110 | pearson = round(cor(sapply(df[,opt$x_axis], function(x) ifelse(opt$x_log, log10(x), x)), 111 | sapply(df[,opt$y_axis], function(x) ifelse(opt$y_log, log10(x), x)), method='p', use='p'), 2) 112 | spearman = round(cor(sapply(df[,opt$x_axis], function(x) ifelse(opt$x_log, log10(x), x)), 113 | sapply(df[,opt$y_axis], function(x) ifelse(opt$y_log, log10(x), x)), method='s', use='p'), 2) 114 | 115 | if (!is.null(opt$color_by) && opt$color_as_factor) { 116 | df[, opt$color_by] <- as.factor(df[, opt$color_by]) 117 | } 118 | 119 | 120 | # PLOTTING ... 121 | 122 | theme_set(theme_bw(base_size=16)) 123 | 124 | bwidth = as.numeric(strsplit(opt$binwidth, ",")[[1]]) 125 | plot_title = sprintf("%s (p_r=%s; s_r=%s)", opt$title, pearson, spearman) 126 | x_col = colnames(df[opt$x_axis]) 127 | y_col = colnames(df[opt$y_axis]) 128 | 129 | # Read the subset of elements you want to highlight 130 | if (!is.null(opt$highlight)) { 131 | highlight = read.table(opt$highlight, h=F)$V1 132 | df_h = df[ df[,opt$id_col] %in% highlight, ] 133 | if (opt$verbose) { 134 | print(head(highlight)) 135 | print(head(df_h)) 136 | } 137 | } 138 | 139 | 140 | # Read the axis titles 141 | if (is.null(opt$x_title)) {x_title = x_col} else {x_title = opt$x_title} 142 | if (is.null(opt$y_title)) {y_title = y_col} else {y_title = opt$y_title} 143 | 144 | 145 | 146 | countBins <- c(0,1,2,5,10,25,50,75,100,500,Inf) 147 | 148 | gp = ggplot(df, aes_string(x=x_col, y=y_col)) 149 | 150 | if (opt$type == 'tile') { 151 | gp = gp + stat_bin2d(bins=100) 152 | gp = gp + scale_fill_gradientn(colours=terrain.colors(20), name=opt$legend_title) 153 | if (!is.null(opt$highlight)) { 154 | gp = gp + geom_point(data=df_h, aes_string(x=x_col, y=y_col)) 155 | } 156 | } 157 | 158 | if (opt$type == 'hex') { 159 | gp = gp + geom_hex(aes(fill=cut(..count.., c(0,1,2,5,10,25,50,75,100,500,Inf))), binwidth=bwidth) 160 | gp = gp + scale_fill_manual('counts', values=terrain.colors(length(countBins))) } 161 | 162 | if (opt$type == "scatter") { 163 | gp = gp + geom_point(aes_string(colour=colnames(df)[opt$color_by]), size=1) 164 | } 165 | 166 | 167 | gp = gp + labs(x=x_title, y=y_title, title=plot_title) 168 | 169 | # Add the diagonal line 170 | 171 | if (opt$diagonal) { 172 | gp = gp + geom_abline(intercept=0, slope=1, color="grey") 173 | } 174 | 175 | # Add the regression line 176 | 177 | if (opt$linear_regression) { 178 | 179 | if (opt$verbose) {print(head(df))} 180 | if (opt$x_log) { 181 | x_col = sprintf("log10(%s)", x_col) 182 | } 183 | if (opt$y_log) { 184 | y_col = sprintf("log10(%s)", y_col) 185 | } 186 | formula = as.formula(sprintf("%s~%s", y_col, x_col)) 187 | coeff = lm(formula, df)$coefficients 188 | gp = gp + geom_abline(intercept=coeff[1], slope=coeff[2]) 189 | } 190 | 191 | 192 | # Change to log scale 193 | 194 | if (opt$x_log) {gp = gp + scale_x_log10()} 195 | if (opt$y_log) {gp = gp + scale_y_log10()} 196 | 197 | ggsave(opt$output, h=5, w=6) 198 | 199 | 200 | 201 | 202 | 203 | q(save='no') 204 | 205 | -------------------------------------------------------------------------------- /plot.network.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | opt = list() 5 | opt$input = "top500.sd.RPKM.glasso.tsv" 6 | opt$node_size = 2 7 | 8 | options(stringsAsFactors=FALSE) 9 | 10 | set.seed(123) 11 | 12 | ################## 13 | # OPTION PARSING 14 | ################## 15 | 16 | suppressPackageStartupMessages(library("optparse")) 17 | 18 | option_list <- list( 19 | 20 | make_option(c("-i", "--input"), default="stdin", 21 | help="File or stdin. Columns are node1, node2, weigth [default=%default]"), 22 | 23 | make_option(c("-o", "--output"), default="network.pdf", 24 | help="Output file name [default=%default]"), 25 | 26 | make_option(c("--nodes"), 27 | help="File with node attributes, no header"), 28 | 29 | make_option(c("--node_color"), type="integer", 30 | help="Index of the node color"), 31 | 32 | make_option(c("--label"), 33 | help="Leave empty for using the node names. \"none\" for no label."), 34 | 35 | make_option(c("--label_color"), type="integer", 36 | help="Index of the label color"), 37 | 38 | make_option(c("--node_frame_color"), type="integer", 39 | help="Index of the node frame color"), 40 | 41 | make_option(c("--node_size"), default=2, 42 | help="Size of the nodes [default=%default]"), 43 | 44 | make_option(c("--node_shape"), type='integer', 45 | help="Index of the node shape"), 46 | 47 | make_option(c("--node_palette"), default="/users/rg/abreschi/R/palettes/rainbow.15.txt", 48 | help="File with colorname in RGB format [default=%default]"), 49 | 50 | make_option(c("--node_frame_palette"), default="/users/rg/abreschi/R/palettes/rainbow.4.txt", 51 | help="File with colorname in RGB format [default=%default]"), 52 | 53 | make_option(c("--label_cex"), default=1, type="double", 54 | help="Size of the labels in cex [default=%default]"), 55 | 56 | make_option(c("--normalize"), action="store_true", default=FALSE, 57 | help="Normalize to have all 1s in the diagonal [default=%default]"), 58 | 59 | make_option(c("--diag"), action="store_true", default=FALSE, 60 | help="Report also edges to and from the same node [default=%default]"), 61 | 62 | make_option(c("--directed"), action="store_true", default=FALSE, 63 | help="Is the graph directed? [default=%default]"), 64 | 65 | make_option(c("-H", "--height"), default=9, 66 | help="Height of the plot in inches [default=%default]"), 67 | 68 | make_option(c("-W", "--width"), default=9, 69 | help="Width of the plot in inches [default=%default]"), 70 | 71 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, 72 | help="if you want more output [default=%default]") 73 | ) 74 | 75 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 76 | arguments <- parse_args(parser, positional_arguments = TRUE) 77 | opt <- arguments$options 78 | if (opt$verbose) {print(opt)} 79 | 80 | 81 | suppressPackageStartupMessages(library(reshape2)) 82 | suppressPackageStartupMessages(library(igraph)) 83 | 84 | 85 | ################################################################# 86 | # triangle vertex shape 87 | mytriangle <- function(coords, v=NULL, params) { 88 | vertex.color <- params("vertex", "color") 89 | if (length(vertex.color) != 1 && !is.null(v)) { 90 | vertex.color <- vertex.color[v] 91 | } 92 | vertex.size <- 1/200 * params("vertex", "size") 93 | if (length(vertex.size) != 1 && !is.null(v)) { 94 | vertex.size <- vertex.size[v] 95 | } 96 | 97 | symbols(x=coords[,1], y=coords[,2], bg=vertex.color, 98 | stars=cbind(vertex.size, vertex.size, vertex.size), 99 | add=TRUE, inches=FALSE) 100 | } 101 | # clips as a circle 102 | add.vertex.shape("triangle", clip=vertex.shapes("circle")$clip, 103 | plot=mytriangle) 104 | 105 | 106 | 107 | # BEGIN 108 | 109 | # Read input 110 | if (opt$input == "stdin") { 111 | m = read.table(file("stdin"), h=F) 112 | } else { 113 | m = read.table(opt$input, h=F) 114 | } 115 | 116 | node_palette = read.table(opt$node_palette, h=F, comment.char="%")$V1 117 | node_frame_palette = read.table(opt$node_frame_palette, h=F, comment.char="%")$V1 118 | node_shape_palette = c("circle", "square", "csquare", "rectangle", "crectangle", "vrectangle", "none") 119 | 120 | # Creat graph 121 | g = graph.data.frame(m, directed=opt$directed) 122 | 123 | 124 | # Normalize if needed 125 | M = as.matrix(get.adjacency(g)) 126 | if (opt$normalize) { 127 | lambda = 1/sqrt(diag(M)) 128 | M = sweep(sweep(M, MARGIN=2, lambda, `*`), MARGIN=1, lambda, `*`) 129 | g = graph.adjacency(as.matrix(M), mode=c("max"), weighted=TRUE, diag=opt$diag) 130 | } 131 | 132 | 133 | # Get node attributes 134 | 135 | V(g)$color = rep("white", vcount(g)) 136 | V(g)$frame.color = rep("white", vcount(g)) 137 | V(g)$label.color = rep("black", vcount(g)) 138 | V(g)$shape = rep("circle", vcount(g)) 139 | V(g)$size = rep(opt$node_size, vcount(g)) 140 | V(g)$label.cex = rep(opt$label_cex, vcount(g)) 141 | 142 | 143 | if (!is.null(opt$nodes)) { 144 | node_attr = read.table(opt$nodes, h=F, quote="\"") 145 | # Select only metadata rows for nodes in the network 146 | node_attr = node_attr[node_attr[,1] %in% V(g)$name,] 147 | match_node = match(V(g)$name, node_attr[,1]) 148 | 149 | # Get node color 150 | if (!is.null(opt$node_color)) { 151 | node_colors = node_palette[as.factor(node_attr[,opt$node_color])] 152 | V(g)$color = node_colors[match_node] 153 | } 154 | 155 | # Get label color 156 | if (!is.null(opt$label_color)) { 157 | label_colors = palette[node_attr[,opt$label_color]] 158 | V(g)$label.color = label_colors[match_node] 159 | } 160 | 161 | # Get node frame color 162 | # if (!is.null(opt$node_frame_color)) { 163 | # node_frame_colors = node_frame_palette[as.factor(node_attr[,opt$node_frame_color])] 164 | # V(g)$frame.color = node_frame_colors[match_node] 165 | # } 166 | 167 | # Get label 168 | if (!is.null(opt$label) && opt$label != "none") { 169 | opt$label = as.integer(opt$label) 170 | node_labels = node_attr[,opt$label] 171 | V(g)$label = node_labels[match_node] 172 | } 173 | 174 | # Get node shape 175 | if (!is.null(opt$node_shape)) { 176 | node_shapes = node_shape_palette[as.factor(node_attr[,opt$node_shape])] 177 | V(g)$shape = node_shapes[match_node] 178 | } 179 | 180 | } 181 | 182 | 183 | vertex.label=NULL 184 | if (opt$label == "none") {V(g)$label = NA} 185 | 186 | #E(g)$label = m[,4] 187 | 188 | # PLOT 189 | 190 | pdf(opt$output, h=opt$height, w=opt$width) 191 | 192 | cat("PLOTTING...") 193 | 194 | plot( 195 | g, 196 | layout=layout.kamada.kawai, 197 | # layout=layout.reingold.tilford(g, root="1") 198 | # vertex.label=vertex.label, 199 | ) 200 | 201 | # Node color legend 202 | if (!is.null(opt$nodes) & !is.null(opt$node_color)) { 203 | labels = unique(node_attr[,opt$node_color]) 204 | legend( 205 | "topright", 206 | legend=labels, 207 | fill=node_colors[match(labels, node_attr[,opt$node_color])] 208 | ) 209 | } 210 | 211 | 212 | # Node frame color legend 213 | if (!is.null(opt$nodes) & !is.null(opt$node_frame_color)) { 214 | labels = unique(node_attr[,opt$node_frame_color]) 215 | legend( 216 | "topleft", 217 | legend=labels, 218 | fill="white", 219 | border=node_frame_colors[match(labels, node_attr[,opt$node_frame_color])] 220 | ) 221 | } 222 | 223 | cat(" DONE\n") 224 | 225 | dev.off() 226 | 227 | 228 | q(save='no') 229 | 230 | -------------------------------------------------------------------------------- /boxplot_expressed_isoforms.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library(reshape2)) 4 | suppressPackageStartupMessages(library(ggplot2)) 5 | suppressPackageStartupMessages(library("optparse")) 6 | 7 | options(stringsAsFactors=F) 8 | cbbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#000000", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 9 | 10 | 11 | ################## 12 | # OPTION PARSING 13 | ################## 14 | 15 | 16 | option_list <- list( 17 | 18 | make_option(c("-i", "--input_matrix"), 19 | help="the matrix you want to analyze"), 20 | 21 | make_option(c("-a", "--annotation"), 22 | help="two-column file with gene and tx ids, no header."), 23 | 24 | make_option(c("-o", "--output"), 25 | help="choose the name for the output file, WITHOUT extension"), 26 | 27 | make_option(c("-m", "--metadata"), 28 | help="tsv file with metadata on matrix experiment"), 29 | 30 | make_option(c("-f", "--fill_by"), 31 | help="choose what to fill by. Leave empty for no filling"), 32 | 33 | make_option(c("-c", "--color_by"), 34 | help="choose what to color by. Leave empty for no coloring") 35 | 36 | ) 37 | 38 | 39 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 40 | arguments <- parse_args(parser, positional_arguments = TRUE) 41 | opt <- arguments$options 42 | 43 | print(opt) 44 | 45 | #-------------------- 46 | # FUNCTIONS 47 | #-------------------- 48 | 49 | # FUNCTION 1 50 | # This function calculates the relative expression of the most expressed isoform of a gene in a given condition. 51 | # It returns NaN when the gene has no isoforms expressed 52 | maj_iso_rel_expr = function(x) {max(x,na.rm=T)/sum(x,na.rm=T)} 53 | 54 | # FUNCTION 2 55 | # This function calculates the Shannon entropy for expressed isoforms of a gene in a given condition 56 | # It returns NA when the gene has no isoforms expressed. 57 | # The function is taken from the script: 58 | source('~abreschi/R/functions.R') 59 | 60 | 61 | 62 | ########### 63 | ## HUMAN ## 64 | ########### 65 | 66 | 67 | # gene, tx annotation file (only protein coding gene, but all txs) 68 | human_gn_tx = read.table(opt$annotation, h=F, col.names=c('gene','tx')) 69 | 70 | # transcript matrix 71 | human_expr = read.table(opt$input_matrix, h=T) 72 | 73 | # read the metadata 74 | mdata = read.table(opt$metadata, h=T, sep="\t") 75 | mdata[,"labExpId"] <- gsub(",", ".", mdata[,"labExpId"]) 76 | 77 | # replace NAs (bad IDR) with zeros 78 | human_expr[is.na(human_expr)] <- 0 79 | 80 | # add the gene locus to each tx 81 | human_tx_gn_expr = merge(human_gn_tx, human_expr, by.x='tx', by.y='row.names') 82 | 83 | 84 | #### EXPRESSED ISOFORMS 85 | # count the number of expressed isoforms in each cell line 86 | human_expr_iso = aggregate(human_tx_gn_expr[,-(1:2)], list(gene=human_tx_gn_expr$gene), function(x) sum(!is.na(x)&x!=0)) 87 | # equivalent to: 88 | human_expr_iso_melt = melt(human_expr_iso, variable.name='sample_name', value.name='expr_iso') 89 | 90 | #### MAJOR ISOFORM EXPRESSION 91 | # calculate the relative epxression of the most expressed isoform for each gene in each sample 92 | human_rel_expr_maj = aggregate(human_tx_gn_expr[,-(1:2)], list(gene=human_tx_gn_expr$gene), maj_iso_rel_expr) 93 | human_rel_expr_maj_melt = melt(human_rel_expr_maj, variable.name='sample_name', value.name='rel_maj') 94 | 95 | #### ISOFORM ENTROPY 96 | # calculate the entropy of expressed isoforms for each gene in each sample 97 | human_entr_iso = aggregate(human_tx_gn_expr[,-(1:2)], list(gene=human_tx_gn_expr$gene), entropy) 98 | human_entr_iso_melt = melt(human_entr_iso, variable.name='sample_name', value.name='entr_iso') 99 | 100 | #### ANNOTATED ISOFORMS 101 | # count the number of annotated isoforms for each gene 102 | human_ann_iso = setNames(aggregate(tx~gene, human_tx_gn_expr, length), c("gene", "ann_isoforms")) 103 | 104 | #### MERGING 105 | # add the annotation information to the expressed isoforms 106 | human_iso_all = merge(human_expr_iso_melt, human_rel_expr_maj_melt, by=c('gene', 'sample_name')) 107 | human_iso_all = merge(human_iso_all, human_entr_iso_melt, by=c('gene', 'sample_name')) 108 | human_iso_all = merge(human_iso_all, human_ann_iso, by = 'gene') 109 | 110 | hs_mm_data_expr_ann_iso = human_iso_all 111 | hs_mm_data_expr_ann_iso = merge(unique(mdata[c("labExpId", opt$fill_by, opt$color_by)]), 112 | hs_mm_data_expr_ann_iso, by.x='labExpId', by.y='sample_name') 113 | write.table(hs_mm_data_expr_ann_iso, file=sprintf('%s.summary_isoform_expression.tsv',opt$output), sep='\t', quote=F, row.names=F) 114 | 115 | 116 | merged_labels = aggregate(gene~tx, aggregate(tx~gene, human_tx_gn_expr, length), length) 117 | 118 | ########## 119 | ## plot ## 120 | ########## 121 | 122 | 123 | # COMMENT: I have to add the number of genes in each boxplot 124 | 125 | theme_set(theme_bw(base_size=16)) 126 | 127 | pdf(sprintf("%s.pdf", opt$output), height=6, width=9) 128 | 129 | max_iso = 15 130 | 131 | data = subset(hs_mm_data_expr_ann_iso,ann_isoforms<=max_iso & expr_iso>0) 132 | gp = ggplot(data, aes(as.factor(ann_isoforms), expr_iso)) 133 | gp = gp + geom_boxplot(aes_string(fill=opt$fill_by, color=opt$color_by)) 134 | gp = gp + ylim(c(0, max_iso)) 135 | gp = gp + labs(y='Number of isoforms expressed per gene', x='Number of annotated isoforms per gene') 136 | gp = gp + geom_abline(linetype='dotted', size=2, color='grey') 137 | gp = gp + geom_text(data=subset(merged_labels, tx<=max_iso), aes(x = tx, y = 0, label = gene), angle=90) 138 | gp = gp + scale_fill_manual(values=cbbPalette) 139 | gp 140 | 141 | data = subset(hs_mm_data_expr_ann_iso,ann_isoforms<=max_iso) 142 | gp = ggplot(data, aes(as.factor(ann_isoforms), rel_maj)) 143 | gp = gp + geom_boxplot(aes_string(fill=opt$fill_by, color=opt$color_by)) 144 | gp = gp + ylim(c(0,1)) 145 | gp = gp + labs(y='Major isoform relative expression', x='Number of annotated isoforms per gene') 146 | gp = gp + geom_text(data=subset(merged_labels, tx<=max_iso), aes(x = tx, y = 0, label = gene), angle=90) 147 | gp = gp + scale_fill_manual(values=cbbPalette) 148 | gp = gp + scale_x_discrete(labels=merged_labels) + theme(axis.text.x = element_text(angle=90, vjust=0.5)) 149 | gp 150 | 151 | data = subset(hs_mm_data_expr_ann_iso,expr_iso<=15 & expr_iso>=2) 152 | gp = ggplot(data, aes(as.factor(expr_iso), entr_iso)) 153 | gp = gp + geom_boxplot(aes_string(fill=opt$fill_by, color=opt$color_by)) 154 | gp = gp + labs(y = 'Shannon entropy', x = 'Number of expressed isoforms per gene') 155 | gp = gp + geom_point(aes(x=as.factor(expr_iso), y = log(expr_iso+1)), color='red') 156 | gp = gp + scale_fill_manual(values=cbbPalette) 157 | gp 158 | 159 | data = subset(hs_mm_data_expr_ann_iso,ann_isoforms<=max_iso & ann_isoforms>=2) 160 | gp = ggplot(data, aes(as.factor(ann_isoforms), entr_iso)) 161 | gp = gp + geom_boxplot(aes_string(fill=opt$fill_by, color=opt$color_by)) 162 | #gp = gp + ylim(c(0, max_iso)) 163 | gp = gp + labs(y = 'Shannon entropy', x = 'Number of annotated isoforms per gene') 164 | gp = gp + geom_point(aes(x=as.factor(ann_isoforms), y = log(ann_isoforms+1)), color='red') 165 | gp = gp + geom_text(data=subset(merged_labels, tx<=max_iso), aes(x = tx, y = 0, label = gene), angle=90) 166 | gp = gp + scale_fill_manual(values=cbbPalette) 167 | gp 168 | 169 | dev.off() 170 | 171 | ggsave(sprintf("%s.jpeg", opt$output), h=6, w=9) 172 | 173 | -------------------------------------------------------------------------------- /scale_matrix.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | 4 | # This script is useful for: 5 | # normalizing samples by row scaling 6 | 7 | ##------------ 8 | ## LIBRARIES 9 | ##------------ 10 | suppressPackageStartupMessages(library("reshape2")) 11 | suppressPackageStartupMessages(library("ggplot2")) 12 | suppressPackageStartupMessages(library("optparse")) 13 | suppressPackageStartupMessages(library("scales")) 14 | 15 | options(stringsAsFactors=F) 16 | 17 | ################## 18 | # OPTION PARSING 19 | ################## 20 | 21 | option_list <- list( 22 | make_option(c("-i", "--input_matrix"), default="stdin", 23 | help="the matrix you want to analyze. \"stdin\" for reading from standard input [default=%default]"), 24 | 25 | make_option(c("-l", "--log10"), action="store_true", default=FALSE, 26 | help="apply the log before scaling [default=FALSE]"), 27 | 28 | make_option(c("-p", "--pseudocount"), type="double", default=0, 29 | help="specify a pseudocount for the log [default=%default]"), 30 | 31 | make_option(c("-r", "--row_first"), action="store_true", default=FALSE, 32 | help="scale first by rows then by columns"), 33 | 34 | make_option(c("-a", "--all"), action="store_true", default=FALSE, 35 | help="scale the whole matrix by overall mean and sd"), 36 | 37 | make_option(c("--range"), action="store_true", default=FALSE, 38 | help="Normalize so the values are between 0 and 1"), 39 | 40 | make_option(c("-k", "--keep_NA"), action="store_true", default=FALSE, 41 | help="NAs are not replaced by zero [default=%default]"), 42 | 43 | make_option(c("-C", "--center"), action="store_true", default=FALSE, 44 | help="subtract the mean [default=%default]"), 45 | 46 | make_option(c("-S", "--scale"), action="store_true", default=FALSE, 47 | help="divide by the standard deviation [default=%default]"), 48 | 49 | make_option(c("-n", "--n_iter"), type='integer', default=20, 50 | help="how many times to iterate. Choose 0 for one-dimension scaling [default=%default]"), 51 | 52 | make_option(c("-m", "--metadata"), 53 | help="tsv file with the metadata"), 54 | 55 | make_option(c("-s", "--scale_by"), 56 | help="choose one or multiple attributes you want to scale by"), 57 | 58 | make_option(c("-o", "--output"), default="stdout", 59 | help="output file name. \"stdout\" to redirect to stdout. [default=%default]"), 60 | 61 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, 62 | help="verbose output. [default=%default]") 63 | ) 64 | 65 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) 66 | arguments <- parse_args(parser, positional_arguments = TRUE) 67 | opt <- arguments$options 68 | if (opt$verbose) {print(opt)} 69 | 70 | 71 | ############### 72 | # BEGIN 73 | ############### 74 | 75 | # read options 76 | inF = opt$input_matrix 77 | if (opt$input_matrix == "stdin") { 78 | inF = file("stdin") 79 | } 80 | m <- read.table(inF, h=T, sep="\t", check.names=F) 81 | 82 | # Remove character columns 83 | char_cols <- which(sapply(m, class) == 'character') 84 | if (opt$verbose) {sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols)} 85 | if (length(char_cols) == 0) {genes = rownames(m)} 86 | if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]} 87 | 88 | # replace or not NAs 89 | if (!opt$keep_NA) {m <- replace(m, is.na(m), 0)} 90 | 91 | # apply the log if required 92 | if (opt$log10) {m = log10(m + opt$pseudocount)} 93 | 94 | # Read the metadata 95 | if (!is.null(opt$metadata)) { 96 | mdata <- read.table(opt$metadata, h=T, row.names=NULL, sep="\t", comment.char="", quote="\"") 97 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(",", ".", x)) 98 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(":", ".", x)) 99 | mdata = subset(mdata, labExpId %in% colnames(m)) 100 | } 101 | 102 | 103 | 104 | 105 | ########## 106 | # EQUILIBRATION 107 | ########## 108 | 109 | # Comments on scale() function from R: 110 | # - if the vector has sd==0, all the scaled values will be NAs. 111 | # - if the vector contains NAs, scale omits them. 112 | 113 | # scaling by rows and by columns in this order for 20 times 114 | 115 | if (opt$row_first) { 116 | # scale first by rows then by columns iteratively 117 | equil = function(matr) { 118 | if (opt$n_iter == 0) {matr = t(scale(t(matr), center=opt$center, scale=opt$scale)) 119 | }else{ 120 | for (i in 1:opt$n_iter) { 121 | matr = t(scale(t(matr), center=opt$center, scale=opt$scale)) 122 | matr = scale(matr, center=opt$center, scale=opt$scale) 123 | } 124 | } 125 | return(matr) 126 | } 127 | }else{ 128 | # scale first by columns then by rows iteratively 129 | equil = function(matr) { 130 | if (opt$n_iter == 0) {matr = scale(matr, center=opt$center, scale=opt$scale) 131 | }else{ 132 | for (i in 1:opt$n_iter) { 133 | matr = scale(matr, center=opt$center, scale=opt$scale) 134 | matr = t(scale(t(matr), center=opt$center, scale=opt$scale)) 135 | } 136 | } 137 | return(matr) 138 | } 139 | } 140 | 141 | scale_all = function(new_m) { 142 | new_m = (new_m - mean(as.matrix(new_m), na.rm=T)) / sd(as.matrix(new_m), na.rm=T) 143 | return(new_m) 144 | } 145 | 146 | 147 | new_m = m 148 | 149 | # Choose function 150 | scale_func = function(new_m, opt) { 151 | if (opt$all & opt$range) { 152 | new_m = as.data.frame(rescale(as.matrix(new_m))) 153 | return(new_m) 154 | } 155 | if (!opt$all & opt$range) { 156 | new_m = t(apply(new_m, 1, rescale, to=c(0,1))) 157 | return (new_m) 158 | } 159 | if (opt$all & !opt$range) { 160 | new_m = scale_all(new_m) 161 | return (new_m) 162 | } 163 | new_m = equil(new_m) 164 | return(new_m) 165 | } 166 | 167 | # scale the whole matrix if no value is provided 168 | if (is.null(opt$scale_by)) { 169 | new_m = scale_func(new_m, opt) 170 | # if (opt$all) { 171 | # new_m = scale_all(new_m) 172 | # } else{ 173 | # new_m = equil(new_m) 174 | # } 175 | } 176 | 177 | # scale the sub-matrices defined the scale_by option 178 | if (!is.null(opt$scale_by)) { 179 | 180 | # Select the metadata columns of interset 181 | scale_by <- strsplit(opt$scale_by, ",")[[1]]; 182 | if (opt$verbose) {cat("Select metadata... ")} 183 | mdata = unique(mdata[,unique(c("labExpId", scale_by))]) 184 | if (opt$verbose) {cat(dim(mdata), "\n")} 185 | 186 | # Get the levels by which scale 187 | lev = levels(interaction(mdata[,scale_by])) 188 | if (opt$verbose) {print(lev)} 189 | 190 | # Add a column to the metadata with the interaction 191 | mdata$interaction = apply(mdata[scale_by], 1, paste, collapse=".") 192 | 193 | for (i in 1:length(lev)) { 194 | ids = mdata[lev[i] == mdata$interaction, "labExpId"] 195 | subm = m[,ids] 196 | equilm = scale_func(subm, opt) 197 | #if (opt$all) { 198 | # equilm = scale_all(subm) 199 | #} else{ 200 | # equilm = equil(subm) 201 | #} 202 | #equilm = equil(subm) 203 | new_m[,ids] = equilm 204 | } 205 | 206 | ## if (length(scale_by) != 1){ 207 | # ids = apply(mdata, 1, function(x) unique(merge(t(as.data.frame(x)), mdata, by=scale_by)$labExpId ))} 208 | # if (length(scale_by) == 1){ 209 | # ids = sapply(unique(mdata[, scale_by]), function(x) unique(mdata[ mdata[,scale_by] == x,]$labExpId))} 210 | # 211 | ## apply normalization 212 | # if (length(scale_by) != 1){for (i in 1:length(ids)) { new_m[, ids[[i]]] <- equil(new_m[,ids[[i]]])} } 213 | # if (length(scale_by) == 1){for (i in 1:ncol(ids)) { new_m[, ids[[i]]] <- equil(new_m[,ids[[i]]])} } 214 | } 215 | 216 | # Round the values for the output 217 | new_m = round(new_m, 4) 218 | 219 | print_rownames = TRUE 220 | if (length(char_cols) != 0) {new_m <- cbind(genes, new_m); print_rownames=FALSE} 221 | 222 | 223 | # print output 224 | #-------------- 225 | 226 | if (opt$output == "stdout") { 227 | output = "" 228 | } else { 229 | output = opt$output 230 | } 231 | 232 | 233 | write.table(new_m, output, quote=F, sep="\t", row.names=print_rownames) 234 | 235 | 236 | # EXIT 237 | 238 | q(save='no') 239 | 240 | -------------------------------------------------------------------------------- /removeBatchEffect.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | options(stringsAsFactors=F) 4 | 5 | ################## 6 | # OPTION PARSING 7 | ################## 8 | 9 | suppressPackageStartupMessages(library("optparse")) 10 | 11 | option_list <- list( 12 | make_option(c("-i", "--input_matrix"), default="stdin", 13 | help="the matrix you want to analyze [default=%default]"), 14 | 15 | 16 | make_option(c("-l", "--log"), default=NULL, 17 | help="The base of the logarithm to tranform the data before batch removal. If NULL no log-transformation is applied [default=%default]"), 18 | 19 | make_option(c("-p", "--pseudocount"), type="double", default=0, 20 | help="A pseudocount to add when log-transforming [default=%default]"), 21 | 22 | make_option(c("-M", "--method"), default="voom", help="The method you want to use for removing the batch effect [default=%default] 23 | 24 | voom : apply voom to a matrix of read counts to estimate the weigths 25 | and then uses the function removeBatchEffect() from limma 26 | limma : uses the function removeBatchEffect() from limma on a matrix of 27 | already normalized values 28 | combat : uses the function combat() from sva on a matrix of already normalized values 29 | "), 30 | 31 | make_option(c("-s", "--scaling_factors"), default="TMM", help="How to compute scaling factors, if the method is voom [default=%default] 32 | 33 | TMM : 34 | none : 35 | "), 36 | 37 | 38 | make_option(c("--d1"), default="~1", help="Design for voom [default=%default]"), 39 | make_option(c("--d2"), help="Design for removing the batch effect (not including the batch effect)"), 40 | make_option(c("-b", "--batch"), help="Column with the batch info"), 41 | 42 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"), 43 | make_option(c("-G", "--merge_mdata_on"), default="labExpId", 44 | help="Column in the metadata with the header of the input matrix [default=%default]"), 45 | 46 | make_option(c("-t", "--total"), type="integer", help="Filter by total count per gene > t [default=%default]"), 47 | #make_option(c("-F", "--fields"), help="choose the fields you want to use in the differential expression, comma-separated"), 48 | make_option(c("-S", "--lib.sizes"), help="Two-column file with no header. col1: header of matrix, col2: library sizes"), 49 | make_option(c("-N", "--output.norm"), help="File name for normalization factors"), 50 | make_option(c("-R", "--read_counts"), default=FALSE, action="store_true", help="Output reads counts instead of log2(cpm) [default=%default]"), 51 | make_option(c("-o", "--output"), default="stdout", help="output file name [default=%default]"), 52 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, help="verbose output [default=%default]") 53 | ) 54 | 55 | parser <- OptionParser( 56 | usage = "%prog [options] file", 57 | option_list=option_list, 58 | description="\nRemove batch effect from a matrix of read counts, or normalized values" 59 | ) 60 | arguments <- parse_args(parser, positional_arguments = TRUE) 61 | opt <- arguments$options 62 | if (opt$verbose) {print(opt)} 63 | 64 | 65 | 66 | # LIBRARIES 67 | 68 | suppressPackageStartupMessages(library(edgeR)) 69 | suppressPackageStartupMessages(library(limma)) 70 | 71 | 72 | ##--------------------## 73 | ## BEGIN ## 74 | ##--------------------## 75 | 76 | 77 | # read the matrix from the command line 78 | if(opt$input_matrix == "stdin"){inF=file("stdin")}else{inF=opt$input_matrix} 79 | m = read.table(inF, h=T, sep="\t") 80 | 81 | # Replace missing values with 0 82 | m = replace(m, is.na(m), 0) 83 | 84 | # Log-transform the values if needed 85 | if (!is.null(opt$log)) { 86 | base = ifelse(opt$log == "e", exp(1), as.double(opt$log)) 87 | m = log(m+opt$pseudocount, base) 88 | } 89 | 90 | inputToBatchRm <- m 91 | 92 | # ?TODO: Error if there is attempt to log-transform integer 93 | 94 | # =========================== Metadata ======================= 95 | 96 | merge_mdata_on = opt$merge_mdata_on 97 | 98 | # read the metadata 99 | mdata = read.table(opt$metadata, h=T, sep="\t", quote=NULL) 100 | 101 | # Get the fields from the formula 102 | if (is.null(opt$batch)) { 103 | cat("ERROR: please specify the batch variable\n") 104 | q(save='no') 105 | } 106 | fields = opt$batch 107 | if (opt$d1 != "~1") { 108 | fields1 = strsplit(sub("~", "", opt$d1), split="[+:*]")[[1]] 109 | fields = c(fields, fields1) 110 | } 111 | if (opt$d2 != "~1") { 112 | fields2 = strsplit(sub("~", "", opt$d2), split="[+:*]")[[1]] 113 | fields = c(fields, fields2) 114 | } 115 | mdata[opt$merge_mdata_on] <- gsub(",", ".", mdata[,opt$merge_mdata_on]) 116 | 117 | 118 | # Check if all the columns are in the metadata 119 | if (sum(!(colnames(m) %in% mdata[,merge_mdata_on])) >0 ) { 120 | cat("ERROR: Not all column names in the metadata\n") 121 | q(save="no") 122 | } 123 | 124 | # Format the metadata 125 | mdata = unique(mdata[unique(c(merge_mdata_on, fields))]) 126 | rownames(mdata) <- mdata[,merge_mdata_on] 127 | mdata <- mdata[match(colnames(m), mdata[,merge_mdata_on]),, drop=FALSE] 128 | if (opt$verbose) { 129 | print(mdata) 130 | print(dim(mdata)) 131 | } 132 | 133 | 134 | # **************** 135 | # voom+limma 136 | # **************** 137 | 138 | if (opt$method == "voom") { 139 | 140 | # Filter by total number of reads per gene if asked 141 | if (!is.null(opt$total)) { 142 | m = m[rowSums(m)>opt$total, ] 143 | } 144 | 145 | # Convert all the values of the matrix to integer (because we want counts) 146 | m[1:ncol(m)] <- apply(m, 2, as.integer) 147 | # Create count object for edgeR 148 | M = DGEList(m) 149 | 150 | 151 | # Check for user-provided library sizes 152 | if (!is.null(opt$lib.sizes)) { 153 | lib.sizes = read.table(opt$lib.sizes, h=F, sep="\t") 154 | lib.sizes = lib.sizes[match(lib.sizes$V1, colnames(m)), "V2"] 155 | M$samples$lib.size <- lib.sizes 156 | } 157 | 158 | # **************** 159 | # TMM 160 | # **************** 161 | 162 | if (opt$scaling == "TMM") { 163 | M <- calcNormFactors(M, method="TMM") 164 | if (!is.null(opt$output.norm)) { 165 | normFactors = data.frame(a=colnames(m), b=M$samples$norm.factors) 166 | write.table(normFactors, file=opt$output.norm, col.names=FALSE, row.names=FALSE, sep="\t", quote=FALSE) 167 | } 168 | } 169 | 170 | # **************** 171 | # none 172 | # **************** 173 | 174 | if (opt$scaling == "none") { 175 | M$samples$norm.factors <- rep(1, ncol(m)) 176 | } 177 | 178 | 179 | # **************** 180 | # voom 181 | # **************** 182 | 183 | design1 <- model.matrix(as.formula(opt$d1), data=mdata) 184 | 185 | if (opt$d1 != "~1") { 186 | design1 <- design1[match(colnames(m), rownames(design1)),] 187 | } 188 | 189 | v <- voom(M, design1, plot=FALSE) 190 | 191 | inputToBatchRm <- v 192 | } 193 | 194 | 195 | 196 | # ********************** 197 | # removeBatchEffect 198 | # ********************** 199 | 200 | design2 <- model.matrix(as.formula(opt$d2), data=mdata) 201 | 202 | if (opt$d2 != "~1") { 203 | design2 <- design2[match(colnames(m), rownames(design2)),] 204 | } 205 | 206 | batch = mdata[match(colnames(m), mdata[, opt$merge_mdata_on]), opt$batch] 207 | 208 | if (opt$method == "voom" | opt$method == "limma") { 209 | out = removeBatchEffect(inputToBatchRm, batch=batch, design=design2) 210 | } 211 | 212 | # Convert back to read counts 213 | if (opt$read_counts) { 214 | out = pmax(sweep(2**out, 2, M$samples$norm.factors * M$samples$lib.size, FUN="*")/1e+06 - 0.5, 0) 215 | } 216 | 217 | 218 | # **************** 219 | # ComBat 220 | # **************** 221 | 222 | 223 | if (opt$method == "combat") { 224 | 225 | suppressPackageStartupMessages(library(sva)) 226 | out = ComBat(dat=m, batch=batch, mod=design2) 227 | 228 | } 229 | 230 | 231 | # =================== OUTPUT ====================== 232 | 233 | out = round(out, digits=5) 234 | outF = ifelse(opt$output=="stdout", "", opt$output) 235 | write.table(out, file=outF, quote=FALSE, sep="\t") 236 | 237 | q(save='no') 238 | --------------------------------------------------------------------------------