├── README.md
├── matrix_matrix_operation.R
├── k-means.R
├── flux.profile.R
├── PFAM_enrichment.R
├── mann-whitney.R
├── gene.list.enrichment.R
├── missing.values.R
├── functions.R
├── plot.isoform.levels.R
├── matrix_to_dist.R
├── element.detected.R
├── gene.pair.correlation.R
├── plot.bigWig.profile.R
├── modularity.R
├── sort.rpkm.lines.R
├── ggpie.R
├── VennDiagram.R
├── KEGG_enrichment.R
├── wordcloud.R
├── read.genomic.coverage.R
├── normalization.DESeq.R
├── cutree.R
├── DESeq.analysis.R
├── anova.R
├── DEXSeq.analysis.R
├── quantile_normalization.R
├── differential_coSI.R
├── barplot.GO.R
├── ggSOM.R
├── matrix_wilcox.R
├── projection.score.R
├── matrix_matrix_correlation.R
├── plot.mean.sd.R
├── GO_enrichment.R
├── SOM.R
├── Rtsne.R
├── major.isoform.across.samples.R
├── add_quantile.R
├── nt.coverage.R
├── dimensRed.R
├── gglines.R
├── matrix_funct.R
├── rpkm_fraction.R
├── scatterplot.R
├── plot.network.R
├── boxplot_expressed_isoforms.R
├── scale_matrix.R
└── removeBatchEffect.R


/README.md:
--------------------------------------------------------------------------------
1 | # Rscripts
2 | Several R scripts for exploratory data analysis
3 | 


--------------------------------------------------------------------------------
/matrix_matrix_operation.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | ##------------
 4 | ## LIBRARIES
 5 | ##------------
 6 | 
 7 | suppressPackageStartupMessages(library("optparse"))
 8 | 
 9 | options(stringsAsFactors=F)
10 | 
11 | ##################
12 | # OPTION PARSING
13 | ##################
14 | 
15 | option_list <- list(
16 | 
17 | make_option(c("-A", "--matrix_A"), 
18 | 	help="the matrix you want to subtract from, WITH header (A-B)"),
19 | 
20 | make_option(c("-B", "--matrix_B"), 
21 | 	help="the matrix you want to subtract, WITH header (A-B)"),
22 | 
23 | make_option(c("-r", "--replace_NA_with"), type="numeric",
24 | 	help="value you want to replace NA with, if null NAs are not replaced and difference will be NA"),
25 | 
26 | make_option(c("-o", "--output"), default="out.tsv",
27 | 	help="additional prefix for otuput [default=%default]"),
28 | 
29 | make_option(c("-e", "--expression"), 
30 | 	help="expression you want to corresponding cells of the matrices, e.g. A-B"),
31 | 
32 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE,
33 | 	help="verbose output [default=%default]")
34 | 
35 | )
36 | 
37 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
38 | arguments <- parse_args(parser, positional_arguments = TRUE)
39 | opt <- arguments$options
40 | 
41 | if(opt$verbose) {print(opt)}
42 | 
43 | ###################
44 | # BEGIN           #
45 | ###################
46 | 
47 | A = read.table(opt$matrix_A, h=T)
48 | B = read.table(opt$matrix_B, h=T)
49 | 
50 | if (!is.null(opt$replace_NA_with)) {
51 | A <- replace(A, is.na(A), opt$replace_NA_with)
52 | B <- replace(B, is.na(B), opt$replace_NA_with)
53 | }
54 | 
55 | M = eval(parse(text=opt$expression))
56 | 
57 | write.table(M, opt$output, quote=F, row.names=T, sep="\t")
58 | 
59 | q(save='no')
60 | 


--------------------------------------------------------------------------------
/k-means.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | options(stringsAsFactors=FALSE)
 4 | 
 5 | ##################
 6 | # OPTION PARSING #
 7 | ##################
 8 | 
 9 | suppressPackageStartupMessages(library("optparse"))
10 | 
11 | option_list <- list(
12 | 
13 | make_option(c("-i", "--input"), default="stdin",
14 | 	help="File or stdin [default=%default]"),
15 | 
16 | make_option(c("-o", "--output"), default="Kmeans.tsv",
17 | 	help="Output file name. Can be stdout [default=%default]"),
18 | 
19 | make_option(c("--header"), action="store_true", default=FALSE,
20 | 	help="The input matrix has a header [default=%default]"),
21 | 
22 | make_option(c("-l", "--log10"), action="store_true", default=FALSE,
23 | 	help="Apply the log10 to the whole matrix as pre-processing step [default=%default]"),	
24 | 
25 | make_option(c("-p", "--pseudocount"), default=0.001,
26 | 	help="Pseudocount to add when applying the log [default=%default]"),
27 | 
28 | make_option(c("-k", "--nb_clusters"), default=3,
29 | 	help="Number of desired clusters [default=%default]"),
30 | 
31 | make_option(c("-B", "--iterations"), default=50,
32 | 	help="Number of initializations to determine the best clustering [default=%default]"),
33 | 
34 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE,
35 | 	help="if you want more output [default=%default]")
36 | )
37 | 
38 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
39 | arguments <- parse_args(parser, positional_arguments = TRUE)
40 | opt <- arguments$options
41 | if (opt$verbose) {print(opt)}
42 | 
43 | 
44 | suppressPackageStartupMessages(library("ggplot2"))
45 | 
46 | 
47 | ##############
48 | # BEGIN
49 | ##############
50 | 
51 | 
52 | if (opt$input == "stdin") {
53 | 	m = read.table(file("stdin"), h=T)
54 | } else {
55 | 	m = read.table(opt$input, h=T)
56 | }
57 | 
58 | if (opt$log10) {
59 | 	m = log10(m + opt$pseudocount)
60 | }
61 | 
62 | 
63 | set.seed(123)
64 | 
65 | # Find the clusters from multiple random initializations
66 | Klist = replicate(opt$iterations, kmeans(m, opt$nb_clusters), simplify=F)
67 | 
68 | # Choose the best
69 | K = Klist[[which.max(sapply(1:length(Klist), function(i) {Klist[[i]]$betweenss/Klist[[i]]$totss}))]]$cluster
70 | 
71 | m$Kmeans = K
72 | 
73 | 
74 | # OUTPUT
75 | 
76 | if (opt$output == "stdout") { 
77 | 	output = ""
78 | } else {
79 | 	output = opt$output
80 | }
81 | 
82 | write.table(m, output, quote=FALSE, col.names=TRUE, row.names=TRUE, sep='\t')
83 | 
84 | q(save='no')
85 | 


--------------------------------------------------------------------------------
/flux.profile.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | options(stringsAsFactors=FALSE)
  4 | 
  5 | ##################
  6 | # OPTION PARSING
  7 | ##################
  8 | 
  9 | suppressPackageStartupMessages(library("optparse"))
 10 | 
 11 | option_list <- list(
 12 | 
 13 | make_option(c("-i", "--input"),
 14 | 	help="Flux profile, .json"),
 15 | 
 16 | make_option(c("-o", "--output"), default="flux.profile.out.pdf",
 17 | 	help="Output file name [default=%default]"),
 18 | 
 19 | make_option(c("-t", "--title"), default="",
 20 | 	help="Plot title [default=%default]"),
 21 | 
 22 | make_option(c("--palette"), type="character", default="/users/rg/abreschi/R/palettes/rainbow.2.txt",
 23 | 	help="file with the palette [default=%default]"),
 24 | 
 25 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE,
 26 | 	help="if you want more output [default=%default]")
 27 | 
 28 | )
 29 | 
 30 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 31 | arguments <- parse_args(parser, positional_arguments = TRUE)
 32 | opt <- arguments$options
 33 | if (opt$verbose) {print(opt)}
 34 | 
 35 | #################
 36 | # LIBRARIES 
 37 | #################
 38 | 
 39 | suppressPackageStartupMessages(library("RJSONIO"))
 40 | suppressPackageStartupMessages(library("ggplot2"))
 41 | 
 42 | 
 43 | 
 44 | # BEGIN
 45 | 
 46 | palette = read.table(opt$palette, h=F, comment.char="%")$V1
 47 | 
 48 | 
 49 | m = fromJSON(opt$input)
 50 | 
 51 | 
 52 | dfSense = (do.call(
 53 | 	rbind, sapply(
 54 | 		1:5, function(i) 
 55 | 		data.frame(
 56 | 			x=seq_along(m$masters[[i]]$sense), 
 57 | 			y=m$masters[[i]]$sense,
 58 | 			strand="sense",
 59 | 			bin=i
 60 | 			), 
 61 | 		simplify=F
 62 | 		)
 63 | 	)
 64 | )
 65 | 
 66 | dfAntisense = (do.call(
 67 | 	rbind, sapply(
 68 | 		1:5, function(i) 
 69 | 		data.frame(
 70 | 			x=seq_along(m$masters[[i]]$asense), 
 71 | 			y=m$masters[[i]]$asense,
 72 | 			strand="antisense",
 73 | 			bin=i
 74 | 			), 
 75 | 		simplify=F
 76 | 		)
 77 | 	)
 78 | )
 79 | 
 80 | 
 81 | df = rbind(dfSense, dfAntisense)
 82 | 
 83 | 
 84 | # PLOT
 85 | theme_set(theme_bw(base_size=16))
 86 | 
 87 | gp = ggplot(df, aes(x=x, y=y))
 88 | gp = gp + geom_line(aes(group=bin, color=strand))
 89 | gp = gp + facet_wrap(~bin, scale="free_x", nrow=1)
 90 | gp = gp + scale_y_log10()
 91 | gp = gp + labs(y="log10(signal)", x="position", title=opt$title)
 92 | gp = gp + scale_color_manual(values=palette)
 93 | gp = gp + theme(axis.text.x=element_text(angle=45, hjust=1))
 94 | 
 95 | 
 96 | ggsave(opt$output, w=10, h=3)
 97 | 
 98 | q(save="no")
 99 | 
100 | 


--------------------------------------------------------------------------------
/PFAM_enrichment.R:
--------------------------------------------------------------------------------
 1 | ##------------
 2 | ## LIBRARIES
 3 | ##------------ 
 4 | cat("Loading libraries... ")
 5 | suppressPackageStartupMessages(library("PFAM.db"))
 6 | suppressPackageStartupMessages(library("org.Hs.eg.db"))
 7 | suppressPackageStartupMessages(library("GOstats"))
 8 | suppressPackageStartupMessages(library("optparse"))
 9 | suppressPackageStartupMessages(library("plyr"))
10 | cat("DONE\n\n")
11 | 
12 | options(stringsAsFactors=F)
13 | pseudocount = 1e-04
14 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 
15 | 
16 | ##################
17 | # OPTION PARSING
18 | ##################
19 | 
20 | 
21 | option_list <- list(
22 | make_option(c("-u", "--universe"), help="a list of human gene identifiers (ensEMBL ids), NO header"),
23 | make_option(c("-G", "--genes"), help="a list of human gene identifiers for the foreground (ensEMBL ids), WITH header"),
24 | make_option(c("-o", "--output"), help="additional tags for otuput [default=out]")
25 | )
26 | 
27 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
28 | arguments <- parse_args(parser, positional_arguments = TRUE)
29 | opt <- arguments$options
30 | print(opt)
31 | 
32 | na2null = function(x) if(is.na(x)) {return(NULL)}else{return(x)}
33 | 
34 | 
35 | ############################
36 | # BEGIN
37 | ############################
38 | 
39 | U = read.table(opt$universe, h=F, col.names='hs')
40 | G = read.table(opt$genes, h=T, col.names='hs')
41 | 
42 | 
43 | # take the entrez gene ids for all the orthologous genes which will be my universe (the same for all the sets)
44 | universe = unlist(mget(U$hs, org.Hs.egENSEMBL2EG, ifnotfound=NA))
45 | 
46 | sprintf("%s background genes; %s with a corresponding entrez id", nrow(U), length(unique(universe)))
47 | # how many genes am I able to map?
48 | # First thing notice that also ensembl gene ids longer than 15 characters are included
49 | # if I remove these genes I end up with:
50 | # length(unique(as.character(universe[which(nchar(names(universe)) == 15)]))) ----> 15593
51 | 
52 | 
53 | createParams = function(x) {
54 | 	geneset = unlist(mget(x, org.Hs.egENSEMBL2EG, ifnotfound=NA))
55 | 	sprintf("%s foreground genes; %s with a corresponding entrez id", length(x), length(unique(geneset)))
56 | 	pv = 1-(1-0.05)**(1/length(x))
57 | 	params = new("PFAMHyperGParams",
58 | 		geneIds = geneset,
59 | 		universeGeneIds = universe,
60 | 		annotation = 'org.Hs.eg.db',
61 | 		pvalueCutoff = pv,
62 | 		testDirection='over')
63 | 	return(params)}
64 | 
65 | res = hyperGTest(createParams(G$hs))
66 | write.table(summary(res), file=sprintf("%s.tsv", opt$output), quote=F, sep="\t", row.names=F)
67 | htmlReport(res, file=sprintf("%s.html", opt$output))
68 | 
69 | q(save='no')
70 | 


--------------------------------------------------------------------------------
/mann-whitney.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | options(stringsAsFactors=F)
  4 | 
  5 | ##################
  6 | # OPTION PARSING
  7 | ##################
  8 | 
  9 | suppressPackageStartupMessages(library("optparse"))
 10 | 
 11 | option_list <- list(
 12 | 
 13 | make_option(c("-i", "--input"), type="character", default='stdin',
 14 | 	help="tab-separated file. Can be stdin [default=%default]"),
 15 | 
 16 | make_option(c("-o", "--output"), default="stdout", 
 17 | 	help="output file name with extension [default=%default]"),
 18 | 
 19 | make_option(c("--header"), action="store_true", default=FALSE, 
 20 | 	help="The file has header [default=%default]"),
 21 | 
 22 | make_option(c("-V", "--values"), type='numeric', default=1,
 23 | 	help="Column index with the values [default=%default]"),
 24 | 
 25 | make_option(c("-f", "--factor"), type='numeric', default=1,
 26 | 	help="Column index with the factor [default=%default]"),
 27 | 
 28 | make_option(c("-v", "--verbose"), action='store_true', default=FALSE,
 29 | 	help="Verbose output [default=%default]")
 30 | 
 31 | #make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"),
 32 | )
 33 | 
 34 | parser <- OptionParser(
 35 | 	usage = "%prog [options] file", 
 36 | 	option_list = option_list,
 37 | 	description = "Compute wilcox.test for each pair of group in the column factor"
 38 | )
 39 | 
 40 | 
 41 | arguments <- parse_args(parser, positional_arguments = TRUE)
 42 | opt <- arguments$options
 43 | if (opt$verbose) {print(opt)}
 44 | 
 45 | 
 46 | ##------------
 47 | ## LIBRARIES
 48 | ##------------
 49 |  
 50 | #if (opt$verbose) {cat("Loading libraries... ")}
 51 | #suppressPackageStartupMessages(library(reshape2))
 52 | #suppressPackageStartupMessages(library(ggplot2))
 53 | ##suppressPackageStartupMessages(library(plyr))
 54 | #if (opt$verbose) {cat("DONE\n\n")}
 55 | 
 56 | 
 57 | # ========
 58 | # BEGIN
 59 | # ========
 60 | 
 61 | 
 62 | # Read data
 63 | if (opt$input == "stdin") {input=file('stdin')} else {input=opt$input}
 64 | m = read.table(input, h=opt$header, sep="\t")
 65 | if(opt$verbose) {print(head(m))}
 66 | 
 67 | # Read the axes
 68 | values = colnames(m)[opt$values]
 69 | fact = colnames(m)[opt$factor]
 70 | # Build the formula
 71 | form = as.formula(sprintf("%s~%s", values, fact))
 72 | 
 73 | if(opt$verbose) {
 74 | 	cat("Formula: ")
 75 | 	print(form)
 76 | }
 77 | 
 78 | # Vector of groups
 79 | groups = unique(as.character(m[,fact]))
 80 | 
 81 | # Perform mann-whitney on all pairs of groups
 82 | res = apply(combn(groups, 2), 2, 
 83 | 	function(x) {
 84 | 		df = m[m[,fact] %in% x,];
 85 | 		res.test = wilcox.test(form, data=df);
 86 | 		data.frame(
 87 | 			group1 = x[1],
 88 | 			group2 = x[2],
 89 | 			W = res.test$statistic,
 90 | 			p.value = format(res.test$p.value, digits=4)
 91 | 		)
 92 | 	}
 93 | )
 94 | 
 95 | # Concatenate the data.frame
 96 | res = do.call(rbind, res)
 97 | 
 98 | # OUTPUT
 99 | output = ifelse(opt$output == "stdout", "", opt$output)
100 | write.table(res, file=output, row.names=FALSE, quote=FALSE, sep="\t")
101 | 
102 | # EXIT
103 | quit(save='no')
104 | 


--------------------------------------------------------------------------------
/gene.list.enrichment.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | 
 4 | options(stringsAsFactors=FALSE)
 5 | 
 6 | ##################
 7 | # OPTION PARSING
 8 | ##################
 9 | 
10 | suppressPackageStartupMessages(library("optparse"))
11 | 
12 | option_list <- list(
13 | 
14 | make_option(c("-i", "--input"), default="stdin",
15 | 	help="File or stdin with a list of genes to test for enrichment [default=%default]"),
16 | 
17 | make_option(c("-d", "--db"), 
18 | 	help="A tab-separated database with the annotation. It has a header. First two columns are <gene_set> <gene_id>. \"%\" is the comment char."),
19 | 
20 | make_option(c("--header"), action="store_true", default=FALSE,
21 | 	help="Use this if the input has a header [default=%default]"),
22 | 
23 | #make_option(c("-U", "--all_genes_as_universe"), action="store_true", default=FALSE,
24 | #	help="Use all genes in the list as universe, otherwise it uses only the genes annotated in db [default=%default]"),
25 | #
26 | make_option(c("-o", "--output"), default="gene.list.enrich.out.tsv",
27 | 	help="Output file name, stdout for printing on stdout [default=%default]"),
28 | 
29 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE,
30 | 	help="if you want more output [default=%default]")
31 | 
32 | )
33 | 
34 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
35 | arguments <- parse_args(parser, positional_arguments = TRUE)
36 | opt <- arguments$options
37 | if (opt$verbose) {print(opt)}
38 | 
39 | #------------
40 | # LIBRARIES
41 | #------------ 
42 | 
43 | if (opt$verbose) {cat("Loading libraries... ")}
44 | suppressPackageStartupMessages(library(reshape2))
45 | suppressPackageStartupMessages(library(ggplot2))
46 | if (opt$verbose) {cat("DONE\n\n")}
47 | 
48 | 
49 | # Wrapper for hypergeometric probaility
50 | 
51 | hyper.test = function(x) {
52 | 	x = as.numeric(x)
53 | 	q = x[1];
54 | 	m = x[2];
55 | 	n = x[4]-m;
56 | 	k = x[3];
57 | 	phyper(q, m, n, k, lower.tail=FALSE)
58 | }
59 | 
60 | 
61 | 
62 | # ======== #
63 | # BEGIN    #
64 | # ======== #
65 | 
66 | 
67 | # Read data
68 | 
69 | if (opt$input == "stdin") {inF = file("stdin")} else {inF = opt$input}
70 | m = read.table(inF, h=opt$header)
71 | 
72 | 
73 | db = read.table(opt$db, h=T, sep="\t", quote="", comment.char="%")
74 | 
75 | formula = as.formula(paste(colnames(db)[2],"~",colnames(db)[1]))
76 | db_counts = setNames(aggregate(formula, db, length), c("feature", "total"))
77 | list_db = db[db[,2] %in% m[,1], ]
78 | list_counts = setNames(aggregate(formula, list_db, length), c("feature", "counts"))
79 | 
80 | df = merge(list_counts, db_counts, by="feature")
81 | df$gene_list = length(unique(list_db[,2]))
82 | df$universe = length(unique(db[,2]))
83 | df$p.value = apply(df[-1], 1, hyper.test)
84 | df$FDR = p.adjust(df$p.value, method="BH")
85 | 
86 | df = df[order(df$FDR),]
87 | 
88 | # Format and write output
89 | 
90 | df$p.value <- format(df$p.value, digits=2)
91 | df$FDR <- format(df$FDR, digits=2)
92 | 
93 | output = ifelse(opt$output == "stdout", "", opt$output)
94 | write.table(df, output, quote=FALSE, row.names=FALSE, sep="\t")
95 | 
96 | 
97 | q(save='no')
98 | 


--------------------------------------------------------------------------------
/missing.values.R:
--------------------------------------------------------------------------------
 1 | 
 2 | ##------------
 3 | ## LIBRARIES
 4 | ##------------ 
 5 | suppressPackageStartupMessages(library(reshape2))
 6 | suppressPackageStartupMessages(library(ggplot2))
 7 | suppressPackageStartupMessages(library("optparse"))
 8 | suppressPackageStartupMessages(library(plyr))
 9 | 
10 | 
11 | options(stringsAsFactors=F)
12 | pseudocount = 1e-04
13 | cbbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#000000", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 
14 | 
15 | ##################
16 | # OPTION PARSING
17 | ##################
18 | 
19 | 
20 | option_list <- list(
21 | make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze"),
22 | make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"),
23 | make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount),
24 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"),
25 | make_option(c("-o", "--output"), help="additional tags for otuput [default=%default]", default='out'),
26 | make_option(c("-t", "--tags"), help="comma-separated field names you want to display in the labels", default="cell,sex,age")
27 | )
28 | 
29 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
30 | arguments <- parse_args(parser, positional_arguments = TRUE)
31 | opt <- arguments$options
32 | print(opt)
33 | 
34 | 
35 | 
36 | ##--------------------##
37 | ## CLUSTERING SAMPLES ##
38 | ##--------------------##
39 | 
40 | # read the matrix from the command line
41 | m = read.table(opt$input_matrix, h=T)
42 | 
43 | # remove potential gene id columns
44 | char_cols <- which(sapply(m, class) == 'character')
45 | sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols)
46 | if (length(char_cols) == 0) {genes = rownames(m)}
47 | if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]}
48 | 
49 | # substitute the matrix with its log if required by the user
50 | if (opt$log) {m = log2(replace(m, is.na(m), 0) + opt$pseudocount)}
51 | 
52 | # read the metadata from the metadata file
53 | mdata = read.table(opt$metadata, h=T, sep='\t')
54 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(",", ".", x))
55 | 
56 | # plot the number of missing values and zeros
57 | stat = data.frame(NAs = apply(m,2,function(x) sum(is.na(x))), 
58 | zeros = apply(m,2,function(x) sum(x==0,na.rm=T)), pos_values = apply(m,2,function(x) sum(!is.na(x)&x!=0)))
59 | labels = strsplit(opt$tags, ',')[[1]]
60 | df = merge(unique(mdata[c('labExpId', labels)]), stat, by.y='row.names', by.x='labExpId')
61 | df = melt(df, ids = c('labExpId',labels), variable = 'values', value.name="Number_of_genes")
62 | df$labels = apply(df[labels], 1, paste, collapse='_' )
63 | 
64 | 
65 | output = sprintf("log_%s.psd_%s.%s.missing.values", opt$log, opt$pseudocount, opt$output)
66 | pdf(sprintf('%s.pdf', output), h=log2(ncol(m)), w=7)
67 | gp = ggplot(df, aes(x=labExpId, y=Number_of_genes)) 
68 | gp = gp + geom_bar(aes(fill=values), stat='identity') 
69 | gp = gp + coord_flip()
70 | gp = gp + scale_x_discrete(labels=df$labels)
71 | gp = gp + labs(x='')
72 | gp = gp + theme(axis.text=element_text(size=10/log10(ncol(m))))
73 | gp
74 | dev.off()
75 | 
76 | q(save='no')
77 | 


--------------------------------------------------------------------------------
/functions.R:
--------------------------------------------------------------------------------
 1 | 
 2 | # 
 3 | entropy = function(x, na.rm=T)  {x = x[!is.na(x)]; x =x[x!=0]; p=x/sum(x); if (length(x) != 0) {return(-sum(p*log(p)))} else {NA}}
 4 | 
 5 | #
 6 | nentropy = function(x, na.rm=T)  {y = x[!is.na(x)]; y =y[y!=0]; p=y/sum(y); if (length(y) != 0) {return(-sum(p*log(p))/log(length(x)))} else {NA}}
 7 | 
 8 | #
 9 | tau = function(x, na.rm=T) {return(sum(1-x/max(x))/(length(x)-1))}
10 | 
11 | # 
12 | cv = function(x, na.rm=T) {sd(x, na.rm=na.rm)/mean(x, na.rm=na.rm)}
13 | 
14 | # 
15 | logit = function(x, base=exp(1)) {return(log(x/(1-x), base=base))}
16 | 
17 | # range
18 | range = function(x, na.rm=T) {return(max(x, na.rm=na.rm) - min(x, na.rm=na.rm))}
19 | 
20 | # Dynamic range
21 | dynRange = function(x, na.rm=T) {
22 | 	#if (is.character(x) || is.factor(x)) {cat('char\n');return(NA)}
23 | 	x = as.numeric(x);
24 | 	x = replace(x, x==0, NA);
25 | 	if (sum(!is.na(x)) < 2) {
26 | 		return(NA)
27 | 	} else{
28 | 		return(log10(max(x, na.rm=T)) - log10(min(x, na.rm=T)))
29 | 	}
30 | }
31 | 
32 | 
33 | # Shuffle data.frame
34 | shuffleData = function(df) {
35 | 	s = matrix(sample(unlist(m), ncol(m)*nrow(m)), ncol=ncol(m), nrow=nrow(m))
36 | 	rownames(s) <- rownames(df)
37 | 	colnames(s) <- colnames(df)
38 | 	return(as.data.frame(s))
39 | }
40 | 
41 | # Extract the legend from ggplot
42 | g_legend<-function(a.gplot){
43 |     tmp <- ggplot_gtable(ggplot_build(a.gplot))
44 |     leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
45 |     legend <- tmp$grobs[[leg]]
46 |     return(legend)
47 | }
48 | 
49 | 
50 | 
51 | #~~~~~~~~~~~~~~~~~~~~~~
52 | # ggplot2 custom theme
53 | #~~~~~~~~~~~~~~~~~~~~~~
54 | 
55 | # Modified from theme_bw() from this site http://sape.inf.usi.ch/quick-reference/ggplot2/themes
56 | 
57 | #theme_minimal <- function(base_size = 12) {
58 | #  library(grid)
59 | #  structure(list(
60 | #    axis.line =         element_blank(),
61 | #    axis.text.x =       element_text(size = base_size * 0.8 , lineheight = 0.9, vjust = 1),
62 | #    axis.text.y =       element_text(size = base_size * 0.8, lineheight = 0.9, hjust = 1),
63 | #    axis.ticks =        element_line(colour = "black", size = 0.2),
64 | #    axis.title.x =      element_text(size = base_size, vjust = 1),
65 | #    axis.title.y =      element_text(size = base_size, angle = 90, vjust = 0.5),
66 | #    axis.ticks.length = unit(0.3, "lines"),
67 | #    axis.ticks.margin = unit(0.5, "lines"),
68 | # 
69 | #    legend.background = element_rect(colour=NA), 
70 | #    legend.key =        element_rect(colour = "grey80"),
71 | #    legend.key.size =   unit(1.2, "lines"),
72 | #    legend.text =       element_text(size = base_size * 0.8),
73 | #    legend.title =      element_text(size = base_size * 0.8, face = "bold", hjust = 0),
74 | #    legend.position =   "right",
75 | # 
76 | #    panel.background =  element_rect(fill = "white", colour = NA), 
77 | #    panel.border =      element_rect(fill = NA, colour="grey50"), 
78 | #    panel.grid.major =  element_line(colour = "grey90", size = 0.2),
79 | #    panel.grid.minor =  element_line(colour = "grey98", size = 0.5),
80 | #    panel.margin =      unit(0.25, "lines"),
81 | # 
82 | #    strip.background =  element_rect(fill = "grey80", colour = "grey50"), 
83 | #    strip.text.x =      element_text(size = base_size * 0.8),
84 | #    strip.text.y =      element_text(size = base_size * 0.8, angle = -90),
85 | # 
86 | #    plot.background =   element_rect(colour = NA),
87 | #    plot.title =        element_text(size = base_size * 1.2),
88 | #    plot.margin =       unit(c(1, 1, 0.5, 0.5), "lines")
89 | #  ), class = "options")
90 | #}
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/plot.isoform.levels.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | suppressPackageStartupMessages(library("reshape2"))
 4 | suppressPackageStartupMessages(library("ggplot2"))
 5 | suppressPackageStartupMessages(library("optparse"))
 6 | suppressPackageStartupMessages(library("plyr"))
 7 | 
 8 | options(stringsAsFactors=FALSE)
 9 | 
10 | 
11 | ##################
12 | # OPTION PARSING
13 | ##################
14 | 
15 | 
16 | option_list <- list(
17 | make_option(c("-i", "--input_matrix"), help="the matrix with the transcript rpkms and gene id, columns names are trid, gnid, labExpId1, labExpId2, ..."),
18 | make_option(c("-l", "--log"), help="apply the log"),
19 | make_option(c("-G", "--gene"), help="the gene of interest"),
20 | make_option(c("-o", "--output"), default="isoforms.pdf", help="choose the name for the output file (with extension) [default=%default]"),
21 | make_option(c("-m", "--metadata"), help="metadata file"),
22 | make_option(c("--merge_mdata_on"), default="labExpId", help="metadata column with headers of the input matrix [default=%default]"),
23 | make_option(c("-f", "--field"), default="cell", help="column name of label [default=%default]"),
24 | make_option(c("-c", "--colors"), default="~/R/palettes/rainbow.6.txt", help="color file, the format is RGB [default=%default]"),
25 | make_option(c("-W", "--width"), default=7, help="width of the plot, in inches [default=%default]"),
26 | make_option(c("-r", "--representation"), default="stack", help="visualization method: stack | [defuault=%default]")
27 | 
28 | #make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"),
29 | #make_option(c("-d", "--de"), help='output of edgeR for differentially expressed genes')
30 | )
31 | 
32 | 
33 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
34 | arguments <- parse_args(parser, positional_arguments = TRUE)
35 | opt <- arguments$options
36 | 
37 | #print(opt)
38 | 
39 | #--------------------
40 | # FUNCTIONS
41 | #--------------------
42 | 
43 | palette = as.character(read.table(opt$colors, h=F, comment.char="%")$V1)
44 | 
45 | m = read.table(opt$input_matrix, h=T)
46 | colnames(m)[1:2] <- c("trid", "gnid")
47 | m = subset(m, gnid == opt$gene)
48 | 
49 | if (nrow(m) == 1) {
50 | 	cat("The gene has only one transcript! \n")
51 | 	cat("EXIT \n")
52 | 	q(save="no")
53 | }
54 | 
55 | m = melt(m)
56 | 
57 | m = ddply(m, .(gnid, variable), transform, sum=sum(value, na.rm=T))
58 | m$ratio = with(m, value/sum*100)
59 | m$sum = signif(m[,"sum"], 3)
60 | 
61 | print(head(m))
62 | 
63 | # Read metadata
64 | if (!is.null(opt$metadata)) {
65 | 	field = opt$field
66 | #	opt$merge_mdata_on = "labExpId"
67 | 	mdata = read.table(opt$metadata, h=T, sep="\t")
68 | 	mdata[,opt$merge_mdata_on] <- gsub(",",".", mdata[,opt$merge_mdata_on])
69 | 	mdata = unique(mdata[,unique(c(field, opt$merge_mdata_on))])
70 | 	m = merge(m, mdata, by.x="variable", by.y=opt$merge_mdata_on)
71 | }
72 | 
73 | print(head(m))
74 | 
75 | if (opt$representation == "stack") {
76 | 	gp = ggplot(m, aes_string(x=field, y="ratio")) 
77 | #	gp = gp + geom_bar(position=position_stack(width=1), stat="identity", aes(fill=trid), color="black")
78 | 	gp = gp + geom_bar(position="stack", stat="identity", aes(fill=trid), color="black")
79 | 	gp = gp + geom_text(aes_string(x=field, y=102, label="sum"), angle=45, hjust=0)
80 | 	gp = gp + scale_fill_manual(values=palette, opt$gene)
81 | 	gp = gp + theme(axis.text.x = element_text(angle=45, hjust=1))
82 | 	gp = gp + labs(x="")
83 | 	gp = gp + scale_y_continuous(limits=c(0,105))
84 | }
85 | 
86 | 
87 | ggsave(opt$output, w=opt$width, h=6)
88 | 
89 | q(save="no")
90 | 


--------------------------------------------------------------------------------
/matrix_to_dist.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | options(stringsAsFactors=F)
  4 | 
  5 | # DEBUGGING OPTIONS
  6 | 
  7 | opt = list()
  8 | opt$input_matrix = "~abreschi/Documents/db/human/gencode18/Flux/encode.promo.human.gene.RPKM.idr_01.thr_0.names_False.tsv" 
  9 | opt$verbose = TRUE
 10 | opt$log10 = FALSE
 11 | opt$pseudocount = 1e-03
 12 | opt$cor = "pearson"
 13 | 
 14 | ##################
 15 | # OPTION PARSING
 16 | ##################
 17 | 
 18 | suppressPackageStartupMessages(library("optparse"))
 19 | 
 20 | option_list <- list(
 21 | make_option(c("-i", "--input_matrix"), default="stdin",
 22 | 	help="the matrix you want to analyze. \"stdin\" for reading from standard input [default=%default]"),
 23 | make_option(c("-v", "--verbose"),action="store_true", default=FALSE, 
 24 | 	help="if you want detailed output"),
 25 | make_option(c("-l", "--log10"), action="store_true", default=FALSE, 
 26 | 	help="apply the log10"),
 27 | make_option(c("-p", "--pseudocount"), type="double", default=0,
 28 | 	help="specify a pseudocount for the log [default=%default]. NAs are replaced by 0s"),
 29 | make_option(c("-c", "--cor"),
 30 | 	help="choose the correlation method"),
 31 | make_option(c("-d", "--dist"), 
 32 | 	help="choose the distance method"),
 33 | make_option(c("-k", "--keep_na"), action="store_true", default=FALSE,
 34 | 	help="use this if you want to keep the NAs. By default they are converted to 0. [default=%default]"),
 35 | make_option(c("-o", "--output"), default="stdout",
 36 | 	help="a name for the output. \"stdout\" to print on standard output [default=%default]")
 37 | )
 38 | 
 39 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 40 | arguments <- parse_args(parser, positional_arguments = TRUE)
 41 | opt <- arguments$options
 42 | 
 43 | # Stop if both the correlation method and the distance are different from NULL
 44 | if (!is.null(opt$cor) & !is.null(opt$dist)) {
 45 | 	cat("ERROR: cannot compute both the correlation and the distance\n")
 46 | 	q(save="no")
 47 | }
 48 | 
 49 | if (opt$verbose) {print(opt)}
 50 | 
 51 | ##############
 52 | # BEGIN
 53 | ##############
 54 | 
 55 | # read input table
 56 | if (opt$input_matrix == "stdin") {input=file("stdin")} else {input=opt$input_matrix}
 57 | m = read.table(input, h=T)
 58 | 
 59 | 
 60 | # remove potential gene id columns
 61 | char_cols <- which(sapply(m, class) == 'character')
 62 | if (opt$verbose) {sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols)}
 63 | if (length(char_cols) == 0) {genes = rownames(m)}
 64 | if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]}
 65 | 
 66 | # Replace NAs with 0s if not asked otherwise
 67 | if (!(opt$keep_na)) {m = replace(m, is.na(m), 0)}
 68 | 
 69 | # apply the log if required
 70 | if (opt$log10) {m = log10(m + opt$pseudocount)}
 71 | 
 72 | 
 73 | # compute the correlation
 74 | if (!is.null(opt$cor)) {
 75 | 	df = cor(m, use='p', method=opt$cor)
 76 | }
 77 | 
 78 | # compute the distance
 79 | if (!is.null(opt$dist)) {
 80 | 	if (opt$dist == "p") {opt$dist <- "pearson"}
 81 | 	if (opt$dist == "s") {opt$dist <- "spearman"}
 82 | 	if (opt$dist != "spearman" & opt$dist != "pearson") {
 83 | 		df = as.matrix(dist(t(m), method=opt$dist))
 84 | 	} else {
 85 | 		df = 1 - abs(cor(m, use="p", method=opt$dist))
 86 | 	}
 87 | }
 88 | 
 89 | dec = 3
 90 | 
 91 | df = round(df, dec)
 92 | 
 93 | # print the results
 94 | if (opt$output == "stdout") {
 95 | 	write.table(df, file="", quote=FALSE, sep="\t")
 96 | } else {
 97 | 	write.table(df, file=opt$output, quote=FALSE, sep="\t")
 98 | }
 99 | 
100 | q(save="no")
101 | 


--------------------------------------------------------------------------------
/element.detected.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | ##------------
 4 | ## LIBRARIES
 5 | ##------------ 
 6 | suppressPackageStartupMessages(library(reshape2))
 7 | suppressPackageStartupMessages(library(ggplot2))
 8 | suppressPackageStartupMessages(library("optparse"))
 9 | suppressPackageStartupMessages(library(plyr))
10 | 
11 | 
12 | #options(stringsAsFactors=F)
13 | pseudocount = 1e-04
14 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 
15 | 
16 | ##################
17 | # OPTION PARSING
18 | ##################
19 | 
20 | 
21 | option_list <- list(
22 | make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze"),
23 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"),
24 | make_option(c("-o", "--output"), help="additional flags for otuput", default="out"),
25 | make_option(c("-W", "--width"), default=5, help="width in inches [default=%default]"),
26 | make_option(c("-H", "--height"), default=5, help="height in inches [default=%default]"),
27 | #make_option(c("-c", "--color_by"), help="choose the color you want to color by [default=NA]", type='character', default=NA),
28 | make_option(c("-f", "--field"), help="dashboard field by which the individuals are grouped")
29 | #make_option(c("-t", "--tags"), help="comma-separated field names you want to display in the labels", default="cell,sex,age")
30 | )
31 | 
32 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
33 | arguments <- parse_args(parser, positional_arguments = TRUE)
34 | opt <- arguments$options
35 | print(opt)
36 | 
37 | na2null = function(x) if(is.na(x)) {return(NULL)}else{return(x)}
38 | 
39 | 
40 | ##--------------------##
41 | ## CLUSTERING SAMPLES ##
42 | ##--------------------##
43 | output = sprintf("%s.%s", basename(opt$input_matrix), opt$output)
44 | 
45 | # read the matrix from the command line
46 | m = read.table(opt$input_matrix, h=F, col.names=c("element","labExpId","n_det_el","prop","tag"))
47 | 
48 | # read the metadata from the metadata file
49 | mdata = read.table(opt$metadata, h=T, sep='\t')
50 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(",", ".", x))
51 | 
52 | # prepare data.frame for ggplot
53 | df = merge(subset(m, tag=="individual"), unique(mdata[c("labExpId", opt$field)]), by='labExpId')
54 | 
55 | 
56 | # duplicate the data.frame to plot all if an opt$field is specified
57 | if (!is.null(opt$field)) {
58 | df_copy = df;
59 | df_copy[opt$field]= "all";
60 | x_axis = opt$field
61 | # attach the two data frames
62 | new_df = rbind(df, df_copy)
63 | }else{
64 | x_axis='x_lab';
65 | df$x_lab = 'all'
66 | new_df = df}
67 | 
68 | 
69 | # order the elements with the hierarchy: junction, exon, transcript, gene
70 | new_df$element <- factor(new_df$element, levels=c('junction','exon','transcript','gene'))
71 | 
72 | n_colors = length(unique(subset(m, tag=='cumulative')$labExpId))
73 | gp = ggplot(new_df, aes_string(x=x_axis, y="prop"))
74 | gp = gp + geom_boxplot() 
75 | gp = gp + facet_grid(element~.) 
76 | gp = gp + geom_point(data=subset(m, tag=='cumulative'), aes_string(x="labExpId", y="prop"), size = 4, alpha = 0.7, col=cbbPalette[2:2+n_colors])
77 | gp = gp + labs(y='Proportion of detected elements (%)', x="")
78 | gp = gp + theme(axis.text = element_text(size=13))
79 | gp = gp + theme(axis.text.x = element_text(angle=45, hjust=1))
80 | #gp
81 | 
82 | w=opt$width
83 | h=opt$height
84 | 
85 | ggsave(filename=sprintf("%s.pdf", output), h=h, w=w)
86 | ggsave(filename=sprintf("%s.png", output), h=h, w=w)
87 | ggsave(filename=sprintf("%s.eps", output), h=h, w=w)
88 | 
89 | 
90 | q(save='no')
91 | 


--------------------------------------------------------------------------------
/gene.pair.correlation.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | 
  4 | options(stringsAsFactors=FALSE)
  5 | cbbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#000000", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 
  6 | 
  7 | ##################
  8 | # OPTION PARSING
  9 | ##################
 10 | 
 11 | suppressPackageStartupMessages(library("optparse"))
 12 | 
 13 | option_list <- list(
 14 | 
 15 | make_option(c("-i", "--input"),
 16 | 	help="Matrix with expression values"),
 17 | 
 18 | make_option(c("-G", "--gene_pairs"), default="stdin",
 19 | 	help="File or stdin. Two-column files with pairs of elements [default=%default]"),
 20 | 
 21 | make_option(c("-o", "--output"), default="cor.out.tsv",
 22 | 	help="Output file name or stdout [default=%default]"),
 23 | 
 24 | make_option(c("--header"), action="store_true", default=FALSE,
 25 | 	help="Use this if the input has a header [default=%default]"),
 26 | 
 27 | make_option(c("-l", "--log"), action="store_true", default=FALSE,
 28 | 	help="log10 of expression values [default=%default]"),
 29 | 
 30 | make_option(c("-p", "--pseudocount"), default=1e-04,
 31 | 	help="pseudocount"),
 32 | 
 33 | make_option(c("-k", "--keep_na"), action="store_true", default=FALSE,
 34 | 	help="do not replace NAs with 0 [default=%default]"),
 35 | 
 36 | make_option(c("-m", "--method"), default="pearson",
 37 | 	help="method for correlation: pearson | spearman [default=%default]"),
 38 | 
 39 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE,
 40 | 	help="if you want more output [default=%default]")
 41 | 
 42 | )
 43 | 
 44 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 45 | arguments <- parse_args(parser, positional_arguments = TRUE)
 46 | opt <- arguments$options
 47 | if (opt$verbose) {print(opt)}
 48 | 
 49 | #------------
 50 | # LIBRARIES
 51 | #------------ 
 52 | 
 53 | if (opt$verbose) {cat("Loading libraries... ")}
 54 | suppressPackageStartupMessages(library(reshape2))
 55 | suppressPackageStartupMessages(library(ggplot2))
 56 | #suppressPackageStartupMessages(library(plyr))
 57 | if (opt$verbose) {cat("DONE\n\n")}
 58 | 
 59 | 
 60 | # ======== #
 61 | # BEGIN    #
 62 | # ======== #
 63 | 
 64 | #input_files = strsplit(opt$input_files, ",")[[1]]
 65 | #labels = strsplit(opt$labels, ",")[[1]]
 66 | #vertical_lines = as.numeric(strsplit(opt$vertical_lines, ",")[[1]])
 67 | 
 68 | 
 69 | # Read data
 70 | 
 71 | if (opt$gene_pairs == "stdin") {
 72 | 	pairs = read.table(file("stdin"), h=opt$header) 
 73 | } else {
 74 | 	pairs = read.table(opt$gene_pairs, h=opt$header)
 75 | }
 76 | 
 77 | m = read.table(opt$input, h=T)
 78 | 
 79 | if (!(opt$keep_na)) {
 80 | 	m = replace(m, is.na(m), 0)
 81 | }
 82 | 
 83 | if (opt$log) {
 84 | 	m = log10(m + opt$pseudocount)
 85 | }
 86 | 
 87 | 
 88 | df = melt(as.matrix(m), value.name="value1", varnames=c("Var1", "variable"))
 89 | df = merge(df, pairs, by.x="Var1", by.y=colnames(pairs)[1])
 90 | df = merge(df, melt(as.matrix(m), value.name="value2", varnames=c("Var2", "variable")), 
 91 | 	by.x=c(colnames(pairs)[2], "variable"), by.y=c("Var2","variable"))
 92 | df$split_by = apply(df[c(colnames(pairs)[2],"Var1")], 1, paste, collapse="_")
 93 | c = sapply(split(df, df$split_by), function(x) cor(x$value1, x$value2, u="p", m=opt$method))
 94 | 
 95 | df = merge(data.frame(c=c, split_by=names(c)), df)
 96 | df = unique(df[c("Var1", colnames(pairs)[2], "c")])
 97 | 
 98 | #print(head(df))
 99 | output = ifelse(opt$output == "stdout", "", opt$output)
100 | write.table(df, output, row.names=FALSE, col.names=FALSE, sep="\t", quote=FALSE)
101 | 
102 | #warnings()
103 | 
104 | 
105 | q(save='no')
106 | 
107 | 


--------------------------------------------------------------------------------
/plot.bigWig.profile.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ##------------
  4 | ## LIBRARIES
  5 | ##------------ 
  6 | cat("Loading libraries... ")
  7 | suppressPackageStartupMessages(library(reshape2))
  8 | suppressPackageStartupMessages(library(ggplot2))
  9 | suppressPackageStartupMessages(library("optparse"))
 10 | #suppressPackageStartupMessages(library(plyr))
 11 | cat("DONE\n\n")
 12 | 
 13 | options(stringsAsFactors=F)
 14 | pseudocount = 1e-04
 15 | cbbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#000000", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 
 16 | 
 17 | ##################
 18 | # OPTION PARSING
 19 | ##################
 20 | 
 21 | option_list <- list(
 22 | 
 23 | make_option(c("-i", "--input_files"), type="character", 
 24 | 	help="list of files with the bigWig profiles, comma-separated. If only one file is given, can be stdin"),
 25 | make_option(c("--header"), action="store_true", default=FALSE, 
 26 | 	help="The files have header [default=%default]"),
 27 | make_option(c("-L", "--labels"), help="list of labels with the labels of each file, commma-separated.\n
 28 | They must be in the same order as the file list", type="character"),
 29 | make_option(c("-v", "--vertical_lines"), help="specify where you want the vertical lines [default=%default]", type="character", default="0"),
 30 | #make_option(c("-f", "--facet"), type="integer", help="column index to facet"),
 31 | make_option(c("-t", "--title"), help="Main title for the plot [default=%default]", default=""),
 32 | make_option(c("-o", "--output"), help="output file name with extension [default=%default]", default="profile.pdf"),
 33 | make_option(c("-y", "--y_title"), help="title for the y-axis [default=%default]", default="norm_read_density")
 34 | #make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"),
 35 | )
 36 | 
 37 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 38 | arguments <- parse_args(parser, positional_arguments = TRUE)
 39 | opt <- arguments$options
 40 | #print(opt)
 41 | 
 42 | # ========
 43 | # BEGIN
 44 | # ========
 45 | if (!is.null(opt$labels)) {labels = strsplit(opt$labels, ",")[[1]]}
 46 | 
 47 | vertical_lines = as.numeric(strsplit(opt$vertical_lines, ",")[[1]])
 48 | 
 49 | 
 50 | # Read data
 51 | 
 52 | if (opt$input_files != "stdin") {
 53 | 	input_files = strsplit(opt$input_files, ",")[[1]]
 54 | 	i = 1
 55 | 	for (f in input_files) {
 56 | 		if (i == 1 ) {
 57 | 			m = read.table(f, h=opt$header)
 58 | 			colnames(m)[1] <- "position"
 59 | 			colnames(m)[2] <- labels[i]
 60 | 		}
 61 | 		if (i != 1) {
 62 | 			tmp = read.table(f, h=F)
 63 | 			colnames(tmp)[1] <- "position"
 64 | 			colnames(tmp)[2] <- labels[i]
 65 | 			m = merge(m, tmp, by = "position")
 66 | 		}
 67 | 		i = i+1
 68 | 	}
 69 | } else {
 70 | 	m = read.table(file('stdin'), h=opt$header)
 71 | 	colnames(m)[1] <- "position"
 72 | 	if (is.null(opt$labels)) {
 73 | 		colnames(m)[2] <- 'label'
 74 | 	} else {
 75 | 		colnames(m)[2] <- labels[1]
 76 | 	}
 77 | }
 78 | 
 79 | 
 80 | # Melt data
 81 | 
 82 | df = melt(m, id.vars="position")
 83 | 
 84 | 
 85 | # GGPLOT
 86 | 
 87 | theme_set(theme_bw(base_size=20))
 88 | 
 89 | gp = ggplot(df, aes(x=position, y=value)) 
 90 | gp = gp + geom_line(aes(group=variable, color=variable), alpha=0.5, size=1)
 91 | gp = gp + geom_vline(xintercept=vertical_lines, color="grey", linetype="longdash")
 92 | gp = gp + labs(title=opt$title, y=opt$y_title)
 93 | gp = gp + scale_color_manual(values=cbbPalette)
 94 | if (is.null(opt$label)) {gp = gp + theme(legend.position='none')}
 95 | 
 96 | ggsave(opt$output, h=5, w=7)
 97 | 
 98 | # EXIT
 99 | quit(save='no')
100 | 


--------------------------------------------------------------------------------
/modularity.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript 
  2 | 
  3 | options(stringsAsFactors=F)
  4 | 
  5 | ##################
  6 | # OPTION PARSING
  7 | ##################
  8 | 
  9 | suppressPackageStartupMessages(library("optparse"))
 10 | 
 11 | option_list <- list(
 12 | make_option(c("-i", "--input_matrix"), default="stdin", 
 13 | 	help="Adjacency matrix in R format: the header is N-1 columns [default=%default]
 14 | 		
 15 | 		Example:
 16 | 		
 17 | 		sample1\tsample2\tsample3
 18 | 		sample1\t1\t0\t1
 19 | 		sample2\t1\t1\t0
 20 | 		sample3\t0\t0\t1
 21 | 	"),
 22 | 
 23 | make_option(c("--diag"), default=FALSE, action="store_true",
 24 | 	help="Consider self-edges. [default=%default]"),
 25 | 
 26 | #make_option(c("-l", "--log"), action="store_true", default=FALSE, help="Log-transform the data [default=%default]"),
 27 | 
 28 | #make_option(c("-p", "--pseudocount"), type="double", help="Pseudocount for the log [default=%default]"),
 29 | 
 30 | #make_option(c("--d1"), help="Design for voom"),
 31 | #make_option(c("-d", "--design"), help="Design for removing the batch effect (not including the batch effect)"),
 32 | make_option(c("-C", "--community"), help="Column in the metadata with the information of the community"),
 33 | 
 34 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"),
 35 | make_option(c("-G", "--merge_mdata_on"), default="labExpId",
 36 | 	help="Column in the metadata with the header of the input matrix [default=%default]"),
 37 | 
 38 | make_option(c("-o", "--output"), default="stdout", help="output file name [default=%default]"),
 39 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, help="verbose output [default=%default]")
 40 | )
 41 | 
 42 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list, description="\nNormalize a matrix")
 43 | arguments <- parse_args(parser, positional_arguments = TRUE)
 44 | opt <- arguments$options
 45 | if (opt$verbose) {print(opt)}
 46 | 
 47 | 
 48 | 
 49 | # LIBRARIES
 50 | 
 51 | suppressPackageStartupMessages(library(igraph))
 52 | #suppressPackageStartupMessages(library(limma))
 53 | 
 54 | 
 55 | ##--------------------##
 56 | ## BEGIN              ##
 57 | ##--------------------##
 58 | 
 59 | 
 60 | # read the matrix from the command line
 61 | if(opt$input_matrix == "stdin"){inF=file("stdin")}else{inF=opt$input_matrix}
 62 | m = read.table(inF, h=T, sep="\t", quote=NULL)
 63 | 
 64 | # Replace missing values with 0
 65 | m = replace(m, is.na(m), 0)
 66 | 
 67 | #if (opt$log) {m = log10(m + opt$pseudocount)}
 68 | 
 69 | g = graph.adjacency(as.matrix(m), mode='directed', diag=opt$diag, add.colnames=TRUE)
 70 | V = vertex.attributes(g)[[1]]
 71 | 
 72 | # =========================== Metadata =======================
 73 | 
 74 | merge_mdata_on = opt$merge_mdata_on
 75 | # read the metadata
 76 | mdata = read.table(opt$metadata, h=T, sep="\t", quote=NULL, comment.char="")
 77 | # Get the fields from the formula 
 78 | if (is.null(opt$community)) {
 79 | 	cat("ERROR: please specify the batch variable\n")
 80 | 	q(save='no')
 81 | }
 82 | fields = opt$community
 83 | mdata[opt$merge_mdata_on] <- gsub("[-,+]", ".", mdata[,opt$merge_mdata_on])
 84 | # Format the metadata
 85 | mdata = unique(mdata[unique(c(merge_mdata_on, fields))])
 86 | rownames(mdata) <- mdata[,merge_mdata_on]
 87 | mdata <- mdata[match(colnames(m), mdata[,merge_mdata_on]),, drop=FALSE]
 88 | 
 89 | community = mdata[match(colnames(m), mdata[, opt$merge_mdata_on]), opt$community]
 90 | 
 91 | 
 92 | modul = modularity(g, as.numeric(factor(community)))
 93 | 
 94 | # =================== OUTPUT ======================
 95 | 
 96 | out = paste(ecount(g), modul, sep="\t")
 97 | cat(out,"\n")
 98 | #cat(modul, "\n")
 99 | 
100 | q(save='no')
101 | 


--------------------------------------------------------------------------------
/sort.rpkm.lines.R:
--------------------------------------------------------------------------------
 1 | 
 2 | ##------------
 3 | ## LIBRARIES
 4 | ##------------ 
 5 | suppressPackageStartupMessages(library(reshape2))
 6 | suppressPackageStartupMessages(library(ggplot2))
 7 | suppressPackageStartupMessages(library("optparse"))
 8 | suppressPackageStartupMessages(library(plyr))
 9 | 
10 | 
11 | options(stringsAsFactors=F)
12 | pseudocount = 1e-04
13 | cbbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#000000", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 
14 | 
15 | ##################
16 | # OPTION PARSING
17 | ##################
18 | 
19 | 
20 | option_list <- list(
21 | make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze"),
22 | make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"),
23 | make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount),
24 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"),
25 | make_option(c("-o", "--output"), help="additional tags for otuput", default="out"),
26 | make_option(c("-c", "--color_by"), help="choose the factor you want to color by", type='character'),
27 | make_option(c("-L", "--linetype_by"), help="choose the factor you want to change the linetype by. Leave empty for none."),
28 | make_option(c("-v", "--value"), help="the value the you are plotting [default=%default]", default="RPKM"),
29 | make_option(c("-t", "--tags"), help="comma-separated field names you want to display in the labels", default="cell,sex,age")
30 | )
31 | 
32 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
33 | arguments <- parse_args(parser, positional_arguments = TRUE)
34 | opt <- arguments$options
35 | print(opt)
36 | 
37 | na2null = function(x) if(is.na(x)) {return(NULL)}else{return(x)}
38 | 
39 | 
40 | ##--------------------##
41 | ## CLUSTERING SAMPLES ##
42 | ##--------------------##
43 | output = sprintf("log_%s.pseudo_%s.colby_%s.%s", opt$log, opt$pseudocount, opt$color_by, opt$output)
44 | 
45 | # read the matrix from the command line
46 | m = read.table(opt$input_matrix, h=T)
47 | ylab = 'rpkm'
48 | 
49 | # remove potential gene id columns
50 | char_cols <- which(sapply(m, class) == 'character')
51 | sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols)
52 | if (length(char_cols) == 0) {genes = rownames(m)}
53 | if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]}
54 | 
55 | #substitute the matrix with its log if required by the user
56 | if (opt$log) {
57 | m = log10(replace(m, is.na(m), 0) + opt$pseudocount);
58 | ylab = sprintf('log10(%s+%s)', opt$value, opt$pseudocount)}
59 | 
60 | # read the metadata from the metadata file
61 | mdata = read.table(opt$metadata, h=T, sep='\t')
62 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(",", ".", x))
63 | 
64 | # prepare data.frame for ggplot
65 | df = melt(m, variable.name = "labExpId", value.name="rpkm")
66 | df = merge(unique(mdata[unique(c("labExpId", strsplit(opt$tags, ",")[[1]], opt$color_by, opt$linetype_by))]), df, by="labExpId")
67 | df$labels = apply(df[strsplit(opt$tags, ",")[[1]]], 1, paste, collapse="_")
68 | # add a column with the x index
69 | df  = ddply(df, .(labels), transform, x=seq_along(labels), y=sort(rpkm, na.last=T,d=T))
70 |  
71 | 
72 | ###############
73 | # OUTPUT 
74 | ###############
75 | 
76 | # plotting...
77 | pdf(sprintf("%s.pdf",output), h=5, w=5)
78 | 
79 | theme_set(theme_bw())
80 | 
81 | gp = ggplot(df, aes(x=x, y=y, group=labels))
82 | gp = gp + geom_line(aes_string(color=opt$color_by, linetype=opt$linetype_by))
83 | gp = gp + labs(y=ylab, x=sprintf('rank(%s)', opt$value))
84 | gp = gp + scale_color_manual(values = cbbPalette)
85 | gp = gp + scale_x_log10(expand=c(0,0))
86 | gp = gp + annotation_logticks(sides="b")
87 | gp = gp + theme(axis.text = element_text(size=15))
88 | gp
89 | 
90 | dev.off()
91 | 
92 | q(save='no')
93 | 


--------------------------------------------------------------------------------
/ggpie.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | 
  4 | # Seed for jittering
  5 | set.seed(123)
  6 | 
  7 | 
  8 | #~~~~~~~~~~~~
  9 | # Libraries
 10 | #~~~~~~~~~~~~
 11 | 
 12 | suppressPackageStartupMessages(library("ggplot2"))
 13 | suppressPackageStartupMessages(library("optparse"))
 14 | suppressPackageStartupMessages(library("grid"))
 15 | 
 16 | 
 17 | # OPTION PARSING
 18 | 
 19 | option_list <- list(
 20 | 
 21 | make_option(c("-i", "--input"), default="stdin",
 22 | 	help="input file. [default=%default]"),
 23 | 
 24 | make_option(c("--header"), default=FALSE,
 25 | 	help="the file has header [deafult=%default]"),
 26 | 
 27 | make_option(c("-o", "--output"), default="ggpie.out.pdf",
 28 | 	help="output file name. Must have a proper image extension (e.g. .pdf, .png) [deafult=%default]"),
 29 | 
 30 | make_option(c("-p", "--percentages"), type='integer',
 31 | 	help="column with the proportions as percentages"),
 32 | 
 33 | make_option(c("-n", "--counts"), type='integer',
 34 | 	help="column with the counts. If you provide counts instead of percentages"),
 35 | 
 36 | make_option(c("-f", "--fill_by"), default=2, type="integer",
 37 | 	help="column with the levels for filling [default=%default]"),
 38 | 
 39 | make_option(c("-P", "--palette"), default="/users/rg/abreschi/R/palettes/Set2.8.txt",
 40 | 	help="palette file [default=%default]"),
 41 | 
 42 | make_option(c("-T", "--ggtitle"),
 43 | 	help="Main title. Leave empty for no title"),
 44 | 
 45 | make_option(c("-t", "--fill_title"), 
 46 | 	help="title for the fill legend")
 47 | 
 48 | )
 49 | 
 50 | 
 51 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 52 | arguments <- parse_args(parser, positional_arguments = TRUE)
 53 | opt <- arguments$options
 54 | 
 55 | #~~~~~~~~~~~~
 56 | # BEGIN
 57 | #~~~~~~~~~~~~
 58 | 
 59 | # Check that counts and percentages are not provided simultaneously
 60 | if (!is.null(opt$percentages) & !is.null(opt$counts)) {
 61 | 	cat("ERROR: cannot provide both percentages and counts\nAborted\n")
 62 | 	q(save='no')
 63 | }
 64 | 
 65 | if (is.null(opt$percentages) & is.null(opt$counts)) {
 66 | 	cat("MISSING INPUT: provide either percentages or counts\nAborted\n")
 67 | 	q(save='no')
 68 | }
 69 | 	
 70 | 
 71 | 
 72 | # read input
 73 | if (opt$input == "stdin") {
 74 | 	m = read.table(file("stdin"), h=opt$header, sep="\t")
 75 | } else {
 76 | 	m = read.table(opt$input, h=opt$header, sep="\t")
 77 | }
 78 | 
 79 | # read palette file
 80 | palette = as.character(read.table(opt$palette, h=FALSE, comment.char="%")$V1)
 81 | 
 82 | # Compute proportions if counts are given
 83 | if (!is.null(opt$counts)) {
 84 | 	counts = colnames(m)[opt$counts]
 85 | 	m$perc = m[,counts]/sum(m[,counts])*100
 86 | 	opt$percentages = which(colnames(m) == "perc") 
 87 | }
 88 | 
 89 | 
 90 | #~~~~~~~~~~~
 91 | # Plot
 92 | #~~~~~~~~~~~
 93 | 
 94 | theme_set(theme_bw(base_size=20))
 95 | 
 96 | theme_update(
 97 | 	panel.grid = element_blank(),
 98 | 	panel.border = element_blank(),
 99 | 	axis.ticks = element_blank(),
100 | 	axis.ticks.margin = unit(0,'in'),
101 | 	axis.title = element_blank(),
102 | 	plot.margin = unit(c(0.01, 0.01, 0.01, 0.01),'in'),
103 | 	legend.key = element_blank()
104 | )
105 | 
106 | 
107 | 
108 | l = cumsum(m[,opt$percentages]) - c(m[1,opt$percentages], diff(cumsum(m[,opt$percentages])))/2
109 | x_labels = paste(round(m[,opt$percentages], 1), "%", sep="")
110 | 
111 | 
112 | gp = ggplot(m, aes_string(x=1, y=colnames(m)[opt$percentages]))
113 | gp = gp + geom_bar(stat="identity", position="stack", aes_string(fill=colnames(m)[opt$fill_by]))
114 | gp = gp + coord_polar(theta="y")
115 | gp = gp + scale_fill_manual(values=palette, name=opt$fill_title)
116 | gp = gp + scale_x_continuous(labels=NULL)
117 | #gp = gp + scale_y_continuous(labels=x_labels, breaks=l)
118 | gp = gp + scale_y_continuous(labels=NULL, breaks=NULL)
119 | gp = gp + geom_text(aes(x=1.5, y=l, label=x_labels), position=position_jitter(w=0.0, h=2.0))
120 | gp = gp + labs(title=opt$ggtitle)
121 | 
122 | 
123 | ggsave(opt$output, w=6, h=3.5)
124 | 
125 | q(save='no')
126 | 


--------------------------------------------------------------------------------
/VennDiagram.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript 
  2 | 
  3 | 
  4 | # -- Variables --
  5 | 
  6 | options(stringsAsFactors=F)
  7 | 
  8 | 
  9 | ##################
 10 | # OPTION PARSING
 11 | ##################
 12 | 
 13 | suppressPackageStartupMessages(library("optparse"))
 14 | 
 15 | option_list <- list(
 16 | make_option(c("-l", "--lcol"), help="Comma-separeted colors for the lines of the sets. Only names accepted for the moment! [default: black]"),
 17 | make_option(c("-f", "--fcol"), help="Comma-separated colors for the surfaces of the sets. Only names accepted for the moment! [default:palette]"),
 18 | make_option(c("-L", "--Lcol"), help="Comma-separated colors for the labels of the sets. Only names accepted for the moment! [default:black]"),
 19 | 
 20 | make_option(c("-W", "--width"), default=3, type='integer',
 21 | 	help="width of the plot in inches [default=%default]"),
 22 | 
 23 | make_option(c("-H", "--height"), default=3, type='integer',
 24 | 	help="height of the plot in inches [default=%default]"),
 25 | 
 26 | make_option(c("-o", "--output"), help="output file name WITHOUT extension [default=Venn.out]", default="venn.out")
 27 | )
 28 | 
 29 | cat("\nNOTE: NAs are treated as strings and not as missing values\n\n", file=stderr())
 30 | 
 31 | parser <- OptionParser(usage = "%prog [options] file(s)", option_list=option_list)
 32 | arguments <- parse_args(parser, positional_arguments = TRUE)
 33 | opt <- arguments$options
 34 | arg <- arguments$args 
 35 | #print(arguments)
 36 | 
 37 | 
 38 | 
 39 | ##------------
 40 | ## LIBRARIES
 41 | ##------------ 
 42 | 
 43 | cat("Loading libraries... ")
 44 | suppressPackageStartupMessages(library('VennDiagram'))
 45 | cat("DONE\n\n")
 46 | 
 47 | 
 48 | ################
 49 | # BEGIN
 50 | ################
 51 | 
 52 | # read the lists of elements from args
 53 | venn_list=list()
 54 | for (f in arg) {
 55 | 	l = as.list(read.table(f, h=T))
 56 | 	venn_list = modifyList(venn_list, l)
 57 | }
 58 | 
 59 | for (i in seq_along(arg)) {
 60 | 	l = as.list(read.table(arg[i], h=T, na.strings=NULL))
 61 | 	if (i==1) {
 62 | 		venn_list = l
 63 | 		merged = l[[1]]
 64 | 		}
 65 | 	if (i!=1) {
 66 | 		venn_list = c(venn_list, l)
 67 | 		merged = intersect(merged, l[[1]])
 68 | 	}
 69 | }
 70 | 
 71 | 
 72 | 
 73 | # graphical parameters
 74 | #-----------------------
 75 | 
 76 | # change the line colors
 77 | if (is.null(opt$lcol)) {
 78 | 	col <- rep('black', length(venn_list))
 79 | 	
 80 | }else{
 81 | 	col <- strsplit(opt$lcol, ',')[[1]]  
 82 | }
 83 | 
 84 | 
 85 | # change the surface colors
 86 | if (is.null(opt$fcol)) {
 87 | 	face_col <- rainbow(length(venn_list))
 88 | }else{
 89 | 	face_col = strsplit(opt$fcol, ',')[[1]]
 90 | }
 91 | 
 92 | 
 93 | # change the label colors
 94 | if (is.null(opt$Lcol)) {
 95 | 	label_col = rep('black', length(venn_list))
 96 | }else{
 97 | 	label_col <- strsplit(opt$Lcol, ',')[[1]]  
 98 | }
 99 | 
100 | 
101 | # ===============
102 | # ** plotting...
103 | # ===============
104 | 
105 | image_type = 'png'
106 | w = opt$width
107 | h = opt$height
108 | 
109 | 
110 | # Set the label distances
111 | cat.dist = (w*c(0.01,0.01,0,0,0))[1:length(venn_list)]
112 | 
113 | if (length(venn_list) == 4) {
114 | 	cat.dist = (w*c(0.08, 0.08, 0.02, 0.02))
115 | }
116 | 
117 | # Set the label position
118 | if (length(venn_list) == 5) {
119 | 	cat.pos = c(20, 40, 40, 30, 10)
120 | }
121 | 
122 | if (length(venn_list) == 4) {
123 | 	cat.pos = c(180, 180, 0, 0)
124 | }
125 | 
126 | if (length(venn_list) == 3) {
127 | 	cat.pos = c(-20, 20, 20)
128 | }
129 | 	
130 | if (length(venn_list) == 2) {
131 | 	cat.pos = c(20, 30)
132 | }
133 | 
134 | 
135 | 
136 | venn.diagram(venn_list, 
137 | 	filename=sprintf("%s.%s", opt$output, image_type), 
138 | 	imagetype=image_type,
139 | 	units='in',
140 | 	width=w,
141 | 	height=h,
142 | 	col=col,
143 | 	cat.col = label_col,
144 | 	cat.pos = cat.pos,
145 | 	cat.dist = cat.dist,
146 | 	cat.cex=c(0.8,0.8,0.8,0.8,0.8)[1:length(venn_list)],
147 | 	fill=face_col
148 | )
149 | 
150 | # writing the intersection
151 | write.table(data.frame(merged), file=sprintf("%s.tsv", opt$output), quote=F, row.names=F)
152 | 
153 | q(save='no')
154 | 
155 | 
156 | 


--------------------------------------------------------------------------------
/KEGG_enrichment.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | 
 4 | suppressPackageStartupMessages(library("optparse"))
 5 | options(stringsAsFactors=F)
 6 | 
 7 | ##################
 8 | # OPTION PARSING
 9 | ##################
10 | 
11 | 
12 | option_list <- list(
13 | make_option(c("-u", "--universe"), help="a list of human gene identifiers (ensEMBL ids), NO header"),
14 | make_option(c("-G", "--genes"), help="a list of human gene identifiers for the foreground (ensEMBL ids), WITH header"),
15 | #make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"),
16 | #make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount),
17 | #make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"),
18 | make_option(c("-o", "--output"), help="additional tags for otuput [default=out]")
19 | #make_option(c("-f", "--fill_by"), help="choose the color you want to fill by [default=NA]", type='character', default=NA)
20 | )
21 | 
22 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
23 | arguments <- parse_args(parser, positional_arguments = TRUE)
24 | opt <- arguments$options
25 | print(opt)
26 | 
27 | ##------------
28 | ## LIBRARIES
29 | ##------------ 
30 | suppressPackageStartupMessages(library("KEGG.db"))
31 | suppressPackageStartupMessages(library("org.Hs.eg.db"))
32 | suppressPackageStartupMessages(library("GOstats"))
33 | suppressPackageStartupMessages(library("plyr"))
34 | 
35 | na2null = function(x) if(is.na(x)) {return(NULL)}else{return(x)}
36 | 
37 | 
38 | ############################
39 | # BEGIN
40 | ############################
41 | 
42 | U = read.table(opt$universe, h=F, col.names='hs')
43 | G = read.table(opt$genes, h=T, col.names='hs')
44 | 
45 | 
46 | #orth = read.table('~/Documents/db/human-mouse/orthologs/merged_orthologs_HumanMouse_1_1_genes.tsv', h=F, col.names=c('hs','mm'))
47 | #genes = read.table('~/Documents/human-mouse/tissues/maxdiff_tissuespecific.tsv', h=F, col.names=c('hs','mm','sample'))
48 | 
49 | # I want to create a list of parameters to perform GO enrichment on different gene sets
50 | 
51 | # take the entrez gene ids for all the orthologous genes which will be my universe (the same for all the sets)
52 | universe = unlist(mget(U$hs, org.Hs.egENSEMBL2EG, ifnotfound=NA))
53 | 
54 | sprintf("%s background genes; %s with a corresponding entrez id", nrow(U), length(unique(universe)))
55 | # how many genes am I able to map?
56 | # First thing notice that also ensembl gene ids longer than 15 characters are included
57 | # if I remove these genes I end up with:
58 | # length(unique(as.character(universe[which(nchar(names(universe)) == 15)]))) ----> 15593
59 | 
60 | 
61 | createParams = function(x) {
62 | 	geneset = unlist(mget(x, org.Hs.egENSEMBL2EG, ifnotfound=NA))
63 | 	sprintf("%s foreground genes; %s with a corresponding entrez id", length(x), length(unique(geneset)))
64 | 	pv = 1-(1-0.05)**(1/length(x))
65 | 	params = new("KEGGHyperGParams",
66 | 		geneIds = geneset,
67 | 		universeGeneIds = universe,
68 | 		annotation = 'org.Hs.eg.db',
69 | 		pvalueCutoff = pv,
70 | 		testDirection='over')
71 | 	return(params)}
72 | 
73 | res = hyperGTest(createParams(G$hs))
74 | write.table(summary(res), file=sprintf("%s.tsv", opt$output), quote=F, sep="\t", row.names=F)
75 | htmlReport(res, file=sprintf("%s.html", opt$output))
76 | 
77 | q(save='no')
78 | 
79 | 
80 | #listOfParamObjs = aggregate(G$hs, list(G$sample), createParams)$x
81 | 
82 | # run it (and hope it is ok! )
83 | #resultList = lapply(listOfParamObjs, hyperGTest)
84 | 
85 | # give the name to each GO enrichment
86 | #names(resultList) <- aggregate(genes$hs, list(genes$sample), length)[,1]
87 | 
88 | # put the results in a data.frame
89 | #resultDf = rbind.fill(lapply(as.list(names(resultList)[-which(lapply(resultList, function(x) nrow(summary(x))) == 0)]), 
90 | #	function(n)data.frame(summary(resultList[[n]]), n_genes=length(geneIds(resultList[[n]])), sample=n)))
91 | 
92 | # save the data.frame in a file
93 | #write.table(resultDf, file='maxdiff_tissuespecific_GO.tsv', quote=F, row.names=F, col.names=T, sep='\t')
94 | 
95 | 


--------------------------------------------------------------------------------
/wordcloud.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | 
  4 | options(stringsAsFactors=FALSE)
  5 | 
  6 | ##################
  7 | # OPTION PARSING
  8 | ##################
  9 | 
 10 | suppressPackageStartupMessages(library("optparse"))
 11 | 
 12 | option_list <- list(
 13 | 
 14 | make_option(c("-i", "--input"), default="stdin",
 15 | 	help="File or stdin [default=%default]"),
 16 | 
 17 | make_option(c("--header"), action="store_true", default=FALSE,
 18 | 	help="Use this if the input has a header [default=%default]"),
 19 | 
 20 | make_option(c("-o", "--output"), default="wordcloud.pdf",
 21 | 	help="Output file name [default=%default]"),
 22 | 
 23 | make_option(c("-w", "--words"), default=1,
 24 | 	help="Index of the column with the words [default=%default]"),
 25 | 
 26 | make_option(c("-e", "--weights"), default=2, type="integer",
 27 | 	help="Index of the column with weights [default=%default]"),
 28 | 
 29 | make_option(c("-C", "--color_by"), type="integer", 
 30 | 	help="Index of the factor for coloring"),
 31 | 
 32 | make_option(c("-P", "--palette"), default="/users/rg/abreschi/R/palettes/Spectral.11.txt",
 33 |         help='File with colors [default=%default]'),
 34 | 
 35 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE,
 36 | 	help="if you want more output [default=%default]")
 37 | 
 38 | #make_option(c("--position"), default='dodge',
 39 | #	help="Position for histogram [default=%default]"),
 40 | #
 41 | #make_option(c("--scale_x_log10"), action="store_true", default=FALSE,
 42 | #	help="log10-transform x scale [default=%default]"),
 43 | #
 44 | #make_option(c("--scale_y_log10"), action="store_true", default=FALSE,
 45 | #	help="log10-transform y scale [default=%default]"),
 46 | #
 47 | #make_option(c("--y_title"), type="character", default="count",
 48 | #	help="Title for the y axis [default=%default]"),
 49 | #
 50 | #make_option(c("--x_title"), type="character", default="",
 51 | #	help="Title for the x axis [default=%default]"),
 52 | #
 53 | #make_option(c("-f", "--fill"), default="aquamarine",
 54 | #	help="choose the color which you want to fill the histogram with"),
 55 | #
 56 | #make_option(c("-c", "--color"), default="white",
 57 | #	help="choose the color which you want to contour the histogram with"),
 58 | #
 59 | #make_option(c("-F", "--fill_by"), type='numeric',
 60 | #	help="the column index with the factor to fill by. Leave empty for no factor."),
 61 | #
 62 | #make_option(c("--facet_by"), type='numeric',
 63 | #	help="the column index with the factor to facet by. Leave empty for no factor."),
 64 | #
 65 | #make_option(c("-W", "--width"), default=7,
 66 | #	help="width of the plot in inches. [default=%default]"),
 67 | #
 68 | #make_option(c("-b", "--binwidth"), type="double", 
 69 | #	help="Specify binwidth. Leave empty for default")
 70 | 
 71 | )
 72 | 
 73 | parser <- OptionParser(
 74 | 	usage = "%prog [options] file", 
 75 | 	option_list=option_list,
 76 | 	description = "Reads the values on the first column and outputs a histogram"
 77 | )
 78 | arguments <- parse_args(parser, positional_arguments = TRUE)
 79 | opt <- arguments$options
 80 | if (opt$verbose) {print(opt)}
 81 | 
 82 | #------------
 83 | # LIBRARIES
 84 | #------------ 
 85 | 
 86 | if (opt$verbose) {cat("Loading libraries... ")}
 87 | #suppressPackageStartupMessages(library(reshape2))
 88 | #suppressPackageStartupMessages(library(ggplot2))
 89 | suppressPackageStartupMessages(library(wordcloud))
 90 | if (opt$verbose) {cat("DONE\n\n")}
 91 | 
 92 | 
 93 | # ======== #
 94 | # BEGIN    #
 95 | # ======== #
 96 | 
 97 | set.seed(123)
 98 | 
 99 | # Read data
100 | if (opt$input == "stdin") {input=file("stdin")} else {input=opt$input}
101 | m = read.table(input, h=opt$header, sep="\t") 
102 | 
103 | df = m
104 | 
105 | words = df[,opt$words] 
106 | freq = df[,opt$weights]
107 | 
108 | if (!is.null(opt$color_by)) {
109 | 	palette = as.character(read.table(opt$palette, comment.char="%", h=F)$V1)
110 | 	color_by = df[,opt$color_by]
111 | 	colors = palette[as.factor(color_by)]
112 | }
113 | 
114 | 
115 | pdf(opt$output, width=7, height=7)
116 | wordcloud(
117 | 	words = words, 
118 | 	freq = freq, 
119 | 	min.freq = -1,
120 | 	colors = colors,
121 | 	ordered.colors = TRUE
122 | )
123 | dev.off()
124 | 
125 | q(save='no')
126 | 


--------------------------------------------------------------------------------
/read.genomic.coverage.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | #options(stringsAsFactors=F)
  4 | pseudocount = 1e-04
  5 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 
  6 | 
  7 | 
  8 | 
  9 | ##################
 10 | # OPTION PARSING
 11 | ##################
 12 | 
 13 | suppressPackageStartupMessages(library("optparse"))
 14 | 
 15 | option_list <- list(
 16 | 
 17 | make_option(c("-i", "--input_matrix"), 
 18 | 	help="the matrix you want to analyze. It must have not a header. It is the output of read.genome.coverage.py"),
 19 | 
 20 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"),
 21 | make_option(c("-M", "--merge_mdata_on"), help="which field corresponds to the ids in the summary file [default=%default]", default="labExpId"),
 22 | make_option(c("-o", "--output"), help="output file name (without extension) [default=%default]", default="summary.out"),
 23 | 
 24 | make_option(c("-H", "--height"), default=6,
 25 | 	help="height of the plot in inches [default=%default]"),
 26 | 
 27 | make_option(c("-W", "--width"), default=8,
 28 | 	help="width of the plot in inches [default=%default]"),
 29 | 
 30 | make_option(c("--facet_nrow"), type="integer",
 31 | 	help="number of rows when faceting"),
 32 | 
 33 | make_option(c("-f", "--facet"), help="dashboard field by which the individuals are faceted"),
 34 | 
 35 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE,
 36 | 	help="verbose output [default=%default]")
 37 | )
 38 | 
 39 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 40 | arguments <- parse_args(parser, positional_arguments = TRUE)
 41 | opt <- arguments$options
 42 | if( opt$verbose) {print(opt)}
 43 | 
 44 | 
 45 | ##------------
 46 | ## LIBRARIES
 47 | ##------------ 
 48 | 
 49 | cat("Loading libraries... ")
 50 | suppressPackageStartupMessages(library(reshape2))
 51 | suppressPackageStartupMessages(library(ggplot2))
 52 | suppressPackageStartupMessages(library(plyr))
 53 | cat("DONE\n\n")
 54 | 
 55 | 
 56 | 
 57 | ##--------------------##
 58 | ## BEGIN              ##
 59 | ##--------------------##
 60 | 
 61 | # read the matrix from the command line
 62 | m = read.table(opt$input_matrix, h=F, col.names=c("labExpId", "type", "region", "nb_reads"))
 63 | 
 64 | # read the metadata from the metadata file
 65 | if (!is.null(opt$metadata)) {
 66 | 	mdata = read.table(opt$metadata, h=T, sep='\t')
 67 | 	mdata[opt$merge_mdata_on] <- sapply(mdata[opt$merge_mdata_on], function(x) gsub(",", ".", x))
 68 | }
 69 | 
 70 | if (opt$verbose) {print(head(m))}
 71 | 
 72 | # separate total from the rest
 73 | df = merge(m, setNames(subset(m, type=="total")[c(1,4)], c("labExpId", "total")), by="labExpId")
 74 | 
 75 | # merge split and continuous
 76 | all = merge(aggregate(nb_reads~labExpId+region, subset(df, region!="total"), sum), subset(m, type=="total")[c(1,2,4)], by="labExpId")
 77 | colnames(all)[c(3,5)] <- c("nb_reads", "total")
 78 | all$type <- "all"
 79 | df = rbind(df, all)
 80 | 
 81 | if (opt$verbose) {print(head(df))}
 82 | 
 83 | # attach the metadata
 84 | if (!is.null(opt$metadata)) {
 85 | 	mdata_header = unique(c(opt$facet, opt$merge_mdata_on))
 86 | 	df = merge(df, unique(mdata[mdata_header]), by.x='labExpId', by.y=opt$merge_mdata_on)
 87 | }
 88 | 
 89 | 
 90 | if (opt$verbose) {print(head(df))}
 91 | 
 92 | # ----------------- ggplot options ------------------------------
 93 | 
 94 | theme_set(theme_bw(base_size=18))
 95 | 
 96 | gp = ggplot(subset(df, type!="total"), aes(y=nb_reads/total*100, x=region))
 97 | gp = gp + geom_boxplot(aes(color=type, fill=type), alpha=0.5)
 98 | if (!is.null(opt$metadata)) {
 99 | 	gp = gp + facet_wrap(as.formula(sprintf("~%s", opt$facet)), nrow=opt$facet_nrow)
100 | #	gp = gp + facet_grid(as.formula(sprintf("~%s", opt$facet)))
101 | }
102 | gp = gp + labs(y='Proportion of mapped reads (%)', x="")
103 | gp = gp + theme(axis.text = element_text(size=13, angle=45, h=1))
104 | gp = gp + scale_color_brewer(palette="Set1")
105 | 
106 | w = opt$width
107 | h = opt$height
108 | 
109 | ggsave(filename=sprintf("%s.pdf", opt$output), h=h, w=w)
110 | ggsave(filename=sprintf("%s.png", opt$output), h=h, w=w)
111 | ggsave(filename=sprintf("%s.eps", opt$output), h=h, w=w)
112 | 
113 | 
114 | q(save='no')
115 | 


--------------------------------------------------------------------------------
/normalization.DESeq.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # This script is useful for:
  3 | # normalizing samples by row scaling
  4 | 
  5 | 
  6 | ##------------
  7 | ## LIBRARIES
  8 | ##------------ 
  9 | suppressPackageStartupMessages(library(reshape2))
 10 | suppressPackageStartupMessages(library(ggplot2))
 11 | suppressPackageStartupMessages(library(DESeq))
 12 | suppressPackageStartupMessages(library("optparse"))
 13 | 
 14 | options(stringsAsFactors=F)
 15 | pseudocount = 1e-05
 16 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 
 17 | 
 18 | ##################
 19 | # OPTION PARSING
 20 | ##################
 21 | 
 22 | option_list <- list(
 23 | make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze"),
 24 | make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"),
 25 | make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount),
 26 | make_option(c("-m", "--metadata"), help="tsv file with the metadata"),
 27 | make_option(c("-s", "--scale_by"), help="choose one or multiple attributes you want to scale by. Null for normalize the whole matrix")
 28 | #make_option(c("-r", "--row_first"), action="store_true", help="scale first by rows then by columns", default=FALSE),
 29 | #make_option(c("-n", "--n_iter"), type='integer', help="how many times to iterate [default=%default]", default=20)
 30 | )
 31 | 
 32 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 33 | arguments <- parse_args(parser, positional_arguments = TRUE)
 34 | opt <- arguments$options
 35 | print(opt)
 36 | 
 37 | 
 38 | ###############
 39 | # BEGIN
 40 | ###############
 41 | 
 42 | # read options
 43 | m <- read.table(opt$input_matrix, h=T)
 44 | 
 45 | mdata <- read.table(opt$metadata, h=T, row.names=NULL, sep="\t")
 46 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(",", ".", x))
 47 | 
 48 | char_cols <- which(sapply(m, class) == 'character')
 49 | sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols)
 50 | 
 51 | if (length(char_cols) == 0) {genes = rownames(m)}
 52 | if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]}
 53 | 
 54 | m = (apply(m, 2, as.integer))
 55 | 
 56 | # apply the log if required
 57 | if (opt$log) { m = log2(replace(m, is.na(m), 0) + pseudocount) }
 58 | 
 59 | mdata = subset(mdata, labExpId %in% colnames(m))
 60 | 
 61 | 
 62 | 
 63 | # store the plot of the distribution with input matrix
 64 | gp1 = ggplot(melt(m), aes(x=value)) + geom_density() + facet_wrap(~variable)
 65 | 
 66 | 
 67 | 
 68 | ##########
 69 | # EQUILIBRATION
 70 | ##########
 71 | 
 72 | 
 73 | DESeq_norm = function(matr) {
 74 | 	condition = rep('group',ncol(matr))
 75 | 	cds = newCountDataSet(matr, condition)
 76 | 	cds = estimateSizeFactors(cds)
 77 | 	return(counts(cds, normalized=T))
 78 | }
 79 | 
 80 | new_m = m
 81 | 
 82 | # scale the whole matrix if no value is provided
 83 | if (is.null(opt$scale_by)) {
 84 | new_m = DESeq_norm(new_m)
 85 | }
 86 | 
 87 | # scale the sub-matrices defined the scale_by option
 88 | if (!is.null(opt$scale_by)) {
 89 | scale_by <- strsplit(opt$scale_by, ",")[[1]]
 90 | if (length(scale_by) != 1){
 91 | ids = apply(data.frame(unique(mdata[, scale_by])), 1, function(x) unique(merge(t(as.data.frame(x)), mdata, by=scale_by)$labExpId ))}
 92 | if (length(scale_by) == 1){
 93 | ids = sapply(unique(mdata[, scale_by]), function(x) unique(mdata[ mdata[,scale_by] == x,]$labExpId))}
 94 | # apply normalization
 95 | if (length(scale_by) != 1){for (i in 1:length(ids)) { new_m[, ids[[i]]] <- DESeq_norm(new_m[,ids[[i]]])}  }
 96 | if (length(scale_by) == 1){for (i in 1:ncol(ids)) { new_m[, ids[[i]]] <- DESeq_norm(new_m[,ids[[i]]])}  }
 97 | }
 98 | 
 99 | 
100 | if (is.null(opt$scale_by)) {gp2 = ggplot(melt(new_m), aes(x=value)) + geom_density() + facet_wrap(~Var2)}else{
101 | gp2 = ggplot(melt(new_m), aes(x=value)) + geom_density() + facet_wrap(~variable)}
102 | if (length(char_cols) != 0) {new_m <- cbind(genes, new_m)}
103 | 
104 | 
105 | # print output
106 | #--------------
107 | output = sprintf('DESeq_norm.log_%s.pscnt_%s', opt$log, opt$pseudocount)
108 | write.table(new_m, sprintf('%s.tsv',output), quote=F, sep='\t', row.names=F)
109 | pdf(sprintf("%s.pdf", output)); gp1; gp2; dev.off()
110 | q(save='no')
111 | 


--------------------------------------------------------------------------------
/cutree.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | options(stringsAsFactors=FALSE)
  4 | 
  5 | ##################
  6 | # OPTION PARSING #
  7 | ##################
  8 | 
  9 | suppressPackageStartupMessages(library("optparse"))
 10 | 
 11 | option_list <- list(
 12 | 
 13 | make_option(c("-i", "--input"), default="stdin",
 14 | 	help="File or stdin [default=%default]"),
 15 | 
 16 | make_option(c("-o", "--output"), default="cutree.tsv",
 17 | 	help="Output file name. Can be stdout [default=%default]"),
 18 | 
 19 | make_option(c("-r", "--replace_na"), default=FALSE, action="store_true",
 20 | 	help="Replace NAs with 0, before adding the pseudocount and applying the log if asked [default=%default]"),
 21 | 
 22 | make_option(c("-l", "--log10"), action="store_true", default=FALSE,
 23 | 	help="Apply the log10 to the whole matrix as pre-processing step [default=%default]"),	
 24 | 
 25 | make_option(c("-p", "--pseudocount"), default=0.001,
 26 | 	help="Pseudocount to add when applying the log [default=%default]"),
 27 | 
 28 | make_option(c("-k", "--nb_clusters"), 
 29 | 	help="Number of desired clusters [default=%default]"),
 30 | 
 31 | make_option(c("--order"), default=FALSE, action="store_true",
 32 | 	help="Give only the order of elements without cutting the tree [default=%defult]"),
 33 | 
 34 | make_option(c("--index"), action="store_true", default=FALSE,
 35 | 	help="Output only the row names and the final index [default=%default]"),
 36 | 
 37 | make_option(c("-H", "--cluster_height"), 
 38 | 	help="Height for cutting the tree [default=%default]"),
 39 | 
 40 | make_option(c("-d", "--dist"), default="euclidean",
 41 | 	help="Distance measure for clustering. Supports spearman (s) and pearson (p) as distance metrics [default=%default]"),	
 42 | 
 43 | make_option(c("-c", "--hclust"), default="complete",
 44 | 	help="Algorithm for the hierarchical clustering [default=%default]"),
 45 | 
 46 | make_option(c("--margin"), default=1,
 47 | 	help="Cluster the rows (1) or the columns (2). Only rows can be clustered [default=%default]"),
 48 | 
 49 | #make_option(c("-B", "--iterations"), default=50,
 50 | #	help="Number of initializations to determine the best clustering [default=%default]"),
 51 | #
 52 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE,
 53 | 	help="if you want more output [default=%default]")
 54 | )
 55 | 
 56 | parser <- OptionParser(
 57 | 	usage = "%prog [options] file", 
 58 | 	option_list = option_list,
 59 | 	description = "Given a matrix cluster the element and cut the tree"
 60 | )
 61 | 
 62 | arguments <- parse_args(parser, positional_arguments = TRUE)
 63 | opt <- arguments$options
 64 | if (opt$verbose) {print(opt)}
 65 | 
 66 | 
 67 | # TODO: cluster the columns
 68 | 
 69 | 
 70 | 
 71 | ##############
 72 | # BEGIN
 73 | ##############
 74 | 
 75 | inF = opt$input; if(opt$input == "stdin") {inF=file("stdin")}
 76 | m = read.table(inF, h=T)
 77 | 
 78 | 
 79 | if (opt$replace_na) {
 80 | 	m = replace(m, is.na(m), 0)
 81 | }
 82 | 
 83 | 
 84 | if (opt$log10) {
 85 | 	m = log10(m + opt$pseudocount)
 86 | }
 87 | 
 88 | 
 89 | 
 90 | # -------------------------- Distance -------------------------------
 91 | 
 92 | # Compute the distance between columns
 93 | if (opt$margin == 2) {
 94 | 	if (opt$dist == "p" || opt$dist =="s") {
 95 | 	    Dist = as.dist(1-cor(m, method=opt$dist, use="p"))
 96 | 	} else {
 97 | 	    Dist = dist(t(m), method=opt$dist)
 98 | 	}
 99 | }
100 | 
101 | # Compute the distance between rows
102 | if (opt$margin == 1) {
103 | 	if (opt$dist == "p" || opt$dist =="s") {
104 | 	    Dist = as.dist(1-cor(t(m), method=opt$dist, use="p"))
105 | 	} else {
106 | 	    Dist = dist(m, method=opt$dist)
107 | 	}
108 | }
109 | 
110 | 
111 | # -------------------------- Clustering ------------------------------
112 | 
113 | klust = hclust(Dist, method=opt$hclust)
114 | if (opt$order) {
115 | 	K = order(klust$order)
116 | 	#print(klust$labels)
117 | 	#print(klust$order)
118 | 	#print (K)
119 | 	m$K = K
120 | } else {
121 | 	K = cutree(klust, k=opt$nb_clusters, h=opt$cluster_height)
122 | 	m$K = K
123 | }
124 | 
125 | 
126 | 
127 | # OUTPUT
128 | 
129 | output = ifelse(opt$output == "stdout", "", opt$output)
130 | 
131 | if (opt$index) {
132 | 	write.table(m["K"], output, quote=FALSE, col.names=FALSE, row.names=TRUE, sep='\t')
133 | 	q(save='no')
134 | }
135 | 
136 | write.table(m, output, quote=FALSE, col.names=TRUE, row.names=TRUE, sep='\t')
137 | 
138 | q(save='no')
139 | 


--------------------------------------------------------------------------------
/DESeq.analysis.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript 
  2 | 
  3 | 
  4 | ##------------
  5 | ## LIBRARIES
  6 | ##------------ 
  7 | 
  8 | cat('Loading libraries... ')
  9 | 
 10 | #suppressPackageStartupMessages(library(reshape2))
 11 | #suppressPackageStartupMessages(library(ggplot2))
 12 | suppressPackageStartupMessages(library("optparse"))
 13 | #suppressPackageStartupMessages(library(plyr))
 14 | suppressPackageStartupMessages(library('DESeq'))
 15 | 
 16 | cat('DONE\n\n')
 17 | 
 18 | options(stringsAsFactors=F)
 19 | pseudocount = 1e-04
 20 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 
 21 | 
 22 | ##################
 23 | # OPTION PARSING
 24 | ##################
 25 | 
 26 | 
 27 | option_list <- list(
 28 | make_option(c("-i", "--input_matrix"), help="the matrix with READ COUNTS you want to analyze"),
 29 | make_option(c("-r", "--replace_NAs"), action="store_true", default=FALSE, help="replace NAs with 0 [default=%default]"),
 30 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"),
 31 | make_option(c("-f", "--fields"), help="choose the fields you want to use in the differential expression, comma-separated"),
 32 | make_option(c("-o", "--output"), help="output file name"),
 33 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE)
 34 | )
 35 | 
 36 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 37 | arguments <- parse_args(parser, positional_arguments = TRUE)
 38 | opt <- arguments$options
 39 | if(opt$verbose) {print(opt)}
 40 | 
 41 | 
 42 | 
 43 | ##--------------------##
 44 | ## CLUSTERING SAMPLES ##
 45 | ##--------------------##
 46 | 
 47 | # read the matrix from the command line
 48 | m = read.table(opt$input_matrix, h=T)
 49 | genes = rownames(m)
 50 | m = (apply(m, 2, as.integer))
 51 | rownames(m) <- genes
 52 | 
 53 | if (opt$replace_NAs) {m = replace(m, is.na(m), 0)}
 54 | 
 55 | # read the metadata from the metadata file
 56 | mdata = read.table(opt$metadata, h=T, sep='\t')
 57 | mdata["labExpId"] <- gsub("[,-]", ".", mdata[,"labExpId"])
 58 | 
 59 | 
 60 | # specify the design to the program
 61 | fields = strsplit(opt$fields, ",")[[1]]
 62 | #mdata = unique(mdata[c("labExpId", opt$fields)])
 63 | #print(match(colnames(m), mdata[,"labExpId"]))
 64 | if (length(fields) == 1) {
 65 | condition = factor(sapply(colnames(m), function(x) unique(subset(mdata, labExpId == x)[,opt$fields])))
 66 | }else{print('cannot handle multiple fields yet');q(save='no')}
 67 | 
 68 | 
 69 | #colData = mdata[mdata[,"labExpId"] %in% colnames(m) & mdata[,"view"]=="Alignments",]
 70 | #rownames(colData) <- colData[,"labExpId"]
 71 | #colData = colData[match(colnames(m), rownames(colData)),]
 72 | #dds = DESeqDataSetFromMatrix(countData = m, colData= colData, design=~cell)
 73 | 
 74 | 
 75 | 
 76 | # create count object for DESeq
 77 | cds = newCountDataSet(m, condition)
 78 | 
 79 | # normalization
 80 | if(opt$verbose) {cat("Estimating size factors... ")}
 81 | cds = estimateSizeFactors(cds)
 82 | if(opt$verbose) {cat("DONE\n")}
 83 | 
 84 | # variance estimation
 85 | if(opt$verbose) {cat("Estimating dispersions... ")}
 86 | cds = estimateDispersions(cds)
 87 | if(opt$verbose) {cat("DONE\n")}
 88 | 
 89 | # plot dispersion estimates
 90 | plotDispEsts = function(x) {
 91 | norm_means = rowMeans(counts(x, normalized=T))
 92 | plot(sort(norm_means), fitInfo(x)$perGeneDispEsts[order(norm_means)], log='xy')
 93 | lines(sort(norm_means), fData(x)$disp_pooled[order(norm_means)],  col=2)
 94 | }
 95 | 
 96 | # calling differential expression
 97 | if(opt$verbose) {cat("Binomial test... ")}
 98 | res = nbinomTest(cds, levels(condition)[1], levels(condition)[2])
 99 | if(opt$verbose) {cat("DONE\n")}
100 | 
101 | # plot MA
102 | plotMA = function(x) {
103 | plot(sort(log10(x$baseMean)), x$log2FoldChange[order(x$baseMean)])
104 | points(log10(subset(x, padj<=0.01)[,'baseMean']), subset(x, padj<=0.01)[,'log2FoldChange'], col='red')
105 | }
106 | 
107 | 
108 | 
109 | # output on file
110 | toTable = data.frame("id"=res[,1], "log2FC"=res[,6], "log2CPM"=log2(res[,2]), "PValue"=res[,7], "FDR"=res[,8])
111 | write.table(toTable, file=opt$output, quote=F, sep='\t', row.names=F)
112 | 
113 | q(save='no')
114 | 
115 | 
116 | # for debugging
117 | opt = list()
118 | opt$input_matrix = '~/Documents/blueprint/pilot/Flux/bp.human.gene.reads.idr_01.thr_0.names_False.tsv'
119 | opt$metadata = '~/Documents/blueprint/pilot/bp_rna_dashboard_temp.columns'
120 | opt$fields = 'cell'
121 | opt$genes = '~/Documents/blueprint/pilot/gen15.gene.pc.txt'
122 | 
123 | 


--------------------------------------------------------------------------------
/anova.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | options(stringsAsFactors=FALSE)
  4 | 
  5 | ##################
  6 | # OPTION PARSING #
  7 | ##################
  8 | 
  9 | suppressPackageStartupMessages(library("optparse"))
 10 | 
 11 | option_list <- list(
 12 | 
 13 | make_option(c("-i", "--input"), default="stdin",
 14 | 	help="Input matrix (R-friendly, the header has n-1 columns [default=%default]"),
 15 | 
 16 | make_option(c("-o", "--output"), default="stdout",
 17 | 	help="Output file name. [default=%default]"),
 18 | 
 19 | make_option(c("-l", "--log10"), action="store_true", default=FALSE,
 20 | 	help="Apply the log10 to the whole matrix as pre-processing step [default=%default]"),	
 21 | 
 22 | make_option(c("-p", "--pseudocount"), default=0.01,
 23 | 	help="Pseudocount to add when applying the log [default=%default]"),
 24 | 
 25 | make_option(c("-r", "--replace_NA"), default=FALSE, action="store_true",
 26 | 	help="Replace NA with 0 [default=%default]"),
 27 | 
 28 | make_option(c("-m", "--metadata"),
 29 | 	help="Matrix with the metadata. It contains the information about the columns of the input matrix"),
 30 | 
 31 | make_option(c("--merge_mdata_on"), default="labExpId",
 32 | 	help="Metadata field which contains the column names of the input matrix [default=%default]"),
 33 | 
 34 | make_option(c("-F", "--factors"), 
 35 | 	help="Factors for anova (right part of the formula after \"~\"), can also be interactions, e.g. value~cell+organism"),
 36 | 
 37 | make_option(c("--p_adj"), default="BH",
 38 | 	help="Method for correcting the pvalue for multiple testing [default=%default]"),
 39 | 
 40 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE,
 41 | 	help="if you want more output [default=%default]")
 42 | )
 43 | 
 44 | parser <- OptionParser(
 45 | 	usage = "%prog [options] file", 
 46 | 	option_list=option_list,
 47 | 	description = "
 48 | 	Computes anova for each row of the input matrix given a design model. 
 49 | 	The SS for each factor of the design can be used to compute the 
 50 | 	proportion of variance explained by that factor
 51 | 
 52 | 	Requires package \"reshape2\"
 53 | 	"
 54 | )
 55 | arguments <- parse_args(parser, positional_arguments = TRUE)
 56 | opt <- arguments$options
 57 | if (opt$verbose) {print(opt)}
 58 | 
 59 | 
 60 | suppressPackageStartupMessages(library("reshape2"))
 61 | 
 62 | 
 63 | ##############
 64 | # BEGIN
 65 | ##############
 66 | 
 67 | # Read input
 68 | #input = ifelse(opt$input == "stdin", file("stdin"), opt$input) # why is this line never working?
 69 | if (opt$input == "stdin") {
 70 |         input = file("stdin")
 71 | } else {
 72 |         input = opt$input
 73 | }
 74 | m = read.table(input, h=T)
 75 | if (opt$verbose) {
 76 | 	cat("Data sample:\n")
 77 | 	print(head(m))
 78 | }
 79 | 
 80 | # Process the data according to the user
 81 | if (opt$replace_NA) {m = replace(m, is.na(m), 0)}
 82 | if (opt$log10) {m = log10(m + opt$pseudocount)}
 83 | 
 84 | 
 85 | # Read the metadata
 86 | mdata = read.table(opt$metadata, h=T, sep="\t", quote=NULL, check.names=F)
 87 | mdata[,opt$merge_mdata_on] = gsub("[,:-]", ".", mdata[,opt$merge_mdata_on])
 88 | mdata_col = unique(c(opt$merge_mdata_on, strsplit(opt$factors, "[+*:]")[[1]]))
 89 | mdata = unique(mdata[,mdata_col])
 90 | if (opt$verbose) {
 91 | 	cat("Metadata sample:\n")
 92 | 	print(head(mdata))
 93 | }
 94 | 
 95 | # Read the formula
 96 | F = as.formula(sprintf("value~`%s`", opt$factors))
 97 | 
 98 | #m = m[1:100,]
 99 | 
100 | # Apply anova on each gene
101 | res = t(sapply(1:nrow(m), 
102 | 	function(i) {
103 | 		mm = suppressMessages(melt(m[i,]))
104 | 		tmp = merge(mm, mdata, by.x="variable", by.y=opt$merge_mdata_on);
105 | #		print(tmp)
106 | 		aov_res = anova(lm(F, tmp));
107 | 		return(unlist(aov_res[,-1]))
108 | 	}
109 | ))
110 | 
111 | tmp = merge(melt(m[1,]), mdata, by.x="variable", by.y=opt$merge_mdata_on);
112 | aov_res = anova(lm(F, tmp));
113 | labels = apply(expand.grid(rownames(aov_res), c("SS", "MeanSq", "F", "pvalue")), 1, paste, collapse="_") 
114 | 
115 | res = data.frame(res)
116 | colnames(res) <- labels
117 | 
118 | 
119 | # Adjust pvalue
120 | for (i in grep("pvalue", colnames(res))) {
121 | 	adj_header = paste(colnames(res)[i], "adj", sep=".")
122 | 	res[,adj_header] = p.adjust(res[,i], method=opt$p_adj)
123 | }
124 | 
125 | res = sapply(res, round, 4)
126 | rownames(res) <- rownames(m)
127 | 
128 | # If the variance is zero set the results to NA
129 | res[apply(m, 1, var)==0,] <- NA
130 | 
131 | 
132 | # OUTPUT
133 | 
134 | output = ifelse(opt$output == "stdout", "", opt$output)
135 | write.table(res, output, quote=FALSE, col.names=TRUE, row.names=TRUE, sep='\t')
136 | 
137 | q(save='no')
138 | 


--------------------------------------------------------------------------------
/DEXSeq.analysis.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | # == NOTES ==
  4 | # 1) the gtf should NOT have the trailing ";"
  5 | # 2) the list of files with the exon read counts should have full paths
  6 | # 3) the order of the id should be the same as the files
  7 | 
  8 | 
  9 | ##------------
 10 | ## LIBRARIES
 11 | ##------------ 
 12 | 
 13 | cat('Loading libraries... ')
 14 | 
 15 | #suppressPackageStartupMessages(library(reshape2))
 16 | #suppressPackageStartupMessages(library(ggplot2))
 17 | suppressPackageStartupMessages(library("parallel"))
 18 | suppressPackageStartupMessages(library("optparse"))
 19 | suppressPackageStartupMessages(library("plyr"))
 20 | suppressPackageStartupMessages(library('DEXSeq'))
 21 | 
 22 | cat('DONE\n\n')
 23 | 
 24 | options(stringsAsFactors=F)
 25 | pseudocount = 1e-04
 26 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 
 27 | 
 28 | # =====================
 29 | # == DEBUG OPTIONS ==
 30 | # ====================
 31 | 
 32 | opt = list()
 33 | opt$metadata = "~/Documents/blueprint/pilot/bp_rna_dashboard_mmaps.crg.tsv"
 34 | opt$countfiles = "ERR180942.exon.count.txt,ERR180943.exon.count.txt,ERR180944.exon.count.txt,ERR180945.exon.count.txt,ERR180948.exon.count.txt,ERR180950.exon.count.txt,ERR180951.exon.count.txt,ERR186015.exon.count.txt,ERR230581.exon.count.txt,ERR232403.exon.count.txt,ERR232404.exon.count.txt,ERR244135.exon.count.txt"
 35 | opt$labExpId = "ERR180942,ERR180943,ERR180944,ERR180945,ERR180948,ERR180950,ERR180951,ERR186015,ERR230581,ERR232403,ERR232404,ERR244135"
 36 | opt$DE_factor = "cell"
 37 | opt$annotationfile = "~/Documents/db/human/gencode15/Long/DEXSeq/gen15.long.DEXSeq.gtf"
 38 | 
 39 | 
 40 | ##################
 41 | # OPTION PARSING
 42 | ##################
 43 | 
 44 | 
 45 | option_list <- list(
 46 | make_option(c("-i", "--countfiles"), help="list of files with the counts for exons"),
 47 | #make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"),
 48 | #make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount),
 49 | make_option(c("-I", "--labExpId"), help="list of ids in the same order as the files"),
 50 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"),
 51 | make_option(c("-f", "--DE_factor"), help="choose the field you want to use for the DE"),
 52 | make_option(c("-a", "--annotationfile"), help="gtf file"),
 53 | make_option(c("-o", "--output"), help="output file name"),
 54 | #make_option(c("-f", "--fill_by"), help="choose the color you want to fill by [default=NA]", type='character', default=NA)
 55 | make_option(c("-g", "--genes"), help='a file with a list of genes to filter', type='character', default=NA)
 56 | )
 57 | 
 58 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 59 | arguments <- parse_args(parser, positional_arguments = TRUE)
 60 | opt <- arguments$options
 61 | print(opt)
 62 | 
 63 | 
 64 | # == BEGIN ==
 65 | 
 66 | minCount = 24
 67 | maxExon = 20
 68 | nCores = 6
 69 | progressFile = "DEXSeq.progress"
 70 | 
 71 | mdata = read.table(opt$metadata, h=T, stringsAsFactors=T)
 72 | countfiles = strsplit(opt$countfiles, ",")[[1]]
 73 | labExpId = strsplit(opt$labExpId, ",")[[1]]
 74 | 
 75 | # Prepare the design matrix from metadata
 76 | df = unique(subset(mdata, labExpId %in% labExpId, select=c("labExpId", opt$DE_factor)))
 77 | df = df[match(labExpId, df$labExpId),]
 78 | rownames(df) <- df$labExpId
 79 | colnames(df)[colnames(df) == opt$DE_factor] <- "condition"
 80 | 
 81 | ecs = read.HTSeqCounts(countfiles = countfiles, design = df, flattenedfile=opt$annotationfile)
 82 | sampleNames(ecs) <- labExpId
 83 | 
 84 | # Run the test on a subset of genes
 85 | test = subsetByGenes(ecs, genes=sample(geneIDs(ecs), 500))
 86 | 
 87 | # Estimate the scaling factors
 88 | test <- estimateSizeFactors(test)
 89 | 
 90 | # Estimate dispersions
 91 | test <- estimateDispersions(test, minCount=minCount, maxExon=maxExon, nCores=nCores, file=progressFile)
 92 | 
 93 | # Fit dispersions
 94 | test <- fitDispersionFunction(test)
 95 | 
 96 | # Make sure the regression line is ok
 97 | plot(log10(rowMeans(counts(test, normalized=T))), log10(fData(test)$dispBeforeSharing))
 98 | points(log10(rowMeans(counts(test, normalized=T))), log10(fData(test)$dispFitted), col='red')
 99 | 
100 | # Test the DE
101 | test <- testForDEU(test, nCores=nCores)
102 | 
103 | # Store the results in a data.frame
104 | res = DEUresultTable(test)
105 | plotDEXSeq(test, "ENSG00000139505.10", displayTranscripts=T, cex.axis=1.2, cex=1.3, lwd=2, legend=T, norCounts=T)
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/quantile_normalization.R:
--------------------------------------------------------------------------------
  1 | # This script is useful for:
  2 | # normalizing samples by quantile
  3 | 
  4 | 
  5 | ##------------
  6 | ## LIBRARIES
  7 | ##------------ 
  8 | 
  9 | suppressPackageStartupMessages(library(reshape))
 10 | suppressPackageStartupMessages(library(reshape2))
 11 | suppressPackageStartupMessages(library(ggplot2))
 12 | suppressPackageStartupMessages(library("optparse"))
 13 | 
 14 | options(stringsAsFactors=F)
 15 | pseudocount = 1e-05
 16 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
 17 | 
 18 | ##################
 19 | # OPTION PARSING
 20 | ##################
 21 | 
 22 | option_list <- list(
 23 | make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze"),
 24 | make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"),
 25 | make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount),
 26 | make_option(c("-m", "--metadata"), help="tsv file with the metadata"),
 27 | make_option(c("-b", "--bottom"), type='double', help="fraction to remove from the bottom sorted distribution column-wise"),
 28 | make_option(c("-t", "--top"), type='double', help="fraction to remove from the top sorted distribution column-wise"),
 29 | make_option(c("-o", "--output"), help="output file name WIHTOUT extension [default=%default]", default="qnorm")
 30 | #make_option(c("-s", "--scale_by"), help="choose one or multiple attributes you want to scale by"),
 31 | #make_option(c("-r", "--row_first"), action="store_true", help="scale first by rows then by columns", default=FALSE),
 32 | #make_option(c("-n", "--n_iter"), type='integer', help="how many times to iterate [default=%default]", default=20)
 33 | )
 34 | 
 35 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 36 | arguments <- parse_args(parser, positional_arguments = TRUE)
 37 | opt <- arguments$options
 38 | print(opt)
 39 | 
 40 | ###############
 41 | # BEGIN
 42 | ###############
 43 | 
 44 | # read options
 45 | m <- read.table(opt$input_matrix, h=T)
 46 | mdata <- read.table(opt$metadata, h=T, row.names=NULL, sep="\t")
 47 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(",", ".", x))
 48 | char_cols <- which(sapply(m, class) == 'character')
 49 | sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols)
 50 | 
 51 | if (length(char_cols) == 0) {genes = rownames(m)}
 52 | if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]}
 53 | 
 54 | 
 55 | # apply the log if required
 56 | #if (opt$log) { m = log2(replace(m, is.na(m), 0) + pseudocount) }
 57 | 
 58 | gp1 = ggplot(melt(m), aes(x=value)) + geom_density() + facet_wrap(~variable)
 59 | 
 60 | # sort the columns of m, be careful to keep the labels
 61 | # I will create two matrices, one with the sorted values, the other with the sorted labels
 62 | 
 63 | sort_m = apply(m, 2, sort, na.last=T, d=F)
 64 | sort_genes_m = apply(m, 2, function(x) as.numeric(rownames(sort_df(as.data.frame(x)))))
 65 | 
 66 | 
 67 | shift1 = opt$bottom
 68 | shift2 = opt$top
 69 | 
 70 | # from this extract a range 
 71 | sort_m_range = sort_m[floor(nrow(sort_m)*(shift1)) : floor(nrow(sort_m)*(shift2)),]
 72 | sort_genes_m_range = sort_genes_m[floor(nrow(sort_genes_m)*(shift1)) : floor(nrow(sort_genes_m)*(shift2)),]
 73 | 
 74 | gp2 = ggplot(melt(as.data.frame(sort_m_range)), aes(x=value)) + geom_density() + facet_wrap(~variable)
 75 | 
 76 | # calculate mean and standard deviation for the gated matrix
 77 | #scl_mean = apply(sort_m_range, 2, mean) 
 78 | #scl_sd = apply(sort_m_range, 2, sd)
 79 | 
 80 | # with mean and standard deviation scale the whole sorted and plot
 81 | #scl_sort_m = sapply(1:ncol(sort_m), function(i) (sort_m[,i]-scl_mean[i])/scl_sd[i]) 
 82 | #colnames(scl_sort_m) <- colnames(sort_m)
 83 | 
 84 | scl_sort_m = sort_m_range
 85 | 
 86 | #gp3 = ggplot(melt(as.data.frame(scl_sort_m)), aes(x=value)) + geom_density() + facet_wrap(~variable)
 87 | 
 88 | # do the average by rows and reassign the gene labels
 89 | av = apply(scl_sort_m, 1, mean)
 90 | qnorm_m = apply(sort_genes_m_range, 2, function(col) av[order(col)])
 91 | gp4 = ggplot(melt(as.data.frame(qnorm_m)), aes(x=value)) + geom_density() + facet_wrap(~variable)
 92 | 
 93 | # write the output matrix to a file
 94 | #if (length(char_cols) != 0) {qnorm_m <- cbind(genes, qnorm_m)}
 95 | write.table(qnorm_m, sprintf("%s.tsv", opt$output), quote=F, sep='\t', row.names=F)
 96 | 
 97 | # plot all the distributions after each of the steps
 98 | pdf(sprintf("%s.pdf", opt$output))
 99 | gp1;gp2;#gp3;
100 | gp4
101 | dev.off()
102 | 
103 | q(save='no')
104 | 


--------------------------------------------------------------------------------
/differential_coSI.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ##------------
  3 | ## LIBRARIES
  4 | ##------------ 
  5 | 
  6 | cat("Loading libraries...")
  7 | 
  8 | suppressPackageStartupMessages(library(reshape2))
  9 | suppressPackageStartupMessages(library(ggplot2))
 10 | suppressPackageStartupMessages(library("optparse"))
 11 | suppressPackageStartupMessages(library(plyr))
 12 | 
 13 | cat("DONE\n\n")
 14 | 
 15 | options(stringsAsFactors=F)
 16 | pseudocount = 1e-04
 17 | cbbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#000000", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 
 18 | 
 19 | ##################
 20 | # OPTION PARSING
 21 | ##################
 22 | 
 23 | 
 24 | option_list <- list(
 25 | make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze"),
 26 | #make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"),
 27 | #make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount),
 28 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"),
 29 | make_option(c("-o", "--output"), help="additional tags for otuput", default="out"),
 30 | make_option(c("-c", "--diff_by"), help="choose the factor you want to differ by. Only one factor", type='character')
 31 | )
 32 | 
 33 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 34 | arguments <- parse_args(parser, positional_arguments = TRUE)
 35 | opt <- arguments$options
 36 | print(opt)
 37 | 
 38 | 
 39 | 
 40 | ##--------------------##
 41 | ## CLUSTERING SAMPLES ##
 42 | ##--------------------##
 43 | output = sprintf("coSI_DE.%s", opt$output)
 44 | 
 45 | # 1. read the matrix from the command line
 46 | m = read.table(opt$input_matrix, h=T)
 47 | 
 48 | # remove potential gene id columns
 49 | char_cols <- which(sapply(m, class) == 'character')
 50 | sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols)
 51 | if (length(char_cols) == 0) {genes = rownames(m)}
 52 | if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]}
 53 | 
 54 | # 2. remove missing values
 55 | m <- na.omit(m)
 56 | 
 57 | # 3. read the metadata from the metadata file
 58 | mdata = read.table(opt$metadata, h=T, sep='\t')
 59 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(",", ".", x))
 60 | if (!is.null(opt$diff_by)) {opt$diff_by = strsplit(opt$diff_by, ",")[[1]]}
 61 | 
 62 | # 4. prepare data.frame for ggplot
 63 | df = melt(as.matrix(m), varnames=c("variable", "labExpId"), value.name="value")
 64 | df = merge(unique(mdata[unique(c("labExpId", opt$diff_by))]), df, by="labExpId")
 65 | 
 66 | # 5. do the average according to factor
 67 | new_df = aggregate( as.formula(sprintf("value~%s+variable", opt$diff_by)), df, FUN=mean, na.action=na.omit)
 68 | new_df = dcast(new_df, as.formula(sprintf("variable~%s", opt$diff_by)))
 69 | 
 70 | # compute M and A
 71 | new_df$A = 0.5*(new_df[,2]+new_df[,3])
 72 | new_df$M = (new_df[,3]) - (new_df[,2])
 73 | 
 74 | 
 75 | # Randomization
 76 | set.seed(123)
 77 | 
 78 | dfr = df
 79 | dfr$value = sample(df$value)
 80 | new_dfr = aggregate( as.formula(sprintf("value~%s+variable", opt$diff_by)), dfr, FUN=mean, na.action=na.omit)
 81 | new_dfr = dcast(new_dfr, as.formula(sprintf("variable~%s", opt$diff_by)))
 82 | 
 83 | new_dfr$A = 0.5*(new_dfr[,2]+new_dfr[,3])
 84 | new_dfr$M = (new_dfr[,3])-(new_dfr[,2])
 85 | 
 86 | 
 87 | # Add statistical significance after randomizing
 88 | new_df$Z = (new_df$M - mean(new_dfr$M))/sd(new_dfr$M)
 89 | new_df$pv = 1 - pnorm(abs(new_df$Z))
 90 | 
 91 |  
 92 | 
 93 | ###############
 94 | # OUTPUT 
 95 | ###############
 96 | 
 97 | # WRITE TABLE
 98 | 
 99 | write.table(new_df, sprintf("%s.tsv", output), row.names=F, quote=F, sep="\t")
100 | 
101 | # plotting...
102 | 
103 | pdf(sprintf("%s.pdf",output), h=5, w=6)
104 | 
105 | theme_set(theme_bw())
106 | 
107 | gp = ggplot(new_df, aes(x=A, y=M)) 
108 | gp = gp + geom_point(aes(color=cut(abs(Z), breaks =c(0, 1, 1.5, 2, 2.5 ,3 , Inf))))
109 | gp = gp + labs(y="cosi1-cosi2", x='1/2(cosi1+cosi2)')
110 | gp = gp + scale_color_manual(name = "Z-score", values = cbbPalette)
111 | gp = gp + scale_y_continuous(expand=c(0.01,0))
112 | gp = gp + theme(axis.text = element_text(size=15))
113 | gp
114 | 
115 | gp = ggplot(new_df, aes(x=A, y=M)) 
116 | gp = gp + geom_point(aes(color=cut(abs(pv), breaks =c(-Inf, 0, 0.001, 0.01, 0.05 ,  1))))
117 | gp = gp + labs(y="cosi1-cosi2", x='1/2(cosi1+cosi2)')
118 | gp = gp + scale_color_manual(name = "p-value", values = cbbPalette)
119 | gp = gp + scale_y_continuous(expand=c(0.01,0))
120 | gp = gp + theme(axis.text = element_text(size=15))
121 | gp
122 | 
123 | dev.off()
124 | 
125 | q(save='no')
126 | 


--------------------------------------------------------------------------------
/barplot.GO.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | 
  4 | options(stringsAsFactors=FALSE)
  5 | 
  6 | ##################
  7 | # OPTION PARSING
  8 | ##################
  9 | 
 10 | suppressPackageStartupMessages(library("optparse"))
 11 | 
 12 | option_list <- list(
 13 | 
 14 | make_option(c("-i", "--input"), default="stdin",
 15 | 	help="File or stdin [default=%default]"),
 16 | 
 17 | make_option(c("-o", "--output"), default="barplot.GO.pdf",
 18 | 	help="Output file name [default=%default]"),
 19 | 
 20 | make_option(c("--header"), action="store_true", default=FALSE,
 21 | 	help="Use this if the input has a header [default=%default]"),
 22 | 
 23 | make_option(c("-f", "--fill"), default="dodgerblue",
 24 | 	help="choose the color which you want to fill the histogram with [default=%default]"),
 25 | 
 26 | make_option(c("-F", "--fill_by"), type='integer', default=NULL, 
 27 | 	help="the column with the factor for the color [default=%default]"),
 28 | 
 29 | make_option(c("-P", "--palette"), default=NULL, 
 30 | 	help="palette file [default=%default]"),
 31 | 
 32 | make_option(c("-H", "--height"), default=7,
 33 | 	help="Height of the plot in inches [default=%default]"),
 34 | 
 35 | make_option(c("-W", "--width"), default=7,
 36 | 	help="Width of the plot in inches [default=%default]"),
 37 | 
 38 | make_option(c("-B", "--base_size"), default=20,
 39 | 	help="Base size [default=%default]"),
 40 | 
 41 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE,
 42 | 	help="if you want more output [default=%default]")
 43 | 
 44 | )
 45 | 
 46 | parser <- OptionParser(
 47 | 	usage = "%prog [options] file", 
 48 | 	option_list=option_list,
 49 | 	description = "Reads the values on the first column and outputs a histogram"
 50 | )
 51 | arguments <- parse_args(parser, positional_arguments = TRUE)
 52 | opt <- arguments$options
 53 | if (opt$verbose) {print(opt)}
 54 | 
 55 | #------------
 56 | # LIBRARIES
 57 | #------------ 
 58 | 
 59 | if (opt$verbose) {cat("Loading libraries... ")}
 60 | suppressPackageStartupMessages(library(reshape2))
 61 | suppressPackageStartupMessages(library(ggplot2))
 62 | suppressPackageStartupMessages(library(stringr))
 63 | #suppressPackageStartupMessages(library(plyr))
 64 | if (opt$verbose) {cat("DONE\n\n")}
 65 | 
 66 | 
 67 | # ======== #
 68 | # BEGIN    #
 69 | # ======== #
 70 | 
 71 | 
 72 | # Read data
 73 | if (opt$input == "stdin") {input=file("stdin")} else {input=opt$input}
 74 | m = read.table(input, h=opt$header, sep="\t") 
 75 | 
 76 | df = m
 77 | 
 78 | # Read palette file
 79 | if (!is.null(opt$palette)) {
 80 | 	palette = read.table(opt$palette, h=F, sep="\t", comment.char="")$V1
 81 | }
 82 | 
 83 | 
 84 | # Read columns
 85 | y_col = 2
 86 | y_col = colnames(df)[y_col]
 87 | df[y_col] <- -log10(df[y_col])
 88 | opt$x_axis <- 7
 89 | x_col = colnames(df)[opt$x_axis]
 90 | 
 91 | # Insert newlines in labels if they are too long
 92 | max_nchar = 25
 93 | toNL = which(sapply(as.character(df[,x_col]), nchar) > max_nchar)
 94 | nbSpaces = str_count(as.character(df[,x_col]), " ") 
 95 | toNL = intersect(which(nbSpaces > 1), toNL)
 96 | #print(df)
 97 | matches = gregexpr(" ", as.character(df[, x_col]), fixed=TRUE)
 98 | pos = as.numeric(sapply(matches, function(x) {i=floor((length(as.numeric(x))+1)/2); x[i]}))
 99 | subst = sapply(1:length(df[,x_col]), function(i) {x=as.character(df[i,x_col]); substring(x, pos[i])<- "\n"; x }) 
100 | df[toNL, x_col] <- subst[toNL]
101 | 
102 | # Sort the labels by p-value
103 | levs = df[order(df[,y_col]),x_col]
104 | df[,x_col] <- factor(df[,x_col], levels=levs)
105 | 
106 | #================
107 | # GGPLOT
108 | #================
109 | 
110 | theme_set(theme_bw(base_size=opt$base_size))
111 | theme_update(
112 | 	axis.text.x=element_text(angle=45, hjust=1, vjust=1),
113 | 	legend.key = element_rect(color='white'),
114 | 	panel.grid.minor = element_blank(),
115 | 	panel.grid.major = element_blank()
116 | )
117 | 
118 | #opt$fill = "dodgerblue"
119 | opt$color = "black"
120 | 
121 | geom_params = list()
122 | 
123 | if (is.null(opt$fill_by)) {
124 | 	geom_params$fill = opt$fill
125 | 	geom_params$color = opt$color
126 | }
127 | 
128 | # specify fill column
129 | if (!is.null(opt$fill_by)) {
130 | 	F_col = colnames(df)[opt$fill_by]
131 | 	mapping <- aes_string(fill=F_col)
132 | } else {
133 | 	mapping = NULL
134 | }
135 | 
136 | # define histogram layer 
137 | histLayer <- layer(
138 |     geom = "bar",
139 | #    geom_params = geom_params,
140 | 	params = geom_params,
141 | 	position = "identity",
142 | 	mapping = mapping,
143 |     stat = "identity"
144 | )
145 | 
146 | 
147 | # start the plot
148 | gp = ggplot(df, aes_string(x = x_col, y = y_col))
149 | gp = gp + histLayer
150 | gp = gp + coord_flip()
151 | gp = gp + labs(y="-log10(pvalue)", x=NULL)
152 | if (!is.null(opt$palette)) {
153 | 	gp = gp + scale_fill_manual(values=palette)
154 | }
155 | ggsave(opt$output, h=opt$height, w=opt$width, title=opt$output)
156 | 
157 | q(save='no')
158 | 


--------------------------------------------------------------------------------
/ggSOM.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript 
  2 | 
  3 | options(stringsAsFactors=F)
  4 | 
  5 | ##################
  6 | # OPTION PARSING
  7 | ##################
  8 | 
  9 | suppressPackageStartupMessages(library("optparse"))
 10 | 
 11 | option_list <- list(
 12 | make_option(c("-i", "--input"), default="stdin",
 13 | 	help="Tab-delimited file with unit id and attribute, no header. Can be stdin [default=%default]"),
 14 | 
 15 | #make_option(c("-r", "--replace_NAs"), default=FALSE, action='store_true',
 16 | #        help="Replace NAs by 0s. If FALSE, rows containing NAs are omitted. [default=%default]"),
 17 | #
 18 | make_option(c("-G", "--grid"), default="5x4",
 19 | 	help="Grid structure. Format: gridRowsxgridCols. [default=%default]"),
 20 | 
 21 | make_option(c("-k", "--cluster"), type='integer',
 22 | 	help="Index of the column with the cluster id"),
 23 | 
 24 | make_option(c("-f", "--factor"), type='integer',
 25 | 	help="Index of the column with the factor"),
 26 | 
 27 | make_option(c("--func"), default="median",
 28 | 	help="Function to aggregate in each cluster [default=%default]"),
 29 | 
 30 | make_option(c("-d", "--is_discrete"), default=FALSE, action="store_true",
 31 | 	help="The factor is a discrete class. [default=%default]"),
 32 | 
 33 | make_option(c("-p", "--palette"), 
 34 | 	help="File with custom palette. If no palette is given, the ggplot default is used."),
 35 | 
 36 | make_option(c("-T", "--title"),
 37 | 	help="Main title"),
 38 | 
 39 | #make_option(c("-t", "--topology"), default="hexagonal",
 40 | #        help="Grid topology. <hexagonal> | <rectangular> [default=%default]"),
 41 | #
 42 | #make_option(c("-m", "--metadata"), 
 43 | #        help="tsv file with metadata on matrix experiment"),
 44 | #
 45 | #make_option(c("-f", "--fields"), 
 46 | #        help="choose the fields you want to use for super-organized maps, comma-separated. Needs a metadata file"),
 47 | #
 48 | make_option(c("-o", "--output"), default="ggSOM.pdf",
 49 |         help="Output file name [default=%default]"),
 50 | 
 51 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, 
 52 | 	help="verbose output")
 53 | )
 54 | 
 55 | parser <- OptionParser(
 56 | 	usage = "%prog [options] file", 
 57 | 	description = "
 58 | 	Plot the output of SOM as hexagonal grid.
 59 | 	",
 60 | 	option_list=option_list)
 61 | arguments <- parse_args(parser, positional_arguments = TRUE)
 62 | opt <- arguments$options
 63 | if (opt$verbose) {print(opt)}
 64 | 
 65 | 
 66 | ##################
 67 | # LIBRARIES      #
 68 | ##################
 69 | 
 70 | suppressPackageStartupMessages(library('ggplot2'))
 71 | 
 72 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 73 | # Functions to compute the hexagon borders
 74 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 75 | 
 76 | hex.x = function(lx) {
 77 | 	c(
 78 | 	lx + 0, 
 79 | 	lx + 0.5, 
 80 | 	lx + 0.5, 
 81 | 	lx + 0, 
 82 | 	lx - 0.5, 
 83 | 	lx - 0.5 
 84 | 	)
 85 | }
 86 | 
 87 | hex.y = function(ly) {
 88 | 	c(
 89 | 	ly - 1/(sqrt(3)), 
 90 | 	ly - 1/(2*sqrt(3)), 
 91 | 	ly + 1/(2*sqrt(3)), 
 92 | 	ly + 1/(sqrt(3)), 
 93 | 	ly + 1/(2*sqrt(3)), 
 94 | 	ly - 1/(2*sqrt(3))
 95 | 	)
 96 | }
 97 | 
 98 | 
 99 | ##################
100 | # BEGIN
101 | ##################
102 | 
103 | 
104 | # read the SOM grid 
105 | gridStruct = as.numeric(strsplit(opt$grid, "x")[[1]])
106 | gridRows = gridStruct[1]
107 | gridCols = gridStruct[2]
108 | if(opt$verbose) {cat("SOM grid:", gridStruct, "\n")}
109 | 
110 | # Read the id attributes
111 | if (opt$input == "stdin") {input = file('stdin')} else {input = opt$input}
112 | m = read.table(input, h=F)
113 | 
114 | # Read the palette if provided
115 | if (!is.null(opt$palette)) {
116 | 	palette = read.table(opt$palette, h=F, comment.char="%")[,1]
117 | }
118 | 
119 | # Name the cluster column
120 | colnames(m)[opt$cluster] <- 'id'
121 | 
122 | # Build the grid
123 | hex = data.frame()
124 | k=0
125 | 
126 | for (i in seq(gridRows)) {
127 | 	for (j in seq(gridCols)) {
128 | 		if (i%%2 != 0) {j = j+0.5}
129 | 		k = k+1
130 | 		hex = rbind(hex, data.frame(x = hex.x(j), y = hex.y(i), id=k))
131 | 	}
132 | }
133 | 
134 | # Aggregate the attribute by cluster
135 | factor_col = colnames(m)[opt$factor]
136 | 
137 | 
138 | if (!opt$is_discrete) {
139 | 	formula_agg = as.formula(sprintf("%s~id", factor_col))
140 | 	m_agg = aggregate(formula_agg, m, eval(opt$func), na.rm=TRUE)
141 | 	# Merge the attribute with the hex grid
142 | 	df = merge(hex, m_agg)
143 | } else {
144 | 	df = merge(hex, m)
145 | }
146 | 
147 | 
148 | 
149 | ########
150 | # PLOT
151 | ########
152 | 
153 | #gp = ggplot(hex, aes(x, y)) + geom_polygon(aes(group=id, fill=id))
154 | gp = ggplot(df, aes(x, y)) + geom_polygon(aes_string(group="id", fill=factor_col), color='black')
155 | gp = gp + labs(title=opt$title, x=NULL, y=NULL)
156 | if (!is.null(opt$palette)) {
157 | 	if (opt$is_discrete) {
158 | 		gp = gp + scale_fill_manual(values=palette)
159 | 	} else {
160 | 		gp = gp + scale_fill_gradientn(colours=palette)
161 | 	}
162 | }
163 | 
164 | ggsave(opt$output, h=5, w=8)
165 | 
166 | 
167 | 


--------------------------------------------------------------------------------
/matrix_wilcox.R:
--------------------------------------------------------------------------------
  1 | 
  2 | # This script is useful for:
  3 | # normalizing samples by row scaling
  4 | 
  5 | 
  6 | ##------------
  7 | ## LIBRARIES
  8 | ##------------ 
  9 | cat("Loading libraries... ")
 10 | suppressPackageStartupMessages(library(reshape2))
 11 | suppressPackageStartupMessages(library(ggplot2))
 12 | suppressPackageStartupMessages(library("optparse"))
 13 | cat("DONE\n\n")
 14 | 
 15 | 
 16 | options(stringsAsFactors=F)
 17 | pseudocount = 1e-05
 18 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 
 19 | 
 20 | ##################
 21 | # OPTION PARSING
 22 | ##################
 23 | 
 24 | option_list <- list(
 25 | make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze"),
 26 | #make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"),
 27 | #make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount),
 28 | make_option(c("-m", "--metadata"), help="tsv file with the metadata"),
 29 | make_option(c("-b", "--by"), help="choose ONE factor you want to do the test by"),
 30 | make_option(c("-o", "--output_suffix"), help="additional output suffix [default=%default]", default='out'),
 31 | #make_option(c("-f", "--func"), help="choose the function <mean>, <sd> [default=%default]", default='mean'),
 32 | make_option(c("-n", "--not_na"), help="fraction of not NA values in the vector for the mean [default=%default]", default=1, type='double')
 33 | #make_option(c("-r", "--row_first"), action="store_true", help="scale first by rows then by columns", default=FALSE),
 34 | #make_option(c("-n", "--n_iter"), type='integer', help="how many times to iterate [default=%default]", default=20)
 35 | )
 36 | 
 37 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 38 | arguments <- parse_args(parser, positional_arguments = TRUE)
 39 | opt <- arguments$options
 40 | print(opt)
 41 | 
 42 | 
 43 | ###############
 44 | # BEGIN
 45 | ###############
 46 | 
 47 | # read options
 48 | m <- read.table(opt$input_matrix, h=T)
 49 | mdata <- read.table(opt$metadata, h=T, row.names=NULL, sep="\t")
 50 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(",", ".", x))
 51 | char_cols <- which(sapply(m, class) == 'character')
 52 | sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols)
 53 | 
 54 | if (length(char_cols) == 0) {genes = rownames(m)}
 55 | if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]}
 56 | 
 57 | # apply the log if required
 58 | #if (opt$log) { m = log2(replace(m, is.na(m), 0) + pseudocount) }
 59 | 
 60 | mdata = subset(mdata, labExpId %in% colnames(m))
 61 | 
 62 | # ----------------
 63 | # Functions
 64 | # ----------------
 65 | 
 66 | #my_mean = function(x) {
 67 | #ifelse((sum(!is.na(x)) < (opt$not_na*length(x))), NA, mean(x,na.rm=T) )
 68 | #}
 69 | #
 70 | #my_sd = function(x) {
 71 | #ifelse((sum(!is.na(x)) < (opt$not_na*length(x))), NA, sd(x,na.rm=T) )
 72 | #}
 73 | #
 74 | #if (opt$func == "mean") {func = my_mean}
 75 | #if (opt$func == "sd") {func = my_sd}
 76 | 
 77 | 
 78 | # apply the function to the whole matrix if no value is provided
 79 | #if (is.null(opt$mean_by)) {
 80 | #new_m = rowMeans(m, na.rm=T); colnames(new_m) <- c("mean")
 81 | #}
 82 | 
 83 | # scale the sub-matrices defined the scale_by option
 84 | if (!is.null(opt$by)) {
 85 | by <- strsplit(opt$by, ",")[[1]]
 86 | if (length(by) != 1){
 87 | #ids = apply(data.frame(unique(mdata[, mean_by])), 1, function(x) unique(merge(t(as.data.frame(x)), mdata, by=mean_by)$labExpId ))
 88 | cat("ERROR: Only ONE factor must be provided!\n"); q(save='no')}
 89 | if (length(by) == 1){
 90 | ids = sapply(unique(mdata[, by]), function(x) unique(mdata[ mdata[,by] == x,]$labExpId))}
 91 | # apply normalization
 92 | #if (length(mean_by) != 1){cat('to be implemented\n\n');for (i in 1:length(ids)) { new_m[, ids[[i]]] <- equil(new_m[,ids[[i]]])}  }
 93 | #if (length(by) == 1){new_m = matrix(ncol=ncol(ids), nrow=nrow(m)); for (i in 1:ncol(ids)) {  
 94 | #colnames(new_m) <- colnames(ids); new_m[,i] <- apply(m[,ids[,i]], 1, mean, na.rm=T)}  }
 95 | }
 96 | 
 97 | wilc_pv = apply(m, 1, function(x) tryCatch({wt = wilcox.test(x[ids[,1]], x[ids[,2]]);wt$p.value}, error = function(e) NA_real_))
 98 | ttes_pv = apply(m, 1, function(x) tryCatch({wt = t.test(x[ids[,1]], x[ids[,2]]);wt$p.value}, error = function(e) NA_real_))
 99 | 
100 | 
101 | #if (is.null(opt$scale_by)) {gp2 = ggplot(melt(new_m), aes(x=value)) + geom_density() + facet_wrap(~Var2)}else{
102 | #gp2 = ggplot(melt(new_m), aes(x=value)) + geom_density() + facet_wrap(~variable)}
103 | 
104 | #if (length(char_cols) != 0) {new_m <- cbind(genes, new_m)}
105 | df = data.frame(cbind(genes, wilc_pv))
106 | 
107 | 
108 | #--------------
109 | # print output
110 | #--------------
111 | 
112 | output = sprintf('%s.by_%s.n_%s.%s', opt$func, opt$by, opt$not_na, opt$output_suffix)
113 | write.table(new_m, sprintf('%s.tsv',output), quote=F, sep='\t', row.names=F)
114 | #pdf(sprintf("%s.pdf", output)); gp1; gp2; dev.off()
115 | q(save='no')
116 | 


--------------------------------------------------------------------------------
/projection.score.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | options(stringsAsFactors=FALSE)
  4 | 
  5 | ##################
  6 | # OPTION PARSING #
  7 | ##################
  8 | 
  9 | suppressPackageStartupMessages(library("optparse"))
 10 | 
 11 | option_list <- list(
 12 | 
 13 | make_option(c("-i", "--input"), default="stdin",
 14 | 	help="File or stdin [default=%default]"),
 15 | 
 16 | make_option(c("-o", "--output"), default="proj.score",
 17 | 	help="Output file name WITHOUT extension [default=%default]"),
 18 | 
 19 | make_option(c("-s", "--statistics"), 
 20 | 	help="Two-column file with header, with the row names of the input and an associated statistics by which to filter"),
 21 | 
 22 | make_option(c("-l", "--log10"), action="store_true", default=FALSE,
 23 | 	help="Apply the log10 to the whole matrix as pre-processing step [default=%default]"),	
 24 | 
 25 | make_option(c("-p", "--pseudocount"), default=0.001,
 26 | 	help="Pseudocount to add when applying the log [default=%default]"),
 27 | 
 28 | make_option(c("-t", "--thresholds"), default="0.1,0.2,0.25,0.3,0.35,0.4,0.5,0.6,0.7,0.8",
 29 | 	help="A comma-separated list of different thresholds (percentages of the max value) [default=%default]"),
 30 | 
 31 | make_option(c("-B", "--iterations"), default=10,
 32 | 	help="Number of row permutations [default=%default]"),
 33 | 
 34 | make_option(c("-S", "--nb_components"), default=3,
 35 | 	help="Number of principal components you want to compute the projection score [default=%default]"),
 36 | 
 37 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE,
 38 | 	help="if you want more output [default=%default]")
 39 | )
 40 | 
 41 | parser <- OptionParser(
 42 | 	usage = "%prog [options] file", 
 43 | 	option_list=option_list,
 44 | 	description = "Compute the projection score on a matrix given a vector of values to use as thresholds. 
 45 | 	NOTE: The matrix is used as it comes with no normalization"
 46 | )
 47 | 
 48 | arguments <- parse_args(parser, positional_arguments = TRUE)
 49 | opt <- arguments$options
 50 | if (opt$verbose) {print(opt)}
 51 | 
 52 | 
 53 | suppressPackageStartupMessages(library("ggplot2"))
 54 | 
 55 | 
 56 | ##############
 57 | # BEGIN
 58 | ##############
 59 | 
 60 | # Read options
 61 | var_thresholds = strsplit(opt$thresholds, ",")[[1]]
 62 | B = opt$iterations
 63 | S = opt$nb_components
 64 | 
 65 | 
 66 | if (opt$input == "stdin") {
 67 | 	m = read.table(file("stdin"), h=T)
 68 | } else {
 69 | 	m = read.table(opt$input, h=T)
 70 | }
 71 | 
 72 | if (opt$log10) {
 73 | 	m = log10(m + opt$pseudocount)
 74 | }
 75 | 
 76 | # Read the statistics
 77 | stats = read.table(opt$statistics, h=T)
 78 | # Order them to be in the same order as the input matrix row names
 79 | stats = stats[match(stats[,1], rownames(m)),]
 80 | 
 81 | # Normalize the variance by the maximum variance
 82 | variancen = stats[,2]/max(stats[,2], na.rm=T)
 83 | #print(head(variancen))
 84 | names(variancen) <- stats[,1]
 85 | 
 86 | # Function to compute the alpha_2 measure
 87 | alpha_2 = function(lambda, S) {
 88 | 	sqrt(sum(lambda[1:S]^2)/sum(lambda^2))
 89 | }
 90 | 
 91 | # Initialize the vector of projection scores
 92 | proj_scores = array(numeric(0))
 93 | 
 94 | # Iterate over different variance thresholds
 95 | for (var_t in var_thresholds) {
 96 | 
 97 | 	if (opt$verbose) {cat("Variance threshold: ", var_t, "\n")}
 98 | 
 99 | 	# Get the actual submatrix for this variance threshold
100 | 	m_t = m[which(variancen>var_t),]
101 | 
102 | 	# Compute the pca on this submatrix
103 | 	pca1 = prcomp(t(m_t), center=FALSE, scale.=FALSE)
104 | 
105 | 	# Obtain a matrix of lambdas (sdev) for each permutation
106 | 	# Rows are the components and columns are the iterations
107 | 	set.seed(123)
108 | 	lambda = replicate(B, prcomp(apply(m_t, 1, sample), center=FALSE, scale.=FALSE)$sdev) 
109 | 	# Count how many times the stdev for each component in the permutation is lower than the observed one
110 | 	lambda_counts = rowSums(apply(lambda, 2, function(x) pca1$sdev>=x))
111 | 	
112 | 	# If at least one observed stdev is not higher than 95% of the permutated lambdas 
113 | 	# the submatrix does not support S, and the projection score is assigned to NA
114 | 	if (sum(lambda_counts[1:S] < 0.05*B) != 0) {
115 | 		proj_score = NA
116 | 	} else {
117 | 		exp_l = mean(apply(lambda, 2, alpha_2, S=S))
118 | 		obs_l = alpha_2(pca1$sdev, S)
119 | 		proj_score = obs_l - exp_l
120 | 	}
121 | 	proj_scores = c(proj_scores, proj_score)
122 | }
123 | 
124 | 
125 | ###############
126 | # OUTPUT
127 | ###############
128 | 
129 | selected = names(which(variancen >= var_thresholds[which.max(proj_scores)]))
130 | write.table(selected, sprintf("%s.txt", opt$output), quote=FALSE, col.names=FALSE, row.names=FALSE, sep='\t')
131 | 
132 | # Plot the projection score as a function of the variance
133 | df = data.frame(var_thresholds, proj_scores)
134 | if (opt$verbose) {print(df)}
135 | 
136 | theme_set(theme_bw(base_size=18))
137 | 
138 | gp = ggplot(df, aes(x=as.numeric(var_thresholds), y=proj_scores)) + geom_point(size=3) + geom_line()
139 | gp = gp + labs(x="Thresholds (fraction of max)")
140 | 
141 | ggsave(sprintf("%s.pdf", opt$output), h=5, w=7)
142 | 
143 | 
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/matrix_matrix_correlation.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ##------------
  4 | ## LIBRARIES
  5 | ##------------ 
  6 | cat("Loading libraries... ")
  7 | suppressPackageStartupMessages(library(reshape2))
  8 | suppressPackageStartupMessages(library(ggplot2))
  9 | suppressPackageStartupMessages(library("optparse"))
 10 | suppressPackageStartupMessages(library(plyr))
 11 | cat("DONE\n\n")
 12 | 
 13 | options(stringsAsFactors=F)
 14 | pseudocount = 1e-04
 15 | cbbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#000000", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 
 16 | 
 17 | ##################
 18 | # OPTION PARSING
 19 | ##################
 20 | 
 21 | 
 22 | option_list <- list(
 23 | make_option(c("-A", "--input1"), help="the first matrix of samples"),
 24 | make_option(c("-B", "--input2"), help="the second matrix of samples, columns must have same order as in input1"),
 25 | make_option(c("--tag1"), help="short tag describing the first input"),
 26 | make_option(c("--tag2"), help="short tag describing the second input"),
 27 | make_option(c("-L", "--labels"), help="select the labels from the dashboard fields, comma-separated"),
 28 | make_option(c("-l", "--log"), help="apply the log to <both>, <A>, <B>, <none> [default=%default]", default="none"),
 29 | make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount),
 30 | make_option(c("-m", "--metadata"), default=NULL, help="tsv file with metadata on matrix experiment"),
 31 | make_option(c("-r", "--representation"), help="choose <scatter>, <hex> [default=%default]", default="hex"),
 32 | make_option(c("-n", "--replace_na_with_0"), action="store_true", help="replace NAs with 0 [default=%default]", default=FALSE),
 33 | make_option(c("--facet_nrow"), default=NULL, type="integer", help="Number of rows for faceting [default=%default]"),
 34 | make_option(c("-o", "--output"), help="output file name without extension", default='out'),
 35 | make_option(c("-v", "--verbose"), default=FALSE, action="store_true", help="verbose")
 36 | )
 37 | 
 38 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 39 | arguments <- parse_args(parser, positional_arguments = TRUE)
 40 | opt <- arguments$options
 41 | if (opt$verbose) {
 42 | 	print(opt)
 43 | }
 44 | 
 45 | 
 46 | ##--------------------##
 47 | ## BEGIN              ##
 48 | ##--------------------##
 49 | 
 50 | #output = sprintf("log_%s.pseudo_%s.fillby_%s.%s", opt$log, opt$pseudocount, opt$fill_by, opt$output)
 51 | 
 52 | m1 = read.table(opt$input1, h=T)
 53 | m2 = read.table(opt$input2, h=T)
 54 | 
 55 | if (opt$replace_na_with_0) {
 56 | m1 = replace(m1, is.na(m1), 0)
 57 | m2 = replace(m2, is.na(m2), 0)
 58 | }
 59 | 
 60 | # check order of headers
 61 | if (!all(colnames(m1) == colnames(m2))) {print('Headers are not the same!'); q(save='no') }
 62 | 
 63 | 
 64 | mm1 = melt(cbind(m1, gene=rownames(m1)), value.name="value1", variable.name='labExpId')
 65 | mm2 = melt(cbind(m2, gene=rownames(m2)), value.name="value2", variable.name='labExpId')
 66 | 
 67 | df = merge(mm1, mm2, by=c('gene', 'labExpId'))
 68 | 
 69 | if (opt$verbose) {print(head(df))}
 70 | 
 71 | 
 72 | # Log-transformation
 73 | if (opt$log=="both") {df$value1 <- log10(df$value1+opt$pseudocount); df$value2 <- log10(df$value2+opt$pseudocount)}
 74 | if (opt$log=="A") {df$value1 <- log10(df$value1+opt$pseudocount)}
 75 | if (opt$log=="B") {df$value2 <- log10(df$value2+opt$pseudocount)}
 76 | 
 77 | 
 78 | # read the metadata
 79 | if (!is.null(opt$metadata)) {
 80 | 	mdata = read.table(opt$metadata, h=T, sep='\t', row.names=NULL)
 81 | 	# select only the requested ones
 82 | 	labels = strsplit(opt$labels, ',')[[1]]
 83 | 	mdata = unique(mdata[c('labExpId', labels)])
 84 | 	mdata$labels = apply(mdata[labels], 1, function(x) paste(x, collapse='\n'))
 85 | 	mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub("[,-]", ".", x))
 86 | 	# add this info to the data.frame
 87 | 	df = merge(df, mdata[c('labExpId',"labels")], by='labExpId')
 88 | 
 89 | # calculate correlation coefficient
 90 | df = ddply(df, .(labExpId,labels), transform, pearson = cor(value1, value2, m='p', use='p'))
 91 | } else {
 92 | # calculate correlation coefficient
 93 | df = ddply(df, .(labExpId), transform, pearson = cor(value1, value2, m='p', use='p'))
 94 | df$labels = df$labExpId
 95 | }
 96 | 
 97 | 
 98 | ##########
 99 | # PLOTTING
100 | ##########
101 | 
102 | gp = ggplot(df, aes(x=value1, y=value2))
103 | if (opt$representation == "hex") {
104 | gp = gp + geom_hex(aes(fill=cut(..count..,c(0,1,2,5,10,25,50,75,100,500,Inf))),binwidth=c(.2,.2))
105 | gp = gp + scale_fill_manual('counts',values=terrain.colors(10))}
106 | if (opt$representation == "scatter") {
107 | gp = gp + geom_point(shape=".")}
108 | gp = gp + facet_wrap(~labels, nrow=opt$facet_nrow)
109 | gp = gp + geom_abline(slope=1, intercept=0, col='blue')
110 | gp = gp + geom_text(aes(x=min(value1,na.rm=T), y=max(value2,na.rm=T)-0.5, label=sprintf("r=%0.3f",pearson)), color='blue', hjust=0)
111 | gp = gp + labs(x=opt$tag1, y=opt$tag2)
112 | #gp
113 | ggsave(filename=sprintf('%s.pdf', opt$output), h=7, w=7)
114 | ggsave(filename=sprintf('%s.jpg', opt$output), h=7, w=7)
115 | 
116 | 
117 | 
118 | q(save='no')
119 | 
120 | 
121 | 
122 | 


--------------------------------------------------------------------------------
/plot.mean.sd.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | options(stringsAsFactors=F)
  4 | 
  5 | # ==================
  6 | # DEBUG OPTIONS
  7 | # ==================
  8 | 
  9 | opt = list()
 10 | opt$output = "detected.features.pdf"
 11 | opt$merge_mdata_on = "labExpId"
 12 | opt$base_size = 16
 13 | 
 14 | ##################
 15 | # OPTION PARSING
 16 | ##################
 17 | 
 18 | suppressPackageStartupMessages(library("optparse"))
 19 | 
 20 | option_list <- list(
 21 | 
 22 | make_option(c("-i", "--input_matrix"), default="stdin", 
 23 |     help="the matrix you want to analyze [default=%default]"),
 24 | 
 25 | make_option(c("-o", "--output"), default="detected.features.pdf",
 26 |     help="output file name. [default=%default]"),
 27 | 
 28 | make_option(c("--header"), action="store_true", default=FALSE,
 29 | 	help="input file has header [default=%default]"),
 30 | 
 31 | make_option(c('--log10'), action="store_true", default=FALSE,
 32 | 	help="Apply the log to the data [default=%default]"),
 33 | 
 34 | make_option(c("-x", "--x_index"), default=1, type="character",
 35 | 	help="column index (or indeces) for the x axis. [default=%default]"),
 36 | 
 37 | make_option(c("-y", "--y_index"), default=2, type="integer",
 38 | 	help="column index for the y axis. [default=%default]"),
 39 | 
 40 | make_option(c("-F", "--fun"), type="character", default="mean_sdl",
 41 | 	help="function to aggregate [default=%default]"),
 42 | 
 43 | make_option(c("-G", "--geom"), type="character", default="pointrange",
 44 | 	help="function to aggregate: pointrange | bar | crossbar [default=%default]"),
 45 | 
 46 | make_option(c("-c", "--color"), type="character", default="orange",
 47 | 	help="color of the point or bar [default=%default]"),
 48 | 
 49 | make_option(c("--fill"), type="character", default="orange",
 50 | 	help="fill of the point or bar. [default=%default]"),
 51 | #	help="fill of the point or bar. If an integer is provided, is the index of the column for the fill factor [default=%default]"),
 52 | 
 53 | make_option(c("-f", "--facet_by"),  
 54 | 	help="column index to facet by"),
 55 | 
 56 | make_option(c("--facet_scale"), default="fixed",
 57 | 	help="facet scale: fixed | free | free_x | free_y"),
 58 | 
 59 | make_option(c("--y_title"), default="Percentage of detected features",
 60 | 	help="title for the y axis [default=%default]"),
 61 | 
 62 | make_option(c("-B", "--base_size"), default=16,
 63 | 	help="font base size [default=%default]"),
 64 | 
 65 | make_option(c("-W", "--width"), default=7,
 66 | 	help="width of the plot in inches [default=%default]"),
 67 | 
 68 | make_option(c("-H", "--height"), default=5,
 69 | 	help="height of the plot in inches [default=%default]")
 70 | 
 71 | )
 72 | 
 73 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 74 | arguments <- parse_args(parser, positional_arguments = TRUE)
 75 | opt <- arguments$options
 76 | 
 77 | ##------------
 78 | ## LIBRARIES
 79 | ##------------
 80 |  
 81 | cat("Loading libraries... ")
 82 | #suppressPackageStartupMessages(library(reshape2))
 83 | suppressPackageStartupMessages(library(ggplot2))
 84 | source("~/R/functions.R")
 85 | cat("DONE\n\n")
 86 | 
 87 | 
 88 | #########
 89 | # BEGIN #
 90 | #########
 91 | 
 92 | # read input matrix (decide if including the header or not)
 93 | if (opt$input_matrix == "stdin") {
 94 | 	m = read.table(file("stdin"), h=opt$header, sep="\t")
 95 | } else {
 96 | 	m = read.table(opt$input_matrix, h=opt$header, sep="\t")
 97 | }
 98 | 
 99 | df = m
100 | 
101 | 
102 | # Define x and y according to user indeces
103 | #x = colnames(df)[opt$x_index]
104 | x_indeces = as.numeric(strsplit(opt$x_index, split=",")[[1]])
105 | df$x_labels = apply(df[x_indeces], 1, paste, collapse=',')
106 | x = 'x_labels'
107 | y = colnames(df)[opt$y_index]
108 | 
109 | # Apply the log if needed
110 | if (opt$log10) {df[y] <- log10(df[y])}
111 | 
112 | # Apply the function outside ggplot
113 | #tmp = as.data.frame(as.list(aggregate(formula, df, smedian.hilow)))
114 | #tmp = rename(tmp, replace=c("V4.Median" = "y", "V4.Lower"="ymin", "V4.Upper"="ymax"))
115 | 
116 | 
117 | # sort by mean
118 | formula = as.formula(sprintf("%s~%s", y, x))
119 | lev = aggregate(formula, df, mean, na.rm=T)
120 | lev = lev[order(lev[,y], decreasing=T),x]
121 | df[x] = factor(df[,x], levels=lev)
122 | 
123 | # ~~~~~~~~~
124 | # PLOT
125 | # ~~~~~~~~~
126 | 
127 | theme_set(theme_bw(base_size=opt$base_size))
128 | 
129 | gp = ggplot(df, aes_string(x=x, y=y))
130 | if (opt$geom == 'crossbar') {
131 | 	gp = gp + stat_summary(fun.data=opt$fun, mult=1, shape=15, size=1, width=0.4, color=opt$color, fill=opt$fill, geom=opt$geom)
132 | } else {
133 | 	gp = gp + stat_summary(fun.data=opt$fun, mult=1, shape=15, size=1, width=0.4, color=opt$color, geom="errorbar")
134 | 	gp = gp + stat_summary(fun.data=opt$fun, mult=1, shape=15, size=1, color=opt$color, fill=opt$fill, geom=opt$geom)
135 | }
136 | gp = gp + theme(axis.text.x=element_text(angle=45, hjust=1))
137 | gp = gp + labs(y=opt$y_title, x="")
138 | 
139 | 
140 | if (!is.null(opt$facet_by)) {
141 | 	facet = as.formula(sprintf("~%s", colnames(df)[as.numeric(opt$facet_by)]))
142 | 	gp = gp + facet_wrap(facet, nrow=1, scale=opt$facet_scale)
143 | }
144 | 
145 | 
146 | ggsave(opt$output, h=opt$height, w=opt$width)
147 | 
148 | q(save="no")
149 | 
150 | 
151 | 


--------------------------------------------------------------------------------
/GO_enrichment.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | # -- Variables --
  4 | 
  5 | options(stringsAsFactors=F)
  6 | pseudocount = 1e-04
  7 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 
  8 | 
  9 | 
 10 | ##################
 11 | # OPTION PARSING
 12 | ##################
 13 | 
 14 | suppressPackageStartupMessages(library("optparse"))
 15 | 
 16 | option_list <- list(
 17 | make_option(c("-u", "--universe"), help="a list of gene identifiers (ensEMBL ids), NO header"),
 18 | make_option(c("-G", "--genes"), default="stdin",
 19 | 	help="a list of gene identifiers for the foreground (ensEMBL ids), WITHOUT header [default=%default]"),
 20 | make_option(c("-c", "--categ"), help="choose the GO category < BP | MF | CC > [default=%default]", default="BP"),
 21 | make_option(c("-s", "--species"), help="choose the species < human | mouse | dmel > [default=%default]", default="human"),
 22 | make_option(c("-o", "--output"), help="additional tags for otuput [default=%default]", default="out"),
 23 | make_option(c("-d", "--output_dir"), default="./", help="directory for the output [default=%default]"),
 24 | make_option(c("--output_genes"), action="store_true", default=FALSE, help="Output the list of genes from enriched GO terms in a separate file [default=%default]"),
 25 | make_option(c("-v", "--verbose"), default=FALSE, action="store_true", help="verbose")
 26 | )
 27 | 
 28 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 29 | arguments <- parse_args(parser, positional_arguments = TRUE)
 30 | opt <- arguments$options
 31 | if (opt$verbose) {print(opt)}
 32 | 
 33 | 
 34 | 
 35 | ##------------
 36 | ## LIBRARIES
 37 | ##------------ 
 38 | 
 39 | if (opt$verbose) {cat("Loading libraries... ")}
 40 | 
 41 | 
 42 | suppressPackageStartupMessages(library("GO.db"))
 43 | if (opt$species == "human") {ann = "org.Hs.eg.db"; suppressPackageStartupMessages(library("org.Hs.eg.db"))}
 44 | if (opt$species == "mouse") {ann = "org.Mm.eg.db"; suppressPackageStartupMessages(library("org.Mm.eg.db"))}
 45 | if (opt$species == "dmel") {ann = "org.Dm.eg.db"; suppressPackageStartupMessages(library("org.Dm.eg.db"))}
 46 | suppressPackageStartupMessages(library("GOstats"))
 47 | suppressPackageStartupMessages(library("plyr"))
 48 | 
 49 | if (opt$verbose) {cat("DONE\n\n"); sessionInfo()}
 50 | 
 51 | ############################
 52 | # BEGIN
 53 | ############################
 54 | 
 55 | U = read.table(opt$universe, h=F, col.names='hs')
 56 | 
 57 | U$hs = unique(U$hs)
 58 | 
 59 | #if (opt$genes == "stdin") {
 60 | #	G = read.table(file("stdin"), h=T, col.names='hs')	
 61 | #} else {
 62 | #	G = read.table(opt$genes, h=T, col.names='hs')
 63 | #}
 64 | 
 65 | if (opt$genes == "stdin") {
 66 | 	G = read.table(file("stdin"), h=F, col.names='hs')	
 67 | } else {
 68 | 	G = read.table(opt$genes, h=F, col.names='hs')
 69 | }
 70 | 
 71 | 
 72 | # I want to create a list of parameters to perform GO enrichment on different gene sets
 73 | 
 74 | # take the entrez gene ids for all the orthologous genes which will be my universe (the same for all the sets)
 75 | if (opt$species == "human") {
 76 | universe = unlist(mget(U$hs, org.Hs.egENSEMBL2EG, ifnotfound=NA))}
 77 | 
 78 | if (opt$species == "mouse") {
 79 | universe = unlist(mget(U$hs, org.Mm.egENSEMBL2EG, ifnotfound=NA))}
 80 | 
 81 | if (opt$species == "dmel") {
 82 | universe = unlist(mget(U$hs, org.Dm.egENSEMBL2EG, ifnotfound=NA))}
 83 | 
 84 | 
 85 | if (opt$verbose) {sprintf("%s background genes; %s with a corresponding entrez id", nrow(U), length(unique(universe)))}
 86 | # how many genes am I able to map?
 87 | # First thing notice that also ensembl gene ids longer than 15 characters are included
 88 | # if I remove these genes I end up with:
 89 | # length(unique(as.character(universe[which(nchar(names(universe)) == 15)]))) ----> 15593
 90 | 
 91 | 
 92 | createParams = function(x, species="human") {
 93 | 	if (species == "human") {
 94 | 		geneset = unlist(mget(x, org.Hs.egENSEMBL2EG, ifnotfound=NA))
 95 | 	}
 96 | 	if (species == "mouse") {
 97 | 		geneset = unlist(mget(x, org.Mm.egENSEMBL2EG, ifnotfound=NA))
 98 | 	}
 99 | 	if (species == "dmel") {
100 | 		geneset = unlist(mget(x, org.Dm.egENSEMBL2EG, ifnotfound=NA))
101 | 	}
102 | 	sprintf("%s foreground genes; %s with a corresponding entrez id", length(x), length(unique(geneset)))
103 | 	pv = 1-(1-0.05)**(1/length(x))
104 | 	params = new("GOHyperGParams",
105 | 		geneIds = geneset,
106 | 		universeGeneIds = universe,
107 | 		annotation = ann,
108 | 		ontology = opt$categ,
109 | 		pvalueCutoff = pv,
110 | 		conditional = TRUE,
111 | 		testDirection='over')
112 | 	return(params)}
113 | 
114 | res = hyperGTest(createParams(unique(G$hs), opt$species))
115 | 
116 | if (opt$verbose) {
117 | cat ("Finished Hypergeometric test\n")
118 | }
119 | 
120 | # Reformat the output table
121 | df = summary(res)
122 | df$Pvalue = format(df$Pvalue, digits=1) 
123 | df$OddsRatio <- round(df$OddsRatio, 2)
124 | df$ExpCount <- round(df$ExpCount, 2)
125 | 
126 | 
127 | 
128 | # Get the genes for the enriched GO terms
129 | if (opt$output_genes) {
130 | 	enrichGenes <- ldply(geneIdsByCategory(res, catids=sigCategories(res, pvalueCutoff(res))), data.frame)
131 | 	enrichGenes[[2]] <- mapIds(eval(parse(text=ann)), keys=enrichGenes[[2]], keytype="ENTREZID", column=c("ENSEMBL"))
132 | 	colnames(enrichGenes) <- c("GO", "gene_id")
133 | 	output = sprintf("%s/%s.%s.genes", opt$output_dir, opt$output, opt$categ)
134 | 	write.table(enrichGenes, file=output, quote=F, sep="\t", row.names=F)
135 | }
136 | 
137 | 
138 | # Print output
139 | output = sprintf("%s/%s.%s", opt$output_dir, opt$output, opt$categ)
140 | write.table(df, file=sprintf("%s.tsv", output), quote=F, sep="\t", row.names=F)
141 | htmlReport(res, file=sprintf("%s.html", output))
142 | 
143 | 
144 | 
145 | 
146 | q(save='no')
147 | 
148 | 
149 | 


--------------------------------------------------------------------------------
/SOM.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript 
  2 | 
  3 | options(stringsAsFactors=F)
  4 | set.seed(1)
  5 | 
  6 | ##################
  7 | # OPTION PARSING
  8 | ##################
  9 | 
 10 | suppressPackageStartupMessages(library("optparse"))
 11 | 
 12 | option_list <- list(
 13 | make_option(c("-i", "--input_matrix"), default="stdin",
 14 | 	help="Columns are samples and rows are dimensions. Can be stdin [default=%default]"),
 15 | 
 16 | make_option(c("-r", "--replace_NAs"), default=FALSE, action='store_true',
 17 | 	help="Replace NAs by 0s. If FALSE, rows containing NAs are omitted. [default=%default]"),
 18 | 
 19 | make_option(c("-G", "--grid"), default="5x4",
 20 | 	help="Grid structure. Format: gridRowsxgridCols. [default=%default]"),
 21 | 
 22 | make_option(c("-t", "--topology"), default="hexagonal",
 23 | 	help="Grid topology. <hexagonal> | <rectangular> [default=%default]"),
 24 | 
 25 | make_option(c("-T", "--toroidal"), default=FALSE, action="store_true",
 26 | 	help="Toroidal strucure [default=%default]"),
 27 | 
 28 | make_option(c("-n", "--iterations"), default=100, 
 29 | 	help="Number of iteration [default=%default]"),
 30 | 
 31 | make_option(c("-m", "--metadata"), 
 32 | 	help="tsv file with metadata on matrix experiment"),
 33 | 
 34 | make_option(c("-f", "--fields"), 
 35 | 	help="choose the fields you want to use for super-organized maps, comma-separated. Needs a metadata file"),
 36 | 
 37 | make_option(c("-o", "--output"), default="SOM.out.tsv",
 38 | 	help="Output file name, with extension .tsv [default=%default]"),
 39 | 
 40 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, help="verbose output")
 41 | )
 42 | 
 43 | parser <- OptionParser(
 44 | 	usage = "%prog [options] file", 
 45 | 	description = "
 46 | 	Apply SOM to a matrix of values.
 47 | 	The resulting cluster unit is appended as last column to the original matrix.",
 48 | 	option_list=option_list)
 49 | arguments <- parse_args(parser, positional_arguments = TRUE)
 50 | opt <- arguments$options
 51 | if (opt$verbose) {print(opt)}
 52 | 
 53 | 
 54 | ##------------
 55 | ## LIBRARIES
 56 | ##------------ 
 57 | 
 58 | if(opt$verbose) {cat('Libraries loading... ')}
 59 | 
 60 | suppressPackageStartupMessages(library('reshape2'))
 61 | suppressPackageStartupMessages(library('ggplot2'))
 62 | suppressPackageStartupMessages(library("kohonen"))
 63 | 
 64 | if (opt$verbose) {cat('DONE\n\n')}
 65 | 
 66 | ##-------------##
 67 | ## BEGIN       ##
 68 | ##-------------##
 69 | 
 70 | # read the matrix from the command line
 71 | if (opt$input_matrix == "stdin") {
 72 | 	input = "stdin"
 73 | } else {
 74 | 	input = opt$input_matrix
 75 | }
 76 | 
 77 | #input = ifelse(opt$input_matrix == "stdin", file("stdin"), opt$input_matrix)
 78 | 
 79 | data = read.table(input, h=T)
 80 | init_nrow = nrow(data)
 81 | if (opt$verbose) {print(head(data))}
 82 | 
 83 | # Deal with NAs
 84 | if (opt$replace_NAs) {
 85 | 	m = replace(data, is.na(data), 0)
 86 | } else {
 87 | 	keep_index = which(rowSums(is.na(data)) == 0)
 88 | 	m = na.omit(data)
 89 | 	if(opt$verbose) {cat("Dimensions retained after omittig NAs:", nrow(m), "of", init_nrow, "\n")}
 90 | }
 91 | 
 92 | # Initialize the column with the clusters
 93 | data$K <- NA
 94 | 
 95 | # read the SOM grid 
 96 | gridStruct = as.numeric(strsplit(opt$grid, "x")[[1]])
 97 | gridRows = gridStruct[1]
 98 | gridCols = gridStruct[2]
 99 | if(opt$verbose) {cat("SOM grid:", gridStruct, "\n")}
100 | 
101 | 
102 | # read the metadata from the metadata file if provided
103 | if (!is.null(opt$metadata)) {
104 | 	merge_mdata_on = 'labExpId'
105 | 	toSuperSOM = list()
106 | 	mdata = read.table(opt$metadata, h=T, sep='\t')
107 | 	mdata[merge_mdata_on] <- gsub(",", ".", mdata[,merge_mdata_on])
108 | 	# Take only mdata rows which are in the columns of the input matrix
109 | 	mdata = mdata[which(mdata[,merge_mdata_on] %in% colnames(data)),]
110 | 	fields = strsplit(opt$fields, ",")[[1]]
111 | 	mdata = unique(mdata[,c(merge_mdata_on, fields)])
112 | 	levs = levels(as.factor(mdata[,fields]))
113 | 	for (lev in levs) {
114 | 		ids = mdata[which(mdata[, fields] == lev), merge_mdata_on]
115 | 		toSuperSOM[[lev]] <- as.matrix(m[ids])
116 | 	}
117 | 	SOM = supersom(
118 | 		toSuperSOM,
119 | 		grid = somgrid(gridCols, gridRows, opt$topology), 
120 | 		toroidal=opt$toroidal,
121 | 		rlen = opt$iterations
122 | 	)
123 | 	df_changes = melt(as.matrix(SOM$changes), varnames=c("x", "f"), value.name="y")
124 | 	data[keep_index, "K"] <- SOM$unit.classif
125 | } else {
126 | 	SOM = som(
127 | 		as.matrix(m), 
128 | 		grid = somgrid(gridCols, gridRows, opt$topology), 
129 | 		toroidal = opt$toroidal,
130 | 		rlen = opt$iterations
131 | 	)
132 | 	df_changes = data.frame(x = seq_along(SOM$changes), y = SOM$changes)
133 | 	data[keep_index, "K"] <- SOM$unit.classif
134 | }
135 | 
136 | 
137 | 
138 | # ~~~~~~~~~~~~ #
139 | # OUTPUT       #
140 | # ~~~~~~~~~~~~ #
141 | 
142 | write.table(data, file=opt$output, quote=FALSE, sep="\t")
143 | 
144 | # ~~~~~~~~~~ #
145 | # PLOT       #
146 | # ~~~~~~~~~~ #
147 | 
148 | out.prefix = gsub(".tsv", "", opt$output)
149 | 
150 | theme_set(theme_bw(base_size=16))
151 | 
152 | # convergence
153 | gp = ggplot(df_changes, aes(x, y)) 
154 | if (!is.null(opt$metadata)) {
155 | 	gp = gp + geom_line(aes(color=f, group=f))
156 | } else {
157 | 	gp = gp + geom_line()
158 | }
159 | gp = gp + labs(x='Iteration', y='Mean distance to unit')
160 | ggsave(sprintf('%s.convergence.pdf', out.prefix), h=5, w=5)
161 | 
162 | # Number of elements per unit
163 | df = setNames(as.data.frame(table(SOM$unit.classif)), c('x', 'y'))
164 | gp = ggplot(df, aes(x, y))
165 | gp = gp + stat_identity(geom='bar', color='blue', fill='white')
166 | gp = gp + labs(x='Unit', y='Number of elements')
167 | gp = gp + theme(axis.text.x = element_text(angle=45, hjust=1))
168 | ggsave(sprintf('%s.units.pdf', out.prefix), h=5, w=8)
169 | 
170 | 
171 | q(save='no')
172 | 


--------------------------------------------------------------------------------
/Rtsne.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | set.seed(1555)
  4 | 
  5 | # DEFAULT OPTIONS
  6 | 
  7 | opt = list()
  8 | opt$log10 = FALSE
  9 | opt$pseudocount = 1e-04
 10 | opt$row_as_variables = FALSE
 11 | 
 12 | ##------------
 13 | ## LIBRARIES
 14 | ##------------ 
 15 | suppressPackageStartupMessages(library("optparse"))
 16 | 
 17 | options(stringsAsFactors=F)
 18 | 
 19 | ##################
 20 | # OPTION PARSING
 21 | ##################
 22 | 
 23 | option_list <- list(
 24 | make_option(c("-i", "--input_matrix"), default="stdin", help="Matrix of observations (rows are the obs) or distances [default=%default]"),
 25 | make_option(c("-l", "--log10"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"),
 26 | make_option(c("-p", "--pseudocount"), type="double", help="specify a pseudocount for the log [default=%default]", default=0),
 27 | 
 28 | make_option(c("-T", "--input_type"), default="dist", help="Choose between dist and obs. [default=%default]"),
 29 | 
 30 | make_option(c("-n", "--iter"), default=1000, help="Number of iterations [default=%default]"),
 31 | 
 32 | make_option(c("-m", "--metadata"), help="Tab-separated file with the metadata. Can be omitted"),
 33 | 
 34 | make_option(c("--merge_mdata_on"), default="labExpId",
 35 | 	help="[default=%default]"),
 36 | 
 37 | make_option(c("--perplexity"), default=30, help="[default=%default]"),
 38 | 
 39 | #make_option(c("-o", "--output"), help="additional info you want to put in the output file name", default="out"),
 40 | make_option(c("-c", "--color_by"), help="choose the fields in the metadata you want to color by", type='character'),
 41 | 
 42 | make_option(c("-s", "--shape_by"), help="choose the fields in the metadata you want to shape by", type='character'),
 43 | 
 44 | make_option(c("-P", "--palette"), help="palette file [default=%default]"),
 45 | 
 46 | make_option(c("-W", "--width"), default=7,
 47 | 	help="Width of the plot in inches [default=%default]"),
 48 | 
 49 | make_option(c("-H", "--height"), default=7,
 50 | 	help="Height of the plot in inches [default=%default]"),
 51 | 
 52 | make_option(c("-o", "--output"), default="Rtsne",
 53 | 	help="Prefix for the file names. [default=%default]"),
 54 | 
 55 | make_option(c("-v", "--verbose"), action='store_true', default=FALSE,
 56 | 	help="verbose output [default=%default]")
 57 | )
 58 | 
 59 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list,
 60 | 	description="\nPerform Barnes Hut t-SNE. Return a plot with the points in the new 2D and a plot of the cost")
 61 | arguments <- parse_args(parser, positional_arguments = TRUE)
 62 | opt <- arguments$options
 63 | 
 64 | if (opt$verbose) {print(opt)}
 65 | 
 66 | suppressPackageStartupMessages(library("reshape"))
 67 | suppressPackageStartupMessages(library("ggplot2"))
 68 | suppressPackageStartupMessages(library("Rtsne"))
 69 | suppressPackageStartupMessages(library("plyr"))
 70 | 
 71 | ###############
 72 | # BEGIN
 73 | ##############
 74 | 
 75 | # read input matrix
 76 | if (opt$input_matrix == "stdin") {input=file("stdin")} else {input=opt$input_matrix}
 77 | m = read.table(input, h=T, sep="\t")
 78 | id = colnames(m)
 79 | 
 80 | # --- dist ---
 81 | if (opt$input_type == "dist") {
 82 | 	m = as.dist(m)
 83 | }
 84 | 
 85 | # --- obs ---
 86 | if (opt$input_type == "obs") {
 87 | 	# apply the log if required
 88 | 	if (opt$log10) {m = log10(replace(m, is.na(m), 0) + opt$pseudocount)}
 89 | 	m = t(m)
 90 | }
 91 | 
 92 | 
 93 | ## Read the color palette
 94 | if (!is.null(opt$palette)) {my_palette = read.table(opt$palette, h=F, comment.char="%", sep="\t")$V1}
 95 | 
 96 | #
 97 | ## Read the shapes
 98 | #if (!is.null(opt$shapes)) {
 99 | #	my_shapes = read.table(opt$shapes, h=F, comment.char="%")$V1
100 | #}
101 | #
102 | tsne = Rtsne(m, max_iter=opt$iter, perplexity=opt$perplexity, pca=FALSE, is.distance=(opt$input_type=="dist"))
103 | 
104 | # Plot the cost at each iteration
105 | cost_df = data.frame(iter=seq_along(tsne$itercosts), cost=tsne$itercosts)
106 | gp = ggplot(cost_df, aes(x=iter, y=cost)) + geom_line()
107 | ggsave(sprintf("%s.cost.pdf", opt$output), h=5, w=7)
108 | 
109 | 
110 | # ***************
111 | # Plot
112 | # ***************
113 | 
114 | map_df = data.frame(x=tsne$Y[,1], y=tsne$Y[,2], id=id)
115 | 
116 | 
117 | # HANDLING METADATA
118 | 
119 | if (!is.null(opt$metadata)){
120 | 	mdata = read.table(opt$metadata, h=T, sep="\t", row.names=NULL, quote="");
121 | 	mdata[,opt$merge_mdata_on] <- gsub("[,-]", ".", mdata[,opt$merge_mdata_on])
122 | 	mdata <- unique(mdata[,c(opt$merge_mdata_on, opt$color_by, opt$shape_by)])
123 | 	map_df <- merge(map_df, mdata, by.x="id", by.y=opt$merge_mdata_on)
124 | }
125 | 
126 | 
127 | # PLOTTING
128 | 
129 | theme_set(theme_bw(base_size=18))
130 | theme_update(
131 | 	legend.key = element_blank()
132 | )
133 | 
134 | 
135 | geom_params = list()
136 | mapping = list()
137 | 
138 | geom_params$size = 4
139 | 
140 | if (is.null(opt$color_by)) {
141 | 	geom_params$color = "black"
142 | } else {
143 | 	mapping = modifyList(mapping, aes_string(color=opt$color_by))
144 | }
145 | 
146 | if (is.null(opt$shape_by)) {
147 | 	geom_params$shape = 19
148 | } else {
149 | 	mapping = modifyList(mapping, aes_string(shape=opt$shape_by))
150 | }
151 | 
152 | class(mapping) <- "uneval"
153 | 
154 | pointLayer <- layer(
155 | 	geom = "point",
156 | #	geom_params = geom_params,
157 | 	params = geom_params,
158 | 	stat = "identity",
159 | 	position = "identity",
160 | 	mapping = mapping	
161 | )
162 | 
163 | 
164 | gp = ggplot(map_df, aes(x=x, y=y)) + pointLayer
165 | gp = gp + labs(x=NULL, y=NULL)
166 | if (!is.null(opt$palette)) {
167 | 	if (is.discrete(map_df[,opt$color_by])) {
168 | 		gp = gp + scale_color_manual(values=my_palette)
169 | 	} else {
170 | 		gp = gp + scale_color_gradientn(colours=my_palette)
171 | 	}
172 | }
173 | ggsave(sprintf("%s.map.pdf", opt$output), w=opt$width, h=opt$height)
174 | 
175 | q(save='no')
176 | 


--------------------------------------------------------------------------------
/major.isoform.across.samples.R:
--------------------------------------------------------------------------------
  1 | 
  2 | #############################
  3 | # LIBRARIES
  4 | #####################
  5 | 
  6 | suppressPackageStartupMessages(library(reshape2))
  7 | suppressPackageStartupMessages(library(ggplot2))
  8 | suppressPackageStartupMessages(library("optparse"))
  9 | 
 10 | options(stringsAsFactors=F)
 11 | pseudocount = 1e-04
 12 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 
 13 | 
 14 | ##################
 15 | # OPTION PARSING
 16 | ##################
 17 | 
 18 | 
 19 | option_list <- list(
 20 | make_option(c("-i", "--input_matrix"), help="the matrix with transcript expression you want to analyze"),
 21 | make_option(c("-a", "--annotation"), help="two-column file with gene and tx ids"),
 22 | make_option(c("-o", "--output"), help="choose the name for the output file WITHOUT extension"),
 23 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"),
 24 | make_option(c("-c", "--compare_by"), help="field of the metadata by which you want to compare")
 25 | )
 26 | 
 27 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 28 | arguments <- parse_args(parser, positional_arguments = TRUE)
 29 | opt <- arguments$options
 30 | 
 31 | #q(save='no')
 32 | 
 33 | 
 34 | entropy = function(x)  {x = x[!is.na(x)]; x =x[x!=0]; p=x/sum(x); if (length(x) != 0) {return(-sum(p*log(p)))} else {NA}}
 35 | 
 36 | which_is_maj_iso = function(x) if (sum(x==0)==length(x)) {return(0)} else {return(which.max(x))}
 37 | gn_fun1 = function(x) max(table(as.numeric(x)))  # to be revised!!!
 38 | gn_fun2 = function(x) length(unique(x[x!=0]))
 39 | 
 40 | 
 41 | ###########
 42 | ## HUMAN ##
 43 | ###########
 44 | 
 45 | # using only protein coding genes AND transcripts
 46 | human_gn_tx = read.table(opt$annotation, h=F, col.names=c('gene','tx'))
 47 | # transcript matrix
 48 | human_expr = read.table(opt$input_matrix, h=T)
 49 | human_expr[is.na(human_expr)] <- 0
 50 | 
 51 | # transcript matrix only for protein coding transcripts with gene id appended
 52 | human_tx_gn_expr = merge(human_gn_tx,human_expr,by.x='tx',by.y='row.names')
 53 | human_expr_iso = aggregate(human_tx_gn_expr[,-(1:2)], list(human_tx_gn_expr$gene), which_is_maj_iso)
 54 | rownames(human_expr_iso) = human_expr_iso[,1]
 55 | human_expr_iso[,1] = NULL
 56 | human_expr_iso_fun = data.frame(n_samples=apply(human_expr_iso,1,gn_fun1),n_maj_iso=apply(human_expr_iso,1,gn_fun2), organism='Human')
 57 | 
 58 | # add the number of annotated for human
 59 | hs_ann_iso = data.frame(table(human_gn_tx$gene))
 60 | names(hs_ann_iso) = c('gene','ann_iso')
 61 | human_expr_iso_fun_ann = merge(hs_ann_iso, human_expr_iso_fun, by.x='gene', by.y='row.names')
 62 | 
 63 | hs_mm_expr_iso_fun_ann = human_expr_iso_fun_ann
 64 | 
 65 | 
 66 | # read the metadata
 67 | mdata = read.table(opt$metadata, h=T, sep='\t')
 68 | 
 69 | variables = unique(mdata[,opt$compare_by])
 70 | 
 71 | shift = sapply(variables, function(x) {ids = unique(mdata[which(mdata[opt$compare_by]==x), 'labExpId']);
 72 | apply(human_expr_iso[ids], 1, function(x) if (length(unique(x))==1) {return(unique(x))}else{return(NA)} )
 73 | })
 74 | 
 75 | for (el in variables){
 76 | ids = unique(mdata[which(mdata[opt$compare_by]==el), 'labExpId'])
 77 | human_expr_iso[el] = apply(human_expr_iso[ids], 1, function(x) if (length(unique(x))==1) {return(unique(x))}else{return(NA)} )
 78 | }
 79 | 
 80 | shift2ggplot = data.frame(
 81 | # Total genes in the annotation
 82 | Total_genes = nrow(shift), 
 83 | # Genes with the same major isoform within the same group
 84 | same_maj_iso_within = nrow(shift[which(rowSums(shift==0) ==0),]),
 85 | # Genes with the same major isoform consistent within the group, but different between
 86 | diff_maj_iso_between = sum(apply(shift[which(rowSums(shift==0) ==0),], 1, function(x) length(unique(x)))>1) 
 87 | )
 88 | 
 89 | write.table(shift2ggplot, sprintf("%s.tsv", opt$output), sep='\t', quote=F, row.names=F)
 90 | 
 91 | ## PLOT ##
 92 | 
 93 | library(ggplot2)
 94 | pdf(sprintf("%s.pdf", opt$output),width=12)
 95 | 
 96 | #ggplot(subset(hs_mm_expr_iso_fun_ann,ann_iso <= 25),aes(x=as.factor(ann_iso), y= n_samples)) + geom_boxplot() + facet_wrap(~organism,scale='free') + xlab('Number of annotated isoforms') + ylab('Max number of samples where the same major isoform is expressed') + opts(title='Most diffused major isoform')
 97 | 
 98 | ggplot(subset(hs_mm_expr_iso_fun_ann, ann_iso >= 2 & n_maj_iso>0),aes(x=as.factor(n_maj_iso),fill=cut(ann_iso, c(1,2,3,4,5,10,15,25,Inf)))) + geom_histogram(position='stack') + guides(fill = guide_legend(reverse = TRUE)) + scale_fill_hue(name='Total isoforms per gene',labels=c(2,3,4,5,'6-10','11-15','16-25','>25')) + labs(x='Major isoforms per gene', y='Number of protein coding genes') 
 99 | 
100 | ggplot(subset(hs_mm_expr_iso_fun_ann,ann_iso<=35 & ann_iso>=2 & n_maj_iso>=1 ),aes(x=as.numeric(ann_iso))) + geom_histogram(aes(fill=cut(n_maj_iso, c(0,1,2,3,4,5,6,7,Inf))),position='stack',binwidth=1) + guides(fill=guide_legend(reverse=TRUE)) + stat_bin(aes(y=..count../x),breaks=seq(2,34,by=1),geom='line') + labs(x='Annotated isoforms per gene', y='Number of protein coding genes') + scale_fill_hue(name='Major isoforms per gene',labels=c(1,2,3,4,5,6,7,'>8'))
101 | 
102 | gp = ggplot(subset(hs_mm_expr_iso_fun_ann, ann_iso >= 2 & n_maj_iso>0), aes(x=ann_iso, y=n_maj_iso)) 
103 | gp = gp + stat_bin2d(binwidth=c(1,1),aes(fill=cut(..count.., c(0,1,5,10,50,100,500,1000,2000,3000,Inf))),colour="black",size=.2) 
104 | gp = gp + scale_fill_brewer(name='Number\nof genes', palette='Greens') + xlim(c(2,20)) 
105 | gp = gp + ylim(c(0,20)) 
106 | gp = gp + labs(x='Annotated isoforms per gene', y='Major isoforms per gene')
107 | gp
108 | 
109 | #ggplot(subset(hs_mm_expr_iso_median_ann,hs_ann_iso!=1 & mm_ann_iso!=1 ),aes(x=floor(mm_entropy*10),y=floor(hs_entropy*10))) + stat_bin2d(binwidth=c(1,1),aes(fill=cut(..count.., c(0,1,5,10,100,500,1000,2000,3000,Inf))),colour ="black",size=.2) + scale_fill_brewer("count") + geom_abline(linetype=4) + opts(title="Median of entropy of isoform expression") + xlab('Mouse (entropy*10)') + ylab('Human (entropy*10)')
110 | dev.off()
111 | 


--------------------------------------------------------------------------------
/add_quantile.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | options(stringsAsFactors=FALSE)
  4 | 
  5 | ##################
  6 | # OPTION PARSING
  7 | ##################
  8 | 
  9 | suppressPackageStartupMessages(library("optparse"))
 10 | 
 11 | option_list <- list(
 12 | 
 13 | make_option(c("-i", "--input"), default="stdin",
 14 | 	help="File or stdin [default=%default]"),
 15 | 
 16 | make_option(c("-o", "--output"), default="quantile.out.tsv",
 17 | 	help="Output file name. <stdout> for printing on stdout [default=%default]"),
 18 | 
 19 | make_option(c("--header"), action="store_true", default=FALSE,
 20 | 	help="Use this if the input has a header [default=%default]"),
 21 | 
 22 | make_option(c("-m", "--method"), default="quantiles",
 23 | 	help="Choose the way to bin the data: < quantiles | breaks > [default=%default]"),
 24 | 
 25 | make_option(c("-c", "--column"), default=1,
 26 | 	help="The column with the values to divide in quantiles [default=%default]"),
 27 | 
 28 | make_option(c("-q", "--quantiles"), default=4, type="integer",
 29 | 	help="Number of quantiles [default=%default]"),
 30 | 
 31 | make_option(c("-s", "--resolve_breaks"), default=FALSE, action="store_true",
 32 | 	help="When breaks are not unique, don't crash but create fewer quantiles [default=%default]"),
 33 | 
 34 | make_option(c("-b", "--breaks"), default="c(-Inf,1,2,3,Inf)", 
 35 | 	help="Breaks for the intervals, comma-separated, with c notation [default=%default]"), 
 36 | 
 37 | make_option(c("--paste"), action="store_true", default=FALSE,
 38 | 	help="Paste index and interval in a unique column [default=%default]"),
 39 | 
 40 | make_option(c("--rowNames"), action="store_true", default=FALSE,
 41 | 	help="Print row names in the output [default=%default]"),
 42 | 
 43 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE,
 44 | 	help="if you want more output [default=%default]")
 45 | 
 46 | )
 47 | 
 48 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 49 | arguments <- parse_args(parser, positional_arguments = TRUE)
 50 | opt <- arguments$options
 51 | if (opt$verbose) {print(opt)}
 52 | 
 53 | #------------
 54 | # LIBRARIES
 55 | #------------ 
 56 | 
 57 | if (opt$verbose) {cat("Loading libraries... ")}
 58 | #suppressPackageStartupMessages(library(reshape2))
 59 | suppressPackageStartupMessages(library(ggplot2))
 60 | #suppressPackageStartupMessages(library(plyr))
 61 | if (opt$verbose) {cat("DONE\n\n")}
 62 | 
 63 | 
 64 | # ============== #
 65 | # Functions      #
 66 | # ============== #
 67 | 
 68 | formatInterval = function(interval) {
 69 | 
 70 | 	interval = as.character(interval)
 71 | 	if (length(grep("-Inf", interval) != 0)) {
 72 | 		string = strsplit(interval, split=",")[[1]][2]
 73 | 		outInterval = sprintf("<=%s", gsub("[\\)\\]]", "", string, perl=TRUE))
 74 | 		#outInterval = sprintf("<=%s", gsub("\\)", "", strsplit(interval, split=",")[[1]][2]))
 75 | 		return(outInterval)
 76 | 	}
 77 | 	if (length(grep("Inf", interval) != 0)) {
 78 | 		string = strsplit(interval, split=",")[[1]][1]
 79 | 		outInterval = sprintf(">%s", gsub("[\\(\\[]", "", string, perl=TRUE))
 80 | 		return(outInterval)
 81 | 	}
 82 | 
 83 | #	left = gsub("[\\(\\]]", "", strsplit(interval, split=",")[[1]][1], perl=TRUE)
 84 | #	right = gsub("\\)", "", strsplit(interval, split=",")[[1]][2])
 85 | #	outInterval = sprintf("%s <= x < %s", left, right)
 86 | 	outInterval = interval
 87 | 	return(outInterval)
 88 | }
 89 | 
 90 | 
 91 | # ======== #
 92 | # BEGIN    #
 93 | # ======== #
 94 | 
 95 | 
 96 | # Read data
 97 | 
 98 | if (opt$input == "stdin") {inF = file("stdin")} else {inF = opt$input}
 99 | m = read.table(inF, h=opt$header, sep="\t") 
100 | 
101 | pasteSEP= ":"
102 | 
103 | # Quantiles
104 | 
105 | if (opt$method == "quantiles") {
106 | 
107 | quantile_header = sprintf("quantile_%s", colnames(m)[opt$column])
108 | quant_index_header = sprintf("quant_index_%s", colnames(m)[opt$column])
109 | quantile_paste_header = sprintf("quant_paste_%s", colnames(m)[opt$column])
110 | 
111 | if (opt$resolve_breaks) {
112 | 	breaks <- c(unique(quantile(m[,opt$column], probs=seq(0,1,1/opt$quantiles), na.rm=T)))
113 | 	m[,quantile_header] = cut(m[,opt$column], breaks, include.lowest=TRUE)
114 | 	m[,quant_index_header] = match(m[,quantile_header], levels(m[,quantile_header]))
115 | } else {
116 | 	m[,quantile_header] = cut_number(m[,opt$column], opt$quantiles)
117 | 	m[,quant_index_header] = as.numeric(cut_number(m[,opt$column], opt$quantile))
118 | }
119 | 
120 | 
121 | if (opt$paste) {
122 | 	m[,	quantile_paste_header] <- with(m, paste(quant_index_header, quantile_header, sep=pasteSEP)) 
123 | #	if (max(m[,quant_index_header]) > 9 & max(m[,quant_index_header]) < 100) {
124 | #		m[m$quant_index_header<9, quantile_paste_header] <- with(m{}, paste0("0", quantile_paste_header))
125 | #	}
126 | 	m[, quantile_header] <- NULL; m[, quant_index_header] <- NULL
127 | }
128 | 
129 | 
130 | }
131 | 
132 | # Intervals
133 | 
134 | if (opt$method == "breaks") {
135 | 
136 | interval_header = sprintf("interval_%s", colnames(m)[opt$column])
137 | interval_index_header = sprintf("interval_index_%s", colnames(m)[opt$column])
138 | interval_paste_header = sprintf("interval_paste_%s", colnames(m)[opt$column])
139 | 
140 | breaks = eval(parse(text=opt$breaks))
141 | m[,interval_header] = cut(m[,opt$column], breaks=breaks, include.lowest=TRUE, right=TRUE, dig.lab=4)
142 | m[,interval_index_header] = cut(m[,opt$column], breaks=breaks, include.lowest=TRUE, right=TRUE, labels=FALSE)
143 | 
144 | # Format the interval
145 | m[,interval_header] = sapply(m[,interval_header], formatInterval)
146 | 
147 | if (opt$paste) {
148 | 	m[,	interval_paste_header] <- paste( as.character( m[, interval_index_header] ), m[, interval_header], sep=pasteSEP)
149 | 	if (max(m[,interval_index_header]) > 9 & max(m[,interval_index_header]) < 100) {
150 | 		m[m[,interval_index_header]<10, interval_paste_header] <- paste0("0", m[m[,interval_index_header]<10, interval_paste_header])
151 | 	}
152 | 	m[, interval_header] <- NULL; m[, interval_index_header] <- NULL
153 | }
154 | 
155 | }
156 | 
157 | 
158 | # Print output
159 | 
160 | output = ifelse(opt$output == "stdout", "", opt$output)
161 | write.table(m, file=output, row.names=opt$rowNames, quote=FALSE, col.names=opt$header, sep="\t")
162 | 
163 | 
164 | # EXIT
165 | quit(save='no')
166 | 


--------------------------------------------------------------------------------
/nt.coverage.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ##------------
  3 | ## LIBRARIES
  4 | ##------------ 
  5 | suppressPackageStartupMessages(library(reshape2))
  6 | suppressPackageStartupMessages(library(ggplot2))
  7 | suppressPackageStartupMessages(library("optparse"))
  8 | suppressPackageStartupMessages(library(plyr))
  9 | 
 10 | 
 11 | #options(stringsAsFactors=F)
 12 | pseudocount = 1e-04
 13 | cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 
 14 | 
 15 | ##################
 16 | # OPTION PARSING
 17 | ##################
 18 | 
 19 | 
 20 | option_list <- list(
 21 | make_option(c("-i", "--input_matrix"), help="the matrix you want to analyze"),
 22 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"),
 23 | make_option(c("-o", "--output"), help="additional flags for otuput", default="out"),
 24 | make_option(c("-d", "--outdir"), help="directory for the output", default="./"),
 25 | #make_option(c("-c", "--color_by"), help="choose the color you want to color by [default=NA]", type='character', default=NA),
 26 | make_option(c("-f", "--field"), help="dashboard field by which the individuals are grouped"),
 27 | make_option(c("-H", "--height"), default=5, help="output height in inches [default=%default]"),
 28 | make_option(c("-W", "--width"), default=9, help="output width in inches [default=%default]"),
 29 | make_option(c("--facet_nrow"), default=1, help="number of rows when faceting [default=%default]")
 30 | 
 31 | #make_option(c("-t", "--tags"), help="comma-separated field names you want to display in the labels", default="cell,sex,age")
 32 | )
 33 | 
 34 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 35 | arguments <- parse_args(parser, positional_arguments = TRUE)
 36 | opt <- arguments$options
 37 | print(opt)
 38 | 
 39 | na2null = function(x) if(is.na(x)) {return(NULL)}else{return(x)}
 40 | 
 41 | 
 42 | ##--------------------##
 43 | ## CLUSTERING SAMPLES ##
 44 | ##--------------------##
 45 | output = sprintf("%s.%s", basename(opt$input_matrix), opt$output)
 46 | 
 47 | # read the matrix from the command line
 48 | m = read.table(opt$input_matrix, h=F, col.names=c("element","labExpId","n_det_el","prop","tag","det"))
 49 | 
 50 | # read the metadata from the metadata file
 51 | mdata = read.table(opt$metadata, h=T, sep='\t')
 52 | mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(",", ".", x))
 53 | 
 54 | # prepare data.frame for ggplot
 55 | df = merge(subset(m, tag=="individual"), unique(mdata[c("labExpId", opt$field)]), by='labExpId')
 56 | 
 57 | # duplicate the data.frame to plot all
 58 | df_copy = df
 59 | df_copy[opt$field]= "all"
 60 | 
 61 | # attach the two data frames
 62 | new_df = rbind(df, df_copy)
 63 | 
 64 | 
 65 | # order the elements with the hierarchy: exonic, intronic, intergenic, total
 66 | new_df$element <- factor(new_df$element, levels=c('exonic','intronic','intergenic','total'))
 67 | #new_df$labels <- factor(new_df$det, labels = c("Genome covered", "Covered by RNA-seq", "RNA-seq distribution"))
 68 | new_df$facet2 = interaction(new_df$element, new_df$det)
 69 | 
 70 | # create a separate data.frame for cumulative coverage
 71 | cumul_df = subset(m, tag=='cumulative')
 72 | cumul_df$facet2 = interaction(cumul_df$element, cumul_df$det)
 73 | cumul_df[opt$field] = cumul_df$labExpId
 74 | #cumul_df$cell = cumul_df$labExpId
 75 | 
 76 | 
 77 | #gp = ggplot(new_df, aes(x=element, y=prop))
 78 | #gp = gp + geom_boxplot() 
 79 | #gp = gp + facet_grid(cell~det)
 80 | ##gp = gp + facet_grid(sprintf(".~%s", opt$field), scales = 'free_y') 
 81 | ##gp = gp + facet_grid(new_df$cell~facet, scales = 'free_y') 
 82 | ##gp = gp + geom_point(data=subset(m, tag=='cumulative'), aes_string(x="element", y="prop"), size = 4, alpha = 0.7)
 83 | #gp = gp + labs(y='Proportion of detected nucleotides', x="")
 84 | #gp = gp + theme(axis.text = element_text(size=15), axis.text.x=element_text(angle=45))
 85 | #gp
 86 | #
 87 | #gp = ggplot(new_df, aes(x=element, y=prop))
 88 | #gp = gp + geom_boxplot(aes(col=cell)) 
 89 | #gp = gp + facet_grid(.~labels, scales='free_x')
 90 | ##gp = gp + facet_grid(sprintf(".~%s", opt$field), scales = 'free_y') 
 91 | ##gp = gp + facet_grid(new_df$cell~facet, scales = 'free_y') 
 92 | ##gp = gp + geom_point(data=subset(m, tag=='cumulative'), aes_string(x="element", y="prop"), size = 4, alpha = 0.7)
 93 | #gp = gp + labs(y='Proportion of detected nucleotides', x="Genomic domain (D)")
 94 | #gp = gp + theme(axis.text = element_text(size=15), axis.text.x=element_text(angle=45, hjust=1))
 95 | #gp
 96 | #
 97 | ## almost final without cumulative
 98 | #gp = ggplot(new_df, aes(x=facet2, y=prop))
 99 | #gp = gp + geom_boxplot(aes(col=labels)) 
100 | #gp = gp + facet_grid(.~cell, scales='free_x')
101 | ##gp = gp + facet_grid(sprintf(".~%s", opt$field), scales = 'free_y') 
102 | ##gp = gp + facet_grid(new_df$cell~facet, scales = 'free_y') 
103 | ##gp = gp + geom_point(data=subset(m, tag=='cumulative'), aes_string(x="element", y="prop"), size = 4, alpha = 0.7)
104 | #gp = gp + labs(y='Proportion of nucleotides (%)', x="Genomic domain (D)")
105 | #gp = gp + theme(axis.text = element_text(size=15), axis.text.x=element_text(angle=45, hjust=1))
106 | #gp = gp + scale_x_discrete(labels=c('total','exonic','intronic','intergenic','exonic','intronic','intergenic'))
107 | #gp
108 | 
109 | # 
110 | 
111 | gp = ggplot(new_df, aes(x=facet2, y=prop))
112 | gp = gp + geom_boxplot(aes(fill=as.character(det)) )
113 | #gp = gp + facet_grid(.~cell, scales='free_x')
114 | if (!is.null(opt$field)) {
115 | #gp = gp + facet_grid(sprintf(".~%s", opt$field), scales='free_x')}
116 | gp = gp + facet_wrap(as.formula(sprintf("~%s", opt$field)), nrow=opt$nrow, scales='free_x')}
117 | gp = gp + geom_point(data=cumul_df, aes(x=facet2, y=prop, col=as.character(det)), size = 4, alpha = 0.7)
118 | gp = gp + geom_point(data=cumul_df, aes(x=facet2, y=prop), col='black', size = 4, alpha = 0.7, shape=1)
119 | gp = gp + labs(y='Proportion of nucleotides (%)', x="Genomic domain (D)")
120 | gp = gp + theme(axis.text = element_text(size=15), axis.text.x=element_text(angle=45, hjust=1))
121 | gp = gp + scale_x_discrete(labels=c('total','exonic','intronic','intergenic','exonic','intronic','intergenic'))
122 | gp = gp + scale_fill_hue(name='',labels = c("Genome covered", "Covered by RNA-seq", "RNA-seq distribution"))
123 | gp = gp + scale_color_hue(name='Cumulative',labels = c("Genome covered", "Covered by RNA-seq", "RNA-seq distribution"))
124 | 
125 | 
126 | w = opt$width
127 | h = opt$height
128 | 
129 | ggsave(filename=sprintf("%s/%s.pdf", opt$outdir, output), w=w, h=h)
130 | ggsave(filename=sprintf("%s/%s.png", opt$outdir, output), w=w, h=h)
131 | ggsave(filename=sprintf("%s/%s.eps", opt$outdir, output), w=w, h=h)
132 | 
133 | 
134 | q(save='no')
135 | 


--------------------------------------------------------------------------------
/dimensRed.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | set.seed(1555)
  4 | 
  5 | # DEFAULT OPTIONS
  6 | 
  7 | opt = list()
  8 | opt$log10 = FALSE
  9 | opt$pseudocount = 1e-04
 10 | opt$row_as_variables = FALSE
 11 | 
 12 | ##------------
 13 | ## LIBRARIES
 14 | ##------------ 
 15 | suppressPackageStartupMessages(library("optparse"))
 16 | 
 17 | options(stringsAsFactors=F)
 18 | 
 19 | ##################
 20 | # OPTION PARSING
 21 | ##################
 22 | 
 23 | option_list <- list(
 24 | make_option(c("-i", "--input_matrix"), default="stdin", help="Matrix of observations (rows are the obs) or distances [default=%default]"),
 25 | make_option(c("-l", "--log10"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"),
 26 | make_option(c("-p", "--pseudocount"), type="double", help="specify a pseudocount for the log [default=%default]", default=0),
 27 | 
 28 | make_option(c("-T", "--input_type"), default="dist", help="Choose between dist and obs. [default=%default]"),
 29 | 
 30 | make_option(c("-n", "--iter"), default=1000, help="Number of iterations [default=%default]"),
 31 | 
 32 | make_option(c("-m", "--metadata"), help="Tab-separated file with the metadata. Can be omitted"),
 33 | 
 34 | make_option(c("-M", "--method"), help="Choose one of: <MDS> <tSNE>"),
 35 | 
 36 | make_option(c("--merge_mdata_on"), default="labExpId",
 37 | 	help="[default=%default]"),
 38 | 
 39 | make_option(c("--perplexity"), default=30, help="[default=%default]"),
 40 | 
 41 | #make_option(c("-o", "--output"), help="additional info you want to put in the output file name", default="out"),
 42 | make_option(c("-c", "--color_by"), help="choose the fields in the metadata you want to color by", type='character'),
 43 | 
 44 | make_option(c("-s", "--shape_by"), help="choose the fields in the metadata you want to shape by", type='character'),
 45 | 
 46 | make_option(c("-P", "--palette"), help="palette file [default=%default]"),
 47 | 
 48 | make_option(c("--shapes"), 
 49 | 	help="File with the shapes [default=%default]"),
 50 | 
 51 | make_option(c("-b", "--base-size"), dest="base_size", default=18,
 52 | 	help="Text size [default=%default]"),
 53 | 
 54 | make_option(c("-W", "--width"), default=7,
 55 | 	help="Width of the plot in inches [default=%default]"),
 56 | 
 57 | make_option(c("-H", "--height"), default=7,
 58 | 	help="Height of the plot in inches [default=%default]"),
 59 | 
 60 | make_option(c("-o", "--output"), default="Rtsne",
 61 | 	help="Prefix for the file names. [default=%default]"),
 62 | 
 63 | make_option(c("-v", "--verbose"), action='store_true', default=FALSE,
 64 | 	help="verbose output [default=%default]")
 65 | )
 66 | 
 67 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list,
 68 | 	description="\nPerform Barnes Hut t-SNE. Return a plot with the points in the new 2D and a plot of the cost")
 69 | arguments <- parse_args(parser, positional_arguments = TRUE)
 70 | opt <- arguments$options
 71 | 
 72 | if (opt$verbose) {print(opt)}
 73 | 
 74 | suppressPackageStartupMessages(library("reshape"))
 75 | suppressPackageStartupMessages(library("ggplot2"))
 76 | suppressPackageStartupMessages(library("Rtsne"))
 77 | suppressPackageStartupMessages(library("plyr"))
 78 | 
 79 | ###############
 80 | # BEGIN
 81 | ##############
 82 | 
 83 | # read input matrix
 84 | if (opt$input_matrix == "stdin") {input=file("stdin")} else {input=opt$input_matrix}
 85 | m = read.table(input, h=T, sep="\t")
 86 | id = colnames(m)
 87 | 
 88 | # --- dist ---
 89 | if (opt$input_type == "dist") {
 90 | 	m = as.dist(m)
 91 | }
 92 | 
 93 | # --- obs ---
 94 | if (opt$input_type == "obs") {
 95 | 	# apply the log if required
 96 | 	if (opt$log10) {m = log10(replace(m, is.na(m), 0) + opt$pseudocount)}
 97 | 	m = t(m)
 98 | }
 99 | 
100 | 
101 | ## Read the color palette
102 | if (!is.null(opt$palette)) {my_palette = read.table(opt$palette, h=F, comment.char="%", sep="\t")$V1}
103 | 
104 | 
105 | # Read the shapes
106 | if (!is.null(opt$shapes)) {
107 | 	my_shapes = read.table(opt$shapes, h=F, comment.char="%")$V1
108 | }
109 | 
110 | if (opt$method == "tSNE") {
111 | 	tsne = Rtsne(m, max_iter=opt$iter, perplexity=opt$perplexity, pca=FALSE, is.distance=(opt$input_type=="dist"))
112 | 	x = tsne$Y[,1]
113 | 	y = tsne$Y[,2]
114 | 
115 | 	# Plot the cost at each iteration
116 | 	cost_df = data.frame(iter=seq_along(tsne$itercosts), cost=tsne$itercosts)
117 | 	gp = ggplot(cost_df, aes(x=iter, y=cost)) + geom_line()
118 | 	ggsave(sprintf("%s.cost.pdf", opt$output), h=5, w=7)
119 | 
120 | }
121 | 
122 | if (opt$method == "MDS") {
123 | 	fit <- cmdscale(m, eig=TRUE, k=2) 
124 | 	x <- fit$points[,1]
125 | 	y <- fit$points[,2]
126 | }
127 | 
128 | 
129 | # ***************
130 | # Plot
131 | # ***************
132 | 
133 | map_df = data.frame(x=x, y=y, id=id)
134 | 
135 | 
136 | # HANDLING METADATA
137 | 
138 | if (!is.null(opt$metadata)){
139 | 	mdata = read.table(opt$metadata, h=T, sep="\t", row.names=NULL, quote="", check.names=F);
140 | 	mdata[,opt$merge_mdata_on] <- gsub("[,-]", ".", mdata[,opt$merge_mdata_on])
141 | 	mdata <- unique(mdata[,c(opt$merge_mdata_on, opt$color_by, opt$shape_by)])
142 | 	map_df <- merge(map_df, mdata, by.x="id", by.y=opt$merge_mdata_on)
143 | }
144 | 
145 | 
146 | # PLOTTING
147 | 
148 | theme_set(theme_bw(base_size=opt$base_size))
149 | theme_update(
150 | 	legend.title = element_text(size=opt$base_size),
151 | 	legend.key = element_blank(),
152 | 	panel.grid = element_blank()
153 | )
154 | 
155 | 
156 | geom_params = list()
157 | mapping = list()
158 | 
159 | geom_params$size = 4
160 | 
161 | if (is.null(opt$color_by)) {
162 | 	geom_params$color = "black"
163 | } else {
164 | 	mapping = modifyList(mapping, aes_string(color=paste("`", opt$color_by, "`", sep="")))
165 | }
166 | 
167 | if (is.null(opt$shape_by)) {
168 | 	geom_params$shape = 19
169 | } else {
170 | 	mapping = modifyList(mapping, aes_string(shape=paste("`", opt$shape_by, "`", sep="")))
171 | }
172 | 
173 | class(mapping) <- "uneval"
174 | 
175 | pointLayer <- layer(
176 | 	geom = "point",
177 | #	geom_params = geom_params,
178 | 	params = geom_params,
179 | 	stat = "identity",
180 | 	position = "identity",
181 | 	mapping = mapping	
182 | )
183 | 
184 | 
185 | gp = ggplot(map_df, aes(x=x, y=y)) + pointLayer
186 | gp = gp + labs(x=paste(opt$method, "1", sep=" "), y=paste(opt$method, "2", sep=" "), title="")
187 | if (!is.null(opt$palette)) {
188 | 	if (is.discrete(map_df[,opt$color_by])) {
189 | 		gp = gp + scale_color_manual(values=my_palette)
190 | 	} else {
191 | 		gp = gp + scale_color_gradientn(colours=my_palette)
192 | 	}
193 | }
194 | if (!is.null(opt$shapes)) {
195 | 	gp = gp + scale_shape_manual(name=opt$shape_by, values=my_shapes);
196 | }
197 | 
198 | 
199 | ggsave(sprintf("%s.map.pdf", opt$output), w=opt$width, h=opt$height)
200 | 
201 | q(save='no')
202 | 


--------------------------------------------------------------------------------
/gglines.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | options(stringsAsFactors=F)
  4 | 
  5 | ##################
  6 | # OPTION PARSING
  7 | ##################
  8 | 
  9 | suppressPackageStartupMessages(library("optparse"))
 10 | 
 11 | option_list <- list(
 12 | 
 13 | make_option(c("-i", "--input"), type="character", default='stdin',
 14 | 	help="tab-separated file. Can be stdin [default=%default]"),
 15 | 
 16 | make_option(c("-o", "--output"), default="profile.pdf", 
 17 | 	help="output file name with extension [default=%default]"),
 18 | 
 19 | make_option(c("--header"), action="store_true", default=FALSE, 
 20 | 	help="The file has header [default=%default]"),
 21 | 
 22 | make_option(c("--group"), type="numeric",
 23 | 	help="Column index with the group factor [default=%default]"),
 24 | 
 25 | make_option(c("-c", "--color_by"), type='numeric',
 26 | 	help="Column index with the color factor. Leave empty for no color"),
 27 | 
 28 | make_option(c("-L", "--linetype_by"), type='numeric', 
 29 | 	help="Column index of the linetype factor, Leave empty for no linetype"),
 30 | 
 31 | make_option(c("-x", "--x_col"), type='numeric', default=1,
 32 | 	help="Column index of the x axis [default=%default]"),
 33 | 
 34 | make_option(c("-y", "--y_col"), type='numeric', default=2,
 35 | 	help="Column index of the y axis [default=%default]"),
 36 | 
 37 | make_option(c("-V", "--vertical_lines"), type='character', 
 38 | 	help="specify where you want the vertical lines [default=%default]"),
 39 | 
 40 | make_option(c("-a", "--alpha"), type="numeric", default=1,
 41 | 	help="Set transparency [default=%default]"),
 42 | 
 43 | #make_option(c("-f", "--facet"), type="integer", help="column index to facet"),
 44 | 
 45 | make_option(c("-t", "--title"), type="character", 
 46 | 	help="Main title for the plot. Leave emtpy for no title"),
 47 | 
 48 | make_option(c("--y_title"), default="norm_read_density", 
 49 | 	help="title for the y-axis [default=%default]"),
 50 | 
 51 | make_option(c("--x_title"), default="position", 
 52 | 	help="title for the x-axis [default=%default]"),
 53 | 
 54 | make_option(c("--scale_x_log10"), action="store_true", default=FALSE,
 55 | 	help="Change the x axis to log10 [default=%default]"),
 56 | 
 57 | make_option(c("--x_limits"), default=NULL,
 58 | 	help="Limits for the x axis [default=%default]"),
 59 | 
 60 | make_option(c("-P", "--palette"), 
 61 | 	help='File with colors for the lines. Leave empty to use even color spacing'),
 62 | 
 63 | make_option(c("-H", "--height"), default=5,
 64 | 	help="Height of the plot in inches [default=%default]"),
 65 | 
 66 | make_option(c("-W", "--width"), default=7,
 67 | 	help="Width of the plot in inches [default=%default]"),
 68 | 
 69 | make_option(c("-v", "--verbose"), action='store_true', default=FALSE,
 70 | 	help="Verbose output [default=%default]")
 71 | 
 72 | #make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"),
 73 | )
 74 | 
 75 | parser <- OptionParser(
 76 | 	usage = "%prog [options] file", 
 77 | 	option_list = option_list,
 78 | 	description = "From a column file, plot a column vs another as lines"
 79 | )
 80 | arguments <- parse_args(parser, positional_arguments = TRUE)
 81 | opt <- arguments$options
 82 | if (opt$verbose) {print(opt)}
 83 | 
 84 | 
 85 | ##------------
 86 | ## LIBRARIES
 87 | ##------------
 88 |  
 89 | if (opt$verbose) {cat("Loading libraries... ")}
 90 | suppressPackageStartupMessages(library(reshape2))
 91 | suppressPackageStartupMessages(library(ggplot2))
 92 | #suppressPackageStartupMessages(library(plyr))
 93 | if (opt$verbose) {cat("DONE\n\n")}
 94 | 
 95 | 
 96 | # ========
 97 | # BEGIN
 98 | # ========
 99 | 
100 | 
101 | 
102 | # Read data
103 | if (opt$input == "stdin") {input=file('stdin')} else {input=opt$input}
104 | m = read.table(input, h=opt$header, sep="\t")
105 | if(opt$verbose) {print(head(m))}
106 | 
107 | # Read palette
108 | if (!is.null(opt$palette)) {
109 | 	palette = read.table(opt$palette, h=FALSE, comment.char='%')$V1
110 | }
111 | 
112 | # Read coloring factor
113 | if (!is.null(opt$color_by)) {
114 | 	if (ncol(m)<opt$color_by) {
115 | 		cat("ERROR: color factor index out of range\n")
116 | 		q(save='no')
117 | 	}
118 | }
119 | 
120 | 
121 | #~~~~~~~~~~~~
122 | # GGPLOT
123 | #~~~~~~~~~~~~
124 | 
125 | theme_set(theme_bw(base_size=20))
126 | theme_update(
127 | 	panel.grid.minor = element_blank(),
128 | 	panel.grid.major = element_blank(),
129 | 	legend.key = element_blank()
130 | )
131 | 
132 | x = colnames(m)[opt$x_col]
133 | y = colnames(m)[opt$y_col]
134 | 
135 | 
136 | 
137 | alpha = opt$alpha
138 | 
139 | # -------------------------
140 | # geom_params and mapping
141 | # -------------------------
142 | 
143 | geom_params = list()
144 | mapping = list()
145 | 
146 | 
147 | # >> color <<
148 | 
149 | if (is.null(opt$color_by)) {
150 | 	geom_params$color = "black"
151 | } else {
152 | 	mapping = modifyList(mapping, aes_string(color=colnames(m)[opt$color_by]))
153 | }
154 | 
155 | 
156 | # >> linetype <<
157 | 
158 | if (is.null(opt$linetype_by)) {
159 | 	geom_params$linetype = 1
160 | } else {
161 | 	mapping = modifyList(mapping, aes_string(linetype=colnames(m)[opt$linetype_by]))
162 | }
163 | 
164 | 
165 | # >> group <<
166 | 
167 | if (!is.null(opt$group)) {
168 | 	group = colnames(m)[opt$group]
169 | 	mapping = modifyList(mapping, aes_string(group=colnames(m)[opt$group]))
170 | 	m[,group] <- gsub("\\\\n", "\n", m[,group])
171 | }
172 | 
173 | geom_params$alpha = alpha
174 | geom_params$size = 1
175 | 
176 | class(mapping) <- "uneval"
177 | 
178 | # -----------
179 | # lineLayer
180 | # -----------
181 | 
182 | lineLayer <- layer(
183 | 	geom = "line",
184 | 	params = geom_params,
185 | #	geom_params = geom_params,
186 | 	mapping = mapping,
187 | 	stat = "identity",
188 | 	position = "identity"
189 | )
190 | 
191 | 
192 | # plot 
193 | gp = ggplot(m, aes_string(x=x, y=y)) 
194 | 
195 | # Add line layer
196 | gp = gp + lineLayer
197 | 
198 | # Add the labels
199 | gp = gp + labs(title=opt$title, y=opt$y_title, x=opt$x_title)
200 | 
201 | # Color scale
202 | if (!is.null(opt$palette)) {
203 | 	gp = gp + scale_color_manual(values=palette)
204 | } else {
205 | 	gp = gp + scale_color_hue()
206 | }
207 | 
208 | # Vertical lines
209 | if (!is.null(opt$vertical_lines)) {
210 | 	vertical_lines = as.numeric(strsplit(opt$vertical_lines, ",")[[1]])
211 | 	gp = gp + geom_vline(xintercept=vertical_lines, color="grey", linetype="longdash")
212 | }
213 | 
214 | # Scale x log10
215 | if (opt$scale_x_log10) {
216 | 	gp = gp + scale_x_log10()
217 | }
218 | 
219 | # Read x limits
220 | if (!is.null(opt$x_limits)) {
221 | 	x_lim = as.numeric(strsplit(opt$x_limits, ",")[[1]])
222 | 	gp = gp + xlim(x_lim)
223 | }
224 | 
225 | ggsave(opt$output, h=opt$height, w=opt$width)
226 | 
227 | # EXIT
228 | quit(save='no')
229 | 


--------------------------------------------------------------------------------
/matrix_funct.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | cat("NOTE: Treat Inf and -Inf as NAs\n\n", file=stderr())
  4 |  
  5 | suppressPackageStartupMessages(library("optparse"))
  6 | 
  7 | options(stringsAsFactors=F)
  8 | 
  9 | 
 10 | # ==================
 11 | # DEBUG OPTIONS
 12 | # ==================
 13 | 
 14 | opt = list()
 15 | opt$input_matrix = "/users/rg/abreschi/Documents/human-mouse/antisense-transcription/s_as_read_counts/corr-mean/log2_FALSE.pdcn_1e-04.NAs_TRUE.human.mouse.S_AS_ratio.tsv"
 16 | opt$metadata = '/users/rg/abreschi/Documents/human-mouse/paper-sample-clustering/merged_RNA_dashboard_files.crg.tsv'
 17 | opt$mean_by="organism"
 18 | opt$output = "s_as_ratio"
 19 | opt$func = "mean"
 20 | opt$not_na = 0.7
 21 | opt$log=FALSE
 22 | opt$replace_na = FALSE
 23 | 
 24 | ##################
 25 | # OPTION PARSING
 26 | ##################
 27 | 
 28 | option_list <- list(
 29 | 
 30 | make_option(c("-i", "--input_matrix"), default="stdin",
 31 | 	help="the matrix you want to analyze. Stdin to read from stdin"),
 32 | 
 33 | make_option(c("--dt"), action="store_true", default=F,
 34 | 	help="read the matrix as data.table"),
 35 | 
 36 | make_option(c("-l", "--log"), action="store_true", default=FALSE, 
 37 | 	help="apply the log10, before applying the function [default=%default]"),
 38 | 
 39 | make_option(c("-k", "--replace_na"), action="store_true", default=FALSE, 
 40 | 	help="use this if you want NAs to be replaced by 0 [default=%default]"),
 41 | 
 42 | make_option(c("-p", "--pseudocount"), type="double", default=1e-04,
 43 | 	help="specify a pseudocount for the log [default=%default]"),
 44 | 
 45 | make_option(c("-m", "--metadata"), 
 46 | 	help="tsv file with the metadata"),
 47 | 
 48 | make_option(c("-s", "--mean_by"), 
 49 | 	help="choose one or multiple attributes you want to average by"),
 50 | 
 51 | make_option(c("-o", "--output"), default="out.tsv", 
 52 | 	help="Output file name. stdout for printing on standard output [default=%default]"),
 53 | 
 54 | make_option(c("-f", "--func"), default="mean",
 55 | 	help="choose the function <mean>, <sd>, <sum>, <median>, <entropy>, <nentropy>, <tau>, <cv> [default=%default]"),
 56 | 
 57 | make_option(c("-C", "--byColumns"), action="store_true", default=FALSE,
 58 | 	help="apply the function to the columns, instead of rows [default=%default]"),
 59 | 
 60 | make_option(c("-n", "--not_na"), type="double", default=1, 
 61 | 	help="fraction of not NA values in the vector for the mean. If NAs are replaced they are not counted [default=%default]"),
 62 | 
 63 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE,
 64 | 	help="verbose output")
 65 | )
 66 | 
 67 | parser <- OptionParser(
 68 | 	usage = "%prog [options] file", 
 69 | 	option_list=option_list,
 70 | 	description = "\nExecute a function on matrix rows or columns"
 71 | 	)
 72 | arguments <- parse_args(parser, positional_arguments = TRUE)
 73 | opt <- arguments$options
 74 | if (opt$verbose) {print(opt)}
 75 | 
 76 | 
 77 | ##------------
 78 | ## LIBRARIES
 79 | ##------------
 80 | 
 81 | if (opt$verbose) {cat("Loading libraries... ", file=stderr())}
 82 | suppressPackageStartupMessages(library(reshape2))
 83 | suppressPackageStartupMessages(library(data.table))
 84 | source("./functions.R")
 85 | if (opt$verbose) {cat("DONE\n\n", file=stderr())}
 86 | 
 87 | 
 88 | # ==========================================
 89 | # Function for loading Rdata
 90 | # ==========================================
 91 | 
 92 | load_obj <- function(f)
 93 | {
 94 |     env <- new.env()
 95 |     nm <- load(f, env)[1]
 96 |     env[[nm]]
 97 | }
 98 | 
 99 | ###############
100 | # BEGIN
101 | ###############
102 | 
103 | # read table
104 | if (opt$verbose) {cat(sprintf("%s: ", Sys.time()), "Reading matrix... ")}
105 | if (opt$dt) {
106 | 	inF = ifelse(opt$input_matrix == "stdin", "file:///dev/stdin", opt$input_matrix)
107 | 	m = fread(inF)
108 | 	row.names = m[,1][[1]]
109 | 	m = m[,-1]
110 | } else {
111 | 	if (opt$input_matrix == "stdin") {
112 | 	    m = read.table(file("stdin"), h=T)
113 | 	} else {
114 | 		m <- try(load_obj(opt$input_matrix), silent=T)
115 | 	    if (class(m) == "try-error") {m <- read.table(opt$input_matrix)}
116 | 	}
117 | 	row.names = rownames(m)
118 | }
119 | if (opt$verbose) {cat("DONE\n")}
120 | 
121 | # apply the log if required
122 | if (opt$replace_na) {m <- replace(m, is.na(m), 0)}
123 | if (opt$log) {m = log10(m + opt$pseudocount)}
124 | 
125 | # Set the result to NA when too many missing values are present
126 | func = function(x) {
127 | 	ifelse((sum(!is.na(x)) < (opt$not_na*length(x))), NA, format(eval(parse(text=opt$func))(x,na.rm=T), digits=5))
128 | }
129 | 
130 | 
131 | # Apply the function by columns
132 | if (opt$byColumns) {
133 | 	if (!is.null(opt$metadata)) {
134 | 		mdata <- read.table(opt$metadata, h=T, sep="\t", quote="", check.names=FALSE)
135 | 	}
136 | 	if (!is.null(opt$mean_by)) {
137 | 		mean_by = strsplit(opt$mean_by, ",")[[1]]
138 | 		m[,ncol(m)+1] <- mdata[,mean_by][match(rownames(m), mdata[,"gene"])]
139 | 		colnames(m)[ncol(m)] <- mean_by
140 | 		form = as.formula(sprintf(".~%s", mean_by))
141 | 		new_m <- aggregate(form, m, func)
142 | 	} else {
143 | 		new_m = data.frame(id=colnames(m), fun=apply(m, 2, func))
144 | 		
145 | 	}
146 | 
147 | } else {
148 | 
149 | 	# apply the function to the whole matrix if no value is provided
150 | 	if (is.null(opt$mean_by)) {
151 | 		new_m = setNames(data.table("id"=row.names, "func"=apply(m, 1, func)), c("id", opt$func))
152 | 	} else {
153 | 	
154 | 		# apply the function to the levels of the specified factors
155 | 		mean_by = strsplit(opt$mean_by, ",")[[1]]
156 | 		df = melt(as.matrix(m), varnames = c("gene_index", "labExpId"))
157 | 		
158 | 		# read metadata and merge with data.frame if needed
159 | 		if (!is.null(opt$metadata)) {
160 | 			mdata <- read.table(opt$metadata, h=T, row.names=NULL, sep="\t", quote="", check.names=FALSE)
161 | 			mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub("[-,]", ".", x))
162 | 			mdata = subset(mdata, labExpId %in% colnames(m))
163 | 			df = merge(df, unique(mdata[c("labExpId", mean_by)]), by = "labExpId")
164 | 		}
165 | 		
166 | 		df$value[abs(df$value)==Inf] <- NA
167 | 		aggr = aggregate(as.formula(sprintf("value~gene_index+`%s`", paste(mean_by,collapse="+"))), df, func, na.action="na.pass")
168 | 		aggr = dcast(aggr, as.formula(sprintf("gene_index~`%s`", paste(mean_by,collapse="+"))))
169 | 		new_m = aggr
170 | 	
171 | 	#	if (length(char_cols)==0) {new_m = cbind(gene=genes, new_m)}
172 | 	#	if (length(char_cols)!=0) {new_m = merge(genes, new_m, by.y="gene_index", by.x="row.names")[,-1]}
173 | 	}
174 | }	
175 | 
176 | 
177 | #--------------
178 | # print output
179 | #--------------
180 | 
181 | output = ifelse(opt$output == "stdout", "", opt$output)
182 | write.table(new_m, file = output, quote=F, sep='\t', row.names=F)
183 | 
184 | q(save='no')
185 | 
186 | 


--------------------------------------------------------------------------------
/rpkm_fraction.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | 
  4 | ##------------
  5 | ## LIBRARIES
  6 | ##------------ 
  7 | suppressPackageStartupMessages(library(reshape2))
  8 | suppressPackageStartupMessages(library(ggplot2))
  9 | suppressPackageStartupMessages(library("optparse"))
 10 | suppressPackageStartupMessages(library(plyr))
 11 | 
 12 | 
 13 | options(stringsAsFactors=F)
 14 | 
 15 | ##################
 16 | # OPTION PARSING
 17 | ##################
 18 | 
 19 | 
 20 | option_list <- list(
 21 | make_option(c("-i", "--input_matrix"), default="stdin", help="the matrix you want to analyze [default=%default]"),
 22 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"),
 23 | make_option(c("-o", "--output"), help="additional tags for otuput", default="out"),
 24 | make_option(c("-c", "--color_by"), help="choose the color you want to color by. Leave empty for no color", type='character'),
 25 | make_option(c("-y", "--linetype_by"), help="choose the factor you want the linetype by. Leave empty for no linetype", type="character"),
 26 | make_option(c("-f", "--file_sel"), help="list of elements of which computing the proportion at each point"),
 27 | make_option(c("--out_file"), help="store the coordinates in a file [default=%default]"),
 28 | make_option(c("-P", "--palette"), help="file with the colors"),
 29 | make_option(c("-t", "--tags"), help="choose the factor by which grouping the lines [default=%default]", default="labExpId")
 30 | )
 31 | 
 32 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 33 | arguments <- parse_args(parser, positional_arguments = TRUE)
 34 | opt <- arguments$options
 35 | #print(opt)
 36 | 
 37 | 
 38 | 
 39 | ##--------------------##
 40 | ## CLUSTERING SAMPLES ##
 41 | ##--------------------##
 42 | output = sprintf("rpkm_fraction.%s", opt$output)
 43 | 
 44 | # 1. read the matrix from the command line
 45 | if (opt$input_matrix == "stdin") {inF = file("stdin")} else {inF = opt$input_matrix}
 46 | m = read.table(inF, h=T, sep="\t")
 47 | 
 48 | 
 49 | # Read color palette if present
 50 | if (!is.null(opt$palette)) {palette = read.table(opt$palette, h=F, comment.char="%", sep="\t")$V1}
 51 | 
 52 | # remove potential gene id columns
 53 | char_cols <- which(sapply(m, class) == 'character')
 54 | sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols)
 55 | if (length(char_cols) == 0) {genes = rownames(m)}
 56 | if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]}
 57 | 
 58 | 
 59 | # 2. sort all the values for each column
 60 | sortm <- apply(m, 2, sort, na.last=T, d=T)
 61 | #sortm <- sortm[1:min(which(sortm==0))-1,]
 62 | 
 63 | # 3. calculate cumulative sum
 64 | cumm <- as.data.frame(apply(sortm, 2, function(x) cumsum(x/sum(x,na.rm=T))))
 65 | 
 66 | # 4. read the metadata from the metadata file
 67 | mdata = read.table(opt$metadata, h=T, sep='\t')
 68 | mdata[,"labExpId"] <- sapply(mdata[,"labExpId"], function(x) gsub(",", ".", x))
 69 | if (!is.null(opt$color_by)) {opt$color_by <- strsplit(opt$color_by, ",")[[1]]}
 70 | 
 71 | # prepare data.frame for ggplot
 72 | df = melt(cumm, variable.name = "labExpId", value.name="rpkm_fraction")
 73 | fields = unique(c("labExpId", strsplit(opt$tags, ",")[[1]], opt$color_by, opt$linetype_by))
 74 | df = merge(unique(mdata[fields]), df, by="labExpId")
 75 | df$labels = apply(df[strsplit(opt$tags, ",")[[1]]], 1, paste, collapse="_")
 76 | # add a column with the x index
 77 | df  = ddply(df, .(labels), transform, x=seq_along(labels), y=sort(rpkm_fraction, na.last=T, d=F))
 78 | 
 79 | # ===== Add an extra dataframe with the percentage of genes after the union =====
 80 | 
 81 | if (!is.null(opt$file_sel)) {
 82 | 
 83 | 	prop_df = data.frame()
 84 | 	sel = read.table(opt$file_sel, h=F)[,1]
 85 | 	# Order the data
 86 | 	orderm = apply(-m, 2, rank, na.last=T, ties.method='first')
 87 | 	# NB: orderm has lost the rownames 
 88 | 	thresholds = c(1:10 %o% 10^(0:4))
 89 | 	thresholds = thresholds[which(thresholds < nrow(m))]
 90 | 
 91 | 	props = sapply(thresholds, function(thr) {
 92 | 		u = rownames(m)[which(rowSums(orderm <= thr) > 0)];
 93 | 		length(intersect(u, sel))/length(u)
 94 | 		}
 95 | 	)
 96 | 	prop_df = data.frame(x=thresholds, y=props)
 97 | }
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | ###############
104 | # OUTPUT 
105 | ###############
106 | 
107 | # coordinate file
108 | 
109 | if (!is.null(opt$out_file)) {
110 | 	write.table(df, file=opt$out_file, row.names=FALSE, quote=FALSE, sep="\t")
111 | }
112 | 
113 | # plotting...
114 | base_size=16
115 | legend_nrow = 18
116 | #legend_nrow = 4
117 | theme_set(theme_bw(base_size=base_size))
118 | legend_text_inch = theme_get()$legend.text$size * base_size / 72.72
119 | add_w = legend_text_inch * max(nchar(df$labels)) * ceiling(length(levels(as.factor(df$labels)))/legend_nrow)
120 | 
121 | geom_params = list()
122 | geom_params$size = opt$size
123 | geom_params$alpha = opt$alpha
124 | 
125 | 
126 | mapping = list()
127 | mapping <- modifyList(mapping, aes(x=x, y=y, group=labels))
128 | 
129 | if (!is.null(opt$color_by)) {
130 | 	gp_color_by = interaction(df[opt$color_by])
131 |     mapping = modifyList(mapping, aes(color=gp_color_by))
132 | }
133 | 
134 | if (!is.null(opt$linetype_by)) {
135 | 	gp_linetype_by = interaction(df[opt$linetype_by])
136 | 	mapping = modifyList(mapping, aes(linetype=gp_linetype_by))
137 | }
138 | 	
139 | 
140 | class(mapping) <- "uneval"
141 | 
142 | lineLayer <- layer(
143 |         geom = "line",
144 | #       geom_params = geom_params,
145 |         params = geom_params,
146 |         mapping = mapping,
147 |         stat = "identity",
148 |         position = "identity"
149 | )
150 | 
151 | 
152 | 
153 | # GGPLOT
154 | 
155 | gp = ggplot(df) + lineLayer
156 | 
157 | #if (!is.null(opt$color_by)) {gp_color_by=interaction(df[opt$color_by])} else {gp_color_by=NULL}
158 | #if (!is.null(opt$linetype_by)) {gp_linetype_by=interaction(df[opt$linetype_by])} else {gp_linetype_by=NULL}
159 | #gp = gp + geom_line(aes(color=gp_color_by, linetype=gp_linetype_by, group=labels))
160 | 
161 | #gp = gp + scale_color_hue(name=paste(opt$color_by, collapse="."))
162 | if (!is.null(opt$color_by)) {
163 | 	if (!is.null(opt$palette)) {
164 | 		gp = gp + scale_color_manual(values=palette)
165 | 	} else {
166 | 		gp = gp + scale_color_hue()
167 | 	}
168 | }
169 | 
170 | gp = gp + labs(y="Fraction of gene rpkm", x='Number of genes')
171 | gp = gp + scale_linetype_manual(values=c(2,1))
172 | gp = gp + guides(col = guide_legend(nrow = legend_nrow, title=opt$color_by))
173 | gp = gp + scale_x_log10(expand=c(0,0))
174 | gp = gp + scale_y_continuous(expand=c(0.01,0), limits=c(0,1))
175 | gp = gp + annotation_logticks(sides="b")
176 | 
177 | if (!is.null(opt$file_sel)) {
178 | gp = gp + geom_point(data=prop_df, aes(x,y), shape=18, size=2)
179 | gp = gp + geom_point(data=prop_df, aes(x,y), shape=18, size=1.7, color='yellow')
180 | }
181 | 
182 | ggsave(sprintf("%s.pdf",output), h=5, w=6+add_w, title=output)
183 | 
184 | q(save='no')
185 | 


--------------------------------------------------------------------------------
/scatterplot.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | options(stringsAsFactors=F)
  4 | x_psd = 1e-03
  5 | y_psd = 1e-03
  6 | 
  7 | 
  8 | 
  9 | 
 10 | ##################
 11 | # OPTION PARSING
 12 | ##################
 13 | suppressPackageStartupMessages(library("optparse"))
 14 | 
 15 | 
 16 | option_list <- list(
 17 | make_option(c("-i", "--input_matrix"), default="stdin",
 18 | 	help="the matrix you want to analyze. \"stdin\" for stdin [default=%default]"),
 19 | 
 20 | make_option(c("--header"), action="store_true", default=FALSE, help="The file has header [default=%default]"),
 21 | 
 22 | make_option(c("-r", "--replace_NAs"), action="store_true", default=FALSE, 
 23 | 	help="Replace NAs with 0 [default=%default]"),
 24 | 
 25 | make_option(c("-x", "--x_axis"), type='integer', default=1,
 26 | 	help="the index (1-based) of the column you want on the x axis [default=%default]"),
 27 | 
 28 | make_option(c("-y", "--y_axis"), type='integer', default=2,
 29 | 	help="the index (1-based) of the column you want on the y axis [default=%default]"),
 30 | 
 31 | make_option(c("-C", "--color_by"), type="integer", 
 32 | 	help="Index of the column by which to color the dots [default=%default]"),
 33 | 
 34 | make_option(c("--color_as_factor"), action="store_true", default=FALSE, 
 35 | 	help="Convert the color_by column to factor [default=%default]"),
 36 | 
 37 | make_option(c("-o", "--output_suffix"), help="output filename [default=%default]", default='scatterplot.out.pdf'),
 38 | make_option(c("-t", "--type"), help="<tile>, <hex>, <scatter> [default=%default]", default="tile"),
 39 | make_option(c("-b", "--binwidth"), help="comma-separated values for binwidth x,y [default=%default]", default="1,1"),
 40 | make_option(c("--x_log"), action="store_true", help="x values log10 transformed [default=%default]", default=FALSE),
 41 | make_option(c("--y_log"), action="store_true", help="y values log10 transformed [default=%default]", default=FALSE),
 42 | make_option(c("--x_psd"), help="pseudocount for x values [default=%default]", default=x_psd, type='double'),
 43 | make_option(c("--y_psd"), help="pseudocount for y values [default=%default]", default=y_psd, type='double'),
 44 | make_option("--x_title", help="write a title for x axis"),
 45 | make_option("--y_title", help="write a title for y axis"),
 46 | make_option("--legend_title", help="write a title for the legend [default=%default]", default="count"),
 47 | 
 48 | make_option(c("--highlight"), 
 49 | 	help="a list of element you want to overlay as extra dots"),
 50 | 
 51 | make_option(c("--id_col"), default=1,
 52 | 	help="column with ids"),
 53 | 
 54 | make_option("--title", default="", 
 55 | 	help="write a title for the plot [default=%default]"),
 56 | 
 57 | make_option("--diagonal", action="store_true", default=FALSE, 
 58 | 	help="plot the diagonal [default=%default]"),
 59 | 
 60 | make_option(c("-R", "--linear_regression"), action="store_true", default=FALSE, 
 61 | 	help="plot the regression line [default=%default]"),
 62 | 
 63 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, 
 64 | 	help="verbose output [default=%default]")
 65 | )
 66 | 
 67 | 
 68 | parser <- OptionParser(
 69 | 	usage = "%prog [options] file", 
 70 | 	option_list=option_list,
 71 | 	description = "\n\nPlot a density scatterplot"
 72 | )
 73 | 
 74 | arguments <- parse_args(parser, positional_arguments = TRUE)
 75 | opt <- arguments$options
 76 | if (opt$verbose) {print(opt)}
 77 | 
 78 | ##------------
 79 | ## LIBRARIES
 80 | ##------------ 
 81 | 
 82 | if (opt$verbose) {cat("Loading libraries... ")}
 83 | suppressPackageStartupMessages(library(reshape2))
 84 | suppressPackageStartupMessages(library(ggplot2))
 85 | suppressPackageStartupMessages(library(plyr))
 86 | if (opt$verbose) {cat("DONE\n\n")}
 87 | 
 88 | 
 89 | 
 90 | 
 91 | ###################
 92 | # BEGIN           #
 93 | ###################
 94 | 
 95 | if (opt$input_matrix == "stdin") {
 96 | 	m = read.table(file("stdin"), h=opt$header, sep="\t")
 97 | } else {
 98 | 	m = read.table(opt$input_matrix, h=opt$header, sep="\t")
 99 | }
100 | 
101 | # Replace NAs with 0 if needed
102 | if (opt$replace_NAs) {m <- replace(m, is.na(m), 0)}
103 | 
104 | if (opt$x_log) {m[,opt$x_axis] <- m[,opt$x_axis] + opt$x_psd}
105 | if (opt$y_log) {m[,opt$y_axis] <- m[,opt$y_axis] + opt$y_psd}
106 | 
107 | df = m
108 | 
109 | # Pearson correlation coefficient
110 | pearson = round(cor(sapply(df[,opt$x_axis], function(x) ifelse(opt$x_log, log10(x), x)), 
111 | 	sapply(df[,opt$y_axis], function(x) ifelse(opt$y_log, log10(x), x)), method='p', use='p'), 2)
112 | spearman = round(cor(sapply(df[,opt$x_axis], function(x) ifelse(opt$x_log, log10(x), x)), 
113 | 	sapply(df[,opt$y_axis], function(x) ifelse(opt$y_log, log10(x), x)), method='s', use='p'), 2)
114 | 
115 | if (!is.null(opt$color_by) && opt$color_as_factor) {
116 |     df[, opt$color_by] <- as.factor(df[, opt$color_by])
117 | }
118 | 
119 | 
120 | # PLOTTING ...
121 | 
122 | theme_set(theme_bw(base_size=16))
123 | 
124 | bwidth = as.numeric(strsplit(opt$binwidth, ",")[[1]])
125 | plot_title = sprintf("%s (p_r=%s; s_r=%s)", opt$title, pearson, spearman)
126 | x_col = colnames(df[opt$x_axis])
127 | y_col = colnames(df[opt$y_axis])
128 | 
129 | # Read the subset of elements you want to highlight
130 | if (!is.null(opt$highlight)) {
131 | 	highlight = read.table(opt$highlight, h=F)$V1
132 | 	df_h = df[ df[,opt$id_col] %in%  highlight, ]
133 | 	if (opt$verbose) {
134 | 		print(head(highlight))
135 | 		print(head(df_h))
136 | 	}
137 | }
138 | 
139 | 
140 | # Read the axis titles
141 | if (is.null(opt$x_title)) {x_title = x_col} else {x_title = opt$x_title}
142 | if (is.null(opt$y_title)) {y_title = y_col} else {y_title = opt$y_title}
143 | 
144 | 
145 | 
146 | countBins <- c(0,1,2,5,10,25,50,75,100,500,Inf)
147 | 
148 | gp = ggplot(df, aes_string(x=x_col, y=y_col)) 
149 | 
150 | if (opt$type == 'tile') {
151 | 	gp = gp + stat_bin2d(bins=100)
152 | 	gp = gp + scale_fill_gradientn(colours=terrain.colors(20), name=opt$legend_title)
153 | 	if (!is.null(opt$highlight)) {
154 | 		gp = gp + geom_point(data=df_h, aes_string(x=x_col, y=y_col))
155 | 	}
156 | }
157 | 
158 | if (opt$type == 'hex') {
159 | gp = gp + geom_hex(aes(fill=cut(..count.., c(0,1,2,5,10,25,50,75,100,500,Inf))), binwidth=bwidth)
160 | gp = gp + scale_fill_manual('counts', values=terrain.colors(length(countBins))) }
161 | 
162 | if (opt$type == "scatter") {
163 |     gp = gp + geom_point(aes_string(colour=colnames(df)[opt$color_by]), size=1)
164 | }
165 | 
166 | 
167 | gp = gp + labs(x=x_title, y=y_title, title=plot_title)
168 | 
169 | # Add the diagonal line
170 | 
171 | if (opt$diagonal) {
172 | 	gp = gp + geom_abline(intercept=0, slope=1, color="grey")
173 | }
174 | 
175 | # Add the regression line
176 | 
177 | if (opt$linear_regression) {
178 | 	
179 | 	if (opt$verbose) {print(head(df))}
180 | 	if (opt$x_log) {
181 | 		x_col = sprintf("log10(%s)", x_col)
182 | 	}
183 | 	if (opt$y_log) {
184 | 		y_col = sprintf("log10(%s)", y_col)
185 | 	}
186 | 	formula = as.formula(sprintf("%s~%s", y_col, x_col))
187 | 	coeff = lm(formula, df)$coefficients
188 | 	gp = gp + geom_abline(intercept=coeff[1], slope=coeff[2])
189 | }
190 | 
191 | 
192 | # Change to log scale
193 | 
194 | if (opt$x_log) {gp = gp + scale_x_log10()}
195 | if (opt$y_log) {gp = gp + scale_y_log10()}
196 | 
197 | ggsave(opt$output, h=5, w=6)
198 | 
199 | 
200 | 
201 | 
202 | 
203 | q(save='no')
204 | 
205 | 


--------------------------------------------------------------------------------
/plot.network.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | 
  4 | opt = list()
  5 | opt$input = "top500.sd.RPKM.glasso.tsv"
  6 | opt$node_size = 2
  7 | 
  8 | options(stringsAsFactors=FALSE)
  9 | 
 10 | set.seed(123)
 11 | 
 12 | ##################
 13 | # OPTION PARSING
 14 | ##################
 15 | 
 16 | suppressPackageStartupMessages(library("optparse"))
 17 | 
 18 | option_list <- list(
 19 | 
 20 | make_option(c("-i", "--input"), default="stdin",
 21 | 	help="File or stdin. Columns are node1, node2, weigth [default=%default]"),
 22 | 
 23 | make_option(c("-o", "--output"), default="network.pdf",
 24 | 	help="Output file name [default=%default]"),
 25 | 
 26 | make_option(c("--nodes"), 
 27 | 	help="File with node attributes, no header"),
 28 | 
 29 | make_option(c("--node_color"), type="integer",
 30 | 	help="Index of the node color"),
 31 | 
 32 | make_option(c("--label"),
 33 | 	help="Leave empty for using the node names. \"none\" for no label."),
 34 | 
 35 | make_option(c("--label_color"), type="integer",
 36 | 	help="Index of the label color"),
 37 | 
 38 | make_option(c("--node_frame_color"), type="integer",
 39 | 	help="Index of the node frame color"),
 40 | 
 41 | make_option(c("--node_size"), default=2,
 42 | 	help="Size of the nodes [default=%default]"),
 43 | 
 44 | make_option(c("--node_shape"), type='integer',
 45 | 	help="Index of the node shape"),
 46 | 
 47 | make_option(c("--node_palette"), default="/users/rg/abreschi/R/palettes/rainbow.15.txt",
 48 | 	help="File with colorname in RGB format [default=%default]"),
 49 | 
 50 | make_option(c("--node_frame_palette"), default="/users/rg/abreschi/R/palettes/rainbow.4.txt",
 51 | 	help="File with colorname in RGB format [default=%default]"),
 52 | 
 53 | make_option(c("--label_cex"), default=1, type="double",
 54 | 	help="Size of the labels in cex [default=%default]"),
 55 | 
 56 | make_option(c("--normalize"), action="store_true", default=FALSE,
 57 | 	help="Normalize to have all 1s in the diagonal [default=%default]"),
 58 | 
 59 | make_option(c("--diag"), action="store_true", default=FALSE,
 60 | 	help="Report also edges to and from the same node [default=%default]"),
 61 | 
 62 | make_option(c("--directed"), action="store_true", default=FALSE,
 63 | 	help="Is the graph directed? [default=%default]"),
 64 | 
 65 | make_option(c("-H", "--height"), default=9,
 66 | 	help="Height of the plot in inches [default=%default]"),
 67 | 
 68 | make_option(c("-W", "--width"), default=9,
 69 | 	help="Width of the plot in inches [default=%default]"),
 70 | 
 71 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE,
 72 | 	help="if you want more output [default=%default]")
 73 | )
 74 | 
 75 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 76 | arguments <- parse_args(parser, positional_arguments = TRUE)
 77 | opt <- arguments$options
 78 | if (opt$verbose) {print(opt)}
 79 | 
 80 | 
 81 | suppressPackageStartupMessages(library(reshape2))
 82 | suppressPackageStartupMessages(library(igraph))
 83 | 
 84 | 
 85 | #################################################################
 86 | # triangle vertex shape
 87 | mytriangle <- function(coords, v=NULL, params) {
 88 |   vertex.color <- params("vertex", "color")
 89 |   if (length(vertex.color) != 1 && !is.null(v)) {
 90 |     vertex.color <- vertex.color[v]
 91 |   }
 92 |   vertex.size <- 1/200 * params("vertex", "size")
 93 |   if (length(vertex.size) != 1 && !is.null(v)) {
 94 |     vertex.size <- vertex.size[v]
 95 |   }
 96 |  
 97 |   symbols(x=coords[,1], y=coords[,2], bg=vertex.color,
 98 |           stars=cbind(vertex.size, vertex.size, vertex.size),
 99 |           add=TRUE, inches=FALSE)
100 | }
101 | # clips as a circle
102 | add.vertex.shape("triangle", clip=vertex.shapes("circle")$clip,
103 |                  plot=mytriangle)
104 | 
105 | 
106 | 
107 | # BEGIN
108 | 
109 | # Read input
110 | if (opt$input == "stdin") {
111 | 	m = read.table(file("stdin"), h=F)
112 | } else {
113 | 	m = read.table(opt$input, h=F)
114 | }
115 | 
116 | node_palette = read.table(opt$node_palette, h=F, comment.char="%")$V1
117 | node_frame_palette = read.table(opt$node_frame_palette, h=F, comment.char="%")$V1
118 | node_shape_palette = c("circle", "square", "csquare", "rectangle", "crectangle", "vrectangle", "none")
119 | 
120 | # Creat graph
121 | g = graph.data.frame(m, directed=opt$directed)
122 | 
123 | 
124 | # Normalize if needed
125 | M = as.matrix(get.adjacency(g))
126 | if (opt$normalize) {
127 | 	lambda = 1/sqrt(diag(M))
128 | 	M = sweep(sweep(M, MARGIN=2, lambda, `*`), MARGIN=1, lambda, `*`)
129 | 	g = graph.adjacency(as.matrix(M), mode=c("max"), weighted=TRUE, diag=opt$diag)
130 | }
131 | 
132 | 
133 | # Get node attributes
134 | 
135 | V(g)$color = rep("white", vcount(g))
136 | V(g)$frame.color = rep("white", vcount(g))
137 | V(g)$label.color = rep("black", vcount(g))
138 | V(g)$shape = rep("circle", vcount(g))
139 | V(g)$size = rep(opt$node_size, vcount(g))
140 | V(g)$label.cex = rep(opt$label_cex, vcount(g))
141 | 
142 | 
143 | if (!is.null(opt$nodes)) {
144 | 	node_attr = read.table(opt$nodes, h=F, quote="\"")
145 | 	# Select only metadata rows for nodes in the network
146 | 	node_attr = node_attr[node_attr[,1] %in% V(g)$name,]
147 | 	match_node = match(V(g)$name, node_attr[,1])
148 | 
149 | 	# Get node color
150 | 	if (!is.null(opt$node_color)) {
151 | 		node_colors = node_palette[as.factor(node_attr[,opt$node_color])]
152 | 		V(g)$color = node_colors[match_node]
153 | 	}
154 | 
155 | 	# Get label color
156 | 	if (!is.null(opt$label_color)) {
157 | 		label_colors = palette[node_attr[,opt$label_color]]
158 | 		V(g)$label.color = label_colors[match_node]
159 | 	}
160 | 
161 | 	# Get node frame color
162 | #	if (!is.null(opt$node_frame_color)) {
163 | #		node_frame_colors = node_frame_palette[as.factor(node_attr[,opt$node_frame_color])]
164 | #		V(g)$frame.color = node_frame_colors[match_node]
165 | #	}
166 | 
167 | 	# Get label
168 | 	if (!is.null(opt$label) && opt$label != "none") {
169 | 		opt$label = as.integer(opt$label)
170 | 		node_labels = node_attr[,opt$label]
171 | 		V(g)$label = node_labels[match_node]
172 | 	}
173 | 	
174 | 	# Get node shape
175 | 	if (!is.null(opt$node_shape)) {
176 | 		node_shapes = node_shape_palette[as.factor(node_attr[,opt$node_shape])]
177 | 		V(g)$shape = node_shapes[match_node]
178 | 	}
179 | 
180 | }
181 | 
182 | 
183 | vertex.label=NULL
184 | if (opt$label == "none") {V(g)$label = NA}
185 | 
186 | #E(g)$label = m[,4]
187 | 
188 | # PLOT
189 | 
190 | pdf(opt$output, h=opt$height, w=opt$width)
191 | 
192 | cat("PLOTTING...")
193 | 
194 | plot(
195 | 	g, 
196 | 	layout=layout.kamada.kawai, 
197 | #	layout=layout.reingold.tilford(g, root="1")
198 | #	vertex.label=vertex.label, 
199 | )
200 | 
201 | # Node color legend
202 | if (!is.null(opt$nodes) & !is.null(opt$node_color)) {
203 | 	labels = unique(node_attr[,opt$node_color])
204 | 	legend(
205 | 		"topright", 
206 | 		legend=labels, 
207 | 		fill=node_colors[match(labels, node_attr[,opt$node_color])]
208 | 	)
209 | }
210 | 
211 | 
212 | # Node frame color legend
213 | if (!is.null(opt$nodes) & !is.null(opt$node_frame_color)) {
214 | 	labels = unique(node_attr[,opt$node_frame_color])
215 | 	legend(
216 | 		"topleft", 
217 | 		legend=labels, 
218 | 		fill="white",
219 | 		border=node_frame_colors[match(labels, node_attr[,opt$node_frame_color])]
220 | 	)
221 | }
222 | 
223 | cat(" DONE\n")
224 | 
225 | dev.off()
226 | 
227 | 
228 | q(save='no')
229 | 
230 | 


--------------------------------------------------------------------------------
/boxplot_expressed_isoforms.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | suppressPackageStartupMessages(library(reshape2))
  4 | suppressPackageStartupMessages(library(ggplot2))
  5 | suppressPackageStartupMessages(library("optparse"))
  6 | 
  7 | options(stringsAsFactors=F)
  8 | cbbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#000000", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") 
  9 | 
 10 | 
 11 | ##################
 12 | # OPTION PARSING
 13 | ##################
 14 | 
 15 | 
 16 | option_list <- list(
 17 | 
 18 | make_option(c("-i", "--input_matrix"), 
 19 | 	help="the matrix you want to analyze"),
 20 | 
 21 | make_option(c("-a", "--annotation"), 
 22 | 	help="two-column file with gene and tx ids, no header."),
 23 | 
 24 | make_option(c("-o", "--output"),
 25 | 	help="choose the name for the output file, WITHOUT extension"),
 26 | 
 27 | make_option(c("-m", "--metadata"),
 28 | 	help="tsv file with metadata on matrix experiment"),
 29 | 
 30 | make_option(c("-f", "--fill_by"),
 31 | 	help="choose what to fill by. Leave empty for no filling"),
 32 | 
 33 | make_option(c("-c", "--color_by"),
 34 | 	help="choose what to color by. Leave empty for no coloring")
 35 | 
 36 | )
 37 | 
 38 | 
 39 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 40 | arguments <- parse_args(parser, positional_arguments = TRUE)
 41 | opt <- arguments$options
 42 | 
 43 | print(opt)
 44 | 
 45 | #--------------------
 46 | # FUNCTIONS
 47 | #--------------------
 48 | 
 49 | # FUNCTION 1
 50 | # This function calculates the relative expression of the most expressed isoform of a gene in a given condition.
 51 | # It returns NaN when the gene has no isoforms expressed
 52 | maj_iso_rel_expr = function(x) {max(x,na.rm=T)/sum(x,na.rm=T)}
 53 | 
 54 | # FUNCTION 2
 55 | # This function calculates the Shannon entropy for expressed isoforms of a gene in a given condition
 56 | # It returns NA when the gene has no isoforms expressed.
 57 | # The function is taken from the script:
 58 | source('~abreschi/R/functions.R')
 59 | 
 60 | 
 61 | 
 62 | ###########
 63 | ## HUMAN ##
 64 | ###########
 65 | 
 66 | 
 67 | # gene, tx annotation file (only protein coding gene, but all txs)
 68 | human_gn_tx = read.table(opt$annotation, h=F, col.names=c('gene','tx'))
 69 | 
 70 | # transcript matrix
 71 | human_expr = read.table(opt$input_matrix, h=T)
 72 | 
 73 | # read the metadata
 74 | mdata = read.table(opt$metadata, h=T, sep="\t")
 75 | mdata[,"labExpId"] <- gsub(",", ".", mdata[,"labExpId"]) 
 76 | 
 77 | # replace NAs (bad IDR) with zeros
 78 | human_expr[is.na(human_expr)] <- 0
 79 | 
 80 | # add the gene locus to each tx
 81 | human_tx_gn_expr = merge(human_gn_tx, human_expr, by.x='tx', by.y='row.names')
 82 | 
 83 | 
 84 | #### EXPRESSED ISOFORMS
 85 | # count the number of expressed isoforms in each cell line
 86 | human_expr_iso = aggregate(human_tx_gn_expr[,-(1:2)], list(gene=human_tx_gn_expr$gene), function(x) sum(!is.na(x)&x!=0))
 87 | # equivalent to:
 88 | human_expr_iso_melt = melt(human_expr_iso, variable.name='sample_name', value.name='expr_iso')
 89 | 
 90 | #### MAJOR ISOFORM EXPRESSION
 91 | # calculate the relative epxression of the most expressed isoform for each gene in each sample
 92 | human_rel_expr_maj = aggregate(human_tx_gn_expr[,-(1:2)], list(gene=human_tx_gn_expr$gene), maj_iso_rel_expr)
 93 | human_rel_expr_maj_melt = melt(human_rel_expr_maj, variable.name='sample_name', value.name='rel_maj')
 94 | 
 95 | #### ISOFORM ENTROPY
 96 | # calculate the entropy of expressed isoforms for each gene in each sample
 97 | human_entr_iso = aggregate(human_tx_gn_expr[,-(1:2)], list(gene=human_tx_gn_expr$gene), entropy)
 98 | human_entr_iso_melt = melt(human_entr_iso, variable.name='sample_name', value.name='entr_iso')
 99 | 
100 | #### ANNOTATED ISOFORMS
101 | # count the number of annotated isoforms for each gene
102 | human_ann_iso = setNames(aggregate(tx~gene, human_tx_gn_expr, length), c("gene", "ann_isoforms"))
103 | 
104 | #### MERGING
105 | # add the annotation information to the expressed isoforms
106 | human_iso_all = merge(human_expr_iso_melt, human_rel_expr_maj_melt, by=c('gene', 'sample_name'))
107 | human_iso_all = merge(human_iso_all, human_entr_iso_melt, by=c('gene', 'sample_name'))
108 | human_iso_all = merge(human_iso_all, human_ann_iso, by = 'gene')
109 | 
110 | hs_mm_data_expr_ann_iso = human_iso_all
111 | hs_mm_data_expr_ann_iso = merge(unique(mdata[c("labExpId", opt$fill_by, opt$color_by)]),
112 | hs_mm_data_expr_ann_iso, by.x='labExpId', by.y='sample_name')
113 | write.table(hs_mm_data_expr_ann_iso, file=sprintf('%s.summary_isoform_expression.tsv',opt$output), sep='\t', quote=F, row.names=F)
114 | 
115 | 
116 | merged_labels = aggregate(gene~tx, aggregate(tx~gene, human_tx_gn_expr, length), length)
117 | 
118 | ##########
119 | ## plot ##
120 | ##########
121 | 
122 | 
123 | # COMMENT: I have to add the number of genes in each boxplot
124 | 
125 | theme_set(theme_bw(base_size=16))
126 | 
127 | pdf(sprintf("%s.pdf", opt$output), height=6, width=9)
128 | 
129 | max_iso = 15
130 | 
131 | data = subset(hs_mm_data_expr_ann_iso,ann_isoforms<=max_iso & expr_iso>0)
132 | gp = ggplot(data, aes(as.factor(ann_isoforms), expr_iso)) 
133 | gp = gp + geom_boxplot(aes_string(fill=opt$fill_by, color=opt$color_by)) 
134 | gp = gp + ylim(c(0, max_iso)) 
135 | gp = gp + labs(y='Number of isoforms expressed per gene', x='Number of annotated isoforms per gene') 
136 | gp = gp + geom_abline(linetype='dotted', size=2, color='grey')
137 | gp = gp + geom_text(data=subset(merged_labels, tx<=max_iso), aes(x = tx, y = 0, label = gene), angle=90)
138 | gp = gp + scale_fill_manual(values=cbbPalette)
139 | gp
140 | 
141 | data = subset(hs_mm_data_expr_ann_iso,ann_isoforms<=max_iso)
142 | gp = ggplot(data, aes(as.factor(ann_isoforms), rel_maj))
143 | gp = gp + geom_boxplot(aes_string(fill=opt$fill_by, color=opt$color_by))
144 | gp = gp + ylim(c(0,1)) 
145 | gp = gp + labs(y='Major isoform relative expression', x='Number of annotated isoforms per gene')
146 | gp = gp + geom_text(data=subset(merged_labels, tx<=max_iso), aes(x = tx, y = 0, label = gene), angle=90)
147 | gp = gp + scale_fill_manual(values=cbbPalette)
148 | gp = gp + scale_x_discrete(labels=merged_labels) + theme(axis.text.x = element_text(angle=90, vjust=0.5))
149 | gp
150 | 
151 | data = subset(hs_mm_data_expr_ann_iso,expr_iso<=15 & expr_iso>=2)
152 | gp = ggplot(data, aes(as.factor(expr_iso), entr_iso))
153 | gp = gp + geom_boxplot(aes_string(fill=opt$fill_by, color=opt$color_by))
154 | gp = gp + labs(y = 'Shannon entropy', x = 'Number of expressed isoforms per gene')
155 | gp = gp + geom_point(aes(x=as.factor(expr_iso), y = log(expr_iso+1)), color='red') 
156 | gp = gp + scale_fill_manual(values=cbbPalette)
157 | gp
158 | 
159 | data = subset(hs_mm_data_expr_ann_iso,ann_isoforms<=max_iso & ann_isoforms>=2)
160 | gp = ggplot(data, aes(as.factor(ann_isoforms), entr_iso))
161 | gp = gp + geom_boxplot(aes_string(fill=opt$fill_by, color=opt$color_by))
162 | #gp = gp + ylim(c(0, max_iso)) 
163 | gp = gp + labs(y = 'Shannon entropy', x = 'Number of annotated isoforms per gene')
164 | gp = gp + geom_point(aes(x=as.factor(ann_isoforms), y = log(ann_isoforms+1)), color='red') 
165 | gp = gp + geom_text(data=subset(merged_labels, tx<=max_iso), aes(x = tx, y = 0, label = gene), angle=90)
166 | gp = gp + scale_fill_manual(values=cbbPalette)
167 | gp
168 | 
169 | dev.off()
170 | 
171 | ggsave(sprintf("%s.jpeg", opt$output), h=6, w=9)
172 | 
173 | 


--------------------------------------------------------------------------------
/scale_matrix.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | 
  4 | # This script is useful for:
  5 | # normalizing samples by row scaling
  6 | 
  7 | ##------------
  8 | ## LIBRARIES
  9 | ##------------ 
 10 | suppressPackageStartupMessages(library("reshape2"))
 11 | suppressPackageStartupMessages(library("ggplot2"))
 12 | suppressPackageStartupMessages(library("optparse"))
 13 | suppressPackageStartupMessages(library("scales"))
 14 | 
 15 | options(stringsAsFactors=F)
 16 | 
 17 | ##################
 18 | # OPTION PARSING
 19 | ##################
 20 | 
 21 | option_list <- list(
 22 | make_option(c("-i", "--input_matrix"), default="stdin", 
 23 | 	help="the matrix you want to analyze. \"stdin\" for reading from standard input [default=%default]"),
 24 | 
 25 | make_option(c("-l", "--log10"), action="store_true", default=FALSE, 
 26 | 	help="apply the log before scaling [default=FALSE]"),
 27 | 
 28 | make_option(c("-p", "--pseudocount"), type="double", default=0,
 29 | 	help="specify a pseudocount for the log [default=%default]"),
 30 | 
 31 | make_option(c("-r", "--row_first"), action="store_true", default=FALSE,
 32 | 	help="scale first by rows then by columns"),
 33 | 
 34 | make_option(c("-a", "--all"), action="store_true", default=FALSE,
 35 | 	help="scale the whole matrix by overall mean and sd"),
 36 | 
 37 | make_option(c("--range"), action="store_true", default=FALSE,
 38 | 	help="Normalize so the values are between 0 and 1"),
 39 | 
 40 | make_option(c("-k", "--keep_NA"), action="store_true", default=FALSE,
 41 | 	help="NAs are not replaced by zero [default=%default]"),
 42 | 
 43 | make_option(c("-C", "--center"), action="store_true", default=FALSE, 
 44 | 	help="subtract the mean [default=%default]"),
 45 | 
 46 | make_option(c("-S", "--scale"), action="store_true", default=FALSE,
 47 | 	help="divide by the standard deviation [default=%default]"),
 48 | 
 49 | make_option(c("-n", "--n_iter"), type='integer', default=20,
 50 | 	help="how many times to iterate. Choose 0 for one-dimension scaling [default=%default]"),
 51 | 
 52 | make_option(c("-m", "--metadata"), 
 53 | 	help="tsv file with the metadata"),
 54 | 
 55 | make_option(c("-s", "--scale_by"), 
 56 | 	help="choose one or multiple attributes you want to scale by"),
 57 | 
 58 | make_option(c("-o", "--output"), default="stdout", 
 59 | 	help="output file name. \"stdout\" to redirect to stdout. [default=%default]"),
 60 | 
 61 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE,
 62 | 	help="verbose output. [default=%default]")
 63 | )
 64 | 
 65 | parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
 66 | arguments <- parse_args(parser, positional_arguments = TRUE)
 67 | opt <- arguments$options
 68 | if (opt$verbose) {print(opt)}
 69 | 
 70 | 
 71 | ###############
 72 | # BEGIN
 73 | ###############
 74 | 
 75 | # read options
 76 | inF = opt$input_matrix
 77 | if (opt$input_matrix == "stdin") {
 78 | 	inF = file("stdin")
 79 | } 
 80 | m <- read.table(inF, h=T, sep="\t", check.names=F)
 81 | 
 82 | # Remove character columns
 83 | char_cols <- which(sapply(m, class) == 'character')
 84 | if (opt$verbose) {sprintf("WARNING: column %s is character, so it is removed from the analysis", char_cols)}
 85 | if (length(char_cols) == 0) {genes = rownames(m)}
 86 | if (length(char_cols) != 0) {genes = m[,char_cols]; m = m[,-(char_cols)]}
 87 | 
 88 | # replace or not NAs
 89 | if (!opt$keep_NA) {m <- replace(m, is.na(m), 0)}
 90 | 
 91 | # apply the log if required
 92 | if (opt$log10) {m = log10(m + opt$pseudocount)}
 93 | 
 94 | # Read the metadata
 95 | if (!is.null(opt$metadata)) {
 96 | 	mdata <- read.table(opt$metadata, h=T, row.names=NULL, sep="\t", comment.char="", quote="\"")
 97 | 	mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(",", ".", x))
 98 | 	mdata$labExpId <- sapply(mdata$labExpId, function(x) gsub(":", ".", x))
 99 | 	mdata = subset(mdata, labExpId %in% colnames(m))
100 | }
101 | 
102 | 
103 | 
104 | 
105 | ##########
106 | # EQUILIBRATION
107 | ##########
108 | 
109 | # Comments on scale() function from R:
110 | # - if the vector has sd==0, all the scaled values will be NAs.
111 | # - if the vector contains NAs, scale omits them.
112 | 
113 | # scaling by rows and by columns in this order for 20 times
114 | 
115 | if (opt$row_first) {
116 | # scale first by rows then by columns iteratively
117 | equil = function(matr) {
118 | 	if (opt$n_iter == 0) {matr = t(scale(t(matr), center=opt$center, scale=opt$scale))
119 | 		}else{
120 | 		for (i in 1:opt$n_iter) {
121 | 			matr = t(scale(t(matr), center=opt$center, scale=opt$scale))
122 | 			matr = scale(matr, center=opt$center, scale=opt$scale)
123 | 			}
124 | 		}
125 | 	return(matr)
126 | 	}
127 | }else{
128 | # scale first by columns then by rows iteratively
129 | equil = function(matr) {
130 | 	if (opt$n_iter == 0) {matr = scale(matr, center=opt$center, scale=opt$scale)
131 | 		}else{
132 | 		for (i in 1:opt$n_iter) {
133 | 			matr = scale(matr, center=opt$center, scale=opt$scale)
134 | 			matr = t(scale(t(matr), center=opt$center, scale=opt$scale))
135 | 			}
136 | 		}
137 | 	return(matr)
138 | 	}
139 | }
140 | 
141 | scale_all = function(new_m) {
142 | 	new_m = (new_m - mean(as.matrix(new_m), na.rm=T)) / sd(as.matrix(new_m), na.rm=T)
143 | 	return(new_m)
144 | }
145 | 
146 | 
147 | new_m = m
148 | 
149 | # Choose function
150 | scale_func = function(new_m, opt) {
151 | 	if (opt$all & opt$range) {
152 | 		new_m = as.data.frame(rescale(as.matrix(new_m)))
153 | 		return(new_m)
154 | 	}
155 | 	if (!opt$all & opt$range) {
156 | 		new_m = t(apply(new_m, 1, rescale, to=c(0,1)))
157 | 		return (new_m)
158 | 	}
159 | 	if (opt$all & !opt$range) {
160 | 		new_m = scale_all(new_m)
161 | 		return (new_m)
162 | 	}
163 | 	new_m = equil(new_m)
164 | 	return(new_m)
165 | }
166 | 
167 | # scale the whole matrix if no value is provided
168 | if (is.null(opt$scale_by)) {
169 | 	new_m = scale_func(new_m, opt)
170 | #	if (opt$all) {
171 | #		new_m = scale_all(new_m)	
172 | #	} else{ 
173 | #		new_m = equil(new_m)
174 | #	}
175 | }
176 | 
177 | # scale the sub-matrices defined the scale_by option
178 | if (!is.null(opt$scale_by)) {
179 | 	
180 | 	# Select the metadata columns of interset
181 | 	scale_by <- strsplit(opt$scale_by, ",")[[1]];
182 | 	if (opt$verbose) {cat("Select metadata...  ")}
183 | 	mdata = unique(mdata[,unique(c("labExpId", scale_by))])
184 | 	if (opt$verbose) {cat(dim(mdata), "\n")}
185 | 
186 | 	# Get the levels by which scale
187 | 	lev = levels(interaction(mdata[,scale_by]))
188 | 	if (opt$verbose) {print(lev)}
189 | 
190 | 	# Add a column to the metadata with the interaction
191 | 	mdata$interaction = apply(mdata[scale_by], 1, paste, collapse=".")
192 | 	
193 | 	for (i in 1:length(lev)) {
194 | 		ids = mdata[lev[i] == mdata$interaction, "labExpId"]
195 | 		subm = m[,ids]
196 | 		equilm = scale_func(subm, opt)
197 | 		#if (opt$all) {
198 | 		#	equilm = scale_all(subm)	
199 | 		#} else{ 
200 | 		#	equilm = equil(subm)
201 | 		#}
202 | 		#equilm = equil(subm)
203 | 		new_m[,ids] = equilm
204 | 	}
205 | 	
206 | ##	if (length(scale_by) != 1){
207 | #		ids = apply(mdata, 1, function(x) unique(merge(t(as.data.frame(x)), mdata, by=scale_by)$labExpId ))}
208 | #	if (length(scale_by) == 1){
209 | #	ids = sapply(unique(mdata[, scale_by]), function(x) unique(mdata[ mdata[,scale_by] == x,]$labExpId))}
210 | #
211 | ## apply normalization
212 | #	if (length(scale_by) != 1){for (i in 1:length(ids)) { new_m[, ids[[i]]] <- equil(new_m[,ids[[i]]])}  }
213 | #	if (length(scale_by) == 1){for (i in 1:ncol(ids)) { new_m[, ids[[i]]] <- equil(new_m[,ids[[i]]])}  }
214 | }
215 | 
216 | # Round the values for the output
217 | new_m = round(new_m, 4)
218 | 
219 | print_rownames = TRUE
220 | if (length(char_cols) != 0) {new_m <- cbind(genes, new_m); print_rownames=FALSE}
221 | 
222 | 
223 | # print output
224 | #--------------
225 | 
226 | if (opt$output == "stdout") {
227 | 	output = ""
228 | } else {
229 | 	output = opt$output
230 | }
231 | 
232 | 
233 | write.table(new_m, output, quote=F, sep="\t", row.names=print_rownames)
234 | 
235 | 
236 | # EXIT
237 | 
238 | q(save='no')
239 | 
240 | 


--------------------------------------------------------------------------------
/removeBatchEffect.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript 
  2 | 
  3 | options(stringsAsFactors=F)
  4 | 
  5 | ##################
  6 | # OPTION PARSING
  7 | ##################
  8 | 
  9 | suppressPackageStartupMessages(library("optparse"))
 10 | 
 11 | option_list <- list(
 12 | make_option(c("-i", "--input_matrix"), default="stdin", 
 13 | 	help="the matrix you want to analyze [default=%default]"),
 14 | 
 15 | 
 16 | make_option(c("-l", "--log"), default=NULL, 
 17 | 	help="The base of the logarithm to tranform the data before batch removal. If NULL no log-transformation is applied [default=%default]"),
 18 | 
 19 | make_option(c("-p", "--pseudocount"), type="double", default=0, 
 20 | 	help="A pseudocount to add when log-transforming [default=%default]"),
 21 | 
 22 | make_option(c("-M", "--method"), default="voom", help="The method you want to use for removing the batch effect [default=%default]
 23 | 
 24 | 		voom   : apply voom to a matrix of read counts to estimate the weigths
 25 | 			     and then uses the function removeBatchEffect() from limma
 26 | 		limma  : uses the function removeBatchEffect() from limma on a matrix of 
 27 | 		         already normalized values
 28 | 		combat : uses the function combat() from sva on a matrix of already normalized values
 29 | "),
 30 | 
 31 | make_option(c("-s", "--scaling_factors"), default="TMM", help="How to compute scaling factors, if the method is voom [default=%default]
 32 | 	
 33 | 		TMM    :
 34 | 		none   :
 35 | "),
 36 | 	
 37 | 
 38 | make_option(c("--d1"), default="~1", help="Design for voom [default=%default]"),
 39 | make_option(c("--d2"), help="Design for removing the batch effect (not including the batch effect)"),
 40 | make_option(c("-b", "--batch"), help="Column with the batch info"),
 41 | 
 42 | make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"),
 43 | make_option(c("-G", "--merge_mdata_on"), default="labExpId",
 44 | 	help="Column in the metadata with the header of the input matrix [default=%default]"),
 45 | 
 46 | make_option(c("-t", "--total"), type="integer", help="Filter by total count per gene > t [default=%default]"),
 47 | #make_option(c("-F", "--fields"), help="choose the fields you want to use in the differential expression, comma-separated"),
 48 | make_option(c("-S", "--lib.sizes"), help="Two-column file with no header. col1: header of matrix, col2: library sizes"),
 49 | make_option(c("-N", "--output.norm"), help="File name for normalization factors"),
 50 | make_option(c("-R", "--read_counts"), default=FALSE, action="store_true", help="Output reads counts instead of log2(cpm) [default=%default]"),
 51 | make_option(c("-o", "--output"), default="stdout", help="output file name [default=%default]"),
 52 | make_option(c("-v", "--verbose"), action="store_true", default=FALSE, help="verbose output [default=%default]")
 53 | )
 54 | 
 55 | parser <- OptionParser(
 56 | 	usage = "%prog [options] file", 
 57 | 	option_list=option_list, 
 58 | 	description="\nRemove batch effect from a matrix of read counts, or normalized values"
 59 | )
 60 | arguments <- parse_args(parser, positional_arguments = TRUE)
 61 | opt <- arguments$options
 62 | if (opt$verbose) {print(opt)}
 63 | 
 64 | 
 65 | 
 66 | # LIBRARIES
 67 | 
 68 | suppressPackageStartupMessages(library(edgeR))
 69 | suppressPackageStartupMessages(library(limma))
 70 | 
 71 | 
 72 | ##--------------------##
 73 | ## BEGIN              ##
 74 | ##--------------------##
 75 | 
 76 | 
 77 | # read the matrix from the command line
 78 | if(opt$input_matrix == "stdin"){inF=file("stdin")}else{inF=opt$input_matrix}
 79 | m = read.table(inF, h=T, sep="\t")
 80 | 
 81 | # Replace missing values with 0
 82 | m = replace(m, is.na(m), 0)
 83 | 
 84 | # Log-transform the values if needed
 85 | if (!is.null(opt$log)) {
 86 | 	base = ifelse(opt$log == "e", exp(1), as.double(opt$log))
 87 | 	m = log(m+opt$pseudocount, base)
 88 | }
 89 | 
 90 | inputToBatchRm <- m
 91 | 
 92 | # ?TODO: Error if there is attempt to log-transform integer
 93 | 
 94 | # =========================== Metadata =======================
 95 | 
 96 | merge_mdata_on = opt$merge_mdata_on
 97 | 
 98 | # read the metadata
 99 | mdata = read.table(opt$metadata, h=T, sep="\t", quote=NULL)
100 | 
101 | # Get the fields from the formula 
102 | if (is.null(opt$batch)) {
103 | 	cat("ERROR: please specify the batch variable\n")
104 | 	q(save='no')
105 | }
106 | fields = opt$batch
107 | if (opt$d1 != "~1") {
108 | 	fields1 = strsplit(sub("~", "", opt$d1), split="[+:*]")[[1]]
109 | 	fields = c(fields, fields1)
110 | }
111 | if (opt$d2 != "~1") {
112 | 	fields2 = strsplit(sub("~", "", opt$d2), split="[+:*]")[[1]]
113 | 	fields = c(fields, fields2)
114 | }
115 | mdata[opt$merge_mdata_on] <- gsub(",", ".", mdata[,opt$merge_mdata_on])
116 | 
117 | 
118 | # Check if all the columns are in the metadata
119 | if (sum(!(colnames(m) %in% mdata[,merge_mdata_on])) >0 ) {
120 | 	cat("ERROR: Not all column names in the metadata\n")
121 | 	q(save="no")
122 | }
123 | 
124 | # Format the metadata
125 | mdata = unique(mdata[unique(c(merge_mdata_on, fields))])
126 | rownames(mdata) <- mdata[,merge_mdata_on]
127 | mdata <- mdata[match(colnames(m), mdata[,merge_mdata_on]),, drop=FALSE]
128 | if (opt$verbose) {
129 | 	print(mdata)
130 | 	print(dim(mdata))
131 | }
132 | 
133 | 
134 | # ****************
135 | #    voom+limma       
136 | # ****************     
137 | 
138 | if (opt$method == "voom") {
139 | 
140 | 	# Filter by total number of reads per gene if asked
141 | 	if (!is.null(opt$total)) {
142 | 		m = m[rowSums(m)>opt$total, ]
143 | 	}
144 | 
145 | 	# Convert all the values of the matrix to integer (because we want counts)
146 | 	m[1:ncol(m)] <- apply(m, 2, as.integer)
147 | 	# Create count object for edgeR
148 | 	M = DGEList(m)
149 | 	
150 | 	
151 | 	# Check for user-provided library sizes
152 | 	if (!is.null(opt$lib.sizes)) {
153 | 		lib.sizes = read.table(opt$lib.sizes, h=F, sep="\t")
154 | 		lib.sizes = lib.sizes[match(lib.sizes$V1, colnames(m)), "V2"]
155 | 		M$samples$lib.size <- lib.sizes
156 | 	}
157 | 	
158 | 	# ****************
159 | 	#      TMM       
160 | 	# ****************     
161 | 	
162 | 	if (opt$scaling == "TMM") {
163 | 		M <- calcNormFactors(M, method="TMM")
164 | 		if (!is.null(opt$output.norm)) {
165 | 			normFactors = data.frame(a=colnames(m), b=M$samples$norm.factors)
166 | 			write.table(normFactors, file=opt$output.norm, col.names=FALSE, row.names=FALSE, sep="\t", quote=FALSE)
167 | 		}
168 | 	}
169 | 	
170 | 	# ****************
171 | 	#      none       
172 | 	# ****************     
173 | 	
174 | 	if (opt$scaling == "none") {
175 | 		M$samples$norm.factors <- rep(1, ncol(m))
176 | 	}
177 | 	
178 | 	
179 | 	# ****************
180 | 	#      voom
181 | 	# ****************
182 | 	
183 | 	design1 <- model.matrix(as.formula(opt$d1), data=mdata)
184 | 	
185 | 	if (opt$d1 != "~1") {
186 | 		design1 <- design1[match(colnames(m), rownames(design1)),]
187 | 	}
188 | 	
189 | 	v <- voom(M, design1, plot=FALSE)
190 | 
191 | 	inputToBatchRm <- v
192 | }
193 | 
194 | 
195 | 
196 | # **********************
197 | #   removeBatchEffect
198 | # **********************
199 | 
200 | design2 <- model.matrix(as.formula(opt$d2), data=mdata)
201 | 
202 | if (opt$d2 != "~1") {
203 | 	design2 <- design2[match(colnames(m), rownames(design2)),]
204 | }
205 | 
206 | batch = mdata[match(colnames(m), mdata[, opt$merge_mdata_on]), opt$batch]
207 | 
208 | if (opt$method == "voom" | opt$method == "limma") {
209 | 	out = removeBatchEffect(inputToBatchRm, batch=batch, design=design2)
210 | }
211 | 
212 | # Convert back to read counts
213 | if (opt$read_counts) {
214 | 	out = pmax(sweep(2**out, 2, M$samples$norm.factors * M$samples$lib.size, FUN="*")/1e+06 - 0.5, 0)
215 | }
216 | 	
217 | 	
218 | # ****************
219 | #    ComBat
220 | # ****************     
221 | 
222 | 
223 | if (opt$method == "combat") {
224 | 
225 | 	suppressPackageStartupMessages(library(sva))
226 | 	out = ComBat(dat=m, batch=batch, mod=design2)
227 | 
228 | }
229 | 
230 | 
231 | # =================== OUTPUT ======================
232 | 
233 | out = round(out, digits=5)
234 | outF = ifelse(opt$output=="stdout", "", opt$output)
235 | write.table(out, file=outF, quote=FALSE, sep="\t")
236 | 
237 | q(save='no')
238 | 


--------------------------------------------------------------------------------