├── Analysis
    ├── PeakAnnotateHomerSummary.r
    ├── PlotChart.r
    └── ResSummary.r
├── IDR_Codes
    ├── IDRAnalysis.sh
    ├── IDRMain.sh
    ├── IDRScatterPlot.r
    ├── IDRSummary.r
    ├── IDR_SubSampleBAM.sh
    └── IDR_SubSampleBAM_Main.sh
├── Imp_Scripts
    ├── Footprint_HINT_ATAC.R
    ├── Motif_HOMER.R
    └── Peak_Enrichment.R
├── README.md
├── bin
    ├── ATACSeqQC.r
    ├── BigWigTrackCreate.sh
    ├── CorrelationBAMPeak.sh
    ├── CorrelationPeakPlot.r
    ├── Sample_ATACseqQC_script.r
    ├── TagAlign.sh
    ├── bam_to_bigwig.sh
    └── pipeline.sh
├── configfile
├── configfile_hg19
├── configfile_hg38
├── configfile_mm10
├── configfile_mm9
├── pipeline_exec.sh
├── sample_IDRScript.sh
└── src
    ├── PlotSample.py
    ├── assign_multimappers.py
    ├── peak_distribution.py
    └── trim_adapters.py


/Analysis/PeakAnnotateHomerSummary.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | #=============================
  4 | # this code plots the percentage of genomic annotations
  5 | # for each peak file generated from the MACS2 command
  6 | # of a ChIP-seq pipeline
  7 | # input is a HOMER annotation of the peak file
  8 | # corresponding to different genomic segments
  9 | #=============================
 10 | 
 11 | args <- commandArgs(TRUE)
 12 | if(length(args)<1)	{
 13 | 	q("no")
 14 | }
 15 | 
 16 | HomerAnnotFile <- args[1]
 17 | HomerPeakTSSDistFile <- args[2]
 18 | 
 19 | InpDir <- dirname(HomerAnnotFile)
 20 | 
 21 | #====================================
 22 | # first process the peak annotations
 23 | # produced by HOMER
 24 | #====================================
 25 | 
 26 | # first initialize different annotation categories
 27 | npeak_3UTR <- 0
 28 | npeak_TTS <- 0
 29 | npeak_Exon <- 0
 30 | npeak_Intron <- 0
 31 | npeak_Intergenic <- 0
 32 | npeak_Promoter <- 0
 33 | npeak_5UTR <- 0
 34 | 
 35 | # open the file and read line by line
 36 | # extract performance values
 37 | finp <- file(HomerAnnotFile, "r")
 38 | lineset <- readLines(finp)
 39 | for (i in 1:length(lineset)) {
 40 | 	curr_line <- trimws(lineset[i], which = "both")
 41 | 	curr_line_split <- strsplit(curr_line,"\t")[[1]]
 42 | 	# cat(sprintf("\n curr_line : %s ", curr_line_split))
 43 | 	# cat(sprintf("\n elem 1: %s elem 2: %s elem 3: %s elem 4: %s ", curr_line_split[1], curr_line_split[2], curr_line_split[3], curr_line_split[4]))
 44 | 	if (regexpr("3UTR", curr_line) > 0) {
 45 | 		npeak_3UTR <- as.numeric(curr_line_split[2])
 46 | 	} else if (regexpr("TTS", curr_line) > 0) {
 47 | 		npeak_TTS <- as.numeric(curr_line_split[2])
 48 | 	} else if (regexpr("Exon", curr_line) > 0) {
 49 | 		npeak_Exon <- as.numeric(curr_line_split[2])
 50 | 	} else if (regexpr("Intron", curr_line) > 0) {
 51 | 		npeak_Intron <- as.numeric(curr_line_split[2])
 52 | 	} else if (regexpr("Intergenic", curr_line) > 0) {
 53 | 		npeak_Intergenic <- as.numeric(curr_line_split[2])
 54 | 	} else if (regexpr("Promoter", curr_line) > 0) {
 55 | 		npeak_Promoter <- as.numeric(curr_line_split[2])
 56 | 	} else if (regexpr("5UTR", curr_line) > 0) {
 57 | 		npeak_5UTR <- as.numeric(curr_line_split[2])
 58 | 	} 
 59 | }
 60 | 
 61 | # close the input file
 62 | close(finp)
 63 | 
 64 | cat(sprintf("\n npeak_3UTR : %s ", npeak_3UTR))
 65 | cat(sprintf("\n npeak_TTS : %s ", npeak_TTS))
 66 | cat(sprintf("\n npeak_Exon : %s ", npeak_Exon))
 67 | cat(sprintf("\n npeak_Intron : %s ", npeak_Intron))
 68 | cat(sprintf("\n npeak_Intergenic : %s ", npeak_Intergenic))
 69 | cat(sprintf("\n npeak_Promoter : %s ", npeak_Promoter))
 70 | cat(sprintf("\n npeak_5UTR : %s ", npeak_5UTR))
 71 | 
 72 | # now prepare a vector of the above categories (provided non zero instances)
 73 | # to create a pie chart
 74 | slices <- c()
 75 | lbls <- c()
 76 | if (npeak_3UTR > 0) {
 77 | 	slices <- c(slices, npeak_3UTR)
 78 | 	lbls <- c(lbls, "3UTR")
 79 | }
 80 | if (npeak_TTS > 0) {
 81 | 	slices <- c(slices, npeak_TTS)
 82 | 	lbls <- c(lbls, "TTS")
 83 | }
 84 | if (npeak_Exon > 0) {
 85 | 	slices <- c(slices, npeak_Exon)
 86 | 	lbls <- c(lbls, "Exon")
 87 | }
 88 | if (npeak_Intron > 0) {
 89 | 	slices <- c(slices, npeak_Intron)
 90 | 	lbls <- c(lbls, "Intron")
 91 | }
 92 | if (npeak_Intergenic > 0) {
 93 | 	slices <- c(slices, npeak_Intergenic)
 94 | 	lbls <- c(lbls, "Intergenic")
 95 | }
 96 | if (npeak_Promoter > 0) {
 97 | 	slices <- c(slices, npeak_Promoter)
 98 | 	lbls <- c(lbls, "Promoter")
 99 | }
100 | if (npeak_5UTR > 0) {
101 | 	slices <- c(slices, npeak_5UTR)
102 | 	lbls <- c(lbls, "5UTR")
103 | }
104 | 
105 | # convert the vector to include the percentage values as well
106 | # for displaying in the pie chart
107 | pct <- round(slices/sum(slices)*100)
108 | lbls <- paste(lbls, pct) # add percents to labels 
109 | lbls <- paste(lbls,"%",sep="") # ad % to labels 
110 | 
111 | OutPlotFile <- paste0(InpDir, "/Pie_Chart_Peak_Annotation.pdf")
112 | pdf(OutPlotFile, width=6, height=4)
113 | pie(slices, labels = lbls, col=rainbow(length(lbls)), main="Pie Chart of peak annotation", radius = 1, cex = 0.5)
114 | dev.off()
115 | 
116 | #====================================
117 | # then process the distance from TSS sites (nearest)
118 | # for individual peaks
119 | # the histogram data is already provided
120 | #====================================
121 | # first remove the header line from the input file
122 | tempfile <- paste0(InpDir, '/temp_TSSDistFile.bed')
123 | system(paste("awk \'NR>1\'", HomerPeakTSSDistFile, ">", tempfile))
124 | 
125 | # now process the temporary file
126 | PeakTSSData <- read.table(tempfile, header=T)
127 | OutPlotFile <- paste0(InpDir, "/Peak_TSS_Distance.pdf")
128 | pdf(OutPlotFile, width=6, height=4)
129 | plot(PeakTSSData[,1], PeakTSSData[,2], cex=0.5, col="red", xlab="Distance from TSS", ylab="ChIP fragment depth (per bp per peak)")
130 | title("Peak distribution near TSS sites")
131 | dev.off()
132 | 
133 | # then remove the temporary file
134 | system(paste("rm", tempfile))
135 | 
136 | 


--------------------------------------------------------------------------------
/Analysis/PlotChart.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | args <- commandArgs(TRUE)
 4 | 
 5 | data <- read.table(args[1])
 6 | 
 7 | png(file = args[2], width=15, height=10, units="cm", res=1200)
 8 | 
 9 | barplot(data[,2], names.arg = data[,1], xlab = "Chromosome", ylab = "Count", col = "blue", main = paste0("Chrosome distribution_", args[3]))
10 | 
11 | dev.off()


--------------------------------------------------------------------------------
/Analysis/ResSummary.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | #==================================
  4 | # used to print the summary result statistics 
  5 | # from a collection of ATAC seq samples
  6 | # this script is to be called on the top most directory structure 
  7 | # containing all the ATAC seq sample folders 
  8 | 
  9 | # author: Sourya Bhattacharyya
 10 | # Vijay-AY lab
 11 | #==================================
 12 | 
 13 | library(optparse)
 14 | library(ggplot2)
 15 | library(plotly)
 16 | 
 17 | #==============
 18 | # function to plot scatter using plotly package
 19 | #==============
 20 | PlotScatter_Data <- function(InpDF, ylabel, plotfile) {
 21 | 	colnames(InpDF) <- c('X', 'Y')
 22 | 	currplot <- plotly::plot_ly(InpDF, x= ~X, y= ~Y, name=ylabel, type="scatter", mode="markers", marker=list(size=10, color= "blue")) %>% layout(xaxis = list(title = 'Samples', zeroline = FALSE, showticklabels = FALSE), yaxis = list(title = ylabel, zeroline = FALSE))
 23 | 	htmlwidgets::saveWidget(currplot, plotfile)
 24 | }
 25 | 
 26 | #===========================================================
 27 | option_list = list(
 28 | 	make_option(c("--BaseDir"), type="character", default=NULL, help="Base directory containing all the ATAC-seq samples. Mandatory parameter."),
 29 | 	make_option(c("--OutDir"), type="character", default=NULL, help="Output directory to contain the summary results. If empty, current directory is used.")
 30 | ); 
 31 | 
 32 | opt_parser = OptionParser(option_list=option_list);
 33 | opt = parse_args(opt_parser);
 34 | 
 35 | if (is.null(opt$BaseDir)) {
 36 | 	print_help(opt_parser)
 37 | 	stop("ERROR !!!!!!! Base output directory is not provided - check the option --BaseDir \n", call.=FALSE)
 38 | }
 39 | 
 40 | if (is.null(opt$OutDir)) {
 41 | 	OutDir <- getwd()
 42 | } else {
 43 | 	OutDir <- opt$OutDir
 44 | 	system(paste("mkdir -p", OutDir))
 45 | }
 46 | 
 47 | # check if the last character of the output base directory is '/'
 48 | # unless append that character
 49 | baseresdir <- opt$BaseDir
 50 | if (substr(baseresdir,nchar(baseresdir),nchar(baseresdir)) != '/') {
 51 | 	baseresdir <- paste0(baseresdir, '/')
 52 | }
 53 | 
 54 | # template name of the directories containing MACS2 results
 55 | # either default parameters are used
 56 | # or extsize based parameters are employed
 57 | MACS2_def_dir <- 'MACS2_Default_Tag'
 58 | MACS2_ext_dir <- 'MACS2_Ext_Tag'
 59 | # MACS2_noduprem_ext_dir <- 'MACS2_NoDupRem_Align_Ext_Tag'
 60 | 
 61 | # template name of the folders (peak outputs) depending on whether control (input) are used for peak detection
 62 | Ctrl_0_Fold <- '_No_Control'
 63 | Ctrl_1_Fold <- '_with_Control'
 64 | 
 65 | # file formats of NRF, read statistics, FRiP and Peak statistics
 66 | # which are present for every sample
 67 | ReadStatFileNameFmt <- 'Read_Count_Stat.txt'
 68 | NRFfilenamefmt <- 'out_NRF'
 69 | FRiPFileNameFmt <- 'out_FRiP.txt'
 70 | PeakCountFileFmt <- 'Peak_Statistics.txt'
 71 | 
 72 | # output text file to contain the summary results
 73 | outtextfile <- paste0(baseresdir, 'Results_All_Samples_Summary.txt')
 74 | 
 75 | #=================================
 76 | file_process <- FALSE
 77 | 
 78 | # process individual directories under the main results directory
 79 | dir.list <- list.dirs(path = baseresdir, full.names = FALSE, recursive = FALSE)
 80 | for (dr in dir.list) {
 81 | 
 82 | 	# cat(sprintf("\n Examining directory: %s \n", dr))	
 83 | 
 84 | 	#==================
 85 | 	# following file stores the count of reads throughout various stages of filtering
 86 | 	#==================
 87 | 	ReadStatFile <- paste0(baseresdir, dr, "/", ReadStatFileNameFmt)
 88 | 	if (file.exists(ReadStatFile) && (file.access(ReadStatFile, 4) == 0)) {
 89 | 		cat(sprintf("\n Found the file: %s \n", ReadStatFile))
 90 | 
 91 | 		# search for a file with name *_R1*.fastq.gz in the current file
 92 | 		filenames <- Sys.glob(paste0(baseresdir, dr, "/*_R1*.fastq.gz"))
 93 | 		currSampleName <- substr(basename(filenames[1]), start=1, stop=regexpr("_R1", basename(filenames[1]))-1)
 94 | 
 95 | 		x <- readLines(ReadStatFile)
 96 | 		lastline <- strsplit(x[length(x)], "\t")[[1]]
 97 | 
 98 | 		# the line should have 8 or 7 fields
 99 | 		# 8 fields if fastq file is used in the pipeline
100 | 		# 7 fields if already aligned file is used in the pipeline
101 | 		nfields <- length(lastline)
102 | 		cat(sprintf("\n No of fields in the read statistics file: %s ", nfields))
103 | 
104 | 	 	TotRead <- as.integer(lastline[nfields-6])	#lastline[2]
105 | 	 	MappableRead <- as.integer(lastline[nfields-5])	#lastline[3]
106 | 	 	Frac_Mappable_Read <- ((MappableRead * 1.0) / TotRead)
107 | 	 	Frac_Unmappable_Read <- (((TotRead - MappableRead) * 1.0) / TotRead)
108 | 	 	Read_remain_after_RandomDel <- as.integer(lastline[nfields-4])	#lastline[4]
109 | 	 	Frac_reads_remain_after_RandomDel <- ((Read_remain_after_RandomDel * 1.0) / TotRead)
110 | 	 	Frac_reads_deleted_random <- (((MappableRead - Read_remain_after_RandomDel) * 1.0) / TotRead)
111 | 	 	Read_remain_after_Mitochondrial_Read_Del <- as.integer(lastline[nfields-3])	#lastline[5]
112 | 	 	Frac_reads_remain_after_MtReadDel <- ((Read_remain_after_Mitochondrial_Read_Del * 1.0) / TotRead)
113 | 	 	Frac_reads_deleted_MtRead <- (((Read_remain_after_RandomDel - Read_remain_after_Mitochondrial_Read_Del) * 1.0) / TotRead)
114 | 	 	UniqMappedRead <- as.integer(lastline[nfields-2])	#lastline[6]
115 | 	 	Frac_reads_unique_mapped <- ((UniqMappedRead * 1.0) / TotRead)
116 | 	 	Frac_reads_del_multimap <- (((Read_remain_after_Mitochondrial_Read_Del - UniqMappedRead) * 1.0) / TotRead)
117 | 	 	ReadQualThr <- as.integer(lastline[nfields-1])	#lastline[7]
118 |  		Frac_reads_remain_QualThr <- ((ReadQualThr * 1.0) / TotRead)
119 |  		Frac_reads_del_QualThr <- (((UniqMappedRead - ReadQualThr) * 1.0) / TotRead)
120 | 		Dupl_Rem_Read <- lastline[nfields]	#lastline[9]
121 | 		Frac_reads_remain_Dupl <- ((Dupl_Rem_Read * 1.0) / TotRead)
122 | 		Frac_reads_del_Dupl <- (((ReadQualThr - Dupl_Rem_Read) * 1.0) / TotRead)
123 | 
124 | 		# append the entries in the final vector
125 | 		CurrOutVec <- c(basename(dr), currSampleName, TotRead, MappableRead, Frac_Mappable_Read, Frac_Unmappable_Read, Read_remain_after_RandomDel, Frac_reads_remain_after_RandomDel, Frac_reads_deleted_random, Read_remain_after_Mitochondrial_Read_Del, Frac_reads_remain_after_MtReadDel, Frac_reads_deleted_MtRead, UniqMappedRead, Frac_reads_unique_mapped, Frac_reads_del_multimap, ReadQualThr, Frac_reads_remain_QualThr, Frac_reads_del_QualThr, Dupl_Rem_Read, Frac_reads_remain_Dupl, Frac_reads_del_Dupl)
126 | 
127 | 		#==================
128 | 		# the following file stores the NRF / library complexity value
129 | 		#==================
130 | 		filenames <- Sys.glob(paste0(baseresdir, dr, "/*", NRFfilenamefmt, "*.txt"))
131 | 		if (length(filenames) > 0) {
132 | 			NRF_textfile <- filenames[1]
133 | 			if (file.exists(NRF_textfile) && (file.access(NRF_textfile, 4) == 0)){
134 | 			 	x <- readLines(NRF_textfile)
135 | 			 	# the 2nd line in string splitted structure
136 | 			 	lastline <- strsplit(x[length(x)], "\t")[[1]]
137 | 			 	UniqMappedPos <- lastline[2]
138 | 			  	NRF_val <- lastline[3]
139 | 			  	M1 <- lastline[4]
140 | 			  	M2 <- lastline[5]
141 | 			  	PBC1 <- lastline[6]
142 | 			  	PBC2 <- lastline[7]
143 | 			  	# adjust the output vector
144 | 			  	CurrOutVec <- c(CurrOutVec, UniqMappedPos, NRF_val, M1, M2, PBC1, PBC2)
145 | 		 	} else {
146 | 		 		CurrOutVec <- c(CurrOutVec, rep('NA', 6))
147 | 		 	}
148 | 		} else {
149 | 			CurrOutVec <- c(CurrOutVec, rep('NA', 6))
150 | 	 	}
151 | 
152 |  		#==================
153 | 	 	# check the peak directories and find corresponding statistics
154 | 	 	#==================
155 | 
156 | 		#==================
157 | 		# FRiP and Peak count measures - default peak calling - no control
158 | 		#==================	
159 | 		FRiP_textfile_def_noctrl <- paste0(baseresdir, dr, "/", MACS2_def_dir, Ctrl_0_Fold, "/", FRiPFileNameFmt)
160 | 		if (file.exists(FRiP_textfile_def_noctrl) && (file.access(FRiP_textfile_def_noctrl, 4) == 0)){
161 | 		 	x <- readLines(FRiP_textfile_def_noctrl)
162 | 		 	lastline <- strsplit(x[length(x)], "\t")[[1]]
163 | 		 	MappedReadPeak_def_noctrl <- lastline[2]
164 | 		  	FRiP_def_noctrl <- lastline[3]
165 | 		  	# adjust the output vector
166 | 		  	CurrOutVec <- c(CurrOutVec, MappedReadPeak_def_noctrl, FRiP_def_noctrl)
167 | 		} else {
168 | 			CurrOutVec <- c(CurrOutVec, rep('NA', 2))
169 | 		}
170 | 	
171 | 		PeakCount_TextFile_def_noctrl <- paste0(baseresdir, dr, "/", MACS2_def_dir, Ctrl_0_Fold, "/", PeakCountFileFmt)
172 | 		if (file.exists(PeakCount_TextFile_def_noctrl) && (file.access(PeakCount_TextFile_def_noctrl, 4) == 0)){
173 | 		 	x <- readLines(PeakCount_TextFile_def_noctrl)
174 | 		 	lastline <- strsplit(x[length(x)], "\t")[[1]]
175 | 		 	TotPeak_def_noctrl <- lastline[1]
176 | 		  	TotPeak_Q_Five_Pct_def_noctrl <- lastline[2]
177 | 		  	TotPeak_Q_One_Pct_def_noctrl <- lastline[3]
178 | 		  	# adjust the output vector
179 | 		  	CurrOutVec <- c(CurrOutVec, TotPeak_def_noctrl, TotPeak_Q_Five_Pct_def_noctrl, TotPeak_Q_One_Pct_def_noctrl)
180 | 		} else {
181 | 			CurrOutVec <- c(CurrOutVec, rep('NA', 3))
182 | 		}
183 | 
184 | 		#==================
185 | 		# FRiP and Peak count measures - Ext peak calling - no control
186 | 		#==================	
187 | 		FRiP_textfile_ext_noctrl <- paste0(baseresdir, dr, "/", MACS2_ext_dir, Ctrl_0_Fold, "/", FRiPFileNameFmt)
188 | 		if (file.exists(FRiP_textfile_ext_noctrl) && (file.access(FRiP_textfile_ext_noctrl, 4) == 0)){
189 | 		 	x <- readLines(FRiP_textfile_ext_noctrl)
190 | 		 	lastline <- strsplit(x[length(x)], "\t")[[1]]
191 | 		 	MappedReadPeak_ext_noctrl <- lastline[2]
192 | 		  	FRiP_ext_noctrl <- lastline[3]
193 | 		  	# adjust the output vector
194 | 		  	CurrOutVec <- c(CurrOutVec, MappedReadPeak_ext_noctrl, FRiP_ext_noctrl)
195 | 		} else {
196 | 			CurrOutVec <- c(CurrOutVec, rep('NA', 2))
197 | 		}
198 | 
199 | 		PeakCount_TextFile_ext_noctrl <- paste0(baseresdir, dr, "/", MACS2_ext_dir, Ctrl_0_Fold, "/", PeakCountFileFmt)
200 | 		if (file.exists(PeakCount_TextFile_ext_noctrl) && (file.access(PeakCount_TextFile_ext_noctrl, 4) == 0)){
201 | 		 	x <- readLines(PeakCount_TextFile_ext_noctrl)
202 | 		 	lastline <- strsplit(x[length(x)], "\t")[[1]]
203 | 		 	TotPeak_ext_noctrl <- lastline[1]
204 | 		  	TotPeak_Q_Five_Pct_ext_noctrl <- lastline[2]
205 | 		  	TotPeak_Q_One_Pct_ext_noctrl <- lastline[3]
206 | 		  	# adjust the output vector
207 | 		  	CurrOutVec <- c(CurrOutVec, TotPeak_ext_noctrl, TotPeak_Q_Five_Pct_ext_noctrl, TotPeak_Q_One_Pct_ext_noctrl)
208 | 		} else {
209 | 			CurrOutVec <- c(CurrOutVec, rep('NA', 3))
210 | 		}
211 | 
212 | 		#==================
213 | 		# FRiP and Peak count measures - default peak calling - with control
214 | 		#==================	
215 | 		FRiP_textfile_def_ctrl <- paste0(baseresdir, dr, "/", MACS2_def_dir, Ctrl_1_Fold, "/", FRiPFileNameFmt)
216 | 		if (file.exists(FRiP_textfile_def_ctrl) && (file.access(FRiP_textfile_def_ctrl, 4) == 0)){
217 | 		 	x <- readLines(FRiP_textfile_def_ctrl)
218 | 		 	lastline <- strsplit(x[length(x)], "\t")[[1]]
219 | 		 	MappedReadPeak_def_ctrl <- lastline[2]
220 | 		  	FRiP_def_ctrl <- lastline[3]
221 | 		  	# adjust the output vector
222 | 		  	CurrOutVec <- c(CurrOutVec, MappedReadPeak_def_ctrl, FRiP_def_ctrl)
223 | 		} else {
224 | 			CurrOutVec <- c(CurrOutVec, rep('NA', 2))
225 | 		}
226 | 
227 | 		PeakCount_TextFile_def_ctrl <- paste0(baseresdir, dr, "/", MACS2_def_dir, Ctrl_1_Fold, "/", PeakCountFileFmt)
228 | 		if (file.exists(PeakCount_TextFile_def_ctrl) && (file.access(PeakCount_TextFile_def_ctrl, 4) == 0)){
229 | 		 	x <- readLines(PeakCount_TextFile_def_ctrl)
230 | 		 	lastline <- strsplit(x[length(x)], "\t")[[1]]
231 | 		 	TotPeak_def_ctrl <- lastline[1]
232 | 		  	TotPeak_Q_Five_Pct_def_ctrl <- lastline[2]
233 | 		  	TotPeak_Q_One_Pct_def_ctrl <- lastline[3]
234 | 		  	# adjust the output vector
235 | 		  	CurrOutVec <- c(CurrOutVec, TotPeak_def_ctrl, TotPeak_Q_Five_Pct_def_ctrl, TotPeak_Q_One_Pct_def_ctrl)
236 | 		} else {
237 | 			CurrOutVec <- c(CurrOutVec, rep('NA', 3))
238 | 		}
239 | 
240 | 		#==================
241 | 		# FRiP and Peak count measures - Ext peak calling - with control
242 | 		#==================	
243 | 		FRiP_textfile_ext_ctrl <- paste0(baseresdir, dr, "/", MACS2_ext_dir, Ctrl_1_Fold, "/", FRiPFileNameFmt)
244 | 		if (file.exists(FRiP_textfile_ext_ctrl) && (file.access(FRiP_textfile_ext_ctrl, 4) == 0)){
245 | 		 	x <- readLines(FRiP_textfile_ext_ctrl)
246 | 		 	lastline <- strsplit(x[length(x)], "\t")[[1]]
247 | 		 	MappedReadPeak_ext_ctrl <- lastline[2]
248 | 		  	FRiP_ext_ctrl <- lastline[3]
249 | 		  	# adjust the output vector
250 | 		  	CurrOutVec <- c(CurrOutVec, MappedReadPeak_ext_ctrl, FRiP_ext_ctrl)
251 | 		} else {
252 | 			CurrOutVec <- c(CurrOutVec, rep('NA', 2))
253 | 		}
254 | 
255 | 		PeakCount_TextFile_ext_ctrl <- paste0(baseresdir, dr, "/", MACS2_ext_dir, Ctrl_1_Fold, "/", PeakCountFileFmt)
256 | 		if (file.exists(PeakCount_TextFile_ext_ctrl) && (file.access(PeakCount_TextFile_ext_ctrl, 4) == 0)){
257 | 		 	x <- readLines(PeakCount_TextFile_ext_ctrl)
258 | 		 	lastline <- strsplit(x[length(x)], "\t")[[1]]
259 | 		 	TotPeak_ext_ctrl <- lastline[1]
260 | 		  	TotPeak_Q_Five_Pct_ext_ctrl <- lastline[2]
261 | 		  	TotPeak_Q_One_Pct_ext_ctrl <- lastline[3]
262 | 		  	# adjust the output vector
263 | 		  	CurrOutVec <- c(CurrOutVec, TotPeak_ext_ctrl, TotPeak_Q_Five_Pct_ext_ctrl, TotPeak_Q_One_Pct_ext_ctrl)
264 | 		} else {
265 | 			CurrOutVec <- c(CurrOutVec, rep('NA', 3))
266 | 		}
267 | 
268 | 		# now convert the current vector in a data frame
269 | 		CurrDF <- data.frame(CurrOutVec, nrow=1, ncol=length(CurrOutVec))
270 | 		colnames(CurrDF) <- c('Dir', 'SampleName', 'Total_Read', 'Number_of_Mappable_Reads', 'Fraction_of_Mappable_Reads', 'Fraction_of_Unmappable_Reads', 'Reads_after_random_chromsome_deletion', 'Fraction_reads_remain_after_random_chromsome_deletion', 'Fraction_reads_in_random_chromsome', 'Reads_excluding_Mitochondrial_Reads', 'Fraction_reads_excluding_MtRead', 'Fraction_mitochondrial_reads', 'UniqMappedRead', 'Fraction_reads_unique_mapped', 'Fraction_reads_multimap', 'Reads_remain_after_QualThr', 'Fraction_reads_remain_QualThr', 'Frac_reads_low_qual', 'Reads_after_dupl_remove', 'Fraction_de-duplicated_reads', 'Fraction_duplicate_reads', 'UniqMappedPos', 'NRF', 'M1', 'M2', 'PBC1', 'PBC2', 'MappedReadPeak_Def_noctrl(Q<0.05)', 'FRiP_Def_NoCtrl(Q<0.05)', 'nPeak_Def_NoCtrl', 'nPeak_Def_NoCtrl(Q<0.05)', 'nPeak_Def_NoCtrl(Q<0.01)', 'MapReadPeak_Ext_NoCtrl(Q<0.05)', 'FRiP_Ext_NoCtrl(Q<0.05)', 'nPeak_Ext_NoCtrl', 'nPeak_Ext_NoCtrl(Q<0.05)', 'nPeak_Ext_NoCtrl(Q<0.01)', 'MapReadPeak_Def_Ctrl(Q<0.05)',  'FRiP_Def_Ctrl(Q<0.05)', 'nPeak_Def_Ctrl', 'nPeak_Def_Ctrl(Q<0.05)', 'nPeak_Def_Ctrl(Q<0.01)', 'MapReadPeak_Ext_Ctrl(Q<0.05)', 'FRiP_Ext_Ctrl(Q<0.05)', 'nPeak_Ext_Ctrl', 'nPeak_Ext_Ctrl(Q<0.05)', 'nPeak_Ext_Ctrl(Q<0.01)')
271 | 
272 | 	 	if (file_process == FALSE) {
273 | 	 		FinalDF <- CurrDF
274 | 	 		file_process <- TRUE
275 | 	 	} else {
276 | 	 		FinalDF <- rbind.data.frame(FinalDF, CurrDF)
277 | 	 	}
278 | 	 	
279 | 	}	# end processing current sample condition 
280 | 
281 | }	# end directory traverse
282 | 
283 | # now remove one or more columns of this data frame, if they are all 'NA'
284 | NA_ColList <- c()
285 | for (i in (1:ncol(FinalDF))) {
286 | 	idx <- which(FinalDF[, i] == 'NA')
287 | 	if (length(idx) == nrow(FinalDF)) {
288 | 		# every entry of this column is NA. So discard this column
289 | 		NA_ColList <- c(NA_ColList, i)
290 | 	}
291 | }
292 | # if (length(NA_ColList) > 0) {
293 | # 	FinalDF_Modified <- FinalDF[-c(NA_ColList)]
294 | # 	cat(sprintf("\n *** Dropped one or more columns since all entries were NA - before dropping : number of columns : %s after dropping columns : %s ", ncol(FinalDF), ncol(FinalDF_Modified)))
295 | # 	write.table(FinalDF_Modified, outtextfile, row.names=F, col.names=T, sep="\t", quote=F, append=F)
296 | # } else {
297 | # 	write.table(FinalDF, outtextfile, row.names=F, col.names=T, sep="\t", quote=F, append=F)
298 | # }
299 | 
300 | #============================
301 | # a few summary statements for the output text file
302 | #============================
303 | 
304 | CommentsFile <- paste0(OutDir, '/Field_Description.txt')
305 | 
306 | # open the output text file
307 | con <- file(CommentsFile, "a")
308 | 
309 | outtext <- paste0("\n\n\n\n\n *** Important parameters ***** \n\n\n")
310 | writeLines(outtext, con=con, sep="\n")
311 | 
312 | outtext <- paste0("\n\n  Total_Read: number of reads in individual fastq file(s)")
313 | writeLines(outtext, con=con, sep="\n")
314 | 
315 | outtext <- paste0("\n\n  Number_of_Mappable_Reads, Fraction_of_Mappable_Reads, and Fraction_of_Unmappable_Reads: number (and fraction) of reads mappable and unmappable to the reference genome. May not be uniquely mappable reads.")
316 | writeLines(outtext, con=con, sep="\n")
317 | 
318 | outtext <- paste0("\n\n  Reads_after_random_chromsome_deletion, Fraction_reads_remain_after_random_chromsome_deletion, and Fraction_reads_in_random_chromsome: number (and fraction) of reads remaining (and deleted) after deleting reads from random chromosomes such as chr1_*, chr2_*, chrUN, ....")
319 | writeLines(outtext, con=con, sep="\n")
320 | 
321 | outtext <- paste0("\n\n  Reads_excluding_Mitochondrial_Reads, Fraction_reads_excluding_MtRead and Fraction_mitochondrial_reads: number (and fraction) of reads remaining (and deleted) after removing the mitochondrial reads")
322 | writeLines(outtext, con=con, sep="\n")
323 | 
324 | outtext <- paste0("\n\n  UniqMappedRead, Fraction_reads_unique_mapped, and Fraction_reads_multimap: number (and fraction) of reads uniquely mapped (and multimapped) to the reference genome.")
325 | writeLines(outtext, con=con, sep="\n")
326 | 
327 | outtext <- paste0("\n\n  Reads_remain_after_QualThr, Fraction_reads_remain_QualThr and Frac_reads_low_qual: number (and fraction) of reads remaining (and deleted) after removing low quality reads (MAPQ threshold)")
328 | writeLines(outtext, con=con, sep="\n")
329 | 
330 | outtext <- paste0("\n\n  Reads_not_in_blackList_genome, Fraction_reads_not_in_blackList_genome, and Fraction_reads_in_blackList_genome: number (and fraction) of reads not in (and in) blacklist segments.")
331 | writeLines(outtext, con=con, sep="\n")
332 | 
333 | outtext <- paste0("\n\n  Reads_after_dupl_remove, Fraction_de-duplicated_reads and Fraction_duplicate_reads: number (and fraction) of reads remaining (and deleted) after removing duplicate reads")
334 | writeLines(outtext, con=con, sep="\n")
335 | 
336 | outtext <- paste0("\n\n  UniqMapPos: number of distinct genome position where at least one read maps uniquely.")
337 | writeLines(outtext, con=con, sep="\n")
338 | 
339 | outtext <- paste0("\n\n  NRF (Non redundant fraction): number of distinct genome positions for uniquely mapped reads / number of uniquely mapped reads")
340 | writeLines(outtext, con=con, sep="\n")
341 | 
342 | outtext <- paste0("\n\n  M1: number of genomic locations where exactly one read maps uniquely.")
343 | writeLines(outtext, con=con, sep="\n")
344 | 
345 | outtext <- paste0("\n\n  M2: number of genomic locations where exactly two reads map uniquely.")
346 | writeLines(outtext, con=con, sep="\n")
347 | 
348 | outtext <- paste0("\n\n  PBC1: M1 / UniqMapPos ")
349 | writeLines(outtext, con=con, sep="\n")
350 | 
351 | outtext <- paste0("\n\n  PBC2: M1 / M2")
352 | writeLines(outtext, con=con, sep="\n")
353 | 
354 | outtext <- paste0("\n\n\n\n\n  MACS2 outputs corresponding to peaks with default MACS2 command and no control ----- input  missing values are replaced by NA \n\n")
355 | writeLines(outtext, con=con, sep="\n")
356 | 
357 | outtext <- paste0("\n\n MappedReadPeak_Def_noctrl: mapped reads in peaks ")
358 | writeLines(outtext, con=con, sep="\n")
359 | 
360 | outtext <- paste0("\n\n FRiP_Def_NoCtrl: MappedReadPeak_Def_noctrl / UniqMappedRead ")
361 | writeLines(outtext, con=con, sep="\n")
362 | 
363 | outtext <- paste0("\n\n nPeak_Def_NoCtrl: number of peaks (determined by p value threshold of 0.01) ")
364 | writeLines(outtext, con=con, sep="\n")
365 | 
366 | outtext <- paste0("\n\n nPeak_Def_NoCtrl(Q<0.05): number of peaks (determined by q value threshold of 0.05) ")
367 | writeLines(outtext, con=con, sep="\n")
368 | 
369 | outtext <- paste0("\n\n nPeak_Def_NoCtrl(Q<0.01): number of peaks (determined by q value threshold of 0.01) ")
370 | writeLines(outtext, con=con, sep="\n")
371 | 
372 | outtext <- paste0("\n\n\n\n\n  MACS2 outputs corresponding to peaks with --extsize option (recommended in existing ATAC seq pipeline) and no control input ----- missing values are replaced by NA \n\n")
373 | writeLines(outtext, con=con, sep="\n")
374 | 
375 | outtext <- paste0("\n\n MapReadPeak_Ext_NoCtrl: mapped reads in peaks ")
376 | writeLines(outtext, con=con, sep="\n")
377 | 
378 | outtext <- paste0("\n\n FRiP_Ext_NoCtrl: MapReadPeak_Ext_NoCtrl / UniqMapRead ")
379 | writeLines(outtext, con=con, sep="\n")
380 | 
381 | outtext <- paste0("\n\n nPeak_Ext_NoCtrl: number of peaks (determined by p value threshold of 0.01) ")
382 | writeLines(outtext, con=con, sep="\n")
383 | 
384 | outtext <- paste0("\n\n nPeak_Ext_NoCtrl(Q<0.05): number of peaks (determined by q value threshold of 0.05) ")
385 | writeLines(outtext, con=con, sep="\n")
386 | 
387 | outtext <- paste0("\n\n nPeak_Ext_NoCtrl(Q<0.01): number of peaks (determined by q value threshold of 0.01) ")
388 | writeLines(outtext, con=con, sep="\n")
389 | 
390 | outtext <- paste0("\n\n\n\n\n  MACS2 outputs corresponding to peaks with default MACS2 command ----- but here control input is present ----- missing values are replaced by NA \n\n")
391 | writeLines(outtext, con=con, sep="\n")
392 | 
393 | outtext <- paste0("\n\n MapReadPeak_Def_Ctrl: mapped reads in peaks ")
394 | writeLines(outtext, con=con, sep="\n")
395 | 
396 | outtext <- paste0("\n\n FRiP_Def_Ctrl: MapReadPeak_Def_Ctrl / UniqMapRead ")
397 | writeLines(outtext, con=con, sep="\n")
398 | 
399 | outtext <- paste0("\n\n nPeak_Def_Ctrl: number of peaks (determined by p value threshold of 0.01) ")
400 | writeLines(outtext, con=con, sep="\n")
401 | 
402 | outtext <- paste0("\n\n nPeak_Def_Ctrl(Q<0.05): number of peaks (determined by q value threshold of 0.05) ")
403 | writeLines(outtext, con=con, sep="\n")
404 | 
405 | outtext <- paste0("\n\n nPeak_Def_Ctrl(Q<0.01): number of peaks (determined by q value threshold of 0.01) ")
406 | writeLines(outtext, con=con, sep="\n")
407 | 
408 | outtext <- paste0("\n\n\n\n\n  MACS2 outputs corresponding to peaks with --extsize option (recommended in existing ATAC seq pipeline) ----- here control input is provided ----- missing values are replaced by NA \n\n")
409 | writeLines(outtext, con=con, sep="\n")
410 | 
411 | outtext <- paste0("\n\n MapReadPeak_Ext_Ctrl: mapped reads in peaks ")
412 | writeLines(outtext, con=con, sep="\n")
413 | 
414 | outtext <- paste0("\n\n FRiP_Ext_Ctrl: MapReadPeak_Ext_Ctrl / UniqMapRead ")
415 | writeLines(outtext, con=con, sep="\n")
416 | 
417 | outtext <- paste0("\n\n nPeak_Ext_Ctrl: number of peaks (determined by p value threshold of 0.01) ")
418 | writeLines(outtext, con=con, sep="\n")
419 | 
420 | outtext <- paste0("\n\n nPeak_Ext_Ctrl(Q<0.05): number of peaks (determined by q value threshold of 0.05) ")
421 | writeLines(outtext, con=con, sep="\n")
422 | 
423 | outtext <- paste0("\n\n nPeak_Ext_Ctrl(Q<0.01): number of peaks (determined by q value threshold of 0.01) ")
424 | writeLines(outtext, con=con, sep="\n")
425 | 
426 | 
427 | # close output summary text file
428 | close(con)
429 | 
430 | #===================
431 | # now read the summary file once more, and plot different statistics
432 | #===================
433 | FinalDF <- read.table(outtextfile, header=T, sep="\t", stringsAsFactors=F)
434 | 
435 | # plot total number of reads for each sample
436 | plotfile <- paste0(OutDir, '/TotalReadCount_Distribution.html')
437 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 3])
438 | PlotScatter_Data(plotdf, "Total Reads", plotfile)
439 | 
440 | # plot fraction of mappable reads for each sample
441 | plotfile <- paste0(OutDir, '/Fraction_MappableReadCount_Distribution.html')
442 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 5])
443 | PlotScatter_Data(plotdf, "Fraction Mappable Reads", plotfile)
444 | 
445 | # plot fraction of mitochondrial reads for each sample
446 | plotfile <- paste0(OutDir, '/Fraction_MitochondrialReadCount_Distribution.html')
447 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 12])
448 | PlotScatter_Data(plotdf, "Fraction mitochondrial Reads", plotfile)
449 | 
450 | # plot fraction of uniquely mapped reads for each sample
451 | plotfile <- paste0(OutDir, '/Fraction_UniqueMappReadCount_Distribution.html')
452 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 14])
453 | PlotScatter_Data(plotdf, "Fraction unique mapped Reads", plotfile)
454 | 
455 | # plot fraction of low quality reads for each sample
456 | plotfile <- paste0(OutDir, '/Fraction_LowQualReadCount_Distribution.html')
457 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 18])
458 | PlotScatter_Data(plotdf, "Fraction low quality Reads", plotfile)
459 | 
460 | # plot fraction of duplicate reads for each sample
461 | plotfile <- paste0(OutDir, '/Fraction_DuplicateReadCount_Distribution.html')
462 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 21])
463 | PlotScatter_Data(plotdf, "Fraction duplicates", plotfile)
464 | 
465 | # plot NRF for each sample
466 | plotfile <- paste0(OutDir, '/NRF_Distribution.html')
467 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 23])
468 | PlotScatter_Data(plotdf, "NRF", plotfile)
469 | 
470 | # plot M1 for each sample
471 | plotfile <- paste0(OutDir, '/M1_Distribution.html')
472 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 24])
473 | PlotScatter_Data(plotdf, "M1", plotfile)
474 | 
475 | # plot M2 for each sample
476 | plotfile <- paste0(OutDir, '/M2_Distribution.html')
477 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 25])
478 | PlotScatter_Data(plotdf, "M2", plotfile)
479 | 
480 | # plot PBC1 for each sample
481 | plotfile <- paste0(OutDir, '/PBC1_Distribution.html')
482 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 26])
483 | PlotScatter_Data(plotdf, "PBC1", plotfile)
484 | 
485 | # plot PBC2 for each sample
486 | plotfile <- paste0(OutDir, '/PBC2_Distribution.html')
487 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 27])
488 | PlotScatter_Data(plotdf, "PBC2", plotfile)
489 | 
490 | # plot FRiP for each sample - no control, default MACS2 peaks
491 | # provided the column is not filled with NA
492 | colno <- 29
493 | if ((colno %in% NA_ColList) == FALSE) {
494 | 	plotfile <- paste0(OutDir, '/FRiP_Def_NoCtrl_Distribution.html')
495 | 	plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, colno])
496 | 	PlotScatter_Data(plotdf, "FRiP_Def_NoCtrl", plotfile)
497 | }
498 | 
499 | # plot number of peaks for each sample - FDR = 0.05 
500 | # no control, MACS2 default peaks
501 | # provided the column is not filled with NA
502 | colno <- 31
503 | if ((colno %in% NA_ColList) == FALSE) {
504 | 	plotfile <- paste0(OutDir, '/NumPeak_Def_NoCtrl_Distribution.html')
505 | 	plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, colno])
506 | 	PlotScatter_Data(plotdf, "NumPeak_Def_NoCtrl", plotfile)
507 | }
508 | 
509 | # plot FRiP for each sample - no control, MACS2 Extsize peaks
510 | # provided the column is not filled with NA
511 | colno <- 34
512 | if ((colno %in% NA_ColList) == FALSE) {
513 | 	plotfile <- paste0(OutDir, '/FRiP_Ext_NoCtrl_Distribution.html')
514 | 	plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, colno])
515 | 	PlotScatter_Data(plotdf, "FRiP_Ext_NoCtrl", plotfile)
516 | }
517 | 
518 | # plot number of peaks for each sample - FDR = 0.05 
519 | # no control, MACS2 Extsize peaks
520 | # provided the column is not filled with NA
521 | colno <- 36
522 | if ((colno %in% NA_ColList) == FALSE) {
523 | 	plotfile <- paste0(OutDir, '/NumPeak_Ext_NoCtrl_Distribution.html')
524 | 	plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, colno])
525 | 	PlotScatter_Data(plotdf, "NumPeak_Ext_NoCtrl", plotfile)
526 | }
527 | 
528 | # plot FRiP for each sample - with control, MACS2 default peaks
529 | # provided the column is not filled with NA
530 | colno <- 39
531 | if ((colno %in% NA_ColList) == FALSE) {
532 | 	plotfile <- paste0(OutDir, '/FRiP_Def_Ctrl_Distribution.html')
533 | 	plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, colno])
534 | 	PlotScatter_Data(plotdf, "FRiP_Def_Ctrl", plotfile)
535 | }
536 | 
537 | # plot number of peaks for each sample - FDR = 0.05 
538 | # no control, MACS2 Extsize peaks
539 | # provided the column is not filled with NA
540 | colno <- 41
541 | if ((colno %in% NA_ColList) == FALSE) {
542 | 	plotfile <- paste0(OutDir, '/NumPeak_Def_Ctrl_Distribution.html')
543 | 	plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, colno])
544 | 	PlotScatter_Data(plotdf, "NumPeak_Def_Ctrl", plotfile)
545 | }
546 | 
547 | # plot FRiP for each sample - with control, MACS2 Extsize peaks
548 | # provided the column is not filled with NA
549 | colno <- 44
550 | if ((colno %in% NA_ColList) == FALSE) {
551 | 	plotfile <- paste0(OutDir, '/FRiP_Ext_Ctrl_Distribution.html')
552 | 	plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, colno])
553 | 	PlotScatter_Data(plotdf, "FRiP_Ext_Ctrl", plotfile)
554 | }
555 | 
556 | # plot number of peaks for each sample - FDR = 0.05 
557 | # with control, MACS2 Extsize peaks
558 | # provided the column is not filled with NA
559 | colno <- 46
560 | if ((colno %in% NA_ColList) == FALSE) {
561 | 	plotfile <- paste0(OutDir, '/NumPeak_Ext_Ctrl_Distribution.html')
562 | 	plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, colno])
563 | 	PlotScatter_Data(plotdf, "NumPeak_Ext_Ctrl", plotfile)
564 | }
565 | 
566 | 


--------------------------------------------------------------------------------
/IDR_Codes/IDRAnalysis.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash 
  2 | 
  3 | #=================================
  4 | # this script is used to perform IDR analysis on a given ATAC seq replicates
  5 | # it uses the idrcode package provided by Anshul Kundaje et. al.
  6 | #=================================
  7 | # developed by - Sourya Bhattacharyya
  8 | # date: 11th july 2017
  9 | # Vijay-AY lab
 10 | # La Jolla Institute for Allergy and Immunology
 11 | #=================================
 12 | 
 13 | # usage info
 14 | usage(){
 15 | cat << EOF
 16 | 
 17 | Options:    
 18 | 
 19 |   -- required:
 20 | 	-a  FILE1        	 First file containing peak information (in either narrowpeak format or narrowpeak.gz format)
 21 | 	-b  FILE2        	 Second file containing peak information (in either narrowpeak format or narrowpeak.gz format)
 22 | 	-d  OutDir 		 	 Output directory containing the IDR results
 23 | 	-P 	PathIDRCode		 Path of the IDRCode package (Kundaje et. al. after its installation)
 24 | 	-n 	PREFIX 			 Prefix of output file
 25 | 	-c 	SampledPeakCount Number of peaks which will be sampled from the input peak files  (default 25000)
 26 | EOF
 27 | }
 28 | 
 29 | # # name of the folder containing IDR results
 30 | # IDR_OutFold='IDR_Overlap0_PVal'
 31 | 
 32 | # executable (R code) of the batch consistency analysis
 33 | # IDRCodeDir='/home/sourya/packages/idrCode/'
 34 | exec1='batch-consistency-analysis.r'
 35 | 
 36 | # default values
 37 | PREFIX='IDR_ATAC'
 38 | 
 39 | OutDir=`pwd`
 40 | 
 41 | # Number of peaks sampled from the original peak detection output
 42 | SampledPeakCount=25000
 43 | 
 44 | while getopts "a:b:d:n:c:P:" opt;
 45 | do
 46 | 	case "$opt" in
 47 | 		a) FILE1=$OPTARG;;
 48 | 		b) FILE2=$OPTARG;;
 49 | 		d) OutDir=$OPTARG;;
 50 | 		n) PREFIX=$OPTARG;;
 51 | 		c) SampledPeakCount=$OPTARG;;
 52 | 		P) IDRCodeDir=$OPTARG;;
 53 | 		\?) usage
 54 | 			echo "error: unrecognized option -$OPTARG";
 55 | 			exit 1
 56 | 			;;
 57 | 	esac
 58 | done
 59 | 
 60 | if [[ -z $FILE1 ]]; then
 61 | 	echo 'User should provide two input peak files (in a bed file or in gzipped bed file) !!'
 62 | 	exit 1
 63 | else
 64 | 	echo 'Input peak file 1: '$FILE1
 65 | fi
 66 | 
 67 | if [[ -z $FILE2 ]]; then
 68 | 	echo 'User should provide two input peak files (in a bed file or in gzipped bed file) !!'
 69 | 	exit 1
 70 | else
 71 | 	echo 'Input peak file 2: '$FILE2
 72 | fi
 73 | 
 74 | if [[ -z $IDRCodeDir ]]; then
 75 | 	echo 'User did not provide the path of IDRCode package (Kundaje et. al.) - exit for the moment !!'
 76 | 	exit 1
 77 | fi
 78 | 
 79 | echo $OutDir
 80 | mkdir -p $OutDir
 81 | 
 82 | #----------------------------------
 83 | # important - sourya
 84 | # change the current directory as the dir containing this executable
 85 | # since other source files relative to the current directory needs to be called
 86 | current_dir=$(pwd)
 87 | script_dir=$(dirname $0)
 88 | cd $script_dir
 89 | #----------------------------------
 90 | 
 91 | # log the replicates
 92 | # if [ ! -f $OutDir'/ReplicaNames.txt' ]; then
 93 | 	echo -e "IDR Analysis of the following two peak files: \n File 1: ${FILE1} \n File 2: ${FILE2}" > $OutDir'/ReplicaNames.txt'
 94 | # fi
 95 | 
 96 | # check the extension of both input files
 97 | filebase1=$(basename "$FILE1")
 98 | filebase2=$(basename "$FILE2")
 99 | 
100 | #===========================================
101 | # extract first 25K (or the count provided) significant peaks from the given peak files and store them
102 | # this will be actually used for IDR analysis
103 | # such significance is decided by the 8th field (P value)
104 | # as instructed in the ENCODE IDR documentation
105 | 
106 | if [[ $filebase1 =~ \.gz$ ]]; then
107 | 	# first file is a gzipped file
108 | 	#ConvFile1=${FILE1%.gz}'_first_'$SampledPeakCount'.gz'
109 | 	ConvFile1=${FILE1%.gz}'_first_'$SampledPeakCount
110 | 	echo 'Subsampled peak file corresponding to the first input: '$ConvFile1
111 | 	#if [ ! -f $ConvFile1 ]; then
112 | 		zcat $FILE1 | sort -k8,8nr > $OutDir'/temp1.txt'
113 | 		#head -n $SampledPeakCount $OutDir'/temp1.txt' | gzip -c > $ConvFile1
114 | 		head -n $SampledPeakCount $OutDir'/temp1.txt' > $ConvFile1
115 | 		rm $OutDir'/temp1.txt'
116 | 	#fi
117 | else
118 | 	#ConvFile1=${FILE1}'_first_'$SampledPeakCount'.gz'
119 | 	ConvFile1=${FILE1}'_first_'$SampledPeakCount
120 | 	echo 'Subsampled peak file corresponding to the first input: '$ConvFile1
121 | 	#if [ ! -f $ConvFile1 ]; then
122 | 		cat $FILE1 | sort -k8,8nr > $OutDir'/temp1.txt'
123 | 		#head -n $SampledPeakCount $OutDir'/temp1.txt' | gzip -c > $ConvFile1
124 | 		head -n $SampledPeakCount $OutDir'/temp1.txt' > $ConvFile1
125 | 		rm $OutDir'/temp1.txt'
126 | 	#fi
127 | fi
128 | 
129 | if [[ $filebase2 =~ \.gz$ ]]; then
130 | 	# first file is a gzipped file
131 | 	#ConvFile2=${FILE2%.gz}'_first_'$SampledPeakCount'.gz'
132 | 	ConvFile2=${FILE2%.gz}'_first_'$SampledPeakCount
133 | 	echo 'Subsampled peak file corresponding to the second input: '$ConvFile2
134 | 	#if [ ! -f $ConvFile2 ]; then
135 | 		zcat $FILE2 | sort -k8,8nr > $OutDir'/temp2.txt'
136 | 		#head -n $SampledPeakCount $OutDir'/temp2.txt' | gzip -c > $ConvFile2
137 | 		head -n $SampledPeakCount $OutDir'/temp2.txt' > $ConvFile2
138 | 		rm $OutDir'/temp2.txt'
139 | 	#fi
140 | else
141 | 	#ConvFile2=${FILE2}'_first_'$SampledPeakCount'.gz'
142 | 	ConvFile2=${FILE2}'_first_'$SampledPeakCount
143 | 	echo 'Subsampled peak file corresponding to the second input: '$ConvFile2
144 | 	#if [ ! -f $ConvFile2 ]; then
145 | 		cat $FILE2 | sort -k8,8nr > $OutDir'/temp2.txt'
146 | 		#head -n $SampledPeakCount $OutDir'/temp2.txt' | gzip -c > $ConvFile2
147 | 		head -n $SampledPeakCount $OutDir'/temp2.txt' > $ConvFile2
148 | 		rm $OutDir'/temp2.txt'
149 | 	#fi
150 | fi
151 | 
152 | #===========================================
153 | 
154 | # we employ p value as the measure for rank determination of the peaks
155 | # We note that only narrow peaks are analyzed for the significance test - so the 6th argument is F (no broadpeak)
156 | # we also note that the criteria of peak overlap is set as 1 bp. 
157 | # So the 5th argument is placed as 0 - if it is 0.5, 50% overlap criteria is imposed
158 | 
159 | # this output directory also notes the settings used for this IDR
160 | CurrOutDir=$OutDir	#'/'	#$IDR_OutFold
161 | mkdir -p $CurrOutDir
162 | 
163 | # the prefix also contains the output directory where all results will be stored
164 | CurrOutPrefix=$CurrOutDir'/'$PREFIX
165 | 
166 | # parameter description is provided in the ENCODE IDR documentation
167 | if [ ! -f $CurrOutPrefix'-overlapped-peaks.txt' ]; then
168 | 	# command for batch consistency analysis
169 | 	# Note: the input to this program should be uncompressed narrow peak file
170 | 
171 | 	# first unzip the files
172 | 	#gunzip $ConvFile1
173 | 	#file1=${ConvFile1%.gz}
174 | 	#gunzip $ConvFile2
175 | 	#file2=${ConvFile2%.gz}
176 | 
177 | 	# then call the batch consistency command
178 | 	cd $IDRCodeDir
179 | 	#Rscript $exec1 $file1 $file2 -1 $CurrOutPrefix 0 F p.value
180 | 	Rscript $exec1 $ConvFile1 $ConvFile2 -1 $CurrOutPrefix 0 F p.value
181 | 	cd -
182 | 
183 | 	# now re-zip the peak files
184 | 	#gzip $file1
185 | 	#gzip $file2 
186 | 
187 | fi
188 | 
189 | #======================
190 | # add - sourya
191 | # here we call a custom R function
192 | # which plots IDR scatter analysis between this pair of samples
193 | 
194 | Rscript IDRScatterPlot.r $IDRCodeDir $ConvFile1 $ConvFile2 $IDRCodeDir'/genome_table.txt' $CurrOutPrefix
195 | 
196 | #----------------------------------
197 | # after generating the IDR statistics for this pair of replicates, 
198 | # now quantify the similarities
199 | 
200 | # number of peaks in the input peak files
201 | # and also the number of overlapped peaks
202 | #npeak1=`zcat $ConvFile1 | wc -l`
203 | #npeak2=`zcat $ConvFile2 | wc -l`
204 | npeak1=`cat $ConvFile1 | wc -l`
205 | npeak2=`cat $ConvFile2 | wc -l`
206 | 
207 | Rscript IDRSummary.r $CurrOutPrefix'-overlapped-peaks.txt' $npeak1 $npeak2
208 | 
209 | #----------------------------------
210 | # important - sourya
211 | # now restore the original directory
212 | cd $current_dir
213 | #----------------------------------
214 | 


--------------------------------------------------------------------------------
/IDR_Codes/IDRMain.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash 
  2 | 
  3 | #=================================
  4 | # this script encapsulates IDR analysis between different replicates into a single script
  5 | # It has input peak files (2 or more) 
  6 | # The script calls IDRAnalysis.sh for pairwise analysis
  7 | #=================================
  8 | # developed by - Sourya Bhattacharyya
  9 | # Vijay-AY lab
 10 | # La Jolla Institute for Allergy and Immunology
 11 | #=================================
 12 | 
 13 | # usage info
 14 | usage(){
 15 | cat << EOF
 16 | 
 17 | usage: ./IDRMain.sh [-h] [-I peakfile1.narrowpeak] [-I peakfile2.narrowpeak] [-P PathIDRCode] [-d OutDir] [-n PREFIXSTR]
 18 | Example:
 19 | 	./IDRMain.sh -I peak1.narrowPeak -I peak2.narrowPeak -I peak3.narrowPeak -P /home/sourya/packages/idrCode/ -d /home/sourya/OutDir_IDR -n 'IDR_test'
 20 | 
 21 | Options:    
 22 | 
 23 |   -- required:
 24 | 	-I  InpFile        	 A list of input peak files (obtained from MACS2 - in .narrowPeak or .narrowPeak.gz format). 
 25 | 						 At least two peak files are required.
 26 | 	-P 	PathIDRCode		 Path of the IDRCode package (Kundaje et. al. after its installation)
 27 | 	-d  OutDir 		 	 Output directory (absolute path preferred) which will store the IDR results.
 28 | 	-n 	PREFIX 			 Prefix of output files. Default 'IDR_ATAC'.
 29 | EOF
 30 | }
 31 | 
 32 | # default variables and values
 33 | IDR_code='./IDRAnalysis.sh'
 34 | PREFIX='IDR_ATAC'
 35 | 
 36 | # code containing the IDR + consistency plot
 37 | # dir2='/home/sourya/packages/idrCode/'
 38 | exec2='batch-consistency-plot.r'
 39 | 
 40 | # Sourya - Note the processing of input file argument since it can be more than one file
 41 | # Note the change of notations
 42 | while getopts "I:n:d:P:" opt;
 43 | do
 44 | 	case "$opt" in
 45 | 		I) InpFile+=($OPTARG);;
 46 | 		n) PREFIX=$OPTARG;;
 47 | 		d) OutDir=$OPTARG;;
 48 | 		P) dir2=$OPTARG;;
 49 | 		\?) usage
 50 | 			echo "error: unrecognized option -$OPTARG";
 51 | 			exit 1
 52 | 			;;
 53 | 	esac
 54 | done
 55 | 
 56 | if [[ -z $InpFile ]]; then
 57 | 	echo 'User did not provide any input peak file - exit for the moment !!'
 58 | 	exit 1
 59 | fi
 60 | 
 61 | if [[ -z $dir2 ]]; then
 62 | 	echo 'User did not provide the path of IDRCode package (Kundaje et. al.) - exit for the moment !!'
 63 | 	exit 1
 64 | fi
 65 | 
 66 | if [[ -z $OutDir ]]; then
 67 | 	echo 'User did not provide output directory for storing the results - exit for the moment !!'
 68 | 	exit 1
 69 | fi
 70 | 
 71 | # number of input files provided
 72 | nsample=${#InpFile[@]}
 73 | echo 'Number of input files : '$nsample
 74 | 
 75 | if [ $nsample -lt 2 ]; then
 76 | 	echo 'User needs to provide at least two peak files for comparison - exit for the moment !!'
 77 | 	exit 1
 78 | fi
 79 | 
 80 | # generate the output directory
 81 | mkdir -p $OutDir
 82 | 
 83 | #----------------------------------
 84 | # important - sourya
 85 | # change the current directory as the dir containing this executable
 86 | # since other source files relative to the current directory needs to be called
 87 | current_dir=$(pwd)
 88 | script_dir=$(dirname $0)
 89 | cd $script_dir
 90 | #----------------------------------
 91 | 
 92 | #=================================
 93 | # batch replicate analysis
 94 | #=================================
 95 | # if [ ! -f $OutDir'/Replicate_Names.txt' ]; then
 96 | 	echo 'Analyzing the '$nsample' Number of replicates --- ' > $OutDir'/Replicate_Names.txt'
 97 | 	for (( i=0; i<${nsample}; i++ ))
 98 | 	do
 99 | 		echo 'Sample '$i' is : '${InpFile[$i]} >> $OutDir'/Replicate_Names.txt'
100 | 	done
101 | # fi
102 | 
103 | #===================
104 | # add - sourya
105 | # here we analyze individual peak files (input)
106 | # and accordingly assign the number of common peaks to be considered
107 | PeakStatFile=$OutDir'/Input_Peak_Statistics.txt'
108 | 
109 | echo 'Summarizing the peak count statistics for individual input files: ' > $PeakStatFile
110 | 
111 | # first get the minimum no of peaks across all the samples
112 | for (( i=0; i<${nsample}; i++ ))
113 | do
114 | 	peakfile=${InpFile[$i]}
115 | 	pc=`cat $peakfile | wc -l`
116 | 	echo "Analyzing the peak file: $peakfile " >> $PeakStatFile
117 | 	echo "Peak count: $pc " >> $PeakStatFile
118 | 	if [ $i == 0 ]; then
119 | 		minpc=$pc
120 | 	else
121 | 		if [ $minpc > $pc ]; then
122 | 			minpc=$pc
123 | 		fi
124 | 	fi
125 | done
126 | 
127 | # assign the minimum number of peaks for consideration
128 | if [[ $minpc -gt 200000 ]]; then
129 | 	CountPeak=150000
130 | elif [[ $minpc -gt 150000 ]]; then
131 | 	CountPeak=100000
132 | elif [[ $minpc -gt 100000 ]]; then
133 | 	CountPeak=75000
134 | elif [[ $minpc -gt 75000 ]]; then
135 | 	CountPeak=50000
136 | else
137 | 	CountPeak=25000
138 | fi
139 | 
140 | echo "Value of CountPeak (number of common peaks to be analyzed for all replicates): $CountPeak " >> $PeakStatFile
141 | #===================
142 | 
143 | # loop for pairwise execution of samples
144 | for (( i=0; i<${nsample}-1; i++ ))
145 | do
146 | 	for (( j=$i+1; j<${nsample}; j++ ))
147 | 	do
148 | 		# pair of samples
149 | 		sample1=${InpFile[$i]}
150 | 		sample2=${InpFile[$j]}
151 | 		# execute the sample pairs
152 | 		# Note the output directory name - it is the sample directory plus the pairwise comparison
153 | 		$IDR_code -a $sample1 -b $sample2 -P $dir2 -d $OutDir'/'$i'_and_'$j -n $PREFIX -c $CountPeak
154 | 	done
155 | done
156 | 
157 | #=================================
158 | # batch consistency plots
159 | #=================================
160 | if [ ! -f $OutDir'/IDR_Batch_Plot-plot.pdf' ]; then
161 | 
162 | 	# the pattern of input prefix present in every replicate 
163 | 	#inppfx=$IDR_OutFold'/'$PREFIX
164 | 	inppfx=$PREFIX
165 | 
166 | 	# no of pairs of samples
167 | 	x=$nsample
168 | 	y=`expr $nsample - 1`
169 | 	z=`expr $x \* $y`
170 | 	npairs=`expr $z / 2`
171 | 	echo 'npairs: '$npairs
172 | 
173 | 	# output command for IDR plot
174 | 	cmd='Rscript '$exec2' '$npairs' '$OutDir'/IDR_Batch_Plot'
175 | 	for (( i=0; i<${nsample}-1; i++ ))
176 | 	do
177 | 		for (( j=$i+1; j<${nsample}; j++ ))
178 | 		do
179 | 			cmd=$cmd' '$OutDir'/'$i'_and_'$j'/'$inppfx
180 | 		done
181 | 	done
182 | 	echo 'cmd: '$cmd
183 | 
184 | 	# execute the command
185 | 	# first go to the directory containing the R code of the IDR
186 | 	cd $dir2
187 | 	$cmd
188 | 	cd -
189 | 
190 | 	# now convert the generated postscript plot file to a pdf file
191 | 	ps2pdf $OutDir'/IDR_Batch_Plot-plot.ps' $OutDir'/IDR_Batch_Plot-plot.pdf' 
192 | 
193 | fi
194 |    
195 | #----------------------------------
196 | # important - sourya
197 | # now restore the original directory
198 | cd $current_dir
199 | #----------------------------------
200 | 
201 | 
202 | 


--------------------------------------------------------------------------------
/IDR_Codes/IDRScatterPlot.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | #===========================================================
 4 | # R script for scatter plot between a pair of peak files
 5 | 
 6 | #Author: Sourya Bhattacharyya
 7 | #Vijay-Ay lab, LJI
 8 | 
 9 | # usage: Rscript result_summary.r $inpfile
10 | #===========================================================
11 | 
12 | args <- commandArgs(TRUE)
13 | 
14 | # directory containing IDR code package
15 | IDRCodeDir <- args[1]	#"/home/sourya/packages/idrCode/"
16 | 
17 | # the pair of peak outputs for comparison
18 | peakfile1 <- args[2]
19 | peakfile2 <- args[3]
20 | # genome table.txt file provided in the IDR code package
21 | genometablefile <- args[4]
22 | # output prefix including the directory path
23 | # and the prefix string of the output plot file names
24 | curroutprefix <- args[5]
25 | 
26 | # system path includes the path of IDR code 
27 | source(paste0(IDRCodeDir, "functions-all-clayton-12-13.r"))
28 | 
29 | chr.size <- read.table(genometablefile)
30 | 
31 | half.width <- NULL
32 | overlap.ratio <- 0
33 | is.broadpeak <- F
34 | sig.value <- "p.value"
35 | 
36 | # width and height values employed in these plots
37 | plotwidth <- 8
38 | plotheight <- 6
39 | 
40 | rep1 <- process.narrowpeak(paste(peakfile1, sep=""), chr.size, 
41 | 	half.width=half.width, summit="offset", broadpeak=is.broadpeak)
42 | 
43 | rep2 <- process.narrowpeak(paste(peakfile2, sep=""), chr.size, 
44 | 	half.width=half.width, summit="offset", broadpeak=is.broadpeak)
45 | 
46 | uri.output <- compute.pair.uri(rep1$data.cleaned, rep2$data.cleaned, 
47 | 	sig.value1=sig.value, sig.value2=sig.value, overlap.ratio=overlap.ratio)
48 | 
49 | em.output <- fit.em(uri.output$data12.enrich, fix.rho2=T)
50 | idr.local <- 1-em.output$em.fit$e.z
51 | IDR <- c()
52 | o <- order(idr.local)
53 | IDR[o] <- cumsum(idr.local[o])/c(1:length(o))
54 | 
55 | idr_output <- data.frame(chr1=em.output$data.pruned$sample1[, "chr"], start1=em.output$data.pruned$sample1[, "start.ori"], stop1=em.output$data.pruned$sample1[, "stop.ori"], sig.value1=em.output$data.pruned$sample1[, "sig.value"],  chr2=em.output$data.pruned$sample2[, "chr"], start2=em.output$data.pruned$sample2[, "start.ori"], stop2=em.output$data.pruned$sample2[, "stop.ori"], sig.value2=em.output$data.pruned$sample2[, "sig.value"], idr.local=1-em.output$em.fit$e.z, IDR=IDR)
56 | 
57 | # this idr_output is already placed in the file "idr_overlapped_peaks.txt" 
58 | 
59 | filtered_peaks <- idr_output[idr_output[,10]<=0.01,]
60 | dim(filtered_peaks) # get the number of peaks
61 | 
62 | ez.list <- get.ez.tt.all(em.output, uri.output$data12.enrich$merge1, uri.output$data12.enrich$merge2)
63 | 
64 | par(mar=c(5,5,0,0.5), mfrow = c(1,3), oma=c(5,0,2,0))
65 | 
66 | idr_output$col[idr_output[,10]<=0.01]="black"
67 | 
68 | idr_output$col[idr_output[,10]>=0.01]="red"
69 | 
70 | # first graph
71 | pdf(paste0(curroutprefix,'_Signal_Replicates.pdf'), width=plotwidth, height=plotheight)
72 | plot(log(idr_output[,4]),log(idr_output[,8]),col=idr_output[,11], pch=19, cex = 0.05, xlab="log(signal) Rep1", ylab="log(signal) Rep2")
73 | legend("topleft", c("IDR=>0.01","IDR<=0.01"), col=c("red","black"), pch=19, bty="n", lty=c(1,1), lwd=c(2,2))
74 | dev.off()
75 | 
76 | # second graph
77 | pdf(paste0(curroutprefix,'_Peak_Rank_Replicates.pdf'), width=plotwidth, height=plotheight)
78 | plot(rank(-idr_output[,4]),rank(-idr_output[,8]),col=idr_output[,11], pch=19, cex = 0.05, xlab="Peak rank Rep1", ylab="Peak rank Rep2")
79 | legend("topleft", c("IDR=>0.01","IDR<=0.01"), col=c("red","black"), pch=19, bty="n", lty=c(1,1), lwd=c(1,1))
80 | dev.off()
81 | 
82 | # third graph
83 | pdf(paste0(curroutprefix,'_SignificantPeaks_vs_IDR.pdf'), width=plotwidth, height=plotheight)
84 | plot(ez.list$IDR, ylab="IDR", xlab="num of significant peaks")
85 | dev.off()
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/IDR_Codes/IDRSummary.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | #===========================================================
  4 | # R script for summarizing the results of IDR analysis between different sample replicates
  5 | 
  6 | #Author: Sourya Bhattacharyya
  7 | #Vijay-Ay lab, LJI
  8 | 
  9 | # usage: Rscript result_summary.r $inpfile
 10 | #===========================================================
 11 | 
 12 | args <- commandArgs(TRUE)
 13 | 
 14 | # file containing the overlapped peak information
 15 | CommonPeakFile <- args[1]
 16 | inpdir <- dirname(CommonPeakFile)
 17 | 
 18 | npeak1 <- as.integer(args[2])
 19 | npeak2 <- as.integer(args[3])
 20 | 
 21 | # print(sprintf("\n CommonPeakFile: %s ", CommonPeakFile))
 22 | # print(sprintf("\n npeak1: %s ", npeak1))
 23 | # print(sprintf("\n npeak2: %s ", npeak2))
 24 | 
 25 | # information of the common peak
 26 | # Note: the file contains a header line
 27 | CommonPeakInfo <- read.table(CommonPeakFile, header=TRUE)
 28 | 
 29 | # number of overlapped peaks (considering all the IDR values)
 30 | ncommonpeak <- length(CommonPeakInfo[,1])
 31 | fracpeak1 <- (ncommonpeak * 1.0) / npeak1
 32 | fracpeak2 <- (ncommonpeak * 1.0) / npeak2
 33 | 
 34 | # print(sprintf("\n ncommonpeak: %s ", ncommonpeak))
 35 | # print(sprintf("\n fracpeak1: %s ", fracpeak1))
 36 | # print(sprintf("\n fracpeak2: %s ", fracpeak2))
 37 | 
 38 | # find the rows where IDR is lower than a specified threshold
 39 | # we employ three different thresholds:
 40 | # 1) 0.01, 2) 0.05, and 3) 0.1
 41 | # Note:  Threshold of 0.01 (newly added) is recommended in the ENCODE  
 42 | 
 43 | NumIDRPass0 <- length(which(CommonPeakInfo[,10] <= 0.01))
 44 | FracIDRPass0 <- (NumIDRPass0 * 1.0) / ncommonpeak
 45 | NumIDRPass1 <- length(which(CommonPeakInfo[,10] <= 0.05))
 46 | FracIDRPass1 <- (NumIDRPass1 * 1.0) / ncommonpeak
 47 | NumIDRPass2 <- length(which(CommonPeakInfo[,10] <= 0.1))
 48 | FracIDRPass2 <- (NumIDRPass2 * 1.0) / ncommonpeak
 49 | 
 50 | # print(sprintf("\n NumIDRPass0: %s ", NumIDRPass0))
 51 | # print(sprintf("\n FracIDRPass0: %s ", FracIDRPass0))
 52 | # print(sprintf("\n NumIDRPass1: %s ", NumIDRPass1))
 53 | # print(sprintf("\n FracIDRPass1: %s ", FracIDRPass1))
 54 | # print(sprintf("\n NumIDRPass2: %s ", NumIDRPass2))
 55 | # print(sprintf("\n FracIDRPass2: %s ", FracIDRPass2))
 56 | 
 57 | # divide the input overlapped peak files into two different structures
 58 | # corresponding to the peak information of two different inputs
 59 | # the seq() function also includes the row number for every interaction
 60 | # this row number serves as the id of peaks
 61 | PeakInfoInput1 <- cbind(seq(1:ncommonpeak), CommonPeakInfo[,1:4])
 62 | PeakInfoInput2 <- cbind(seq(1:ncommonpeak), CommonPeakInfo[,5:8])
 63 | 
 64 | # sort the data according to the significance value (last column of both the data)
 65 | # decreasing order is employed
 66 | PeakInfoInput1_Sort <- PeakInfoInput1[ order(-PeakInfoInput1[,5]),]
 67 | PeakInfoInput2_Sort <- PeakInfoInput2[ order(-PeakInfoInput2[,5]),]
 68 | 
 69 | # we check the cumulative percent of samples in both peak sets
 70 | # and find out the overlap of peaks
 71 | fraction_overlap <- c()
 72 | 
 73 | for (x in seq(0, 1, 0.1)) {
 74 | 	if ((x != 0) && (x != 1)) {
 75 | 		# number of elements of both peak lists
 76 | 		nsample <- as.integer(ncommonpeak * x)
 77 | 		# common elements in both peak lists
 78 | 		# the common factor is the first column: peak id
 79 | 		OverlapSet <- PeakInfoInput1_Sort[1:nsample, 1] %in% PeakInfoInput2_Sort[1:nsample, 1]
 80 | 		ncommon <- length(OverlapSet[OverlapSet==TRUE])
 81 | 		frac_common <- (ncommon * 1.0 / nsample)
 82 | 		fraction_overlap <- c(fraction_overlap, frac_common)
 83 | 
 84 | 		# we also note down two different fraction overlap statistics
 85 | 		# corresponding to 10\%, 20% and 50% strongest peaks
 86 | 		if (x == 0.1) {
 87 | 			frac_overlap_10Pct = frac_common
 88 | 		}
 89 | 		if (x == 0.2) {
 90 | 			frac_overlap_20Pct = frac_common
 91 | 		}
 92 | 		if (x == 0.5) {
 93 | 			frac_overlap_50Pct = frac_common
 94 | 		}
 95 | 
 96 | 		# print(sprintf("\n Percentile value: %s ", x))
 97 | 		# print(sprintf("\n nsample: %s ", nsample))
 98 | 		# print(sprintf("\n ncommon: %s ", ncommon))
 99 | 		# print(sprintf("\n frac_common: %s ", frac_common))
100 | 	}
101 | }
102 | 
103 | # print(sprintf("\n Mean of fraction overlap: %s ", mean(fraction_overlap)))
104 | 
105 | # # we check the percent of samples in both peak sets
106 | # # and find out the overlap of peaks
107 | # # for individual 10% bins
108 | 
109 | # nbins <- 5	#10
110 | # fraction_overlap2 <- c()
111 | # sampleperbin <- as.integer(ncommonpeak / nbins)
112 | 
113 | # for (b in (1:nbins)) {
114 | # 	if (b == 1) {
115 | # 		si <- 1
116 | # 		ei <- si + sampleperbin - 1
117 | # 	} else {
118 | # 		si <- ei + 1
119 | # 		if (b == nbins) {
120 | # 			ei <- ncommonpeak
121 | # 		} else {
122 | # 			ei <- si + sampleperbin - 1
123 | # 		}
124 | # 	}
125 | # 	OverlapSet <- PeakInfoInput1_Sort[si:ei, 1] %in% PeakInfoInput2_Sort[si:ei, 1]
126 | # 	ncommon <- length(OverlapSet[OverlapSet==TRUE])
127 | # 	frac_common <- (ncommon * 1.0 / (ei-si+1))
128 | # 	fraction_overlap2 <- c(fraction_overlap2, frac_common)
129 | # 	print(sprintf("\n si: %s ", si))
130 | # 	print(sprintf("\n ei: %s ", ei))
131 | # 	print(sprintf("\n ncommon: %s ", ncommon))
132 | # 	print(sprintf("\n frac_common: %s ", frac_common))
133 | # }
134 | 
135 | # print(sprintf("\n Mean of fraction overlap2: %s ", mean(fraction_overlap2)))
136 | 
137 | 
138 | # write the results in a text file
139 | OutFilename <- paste0(inpdir, '/Stat.tab')
140 | 
141 | fp <- file(OutFilename, open="w")
142 | write(paste0('NPeak1', '\t', 'NPeak2', '\t', 'CommonPeak', '\t', 'FracPeak1', '\t', 'FracPeak2', '\t', 'IDR_0.01_Peak', '\t', 'Frac_IDR_0.01_Peak', '\t', 'IDR_0.05_Peak', '\t', 'Frac_IDR_0.05_Peak', '\t', 'IDR_0.1_Peak', '\t', 'Frac_IDR_0.1_Peak', '\t', 'MeanOverlap', '\t', 'Overlap10', '\t', 'Overlap20', '\t', 'Overlap50'), file=fp, append=T)
143 | write(paste(npeak1, '\t', npeak2, '\t', ncommonpeak, '\t', fracpeak1, '\t', fracpeak2, '\t', NumIDRPass0, '\t', FracIDRPass0, '\t', NumIDRPass1, '\t', FracIDRPass1, '\t', NumIDRPass2, '\t', FracIDRPass2, '\t', mean(fraction_overlap), '\t', frac_overlap_10Pct, '\t', frac_overlap_20Pct, '\t', frac_overlap_50Pct), file=fp, append=T)
144 | close(fp)
145 | 
146 | 
147 | 
148 | 


--------------------------------------------------------------------------------
/IDR_Codes/IDR_SubSampleBAM.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #=================================
  4 | # this script encapsulates IDR analysis between two replicates into a single script
  5 | # First, the input BAM files are analyzed to check their read similarity
  6 | # and the BAM file with higher read is subsampled
  7 | # then these modified BAM files are used for peak calling and subsequent IDR analysis
  8 | #=================================
  9 | # developed by - Sourya Bhattacharyya
 10 | # Vijay-AY lab
 11 | # La Jolla Institute for Allergy and Immunology
 12 | #=================================
 13 | 
 14 | # usage info
 15 | usage(){
 16 | cat << EOF
 17 | 
 18 | usage: ./IDR_SubSampleBAM.sh [-h] [-A BamFile1] [-B BamFile2] [-d OutDir] [-n 'IDR_test'] [-P PathIDRCode] [-c 25000] [-C control.bam]
 19 | Example:
 20 | 
 21 | Options:    
 22 | 
 23 |   -- required:
 24 | 	-A  BamFile1       	 First BAM file
 25 | 	-B  BamFile2       	 Second BAM file
 26 | 	-d  OutDir 		 	 Output directory (absolute path preferred) which will store the IDR results.
 27 | 	-P 	PathIDRCode		 Path of the IDRCode package (Kundaje et. al. after its installation)
 28 | 	-n 	PREFIX 			 Prefix of output files. Default 'IDR_ATAC'.
 29 | 	-c  CountPeak		 No of peaks in both replicates that will be compared. Default 25000.
 30 | 	-C  CONTROLBAM		 Control file (in eiher .BAM or tagalign file in .gz format)
 31 | EOF
 32 | }
 33 | 
 34 | # default values of peaks that need to be retained
 35 | CountPeak=25000
 36 | 
 37 | # executable containing the tag align shift code
 38 | TagAlignExec='../bin/TagAlign.sh'
 39 | 
 40 | # default control bam file
 41 | CONTROLBAM=""
 42 | 
 43 | # IDR analysis code using a pair of peak files
 44 | IDR_code='./IDRAnalysis.sh'
 45 | 
 46 | # default prefix string
 47 | PREFIX='IDR_ATAC'
 48 | 
 49 | # executable of sambamba 
 50 | # for subsampling of the bam file, samtools has a bug
 51 | # so using this package
 52 | sambamba_exec=`which sambamba`
 53 | 
 54 | # Sourya - Note the processing of input file argument since it can be more than one file
 55 | # Note the change of notations
 56 | while getopts "A:B:n:d:c:C:P:" opt;
 57 | do
 58 | 	case "$opt" in
 59 | 		A) BamFile1=$OPTARG;;
 60 | 		B) BamFile2=$OPTARG;;
 61 | 		n) PREFIX=$OPTARG;;
 62 | 		d) OutDir=$OPTARG;;
 63 | 		c) CountPeak=$OPTARG;;
 64 | 		C) CONTROLBAM=$OPTARG;;
 65 | 		P) IDRCodeDir=$OPTARG;;
 66 | 		\?) usage
 67 | 			echo "error: unrecognized option -$OPTARG";
 68 | 			exit 1
 69 | 			;;
 70 | 	esac
 71 | done
 72 | 
 73 | if [[ -z $BamFile1 ]]; then
 74 | 	echo 'User did not provide the first BAM file - exit for the moment !!'
 75 | 	exit 1
 76 | fi
 77 | 
 78 | if [[ -z $BamFile2 ]]; then
 79 | 	echo 'User did not provide the second BAM file - exit for the moment !!'
 80 | 	exit 1
 81 | fi
 82 | 
 83 | if [[ -z $IDRCodeDir ]]; then
 84 | 	echo 'User did not provide the path of IDRCode package (Kundaje et. al.) - exit for the moment !!'
 85 | 	exit 1
 86 | fi
 87 | 
 88 | if [[ -z $OutDir ]]; then
 89 | 	echo 'User did not provide output directory for storing the results - exit for the moment !!'
 90 | 	exit 1
 91 | fi
 92 | 
 93 | # create the output directory
 94 | mkdir -p $OutDir
 95 | 
 96 | #----------------------------------
 97 | # important - sourya
 98 | # change the current directory as the dir containing this executable
 99 | # since other source files relative to the current directory needs to be called
100 | current_dir=$(pwd)
101 | script_dir=$(dirname $0)
102 | cd $script_dir
103 | #----------------------------------
104 | 
105 | # count of READS for two BAM files
106 | readcount1=`samtools view $BamFile1 | wc -l`
107 | readcount2=`samtools view $BamFile2 | wc -l`
108 | 
109 | TagAlignFile1=$OutDir'/temp_1_tagalign.gz'
110 | TagAlignFile2=$OutDir'/temp_2_tagalign.gz'
111 | 
112 | if [[ $readcount1 -gt $readcount2 ]]; then
113 | 	# the first BAM file needs to be subsampled, followed by their conversion to TAG Align format
114 | 	if [ ! -f $OutDir'/temp_1.bam' ]; then
115 | 		# fraction of subsampling
116 | 		# Note: we did not use the expr operator - simple expr does not work for float
117 | 		f=$(echo "scale=2;$readcount2/$readcount1" | bc)
118 | 		# we use sambamba for the subsampling
119 | 		# and use 8 threads for faster operation
120 | 		$sambamba_exec view -h -t 8 -s $f -f bam $BamFile1 -o $OutDir'/temp_1.bam'
121 | 	fi
122 | 	# conversion of the TAG Align format
123 | 	if [ ! -f $TagAlignFile1 ]; then
124 | 		$TagAlignExec -I $OutDir'/temp_1.bam' -N 0 -O $TagAlignFile1
125 | 	fi
126 | 	if [ ! -f $TagAlignFile2 ]; then
127 | 		$TagAlignExec -I $BamFile2 -N 0 -O $TagAlignFile2
128 | 	fi
129 | else
130 | 	# the second BAM file needs to be subsampled, followed by their conversion to TAG Align format
131 | 	if [ ! -f $OutDir'/temp_2.bam' ]; then
132 | 		# fraction of subsampling
133 | 		# Note: we did not use the expr operator - simple expr does not work for float
134 | 		f=$(echo "scale=2;$readcount1/$readcount2" | bc)
135 | 		# we use sambamba for the subsampling
136 | 		# and use 8 threads for faster operation
137 | 		$sambamba_exec view -h -t 8 -s $f -f bam $BamFile2 -o $OutDir'/temp_2.bam'
138 | 	fi
139 | 	if [ ! -f $TagAlignFile2 ]; then
140 | 		$TagAlignExec -I $OutDir'/temp_2.bam' -N 0 -O $TagAlignFile2
141 | 	fi
142 | 	if [ ! -f $TagAlignFile1 ]; then
143 | 		$TagAlignExec -I $BamFile1 -N 0 -O $TagAlignFile1
144 | 	fi
145 | fi
146 | 
147 | #==============================================
148 | # calling the MACS2 using the generated tag align file
149 | #==============================================
150 | 
151 | # first we have to fix the output folders containing the MACS2 output for both the samples
152 | # the output folder name is like MACS2_0/1_C 
153 | # (where 0/1 indicates first or second sample)
154 | # _C is optional and included only when control bam file is provided as input
155 | 
156 | MACS2_outdir1=$OutDir'/MACS2_0'
157 | MACS2_outdir2=$OutDir'/MACS2_1'
158 | if [[ ! -z $CONTROLBAM ]]; then
159 | 	MACS2_outdir1=$MACS2_outdir1'_C'
160 | 	MACS2_outdir2=$MACS2_outdir2'_C'
161 | fi
162 | MACS2_outdir1=$MACS2_outdir1'/'
163 | MACS2_outdir2=$MACS2_outdir2'/'
164 | mkdir -p $MACS2_outdir1
165 | mkdir -p $MACS2_outdir2
166 | 
167 | 
168 | # first file - MACS2
169 | MACS2PeakOutFile1=$MACS2_outdir1$PREFIX'.macs2_peaks.narrowPeak'
170 | if [ ! -f $MACS2PeakOutFile1 ]; then
171 | 	MACS2_cmd='macs2 callpeak -t '$TagAlignFile1' -f BED -n '$PREFIX'.macs2 --nomodel --nolambda --shift -100 --extsize 200 --outdir '$MACS2_outdir1
172 | 	if [[ ! -z $CONTROLBAM ]]; then
173 | 		# include the control file also
174 | 		MACS2_cmd=$MACS2_cmd' -c '$CONTROLBAM
175 | 	fi
176 | 	# execute the command
177 | 	$MACS2_cmd
178 | fi
179 | 
180 | # second file - MACS2
181 | MACS2PeakOutFile2=$MACS2_outdir2$PREFIX'.macs2_peaks.narrowPeak'
182 | if [ ! -f $MACS2PeakOutFile2 ]; then
183 | 	MACS2_cmd='macs2 callpeak -t '$TagAlignFile2' -f BED -n '$PREFIX'.macs2 --nomodel --nolambda --shift -100 --extsize 200 --outdir '$MACS2_outdir2
184 | 	if [[ ! -z $CONTROLBAM ]]; then
185 | 		# include the control file also
186 | 		MACS2_cmd=$MACS2_cmd' -c '$CONTROLBAM
187 | 	fi
188 | 	# execute the command
189 | 	$MACS2_cmd
190 | fi
191 | 
192 | #====================================
193 | # now call the IDR analysis using the generated peak files
194 | #====================================
195 | # we have to fix the output directory where the results of IDR will be stored
196 | # depending on the presence of control parameters
197 | # the folders will vary
198 | # the folders have the following format: C(0/1) depending on the input options
199 | 
200 | IDR_OutDir=$OutDir'/'
201 | if [[ ! -z $CONTROLBAM ]]; then
202 | 	IDR_OutDir=$IDR_OutDir'C1'
203 | else
204 | 	IDR_OutDir=$IDR_OutDir'C0'
205 | fi
206 | IDR_OutDir=$IDR_OutDir'_Peak'$CountPeak'/'
207 | mkdir -p $IDR_OutDir
208 | 
209 | $IDR_code -a $MACS2PeakOutFile1 -b $MACS2PeakOutFile2 -P $IDRCodeDir -d $IDR_OutDir -n $PREFIX -c $CountPeak
210 | 
211 | #----------------------------------
212 | # important - sourya
213 | # now restore the original directory
214 | cd $current_dir
215 | #----------------------------------
216 | 


--------------------------------------------------------------------------------
/IDR_Codes/IDR_SubSampleBAM_Main.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #=================================
  4 | # this script encapsulates IDR analysis between different replicates into a single script
  5 | # provided that the inputs are in BAM format
  6 | # and they need to be resampled + peak calling + IDR
  7 | #=================================
  8 | # developed by - Sourya Bhattacharyya
  9 | # Vijay-AY lab
 10 | # La Jolla Institute for Allergy and Immunology
 11 | #=================================
 12 | 
 13 | # usage info
 14 | usage(){
 15 | cat << EOF
 16 | 
 17 | usage: ./IDR_SubSampleBAM_Main.sh [-h] [-I inpfile1.bam] [-I inpfile2.bam] [-d OutDir] [-P PathIDRCode] [-n 'IDR_test'] [-c 25000] [-C control.bam]
 18 | 
 19 | Options:    
 20 | 
 21 |   -- required:
 22 | 	-I  InpFile       	 A list of input bam files. At least two bam files are required.
 23 | 	-d  OutDir 		 	 Output directory (absolute path preferred) which will store the IDR results.
 24 | 	-P 	PathIDRCode		 Path of the IDRCode package (Kundaje et. al. after its installation)
 25 | 	-n 	PREFIX 			 Prefix of output files. Default 'IDR_ATAC'.
 26 | 	-c  CountPeak		 No of peaks in both replicates that will be compared. Default 25000.
 27 | 	-C  CONTROLBAM		 Control file (in eiher .BAM or tagalign file in .gz format)
 28 | EOF
 29 | }
 30 | 
 31 | # default values of peaks that need to be retained
 32 | CountPeak=25000
 33 | 
 34 | # code containing the IDR + consistency plot
 35 | # dir2='/home/sourya/packages/idrCode/'
 36 | exec2='batch-consistency-plot.r'
 37 | 
 38 | # default control bam file
 39 | CONTROLBAM=""
 40 | 
 41 | # default prefix string
 42 | PREFIX='IDR_ATAC'
 43 | 
 44 | # Sourya - Note the processing of input file argument since it can be more than one file
 45 | # Note the change of notations
 46 | while getopts "I:n:d:c:C:P:" opt;
 47 | do
 48 | 	case "$opt" in
 49 | 		I) InpFile+=($OPTARG);;
 50 | 		n) PREFIX=$OPTARG;;
 51 | 		d) OutDir=$OPTARG;;
 52 | 		c) CountPeak=$OPTARG;;
 53 | 		C) CONTROLBAM=$OPTARG;;
 54 | 		P) dir2=$OPTARG;;
 55 | 		\?) usage
 56 | 			echo "error: unrecognized option -$OPTARG";
 57 | 			exit 1
 58 | 			;;
 59 | 	esac
 60 | done
 61 | 
 62 | if [[ -z $InpFile ]]; then
 63 | 	echo 'User did not provide any input BAM file - exit for the moment !!'
 64 | 	exit 1
 65 | fi
 66 | 
 67 | if [[ -z $dir2 ]]; then
 68 | 	echo 'User did not provide the path of IDRCode package (Kundaje et. al.) - exit for the moment !!'
 69 | 	exit 1
 70 | fi
 71 | 
 72 | if [[ -z $OutDir ]]; then
 73 | 	echo 'User did not provide output directory for storing the results - exit for the moment !!'
 74 | 	exit 1
 75 | fi
 76 | 
 77 | # number of input files provided
 78 | nsample=${#InpFile[@]}
 79 | echo 'Number of input files : '$nsample
 80 | 
 81 | if [ $nsample -lt 2 ]; then
 82 | 	echo 'User needs to provide at least two peak files for comparison - exit for the moment !!'
 83 | 	exit 1
 84 | fi
 85 | 
 86 | # generate the output directory
 87 | mkdir -p $OutDir
 88 | 
 89 | #----------------------------------
 90 | # important - sourya
 91 | # change the current directory as the dir containing this executable
 92 | # since other source files relative to the current directory needs to be called
 93 | current_dir=$(pwd)
 94 | script_dir=$(dirname $0)
 95 | cd $script_dir
 96 | #----------------------------------
 97 | 
 98 | #=================================
 99 | # batch replicate analysis
100 | #=================================
101 | if [ ! -f $OutDir'/Replicate_Names.txt' ]; then
102 | 	echo 'Analyzing the '$nsample' Number of replicates --- ' > $OutDir'/Replicate_Names.txt'
103 | 	for (( i=0; i<${nsample}; i++ ))
104 | 	do
105 | 		echo 'Sample '$i' is : '${InpFile[$i]} >> $OutDir'/Replicate_Names.txt'
106 | 	done
107 | fi
108 | 
109 | # loop for pairwise execution of samples
110 | for (( i=0; i<${nsample}-1; i++ ))
111 | do
112 | 	for (( j=$i+1; j<${nsample}; j++ ))
113 | 	do
114 | 		# pair of samples
115 | 		sample1=${InpFile[$i]}
116 | 		sample2=${InpFile[$j]}
117 | 		# execute the sample pairs
118 | 		# Note the output directory name
119 | 		if [[ ! -z $CONTROLBAM ]]; then
120 | 			./IDR_SubSampleBAM.sh -A $sample1 -B $sample2 -d $OutDir'/'$i'_and_'$j -P $dir2 -n $PREFIX -c $CountPeak -C $CONTROLBAM
121 | 		else
122 | 			./IDR_SubSampleBAM.sh -A $sample1 -B $sample2 -d $OutDir'/'$i'_and_'$j -P $dir2 -n $PREFIX -c $CountPeak
123 | 		fi
124 | 	done
125 | done
126 | 
127 | #=================================
128 | # batch consistency plots
129 | #=================================
130 | 
131 | # the pattern of input prefix present in every replicate 
132 | # depends on the control sample and tagmentation option
133 | 
134 | if [[ ! -z $CONTROLBAM ]]; then
135 | 	inppfx='C1'
136 | else
137 | 	inppfx='C0'
138 | fi
139 | inppfx=$inppfx'_Peak'$CountPeak'/'$PREFIX
140 | 
141 | # basic plotting file name format 
142 | # without the extension '-plot.pdf'
143 | plotfilename='IDR_Batch_Plot'
144 | if [[ ! -z $CONTROLBAM ]]; then
145 | 	plotfilename=$plotfilename'_C1'
146 | else
147 | 	plotfilename=$plotfilename'_C0'
148 | fi
149 | 
150 | #if [ ! -f $OutDir'/'$plotfilename'-plot.pdf' ]; then
151 | 
152 | 	# no of pairs of samples
153 | 	x=$nsample
154 | 	y=`expr $nsample - 1`
155 | 	z=`expr $x \* $y`
156 | 	npairs=`expr $z / 2`
157 | 	echo 'npairs: '$npairs
158 | 
159 | 	# output command for IDR plot
160 | 	cmd='Rscript '$exec2' '$npairs' '$OutDir'/'$plotfilename
161 | 	for (( i=0; i<${nsample}-1; i++ ))
162 | 	do
163 | 		for (( j=$i+1; j<${nsample}; j++ ))
164 | 		do
165 | 			cmd=$cmd' '$OutDir'/'$i'_and_'$j'/'$inppfx
166 | 		done
167 | 	done
168 | 	echo 'cmd: '$cmd
169 | 
170 | 	# execute the command
171 | 	# first go to the directory containing the R code of the IDR
172 | 	cd $dir2
173 | 	$cmd
174 | 	cd -
175 | 
176 | 	# now convert the generated postscript plot file to a pdf file
177 | 	ps2pdf $OutDir'/'$plotfilename'-plot.ps' $OutDir'/'$plotfilename'-plot.pdf' 
178 | 
179 | #fi
180 | 
181 | #----------------------------------
182 | # important - sourya
183 | # now restore the original directory
184 | cd $current_dir
185 | #----------------------------------
186 | 


--------------------------------------------------------------------------------
/Imp_Scripts/Footprint_HINT_ATAC.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | #==================================
  4 | # footprinting using HINT-ATAC package
  5 | # http://www.regulatory-genomics.org/hint/introduction/
  6 | 
  7 | # author: Sourya Bhattacharyya
  8 | # Vijay-AY lab
  9 | 
 10 | # check 
 11 | # http://www.regulatory-genomics.org/hint/tutorial/
 12 | #==================================
 13 | 
 14 | library(optparse)
 15 | 
 16 | #===========================================================
 17 | option_list = list(
 18 | 
 19 | 	make_option(c("--AllRead"), type="character", default=NULL, help="Alignment file containing all reads."),
 20 | 	make_option(c("--NFRRead"), type="character", default=NULL, help="Alignment file containing nucleosome free regions (NFR) reads."),
 21 | 	make_option(c("--NFRANDNuclRead"), type="character", default=NULL, help="Alignment file containing nucleosome free regions (NFR) plus all nucleosome (1M, 2M, 3M) merged reads."),
 22 | 	make_option(c("--RefGenome"), type="character", default=NULL, help="Reference genome name."),
 23 | 	make_option(c("--OutDir"), type="character", default=NULL, help="Output directory to contain the motif."),
 24 | 	make_option(c("--PE"), type="integer", action="store", default=0, help="If 1, indicates paired end input data. Default = 0"),
 25 | 	make_option(c("--FP"), type="integer", action="store", default=1, help="Footorinting option. Value can be 1 (default), 2, or 3. (1): footoprint using the nucleosome free reads (NFR) will be computed. Default setting. Best for default ATAC-seq protocol (check Li et. al. Genome Biology 2019). 2: footoprint using the nucleosome free reads (NFR) and also the nucleosome containing reads (NFR + 1N + 2N + 3N ...) will be computed (two different footprint outputs - time consuming). Best for Omni-ATAC protocol (check Li et. al. Genome Biology 2019). (3): footoprint using NFR, NFR with nucleosome reads, and all reads will be computed (three different footprint outputs - highly time consuming). Default = 1"),
 26 | 	make_option(c("--MotifPeak"), type="character", default=NULL, help="Peak or summit file which was used by HOMER to generate corresponding motifs. Mandatory parameter.")
 27 | ); 
 28 | 
 29 | opt_parser = OptionParser(option_list=option_list);
 30 | opt = parse_args(opt_parser);
 31 | 
 32 | # create the output directory
 33 | system(paste("mkdir -p", opt$OutDir))
 34 | 
 35 | # prefix string of output file name
 36 | OUTPREFIX <- 'footprints_HINT_ATAC'
 37 | 
 38 | 
 39 | ##===========
 40 | ## processing all reads
 41 | ## only if FP option > 2
 42 | ##===========
 43 | if (opt$FP > 2) {
 44 | 	if (!is.null(opt$AllRead)) {
 45 | 		curroutdir <- paste0(opt$OutDir, '/all')
 46 | 		system(paste("mkdir -p", curroutdir))
 47 | 		if ((file.exists(paste0(curroutdir, '/', OUTPREFIX, '.bed')) == FALSE) | (file.exists(paste0(curroutdir, '/', OUTPREFIX, '.info')) == FALSE)) {
 48 | 			if (opt$PE == 1) {
 49 | 				cat(sprintf("\n start footprint HINT ATAC PE reads - all reads"))
 50 | 				system(paste("rgt-hint footprinting --atac-seq --organism ", opt$RefGenome, " --paired-end --output-location ", curroutdir, " --output-prefix ", OUTPREFIX, " ", opt$AllRead, opt$MotifPeak))
 51 | 			} else {
 52 | 				system(paste("rgt-hint footprinting --atac-seq --organism ", opt$RefGenome, " --output-location ", curroutdir, " --output-prefix ", OUTPREFIX, " ", opt$AllRead, opt$MotifPeak))
 53 | 			}
 54 | 		}
 55 | 		# now call motif matching for the obtained footprints
 56 | 		# JASPAR database is used by default for motif finding
 57 | 		# 10% random background region is tested - using the option --rand-proportion 10
 58 | 		cat(sprintf("\n start motifanalysis of HINT ATAC - all reads"))
 59 | 		motifoutdir <- paste0(curroutdir, '/motifanalysis_matching_out')
 60 | 		system(paste("mkdir -p", motifoutdir))
 61 | 		system(paste("rgt-motifanalysis matching --organism ", opt$RefGenome, " --rand-proportion 10 --input-files ", paste0(curroutdir, '/', OUTPREFIX, '.bed'), " --output-location ",  motifoutdir))
 62 | 	}
 63 | }
 64 | 
 65 | ##===========
 66 | ## processing NFR and nucleosome reads (1N, 2N, ...)
 67 | ## only if FP option > 1
 68 | ##===========
 69 | if (opt$FP > 1) {
 70 | 	if (!is.null(opt$NFRANDNuclRead)) {
 71 | 		curroutdir <- paste0(opt$OutDir, '/NFRANDNucl')
 72 | 		system(paste("mkdir -p", curroutdir))
 73 | 		if ((file.exists(paste0(curroutdir, '/', OUTPREFIX, '.bed')) == FALSE) | (file.exists(paste0(curroutdir, '/', OUTPREFIX, '.info')) == FALSE)) {	
 74 | 			if (opt$PE == 1) {
 75 | 				cat(sprintf("\n start footprint HINT ATAC PE reads - nucleosome free and nucleosome reads"))
 76 | 				system(paste("rgt-hint footprinting --atac-seq --organism ", opt$RefGenome, " --paired-end --output-location ", curroutdir, " --output-prefix ", OUTPREFIX, " ", opt$NFRANDNuclRead, opt$MotifPeak))
 77 | 			} else {
 78 | 				system(paste("rgt-hint footprinting --atac-seq --organism ", opt$RefGenome, " --output-location ", curroutdir, " --output-prefix ", OUTPREFIX, " ", opt$NFRANDNuclRead, opt$MotifPeak))
 79 | 			}
 80 | 		}
 81 | 		# now call motif matching for the obtained footprints
 82 | 		# JASPAR database is used by default for motif finding	
 83 | 		# 10% random background region is tested - using the option --rand-proportion 10
 84 | 		cat(sprintf("\n start motifanalysis of HINT ATAC - nucleosome free and nucleosome reads"))
 85 | 		motifoutdir <- paste0(curroutdir, '/motifanalysis_matching_out')
 86 | 		system(paste("mkdir -p", motifoutdir))	
 87 | 		system(paste("rgt-motifanalysis matching --organism ", opt$RefGenome, " --rand-proportion 10 --input-files ", paste0(curroutdir, '/', OUTPREFIX, '.bed'), " --output-location ",  motifoutdir))
 88 | 	}
 89 | }
 90 | 
 91 | ##===========
 92 | ## processing NFR reads
 93 | ## default option
 94 | ##===========
 95 | if (!is.null(opt$NFRRead)) {
 96 | 	curroutdir <- paste0(opt$OutDir, '/NFR')
 97 | 	system(paste("mkdir -p", curroutdir))
 98 | 	if ((file.exists(paste0(curroutdir, '/', OUTPREFIX, '.bed')) == FALSE) | (file.exists(paste0(curroutdir, '/', OUTPREFIX, '.info')) == FALSE)) {
 99 | 		if (opt$PE == 1) {
100 | 			cat(sprintf("\n start footprint HINT ATAC PE reads - nucleosome free reads"))
101 | 			system(paste("rgt-hint footprinting --atac-seq --organism ", opt$RefGenome, " --paired-end --output-location ", curroutdir, " --output-prefix ", OUTPREFIX, " ", opt$NFRRead, opt$MotifPeak))
102 | 		} else {
103 | 			system(paste("rgt-hint footprinting --atac-seq --organism ", opt$RefGenome, " --output-location ", curroutdir, " --output-prefix ", OUTPREFIX, " ", opt$NFRRead, opt$MotifPeak))
104 | 		}
105 | 	}
106 | 	# now call motif matching for the obtained footprints
107 | 	# JASPAR database is used by default for motif finding	
108 | 	# 10% random background region is tested - using the option --rand-proportion 10
109 | 	cat(sprintf("\n start motifanalysis of HINT ATAC - nucleosome free reads"))
110 | 	motifoutdir <- paste0(curroutdir, '/motifanalysis_matching_out')
111 | 	system(paste("mkdir -p", motifoutdir))	
112 | 	system(paste("rgt-motifanalysis matching --organism ", opt$RefGenome, " --rand-proportion 10 --input-files ", paste0(curroutdir, '/', OUTPREFIX, '.bed'), " --output-location ",  motifoutdir))
113 | }
114 | 
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/Imp_Scripts/Motif_HOMER.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | #=========================
  4 | # call motifs from ATAC-seq peaks using HOMER
  5 | #=========================
  6 | 
  7 | library(optparse)
  8 | library(data.table)
  9 | 
 10 | options(scipen = 999)
 11 | options(datatable.fread.datatable=FALSE)
 12 | 
 13 | #=================================
 14 | # function to create peak summit information
 15 | #=================================
 16 | GeneratePeakSummitFile <- function(PeakSummitFile, PeakData, offset=500) {
 17 | 	
 18 | 	if (ncol(PeakData) > 9) {
 19 | 		# use the relative peak summit information (10th field) and generate an offset of 
 20 | 		outDF <- cbind.data.frame(PeakData[,1], (PeakData[,2] + PeakData[,10] - offset), (PeakData[,2] + PeakData[,10] + offset))
 21 | 	} else {
 22 | 		# use the midpoint of the peaks as the summit
 23 | 		outDF <- cbind.data.frame(PeakData[,1], (as.integer((PeakData[,2] + PeakData[,3])/2) - offset), (as.integer((PeakData[,2] + PeakData[,3])/2) + offset))
 24 | 	}
 25 | 	write.table(outDF, PeakSummitFile, row.names=F, col.names=F, sep="\t", quote=F, append=F)
 26 | 
 27 | }	# end function
 28 | 
 29 | #===========================================================
 30 | option_list = list(
 31 | 
 32 | 	make_option(c("--MotifFindExec"), type="character", default=NULL, help="HOMER motif finding executable"),
 33 | 	make_option(c("--RefGenome"), type="character", default=NULL, help="Reference genome name."),
 34 | 	make_option(c("--PeakFile"), type="character", default=NULL, help="ATAC-seq Peak file."),
 35 | 	make_option(c("--PValThr"), type="numeric", default=0, help="Threshold of -log10(p-value) above which peaks will be considered. Default = 0, means no Threshold is imposed."),
 36 | 	make_option(c("--QValThr"), type="numeric", default=0, help="Threshold of -log10(q-value) above which peaks will be considered. Default = 0, means no threshold is imposed."),
 37 | 	make_option(c("--OutDir"), type="character", default=NULL, help="Output directory."),
 38 | 	make_option(c("--SizeVal"), type="integer", action="store", default=200, help="Size argument of HOMER motif finding. Default = 200"),
 39 | 	make_option(c("--SummitOffset"), type="integer", action="store", default=500, help="Offset around the peak summit position to be considered for motif finding. Default = 500")
 40 | ); 
 41 | 
 42 | opt_parser = OptionParser(option_list=option_list);
 43 | opt = parse_args(opt_parser);
 44 | 
 45 | system(paste("mkdir -p", opt$OutDir))
 46 | 
 47 | PValThr <- as.numeric(opt$PValThr)
 48 | QValThr <- as.numeric(opt$QValThr)
 49 | if (QValThr > 0) {
 50 | 	PValThr <- 0
 51 | }
 52 | 
 53 | if ((PValThr == 0) & (QValThr == 0)) {
 54 | 	# CurrOutDir <- paste0(opt$OutDir, '/Motif_Complete_Peaks_Size_', opt$SizeVal, '_SummitOffset_', opt$SummitOffset)
 55 | 	CurrOutDir <- paste0(opt$OutDir, '/Motif_Complete_Peaks_SummitOffset_', opt$SummitOffset)
 56 | } else if (QValThr > 0) {
 57 | 	# CurrOutDir <- paste0(opt$OutDir, '/Motif_Peaks_QvalThr_', QValThr, '_Size_', opt$SizeVal, '_SummitOffset_', opt$SummitOffset)
 58 | 	CurrOutDir <- paste0(opt$OutDir, '/Motif_Peaks_QvalThr_', QValThr, '_SummitOffset_', opt$SummitOffset)
 59 | } else {
 60 | 	# CurrOutDir <- paste0(opt$OutDir, '/Motif_Peaks_PvalThr_', PValThr, '_Size_', opt$SizeVal, '_SummitOffset_', opt$SummitOffset)
 61 | 	CurrOutDir <- paste0(opt$OutDir, '/Motif_Peaks_PvalThr_', PValThr, '_SummitOffset_', opt$SummitOffset)
 62 | }
 63 | system(paste("mkdir -p", CurrOutDir))
 64 | 
 65 | # read the complete peak data
 66 | PeakData <- data.table::fread(opt$PeakFile)
 67 | 
 68 | # filter peaks if there is any p-value or q-value specific threshold is provided
 69 | # then call the motif finding routine
 70 | if (PValThr > 0) {	
 71 | 	PeakData_Filt <- PeakData[which(PeakData[, 8] > PValThr), ]
 72 | 	if (nrow(PeakData_Filt) > 0) {		
 73 | 		# write the filtered peaks
 74 | 		FiltPeakFileName <- paste0(CurrOutDir, '/Filtered_Peaks_PvalThr.bed')
 75 | 		write.table(PeakData_Filt, FiltPeakFileName, row.names=F, col.names=F, sep="\t", quote=F, append=F)
 76 | 		# extract the peak summits and +/- opt$SummitOffset bp from the summits
 77 | 		FiltPeakFileNameSummit <- paste0(CurrOutDir, '/Filtered_Peaks_PvalThr_Summit_Offset_', opt$SummitOffset, 'bp.bed')
 78 | 		GeneratePeakSummitFile(FiltPeakFileNameSummit, PeakData_Filt, offset=opt$SummitOffset)
 79 | 		# now call motif using these summit information
 80 | 		# currently commented - sourya
 81 | 		# system(paste(opt$MotifFindExec, FiltPeakFileNameSummit, opt$RefGenome, CurrOutDir, " -size ", opt$SizeVal, " -mask"))
 82 | 	}		
 83 | } else if (QValThr > 0) {	
 84 | 	PeakData_Filt <- PeakData[which(PeakData[, 9] > QValThr), ]
 85 | 	if (nrow(PeakData_Filt) > 0) {
 86 | 		# write the filtered peaks
 87 | 		FiltPeakFileName <- paste0(CurrOutDir, '/Filtered_Peaks_QvalThr.bed')
 88 | 		write.table(PeakData_Filt, FiltPeakFileName, row.names=F, col.names=F, sep="\t", quote=F, append=F)
 89 | 		# extract the peak summits and +/- opt$SummitOffset bp from the summits
 90 | 		FiltPeakFileNameSummit <- paste0(CurrOutDir, '/Filtered_Peaks_QvalThr_Summit_Offset_', opt$SummitOffset, 'bp.bed')
 91 | 		GeneratePeakSummitFile(FiltPeakFileNameSummit, PeakData_Filt, offset=opt$SummitOffset)
 92 | 		# now call motif using these summit information
 93 | 		# currently commented - sourya
 94 | 		# system(paste(opt$MotifFindExec, FiltPeakFileNameSummit, opt$RefGenome, CurrOutDir, " -size ", opt$SizeVal, " -mask"))
 95 | 	}
 96 | } else {
 97 | 	# extract the peak summits and +/- opt$SummitOffset bp from the summits
 98 | 	FiltPeakFileNameSummit <- paste0(CurrOutDir, '/Peaks_Summit_Offset_', opt$SummitOffset, 'bp.bed')	
 99 | 	GeneratePeakSummitFile(FiltPeakFileNameSummit, PeakData, offset=opt$SummitOffset)
100 | 
101 | 	# now call motif using these summit information
102 | 	# currently commented - sourya
103 | 	# system(paste(opt$MotifFindExec, FiltPeakFileNameSummit, opt$RefGenome, CurrOutDir, " -size ", opt$SizeVal, " -mask"))
104 | }
105 | 
106 | 
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/Imp_Scripts/Peak_Enrichment.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | #=========================
  4 | # analyze ATAC-seq peaks and plot the enrichment for peaks and surrounding regions
  5 | # also analyze separately for promoter peaks and enhancer peaks
  6 | # using reference TSS information
  7 | #=========================
  8 | 
  9 | suppressMessages(library(GenomicRanges))
 10 | library(optparse)
 11 | library(data.table)
 12 | 
 13 | options(scipen = 999)
 14 | options(datatable.fread.datatable=FALSE)
 15 | 
 16 | #=================================
 17 | # function to create peak summit information
 18 | #=================================
 19 | GeneratePeakSummitFile <- function(PeakSummitFile, PeakData) {
 20 | 	
 21 | 	if (ncol(PeakData) > 9) {
 22 | 		# use the relative peak summit information (10th field) and generate an offset of 
 23 | 		outDF <- cbind.data.frame(PeakData[,1], (PeakData[,2] + PeakData[,10] - 5), (PeakData[,2] + PeakData[,10] + 5))
 24 | 	} else {
 25 | 		# use the midpoint of the peaks as the summit
 26 | 		outDF <- cbind.data.frame(PeakData[,1], (as.integer((PeakData[,2] + PeakData[,3])/2) - 5), (as.integer((PeakData[,2] + PeakData[,3])/2) + 5))
 27 | 	}
 28 | 	write.table(outDF, PeakSummitFile, row.names=F, col.names=F, sep="\t", quote=F, append=F)
 29 | 
 30 | }	# end function
 31 | 
 32 | 
 33 | #=================================
 34 | # function to compute overlap of 1D bins
 35 | #=================================
 36 | Overlap1D <- function(Inpdata1, Inpdata2, boundary=1, offset=0, uniqov=TRUE) {
 37 | 
 38 | 	ov1 <- as.data.frame(findOverlaps(GRanges(Inpdata1[,1], IRanges(Inpdata1[,2]+boundary-offset, Inpdata1[,3]-boundary+offset)),GRanges(Inpdata2[,1], IRanges(Inpdata2[,2]+boundary-offset, Inpdata2[,3]-boundary+offset))))
 39 | 	if (uniqov == TRUE) {
 40 | 		ov_idx_file1 <- unique(ov1[,1])
 41 | 		ov_idx_file2 <- unique(ov1[,2])		
 42 | 	} else {
 43 | 		ov_idx_file1 <- ov1[,1]
 44 | 		ov_idx_file2 <- ov1[,2]
 45 | 	}
 46 | 	nonov_idx_file1 <- setdiff(seq(1, nrow(Inpdata1)), ov_idx_file1)
 47 | 	nonov_idx_file2 <- setdiff(seq(1, nrow(Inpdata2)), ov_idx_file2)
 48 | 
 49 | 	# return the overlapping and non-overlapping set of indices
 50 | 	newList <- list(A_AND_B = ov_idx_file1, B_AND_A = ov_idx_file2, A_MINUS_B = nonov_idx_file1, B_MINUS_A = nonov_idx_file2)
 51 | 	return(newList)
 52 | 
 53 | }
 54 | 
 55 | #=================================
 56 | # function to plot the heatmap using deeptools
 57 | #=================================
 58 | PlotHeatMap <- function(CurrOutDir, DeepToolsDir, outmatfile, Label) {
 59 | 
 60 | 	# then use this matrix to plot profile
 61 | 	outprofileplotfile <- paste0(CurrOutDir, '/out_mat_profile_plot.pdf')
 62 | 	outprofileplotfile1 <- paste0(CurrOutDir, '/out_mat_profile_plot_1.pdf')
 63 | 	outprofileplotfile2 <- paste0(CurrOutDir, '/out_mat_heatmap_plot.pdf')
 64 | 	outprofileplotfile3 <- paste0(CurrOutDir, '/out_mat_heatmap_plot_1.pdf')
 65 | 
 66 | 	system(paste0(DeepToolsDir, "/plotProfile --matrixFile ", outmatfile, " --outFileName ", outprofileplotfile, " --plotHeight 7 --plotWidth 10 --samplesLabel ", Label, " --plotTitle ATACPeakTSSEnrichment --plotFileFormat pdf --yMin 0.5 --yMax 40 --colors red yellow blue"))
 67 | 
 68 | 	system(paste0(DeepToolsDir, "/plotProfile --matrixFile ", outmatfile, " --outFileName ", outprofileplotfile1, " --plotHeight 7 --plotWidth 10 --samplesLabel ", Label, " --plotTitle ATACPeakTSSEnrichment --plotFileFormat pdf --yMin 0.5 --yMax 5 --colors red yellow blue"))
 69 | 
 70 | 	system(paste0(DeepToolsDir, "/plotHeatmap --matrixFile ", outmatfile, " --outFileName ", outprofileplotfile2, " --heatmapHeight 10 --heatmapWidth 8 --samplesLabel ", Label, " --plotTitle ATACPeakTSSEnrichment --plotFileFormat pdf --yMin 0.5 --yMax 40 --zMin 0 --zMax 50"))
 71 | 
 72 | 	system(paste0(DeepToolsDir, "/plotHeatmap --matrixFile ", outmatfile, " --outFileName ", outprofileplotfile3, " --heatmapHeight 10 --heatmapWidth 8 --samplesLabel ", Label, " --plotTitle ATACPeakTSSEnrichment --plotFileFormat pdf --yMin 0.5 --yMax 5 --zMin 0 --zMax 50"))
 73 | 
 74 | }	# end function
 75 | 
 76 | 
 77 | #===========================================================
 78 | option_list = list(
 79 | 
 80 | 	make_option(c("--BigWigFile"), type="character", default=NULL, help="BigWig file of ATAC-seq data."),
 81 | 	make_option(c("--Label"), type="character", default=NULL, help="Label or sample name of ATAC-seq data."),
 82 | 	make_option(c("--DeepToolsDir"), type="character", default=NULL, help="Deeptools executable directory."),
 83 | 	make_option(c("--TSSFile"), type="character", default=NULL, help="File containing reference genome TSS information."),
 84 | 	make_option(c("--PeakFile"), type="character", default=NULL, help="File containing ATAC-seq peak information."),
 85 | 	make_option(c("--OutDir"), type="character", default=NULL, help="Output directory."),
 86 | 	make_option(c("--Offset"), type="integer", action="store", default=5000, help="Offset with respect to summit (in bp) to compute enrichment. Default = 5000 means 5 Kb around peak summits would be used for enrichment.")
 87 | ); 
 88 | 
 89 | opt_parser = OptionParser(option_list=option_list);
 90 | opt = parse_args(opt_parser);
 91 | 
 92 | system(paste("mkdir -p", opt$OutDir))
 93 | 
 94 | # read the input peaks
 95 | PeakData <- data.table::fread(opt$PeakFile)
 96 | 
 97 | # extract the peak summits of the complete peak file
 98 | CurrOutDir <- paste0(opt$OutDir, '/Complete_Peaks')
 99 | system(paste("mkdir -p", CurrOutDir))
100 | 
101 | PeakSummitFile <- paste0(CurrOutDir, '/Peak_Summits.bed')
102 | if (file.exists(PeakSummitFile) == FALSE) {
103 | 	GeneratePeakSummitFile(PeakSummitFile, PeakData)
104 | }
105 | 
106 | # now apply deeptools utility to compute enrichment
107 | outmatfile <- paste0(CurrOutDir, '/deeptools_out_mat_TSS.gz')
108 | if (file.exists(outmatfile) == FALSE) {
109 | 	system(paste0(opt$DeepToolsDir, "/computeMatrix reference-point -R ", PeakSummitFile, " -S ", opt$BigWigFile, " -a ", opt$Offset, " -b ", opt$Offset, " --skipZeros --outFileName ", outmatfile))
110 | }
111 | PlotHeatMap(CurrOutDir, opt$DeepToolsDir, outmatfile, opt$Label)
112 | 
113 | # if TSS information is also provided, find the enrichment of promoter and 
114 | # enhancer peaks separately
115 | if (!is.null(opt$TSSFile)) {
116 | 	TSSData <- data.table::fread(opt$TSSFile)
117 | 	# 2.5 Kb overlap on both side of TSS data
118 | 	ov <- Overlap1D(PeakData[,1:3], cbind.data.frame(TSSData[,1:2],TSSData[,2]), boundary=0, offset=2500, uniqov=TRUE)
119 | 	PromPeakData <- PeakData[ov$A_AND_B, ]
120 | 	EnhPeakData <- PeakData[ov$A_MINUS_B, ]
121 | 
122 | 	# process the promoter peaks
123 | 	if (nrow(PromPeakData) > 0) {
124 | 		CurrOutDir <- paste0(opt$OutDir, '/Promoter_Peaks')
125 | 		system(paste("mkdir -p", CurrOutDir))
126 | 		PeakSummitFile <- paste0(CurrOutDir, '/Peak_Summits.bed')
127 | 		if (file.exists(PeakSummitFile) == FALSE) {
128 | 			GeneratePeakSummitFile(PeakSummitFile, PromPeakData)
129 | 		}
130 | 		# now apply deeptools utility to compute enrichment
131 | 		outmatfile <- paste0(CurrOutDir, '/deeptools_out_mat_TSS.gz')
132 | 		if (file.exists(outmatfile) == FALSE) {
133 | 			system(paste0(opt$DeepToolsDir, "/computeMatrix reference-point -R ", PeakSummitFile, " -S ", opt$BigWigFile, " -a ", opt$Offset, " -b ", opt$Offset, " --skipZeros --outFileName ", outmatfile))
134 | 		}
135 | 		PlotHeatMap(CurrOutDir, opt$DeepToolsDir, outmatfile, opt$Label)
136 | 	}
137 | 
138 | 	# process the enhancer peaks
139 | 	if (nrow(EnhPeakData) > 0) {
140 | 		CurrOutDir <- paste0(opt$OutDir, '/Enhancer_Peaks')
141 | 		system(paste("mkdir -p", CurrOutDir))
142 | 		PeakSummitFile <- paste0(CurrOutDir, '/Peak_Summits.bed')
143 | 		if (file.exists(PeakSummitFile) == FALSE) {
144 | 			GeneratePeakSummitFile(PeakSummitFile, EnhPeakData)
145 | 		}
146 | 		# now apply deeptools utility to compute enrichment
147 | 		outmatfile <- paste0(CurrOutDir, '/deeptools_out_mat_TSS.gz')
148 | 		if (file.exists(outmatfile) == FALSE) {
149 | 			system(paste0(opt$DeepToolsDir, "/computeMatrix reference-point -R ", PeakSummitFile, " -S ", opt$BigWigFile, " -a ", opt$Offset, " -b ", opt$Offset, " --skipZeros --outFileName ", outmatfile))
150 | 		}
151 | 		PlotHeatMap(CurrOutDir, opt$DeepToolsDir, outmatfile, opt$Label)
152 | 	}
153 | }
154 | 
155 | 
156 | 
157 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ATACProc - a pipeline for processing ATAC-seq data
  2 | 
  3 | Devloper: Sourya Bhattacharyya
  4 | 
  5 | Supervisors: Dr. Ferhat Ay and Dr. Pandurangan Vijayanand
  6 | 
  7 | La Jolla Institute for Immunology, CA 92037, USA
  8 | 
  9 | 
 10 | #######################
 11 | 
 12 | ATACProc is a pipeline to analyze ATAC-seq data. Currently datasets involving one of the four reference genomes, namely hg19, hg38, mm9 and mm10 are supported. Important features of this pipeline are:
 13 | 
 14 | 1) Supports single or paired-end fastq or BAM formatted data.
 15 | 
 16 | 2) Generates alignment summary and QC statistics.
 17 | 
 18 | 3) Peak calls using MACS2, for multiple FDR thresholds (0.01 and 0.05)
 19 | 
 20 | 4) Generating raw and coverage normalized BigWig tracks for visualizing the data in UCSC genome browser.
 21 | 
 22 | 5) Irreproducible Discovery Rate (IDR) analysis (https://github.com/nboley/idr) between a set of peak calls or even a set of input alignment (BAM) files (in which case, peaks are estimated first) corresponding to a set of biological or technical ATAC-seq replicates. 
 23 | 
 24 | 6) **New in version 2.0:** Support discarding reads falling in blacklisted genomic regions
 25 | 
 26 | 7) **New in version 2.0:** Support extracting nucleosome free reads (NFR), one or more nucleosome containing regions (denoted as +1M), for TF footprinting analysis.
 27 | 
 28 | 8) **New in version 2.0:** Compatibility to the package ATAQV (https://github.com/ParkerLab/ataqv) for generating summary statistics across a set of samples.
 29 | 
 30 | #######################
 31 | 
 32 | Release notes
 33 | -----------------
 34 | 
 35 | **Version 2.2 - April 2022**
 36 | 
 37 | Added -F option - corresponds to using different types of reads for footprinting. 
 38 | 
 39 | Default = 1, means footprinting with nucleosome free reads (NFR) will be done.
 40 | 
 41 | Best for standard ATAC-seq protocols (Li et al. Genome Biology, 2019)
 42 | 
 43 | If -F option is 2, footprinting with nucleosome reads will also be separately computed in addition to the NFR based footprints (two different footprinting outputs).
 44 | 
 45 | If -F option is 3, footprinting with all the reads will also be separately computed in addition to the NFR based and nucleosome read based footprints (three different footprinting outputs).
 46 | 
 47 | **Version 2.1 - July 2020**
 48 | 
 49 | Minor change of picard duplicate removal syntax, according to the picard tool version 2.8.14 
 50 | We recommend using this (or later) versions
 51 | 
 52 | **Version 2.0 - November 2019**
 53 | 
 54 | 1) Included TF footprinting, optional discarding of blacklisted genomic regions, motif analysis
 55 | 
 56 | 2) Updated summary statistics incorporating support for ATAQV package (https://github.com/ParkerLab/ataqv)
 57 | 
 58 | 3) Discarded R package ATACseqQC (https://bioconductor.org/packages/release/bioc/html/ATACseqQC.html) and corresponding operations, mainly due to its time complexity and reliability issues.
 59 | 
 60 | 
 61 | *Version 1.0 - July 2018:*
 62 | 
 63 | 1) Released first version of ATAC-seq pipeline, supporting generation of QC metrics, peak calls, signal tracks for visualizing in UCSC genome browser. 
 64 | 
 65 | 2) Also supports IDR between a set of peaks / alignments for a set of replicates.
 66 | 
 67 | 
 68 | Theory
 69 | ----------
 70 | 
 71 | Papers / links for understanding ATAC-seq QCs:
 72 | 
 73 | 1) https://github.com/crazyhottommy/ChIP-seq-analysis  (very useful; contains many papers 
 74 | and links for understanding ChIP-seq and ATAC-seq data)
 75 | 
 76 | 2) https://www.encodeproject.org/data-standards/terms/#library
 77 | 
 78 | 3) https://www.biostars.org/p/187204/
 79 | 
 80 | 4) http://seqanswers.com/forums/archive/index.php/t-59219.html
 81 | 
 82 | 5) https://github.com/kundajelab/atac_dnase_pipelines
 83 | 
 84 | 6) https://github.com/ParkerLab/bioinf525#sifting
 85 | 
 86 | 7) https://github.com/taoliu/MACS/issues/145
 87 | 
 88 | 8) https://www.biostars.org/p/207318/
 89 | 
 90 | 9) https://www.biostars.org/p/209592/
 91 | 
 92 | 10) https://www.biostars.org/p/205576/
 93 | 
 94 | 
 95 | Understanding peak calling
 96 | 
 97 | 1) https://genomebiology.biomedcentral.com/articles/10.1186/gb-2008-9-9-r137
 98 | 
 99 | Understanding TF footprinting
100 | 
101 | 1) https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1642-2
102 | 
103 | Understanding IDR analysis
104 | 
105 | 1) https://github.com/nboley/idr
106 | 
107 | 
108 | 
109 | Installation
110 | -------------
111 | 
112 | Following packages / libraries should be installed before running this pipeline:
113 | 
114 | 1) Python 2.7 
115 | 
116 | 2) R environment (we have used 3.4.3)
117 | 
118 | 	User should also install the following R packages, by running the following command inside R prompt:
119 | 
120 | 	install.packages(c(“optparse”, “ggplot2”, “data.table”, “plotly”))
121 | 
122 | 	Also user needs to install the bioconductor package GenomicRanges <https://bioconductor.org/packages/release/bioc/html/GenomicRanges.html>
123 | 
124 | 3) Bowtie2 (we have used version 2.3.3.1) <http://bowtie-bio.sourceforge.net/bowtie2/index.shtml>
125 | 
126 | 4) samtools (we have used version 1.6) <http://samtools.sourceforge.net/>
127 | 
128 | 5) PICARD tools (we have used 2.8.14 version now; previously we were using version 2.7.1) <https://broadinstitute.github.io/picard/>
129 | 
130 | 6) Utilities "bedGraphToBigWig", "bedSort", "bigBedToBed", "hubCheck" and "fetchChromSizes" - to be downloaded from UCSC repository <http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/>
131 | 
132 | 7) deepTools (we have used version 2.0) <https://deeptools.readthedocs.io/en/develop/>
133 | 
134 | 8) MACS2 (we have used version 2.1.1) https://github.com/taoliu/MACS
135 | 
136 | 9) HOMER (we recommend using the latest version) http://homer.ucsd.edu/homer/
137 | 
138 | 10) The package *ataqv* (https://github.com/ParkerLab/ataqv). User needs to download the GitHub release (.tar.gz) file in a convenient location, extract it, and provide corresponding path in a configuration file (mentioned below).
139 | 
140 | 11) Regulatory genomics toolbox (https://www.regulatory-genomics.org/) 
141 | 
142 | 	First user needs to install the module *RGT* using the following commands:
143 | 
144 | 		pip install --user cython numpy scipy
145 | 		pip install --user RGT
146 | 
147 | 	A folder *rgtdata* would be created inside the home directory. Next step is to configure that folder by typing the following commands:
148 | 
149 | 		cd ~/rgtdata
150 | 		python setupGenomicData.py --hg19
151 | 		python setupGenomicData.py --hg38
152 | 		python setupGenomicData.py --mm9
153 | 		python setupGenomicData.py --mm10
154 | 
155 | 		(Note: it is better to run the last four commands together in a qsub / cluster environment, otherwise it'll be time consuming).
156 | 
157 | 
158 | 	Then, user needs to set up the motif configuration data, via executing the following commands (preferable to run in qsub / cluster environment)
159 | 
160 | 		cd ~/rgtdata
161 | 		python setupLogoData.py --all
162 | 
163 | 
164 | **User should include the PATH of above mentioned libraries / packages inside their SYSTEM PATH variable. Alternatively, installation PATHS for some of these packages are to be mentioned in a separate configuration file (described below)**
165 | 
166 | **Following packages / libraries are to be installed for executing IDR code**
167 | 
168 | 9) sambamba (we have used version 0.6.7) <http://lomereiter.github.io/sambamba/>
169 | 
170 | 10) IDRCode (https://drive.google.com/file/d/0B_ssVVyXv8ZSX3luT0xhV3ZQNWc/view?usp=sharing). User should unzip the archieve and store in convenient location. Path of this archieve is to be provided for executing IDR code.
171 | 
172 | 
173 | 
174 | Execution
175 | ----------
176 | 
177 | User should first clone this pipeline in a convenient location, using the following command: 
178 | 
179 | git clone https://github.com/ay-lab/ATACProc.git
180 | 
181 | A sample script "pipeline_exec.sh" contains basic execution commands, to invoke the main executable "pipeline.sh" (located inside the folder "bin"). The executable has the following command line options:
182 | 
183 | Options:
184 | 
185 | Mandatory parameters:
186 | 
187 | 	-C  ConfigFile		    
188 |          	Configuration file to be separately provided. Mandatory parameter. Current package includes four sample configuration files named "configfile_*" corresponding to the reference genomes hg19, hg38, mm9 and mm10. Detailed description of the entries in this configuration file are mentioned later.
189 | 	              
190 | 	-f  FASTQ1          
191 |         	Read 1 (or forward strand) of paired-end sequencing data  [.fq|.gz|.bz2]. Or, even an aligned genome (.bam file; single or paired end alignment) can be provided.
192 | 	        
193 | 	-r  FASTQ2          
194 |             R2 of pair-end sequencing data [.fq|.gz|.bz2]. If not provided, and the -f parameter is not a BAM file, the input is assumed to be single ended.
195 | 
196 | 	-n  PREFIX           
197 |             Prefix string of output files. For example, -n "TEST" means that the output filenames start with the string "TEST". Generally, sample names with run ID, lane information, etc. can be used as a prefix string.
198 | 
199 | 	-g  BOWTIE2_GENOME   
200 |             Bowtie2 indexed reference genome. Basically, the folder containing bwt2 indices (corresponding to the reference genome) are to be provided. Mandatory parameter if the user provides fastq files as input (-f and -r options). If user provides .bam files as an input (-f option) then this field is optional.
201 | 
202 | 	-d  OutDir 			  
203 |             Output directory to store the results for the current sample.
204 | 
205 | 	-c  CONTROLBAM		 
206 |          	Control file(s) used for peak calling using MACS2. One or more alignment files can be provided to be used as a control. It may not be specified at all, in which case MACS2 operates without any control. Control file can be either in *BAM* or in *tagalign.gz* format (the standalone script *bin/TagAlign.sh* in this repository converts BAM file to tagalign.gz format). For multiple control files, they all are required to be of the same format (i.e. either all BAM or all tagalign.gz). Example: -c control1.bam -c control2.bam puts two control files for using in MACS2.
207 | 		
208 | 	-w BigWigGenome	 
209 | 			Reference genome as a string. Allowed values are hg19 (default), hg38, mm9 and mm10. If -g option is enabled (i.e. the Bowtie2 index genome is provided), this field is optional. Otherwise, mandatory parameter.				
210 | 		
211 | 	-D  DEBUG_TXT		 
212 | 			Binary variable. If 1 (recommended), dumps QC statistics. For a set of samples, those QC statistics can be used later to profile QC variation among different samples.				
213 | 		
214 | 	-O 	Overwrite		 
215 | 			Binary variable. If 1, overwrites the existing files (if any). Default = 0.
216 | 
217 | 	-F 	Footprint 	 	
218 | 			This flag specifies the footprinting option. Value can be 1 (default), 2, or 3
219 | 			1: footoprint using the nucleosome free reads (NFR) will be computed. 
220 | 			   Default setting. Best for default ATAC-seq protocol (check Li et. al. Genome Biology 2019)
221 | 			2: footoprint using the nucleosome free reads (NFR) and also the nucleosome containing reads (NFR + 1N + 2N + 3N ...) 
222 | 			   will be computed (two different footprint outputs - time consuming). 
223 | 			   Best for Omni-ATAC protocol (check Li et. al. Genome Biology 2019)
224 |  			3: footoprint using NFR, NFR with nucleosome reads, and all reads will be computed 
225 | 			   (three different footprint outputs - highly time consuming).	
226 | 			   
227 | Optional parameters:
228 | 	-q  MAPQ_THR		 
229 | 			Mapping quality threshold for bowtie2 alignment. Aligned reads with quality below this threshold are discarded. Default = 30. 
230 | 		 
231 | 	-t  NUMTHREADS              
232 | 			Number of sorting, Bowtie2 mapping THREADS [Default = 1]. If multiprocessing core is available, user should specify values > 1 such as 4 or 8, for faster execution of Bowtie2.
233 | 		
234 | 	-m  MAX_MEM          
235 | 			Set max memory used for PICARD duplication removal [Default = 8G].
236 | 		
237 | 	-a  ALIGNVALIDMAX	 
238 | 			Set the number of (max) valid alignments which will be searched [Default = 4] for Bowtie2.
239 | 		
240 | 	-l  MAXFRAGLEN 		 
241 | 			Set the maximum fragment length to be used for Bowtie2 alignment [Default = 2000]
242 | 			
243 | 
244 | Entries in the configuration file (first parameter)
245 | ---------------------------------------------------
246 | 
247 | The configuration file follows the format parameter=value
248 | 
249 | And is to be filled with the following entries:
250 | 
251 | 	picardexec=
252 | 		Path of Picard tool executable
253 | 		Example: /home/sourya/packages/picard-tools/picard-tools-2.7.1/picard.jar
254 | 
255 | 	HOMERPath=
256 | 		Path of HOMER (after installation)
257 | 		Example: /home/sourya/packages/HOMER/bin/
258 | 
259 | 	DeepToolsDir=
260 | 		Path of deepTools executable
261 | 		Example: /home/sourya/packages/deepTools/deepTools2.0/bin/	
262 | 
263 | 	NarrowPeakASFile=
264 | 		file (SQL) required to convert the narrowPeak file to the bigBed format
265 | 		Download the file from this link (and save):
266 | 		https://genome-source.gi.ucsc.edu/gitlist/kent.git/blob/master/src/hg/lib/encode/narrowPeak.as
267 | 		Specify the location of this downloaded file:
268 | 		Example: /home/sourya/genomes/chrsize/narrowPeak.as
269 | 
270 | 	BigNarrowPeakASFile=
271 | 		file (SQL) required to convert the bignarrowPeak file to the bigBed format
272 | 		Download the file from this link (and save):
273 | 		https://genome.ucsc.edu/goldenPath/help/examples/bigNarrowPeak.as
274 | 		Specify the location of this downloaded file:
275 | 		Example: /home/sourya/genomes/chrsize/bigNarrowPeak.as
276 | 		
277 | 	BroadPeakASFile=
278 | 		file (SQL) required to convert the broadPeak file to the bigBed format
279 | 		Download the file from this link (and save):
280 | 		https://genome-source.gi.ucsc.edu/gitlist/kent.git/blob/master/src/hg/lib/encode/broadPeak.as
281 | 		Specify the location of this downloaded file:
282 | 		Example: /home/sourya/genomes/chrsize/broadPeak.as
283 | 		
284 | 	RefChrSizeFile=
285 | 		files containing chromosome size information
286 | 		two column file storing the size of individual chromosomes
287 | 		Downloaded from the link (depends on the reference Chromosome employed):
288 | 		For example, the hg38.chrom.sizes file for the hg38 database is located at 
289 | 		http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes.
290 | 		Alternatively, Use the "fetchChromSizes" script from the UCSC repository 
291 | 		to get the appropriate chromosome size file.
292 | 		Specify the location of this downloaded file:
293 | 		Example: /home/sourya/genomes/chrsize/hg38.chrom.sizes
294 | 		
295 | 	RefChrFastaFile=
296 | 		Fasta file of the reference Chromosome. 
297 | 		Can be downloaded from the link: http://hgdownload.cse.ucsc.edu/downloads.html
298 | 		Example: /home/sourya/genomes/Complete_Genome/hg38/hg38.fa
299 | 		
300 | 	RefChrAnnotFile=
301 | 		file containing reference genome specific annotation (.gtf format). 
302 | 		To be downloaded from the following links:
303 | 		hg38: ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/
304 | 		hg19: ftp://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/
305 | 		mm9: ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/
306 | 		mm10: ftp://ftp.ensembl.org/pub/release-97/gtf/mus_musculus/
307 | 		Example: /home/sourya/genomes/Annotation/hg38/hg38.gtf
308 | 
309 | 	BlackListFile=
310 | 		file containing blacklisted regions corresponding to the reference genome. 
311 | 		To be downloaded from the link: https://github.com/Boyle-Lab/Blacklist/tree/master/lists (v2)
312 | 		File can be gzipped or normal text format.
313 | 		*Note: This parameter is optional.*
314 | 		Example: /home/sourya/genomes/BlackListed_Regions/hg38-blacklist.v2.bed
315 | 
316 | 	ATAQVPath=
317 | 		Path of ataqv package (https://github.com/ParkerLab/ataqv) executable. 
318 | 		User needs to download the GitHub release (.tar.gz) file, extract it, and provide the ataqv executable path here.
319 | 		Example: /home/sourya/packages/ataqv/ataqv-1.0.0/bin/ataqv
320 | 
321 | 	TSSFile=
322 | 		File containing TSS information for the reference genome. Obtained using the gene annotation (GTF) file.
323 | 		Example: /home/sourya/genomes/Annotation/hg38/hg38_TSS.gtf
324 | 
325 | 	
326 | 	The last parameter, *TSSFile*, needs a special mention. User can apply the following awk script to the reference genome annotation file (indicated in the parameter *RefChrAnnotFile*) to produce a file with TSS information.
327 | 
328 | 	Assuming user has downloaded the reference genome specific gene annotation file using one of the ftp links provided above, when the reference genome is either hg19, hg38 or mm10, user can apply the following awk script to obtain a TSS file (input_TSS.gtf) from the gene annotation file (input.gtf) (Note: it is always best to check the .gtf file format) :
329 | 
330 | 		awk -F'[\t]' '{if ((substr($1,1,1)!="#") && ($3=="transcript")) {if ($7=="+") {print "chr"$1"\t"$4"\t"$4"\t"$3"\t"$4"\t"$5"\t"$7"\t"$9} else {print "chr"$1"\t"$5"\t"$5"\t"$3"\t"$4"\t"$5"\t"$7"\t"$9}}}' input.gtf > input_TSS.gtf
331 | 
332 | 	When the reference genome is mm9, user can apply the following script (it is best to check the .gtf file format):
333 | 
334 | 		awk -F'[\t]' '{if ((substr($1,1,1)!="#") && ($3=="exon")) {if ($7=="+") {print "chr"$1"\t"$4"\t"$4"\t"$3"\t"$4"\t"$5"\t"$7"\t"$9} else {print "chr"$1"\t"$5"\t"$5"\t"$3"\t"$4"\t"$5"\t"$7"\t"$9}}}' mm9.gtf > mm9_TSS.gtf
335 | 
336 | Describing output of ATAC-seq pipeline
337 | -----------------------------------------
338 | 
339 | Within the folder *OutDir* (specified by the configuration option -d) following files (f) and folders (F) exist:
340 | 
341 | 	F1: Alignment_MAPQ${MAPQ_THR}
342 | 
343 | 		f1-1: Bowtie2_Init_Align.sam
344 | 			Initial alignment by Bowtie2 (if fastq files are provided as the input.)
345 | 		f1-2: UniqMappedRead.bam
346 | 			Uniquely mapped reads.
347 | 		f1-3: Bowtie2_del_Random.bam
348 | 			Alignment after excluding reads from chromosomes other than autosomal chromosomes, chrX, and chrM.
349 | 		f1-4: Bowtie2_del_Mitch.bam: 
350 | 			Alignment after excluding reads from chrM.
351 | 		f1-5: ${PREFIX}.align.sort.MAPQ${MAPQ_THR}.bam
352 | 			Sorted, and MAPQ thresholded alignment.
353 | 		f1-6: ${PREFIX}.align.sort.MAPQ${MAPQ_THR}.rmdup.bam
354 | 			De-duplicated alignment (used for subsequent operations)
355 | 		f1-7: ${PREFIX}.align.sort.MAPQ${MAPQ_THR}.picard_metrics.txt
356 | 			PICARD metrics log file corresponding to the duplicate removal operation.
357 | 		f1-8: ${PREFIX}.align.sort.MAPQ${MAPQ_THR}_TN5_Shift.bam
358 | 			**New in version 2.0:** De-duplicated reads with shifted forward (+4bp) and reverse strands (-5bp) by Tn5 transposase. Used to extract the nucleosome free and nucleosome containing regions.
359 | 		f1-9: ${PREFIX}.align.sort.MAPQ${MAPQ_THR}_TN5_Shift.bed
360 | 			**New in version 2.0:** Bed converted f7, used for MACS2 peak calling.
361 | 		f1-10: NucleosomeFree.bam
362 | 			**New in version 2.0:** Alignment with nucleosome free regions (NFR)
363 | 		f1-11: mononucleosome.bam
364 | 			**New in version 2.0:** Alignment with mononucleosome fragments
365 | 		f1-12: dinucleosome.bam
366 | 			**New in version 2.0:** Alignment with dinucleosome fragments
367 | 		f1-13: trinucleosome.bam
368 | 			**New in version 2.0:** Alignment with trinucleosome fragments
369 | 		f1-14: Merged_nucleosome.bam
370 | 			**New in version 2.0:** File containing fragments of nucleosome free and one or more nucleosomes (denoted as NFR +1M, in the HINT-ATAC genome biology paper). Generated by merging files f1-10 to f1-13.
371 | 
372 | 	F2: Out_BigWig
373 | 		f2-1: ${PREFIX}.bw 
374 | 			bigwig file for track visualization.
375 | 
376 | 	F3: Out_BigWig_NormCov:
377 | 		f3-1: ${PREFIX}_NormCov.bw
378 | 			bigwig file for track visualization (after normalizing the coverage). Recommended to use this file for visualizing tracks in UCSC genome browser.
379 | 
380 | 	F4: MACS2_Ext_*
381 | 		Contains peaks employing MACS2 with the parameters:
382 | 			--nomodel --nolambda --shift -100 --extsize 200 --keep-dup all --call-summits
383 | 		*Note: this parameter is recommended for ATAC-seq, as mostly followed in existin studies.*
384 | 
385 | 		If the folder name is "*_No_Control", no control BAM file was used to infer the peaks. Otherwise, if the folder name is "*_With_Control", one or more control alignment files were used for inferring the peaks.
386 | 
387 | 			f4-1: *.narrowPeak: narrow peaks with p-value threshold of 0.01
388 | 			f4-2: *.narrowPeak_Q0.05filt: narrow peaks with FDR (q-value) threshold = 0.05
389 | 			f4-3: *.narrowPeak_Q0.01filt: narrow peaks with FDR threshold = 0.01
390 | 			f4-4: *.broadPeak: broad peaks with p-value threshold of 0.01
391 | 			f4-5: *.broadPeak_Q0.05filt: broad peaks with FDR threshold = 0.05
392 | 			f4-6: *.broadPeak_Q0.01filt: broad peaks with FDR threshold = 0.01
393 | 			f4-7: out_FRiP.txt: FRiP (fraction of reads in peaks) statistics for the narrow and broad peaks.
394 | 			f4-8: Peak_Statistics.txt: number of peaks in different settings.
395 | 			F4-9: Peak_Annotate_Q*:
396 | 				HOMER based annotations corresponding to the narrow peaks inferred by the corresponding FDR threshold (0.01 or 0.05). Contains the following files:
397 | 				f4-9-1: Out_Summary.log: summary text file containing HOMER annotation.
398 | 				f4-9-2: Annotated_Peak_Q*filt.txt: Detailed HOMER annotation of the corresponding peaks.
399 | 				f4-9-3: Pie_Chart_Peak_Annotation.pdf: pie chart of peaks containing different annotations.
400 | 				f4-9-4: Peak_TSS_Distance.pdf: Histogram of distance between peaks and closest TSS
401 | 			f4-10: Files of *.bb extension are big-bed formatted peaks, used to visualize those peaks in UCSC tracks.
402 | 
403 | 	F5: MACS2_Default_*
404 | 		Contains peaks employing default MACS2 parameters. (generally not used for ATAC-seq processing, but we've kept it for comparison).
405 | 		File and folder structure is similar as F4.
406 | 
407 | 	f8: out_NRF_MAPQ${MAPQ_THR}.txt
408 | 		Metric NRF
409 | 		
410 | 	f9: Read_Count_Stat.txt
411 | 		Read count statistics.
412 | 
413 | 	F10: QC_ataqv_ParkerLab_Test
414 | 		**New in version 2.0:** Folder containing the summary .json files generated by the package ATAQV, which for diferent samples, can be combined to put a summary statistic and displayed in a Web browser.
415 | 
416 | 	F11: TSS_Enrichment_Peaks
417 | 		**New in version 2.0:** Processes the narrow peaks from the folder F4, and computes the TSS enrichment of these peaks. The underlying file structure is:
418 | 
419 | 		MACS2_Ext_*${CONTROLSTR}/macs2_narrowPeak_Q${FDRTHR}filt_Offset_${OFFSETVAL}/${PEAKTYPE}/*.pdf
420 | 
421 | 		where, 
422 | 			${CONTROLSTR}: "*_No_Control" or "*_With_Control", depending on the use of control BAM file in inferring the peaks.
423 | 			${FDRTHR}: FDR threshold. Can be either 0.01 or 0.05
424 | 			${OFFSETVAL}: can be either 1000 (1 Kb) or 5000 (5Kb) (1 Kb or 5 Kb regions surrounding TSS are checked for computing TSS enrichment).
425 | 			${PEAKTYPE}: can be either "Complete_Peaks" (means complete set of peaks are experimented), "Promoter_Peaks" (means peaks located within 5 Kb of a TSS site are only considered), or "Enhancer_Peaks" (peaks excluding the promoter peaks).
426 | 
427 | 
428 | 	F12: Motif_MACS2_Ext_*${CONTROLSTR}_narrowPeak_Q${FDRTHR}filt
429 | 		**New in version 2.0:** TF footorinting analysis corresponding to the ChIP-seq peaks stored in F4. Here, ${CONTROLSTR} is either "*_No_Control" or "*_With_Control", depending on the use of control BAM file in inferring the peaks. ${FDRTHR} is either 0.01 or 0.05.
430 | 
431 | 		The principle is to extract the peak summits and surroundings (by some bp, defined as an offset) and compute the TF footprinting regions and underlying motifs within these regions.
432 | 
433 | 		Within this folder, the file structure is as follows:
434 | 		Motif_${PEAKS_ANALYZED}_SummitOffset_${OFFSET}/Footprint_HINT_ATAC/${READTYPE}/footprints_HINT_ATAC.bed
435 | 
436 | 		where, 
437 | 			${PEAKS_ANALYZED}: can be "Complete_Peaks" (means complete set of peaks) or "Peaks_PvalThr_50" (means peaks with -log10(p-value) > 50 are only considered).
438 | 			${OFFSET}: can be either 200 or 500, means the summit +/- offset bp regions are accounted for TF footprinting.
439 | 			${READTYPE}: can be one of the following:
440 | 			 	"all" (means all de-duplicated reads in the file f1-8 considered), 
441 | 			 	"NFR" (means only nucleosome free reads in the file f1-10 are considered), 
442 | 			 	"NFRANDNucl" (means NFR regions and +1M reads, indicated by the file f1-14, are considered).
443 | 
444 | 		 	The output file in each occasion, "footprints_HINT_ATAC.bed", contains the TF footprinting regions.
445 | 
446 | 
447 | Summarizing a set of ATAC-seq samples
448 | ---------------------------------------
449 | 
450 | Suppose, a directory "/home/sourya/Results" contain within it, the following folders: 
451 | 1, 2, 3, 4, ... each corresponding to the output for processing individual ATAC-seq samples.
452 | 
453 | To get a summarized list of performance metrics for these samples, use the script *Analysis/ResSummary.r*, using the following syntax.
454 | 
455 | 	Rscript ResSummary.r --BaseDir ${BaseDir} --OutDir ${OutDir}
456 | 
457 | 	where,
458 | 	1) ${BaseDir}: 
459 | 		Directory containing results of all ATAC-seq sample analysis 		
460 | 		(like /home/sourya/Results as mentioned above). Mandatory parameter.
461 | 
462 | 	2) ${OutDir}: 
463 | 		Output directory to contain the summarized results. Default: current working directory.
464 | 
465 | 	For details of ATAC-seq QC measures, user may check this link:
466 | 	https://www.encodeproject.org/atac-seq/
467 | 
468 | 	Upon executing the R script, the following files are created within the specified ${OutDir}:
469 | 
470 | 		1) Results_All_Samples_Summary.txt: summarized statistics for all samples
471 | 		2) Field_Description.txt: Summary description of individual fields / parameters.
472 | 		3) TotalReadCount_Distribution.html: To be loaded in any web browser. Plot depicting the distribution of total reads for all samples.
473 | 		4) Fraction_MappableReadCount_Distribution.html: Fraction of mappability for all samples.
474 | 		5) Fraction_MitochondrialReadCount_Distribution.html: Fraction of mitochondrial reads for all samples.
475 | 		6) Fraction_UniqueMappReadCount_Distribution.html: Fraction of unique mappability for all samples.
476 | 		7) Fraction_LowQualReadCount_Distribution.html: Fraction of low quality reads for all samples.
477 | 		8) Fraction_DuplicateReadCount_Distribution.html: Fraction of duplicate reads for all samples.
478 | 		9) NRF_Distribution.html: NRF for all samples.
479 | 		10) M1_Distribution.html: M1 metric for all samples.
480 | 		11) M2_Distribution.html: M2 metric for all samples.
481 | 		12) PBC1_Distribution.html: PBC1 metric for all samples.
482 | 		13) PBC2_Distribution.html: PBC2 metric for all samples.
483 | 		14) FRiP_Def_NoCtrl_Distribution.html: FRiP statistics for MACS2 peaks with default command, and without using any control BAM files.
484 | 		15) NumPeak_Def_NoCtrl_Distribution.html: Number of MACS2 peaks with default command, and without using any control BAM files.
485 | 		16) FRiP_Ext_NoCtrl_Distribution.html: FRiP statistics for MACS2 peaks with --Extsize option (recommended), and without using any control BAM files.
486 | 		17) NumPeak_Ext_NoCtrl_Distribution.html: Number of MACS2 peaks with --Extsize option (recommended), and without using any control BAM files.
487 | 		18) FRiP_Def_Ctrl_Distribution.html: FRiP statistics for MACS2 peaks with default command, and when one or more control BAM files are used.
488 | 		19) NumPeak_Def_Ctrl_Distribution.html: Number of MACS2 peaks with default command, and when one or more control BAM files are used.
489 | 		20) FRiP_Ext_Ctrl_Distribution.html: FRiP statistics for MACS2 peaks with --Extsize option (recommended), and when one or more control BAM files are used.
490 | 		21) NumPeak_Ext_Ctrl_Distribution.html: Number of MACS2 peaks with --Extsize option (recommended), and when one or more control BAM files are used.
491 | 
492 | Command for executing IDR codes
493 | ---------------------------------
494 | 
495 | Current pipeline supports IDR analysis between either a list of ATAC-seq peak files 
496 | or between a list of alignment (BAM) files. In the second case, first the BAM files 
497 | are analyzed and subsampled to contain equal number of reads (minimum number of reads 
498 | contained in the inputs), and subsequently, peaks are estimated from these 
499 | (subsampled) BAM files using MACS2. These peaks are then applied for IDR analysis.
500 | 
501 | The script "sample_IDRScript.sh" included within this package 
502 | shows calling following two functions (both are included within the folder 
503 | "IDR_Codes"):
504 | 
505 | 	1) IDRMain.sh
506 | 
507 | 	2) IDR_SubSampleBAM_Main.sh
508 | 
509 | 	The first script, IDRMain.sh, performs IDR between two or more 
510 | 	input peak files (we have used peaks estimated from MACS2). The parameters 
511 | 	corresponding to this script are as follows:
512 | 
513 | 	-I  InpFile        	 
514 | 			A list of input peak files (obtained from MACS2 - in .narrowPeak or .narrowPeak.gz format). 
515 | 			At least two peak files are required. 
516 | 	
517 | 	-P 	PathIDRCode		 
518 | 			Path of the IDRCode package (Kundaje et. al. after its installation). 
519 | 			Please check the "Required packages" section for the details.
520 | 
521 | 	-d  OutDir 		 	 
522 | 			Output directory (absolute path preferred) which will store the IDR results.
523 | 
524 | 	-n 	PREFIX 			 
525 | 			Prefix of output files. Default 'IDR_ATAC'.
526 | 
527 | 	A sample execution of this script is as follows:
528 | 
529 | 	./IDRMain.sh -I peak1.narrowPeak -I peak2.narrowPeak -I peak3.narrowPeak -P /home/sourya/packages/idrCode/ -d /home/sourya/OutDir_IDR -n 'IDR_test'
530 | 
531 | 
532 | 
533 | 	The second script, IDR_SubSampleBAM_Main.sh, takes input of two or more BAM files, 
534 | 	estimates peaks from these BAM files, and then performs IDR analysis. The parameters 
535 | 	corresponding to this script are as follows:
536 | 
537 | 	-I  InpFile        	 
538 | 			A list of input BAM files. At least two BAM files are required. 
539 | 	
540 | 	-P 	PathIDRCode		 
541 | 			Path of the IDRCode package (Kundaje et. al. after its installation). 
542 | 			Please check the "Required packages" section for the details.
543 | 
544 | 	-d  OutDir 		 	 
545 | 			Output directory (absolute path preferred) which will store the IDR results.
546 | 
547 | 	-n 	PREFIX 			 
548 | 			Prefix of output files. Default 'IDR_ATAC'.
549 | 
550 | 	-c  CountPeak		 
551 | 			No of peaks in both replicates that will be compared for IDR analysis.
552 | 			Default 25000.
553 | 		
554 | 	-C  CONTROLBAM		 
555 | 			Control file (in eiher .BAM or tagalign file in .gz format)	
556 | 			used to estimate the peaks from MACS2. User may leave this field 
557 | 			blank if no control file is available.
558 | 
559 | 	A sample execution of this script is as follows:
560 | 
561 | 	./IDR_SubSampleBAM_Main.sh -I inpfile1.bam -I inpfile2.bam -P /home/sourya/packages/idrCode/ -d /home/sourya/OutDir_IDR -n 'IDR_test' -c 25000 -C control.bam
562 | 
563 | 
564 | Describing output of IDR analysis
565 | ----------------------------------
566 | 
567 | In the specified output directory "OutDir" mentioned in the IDR script, following 
568 | files (f) and folders (F) exist:
569 | 
570 | 	F1: Folders of the name $i$_and_$j$ where 0 <= i < N and 1 <= j <= N, where N is 
571 | 	the number of replicates analyzed. Individual folders contain results for 
572 | 	pairwise IDR analysis. For example, folder 0_and_1 contain IDR analysis 
573 | 	for the sample 0 (first replicate) and the sample 1 (second replicate).
574 | 
575 | 	f1 : "Replicate_Names.txt" : names of the replicate samples used for IDR analysis.
576 | 
577 | 	f2: Input_Peak_Statistics.txt: number of peaks and the peak containing replicates.
578 | 
579 | 	f3: IDR_Batch_Plot-plot.pdf: final IDR plot. Here individual pairs (whose results 
580 | 		are stored in the above mentioned folders) are numbered 1, 2, ...
581 | 		Consideing N = 3, the number of pairs possible is also 3. Here, 
582 | 		the number 1 denotes the folder (pair) 0_and_1, 
583 | 		2 denotes the folder (pair) 0_and_2, and 3 denotes the 
584 | 		folder (pair) 1_and_2.
585 | 
586 | 
587 | Contact
588 | -----------
589 | 
590 | For any queries, please generate a GitHub issue, or alternatively, e-mail us:
591 | 
592 | Sourya Bhattacharyya (sourya@lji.org)
593 | 
594 | Ferhat Ay (ferhatay@lji.org)
595 | 
596 | Pandurangan Vijayanand (vijay@lji.org)
597 | 
598 | 


--------------------------------------------------------------------------------
/bin/ATACSeqQC.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | #==================================
  4 | # used for ATAC seq quality analysis
  5 | # adapted from the link:
  6 | # https://bioconductor.org/packages/release/bioc/vignettes/ATACseqQC/inst/doc/ATACseqQC.html
  7 | 
  8 | # check the code Install_Bioconductor_Packages_using_BioCLite.R
  9 | # to find out the required packages
 10 | 
 11 | #==================================
 12 | # author: Sourya Bhattacharyya
 13 | # Vijay-AY lab
 14 | #==================================
 15 | 
 16 | suppressPackageStartupMessages({
 17 |   library(ATACseqQC)
 18 |   library(ChIPpeakAnno)
 19 |   library(MotifDb)
 20 |   library(GenomicRanges)
 21 |   library(GenomicAlignments)
 22 |   library(optparse)  
 23 | })
 24 | 
 25 | #===========================================================
 26 | option_list = list(
 27 | 	make_option(c("--AlignFile"), type="character", default=NULL, help="Input alignment file. Must contain an index file (generated by samtools)."),	
 28 | 	make_option(c("--RefGenome"), type="character", default=NULL, help="Reference genome name."),
 29 | 	make_option(c("--OutDirQC"), type="character", default=NULL, help="Output directory to contain the QC related statistics."),
 30 | 	make_option(c("--PE"), type="integer", action="store", default=0, help="If 1, indicates paired end input data. Default = 0")
 31 | ); 
 32 | 
 33 | opt_parser = OptionParser(option_list=option_list);
 34 | opt = parse_args(opt_parser);
 35 | 
 36 | # dynamic loading of libraries based on the reference genome
 37 | if (opt$RefGenome == 'hg19') {
 38 | 	library(BSgenome.Hsapiens.UCSC.hg19)
 39 | 	library(TxDb.Hsapiens.UCSC.hg19.knownGene)
 40 | 	library(phastCons100way.UCSC.hg19)
 41 | } else if (opt$RefGenome == 'hg38') {
 42 | 	library(BSgenome.Hsapiens.UCSC.hg38)
 43 | 	library(TxDb.Hsapiens.UCSC.hg38.knownGene)
 44 | 	library(phastCons100way.UCSC.hg38)
 45 | } else if (opt$RefGenome == 'mm10') {
 46 | 	library(BSgenome.Mmusculus.UCSC.mm10)
 47 | 	library(TxDb.Mmusculus.UCSC.mm10.knownGene)
 48 | 	library(phastCons60way.UCSC.mm10)
 49 | } else if (opt$RefGenome == 'mm9') {
 50 | 	library(BSgenome.Mmusculus.UCSC.mm9)
 51 | 	library(TxDb.Mmusculus.UCSC.mm9.knownGene)
 52 | 	library(phastCons60way.UCSC.mm9)
 53 | }
 54 | 
 55 | 
 56 | # create the output directory for QC
 57 | outdir <- opt$OutDirQC
 58 | system(paste("mkdir -p", outdir))
 59 | 
 60 | # input the bamFile from the ATACseqQC package 
 61 | # bamfile <- system.file("extdata", inpbamfilename, package="ATACseqQC", mustWork=TRUE)
 62 | bamfile <- opt$AlignFile
 63 | bamfile.labels <- gsub(".bam", "", basename(bamfile))
 64 | 
 65 | # store the current directory
 66 | currdir <- getwd()
 67 | 
 68 | # go to the directory "outdir"
 69 | setwd(outdir)
 70 | 
 71 | #=====================
 72 | # Check alignment metrics and mapping quality
 73 | bam_QC_textfile <- paste0(outdir, '/Summary_QC.txt')
 74 | if (file.exists(bam_QC_textfile) == FALSE) {
 75 | 	fp_out <- file(bam_QC_textfile, "w")
 76 | 	outtext <- paste0("\n *** Quality control measures for the alignment file ", opt$AlignFile, " is **** \n")
 77 | 	writeLines(outtext, con=fp_out, sep="\n")
 78 | 	close(fp_out)
 79 | 	# quality control summary
 80 | 	capture.output(bamQC(bamfile, outPath=NULL), file=bam_QC_textfile, append=TRUE)
 81 | 	cat(sprintf("\n *** Computed the sumary statistics of the input ATAC seq file ****  \n"))
 82 | }
 83 | 
 84 | # #=====================
 85 | # Estimate the library complexity
 86 | # sourya - commented since old bioconductor version does not support this function
 87 | if (0) {
 88 | 	plotfile <- paste0(outdir, '/Library_Complexity.pdf')
 89 | 	pdf(plotfile, width=6, height=4)
 90 | 	estimateLibComplexity(readsDupFreq(bamfile))
 91 | 	dev.off()
 92 | 	cat(sprintf("\n *** Computed library complexity ****  \n"))
 93 | }
 94 | 
 95 | #=====================
 96 | # works only when the input is single end
 97 | if (opt$PE == 0) {
 98 | 	# shift the BAM file - forward strand by +4 bp
 99 | 	possibleTag <- c("AS", "XN", "XM", "XO", "XG", "NM", "MD", "YS", "YT")
100 | 	gal <- readBamFile(bamfile, asMates=FALSE)
101 | 	shiftedBamfile <- paste0(outdir, '/shifted.bam')
102 | 	gal1 <- shiftGAlignmentsList(gal)
103 | 	export(gal1, shiftedBamfile)
104 | 	cat(sprintf("\n *** shifted bam file ****  \n"))
105 | }
106 | 
107 | #=====================
108 | # works only when the input is paired end
109 | if (opt$PE == 1) {
110 | 	
111 | 	# fragment size distribution (main QC metric)
112 | 	plotfile <- paste0(outdir, '/Fragment_Size_Distribution.pdf')
113 | 	if (file.exists(plotfile) == FALSE) {
114 | 		pdf(plotfile, width=6, height=4)
115 | 		fragSizeDist(bamfile, bamfile.labels)
116 | 		dev.off()
117 | 		cat(sprintf("\n *** Computed the fragment size distribution (paired end read) **** \n"))
118 | 	}
119 | 
120 | 	# prepare the tags
121 | 	# obtained from: 
122 | 	# https://bioinformatics-core-shared-training.github.io/cruk-summer-school-2019/ChIPSeq/Materials/Practicals/Day5/Practical01_ATAC-seq_analysis_SS.html
123 | 	# and from 
124 | 	# https://bioconductor.org/packages/release/bioc/vignettes/ATACseqQC/inst/doc/ATACseqQC.html
125 | 
126 | 	# option 1: all combination
127 | 	# possibleTag <- combn(LETTERS, 2)
128 | 	# possibleTag <- c(paste0(possibleTag[1, ], possibleTag[2, ]), paste0(possibleTag[2, ], possibleTag[1, ]))
129 | 	# cat(sprintf("\n length of possibleTag : %s ", length(possibleTag)))
130 | 
131 | 	# option 2: specified combinations
132 | 	possibleTag <- c("AS", "XN", "XM", "XO", "XG", "NM", "MD", "YS", "YT")
133 | 
134 | 	# prepare the seqlev input
135 | 	# by default  seqlev = paste0("chr", c(1:22, "X", "Y"))
136 | 	# but it should be checked with the BAM header
137 | 	seqlevset_default <- paste0("chr", c(1:22, "X", "Y"))
138 | 	tempBAMHeaderFile <- paste0(outdir, '/t.bed')
139 | 	system(paste("samtools view -H ", bamfile, " | awk -F\'[\t]\' \'($1==\"@SQ\")\' - > ", tempBAMHeaderFile))
140 | 	nline <- as.integer(system(paste("cat", tempBAMHeaderFile, "| wc -l"), intern = TRUE))
141 | 	if (nline > 0) {
142 | 		x <- read.table(tempBAMHeaderFile, header=F, sep="\t", stringsAsFactors=F)
143 | 		y <- x[,2]
144 | 		z <- as.vector(unlist(strsplit(y, ":")))
145 | 		seqlevset_final <- intersect(seqlevset_default, z)
146 | 		cat(sprintf("\n ==>> chromosomes considered : %s ", paste(as.vector(seqlevset_final), sep="\t")))
147 | 	} else {
148 | 		seqlevset_final <- seqlevset_default
149 | 	}
150 | 	system(paste("rm", tempBAMHeaderFile))
151 | 
152 | 	# compute promoter transcript body score
153 | 	if (opt$RefGenome == 'hg19') {
154 | 		txs <- transcripts(TxDb.Hsapiens.UCSC.hg19.knownGene)		
155 | 	} else if (opt$RefGenome == 'hg38') {
156 | 		txs <- transcripts(TxDb.Hsapiens.UCSC.hg38.knownGene)
157 | 	} else if (opt$RefGenome == 'mm10') {
158 | 		txs <- transcripts(TxDb.Mmusculus.UCSC.mm10.knownGene)
159 | 	} else if (opt$RefGenome == 'mm9') {
160 | 		txs <- transcripts(TxDb.Mmusculus.UCSC.mm9.knownGene)
161 | 	}
162 | 	cat(sprintf("\n ==>> read transcript information "))
163 | 
164 | 	for (chrIdx in 1:length(seqlevset_final)) {
165 | 		currChr_seqlev <- seqlevset_final[chrIdx]
166 | 		cat(sprintf("\n extracting nucleosome specific BAM - chromosome : %s ", currChr_seqlev))		
167 | 		
168 | 		# extract current chromosome specific genome information
169 | 		if ((opt$RefGenome == 'hg19') | (opt$RefGenome == 'hg38')) {
170 | 			which_currChr <- as(seqinfo(Hsapiens)[currChr_seqlev], "GRanges")
171 | 		} else if ((opt$RefGenome == 'mm10') | (opt$RefGenome == 'mm9')) {
172 | 			which_currChr <- as(seqinfo(Mmusculus)[currChr_seqlev], "GRanges")
173 | 		}
174 | 
175 | 		# read the BAM file for the current chromosome
176 | 		# and shift it in both ends
177 | 		gal <- readBamFile(bamfile, tag=possibleTag, which=which_currChr, asMates=TRUE)
178 | 		cat(sprintf("\n read bam file for the current chromosome : %s ", currChr_seqlev))
179 | 
180 | 		# shift the GAlignmentsLists by 5' ends. 
181 | 		# All reads aligning to the positive strand will be offset by +4bp, 
182 | 		# and all reads aligning to the negative strand will be offset -5bp by default.
183 | 		shiftedBamfile <- file.path(outdir, 'shifted.bam')
184 | 		gal1 <- shiftGAlignmentsList(gal)
185 | 		export(gal1, shiftedBamfile)
186 | 		cat(sprintf("\n shifted bam file for the current chromosome : %s ", currChr_seqlev))
187 | 
188 | 		# following code is to get the nucleosome specific
189 | 		# and nucleosome free regions
190 | 		# but this is giving some problems - comment - sourya
191 | 		if (0) {
192 | 
193 | 		# extract transcripts for the current chromosome
194 | 		txs_currChr_seqlev <- txs[seqnames(txs) %in% currChr_seqlev]
195 | 		cat(sprintf("\n extracted transcripts for the current chromosome : %s ", currChr_seqlev))
196 | 
197 | 		# call the split BAM routine for the current chromosome
198 | 		if (opt$RefGenome == 'hg19') {
199 | 			objs <- splitGAlignmentsByCut(gal1, txs=txs_currChr_seqlev, genome=Hsapiens, conservation=phastCons100way.UCSC.hg19)
200 | 		} else if (opt$RefGenome == 'hg38') {
201 | 			objs <- splitGAlignmentsByCut(gal1, txs=txs_currChr_seqlev, genome=Hsapiens, conservation=phastCons100way.UCSC.hg38)
202 | 		} else if (opt$RefGenome == 'mm10') {
203 | 			objs <- splitGAlignmentsByCut(gal1, txs=txs_currChr_seqlev, genome=Mmusculus, conservation=phastCons60way.UCSC.mm10)
204 | 		} else if (opt$RefGenome == 'mm9') {
205 | 			objs <- splitGAlignmentsByCut(gal1, txs=txs_currChr_seqlev, genome=Mmusculus, conservation=phastCons60way.UCSC.mm9)
206 | 		}
207 | 		cat(sprintf("\n created objs for the current chromosome : %s ", currChr_seqlev))
208 | 
209 | 		# export the binned alignments into bam files.
210 | 		null <- writeListOfGAlignments(objs, outdir)
211 | 		cat(sprintf("\n extracted nucleosome specific reads for the current chromosome : %s ", currChr_seqlev))
212 | 
213 | 		# now rename the files so that chromosome specific information is preserved
214 | 		system(paste("mv NucleosomeFree.bam", paste0("NucleosomeFree_", currChr_seqlev, ".bam")))
215 | 		system(paste("mv NucleosomeFree.bam.bai", paste0("NucleosomeFree_", currChr_seqlev, ".bam.bai")))
216 | 		system(paste("mv mononucleosome.bam", paste0("mononucleosome_", currChr_seqlev, ".bam")))
217 | 		system(paste("mv mononucleosome.bam.bai", paste0("mononucleosome_", currChr_seqlev, ".bam.bai")))
218 | 		system(paste("mv dinucleosome.bam", paste0("dinucleosome_", currChr_seqlev, ".bam")))
219 | 		system(paste("mv dinucleosome.bam.bai", paste0("dinucleosome_", currChr_seqlev, ".bam.bai")))
220 | 		system(paste("mv trinucleosome.bam", paste0("trinucleosome_", currChr_seqlev, ".bam")))
221 | 		system(paste("mv trinucleosome.bam.bai", paste0("trinucleosome_", currChr_seqlev, ".bam.bai")))
222 | 		system(paste("mv inter1.bam", paste0("inter1_", currChr_seqlev, ".bam")))
223 | 		system(paste("mv inter1.bam.bai", paste0("inter1_", currChr_seqlev, ".bam.bai")))
224 | 		system(paste("mv inter2.bam", paste0("inter2_", currChr_seqlev, ".bam")))
225 | 		system(paste("mv inter2.bam.bai", paste0("inter2_", currChr_seqlev, ".bam.bai")))
226 | 		system(paste("mv inter3.bam", paste0("inter3_", currChr_seqlev, ".bam")))
227 | 		system(paste("mv inter3.bam.bai", paste0("inter3_", currChr_seqlev, ".bam.bai")))
228 | 		system(paste("mv others.bam", paste0("others_", currChr_seqlev, ".bam")))
229 | 		system(paste("mv others.bam.bai", paste0("others_", currChr_seqlev, ".bam.bai")))
230 | 
231 | 		}	# end dummy if
232 | 		# end code  comment - sourya
233 | 
234 | 		# only rename the shifted files
235 | 		system(paste("mv shifted.bam", paste0("shifted_", currChr_seqlev, ".bam")))
236 | 		system(paste("mv shifted.bam.bai", paste0("shifted_", currChr_seqlev, ".bam.bai")))
237 | 	
238 | 	}	# end chromosome loop
239 | 	
240 | 	cat(sprintf("\n *** Split alignments ****  \n"))
241 | 
242 | 	# sourya - commented since old bioconductor version does not support this function
243 | 	if (0) {
244 | 		pt <- PTscore(gal1, txs)
245 | 		plotfile <- paste0(outdir, '/promoter_transcript_body_score.pdf')
246 | 		pdf(plotfile, width=6, height=4)	
247 | 		plot(pt$log2meanCoverage, pt$PT_score, xlab="log2 mean coverage", ylab="Promoter vs Transcript")
248 | 		dev.off()
249 | 	}
250 | 
251 | 	# Nucleosome Free Regions (NFR) score
252 | 	# sourya - commented since old bioconductor version does not support this function
253 | 	if (0) {	
254 | 		nfr <- NFRscore(gal1, txs)
255 | 		plotfile <- paste0(outdir, '/Nucleosome_Free_Regions_score.pdf')
256 | 		pdf(plotfile, width=6, height=4)
257 | 		plot(nfr$log2meanCoverage, nfr$NFR_score, xlab="log2 mean coverage", ylab="Nucleosome Free Regions score", main="NFRscore for 200bp flanking TSSs", xlim=c(-10, 0), ylim=c(-5, 5))
258 | 		dev.off()
259 | 	}
260 | 
261 | 	# Transcription Start Site (TSS) Enrichment Score
262 | 	# sourya - commented since old bioconductor version does not support this function
263 | 	if (0) {	
264 | 		tsse <- TSSEscore(gal1, txs)
265 | 		capture.output(summary(tsse$TSS.enrichment.score), file=paste0(outdir, '/Summary_TSS_Enrichment_Score.txt'), append=FALSE)
266 | 	}
267 | 
268 | 
269 | 	# Heatmap and coverage curve for nucleosome positions
270 | 	# sourya - commented since old bioconductor version does not support this function
271 | 	if (0) {	
272 | 		nucleosome_positions_bamfiles <- file.path(outdir, c("NucleosomeFree.bam", "mononucleosome.bam", "dinucleosome.bam", "trinucleosome.bam"))
273 | 		plotfile <- paste0(outdir, '/Nucleosome_cumulative_Percentage.pdf')
274 | 		pdf(plotfile, width=6, height=4)
275 | 		if ((opt$RefGenome == 'hg19') | (opt$RefGenome == 'hg38')) {
276 | 			cumulativePercentage(nucleosome_positions_bamfiles[1:2], as(seqinfo(Hsapiens), "GRanges"))
277 | 		} else if (opt$RefGenome == 'mm10') {
278 | 			cumulativePercentage(nucleosome_positions_bamfiles[1:2], as(seqinfo(Mmusculus), "GRanges"))
279 | 		} else if (opt$RefGenome == 'mm9') {
280 | 			cumulativePercentage(nucleosome_positions_bamfiles[1:2], as(seqinfo(Mmusculus), "GRanges"))
281 | 		}
282 | 		dev.off()
283 | 		cat(sprintf("\n *** Performed function - Nucleosome_cumulative_Percentage ****  \n"))
284 | 	}
285 | 
286 | 	# TSS statistics
287 | 	# sourya - commented since old bioconductor version does not support this function
288 | 	if (0) {
289 | 		TSS <- promoters(txs, upstream=0, downstream=1)
290 | 		TSS <- unique(TSS)
291 | 		# estimate the library size for normalization
292 | 		(librarySize <- estLibSize(nucleosome_positions_bamfiles))
293 | 		# calculate the signals around TSSs
294 | 		NTILE <- 101
295 | 		dws <- ups <- 1010
296 | 		sigs <- enrichedFragments(gal=objs[c("NucleosomeFree", "mononucleosome", "dinucleosome", "trinucleosome")], TSS=TSS, librarySize=librarySize, TSS.filter=0.5, n.tile = NTILE, upstream = ups, downstream = dws)
297 | 		# log2 transformed signals
298 | 		sigs.log2 <- lapply(sigs, function(.ele) log2(.ele+1))
299 | 		# plot heatmap
300 | 		plotfile <- paste0(outdir, '/Heatmap_signal_around_TSS.pdf')
301 | 		pdf(plotfile, width=6, height=4)
302 | 		featureAlignedHeatmap(sigs.log2, reCenterPeaks(TSS, width=ups+dws), zeroAt=.5, n.tile=NTILE)
303 | 		dev.off()
304 | 		cat(sprintf("\n *** Performed featureAlignedHeatmap **** \n"))
305 | 
306 | 		# get signals normalized for nucleosome-free and nucleosome-bound regions
307 | 		out_Align_Distr <- featureAlignedDistribution(sigs, reCenterPeaks(TSS, width=ups+dws), zeroAt=.5, n.tile=NTILE, type="l",  ylab="Averaged coverage")
308 | 		# rescale the nucleosome-free and nucleosome signals to 0~1
309 | 		range01 <- function(x){(x-min(x))/(max(x)-min(x))}
310 | 		out_Align_Distr <- apply(out_Align_Distr, 2, range01)
311 | 		plotfile <- paste0(outdir, '/nucleosome_free_and_nucleosome_signals.pdf')
312 | 		pdf(plotfile, width=6, height=4)
313 | 		matplot(out_Align_Distr, type="l", xaxt="n", xlab="Position (bp)", ylab="Fraction of signal")
314 | 		axis(1, at=seq(0, 100, by=10)+1, labels=c("-1K", seq(-800, 800, by=200), "1K"), las=2)
315 | 		abline(v=seq(0, 100, by=10)+1, lty=2, col="gray")
316 | 		dev.off()
317 | 		cat(sprintf("\n *** Performed featureAlignedDistribution **** \n"))
318 | 	}
319 | 
320 | }	# paired end condition
321 | 
322 | # return to the original directory
323 | setwd(currdir)
324 | 
325 | 


--------------------------------------------------------------------------------
/bin/BigWigTrackCreate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -ex
 2 | #PBS -l nodes=1:ppn=4
 3 | #PBS -l mem=10GB
 4 | #PBS -l walltime=24:00:00
 5 | #PBS -m ae
 6 | #PBS -j eo
 7 | #PBS -V
 8 | source ~/.bashrc
 9 | #source ~/.bash_profile
10 | hostname
11 | TMPDIR=/scratch
12 | cd $PBS_O_WORKDIR
13 | 
14 | #=================================
15 | # this program denotes a sample pipeline for ATAC-seq data
16 | # applicable only a single fastq or alignment file is provided
17 | #=================================
18 | # developed by - Sourya Bhattacharyya
19 | # Vijay-AY lab
20 | # La Jolla Institute for Allergy and Immunology
21 | #=================================
22 | 
23 | # usage info
24 | usage(){
25 | cat << EOF
26 | 
27 | usage: 
28 | 
29 | Options:    
30 | 
31 |   -- required:
32 | 	-I  InpFile   		 Input alignment file (Bowtie2 aligned)
33 | 	-n  PREFIX           Prefix of output files.
34 | 	-d  OutDir 			 Set the output directory which will contain all the results
35 |  	-w 	BigWigGenome	 The reference genome which is used to convert BAM file to a BigWig file. (such as 'hg19', 'mm9', etc.)
36 | 
37 | EOF
38 | }
39 | 
40 | while getopts "n:I:d:w:" opt;
41 | do
42 | 	case "$opt" in
43 | 		n) PREFIX=$OPTARG;;
44 | 		I) InpFile=$OPTARG;;
45 | 		d) OutDir=$OPTARG;;
46 | 		w) BigWigGenome=$OPTARG;;
47 | 		\?) usage
48 | 			echo "error: unrecognized option -$OPTARG";
49 | 			exit 1
50 | 			;;
51 | 	esac
52 | done
53 | 
54 | # executable to convert the sorted bam file to the bigwig format
55 | BigWigCreateExec='/home/sourya/proj/utils/bam_to_bigwig.sh'
56 | 
57 | #======================
58 | # convert the alignment file to the bigwig data format
59 | # for track visualization
60 | #======================
61 | BigWigoutdir=$OutDir'/Out_BigWig'
62 | mkdir -p $BigWigoutdir
63 | 
64 | # we use sorted (before duplicate removal) bam file
65 | $BigWigCreateExec -I $InpFile -g $BigWigGenome -d $BigWigoutdir -n $PREFIX
66 | 
67 | 


--------------------------------------------------------------------------------
/bin/CorrelationBAMPeak.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash 
  2 | 
  3 | #=================================
  4 | # this program is a supporting script for the ATAC seq pipeline
  5 | 
  6 | # inputs:
  7 | # 1) a set of input bam files (requires sorted bam files, and possibly indexed as well)
  8 | # 2) a set of peak files
  9 | # 3) max no of peaks to be considered
 10 | 
 11 | # union of the given peak files is used to compute the coverage with respect to individual input bam files
 12 | # with respect to a minimum coverage (num of reads) threshold
 13 | # and a given threshold of the max  no of peaks to be considered
 14 | # bam files are subsampled to cover only the subset of union peak set
 15 | # correlation between these subsampled bam files 
 16 | 
 17 | #=================================
 18 | # developed by - Sourya Bhattacharyya
 19 | # Vijay-AY lab
 20 | # La Jolla Institute for Allergy and Immunology
 21 | #=================================
 22 | 
 23 | 
 24 | # usage info
 25 | usage(){
 26 | cat << EOF
 27 | 
 28 | Options:    
 29 | 
 30 |   -- required:
 31 |   	-B 	BAM 			 One or more input bam files (sorted)
 32 |   	-P 	NarrowPeak 		 One or more input narrow peak files (corresponding to the input bam files)
 33 |   						 and in the same order as the input bam files
 34 | 	-L  Labels 			 One or more strings (labels) corresponding to input bam files.
 35 |  	-D 	OutDir 			 Output directory storing the correlation results
 36 |  	-r 	ReadCount		 A threshold (integer) of the number of reads (coverage) that each peak should 
 37 |  					 	 minimally cover. Default = 5 (according to the Greenleaf 2018 paper)
 38 |  	-c 	PeakCount		 Number of peaks to be randomly from the union set of peaks. 
 39 |  						 Default = 50000 (according to the Greenleaf 2018 paper)
 40 | 	-O 	Overwrite		 this boolean option signifies whether existing output files would 
 41 | 						 be overwritten (1) or not (0).
 42 | 						 Default = 0
 43 | 
 44 | EOF
 45 | }
 46 | 
 47 | # default minimum coverage threshold for each peak
 48 | ReadCountThr=5
 49 | 
 50 | # default threshold of the number of peaks
 51 | PeakCountThr=50000
 52 | 
 53 | # default output directory
 54 | OutDir=`pwd`'/'
 55 | 
 56 | # this boolean option signifies whether existing output 
 57 | # files would be overwritten (1) or not (0).
 58 | # Default = 0
 59 | Overwrite=0
 60 | 
 61 | while getopts "B:P:D:r:c:O:L:" opt;
 62 | do
 63 | 	case "$opt" in
 64 | 		B) BAMFILES+=($OPTARG);;	# one or more bam input files can be provided
 65 | 		P) PEAKFILES+=($OPTARG);;	# one or more peak input files can be provided
 66 | 		L) Labels+=($OPTARG);;		# labels corresponding to individual input bam files
 67 | 		D) OutDir=$OPTARG;;
 68 | 		r) ReadCountThr=$OPTARG;;
 69 | 		c) PeakCountThr=$OPTARG;;
 70 | 		O) Overwrite=$OPTARG;;
 71 | 		\?) usage
 72 | 			echo "error: unrecognized option -$OPTARG";
 73 | 			exit 1
 74 | 			;;
 75 | 	esac
 76 | done
 77 | 
 78 | 
 79 | #----------------------------------
 80 | # important - sourya
 81 | # change the current directory as the dir containing this executable
 82 | # since other source files relative to the current directory needs to be called
 83 | current_dir=$(pwd)
 84 | script_dir=$(dirname $0)
 85 | cd $script_dir
 86 | #----------------------------------
 87 | 
 88 | nbamfiles=${#BAMFILES[@]}
 89 | echo 'number of bam files provided: '$nbamfiles
 90 | 
 91 | npeakfiles=${#PEAKFILES[@]}
 92 | echo 'number of peak files provided: '$npeakfiles
 93 | 
 94 | nlabels=${#Labels[@]}
 95 | echo 'number of labels provided: '$npeakfiles
 96 | 
 97 | # if [[ $nbamfiles != $npeakfiles ]]; then
 98 | # 	echo "Number of input bam files and the number of peak files do not match - return !!!"
 99 | # 	exit 1
100 | # fi
101 | 
102 | # check if the input bam files are all indexed
103 | # otherwise index the bam files
104 | listbamfiles=''
105 | for (( i=0; i<${nbamfiles}; i++ ));
106 | do 
107 | 	currbamfile=${BAMFILES[i]}
108 | 	if [[ $i == 0 ]]; then
109 | 		listbamfiles=$currbamfile
110 | 	else
111 | 		listbamfiles=$listbamfiles' '$currbamfile
112 | 	fi
113 | 	echo 'processing the bam file index: '$i'  name: '$currbamfile
114 | 	if [ ! -f $currbamfile'.bai' ]; then
115 | 		samtools index $currbamfile
116 | 	fi
117 | done
118 | echo 'listbamfiles: '$listbamfiles
119 | 
120 | # list of labels
121 | if [[ $nlabels == $nbamfiles ]]; then
122 | 	listlabels=''
123 | 	# also required a colon separated list 
124 | 	listlabelsRscript=''
125 | 	for (( i=0; i<${nlabels}; i++ ));
126 | 	do
127 | 		if [[ $i == 0 ]]; then
128 | 			listlabels=${Labels[i]}
129 | 			listlabelsRscript=${Labels[i]}
130 | 		else
131 | 			listlabels=$listlabels' '${Labels[i]}
132 | 			listlabelsRscript=$listlabelsRscript':'${Labels[i]}
133 | 		fi
134 | 	done
135 | 	echo 'listlabels: '$listlabels
136 | 	echo 'listlabelsRscript: '$listlabelsRscript
137 | fi
138 | 
139 | # list of peak files (if provided)
140 | if [[ $npeakfiles == $nbamfiles ]]; then
141 | 	listpeakfiles=''
142 | 	for (( i=0; i<${npeakfiles}; i++ ));
143 | 	do 
144 | 		echo 'processing the peak file index: '$i'  name: '${PEAKFILES[i]}
145 | 		if [[ $i == 0 ]]; then
146 | 			listpeakfiles=${PEAKFILES[i]}
147 | 		else
148 | 			listpeakfiles=$listpeakfiles' '${PEAKFILES[i]}
149 | 		fi
150 | 	done
151 | 	echo 'listpeakfiles: '$listpeakfiles
152 | fi
153 | 
154 | #=============================
155 | # two cases:
156 | # 1) when peak files are provided, and the subsampled union of peaks are used for correlation
157 | # 2) or, when peaks are not provided, and whole bam files are used for correlation
158 | #==============================
159 | if [[ $npeakfiles == $nbamfiles ]]; then
160 | 
161 | 	# union of the input peak files
162 | 	UnionPeakFile=$OutDir'/Union_Peaks_Original.bed'
163 | 
164 | 	echo '***** before computing '$UnionPeakFile'  *****'
165 | 
166 | 	if [[ ! -f $UnionPeakFile || $Overwrite == 1 ]]; then
167 | 		cat $listpeakfiles | cut -f1-3 | sort -k1,1 -k2,2n | mergeBed -i stdin > $UnionPeakFile
168 | 	fi
169 | 
170 | 	echo '***** after computing '$UnionPeakFile'  *****'	
171 | 
172 | 	# now perform the coverage of this union peaks
173 | 	# with respect to input bam files
174 | 	coverageoutfile=$OutDir'/Union_Peaks_Original_CoverageVal.bed'
175 | 
176 | 	echo '***** before computing '$coverageoutfile'  *****'
177 | 
178 | 	if [[ ! -f $coverageoutfile || $Overwrite == 1 ]]; then
179 | 		# earlier command - sourya
180 | 		# bedtools multicov -bams ${listbamfiles} -bed ${UnionPeakFile} > ${coverageoutfile}
181 | 		
182 | 		# modified command - sourya
183 | 		# we found that supplying all of the bam files together in the "multicov" function
184 | 		# results errors, probably due to mismatching headers in different bam files
185 | 		# so we supply one bam file at a time,
186 | 		# compute the coverage with respect to individual bam files
187 | 		# and sequentially merge all the information
188 | 		for (( i=0; i<${nbamfiles}; i++ ));
189 | 		do
190 | 			# temp output file
191 | 			tempout=$OutDir'/temp_out_union_peaks_coverage.bed'
192 | 			bedtools multicov -bams ${BAMFILES[i]} -bed ${UnionPeakFile} > $tempout
193 | 			if [ $i == 0 ]; then
194 | 				# first iteration - rename the temporary output file to the 
195 | 				# final output file
196 | 				mv $tempout $coverageoutfile
197 | 			else
198 | 				# subsequent iteration
199 | 				# use bedtools map function to merge the existing contents 
200 | 				# of "coverageoutfile" with the new "tempout" contents
201 | 				tempout2=$OutDir'/temp_out_union_peaks_coverage2.bed'
202 | 				bedtools map -c 4 -o mean -null '0'	-a $coverageoutfile -b $tempout > $tempout2
203 | 				# remove the old instance of coverage output file
204 | 				# and use the newly constructed file
205 | 				rm $coverageoutfile
206 | 				mv $tempout2 $coverageoutfile
207 | 			fi
208 | 		done
209 | 		# remove the temporary files
210 | 		if [ -f $tempout ]; then
211 | 			rm $tempout
212 | 		fi
213 | 		if [ -f $tempout2 ]; then
214 | 			rm $tempout2
215 | 		fi
216 | 	fi
217 | 
218 | 	echo '***** after computing '$coverageoutfile'  *****'
219 | 
220 | 	coverageoutfileThr=$OutDir'/Union_Peaks_CoverageVal_MinReadThr.bed'
221 | 
222 | 	echo '***** before computing '$coverageoutfileThr'  *****'
223 | 
224 | 	# select only those peaks which have coverage value >= ReadCountThr
225 | 	# for all the bam files considered
226 | 	if [[ ! -f $coverageoutfileThr || $Overwrite == 1 ]]; then
227 | 		awk -v T="$ReadCountThr" -v N="$npeakfiles" '{f=0; for (i=0;i<N;i++) {if ($(NF-i)<T) {f=1}}; {if (f==0) {print $0}}}' $coverageoutfile > $coverageoutfileThr
228 | 	fi
229 |  
230 | 	echo '***** after computing '$coverageoutfileThr'  *****'
231 | 
232 | 	# subset of the peaks (randomly selected)
233 | 	# such that the total number of peaks = PeakCountThr
234 | 	# check if the coverge thresholded peaks have higher number of peaks
235 | 	# than the mentioned threshold
236 | 	coverageoutfileThrSubSample=$OutDir'/Union_Peaks_CoverageVal_MinReadThr_Subsampled_'$PeakCountThr'.bed'
237 | 
238 | 	echo '***** before computing '$coverageoutfileThrSubSample'  *****'
239 | 
240 | 	if [[ ! -f $coverageoutfileThrSubSample || $Overwrite == 1 ]]; then
241 | 		npeakAboveThr=`cat $coverageoutfileThr | wc -l`
242 | 		echo 'npeakAboveThr: '$npeakAboveThr
243 | 		if [[ $npeakAboveThr -gt $PeakCountThr ]]; then
244 | 			echo 'Above the mentioned peak count threshold - random subset'
245 | 			shuf $coverageoutfileThr | head -n $PeakCountThr | cut -f1-3 | sort -k1,1 -k2,2n > $coverageoutfileThrSubSample
246 | 		else
247 | 			cat $coverageoutfileThr | cut -f1-3 | sort -k1,1 -k2,2n > $coverageoutfileThrSubSample 
248 | 		fi
249 | 	fi
250 | 
251 | 	echo '***** after computing '$coverageoutfileThrSubSample'  *****'
252 | 
253 | 	# dumping intermediate results
254 | 	# minimum mapping quality is maintained at 30
255 | 	OutDumpFile=$OutDir'/results_SubsampledPeak.npz'
256 | 
257 | 	echo '***** before computing '$OutDumpFile'  *****'
258 | 
259 | 	# --minMappingQuality 30 
260 | 
261 | 	if [[ $nlabels == $nbamfiles ]]; then
262 | 		multiBamSummary BED-file --BED $coverageoutfileThrSubSample --bamfiles $listbamfiles --labels $listlabels -out $OutDumpFile
263 | 	else
264 | 		multiBamSummary BED-file --BED $coverageoutfileThrSubSample --bamfiles $listbamfiles --smartLabels -out $OutDumpFile 
265 | 	fi
266 | 
267 | 	echo '***** after computing '$OutDumpFile'  *****'
268 | 
269 | 	# using spearman correlation
270 | 	
271 | 	# OutPlotFile=$OutDir'/Correlation_Spearman_SubsampledPeak.pdf'
272 | 	# OutMatrixFile=$OutDir'/Correlation_Spearman_SubsampledPeak.matrix'
273 | 	# --plotFile $OutPlotFile --corMethod spearman  --outFileCorMatrix $OutMatrixFile
274 | 
275 | 	# --skipZeros --removeOutliers
276 | 	
277 | 	OutPlotFileHeatMap=$OutDir'/Correlation_Spearman_SubsampledPeak_Heatmap.pdf'
278 | 	OutPlotFileScatter=$OutDir'/Correlation_Spearman_SubsampledPeak_Scatterplot.pdf'
279 | 	if [[ ! -f $OutPlotFileHeatMap || ! -f $OutPlotFileScatter || $Overwrite == 1 ]]; then
280 | 		plotCorrelation --corData $OutDumpFile --plotFile $OutPlotFileHeatMap --corMethod spearman --whatToPlot heatmap --colorMap RdYlBu --plotNumbers --outFileCorMatrix $OutDir'/Correlation_Spearman_SubsampledPeak_Corr.mat' 
281 | 		plotCorrelation --corData $OutDumpFile --plotFile $OutPlotFileScatter --corMethod spearman --whatToPlot scatterplot 
282 | 	fi
283 | 
284 | 	# # using pearson correlation
285 | 	# commented - check https://www.biostars.org/p/195328/
286 | 	
287 | 	# # OutPlotFile=$OutDir'/Correlation_Pearson_SubsampledPeak.pdf'
288 | 	# # OutMatrixFile=$OutDir'/Correlation_Pearson_SubsampledPeak.matrix'
289 | 	# # --plotFile $OutPlotFile --corMethod pearson  --outFileCorMatrix $OutMatrixFile 
290 | 
291 | 	# # --skipZeros --removeOutliers
292 | 
293 | 	# OutPlotFileHeatMap=$OutDir'/Correlation_Pearson_SubsampledPeak_Heatmap.pdf'
294 | 	# OutPlotFileScatter=$OutDir'/Correlation_Pearson_SubsampledPeak_Scatterplot.pdf'
295 | 
296 | 	# if [[ ! -f $OutPlotFileHeatMap || ! -f $OutPlotFileScatter || $Overwrite == 1 ]]; then
297 | 	# 	plotCorrelation --corData $OutDumpFile --plotFile $OutPlotFileHeatMap --corMethod pearson --whatToPlot heatmap --colorMap RdYlBu --plotNumbers --outFileCorMatrix $OutDir'/Correlation_Pearson_SubsampledPeak_Corr.mat' 
298 | 	# 	plotCorrelation --corData $OutDumpFile --plotFile $OutPlotFileScatter --corMethod pearson --whatToPlot scatterplot 
299 | 	# fi
300 | 
301 | 	#========================
302 | 	# now extract the peak intervals (after subsampling)
303 | 	# and get the peak intensities for individual input peak files
304 | 	OnlyPeakSubsample=$OutDir'/Peak_PVal_Subsampled_'$PeakCountThr'.bed'
305 | 	cat $coverageoutfileThrSubSample | cut -f1-3 > $OnlyPeakSubsample
306 | 
307 | 	# merge with individual peak input files
308 | 	for (( i=0; i<${npeakfiles}; i++ ));
309 | 	do
310 | 		tempfile=$OutDir'/Peak_PVal_temp.bed'
311 | 		# 8th field in the peak file contains -log10(p) score
312 | 		bedtools map -c 8 -o mean -null '0'	-a $OnlyPeakSubsample -b ${PEAKFILES[i]} > $tempfile
313 | 		rm $OnlyPeakSubsample
314 | 		mv $tempfile $OnlyPeakSubsample
315 | 	done
316 | 
317 | 	# now call a R script which would plot the correlation
318 | 	# for these peaks
319 | 	if [[ $nlabels == $nbamfiles ]]; then
320 | 		Rscript CorrelationPeakPlot.r --InpPeakFile $OnlyPeakSubsample --OutDir $OutDir --InpLabels $listlabelsRscript
321 | 	else
322 | 		Rscript CorrelationPeakPlot.r --InpPeakFile $OnlyPeakSubsample --OutDir $OutDir
323 | 	fi
324 | 	#========================
325 | 
326 | else
327 | 
328 | 	# here no peak files are provided
329 | 	# so simple correlation using the whole bam files is required
330 | 
331 | 	# dumping intermediate results
332 | 	# minimum mapping quality is maintained at 30
333 | 	# --minMappingQuality 30 
334 | 
335 | 	OutDumpFile=$OutDir'/results.npz'
336 | 	if [[ $nlabels == $nbamfiles ]]; then
337 | 		multiBamSummary bins --bamfiles $listbamfiles --labels $listlabels -out $OutDumpFile
338 | 	else
339 | 		multiBamSummary bins --bamfiles $listbamfiles --smartLabels -out $OutDumpFile 
340 | 	fi
341 | 
342 | 	# using spearman correlation
343 | 	
344 | 	# OutPlotFile=$OutDir'/Correlation_Spearman_SubsampledPeak.pdf'
345 | 	# OutMatrixFile=$OutDir'/Correlation_Spearman_SubsampledPeak.matrix'
346 | 	# --plotFile $OutPlotFile --corMethod spearman  --outFileCorMatrix $OutMatrixFile
347 | 
348 | 	# --skipZeros --removeOutliers
349 | 
350 | 	OutPlotFileHeatMap=$OutDir'/Spearman_Heatmap.pdf'
351 | 	OutPlotFileScatter=$OutDir'/Spearman_Scatterplot.pdf'
352 | 
353 | 	if [[ ! -f $OutPlotFileHeatMap || ! -f $OutPlotFileScatter || $Overwrite == 1 ]]; then
354 | 		plotCorrelation --corData $OutDumpFile --plotFile $OutPlotFileHeatMap --corMethod spearman --whatToPlot heatmap --colorMap RdYlBu --plotNumbers --outFileCorMatrix $OutDir'/Correlation_Spearman_Corr.mat' 
355 | 		plotCorrelation --corData $OutDumpFile --plotFile $OutPlotFileScatter --skipZeros --removeOutliers --corMethod spearman --whatToPlot scatterplot 
356 | 	fi
357 | 
358 | 
359 | 
360 | 	# # using pearson correlation
361 | 	# commented - check https://www.biostars.org/p/195328/
362 | 
363 | 	# # OutPlotFile=$OutDir'/Correlation_Pearson_SubsampledPeak.pdf'
364 | 	# # OutMatrixFile=$OutDir'/Correlation_Pearson_SubsampledPeak.matrix'
365 | 	# # --plotFile $OutPlotFile --corMethod pearson  --outFileCorMatrix $OutMatrixFile 
366 | 
367 | 	# OutPlotFileHeatMap=$OutDir'/Pearson_Heatmap.pdf'
368 | 	# OutPlotFileScatter=$OutDir'/Pearson_Scatterplot.pdf'
369 | 
370 | 	# if [[ ! -f $OutPlotFileHeatMap || ! -f $OutPlotFileScatter || $Overwrite == 1 ]]; then
371 | 	# 	plotCorrelation --corData $OutDumpFile --plotFile $OutPlotFileHeatMap --corMethod pearson --whatToPlot heatmap --colorMap RdYlBu --plotNumbers --outFileCorMatrix $OutDir'/Correlation_Pearson_Corr.mat' 
372 | 	# 	plotCorrelation --corData $OutDumpFile --plotFile $OutPlotFileScatter --skipZeros --removeOutliers --corMethod pearson --whatToPlot scatterplot 
373 | 	# fi
374 | 
375 | 
376 | 
377 | fi
378 | 
379 | 
380 | #----------------------------------
381 | # important - sourya
382 | # now restore the original directory
383 | cd $current_dir
384 | #----------------------------------
385 | 
386 | 


--------------------------------------------------------------------------------
/bin/CorrelationPeakPlot.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | #===========================================================
 4 | # R script for plotting the pairwise correlation of the peak intensity values
 5 | # for a given pair of input peaks
 6 | # Input: one file containing the peak intervals (first three columns)
 7 | # and the peak intensity values in the subsequent columns
 8 | # another input is the labels of the given samples
 9 | 
10 | # Author: Sourya Bhattacharyya
11 | # Vijay-Ay lab, LJI
12 | # February 26, 2018
13 | #===========================================================
14 | 
15 | # used for parsing the command line arguments
16 | library(optparse)
17 | 
18 | # library(ggplot2)
19 | 
20 | # plot dimension values
21 | PlotWidth <- 10
22 | PlotHeight <- 7
23 | 
24 | # font size used in texts
25 | FontSize <- 20
26 | 
27 | # different colors used in heatmap
28 | ColorVec <- c('blue', 'cyan', 'green', 'yellow', 'orange', 'red')
29 | 
30 | option_list = list(
31 | 	make_option(c("--InpPeakFile"), type="character", default=NULL, help="Input file containing peak locations and the peak intensity values for all the candidate input samples"),
32 | 	make_option(c("--OutDir"), type="character", default=NULL, help="Output directory containing the results"),	
33 | 	make_option(c("--InpLabels"), type="character", default=NULL, help="Comma or colon separated list of labels associated with individual samples (default %default)")
34 | 
35 | ); 
36 | 
37 | parser <- OptionParser(option_list=option_list)
38 | arguments <- parse_args(parser, positional_arguments=TRUE)
39 | opt <- arguments$options
40 | args <- arguments$args
41 | 
42 | # read the input peak file
43 | InpData <- read.table(opt$InpPeakFile, header=F)
44 | 
45 | # number of samples is the number of columns of the input file
46 | # minus the first three columns
47 | NumSample <- ncol(InpData) - 3
48 | 
49 | # read the labels of the input samples 
50 | # if not provided, assign numeric labels 1 to NumSample
51 | if (is.null(opt$InpLabels)) {
52 | 	InpLabelList <- as.character(seq(1, NumSample))
53 | } else {
54 | 	InpLabelList <- as.character(unlist(strsplit(opt$InpLabels,"[,:]")))
55 | }
56 | 
57 | if (is.null(opt$OutDir)) {
58 | 	OutDir <- getwd()
59 | } else {
60 | 	OutDir <- opt$OutDir
61 | }
62 | 
63 | cat(sprintf("\n\n *** NumSample: %s ", NumSample))
64 | cat(sprintf("\n\n *** InpLabelList: %s ", InpLabelList))
65 | 
66 | TextFile <- paste0(OutDir, '/Correlation_Peak_Spearman.txt')
67 | con <- file(TextFile, "w") 
68 | 
69 | # pairwise processing of the input samples
70 | for (i in (1:(NumSample-1))) {
71 | 	for (j in ((i+1):NumSample)) {
72 | 
73 | 		XAxisData <- InpData[, 3+i]
74 | 		YAxisData <- InpData[, 3+j]
75 | 		AbsDiffVec <- abs(XAxisData - YAxisData)
76 | 		MinDiff <- min(AbsDiffVec)
77 | 		MaxDiff <- max(AbsDiffVec)
78 | 		ColorVal_CurrData <- ceiling(((AbsDiffVec - MinDiff) * length(ColorVec)) / ((MaxDiff - MinDiff) * 1.0))
79 | 
80 | 		# plot the peak correlation
81 | 		plotfile1 <- paste0(OutDir, '/Correlation_Peak_', InpLabelList[i], '_', InpLabelList[j], '.pdf')	
82 | 		pdf(plotfile1, width=PlotWidth, height=PlotHeight)
83 | 		plot(XAxisData, YAxisData, cex=0.25, col=ColorVal_CurrData, xlab=paste0("Peak_Log10P_", InpLabelList[i]), ylab=paste0("Peak_Log10P_", InpLabelList[j]))
84 | 		title("Correlation between peak intensity")
85 | 		dev.off()
86 | 
87 | 		# also print the correlation 
88 | 		Corr_val <- cor(XAxisData, YAxisData, method="spearman")
89 | 		outtext <- paste0("\n\n First peak file label : ", InpLabelList[i], "\n\n Second peak file label : ", InpLabelList[j], "\n\n Correlation value: ", Corr_val)
90 | 		writeLines(outtext, con=con, sep="\n")
91 | 	}
92 | }
93 | 
94 | close(con)
95 | 
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/bin/Sample_ATACseqQC_script.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | 
  3 | ## ---- echo=FALSE, results="hide", warning=FALSE, message=FALSE-------------
  4 | suppressPackageStartupMessages({
  5 |   library(ATACseqQC)
  6 |   library(ChIPpeakAnno)
  7 |   library(BSgenome.Hsapiens.UCSC.hg19)
  8 |   library(TxDb.Hsapiens.UCSC.hg19.knownGene)
  9 |   library(phastCons100way.UCSC.hg19)
 10 |   library(MotifDb)
 11 | })
 12 | knitr::opts_chunk$set(warning=FALSE, message=FALSE)
 13 | 
 14 | ## ---- eval=FALSE-----------------------------------------------------------
 15 | #  library(BiocInstaller)
 16 | #  biocLite(c("ATACseqQC", "ChIPpeakAnno", "MotifDb",
 17 | #             "BSgenome.Hsapiens.UCSC.hg19", "TxDb.Hsapiens.UCSC.hg19.knownGene",
 18 | #             "phastCons100way.UCSC.hg19"))
 19 | 
 20 | ## --------------------------------------------------------------------------
 21 | ## load the library
 22 | library(ATACseqQC)
 23 | ## input the bamFile from the ATACseqQC package 
 24 | bamfile <- system.file("extdata", "GL1.bam", 
 25 |                         package="ATACseqQC", mustWork=TRUE)
 26 | bamfile.labels <- gsub(".bam", "", basename(bamfile))
 27 | 
 28 | ## --------------------------------------------------------------------------
 29 | bamQC(bamfile, outPath=NULL)
 30 | 
 31 | ## --------------------------------------------------------------------------
 32 | ## generate fragement size distribution
 33 | fragSize <- fragSizeDist(bamfile, bamfile.labels)
 34 | 
 35 | ## --------------------------------------------------------------------------
 36 | ## bamfile tags to be read in
 37 | tags <- c("AS", "XN", "XM", "XO", "XG", "NM", "MD", "YS", "YT")
 38 | ## files will be output into outPath
 39 | outPath <- "splited"
 40 | dir.create(outPath)
 41 | ## shift the coordinates of 5'ends of alignments in the bam file
 42 | library(BSgenome.Hsapiens.UCSC.hg19)
 43 | seqlev <- "chr1" ## subsample data for quick run
 44 | which <- as(seqinfo(Hsapiens)[seqlev], "GRanges")
 45 | gal <- readBamFile(bamfile, tag=tags, which=which, asMates=TRUE)
 46 | gal1 <- shiftGAlignmentsList(gal)
 47 | shiftedBamfile <- file.path(outPath, "shifted.bam")
 48 | export(gal1, shiftedBamfile)
 49 | 
 50 | ## --------------------------------------------------------------------------
 51 | library(phastCons100way.UCSC.hg19)
 52 | library(TxDb.Hsapiens.UCSC.hg19.knownGene)
 53 | txs <- transcripts(TxDb.Hsapiens.UCSC.hg19.knownGene)
 54 | ## run program for chromosome 1 only
 55 | txs <- txs[seqnames(txs) %in% "chr1"]
 56 | genome <- Hsapiens
 57 | ## split the reads into NucleosomeFree, mononucleosome, 
 58 | ## dinucleosome and trinucleosome.
 59 | objs <- splitGAlignmentsByCut(gal1, txs=txs, genome=genome,
 60 |                               conservation=phastCons100way.UCSC.hg19)
 61 | 
 62 | ## --------------------------------------------------------------------------
 63 | null <- writeListOfGAlignments(objs, outPath)
 64 | ## list the files generated by splitBam.
 65 | dir(outPath)
 66 | 
 67 | ## ----eval=FALSE------------------------------------------------------------
 68 | #  objs <- splitBam(bamfile, tags=tags, outPath=outPath,
 69 | #                   txs=txs, genome=genome,
 70 | #                   conservation=phastCons100way.UCSC.hg19)
 71 | 
 72 | ## ----fig.height=4, fig.width=4---------------------------------------------
 73 | library(ChIPpeakAnno)
 74 | bamfiles <- file.path(outPath,
 75 |                      c("NucleosomeFree.bam",
 76 |                      "mononucleosome.bam",
 77 |                      "dinucleosome.bam",
 78 |                      "trinucleosome.bam"))
 79 | ## Plot the cumulative percentage of tag allocation in nucleosome-free 
 80 | ## and mononucleosome bam files.
 81 | cumulativePercentage(bamfiles[1:2], as(seqinfo(Hsapiens)["chr1"], "GRanges"))
 82 | 
 83 | ## ----fig.height=8, fig.width=4---------------------------------------------
 84 | TSS <- promoters(txs, upstream=0, downstream=1)
 85 | TSS <- unique(TSS)
 86 | ## estimate the library size for normalization
 87 | (librarySize <- estLibSize(bamfiles))
 88 | ## calculate the signals around TSSs.
 89 | NTILE <- 101
 90 | dws <- ups <- 1010
 91 | sigs <- enrichedFragments(gal=objs[c("NucleosomeFree", 
 92 |                                      "mononucleosome",
 93 |                                      "dinucleosome",
 94 |                                      "trinucleosome")], 
 95 |                           TSS=TSS,
 96 |                           librarySize=librarySize,
 97 |                           seqlev=seqlev,
 98 |                           TSS.filter=0.5,
 99 |                           n.tile = NTILE,
100 |                           upstream = ups,
101 |                           downstream = dws)
102 | ## log2 transformed signals
103 | sigs.log2 <- lapply(sigs, function(.ele) log2(.ele+1))
104 | #plot heatmap
105 | featureAlignedHeatmap(sigs.log2, reCenterPeaks(TSS, width=ups+dws),
106 |                       zeroAt=.5, n.tile=NTILE)
107 | 
108 | ## ----fig.show="hide"-------------------------------------------------------
109 | ## get signals normalized for nucleosome-free and nucleosome-bound regions.
110 | out <- featureAlignedDistribution(sigs, 
111 |                                   reCenterPeaks(TSS, width=ups+dws),
112 |                                   zeroAt=.5, n.tile=NTILE, type="l", 
113 |                                   ylab="Averaged coverage")
114 | 
115 | ## --------------------------------------------------------------------------
116 | ## rescale the nucleosome-free and nucleosome signals to 0~1
117 | range01 <- function(x){(x-min(x))/(max(x)-min(x))}
118 | out <- apply(out, 2, range01)
119 | matplot(out, type="l", xaxt="n", 
120 |         xlab="Position (bp)", 
121 |         ylab="Fraction of signal")
122 | axis(1, at=seq(0, 100, by=10)+1, 
123 |      labels=c("-1K", seq(-800, 800, by=200), "1K"), las=2)
124 | abline(v=seq(0, 100, by=10)+1, lty=2, col="gray")
125 | 
126 | ## --------------------------------------------------------------------------
127 | ## foot prints
128 | library(MotifDb)
129 | CTCF <- query(MotifDb, c("CTCF"))
130 | CTCF <- as.list(CTCF)
131 | print(CTCF[[1]], digits=2)
132 | sigs <- factorFootprints(shiftedBamfile, pfm=CTCF[[1]], 
133 |                          genome=genome,
134 |                          min.score="90%", seqlev=seqlev,
135 |                          upstream=100, downstream=100)
136 | 
137 | ## ----fig.height=6, fig.width=6---------------------------------------------
138 | featureAlignedHeatmap(sigs$signal, 
139 |                       feature.gr=reCenterPeaks(sigs$bindingSites,
140 |                                                width=200+width(sigs$bindingSites[1])), 
141 |                       annoMcols="score",
142 |                       sortBy="score",
143 |                       n.tile=ncol(sigs$signal[[1]]))
144 | 
145 | sigs$spearman.correlation
146 | 
147 | ## ----sessionInfo-----------------------------------------------------------
148 | sessionInfo()
149 | 
150 | 


--------------------------------------------------------------------------------
/bin/TagAlign.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #=================================
  4 | # this program creates a tag align file (in .gz format) from one or more input aligned (in .bam / .gz format) files
  5 | #=================================
  6 | # developed by - Sourya Bhattacharyya
  7 | # Vijay-AY lab
  8 | # La Jolla Institute for Allergy and Immunology
  9 | #=================================
 10 | 
 11 | # usage info
 12 | usage(){
 13 | cat << EOF
 14 | 
 15 | usage: 
 16 | 
 17 | 1) ./TagAlign.sh [-h] [-I inpfile1] [-N 0] [-f 4] [-r 5] [-O Outfile]			For ATAC seq or ChIPMentation data
 18 | 2) ./TagAlign.sh [-h] [-I inpfile1] [-N 1] [-O Outfile]							For standard ChIP seq data
 19 | 
 20 | Example to process multiple input files:
 21 | ./TagAlign.sh [-h] [-I inpfile1] [-I inpfile2] [-N 1] [-O Outfile]					
 22 | 	Here all the input files will be processed separately, and their outputs will be combined in a single file
 23 | 
 24 | Options:    
 25 | 
 26 |   -- required:
 27 | 	-I  InpFile          Input files (aligned in .bam format) or already in gzipped bed format (.gz). 
 28 | 						 User can provide multiple input files together.
 29 | 	-N  NoShift			 It is a binary variable. If 1, the aligned files strand information are not altered. 
 30 | 						 For standard ChIP seq data, this should be set as 1. 
 31 | 						 For a ChIPMentation or ATAC seq data, this should be set as 0, since in these cases the tagaligned files are formed 
 32 | 						 by shifting forward and reverse strands to cover the length of transposon. Default 1.
 33 | 	-f  fwdshift		 If NoShift=0, this value signifies the amount of shift a forward strand will 
 34 | 						 require to cover the length of transposon. Default 4.
 35 | 	-r  revshift         If NoShift=0, this value signifies the amount of shift a reverse strand 
 36 | 						 will require to cover the length of transposon. Default 5.
 37 | 	-O  OutFile 		 Output tagalign file (in .gz format) combining the input files
 38 | 	-q  MAPQ_THR		 Quality threshold that will be applied on the given input BAM file (default 30)
 39 |   
 40 | EOF
 41 | }
 42 | 
 43 | # threshold of mapq quality
 44 | MAPQ_THR=30
 45 | 
 46 | # default configurations
 47 | NoShift=1
 48 | fwdshift=4
 49 | revshift=5
 50 | 
 51 | # Sourya - Note the processing of input file argument since it can be more than one file
 52 | # Note the change of notations
 53 | 
 54 | while getopts "I:N:f:r:O:q:" opt;
 55 | do
 56 | 	case "$opt" in
 57 | 		I) InpFile+=($OPTARG);;
 58 | 		N) NoShift=$OPTARG;;
 59 | 		f) fwdshift=$OPTARG;;
 60 | 		r) revshift=$OPTARG;;
 61 | 		O) OutFile=$OPTARG;;
 62 | 		q) MAPQ_THR=$OPTARG;;
 63 | 		\?) usage
 64 | 			echo "error: unrecognized option -$OPTARG";
 65 | 			exit 1
 66 | 			;;
 67 | 	esac
 68 | done
 69 | 
 70 | echo 'Within utility function TagAlign'
 71 | 
 72 | # # this line should be added when processing a list of inputs using the same command line option
 73 | # shift $(( OPTIND - 1 ))
 74 | 
 75 | # number of input files provided
 76 | ninp=${#InpFile[@]}
 77 | echo 'Number of input files : '$ninp
 78 | 
 79 | if [[ $ninp == 0 ]]; then
 80 | 	echo 'User should provide one or more aligned (.bam) or existing tagalign (.gz) files to combine them - exit for the moment !!'
 81 | 	exit 1
 82 | fi
 83 | 
 84 | # echo 'List of input files: '$InpFile
 85 | 
 86 | if [[ -z $OutFile ]]; then
 87 | 	echo 'User did not provide the output file name - exit for the moment !!'
 88 | 	exit 1
 89 | fi
 90 | 
 91 | #----------------------------------
 92 | # important - sourya
 93 | # change the current directory as the dir containing this executable
 94 | # since other source files relative to the current directory needs to be called
 95 | current_dir=$(pwd)
 96 | script_dir=$(dirname $0)
 97 | cd $script_dir
 98 | #----------------------------------
 99 | 
100 | # also check the extension of input file
101 | filebase1=$(basename "${InpFile[0]}")
102 | if [[ $filebase1 =~ \.bam$ ]]; then
103 | 	bamext=1
104 | 	echo 'Input files are provided in .bam format'
105 | else
106 | 	if [[ $filebase1 =~ \.gz$ ]]; then
107 | 		bamext=0
108 | 		echo 'Input files are already in .gz format'
109 | 	else
110 | 		echo 'User should provide either one or more aligned (.bam) files or previously generated TagAlign files in .gz format !! Exit '
111 | 		exit 1
112 | 	fi
113 | fi
114 | 
115 | # similarly check the extension of output file and if required, append the gzipped extension
116 | filebase2=$(basename "$OutFile")
117 | if [[ $filebase2 =~ \.gz$ ]]; then
118 | 	echo 'User has correctly provided gzipped outfile name'
119 | else
120 | 	echo 'Appending gzipped extension in the output file name'
121 | 	OutFile=$OutFile'.gz'
122 | fi
123 | 
124 | # check the number of input files and proceed accordingly
125 | if [ $ninp == 1 ]; then
126 | 	# Only one input file is provided
127 | 	echo 'Converting the file: '${InpFile[0]}
128 | 	if [ $bamext == 1 ]; then
129 | 		# here one input bam file is provided
130 | 		# so convert the bam file according to the shifting / non-shifting criteria
131 | 		if [ $NoShift == 0 ]; then
132 | 			samtools view -b -F 1548 -q $MAPQ_THR ${InpFile[0]} | bamToBed -i stdin | awk -v f=$fwdshift -v r=$revshift 'BEGIN {OFS = "\t"} ; function pos(x){return ((x < 0.0) ? 0 : x)}  {if ($6 == "+") print $1, $2 + f, $3 + f, $4, $5, $6; else print $1, pos($2 - r), pos($3 - r), $4, $5, $6}' | gzip -c > $OutFile
133 | 		else
134 | 			samtools view -b -F 1548 -q $MAPQ_THR ${InpFile[0]} | bamToBed -i stdin | awk 'BEGIN{FS="\t";OFS="\t"}{$4="N"; print $0}' | gzip -c > $OutFile
135 | 		fi
136 | 	else
137 | 		# already a gzipped file is provided
138 | 		# we can just copy the file
139 | 		cp ${InpFile[0]} $OutFile
140 | 	fi
141 | else
142 | 	# Multiple input files are provided
143 | 	if [ $bamext == 1 ]; then
144 | 		# input files are provided in bam format
145 | 		# we have to convert them individually, and then combine them
146 | 		convfilelist=''
147 | 		for (( i=0; i<${ninp}; i++ ));
148 | 		do
149 | 			# convert the current file into a temporary output file
150 | 			echo 'Converting the file: '${InpFile[i]}
151 | 			curroutfile='temp_'$i'.gz'
152 | 			if [ $NoShift == 0 ]; then
153 | 				samtools view -b -F 1548 -q $MAPQ_THR ${InpFile[i]} | bamToBed -i stdin | awk -v f=$fwdshift -v r=$revshift 'BEGIN {OFS = "\t"} ; function pos(x){return ((x < 0.0) ? 0 : x)}  {if ($6 == "+") print $1, $2 + f, $3 + f, $4, $5, $6; else print $1, pos($2 - r), pos($3 - r), $4, $5, $6}' | gzip -c > $curroutfile
154 | 			else
155 | 				samtools view -b -F 1548 -q $MAPQ_THR ${InpFile[i]} | bamToBed -i stdin | awk 'BEGIN{FS="\t";OFS="\t"}{$4="N"; print $0}' | gzip -c > $curroutfile
156 | 			fi
157 | 			# also update the command for combining these generated files
158 | 			convfilelist=$convfilelist' '$curroutfile
159 | 		done
160 | 		zcat $convfilelist | gzip -c > $OutFile
161 | 		# remove temporary files
162 | 		for (( i=0; i<${ninp}; i++ ));
163 | 		do
164 | 			rm 'temp_'$i'.gz'
165 | 		done
166 | 	else
167 | 		# input files are already in gzipped format
168 | 		# we can just combine them
169 | 		convfilelist=''
170 | 		for val in "${InpFile[@]}"; do
171 | 			convfilelist=$convfilelist' '$val
172 | 		done
173 | 		zcat $convfilelist | gzip -c > $OutFile
174 | 	fi
175 | fi
176 | 
177 | 
178 | #----------------------------------
179 | # important - sourya
180 | # now restore the original directory
181 | cd $current_dir
182 | #----------------------------------
183 | 
184 | 


--------------------------------------------------------------------------------
/bin/bam_to_bigwig.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #========================================
  4 | # sample script for converting input bam file to bigwig format
  5 | 
  6 | # author: Sourya Bhattacharyya
  7 | # Vijay-AY lab
  8 | # La Jolla Institute for Allergy and Immunology
  9 | #========================================
 10 | 
 11 | # usage info
 12 | usage(){
 13 | cat << EOF
 14 | 
 15 | usage: 
 16 | ./bam_to_bigwig.sh [-h] [-I InpFile] [-g refgenome] [-d OutDir]
 17 | Example:
 18 | ./bam_to_bigwig.sh -I Inp.bam -g 'hg19' -d '/home/sample_ATAC'
 19 | 
 20 | Options:
 21 |   -- required:
 22 | 	-I  InpFile          Input BAM file.
 23 | 	-g  refgenome   	 Reference genome for chromosome size etc.
 24 | 	-d  OutDir 			 Output directory which will contain all the bigwig file and other data
 25 | 	-n  OutFilePrefix	 If specified, output bigwig file name will be 'OutFilePrefix.bw' under the directory 'OutDir'
 26 | EOF
 27 | }
 28 | 
 29 | # default output directory
 30 | OutDir=`pwd`'/'
 31 | 
 32 | # initialization of the prefix string
 33 | OutFilePrefix=""
 34 | 
 35 | while getopts "I:g:d:n:" opt;
 36 | do
 37 | 	case "$opt" in
 38 | 		I) InpFile=$OPTARG;;
 39 | 		g) refgenome=$OPTARG;;
 40 | 		d) OutDir=$OPTARG;;
 41 | 		n) OutFilePrefix=$OPTARG;;
 42 | 		\?) usage
 43 | 			echo "error: unrecognized option -$OPTARG";
 44 | 			exit 1
 45 | 			;;
 46 | 	esac
 47 | done
 48 | 
 49 | if [[ -z $InpFile ]]; then
 50 | 	echo 'No input BAM file is provided - exit !!'
 51 | 	exit 1
 52 | fi
 53 | 
 54 | if [[ -z $refgenome ]]; then
 55 | 	echo 'No reference genome is provided - exit !!'
 56 | 	exit 1
 57 | fi
 58 | 
 59 | filebase1=$(basename "${InpFile}")
 60 | if [[ $filebase1 =~ \.bam$ ]]; then
 61 | 	echo 'Input files are provided in .bam format'
 62 | else
 63 | 	echo 'Input file is not in BAM format - exit !!'
 64 | 	exit 1
 65 | fi
 66 | 
 67 | #----------------------------------
 68 | # important - sourya
 69 | # change the current directory as the dir containing this executable
 70 | # since other source files relative to the current directory needs to be called
 71 | current_dir=$(pwd)
 72 | script_dir=$(dirname $0)
 73 | cd $script_dir
 74 | #----------------------------------
 75 | 
 76 | if [ ! -f $refgenome'.chrom.sizes' ]; then
 77 | 	# this utility program from UCSC, fetches the chromosome size of the target genome
 78 | 	# and stores that in the specified text file
 79 | 	echo 'Getting the chromosome size'
 80 | 	fetchChromSizes $refgenome > $refgenome'.chrom.sizes'
 81 | fi
 82 | 
 83 | # convert the bam file to a bedgraph file
 84 | # ensure that the bedgraph file contains only valid chromosomes
 85 | # if [ ! -f $OutDir'/Inp.bedGraph' ]; then
 86 | 	genomeCoverageBed -bga -ibam $InpFile -g $refgenome'.chrom.sizes' | awk '( $1 ~ /^chr([1-9]|2[0-2]|1[0-9]|X|M|Y)$/ )' - > $OutDir'/Inp.bedGraph'
 87 | # fi
 88 | 
 89 | # sort the generated bedgraph file using the utility of UCSC genome browser
 90 | # if [ ! -f $OutDir'/Inp.Sorted.bedGraph' ]; then
 91 | 	bedSort $OutDir'/Inp.bedGraph' $OutDir'/Inp.Sorted.bedGraph'
 92 | # fi
 93 | 
 94 | # from the bedgraph file, generate the BigWig file
 95 | # using an utility of UCSC genome browser
 96 | if [[ -z $OutFilePrefix ]]; then
 97 | 	outbigwigfile=$OutDir'/Inp_BigWig.bw'
 98 | else
 99 | 	outbigwigfile=$OutDir'/'$OutFilePrefix'.bw'
100 | fi 
101 | 
102 | # if [ ! -f $outbigwigfile ]; then
103 | 	bedGraphToBigWig $OutDir'/Inp.Sorted.bedGraph' $refgenome'.chrom.sizes' $outbigwigfile
104 | # fi
105 | 
106 | #----------------------------------
107 | # important - sourya
108 | # now restore the original directory
109 | cd $current_dir
110 | #----------------------------------
111 | 
112 | 


--------------------------------------------------------------------------------
/configfile:
--------------------------------------------------------------------------------
 1 | #==================================== 
 2 | # Sample Configuration file for running the ATAC-seq pipeline
 3 | # Contains locations of executables and a few genome specific files
 4 | # required to execute the pipeline
 5 | #====================================  
 6 | 
 7 | 
 8 | # Picard tool executable
 9 | # used for removing PCR duplicates from the ChIP-seq alignment file
10 | picardexec=/home/sourya/packages/PicardTool/picard_version_2.18.14.jar
11 | 
12 | # HOMER package executable path
13 | HOMERPath=/home/sourya/packages/HOMER/bin/
14 | 
15 | # deeptools package - directory
16 | DeepToolsDir=/home/sourya/packages/deepTools/deepTools2.0/bin/
17 | 
18 | # file (SQL) required to convert the narrowPeak file to the bigBed format
19 | # check the UCSC web site to download these files
20 | NarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/narrowPeak.as
21 | 
22 | # file (SQL) required to convert the bignarrowPeak file to the bigBed format
23 | # check the UCSC web site to download these files
24 | BigNarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/bigNarrowPeak.as
25 | 
26 | # file (SQL) required to convert the broadPeak file to the bigBed format
27 | # check the UCSC web site to download these files
28 | BroadPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/broadPeak.as
29 | 
30 | # files containing chromosome size information
31 | # two column file storing the size of individual chromosomes
32 | # example: for reference genome hg19, chrom_hg19.sizes 
33 | # example: for reference genome hg38, hg38.chrom.sizes
34 | # example: for reference genome mm9, chrom_mm9.sizes
35 | # example: for reference genome mm10, mm10.chrom.sizes
36 | RefChrSizeFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/hg38.chrom.sizes
37 | 
38 | # files containing reference chromosome fasta sequence
39 | # example: hg19.fa, mm9.fa, hg38.fa and mm10.fa
40 | RefChrFastaFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Complete_Genome/hg38/hg38.fa
41 | 
42 | # files containing reference UCSC annotation (.gtf format) corresponding to the reference Chromosome
43 | # applied as an input to HOMER
44 | RefChrAnnotFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/hg38/hg38.gtf
45 | 
46 | # file containing blacklisted regions corresponding to this reference genome hg38
47 | # can be downloaded from the link 
48 | # https://github.com/Boyle-Lab/Blacklist/tree/master/lists (v2)
49 | # its OK if this parameter is void, but we recommend to provide if the file is available
50 | # file can be gzipped or normal text format 
51 | BlackListFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/BlackListed_Regions/hg38-blacklist.v2.bed
52 | 
53 | # executable of ataqv package (https://github.com/ParkerLab/ataqv)
54 | # download the GitHub release (.tar.gz) file, extract it and provide the ataqv executable path here
55 | ATAQVPath=/home/sourya/packages/ataqv/ataqv-1.0.0/bin/ataqv
56 | 
57 | # we have to also include the TSS file, but have to specify that the user needs to convert their GTF File (gene annotation file) in the TSS format
58 | # so that first two fields of TSS file includes the chromosome name and the TSS coordinate
59 | TSSFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/hg38/hg38_TSS.gtf
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/configfile_hg19:
--------------------------------------------------------------------------------
 1 | #==================================== 
 2 | # Configuration file for running the ATAC-seq pipeline
 3 | # Contains locations of executables and a few genome specific files
 4 | # required to execute the pipeline
 5 | #====================================  
 6 | 
 7 | 
 8 | # Picard tool executable
 9 | # used for removing PCR duplicates from the ChIP-seq alignment file
10 | picardexec=/home/sourya/packages/PicardTool/picard_version_2.18.14.jar
11 | 
12 | # HOMER package executable path
13 | HOMERPath=/home/sourya/packages/HOMER/bin/
14 | 
15 | # deeptools package - directory
16 | DeepToolsDir=/home/sourya/packages/deepTools/deepTools2.0/bin/
17 | 
18 | # file (SQL) required to convert the narrowPeak file to the bigBed format
19 | # check the UCSC web site to download these files
20 | NarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/narrowPeak.as
21 | 
22 | # file (SQL) required to convert the bignarrowPeak file to the bigBed format
23 | # check the UCSC web site to download these files
24 | BigNarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/bigNarrowPeak.as
25 | 
26 | # file (SQL) required to convert the broadPeak file to the bigBed format
27 | # check the UCSC web site to download these files
28 | BroadPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/broadPeak.as
29 | 
30 | # files containing chromosome size information
31 | # two column file storing the size of individual chromosomes
32 | # example: for reference genome hg19, chrom_hg19.sizes 
33 | # example: for reference genome hg38, hg38.chrom.sizes
34 | # example: for reference genome mm9, chrom_mm9.sizes
35 | # example: for reference genome mm10, mm10.chrom.sizes
36 | RefChrSizeFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/chrom_hg19.sizes
37 | 
38 | # files containing reference chromosome fasta sequence
39 | # example: hg19.fa, mm9.fa, hg38.fa and mm10.fa
40 | RefChrFastaFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Complete_Genome/hg19/hg19.fa
41 | 
42 | # files containing reference UCSC annotation (.gtf format) corresponding to the reference Chromosome
43 | # applied as an input to HOMER
44 | RefChrAnnotFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/hg19/hg19.gtf
45 | 
46 | # file containing blacklisted regions corresponding to this reference genome hg38
47 | # can be downloaded from the link 
48 | # https://github.com/Boyle-Lab/Blacklist/tree/master/lists (v2)
49 | # its OK if this parameter is void, but we recommend to provide if the file is available
50 | # file can be gzipped or normal text format 
51 | BlackListFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/BlackListed_Regions/hg19-blacklist.v2.bed
52 | 
53 | # executable of ataqv package (https://github.com/ParkerLab/ataqv)
54 | # download the GitHub release (.tar.gz) file, extract it and provide the ataqv executable path here
55 | ATAQVPath=/home/sourya/packages/ataqv/ataqv-1.0.0/bin/ataqv
56 | 
57 | # we have to also include the TSS file, but have to specify that the user needs to convert their GTF File (gene annotation file) in the TSS format
58 | # so that first two fields of TSS file includes the chromosome name and the TSS coordinate
59 | TSSFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/hg19/hg19_TSS.gtf
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/configfile_hg38:
--------------------------------------------------------------------------------
 1 | #==================================== 
 2 | # Configuration file for running the ATAC-seq pipeline
 3 | # Contains locations of executables and a few genome specific files
 4 | # required to execute the pipeline
 5 | #====================================  
 6 | 
 7 | 
 8 | # Picard tool executable
 9 | # used for removing PCR duplicates from the ChIP-seq alignment file
10 | picardexec=/home/sourya/packages/PicardTool/picard_version_2.18.14.jar
11 | 
12 | # HOMER package executable path
13 | HOMERPath=/home/sourya/packages/HOMER/bin/
14 | 
15 | # deeptools package - directory
16 | DeepToolsDir=/home/sourya/packages/deepTools/deepTools2.0/bin/
17 | 
18 | # file (SQL) required to convert the narrowPeak file to the bigBed format
19 | # check the UCSC web site to download these files
20 | NarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/narrowPeak.as
21 | 
22 | # file (SQL) required to convert the bignarrowPeak file to the bigBed format
23 | # check the UCSC web site to download these files
24 | BigNarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/bigNarrowPeak.as
25 | 
26 | # file (SQL) required to convert the broadPeak file to the bigBed format
27 | # check the UCSC web site to download these files
28 | BroadPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/broadPeak.as
29 | 
30 | # files containing chromosome size information
31 | # two column file storing the size of individual chromosomes
32 | # example: for reference genome hg19, chrom_hg19.sizes 
33 | # example: for reference genome hg38, hg38.chrom.sizes
34 | # example: for reference genome mm9, chrom_mm9.sizes
35 | # example: for reference genome mm10, mm10.chrom.sizes
36 | RefChrSizeFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/hg38.chrom.sizes
37 | 
38 | # files containing reference chromosome fasta sequence
39 | # example: hg19.fa, mm9.fa, hg38.fa and mm10.fa
40 | RefChrFastaFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Complete_Genome/hg38/hg38.fa
41 | 
42 | # files containing reference UCSC annotation (.gtf format) corresponding to the reference Chromosome
43 | # applied as an input to HOMER
44 | RefChrAnnotFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/hg38/hg38.gtf
45 | 
46 | # file containing blacklisted regions corresponding to this reference genome hg38
47 | # can be downloaded from the link 
48 | # https://github.com/Boyle-Lab/Blacklist/tree/master/lists (v2)
49 | # its OK if this parameter is void, but we recommend to provide if the file is available
50 | # file can be gzipped or normal text format 
51 | BlackListFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/BlackListed_Regions/hg38-blacklist.v2.bed
52 | 
53 | # executable of ataqv package (https://github.com/ParkerLab/ataqv)
54 | # download the GitHub release (.tar.gz) file, extract it and provide the ataqv executable path here
55 | ATAQVPath=/home/sourya/packages/ataqv/ataqv-1.0.0/bin/ataqv
56 | 
57 | # we have to also include the TSS file, but have to specify that the user needs to convert their GTF File (gene annotation file) in the TSS format
58 | # so that first two fields of TSS file includes the chromosome name and the TSS coordinate
59 | TSSFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/hg38/hg38_TSS.gtf
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/configfile_mm10:
--------------------------------------------------------------------------------
 1 | #==================================== 
 2 | # Configuration file for running the ATAC-seq pipeline
 3 | # Contains locations of executables and a few genome specific files
 4 | # required to execute the pipeline
 5 | #====================================  
 6 | 
 7 | 
 8 | # Picard tool executable
 9 | # used for removing PCR duplicates from the ChIP-seq alignment file
10 | picardexec=/home/sourya/packages/PicardTool/picard_version_2.18.14.jar
11 | 
12 | # HOMER package executable path
13 | HOMERPath=/home/sourya/packages/HOMER/bin/
14 | 
15 | # deeptools package - directory
16 | DeepToolsDir=/home/sourya/packages/deepTools/deepTools2.0/bin/
17 | 
18 | # file (SQL) required to convert the narrowPeak file to the bigBed format
19 | # check the UCSC web site to download these files
20 | NarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/narrowPeak.as
21 | 
22 | # file (SQL) required to convert the bignarrowPeak file to the bigBed format
23 | # check the UCSC web site to download these files
24 | BigNarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/bigNarrowPeak.as
25 | 
26 | # file (SQL) required to convert the broadPeak file to the bigBed format
27 | # check the UCSC web site to download these files
28 | BroadPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/broadPeak.as
29 | 
30 | # files containing chromosome size information
31 | # two column file storing the size of individual chromosomes
32 | # example: for reference genome hg19, chrom_hg19.sizes 
33 | # example: for reference genome hg38, hg38.chrom.sizes
34 | # example: for reference genome mm9, chrom_mm9.sizes
35 | # example: for reference genome mm10, mm10.chrom.sizes
36 | RefChrSizeFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/mm10.chrom.sizes
37 | 
38 | # files containing reference chromosome fasta sequence
39 | # example: hg19.fa, mm9.fa, hg38.fa and mm10.fa
40 | RefChrFastaFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Complete_Genome/mm10/mm10.fa
41 | 
42 | # files containing reference UCSC annotation (.gtf format) corresponding to the reference Chromosome
43 | # applied as an input to HOMER
44 | RefChrAnnotFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/mm10/mm10.gtf
45 | 
46 | # file containing blacklisted regions corresponding to this reference genome hg38
47 | # can be downloaded from the link 
48 | # https://github.com/Boyle-Lab/Blacklist/tree/master/lists (v2)
49 | # its OK if this parameter is void, but we recommend to provide if the file is available
50 | # file can be gzipped or normal text format 
51 | BlackListFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/BlackListed_Regions/mm10-blacklist.v2.bed
52 | 
53 | # executable of ataqv package (https://github.com/ParkerLab/ataqv)
54 | # download the GitHub release (.tar.gz) file, extract it and provide the ataqv executable path here
55 | ATAQVPath=/home/sourya/packages/ataqv/ataqv-1.0.0/bin/ataqv
56 | 
57 | # we have to also include the TSS file, but have to specify that the user needs to convert their GTF File (gene annotation file) in the TSS format
58 | # so that first two fields of TSS file includes the chromosome name and the TSS coordinate
59 | TSSFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/mm10/mm10_TSS.gtf
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/configfile_mm9:
--------------------------------------------------------------------------------
 1 | #==================================== 
 2 | # Configuration file for running the ATAC-seq pipeline
 3 | # Contains locations of executables and a few genome specific files
 4 | # required to execute the pipeline
 5 | #====================================  
 6 | 
 7 | 
 8 | # Picard tool executable
 9 | # used for removing PCR duplicates from the ChIP-seq alignment file
10 | picardexec=/home/sourya/packages/PicardTool/picard_version_2.18.14.jar
11 | 
12 | # HOMER package executable path
13 | HOMERPath=/home/sourya/packages/HOMER/bin/
14 | 
15 | # deeptools package - directory
16 | DeepToolsDir=/home/sourya/packages/deepTools/deepTools2.0/bin/
17 | 
18 | # file (SQL) required to convert the narrowPeak file to the bigBed format
19 | # check the UCSC web site to download these files
20 | NarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/narrowPeak.as
21 | 
22 | # file (SQL) required to convert the bignarrowPeak file to the bigBed format
23 | # check the UCSC web site to download these files
24 | BigNarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/bigNarrowPeak.as
25 | 
26 | # file (SQL) required to convert the broadPeak file to the bigBed format
27 | # check the UCSC web site to download these files
28 | BroadPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/broadPeak.as
29 | 
30 | # files containing chromosome size information
31 | # two column file storing the size of individual chromosomes
32 | # example: for reference genome hg19, chrom_hg19.sizes 
33 | # example: for reference genome hg38, hg38.chrom.sizes
34 | # example: for reference genome mm9, chrom_mm9.sizes
35 | # example: for reference genome mm10, mm10.chrom.sizes
36 | RefChrSizeFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/chrom_mm9.sizes
37 | 
38 | # files containing reference chromosome fasta sequence
39 | # example: hg19.fa, mm9.fa, hg38.fa and mm10.fa
40 | RefChrFastaFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Complete_Genome/mm9/mm9.fa
41 | 
42 | # files containing reference UCSC annotation (.gtf format) corresponding to the reference Chromosome
43 | # applied as an input to HOMER
44 | RefChrAnnotFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/mm9/mm9.gtf
45 | 
46 | # file containing blacklisted regions corresponding to this reference genome hg38
47 | # can be downloaded from the link 
48 | # https://github.com/Boyle-Lab/Blacklist/tree/master/lists (v2)
49 | # its OK if this parameter is void, but we recommend to provide if the file is available
50 | # file can be gzipped or normal text format 
51 | BlackListFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/BlackListed_Regions/mm9-blacklist.bed
52 | 
53 | # executable of ataqv package (https://github.com/ParkerLab/ataqv)
54 | # download the GitHub release (.tar.gz) file, extract it and provide the ataqv executable path here
55 | ATAQVPath=/home/sourya/packages/ataqv/ataqv-1.0.0/bin/ataqv
56 | 
57 | # we have to also include the TSS file, but have to specify that the user needs to convert their GTF File (gene annotation file) in the TSS format
58 | # so that first two fields of TSS file includes the chromosome name and the TSS coordinate
59 | TSSFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/mm9/mm9_TSS.gtf
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/pipeline_exec.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | 
 3 | #=================
 4 | # main executable script of the ATAC seq pipeline
 5 | #=================
 6 | # developed by - Sourya Bhattacharyya
 7 | # Vijay-AY lab
 8 | # La Jolla Institute for Allergy and Immunology
 9 | #=================
10 | 
11 | #=================
12 | # script 1 - when fastq files of paired end read are provided as the input
13 | #=================
14 | 
15 | genome='/home/sourya/genomes/bowtie2_index/hg19/hg19'
16 | dirdata='/home/sourya/test1/'
17 | inpfile1=$dirdata'fastafiles/001_R1.fastq.gz'
18 | inpfile2=$dirdata'fastafiles/001_R2.fastq.gz'
19 | outdir=$dirdata'Sample_TEST_ATAC'
20 | prefix='001'
21 | 
22 | `pwd`/bin/pipeline.sh -f $inpfile1 -r $inpfile2 -C `pwd`'/configfile_hg19' -n $prefix -g $genome -d $outdir -t 8 -m "16G" -q 20 -D 1 -O 0
23 | 
24 | #=================
25 | # script 2 - when fastq files of single end end read are provided as the input
26 | #=================
27 | 
28 | genome='/home/sourya/genomes/bowtie2_index/hg19/hg19'
29 | dirdata='/home/sourya/test2/'
30 | inpfile=$dirdata'merged_inp.fastq.gz'
31 | outdir=$dirdata'Sample_TEST_ATAC'
32 | prefix='002'
33 | 
34 | `pwd`/bin/pipeline.sh -f $inpfile -C `pwd`'/configfile_hg19' -n $prefix -g $genome -d $outdir -t 8 -m "16G" -q 20 -D 0 -O 0
35 | 
36 | #=================
37 | # script 3 - when a BAM file is provided as the input
38 | # here reference genome is not used
39 | # however, -w parameter is used to specify the genome for 
40 | # creating UCSC compatible tracks
41 | #=================
42 | 
43 | dirdata='/home/sourya/test3/'
44 | inpfile=$dirdata'inp.bam'
45 | outdir=$dirdata'Sample_TEST_ATAC'
46 | prefix='003'
47 | 
48 | `pwd`/bin/pipeline.sh -f $inpfile -C `pwd`'/configfile_hg19' -n $prefix -d $outdir -t 8 -m "16G" -q 20 -D 1 -O 0 -w "hg19"
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/sample_IDRScript.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #=================================
 4 | # sample script for IDR execution
 5 | # where peaks generated from multiple ChIP-seq replicates are provided as input
 6 | #=================================
 7 | 
 8 | # main executable of IDR script
 9 | # when peak files are used as input
10 | IDRScript='./IDR_Codes/IDRMain.sh'
11 | 
12 | # main executable of IDR script
13 | # when BAM files are used as input
14 | IDRScriptBAM='./IDR_Codes/IDR_SubSampleBAM_Main.sh'
15 | 
16 | #******************************
17 | # path containing the IDRCode package by Anshul Kundaje et. al.
18 | # user should replace this path with their custom installation directory
19 | IDRCodePackage='/home/sourya/packages/idrCode/'
20 | #******************************
21 | 
22 | 
23 | #====================
24 | # IDR testing script 1
25 | # examining IDR between two peak files
26 | # top 25K common peaks between two samples are experimented
27 | #====================
28 | 
29 | SampleBaseDir='/home/sourya/test1/'
30 | 
31 | $IDRScript -I $SampleBaseDir'Sample1/MACS2_Default_Tag_No_Control/Sample1.macs2_peaks.narrowPeak_Q0.01filt' -I $SampleBaseDir'Sample2/MACS2_Default_Tag_No_Control/Sample2.macs2_peaks.narrowPeak_Q0.01filt' -d $SampleBaseDir'/Sample_IDR_Peaks' -P $IDRCodePackage
32 | 
33 | 
34 | #====================
35 | # IDR testing script 2
36 | # examining IDR between two BAM files
37 | # first these BAM files are subsampled
38 | # and their peaks are estimated using MACS2
39 | # top 25K common peaks between two samples are experimented
40 | # no control BAM file is provided
41 | # user may specify one or more control BAM files using -C option
42 | # like -C control1.bam -C control2.bam etc.
43 | #====================
44 | 
45 | SampleBaseDir='/home/sourya/test2/'
46 | 
47 | $IDRScriptBAM -I $SampleBaseDir'Sample1/Alignment_MAPQ30/Sample1.align.sort.MAPQ30.bam' -I $SampleBaseDir'Sample2/Alignment_MAPQ30/Sample2.align.sort.MAPQ30.bam' -d $SampleBaseDir'/Sample_IDR_BAMFiles' -P $IDRCodePackage -c 25000
48 | 


--------------------------------------------------------------------------------
/src/PlotSample.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | This program is for plotting ATAC seq peak distribution
  5 | 
  6 | Author: Sourya Bhattacharyya
  7 | Vijay-AY lab
  8 | """
  9 | 
 10 | import matplotlib
 11 | matplotlib.use('Agg') 
 12 | 
 13 | import os
 14 | from optparse import OptionParser
 15 | # import re
 16 | import matplotlib.pyplot as plt
 17 | # import numpy as np
 18 | 
 19 | #-----------------------------------------------------
 20 | def parse_options():  
 21 | 	parser = OptionParser()
 22 | 		
 23 | 	parser.add_option("-I", "--INPFILE", \
 24 | 				type="string", \
 25 | 				action="store", \
 26 | 				dest="INP_TEXT_FILE", \
 27 | 				default="", \
 28 | 				help="Input TEXT file containing the Picard Insert size results")
 29 | 
 30 | 	opts, args = parser.parse_args()
 31 | 	return opts, args
 32 | 
 33 | #-----------------------------------------------------
 34 | """
 35 | main function
 36 | """
 37 | def main():  
 38 | 	opts, args = parse_options()
 39 | 	InpFile = opts.INP_TEXT_FILE
 40 | 
 41 | 	k = InpFile.rfind('/')
 42 | 	if (k == -1):
 43 | 		InpDir = "./"
 44 | 	else:
 45 | 		InpDir = InpFile[:(k+1)]
 46 | 
 47 | 	outdir = InpDir + "Plots"
 48 | 	if (not os.path.exists(outdir)):
 49 | 		os.makedirs(outdir)
 50 | 
 51 | 	fragment_length_list = []
 52 | 	aligned_read_count_list = []
 53 | 
 54 | 	with open(InpFile) as f:
 55 | 		for line in f.readlines():
 56 | 			#curr_line_content = re.split(r'\s', line)
 57 | 			curr_line_content = line.split()
 58 | 			if (len(curr_line_content) == 2):
 59 | 				str1 = str(curr_line_content[0])
 60 | 				str2 = str(curr_line_content[1])
 61 | 				if 0:
 62 | 					print 'str1: ', str1, '  str2: ', str2
 63 | 				if (len(str1) > 0) and (len(str2) > 0):
 64 | 					if (str1[0].isdigit() == True) and (str2[0].isdigit() == True):
 65 | 						fragment_length_list.append(int(str1))
 66 | 						aligned_read_count_list.append(int(str2))
 67 | 
 68 | 	if 0:
 69 | 		print 'fragment_length_list: ',fragment_length_list
 70 | 		print 'aligned_read_count_list: ',aligned_read_count_list
 71 | 
 72 | 	total_read_count = sum(aligned_read_count_list)
 73 | 	if 0:
 74 | 		print 'total_read_count: ', total_read_count
 75 | 
 76 | 	
 77 | 	"""
 78 | 	normalize the aligned read count
 79 | 	dividing by the total no of reads and 
 80 | 	with respect to unit fragment size
 81 | 	"""
 82 | 	for i in range(len(aligned_read_count_list)):
 83 | 		aligned_read_count_list[i] = (aligned_read_count_list[i] * 1.0) / (fragment_length_list[i] * total_read_count)
 84 | 
 85 | 	"""
 86 | 	Now plot the statistics
 87 | 	"""
 88 | 	OutPlotFile = outdir + "/Fragment_plot_LINEAR.pdf"
 89 | 	f = plt.figure()
 90 | 	plt.plot(fragment_length_list, aligned_read_count_list, ls='-', lw=0.3, color='red')
 91 | 	plt.xlim([0,1400])	# add - sourya - setting view for 1400 Kb
 92 | 	plt.xlabel('Fragment length (bp)')
 93 | 	plt.ylabel('Norm Read count')
 94 | 	plt.title('ATAC seq - read density vs fragment length')
 95 | 	f.savefig(OutPlotFile, bbox_inches='tight')
 96 | 
 97 | 
 98 | 	OutPlotFile2 = outdir + "/Fragment_plot_LOG.pdf"
 99 | 	f = plt.figure()
100 | 	plt.semilogy(fragment_length_list, aligned_read_count_list, ls='-', lw=0.3, color='red')
101 | 	plt.xlim([0,1400])	# add - sourya - setting view for 1400 Kb
102 | 	plt.xlabel('Fragment length (bp)')
103 | 	plt.ylabel('Norm Read count')
104 | 	plt.title('ATAC seq - read density vs fragment length')
105 | 	f.savefig(OutPlotFile2, bbox_inches='tight')
106 | 
107 | 
108 | #-----------------------------------------------------
109 | if __name__ == "__main__":
110 | 	main() 
111 | 


--------------------------------------------------------------------------------
/src/assign_multimappers.py:
--------------------------------------------------------------------------------
 1 | ## assign_multimappers.py
 2 | 
 3 | # this code is used to process the multi mapped reads
 4 | # used after the Bowtie2 output is filtered to remove all the unmapped reads
 5 | 
 6 | 
 7 | # code by - Sourya Bhattacharyya
 8 | # taken from the standard ATAC seq pipeline
 9 | 
10 | import sys 
11 | import random 
12 | import argparse
13 | 
14 | # function to parse the arguments
15 | def parse_args(): 
16 | 	# Gives options
17 | 	parser = argparse.ArgumentParser(description='Saves reads below an alignment threshold and discards all others')
18 | 	parser.add_argument('-k', help='Alignment number cutoff') 
19 | 	parser.add_argument('--paired-end', dest='paired_ended', action='store_true', help='Data is paired-end')
20 | 
21 | 	# processing the input arguments
22 | 	# and return the input parameters to the main function
23 | 	args = parser.parse_args()
24 | 	alignment_cutoff = int(args.k) 
25 | 	paired_ended = args.paired_ended
26 | 	return alignment_cutoff, paired_ended
27 | 
28 | # main function
29 | if __name__ == "__main__": 
30 | 	
31 | 	# Runs the filtering step of choosing multimapped reads
32 | 	[alignment_cutoff, paired_ended] = parse_args()
33 | 
34 | 	# when paired ended input, the cutoff is adjusted
35 | 	if paired_ended:
36 | 		alignment_cutoff = int(alignment_cutoff) * 2
37 | 
38 | 	# Store each line in sam file as a list of reads,
39 | 	# where each read is a list of elements to easily 
40 | 	# modify or grab things
41 | 	current_reads = []
42 | 	current_qname = ''
43 | 
44 | 	# processing individual lines
45 | 	for line in sys.stdin:
46 | 		read_elems = line.strip().split('\t')
47 | 		if read_elems[0].startswith('@'): 
48 | 			sys.stdout.write(line)
49 | 			continue
50 | 	
51 | 		# Keep taking lines that have the same qname 
52 | 		if read_elems[0] == current_qname:
53 | 			# Add line to current reads 
54 | 			current_reads.append(line)
55 | 			pass
56 | 
57 | 		else:
58 | 			# Discard if there are more than the alignment cutoff 
59 | 			if len(current_reads) >= alignment_cutoff: 
60 | 				current_reads = [line]
61 | 				current_qname = read_elems[0]
62 | 			elif len(current_reads) > 0:
63 | 				# Just output all reads, which are then filtered with samtools
64 | 				for read in current_reads:
65 | 					sys.stdout.write(str(read))
66 | 				# And then discard 
67 | 				current_reads = [line] 
68 | 				current_qname = read_elems[0] 
69 | 			else:
70 | 				# First read in file 
71 | 				current_reads.append(line) 
72 | 				current_qname = read_elems[0]
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/src/peak_distribution.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | This program computes the distribution of peak fragment length and the number of aligned reads (read density)
  5 | used for benchmarking the ATAC-seq pipeline
  6 | 
  7 | 
  8 | Author: Sourya Bhattacharyya
  9 | Vijay-AY lab
 10 | """
 11 | """
 12 | these two lines force matplotlibv to not choose any X-windows 
 13 | This should be declared very first
 14 | see http://stackoverflow.com/questions/2801882/generating-a-png-with-matplotlib-when-display-is-undefined
 15 | """
 16 | import matplotlib
 17 | matplotlib.use('Agg') 
 18 | 
 19 | import os
 20 | from optparse import OptionParser
 21 | import re
 22 | import subprocess
 23 | import matplotlib.pyplot as plt
 24 | import numpy as np
 25 | 
 26 | ##-----------------------------------------------------
 27 | # this function is useful to parse various options for input data processing
 28 | def parse_options():  
 29 | 	parser = OptionParser()
 30 | 		
 31 | 	parser.add_option("-I", "--INPFILE", \
 32 | 				type="string", \
 33 | 				action="store", \
 34 | 				dest="INP_BED_FILE", \
 35 | 				default="", \
 36 | 				help="Input BED file containing the MACS2 peak detection results")
 37 | 
 38 | 	parser.add_option("-R", "--REFFILE", \
 39 | 				type="string", \
 40 | 				action="store", \
 41 | 				dest="REF_BAM_FILE", \
 42 | 				default="", \
 43 | 				help="Reference BAM alignment file")  
 44 | 			
 45 | 	opts, args = parser.parse_args()
 46 | 	return opts, args
 47 |   
 48 | #-----------------------------------------------------
 49 | """
 50 | main function
 51 | """
 52 | def main():  
 53 | 	opts, args = parse_options()
 54 | 
 55 | 	INP_BEDFILE = opts.INP_BED_FILE
 56 | 	REF_BAMFILE = opts.REF_BAM_FILE
 57 | 
 58 | 	k = INP_BEDFILE.rfind('/')
 59 | 	if (k == -1):
 60 | 		INP_BEDFILE_DIR = "./"
 61 | 		Inp_BED_only_filename = INP_BEDFILE
 62 | 	else:
 63 | 		INP_BEDFILE_DIR = INP_BEDFILE[:(k+1)]
 64 | 		Inp_BED_only_filename = INP_BEDFILE[(k+1):]
 65 | 
 66 | 	k1 = Inp_BED_only_filename.rfind('.')
 67 | 	if (k1 == -1):
 68 | 		OutDN = Inp_BED_only_filename
 69 | 	else:
 70 | 		OutDN = Inp_BED_only_filename[(k1+1):]
 71 | 
 72 | 	"""
 73 | 	final output directory which will store the plots and data
 74 | 	"""
 75 | 	OutDir_Name = INP_BEDFILE_DIR + OutDN
 76 | 	if (not os.path.exists(OutDir_Name)):
 77 | 		os.makedirs(OutDir_Name)
 78 | 
 79 | 	fragment_length_list = []
 80 | 	aligned_read_count_list = []
 81 | 
 82 | 	temp_filename = OutDir_Name + "/temp.bed"
 83 | 	fp_temp = open(temp_filename, "w")
 84 | 
 85 | 	"""
 86 | 	scan each line of the input bed file, and note the peak fragment length
 87 | 	also note the number of input reads (from BAM file) which are mapped in this peak
 88 | 	"""
 89 | 	with open(INP_BEDFILE) as fp_inp:
 90 | 		for line in fp_inp:
 91 | 			# note the peak fragment length
 92 | 			curr_line_content = re.split(r'\s', line)
 93 | 			if 0:
 94 | 				print '\n\n Current line: ', line, '  Contents: ', curr_line_content
 95 | 			peak_fragment_len = int(curr_line_content[2]) - int(curr_line_content[1]) + 1
 96 | 			if 0:
 97 | 				print 'peak_fragment_len: ', peak_fragment_len
 98 | 			# write the line to a temporary bed file
 99 | 			fp_temp.seek(0, os.SEEK_SET)
100 | 			fp_temp.write(line)
101 | 			# now count the number of mapped reads to this peak
102 | 			sys_cmd = "samtools view -cL " + str(temp_filename) + " " + str(REF_BAMFILE)
103 | 			read_count = int((subprocess.Popen(sys_cmd, stdout=subprocess.PIPE, shell=True)).stdout.read())
104 | 			if 0:
105 | 				print 'read_count: ', read_count
106 | 			# now append the values in designated lists
107 | 			# maintain sorted lists
108 | 			n = len(fragment_length_list)
109 | 			if (n == 0):
110 | 				# very first element
111 | 				fragment_length_list.append(peak_fragment_len)
112 | 				aligned_read_count_list.append(read_count)
113 | 			else:
114 | 				flag = False
115 | 				for i in xrange((n-1), -1, -1):
116 | 					if (peak_fragment_len == fragment_length_list[i]):
117 | 						aligned_read_count_list[i] = aligned_read_count_list[i] + read_count
118 | 						flag = True
119 | 						break
120 | 					elif (peak_fragment_len > fragment_length_list[i]):
121 | 						if (i == (n-1)):
122 | 							fragment_length_list.append(peak_fragment_len)
123 | 							aligned_read_count_list.append(read_count)
124 | 							flag = True
125 | 						else:
126 | 							fragment_length_list.insert(i+1, peak_fragment_len)
127 | 							aligned_read_count_list.insert(i+1, read_count)
128 | 							flag = True
129 | 						break
130 | 
131 | 				if (flag == False):
132 | 					# condition for insertion at the first location
133 | 					fragment_length_list.insert(0, peak_fragment_len)
134 | 					aligned_read_count_list.insert(0, read_count)
135 | 
136 | 	# close the temporary file
137 | 	fp_temp.close()
138 | 
139 | 	# remove the temporary bed file
140 | 	os.system("rm " + temp_filename)
141 | 
142 | 	"""
143 | 	open a text file with two columns
144 | 	first column will show the peak fragment length
145 | 	second column displays the read count
146 | 	the plot file is stored in the same directory containing macs2 results
147 | 	"""
148 | 	plot_data_textfile = OutDir_Name + "/plot.txt"
149 | 	fp = open(plot_data_textfile, "w")
150 | 	fp.write("Peak_Length" + "\t" + "Read_Count" + "")
151 | 	for i in range(len(fragment_length_list)):
152 | 		fp.write("\n" + str(fragment_length_list[i]) + "\t" + str(aligned_read_count_list[i]))
153 | 	fp.close()
154 | 
155 | 	# """
156 | 	# create a read count list which will contain the no of read count in 1K scale
157 | 	# """
158 | 	# read_count_list_1K_scale = [((aligned_read_count_list[i] * 1.0) / 1000) for i in range(len(aligned_read_count_list))]
159 | 
160 | 	"""
161 | 	Now plot the statistics
162 | 	"""
163 | 	# OutPlotFile = OutDir_Name + "/test_1K_Scale.pdf"
164 | 	# f = plt.figure()
165 | 	# plt.plot(fragment_length_list, read_count_list_1K_scale, ls='-', lw=2.0)
166 | 	# plt.xlabel('Fragment length (bp)')
167 | 	# plt.ylabel('Norm Read count')
168 | 	# plt.title('ATAC seq - read density vs fragment length')
169 | 	# f.savefig(OutPlotFile, bbox_inches='tight')
170 | 
171 | 	OutPlotFile2 = OutDir_Name + "/test_LOG_Scale.pdf"
172 | 	f = plt.figure()
173 | 	plt.semilogy(fragment_length_list, np.exp(-np.asarray(aligned_read_count_list)/5.0), ls='-', lw=2.0)
174 | 	#plt.semilogy(fragment_length_list, np.exp(-np.asarray(aligned_read_count_list)/5.0), ls='-', lw=2.0)
175 | 	# plt.plot(fragment_length_list, aligned_read_count_list, ls='-', lw=2.0)
176 | 	# plt.yscale('log', basey=10)
177 | 	plt.xlabel('Fragment length (bp)')
178 | 	plt.ylabel('Norm Read count')
179 | 	plt.title('ATAC seq - read density vs fragment length')
180 | 	f.savefig(OutPlotFile2, bbox_inches='tight')
181 | 
182 | #-----------------------------------------------------
183 | if __name__ == "__main__":
184 | 	main() 
185 | 
186 | 


--------------------------------------------------------------------------------
/src/trim_adapters.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | # Author: Jason Buenrostro, Stanford University
  4 | # The following program will compress daisy chain seq data into singe molecules
  5 | 
  6 | ##### IMPORT MODULES #####
  7 | # import necessary for python
  8 | import os
  9 | import re
 10 | import sys
 11 | import gzip
 12 | import string
 13 | import Levenshtein
 14 | from optparse import OptionParser
 15 | 
 16 | ##### DEFINE FUNCTIONS #####
 17 | # Reverse complement
 18 | complement = string.maketrans('ATCGN', 'TAGCN')
 19 | def reverse_complement(sequence):
 20 |     return sequence.upper().translate(complement)[::-1]
 21 | 
 22 | # Align with mismatch, find first and move on, assumes only one
 23 | def fuzz_align(s_seq,l_seq,mismatch):
 24 |     for i, base in enumerate(l_seq):  # loop through equal size windows
 25 |         l_subset = l_seq[i:i+len(s_seq)]
 26 |         dist = Levenshtein.distance(l_subset, s_seq)
 27 |         if dist <= mismatch:  # find first then break
 28 |             return i, dist
 29 |             break
 30 | 
 31 | # added by Jin Lee for hot fix (output name bug)
 32 | def rreplace(s, old, new, occurrence):
 33 |     li = s.rsplit(old, occurrence)
 34 |     return new.join(li)
 35 | 
 36 | #### OPTIONS ####
 37 | # define options
 38 | opts = OptionParser()
 39 | usage = "usage: %prog [options] [inputs] This will trim adapters"
 40 | opts = OptionParser(usage=usage)
 41 | opts.add_option("-a", help="<Read1> Accepts fastq or fastq.gz")
 42 | opts.add_option("-b", help="<Read2> Accepts fastq or fastq.gz")
 43 | opts.add_option("-d", help="<OutDir> Output directory storing the trimmed files")
 44 | options, arguments = opts.parse_args()
 45 | 
 46 | # return usage information if no argvs given AND they're not available in the environment
 47 | # command line arguments always override environment variables
 48 | if len(sys.argv)==1:
 49 | 	p1_in = os.environ.get('P1_IN')
 50 | 	p2_in = os.environ.get('P2_IN')
 51 | 	# default output directory
 52 | 	OutDir=os.getwd()
 53 | 	if  (p1_in is None) or (p2_in is None):
 54 | 		os.system(sys.argv[0]+" --help")
 55 | 		sys.exit()
 56 | else:
 57 | 	##### INPUTS AND OUTPUTS #####
 58 | 	# name input and outputs
 59 | 	p1_in = options.a
 60 | 	p2_in = options.b
 61 |     	OutDir = options.d
 62 | 
 63 | # name outputs and print to working dir
 64 | p1_file = p1_in.split('/')[-1]
 65 | p2_file = p2_in.split('/')[-1]
 66 | p1_out = re.sub(".fastq", ".trim.fastq", p1_file)
 67 | p2_out = re.sub(".fastq", ".trim.fastq", p2_file)
 68 | 
 69 | #check for file type and open input file
 70 | append = p1_in.split('.')[-1]
 71 | if append == "fastq":
 72 |     p1_rds = open(p1_in,'r')
 73 |     p2_rds = open(p2_in,'r')
 74 |     p1_out = re.sub(".fastq", ".trim.fastq", p1_file)
 75 |     p2_out = re.sub(".fastq", ".trim.fastq", p2_file)
 76 | elif append == "fq":
 77 |     p1_rds = open(p1_in,'r')
 78 |     p2_rds = open(p2_in,'r')
 79 |     p1_out = re.sub(".fq", ".trim.fastq", p1_file)
 80 |     p2_out = re.sub(".fq", ".trim.fastq", p2_file)
 81 | elif append == "gz":
 82 |     p1_rds = gzip.open(p1_in,'r')
 83 |     p2_rds = gzip.open(p2_in,'r')
 84 |     p1_out = re.sub(".fastq.gz", ".trim.fastq", p1_file)
 85 |     p2_out = re.sub(".fastq.gz", ".trim.fastq", p2_file)
 86 |     p1_out = re.sub(".fq.gz", ".trim.fastq", p1_out)
 87 |     p2_out = re.sub(".fq.gz", ".trim.fastq", p2_out)
 88 | else:
 89 |     sys.exit("ERROR! The input file2 must be a .fastq or .fastq.gz")
 90 | 
 91 | 
 92 | #================
 93 | # output files are placed within the specified output directory
 94 | p1_out = OutDir + '/' + p1_out
 95 | p2_out = OutDir + '/' + p2_out
 96 | #=================
 97 | 
 98 | ##### SCRIPT #####
 99 | # initialize variables
100 | i=0;j=0;k=0;tot_b=0;count=1
101 | n=20  # match seq
102 | mismatch=1  # only allow 0-1 mismatches for now, if allow two then gets mis indexed, to fix this need to change fuzz_align to save L as a vector and reiterate to find 2nd
103 | 
104 | # initilize write files
105 | r1_write = open(p1_out, 'w')
106 | r2_write = open(p2_out, 'w')
107 | 
108 | while 1:
109 |     # read lines
110 |     p1_line = p1_rds.readline()
111 |     p2_line = p2_rds.readline()
112 | 
113 |     # break if at end of file
114 |     if not p1_line:
115 |         break
116 | 
117 |     # load fastq into memory
118 |     if count ==1:
119 |         seqhead1 = p1_line
120 |         seqhead2 = p2_line
121 |     elif count ==2:
122 |         seq1 = p1_line.rstrip()
123 |         seq2 = p2_line.rstrip()
124 |     elif count ==3:
125 |         qualhead1 = p1_line
126 |         qualhead2 = p2_line
127 |     elif count ==4:
128 |         qual1 = p1_line.rstrip()
129 |         qual2 = p2_line.rstrip()
130 | 
131 |         # align reads to themselves
132 |         i = i+1  # total reads
133 |         rc_seq2 = reverse_complement(seq2[0:n])
134 |         idx = seq1.rfind(rc_seq2) # look for perfect match
135 |         if idx > 0:
136 |             j = j+1  # 0 mismatchs
137 |         elif mismatch>0:
138 |             hold = fuzz_align(rc_seq2,seq1,mismatch)  # else allow for mismatch
139 |             if hold:
140 |                 idx,mis=hold
141 |                 if mis == 1:
142 |                     k=k+1  # 1 mismatch
143 | 
144 |         # trim reads if idx exist
145 |         if idx > 0:
146 |             # keep track on how much trimming
147 |             tot_b = tot_b+len(seq2[idx+n:-1]) #track total bases trimmed 
148 |             
149 |             # trim data
150 |             seq1 = seq1[0:idx+n-1]  # modified to sub1 because some aligners (bowtie) dont like perfectly overlapping reads
151 |             seq2 = seq2[0:idx+n-1]
152 |             qual1 = qual1[0:idx+n-1]
153 |             qual2 = qual2[0:idx+n-1]
154 |         
155 |         # print read1
156 |         r1_write.write(seqhead1)
157 |         r1_write.write(seq1+"\n")
158 |         r1_write.write(qualhead1)
159 |         r1_write.write(qual1+"\n")
160 | 
161 |         # print read2
162 |         r2_write.write(seqhead2)
163 |         r2_write.write(seq2+"\n")
164 |         r2_write.write(qualhead2)
165 |         r2_write.write(qual2+"\n")
166 | 
167 |     # increment count
168 |     count = count + 1
169 |     if count == 5:
170 |         count = 1
171 |     else:
172 |         count = count
173 | 
174 | # close files to write the file
175 | r1_write.close()
176 | r2_write.close()
177 | p1_rds.close()
178 | p2_rds.close()
179 | 
180 | # write file output names for passing into next step of pipeline
181 | # !!! DO NOT WRITE ANYTHING ELSE TO STDOUT AFTER THIS !!!
182 | sys.stdout.write(p1_out + '\n')
183 | sys.stdout.write(p2_out + '\n')
184 | 
185 | # give summary
186 | sys.stderr.write(str(i)+" sequences total\n")
187 | sys.stderr.write(str(j)+" sequences trimmed with 0 mismatches\n")
188 | sys.stderr.write(str(k)+" sequences trimmed with 1 mismatch\n")
189 | sys.stderr.write(str(tot_b/(j+k))+" mean number of bases trimmed for reads requiring trimming\n")
190 | 


--------------------------------------------------------------------------------