├── Analysis ├── PeakAnnotateHomerSummary.r ├── PlotChart.r └── ResSummary.r ├── IDR_Codes ├── IDRAnalysis.sh ├── IDRMain.sh ├── IDRScatterPlot.r ├── IDRSummary.r ├── IDR_SubSampleBAM.sh └── IDR_SubSampleBAM_Main.sh ├── Imp_Scripts ├── Footprint_HINT_ATAC.R ├── Motif_HOMER.R └── Peak_Enrichment.R ├── README.md ├── bin ├── ATACSeqQC.r ├── BigWigTrackCreate.sh ├── CorrelationBAMPeak.sh ├── CorrelationPeakPlot.r ├── Sample_ATACseqQC_script.r ├── TagAlign.sh ├── bam_to_bigwig.sh └── pipeline.sh ├── configfile ├── configfile_hg19 ├── configfile_hg38 ├── configfile_mm10 ├── configfile_mm9 ├── pipeline_exec.sh ├── sample_IDRScript.sh └── src ├── PlotSample.py ├── assign_multimappers.py ├── peak_distribution.py └── trim_adapters.py /Analysis/PeakAnnotateHomerSummary.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | #============================= 4 | # this code plots the percentage of genomic annotations 5 | # for each peak file generated from the MACS2 command 6 | # of a ChIP-seq pipeline 7 | # input is a HOMER annotation of the peak file 8 | # corresponding to different genomic segments 9 | #============================= 10 | 11 | args <- commandArgs(TRUE) 12 | if(length(args)<1) { 13 | q("no") 14 | } 15 | 16 | HomerAnnotFile <- args[1] 17 | HomerPeakTSSDistFile <- args[2] 18 | 19 | InpDir <- dirname(HomerAnnotFile) 20 | 21 | #==================================== 22 | # first process the peak annotations 23 | # produced by HOMER 24 | #==================================== 25 | 26 | # first initialize different annotation categories 27 | npeak_3UTR <- 0 28 | npeak_TTS <- 0 29 | npeak_Exon <- 0 30 | npeak_Intron <- 0 31 | npeak_Intergenic <- 0 32 | npeak_Promoter <- 0 33 | npeak_5UTR <- 0 34 | 35 | # open the file and read line by line 36 | # extract performance values 37 | finp <- file(HomerAnnotFile, "r") 38 | lineset <- readLines(finp) 39 | for (i in 1:length(lineset)) { 40 | curr_line <- trimws(lineset[i], which = "both") 41 | curr_line_split <- strsplit(curr_line,"\t")[[1]] 42 | # cat(sprintf("\n curr_line : %s ", curr_line_split)) 43 | # cat(sprintf("\n elem 1: %s elem 2: %s elem 3: %s elem 4: %s ", curr_line_split[1], curr_line_split[2], curr_line_split[3], curr_line_split[4])) 44 | if (regexpr("3UTR", curr_line) > 0) { 45 | npeak_3UTR <- as.numeric(curr_line_split[2]) 46 | } else if (regexpr("TTS", curr_line) > 0) { 47 | npeak_TTS <- as.numeric(curr_line_split[2]) 48 | } else if (regexpr("Exon", curr_line) > 0) { 49 | npeak_Exon <- as.numeric(curr_line_split[2]) 50 | } else if (regexpr("Intron", curr_line) > 0) { 51 | npeak_Intron <- as.numeric(curr_line_split[2]) 52 | } else if (regexpr("Intergenic", curr_line) > 0) { 53 | npeak_Intergenic <- as.numeric(curr_line_split[2]) 54 | } else if (regexpr("Promoter", curr_line) > 0) { 55 | npeak_Promoter <- as.numeric(curr_line_split[2]) 56 | } else if (regexpr("5UTR", curr_line) > 0) { 57 | npeak_5UTR <- as.numeric(curr_line_split[2]) 58 | } 59 | } 60 | 61 | # close the input file 62 | close(finp) 63 | 64 | cat(sprintf("\n npeak_3UTR : %s ", npeak_3UTR)) 65 | cat(sprintf("\n npeak_TTS : %s ", npeak_TTS)) 66 | cat(sprintf("\n npeak_Exon : %s ", npeak_Exon)) 67 | cat(sprintf("\n npeak_Intron : %s ", npeak_Intron)) 68 | cat(sprintf("\n npeak_Intergenic : %s ", npeak_Intergenic)) 69 | cat(sprintf("\n npeak_Promoter : %s ", npeak_Promoter)) 70 | cat(sprintf("\n npeak_5UTR : %s ", npeak_5UTR)) 71 | 72 | # now prepare a vector of the above categories (provided non zero instances) 73 | # to create a pie chart 74 | slices <- c() 75 | lbls <- c() 76 | if (npeak_3UTR > 0) { 77 | slices <- c(slices, npeak_3UTR) 78 | lbls <- c(lbls, "3UTR") 79 | } 80 | if (npeak_TTS > 0) { 81 | slices <- c(slices, npeak_TTS) 82 | lbls <- c(lbls, "TTS") 83 | } 84 | if (npeak_Exon > 0) { 85 | slices <- c(slices, npeak_Exon) 86 | lbls <- c(lbls, "Exon") 87 | } 88 | if (npeak_Intron > 0) { 89 | slices <- c(slices, npeak_Intron) 90 | lbls <- c(lbls, "Intron") 91 | } 92 | if (npeak_Intergenic > 0) { 93 | slices <- c(slices, npeak_Intergenic) 94 | lbls <- c(lbls, "Intergenic") 95 | } 96 | if (npeak_Promoter > 0) { 97 | slices <- c(slices, npeak_Promoter) 98 | lbls <- c(lbls, "Promoter") 99 | } 100 | if (npeak_5UTR > 0) { 101 | slices <- c(slices, npeak_5UTR) 102 | lbls <- c(lbls, "5UTR") 103 | } 104 | 105 | # convert the vector to include the percentage values as well 106 | # for displaying in the pie chart 107 | pct <- round(slices/sum(slices)*100) 108 | lbls <- paste(lbls, pct) # add percents to labels 109 | lbls <- paste(lbls,"%",sep="") # ad % to labels 110 | 111 | OutPlotFile <- paste0(InpDir, "/Pie_Chart_Peak_Annotation.pdf") 112 | pdf(OutPlotFile, width=6, height=4) 113 | pie(slices, labels = lbls, col=rainbow(length(lbls)), main="Pie Chart of peak annotation", radius = 1, cex = 0.5) 114 | dev.off() 115 | 116 | #==================================== 117 | # then process the distance from TSS sites (nearest) 118 | # for individual peaks 119 | # the histogram data is already provided 120 | #==================================== 121 | # first remove the header line from the input file 122 | tempfile <- paste0(InpDir, '/temp_TSSDistFile.bed') 123 | system(paste("awk \'NR>1\'", HomerPeakTSSDistFile, ">", tempfile)) 124 | 125 | # now process the temporary file 126 | PeakTSSData <- read.table(tempfile, header=T) 127 | OutPlotFile <- paste0(InpDir, "/Peak_TSS_Distance.pdf") 128 | pdf(OutPlotFile, width=6, height=4) 129 | plot(PeakTSSData[,1], PeakTSSData[,2], cex=0.5, col="red", xlab="Distance from TSS", ylab="ChIP fragment depth (per bp per peak)") 130 | title("Peak distribution near TSS sites") 131 | dev.off() 132 | 133 | # then remove the temporary file 134 | system(paste("rm", tempfile)) 135 | 136 | -------------------------------------------------------------------------------- /Analysis/PlotChart.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | args <- commandArgs(TRUE) 4 | 5 | data <- read.table(args[1]) 6 | 7 | png(file = args[2], width=15, height=10, units="cm", res=1200) 8 | 9 | barplot(data[,2], names.arg = data[,1], xlab = "Chromosome", ylab = "Count", col = "blue", main = paste0("Chrosome distribution_", args[3])) 10 | 11 | dev.off() -------------------------------------------------------------------------------- /Analysis/ResSummary.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | #================================== 4 | # used to print the summary result statistics 5 | # from a collection of ATAC seq samples 6 | # this script is to be called on the top most directory structure 7 | # containing all the ATAC seq sample folders 8 | 9 | # author: Sourya Bhattacharyya 10 | # Vijay-AY lab 11 | #================================== 12 | 13 | library(optparse) 14 | library(ggplot2) 15 | library(plotly) 16 | 17 | #============== 18 | # function to plot scatter using plotly package 19 | #============== 20 | PlotScatter_Data <- function(InpDF, ylabel, plotfile) { 21 | colnames(InpDF) <- c('X', 'Y') 22 | currplot <- plotly::plot_ly(InpDF, x= ~X, y= ~Y, name=ylabel, type="scatter", mode="markers", marker=list(size=10, color= "blue")) %>% layout(xaxis = list(title = 'Samples', zeroline = FALSE, showticklabels = FALSE), yaxis = list(title = ylabel, zeroline = FALSE)) 23 | htmlwidgets::saveWidget(currplot, plotfile) 24 | } 25 | 26 | #=========================================================== 27 | option_list = list( 28 | make_option(c("--BaseDir"), type="character", default=NULL, help="Base directory containing all the ATAC-seq samples. Mandatory parameter."), 29 | make_option(c("--OutDir"), type="character", default=NULL, help="Output directory to contain the summary results. If empty, current directory is used.") 30 | ); 31 | 32 | opt_parser = OptionParser(option_list=option_list); 33 | opt = parse_args(opt_parser); 34 | 35 | if (is.null(opt$BaseDir)) { 36 | print_help(opt_parser) 37 | stop("ERROR !!!!!!! Base output directory is not provided - check the option --BaseDir \n", call.=FALSE) 38 | } 39 | 40 | if (is.null(opt$OutDir)) { 41 | OutDir <- getwd() 42 | } else { 43 | OutDir <- opt$OutDir 44 | system(paste("mkdir -p", OutDir)) 45 | } 46 | 47 | # check if the last character of the output base directory is '/' 48 | # unless append that character 49 | baseresdir <- opt$BaseDir 50 | if (substr(baseresdir,nchar(baseresdir),nchar(baseresdir)) != '/') { 51 | baseresdir <- paste0(baseresdir, '/') 52 | } 53 | 54 | # template name of the directories containing MACS2 results 55 | # either default parameters are used 56 | # or extsize based parameters are employed 57 | MACS2_def_dir <- 'MACS2_Default_Tag' 58 | MACS2_ext_dir <- 'MACS2_Ext_Tag' 59 | # MACS2_noduprem_ext_dir <- 'MACS2_NoDupRem_Align_Ext_Tag' 60 | 61 | # template name of the folders (peak outputs) depending on whether control (input) are used for peak detection 62 | Ctrl_0_Fold <- '_No_Control' 63 | Ctrl_1_Fold <- '_with_Control' 64 | 65 | # file formats of NRF, read statistics, FRiP and Peak statistics 66 | # which are present for every sample 67 | ReadStatFileNameFmt <- 'Read_Count_Stat.txt' 68 | NRFfilenamefmt <- 'out_NRF' 69 | FRiPFileNameFmt <- 'out_FRiP.txt' 70 | PeakCountFileFmt <- 'Peak_Statistics.txt' 71 | 72 | # output text file to contain the summary results 73 | outtextfile <- paste0(baseresdir, 'Results_All_Samples_Summary.txt') 74 | 75 | #================================= 76 | file_process <- FALSE 77 | 78 | # process individual directories under the main results directory 79 | dir.list <- list.dirs(path = baseresdir, full.names = FALSE, recursive = FALSE) 80 | for (dr in dir.list) { 81 | 82 | # cat(sprintf("\n Examining directory: %s \n", dr)) 83 | 84 | #================== 85 | # following file stores the count of reads throughout various stages of filtering 86 | #================== 87 | ReadStatFile <- paste0(baseresdir, dr, "/", ReadStatFileNameFmt) 88 | if (file.exists(ReadStatFile) && (file.access(ReadStatFile, 4) == 0)) { 89 | cat(sprintf("\n Found the file: %s \n", ReadStatFile)) 90 | 91 | # search for a file with name *_R1*.fastq.gz in the current file 92 | filenames <- Sys.glob(paste0(baseresdir, dr, "/*_R1*.fastq.gz")) 93 | currSampleName <- substr(basename(filenames[1]), start=1, stop=regexpr("_R1", basename(filenames[1]))-1) 94 | 95 | x <- readLines(ReadStatFile) 96 | lastline <- strsplit(x[length(x)], "\t")[[1]] 97 | 98 | # the line should have 8 or 7 fields 99 | # 8 fields if fastq file is used in the pipeline 100 | # 7 fields if already aligned file is used in the pipeline 101 | nfields <- length(lastline) 102 | cat(sprintf("\n No of fields in the read statistics file: %s ", nfields)) 103 | 104 | TotRead <- as.integer(lastline[nfields-6]) #lastline[2] 105 | MappableRead <- as.integer(lastline[nfields-5]) #lastline[3] 106 | Frac_Mappable_Read <- ((MappableRead * 1.0) / TotRead) 107 | Frac_Unmappable_Read <- (((TotRead - MappableRead) * 1.0) / TotRead) 108 | Read_remain_after_RandomDel <- as.integer(lastline[nfields-4]) #lastline[4] 109 | Frac_reads_remain_after_RandomDel <- ((Read_remain_after_RandomDel * 1.0) / TotRead) 110 | Frac_reads_deleted_random <- (((MappableRead - Read_remain_after_RandomDel) * 1.0) / TotRead) 111 | Read_remain_after_Mitochondrial_Read_Del <- as.integer(lastline[nfields-3]) #lastline[5] 112 | Frac_reads_remain_after_MtReadDel <- ((Read_remain_after_Mitochondrial_Read_Del * 1.0) / TotRead) 113 | Frac_reads_deleted_MtRead <- (((Read_remain_after_RandomDel - Read_remain_after_Mitochondrial_Read_Del) * 1.0) / TotRead) 114 | UniqMappedRead <- as.integer(lastline[nfields-2]) #lastline[6] 115 | Frac_reads_unique_mapped <- ((UniqMappedRead * 1.0) / TotRead) 116 | Frac_reads_del_multimap <- (((Read_remain_after_Mitochondrial_Read_Del - UniqMappedRead) * 1.0) / TotRead) 117 | ReadQualThr <- as.integer(lastline[nfields-1]) #lastline[7] 118 | Frac_reads_remain_QualThr <- ((ReadQualThr * 1.0) / TotRead) 119 | Frac_reads_del_QualThr <- (((UniqMappedRead - ReadQualThr) * 1.0) / TotRead) 120 | Dupl_Rem_Read <- lastline[nfields] #lastline[9] 121 | Frac_reads_remain_Dupl <- ((Dupl_Rem_Read * 1.0) / TotRead) 122 | Frac_reads_del_Dupl <- (((ReadQualThr - Dupl_Rem_Read) * 1.0) / TotRead) 123 | 124 | # append the entries in the final vector 125 | CurrOutVec <- c(basename(dr), currSampleName, TotRead, MappableRead, Frac_Mappable_Read, Frac_Unmappable_Read, Read_remain_after_RandomDel, Frac_reads_remain_after_RandomDel, Frac_reads_deleted_random, Read_remain_after_Mitochondrial_Read_Del, Frac_reads_remain_after_MtReadDel, Frac_reads_deleted_MtRead, UniqMappedRead, Frac_reads_unique_mapped, Frac_reads_del_multimap, ReadQualThr, Frac_reads_remain_QualThr, Frac_reads_del_QualThr, Dupl_Rem_Read, Frac_reads_remain_Dupl, Frac_reads_del_Dupl) 126 | 127 | #================== 128 | # the following file stores the NRF / library complexity value 129 | #================== 130 | filenames <- Sys.glob(paste0(baseresdir, dr, "/*", NRFfilenamefmt, "*.txt")) 131 | if (length(filenames) > 0) { 132 | NRF_textfile <- filenames[1] 133 | if (file.exists(NRF_textfile) && (file.access(NRF_textfile, 4) == 0)){ 134 | x <- readLines(NRF_textfile) 135 | # the 2nd line in string splitted structure 136 | lastline <- strsplit(x[length(x)], "\t")[[1]] 137 | UniqMappedPos <- lastline[2] 138 | NRF_val <- lastline[3] 139 | M1 <- lastline[4] 140 | M2 <- lastline[5] 141 | PBC1 <- lastline[6] 142 | PBC2 <- lastline[7] 143 | # adjust the output vector 144 | CurrOutVec <- c(CurrOutVec, UniqMappedPos, NRF_val, M1, M2, PBC1, PBC2) 145 | } else { 146 | CurrOutVec <- c(CurrOutVec, rep('NA', 6)) 147 | } 148 | } else { 149 | CurrOutVec <- c(CurrOutVec, rep('NA', 6)) 150 | } 151 | 152 | #================== 153 | # check the peak directories and find corresponding statistics 154 | #================== 155 | 156 | #================== 157 | # FRiP and Peak count measures - default peak calling - no control 158 | #================== 159 | FRiP_textfile_def_noctrl <- paste0(baseresdir, dr, "/", MACS2_def_dir, Ctrl_0_Fold, "/", FRiPFileNameFmt) 160 | if (file.exists(FRiP_textfile_def_noctrl) && (file.access(FRiP_textfile_def_noctrl, 4) == 0)){ 161 | x <- readLines(FRiP_textfile_def_noctrl) 162 | lastline <- strsplit(x[length(x)], "\t")[[1]] 163 | MappedReadPeak_def_noctrl <- lastline[2] 164 | FRiP_def_noctrl <- lastline[3] 165 | # adjust the output vector 166 | CurrOutVec <- c(CurrOutVec, MappedReadPeak_def_noctrl, FRiP_def_noctrl) 167 | } else { 168 | CurrOutVec <- c(CurrOutVec, rep('NA', 2)) 169 | } 170 | 171 | PeakCount_TextFile_def_noctrl <- paste0(baseresdir, dr, "/", MACS2_def_dir, Ctrl_0_Fold, "/", PeakCountFileFmt) 172 | if (file.exists(PeakCount_TextFile_def_noctrl) && (file.access(PeakCount_TextFile_def_noctrl, 4) == 0)){ 173 | x <- readLines(PeakCount_TextFile_def_noctrl) 174 | lastline <- strsplit(x[length(x)], "\t")[[1]] 175 | TotPeak_def_noctrl <- lastline[1] 176 | TotPeak_Q_Five_Pct_def_noctrl <- lastline[2] 177 | TotPeak_Q_One_Pct_def_noctrl <- lastline[3] 178 | # adjust the output vector 179 | CurrOutVec <- c(CurrOutVec, TotPeak_def_noctrl, TotPeak_Q_Five_Pct_def_noctrl, TotPeak_Q_One_Pct_def_noctrl) 180 | } else { 181 | CurrOutVec <- c(CurrOutVec, rep('NA', 3)) 182 | } 183 | 184 | #================== 185 | # FRiP and Peak count measures - Ext peak calling - no control 186 | #================== 187 | FRiP_textfile_ext_noctrl <- paste0(baseresdir, dr, "/", MACS2_ext_dir, Ctrl_0_Fold, "/", FRiPFileNameFmt) 188 | if (file.exists(FRiP_textfile_ext_noctrl) && (file.access(FRiP_textfile_ext_noctrl, 4) == 0)){ 189 | x <- readLines(FRiP_textfile_ext_noctrl) 190 | lastline <- strsplit(x[length(x)], "\t")[[1]] 191 | MappedReadPeak_ext_noctrl <- lastline[2] 192 | FRiP_ext_noctrl <- lastline[3] 193 | # adjust the output vector 194 | CurrOutVec <- c(CurrOutVec, MappedReadPeak_ext_noctrl, FRiP_ext_noctrl) 195 | } else { 196 | CurrOutVec <- c(CurrOutVec, rep('NA', 2)) 197 | } 198 | 199 | PeakCount_TextFile_ext_noctrl <- paste0(baseresdir, dr, "/", MACS2_ext_dir, Ctrl_0_Fold, "/", PeakCountFileFmt) 200 | if (file.exists(PeakCount_TextFile_ext_noctrl) && (file.access(PeakCount_TextFile_ext_noctrl, 4) == 0)){ 201 | x <- readLines(PeakCount_TextFile_ext_noctrl) 202 | lastline <- strsplit(x[length(x)], "\t")[[1]] 203 | TotPeak_ext_noctrl <- lastline[1] 204 | TotPeak_Q_Five_Pct_ext_noctrl <- lastline[2] 205 | TotPeak_Q_One_Pct_ext_noctrl <- lastline[3] 206 | # adjust the output vector 207 | CurrOutVec <- c(CurrOutVec, TotPeak_ext_noctrl, TotPeak_Q_Five_Pct_ext_noctrl, TotPeak_Q_One_Pct_ext_noctrl) 208 | } else { 209 | CurrOutVec <- c(CurrOutVec, rep('NA', 3)) 210 | } 211 | 212 | #================== 213 | # FRiP and Peak count measures - default peak calling - with control 214 | #================== 215 | FRiP_textfile_def_ctrl <- paste0(baseresdir, dr, "/", MACS2_def_dir, Ctrl_1_Fold, "/", FRiPFileNameFmt) 216 | if (file.exists(FRiP_textfile_def_ctrl) && (file.access(FRiP_textfile_def_ctrl, 4) == 0)){ 217 | x <- readLines(FRiP_textfile_def_ctrl) 218 | lastline <- strsplit(x[length(x)], "\t")[[1]] 219 | MappedReadPeak_def_ctrl <- lastline[2] 220 | FRiP_def_ctrl <- lastline[3] 221 | # adjust the output vector 222 | CurrOutVec <- c(CurrOutVec, MappedReadPeak_def_ctrl, FRiP_def_ctrl) 223 | } else { 224 | CurrOutVec <- c(CurrOutVec, rep('NA', 2)) 225 | } 226 | 227 | PeakCount_TextFile_def_ctrl <- paste0(baseresdir, dr, "/", MACS2_def_dir, Ctrl_1_Fold, "/", PeakCountFileFmt) 228 | if (file.exists(PeakCount_TextFile_def_ctrl) && (file.access(PeakCount_TextFile_def_ctrl, 4) == 0)){ 229 | x <- readLines(PeakCount_TextFile_def_ctrl) 230 | lastline <- strsplit(x[length(x)], "\t")[[1]] 231 | TotPeak_def_ctrl <- lastline[1] 232 | TotPeak_Q_Five_Pct_def_ctrl <- lastline[2] 233 | TotPeak_Q_One_Pct_def_ctrl <- lastline[3] 234 | # adjust the output vector 235 | CurrOutVec <- c(CurrOutVec, TotPeak_def_ctrl, TotPeak_Q_Five_Pct_def_ctrl, TotPeak_Q_One_Pct_def_ctrl) 236 | } else { 237 | CurrOutVec <- c(CurrOutVec, rep('NA', 3)) 238 | } 239 | 240 | #================== 241 | # FRiP and Peak count measures - Ext peak calling - with control 242 | #================== 243 | FRiP_textfile_ext_ctrl <- paste0(baseresdir, dr, "/", MACS2_ext_dir, Ctrl_1_Fold, "/", FRiPFileNameFmt) 244 | if (file.exists(FRiP_textfile_ext_ctrl) && (file.access(FRiP_textfile_ext_ctrl, 4) == 0)){ 245 | x <- readLines(FRiP_textfile_ext_ctrl) 246 | lastline <- strsplit(x[length(x)], "\t")[[1]] 247 | MappedReadPeak_ext_ctrl <- lastline[2] 248 | FRiP_ext_ctrl <- lastline[3] 249 | # adjust the output vector 250 | CurrOutVec <- c(CurrOutVec, MappedReadPeak_ext_ctrl, FRiP_ext_ctrl) 251 | } else { 252 | CurrOutVec <- c(CurrOutVec, rep('NA', 2)) 253 | } 254 | 255 | PeakCount_TextFile_ext_ctrl <- paste0(baseresdir, dr, "/", MACS2_ext_dir, Ctrl_1_Fold, "/", PeakCountFileFmt) 256 | if (file.exists(PeakCount_TextFile_ext_ctrl) && (file.access(PeakCount_TextFile_ext_ctrl, 4) == 0)){ 257 | x <- readLines(PeakCount_TextFile_ext_ctrl) 258 | lastline <- strsplit(x[length(x)], "\t")[[1]] 259 | TotPeak_ext_ctrl <- lastline[1] 260 | TotPeak_Q_Five_Pct_ext_ctrl <- lastline[2] 261 | TotPeak_Q_One_Pct_ext_ctrl <- lastline[3] 262 | # adjust the output vector 263 | CurrOutVec <- c(CurrOutVec, TotPeak_ext_ctrl, TotPeak_Q_Five_Pct_ext_ctrl, TotPeak_Q_One_Pct_ext_ctrl) 264 | } else { 265 | CurrOutVec <- c(CurrOutVec, rep('NA', 3)) 266 | } 267 | 268 | # now convert the current vector in a data frame 269 | CurrDF <- data.frame(CurrOutVec, nrow=1, ncol=length(CurrOutVec)) 270 | colnames(CurrDF) <- c('Dir', 'SampleName', 'Total_Read', 'Number_of_Mappable_Reads', 'Fraction_of_Mappable_Reads', 'Fraction_of_Unmappable_Reads', 'Reads_after_random_chromsome_deletion', 'Fraction_reads_remain_after_random_chromsome_deletion', 'Fraction_reads_in_random_chromsome', 'Reads_excluding_Mitochondrial_Reads', 'Fraction_reads_excluding_MtRead', 'Fraction_mitochondrial_reads', 'UniqMappedRead', 'Fraction_reads_unique_mapped', 'Fraction_reads_multimap', 'Reads_remain_after_QualThr', 'Fraction_reads_remain_QualThr', 'Frac_reads_low_qual', 'Reads_after_dupl_remove', 'Fraction_de-duplicated_reads', 'Fraction_duplicate_reads', 'UniqMappedPos', 'NRF', 'M1', 'M2', 'PBC1', 'PBC2', 'MappedReadPeak_Def_noctrl(Q<0.05)', 'FRiP_Def_NoCtrl(Q<0.05)', 'nPeak_Def_NoCtrl', 'nPeak_Def_NoCtrl(Q<0.05)', 'nPeak_Def_NoCtrl(Q<0.01)', 'MapReadPeak_Ext_NoCtrl(Q<0.05)', 'FRiP_Ext_NoCtrl(Q<0.05)', 'nPeak_Ext_NoCtrl', 'nPeak_Ext_NoCtrl(Q<0.05)', 'nPeak_Ext_NoCtrl(Q<0.01)', 'MapReadPeak_Def_Ctrl(Q<0.05)', 'FRiP_Def_Ctrl(Q<0.05)', 'nPeak_Def_Ctrl', 'nPeak_Def_Ctrl(Q<0.05)', 'nPeak_Def_Ctrl(Q<0.01)', 'MapReadPeak_Ext_Ctrl(Q<0.05)', 'FRiP_Ext_Ctrl(Q<0.05)', 'nPeak_Ext_Ctrl', 'nPeak_Ext_Ctrl(Q<0.05)', 'nPeak_Ext_Ctrl(Q<0.01)') 271 | 272 | if (file_process == FALSE) { 273 | FinalDF <- CurrDF 274 | file_process <- TRUE 275 | } else { 276 | FinalDF <- rbind.data.frame(FinalDF, CurrDF) 277 | } 278 | 279 | } # end processing current sample condition 280 | 281 | } # end directory traverse 282 | 283 | # now remove one or more columns of this data frame, if they are all 'NA' 284 | NA_ColList <- c() 285 | for (i in (1:ncol(FinalDF))) { 286 | idx <- which(FinalDF[, i] == 'NA') 287 | if (length(idx) == nrow(FinalDF)) { 288 | # every entry of this column is NA. So discard this column 289 | NA_ColList <- c(NA_ColList, i) 290 | } 291 | } 292 | # if (length(NA_ColList) > 0) { 293 | # FinalDF_Modified <- FinalDF[-c(NA_ColList)] 294 | # cat(sprintf("\n *** Dropped one or more columns since all entries were NA - before dropping : number of columns : %s after dropping columns : %s ", ncol(FinalDF), ncol(FinalDF_Modified))) 295 | # write.table(FinalDF_Modified, outtextfile, row.names=F, col.names=T, sep="\t", quote=F, append=F) 296 | # } else { 297 | # write.table(FinalDF, outtextfile, row.names=F, col.names=T, sep="\t", quote=F, append=F) 298 | # } 299 | 300 | #============================ 301 | # a few summary statements for the output text file 302 | #============================ 303 | 304 | CommentsFile <- paste0(OutDir, '/Field_Description.txt') 305 | 306 | # open the output text file 307 | con <- file(CommentsFile, "a") 308 | 309 | outtext <- paste0("\n\n\n\n\n *** Important parameters ***** \n\n\n") 310 | writeLines(outtext, con=con, sep="\n") 311 | 312 | outtext <- paste0("\n\n Total_Read: number of reads in individual fastq file(s)") 313 | writeLines(outtext, con=con, sep="\n") 314 | 315 | outtext <- paste0("\n\n Number_of_Mappable_Reads, Fraction_of_Mappable_Reads, and Fraction_of_Unmappable_Reads: number (and fraction) of reads mappable and unmappable to the reference genome. May not be uniquely mappable reads.") 316 | writeLines(outtext, con=con, sep="\n") 317 | 318 | outtext <- paste0("\n\n Reads_after_random_chromsome_deletion, Fraction_reads_remain_after_random_chromsome_deletion, and Fraction_reads_in_random_chromsome: number (and fraction) of reads remaining (and deleted) after deleting reads from random chromosomes such as chr1_*, chr2_*, chrUN, ....") 319 | writeLines(outtext, con=con, sep="\n") 320 | 321 | outtext <- paste0("\n\n Reads_excluding_Mitochondrial_Reads, Fraction_reads_excluding_MtRead and Fraction_mitochondrial_reads: number (and fraction) of reads remaining (and deleted) after removing the mitochondrial reads") 322 | writeLines(outtext, con=con, sep="\n") 323 | 324 | outtext <- paste0("\n\n UniqMappedRead, Fraction_reads_unique_mapped, and Fraction_reads_multimap: number (and fraction) of reads uniquely mapped (and multimapped) to the reference genome.") 325 | writeLines(outtext, con=con, sep="\n") 326 | 327 | outtext <- paste0("\n\n Reads_remain_after_QualThr, Fraction_reads_remain_QualThr and Frac_reads_low_qual: number (and fraction) of reads remaining (and deleted) after removing low quality reads (MAPQ threshold)") 328 | writeLines(outtext, con=con, sep="\n") 329 | 330 | outtext <- paste0("\n\n Reads_not_in_blackList_genome, Fraction_reads_not_in_blackList_genome, and Fraction_reads_in_blackList_genome: number (and fraction) of reads not in (and in) blacklist segments.") 331 | writeLines(outtext, con=con, sep="\n") 332 | 333 | outtext <- paste0("\n\n Reads_after_dupl_remove, Fraction_de-duplicated_reads and Fraction_duplicate_reads: number (and fraction) of reads remaining (and deleted) after removing duplicate reads") 334 | writeLines(outtext, con=con, sep="\n") 335 | 336 | outtext <- paste0("\n\n UniqMapPos: number of distinct genome position where at least one read maps uniquely.") 337 | writeLines(outtext, con=con, sep="\n") 338 | 339 | outtext <- paste0("\n\n NRF (Non redundant fraction): number of distinct genome positions for uniquely mapped reads / number of uniquely mapped reads") 340 | writeLines(outtext, con=con, sep="\n") 341 | 342 | outtext <- paste0("\n\n M1: number of genomic locations where exactly one read maps uniquely.") 343 | writeLines(outtext, con=con, sep="\n") 344 | 345 | outtext <- paste0("\n\n M2: number of genomic locations where exactly two reads map uniquely.") 346 | writeLines(outtext, con=con, sep="\n") 347 | 348 | outtext <- paste0("\n\n PBC1: M1 / UniqMapPos ") 349 | writeLines(outtext, con=con, sep="\n") 350 | 351 | outtext <- paste0("\n\n PBC2: M1 / M2") 352 | writeLines(outtext, con=con, sep="\n") 353 | 354 | outtext <- paste0("\n\n\n\n\n MACS2 outputs corresponding to peaks with default MACS2 command and no control ----- input missing values are replaced by NA \n\n") 355 | writeLines(outtext, con=con, sep="\n") 356 | 357 | outtext <- paste0("\n\n MappedReadPeak_Def_noctrl: mapped reads in peaks ") 358 | writeLines(outtext, con=con, sep="\n") 359 | 360 | outtext <- paste0("\n\n FRiP_Def_NoCtrl: MappedReadPeak_Def_noctrl / UniqMappedRead ") 361 | writeLines(outtext, con=con, sep="\n") 362 | 363 | outtext <- paste0("\n\n nPeak_Def_NoCtrl: number of peaks (determined by p value threshold of 0.01) ") 364 | writeLines(outtext, con=con, sep="\n") 365 | 366 | outtext <- paste0("\n\n nPeak_Def_NoCtrl(Q<0.05): number of peaks (determined by q value threshold of 0.05) ") 367 | writeLines(outtext, con=con, sep="\n") 368 | 369 | outtext <- paste0("\n\n nPeak_Def_NoCtrl(Q<0.01): number of peaks (determined by q value threshold of 0.01) ") 370 | writeLines(outtext, con=con, sep="\n") 371 | 372 | outtext <- paste0("\n\n\n\n\n MACS2 outputs corresponding to peaks with --extsize option (recommended in existing ATAC seq pipeline) and no control input ----- missing values are replaced by NA \n\n") 373 | writeLines(outtext, con=con, sep="\n") 374 | 375 | outtext <- paste0("\n\n MapReadPeak_Ext_NoCtrl: mapped reads in peaks ") 376 | writeLines(outtext, con=con, sep="\n") 377 | 378 | outtext <- paste0("\n\n FRiP_Ext_NoCtrl: MapReadPeak_Ext_NoCtrl / UniqMapRead ") 379 | writeLines(outtext, con=con, sep="\n") 380 | 381 | outtext <- paste0("\n\n nPeak_Ext_NoCtrl: number of peaks (determined by p value threshold of 0.01) ") 382 | writeLines(outtext, con=con, sep="\n") 383 | 384 | outtext <- paste0("\n\n nPeak_Ext_NoCtrl(Q<0.05): number of peaks (determined by q value threshold of 0.05) ") 385 | writeLines(outtext, con=con, sep="\n") 386 | 387 | outtext <- paste0("\n\n nPeak_Ext_NoCtrl(Q<0.01): number of peaks (determined by q value threshold of 0.01) ") 388 | writeLines(outtext, con=con, sep="\n") 389 | 390 | outtext <- paste0("\n\n\n\n\n MACS2 outputs corresponding to peaks with default MACS2 command ----- but here control input is present ----- missing values are replaced by NA \n\n") 391 | writeLines(outtext, con=con, sep="\n") 392 | 393 | outtext <- paste0("\n\n MapReadPeak_Def_Ctrl: mapped reads in peaks ") 394 | writeLines(outtext, con=con, sep="\n") 395 | 396 | outtext <- paste0("\n\n FRiP_Def_Ctrl: MapReadPeak_Def_Ctrl / UniqMapRead ") 397 | writeLines(outtext, con=con, sep="\n") 398 | 399 | outtext <- paste0("\n\n nPeak_Def_Ctrl: number of peaks (determined by p value threshold of 0.01) ") 400 | writeLines(outtext, con=con, sep="\n") 401 | 402 | outtext <- paste0("\n\n nPeak_Def_Ctrl(Q<0.05): number of peaks (determined by q value threshold of 0.05) ") 403 | writeLines(outtext, con=con, sep="\n") 404 | 405 | outtext <- paste0("\n\n nPeak_Def_Ctrl(Q<0.01): number of peaks (determined by q value threshold of 0.01) ") 406 | writeLines(outtext, con=con, sep="\n") 407 | 408 | outtext <- paste0("\n\n\n\n\n MACS2 outputs corresponding to peaks with --extsize option (recommended in existing ATAC seq pipeline) ----- here control input is provided ----- missing values are replaced by NA \n\n") 409 | writeLines(outtext, con=con, sep="\n") 410 | 411 | outtext <- paste0("\n\n MapReadPeak_Ext_Ctrl: mapped reads in peaks ") 412 | writeLines(outtext, con=con, sep="\n") 413 | 414 | outtext <- paste0("\n\n FRiP_Ext_Ctrl: MapReadPeak_Ext_Ctrl / UniqMapRead ") 415 | writeLines(outtext, con=con, sep="\n") 416 | 417 | outtext <- paste0("\n\n nPeak_Ext_Ctrl: number of peaks (determined by p value threshold of 0.01) ") 418 | writeLines(outtext, con=con, sep="\n") 419 | 420 | outtext <- paste0("\n\n nPeak_Ext_Ctrl(Q<0.05): number of peaks (determined by q value threshold of 0.05) ") 421 | writeLines(outtext, con=con, sep="\n") 422 | 423 | outtext <- paste0("\n\n nPeak_Ext_Ctrl(Q<0.01): number of peaks (determined by q value threshold of 0.01) ") 424 | writeLines(outtext, con=con, sep="\n") 425 | 426 | 427 | # close output summary text file 428 | close(con) 429 | 430 | #=================== 431 | # now read the summary file once more, and plot different statistics 432 | #=================== 433 | FinalDF <- read.table(outtextfile, header=T, sep="\t", stringsAsFactors=F) 434 | 435 | # plot total number of reads for each sample 436 | plotfile <- paste0(OutDir, '/TotalReadCount_Distribution.html') 437 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 3]) 438 | PlotScatter_Data(plotdf, "Total Reads", plotfile) 439 | 440 | # plot fraction of mappable reads for each sample 441 | plotfile <- paste0(OutDir, '/Fraction_MappableReadCount_Distribution.html') 442 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 5]) 443 | PlotScatter_Data(plotdf, "Fraction Mappable Reads", plotfile) 444 | 445 | # plot fraction of mitochondrial reads for each sample 446 | plotfile <- paste0(OutDir, '/Fraction_MitochondrialReadCount_Distribution.html') 447 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 12]) 448 | PlotScatter_Data(plotdf, "Fraction mitochondrial Reads", plotfile) 449 | 450 | # plot fraction of uniquely mapped reads for each sample 451 | plotfile <- paste0(OutDir, '/Fraction_UniqueMappReadCount_Distribution.html') 452 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 14]) 453 | PlotScatter_Data(plotdf, "Fraction unique mapped Reads", plotfile) 454 | 455 | # plot fraction of low quality reads for each sample 456 | plotfile <- paste0(OutDir, '/Fraction_LowQualReadCount_Distribution.html') 457 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 18]) 458 | PlotScatter_Data(plotdf, "Fraction low quality Reads", plotfile) 459 | 460 | # plot fraction of duplicate reads for each sample 461 | plotfile <- paste0(OutDir, '/Fraction_DuplicateReadCount_Distribution.html') 462 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 21]) 463 | PlotScatter_Data(plotdf, "Fraction duplicates", plotfile) 464 | 465 | # plot NRF for each sample 466 | plotfile <- paste0(OutDir, '/NRF_Distribution.html') 467 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 23]) 468 | PlotScatter_Data(plotdf, "NRF", plotfile) 469 | 470 | # plot M1 for each sample 471 | plotfile <- paste0(OutDir, '/M1_Distribution.html') 472 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 24]) 473 | PlotScatter_Data(plotdf, "M1", plotfile) 474 | 475 | # plot M2 for each sample 476 | plotfile <- paste0(OutDir, '/M2_Distribution.html') 477 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 25]) 478 | PlotScatter_Data(plotdf, "M2", plotfile) 479 | 480 | # plot PBC1 for each sample 481 | plotfile <- paste0(OutDir, '/PBC1_Distribution.html') 482 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 26]) 483 | PlotScatter_Data(plotdf, "PBC1", plotfile) 484 | 485 | # plot PBC2 for each sample 486 | plotfile <- paste0(OutDir, '/PBC2_Distribution.html') 487 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, 27]) 488 | PlotScatter_Data(plotdf, "PBC2", plotfile) 489 | 490 | # plot FRiP for each sample - no control, default MACS2 peaks 491 | # provided the column is not filled with NA 492 | colno <- 29 493 | if ((colno %in% NA_ColList) == FALSE) { 494 | plotfile <- paste0(OutDir, '/FRiP_Def_NoCtrl_Distribution.html') 495 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, colno]) 496 | PlotScatter_Data(plotdf, "FRiP_Def_NoCtrl", plotfile) 497 | } 498 | 499 | # plot number of peaks for each sample - FDR = 0.05 500 | # no control, MACS2 default peaks 501 | # provided the column is not filled with NA 502 | colno <- 31 503 | if ((colno %in% NA_ColList) == FALSE) { 504 | plotfile <- paste0(OutDir, '/NumPeak_Def_NoCtrl_Distribution.html') 505 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, colno]) 506 | PlotScatter_Data(plotdf, "NumPeak_Def_NoCtrl", plotfile) 507 | } 508 | 509 | # plot FRiP for each sample - no control, MACS2 Extsize peaks 510 | # provided the column is not filled with NA 511 | colno <- 34 512 | if ((colno %in% NA_ColList) == FALSE) { 513 | plotfile <- paste0(OutDir, '/FRiP_Ext_NoCtrl_Distribution.html') 514 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, colno]) 515 | PlotScatter_Data(plotdf, "FRiP_Ext_NoCtrl", plotfile) 516 | } 517 | 518 | # plot number of peaks for each sample - FDR = 0.05 519 | # no control, MACS2 Extsize peaks 520 | # provided the column is not filled with NA 521 | colno <- 36 522 | if ((colno %in% NA_ColList) == FALSE) { 523 | plotfile <- paste0(OutDir, '/NumPeak_Ext_NoCtrl_Distribution.html') 524 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, colno]) 525 | PlotScatter_Data(plotdf, "NumPeak_Ext_NoCtrl", plotfile) 526 | } 527 | 528 | # plot FRiP for each sample - with control, MACS2 default peaks 529 | # provided the column is not filled with NA 530 | colno <- 39 531 | if ((colno %in% NA_ColList) == FALSE) { 532 | plotfile <- paste0(OutDir, '/FRiP_Def_Ctrl_Distribution.html') 533 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, colno]) 534 | PlotScatter_Data(plotdf, "FRiP_Def_Ctrl", plotfile) 535 | } 536 | 537 | # plot number of peaks for each sample - FDR = 0.05 538 | # no control, MACS2 Extsize peaks 539 | # provided the column is not filled with NA 540 | colno <- 41 541 | if ((colno %in% NA_ColList) == FALSE) { 542 | plotfile <- paste0(OutDir, '/NumPeak_Def_Ctrl_Distribution.html') 543 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, colno]) 544 | PlotScatter_Data(plotdf, "NumPeak_Def_Ctrl", plotfile) 545 | } 546 | 547 | # plot FRiP for each sample - with control, MACS2 Extsize peaks 548 | # provided the column is not filled with NA 549 | colno <- 44 550 | if ((colno %in% NA_ColList) == FALSE) { 551 | plotfile <- paste0(OutDir, '/FRiP_Ext_Ctrl_Distribution.html') 552 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, colno]) 553 | PlotScatter_Data(plotdf, "FRiP_Ext_Ctrl", plotfile) 554 | } 555 | 556 | # plot number of peaks for each sample - FDR = 0.05 557 | # with control, MACS2 Extsize peaks 558 | # provided the column is not filled with NA 559 | colno <- 46 560 | if ((colno %in% NA_ColList) == FALSE) { 561 | plotfile <- paste0(OutDir, '/NumPeak_Ext_Ctrl_Distribution.html') 562 | plotdf <- data.frame(X=FinalDF[, 2], Y=FinalDF[, colno]) 563 | PlotScatter_Data(plotdf, "NumPeak_Ext_Ctrl", plotfile) 564 | } 565 | 566 | -------------------------------------------------------------------------------- /IDR_Codes/IDRAnalysis.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #================================= 4 | # this script is used to perform IDR analysis on a given ATAC seq replicates 5 | # it uses the idrcode package provided by Anshul Kundaje et. al. 6 | #================================= 7 | # developed by - Sourya Bhattacharyya 8 | # date: 11th july 2017 9 | # Vijay-AY lab 10 | # La Jolla Institute for Allergy and Immunology 11 | #================================= 12 | 13 | # usage info 14 | usage(){ 15 | cat << EOF 16 | 17 | Options: 18 | 19 | -- required: 20 | -a FILE1 First file containing peak information (in either narrowpeak format or narrowpeak.gz format) 21 | -b FILE2 Second file containing peak information (in either narrowpeak format or narrowpeak.gz format) 22 | -d OutDir Output directory containing the IDR results 23 | -P PathIDRCode Path of the IDRCode package (Kundaje et. al. after its installation) 24 | -n PREFIX Prefix of output file 25 | -c SampledPeakCount Number of peaks which will be sampled from the input peak files (default 25000) 26 | EOF 27 | } 28 | 29 | # # name of the folder containing IDR results 30 | # IDR_OutFold='IDR_Overlap0_PVal' 31 | 32 | # executable (R code) of the batch consistency analysis 33 | # IDRCodeDir='/home/sourya/packages/idrCode/' 34 | exec1='batch-consistency-analysis.r' 35 | 36 | # default values 37 | PREFIX='IDR_ATAC' 38 | 39 | OutDir=`pwd` 40 | 41 | # Number of peaks sampled from the original peak detection output 42 | SampledPeakCount=25000 43 | 44 | while getopts "a:b:d:n:c:P:" opt; 45 | do 46 | case "$opt" in 47 | a) FILE1=$OPTARG;; 48 | b) FILE2=$OPTARG;; 49 | d) OutDir=$OPTARG;; 50 | n) PREFIX=$OPTARG;; 51 | c) SampledPeakCount=$OPTARG;; 52 | P) IDRCodeDir=$OPTARG;; 53 | \?) usage 54 | echo "error: unrecognized option -$OPTARG"; 55 | exit 1 56 | ;; 57 | esac 58 | done 59 | 60 | if [[ -z $FILE1 ]]; then 61 | echo 'User should provide two input peak files (in a bed file or in gzipped bed file) !!' 62 | exit 1 63 | else 64 | echo 'Input peak file 1: '$FILE1 65 | fi 66 | 67 | if [[ -z $FILE2 ]]; then 68 | echo 'User should provide two input peak files (in a bed file or in gzipped bed file) !!' 69 | exit 1 70 | else 71 | echo 'Input peak file 2: '$FILE2 72 | fi 73 | 74 | if [[ -z $IDRCodeDir ]]; then 75 | echo 'User did not provide the path of IDRCode package (Kundaje et. al.) - exit for the moment !!' 76 | exit 1 77 | fi 78 | 79 | echo $OutDir 80 | mkdir -p $OutDir 81 | 82 | #---------------------------------- 83 | # important - sourya 84 | # change the current directory as the dir containing this executable 85 | # since other source files relative to the current directory needs to be called 86 | current_dir=$(pwd) 87 | script_dir=$(dirname $0) 88 | cd $script_dir 89 | #---------------------------------- 90 | 91 | # log the replicates 92 | # if [ ! -f $OutDir'/ReplicaNames.txt' ]; then 93 | echo -e "IDR Analysis of the following two peak files: \n File 1: ${FILE1} \n File 2: ${FILE2}" > $OutDir'/ReplicaNames.txt' 94 | # fi 95 | 96 | # check the extension of both input files 97 | filebase1=$(basename "$FILE1") 98 | filebase2=$(basename "$FILE2") 99 | 100 | #=========================================== 101 | # extract first 25K (or the count provided) significant peaks from the given peak files and store them 102 | # this will be actually used for IDR analysis 103 | # such significance is decided by the 8th field (P value) 104 | # as instructed in the ENCODE IDR documentation 105 | 106 | if [[ $filebase1 =~ \.gz$ ]]; then 107 | # first file is a gzipped file 108 | #ConvFile1=${FILE1%.gz}'_first_'$SampledPeakCount'.gz' 109 | ConvFile1=${FILE1%.gz}'_first_'$SampledPeakCount 110 | echo 'Subsampled peak file corresponding to the first input: '$ConvFile1 111 | #if [ ! -f $ConvFile1 ]; then 112 | zcat $FILE1 | sort -k8,8nr > $OutDir'/temp1.txt' 113 | #head -n $SampledPeakCount $OutDir'/temp1.txt' | gzip -c > $ConvFile1 114 | head -n $SampledPeakCount $OutDir'/temp1.txt' > $ConvFile1 115 | rm $OutDir'/temp1.txt' 116 | #fi 117 | else 118 | #ConvFile1=${FILE1}'_first_'$SampledPeakCount'.gz' 119 | ConvFile1=${FILE1}'_first_'$SampledPeakCount 120 | echo 'Subsampled peak file corresponding to the first input: '$ConvFile1 121 | #if [ ! -f $ConvFile1 ]; then 122 | cat $FILE1 | sort -k8,8nr > $OutDir'/temp1.txt' 123 | #head -n $SampledPeakCount $OutDir'/temp1.txt' | gzip -c > $ConvFile1 124 | head -n $SampledPeakCount $OutDir'/temp1.txt' > $ConvFile1 125 | rm $OutDir'/temp1.txt' 126 | #fi 127 | fi 128 | 129 | if [[ $filebase2 =~ \.gz$ ]]; then 130 | # first file is a gzipped file 131 | #ConvFile2=${FILE2%.gz}'_first_'$SampledPeakCount'.gz' 132 | ConvFile2=${FILE2%.gz}'_first_'$SampledPeakCount 133 | echo 'Subsampled peak file corresponding to the second input: '$ConvFile2 134 | #if [ ! -f $ConvFile2 ]; then 135 | zcat $FILE2 | sort -k8,8nr > $OutDir'/temp2.txt' 136 | #head -n $SampledPeakCount $OutDir'/temp2.txt' | gzip -c > $ConvFile2 137 | head -n $SampledPeakCount $OutDir'/temp2.txt' > $ConvFile2 138 | rm $OutDir'/temp2.txt' 139 | #fi 140 | else 141 | #ConvFile2=${FILE2}'_first_'$SampledPeakCount'.gz' 142 | ConvFile2=${FILE2}'_first_'$SampledPeakCount 143 | echo 'Subsampled peak file corresponding to the second input: '$ConvFile2 144 | #if [ ! -f $ConvFile2 ]; then 145 | cat $FILE2 | sort -k8,8nr > $OutDir'/temp2.txt' 146 | #head -n $SampledPeakCount $OutDir'/temp2.txt' | gzip -c > $ConvFile2 147 | head -n $SampledPeakCount $OutDir'/temp2.txt' > $ConvFile2 148 | rm $OutDir'/temp2.txt' 149 | #fi 150 | fi 151 | 152 | #=========================================== 153 | 154 | # we employ p value as the measure for rank determination of the peaks 155 | # We note that only narrow peaks are analyzed for the significance test - so the 6th argument is F (no broadpeak) 156 | # we also note that the criteria of peak overlap is set as 1 bp. 157 | # So the 5th argument is placed as 0 - if it is 0.5, 50% overlap criteria is imposed 158 | 159 | # this output directory also notes the settings used for this IDR 160 | CurrOutDir=$OutDir #'/' #$IDR_OutFold 161 | mkdir -p $CurrOutDir 162 | 163 | # the prefix also contains the output directory where all results will be stored 164 | CurrOutPrefix=$CurrOutDir'/'$PREFIX 165 | 166 | # parameter description is provided in the ENCODE IDR documentation 167 | if [ ! -f $CurrOutPrefix'-overlapped-peaks.txt' ]; then 168 | # command for batch consistency analysis 169 | # Note: the input to this program should be uncompressed narrow peak file 170 | 171 | # first unzip the files 172 | #gunzip $ConvFile1 173 | #file1=${ConvFile1%.gz} 174 | #gunzip $ConvFile2 175 | #file2=${ConvFile2%.gz} 176 | 177 | # then call the batch consistency command 178 | cd $IDRCodeDir 179 | #Rscript $exec1 $file1 $file2 -1 $CurrOutPrefix 0 F p.value 180 | Rscript $exec1 $ConvFile1 $ConvFile2 -1 $CurrOutPrefix 0 F p.value 181 | cd - 182 | 183 | # now re-zip the peak files 184 | #gzip $file1 185 | #gzip $file2 186 | 187 | fi 188 | 189 | #====================== 190 | # add - sourya 191 | # here we call a custom R function 192 | # which plots IDR scatter analysis between this pair of samples 193 | 194 | Rscript IDRScatterPlot.r $IDRCodeDir $ConvFile1 $ConvFile2 $IDRCodeDir'/genome_table.txt' $CurrOutPrefix 195 | 196 | #---------------------------------- 197 | # after generating the IDR statistics for this pair of replicates, 198 | # now quantify the similarities 199 | 200 | # number of peaks in the input peak files 201 | # and also the number of overlapped peaks 202 | #npeak1=`zcat $ConvFile1 | wc -l` 203 | #npeak2=`zcat $ConvFile2 | wc -l` 204 | npeak1=`cat $ConvFile1 | wc -l` 205 | npeak2=`cat $ConvFile2 | wc -l` 206 | 207 | Rscript IDRSummary.r $CurrOutPrefix'-overlapped-peaks.txt' $npeak1 $npeak2 208 | 209 | #---------------------------------- 210 | # important - sourya 211 | # now restore the original directory 212 | cd $current_dir 213 | #---------------------------------- 214 | -------------------------------------------------------------------------------- /IDR_Codes/IDRMain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #================================= 4 | # this script encapsulates IDR analysis between different replicates into a single script 5 | # It has input peak files (2 or more) 6 | # The script calls IDRAnalysis.sh for pairwise analysis 7 | #================================= 8 | # developed by - Sourya Bhattacharyya 9 | # Vijay-AY lab 10 | # La Jolla Institute for Allergy and Immunology 11 | #================================= 12 | 13 | # usage info 14 | usage(){ 15 | cat << EOF 16 | 17 | usage: ./IDRMain.sh [-h] [-I peakfile1.narrowpeak] [-I peakfile2.narrowpeak] [-P PathIDRCode] [-d OutDir] [-n PREFIXSTR] 18 | Example: 19 | ./IDRMain.sh -I peak1.narrowPeak -I peak2.narrowPeak -I peak3.narrowPeak -P /home/sourya/packages/idrCode/ -d /home/sourya/OutDir_IDR -n 'IDR_test' 20 | 21 | Options: 22 | 23 | -- required: 24 | -I InpFile A list of input peak files (obtained from MACS2 - in .narrowPeak or .narrowPeak.gz format). 25 | At least two peak files are required. 26 | -P PathIDRCode Path of the IDRCode package (Kundaje et. al. after its installation) 27 | -d OutDir Output directory (absolute path preferred) which will store the IDR results. 28 | -n PREFIX Prefix of output files. Default 'IDR_ATAC'. 29 | EOF 30 | } 31 | 32 | # default variables and values 33 | IDR_code='./IDRAnalysis.sh' 34 | PREFIX='IDR_ATAC' 35 | 36 | # code containing the IDR + consistency plot 37 | # dir2='/home/sourya/packages/idrCode/' 38 | exec2='batch-consistency-plot.r' 39 | 40 | # Sourya - Note the processing of input file argument since it can be more than one file 41 | # Note the change of notations 42 | while getopts "I:n:d:P:" opt; 43 | do 44 | case "$opt" in 45 | I) InpFile+=($OPTARG);; 46 | n) PREFIX=$OPTARG;; 47 | d) OutDir=$OPTARG;; 48 | P) dir2=$OPTARG;; 49 | \?) usage 50 | echo "error: unrecognized option -$OPTARG"; 51 | exit 1 52 | ;; 53 | esac 54 | done 55 | 56 | if [[ -z $InpFile ]]; then 57 | echo 'User did not provide any input peak file - exit for the moment !!' 58 | exit 1 59 | fi 60 | 61 | if [[ -z $dir2 ]]; then 62 | echo 'User did not provide the path of IDRCode package (Kundaje et. al.) - exit for the moment !!' 63 | exit 1 64 | fi 65 | 66 | if [[ -z $OutDir ]]; then 67 | echo 'User did not provide output directory for storing the results - exit for the moment !!' 68 | exit 1 69 | fi 70 | 71 | # number of input files provided 72 | nsample=${#InpFile[@]} 73 | echo 'Number of input files : '$nsample 74 | 75 | if [ $nsample -lt 2 ]; then 76 | echo 'User needs to provide at least two peak files for comparison - exit for the moment !!' 77 | exit 1 78 | fi 79 | 80 | # generate the output directory 81 | mkdir -p $OutDir 82 | 83 | #---------------------------------- 84 | # important - sourya 85 | # change the current directory as the dir containing this executable 86 | # since other source files relative to the current directory needs to be called 87 | current_dir=$(pwd) 88 | script_dir=$(dirname $0) 89 | cd $script_dir 90 | #---------------------------------- 91 | 92 | #================================= 93 | # batch replicate analysis 94 | #================================= 95 | # if [ ! -f $OutDir'/Replicate_Names.txt' ]; then 96 | echo 'Analyzing the '$nsample' Number of replicates --- ' > $OutDir'/Replicate_Names.txt' 97 | for (( i=0; i<${nsample}; i++ )) 98 | do 99 | echo 'Sample '$i' is : '${InpFile[$i]} >> $OutDir'/Replicate_Names.txt' 100 | done 101 | # fi 102 | 103 | #=================== 104 | # add - sourya 105 | # here we analyze individual peak files (input) 106 | # and accordingly assign the number of common peaks to be considered 107 | PeakStatFile=$OutDir'/Input_Peak_Statistics.txt' 108 | 109 | echo 'Summarizing the peak count statistics for individual input files: ' > $PeakStatFile 110 | 111 | # first get the minimum no of peaks across all the samples 112 | for (( i=0; i<${nsample}; i++ )) 113 | do 114 | peakfile=${InpFile[$i]} 115 | pc=`cat $peakfile | wc -l` 116 | echo "Analyzing the peak file: $peakfile " >> $PeakStatFile 117 | echo "Peak count: $pc " >> $PeakStatFile 118 | if [ $i == 0 ]; then 119 | minpc=$pc 120 | else 121 | if [ $minpc > $pc ]; then 122 | minpc=$pc 123 | fi 124 | fi 125 | done 126 | 127 | # assign the minimum number of peaks for consideration 128 | if [[ $minpc -gt 200000 ]]; then 129 | CountPeak=150000 130 | elif [[ $minpc -gt 150000 ]]; then 131 | CountPeak=100000 132 | elif [[ $minpc -gt 100000 ]]; then 133 | CountPeak=75000 134 | elif [[ $minpc -gt 75000 ]]; then 135 | CountPeak=50000 136 | else 137 | CountPeak=25000 138 | fi 139 | 140 | echo "Value of CountPeak (number of common peaks to be analyzed for all replicates): $CountPeak " >> $PeakStatFile 141 | #=================== 142 | 143 | # loop for pairwise execution of samples 144 | for (( i=0; i<${nsample}-1; i++ )) 145 | do 146 | for (( j=$i+1; j<${nsample}; j++ )) 147 | do 148 | # pair of samples 149 | sample1=${InpFile[$i]} 150 | sample2=${InpFile[$j]} 151 | # execute the sample pairs 152 | # Note the output directory name - it is the sample directory plus the pairwise comparison 153 | $IDR_code -a $sample1 -b $sample2 -P $dir2 -d $OutDir'/'$i'_and_'$j -n $PREFIX -c $CountPeak 154 | done 155 | done 156 | 157 | #================================= 158 | # batch consistency plots 159 | #================================= 160 | if [ ! -f $OutDir'/IDR_Batch_Plot-plot.pdf' ]; then 161 | 162 | # the pattern of input prefix present in every replicate 163 | #inppfx=$IDR_OutFold'/'$PREFIX 164 | inppfx=$PREFIX 165 | 166 | # no of pairs of samples 167 | x=$nsample 168 | y=`expr $nsample - 1` 169 | z=`expr $x \* $y` 170 | npairs=`expr $z / 2` 171 | echo 'npairs: '$npairs 172 | 173 | # output command for IDR plot 174 | cmd='Rscript '$exec2' '$npairs' '$OutDir'/IDR_Batch_Plot' 175 | for (( i=0; i<${nsample}-1; i++ )) 176 | do 177 | for (( j=$i+1; j<${nsample}; j++ )) 178 | do 179 | cmd=$cmd' '$OutDir'/'$i'_and_'$j'/'$inppfx 180 | done 181 | done 182 | echo 'cmd: '$cmd 183 | 184 | # execute the command 185 | # first go to the directory containing the R code of the IDR 186 | cd $dir2 187 | $cmd 188 | cd - 189 | 190 | # now convert the generated postscript plot file to a pdf file 191 | ps2pdf $OutDir'/IDR_Batch_Plot-plot.ps' $OutDir'/IDR_Batch_Plot-plot.pdf' 192 | 193 | fi 194 | 195 | #---------------------------------- 196 | # important - sourya 197 | # now restore the original directory 198 | cd $current_dir 199 | #---------------------------------- 200 | 201 | 202 | -------------------------------------------------------------------------------- /IDR_Codes/IDRScatterPlot.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | #=========================================================== 4 | # R script for scatter plot between a pair of peak files 5 | 6 | #Author: Sourya Bhattacharyya 7 | #Vijay-Ay lab, LJI 8 | 9 | # usage: Rscript result_summary.r $inpfile 10 | #=========================================================== 11 | 12 | args <- commandArgs(TRUE) 13 | 14 | # directory containing IDR code package 15 | IDRCodeDir <- args[1] #"/home/sourya/packages/idrCode/" 16 | 17 | # the pair of peak outputs for comparison 18 | peakfile1 <- args[2] 19 | peakfile2 <- args[3] 20 | # genome table.txt file provided in the IDR code package 21 | genometablefile <- args[4] 22 | # output prefix including the directory path 23 | # and the prefix string of the output plot file names 24 | curroutprefix <- args[5] 25 | 26 | # system path includes the path of IDR code 27 | source(paste0(IDRCodeDir, "functions-all-clayton-12-13.r")) 28 | 29 | chr.size <- read.table(genometablefile) 30 | 31 | half.width <- NULL 32 | overlap.ratio <- 0 33 | is.broadpeak <- F 34 | sig.value <- "p.value" 35 | 36 | # width and height values employed in these plots 37 | plotwidth <- 8 38 | plotheight <- 6 39 | 40 | rep1 <- process.narrowpeak(paste(peakfile1, sep=""), chr.size, 41 | half.width=half.width, summit="offset", broadpeak=is.broadpeak) 42 | 43 | rep2 <- process.narrowpeak(paste(peakfile2, sep=""), chr.size, 44 | half.width=half.width, summit="offset", broadpeak=is.broadpeak) 45 | 46 | uri.output <- compute.pair.uri(rep1$data.cleaned, rep2$data.cleaned, 47 | sig.value1=sig.value, sig.value2=sig.value, overlap.ratio=overlap.ratio) 48 | 49 | em.output <- fit.em(uri.output$data12.enrich, fix.rho2=T) 50 | idr.local <- 1-em.output$em.fit$e.z 51 | IDR <- c() 52 | o <- order(idr.local) 53 | IDR[o] <- cumsum(idr.local[o])/c(1:length(o)) 54 | 55 | idr_output <- data.frame(chr1=em.output$data.pruned$sample1[, "chr"], start1=em.output$data.pruned$sample1[, "start.ori"], stop1=em.output$data.pruned$sample1[, "stop.ori"], sig.value1=em.output$data.pruned$sample1[, "sig.value"], chr2=em.output$data.pruned$sample2[, "chr"], start2=em.output$data.pruned$sample2[, "start.ori"], stop2=em.output$data.pruned$sample2[, "stop.ori"], sig.value2=em.output$data.pruned$sample2[, "sig.value"], idr.local=1-em.output$em.fit$e.z, IDR=IDR) 56 | 57 | # this idr_output is already placed in the file "idr_overlapped_peaks.txt" 58 | 59 | filtered_peaks <- idr_output[idr_output[,10]<=0.01,] 60 | dim(filtered_peaks) # get the number of peaks 61 | 62 | ez.list <- get.ez.tt.all(em.output, uri.output$data12.enrich$merge1, uri.output$data12.enrich$merge2) 63 | 64 | par(mar=c(5,5,0,0.5), mfrow = c(1,3), oma=c(5,0,2,0)) 65 | 66 | idr_output$col[idr_output[,10]<=0.01]="black" 67 | 68 | idr_output$col[idr_output[,10]>=0.01]="red" 69 | 70 | # first graph 71 | pdf(paste0(curroutprefix,'_Signal_Replicates.pdf'), width=plotwidth, height=plotheight) 72 | plot(log(idr_output[,4]),log(idr_output[,8]),col=idr_output[,11], pch=19, cex = 0.05, xlab="log(signal) Rep1", ylab="log(signal) Rep2") 73 | legend("topleft", c("IDR=>0.01","IDR<=0.01"), col=c("red","black"), pch=19, bty="n", lty=c(1,1), lwd=c(2,2)) 74 | dev.off() 75 | 76 | # second graph 77 | pdf(paste0(curroutprefix,'_Peak_Rank_Replicates.pdf'), width=plotwidth, height=plotheight) 78 | plot(rank(-idr_output[,4]),rank(-idr_output[,8]),col=idr_output[,11], pch=19, cex = 0.05, xlab="Peak rank Rep1", ylab="Peak rank Rep2") 79 | legend("topleft", c("IDR=>0.01","IDR<=0.01"), col=c("red","black"), pch=19, bty="n", lty=c(1,1), lwd=c(1,1)) 80 | dev.off() 81 | 82 | # third graph 83 | pdf(paste0(curroutprefix,'_SignificantPeaks_vs_IDR.pdf'), width=plotwidth, height=plotheight) 84 | plot(ez.list$IDR, ylab="IDR", xlab="num of significant peaks") 85 | dev.off() 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /IDR_Codes/IDRSummary.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | #=========================================================== 4 | # R script for summarizing the results of IDR analysis between different sample replicates 5 | 6 | #Author: Sourya Bhattacharyya 7 | #Vijay-Ay lab, LJI 8 | 9 | # usage: Rscript result_summary.r $inpfile 10 | #=========================================================== 11 | 12 | args <- commandArgs(TRUE) 13 | 14 | # file containing the overlapped peak information 15 | CommonPeakFile <- args[1] 16 | inpdir <- dirname(CommonPeakFile) 17 | 18 | npeak1 <- as.integer(args[2]) 19 | npeak2 <- as.integer(args[3]) 20 | 21 | # print(sprintf("\n CommonPeakFile: %s ", CommonPeakFile)) 22 | # print(sprintf("\n npeak1: %s ", npeak1)) 23 | # print(sprintf("\n npeak2: %s ", npeak2)) 24 | 25 | # information of the common peak 26 | # Note: the file contains a header line 27 | CommonPeakInfo <- read.table(CommonPeakFile, header=TRUE) 28 | 29 | # number of overlapped peaks (considering all the IDR values) 30 | ncommonpeak <- length(CommonPeakInfo[,1]) 31 | fracpeak1 <- (ncommonpeak * 1.0) / npeak1 32 | fracpeak2 <- (ncommonpeak * 1.0) / npeak2 33 | 34 | # print(sprintf("\n ncommonpeak: %s ", ncommonpeak)) 35 | # print(sprintf("\n fracpeak1: %s ", fracpeak1)) 36 | # print(sprintf("\n fracpeak2: %s ", fracpeak2)) 37 | 38 | # find the rows where IDR is lower than a specified threshold 39 | # we employ three different thresholds: 40 | # 1) 0.01, 2) 0.05, and 3) 0.1 41 | # Note: Threshold of 0.01 (newly added) is recommended in the ENCODE 42 | 43 | NumIDRPass0 <- length(which(CommonPeakInfo[,10] <= 0.01)) 44 | FracIDRPass0 <- (NumIDRPass0 * 1.0) / ncommonpeak 45 | NumIDRPass1 <- length(which(CommonPeakInfo[,10] <= 0.05)) 46 | FracIDRPass1 <- (NumIDRPass1 * 1.0) / ncommonpeak 47 | NumIDRPass2 <- length(which(CommonPeakInfo[,10] <= 0.1)) 48 | FracIDRPass2 <- (NumIDRPass2 * 1.0) / ncommonpeak 49 | 50 | # print(sprintf("\n NumIDRPass0: %s ", NumIDRPass0)) 51 | # print(sprintf("\n FracIDRPass0: %s ", FracIDRPass0)) 52 | # print(sprintf("\n NumIDRPass1: %s ", NumIDRPass1)) 53 | # print(sprintf("\n FracIDRPass1: %s ", FracIDRPass1)) 54 | # print(sprintf("\n NumIDRPass2: %s ", NumIDRPass2)) 55 | # print(sprintf("\n FracIDRPass2: %s ", FracIDRPass2)) 56 | 57 | # divide the input overlapped peak files into two different structures 58 | # corresponding to the peak information of two different inputs 59 | # the seq() function also includes the row number for every interaction 60 | # this row number serves as the id of peaks 61 | PeakInfoInput1 <- cbind(seq(1:ncommonpeak), CommonPeakInfo[,1:4]) 62 | PeakInfoInput2 <- cbind(seq(1:ncommonpeak), CommonPeakInfo[,5:8]) 63 | 64 | # sort the data according to the significance value (last column of both the data) 65 | # decreasing order is employed 66 | PeakInfoInput1_Sort <- PeakInfoInput1[ order(-PeakInfoInput1[,5]),] 67 | PeakInfoInput2_Sort <- PeakInfoInput2[ order(-PeakInfoInput2[,5]),] 68 | 69 | # we check the cumulative percent of samples in both peak sets 70 | # and find out the overlap of peaks 71 | fraction_overlap <- c() 72 | 73 | for (x in seq(0, 1, 0.1)) { 74 | if ((x != 0) && (x != 1)) { 75 | # number of elements of both peak lists 76 | nsample <- as.integer(ncommonpeak * x) 77 | # common elements in both peak lists 78 | # the common factor is the first column: peak id 79 | OverlapSet <- PeakInfoInput1_Sort[1:nsample, 1] %in% PeakInfoInput2_Sort[1:nsample, 1] 80 | ncommon <- length(OverlapSet[OverlapSet==TRUE]) 81 | frac_common <- (ncommon * 1.0 / nsample) 82 | fraction_overlap <- c(fraction_overlap, frac_common) 83 | 84 | # we also note down two different fraction overlap statistics 85 | # corresponding to 10\%, 20% and 50% strongest peaks 86 | if (x == 0.1) { 87 | frac_overlap_10Pct = frac_common 88 | } 89 | if (x == 0.2) { 90 | frac_overlap_20Pct = frac_common 91 | } 92 | if (x == 0.5) { 93 | frac_overlap_50Pct = frac_common 94 | } 95 | 96 | # print(sprintf("\n Percentile value: %s ", x)) 97 | # print(sprintf("\n nsample: %s ", nsample)) 98 | # print(sprintf("\n ncommon: %s ", ncommon)) 99 | # print(sprintf("\n frac_common: %s ", frac_common)) 100 | } 101 | } 102 | 103 | # print(sprintf("\n Mean of fraction overlap: %s ", mean(fraction_overlap))) 104 | 105 | # # we check the percent of samples in both peak sets 106 | # # and find out the overlap of peaks 107 | # # for individual 10% bins 108 | 109 | # nbins <- 5 #10 110 | # fraction_overlap2 <- c() 111 | # sampleperbin <- as.integer(ncommonpeak / nbins) 112 | 113 | # for (b in (1:nbins)) { 114 | # if (b == 1) { 115 | # si <- 1 116 | # ei <- si + sampleperbin - 1 117 | # } else { 118 | # si <- ei + 1 119 | # if (b == nbins) { 120 | # ei <- ncommonpeak 121 | # } else { 122 | # ei <- si + sampleperbin - 1 123 | # } 124 | # } 125 | # OverlapSet <- PeakInfoInput1_Sort[si:ei, 1] %in% PeakInfoInput2_Sort[si:ei, 1] 126 | # ncommon <- length(OverlapSet[OverlapSet==TRUE]) 127 | # frac_common <- (ncommon * 1.0 / (ei-si+1)) 128 | # fraction_overlap2 <- c(fraction_overlap2, frac_common) 129 | # print(sprintf("\n si: %s ", si)) 130 | # print(sprintf("\n ei: %s ", ei)) 131 | # print(sprintf("\n ncommon: %s ", ncommon)) 132 | # print(sprintf("\n frac_common: %s ", frac_common)) 133 | # } 134 | 135 | # print(sprintf("\n Mean of fraction overlap2: %s ", mean(fraction_overlap2))) 136 | 137 | 138 | # write the results in a text file 139 | OutFilename <- paste0(inpdir, '/Stat.tab') 140 | 141 | fp <- file(OutFilename, open="w") 142 | write(paste0('NPeak1', '\t', 'NPeak2', '\t', 'CommonPeak', '\t', 'FracPeak1', '\t', 'FracPeak2', '\t', 'IDR_0.01_Peak', '\t', 'Frac_IDR_0.01_Peak', '\t', 'IDR_0.05_Peak', '\t', 'Frac_IDR_0.05_Peak', '\t', 'IDR_0.1_Peak', '\t', 'Frac_IDR_0.1_Peak', '\t', 'MeanOverlap', '\t', 'Overlap10', '\t', 'Overlap20', '\t', 'Overlap50'), file=fp, append=T) 143 | write(paste(npeak1, '\t', npeak2, '\t', ncommonpeak, '\t', fracpeak1, '\t', fracpeak2, '\t', NumIDRPass0, '\t', FracIDRPass0, '\t', NumIDRPass1, '\t', FracIDRPass1, '\t', NumIDRPass2, '\t', FracIDRPass2, '\t', mean(fraction_overlap), '\t', frac_overlap_10Pct, '\t', frac_overlap_20Pct, '\t', frac_overlap_50Pct), file=fp, append=T) 144 | close(fp) 145 | 146 | 147 | 148 | -------------------------------------------------------------------------------- /IDR_Codes/IDR_SubSampleBAM.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #================================= 4 | # this script encapsulates IDR analysis between two replicates into a single script 5 | # First, the input BAM files are analyzed to check their read similarity 6 | # and the BAM file with higher read is subsampled 7 | # then these modified BAM files are used for peak calling and subsequent IDR analysis 8 | #================================= 9 | # developed by - Sourya Bhattacharyya 10 | # Vijay-AY lab 11 | # La Jolla Institute for Allergy and Immunology 12 | #================================= 13 | 14 | # usage info 15 | usage(){ 16 | cat << EOF 17 | 18 | usage: ./IDR_SubSampleBAM.sh [-h] [-A BamFile1] [-B BamFile2] [-d OutDir] [-n 'IDR_test'] [-P PathIDRCode] [-c 25000] [-C control.bam] 19 | Example: 20 | 21 | Options: 22 | 23 | -- required: 24 | -A BamFile1 First BAM file 25 | -B BamFile2 Second BAM file 26 | -d OutDir Output directory (absolute path preferred) which will store the IDR results. 27 | -P PathIDRCode Path of the IDRCode package (Kundaje et. al. after its installation) 28 | -n PREFIX Prefix of output files. Default 'IDR_ATAC'. 29 | -c CountPeak No of peaks in both replicates that will be compared. Default 25000. 30 | -C CONTROLBAM Control file (in eiher .BAM or tagalign file in .gz format) 31 | EOF 32 | } 33 | 34 | # default values of peaks that need to be retained 35 | CountPeak=25000 36 | 37 | # executable containing the tag align shift code 38 | TagAlignExec='../bin/TagAlign.sh' 39 | 40 | # default control bam file 41 | CONTROLBAM="" 42 | 43 | # IDR analysis code using a pair of peak files 44 | IDR_code='./IDRAnalysis.sh' 45 | 46 | # default prefix string 47 | PREFIX='IDR_ATAC' 48 | 49 | # executable of sambamba 50 | # for subsampling of the bam file, samtools has a bug 51 | # so using this package 52 | sambamba_exec=`which sambamba` 53 | 54 | # Sourya - Note the processing of input file argument since it can be more than one file 55 | # Note the change of notations 56 | while getopts "A:B:n:d:c:C:P:" opt; 57 | do 58 | case "$opt" in 59 | A) BamFile1=$OPTARG;; 60 | B) BamFile2=$OPTARG;; 61 | n) PREFIX=$OPTARG;; 62 | d) OutDir=$OPTARG;; 63 | c) CountPeak=$OPTARG;; 64 | C) CONTROLBAM=$OPTARG;; 65 | P) IDRCodeDir=$OPTARG;; 66 | \?) usage 67 | echo "error: unrecognized option -$OPTARG"; 68 | exit 1 69 | ;; 70 | esac 71 | done 72 | 73 | if [[ -z $BamFile1 ]]; then 74 | echo 'User did not provide the first BAM file - exit for the moment !!' 75 | exit 1 76 | fi 77 | 78 | if [[ -z $BamFile2 ]]; then 79 | echo 'User did not provide the second BAM file - exit for the moment !!' 80 | exit 1 81 | fi 82 | 83 | if [[ -z $IDRCodeDir ]]; then 84 | echo 'User did not provide the path of IDRCode package (Kundaje et. al.) - exit for the moment !!' 85 | exit 1 86 | fi 87 | 88 | if [[ -z $OutDir ]]; then 89 | echo 'User did not provide output directory for storing the results - exit for the moment !!' 90 | exit 1 91 | fi 92 | 93 | # create the output directory 94 | mkdir -p $OutDir 95 | 96 | #---------------------------------- 97 | # important - sourya 98 | # change the current directory as the dir containing this executable 99 | # since other source files relative to the current directory needs to be called 100 | current_dir=$(pwd) 101 | script_dir=$(dirname $0) 102 | cd $script_dir 103 | #---------------------------------- 104 | 105 | # count of READS for two BAM files 106 | readcount1=`samtools view $BamFile1 | wc -l` 107 | readcount2=`samtools view $BamFile2 | wc -l` 108 | 109 | TagAlignFile1=$OutDir'/temp_1_tagalign.gz' 110 | TagAlignFile2=$OutDir'/temp_2_tagalign.gz' 111 | 112 | if [[ $readcount1 -gt $readcount2 ]]; then 113 | # the first BAM file needs to be subsampled, followed by their conversion to TAG Align format 114 | if [ ! -f $OutDir'/temp_1.bam' ]; then 115 | # fraction of subsampling 116 | # Note: we did not use the expr operator - simple expr does not work for float 117 | f=$(echo "scale=2;$readcount2/$readcount1" | bc) 118 | # we use sambamba for the subsampling 119 | # and use 8 threads for faster operation 120 | $sambamba_exec view -h -t 8 -s $f -f bam $BamFile1 -o $OutDir'/temp_1.bam' 121 | fi 122 | # conversion of the TAG Align format 123 | if [ ! -f $TagAlignFile1 ]; then 124 | $TagAlignExec -I $OutDir'/temp_1.bam' -N 0 -O $TagAlignFile1 125 | fi 126 | if [ ! -f $TagAlignFile2 ]; then 127 | $TagAlignExec -I $BamFile2 -N 0 -O $TagAlignFile2 128 | fi 129 | else 130 | # the second BAM file needs to be subsampled, followed by their conversion to TAG Align format 131 | if [ ! -f $OutDir'/temp_2.bam' ]; then 132 | # fraction of subsampling 133 | # Note: we did not use the expr operator - simple expr does not work for float 134 | f=$(echo "scale=2;$readcount1/$readcount2" | bc) 135 | # we use sambamba for the subsampling 136 | # and use 8 threads for faster operation 137 | $sambamba_exec view -h -t 8 -s $f -f bam $BamFile2 -o $OutDir'/temp_2.bam' 138 | fi 139 | if [ ! -f $TagAlignFile2 ]; then 140 | $TagAlignExec -I $OutDir'/temp_2.bam' -N 0 -O $TagAlignFile2 141 | fi 142 | if [ ! -f $TagAlignFile1 ]; then 143 | $TagAlignExec -I $BamFile1 -N 0 -O $TagAlignFile1 144 | fi 145 | fi 146 | 147 | #============================================== 148 | # calling the MACS2 using the generated tag align file 149 | #============================================== 150 | 151 | # first we have to fix the output folders containing the MACS2 output for both the samples 152 | # the output folder name is like MACS2_0/1_C 153 | # (where 0/1 indicates first or second sample) 154 | # _C is optional and included only when control bam file is provided as input 155 | 156 | MACS2_outdir1=$OutDir'/MACS2_0' 157 | MACS2_outdir2=$OutDir'/MACS2_1' 158 | if [[ ! -z $CONTROLBAM ]]; then 159 | MACS2_outdir1=$MACS2_outdir1'_C' 160 | MACS2_outdir2=$MACS2_outdir2'_C' 161 | fi 162 | MACS2_outdir1=$MACS2_outdir1'/' 163 | MACS2_outdir2=$MACS2_outdir2'/' 164 | mkdir -p $MACS2_outdir1 165 | mkdir -p $MACS2_outdir2 166 | 167 | 168 | # first file - MACS2 169 | MACS2PeakOutFile1=$MACS2_outdir1$PREFIX'.macs2_peaks.narrowPeak' 170 | if [ ! -f $MACS2PeakOutFile1 ]; then 171 | MACS2_cmd='macs2 callpeak -t '$TagAlignFile1' -f BED -n '$PREFIX'.macs2 --nomodel --nolambda --shift -100 --extsize 200 --outdir '$MACS2_outdir1 172 | if [[ ! -z $CONTROLBAM ]]; then 173 | # include the control file also 174 | MACS2_cmd=$MACS2_cmd' -c '$CONTROLBAM 175 | fi 176 | # execute the command 177 | $MACS2_cmd 178 | fi 179 | 180 | # second file - MACS2 181 | MACS2PeakOutFile2=$MACS2_outdir2$PREFIX'.macs2_peaks.narrowPeak' 182 | if [ ! -f $MACS2PeakOutFile2 ]; then 183 | MACS2_cmd='macs2 callpeak -t '$TagAlignFile2' -f BED -n '$PREFIX'.macs2 --nomodel --nolambda --shift -100 --extsize 200 --outdir '$MACS2_outdir2 184 | if [[ ! -z $CONTROLBAM ]]; then 185 | # include the control file also 186 | MACS2_cmd=$MACS2_cmd' -c '$CONTROLBAM 187 | fi 188 | # execute the command 189 | $MACS2_cmd 190 | fi 191 | 192 | #==================================== 193 | # now call the IDR analysis using the generated peak files 194 | #==================================== 195 | # we have to fix the output directory where the results of IDR will be stored 196 | # depending on the presence of control parameters 197 | # the folders will vary 198 | # the folders have the following format: C(0/1) depending on the input options 199 | 200 | IDR_OutDir=$OutDir'/' 201 | if [[ ! -z $CONTROLBAM ]]; then 202 | IDR_OutDir=$IDR_OutDir'C1' 203 | else 204 | IDR_OutDir=$IDR_OutDir'C0' 205 | fi 206 | IDR_OutDir=$IDR_OutDir'_Peak'$CountPeak'/' 207 | mkdir -p $IDR_OutDir 208 | 209 | $IDR_code -a $MACS2PeakOutFile1 -b $MACS2PeakOutFile2 -P $IDRCodeDir -d $IDR_OutDir -n $PREFIX -c $CountPeak 210 | 211 | #---------------------------------- 212 | # important - sourya 213 | # now restore the original directory 214 | cd $current_dir 215 | #---------------------------------- 216 | -------------------------------------------------------------------------------- /IDR_Codes/IDR_SubSampleBAM_Main.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #================================= 4 | # this script encapsulates IDR analysis between different replicates into a single script 5 | # provided that the inputs are in BAM format 6 | # and they need to be resampled + peak calling + IDR 7 | #================================= 8 | # developed by - Sourya Bhattacharyya 9 | # Vijay-AY lab 10 | # La Jolla Institute for Allergy and Immunology 11 | #================================= 12 | 13 | # usage info 14 | usage(){ 15 | cat << EOF 16 | 17 | usage: ./IDR_SubSampleBAM_Main.sh [-h] [-I inpfile1.bam] [-I inpfile2.bam] [-d OutDir] [-P PathIDRCode] [-n 'IDR_test'] [-c 25000] [-C control.bam] 18 | 19 | Options: 20 | 21 | -- required: 22 | -I InpFile A list of input bam files. At least two bam files are required. 23 | -d OutDir Output directory (absolute path preferred) which will store the IDR results. 24 | -P PathIDRCode Path of the IDRCode package (Kundaje et. al. after its installation) 25 | -n PREFIX Prefix of output files. Default 'IDR_ATAC'. 26 | -c CountPeak No of peaks in both replicates that will be compared. Default 25000. 27 | -C CONTROLBAM Control file (in eiher .BAM or tagalign file in .gz format) 28 | EOF 29 | } 30 | 31 | # default values of peaks that need to be retained 32 | CountPeak=25000 33 | 34 | # code containing the IDR + consistency plot 35 | # dir2='/home/sourya/packages/idrCode/' 36 | exec2='batch-consistency-plot.r' 37 | 38 | # default control bam file 39 | CONTROLBAM="" 40 | 41 | # default prefix string 42 | PREFIX='IDR_ATAC' 43 | 44 | # Sourya - Note the processing of input file argument since it can be more than one file 45 | # Note the change of notations 46 | while getopts "I:n:d:c:C:P:" opt; 47 | do 48 | case "$opt" in 49 | I) InpFile+=($OPTARG);; 50 | n) PREFIX=$OPTARG;; 51 | d) OutDir=$OPTARG;; 52 | c) CountPeak=$OPTARG;; 53 | C) CONTROLBAM=$OPTARG;; 54 | P) dir2=$OPTARG;; 55 | \?) usage 56 | echo "error: unrecognized option -$OPTARG"; 57 | exit 1 58 | ;; 59 | esac 60 | done 61 | 62 | if [[ -z $InpFile ]]; then 63 | echo 'User did not provide any input BAM file - exit for the moment !!' 64 | exit 1 65 | fi 66 | 67 | if [[ -z $dir2 ]]; then 68 | echo 'User did not provide the path of IDRCode package (Kundaje et. al.) - exit for the moment !!' 69 | exit 1 70 | fi 71 | 72 | if [[ -z $OutDir ]]; then 73 | echo 'User did not provide output directory for storing the results - exit for the moment !!' 74 | exit 1 75 | fi 76 | 77 | # number of input files provided 78 | nsample=${#InpFile[@]} 79 | echo 'Number of input files : '$nsample 80 | 81 | if [ $nsample -lt 2 ]; then 82 | echo 'User needs to provide at least two peak files for comparison - exit for the moment !!' 83 | exit 1 84 | fi 85 | 86 | # generate the output directory 87 | mkdir -p $OutDir 88 | 89 | #---------------------------------- 90 | # important - sourya 91 | # change the current directory as the dir containing this executable 92 | # since other source files relative to the current directory needs to be called 93 | current_dir=$(pwd) 94 | script_dir=$(dirname $0) 95 | cd $script_dir 96 | #---------------------------------- 97 | 98 | #================================= 99 | # batch replicate analysis 100 | #================================= 101 | if [ ! -f $OutDir'/Replicate_Names.txt' ]; then 102 | echo 'Analyzing the '$nsample' Number of replicates --- ' > $OutDir'/Replicate_Names.txt' 103 | for (( i=0; i<${nsample}; i++ )) 104 | do 105 | echo 'Sample '$i' is : '${InpFile[$i]} >> $OutDir'/Replicate_Names.txt' 106 | done 107 | fi 108 | 109 | # loop for pairwise execution of samples 110 | for (( i=0; i<${nsample}-1; i++ )) 111 | do 112 | for (( j=$i+1; j<${nsample}; j++ )) 113 | do 114 | # pair of samples 115 | sample1=${InpFile[$i]} 116 | sample2=${InpFile[$j]} 117 | # execute the sample pairs 118 | # Note the output directory name 119 | if [[ ! -z $CONTROLBAM ]]; then 120 | ./IDR_SubSampleBAM.sh -A $sample1 -B $sample2 -d $OutDir'/'$i'_and_'$j -P $dir2 -n $PREFIX -c $CountPeak -C $CONTROLBAM 121 | else 122 | ./IDR_SubSampleBAM.sh -A $sample1 -B $sample2 -d $OutDir'/'$i'_and_'$j -P $dir2 -n $PREFIX -c $CountPeak 123 | fi 124 | done 125 | done 126 | 127 | #================================= 128 | # batch consistency plots 129 | #================================= 130 | 131 | # the pattern of input prefix present in every replicate 132 | # depends on the control sample and tagmentation option 133 | 134 | if [[ ! -z $CONTROLBAM ]]; then 135 | inppfx='C1' 136 | else 137 | inppfx='C0' 138 | fi 139 | inppfx=$inppfx'_Peak'$CountPeak'/'$PREFIX 140 | 141 | # basic plotting file name format 142 | # without the extension '-plot.pdf' 143 | plotfilename='IDR_Batch_Plot' 144 | if [[ ! -z $CONTROLBAM ]]; then 145 | plotfilename=$plotfilename'_C1' 146 | else 147 | plotfilename=$plotfilename'_C0' 148 | fi 149 | 150 | #if [ ! -f $OutDir'/'$plotfilename'-plot.pdf' ]; then 151 | 152 | # no of pairs of samples 153 | x=$nsample 154 | y=`expr $nsample - 1` 155 | z=`expr $x \* $y` 156 | npairs=`expr $z / 2` 157 | echo 'npairs: '$npairs 158 | 159 | # output command for IDR plot 160 | cmd='Rscript '$exec2' '$npairs' '$OutDir'/'$plotfilename 161 | for (( i=0; i<${nsample}-1; i++ )) 162 | do 163 | for (( j=$i+1; j<${nsample}; j++ )) 164 | do 165 | cmd=$cmd' '$OutDir'/'$i'_and_'$j'/'$inppfx 166 | done 167 | done 168 | echo 'cmd: '$cmd 169 | 170 | # execute the command 171 | # first go to the directory containing the R code of the IDR 172 | cd $dir2 173 | $cmd 174 | cd - 175 | 176 | # now convert the generated postscript plot file to a pdf file 177 | ps2pdf $OutDir'/'$plotfilename'-plot.ps' $OutDir'/'$plotfilename'-plot.pdf' 178 | 179 | #fi 180 | 181 | #---------------------------------- 182 | # important - sourya 183 | # now restore the original directory 184 | cd $current_dir 185 | #---------------------------------- 186 | -------------------------------------------------------------------------------- /Imp_Scripts/Footprint_HINT_ATAC.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | #================================== 4 | # footprinting using HINT-ATAC package 5 | # http://www.regulatory-genomics.org/hint/introduction/ 6 | 7 | # author: Sourya Bhattacharyya 8 | # Vijay-AY lab 9 | 10 | # check 11 | # http://www.regulatory-genomics.org/hint/tutorial/ 12 | #================================== 13 | 14 | library(optparse) 15 | 16 | #=========================================================== 17 | option_list = list( 18 | 19 | make_option(c("--AllRead"), type="character", default=NULL, help="Alignment file containing all reads."), 20 | make_option(c("--NFRRead"), type="character", default=NULL, help="Alignment file containing nucleosome free regions (NFR) reads."), 21 | make_option(c("--NFRANDNuclRead"), type="character", default=NULL, help="Alignment file containing nucleosome free regions (NFR) plus all nucleosome (1M, 2M, 3M) merged reads."), 22 | make_option(c("--RefGenome"), type="character", default=NULL, help="Reference genome name."), 23 | make_option(c("--OutDir"), type="character", default=NULL, help="Output directory to contain the motif."), 24 | make_option(c("--PE"), type="integer", action="store", default=0, help="If 1, indicates paired end input data. Default = 0"), 25 | make_option(c("--FP"), type="integer", action="store", default=1, help="Footorinting option. Value can be 1 (default), 2, or 3. (1): footoprint using the nucleosome free reads (NFR) will be computed. Default setting. Best for default ATAC-seq protocol (check Li et. al. Genome Biology 2019). 2: footoprint using the nucleosome free reads (NFR) and also the nucleosome containing reads (NFR + 1N + 2N + 3N ...) will be computed (two different footprint outputs - time consuming). Best for Omni-ATAC protocol (check Li et. al. Genome Biology 2019). (3): footoprint using NFR, NFR with nucleosome reads, and all reads will be computed (three different footprint outputs - highly time consuming). Default = 1"), 26 | make_option(c("--MotifPeak"), type="character", default=NULL, help="Peak or summit file which was used by HOMER to generate corresponding motifs. Mandatory parameter.") 27 | ); 28 | 29 | opt_parser = OptionParser(option_list=option_list); 30 | opt = parse_args(opt_parser); 31 | 32 | # create the output directory 33 | system(paste("mkdir -p", opt$OutDir)) 34 | 35 | # prefix string of output file name 36 | OUTPREFIX <- 'footprints_HINT_ATAC' 37 | 38 | 39 | ##=========== 40 | ## processing all reads 41 | ## only if FP option > 2 42 | ##=========== 43 | if (opt$FP > 2) { 44 | if (!is.null(opt$AllRead)) { 45 | curroutdir <- paste0(opt$OutDir, '/all') 46 | system(paste("mkdir -p", curroutdir)) 47 | if ((file.exists(paste0(curroutdir, '/', OUTPREFIX, '.bed')) == FALSE) | (file.exists(paste0(curroutdir, '/', OUTPREFIX, '.info')) == FALSE)) { 48 | if (opt$PE == 1) { 49 | cat(sprintf("\n start footprint HINT ATAC PE reads - all reads")) 50 | system(paste("rgt-hint footprinting --atac-seq --organism ", opt$RefGenome, " --paired-end --output-location ", curroutdir, " --output-prefix ", OUTPREFIX, " ", opt$AllRead, opt$MotifPeak)) 51 | } else { 52 | system(paste("rgt-hint footprinting --atac-seq --organism ", opt$RefGenome, " --output-location ", curroutdir, " --output-prefix ", OUTPREFIX, " ", opt$AllRead, opt$MotifPeak)) 53 | } 54 | } 55 | # now call motif matching for the obtained footprints 56 | # JASPAR database is used by default for motif finding 57 | # 10% random background region is tested - using the option --rand-proportion 10 58 | cat(sprintf("\n start motifanalysis of HINT ATAC - all reads")) 59 | motifoutdir <- paste0(curroutdir, '/motifanalysis_matching_out') 60 | system(paste("mkdir -p", motifoutdir)) 61 | system(paste("rgt-motifanalysis matching --organism ", opt$RefGenome, " --rand-proportion 10 --input-files ", paste0(curroutdir, '/', OUTPREFIX, '.bed'), " --output-location ", motifoutdir)) 62 | } 63 | } 64 | 65 | ##=========== 66 | ## processing NFR and nucleosome reads (1N, 2N, ...) 67 | ## only if FP option > 1 68 | ##=========== 69 | if (opt$FP > 1) { 70 | if (!is.null(opt$NFRANDNuclRead)) { 71 | curroutdir <- paste0(opt$OutDir, '/NFRANDNucl') 72 | system(paste("mkdir -p", curroutdir)) 73 | if ((file.exists(paste0(curroutdir, '/', OUTPREFIX, '.bed')) == FALSE) | (file.exists(paste0(curroutdir, '/', OUTPREFIX, '.info')) == FALSE)) { 74 | if (opt$PE == 1) { 75 | cat(sprintf("\n start footprint HINT ATAC PE reads - nucleosome free and nucleosome reads")) 76 | system(paste("rgt-hint footprinting --atac-seq --organism ", opt$RefGenome, " --paired-end --output-location ", curroutdir, " --output-prefix ", OUTPREFIX, " ", opt$NFRANDNuclRead, opt$MotifPeak)) 77 | } else { 78 | system(paste("rgt-hint footprinting --atac-seq --organism ", opt$RefGenome, " --output-location ", curroutdir, " --output-prefix ", OUTPREFIX, " ", opt$NFRANDNuclRead, opt$MotifPeak)) 79 | } 80 | } 81 | # now call motif matching for the obtained footprints 82 | # JASPAR database is used by default for motif finding 83 | # 10% random background region is tested - using the option --rand-proportion 10 84 | cat(sprintf("\n start motifanalysis of HINT ATAC - nucleosome free and nucleosome reads")) 85 | motifoutdir <- paste0(curroutdir, '/motifanalysis_matching_out') 86 | system(paste("mkdir -p", motifoutdir)) 87 | system(paste("rgt-motifanalysis matching --organism ", opt$RefGenome, " --rand-proportion 10 --input-files ", paste0(curroutdir, '/', OUTPREFIX, '.bed'), " --output-location ", motifoutdir)) 88 | } 89 | } 90 | 91 | ##=========== 92 | ## processing NFR reads 93 | ## default option 94 | ##=========== 95 | if (!is.null(opt$NFRRead)) { 96 | curroutdir <- paste0(opt$OutDir, '/NFR') 97 | system(paste("mkdir -p", curroutdir)) 98 | if ((file.exists(paste0(curroutdir, '/', OUTPREFIX, '.bed')) == FALSE) | (file.exists(paste0(curroutdir, '/', OUTPREFIX, '.info')) == FALSE)) { 99 | if (opt$PE == 1) { 100 | cat(sprintf("\n start footprint HINT ATAC PE reads - nucleosome free reads")) 101 | system(paste("rgt-hint footprinting --atac-seq --organism ", opt$RefGenome, " --paired-end --output-location ", curroutdir, " --output-prefix ", OUTPREFIX, " ", opt$NFRRead, opt$MotifPeak)) 102 | } else { 103 | system(paste("rgt-hint footprinting --atac-seq --organism ", opt$RefGenome, " --output-location ", curroutdir, " --output-prefix ", OUTPREFIX, " ", opt$NFRRead, opt$MotifPeak)) 104 | } 105 | } 106 | # now call motif matching for the obtained footprints 107 | # JASPAR database is used by default for motif finding 108 | # 10% random background region is tested - using the option --rand-proportion 10 109 | cat(sprintf("\n start motifanalysis of HINT ATAC - nucleosome free reads")) 110 | motifoutdir <- paste0(curroutdir, '/motifanalysis_matching_out') 111 | system(paste("mkdir -p", motifoutdir)) 112 | system(paste("rgt-motifanalysis matching --organism ", opt$RefGenome, " --rand-proportion 10 --input-files ", paste0(curroutdir, '/', OUTPREFIX, '.bed'), " --output-location ", motifoutdir)) 113 | } 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /Imp_Scripts/Motif_HOMER.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | #========================= 4 | # call motifs from ATAC-seq peaks using HOMER 5 | #========================= 6 | 7 | library(optparse) 8 | library(data.table) 9 | 10 | options(scipen = 999) 11 | options(datatable.fread.datatable=FALSE) 12 | 13 | #================================= 14 | # function to create peak summit information 15 | #================================= 16 | GeneratePeakSummitFile <- function(PeakSummitFile, PeakData, offset=500) { 17 | 18 | if (ncol(PeakData) > 9) { 19 | # use the relative peak summit information (10th field) and generate an offset of 20 | outDF <- cbind.data.frame(PeakData[,1], (PeakData[,2] + PeakData[,10] - offset), (PeakData[,2] + PeakData[,10] + offset)) 21 | } else { 22 | # use the midpoint of the peaks as the summit 23 | outDF <- cbind.data.frame(PeakData[,1], (as.integer((PeakData[,2] + PeakData[,3])/2) - offset), (as.integer((PeakData[,2] + PeakData[,3])/2) + offset)) 24 | } 25 | write.table(outDF, PeakSummitFile, row.names=F, col.names=F, sep="\t", quote=F, append=F) 26 | 27 | } # end function 28 | 29 | #=========================================================== 30 | option_list = list( 31 | 32 | make_option(c("--MotifFindExec"), type="character", default=NULL, help="HOMER motif finding executable"), 33 | make_option(c("--RefGenome"), type="character", default=NULL, help="Reference genome name."), 34 | make_option(c("--PeakFile"), type="character", default=NULL, help="ATAC-seq Peak file."), 35 | make_option(c("--PValThr"), type="numeric", default=0, help="Threshold of -log10(p-value) above which peaks will be considered. Default = 0, means no Threshold is imposed."), 36 | make_option(c("--QValThr"), type="numeric", default=0, help="Threshold of -log10(q-value) above which peaks will be considered. Default = 0, means no threshold is imposed."), 37 | make_option(c("--OutDir"), type="character", default=NULL, help="Output directory."), 38 | make_option(c("--SizeVal"), type="integer", action="store", default=200, help="Size argument of HOMER motif finding. Default = 200"), 39 | make_option(c("--SummitOffset"), type="integer", action="store", default=500, help="Offset around the peak summit position to be considered for motif finding. Default = 500") 40 | ); 41 | 42 | opt_parser = OptionParser(option_list=option_list); 43 | opt = parse_args(opt_parser); 44 | 45 | system(paste("mkdir -p", opt$OutDir)) 46 | 47 | PValThr <- as.numeric(opt$PValThr) 48 | QValThr <- as.numeric(opt$QValThr) 49 | if (QValThr > 0) { 50 | PValThr <- 0 51 | } 52 | 53 | if ((PValThr == 0) & (QValThr == 0)) { 54 | # CurrOutDir <- paste0(opt$OutDir, '/Motif_Complete_Peaks_Size_', opt$SizeVal, '_SummitOffset_', opt$SummitOffset) 55 | CurrOutDir <- paste0(opt$OutDir, '/Motif_Complete_Peaks_SummitOffset_', opt$SummitOffset) 56 | } else if (QValThr > 0) { 57 | # CurrOutDir <- paste0(opt$OutDir, '/Motif_Peaks_QvalThr_', QValThr, '_Size_', opt$SizeVal, '_SummitOffset_', opt$SummitOffset) 58 | CurrOutDir <- paste0(opt$OutDir, '/Motif_Peaks_QvalThr_', QValThr, '_SummitOffset_', opt$SummitOffset) 59 | } else { 60 | # CurrOutDir <- paste0(opt$OutDir, '/Motif_Peaks_PvalThr_', PValThr, '_Size_', opt$SizeVal, '_SummitOffset_', opt$SummitOffset) 61 | CurrOutDir <- paste0(opt$OutDir, '/Motif_Peaks_PvalThr_', PValThr, '_SummitOffset_', opt$SummitOffset) 62 | } 63 | system(paste("mkdir -p", CurrOutDir)) 64 | 65 | # read the complete peak data 66 | PeakData <- data.table::fread(opt$PeakFile) 67 | 68 | # filter peaks if there is any p-value or q-value specific threshold is provided 69 | # then call the motif finding routine 70 | if (PValThr > 0) { 71 | PeakData_Filt <- PeakData[which(PeakData[, 8] > PValThr), ] 72 | if (nrow(PeakData_Filt) > 0) { 73 | # write the filtered peaks 74 | FiltPeakFileName <- paste0(CurrOutDir, '/Filtered_Peaks_PvalThr.bed') 75 | write.table(PeakData_Filt, FiltPeakFileName, row.names=F, col.names=F, sep="\t", quote=F, append=F) 76 | # extract the peak summits and +/- opt$SummitOffset bp from the summits 77 | FiltPeakFileNameSummit <- paste0(CurrOutDir, '/Filtered_Peaks_PvalThr_Summit_Offset_', opt$SummitOffset, 'bp.bed') 78 | GeneratePeakSummitFile(FiltPeakFileNameSummit, PeakData_Filt, offset=opt$SummitOffset) 79 | # now call motif using these summit information 80 | # currently commented - sourya 81 | # system(paste(opt$MotifFindExec, FiltPeakFileNameSummit, opt$RefGenome, CurrOutDir, " -size ", opt$SizeVal, " -mask")) 82 | } 83 | } else if (QValThr > 0) { 84 | PeakData_Filt <- PeakData[which(PeakData[, 9] > QValThr), ] 85 | if (nrow(PeakData_Filt) > 0) { 86 | # write the filtered peaks 87 | FiltPeakFileName <- paste0(CurrOutDir, '/Filtered_Peaks_QvalThr.bed') 88 | write.table(PeakData_Filt, FiltPeakFileName, row.names=F, col.names=F, sep="\t", quote=F, append=F) 89 | # extract the peak summits and +/- opt$SummitOffset bp from the summits 90 | FiltPeakFileNameSummit <- paste0(CurrOutDir, '/Filtered_Peaks_QvalThr_Summit_Offset_', opt$SummitOffset, 'bp.bed') 91 | GeneratePeakSummitFile(FiltPeakFileNameSummit, PeakData_Filt, offset=opt$SummitOffset) 92 | # now call motif using these summit information 93 | # currently commented - sourya 94 | # system(paste(opt$MotifFindExec, FiltPeakFileNameSummit, opt$RefGenome, CurrOutDir, " -size ", opt$SizeVal, " -mask")) 95 | } 96 | } else { 97 | # extract the peak summits and +/- opt$SummitOffset bp from the summits 98 | FiltPeakFileNameSummit <- paste0(CurrOutDir, '/Peaks_Summit_Offset_', opt$SummitOffset, 'bp.bed') 99 | GeneratePeakSummitFile(FiltPeakFileNameSummit, PeakData, offset=opt$SummitOffset) 100 | 101 | # now call motif using these summit information 102 | # currently commented - sourya 103 | # system(paste(opt$MotifFindExec, FiltPeakFileNameSummit, opt$RefGenome, CurrOutDir, " -size ", opt$SizeVal, " -mask")) 104 | } 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /Imp_Scripts/Peak_Enrichment.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | #========================= 4 | # analyze ATAC-seq peaks and plot the enrichment for peaks and surrounding regions 5 | # also analyze separately for promoter peaks and enhancer peaks 6 | # using reference TSS information 7 | #========================= 8 | 9 | suppressMessages(library(GenomicRanges)) 10 | library(optparse) 11 | library(data.table) 12 | 13 | options(scipen = 999) 14 | options(datatable.fread.datatable=FALSE) 15 | 16 | #================================= 17 | # function to create peak summit information 18 | #================================= 19 | GeneratePeakSummitFile <- function(PeakSummitFile, PeakData) { 20 | 21 | if (ncol(PeakData) > 9) { 22 | # use the relative peak summit information (10th field) and generate an offset of 23 | outDF <- cbind.data.frame(PeakData[,1], (PeakData[,2] + PeakData[,10] - 5), (PeakData[,2] + PeakData[,10] + 5)) 24 | } else { 25 | # use the midpoint of the peaks as the summit 26 | outDF <- cbind.data.frame(PeakData[,1], (as.integer((PeakData[,2] + PeakData[,3])/2) - 5), (as.integer((PeakData[,2] + PeakData[,3])/2) + 5)) 27 | } 28 | write.table(outDF, PeakSummitFile, row.names=F, col.names=F, sep="\t", quote=F, append=F) 29 | 30 | } # end function 31 | 32 | 33 | #================================= 34 | # function to compute overlap of 1D bins 35 | #================================= 36 | Overlap1D <- function(Inpdata1, Inpdata2, boundary=1, offset=0, uniqov=TRUE) { 37 | 38 | ov1 <- as.data.frame(findOverlaps(GRanges(Inpdata1[,1], IRanges(Inpdata1[,2]+boundary-offset, Inpdata1[,3]-boundary+offset)),GRanges(Inpdata2[,1], IRanges(Inpdata2[,2]+boundary-offset, Inpdata2[,3]-boundary+offset)))) 39 | if (uniqov == TRUE) { 40 | ov_idx_file1 <- unique(ov1[,1]) 41 | ov_idx_file2 <- unique(ov1[,2]) 42 | } else { 43 | ov_idx_file1 <- ov1[,1] 44 | ov_idx_file2 <- ov1[,2] 45 | } 46 | nonov_idx_file1 <- setdiff(seq(1, nrow(Inpdata1)), ov_idx_file1) 47 | nonov_idx_file2 <- setdiff(seq(1, nrow(Inpdata2)), ov_idx_file2) 48 | 49 | # return the overlapping and non-overlapping set of indices 50 | newList <- list(A_AND_B = ov_idx_file1, B_AND_A = ov_idx_file2, A_MINUS_B = nonov_idx_file1, B_MINUS_A = nonov_idx_file2) 51 | return(newList) 52 | 53 | } 54 | 55 | #================================= 56 | # function to plot the heatmap using deeptools 57 | #================================= 58 | PlotHeatMap <- function(CurrOutDir, DeepToolsDir, outmatfile, Label) { 59 | 60 | # then use this matrix to plot profile 61 | outprofileplotfile <- paste0(CurrOutDir, '/out_mat_profile_plot.pdf') 62 | outprofileplotfile1 <- paste0(CurrOutDir, '/out_mat_profile_plot_1.pdf') 63 | outprofileplotfile2 <- paste0(CurrOutDir, '/out_mat_heatmap_plot.pdf') 64 | outprofileplotfile3 <- paste0(CurrOutDir, '/out_mat_heatmap_plot_1.pdf') 65 | 66 | system(paste0(DeepToolsDir, "/plotProfile --matrixFile ", outmatfile, " --outFileName ", outprofileplotfile, " --plotHeight 7 --plotWidth 10 --samplesLabel ", Label, " --plotTitle ATACPeakTSSEnrichment --plotFileFormat pdf --yMin 0.5 --yMax 40 --colors red yellow blue")) 67 | 68 | system(paste0(DeepToolsDir, "/plotProfile --matrixFile ", outmatfile, " --outFileName ", outprofileplotfile1, " --plotHeight 7 --plotWidth 10 --samplesLabel ", Label, " --plotTitle ATACPeakTSSEnrichment --plotFileFormat pdf --yMin 0.5 --yMax 5 --colors red yellow blue")) 69 | 70 | system(paste0(DeepToolsDir, "/plotHeatmap --matrixFile ", outmatfile, " --outFileName ", outprofileplotfile2, " --heatmapHeight 10 --heatmapWidth 8 --samplesLabel ", Label, " --plotTitle ATACPeakTSSEnrichment --plotFileFormat pdf --yMin 0.5 --yMax 40 --zMin 0 --zMax 50")) 71 | 72 | system(paste0(DeepToolsDir, "/plotHeatmap --matrixFile ", outmatfile, " --outFileName ", outprofileplotfile3, " --heatmapHeight 10 --heatmapWidth 8 --samplesLabel ", Label, " --plotTitle ATACPeakTSSEnrichment --plotFileFormat pdf --yMin 0.5 --yMax 5 --zMin 0 --zMax 50")) 73 | 74 | } # end function 75 | 76 | 77 | #=========================================================== 78 | option_list = list( 79 | 80 | make_option(c("--BigWigFile"), type="character", default=NULL, help="BigWig file of ATAC-seq data."), 81 | make_option(c("--Label"), type="character", default=NULL, help="Label or sample name of ATAC-seq data."), 82 | make_option(c("--DeepToolsDir"), type="character", default=NULL, help="Deeptools executable directory."), 83 | make_option(c("--TSSFile"), type="character", default=NULL, help="File containing reference genome TSS information."), 84 | make_option(c("--PeakFile"), type="character", default=NULL, help="File containing ATAC-seq peak information."), 85 | make_option(c("--OutDir"), type="character", default=NULL, help="Output directory."), 86 | make_option(c("--Offset"), type="integer", action="store", default=5000, help="Offset with respect to summit (in bp) to compute enrichment. Default = 5000 means 5 Kb around peak summits would be used for enrichment.") 87 | ); 88 | 89 | opt_parser = OptionParser(option_list=option_list); 90 | opt = parse_args(opt_parser); 91 | 92 | system(paste("mkdir -p", opt$OutDir)) 93 | 94 | # read the input peaks 95 | PeakData <- data.table::fread(opt$PeakFile) 96 | 97 | # extract the peak summits of the complete peak file 98 | CurrOutDir <- paste0(opt$OutDir, '/Complete_Peaks') 99 | system(paste("mkdir -p", CurrOutDir)) 100 | 101 | PeakSummitFile <- paste0(CurrOutDir, '/Peak_Summits.bed') 102 | if (file.exists(PeakSummitFile) == FALSE) { 103 | GeneratePeakSummitFile(PeakSummitFile, PeakData) 104 | } 105 | 106 | # now apply deeptools utility to compute enrichment 107 | outmatfile <- paste0(CurrOutDir, '/deeptools_out_mat_TSS.gz') 108 | if (file.exists(outmatfile) == FALSE) { 109 | system(paste0(opt$DeepToolsDir, "/computeMatrix reference-point -R ", PeakSummitFile, " -S ", opt$BigWigFile, " -a ", opt$Offset, " -b ", opt$Offset, " --skipZeros --outFileName ", outmatfile)) 110 | } 111 | PlotHeatMap(CurrOutDir, opt$DeepToolsDir, outmatfile, opt$Label) 112 | 113 | # if TSS information is also provided, find the enrichment of promoter and 114 | # enhancer peaks separately 115 | if (!is.null(opt$TSSFile)) { 116 | TSSData <- data.table::fread(opt$TSSFile) 117 | # 2.5 Kb overlap on both side of TSS data 118 | ov <- Overlap1D(PeakData[,1:3], cbind.data.frame(TSSData[,1:2],TSSData[,2]), boundary=0, offset=2500, uniqov=TRUE) 119 | PromPeakData <- PeakData[ov$A_AND_B, ] 120 | EnhPeakData <- PeakData[ov$A_MINUS_B, ] 121 | 122 | # process the promoter peaks 123 | if (nrow(PromPeakData) > 0) { 124 | CurrOutDir <- paste0(opt$OutDir, '/Promoter_Peaks') 125 | system(paste("mkdir -p", CurrOutDir)) 126 | PeakSummitFile <- paste0(CurrOutDir, '/Peak_Summits.bed') 127 | if (file.exists(PeakSummitFile) == FALSE) { 128 | GeneratePeakSummitFile(PeakSummitFile, PromPeakData) 129 | } 130 | # now apply deeptools utility to compute enrichment 131 | outmatfile <- paste0(CurrOutDir, '/deeptools_out_mat_TSS.gz') 132 | if (file.exists(outmatfile) == FALSE) { 133 | system(paste0(opt$DeepToolsDir, "/computeMatrix reference-point -R ", PeakSummitFile, " -S ", opt$BigWigFile, " -a ", opt$Offset, " -b ", opt$Offset, " --skipZeros --outFileName ", outmatfile)) 134 | } 135 | PlotHeatMap(CurrOutDir, opt$DeepToolsDir, outmatfile, opt$Label) 136 | } 137 | 138 | # process the enhancer peaks 139 | if (nrow(EnhPeakData) > 0) { 140 | CurrOutDir <- paste0(opt$OutDir, '/Enhancer_Peaks') 141 | system(paste("mkdir -p", CurrOutDir)) 142 | PeakSummitFile <- paste0(CurrOutDir, '/Peak_Summits.bed') 143 | if (file.exists(PeakSummitFile) == FALSE) { 144 | GeneratePeakSummitFile(PeakSummitFile, EnhPeakData) 145 | } 146 | # now apply deeptools utility to compute enrichment 147 | outmatfile <- paste0(CurrOutDir, '/deeptools_out_mat_TSS.gz') 148 | if (file.exists(outmatfile) == FALSE) { 149 | system(paste0(opt$DeepToolsDir, "/computeMatrix reference-point -R ", PeakSummitFile, " -S ", opt$BigWigFile, " -a ", opt$Offset, " -b ", opt$Offset, " --skipZeros --outFileName ", outmatfile)) 150 | } 151 | PlotHeatMap(CurrOutDir, opt$DeepToolsDir, outmatfile, opt$Label) 152 | } 153 | } 154 | 155 | 156 | 157 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ATACProc - a pipeline for processing ATAC-seq data 2 | 3 | Devloper: Sourya Bhattacharyya 4 | 5 | Supervisors: Dr. Ferhat Ay and Dr. Pandurangan Vijayanand 6 | 7 | La Jolla Institute for Immunology, CA 92037, USA 8 | 9 | 10 | ####################### 11 | 12 | ATACProc is a pipeline to analyze ATAC-seq data. Currently datasets involving one of the four reference genomes, namely hg19, hg38, mm9 and mm10 are supported. Important features of this pipeline are: 13 | 14 | 1) Supports single or paired-end fastq or BAM formatted data. 15 | 16 | 2) Generates alignment summary and QC statistics. 17 | 18 | 3) Peak calls using MACS2, for multiple FDR thresholds (0.01 and 0.05) 19 | 20 | 4) Generating raw and coverage normalized BigWig tracks for visualizing the data in UCSC genome browser. 21 | 22 | 5) Irreproducible Discovery Rate (IDR) analysis (https://github.com/nboley/idr) between a set of peak calls or even a set of input alignment (BAM) files (in which case, peaks are estimated first) corresponding to a set of biological or technical ATAC-seq replicates. 23 | 24 | 6) **New in version 2.0:** Support discarding reads falling in blacklisted genomic regions 25 | 26 | 7) **New in version 2.0:** Support extracting nucleosome free reads (NFR), one or more nucleosome containing regions (denoted as +1M), for TF footprinting analysis. 27 | 28 | 8) **New in version 2.0:** Compatibility to the package ATAQV (https://github.com/ParkerLab/ataqv) for generating summary statistics across a set of samples. 29 | 30 | ####################### 31 | 32 | Release notes 33 | ----------------- 34 | 35 | **Version 2.2 - April 2022** 36 | 37 | Added -F option - corresponds to using different types of reads for footprinting. 38 | 39 | Default = 1, means footprinting with nucleosome free reads (NFR) will be done. 40 | 41 | Best for standard ATAC-seq protocols (Li et al. Genome Biology, 2019) 42 | 43 | If -F option is 2, footprinting with nucleosome reads will also be separately computed in addition to the NFR based footprints (two different footprinting outputs). 44 | 45 | If -F option is 3, footprinting with all the reads will also be separately computed in addition to the NFR based and nucleosome read based footprints (three different footprinting outputs). 46 | 47 | **Version 2.1 - July 2020** 48 | 49 | Minor change of picard duplicate removal syntax, according to the picard tool version 2.8.14 50 | We recommend using this (or later) versions 51 | 52 | **Version 2.0 - November 2019** 53 | 54 | 1) Included TF footprinting, optional discarding of blacklisted genomic regions, motif analysis 55 | 56 | 2) Updated summary statistics incorporating support for ATAQV package (https://github.com/ParkerLab/ataqv) 57 | 58 | 3) Discarded R package ATACseqQC (https://bioconductor.org/packages/release/bioc/html/ATACseqQC.html) and corresponding operations, mainly due to its time complexity and reliability issues. 59 | 60 | 61 | *Version 1.0 - July 2018:* 62 | 63 | 1) Released first version of ATAC-seq pipeline, supporting generation of QC metrics, peak calls, signal tracks for visualizing in UCSC genome browser. 64 | 65 | 2) Also supports IDR between a set of peaks / alignments for a set of replicates. 66 | 67 | 68 | Theory 69 | ---------- 70 | 71 | Papers / links for understanding ATAC-seq QCs: 72 | 73 | 1) https://github.com/crazyhottommy/ChIP-seq-analysis (very useful; contains many papers 74 | and links for understanding ChIP-seq and ATAC-seq data) 75 | 76 | 2) https://www.encodeproject.org/data-standards/terms/#library 77 | 78 | 3) https://www.biostars.org/p/187204/ 79 | 80 | 4) http://seqanswers.com/forums/archive/index.php/t-59219.html 81 | 82 | 5) https://github.com/kundajelab/atac_dnase_pipelines 83 | 84 | 6) https://github.com/ParkerLab/bioinf525#sifting 85 | 86 | 7) https://github.com/taoliu/MACS/issues/145 87 | 88 | 8) https://www.biostars.org/p/207318/ 89 | 90 | 9) https://www.biostars.org/p/209592/ 91 | 92 | 10) https://www.biostars.org/p/205576/ 93 | 94 | 95 | Understanding peak calling 96 | 97 | 1) https://genomebiology.biomedcentral.com/articles/10.1186/gb-2008-9-9-r137 98 | 99 | Understanding TF footprinting 100 | 101 | 1) https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1642-2 102 | 103 | Understanding IDR analysis 104 | 105 | 1) https://github.com/nboley/idr 106 | 107 | 108 | 109 | Installation 110 | ------------- 111 | 112 | Following packages / libraries should be installed before running this pipeline: 113 | 114 | 1) Python 2.7 115 | 116 | 2) R environment (we have used 3.4.3) 117 | 118 | User should also install the following R packages, by running the following command inside R prompt: 119 | 120 | install.packages(c(“optparse”, “ggplot2”, “data.table”, “plotly”)) 121 | 122 | Also user needs to install the bioconductor package GenomicRanges 123 | 124 | 3) Bowtie2 (we have used version 2.3.3.1) 125 | 126 | 4) samtools (we have used version 1.6) 127 | 128 | 5) PICARD tools (we have used 2.8.14 version now; previously we were using version 2.7.1) 129 | 130 | 6) Utilities "bedGraphToBigWig", "bedSort", "bigBedToBed", "hubCheck" and "fetchChromSizes" - to be downloaded from UCSC repository 131 | 132 | 7) deepTools (we have used version 2.0) 133 | 134 | 8) MACS2 (we have used version 2.1.1) https://github.com/taoliu/MACS 135 | 136 | 9) HOMER (we recommend using the latest version) http://homer.ucsd.edu/homer/ 137 | 138 | 10) The package *ataqv* (https://github.com/ParkerLab/ataqv). User needs to download the GitHub release (.tar.gz) file in a convenient location, extract it, and provide corresponding path in a configuration file (mentioned below). 139 | 140 | 11) Regulatory genomics toolbox (https://www.regulatory-genomics.org/) 141 | 142 | First user needs to install the module *RGT* using the following commands: 143 | 144 | pip install --user cython numpy scipy 145 | pip install --user RGT 146 | 147 | A folder *rgtdata* would be created inside the home directory. Next step is to configure that folder by typing the following commands: 148 | 149 | cd ~/rgtdata 150 | python setupGenomicData.py --hg19 151 | python setupGenomicData.py --hg38 152 | python setupGenomicData.py --mm9 153 | python setupGenomicData.py --mm10 154 | 155 | (Note: it is better to run the last four commands together in a qsub / cluster environment, otherwise it'll be time consuming). 156 | 157 | 158 | Then, user needs to set up the motif configuration data, via executing the following commands (preferable to run in qsub / cluster environment) 159 | 160 | cd ~/rgtdata 161 | python setupLogoData.py --all 162 | 163 | 164 | **User should include the PATH of above mentioned libraries / packages inside their SYSTEM PATH variable. Alternatively, installation PATHS for some of these packages are to be mentioned in a separate configuration file (described below)** 165 | 166 | **Following packages / libraries are to be installed for executing IDR code** 167 | 168 | 9) sambamba (we have used version 0.6.7) 169 | 170 | 10) IDRCode (https://drive.google.com/file/d/0B_ssVVyXv8ZSX3luT0xhV3ZQNWc/view?usp=sharing). User should unzip the archieve and store in convenient location. Path of this archieve is to be provided for executing IDR code. 171 | 172 | 173 | 174 | Execution 175 | ---------- 176 | 177 | User should first clone this pipeline in a convenient location, using the following command: 178 | 179 | git clone https://github.com/ay-lab/ATACProc.git 180 | 181 | A sample script "pipeline_exec.sh" contains basic execution commands, to invoke the main executable "pipeline.sh" (located inside the folder "bin"). The executable has the following command line options: 182 | 183 | Options: 184 | 185 | Mandatory parameters: 186 | 187 | -C ConfigFile 188 | Configuration file to be separately provided. Mandatory parameter. Current package includes four sample configuration files named "configfile_*" corresponding to the reference genomes hg19, hg38, mm9 and mm10. Detailed description of the entries in this configuration file are mentioned later. 189 | 190 | -f FASTQ1 191 | Read 1 (or forward strand) of paired-end sequencing data [.fq|.gz|.bz2]. Or, even an aligned genome (.bam file; single or paired end alignment) can be provided. 192 | 193 | -r FASTQ2 194 | R2 of pair-end sequencing data [.fq|.gz|.bz2]. If not provided, and the -f parameter is not a BAM file, the input is assumed to be single ended. 195 | 196 | -n PREFIX 197 | Prefix string of output files. For example, -n "TEST" means that the output filenames start with the string "TEST". Generally, sample names with run ID, lane information, etc. can be used as a prefix string. 198 | 199 | -g BOWTIE2_GENOME 200 | Bowtie2 indexed reference genome. Basically, the folder containing bwt2 indices (corresponding to the reference genome) are to be provided. Mandatory parameter if the user provides fastq files as input (-f and -r options). If user provides .bam files as an input (-f option) then this field is optional. 201 | 202 | -d OutDir 203 | Output directory to store the results for the current sample. 204 | 205 | -c CONTROLBAM 206 | Control file(s) used for peak calling using MACS2. One or more alignment files can be provided to be used as a control. It may not be specified at all, in which case MACS2 operates without any control. Control file can be either in *BAM* or in *tagalign.gz* format (the standalone script *bin/TagAlign.sh* in this repository converts BAM file to tagalign.gz format). For multiple control files, they all are required to be of the same format (i.e. either all BAM or all tagalign.gz). Example: -c control1.bam -c control2.bam puts two control files for using in MACS2. 207 | 208 | -w BigWigGenome 209 | Reference genome as a string. Allowed values are hg19 (default), hg38, mm9 and mm10. If -g option is enabled (i.e. the Bowtie2 index genome is provided), this field is optional. Otherwise, mandatory parameter. 210 | 211 | -D DEBUG_TXT 212 | Binary variable. If 1 (recommended), dumps QC statistics. For a set of samples, those QC statistics can be used later to profile QC variation among different samples. 213 | 214 | -O Overwrite 215 | Binary variable. If 1, overwrites the existing files (if any). Default = 0. 216 | 217 | -F Footprint 218 | This flag specifies the footprinting option. Value can be 1 (default), 2, or 3 219 | 1: footoprint using the nucleosome free reads (NFR) will be computed. 220 | Default setting. Best for default ATAC-seq protocol (check Li et. al. Genome Biology 2019) 221 | 2: footoprint using the nucleosome free reads (NFR) and also the nucleosome containing reads (NFR + 1N + 2N + 3N ...) 222 | will be computed (two different footprint outputs - time consuming). 223 | Best for Omni-ATAC protocol (check Li et. al. Genome Biology 2019) 224 | 3: footoprint using NFR, NFR with nucleosome reads, and all reads will be computed 225 | (three different footprint outputs - highly time consuming). 226 | 227 | Optional parameters: 228 | -q MAPQ_THR 229 | Mapping quality threshold for bowtie2 alignment. Aligned reads with quality below this threshold are discarded. Default = 30. 230 | 231 | -t NUMTHREADS 232 | Number of sorting, Bowtie2 mapping THREADS [Default = 1]. If multiprocessing core is available, user should specify values > 1 such as 4 or 8, for faster execution of Bowtie2. 233 | 234 | -m MAX_MEM 235 | Set max memory used for PICARD duplication removal [Default = 8G]. 236 | 237 | -a ALIGNVALIDMAX 238 | Set the number of (max) valid alignments which will be searched [Default = 4] for Bowtie2. 239 | 240 | -l MAXFRAGLEN 241 | Set the maximum fragment length to be used for Bowtie2 alignment [Default = 2000] 242 | 243 | 244 | Entries in the configuration file (first parameter) 245 | --------------------------------------------------- 246 | 247 | The configuration file follows the format parameter=value 248 | 249 | And is to be filled with the following entries: 250 | 251 | picardexec= 252 | Path of Picard tool executable 253 | Example: /home/sourya/packages/picard-tools/picard-tools-2.7.1/picard.jar 254 | 255 | HOMERPath= 256 | Path of HOMER (after installation) 257 | Example: /home/sourya/packages/HOMER/bin/ 258 | 259 | DeepToolsDir= 260 | Path of deepTools executable 261 | Example: /home/sourya/packages/deepTools/deepTools2.0/bin/ 262 | 263 | NarrowPeakASFile= 264 | file (SQL) required to convert the narrowPeak file to the bigBed format 265 | Download the file from this link (and save): 266 | https://genome-source.gi.ucsc.edu/gitlist/kent.git/blob/master/src/hg/lib/encode/narrowPeak.as 267 | Specify the location of this downloaded file: 268 | Example: /home/sourya/genomes/chrsize/narrowPeak.as 269 | 270 | BigNarrowPeakASFile= 271 | file (SQL) required to convert the bignarrowPeak file to the bigBed format 272 | Download the file from this link (and save): 273 | https://genome.ucsc.edu/goldenPath/help/examples/bigNarrowPeak.as 274 | Specify the location of this downloaded file: 275 | Example: /home/sourya/genomes/chrsize/bigNarrowPeak.as 276 | 277 | BroadPeakASFile= 278 | file (SQL) required to convert the broadPeak file to the bigBed format 279 | Download the file from this link (and save): 280 | https://genome-source.gi.ucsc.edu/gitlist/kent.git/blob/master/src/hg/lib/encode/broadPeak.as 281 | Specify the location of this downloaded file: 282 | Example: /home/sourya/genomes/chrsize/broadPeak.as 283 | 284 | RefChrSizeFile= 285 | files containing chromosome size information 286 | two column file storing the size of individual chromosomes 287 | Downloaded from the link (depends on the reference Chromosome employed): 288 | For example, the hg38.chrom.sizes file for the hg38 database is located at 289 | http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes. 290 | Alternatively, Use the "fetchChromSizes" script from the UCSC repository 291 | to get the appropriate chromosome size file. 292 | Specify the location of this downloaded file: 293 | Example: /home/sourya/genomes/chrsize/hg38.chrom.sizes 294 | 295 | RefChrFastaFile= 296 | Fasta file of the reference Chromosome. 297 | Can be downloaded from the link: http://hgdownload.cse.ucsc.edu/downloads.html 298 | Example: /home/sourya/genomes/Complete_Genome/hg38/hg38.fa 299 | 300 | RefChrAnnotFile= 301 | file containing reference genome specific annotation (.gtf format). 302 | To be downloaded from the following links: 303 | hg38: ftp://ftp.ensembl.org/pub/release-98/gtf/homo_sapiens/ 304 | hg19: ftp://ftp.ensembl.org/pub/grch37/current/gtf/homo_sapiens/ 305 | mm9: ftp://ftp.ensembl.org/pub/release-67/gtf/mus_musculus/ 306 | mm10: ftp://ftp.ensembl.org/pub/release-97/gtf/mus_musculus/ 307 | Example: /home/sourya/genomes/Annotation/hg38/hg38.gtf 308 | 309 | BlackListFile= 310 | file containing blacklisted regions corresponding to the reference genome. 311 | To be downloaded from the link: https://github.com/Boyle-Lab/Blacklist/tree/master/lists (v2) 312 | File can be gzipped or normal text format. 313 | *Note: This parameter is optional.* 314 | Example: /home/sourya/genomes/BlackListed_Regions/hg38-blacklist.v2.bed 315 | 316 | ATAQVPath= 317 | Path of ataqv package (https://github.com/ParkerLab/ataqv) executable. 318 | User needs to download the GitHub release (.tar.gz) file, extract it, and provide the ataqv executable path here. 319 | Example: /home/sourya/packages/ataqv/ataqv-1.0.0/bin/ataqv 320 | 321 | TSSFile= 322 | File containing TSS information for the reference genome. Obtained using the gene annotation (GTF) file. 323 | Example: /home/sourya/genomes/Annotation/hg38/hg38_TSS.gtf 324 | 325 | 326 | The last parameter, *TSSFile*, needs a special mention. User can apply the following awk script to the reference genome annotation file (indicated in the parameter *RefChrAnnotFile*) to produce a file with TSS information. 327 | 328 | Assuming user has downloaded the reference genome specific gene annotation file using one of the ftp links provided above, when the reference genome is either hg19, hg38 or mm10, user can apply the following awk script to obtain a TSS file (input_TSS.gtf) from the gene annotation file (input.gtf) (Note: it is always best to check the .gtf file format) : 329 | 330 | awk -F'[\t]' '{if ((substr($1,1,1)!="#") && ($3=="transcript")) {if ($7=="+") {print "chr"$1"\t"$4"\t"$4"\t"$3"\t"$4"\t"$5"\t"$7"\t"$9} else {print "chr"$1"\t"$5"\t"$5"\t"$3"\t"$4"\t"$5"\t"$7"\t"$9}}}' input.gtf > input_TSS.gtf 331 | 332 | When the reference genome is mm9, user can apply the following script (it is best to check the .gtf file format): 333 | 334 | awk -F'[\t]' '{if ((substr($1,1,1)!="#") && ($3=="exon")) {if ($7=="+") {print "chr"$1"\t"$4"\t"$4"\t"$3"\t"$4"\t"$5"\t"$7"\t"$9} else {print "chr"$1"\t"$5"\t"$5"\t"$3"\t"$4"\t"$5"\t"$7"\t"$9}}}' mm9.gtf > mm9_TSS.gtf 335 | 336 | Describing output of ATAC-seq pipeline 337 | ----------------------------------------- 338 | 339 | Within the folder *OutDir* (specified by the configuration option -d) following files (f) and folders (F) exist: 340 | 341 | F1: Alignment_MAPQ${MAPQ_THR} 342 | 343 | f1-1: Bowtie2_Init_Align.sam 344 | Initial alignment by Bowtie2 (if fastq files are provided as the input.) 345 | f1-2: UniqMappedRead.bam 346 | Uniquely mapped reads. 347 | f1-3: Bowtie2_del_Random.bam 348 | Alignment after excluding reads from chromosomes other than autosomal chromosomes, chrX, and chrM. 349 | f1-4: Bowtie2_del_Mitch.bam: 350 | Alignment after excluding reads from chrM. 351 | f1-5: ${PREFIX}.align.sort.MAPQ${MAPQ_THR}.bam 352 | Sorted, and MAPQ thresholded alignment. 353 | f1-6: ${PREFIX}.align.sort.MAPQ${MAPQ_THR}.rmdup.bam 354 | De-duplicated alignment (used for subsequent operations) 355 | f1-7: ${PREFIX}.align.sort.MAPQ${MAPQ_THR}.picard_metrics.txt 356 | PICARD metrics log file corresponding to the duplicate removal operation. 357 | f1-8: ${PREFIX}.align.sort.MAPQ${MAPQ_THR}_TN5_Shift.bam 358 | **New in version 2.0:** De-duplicated reads with shifted forward (+4bp) and reverse strands (-5bp) by Tn5 transposase. Used to extract the nucleosome free and nucleosome containing regions. 359 | f1-9: ${PREFIX}.align.sort.MAPQ${MAPQ_THR}_TN5_Shift.bed 360 | **New in version 2.0:** Bed converted f7, used for MACS2 peak calling. 361 | f1-10: NucleosomeFree.bam 362 | **New in version 2.0:** Alignment with nucleosome free regions (NFR) 363 | f1-11: mononucleosome.bam 364 | **New in version 2.0:** Alignment with mononucleosome fragments 365 | f1-12: dinucleosome.bam 366 | **New in version 2.0:** Alignment with dinucleosome fragments 367 | f1-13: trinucleosome.bam 368 | **New in version 2.0:** Alignment with trinucleosome fragments 369 | f1-14: Merged_nucleosome.bam 370 | **New in version 2.0:** File containing fragments of nucleosome free and one or more nucleosomes (denoted as NFR +1M, in the HINT-ATAC genome biology paper). Generated by merging files f1-10 to f1-13. 371 | 372 | F2: Out_BigWig 373 | f2-1: ${PREFIX}.bw 374 | bigwig file for track visualization. 375 | 376 | F3: Out_BigWig_NormCov: 377 | f3-1: ${PREFIX}_NormCov.bw 378 | bigwig file for track visualization (after normalizing the coverage). Recommended to use this file for visualizing tracks in UCSC genome browser. 379 | 380 | F4: MACS2_Ext_* 381 | Contains peaks employing MACS2 with the parameters: 382 | --nomodel --nolambda --shift -100 --extsize 200 --keep-dup all --call-summits 383 | *Note: this parameter is recommended for ATAC-seq, as mostly followed in existin studies.* 384 | 385 | If the folder name is "*_No_Control", no control BAM file was used to infer the peaks. Otherwise, if the folder name is "*_With_Control", one or more control alignment files were used for inferring the peaks. 386 | 387 | f4-1: *.narrowPeak: narrow peaks with p-value threshold of 0.01 388 | f4-2: *.narrowPeak_Q0.05filt: narrow peaks with FDR (q-value) threshold = 0.05 389 | f4-3: *.narrowPeak_Q0.01filt: narrow peaks with FDR threshold = 0.01 390 | f4-4: *.broadPeak: broad peaks with p-value threshold of 0.01 391 | f4-5: *.broadPeak_Q0.05filt: broad peaks with FDR threshold = 0.05 392 | f4-6: *.broadPeak_Q0.01filt: broad peaks with FDR threshold = 0.01 393 | f4-7: out_FRiP.txt: FRiP (fraction of reads in peaks) statistics for the narrow and broad peaks. 394 | f4-8: Peak_Statistics.txt: number of peaks in different settings. 395 | F4-9: Peak_Annotate_Q*: 396 | HOMER based annotations corresponding to the narrow peaks inferred by the corresponding FDR threshold (0.01 or 0.05). Contains the following files: 397 | f4-9-1: Out_Summary.log: summary text file containing HOMER annotation. 398 | f4-9-2: Annotated_Peak_Q*filt.txt: Detailed HOMER annotation of the corresponding peaks. 399 | f4-9-3: Pie_Chart_Peak_Annotation.pdf: pie chart of peaks containing different annotations. 400 | f4-9-4: Peak_TSS_Distance.pdf: Histogram of distance between peaks and closest TSS 401 | f4-10: Files of *.bb extension are big-bed formatted peaks, used to visualize those peaks in UCSC tracks. 402 | 403 | F5: MACS2_Default_* 404 | Contains peaks employing default MACS2 parameters. (generally not used for ATAC-seq processing, but we've kept it for comparison). 405 | File and folder structure is similar as F4. 406 | 407 | f8: out_NRF_MAPQ${MAPQ_THR}.txt 408 | Metric NRF 409 | 410 | f9: Read_Count_Stat.txt 411 | Read count statistics. 412 | 413 | F10: QC_ataqv_ParkerLab_Test 414 | **New in version 2.0:** Folder containing the summary .json files generated by the package ATAQV, which for diferent samples, can be combined to put a summary statistic and displayed in a Web browser. 415 | 416 | F11: TSS_Enrichment_Peaks 417 | **New in version 2.0:** Processes the narrow peaks from the folder F4, and computes the TSS enrichment of these peaks. The underlying file structure is: 418 | 419 | MACS2_Ext_*${CONTROLSTR}/macs2_narrowPeak_Q${FDRTHR}filt_Offset_${OFFSETVAL}/${PEAKTYPE}/*.pdf 420 | 421 | where, 422 | ${CONTROLSTR}: "*_No_Control" or "*_With_Control", depending on the use of control BAM file in inferring the peaks. 423 | ${FDRTHR}: FDR threshold. Can be either 0.01 or 0.05 424 | ${OFFSETVAL}: can be either 1000 (1 Kb) or 5000 (5Kb) (1 Kb or 5 Kb regions surrounding TSS are checked for computing TSS enrichment). 425 | ${PEAKTYPE}: can be either "Complete_Peaks" (means complete set of peaks are experimented), "Promoter_Peaks" (means peaks located within 5 Kb of a TSS site are only considered), or "Enhancer_Peaks" (peaks excluding the promoter peaks). 426 | 427 | 428 | F12: Motif_MACS2_Ext_*${CONTROLSTR}_narrowPeak_Q${FDRTHR}filt 429 | **New in version 2.0:** TF footorinting analysis corresponding to the ChIP-seq peaks stored in F4. Here, ${CONTROLSTR} is either "*_No_Control" or "*_With_Control", depending on the use of control BAM file in inferring the peaks. ${FDRTHR} is either 0.01 or 0.05. 430 | 431 | The principle is to extract the peak summits and surroundings (by some bp, defined as an offset) and compute the TF footprinting regions and underlying motifs within these regions. 432 | 433 | Within this folder, the file structure is as follows: 434 | Motif_${PEAKS_ANALYZED}_SummitOffset_${OFFSET}/Footprint_HINT_ATAC/${READTYPE}/footprints_HINT_ATAC.bed 435 | 436 | where, 437 | ${PEAKS_ANALYZED}: can be "Complete_Peaks" (means complete set of peaks) or "Peaks_PvalThr_50" (means peaks with -log10(p-value) > 50 are only considered). 438 | ${OFFSET}: can be either 200 or 500, means the summit +/- offset bp regions are accounted for TF footprinting. 439 | ${READTYPE}: can be one of the following: 440 | "all" (means all de-duplicated reads in the file f1-8 considered), 441 | "NFR" (means only nucleosome free reads in the file f1-10 are considered), 442 | "NFRANDNucl" (means NFR regions and +1M reads, indicated by the file f1-14, are considered). 443 | 444 | The output file in each occasion, "footprints_HINT_ATAC.bed", contains the TF footprinting regions. 445 | 446 | 447 | Summarizing a set of ATAC-seq samples 448 | --------------------------------------- 449 | 450 | Suppose, a directory "/home/sourya/Results" contain within it, the following folders: 451 | 1, 2, 3, 4, ... each corresponding to the output for processing individual ATAC-seq samples. 452 | 453 | To get a summarized list of performance metrics for these samples, use the script *Analysis/ResSummary.r*, using the following syntax. 454 | 455 | Rscript ResSummary.r --BaseDir ${BaseDir} --OutDir ${OutDir} 456 | 457 | where, 458 | 1) ${BaseDir}: 459 | Directory containing results of all ATAC-seq sample analysis 460 | (like /home/sourya/Results as mentioned above). Mandatory parameter. 461 | 462 | 2) ${OutDir}: 463 | Output directory to contain the summarized results. Default: current working directory. 464 | 465 | For details of ATAC-seq QC measures, user may check this link: 466 | https://www.encodeproject.org/atac-seq/ 467 | 468 | Upon executing the R script, the following files are created within the specified ${OutDir}: 469 | 470 | 1) Results_All_Samples_Summary.txt: summarized statistics for all samples 471 | 2) Field_Description.txt: Summary description of individual fields / parameters. 472 | 3) TotalReadCount_Distribution.html: To be loaded in any web browser. Plot depicting the distribution of total reads for all samples. 473 | 4) Fraction_MappableReadCount_Distribution.html: Fraction of mappability for all samples. 474 | 5) Fraction_MitochondrialReadCount_Distribution.html: Fraction of mitochondrial reads for all samples. 475 | 6) Fraction_UniqueMappReadCount_Distribution.html: Fraction of unique mappability for all samples. 476 | 7) Fraction_LowQualReadCount_Distribution.html: Fraction of low quality reads for all samples. 477 | 8) Fraction_DuplicateReadCount_Distribution.html: Fraction of duplicate reads for all samples. 478 | 9) NRF_Distribution.html: NRF for all samples. 479 | 10) M1_Distribution.html: M1 metric for all samples. 480 | 11) M2_Distribution.html: M2 metric for all samples. 481 | 12) PBC1_Distribution.html: PBC1 metric for all samples. 482 | 13) PBC2_Distribution.html: PBC2 metric for all samples. 483 | 14) FRiP_Def_NoCtrl_Distribution.html: FRiP statistics for MACS2 peaks with default command, and without using any control BAM files. 484 | 15) NumPeak_Def_NoCtrl_Distribution.html: Number of MACS2 peaks with default command, and without using any control BAM files. 485 | 16) FRiP_Ext_NoCtrl_Distribution.html: FRiP statistics for MACS2 peaks with --Extsize option (recommended), and without using any control BAM files. 486 | 17) NumPeak_Ext_NoCtrl_Distribution.html: Number of MACS2 peaks with --Extsize option (recommended), and without using any control BAM files. 487 | 18) FRiP_Def_Ctrl_Distribution.html: FRiP statistics for MACS2 peaks with default command, and when one or more control BAM files are used. 488 | 19) NumPeak_Def_Ctrl_Distribution.html: Number of MACS2 peaks with default command, and when one or more control BAM files are used. 489 | 20) FRiP_Ext_Ctrl_Distribution.html: FRiP statistics for MACS2 peaks with --Extsize option (recommended), and when one or more control BAM files are used. 490 | 21) NumPeak_Ext_Ctrl_Distribution.html: Number of MACS2 peaks with --Extsize option (recommended), and when one or more control BAM files are used. 491 | 492 | Command for executing IDR codes 493 | --------------------------------- 494 | 495 | Current pipeline supports IDR analysis between either a list of ATAC-seq peak files 496 | or between a list of alignment (BAM) files. In the second case, first the BAM files 497 | are analyzed and subsampled to contain equal number of reads (minimum number of reads 498 | contained in the inputs), and subsequently, peaks are estimated from these 499 | (subsampled) BAM files using MACS2. These peaks are then applied for IDR analysis. 500 | 501 | The script "sample_IDRScript.sh" included within this package 502 | shows calling following two functions (both are included within the folder 503 | "IDR_Codes"): 504 | 505 | 1) IDRMain.sh 506 | 507 | 2) IDR_SubSampleBAM_Main.sh 508 | 509 | The first script, IDRMain.sh, performs IDR between two or more 510 | input peak files (we have used peaks estimated from MACS2). The parameters 511 | corresponding to this script are as follows: 512 | 513 | -I InpFile 514 | A list of input peak files (obtained from MACS2 - in .narrowPeak or .narrowPeak.gz format). 515 | At least two peak files are required. 516 | 517 | -P PathIDRCode 518 | Path of the IDRCode package (Kundaje et. al. after its installation). 519 | Please check the "Required packages" section for the details. 520 | 521 | -d OutDir 522 | Output directory (absolute path preferred) which will store the IDR results. 523 | 524 | -n PREFIX 525 | Prefix of output files. Default 'IDR_ATAC'. 526 | 527 | A sample execution of this script is as follows: 528 | 529 | ./IDRMain.sh -I peak1.narrowPeak -I peak2.narrowPeak -I peak3.narrowPeak -P /home/sourya/packages/idrCode/ -d /home/sourya/OutDir_IDR -n 'IDR_test' 530 | 531 | 532 | 533 | The second script, IDR_SubSampleBAM_Main.sh, takes input of two or more BAM files, 534 | estimates peaks from these BAM files, and then performs IDR analysis. The parameters 535 | corresponding to this script are as follows: 536 | 537 | -I InpFile 538 | A list of input BAM files. At least two BAM files are required. 539 | 540 | -P PathIDRCode 541 | Path of the IDRCode package (Kundaje et. al. after its installation). 542 | Please check the "Required packages" section for the details. 543 | 544 | -d OutDir 545 | Output directory (absolute path preferred) which will store the IDR results. 546 | 547 | -n PREFIX 548 | Prefix of output files. Default 'IDR_ATAC'. 549 | 550 | -c CountPeak 551 | No of peaks in both replicates that will be compared for IDR analysis. 552 | Default 25000. 553 | 554 | -C CONTROLBAM 555 | Control file (in eiher .BAM or tagalign file in .gz format) 556 | used to estimate the peaks from MACS2. User may leave this field 557 | blank if no control file is available. 558 | 559 | A sample execution of this script is as follows: 560 | 561 | ./IDR_SubSampleBAM_Main.sh -I inpfile1.bam -I inpfile2.bam -P /home/sourya/packages/idrCode/ -d /home/sourya/OutDir_IDR -n 'IDR_test' -c 25000 -C control.bam 562 | 563 | 564 | Describing output of IDR analysis 565 | ---------------------------------- 566 | 567 | In the specified output directory "OutDir" mentioned in the IDR script, following 568 | files (f) and folders (F) exist: 569 | 570 | F1: Folders of the name $i$_and_$j$ where 0 <= i < N and 1 <= j <= N, where N is 571 | the number of replicates analyzed. Individual folders contain results for 572 | pairwise IDR analysis. For example, folder 0_and_1 contain IDR analysis 573 | for the sample 0 (first replicate) and the sample 1 (second replicate). 574 | 575 | f1 : "Replicate_Names.txt" : names of the replicate samples used for IDR analysis. 576 | 577 | f2: Input_Peak_Statistics.txt: number of peaks and the peak containing replicates. 578 | 579 | f3: IDR_Batch_Plot-plot.pdf: final IDR plot. Here individual pairs (whose results 580 | are stored in the above mentioned folders) are numbered 1, 2, ... 581 | Consideing N = 3, the number of pairs possible is also 3. Here, 582 | the number 1 denotes the folder (pair) 0_and_1, 583 | 2 denotes the folder (pair) 0_and_2, and 3 denotes the 584 | folder (pair) 1_and_2. 585 | 586 | 587 | Contact 588 | ----------- 589 | 590 | For any queries, please generate a GitHub issue, or alternatively, e-mail us: 591 | 592 | Sourya Bhattacharyya (sourya@lji.org) 593 | 594 | Ferhat Ay (ferhatay@lji.org) 595 | 596 | Pandurangan Vijayanand (vijay@lji.org) 597 | 598 | -------------------------------------------------------------------------------- /bin/ATACSeqQC.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | #================================== 4 | # used for ATAC seq quality analysis 5 | # adapted from the link: 6 | # https://bioconductor.org/packages/release/bioc/vignettes/ATACseqQC/inst/doc/ATACseqQC.html 7 | 8 | # check the code Install_Bioconductor_Packages_using_BioCLite.R 9 | # to find out the required packages 10 | 11 | #================================== 12 | # author: Sourya Bhattacharyya 13 | # Vijay-AY lab 14 | #================================== 15 | 16 | suppressPackageStartupMessages({ 17 | library(ATACseqQC) 18 | library(ChIPpeakAnno) 19 | library(MotifDb) 20 | library(GenomicRanges) 21 | library(GenomicAlignments) 22 | library(optparse) 23 | }) 24 | 25 | #=========================================================== 26 | option_list = list( 27 | make_option(c("--AlignFile"), type="character", default=NULL, help="Input alignment file. Must contain an index file (generated by samtools)."), 28 | make_option(c("--RefGenome"), type="character", default=NULL, help="Reference genome name."), 29 | make_option(c("--OutDirQC"), type="character", default=NULL, help="Output directory to contain the QC related statistics."), 30 | make_option(c("--PE"), type="integer", action="store", default=0, help="If 1, indicates paired end input data. Default = 0") 31 | ); 32 | 33 | opt_parser = OptionParser(option_list=option_list); 34 | opt = parse_args(opt_parser); 35 | 36 | # dynamic loading of libraries based on the reference genome 37 | if (opt$RefGenome == 'hg19') { 38 | library(BSgenome.Hsapiens.UCSC.hg19) 39 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 40 | library(phastCons100way.UCSC.hg19) 41 | } else if (opt$RefGenome == 'hg38') { 42 | library(BSgenome.Hsapiens.UCSC.hg38) 43 | library(TxDb.Hsapiens.UCSC.hg38.knownGene) 44 | library(phastCons100way.UCSC.hg38) 45 | } else if (opt$RefGenome == 'mm10') { 46 | library(BSgenome.Mmusculus.UCSC.mm10) 47 | library(TxDb.Mmusculus.UCSC.mm10.knownGene) 48 | library(phastCons60way.UCSC.mm10) 49 | } else if (opt$RefGenome == 'mm9') { 50 | library(BSgenome.Mmusculus.UCSC.mm9) 51 | library(TxDb.Mmusculus.UCSC.mm9.knownGene) 52 | library(phastCons60way.UCSC.mm9) 53 | } 54 | 55 | 56 | # create the output directory for QC 57 | outdir <- opt$OutDirQC 58 | system(paste("mkdir -p", outdir)) 59 | 60 | # input the bamFile from the ATACseqQC package 61 | # bamfile <- system.file("extdata", inpbamfilename, package="ATACseqQC", mustWork=TRUE) 62 | bamfile <- opt$AlignFile 63 | bamfile.labels <- gsub(".bam", "", basename(bamfile)) 64 | 65 | # store the current directory 66 | currdir <- getwd() 67 | 68 | # go to the directory "outdir" 69 | setwd(outdir) 70 | 71 | #===================== 72 | # Check alignment metrics and mapping quality 73 | bam_QC_textfile <- paste0(outdir, '/Summary_QC.txt') 74 | if (file.exists(bam_QC_textfile) == FALSE) { 75 | fp_out <- file(bam_QC_textfile, "w") 76 | outtext <- paste0("\n *** Quality control measures for the alignment file ", opt$AlignFile, " is **** \n") 77 | writeLines(outtext, con=fp_out, sep="\n") 78 | close(fp_out) 79 | # quality control summary 80 | capture.output(bamQC(bamfile, outPath=NULL), file=bam_QC_textfile, append=TRUE) 81 | cat(sprintf("\n *** Computed the sumary statistics of the input ATAC seq file **** \n")) 82 | } 83 | 84 | # #===================== 85 | # Estimate the library complexity 86 | # sourya - commented since old bioconductor version does not support this function 87 | if (0) { 88 | plotfile <- paste0(outdir, '/Library_Complexity.pdf') 89 | pdf(plotfile, width=6, height=4) 90 | estimateLibComplexity(readsDupFreq(bamfile)) 91 | dev.off() 92 | cat(sprintf("\n *** Computed library complexity **** \n")) 93 | } 94 | 95 | #===================== 96 | # works only when the input is single end 97 | if (opt$PE == 0) { 98 | # shift the BAM file - forward strand by +4 bp 99 | possibleTag <- c("AS", "XN", "XM", "XO", "XG", "NM", "MD", "YS", "YT") 100 | gal <- readBamFile(bamfile, asMates=FALSE) 101 | shiftedBamfile <- paste0(outdir, '/shifted.bam') 102 | gal1 <- shiftGAlignmentsList(gal) 103 | export(gal1, shiftedBamfile) 104 | cat(sprintf("\n *** shifted bam file **** \n")) 105 | } 106 | 107 | #===================== 108 | # works only when the input is paired end 109 | if (opt$PE == 1) { 110 | 111 | # fragment size distribution (main QC metric) 112 | plotfile <- paste0(outdir, '/Fragment_Size_Distribution.pdf') 113 | if (file.exists(plotfile) == FALSE) { 114 | pdf(plotfile, width=6, height=4) 115 | fragSizeDist(bamfile, bamfile.labels) 116 | dev.off() 117 | cat(sprintf("\n *** Computed the fragment size distribution (paired end read) **** \n")) 118 | } 119 | 120 | # prepare the tags 121 | # obtained from: 122 | # https://bioinformatics-core-shared-training.github.io/cruk-summer-school-2019/ChIPSeq/Materials/Practicals/Day5/Practical01_ATAC-seq_analysis_SS.html 123 | # and from 124 | # https://bioconductor.org/packages/release/bioc/vignettes/ATACseqQC/inst/doc/ATACseqQC.html 125 | 126 | # option 1: all combination 127 | # possibleTag <- combn(LETTERS, 2) 128 | # possibleTag <- c(paste0(possibleTag[1, ], possibleTag[2, ]), paste0(possibleTag[2, ], possibleTag[1, ])) 129 | # cat(sprintf("\n length of possibleTag : %s ", length(possibleTag))) 130 | 131 | # option 2: specified combinations 132 | possibleTag <- c("AS", "XN", "XM", "XO", "XG", "NM", "MD", "YS", "YT") 133 | 134 | # prepare the seqlev input 135 | # by default seqlev = paste0("chr", c(1:22, "X", "Y")) 136 | # but it should be checked with the BAM header 137 | seqlevset_default <- paste0("chr", c(1:22, "X", "Y")) 138 | tempBAMHeaderFile <- paste0(outdir, '/t.bed') 139 | system(paste("samtools view -H ", bamfile, " | awk -F\'[\t]\' \'($1==\"@SQ\")\' - > ", tempBAMHeaderFile)) 140 | nline <- as.integer(system(paste("cat", tempBAMHeaderFile, "| wc -l"), intern = TRUE)) 141 | if (nline > 0) { 142 | x <- read.table(tempBAMHeaderFile, header=F, sep="\t", stringsAsFactors=F) 143 | y <- x[,2] 144 | z <- as.vector(unlist(strsplit(y, ":"))) 145 | seqlevset_final <- intersect(seqlevset_default, z) 146 | cat(sprintf("\n ==>> chromosomes considered : %s ", paste(as.vector(seqlevset_final), sep="\t"))) 147 | } else { 148 | seqlevset_final <- seqlevset_default 149 | } 150 | system(paste("rm", tempBAMHeaderFile)) 151 | 152 | # compute promoter transcript body score 153 | if (opt$RefGenome == 'hg19') { 154 | txs <- transcripts(TxDb.Hsapiens.UCSC.hg19.knownGene) 155 | } else if (opt$RefGenome == 'hg38') { 156 | txs <- transcripts(TxDb.Hsapiens.UCSC.hg38.knownGene) 157 | } else if (opt$RefGenome == 'mm10') { 158 | txs <- transcripts(TxDb.Mmusculus.UCSC.mm10.knownGene) 159 | } else if (opt$RefGenome == 'mm9') { 160 | txs <- transcripts(TxDb.Mmusculus.UCSC.mm9.knownGene) 161 | } 162 | cat(sprintf("\n ==>> read transcript information ")) 163 | 164 | for (chrIdx in 1:length(seqlevset_final)) { 165 | currChr_seqlev <- seqlevset_final[chrIdx] 166 | cat(sprintf("\n extracting nucleosome specific BAM - chromosome : %s ", currChr_seqlev)) 167 | 168 | # extract current chromosome specific genome information 169 | if ((opt$RefGenome == 'hg19') | (opt$RefGenome == 'hg38')) { 170 | which_currChr <- as(seqinfo(Hsapiens)[currChr_seqlev], "GRanges") 171 | } else if ((opt$RefGenome == 'mm10') | (opt$RefGenome == 'mm9')) { 172 | which_currChr <- as(seqinfo(Mmusculus)[currChr_seqlev], "GRanges") 173 | } 174 | 175 | # read the BAM file for the current chromosome 176 | # and shift it in both ends 177 | gal <- readBamFile(bamfile, tag=possibleTag, which=which_currChr, asMates=TRUE) 178 | cat(sprintf("\n read bam file for the current chromosome : %s ", currChr_seqlev)) 179 | 180 | # shift the GAlignmentsLists by 5' ends. 181 | # All reads aligning to the positive strand will be offset by +4bp, 182 | # and all reads aligning to the negative strand will be offset -5bp by default. 183 | shiftedBamfile <- file.path(outdir, 'shifted.bam') 184 | gal1 <- shiftGAlignmentsList(gal) 185 | export(gal1, shiftedBamfile) 186 | cat(sprintf("\n shifted bam file for the current chromosome : %s ", currChr_seqlev)) 187 | 188 | # following code is to get the nucleosome specific 189 | # and nucleosome free regions 190 | # but this is giving some problems - comment - sourya 191 | if (0) { 192 | 193 | # extract transcripts for the current chromosome 194 | txs_currChr_seqlev <- txs[seqnames(txs) %in% currChr_seqlev] 195 | cat(sprintf("\n extracted transcripts for the current chromosome : %s ", currChr_seqlev)) 196 | 197 | # call the split BAM routine for the current chromosome 198 | if (opt$RefGenome == 'hg19') { 199 | objs <- splitGAlignmentsByCut(gal1, txs=txs_currChr_seqlev, genome=Hsapiens, conservation=phastCons100way.UCSC.hg19) 200 | } else if (opt$RefGenome == 'hg38') { 201 | objs <- splitGAlignmentsByCut(gal1, txs=txs_currChr_seqlev, genome=Hsapiens, conservation=phastCons100way.UCSC.hg38) 202 | } else if (opt$RefGenome == 'mm10') { 203 | objs <- splitGAlignmentsByCut(gal1, txs=txs_currChr_seqlev, genome=Mmusculus, conservation=phastCons60way.UCSC.mm10) 204 | } else if (opt$RefGenome == 'mm9') { 205 | objs <- splitGAlignmentsByCut(gal1, txs=txs_currChr_seqlev, genome=Mmusculus, conservation=phastCons60way.UCSC.mm9) 206 | } 207 | cat(sprintf("\n created objs for the current chromosome : %s ", currChr_seqlev)) 208 | 209 | # export the binned alignments into bam files. 210 | null <- writeListOfGAlignments(objs, outdir) 211 | cat(sprintf("\n extracted nucleosome specific reads for the current chromosome : %s ", currChr_seqlev)) 212 | 213 | # now rename the files so that chromosome specific information is preserved 214 | system(paste("mv NucleosomeFree.bam", paste0("NucleosomeFree_", currChr_seqlev, ".bam"))) 215 | system(paste("mv NucleosomeFree.bam.bai", paste0("NucleosomeFree_", currChr_seqlev, ".bam.bai"))) 216 | system(paste("mv mononucleosome.bam", paste0("mononucleosome_", currChr_seqlev, ".bam"))) 217 | system(paste("mv mononucleosome.bam.bai", paste0("mononucleosome_", currChr_seqlev, ".bam.bai"))) 218 | system(paste("mv dinucleosome.bam", paste0("dinucleosome_", currChr_seqlev, ".bam"))) 219 | system(paste("mv dinucleosome.bam.bai", paste0("dinucleosome_", currChr_seqlev, ".bam.bai"))) 220 | system(paste("mv trinucleosome.bam", paste0("trinucleosome_", currChr_seqlev, ".bam"))) 221 | system(paste("mv trinucleosome.bam.bai", paste0("trinucleosome_", currChr_seqlev, ".bam.bai"))) 222 | system(paste("mv inter1.bam", paste0("inter1_", currChr_seqlev, ".bam"))) 223 | system(paste("mv inter1.bam.bai", paste0("inter1_", currChr_seqlev, ".bam.bai"))) 224 | system(paste("mv inter2.bam", paste0("inter2_", currChr_seqlev, ".bam"))) 225 | system(paste("mv inter2.bam.bai", paste0("inter2_", currChr_seqlev, ".bam.bai"))) 226 | system(paste("mv inter3.bam", paste0("inter3_", currChr_seqlev, ".bam"))) 227 | system(paste("mv inter3.bam.bai", paste0("inter3_", currChr_seqlev, ".bam.bai"))) 228 | system(paste("mv others.bam", paste0("others_", currChr_seqlev, ".bam"))) 229 | system(paste("mv others.bam.bai", paste0("others_", currChr_seqlev, ".bam.bai"))) 230 | 231 | } # end dummy if 232 | # end code comment - sourya 233 | 234 | # only rename the shifted files 235 | system(paste("mv shifted.bam", paste0("shifted_", currChr_seqlev, ".bam"))) 236 | system(paste("mv shifted.bam.bai", paste0("shifted_", currChr_seqlev, ".bam.bai"))) 237 | 238 | } # end chromosome loop 239 | 240 | cat(sprintf("\n *** Split alignments **** \n")) 241 | 242 | # sourya - commented since old bioconductor version does not support this function 243 | if (0) { 244 | pt <- PTscore(gal1, txs) 245 | plotfile <- paste0(outdir, '/promoter_transcript_body_score.pdf') 246 | pdf(plotfile, width=6, height=4) 247 | plot(pt$log2meanCoverage, pt$PT_score, xlab="log2 mean coverage", ylab="Promoter vs Transcript") 248 | dev.off() 249 | } 250 | 251 | # Nucleosome Free Regions (NFR) score 252 | # sourya - commented since old bioconductor version does not support this function 253 | if (0) { 254 | nfr <- NFRscore(gal1, txs) 255 | plotfile <- paste0(outdir, '/Nucleosome_Free_Regions_score.pdf') 256 | pdf(plotfile, width=6, height=4) 257 | plot(nfr$log2meanCoverage, nfr$NFR_score, xlab="log2 mean coverage", ylab="Nucleosome Free Regions score", main="NFRscore for 200bp flanking TSSs", xlim=c(-10, 0), ylim=c(-5, 5)) 258 | dev.off() 259 | } 260 | 261 | # Transcription Start Site (TSS) Enrichment Score 262 | # sourya - commented since old bioconductor version does not support this function 263 | if (0) { 264 | tsse <- TSSEscore(gal1, txs) 265 | capture.output(summary(tsse$TSS.enrichment.score), file=paste0(outdir, '/Summary_TSS_Enrichment_Score.txt'), append=FALSE) 266 | } 267 | 268 | 269 | # Heatmap and coverage curve for nucleosome positions 270 | # sourya - commented since old bioconductor version does not support this function 271 | if (0) { 272 | nucleosome_positions_bamfiles <- file.path(outdir, c("NucleosomeFree.bam", "mononucleosome.bam", "dinucleosome.bam", "trinucleosome.bam")) 273 | plotfile <- paste0(outdir, '/Nucleosome_cumulative_Percentage.pdf') 274 | pdf(plotfile, width=6, height=4) 275 | if ((opt$RefGenome == 'hg19') | (opt$RefGenome == 'hg38')) { 276 | cumulativePercentage(nucleosome_positions_bamfiles[1:2], as(seqinfo(Hsapiens), "GRanges")) 277 | } else if (opt$RefGenome == 'mm10') { 278 | cumulativePercentage(nucleosome_positions_bamfiles[1:2], as(seqinfo(Mmusculus), "GRanges")) 279 | } else if (opt$RefGenome == 'mm9') { 280 | cumulativePercentage(nucleosome_positions_bamfiles[1:2], as(seqinfo(Mmusculus), "GRanges")) 281 | } 282 | dev.off() 283 | cat(sprintf("\n *** Performed function - Nucleosome_cumulative_Percentage **** \n")) 284 | } 285 | 286 | # TSS statistics 287 | # sourya - commented since old bioconductor version does not support this function 288 | if (0) { 289 | TSS <- promoters(txs, upstream=0, downstream=1) 290 | TSS <- unique(TSS) 291 | # estimate the library size for normalization 292 | (librarySize <- estLibSize(nucleosome_positions_bamfiles)) 293 | # calculate the signals around TSSs 294 | NTILE <- 101 295 | dws <- ups <- 1010 296 | sigs <- enrichedFragments(gal=objs[c("NucleosomeFree", "mononucleosome", "dinucleosome", "trinucleosome")], TSS=TSS, librarySize=librarySize, TSS.filter=0.5, n.tile = NTILE, upstream = ups, downstream = dws) 297 | # log2 transformed signals 298 | sigs.log2 <- lapply(sigs, function(.ele) log2(.ele+1)) 299 | # plot heatmap 300 | plotfile <- paste0(outdir, '/Heatmap_signal_around_TSS.pdf') 301 | pdf(plotfile, width=6, height=4) 302 | featureAlignedHeatmap(sigs.log2, reCenterPeaks(TSS, width=ups+dws), zeroAt=.5, n.tile=NTILE) 303 | dev.off() 304 | cat(sprintf("\n *** Performed featureAlignedHeatmap **** \n")) 305 | 306 | # get signals normalized for nucleosome-free and nucleosome-bound regions 307 | out_Align_Distr <- featureAlignedDistribution(sigs, reCenterPeaks(TSS, width=ups+dws), zeroAt=.5, n.tile=NTILE, type="l", ylab="Averaged coverage") 308 | # rescale the nucleosome-free and nucleosome signals to 0~1 309 | range01 <- function(x){(x-min(x))/(max(x)-min(x))} 310 | out_Align_Distr <- apply(out_Align_Distr, 2, range01) 311 | plotfile <- paste0(outdir, '/nucleosome_free_and_nucleosome_signals.pdf') 312 | pdf(plotfile, width=6, height=4) 313 | matplot(out_Align_Distr, type="l", xaxt="n", xlab="Position (bp)", ylab="Fraction of signal") 314 | axis(1, at=seq(0, 100, by=10)+1, labels=c("-1K", seq(-800, 800, by=200), "1K"), las=2) 315 | abline(v=seq(0, 100, by=10)+1, lty=2, col="gray") 316 | dev.off() 317 | cat(sprintf("\n *** Performed featureAlignedDistribution **** \n")) 318 | } 319 | 320 | } # paired end condition 321 | 322 | # return to the original directory 323 | setwd(currdir) 324 | 325 | -------------------------------------------------------------------------------- /bin/BigWigTrackCreate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | #PBS -l nodes=1:ppn=4 3 | #PBS -l mem=10GB 4 | #PBS -l walltime=24:00:00 5 | #PBS -m ae 6 | #PBS -j eo 7 | #PBS -V 8 | source ~/.bashrc 9 | #source ~/.bash_profile 10 | hostname 11 | TMPDIR=/scratch 12 | cd $PBS_O_WORKDIR 13 | 14 | #================================= 15 | # this program denotes a sample pipeline for ATAC-seq data 16 | # applicable only a single fastq or alignment file is provided 17 | #================================= 18 | # developed by - Sourya Bhattacharyya 19 | # Vijay-AY lab 20 | # La Jolla Institute for Allergy and Immunology 21 | #================================= 22 | 23 | # usage info 24 | usage(){ 25 | cat << EOF 26 | 27 | usage: 28 | 29 | Options: 30 | 31 | -- required: 32 | -I InpFile Input alignment file (Bowtie2 aligned) 33 | -n PREFIX Prefix of output files. 34 | -d OutDir Set the output directory which will contain all the results 35 | -w BigWigGenome The reference genome which is used to convert BAM file to a BigWig file. (such as 'hg19', 'mm9', etc.) 36 | 37 | EOF 38 | } 39 | 40 | while getopts "n:I:d:w:" opt; 41 | do 42 | case "$opt" in 43 | n) PREFIX=$OPTARG;; 44 | I) InpFile=$OPTARG;; 45 | d) OutDir=$OPTARG;; 46 | w) BigWigGenome=$OPTARG;; 47 | \?) usage 48 | echo "error: unrecognized option -$OPTARG"; 49 | exit 1 50 | ;; 51 | esac 52 | done 53 | 54 | # executable to convert the sorted bam file to the bigwig format 55 | BigWigCreateExec='/home/sourya/proj/utils/bam_to_bigwig.sh' 56 | 57 | #====================== 58 | # convert the alignment file to the bigwig data format 59 | # for track visualization 60 | #====================== 61 | BigWigoutdir=$OutDir'/Out_BigWig' 62 | mkdir -p $BigWigoutdir 63 | 64 | # we use sorted (before duplicate removal) bam file 65 | $BigWigCreateExec -I $InpFile -g $BigWigGenome -d $BigWigoutdir -n $PREFIX 66 | 67 | -------------------------------------------------------------------------------- /bin/CorrelationBAMPeak.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #================================= 4 | # this program is a supporting script for the ATAC seq pipeline 5 | 6 | # inputs: 7 | # 1) a set of input bam files (requires sorted bam files, and possibly indexed as well) 8 | # 2) a set of peak files 9 | # 3) max no of peaks to be considered 10 | 11 | # union of the given peak files is used to compute the coverage with respect to individual input bam files 12 | # with respect to a minimum coverage (num of reads) threshold 13 | # and a given threshold of the max no of peaks to be considered 14 | # bam files are subsampled to cover only the subset of union peak set 15 | # correlation between these subsampled bam files 16 | 17 | #================================= 18 | # developed by - Sourya Bhattacharyya 19 | # Vijay-AY lab 20 | # La Jolla Institute for Allergy and Immunology 21 | #================================= 22 | 23 | 24 | # usage info 25 | usage(){ 26 | cat << EOF 27 | 28 | Options: 29 | 30 | -- required: 31 | -B BAM One or more input bam files (sorted) 32 | -P NarrowPeak One or more input narrow peak files (corresponding to the input bam files) 33 | and in the same order as the input bam files 34 | -L Labels One or more strings (labels) corresponding to input bam files. 35 | -D OutDir Output directory storing the correlation results 36 | -r ReadCount A threshold (integer) of the number of reads (coverage) that each peak should 37 | minimally cover. Default = 5 (according to the Greenleaf 2018 paper) 38 | -c PeakCount Number of peaks to be randomly from the union set of peaks. 39 | Default = 50000 (according to the Greenleaf 2018 paper) 40 | -O Overwrite this boolean option signifies whether existing output files would 41 | be overwritten (1) or not (0). 42 | Default = 0 43 | 44 | EOF 45 | } 46 | 47 | # default minimum coverage threshold for each peak 48 | ReadCountThr=5 49 | 50 | # default threshold of the number of peaks 51 | PeakCountThr=50000 52 | 53 | # default output directory 54 | OutDir=`pwd`'/' 55 | 56 | # this boolean option signifies whether existing output 57 | # files would be overwritten (1) or not (0). 58 | # Default = 0 59 | Overwrite=0 60 | 61 | while getopts "B:P:D:r:c:O:L:" opt; 62 | do 63 | case "$opt" in 64 | B) BAMFILES+=($OPTARG);; # one or more bam input files can be provided 65 | P) PEAKFILES+=($OPTARG);; # one or more peak input files can be provided 66 | L) Labels+=($OPTARG);; # labels corresponding to individual input bam files 67 | D) OutDir=$OPTARG;; 68 | r) ReadCountThr=$OPTARG;; 69 | c) PeakCountThr=$OPTARG;; 70 | O) Overwrite=$OPTARG;; 71 | \?) usage 72 | echo "error: unrecognized option -$OPTARG"; 73 | exit 1 74 | ;; 75 | esac 76 | done 77 | 78 | 79 | #---------------------------------- 80 | # important - sourya 81 | # change the current directory as the dir containing this executable 82 | # since other source files relative to the current directory needs to be called 83 | current_dir=$(pwd) 84 | script_dir=$(dirname $0) 85 | cd $script_dir 86 | #---------------------------------- 87 | 88 | nbamfiles=${#BAMFILES[@]} 89 | echo 'number of bam files provided: '$nbamfiles 90 | 91 | npeakfiles=${#PEAKFILES[@]} 92 | echo 'number of peak files provided: '$npeakfiles 93 | 94 | nlabels=${#Labels[@]} 95 | echo 'number of labels provided: '$npeakfiles 96 | 97 | # if [[ $nbamfiles != $npeakfiles ]]; then 98 | # echo "Number of input bam files and the number of peak files do not match - return !!!" 99 | # exit 1 100 | # fi 101 | 102 | # check if the input bam files are all indexed 103 | # otherwise index the bam files 104 | listbamfiles='' 105 | for (( i=0; i<${nbamfiles}; i++ )); 106 | do 107 | currbamfile=${BAMFILES[i]} 108 | if [[ $i == 0 ]]; then 109 | listbamfiles=$currbamfile 110 | else 111 | listbamfiles=$listbamfiles' '$currbamfile 112 | fi 113 | echo 'processing the bam file index: '$i' name: '$currbamfile 114 | if [ ! -f $currbamfile'.bai' ]; then 115 | samtools index $currbamfile 116 | fi 117 | done 118 | echo 'listbamfiles: '$listbamfiles 119 | 120 | # list of labels 121 | if [[ $nlabels == $nbamfiles ]]; then 122 | listlabels='' 123 | # also required a colon separated list 124 | listlabelsRscript='' 125 | for (( i=0; i<${nlabels}; i++ )); 126 | do 127 | if [[ $i == 0 ]]; then 128 | listlabels=${Labels[i]} 129 | listlabelsRscript=${Labels[i]} 130 | else 131 | listlabels=$listlabels' '${Labels[i]} 132 | listlabelsRscript=$listlabelsRscript':'${Labels[i]} 133 | fi 134 | done 135 | echo 'listlabels: '$listlabels 136 | echo 'listlabelsRscript: '$listlabelsRscript 137 | fi 138 | 139 | # list of peak files (if provided) 140 | if [[ $npeakfiles == $nbamfiles ]]; then 141 | listpeakfiles='' 142 | for (( i=0; i<${npeakfiles}; i++ )); 143 | do 144 | echo 'processing the peak file index: '$i' name: '${PEAKFILES[i]} 145 | if [[ $i == 0 ]]; then 146 | listpeakfiles=${PEAKFILES[i]} 147 | else 148 | listpeakfiles=$listpeakfiles' '${PEAKFILES[i]} 149 | fi 150 | done 151 | echo 'listpeakfiles: '$listpeakfiles 152 | fi 153 | 154 | #============================= 155 | # two cases: 156 | # 1) when peak files are provided, and the subsampled union of peaks are used for correlation 157 | # 2) or, when peaks are not provided, and whole bam files are used for correlation 158 | #============================== 159 | if [[ $npeakfiles == $nbamfiles ]]; then 160 | 161 | # union of the input peak files 162 | UnionPeakFile=$OutDir'/Union_Peaks_Original.bed' 163 | 164 | echo '***** before computing '$UnionPeakFile' *****' 165 | 166 | if [[ ! -f $UnionPeakFile || $Overwrite == 1 ]]; then 167 | cat $listpeakfiles | cut -f1-3 | sort -k1,1 -k2,2n | mergeBed -i stdin > $UnionPeakFile 168 | fi 169 | 170 | echo '***** after computing '$UnionPeakFile' *****' 171 | 172 | # now perform the coverage of this union peaks 173 | # with respect to input bam files 174 | coverageoutfile=$OutDir'/Union_Peaks_Original_CoverageVal.bed' 175 | 176 | echo '***** before computing '$coverageoutfile' *****' 177 | 178 | if [[ ! -f $coverageoutfile || $Overwrite == 1 ]]; then 179 | # earlier command - sourya 180 | # bedtools multicov -bams ${listbamfiles} -bed ${UnionPeakFile} > ${coverageoutfile} 181 | 182 | # modified command - sourya 183 | # we found that supplying all of the bam files together in the "multicov" function 184 | # results errors, probably due to mismatching headers in different bam files 185 | # so we supply one bam file at a time, 186 | # compute the coverage with respect to individual bam files 187 | # and sequentially merge all the information 188 | for (( i=0; i<${nbamfiles}; i++ )); 189 | do 190 | # temp output file 191 | tempout=$OutDir'/temp_out_union_peaks_coverage.bed' 192 | bedtools multicov -bams ${BAMFILES[i]} -bed ${UnionPeakFile} > $tempout 193 | if [ $i == 0 ]; then 194 | # first iteration - rename the temporary output file to the 195 | # final output file 196 | mv $tempout $coverageoutfile 197 | else 198 | # subsequent iteration 199 | # use bedtools map function to merge the existing contents 200 | # of "coverageoutfile" with the new "tempout" contents 201 | tempout2=$OutDir'/temp_out_union_peaks_coverage2.bed' 202 | bedtools map -c 4 -o mean -null '0' -a $coverageoutfile -b $tempout > $tempout2 203 | # remove the old instance of coverage output file 204 | # and use the newly constructed file 205 | rm $coverageoutfile 206 | mv $tempout2 $coverageoutfile 207 | fi 208 | done 209 | # remove the temporary files 210 | if [ -f $tempout ]; then 211 | rm $tempout 212 | fi 213 | if [ -f $tempout2 ]; then 214 | rm $tempout2 215 | fi 216 | fi 217 | 218 | echo '***** after computing '$coverageoutfile' *****' 219 | 220 | coverageoutfileThr=$OutDir'/Union_Peaks_CoverageVal_MinReadThr.bed' 221 | 222 | echo '***** before computing '$coverageoutfileThr' *****' 223 | 224 | # select only those peaks which have coverage value >= ReadCountThr 225 | # for all the bam files considered 226 | if [[ ! -f $coverageoutfileThr || $Overwrite == 1 ]]; then 227 | awk -v T="$ReadCountThr" -v N="$npeakfiles" '{f=0; for (i=0;i $coverageoutfileThr 228 | fi 229 | 230 | echo '***** after computing '$coverageoutfileThr' *****' 231 | 232 | # subset of the peaks (randomly selected) 233 | # such that the total number of peaks = PeakCountThr 234 | # check if the coverge thresholded peaks have higher number of peaks 235 | # than the mentioned threshold 236 | coverageoutfileThrSubSample=$OutDir'/Union_Peaks_CoverageVal_MinReadThr_Subsampled_'$PeakCountThr'.bed' 237 | 238 | echo '***** before computing '$coverageoutfileThrSubSample' *****' 239 | 240 | if [[ ! -f $coverageoutfileThrSubSample || $Overwrite == 1 ]]; then 241 | npeakAboveThr=`cat $coverageoutfileThr | wc -l` 242 | echo 'npeakAboveThr: '$npeakAboveThr 243 | if [[ $npeakAboveThr -gt $PeakCountThr ]]; then 244 | echo 'Above the mentioned peak count threshold - random subset' 245 | shuf $coverageoutfileThr | head -n $PeakCountThr | cut -f1-3 | sort -k1,1 -k2,2n > $coverageoutfileThrSubSample 246 | else 247 | cat $coverageoutfileThr | cut -f1-3 | sort -k1,1 -k2,2n > $coverageoutfileThrSubSample 248 | fi 249 | fi 250 | 251 | echo '***** after computing '$coverageoutfileThrSubSample' *****' 252 | 253 | # dumping intermediate results 254 | # minimum mapping quality is maintained at 30 255 | OutDumpFile=$OutDir'/results_SubsampledPeak.npz' 256 | 257 | echo '***** before computing '$OutDumpFile' *****' 258 | 259 | # --minMappingQuality 30 260 | 261 | if [[ $nlabels == $nbamfiles ]]; then 262 | multiBamSummary BED-file --BED $coverageoutfileThrSubSample --bamfiles $listbamfiles --labels $listlabels -out $OutDumpFile 263 | else 264 | multiBamSummary BED-file --BED $coverageoutfileThrSubSample --bamfiles $listbamfiles --smartLabels -out $OutDumpFile 265 | fi 266 | 267 | echo '***** after computing '$OutDumpFile' *****' 268 | 269 | # using spearman correlation 270 | 271 | # OutPlotFile=$OutDir'/Correlation_Spearman_SubsampledPeak.pdf' 272 | # OutMatrixFile=$OutDir'/Correlation_Spearman_SubsampledPeak.matrix' 273 | # --plotFile $OutPlotFile --corMethod spearman --outFileCorMatrix $OutMatrixFile 274 | 275 | # --skipZeros --removeOutliers 276 | 277 | OutPlotFileHeatMap=$OutDir'/Correlation_Spearman_SubsampledPeak_Heatmap.pdf' 278 | OutPlotFileScatter=$OutDir'/Correlation_Spearman_SubsampledPeak_Scatterplot.pdf' 279 | if [[ ! -f $OutPlotFileHeatMap || ! -f $OutPlotFileScatter || $Overwrite == 1 ]]; then 280 | plotCorrelation --corData $OutDumpFile --plotFile $OutPlotFileHeatMap --corMethod spearman --whatToPlot heatmap --colorMap RdYlBu --plotNumbers --outFileCorMatrix $OutDir'/Correlation_Spearman_SubsampledPeak_Corr.mat' 281 | plotCorrelation --corData $OutDumpFile --plotFile $OutPlotFileScatter --corMethod spearman --whatToPlot scatterplot 282 | fi 283 | 284 | # # using pearson correlation 285 | # commented - check https://www.biostars.org/p/195328/ 286 | 287 | # # OutPlotFile=$OutDir'/Correlation_Pearson_SubsampledPeak.pdf' 288 | # # OutMatrixFile=$OutDir'/Correlation_Pearson_SubsampledPeak.matrix' 289 | # # --plotFile $OutPlotFile --corMethod pearson --outFileCorMatrix $OutMatrixFile 290 | 291 | # # --skipZeros --removeOutliers 292 | 293 | # OutPlotFileHeatMap=$OutDir'/Correlation_Pearson_SubsampledPeak_Heatmap.pdf' 294 | # OutPlotFileScatter=$OutDir'/Correlation_Pearson_SubsampledPeak_Scatterplot.pdf' 295 | 296 | # if [[ ! -f $OutPlotFileHeatMap || ! -f $OutPlotFileScatter || $Overwrite == 1 ]]; then 297 | # plotCorrelation --corData $OutDumpFile --plotFile $OutPlotFileHeatMap --corMethod pearson --whatToPlot heatmap --colorMap RdYlBu --plotNumbers --outFileCorMatrix $OutDir'/Correlation_Pearson_SubsampledPeak_Corr.mat' 298 | # plotCorrelation --corData $OutDumpFile --plotFile $OutPlotFileScatter --corMethod pearson --whatToPlot scatterplot 299 | # fi 300 | 301 | #======================== 302 | # now extract the peak intervals (after subsampling) 303 | # and get the peak intensities for individual input peak files 304 | OnlyPeakSubsample=$OutDir'/Peak_PVal_Subsampled_'$PeakCountThr'.bed' 305 | cat $coverageoutfileThrSubSample | cut -f1-3 > $OnlyPeakSubsample 306 | 307 | # merge with individual peak input files 308 | for (( i=0; i<${npeakfiles}; i++ )); 309 | do 310 | tempfile=$OutDir'/Peak_PVal_temp.bed' 311 | # 8th field in the peak file contains -log10(p) score 312 | bedtools map -c 8 -o mean -null '0' -a $OnlyPeakSubsample -b ${PEAKFILES[i]} > $tempfile 313 | rm $OnlyPeakSubsample 314 | mv $tempfile $OnlyPeakSubsample 315 | done 316 | 317 | # now call a R script which would plot the correlation 318 | # for these peaks 319 | if [[ $nlabels == $nbamfiles ]]; then 320 | Rscript CorrelationPeakPlot.r --InpPeakFile $OnlyPeakSubsample --OutDir $OutDir --InpLabels $listlabelsRscript 321 | else 322 | Rscript CorrelationPeakPlot.r --InpPeakFile $OnlyPeakSubsample --OutDir $OutDir 323 | fi 324 | #======================== 325 | 326 | else 327 | 328 | # here no peak files are provided 329 | # so simple correlation using the whole bam files is required 330 | 331 | # dumping intermediate results 332 | # minimum mapping quality is maintained at 30 333 | # --minMappingQuality 30 334 | 335 | OutDumpFile=$OutDir'/results.npz' 336 | if [[ $nlabels == $nbamfiles ]]; then 337 | multiBamSummary bins --bamfiles $listbamfiles --labels $listlabels -out $OutDumpFile 338 | else 339 | multiBamSummary bins --bamfiles $listbamfiles --smartLabels -out $OutDumpFile 340 | fi 341 | 342 | # using spearman correlation 343 | 344 | # OutPlotFile=$OutDir'/Correlation_Spearman_SubsampledPeak.pdf' 345 | # OutMatrixFile=$OutDir'/Correlation_Spearman_SubsampledPeak.matrix' 346 | # --plotFile $OutPlotFile --corMethod spearman --outFileCorMatrix $OutMatrixFile 347 | 348 | # --skipZeros --removeOutliers 349 | 350 | OutPlotFileHeatMap=$OutDir'/Spearman_Heatmap.pdf' 351 | OutPlotFileScatter=$OutDir'/Spearman_Scatterplot.pdf' 352 | 353 | if [[ ! -f $OutPlotFileHeatMap || ! -f $OutPlotFileScatter || $Overwrite == 1 ]]; then 354 | plotCorrelation --corData $OutDumpFile --plotFile $OutPlotFileHeatMap --corMethod spearman --whatToPlot heatmap --colorMap RdYlBu --plotNumbers --outFileCorMatrix $OutDir'/Correlation_Spearman_Corr.mat' 355 | plotCorrelation --corData $OutDumpFile --plotFile $OutPlotFileScatter --skipZeros --removeOutliers --corMethod spearman --whatToPlot scatterplot 356 | fi 357 | 358 | 359 | 360 | # # using pearson correlation 361 | # commented - check https://www.biostars.org/p/195328/ 362 | 363 | # # OutPlotFile=$OutDir'/Correlation_Pearson_SubsampledPeak.pdf' 364 | # # OutMatrixFile=$OutDir'/Correlation_Pearson_SubsampledPeak.matrix' 365 | # # --plotFile $OutPlotFile --corMethod pearson --outFileCorMatrix $OutMatrixFile 366 | 367 | # OutPlotFileHeatMap=$OutDir'/Pearson_Heatmap.pdf' 368 | # OutPlotFileScatter=$OutDir'/Pearson_Scatterplot.pdf' 369 | 370 | # if [[ ! -f $OutPlotFileHeatMap || ! -f $OutPlotFileScatter || $Overwrite == 1 ]]; then 371 | # plotCorrelation --corData $OutDumpFile --plotFile $OutPlotFileHeatMap --corMethod pearson --whatToPlot heatmap --colorMap RdYlBu --plotNumbers --outFileCorMatrix $OutDir'/Correlation_Pearson_Corr.mat' 372 | # plotCorrelation --corData $OutDumpFile --plotFile $OutPlotFileScatter --skipZeros --removeOutliers --corMethod pearson --whatToPlot scatterplot 373 | # fi 374 | 375 | 376 | 377 | fi 378 | 379 | 380 | #---------------------------------- 381 | # important - sourya 382 | # now restore the original directory 383 | cd $current_dir 384 | #---------------------------------- 385 | 386 | -------------------------------------------------------------------------------- /bin/CorrelationPeakPlot.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | #=========================================================== 4 | # R script for plotting the pairwise correlation of the peak intensity values 5 | # for a given pair of input peaks 6 | # Input: one file containing the peak intervals (first three columns) 7 | # and the peak intensity values in the subsequent columns 8 | # another input is the labels of the given samples 9 | 10 | # Author: Sourya Bhattacharyya 11 | # Vijay-Ay lab, LJI 12 | # February 26, 2018 13 | #=========================================================== 14 | 15 | # used for parsing the command line arguments 16 | library(optparse) 17 | 18 | # library(ggplot2) 19 | 20 | # plot dimension values 21 | PlotWidth <- 10 22 | PlotHeight <- 7 23 | 24 | # font size used in texts 25 | FontSize <- 20 26 | 27 | # different colors used in heatmap 28 | ColorVec <- c('blue', 'cyan', 'green', 'yellow', 'orange', 'red') 29 | 30 | option_list = list( 31 | make_option(c("--InpPeakFile"), type="character", default=NULL, help="Input file containing peak locations and the peak intensity values for all the candidate input samples"), 32 | make_option(c("--OutDir"), type="character", default=NULL, help="Output directory containing the results"), 33 | make_option(c("--InpLabels"), type="character", default=NULL, help="Comma or colon separated list of labels associated with individual samples (default %default)") 34 | 35 | ); 36 | 37 | parser <- OptionParser(option_list=option_list) 38 | arguments <- parse_args(parser, positional_arguments=TRUE) 39 | opt <- arguments$options 40 | args <- arguments$args 41 | 42 | # read the input peak file 43 | InpData <- read.table(opt$InpPeakFile, header=F) 44 | 45 | # number of samples is the number of columns of the input file 46 | # minus the first three columns 47 | NumSample <- ncol(InpData) - 3 48 | 49 | # read the labels of the input samples 50 | # if not provided, assign numeric labels 1 to NumSample 51 | if (is.null(opt$InpLabels)) { 52 | InpLabelList <- as.character(seq(1, NumSample)) 53 | } else { 54 | InpLabelList <- as.character(unlist(strsplit(opt$InpLabels,"[,:]"))) 55 | } 56 | 57 | if (is.null(opt$OutDir)) { 58 | OutDir <- getwd() 59 | } else { 60 | OutDir <- opt$OutDir 61 | } 62 | 63 | cat(sprintf("\n\n *** NumSample: %s ", NumSample)) 64 | cat(sprintf("\n\n *** InpLabelList: %s ", InpLabelList)) 65 | 66 | TextFile <- paste0(OutDir, '/Correlation_Peak_Spearman.txt') 67 | con <- file(TextFile, "w") 68 | 69 | # pairwise processing of the input samples 70 | for (i in (1:(NumSample-1))) { 71 | for (j in ((i+1):NumSample)) { 72 | 73 | XAxisData <- InpData[, 3+i] 74 | YAxisData <- InpData[, 3+j] 75 | AbsDiffVec <- abs(XAxisData - YAxisData) 76 | MinDiff <- min(AbsDiffVec) 77 | MaxDiff <- max(AbsDiffVec) 78 | ColorVal_CurrData <- ceiling(((AbsDiffVec - MinDiff) * length(ColorVec)) / ((MaxDiff - MinDiff) * 1.0)) 79 | 80 | # plot the peak correlation 81 | plotfile1 <- paste0(OutDir, '/Correlation_Peak_', InpLabelList[i], '_', InpLabelList[j], '.pdf') 82 | pdf(plotfile1, width=PlotWidth, height=PlotHeight) 83 | plot(XAxisData, YAxisData, cex=0.25, col=ColorVal_CurrData, xlab=paste0("Peak_Log10P_", InpLabelList[i]), ylab=paste0("Peak_Log10P_", InpLabelList[j])) 84 | title("Correlation between peak intensity") 85 | dev.off() 86 | 87 | # also print the correlation 88 | Corr_val <- cor(XAxisData, YAxisData, method="spearman") 89 | outtext <- paste0("\n\n First peak file label : ", InpLabelList[i], "\n\n Second peak file label : ", InpLabelList[j], "\n\n Correlation value: ", Corr_val) 90 | writeLines(outtext, con=con, sep="\n") 91 | } 92 | } 93 | 94 | close(con) 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /bin/Sample_ATACseqQC_script.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | ## ---- echo=FALSE, results="hide", warning=FALSE, message=FALSE------------- 4 | suppressPackageStartupMessages({ 5 | library(ATACseqQC) 6 | library(ChIPpeakAnno) 7 | library(BSgenome.Hsapiens.UCSC.hg19) 8 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 9 | library(phastCons100way.UCSC.hg19) 10 | library(MotifDb) 11 | }) 12 | knitr::opts_chunk$set(warning=FALSE, message=FALSE) 13 | 14 | ## ---- eval=FALSE----------------------------------------------------------- 15 | # library(BiocInstaller) 16 | # biocLite(c("ATACseqQC", "ChIPpeakAnno", "MotifDb", 17 | # "BSgenome.Hsapiens.UCSC.hg19", "TxDb.Hsapiens.UCSC.hg19.knownGene", 18 | # "phastCons100way.UCSC.hg19")) 19 | 20 | ## -------------------------------------------------------------------------- 21 | ## load the library 22 | library(ATACseqQC) 23 | ## input the bamFile from the ATACseqQC package 24 | bamfile <- system.file("extdata", "GL1.bam", 25 | package="ATACseqQC", mustWork=TRUE) 26 | bamfile.labels <- gsub(".bam", "", basename(bamfile)) 27 | 28 | ## -------------------------------------------------------------------------- 29 | bamQC(bamfile, outPath=NULL) 30 | 31 | ## -------------------------------------------------------------------------- 32 | ## generate fragement size distribution 33 | fragSize <- fragSizeDist(bamfile, bamfile.labels) 34 | 35 | ## -------------------------------------------------------------------------- 36 | ## bamfile tags to be read in 37 | tags <- c("AS", "XN", "XM", "XO", "XG", "NM", "MD", "YS", "YT") 38 | ## files will be output into outPath 39 | outPath <- "splited" 40 | dir.create(outPath) 41 | ## shift the coordinates of 5'ends of alignments in the bam file 42 | library(BSgenome.Hsapiens.UCSC.hg19) 43 | seqlev <- "chr1" ## subsample data for quick run 44 | which <- as(seqinfo(Hsapiens)[seqlev], "GRanges") 45 | gal <- readBamFile(bamfile, tag=tags, which=which, asMates=TRUE) 46 | gal1 <- shiftGAlignmentsList(gal) 47 | shiftedBamfile <- file.path(outPath, "shifted.bam") 48 | export(gal1, shiftedBamfile) 49 | 50 | ## -------------------------------------------------------------------------- 51 | library(phastCons100way.UCSC.hg19) 52 | library(TxDb.Hsapiens.UCSC.hg19.knownGene) 53 | txs <- transcripts(TxDb.Hsapiens.UCSC.hg19.knownGene) 54 | ## run program for chromosome 1 only 55 | txs <- txs[seqnames(txs) %in% "chr1"] 56 | genome <- Hsapiens 57 | ## split the reads into NucleosomeFree, mononucleosome, 58 | ## dinucleosome and trinucleosome. 59 | objs <- splitGAlignmentsByCut(gal1, txs=txs, genome=genome, 60 | conservation=phastCons100way.UCSC.hg19) 61 | 62 | ## -------------------------------------------------------------------------- 63 | null <- writeListOfGAlignments(objs, outPath) 64 | ## list the files generated by splitBam. 65 | dir(outPath) 66 | 67 | ## ----eval=FALSE------------------------------------------------------------ 68 | # objs <- splitBam(bamfile, tags=tags, outPath=outPath, 69 | # txs=txs, genome=genome, 70 | # conservation=phastCons100way.UCSC.hg19) 71 | 72 | ## ----fig.height=4, fig.width=4--------------------------------------------- 73 | library(ChIPpeakAnno) 74 | bamfiles <- file.path(outPath, 75 | c("NucleosomeFree.bam", 76 | "mononucleosome.bam", 77 | "dinucleosome.bam", 78 | "trinucleosome.bam")) 79 | ## Plot the cumulative percentage of tag allocation in nucleosome-free 80 | ## and mononucleosome bam files. 81 | cumulativePercentage(bamfiles[1:2], as(seqinfo(Hsapiens)["chr1"], "GRanges")) 82 | 83 | ## ----fig.height=8, fig.width=4--------------------------------------------- 84 | TSS <- promoters(txs, upstream=0, downstream=1) 85 | TSS <- unique(TSS) 86 | ## estimate the library size for normalization 87 | (librarySize <- estLibSize(bamfiles)) 88 | ## calculate the signals around TSSs. 89 | NTILE <- 101 90 | dws <- ups <- 1010 91 | sigs <- enrichedFragments(gal=objs[c("NucleosomeFree", 92 | "mononucleosome", 93 | "dinucleosome", 94 | "trinucleosome")], 95 | TSS=TSS, 96 | librarySize=librarySize, 97 | seqlev=seqlev, 98 | TSS.filter=0.5, 99 | n.tile = NTILE, 100 | upstream = ups, 101 | downstream = dws) 102 | ## log2 transformed signals 103 | sigs.log2 <- lapply(sigs, function(.ele) log2(.ele+1)) 104 | #plot heatmap 105 | featureAlignedHeatmap(sigs.log2, reCenterPeaks(TSS, width=ups+dws), 106 | zeroAt=.5, n.tile=NTILE) 107 | 108 | ## ----fig.show="hide"------------------------------------------------------- 109 | ## get signals normalized for nucleosome-free and nucleosome-bound regions. 110 | out <- featureAlignedDistribution(sigs, 111 | reCenterPeaks(TSS, width=ups+dws), 112 | zeroAt=.5, n.tile=NTILE, type="l", 113 | ylab="Averaged coverage") 114 | 115 | ## -------------------------------------------------------------------------- 116 | ## rescale the nucleosome-free and nucleosome signals to 0~1 117 | range01 <- function(x){(x-min(x))/(max(x)-min(x))} 118 | out <- apply(out, 2, range01) 119 | matplot(out, type="l", xaxt="n", 120 | xlab="Position (bp)", 121 | ylab="Fraction of signal") 122 | axis(1, at=seq(0, 100, by=10)+1, 123 | labels=c("-1K", seq(-800, 800, by=200), "1K"), las=2) 124 | abline(v=seq(0, 100, by=10)+1, lty=2, col="gray") 125 | 126 | ## -------------------------------------------------------------------------- 127 | ## foot prints 128 | library(MotifDb) 129 | CTCF <- query(MotifDb, c("CTCF")) 130 | CTCF <- as.list(CTCF) 131 | print(CTCF[[1]], digits=2) 132 | sigs <- factorFootprints(shiftedBamfile, pfm=CTCF[[1]], 133 | genome=genome, 134 | min.score="90%", seqlev=seqlev, 135 | upstream=100, downstream=100) 136 | 137 | ## ----fig.height=6, fig.width=6--------------------------------------------- 138 | featureAlignedHeatmap(sigs$signal, 139 | feature.gr=reCenterPeaks(sigs$bindingSites, 140 | width=200+width(sigs$bindingSites[1])), 141 | annoMcols="score", 142 | sortBy="score", 143 | n.tile=ncol(sigs$signal[[1]])) 144 | 145 | sigs$spearman.correlation 146 | 147 | ## ----sessionInfo----------------------------------------------------------- 148 | sessionInfo() 149 | 150 | -------------------------------------------------------------------------------- /bin/TagAlign.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #================================= 4 | # this program creates a tag align file (in .gz format) from one or more input aligned (in .bam / .gz format) files 5 | #================================= 6 | # developed by - Sourya Bhattacharyya 7 | # Vijay-AY lab 8 | # La Jolla Institute for Allergy and Immunology 9 | #================================= 10 | 11 | # usage info 12 | usage(){ 13 | cat << EOF 14 | 15 | usage: 16 | 17 | 1) ./TagAlign.sh [-h] [-I inpfile1] [-N 0] [-f 4] [-r 5] [-O Outfile] For ATAC seq or ChIPMentation data 18 | 2) ./TagAlign.sh [-h] [-I inpfile1] [-N 1] [-O Outfile] For standard ChIP seq data 19 | 20 | Example to process multiple input files: 21 | ./TagAlign.sh [-h] [-I inpfile1] [-I inpfile2] [-N 1] [-O Outfile] 22 | Here all the input files will be processed separately, and their outputs will be combined in a single file 23 | 24 | Options: 25 | 26 | -- required: 27 | -I InpFile Input files (aligned in .bam format) or already in gzipped bed format (.gz). 28 | User can provide multiple input files together. 29 | -N NoShift It is a binary variable. If 1, the aligned files strand information are not altered. 30 | For standard ChIP seq data, this should be set as 1. 31 | For a ChIPMentation or ATAC seq data, this should be set as 0, since in these cases the tagaligned files are formed 32 | by shifting forward and reverse strands to cover the length of transposon. Default 1. 33 | -f fwdshift If NoShift=0, this value signifies the amount of shift a forward strand will 34 | require to cover the length of transposon. Default 4. 35 | -r revshift If NoShift=0, this value signifies the amount of shift a reverse strand 36 | will require to cover the length of transposon. Default 5. 37 | -O OutFile Output tagalign file (in .gz format) combining the input files 38 | -q MAPQ_THR Quality threshold that will be applied on the given input BAM file (default 30) 39 | 40 | EOF 41 | } 42 | 43 | # threshold of mapq quality 44 | MAPQ_THR=30 45 | 46 | # default configurations 47 | NoShift=1 48 | fwdshift=4 49 | revshift=5 50 | 51 | # Sourya - Note the processing of input file argument since it can be more than one file 52 | # Note the change of notations 53 | 54 | while getopts "I:N:f:r:O:q:" opt; 55 | do 56 | case "$opt" in 57 | I) InpFile+=($OPTARG);; 58 | N) NoShift=$OPTARG;; 59 | f) fwdshift=$OPTARG;; 60 | r) revshift=$OPTARG;; 61 | O) OutFile=$OPTARG;; 62 | q) MAPQ_THR=$OPTARG;; 63 | \?) usage 64 | echo "error: unrecognized option -$OPTARG"; 65 | exit 1 66 | ;; 67 | esac 68 | done 69 | 70 | echo 'Within utility function TagAlign' 71 | 72 | # # this line should be added when processing a list of inputs using the same command line option 73 | # shift $(( OPTIND - 1 )) 74 | 75 | # number of input files provided 76 | ninp=${#InpFile[@]} 77 | echo 'Number of input files : '$ninp 78 | 79 | if [[ $ninp == 0 ]]; then 80 | echo 'User should provide one or more aligned (.bam) or existing tagalign (.gz) files to combine them - exit for the moment !!' 81 | exit 1 82 | fi 83 | 84 | # echo 'List of input files: '$InpFile 85 | 86 | if [[ -z $OutFile ]]; then 87 | echo 'User did not provide the output file name - exit for the moment !!' 88 | exit 1 89 | fi 90 | 91 | #---------------------------------- 92 | # important - sourya 93 | # change the current directory as the dir containing this executable 94 | # since other source files relative to the current directory needs to be called 95 | current_dir=$(pwd) 96 | script_dir=$(dirname $0) 97 | cd $script_dir 98 | #---------------------------------- 99 | 100 | # also check the extension of input file 101 | filebase1=$(basename "${InpFile[0]}") 102 | if [[ $filebase1 =~ \.bam$ ]]; then 103 | bamext=1 104 | echo 'Input files are provided in .bam format' 105 | else 106 | if [[ $filebase1 =~ \.gz$ ]]; then 107 | bamext=0 108 | echo 'Input files are already in .gz format' 109 | else 110 | echo 'User should provide either one or more aligned (.bam) files or previously generated TagAlign files in .gz format !! Exit ' 111 | exit 1 112 | fi 113 | fi 114 | 115 | # similarly check the extension of output file and if required, append the gzipped extension 116 | filebase2=$(basename "$OutFile") 117 | if [[ $filebase2 =~ \.gz$ ]]; then 118 | echo 'User has correctly provided gzipped outfile name' 119 | else 120 | echo 'Appending gzipped extension in the output file name' 121 | OutFile=$OutFile'.gz' 122 | fi 123 | 124 | # check the number of input files and proceed accordingly 125 | if [ $ninp == 1 ]; then 126 | # Only one input file is provided 127 | echo 'Converting the file: '${InpFile[0]} 128 | if [ $bamext == 1 ]; then 129 | # here one input bam file is provided 130 | # so convert the bam file according to the shifting / non-shifting criteria 131 | if [ $NoShift == 0 ]; then 132 | samtools view -b -F 1548 -q $MAPQ_THR ${InpFile[0]} | bamToBed -i stdin | awk -v f=$fwdshift -v r=$revshift 'BEGIN {OFS = "\t"} ; function pos(x){return ((x < 0.0) ? 0 : x)} {if ($6 == "+") print $1, $2 + f, $3 + f, $4, $5, $6; else print $1, pos($2 - r), pos($3 - r), $4, $5, $6}' | gzip -c > $OutFile 133 | else 134 | samtools view -b -F 1548 -q $MAPQ_THR ${InpFile[0]} | bamToBed -i stdin | awk 'BEGIN{FS="\t";OFS="\t"}{$4="N"; print $0}' | gzip -c > $OutFile 135 | fi 136 | else 137 | # already a gzipped file is provided 138 | # we can just copy the file 139 | cp ${InpFile[0]} $OutFile 140 | fi 141 | else 142 | # Multiple input files are provided 143 | if [ $bamext == 1 ]; then 144 | # input files are provided in bam format 145 | # we have to convert them individually, and then combine them 146 | convfilelist='' 147 | for (( i=0; i<${ninp}; i++ )); 148 | do 149 | # convert the current file into a temporary output file 150 | echo 'Converting the file: '${InpFile[i]} 151 | curroutfile='temp_'$i'.gz' 152 | if [ $NoShift == 0 ]; then 153 | samtools view -b -F 1548 -q $MAPQ_THR ${InpFile[i]} | bamToBed -i stdin | awk -v f=$fwdshift -v r=$revshift 'BEGIN {OFS = "\t"} ; function pos(x){return ((x < 0.0) ? 0 : x)} {if ($6 == "+") print $1, $2 + f, $3 + f, $4, $5, $6; else print $1, pos($2 - r), pos($3 - r), $4, $5, $6}' | gzip -c > $curroutfile 154 | else 155 | samtools view -b -F 1548 -q $MAPQ_THR ${InpFile[i]} | bamToBed -i stdin | awk 'BEGIN{FS="\t";OFS="\t"}{$4="N"; print $0}' | gzip -c > $curroutfile 156 | fi 157 | # also update the command for combining these generated files 158 | convfilelist=$convfilelist' '$curroutfile 159 | done 160 | zcat $convfilelist | gzip -c > $OutFile 161 | # remove temporary files 162 | for (( i=0; i<${ninp}; i++ )); 163 | do 164 | rm 'temp_'$i'.gz' 165 | done 166 | else 167 | # input files are already in gzipped format 168 | # we can just combine them 169 | convfilelist='' 170 | for val in "${InpFile[@]}"; do 171 | convfilelist=$convfilelist' '$val 172 | done 173 | zcat $convfilelist | gzip -c > $OutFile 174 | fi 175 | fi 176 | 177 | 178 | #---------------------------------- 179 | # important - sourya 180 | # now restore the original directory 181 | cd $current_dir 182 | #---------------------------------- 183 | 184 | -------------------------------------------------------------------------------- /bin/bam_to_bigwig.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #======================================== 4 | # sample script for converting input bam file to bigwig format 5 | 6 | # author: Sourya Bhattacharyya 7 | # Vijay-AY lab 8 | # La Jolla Institute for Allergy and Immunology 9 | #======================================== 10 | 11 | # usage info 12 | usage(){ 13 | cat << EOF 14 | 15 | usage: 16 | ./bam_to_bigwig.sh [-h] [-I InpFile] [-g refgenome] [-d OutDir] 17 | Example: 18 | ./bam_to_bigwig.sh -I Inp.bam -g 'hg19' -d '/home/sample_ATAC' 19 | 20 | Options: 21 | -- required: 22 | -I InpFile Input BAM file. 23 | -g refgenome Reference genome for chromosome size etc. 24 | -d OutDir Output directory which will contain all the bigwig file and other data 25 | -n OutFilePrefix If specified, output bigwig file name will be 'OutFilePrefix.bw' under the directory 'OutDir' 26 | EOF 27 | } 28 | 29 | # default output directory 30 | OutDir=`pwd`'/' 31 | 32 | # initialization of the prefix string 33 | OutFilePrefix="" 34 | 35 | while getopts "I:g:d:n:" opt; 36 | do 37 | case "$opt" in 38 | I) InpFile=$OPTARG;; 39 | g) refgenome=$OPTARG;; 40 | d) OutDir=$OPTARG;; 41 | n) OutFilePrefix=$OPTARG;; 42 | \?) usage 43 | echo "error: unrecognized option -$OPTARG"; 44 | exit 1 45 | ;; 46 | esac 47 | done 48 | 49 | if [[ -z $InpFile ]]; then 50 | echo 'No input BAM file is provided - exit !!' 51 | exit 1 52 | fi 53 | 54 | if [[ -z $refgenome ]]; then 55 | echo 'No reference genome is provided - exit !!' 56 | exit 1 57 | fi 58 | 59 | filebase1=$(basename "${InpFile}") 60 | if [[ $filebase1 =~ \.bam$ ]]; then 61 | echo 'Input files are provided in .bam format' 62 | else 63 | echo 'Input file is not in BAM format - exit !!' 64 | exit 1 65 | fi 66 | 67 | #---------------------------------- 68 | # important - sourya 69 | # change the current directory as the dir containing this executable 70 | # since other source files relative to the current directory needs to be called 71 | current_dir=$(pwd) 72 | script_dir=$(dirname $0) 73 | cd $script_dir 74 | #---------------------------------- 75 | 76 | if [ ! -f $refgenome'.chrom.sizes' ]; then 77 | # this utility program from UCSC, fetches the chromosome size of the target genome 78 | # and stores that in the specified text file 79 | echo 'Getting the chromosome size' 80 | fetchChromSizes $refgenome > $refgenome'.chrom.sizes' 81 | fi 82 | 83 | # convert the bam file to a bedgraph file 84 | # ensure that the bedgraph file contains only valid chromosomes 85 | # if [ ! -f $OutDir'/Inp.bedGraph' ]; then 86 | genomeCoverageBed -bga -ibam $InpFile -g $refgenome'.chrom.sizes' | awk '( $1 ~ /^chr([1-9]|2[0-2]|1[0-9]|X|M|Y)$/ )' - > $OutDir'/Inp.bedGraph' 87 | # fi 88 | 89 | # sort the generated bedgraph file using the utility of UCSC genome browser 90 | # if [ ! -f $OutDir'/Inp.Sorted.bedGraph' ]; then 91 | bedSort $OutDir'/Inp.bedGraph' $OutDir'/Inp.Sorted.bedGraph' 92 | # fi 93 | 94 | # from the bedgraph file, generate the BigWig file 95 | # using an utility of UCSC genome browser 96 | if [[ -z $OutFilePrefix ]]; then 97 | outbigwigfile=$OutDir'/Inp_BigWig.bw' 98 | else 99 | outbigwigfile=$OutDir'/'$OutFilePrefix'.bw' 100 | fi 101 | 102 | # if [ ! -f $outbigwigfile ]; then 103 | bedGraphToBigWig $OutDir'/Inp.Sorted.bedGraph' $refgenome'.chrom.sizes' $outbigwigfile 104 | # fi 105 | 106 | #---------------------------------- 107 | # important - sourya 108 | # now restore the original directory 109 | cd $current_dir 110 | #---------------------------------- 111 | 112 | -------------------------------------------------------------------------------- /configfile: -------------------------------------------------------------------------------- 1 | #==================================== 2 | # Sample Configuration file for running the ATAC-seq pipeline 3 | # Contains locations of executables and a few genome specific files 4 | # required to execute the pipeline 5 | #==================================== 6 | 7 | 8 | # Picard tool executable 9 | # used for removing PCR duplicates from the ChIP-seq alignment file 10 | picardexec=/home/sourya/packages/PicardTool/picard_version_2.18.14.jar 11 | 12 | # HOMER package executable path 13 | HOMERPath=/home/sourya/packages/HOMER/bin/ 14 | 15 | # deeptools package - directory 16 | DeepToolsDir=/home/sourya/packages/deepTools/deepTools2.0/bin/ 17 | 18 | # file (SQL) required to convert the narrowPeak file to the bigBed format 19 | # check the UCSC web site to download these files 20 | NarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/narrowPeak.as 21 | 22 | # file (SQL) required to convert the bignarrowPeak file to the bigBed format 23 | # check the UCSC web site to download these files 24 | BigNarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/bigNarrowPeak.as 25 | 26 | # file (SQL) required to convert the broadPeak file to the bigBed format 27 | # check the UCSC web site to download these files 28 | BroadPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/broadPeak.as 29 | 30 | # files containing chromosome size information 31 | # two column file storing the size of individual chromosomes 32 | # example: for reference genome hg19, chrom_hg19.sizes 33 | # example: for reference genome hg38, hg38.chrom.sizes 34 | # example: for reference genome mm9, chrom_mm9.sizes 35 | # example: for reference genome mm10, mm10.chrom.sizes 36 | RefChrSizeFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/hg38.chrom.sizes 37 | 38 | # files containing reference chromosome fasta sequence 39 | # example: hg19.fa, mm9.fa, hg38.fa and mm10.fa 40 | RefChrFastaFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Complete_Genome/hg38/hg38.fa 41 | 42 | # files containing reference UCSC annotation (.gtf format) corresponding to the reference Chromosome 43 | # applied as an input to HOMER 44 | RefChrAnnotFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/hg38/hg38.gtf 45 | 46 | # file containing blacklisted regions corresponding to this reference genome hg38 47 | # can be downloaded from the link 48 | # https://github.com/Boyle-Lab/Blacklist/tree/master/lists (v2) 49 | # its OK if this parameter is void, but we recommend to provide if the file is available 50 | # file can be gzipped or normal text format 51 | BlackListFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/BlackListed_Regions/hg38-blacklist.v2.bed 52 | 53 | # executable of ataqv package (https://github.com/ParkerLab/ataqv) 54 | # download the GitHub release (.tar.gz) file, extract it and provide the ataqv executable path here 55 | ATAQVPath=/home/sourya/packages/ataqv/ataqv-1.0.0/bin/ataqv 56 | 57 | # we have to also include the TSS file, but have to specify that the user needs to convert their GTF File (gene annotation file) in the TSS format 58 | # so that first two fields of TSS file includes the chromosome name and the TSS coordinate 59 | TSSFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/hg38/hg38_TSS.gtf 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /configfile_hg19: -------------------------------------------------------------------------------- 1 | #==================================== 2 | # Configuration file for running the ATAC-seq pipeline 3 | # Contains locations of executables and a few genome specific files 4 | # required to execute the pipeline 5 | #==================================== 6 | 7 | 8 | # Picard tool executable 9 | # used for removing PCR duplicates from the ChIP-seq alignment file 10 | picardexec=/home/sourya/packages/PicardTool/picard_version_2.18.14.jar 11 | 12 | # HOMER package executable path 13 | HOMERPath=/home/sourya/packages/HOMER/bin/ 14 | 15 | # deeptools package - directory 16 | DeepToolsDir=/home/sourya/packages/deepTools/deepTools2.0/bin/ 17 | 18 | # file (SQL) required to convert the narrowPeak file to the bigBed format 19 | # check the UCSC web site to download these files 20 | NarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/narrowPeak.as 21 | 22 | # file (SQL) required to convert the bignarrowPeak file to the bigBed format 23 | # check the UCSC web site to download these files 24 | BigNarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/bigNarrowPeak.as 25 | 26 | # file (SQL) required to convert the broadPeak file to the bigBed format 27 | # check the UCSC web site to download these files 28 | BroadPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/broadPeak.as 29 | 30 | # files containing chromosome size information 31 | # two column file storing the size of individual chromosomes 32 | # example: for reference genome hg19, chrom_hg19.sizes 33 | # example: for reference genome hg38, hg38.chrom.sizes 34 | # example: for reference genome mm9, chrom_mm9.sizes 35 | # example: for reference genome mm10, mm10.chrom.sizes 36 | RefChrSizeFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/chrom_hg19.sizes 37 | 38 | # files containing reference chromosome fasta sequence 39 | # example: hg19.fa, mm9.fa, hg38.fa and mm10.fa 40 | RefChrFastaFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Complete_Genome/hg19/hg19.fa 41 | 42 | # files containing reference UCSC annotation (.gtf format) corresponding to the reference Chromosome 43 | # applied as an input to HOMER 44 | RefChrAnnotFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/hg19/hg19.gtf 45 | 46 | # file containing blacklisted regions corresponding to this reference genome hg38 47 | # can be downloaded from the link 48 | # https://github.com/Boyle-Lab/Blacklist/tree/master/lists (v2) 49 | # its OK if this parameter is void, but we recommend to provide if the file is available 50 | # file can be gzipped or normal text format 51 | BlackListFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/BlackListed_Regions/hg19-blacklist.v2.bed 52 | 53 | # executable of ataqv package (https://github.com/ParkerLab/ataqv) 54 | # download the GitHub release (.tar.gz) file, extract it and provide the ataqv executable path here 55 | ATAQVPath=/home/sourya/packages/ataqv/ataqv-1.0.0/bin/ataqv 56 | 57 | # we have to also include the TSS file, but have to specify that the user needs to convert their GTF File (gene annotation file) in the TSS format 58 | # so that first two fields of TSS file includes the chromosome name and the TSS coordinate 59 | TSSFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/hg19/hg19_TSS.gtf 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /configfile_hg38: -------------------------------------------------------------------------------- 1 | #==================================== 2 | # Configuration file for running the ATAC-seq pipeline 3 | # Contains locations of executables and a few genome specific files 4 | # required to execute the pipeline 5 | #==================================== 6 | 7 | 8 | # Picard tool executable 9 | # used for removing PCR duplicates from the ChIP-seq alignment file 10 | picardexec=/home/sourya/packages/PicardTool/picard_version_2.18.14.jar 11 | 12 | # HOMER package executable path 13 | HOMERPath=/home/sourya/packages/HOMER/bin/ 14 | 15 | # deeptools package - directory 16 | DeepToolsDir=/home/sourya/packages/deepTools/deepTools2.0/bin/ 17 | 18 | # file (SQL) required to convert the narrowPeak file to the bigBed format 19 | # check the UCSC web site to download these files 20 | NarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/narrowPeak.as 21 | 22 | # file (SQL) required to convert the bignarrowPeak file to the bigBed format 23 | # check the UCSC web site to download these files 24 | BigNarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/bigNarrowPeak.as 25 | 26 | # file (SQL) required to convert the broadPeak file to the bigBed format 27 | # check the UCSC web site to download these files 28 | BroadPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/broadPeak.as 29 | 30 | # files containing chromosome size information 31 | # two column file storing the size of individual chromosomes 32 | # example: for reference genome hg19, chrom_hg19.sizes 33 | # example: for reference genome hg38, hg38.chrom.sizes 34 | # example: for reference genome mm9, chrom_mm9.sizes 35 | # example: for reference genome mm10, mm10.chrom.sizes 36 | RefChrSizeFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/hg38.chrom.sizes 37 | 38 | # files containing reference chromosome fasta sequence 39 | # example: hg19.fa, mm9.fa, hg38.fa and mm10.fa 40 | RefChrFastaFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Complete_Genome/hg38/hg38.fa 41 | 42 | # files containing reference UCSC annotation (.gtf format) corresponding to the reference Chromosome 43 | # applied as an input to HOMER 44 | RefChrAnnotFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/hg38/hg38.gtf 45 | 46 | # file containing blacklisted regions corresponding to this reference genome hg38 47 | # can be downloaded from the link 48 | # https://github.com/Boyle-Lab/Blacklist/tree/master/lists (v2) 49 | # its OK if this parameter is void, but we recommend to provide if the file is available 50 | # file can be gzipped or normal text format 51 | BlackListFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/BlackListed_Regions/hg38-blacklist.v2.bed 52 | 53 | # executable of ataqv package (https://github.com/ParkerLab/ataqv) 54 | # download the GitHub release (.tar.gz) file, extract it and provide the ataqv executable path here 55 | ATAQVPath=/home/sourya/packages/ataqv/ataqv-1.0.0/bin/ataqv 56 | 57 | # we have to also include the TSS file, but have to specify that the user needs to convert their GTF File (gene annotation file) in the TSS format 58 | # so that first two fields of TSS file includes the chromosome name and the TSS coordinate 59 | TSSFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/hg38/hg38_TSS.gtf 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /configfile_mm10: -------------------------------------------------------------------------------- 1 | #==================================== 2 | # Configuration file for running the ATAC-seq pipeline 3 | # Contains locations of executables and a few genome specific files 4 | # required to execute the pipeline 5 | #==================================== 6 | 7 | 8 | # Picard tool executable 9 | # used for removing PCR duplicates from the ChIP-seq alignment file 10 | picardexec=/home/sourya/packages/PicardTool/picard_version_2.18.14.jar 11 | 12 | # HOMER package executable path 13 | HOMERPath=/home/sourya/packages/HOMER/bin/ 14 | 15 | # deeptools package - directory 16 | DeepToolsDir=/home/sourya/packages/deepTools/deepTools2.0/bin/ 17 | 18 | # file (SQL) required to convert the narrowPeak file to the bigBed format 19 | # check the UCSC web site to download these files 20 | NarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/narrowPeak.as 21 | 22 | # file (SQL) required to convert the bignarrowPeak file to the bigBed format 23 | # check the UCSC web site to download these files 24 | BigNarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/bigNarrowPeak.as 25 | 26 | # file (SQL) required to convert the broadPeak file to the bigBed format 27 | # check the UCSC web site to download these files 28 | BroadPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/broadPeak.as 29 | 30 | # files containing chromosome size information 31 | # two column file storing the size of individual chromosomes 32 | # example: for reference genome hg19, chrom_hg19.sizes 33 | # example: for reference genome hg38, hg38.chrom.sizes 34 | # example: for reference genome mm9, chrom_mm9.sizes 35 | # example: for reference genome mm10, mm10.chrom.sizes 36 | RefChrSizeFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/mm10.chrom.sizes 37 | 38 | # files containing reference chromosome fasta sequence 39 | # example: hg19.fa, mm9.fa, hg38.fa and mm10.fa 40 | RefChrFastaFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Complete_Genome/mm10/mm10.fa 41 | 42 | # files containing reference UCSC annotation (.gtf format) corresponding to the reference Chromosome 43 | # applied as an input to HOMER 44 | RefChrAnnotFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/mm10/mm10.gtf 45 | 46 | # file containing blacklisted regions corresponding to this reference genome hg38 47 | # can be downloaded from the link 48 | # https://github.com/Boyle-Lab/Blacklist/tree/master/lists (v2) 49 | # its OK if this parameter is void, but we recommend to provide if the file is available 50 | # file can be gzipped or normal text format 51 | BlackListFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/BlackListed_Regions/mm10-blacklist.v2.bed 52 | 53 | # executable of ataqv package (https://github.com/ParkerLab/ataqv) 54 | # download the GitHub release (.tar.gz) file, extract it and provide the ataqv executable path here 55 | ATAQVPath=/home/sourya/packages/ataqv/ataqv-1.0.0/bin/ataqv 56 | 57 | # we have to also include the TSS file, but have to specify that the user needs to convert their GTF File (gene annotation file) in the TSS format 58 | # so that first two fields of TSS file includes the chromosome name and the TSS coordinate 59 | TSSFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/mm10/mm10_TSS.gtf 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /configfile_mm9: -------------------------------------------------------------------------------- 1 | #==================================== 2 | # Configuration file for running the ATAC-seq pipeline 3 | # Contains locations of executables and a few genome specific files 4 | # required to execute the pipeline 5 | #==================================== 6 | 7 | 8 | # Picard tool executable 9 | # used for removing PCR duplicates from the ChIP-seq alignment file 10 | picardexec=/home/sourya/packages/PicardTool/picard_version_2.18.14.jar 11 | 12 | # HOMER package executable path 13 | HOMERPath=/home/sourya/packages/HOMER/bin/ 14 | 15 | # deeptools package - directory 16 | DeepToolsDir=/home/sourya/packages/deepTools/deepTools2.0/bin/ 17 | 18 | # file (SQL) required to convert the narrowPeak file to the bigBed format 19 | # check the UCSC web site to download these files 20 | NarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/narrowPeak.as 21 | 22 | # file (SQL) required to convert the bignarrowPeak file to the bigBed format 23 | # check the UCSC web site to download these files 24 | BigNarrowPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/bigNarrowPeak.as 25 | 26 | # file (SQL) required to convert the broadPeak file to the bigBed format 27 | # check the UCSC web site to download these files 28 | BroadPeakASFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/broadPeak.as 29 | 30 | # files containing chromosome size information 31 | # two column file storing the size of individual chromosomes 32 | # example: for reference genome hg19, chrom_hg19.sizes 33 | # example: for reference genome hg38, hg38.chrom.sizes 34 | # example: for reference genome mm9, chrom_mm9.sizes 35 | # example: for reference genome mm10, mm10.chrom.sizes 36 | RefChrSizeFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/chrsize/chrom_mm9.sizes 37 | 38 | # files containing reference chromosome fasta sequence 39 | # example: hg19.fa, mm9.fa, hg38.fa and mm10.fa 40 | RefChrFastaFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Complete_Genome/mm9/mm9.fa 41 | 42 | # files containing reference UCSC annotation (.gtf format) corresponding to the reference Chromosome 43 | # applied as an input to HOMER 44 | RefChrAnnotFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/mm9/mm9.gtf 45 | 46 | # file containing blacklisted regions corresponding to this reference genome hg38 47 | # can be downloaded from the link 48 | # https://github.com/Boyle-Lab/Blacklist/tree/master/lists (v2) 49 | # its OK if this parameter is void, but we recommend to provide if the file is available 50 | # file can be gzipped or normal text format 51 | BlackListFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/BlackListed_Regions/mm9-blacklist.bed 52 | 53 | # executable of ataqv package (https://github.com/ParkerLab/ataqv) 54 | # download the GitHub release (.tar.gz) file, extract it and provide the ataqv executable path here 55 | ATAQVPath=/home/sourya/packages/ataqv/ataqv-1.0.0/bin/ataqv 56 | 57 | # we have to also include the TSS file, but have to specify that the user needs to convert their GTF File (gene annotation file) in the TSS format 58 | # so that first two fields of TSS file includes the chromosome name and the TSS coordinate 59 | TSSFile=/mnt/BioAdHoc/Groups/vd-vijay/sourya/genomes/Annotation/mm9/mm9_TSS.gtf 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /pipeline_exec.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #================= 4 | # main executable script of the ATAC seq pipeline 5 | #================= 6 | # developed by - Sourya Bhattacharyya 7 | # Vijay-AY lab 8 | # La Jolla Institute for Allergy and Immunology 9 | #================= 10 | 11 | #================= 12 | # script 1 - when fastq files of paired end read are provided as the input 13 | #================= 14 | 15 | genome='/home/sourya/genomes/bowtie2_index/hg19/hg19' 16 | dirdata='/home/sourya/test1/' 17 | inpfile1=$dirdata'fastafiles/001_R1.fastq.gz' 18 | inpfile2=$dirdata'fastafiles/001_R2.fastq.gz' 19 | outdir=$dirdata'Sample_TEST_ATAC' 20 | prefix='001' 21 | 22 | `pwd`/bin/pipeline.sh -f $inpfile1 -r $inpfile2 -C `pwd`'/configfile_hg19' -n $prefix -g $genome -d $outdir -t 8 -m "16G" -q 20 -D 1 -O 0 23 | 24 | #================= 25 | # script 2 - when fastq files of single end end read are provided as the input 26 | #================= 27 | 28 | genome='/home/sourya/genomes/bowtie2_index/hg19/hg19' 29 | dirdata='/home/sourya/test2/' 30 | inpfile=$dirdata'merged_inp.fastq.gz' 31 | outdir=$dirdata'Sample_TEST_ATAC' 32 | prefix='002' 33 | 34 | `pwd`/bin/pipeline.sh -f $inpfile -C `pwd`'/configfile_hg19' -n $prefix -g $genome -d $outdir -t 8 -m "16G" -q 20 -D 0 -O 0 35 | 36 | #================= 37 | # script 3 - when a BAM file is provided as the input 38 | # here reference genome is not used 39 | # however, -w parameter is used to specify the genome for 40 | # creating UCSC compatible tracks 41 | #================= 42 | 43 | dirdata='/home/sourya/test3/' 44 | inpfile=$dirdata'inp.bam' 45 | outdir=$dirdata'Sample_TEST_ATAC' 46 | prefix='003' 47 | 48 | `pwd`/bin/pipeline.sh -f $inpfile -C `pwd`'/configfile_hg19' -n $prefix -d $outdir -t 8 -m "16G" -q 20 -D 1 -O 0 -w "hg19" 49 | 50 | 51 | -------------------------------------------------------------------------------- /sample_IDRScript.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #================================= 4 | # sample script for IDR execution 5 | # where peaks generated from multiple ChIP-seq replicates are provided as input 6 | #================================= 7 | 8 | # main executable of IDR script 9 | # when peak files are used as input 10 | IDRScript='./IDR_Codes/IDRMain.sh' 11 | 12 | # main executable of IDR script 13 | # when BAM files are used as input 14 | IDRScriptBAM='./IDR_Codes/IDR_SubSampleBAM_Main.sh' 15 | 16 | #****************************** 17 | # path containing the IDRCode package by Anshul Kundaje et. al. 18 | # user should replace this path with their custom installation directory 19 | IDRCodePackage='/home/sourya/packages/idrCode/' 20 | #****************************** 21 | 22 | 23 | #==================== 24 | # IDR testing script 1 25 | # examining IDR between two peak files 26 | # top 25K common peaks between two samples are experimented 27 | #==================== 28 | 29 | SampleBaseDir='/home/sourya/test1/' 30 | 31 | $IDRScript -I $SampleBaseDir'Sample1/MACS2_Default_Tag_No_Control/Sample1.macs2_peaks.narrowPeak_Q0.01filt' -I $SampleBaseDir'Sample2/MACS2_Default_Tag_No_Control/Sample2.macs2_peaks.narrowPeak_Q0.01filt' -d $SampleBaseDir'/Sample_IDR_Peaks' -P $IDRCodePackage 32 | 33 | 34 | #==================== 35 | # IDR testing script 2 36 | # examining IDR between two BAM files 37 | # first these BAM files are subsampled 38 | # and their peaks are estimated using MACS2 39 | # top 25K common peaks between two samples are experimented 40 | # no control BAM file is provided 41 | # user may specify one or more control BAM files using -C option 42 | # like -C control1.bam -C control2.bam etc. 43 | #==================== 44 | 45 | SampleBaseDir='/home/sourya/test2/' 46 | 47 | $IDRScriptBAM -I $SampleBaseDir'Sample1/Alignment_MAPQ30/Sample1.align.sort.MAPQ30.bam' -I $SampleBaseDir'Sample2/Alignment_MAPQ30/Sample2.align.sort.MAPQ30.bam' -d $SampleBaseDir'/Sample_IDR_BAMFiles' -P $IDRCodePackage -c 25000 48 | -------------------------------------------------------------------------------- /src/PlotSample.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This program is for plotting ATAC seq peak distribution 5 | 6 | Author: Sourya Bhattacharyya 7 | Vijay-AY lab 8 | """ 9 | 10 | import matplotlib 11 | matplotlib.use('Agg') 12 | 13 | import os 14 | from optparse import OptionParser 15 | # import re 16 | import matplotlib.pyplot as plt 17 | # import numpy as np 18 | 19 | #----------------------------------------------------- 20 | def parse_options(): 21 | parser = OptionParser() 22 | 23 | parser.add_option("-I", "--INPFILE", \ 24 | type="string", \ 25 | action="store", \ 26 | dest="INP_TEXT_FILE", \ 27 | default="", \ 28 | help="Input TEXT file containing the Picard Insert size results") 29 | 30 | opts, args = parser.parse_args() 31 | return opts, args 32 | 33 | #----------------------------------------------------- 34 | """ 35 | main function 36 | """ 37 | def main(): 38 | opts, args = parse_options() 39 | InpFile = opts.INP_TEXT_FILE 40 | 41 | k = InpFile.rfind('/') 42 | if (k == -1): 43 | InpDir = "./" 44 | else: 45 | InpDir = InpFile[:(k+1)] 46 | 47 | outdir = InpDir + "Plots" 48 | if (not os.path.exists(outdir)): 49 | os.makedirs(outdir) 50 | 51 | fragment_length_list = [] 52 | aligned_read_count_list = [] 53 | 54 | with open(InpFile) as f: 55 | for line in f.readlines(): 56 | #curr_line_content = re.split(r'\s', line) 57 | curr_line_content = line.split() 58 | if (len(curr_line_content) == 2): 59 | str1 = str(curr_line_content[0]) 60 | str2 = str(curr_line_content[1]) 61 | if 0: 62 | print 'str1: ', str1, ' str2: ', str2 63 | if (len(str1) > 0) and (len(str2) > 0): 64 | if (str1[0].isdigit() == True) and (str2[0].isdigit() == True): 65 | fragment_length_list.append(int(str1)) 66 | aligned_read_count_list.append(int(str2)) 67 | 68 | if 0: 69 | print 'fragment_length_list: ',fragment_length_list 70 | print 'aligned_read_count_list: ',aligned_read_count_list 71 | 72 | total_read_count = sum(aligned_read_count_list) 73 | if 0: 74 | print 'total_read_count: ', total_read_count 75 | 76 | 77 | """ 78 | normalize the aligned read count 79 | dividing by the total no of reads and 80 | with respect to unit fragment size 81 | """ 82 | for i in range(len(aligned_read_count_list)): 83 | aligned_read_count_list[i] = (aligned_read_count_list[i] * 1.0) / (fragment_length_list[i] * total_read_count) 84 | 85 | """ 86 | Now plot the statistics 87 | """ 88 | OutPlotFile = outdir + "/Fragment_plot_LINEAR.pdf" 89 | f = plt.figure() 90 | plt.plot(fragment_length_list, aligned_read_count_list, ls='-', lw=0.3, color='red') 91 | plt.xlim([0,1400]) # add - sourya - setting view for 1400 Kb 92 | plt.xlabel('Fragment length (bp)') 93 | plt.ylabel('Norm Read count') 94 | plt.title('ATAC seq - read density vs fragment length') 95 | f.savefig(OutPlotFile, bbox_inches='tight') 96 | 97 | 98 | OutPlotFile2 = outdir + "/Fragment_plot_LOG.pdf" 99 | f = plt.figure() 100 | plt.semilogy(fragment_length_list, aligned_read_count_list, ls='-', lw=0.3, color='red') 101 | plt.xlim([0,1400]) # add - sourya - setting view for 1400 Kb 102 | plt.xlabel('Fragment length (bp)') 103 | plt.ylabel('Norm Read count') 104 | plt.title('ATAC seq - read density vs fragment length') 105 | f.savefig(OutPlotFile2, bbox_inches='tight') 106 | 107 | 108 | #----------------------------------------------------- 109 | if __name__ == "__main__": 110 | main() 111 | -------------------------------------------------------------------------------- /src/assign_multimappers.py: -------------------------------------------------------------------------------- 1 | ## assign_multimappers.py 2 | 3 | # this code is used to process the multi mapped reads 4 | # used after the Bowtie2 output is filtered to remove all the unmapped reads 5 | 6 | 7 | # code by - Sourya Bhattacharyya 8 | # taken from the standard ATAC seq pipeline 9 | 10 | import sys 11 | import random 12 | import argparse 13 | 14 | # function to parse the arguments 15 | def parse_args(): 16 | # Gives options 17 | parser = argparse.ArgumentParser(description='Saves reads below an alignment threshold and discards all others') 18 | parser.add_argument('-k', help='Alignment number cutoff') 19 | parser.add_argument('--paired-end', dest='paired_ended', action='store_true', help='Data is paired-end') 20 | 21 | # processing the input arguments 22 | # and return the input parameters to the main function 23 | args = parser.parse_args() 24 | alignment_cutoff = int(args.k) 25 | paired_ended = args.paired_ended 26 | return alignment_cutoff, paired_ended 27 | 28 | # main function 29 | if __name__ == "__main__": 30 | 31 | # Runs the filtering step of choosing multimapped reads 32 | [alignment_cutoff, paired_ended] = parse_args() 33 | 34 | # when paired ended input, the cutoff is adjusted 35 | if paired_ended: 36 | alignment_cutoff = int(alignment_cutoff) * 2 37 | 38 | # Store each line in sam file as a list of reads, 39 | # where each read is a list of elements to easily 40 | # modify or grab things 41 | current_reads = [] 42 | current_qname = '' 43 | 44 | # processing individual lines 45 | for line in sys.stdin: 46 | read_elems = line.strip().split('\t') 47 | if read_elems[0].startswith('@'): 48 | sys.stdout.write(line) 49 | continue 50 | 51 | # Keep taking lines that have the same qname 52 | if read_elems[0] == current_qname: 53 | # Add line to current reads 54 | current_reads.append(line) 55 | pass 56 | 57 | else: 58 | # Discard if there are more than the alignment cutoff 59 | if len(current_reads) >= alignment_cutoff: 60 | current_reads = [line] 61 | current_qname = read_elems[0] 62 | elif len(current_reads) > 0: 63 | # Just output all reads, which are then filtered with samtools 64 | for read in current_reads: 65 | sys.stdout.write(str(read)) 66 | # And then discard 67 | current_reads = [line] 68 | current_qname = read_elems[0] 69 | else: 70 | # First read in file 71 | current_reads.append(line) 72 | current_qname = read_elems[0] 73 | 74 | 75 | -------------------------------------------------------------------------------- /src/peak_distribution.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | This program computes the distribution of peak fragment length and the number of aligned reads (read density) 5 | used for benchmarking the ATAC-seq pipeline 6 | 7 | 8 | Author: Sourya Bhattacharyya 9 | Vijay-AY lab 10 | """ 11 | """ 12 | these two lines force matplotlibv to not choose any X-windows 13 | This should be declared very first 14 | see http://stackoverflow.com/questions/2801882/generating-a-png-with-matplotlib-when-display-is-undefined 15 | """ 16 | import matplotlib 17 | matplotlib.use('Agg') 18 | 19 | import os 20 | from optparse import OptionParser 21 | import re 22 | import subprocess 23 | import matplotlib.pyplot as plt 24 | import numpy as np 25 | 26 | ##----------------------------------------------------- 27 | # this function is useful to parse various options for input data processing 28 | def parse_options(): 29 | parser = OptionParser() 30 | 31 | parser.add_option("-I", "--INPFILE", \ 32 | type="string", \ 33 | action="store", \ 34 | dest="INP_BED_FILE", \ 35 | default="", \ 36 | help="Input BED file containing the MACS2 peak detection results") 37 | 38 | parser.add_option("-R", "--REFFILE", \ 39 | type="string", \ 40 | action="store", \ 41 | dest="REF_BAM_FILE", \ 42 | default="", \ 43 | help="Reference BAM alignment file") 44 | 45 | opts, args = parser.parse_args() 46 | return opts, args 47 | 48 | #----------------------------------------------------- 49 | """ 50 | main function 51 | """ 52 | def main(): 53 | opts, args = parse_options() 54 | 55 | INP_BEDFILE = opts.INP_BED_FILE 56 | REF_BAMFILE = opts.REF_BAM_FILE 57 | 58 | k = INP_BEDFILE.rfind('/') 59 | if (k == -1): 60 | INP_BEDFILE_DIR = "./" 61 | Inp_BED_only_filename = INP_BEDFILE 62 | else: 63 | INP_BEDFILE_DIR = INP_BEDFILE[:(k+1)] 64 | Inp_BED_only_filename = INP_BEDFILE[(k+1):] 65 | 66 | k1 = Inp_BED_only_filename.rfind('.') 67 | if (k1 == -1): 68 | OutDN = Inp_BED_only_filename 69 | else: 70 | OutDN = Inp_BED_only_filename[(k1+1):] 71 | 72 | """ 73 | final output directory which will store the plots and data 74 | """ 75 | OutDir_Name = INP_BEDFILE_DIR + OutDN 76 | if (not os.path.exists(OutDir_Name)): 77 | os.makedirs(OutDir_Name) 78 | 79 | fragment_length_list = [] 80 | aligned_read_count_list = [] 81 | 82 | temp_filename = OutDir_Name + "/temp.bed" 83 | fp_temp = open(temp_filename, "w") 84 | 85 | """ 86 | scan each line of the input bed file, and note the peak fragment length 87 | also note the number of input reads (from BAM file) which are mapped in this peak 88 | """ 89 | with open(INP_BEDFILE) as fp_inp: 90 | for line in fp_inp: 91 | # note the peak fragment length 92 | curr_line_content = re.split(r'\s', line) 93 | if 0: 94 | print '\n\n Current line: ', line, ' Contents: ', curr_line_content 95 | peak_fragment_len = int(curr_line_content[2]) - int(curr_line_content[1]) + 1 96 | if 0: 97 | print 'peak_fragment_len: ', peak_fragment_len 98 | # write the line to a temporary bed file 99 | fp_temp.seek(0, os.SEEK_SET) 100 | fp_temp.write(line) 101 | # now count the number of mapped reads to this peak 102 | sys_cmd = "samtools view -cL " + str(temp_filename) + " " + str(REF_BAMFILE) 103 | read_count = int((subprocess.Popen(sys_cmd, stdout=subprocess.PIPE, shell=True)).stdout.read()) 104 | if 0: 105 | print 'read_count: ', read_count 106 | # now append the values in designated lists 107 | # maintain sorted lists 108 | n = len(fragment_length_list) 109 | if (n == 0): 110 | # very first element 111 | fragment_length_list.append(peak_fragment_len) 112 | aligned_read_count_list.append(read_count) 113 | else: 114 | flag = False 115 | for i in xrange((n-1), -1, -1): 116 | if (peak_fragment_len == fragment_length_list[i]): 117 | aligned_read_count_list[i] = aligned_read_count_list[i] + read_count 118 | flag = True 119 | break 120 | elif (peak_fragment_len > fragment_length_list[i]): 121 | if (i == (n-1)): 122 | fragment_length_list.append(peak_fragment_len) 123 | aligned_read_count_list.append(read_count) 124 | flag = True 125 | else: 126 | fragment_length_list.insert(i+1, peak_fragment_len) 127 | aligned_read_count_list.insert(i+1, read_count) 128 | flag = True 129 | break 130 | 131 | if (flag == False): 132 | # condition for insertion at the first location 133 | fragment_length_list.insert(0, peak_fragment_len) 134 | aligned_read_count_list.insert(0, read_count) 135 | 136 | # close the temporary file 137 | fp_temp.close() 138 | 139 | # remove the temporary bed file 140 | os.system("rm " + temp_filename) 141 | 142 | """ 143 | open a text file with two columns 144 | first column will show the peak fragment length 145 | second column displays the read count 146 | the plot file is stored in the same directory containing macs2 results 147 | """ 148 | plot_data_textfile = OutDir_Name + "/plot.txt" 149 | fp = open(plot_data_textfile, "w") 150 | fp.write("Peak_Length" + "\t" + "Read_Count" + "") 151 | for i in range(len(fragment_length_list)): 152 | fp.write("\n" + str(fragment_length_list[i]) + "\t" + str(aligned_read_count_list[i])) 153 | fp.close() 154 | 155 | # """ 156 | # create a read count list which will contain the no of read count in 1K scale 157 | # """ 158 | # read_count_list_1K_scale = [((aligned_read_count_list[i] * 1.0) / 1000) for i in range(len(aligned_read_count_list))] 159 | 160 | """ 161 | Now plot the statistics 162 | """ 163 | # OutPlotFile = OutDir_Name + "/test_1K_Scale.pdf" 164 | # f = plt.figure() 165 | # plt.plot(fragment_length_list, read_count_list_1K_scale, ls='-', lw=2.0) 166 | # plt.xlabel('Fragment length (bp)') 167 | # plt.ylabel('Norm Read count') 168 | # plt.title('ATAC seq - read density vs fragment length') 169 | # f.savefig(OutPlotFile, bbox_inches='tight') 170 | 171 | OutPlotFile2 = OutDir_Name + "/test_LOG_Scale.pdf" 172 | f = plt.figure() 173 | plt.semilogy(fragment_length_list, np.exp(-np.asarray(aligned_read_count_list)/5.0), ls='-', lw=2.0) 174 | #plt.semilogy(fragment_length_list, np.exp(-np.asarray(aligned_read_count_list)/5.0), ls='-', lw=2.0) 175 | # plt.plot(fragment_length_list, aligned_read_count_list, ls='-', lw=2.0) 176 | # plt.yscale('log', basey=10) 177 | plt.xlabel('Fragment length (bp)') 178 | plt.ylabel('Norm Read count') 179 | plt.title('ATAC seq - read density vs fragment length') 180 | f.savefig(OutPlotFile2, bbox_inches='tight') 181 | 182 | #----------------------------------------------------- 183 | if __name__ == "__main__": 184 | main() 185 | 186 | -------------------------------------------------------------------------------- /src/trim_adapters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | # Author: Jason Buenrostro, Stanford University 4 | # The following program will compress daisy chain seq data into singe molecules 5 | 6 | ##### IMPORT MODULES ##### 7 | # import necessary for python 8 | import os 9 | import re 10 | import sys 11 | import gzip 12 | import string 13 | import Levenshtein 14 | from optparse import OptionParser 15 | 16 | ##### DEFINE FUNCTIONS ##### 17 | # Reverse complement 18 | complement = string.maketrans('ATCGN', 'TAGCN') 19 | def reverse_complement(sequence): 20 | return sequence.upper().translate(complement)[::-1] 21 | 22 | # Align with mismatch, find first and move on, assumes only one 23 | def fuzz_align(s_seq,l_seq,mismatch): 24 | for i, base in enumerate(l_seq): # loop through equal size windows 25 | l_subset = l_seq[i:i+len(s_seq)] 26 | dist = Levenshtein.distance(l_subset, s_seq) 27 | if dist <= mismatch: # find first then break 28 | return i, dist 29 | break 30 | 31 | # added by Jin Lee for hot fix (output name bug) 32 | def rreplace(s, old, new, occurrence): 33 | li = s.rsplit(old, occurrence) 34 | return new.join(li) 35 | 36 | #### OPTIONS #### 37 | # define options 38 | opts = OptionParser() 39 | usage = "usage: %prog [options] [inputs] This will trim adapters" 40 | opts = OptionParser(usage=usage) 41 | opts.add_option("-a", help=" Accepts fastq or fastq.gz") 42 | opts.add_option("-b", help=" Accepts fastq or fastq.gz") 43 | opts.add_option("-d", help=" Output directory storing the trimmed files") 44 | options, arguments = opts.parse_args() 45 | 46 | # return usage information if no argvs given AND they're not available in the environment 47 | # command line arguments always override environment variables 48 | if len(sys.argv)==1: 49 | p1_in = os.environ.get('P1_IN') 50 | p2_in = os.environ.get('P2_IN') 51 | # default output directory 52 | OutDir=os.getwd() 53 | if (p1_in is None) or (p2_in is None): 54 | os.system(sys.argv[0]+" --help") 55 | sys.exit() 56 | else: 57 | ##### INPUTS AND OUTPUTS ##### 58 | # name input and outputs 59 | p1_in = options.a 60 | p2_in = options.b 61 | OutDir = options.d 62 | 63 | # name outputs and print to working dir 64 | p1_file = p1_in.split('/')[-1] 65 | p2_file = p2_in.split('/')[-1] 66 | p1_out = re.sub(".fastq", ".trim.fastq", p1_file) 67 | p2_out = re.sub(".fastq", ".trim.fastq", p2_file) 68 | 69 | #check for file type and open input file 70 | append = p1_in.split('.')[-1] 71 | if append == "fastq": 72 | p1_rds = open(p1_in,'r') 73 | p2_rds = open(p2_in,'r') 74 | p1_out = re.sub(".fastq", ".trim.fastq", p1_file) 75 | p2_out = re.sub(".fastq", ".trim.fastq", p2_file) 76 | elif append == "fq": 77 | p1_rds = open(p1_in,'r') 78 | p2_rds = open(p2_in,'r') 79 | p1_out = re.sub(".fq", ".trim.fastq", p1_file) 80 | p2_out = re.sub(".fq", ".trim.fastq", p2_file) 81 | elif append == "gz": 82 | p1_rds = gzip.open(p1_in,'r') 83 | p2_rds = gzip.open(p2_in,'r') 84 | p1_out = re.sub(".fastq.gz", ".trim.fastq", p1_file) 85 | p2_out = re.sub(".fastq.gz", ".trim.fastq", p2_file) 86 | p1_out = re.sub(".fq.gz", ".trim.fastq", p1_out) 87 | p2_out = re.sub(".fq.gz", ".trim.fastq", p2_out) 88 | else: 89 | sys.exit("ERROR! The input file2 must be a .fastq or .fastq.gz") 90 | 91 | 92 | #================ 93 | # output files are placed within the specified output directory 94 | p1_out = OutDir + '/' + p1_out 95 | p2_out = OutDir + '/' + p2_out 96 | #================= 97 | 98 | ##### SCRIPT ##### 99 | # initialize variables 100 | i=0;j=0;k=0;tot_b=0;count=1 101 | n=20 # match seq 102 | mismatch=1 # only allow 0-1 mismatches for now, if allow two then gets mis indexed, to fix this need to change fuzz_align to save L as a vector and reiterate to find 2nd 103 | 104 | # initilize write files 105 | r1_write = open(p1_out, 'w') 106 | r2_write = open(p2_out, 'w') 107 | 108 | while 1: 109 | # read lines 110 | p1_line = p1_rds.readline() 111 | p2_line = p2_rds.readline() 112 | 113 | # break if at end of file 114 | if not p1_line: 115 | break 116 | 117 | # load fastq into memory 118 | if count ==1: 119 | seqhead1 = p1_line 120 | seqhead2 = p2_line 121 | elif count ==2: 122 | seq1 = p1_line.rstrip() 123 | seq2 = p2_line.rstrip() 124 | elif count ==3: 125 | qualhead1 = p1_line 126 | qualhead2 = p2_line 127 | elif count ==4: 128 | qual1 = p1_line.rstrip() 129 | qual2 = p2_line.rstrip() 130 | 131 | # align reads to themselves 132 | i = i+1 # total reads 133 | rc_seq2 = reverse_complement(seq2[0:n]) 134 | idx = seq1.rfind(rc_seq2) # look for perfect match 135 | if idx > 0: 136 | j = j+1 # 0 mismatchs 137 | elif mismatch>0: 138 | hold = fuzz_align(rc_seq2,seq1,mismatch) # else allow for mismatch 139 | if hold: 140 | idx,mis=hold 141 | if mis == 1: 142 | k=k+1 # 1 mismatch 143 | 144 | # trim reads if idx exist 145 | if idx > 0: 146 | # keep track on how much trimming 147 | tot_b = tot_b+len(seq2[idx+n:-1]) #track total bases trimmed 148 | 149 | # trim data 150 | seq1 = seq1[0:idx+n-1] # modified to sub1 because some aligners (bowtie) dont like perfectly overlapping reads 151 | seq2 = seq2[0:idx+n-1] 152 | qual1 = qual1[0:idx+n-1] 153 | qual2 = qual2[0:idx+n-1] 154 | 155 | # print read1 156 | r1_write.write(seqhead1) 157 | r1_write.write(seq1+"\n") 158 | r1_write.write(qualhead1) 159 | r1_write.write(qual1+"\n") 160 | 161 | # print read2 162 | r2_write.write(seqhead2) 163 | r2_write.write(seq2+"\n") 164 | r2_write.write(qualhead2) 165 | r2_write.write(qual2+"\n") 166 | 167 | # increment count 168 | count = count + 1 169 | if count == 5: 170 | count = 1 171 | else: 172 | count = count 173 | 174 | # close files to write the file 175 | r1_write.close() 176 | r2_write.close() 177 | p1_rds.close() 178 | p2_rds.close() 179 | 180 | # write file output names for passing into next step of pipeline 181 | # !!! DO NOT WRITE ANYTHING ELSE TO STDOUT AFTER THIS !!! 182 | sys.stdout.write(p1_out + '\n') 183 | sys.stdout.write(p2_out + '\n') 184 | 185 | # give summary 186 | sys.stderr.write(str(i)+" sequences total\n") 187 | sys.stderr.write(str(j)+" sequences trimmed with 0 mismatches\n") 188 | sys.stderr.write(str(k)+" sequences trimmed with 1 mismatch\n") 189 | sys.stderr.write(str(tot_b/(j+k))+" mean number of bases trimmed for reads requiring trimming\n") 190 | --------------------------------------------------------------------------------