├── .gitignore ├── .gitattributes ├── modules ├── module_template.bds ├── pipeline_template.bds ├── align_multimapping.bds ├── callpeak_blacklist_filter.bds ├── align_trim_fastq.bds ├── output.bds ├── git.bds ├── callpeak_bigbed.bds ├── input_tagalign.bds ├── callpeak_filter.bds ├── species.bds ├── input_peak.bds ├── input_bam.bds ├── input_fastq.bds ├── cluster.bds ├── callpeak_gem.bds ├── parallel.bds ├── callpeak_spp.bds ├── sys.bds ├── filetable.bds ├── postalign_xcor.bds ├── callpeak_peakseq.bds ├── env.bds ├── align_bwa.bds ├── conf.bds ├── input.bds └── postalign_bed.bds ├── requirements_py3.txt ├── uninstall_dependencies.sh ├── utils ├── kill_scr ├── broadpeak.py ├── narrowpeak.py ├── narrowpeak_idr.py ├── gappedpeak.py ├── clusterGeneric │ ├── run.pl │ ├── stat.pl │ ├── kill.pl │ └── postMortemInfo.pl ├── get_read_length_from_fastq.py ├── axt_dirfiles.py ├── reassemble.py ├── bds_scr_5min ├── bds_scr ├── assign_multimappers.py ├── ucsc_ensGene.py ├── ucsc_simplegene.py ├── parse_summary_qc_recursively.py └── trimfastq.py ├── examples ├── ENCSR936XTK_SE.json ├── ENCSR936XTK_PE.json ├── multiple_data_type.sh ├── example.env ├── encode_test.sh ├── start_from_peaks.sh ├── chipseq_test.sh ├── example2.sh └── scripts │ └── make_bds_cmds_PE.py ├── html ├── jquery.treetable.css └── rpt_header.html ├── etc ├── broadPeak.as ├── narrowPeak.as ├── gappedPeak.as └── Read_Distribution_ChIP-exo.txt ├── bds.config ├── requirements.txt ├── LICENSE.md ├── example_conf.json ├── default.env ├── example_conf_full.json ├── install_dependencies.sh └── species ├── scg.conf ├── sherlock.conf └── kundaje.conf /.gitignore: -------------------------------------------------------------------------------- 1 | *.chp 2 | .*.swp 3 | .nfs* 4 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.bds linguist-language=Java 2 | -------------------------------------------------------------------------------- /modules/module_template.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "parallel.bds" 5 | include "report.bds" 6 | -------------------------------------------------------------------------------- /modules/pipeline_template.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "git.bds" 5 | include "parallel.bds" 6 | include "report.bds" 7 | -------------------------------------------------------------------------------- /requirements_py3.txt: -------------------------------------------------------------------------------- 1 | nomkl 2 | python ==3.5.0 3 | numpy ==1.11.3 4 | idr ==2.0.3 5 | bedtools ==2.26.0 6 | pigz 7 | java-jdk ==8.0.92 8 | matplotlib ==1.5.1 9 | -------------------------------------------------------------------------------- /uninstall_dependencies.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## conda environment name 4 | 5 | ENV_NAME=aquas_chipseq 6 | ENV_NAME_PY3=aquas_chipseq_py3 7 | 8 | conda env remove --name ${ENV_NAME} -y 9 | conda env remove --name ${ENV_NAME_PY3} -y 10 | -------------------------------------------------------------------------------- /utils/kill_scr: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$#" -lt 1 ]; then 4 | echo 5 | echo "Kill a screen with name [SCR_NAME]" 6 | echo "Usage : kill_scr [SCR_NAME]" 7 | echo 8 | screen -ls 9 | exit 1 10 | fi 11 | 12 | screen -X -R $1 quit 13 | -------------------------------------------------------------------------------- /examples/ENCSR936XTK_SE.json: -------------------------------------------------------------------------------- 1 | { 2 | "out_dir" : "ENCSR936XTK/SE", 3 | "se" : true, 4 | "fastq1" : "rep1.fastq.gz", 5 | "fastq2" : "rep2.fastq.gz", 6 | "ctl_fastq1" : "ctl1.fastq.gz", 7 | "ctl_fastq2" : "ctl2.fastq.gz", 8 | "species" : "hg38_ENCODE", 9 | "nth" : 8, 10 | "use_pooled_ctl" : true 11 | } 12 | -------------------------------------------------------------------------------- /examples/ENCSR936XTK_PE.json: -------------------------------------------------------------------------------- 1 | { 2 | "out_dir" : "ENCSR936XTK/PE", 3 | "pe" : true, 4 | "fastq1_1" : "rep1-R1.fastq.gz", 5 | "fastq1_2" : "rep1-R2.fastq.gz", 6 | "fastq2_1" : "rep2-R1.fastq.gz", 7 | "fastq2_2" : "rep2-R2.fastq.gz", 8 | "ctl_fastq1_1" : "ctl1-R1.fastq.gz", 9 | "ctl_fastq1_2" : "ctl1-R2.fastq.gz", 10 | "ctl_fastq2_1" : "ctl2-R1.fastq.gz", 11 | "ctl_fastq2_2" : "ctl2-R2.fastq.gz", 12 | "species" : "hg38_ENCODE", 13 | "nth" : 16, 14 | "use_pooled_ctl" : true 15 | } 16 | -------------------------------------------------------------------------------- /modules/align_multimapping.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "conf.bds" 5 | 6 | 7 | help == align multimapping settings 8 | multimapping := 0 help # alignments reported for multimapping (default: 0). 9 | 10 | 11 | init_align_multimapping() 12 | 13 | 14 | void init_align_multimapping() { 15 | multimapping = get_conf_val_int( multimapping, ["multimapping"] ) 16 | 17 | print("\n\n== align multimapping settings\n") 18 | print( "# alignments reported for multimapping\t: $multimapping\n") 19 | } 20 | -------------------------------------------------------------------------------- /html/jquery.treetable.css: -------------------------------------------------------------------------------- 1 | table.treetable span.indenter { 2 | display: inline-block; 3 | margin: 0; 4 | padding: 0; 5 | text-align: right; 6 | 7 | /* Disable text selection of nodes (for better D&D UX) */ 8 | user-select: none; 9 | -khtml-user-select: none; 10 | -moz-user-select: none; 11 | -o-user-select: none; 12 | -webkit-user-select: none; 13 | 14 | /* Force content-box box model for indenter (Bootstrap compatibility) */ 15 | -webkit-box-sizing: content-box; 16 | -moz-box-sizing: content-box; 17 | box-sizing: content-box; 18 | 19 | width: 19px; 20 | } 21 | 22 | table.treetable span.indenter a { 23 | background-position: left center; 24 | background-repeat: no-repeat; 25 | display: inline-block; 26 | text-decoration: none; 27 | width: 19px; 28 | } 29 | -------------------------------------------------------------------------------- /utils/broadpeak.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import sys,os 4 | 5 | if len(sys.argv)!=3: 6 | print ' ' 7 | sys.exit() 8 | 9 | infile,outfile=sys.argv[1:] 10 | 11 | # all values on 9th field are -1, exclude them 12 | 13 | id=1 14 | fout=open(outfile,'w') 15 | with open(infile) as fin: 16 | for line in fin: 17 | lst=line.rstrip().split('\t') 18 | fout.write('{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]}],id:{1},'.format(lst,id)) 19 | id+=1 20 | if len(lst[3])>1: 21 | fout.write('name:"'+lst[3]+'",') 22 | if lst[5]!='.': 23 | fout.write('strand:"'+lst[5]+'",') 24 | fout.write('\n') 25 | fout.close() 26 | 27 | os.system('sort -k1,1 -k2,2n '+outfile+' > '+outfile+'.srt') 28 | os.system('mv '+outfile+'.srt'+' '+outfile) 29 | os.system('bgzip -f '+outfile) 30 | os.system('tabix -f -p bed '+outfile+'.gz') 31 | -------------------------------------------------------------------------------- /examples/multiple_data_type.sh: -------------------------------------------------------------------------------- 1 | FASTQ1_1=/srv/gsfs0/scratch/leepc12/data/DREAM_challenge_hidden/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS866ZXX.BSREP1.TECHREP1.FILEIDENCFF164ZYB.R1.fastq.gz 2 | FASTQ1_2=/srv/gsfs0/scratch/leepc12/data/DREAM_challenge_hidden/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS866ZXX.BSREP1.TECHREP1.FILEIDENCFF431EXX.R2.fastq.gz 3 | TAG2=/srv/gsfs0/scratch/leepc12/run/DREAM_challenge_hidden/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO/out/align/rep2/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS705BBA.BSREP2.TECHREP1.FILEIDENCFF478QIY.R1.PE2SE.nodup.tagAlign.gz 4 | CTL_BAM1=/srv/gsfs0/scratch/leepc12/run/DREAM_challenge_hidden/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO/out/align/ctl1/CONTROL.MCF-7.R1.PE2SE.bam 5 | 6 | bds /home/leepc12/bds_atac/chipseq/chipseq.bds -species hg19 -pe -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -tag2 $TAG2 -ctl_bam1 $CTL_BAM1 7 | -------------------------------------------------------------------------------- /utils/narrowpeak.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import sys,os 4 | 5 | if len(sys.argv)!=3: 6 | print ' ' 7 | sys.exit() 8 | 9 | infile,outfile=sys.argv[1:] 10 | 11 | id=1 12 | fout=open(outfile,'w') 13 | with open(infile) as fin: 14 | for line in fin: 15 | lst=line.rstrip().split('\t') 16 | fout.write('{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]},{0[8]}],id:{1},'.format(lst,id)) 17 | id+=1 18 | if len(lst[3])>1: 19 | fout.write('name:"'+lst[3]+'",') 20 | if lst[5]!='.': 21 | fout.write('strand:"'+lst[5]+'",') 22 | if lst[9]!='-1': 23 | fout.write('sbstroke:['+lst[9]+']') 24 | fout.write('\n') 25 | 26 | fout.close() 27 | 28 | os.system('sort -k1,1 -k2,2n '+outfile+' > '+outfile+'.srt') 29 | os.system('mv '+outfile+'.srt'+' '+outfile) 30 | os.system('bgzip -f '+outfile) 31 | os.system('tabix -f -p bed '+outfile+'.gz') 32 | -------------------------------------------------------------------------------- /etc/broadPeak.as: -------------------------------------------------------------------------------- 1 | table broadPeak 2 | "BED6+3 Peaks of signal enrichment based on pooled, normalized (interpreted) data." 3 | ( 4 | string chrom; "Reference sequence chromosome or scaffold" 5 | uint chromStart; "Start position in chromosome" 6 | uint chromEnd; "End position in chromosome" 7 | string name; "Name given to a region (preferably unique). Use . if no name is assigned." 8 | uint score; "Indicates how dark the peak will be displayed in the browser (0-1000)" 9 | char[1] strand; "+ or - or . for unknown" 10 | float signalValue; "Measurement of average enrichment for the region" 11 | float pValue; "Statistical significance of signal value (-log10). Set to -1 if not used." 12 | float qValue; "Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used." 13 | ) 14 | -------------------------------------------------------------------------------- /bds.config: -------------------------------------------------------------------------------- 1 | # default system (local, sge, ...) 2 | system = local 3 | 4 | # shell env. 5 | taskShell = /bin/bash -e 6 | sysShell = /bin/bash -e -c 7 | 8 | # regex to get pid 9 | pidRegex = "(\\d+)" 10 | 11 | # checkpoint disabled, show full commands/stderr/stdout on task, filter out commands including "export" from task hint 12 | disableCheckpoint = true 13 | taskMaxHintLen = 300 14 | showTaskCode = true 15 | tailLines = 100000000 16 | filterOutTaskHint = export 17 | clusterPostMortemDisabled = true # prevent error on scg3/4 18 | 19 | # SGE 20 | sge.pe = shm 21 | sge.mem = h_vmem 22 | sge.timeout = h_rt 23 | sge.timeout2 = s_rt 24 | clusterRunAdditionalArgs = -V 25 | 26 | # SLURM (using generic cluster) 27 | clusterGenericRun = ~/.bds/clusterGeneric/run.pl 28 | clusterGenericKill = ~/.bds/clusterGeneric/kill.pl 29 | clusterGenericStat = ~/.bds/clusterGeneric/stat.pl 30 | clusterGenericPostMortemInfo = ~/.bds/clusterGeneric/postMortemInfo.pl 31 | 32 | -------------------------------------------------------------------------------- /etc/narrowPeak.as: -------------------------------------------------------------------------------- 1 | table narrowPeak 2 | "BED6+4 Peaks of signal enrichment based on pooled, normalized (interpreted) data." 3 | ( 4 | string chrom; "Reference sequence chromosome or scaffold" 5 | uint chromStart; "Start position in chromosome" 6 | uint chromEnd; "End position in chromosome" 7 | string name; "Name given to a region (preferably unique). Use . if no name is assigned" 8 | uint score; "Indicates how dark the peak will be displayed in the browser (0-1000) " 9 | char[1] strand; "+ or - or . for unknown" 10 | float signalValue; "Measurement of average enrichment for the region" 11 | float pValue; "Statistical significance of signal value (-log10). Set to -1 if not used." 12 | float qValue; "Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used." 13 | int peak; "Point-source called for this peak; 0-based offset from chromStart. Set to -1 if no point-source called." 14 | ) 15 | -------------------------------------------------------------------------------- /utils/narrowpeak_idr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # show -log10(GLOBAL IDR SCORE) instead of narrowpeak pval 4 | 5 | import sys,os 6 | 7 | if len(sys.argv)!=3: 8 | print ' ' 9 | sys.exit() 10 | 11 | infile,outfile=sys.argv[1:] 12 | 13 | id=1 14 | fout=open(outfile,'w') 15 | with open(infile) as fin: 16 | for line in fin: 17 | lst=line.rstrip().split('\t') 18 | fout.write('{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]},{0[8]},{0[10]},{0[11]}],id:{1},'.format(lst,id)) 19 | id+=1 20 | if len(lst[3])>1: 21 | fout.write('name:"'+lst[3]+'",') 22 | else: 23 | fout.write('name:"'+str(id)+'",') 24 | if lst[5]!='.': 25 | fout.write('strand:"'+lst[5]+'",') 26 | if lst[9]!='-1': 27 | fout.write('sbstroke:['+lst[9]+']') 28 | fout.write('\n') 29 | 30 | fout.close() 31 | 32 | os.system('sort -k1,1 -k2,2n '+outfile+' > '+outfile+'.srt') 33 | os.system('mv '+outfile+'.srt'+' '+outfile) 34 | os.system('bgzip -f '+outfile) 35 | os.system('tabix -f -p bed '+outfile+'.gz') 36 | -------------------------------------------------------------------------------- /utils/gappedpeak.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import sys,os 4 | 5 | if len(sys.argv)!=3: 6 | print ' ' 7 | sys.exit() 8 | 9 | infile,outfile=sys.argv[1:] 10 | 11 | id=1 12 | fout=open(outfile,'w') 13 | with open(infile) as fin: 14 | for line in fin: 15 | lst=line.rstrip().split('\t') 16 | fout.write('{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[12]},{0[13]},{0[14]}],id:{1},struct:{{thin:[[{0[1]},{0[2]}]],thick:['.format(lst,id)) 17 | id+=1 18 | a=int(lst[1]) 19 | sizes=lst[10].split(',') 20 | starts=lst[11].split(',') 21 | for i in range(len(sizes)): 22 | fout.write('[{0},{1}],'.format(a+int(starts[i]),a+int(starts[i])+int(sizes[i]))) 23 | fout.write(']},') 24 | 25 | if len(lst[3])>1: 26 | fout.write('name:"'+lst[3]+'",') 27 | if lst[5]!='.': 28 | fout.write('strand:"'+lst[5]+'",') 29 | fout.write('\n') 30 | 31 | fout.close() 32 | 33 | os.system('sort -k1,1 -k2,2n '+outfile+' > '+outfile+'.srt') 34 | os.system('mv '+outfile+'.srt'+' '+outfile) 35 | os.system('bgzip -f '+outfile) 36 | os.system('tabix -f -p bed '+outfile+'.gz') 37 | -------------------------------------------------------------------------------- /modules/callpeak_blacklist_filter.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "species.bds" 5 | include "module_template.bds" 6 | 7 | 8 | string blacklist_filter_peak( string filetype, string peak, string o_dir, string group ) { 9 | 10 | prefix := replace_dir( rm_ext( peak, \ 11 | ["narrowPeak","narrowpeak",\ 12 | "broadPeak","broadpeak",\ 13 | "regionPeak","regionpeak",\ 14 | "gappedPeak","gappedpeak",filetype] )\ 15 | , o_dir ) 16 | filtered:= "$prefix.filt.$filetype.gz" 17 | 18 | in := [ peak ] 19 | out := filtered 20 | 21 | taskName:= "blacklist_filter " + group 22 | //timeout := 3600 // to get queued fast 23 | system := "local" 24 | 25 | wait_par( cpus ) 26 | 27 | tid := task( out<-in ) { 28 | 29 | sys $shcmd_init 30 | 31 | sys bedtools intersect -v -a <(zcat -f $peak) -b <(zcat -f $blacklist) \ 32 | | awk 'BEGIN{OFS="\t"} {if ($5>1000) $5=1000; print $0}' \ 33 | | grep -P 'chr[\dXY]+[ \t]' | gzip -nc > $filtered 34 | 35 | sys $shcmd_finalize 36 | } 37 | 38 | register_par( tid, cpus ) 39 | 40 | return out 41 | } 42 | -------------------------------------------------------------------------------- /utils/clusterGeneric/run.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use POSIX; 4 | 5 | die "Error: Missing arguments.\nUsage: run.pl timeout cpus mem queue saveStdout saveStderr cmd arg1 ... argN\n" if $#ARGV < 6 ; 6 | 7 | $timeout = shift @ARGV; 8 | $cpus = shift @ARGV; 9 | $mem = shift @ARGV; 10 | $queue = shift @ARGV; 11 | $saveStdout = shift @ARGV; 12 | $saveStderr = shift @ARGV; 13 | $cmd = join(' ', @ARGV); 14 | 15 | $qsub = "sbatch --export=ALL "; 16 | $qsub .= "-n 1 --ntasks-per-node=1 --cpus-per-task=$cpus " if( $cpus > 0 ); 17 | if( $mem > 0 ) { 18 | $mem = ceil($mem/1000000); # MB 19 | $qsub .= "--mem-per-cpu $mem "; 20 | } 21 | if( $timeout > 0 ) { 22 | $timeout = ceil($timeout/60); # minute 23 | $qsub .= "-t $timeout "; 24 | } 25 | if ( $queue ne "" ) { 26 | $qsub .= "-p $queue " 27 | } 28 | 29 | $pid = open QSUB, " | $qsub"; 30 | die "Cannot run command '$qsub'\n" if ! kill(0, $pid); # Check that process exists 31 | print QSUB "#!/bin/sh \n"; # SLURM sbatch needs this shebang... 32 | print QSUB "$cmd\n"; # Send cluster's task via qsub's STDIN 33 | close QSUB; 34 | 35 | exit(0); 36 | 37 | -------------------------------------------------------------------------------- /modules/align_trim_fastq.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "species.bds" 5 | include "module_template.bds" 6 | 7 | 8 | help == fastq trimmer settings 9 | trim_bp := 50 help Number of basepairs after trimming fastqs (default: 50). 10 | 11 | grp_color_trim_fq := "skyblue" 12 | 13 | init_align_trim_fastq() 14 | 15 | void init_align_trim_fastq() { 16 | trim_bp = get_conf_val_int( trim_bp, ["trim_bp"] ) 17 | 18 | print("\n\n== fastq trimmer settings\n") 19 | print( "Number of basepairs after trimming\t\t: $trim_bp\n") 20 | } 21 | 22 | string trim_fastq( string fastq, string o_dir, string group ) { 23 | 24 | prefix := replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir ) 25 | trimmed := "$prefix.trim_"+metric_prefix(trim_bp)+"bp.fastq.gz" 26 | 27 | in := [ fastq ] 28 | out := trimmed 29 | taskName:= "trim_fq " + group 30 | wait_par( cpus ) 31 | 32 | tid := task( out<-in ) { 33 | sys $shcmd_init 34 | sys python $(which trimfastq.py) $fastq $trim_bp | gzip -nc > $trimmed 35 | sys $shcmd_finalize 36 | } 37 | 38 | add_task_to_graph( in, out, group, "TRIM-FQ", grp_color_trim_fq ) 39 | 40 | return out 41 | } -------------------------------------------------------------------------------- /utils/get_read_length_from_fastq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # code extracted from Daniel Kim's ATAQC module (run_ataqc.py) 3 | 4 | import os, sys, re, gzip 5 | 6 | def getFileHandle(filename, mode="r"): 7 | if (re.search('.gz$',filename) or re.search('.gzip',filename)): 8 | if (mode=="r"): 9 | mode="rb"; 10 | return gzip.open(filename,mode) 11 | else: 12 | return open(filename,mode) 13 | 14 | def get_read_length(fastq_file): 15 | ''' 16 | Get read length out of fastq file 17 | ''' 18 | total_reads_to_consider = 1000000 19 | line_num = 0 20 | total_reads_considered = 0 21 | max_length = 0 22 | with getFileHandle(fastq_file, 'rb') as fp: 23 | for line in fp: 24 | if line_num % 4 == 1: 25 | if len(line.strip()) > max_length: 26 | max_length = len(line.strip()) 27 | total_reads_considered += 1 28 | if total_reads_considered >= total_reads_to_consider: 29 | break 30 | line_num += 1 31 | 32 | return int(max_length) 33 | 34 | def main(): 35 | print(get_read_length(sys.argv[1])) 36 | 37 | if __name__ == "__main__": 38 | main() 39 | -------------------------------------------------------------------------------- /html/rpt_header.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # channels : defaults, r, bioconda 2 | 3 | nomkl 4 | samtools ==1.2 5 | htslib ==1.4 # 1.5 in bioconda needed libbz2.so.1.0 6 | bedtools ==2.26.0 #2.22 # 2.21.0 7 | picard ==1.126 # wanted 1.129 here but doesn't exist. instead 1.139 has backward compatibility issue, so take 1.126 8 | ucsc-fetchchromsizes 9 | ucsc-wigtobigwig 10 | ucsc-bedgraphtobigwig 11 | ucsc-bigwiginfo 12 | ucsc-bedclip 13 | ucsc-bedtobigbed 14 | ucsc-twobittofa 15 | macs2 ==2.1.1.20160309 #2.1.0 (no binaries for OSX) 16 | boost ==1.57.0 17 | openblas ==0.2.19 18 | numpy ==1.11.3 #1.13.3 #1.10.2 (no binaries for OSX) #1.9.0, 1.8.2 conflicts with ATAQC 19 | matplotlib ==1.5.1 20 | six==1.10.0 # to fix (ImportError: cannot import name _thread) 21 | python-dateutil==2.6.1 22 | libgfortran==3.0 23 | graphviz ==2.38.0 24 | libtool 25 | ghostscript # pdf2png 26 | pigz 27 | zlib 28 | sambamba ==0.6.6 # to fix seg fault error in 0.6.1 29 | r ==3.2.2 30 | r-snow 31 | r-snowfall 32 | r-bitops 33 | r-catools 34 | bioconductor-rsamtools 35 | r-spp ==1.13 36 | #glibc #segmentation fault in conda with openssl 37 | pyfaidx ==0.4.7.1 38 | 39 | bwa ==0.7.13 40 | deeptools ==2.5.4 #2.2.3 does not support plotFingerprint --outQualityMetrics 41 | #openssl ==1.0.2g-0 42 | openssl==1.0.2p 43 | -------------------------------------------------------------------------------- /utils/clusterGeneric/stat.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #------------------------------------------------------------------------------- 4 | # BDS generic cluster example 5 | # 6 | # This is a trivial example of the 'cluster generic' interface implementation. 7 | # The commands implemented in this example simply pass the propper arguments 8 | # to qsub, qdel or qstat commands. 9 | # This is intended as a toy example, since bds can do this directly (but 10 | # it's a good starting point to extend your own implementation). 11 | # 12 | # This script is executed in order to show the jobID of all jobs currently 13 | # scheduled in the cluster 14 | # 15 | # Script's output: 16 | # This script is expected to print all jobs currently scheduled or 17 | # running in the cluster (e.g. qstat). One per line. The FIRST column 18 | # should be the jobID (columns are spce or tab separated). Other 19 | # columns may exists (but are currently ignored). 20 | # 21 | # Command line arguments: 22 | # None 23 | # 24 | # Pablo Cingolani 25 | #------------------------------------------------------------------------------- 26 | 27 | #--- 28 | # Execute cluster command to show all tasks 29 | #--- 30 | $exitCode = system "squeue"; 31 | 32 | # OK 33 | exit($exitCode); 34 | -------------------------------------------------------------------------------- /etc/gappedPeak.as: -------------------------------------------------------------------------------- 1 | table gappedPeak 2 | "This format is used to provide called regions of signal enrichment based on pooled, normalized (interpreted) data where the regions may be spliced or incorporate gaps in the genomic sequence. It is a BED12+3 format." 3 | ( 4 | string chrom; "Reference sequence chromosome or scaffold" 5 | uint chromStart; "Pseudogene alignment start position" 6 | uint chromEnd; "Pseudogene alignment end position" 7 | string name; "Name of pseudogene" 8 | uint score; "Score of pseudogene with gene (0-1000)" 9 | char[1] strand; "+ or - or . for unknown" 10 | uint thickStart; "Start of where display should be thick (start codon)" 11 | uint thickEnd; "End of where display should be thick (stop codon)" 12 | uint reserved; "Always zero for now" 13 | int blockCount; "Number of blocks" 14 | int[blockCount] blockSizes; "Comma separated list of block sizes" 15 | int[blockCount] chromStarts; "Start positions relative to chromStart" 16 | float signalValue; "Measurement of average enrichment for the region" 17 | float pValue; "Statistical significance of signal value (-log10). Set to -1 if not used." 18 | float qValue; "Statistical significance with multiple-test correction applied (FDR). Set to -1 if not used." 19 | ) 20 | -------------------------------------------------------------------------------- /modules/output.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "conf.bds" 5 | 6 | 7 | help == output/title settings 8 | out_dir := "out" help Output directory (default: out). 9 | title := "" help Prefix for HTML report and outputs without given prefix. 10 | 11 | 12 | init_output() 13 | 14 | 15 | void init_output() { 16 | out_dir = get_conf_val( out_dir, ["out_dir"] ) 17 | title = get_conf_val( title, ["title"] ) 18 | 19 | if ( title == "" ) { // if title is empty, use directory name as a title 20 | dirname := get_basename( get_path(out_dir) ) 21 | if ( dirname == "out" ) { // if output folder is default one (out), then use parent dir. name 22 | dirname = get_basename( rm_str_at_end( get_path(out_dir), "/out" ) ) 23 | } 24 | title = dirname 25 | } 26 | if ( !is_cmd_line_arg_empty() ) out_dir = mkdir( out_dir ) // create output directory and get absolute path for it 27 | title = replace_illegal_chrs( title ) 28 | 29 | print("\n\n== output directory/title info\n") 30 | print( "Output dir.\t\t\t: $out_dir\n" ) 31 | print( "Title (prefix)\t\t\t: $title\n" ) 32 | } 33 | 34 | string get_rel_path( string path ) { // get relative path according to $out_dir 35 | rel_path := path.path().replace( out_dir.path(), "." ) 36 | if ( rel_path == path.path() ) return path //"" 37 | else return rel_path 38 | } 39 | -------------------------------------------------------------------------------- /utils/clusterGeneric/kill.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #------------------------------------------------------------------------------- 4 | # BDS generic cluster example 5 | # 6 | # This is a trivial example of the 'cluster generic' interface implementation. 7 | # The commands implemented in this example simply pass the propper arguments 8 | # to qsub, qdel or qstat commands. 9 | # This is intended as a toy example, since bds can do this directly (but 10 | # it's a good starting point to extend your own implementation). 11 | # 12 | # The script is called when a task is killed 13 | # 14 | # Script's output: 15 | # None 16 | # 17 | # Command line arguments: 18 | # jobId: This is the jobId returned as the first line in 'clusterGenericRun' 19 | # script (i.e. the jobID provided by the cluster management system) 20 | # 21 | # Pablo Cingolani 22 | #------------------------------------------------------------------------------- 23 | 24 | #--- 25 | # Parse command line arguments 26 | #--- 27 | die "Error: Missing arguments.\nUsage: kill.pl jobId\n" if $#ARGV < 0 ; 28 | #$jobId = shift @ARGV; 29 | $jobId = join(' ', @ARGV); 30 | 31 | #--- 32 | # Execute cluster command to kill task 33 | #--- 34 | $exitCode = system "scancel $jobId"; 35 | 36 | # OK 37 | exit($exitCode); 38 | 39 | -------------------------------------------------------------------------------- /modules/git.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "sys.bds" 5 | 6 | 7 | latest_git_commit_sha1 := "" // to show latest git commit sha1/date 8 | latest_git_commit_date := "" 9 | 10 | 11 | init_git() 12 | 13 | 14 | void init_git() { // print latest git commit info 15 | script_file_paths := get_script_file_paths() 16 | for ( string path : script_file_paths ) { 17 | if ( path.exists() && "$path/.git".exists() ) { 18 | 19 | latest_git_commit_sha1 = get_stdout("cd $path; git rev-parse HEAD") 20 | latest_git_commit_date = get_stdout("cd $path; git show -s --format=%cd --date=local $latest_git_commit_sha1") 21 | break; 22 | } 23 | } 24 | 25 | print("\n\n== git info\n") 26 | if ( latest_git_commit_sha1 == "" ) \ 27 | print( "Latest git commit\t\t: not under git control\n" ) 28 | else \ 29 | print( "Latest git commit\t\t: $latest_git_commit_sha1 ($latest_git_commit_date)\n" ) 30 | } 31 | 32 | string html_pipeline_version( string git_url_prefix ) { 33 | string html 34 | if ( latest_git_commit_sha1 != "" ) { 35 | html += "
Pipeline version

" 36 | html += "Latest git commit SHA1: "+\ 37 | "$latest_git_commit_sha1"+\ 38 | " ($latest_git_commit_date)\n" 39 | html += "


\n" 40 | } 41 | 42 | return html 43 | } 44 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | BSD-3-Clause License 2 | 3 | Copyright (c) 2016, Kundaje Lab 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 11 | 12 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /utils/axt_dirfiles.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import sys,glob,gzip,os 4 | 5 | # axt format: http://genome.ucsc.edu/goldenPath/help/axt.html 6 | 7 | if len(sys.argv)!=3: 8 | print ' Run under the dir of gzipped Axt files, presumably one for each target chr but that doesn\'t matter' 9 | sys.exit() 10 | 11 | chrsize={} 12 | with open(sys.argv[1]) as fin: 13 | for line in fin: 14 | lst=line.rstrip().split('\t') 15 | chrsize[lst[0]]=int(lst[1]) 16 | 17 | 18 | OF=sys.argv[2] 19 | 20 | fout=open(OF,'w') 21 | 22 | id=1 23 | 24 | for f in glob.glob('*'): 25 | fin=gzip.GzipFile(f,'r') 26 | line=fin.readline() 27 | while line: 28 | if line[0]!='#': 29 | lst=line.rstrip().split() 30 | # query start/stop 31 | a=0 32 | b=0 33 | if lst[7]=='+': 34 | a=int(lst[5])-1 35 | b=lst[6] 36 | else: 37 | c=chrsize[lst[4]] 38 | a=c-int(lst[6]) 39 | b=c-int(lst[5])+1 40 | 41 | fout.write('{0[1]}\t{2}\t{0[3]}\tid:{1},genomealign:{{chr:"{0[4]}",start:{3},stop:{4},strand:"{0[7]}",targetseq:'.format( 42 | lst, 43 | id, 44 | int(lst[2])-1, 45 | a, 46 | b 47 | )) 48 | id+=1 49 | line=fin.readline().rstrip() 50 | fout.write('"'+line+'",queryseq:') 51 | line=fin.readline().rstrip() 52 | fout.write('"'+line+'"}\n') 53 | fin.readline() 54 | line=fin.readline() 55 | 56 | 57 | fout.close() 58 | 59 | 60 | os.system('sort -k1,1 -k2,2n '+OF+' > xx') 61 | os.system('mv xx '+OF) 62 | os.system('bgzip -f '+OF) 63 | os.system('tabix -f -p bed '+OF+'.gz') 64 | -------------------------------------------------------------------------------- /examples/example.env: -------------------------------------------------------------------------------- 1 | ## Get your hostname by `hostname -f` 2 | 3 | [your_hostname] 4 | 5 | mod_chipseq = bwa/0.7.7 samtools/0.1.19 bedtools/2.19.1 ucsc_tools/3.0.9 picard-tools/1.92 MACS2/2.1.0 java/latest 6 | 7 | addpath_chipseq = /srv/gsfs0/scratch/leepc12/software/idrCode:/srv/gsfs0/scratch/leepc12/software/phantompeakqualtools:/srv/gsfs0/scratch/leepc12/software/idr/bin:/srv/gsfs0/scratch/leepc12/software/align2rawsignal/bin:/srv/gsfs0/scratch/leepc12/software/gem:/srv/gsfs0/scratch/leepc12/software/deepTools/bin:/srv/gsfs0/scratch/leepc12/software/R-2.15.1/bin:/srv/gsfs0/scratch/leepc12/software/python3.4/bin:/srv/gsfs0/scratch/leepc12/software/python2.7/bin 8 | 9 | shcmd_chipseq = export GEMROOT=/srv/gsfs0/scratch/leepc12/software/gem; export GEM=/srv/gsfs0/scratch/leepc12/software/gem/gem.jar; export LAPACK=/srv/gsfs0/scratch/leepc12/software/blas/lapack-*/liblapack.a; export _JAVA_OPTIONS='-Xms256M -Xmx512M -XX:ParallelGCThreads=1'; export MAX_JAVA_MEM='8G'; export MALLOC_ARENA_MAX=4; MCRROOT=/srv/gsfs0/scratch/leepc12/software/MATLAB_Compiler_Runtime/v714; LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRROOT}/runtime/glnxa64; LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRROOT}/bin/glnxa64; MCRJRE=${MCRROOT}/sys/java/jre/glnxa64/jre/lib/amd64; LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRJRE}/native_threads; LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRJRE}/server; LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRJRE}; XAPPLRESDIR=${MCRROOT}/X11/app-defaults; export LD_LIBRARY_PATH; export XAPPLRESDIR; 10 | 11 | species_file = $script_dir/species/scg3.conf 12 | 13 | use_sys_default = true # unlimited resource 14 | -------------------------------------------------------------------------------- /modules/callpeak_bigbed.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "species.bds" 5 | include "module_template.bds" 6 | 7 | 8 | string peak_to_bigbed( string filetype, string peak, string o_dir, string group ) { 9 | 10 | prefix := replace_dir( rm_ext( peak, \ 11 | ["narrowPeak","narrowpeak",\ 12 | "broadPeak","broadpeak",\ 13 | "gappedPeak","gappedpeak",filetype] )\ 14 | , o_dir ) 15 | bigbed := "$prefix.$filetype.bb" 16 | 17 | bed_param := _get_bed_param( filetype ) 18 | 19 | in := [ peak ] 20 | out := bigbed 21 | 22 | taskName:= "peak_to_bigbed " + group 23 | system := "local" 24 | 25 | wait_par( cpus ) 26 | 27 | tid := task( out<-in ) { 28 | 29 | sys $shcmd_init 30 | 31 | sys cat $chrsz | grep -P 'chr[\dXY]+[ \t]' > $bigbed.chrsz.tmp 32 | sys zcat $peak | sort -k1,1 -k2,2n > $bigbed.tmp 33 | sys bedClip $bigbed.tmp $bigbed.chrsz.tmp $bigbed.tmp2 34 | 35 | sys bedToBigBed $bed_param $bigbed.tmp2 $bigbed.chrsz.tmp $bigbed 36 | sys rm -f $bigbed.tmp $bigbed.tmp2 $bigbed.chrsz.tmp 37 | 38 | sys $shcmd_finalize 39 | } 40 | 41 | register_par( tid, cpus ) 42 | 43 | return out 44 | } 45 | 46 | string _get_bed_param( string filetype ) { 47 | 48 | if ( filetype.toLower() == "narrowpeak" ) { 49 | return "-type=bed6+4 -as=$script_dir/etc/narrowPeak.as" 50 | } 51 | else if ( filetype.toLower() == "broadpeak") { 52 | return "-type=bed6+3 -as=$script_dir/etc/broadPeak.as" 53 | } 54 | else if ( filetype.toLower() == "gappedpeak") { 55 | return "-type=bed12+3 -as=$script_dir/etc/gappedPeak.as" 56 | } 57 | else { 58 | error("Unsupported peak file type! ($filetype)\n") 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /utils/clusterGeneric/postMortemInfo.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #------------------------------------------------------------------------------- 4 | # BDS generic cluster example 5 | # 6 | # This is a trivial example of the 'cluster generic' interface implementation. 7 | # The commands implemented in this example simply pass the propper arguments 8 | # to qsub, qdel or qstat commands. 9 | # This is intended as a toy example, since bds can do this directly (but 10 | # it's a good starting point to extend your own implementation). 11 | # 12 | # The following command is executed in order to get information of a recently 13 | # finished jobId. This information is typically used for debuging and it added 14 | # to bds's output. 15 | # 16 | # Script's output: 17 | # The output is not parsed, it is stored and later shown 18 | # in bds's report. Is should contain information relevant 19 | # to the job's execution (e.g. "qstat -f $jobId" or 20 | # "checkjob -v $jobId") 21 | # 22 | # Command line arguments: 23 | # jobId: This is the jobId returned as the first line in 'clusterGenericRun' 24 | # script (i.e. the jobID provided by the cluster management system) 25 | # 26 | # Pablo Cingolani 27 | #------------------------------------------------------------------------------- 28 | 29 | #--- 30 | # Parse command line arguments 31 | #--- 32 | die "Error: Missing arguments.\nUsage: postMortemInfo.pl jobId\n" if $#ARGV < 0 ; 33 | $jobId = shift @ARGV; 34 | 35 | #--- 36 | # Execute cluster command to show task details 37 | #--- 38 | $exitCode = system "squeue -j $jobId"; 39 | 40 | # OK 41 | exit($exitCode); 42 | 43 | -------------------------------------------------------------------------------- /example_conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "screen" : "", 3 | "dry_run" : false, 4 | "type" : "TF", 5 | "final_stage" : "idr", 6 | "out_dir" : "out", 7 | "title" : "", 8 | "input_endedness" : { 9 | "se" : false, 10 | "pe" : false 11 | }, 12 | "input_files" : { 13 | }, 14 | "species" : { 15 | "species" : "" 16 | }, 17 | "cluster" : { 18 | "use_system" : "local", 19 | "q" : "" 20 | }, 21 | "resource" : { 22 | "nth" : 8, 23 | "no_par" : false, 24 | "wt" : "5h50m", 25 | "memory" : "7G", 26 | "wt_dedup" : "23h", 27 | "mem_dedup" : "12G", 28 | "mem_shuf" : "12G", 29 | "wt_bwa" : "47h", 30 | "mem_bwa" : "12G", 31 | "wt_macs2" : "23h", 32 | "mem_macs2" : "15G", 33 | "wt_spp" : "47h", 34 | "mem_spp" : "12G" 35 | }, 36 | "alignment" : { 37 | "aligner" : "bwa", 38 | "bwa" : { 39 | "param_bwa_aln" : "-q 5 -l 32 -k 2" 40 | }, 41 | "filter" : { 42 | "dup_marker" : "picard", 43 | "anon_filt_bam" : false, 44 | "mapq_thresh" : 30, 45 | "no_dup_removal" : false 46 | }, 47 | "subsample" : { 48 | "subsample_chip" : "0", 49 | "subsample_ctl" : "0" 50 | } 51 | }, 52 | "cross_corr_analysis" : { 53 | "no_xcor" : false, 54 | "subsample_xcor" : "15M", 55 | "speak_xcor" : -1 56 | }, 57 | "callpeak" : { 58 | "peak_caller" : "spp", 59 | "ctl_depth_ratio" : 1.2, 60 | "use_pooled_ctl" : false, 61 | "true_rep" : false, 62 | "no_pseudo_rep" : false, 63 | "spp" : { 64 | "npeak_spp" : 300000 65 | }, 66 | "macs2" : { 67 | "pval_thresh_macs2" : 0.01 68 | }, 69 | "idr" : { 70 | "idr_thresh" : 0.05 71 | }, 72 | "naive_overlap" : { 73 | "nonamecheck" : false 74 | } 75 | }, 76 | "visualization" : { 77 | "url_base" : "" 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /utils/reassemble.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import sys 4 | 5 | if len(sys.argv)!=3: 6 | print ' ' 7 | sys.exit() 8 | 9 | infile,outn=sys.argv[1:] 10 | 11 | aliencoord=0 12 | alienchrid=1 13 | id1=1 14 | id2=1 15 | fn1=outn+'_native' 16 | fn2=outn+'_alien' 17 | fout1=open(fn1,'w') 18 | fout2=open(fn2,'w') 19 | 20 | chrname='scaffold_' 21 | 22 | with open(infile) as fin: 23 | for line in fin: 24 | lst=line.rstrip().split('\t') 25 | if len(lst)==1: 26 | print '{2}{0}:{1}'.format(alienchrid,aliencoord,chrname) 27 | aliencoord=0 28 | alienchrid+=1 29 | continue 30 | a=int(lst[1]) 31 | b=int(lst[2]) 32 | 33 | if a>=b: 34 | print 'wrong line: '+line 35 | sys.exit() 36 | 37 | # native 38 | fout1.write('{0}\t{1}\t{2}\tid:{3},genomealign:{{chr:"{8}{4}",start:{5},stop:{6},strand:"{7}"}}\n'.format( 39 | lst[0],a,b, 40 | id1, 41 | alienchrid, 42 | aliencoord, 43 | aliencoord+b-a, 44 | lst[3], 45 | chrname 46 | )) 47 | id1+=1 48 | # alien 49 | fout2.write('{8}{0}\t{1}\t{2}\tid:{3},genomealign:{{chr:"{4}",start:{5},stop:{6},strand:"{7}"}}\n'.format( 50 | alienchrid, 51 | aliencoord, 52 | aliencoord+b-a, 53 | id2, 54 | lst[0],a,b, 55 | lst[3], 56 | chrname 57 | )) 58 | id2+=1 59 | aliencoord+=b-a 60 | 61 | print '{2}{0}:{1}'.format(alienchrid,aliencoord,chrname) 62 | 63 | fout1.close() 64 | fout2.close() 65 | 66 | import os 67 | 68 | os.system('sort -k1,1 -k2,2n '+fn1+' > x') 69 | os.system('mv x '+fn1) 70 | os.system('bgzip -f '+fn1) 71 | os.system('tabix -f -p bed '+fn1+'.gz') 72 | 73 | os.system('sort -k1,1 -k2,2n '+fn2+' > x') 74 | os.system('mv x '+fn2) 75 | os.system('bgzip -f '+fn2) 76 | os.system('tabix -f -p bed '+fn2+'.gz') 77 | -------------------------------------------------------------------------------- /examples/encode_test.sh: -------------------------------------------------------------------------------- 1 | TITLE=ENCSR011PEI 2 | FASTQ1=/srv/scratch/shared/surya/leepc12/data/ENCODE_test/ENCSR011PEI/rep1/ENCFF282GDI_ENCFF316FIQ.fastq.gz 3 | FASTQ2=/srv/scratch/shared/surya/leepc12/data/ENCODE_test/ENCSR011PEI/rep2/ENCFF959EDS_ENCFF740WEF.fastq.gz 4 | CTL_FASTQ1=/srv/scratch/shared/surya/leepc12/data/ENCODE_test/ENCSR011PEI/ctl1/ENCFF728HNA.fastq.gz 5 | WORKDIR=/srv/scratch/shared/surya/leepc12/run/ENCODE_test/$TITLE 6 | mkdir -p $WORKDIR; cd $WORKDIR 7 | bds_scr $TITLE /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -title $TITLE -nth 8 -species hg38 -fastq1 $FASTQ1 -fastq2 $FASTQ2 -ctl_fastq1 $CTL_FASTQ1 8 | sleep 1 9 | 10 | TITLE=ENCSR017GBO 11 | FASTQ1=/srv/scratch/shared/surya/leepc12/data/ENCODE_test/ENCSR017GBO/rep1/ENCFF697GAP_ENCFF713DPD.fastq.gz 12 | FASTQ2=/srv/scratch/shared/surya/leepc12/data/ENCODE_test/ENCSR017GBO/rep2/ENCFF987WCU.fastq.gz 13 | CTL_FASTQ1=/srv/scratch/shared/surya/leepc12/data/ENCODE_test/ENCSR017GBO/ctl1/ENCFF894RGF_ENCFF414HWA.fastq.gz 14 | WORKDIR=/srv/scratch/shared/surya/leepc12/run/ENCODE_test/$TITLE 15 | mkdir -p $WORKDIR; cd $WORKDIR 16 | bds_scr $TITLE /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -title $TITLE -nth 8 -species hg38 -fastq1 $FASTQ1 -fastq2 $FASTQ2 -ctl_fastq1 $CTL_FASTQ1 17 | sleep 1 18 | 19 | TITLE=ENCSR290MUH 20 | FASTQ1=/srv/scratch/shared/surya/leepc12/data/ENCODE_test/ENCSR290MUH/rep1/ENCFF861PLD_ENCFF346WZR.fastq.gz 21 | FASTQ2=/srv/scratch/shared/surya/leepc12/data/ENCODE_test/ENCSR290MUH/rep2/ENCFF701VYF_ENCFF385UYP.fastq.gz 22 | CTL_FASTQ1=/srv/scratch/shared/surya/leepc12/data/ENCODE_test/ENCSR290MUH/ctl1/ENCFF617CIJ_ENCFF322SHV.fastq.gz 23 | WORKDIR=/srv/scratch/shared/surya/leepc12/run/ENCODE_test/$TITLE 24 | mkdir -p $WORKDIR; cd $WORKDIR 25 | bds_scr $TITLE /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -title $TITLE -nth 8 -species hg38 -fastq1 $FASTQ1 -fastq2 $FASTQ2 -ctl_fastq1 $CTL_FASTQ1 26 | sleep 1 27 | 28 | -------------------------------------------------------------------------------- /utils/bds_scr_5min: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$#" -lt 2 ]; then 4 | echo 5 | echo "Create a detached screen for a BDS script and redirect stdout/stderr to a log file." 6 | echo "If you skip [LOG_FILE_NAME], a log file [SCR_NAME].log will be generated on the working directory." 7 | echo "If a log file already exists, stdout/stderr will be appended to it." 8 | echo "Monitor a log file with 'tail -f [LOG_FILE_NAME]'" 9 | echo 10 | echo "Usage: bds_scr [SCR_NAME] [LOG_FILE_NAME] [BDS_PARAM]" 11 | echo " Example: bds_scr TEST ~/TEST.log -s sge chipseq.bds -fastq1 ..." 12 | echo 13 | exit 0 14 | fi 15 | 16 | SCR_NAME=$1.BDS 17 | 18 | #if [ $(screen -ls $SCR_NAME | grep 'No Sockets' | wc -l) != "1" ]; then 19 | if [ $(screen -ls | grep -P "[\t ]\d+.$SCR_NAME" | wc -l) != "0" ]; then 20 | echo "error: A screen named $SCR_NAME already exists." 21 | exit 1 22 | else 23 | echo "[SCR_NAME] : $SCR_NAME" 24 | fi 25 | 26 | if [[ $2 == -* || $2 == *.bds ]]; then # LOG_FILE_NAME skipped 27 | LOG_FILE_NAME="$PWD/$SCR_NAME.log" 28 | PARAM_START_IDX=2 29 | elif [[ $3 == -* || $3 == *.bds ]]; then 30 | LOG_FILE_NAME=$2 31 | PARAM_START_IDX=3 32 | else 33 | echo "error: [BDS_PARAM] is wrong." 34 | exit 2 35 | fi 36 | 37 | if [ $(find $LOG_FILE_NAME -mmin -5 | wc -l) != "0" ]; then 38 | echo "error: log file handle is open or very fresh (modified in past 5 minutes)." 39 | exit 3 40 | fi 41 | 42 | PARAM= 43 | 44 | for ((i=$PARAM_START_IDX;i<=$#;i++)); do 45 | PARAM="$PARAM ${!i}" 46 | done 47 | 48 | echo "[LOG_FILE_NAME] : $LOG_FILE_NAME" 49 | echo "[BDS_PARAM] : $PARAM" 50 | 51 | mkdir -p $(dirname $LOG_FILE_NAME) 52 | 53 | echo "" 54 | echo "===== Created a new screen ====" >> $LOG_FILE_NAME 55 | echo "DATE : $(date)" >> $LOG_FILE_NAME 56 | echo "[HOST] : $(hostname -f)" >> $LOG_FILE_NAME 57 | echo "[SCR_NAME] : $SCR_NAME" >> $LOG_FILE_NAME 58 | echo "[BDS_PARAM] : $PARAM" >> $LOG_FILE_NAME 59 | echo "" >> $LOG_FILE_NAME 60 | 61 | screen -Sdm $SCR_NAME bash -c "bds &>>$LOG_FILE_NAME $PARAM $>>$LOG_FILE_NAME" 62 | 63 | -------------------------------------------------------------------------------- /utils/bds_scr: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$#" -lt 2 ]; then 4 | echo 5 | echo "Create a detached screen for a BDS script and redirect stdout/stderr to a log file." 6 | echo "If you skip [LOG_FILE_NAME], a log file [SCR_NAME].log will be generated on the working directory." 7 | echo "If a log file already exists, stdout/stderr will be appended to it." 8 | echo "Monitor a log file with 'tail -f [LOG_FILE_NAME]'" 9 | echo 10 | echo "Usage: bds_scr [SCR_NAME] [LOG_FILE_NAME] [BDS_PARAM]" 11 | echo " Example: bds_scr TEST ~/TEST.log -s sge chipseq.bds -fastq1 ..." 12 | echo 13 | exit 0 14 | fi 15 | 16 | SCR_NAME="$1".BDS 17 | 18 | #if [ $(screen -ls $SCR_NAME | grep 'No Sockets' | wc -l) != "1" ]; then 19 | if [ $(screen -ls | grep -P "[\t ]\d+.$SCR_NAME" | wc -l) != "0" ]; then 20 | echo "error: A screen named $SCR_NAME already exists." 21 | exit 1 22 | else 23 | echo "[SCR_NAME] : $SCR_NAME" 24 | fi 25 | 26 | if [[ $2 == -* || $2 == *.bds ]]; then # LOG_FILE_NAME skipped 27 | LOG_FILE_NAME="$PWD/$SCR_NAME.log" 28 | PARAM_START_IDX=2 29 | elif [[ $3 == -* || $3 == *.bds ]]; then 30 | LOG_FILE_NAME=$2 31 | PARAM_START_IDX=3 32 | else 33 | echo "error: [BDS_PARAM] is wrong." 34 | exit 1 35 | fi 36 | 37 | PARAM= 38 | 39 | if [ $(find $LOG_FILE_NAME -mmin -2 2> /dev/null | wc -l) != "0" ]; then 40 | echo "error: log file handle is open or very fresh (modified in past 2 minutes)." 41 | exit 3 42 | fi 43 | 44 | for ((i=$PARAM_START_IDX;i<=$#;i++)); do 45 | PARAM="$PARAM ${!i}" 46 | done 47 | 48 | echo "[HOST] : $(hostname -f)" 49 | echo "[LOG_FILE_NAME] : $LOG_FILE_NAME" 50 | echo "[BDS_PARAM] : $PARAM" 51 | 52 | mkdir -p $(dirname $LOG_FILE_NAME) 53 | 54 | echo "" 55 | echo "===== Created a new screen ====" >> $LOG_FILE_NAME 56 | echo "[DATE] : $(date)" >> $LOG_FILE_NAME 57 | echo "[HOST] : $(hostname -f)" >> $LOG_FILE_NAME 58 | echo "[SCR_NAME] : $SCR_NAME" >> $LOG_FILE_NAME 59 | echo "[BDS_PARAM] : $PARAM" >> $LOG_FILE_NAME 60 | echo "" >> $LOG_FILE_NAME 61 | 62 | screen -Sdm $SCR_NAME bash -c "bds &>>$LOG_FILE_NAME $PARAM $>>$LOG_FILE_NAME" 63 | 64 | -------------------------------------------------------------------------------- /modules/input_tagalign.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "conf.bds" 5 | 6 | 7 | help == tagalign input definition : 8 | help For replicate '-tag[REP_ID]', For control '-ctl_tag[REP_ID]'. 9 | 10 | 11 | string get_tag( int ctl, int rep ) { 12 | 13 | key := ( ctl > 0 ? "ctl_tag" : "tag" ) + "_rep" + rep 14 | key2 := ( ctl > 0 ? "ctl_tagalign" : "tagalign" ) + "_rep" + rep 15 | 16 | key3 := ( ctl > 0 ? "ctl_tag" : "tag" ) + rep 17 | key4 := ( ctl > 0 ? "ctl_tagalign" : "tagalign" ) + rep 18 | 19 | key5 := ( ctl > 0 ? "ctl_tag" : "tag" ) 20 | key6 := ( ctl > 0 ? "ctl_tagalign" : "tagalign" ) 21 | 22 | if ( cmd_line_arg_has_key( key ) ) { 23 | return get_path( get_cmd_line_arg_val( key ) ) 24 | } 25 | else if ( cmd_line_arg_has_key( key2 ) ) { 26 | return get_path( get_cmd_line_arg_val( key2 ) ) 27 | } 28 | else if ( cmd_line_arg_has_key( key3 ) ) { 29 | return get_path( get_cmd_line_arg_val( key3 ) ) 30 | } 31 | else if ( cmd_line_arg_has_key( key4 ) ) { 32 | return get_path( get_cmd_line_arg_val( key4 ) ) 33 | } 34 | else if ( (rep==1) && cmd_line_arg_has_key( key5 ) ) { 35 | return get_path( get_cmd_line_arg_val( key5 ) ) 36 | } 37 | else if ( (rep==1) && cmd_line_arg_has_key( key6 ) ) { 38 | return get_path( get_cmd_line_arg_val( key6 ) ) 39 | } 40 | else if ( conf.hasKey( key ) ) { 41 | return get_path( conf{ key } ) 42 | } 43 | else if ( conf.hasKey( key2 ) ) { 44 | return get_path( conf{ key2 } ) 45 | } 46 | else if ( conf.hasKey( key3 ) ) { 47 | return get_path( conf{ key3 } ) 48 | } 49 | else if ( conf.hasKey( key4 ) ) { 50 | return get_path( conf{ key4 } ) 51 | } 52 | else if ( (rep==1) && conf.hasKey( key5 ) ) { 53 | return get_path( conf{ key5 } ) 54 | } 55 | else if ( (rep==1) && conf.hasKey( key6 ) ) { 56 | return get_path( conf{ key6 } ) 57 | } 58 | return "" 59 | } 60 | 61 | string get_tag( int rep ) { 62 | 63 | return get_tag( 0, rep ) 64 | } 65 | 66 | bool is_input_tag( int ctl, int rep ) { 67 | 68 | return get_tag( ctl, rep ) != "" 69 | } 70 | 71 | bool is_input_tag( int rep ) { 72 | 73 | return is_input_tag( 0, rep ) 74 | } 75 | -------------------------------------------------------------------------------- /modules/callpeak_filter.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "species.bds" 5 | include "module_template.bds" 6 | 7 | 8 | help == callpeak etc settings 9 | npeak_filt := 500000 help # top peaks filtered from a narrow peak files (default: 500000). 10 | 11 | 12 | init_callpeak_etc() 13 | 14 | 15 | void init_callpeak_etc() { 16 | 17 | npeak_filt = get_conf_val_int( npeak_filt, ["npeak_filt"] ) 18 | 19 | print("\n\n== callpeak etc settings\n") 20 | print( "# of top peaks to pick up in peak files\t: $npeak_filt\n") 21 | } 22 | 23 | // sort in a descending order of p-value and take top $npeak_filt peaks 24 | string filt_top_peaks( string filetype, string peakfile, string o_dir, string group ) { 25 | 26 | prefix := replace_dir( rm_ext( peakfile, \ 27 | ["narrowPeak","gappedPeak","broadPeak","regionPeak"] ), o_dir ) 28 | ext := get_actual_ext( peakfile ) 29 | peakfile_filt := "$prefix."+metric_prefix( npeak_filt )+".$ext" 30 | sort_param := _get_sort_param( filetype ) 31 | 32 | in := [ peakfile ] 33 | out := peakfile_filt 34 | 35 | taskName:= "filt_top_peaks " + group 36 | timeout := 3600 // to get queued fast 37 | system := "local" 38 | 39 | wait_par( cpus ) 40 | 41 | tid := task( out<-in ) { 42 | 43 | sys $shcmd_init 44 | 45 | // sort -grk8 returns non-zero exit code when 8th columns of any line pair are equal 46 | sys set +o pipefail 47 | 48 | // sort by 8th (-log10(pval) ) column and take top $npeak_filt lines 49 | sys zcat $peakfile | sort $sort_param | head -n $npeak_filt | gzip -nc > $peakfile_filt 50 | 51 | sys $shcmd_finalize 52 | } 53 | 54 | register_par( tid, cpus ) 55 | 56 | add_task_to_graph( in, out, group ) 57 | 58 | return out 59 | } 60 | 61 | string _get_sort_param( string filetype ) { 62 | 63 | if ( filetype.toLower() == "narrowpeak" || filetype.toLower() == "regionpeak" || filetype.toLower() == "broadpeak" ) { 64 | // p-value is at 8th column 65 | return "-s -grk8,8" 66 | } 67 | else if ( filetype.toLower() == "gappedpeak") { 68 | // p-value is at 14th column 69 | return "-s -grk14,14" 70 | } 71 | else { 72 | error("Unsupport peak file type! ($filetype)\n") 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /examples/start_from_peaks.sh: -------------------------------------------------------------------------------- 1 | OUT=/srv/gsfs0/scratch/leepc12/run/DREAM_challenge_hidden/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO/out 2 | 3 | peak1=$OUT/peak/spp/rep1/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS866ZXX.BSREP1.TECHREP1.FILEIDENCFF164ZYB.R1.PE2SE.nodup.tagAlign_x_CONTROL.MCF-7.R1.PE2SE.nodup.40M.tagAlign.regionPeak.gz 4 | peak2=$OUT/peak/spp/rep2/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS705BBA.BSREP2.TECHREP1.FILEIDENCFF478QIY.R1.PE2SE.nodup.tagAlign_x_CONTROL.MCF-7.R1.PE2SE.nodup.40M.tagAlign.regionPeak.gz 5 | peak_pooled=$OUT/peak/spp/pooled_rep/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS866ZXX.BSREP1.TECHREP1.FILEIDENCFF164ZYB.R1.PE2SE.nodup_pooled.tagAlign_x_CONTROL.MCF-7.R1.PE2SE.nodup.40M.tagAlign.regionPeak.gz 6 | peak1_pr1=$OUT/peak/spp/pseudo_reps/rep1/pr1/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS866ZXX.BSREP1.TECHREP1.FILEIDENCFF164ZYB.R1.PE2SE.nodup.pr1.tagAlign_x_CONTROL.MCF-7.R1.PE2SE.nodup.40M.tagAlign.regionPeak.gz 7 | peak1_pr2=$OUT/peak/spp/pseudo_reps/rep1/pr2/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS866ZXX.BSREP1.TECHREP1.FILEIDENCFF164ZYB.R1.PE2SE.nodup.pr2.tagAlign_x_CONTROL.MCF-7.R1.PE2SE.nodup.40M.tagAlign.regionPeak.gz 8 | peak2_pr1=$OUT/peak/spp/pseudo_reps/rep2/pr1/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS705BBA.BSREP2.TECHREP1.FILEIDENCFF478QIY.R1.PE2SE.nodup.pr1.tagAlign_x_CONTROL.MCF-7.R1.PE2SE.nodup.40M.tagAlign.regionPeak.gz 9 | peak2_pr2=$OUT/peak/spp/pseudo_reps/rep2/pr2/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS705BBA.BSREP2.TECHREP1.FILEIDENCFF478QIY.R1.PE2SE.nodup.pr2.tagAlign_x_CONTROL.MCF-7.R1.PE2SE.nodup.40M.tagAlign.regionPeak.gz 10 | peak_ppr1=$OUT/peak/spp/pooled_pseudo_reps/ppr1/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS866ZXX.BSREP1.TECHREP1.FILEIDENCFF164ZYB.R1.PE2SE.nodup.pr1_pooled.tagAlign_x_CONTROL.MCF-7.R1.PE2SE.nodup.40M.tagAlign.regionPeak.gz 11 | peak_ppr2=$OUT/peak/spp/pooled_pseudo_reps/ppr2/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS866ZXX.BSREP1.TECHREP1.FILEIDENCFF164ZYB.R1.PE2SE.nodup.pr2_pooled.tagAlign_x_CONTROL.MCF-7.R1.PE2SE.nodup.40M.tagAlign.regionPeak.gz 12 | 13 | bds $CODE/bds_atac/chipseq/chipseq.bds -species hg19 \ 14 | -peak1 $peak1 -peak2 $peak2 -peak_pooled $peak_pooled \ 15 | -peak1_pr1 $peak1_pr1 -peak1_pr2 $peak1_pr2 -peak2_pr1 $peak2_pr1 -peak2_pr2 $peak1_pr2 \ 16 | -peak_ppr1 $peak_ppr1 -peak_ppr2 $peak_ppr2 17 | 18 | -------------------------------------------------------------------------------- /modules/species.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "conf.bds" 5 | 6 | 7 | help == species settings 8 | species := "" help Species. need to specify '-species_file' too if you have not installed genome database with 'install_genome_data.sh'. 9 | species_file := "" help Species file path. 10 | species_browser := "" help Species name in WashU genome browser. 11 | 12 | ref_fa := "" help Reference genome sequence fasta. 13 | chrsz := "" help Chromosome sizes file path (use fetchChromSizes from UCSC tools). 14 | blacklist := "" help Blacklist bed. 15 | seq_dir := "" help Reference genome sequence directory path (where chr*.fa exist). 16 | 17 | init_species() 18 | 19 | void init_species() { 20 | 21 | species = get_conf_val( species, ["species"] ) 22 | species_file = get_conf_val( species_file, ["species_file"] ) 23 | 24 | _read_species() 25 | 26 | species_browser = get_conf_val( species_browser,["species_browser"] ) 27 | 28 | ref_fa = get_conf_val( ref_fa, ["ref_fa"] ) 29 | chrsz = get_conf_val( chrsz, ["chrsz"] ) 30 | blacklist = get_conf_val( blacklist, ["blacklist"] ) 31 | seq_dir = get_conf_val( seq_dir, ["seq_dir"]) 32 | 33 | if ( species_browser == "" ) species_browser = species 34 | 35 | print("\n\n== species settings\n") 36 | print( "Species\t\t\t\t: $species\n" ) 37 | print( "Species file\t\t\t: $species_file\n\n" ) 38 | print( "Species name (WashU browser)\t: $species_browser\n" ) 39 | print( "Ref. genome seq. fasta\t\t: $ref_fa\n" ) 40 | print( "Chr. sizes file\t\t\t: $chrsz\n" ) 41 | print( "Black list bed\t\t\t: $blacklist\n" ) 42 | print( "Ref. genome seq. dir.\t\t: $seq_dir\n" ) 43 | } 44 | 45 | void _read_species() { // check for species configruation files 46 | // value for key will be overriden as loop goes. so the last element in species_paths has the priority 47 | string[] species_paths 48 | if ( env != "" ) species_paths.add( env ) 49 | if ( c != "" ) species_paths.add( c ) 50 | species_paths.add( species_file ) 51 | 52 | for ( string path : species_paths ) { 53 | if ( path.exists() ) { 54 | add_to_conf( path, species ) 55 | } 56 | } 57 | } 58 | 59 | 60 | // temp 61 | /* 62 | bwt_idx := "" help Bowtie index (full path prefix of *.1.ebwt file). 63 | bwt_idx = get_conf_val( bwt_idx, ["bwt_idx"] ) 64 | print( "Bowtie index\t\t\t: $bwt_idx\n" ) 65 | */ 66 | -------------------------------------------------------------------------------- /utils/assign_multimappers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # piped script to take multimappers and randomly assign 4 | # requires a qname sorted file!! 5 | 6 | import sys 7 | import random 8 | import argparse 9 | 10 | def parse_args(): 11 | ''' 12 | Gives options 13 | ''' 14 | parser = argparse.ArgumentParser(description='Saves reads below a alignment threshold and discards all others') 15 | parser.add_argument('-k', help='Alignment number cutoff') 16 | parser.add_argument('--paired-end', dest='paired_ended', action='store_true', help='Data is paired-end') 17 | args = parser.parse_args() 18 | alignment_cutoff = int(args.k) 19 | paired_ended = args.paired_ended 20 | 21 | return alignment_cutoff, paired_ended 22 | 23 | 24 | if __name__ == "__main__": 25 | ''' 26 | Runs the filtering step of choosing multimapped reads 27 | ''' 28 | 29 | [alignment_cutoff, paired_ended] = parse_args() 30 | 31 | if paired_ended: 32 | alignment_cutoff = int(alignment_cutoff) * 2 33 | 34 | # Store each line in sam file as a list of reads, 35 | # where each read is a list of elements to easily 36 | # modify or grab things 37 | current_reads = [] 38 | current_qname = '' 39 | 40 | for line in sys.stdin: 41 | 42 | read_elems = line.strip().split('\t') 43 | 44 | if read_elems[0].startswith('@'): 45 | sys.stdout.write(line) 46 | continue 47 | 48 | # Keep taking lines that have the same qname 49 | if read_elems[0] == current_qname: 50 | # Add line to current reads 51 | current_reads.append(line) 52 | pass 53 | else: 54 | # Discard if there are more than the alignment cutoff 55 | if len(current_reads) >= alignment_cutoff: 56 | current_reads = [line] 57 | current_qname = read_elems[0] 58 | elif len(current_reads) > 0: 59 | # Just output all reads, which are then filtered with 60 | # samtools 61 | for read in current_reads: 62 | sys.stdout.write(str(read)) 63 | 64 | # And then discard 65 | current_reads = [line] 66 | current_qname = read_elems[0] 67 | else: 68 | # First read in file 69 | current_reads.append(line) 70 | current_qname = read_elems[0] 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /modules/input_peak.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "conf.bds" 5 | 6 | 7 | help == narrow peak input definition : 8 | help For true replicates, use '-peak1' and '-peak2', 9 | help For pooled replicates, use '-peak_pooled', 10 | help For two PR (self-pseudo-replicates), use '-peak[REP_ID]_pr1' and '-peak[REP_ID]_pr2' 11 | help For two PPR (pooled pseudo-replicates), use '-peak_ppr1' and '-peak_ppr2' 12 | 13 | 14 | void chk_input_peak( bool true_rep, bool no_pseudo_rep ) { 15 | 16 | if ( !is_input_peak() ) return // read peaks here 17 | 18 | for ( int rep=0; rep<=get_num_rep_peak(); rep++) { // rep==0 : pooled 19 | if ( get_num_rep_peak() == 1 && rep==0 ) continue // if only one replicate, skip reading pooled rep 20 | 21 | for (int pse=0; pse<=2; pse++) { // pse(pseudo)==0 : true rep, pse==1,2 : self-pseudo rep 1,2 22 | if ( true_rep && pse > 0 ) continue 23 | if ( no_pseudo_rep && rep != 0 && pse > 0 ) continue 24 | 25 | peak_ := get_peak(rep,pse) 26 | suffix1 := rep==0 ? "replicate" : "replicate $rep" 27 | suffix2 := rep==0 ? "pseudo-replicate $pse" : "pseudo-replicate $pse for replicate $rep" 28 | prefix := (rep==0 ? "pooled " : "") + (pse==0 ? suffix1 : suffix2) 29 | 30 | print( "$prefix: \n\t$peak_"+"\n") 31 | if ( !path_exists( peak_ ) ) error("\t\tFile not found!\n") 32 | } 33 | } 34 | } 35 | 36 | string get_peak( int rep, int pse ) { // rep==0 : pooled peak, pse==0 : true replicate 37 | 38 | if ( pse > 2 ) error ("\nget_peak() : pse should not be larger than 2!") 39 | 40 | string key, key2 41 | if ( rep == 0 ) { 42 | key = ( pse == 0 ? "peak_pooled" : ("peak_ppr" + pse) ) 43 | key2 = key 44 | } 45 | else { 46 | key = "peak" + rep + ( pse == 0 ? "" : ("_pr" + pse) ) 47 | key2 = "peak_rep" + rep + ( pse == 0 ? "" : ("_pr" + pse) ) 48 | } 49 | 50 | if ( cmd_line_arg_has_key( key ) ) { 51 | return get_path( get_cmd_line_arg_val( key ) ) 52 | } 53 | else if ( cmd_line_arg_has_key( key2 ) ) { 54 | return get_path( get_cmd_line_arg_val( key2 ) ) 55 | } 56 | else if ( conf.hasKey( key ) ) { 57 | return get_path( conf{ key } ) 58 | } 59 | else if ( conf.hasKey( key2 ) ) { 60 | return get_path( conf{ key2 } ) 61 | } 62 | 63 | return "" 64 | } 65 | 66 | bool is_input_peak() { 67 | 68 | return get_peak( 1, 0 ) != "" 69 | } 70 | 71 | int get_num_rep_peak() { 72 | 73 | rep := 1 74 | 75 | while( get_peak( rep, 0 ) != "" ) rep++ 76 | 77 | return rep-1 78 | } 79 | -------------------------------------------------------------------------------- /utils/ucsc_ensGene.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import sys,os 4 | sys.path.append('/home/xzhou/subtleKnife/script/genescript') 5 | import parseUcscgenestruct 6 | 7 | if len(sys.argv)!=3: 8 | print ' knownToEnsembl.txt and kgXref.txt must be under current dir' 9 | sys.exit() 10 | 11 | 12 | aa={} 13 | with open('knownToEnsembl.txt') as fin: 14 | for line in fin: 15 | lst=line.rstrip().split('\t') 16 | aa[lst[0]]=lst[1] 17 | 18 | symbol={} 19 | desc={} 20 | with open('kgXref.txt') as fin: 21 | for line in fin: 22 | lst=line.rstrip().split('\t') 23 | if lst[0] in aa: 24 | ens=aa[lst[0]] 25 | if len(lst[4])>0: 26 | symbol[ens]=lst[4] 27 | if len(lst[7])>0: 28 | desc[ens]=lst[7] 29 | 30 | 31 | ucsc,tkname=sys.argv[1:] 32 | 33 | 34 | 35 | # dump 36 | fout=open(tkname,'w') 37 | fout2=open(tkname+'_load','w') 38 | 39 | id=1 40 | with open(ucsc) as fin: 41 | for line in fin: 42 | lst=line.rstrip().split('\t') 43 | g=parseUcscgenestruct.parse(lst,True) 44 | name=lst[1] 45 | fout.write('{0}\t{1}\t{2}\tname:"{3}",id:{4},strand:"{5}",'.format( 46 | g['chrom'], 47 | g['start'], 48 | g['stop'], 49 | name, 50 | id, 51 | g['strand'])) 52 | id+=1 53 | if 'thin' in g or 'thick' in g: 54 | fout.write('struct:{') 55 | if 'thin' in g: 56 | fout.write('thin:[') 57 | for x in g['thin']: 58 | fout.write('[{0},{1}],'.format(x[0],x[1])) 59 | fout.write('],') 60 | if 'thick' in g: 61 | fout.write('thick:[') 62 | for x in g['thick']: 63 | fout.write('[{0},{1}],'.format(x[0],x[1])) 64 | fout.write('],') 65 | fout.write('},') 66 | # desc 67 | if name in desc: 68 | fout.write('desc:"'+desc[name]+'",') 69 | if name in symbol: 70 | fout.write('name2:"'+symbol[name]+'"') 71 | fout2.write('{0}\t{1}\t{2}\t{3}\n'.format(g['chrom'],g['start'],g['stop'],symbol[name])) 72 | fout.write('\n') 73 | fout2.write('{0}\t{1}\t{2}\t{3}\n'.format(g['chrom'],g['start'],g['stop'],name)) 74 | 75 | 76 | fout2.close() 77 | fout.close() 78 | 79 | import os 80 | os.system('sort -k1,1 -k2,2n '+tkname+' > x') 81 | os.system('mv x '+tkname) 82 | os.system('bgzip -f '+tkname) 83 | os.system('tabix -f -p bed '+tkname+'.gz') 84 | 85 | print ''' 86 | drop table if exists {0}; 87 | create table {0} ( 88 | chrom varchar(20) not null, 89 | start int unsigned not null, 90 | stop int unsigned not null, 91 | name varchar(100) not null 92 | ) ENGINE=MyISAM DEFAULT CHARSET=latin1; 93 | load data local infile '{0}_load' into table {0}; 94 | create index name on {0} (name); 95 | '''.format(tkname) 96 | 97 | -------------------------------------------------------------------------------- /modules/input_bam.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "conf.bds" 5 | 6 | 7 | help == bam input (raw or filtered) definition : 8 | help Raw bam : For replicate '-bam[REP_ID]', For control '-ctl_bam[REP_ID]'. 9 | help Filtered bam : For replicate '-filt_bam[REP_ID]', For control '-ctl_filt_bam[REP_ID]'. 10 | 11 | 12 | string get_bam( int ctl, int rep ) { 13 | 14 | key := ( ctl > 0 ? "ctl_bam" : "bam" ) + "_rep" + rep 15 | key2 := ( ctl > 0 ? "ctl_bam" : "bam" ) + rep 16 | key3 := ( ctl > 0 ? "ctl_bam" : "bam" ) 17 | 18 | if ( cmd_line_arg_has_key( key ) ) { 19 | return get_path( get_cmd_line_arg_val( key ) ) 20 | } 21 | else if ( cmd_line_arg_has_key( key2 ) ) { 22 | return get_path( get_cmd_line_arg_val( key2 ) ) 23 | } 24 | else if ( (rep==1) && cmd_line_arg_has_key( key3 ) ) { 25 | return get_path( get_cmd_line_arg_val( key3 ) ) 26 | } 27 | else if ( conf.hasKey( key ) ) { 28 | return get_path( conf{ key } ) 29 | } 30 | else if ( conf.hasKey( key2 ) ) { 31 | return get_path( conf{ key2 } ) 32 | } 33 | else if ( (rep==1) && conf.hasKey( key3 ) ) { 34 | return get_path( conf{ key3 } ) 35 | } 36 | return "" 37 | } 38 | 39 | string get_bam( int rep ) { 40 | 41 | return get_bam( 0, rep ) 42 | } 43 | 44 | string get_filt_bam( int ctl, int rep ) { 45 | 46 | key := ( ctl > 0 ? "ctl_filt_bam" : "filt_bam" ) + "_rep" + rep 47 | key2 := ( ctl > 0 ? "ctl_filt_bam" : "filt_bam" ) + rep 48 | key3 := ( ctl > 0 ? "ctl_filt_bam" : "filt_bam" ) 49 | 50 | if ( cmd_line_arg_has_key( key ) ) { 51 | return get_path( get_cmd_line_arg_val( key ) ) 52 | } 53 | else if ( cmd_line_arg_has_key( key2 ) ) { 54 | return get_path( get_cmd_line_arg_val( key2 ) ) 55 | } 56 | else if ( (rep==1) && cmd_line_arg_has_key( key3 ) ) { 57 | return get_path( get_cmd_line_arg_val( key3 ) ) 58 | } 59 | else if ( conf.hasKey( key ) ) { 60 | return get_path( conf{ key } ) 61 | } 62 | else if ( conf.hasKey( key2 ) ) { 63 | return get_path( conf{ key2 } ) 64 | } 65 | else if ( (rep==1) && conf.hasKey( key3 ) ) { 66 | return get_path( conf{ key3 } ) 67 | } 68 | return "" 69 | } 70 | 71 | string get_filt_bam( int rep ) { 72 | 73 | return get_filt_bam( 0, rep ) 74 | } 75 | 76 | bool is_input_bam( int ctl, int rep ) { 77 | 78 | return get_bam( ctl, rep ) != "" 79 | } 80 | 81 | bool is_input_bam( int rep ) { 82 | 83 | return is_input_bam( 0, rep ) 84 | } 85 | 86 | bool is_input_filt_bam( int ctl, int rep ) { 87 | 88 | return get_filt_bam( ctl, rep ) != "" 89 | } 90 | 91 | bool is_input_filt_bam( int rep ) { 92 | 93 | return is_input_filt_bam( 0, rep ) 94 | } 95 | -------------------------------------------------------------------------------- /utils/ucsc_simplegene.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import sys,os 4 | sys.path.append('/home/xzhou/subtleKnife/script/genescript') 5 | import parseUcscgenestruct 6 | 7 | if len(sys.argv)!=3: 8 | print ' ' 9 | sys.exit() 10 | 11 | ucsc,tkname=sys.argv[1:] 12 | 13 | 14 | symbol={} 15 | desc={} 16 | i=0 17 | if os.path.exists('refLink.txt'): 18 | ''' 19 | 0 symbol 20 | 1 desc 21 | 2 name 22 | 3 name 23 | ''' 24 | with open('refLink.txt') as fin: 25 | for line in fin: 26 | lst=line.rstrip().split('\t') 27 | if len(lst)<4: continue 28 | w=lst[1].replace('"','') 29 | #w=w.replace("'",'') 30 | desc[lst[2]]=w 31 | desc[lst[3]]=w 32 | symbol[lst[2]]=lst[0] 33 | symbol[lst[3]]=lst[0] 34 | i+=1 35 | print 'refLink: '+str(i) 36 | 37 | 38 | # dump 39 | fout=open(tkname,'w') 40 | fout2=open(tkname+'_load','w') 41 | 42 | id=1 43 | with open(ucsc) as fin: 44 | for line in fin: 45 | lst=line.rstrip().split('\t') 46 | g=parseUcscgenestruct.parse(lst,True) 47 | name=lst[1] 48 | fout.write('{0}\t{1}\t{2}\tname:"{3}",id:{4},strand:"{5}",'.format( 49 | g['chrom'], 50 | g['start'], 51 | g['stop'], 52 | name, 53 | id, 54 | g['strand'])) 55 | id+=1 56 | if 'thin' in g or 'thick' in g: 57 | fout.write('struct:{') 58 | if 'thin' in g: 59 | fout.write('thin:[') 60 | for x in g['thin']: 61 | fout.write('[{0},{1}],'.format(x[0],x[1])) 62 | fout.write('],') 63 | if 'thick' in g: 64 | fout.write('thick:[') 65 | for x in g['thick']: 66 | fout.write('[{0},{1}],'.format(x[0],x[1])) 67 | fout.write('],') 68 | fout.write('},') 69 | # desc 70 | if name in desc: 71 | fout.write('desc:"'+desc[name]+'",') 72 | if name in symbol: 73 | fout.write('name2:"'+symbol[name]+'"') 74 | fout2.write('{0}\t{1}\t{2}\t{3}\n'.format(g['chrom'],g['start'],g['stop'],symbol[name])) 75 | fout.write('\n') 76 | fout2.write('{0}\t{1}\t{2}\t{3}\n'.format(g['chrom'],g['start'],g['stop'],name)) 77 | 78 | 79 | fout2.close() 80 | fout.close() 81 | 82 | import os 83 | os.system('sort -k1,1 -k2,2n '+tkname+' > x') 84 | os.system('mv x '+tkname) 85 | os.system('bgzip -f '+tkname) 86 | os.system('tabix -f -p bed '+tkname+'.gz') 87 | 88 | print ''' 89 | drop table if exists {0}; 90 | create table {0} ( 91 | chrom varchar(20) not null, 92 | start int unsigned not null, 93 | stop int unsigned not null, 94 | name varchar(100) not null 95 | ) ENGINE=MyISAM DEFAULT CHARSET=latin1; 96 | load data local infile '{0}_load' into table {0}; 97 | create index name on {0} (name); 98 | '''.format(tkname) 99 | 100 | -------------------------------------------------------------------------------- /default.env: -------------------------------------------------------------------------------- 1 | ## Get hostname with the following command: 2 | ## $ hostname -f 3 | ## 4 | ## Configure an environment per hostname: 5 | ## [hostname1] 6 | ## ... 7 | ## 8 | ## Use the same environment for multiple hostnames: 9 | ## [hostname2, hostname3, ...] 10 | ## ... 11 | ## 12 | ## Using group 13 | ## [hostname1, hostname2, ... : group] 14 | ## [group] 15 | ## ... 16 | ## 17 | ## Using an asterisk in hostnames (IMPORTANT: only one * is allowed in hostnames) 18 | ## 19 | ## [host*name1] 20 | ## 21 | ## [*hostname2, hostname3*] 22 | 23 | # Stanford Kundaje group clusters (out of SGE) 24 | [vayu, mitra, durga] 25 | conda_env = aquas_chipseq 26 | conda_env_py3 = aquas_chipseq_py3 27 | conda_bin_dir = /software/miniconda3/bin 28 | species_file = $script_dir/species/kundaje.conf 29 | unlimited_mem_wt= true # unlimited max. memory and walltime on Kundaje clusters 30 | nice = 10 31 | nth = 4 32 | 33 | # Stanford Kundaje group clusters (controlled with SGE) 34 | [nandi, kali, amold, wotan, kadru, surya, indra, brahma] 35 | conda_env = aquas_chipseq 36 | conda_env_py3 = aquas_chipseq_py3 37 | conda_bin_dir = /software/miniconda3/bin 38 | species_file = $script_dir/species/kundaje.conf 39 | unlimited_mem_wt= true # unlimited max. memory and walltime on Kundaje clusters 40 | system = sge # force to use SGE (Sun Grid Engine) 41 | nice = 20 42 | nth = 4 43 | 44 | # Stanford NEW SCG 45 | [*.scg.stanford.edu, dper730xd*, hppsl230s*, dper910*, sgiuv*, sgisummit*, smsx10srw*] 46 | conda_env = aquas_chipseq 47 | conda_env_py3 = aquas_chipseq_py3 48 | species_file = $script_dir/species/scg.conf 49 | nth = 4 # number of threads for each pipeline 50 | system = slurm # force to use SLURM SCG 51 | q_for_slurm_account = true # use --account instead of -p (partition) 52 | cluster_task_delay = 10 # for NFS delayed write 53 | 54 | # Stanford OLD SCG : login node, computing nodes, file transfer servers 55 | [scg*.stanford.edu, scg*.local, carmack.stanford.edu, crick.stanford.edu] 56 | conda_env = aquas_chipseq 57 | conda_env_py3 = aquas_chipseq_py3 58 | species_file = $script_dir/species/scg.conf 59 | nth = 8 # number of threads for each pipeline run 60 | wt_spp = 72h # walltime for spp 61 | system = sge # force to use SGE (Sun Grid Engine) on SCG3/4 even though a user doesn't explicitly specify SGE on command line with 'bds -s sge chipseq.bds ...' 62 | cluster_task_delay = 10 63 | 64 | # Stanford Sherlock clusters 65 | [sherlock*.stanford.edu, sh-*.local, sh-*.int, sh-ln*.stanford.edu] 66 | conda_env = aquas_chipseq 67 | conda_env_py3 = aquas_chipseq_py3 68 | species_file = $script_dir/species/sherlock.conf 69 | nth = 8 # number of threads for each pipeline run 70 | wt_spp = 47h # walltime for spp 71 | system = slurm # force to use SLURM 72 | cluster_task_delay = 30 73 | 74 | 75 | # default (if no section with hostname is found) 76 | [default] 77 | conda_env = aquas_chipseq 78 | conda_env_py3 = aquas_chipseq_py3 79 | species_file = # use your own species file here. (DEF_SPECIES_FILE: DO NOT REMOVE THIS COMMENT!) 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /example_conf_full.json: -------------------------------------------------------------------------------- 1 | { 2 | "screen" : "", 3 | "dry_run" : false, 4 | "type" : "TF", 5 | "final_stage" : "idr", 6 | "out_dir" : "out", 7 | "title" : "", 8 | "input_endedness" : { 9 | "se" : false, 10 | "pe" : false 11 | }, 12 | "input_files" : { 13 | }, 14 | "species" : { 15 | "species" : "", 16 | "species_file" : "", 17 | "species_browser" : "", 18 | "ref_fa" : "", 19 | "chrsz" : "", 20 | "blacklist" : "", 21 | "gensz" : "" 22 | }, 23 | "cluster" : { 24 | "system" : "local", 25 | "nice" : 0, 26 | "retrial" : 0, 27 | "q" : "" 28 | }, 29 | "resource" : { 30 | "nth" : 8, 31 | "no_par" : false, 32 | "wt" : "5h50m", 33 | "memory" : "7G", 34 | "unlimited_mem_wt" : false, 35 | "wt_dedup" : "23h", 36 | "mem_dedup" : "12G", 37 | "mem_shuf" : "12G", 38 | "wt_bwa" : "47h", 39 | "mem_bwa" : "12G", 40 | "wt_macs2" : "23h", 41 | "mem_macs2" : "15G", 42 | "wt_spp" : "47h", 43 | "mem_spp" : "12G" 44 | }, 45 | "alignment" : { 46 | "aligner" : "bwa", 47 | "bwa" : { 48 | "param_bwa_aln" : "-q 5 -l 32 -k 2", 49 | "bwa_idx" : "" 50 | }, 51 | "filter" : { 52 | "dup_marker" : "picard", 53 | "anon_filt_bam" : false, 54 | "mapq_thresh" : 30, 55 | "rm_chr_from_tag" : "", 56 | "no_dup_removal" : false 57 | }, 58 | "subsample" : { 59 | "subsample_chip" : "0", 60 | "subsample_ctl" : "0" 61 | } 62 | }, 63 | "cross_corr_analysis" : { 64 | "no_xcor" : false, 65 | "subsample_xcor" : "15M", 66 | "speak_xcor" : -1, 67 | "extra_param_xcor" : "" 68 | }, 69 | "callpeak" : { 70 | "peak_caller" : "spp", 71 | "ctl_depth_ratio" : 1.2, 72 | "use_pooled_ctl" : false, 73 | "true_rep" : false, 74 | "no_pseudo_rep" : false, 75 | "spp" : { 76 | "cap_num_peak_spp" : 300000, 77 | "max_ppsize_spp" : "", 78 | "speak_spp" : -1, 79 | "extra_param_spp" : "" 80 | }, 81 | "macs2" : { 82 | "pval_thresh_macs2" : 0.01, 83 | "keep_dup_macs2" : "all", 84 | "extsize_macs2" : -1, 85 | "shift_macs2" : 0, 86 | "extra_param_macs2" : "" 87 | }, 88 | "idr" : { 89 | "idr_suffix" : false, 90 | "idr_rank" : "", 91 | "idr_thresh" : 0.05 92 | }, 93 | "naive_overlap" : { 94 | "nonamecheck" : false 95 | } 96 | }, 97 | "signal_track" : { 98 | "sig_trk_for_pooled_rep_only" : false 99 | }, 100 | "bds_configuration" : { 101 | "env" : "$script_dir/default.env" 102 | }, 103 | "visualization" : { 104 | "url_base" : "" 105 | }, 106 | "ENCODE_accession" : { 107 | "ENCODE_accession" : "", 108 | "ENCODE_award_rfa" : "", 109 | "ENCODE_assay_category" : "", 110 | "ENCODE_assay_title" : "", 111 | "ENCODE_award" : "", 112 | "ENCODE_lab" : "", 113 | "ENCODE_assembly" : "", 114 | "ENCODE_alias_prefix" : "KLAB_PIPELINE" 115 | }, 116 | "shell_environment" : { 117 | "conda" : { 118 | "conda_env" : "", 119 | "conda_env_py3" : "", 120 | "conda_bin_dir" : "" 121 | }, 122 | "modules" : { 123 | "mod" : "", 124 | "shcmd" : "", 125 | "addpath" : "" 126 | } 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /modules/input_fastq.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "conf.bds" 5 | 6 | 7 | help == fastq input definition : 8 | help Single-ended : For replicate '-fastq[REP_ID]', For control '-ctl_fastq[REP_ID]' 9 | help Paired end : For replicate '-fastq[REP_ID]_[PAIR_ID]', For control '-ctl_fastq[REP_ID]_[PAIR_ID]' 10 | 11 | 12 | 13 | string[] get_fastqs( int ctl, int rep ) { // if paired-end return [PE1, PE2], elseif single-end else return [PE1], else [] 14 | 15 | string[] ret 16 | for ( int pe=1; pe<=2; pe++ ) { 17 | ret += get_fastq( ctl, rep, pe ) 18 | } 19 | 20 | return ret 21 | } 22 | 23 | string[] get_fastqs( int rep ) { // if paired-end return [PE1, PE2], elseif single-end else return [PE1], else [] 24 | 25 | return get_fastqs( 0, rep ) 26 | } 27 | 28 | string[] get_fastq( int ctl, int rep, int p ) { 29 | 30 | // allow up to 10 fastqs to be pooled (i.e. fastq1 fastq1:2 fastq1:3, ...) 31 | string[] suffix 32 | suffix.add("") 33 | for ( int i=1; i<=99; i++ ) { 34 | suffix.add(":$i") 35 | } 36 | 37 | string[] result 38 | for ( int i=0; i 0 ? "ctl_fastq" : "fastq" ) + "_rep" + rep 40 | key := key_wo_p + "_p" + p + suffix[i] 41 | key_wo_p += suffix[i] 42 | 43 | key_wo_p2 := ( ctl > 0 ? "ctl_fastq" : "fastq" ) + rep 44 | key2 := key_wo_p2 + "_" + p + suffix[i] 45 | key_wo_p2 += suffix[i] 46 | 47 | key_wo_p3 := ( ctl > 0 ? "ctl_fastq" : "fastq" ) 48 | key3 := key_wo_p3 + "_" + p + suffix[i] 49 | key_wo_p3 += suffix[i] 50 | 51 | if ( (p==1) && cmd_line_arg_has_key( key_wo_p ) ) { 52 | result.add( get_path( get_cmd_line_arg_val( key_wo_p ) ) ) 53 | } 54 | else if ( (p==1) && cmd_line_arg_has_key( key_wo_p2 ) ) { 55 | result.add( get_path( get_cmd_line_arg_val( key_wo_p2 ) ) ) 56 | } 57 | else if ( (p==1) && (rep==1) && cmd_line_arg_has_key( key_wo_p3 ) ) { 58 | result.add( get_path( get_cmd_line_arg_val( key_wo_p3 ) ) ) 59 | } 60 | else if ( cmd_line_arg_has_key( key ) ) { 61 | result.add( get_path( get_cmd_line_arg_val( key ) ) ) 62 | } 63 | else if ( cmd_line_arg_has_key( key2 ) ) { 64 | result.add( get_path( get_cmd_line_arg_val( key2 ) ) ) 65 | } 66 | else if ( (rep==1) && cmd_line_arg_has_key( key3 ) ) { 67 | result.add( get_path( get_cmd_line_arg_val( key3 ) ) ) 68 | } 69 | else if ( (p==1) && conf.hasKey( key_wo_p ) ) { 70 | result.add( get_path( conf{ key_wo_p } ) ) 71 | } 72 | else if ( (p==1) && conf.hasKey( key_wo_p2 ) ) { 73 | result.add( get_path( conf{ key_wo_p2 } ) ) 74 | } 75 | else if ( (p==1) && (rep==1) && conf.hasKey( key_wo_p3 ) ) { 76 | result.add( get_path( conf{ key_wo_p3 } ) ) 77 | } 78 | else if ( conf.hasKey( key ) ) { 79 | result.add( get_path( conf{ key } ) ) 80 | } 81 | else if ( conf.hasKey( key2 ) ) { 82 | result.add( get_path( conf{ key2 } ) ) 83 | } 84 | else if ( (rep==1) && conf.hasKey( key3 ) ) { 85 | result.add( get_path( conf{ key3 } ) ) 86 | } 87 | } 88 | 89 | return result 90 | } 91 | 92 | string[] get_fastq( int rep, int p ) { 93 | 94 | return get_fastq( 0, rep, p ) 95 | } 96 | 97 | bool is_input_fastq( int ctl, int rep ) { 98 | 99 | fastqs := get_fastqs( ctl, rep ) 100 | if ( fastqs.size() > 0 ) return true 101 | return false 102 | } 103 | 104 | bool is_input_fastq( int rep ) { 105 | 106 | return is_input_fastq( 0, rep ) 107 | } 108 | -------------------------------------------------------------------------------- /modules/cluster.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "conf.bds" 5 | 6 | 7 | help == cluster/system/resource settings 8 | wt := "5h50m" help Walltime for all single-threaded tasks (example: 8:10:00, 3h, 3600, default: 5h50m, 5:50:00). 9 | memory := "7G" help Maximum memory for all single-threaded tasks (equivalent to '-mem', example: 4.5G, 1024M, default: 7G). 10 | use_system := "local" help Force to use a system (equivalent to 'bds -s [SYSTEM_NAME] ...', any system defined in bds.config can be used). 11 | nice := 0 help Set process priority for all tasks (default: 0; -20 (highest) ~ 19 (lowest) ). 12 | retrial := 0 help # of Retrial for failed tasks (default: 0). 13 | q := "" help Submit tasks to a specified cluster queue. 14 | q_for_slurm_account := false help Use --account instead of -p (partition) for SLURM only. 15 | unlimited_mem_wt:= false help Use unlimited max. memory and walltime. 16 | java_tmp_dir := "\${TMPDIR}" help Java temporary directory. (change it when you get 'Disk quota exceeded' error in Java, default: ${TMPDIR}). 17 | 18 | init_cluster() 19 | 20 | 21 | void init_cluster() { 22 | wt = get_conf_val( wt, ["wt"] ) 23 | memory = get_conf_val( memory, ["memory","mem"] ) 24 | use_system = get_conf_val( use_system, ["use_system","system"] ) 25 | nice = get_conf_val_int( nice, ["nice"] ) 26 | retrial = get_conf_val_int( retrial, ["retrial","retry"] ) 27 | q = get_conf_val( q, ["q"] ) 28 | unlimited_mem_wt= get_conf_val_bool( unlimited_mem_wt, ["unlimited_mem_wt"] ) 29 | q_for_slurm_account= get_conf_val_bool( q_for_slurm_account, ["q_for_slurm_account"] ) 30 | java_tmp_dir = get_conf_val( java_tmp_dir, ["java_tmp_dir"] ) 31 | 32 | if ( cmd_line_arg_has_key("mem") ) memory = get_cmd_line_arg_val( "mem" ) 33 | if ( cmd_line_arg_has_key("system") ) use_system = get_cmd_line_arg_val( "system" ) 34 | if ( nice <= -20 ) nice = -20 35 | if ( nice > 19 ) nice = 19 36 | if ( use_system != "" ) system = use_system.toLower() 37 | if ( system == "slurm" || system == "generic" ) { // for new SCG, which uses --account instead of -p (partition) 38 | system = "generic" 39 | if ( q != "" ) { 40 | if ( q_for_slurm_account ) { 41 | queue = "--account $q" 42 | } 43 | else { 44 | queue = "-p $q" 45 | } 46 | } 47 | } 48 | else if ( q != "" ) { 49 | queue = q 50 | } 51 | 52 | // cpus, mem and timeout are pre-declared BDS variables for default resource settings 53 | mem = get_res_mem(memory,1) 54 | timeout = get_res_wt(wt) 55 | retry = retrial 56 | 57 | // do not modify this (BDS timeout; how long BDS will wait for tasks to be queued on the cluster) 58 | walltimeout = 3600*24*100 // timeout var. in BigDataScript (100 days, jobs will never be stopped by BDS due to BDS timeout) 59 | 60 | print("\n\n== cluster/system info\n") 61 | print( "Walltime (general)\t\t: $wt\n" ) 62 | print( "Max. memory (general)\t\t: $memory\n" ) 63 | print( "Force to use a system\t\t: $use_system\n" ) 64 | print( "Process priority (niceness)\t: $nice\n" ) 65 | print( "Retiral for failed tasks\t: $retrial\n" ) 66 | print( "Submit tasks to a cluster queue\t: $q\n" ) 67 | print( "Unlimited cluster mem./walltime\t: $unlimited_mem_wt\n") 68 | print( "Use --acount instead of SLURM partition\t\t: $q_for_slurm_account\n") 69 | print( "Java temporary directory\t\t: $java_tmp_dir\n") 70 | } 71 | 72 | int get_res_wt( string str ) { 73 | return (unlimited_mem_wt || is_system_local() ) ? -1 : parse_time( str ) 74 | } 75 | 76 | int get_res_mem( string str, int n ) { 77 | if ( n < 1 ) n = 1 78 | return (unlimited_mem_wt || is_system_local() ) ? -1 : parse_mem( str )/n 79 | } 80 | 81 | int get_res_mem( string str ) { 82 | return get_res_mem( str , 1 ) 83 | } 84 | 85 | bool is_system_sge() { 86 | return system == "sge" 87 | } 88 | 89 | bool is_system_local() { 90 | return system == "local" 91 | } 92 | 93 | bool is_system_generic() { 94 | return system == "generic" 95 | } 96 | 97 | bool is_system_slurm() { 98 | // slurm uses generic cluster, it's configured in bds.config and ./utils/clusterGeneral 99 | return system == "generic" 100 | } 101 | -------------------------------------------------------------------------------- /modules/callpeak_gem.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "species.bds" 5 | include "module_template.bds" 6 | 7 | 8 | help == callpeak gem settings 9 | npeak_gem := 300000 help Threshold on # of peaks for GEM (default: 300000). 10 | k_min_gem := 6 help Minimum length of k-mers (--k_min in GEM, default: 6). 11 | k_max_gem := 13 help Maximum length of k-mers (--k_max in GEM, default: 13). 12 | q_val_thresh_gem:= 0.0 help Q-value threshold (--q in GEM, default: 0). 13 | read_dist_gem := "$script_dir/etc/Read_Distribution_default.txt" help Read distribution txt file for GEM (default: $script_dir/etc/Read_Distribution_default.txt). 14 | extra_param_gem := "" help Extra parameters for GEM. 15 | wt_gem := "47h" help Walltime for GEM (default: 47h, 47:00:00). 16 | mem_gem := "15G" help Max. memory for GEM (default: 15G). 17 | 18 | grp_color_gem := "skyblue" 19 | 20 | 21 | init_callpeak_gem() 22 | 23 | 24 | void init_callpeak_gem() { 25 | 26 | npeak_gem = get_conf_val_int( npeak_gem, ["npeak_gem"] ) 27 | k_min_gem = get_conf_val_int( k_min_gem, ["k_min_gem"] ) 28 | k_max_gem = get_conf_val_int( k_max_gem, ["k_max_gem"] ) 29 | q_val_thresh_gem= get_conf_val_real( q_val_thresh_gem, ["q_val_thresh_gem"] ) 30 | read_dist_gem = get_conf_val( read_dist_gem, ["read_dist_gem"] ) 31 | extra_param_gem = get_conf_val( extra_param_gem, ["extra_param_gem"] ) 32 | wt_gem = get_conf_val( wt_gem, ["walltime_gem", "wt_gem", "timeout_gem"] ) 33 | mem_gem = get_conf_val( mem_gem, ["memory_gem", "mem_gem"] ) 34 | 35 | print("\n\n== callpeak gem settings\n") 36 | print( "Threshold for # peak in GEM\t\t: $npeak_gem\n") 37 | print( "Min. length of k-mers in GEM\t\t: $k_min_gem\n") 38 | print( "Max. length of k-mers in GEM\t\t: $k_max_gem\n") 39 | print( "Q-value threshold for GEM\t\t: $q_val_thresh_gem\n") 40 | print( "Read distribution txt for GEM\t\t: $read_dist_gem\n") 41 | print( "Extra parameters for GEM\t:$extra_param_gem\n") 42 | print( "Walltime (GEM)\t\t\t: $wt_gem\n") 43 | print( "Max. memory (GEM)\t\t: $mem_gem\n") 44 | } 45 | 46 | void chk_callpeak_gem() { 47 | if ( !path_exists( "$seq_dir/chr1.fa") && !path_exists( "$seq_dir/chr1.fasta") ) \ 48 | error("\nReference genome sequence directory doesn't exists! (file: $seq_dir/chr1.fa)\n") 49 | } 50 | 51 | string[] gem( string tag, string ctl_tag, string o_dir, string group, int nth_gem ) { 52 | prefix := ctl_tag ? ("$o_dir/" + make_x_basename_wo_gz( tag, ctl_tag, "" ) ) \ 53 | : replace_dir( rm_ext( tag, "tagAlign" ), o_dir ) 54 | tag_tmp := replace_dir( rm_ext( tag, "tagAlign" ), o_dir ) + ".tmp.bed" 55 | ctl_tag_tmp := replace_dir( rm_ext( ctl_tag, "tagAlign" ), o_dir ) + ".tmp.bed" 56 | npeakfile := "$prefix.narrowPeak.gz" 57 | npeakfile_tmp := "$prefix/"+prefix.baseName()+".GEM_events.narrowPeak" 58 | ctl_cmd := ctl_tag ? "zcat $ctl_tag > $ctl_tag_tmp" : "echo" 59 | ctl_param := ctl_tag ? "--ctrl $ctl_tag_tmp" : "" 60 | 61 | in := [ tag, ctl_tag ] 62 | out := [ npeakfile ] 63 | 64 | max_java_heap := binary_prefix( (mem==-1) ? parse_mem( mem_gem ) : (mem*4)/5 ) 65 | taskName:= "gem " + group 66 | cpus := (nth_gem==1) ? -1 : nth_gem; mem := get_res_mem(mem_gem,nth_gem); timeout := get_res_wt(wt_gem) 67 | 68 | wait_par( cpus ) 69 | 70 | tid := task( out<-in ) { 71 | 72 | sys $shcmd_init_py3 73 | 74 | sys zcat $tag > $tag_tmp 75 | sys $ctl_cmd 76 | // # ============================= 77 | // # See http://wiki.encodedcc.org/index.php/GPS/GEM of additional information 78 | // # ============================= 79 | sys export _JAVA_OPTIONS="-Xms256M -Xmx$max_java_heap -XX:ParallelGCThreads=1" 80 | 81 | // removed --s 2400000000 since, can guess from chrsz 82 | sys java -jar $(which gem.jar) --g $chrsz --d $read_dist_gem \ 83 | --expt $tag_tmp $ctl_param --f BED --out $prefix \ 84 | --genome $seq_dir --k_min $k_min_gem --k_max $k_max_gem --outNP \ 85 | --t $nth_gem --q $q_val_thresh_gem $extra_param_gem 86 | 87 | // # ============================= 88 | // # Sort peaks by signal value and truncate peaks to top 300K 89 | // # ============================= 90 | sys sort -k7nr,7nr $npeakfile_tmp | head -n $npeak_gem | gzip -nc > $npeakfile 91 | sys rm -f $tag_tmp $ctl_tag_tmp 92 | 93 | sys $shcmd_finalize 94 | } 95 | 96 | register_par( tid, cpus ) 97 | 98 | add_task_to_graph( in, out, group, "GEM", grp_color_gem ) 99 | 100 | return out 101 | } 102 | -------------------------------------------------------------------------------- /modules/parallel.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "conf.bds" 5 | 6 | 7 | help == parallelization settings 8 | no_par := false help Serialize all tasks (individual tasks can still use multiple threads up to '-nth'). 9 | nth := 8 help Maximum # threads for a pipeline. (default: 8). 10 | 11 | string[] _tids_all // array of task ids currently running 12 | int{} _nth_tasks // key: task id, value: # of threads for the task 13 | 14 | 15 | init_parallel() 16 | 17 | 18 | void init_parallel() { 19 | no_par = get_conf_val_bool( no_par, ["no_par"] ) 20 | nth = get_conf_val_int( nth, ["nth"] ) 21 | 22 | if ( nth > 32 ) error("Maximum # threads (-nth) for a pipeline should not exceed 32!") 23 | if ( nth <= 1 ) { 24 | print("\nWarning: Maximum # threads (-nth) for a pipeline is <= 1. Turning off parallelization... (-no_par)") 25 | nth = 1 26 | no_par = true 27 | } 28 | 29 | // pre-declared BDS variable 30 | cpus = -1 // With cpus==-1, BDS does not pass number of threads to cluster engine (SGE, SLURM, ...), which means single-threaded 31 | 32 | print("\n\n== parallelization info\n") 33 | print( "No parallel jobs\t\t: $no_par\n" ) 34 | print( "Maximum # threads \t\t: $nth\n" ) 35 | } 36 | 37 | void wait_par( int nth_task ) { 38 | if ( nth_task < 1 ) nth_task = 1 39 | 40 | while ( true ) { 41 | sleep( rand()*1.0 + 0.5 ) 42 | _tids_all_ := _tids_all // make dummy array for thread safety 43 | 44 | string[] tids_running 45 | int nth_running 46 | for ( string tid : _tids_all_ ) { // get total # threads for currently running tasks, and find the oldest task 47 | if ( !tid.isDone() ) { 48 | tids_running.add( tid ) 49 | nth_running = nth_running + _nth_tasks{tid} 50 | } 51 | } 52 | 53 | if ( tids_running.size() == 0 ) { 54 | break 55 | } 56 | else if ( no_par || (nth_running+nth_task) > nth ) { 57 | loop_cnt := 0 58 | while( true ) { // wait until one of running tasks finishes 59 | break_loop := false 60 | for ( string tid : tids_running ) { 61 | if ( tid.isDone() ) { 62 | break_loop = true 63 | break 64 | } 65 | } 66 | if ( break_loop ) break 67 | sleep( rand() + 0.5 ) 68 | } 69 | sleep( rand()*1.0 + 0.5 ) 70 | } 71 | else { 72 | break 73 | } 74 | } 75 | } 76 | 77 | void register_par( string tid, int nth_task ) { 78 | if ( nth_task < 1 ) nth_task = 1 79 | if ( tid == "" ) return 80 | 81 | _tids_all.add(tid) 82 | _nth_tasks{tid} = nth_task 83 | } 84 | 85 | int{} distribute_nonzero( int n, int{} weight ) { // distribute integer n according to weight 86 | int{} ret 87 | 88 | int sum 89 | for ( int w : weight ) sum += w 90 | if ( sum == 0 ) error("distribute_nth: sum is zero. check if input file size is 0?\n") 91 | for ( string key : weight.keys() ) { 92 | w := weight{key} 93 | ret{key} = (n*w)/sum 94 | 95 | if ( ret{key} == 0 ) ret{key} = 1 96 | } 97 | 98 | while( true ) { 99 | int sum2 100 | for ( string key : weight.keys() ) sum2 += ret{key} 101 | if ( n > sum2 ) { 102 | string key_to_plus 103 | int max_diff = 0 104 | for ( string key : weight.keys() ) { 105 | diff := n*weight{key}-ret{key}*sum 106 | if ( diff > max_diff ) { 107 | key_to_plus = key 108 | max_diff = diff 109 | } 110 | } 111 | ret{key_to_plus}++ 112 | } 113 | else { 114 | break 115 | } 116 | } 117 | 118 | print("Distributing $n to ... \n") 119 | print(ret) 120 | print("\n") 121 | return ret 122 | } 123 | 124 | int[] distribute_nonzero( int n, int[] weight ) { // distribute integer n according to weight 125 | int[] ret 126 | 127 | int sum 128 | for ( int w : weight ) sum += w 129 | if ( sum == 0 ) error("distribute_nth: sum is zero. check if input file size is 0?\n") 130 | for ( int i=0; i sum2 ) { 140 | int id_to_plus 141 | int max_diff = 0 142 | for ( int i=0; i max_diff ) { 145 | id_to_plus = i 146 | max_diff = diff 147 | } 148 | } 149 | ret[id_to_plus]++ 150 | } 151 | else { 152 | break 153 | } 154 | } 155 | 156 | print("Distributing $n to ... \n") 157 | print(ret) 158 | print("\n") 159 | return ret 160 | } 161 | -------------------------------------------------------------------------------- /install_dependencies.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Stop on error 3 | set -e 4 | 5 | ## conda environment name 6 | 7 | ENV_NAME=aquas_chipseq 8 | ENV_NAME_PY3=aquas_chipseq_py3 9 | 10 | INSTALL_GEM=1 11 | INSTALL_PEAKSEQ=1 12 | 13 | ## install packages from official channels (bioconda and r) 14 | 15 | conda create -n ${ENV_NAME} --file requirements.txt -y -c defaults -c bioconda -c r -c bcbio -c daler -c asmeurer 16 | conda create -n ${ENV_NAME_PY3} --file requirements_py3.txt -y -c defaults -c bioconda -c r -c bcbio -c daler -c asmeurer 17 | 18 | ### bash function definition 19 | 20 | function add_to_activate { 21 | if [[ ! -f $CONDA_INIT ]]; then 22 | echo > $CONDA_INIT 23 | fi 24 | for i in "${CONTENTS[@]}"; do 25 | if [[ $(grep "$i" "$CONDA_INIT" | wc -l ) == 0 ]]; then 26 | echo $i >> "$CONDA_INIT" 27 | fi 28 | done 29 | } 30 | 31 | ## install useful tools for BigDataScript 32 | 33 | mkdir -p $HOME/.bds 34 | cp -f ./utils/bds_scr ./utils/bds_scr_5min ./utils/kill_scr bds.config $HOME/.bds/ 35 | cp -rf ./utils/clusterGeneric/ $HOME/.bds/ 36 | 37 | ## install additional packages 38 | 39 | source activate ${ENV_NAME} 40 | 41 | conda uninstall graphviz -y # graphviz in bioconda has segmentation fault bug 42 | conda install graphviz -c anaconda -y 43 | 44 | conda install ucsc-bedgraphtobigwig -c bioconda -y 45 | conda install ucsc-bedtobigbed -c bioconda -y 46 | 47 | #CONDA_BIN=$(dirname $(which activate))/../envs/${ENV_NAME}/bin 48 | #CONDA_BIN=$(dirname $(which activate)) 49 | CONDA_BIN=$(dirname $(which bedtools)) 50 | CONDA_EXTRA="$CONDA_BIN/../extra" 51 | CONDA_ACTIVATE_D="$CONDA_BIN/../etc/conda/activate.d" 52 | CONDA_INIT="$CONDA_ACTIVATE_D/init.sh" 53 | CONDA_LIB="$CONDA_BIN/../lib" 54 | if [[ $(find $CONDA_LIB -name '*egg-info*' -not -perm -o+r | wc -l ) > 0 ]]; then 55 | find $CONDA_LIB -name '*egg-info*' -not -perm -o+r -exec dirname {} \; | xargs chmod o+r -R 56 | fi 57 | 58 | mkdir -p $CONDA_EXTRA $CONDA_ACTIVATE_D 59 | 60 | ### install Anshul's phantompeakqualtool 61 | echo $CONDA_EXTRA 62 | cd $CONDA_EXTRA 63 | git clone https://github.com/kundajelab/phantompeakqualtools 64 | chmod 755 -R phantompeakqualtools 65 | CONTENTS=("export PATH=$CONDA_EXTRA/phantompeakqualtools:\$PATH") 66 | add_to_activate 67 | 68 | ### disable locally installed python package lookup 69 | CONTENTS=("export PYTHONNOUSERSITE=True") 70 | add_to_activate 71 | #CONTENTS=("export PYTHONPATH=$CONDA_LIB/python2.7/site-packages:\$PYTHONPATH") 72 | #add_to_activate 73 | 74 | ### decompress MACS2 python egg 75 | #cd $CONDA_LIB/python2.7/site-packages 76 | #unzip -o MACS2-2.1.1.20160309-py2.7-linux-x86_64.egg 77 | 78 | # install PeakSeq 79 | if [[ ${INSTALL_PEAKSEQ} == 1 ]]; then 80 | cd $CONDA_EXTRA 81 | wget http://archive.gersteinlab.org/proj/PeakSeq/Scoring_ChIPSeq/Code/C/PeakSeq_1.31.zip -N --no-check-certificate 82 | unzip PeakSeq_1.31.zip 83 | rm -f PeakSeq_1.31.zip 84 | cd PeakSeq 85 | make 86 | chmod 755 bin/PeakSeq 87 | cd $CONDA_BIN 88 | ln -s $CONDA_EXTRA/PeakSeq/bin/PeakSeq 89 | fi 90 | 91 | source deactivate 92 | 93 | 94 | source activate ${ENV_NAME_PY3} 95 | 96 | # CONDA_BIN=$(dirname $(which activate))/../envs/${ENV_NAME_PY3}/bin 97 | #CONDA_BIN=$(dirname $(which activate)) 98 | CONDA_BIN=$(dirname $(which bedtools)) 99 | CONDA_EXTRA="$CONDA_BIN/../extra" 100 | CONDA_ACTIVATE_D="$CONDA_BIN/../etc/conda/activate.d" 101 | CONDA_INIT="$CONDA_ACTIVATE_D/init.sh" 102 | CONDA_LIB="$CONDA_BIN/../lib" 103 | if [[ $(find $CONDA_LIB -name '*egg-info*' -not -perm -o+r | wc -l ) > 0 ]]; then 104 | find $CONDA_LIB -name '*egg-info*' -not -perm -o+r -exec dirname {} \; | xargs chmod o+r -R 105 | fi 106 | 107 | mkdir -p $CONDA_EXTRA $CONDA_ACTIVATE_D 108 | 109 | ### uninstall IDR 2.0.4 and install the latest one 110 | conda uninstall idr -y 111 | cd $CONDA_EXTRA 112 | git clone --branch 2.0.4.2 git://github.com/kundajelab/idr 113 | cd idr 114 | python3 setup.py install 115 | cd $CONDA_EXTRA 116 | rm -rf idr 117 | 118 | ### disable locally installed python package lookup 119 | CONTENTS=("export PYTHONNOUSERSITE=True") 120 | add_to_activate 121 | CONTENTS=("export PYTHONPATH=$CONDA_LIB/python3.5/site-packages:\$PYTHONPATH") 122 | add_to_activate 123 | 124 | # install GEM 125 | if [[ ${INSTALL_GEM} == 1 ]]; then 126 | cd $CONDA_EXTRA 127 | wget http://groups.csail.mit.edu/cgs/gem/download/gem.v3.0.tar.gz -N --no-check-certificate 128 | tar zxvf gem.v3.0.tar.gz 129 | rm -f gem.v3.0.tar.gz 130 | cd gem 131 | chmod 755 gem.jar 132 | cd $CONDA_BIN 133 | ln -s $CONDA_EXTRA/gem/gem.jar 134 | fi 135 | 136 | source deactivate 137 | 138 | 139 | echo == Installing dependencies has been successfully done. == 140 | -------------------------------------------------------------------------------- /modules/callpeak_spp.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "species.bds" 5 | include "module_template.bds" 6 | 7 | 8 | help == callpeak spp settings 9 | cap_num_peak_spp := "300K" help Cap number of peaks (-npeak= in run_spp.R) (default: 300000). 10 | max_ppsize_spp := "" help R stack size (R parameter --max-ppsize=; between 5000 and 5000000) for SPP. 11 | speak_spp := -1 help User-defined cross-corr. peak strandshift (-speak= in run_spp.R). Use -1 to get from upstream cross-corr. analysis (default: -1). 12 | extra_param_spp := "" help Extra parameters for SPP (run_spp.R, peak calling only). 13 | wt_spp := "47h" help Walltime for spp (default: 47h, 47:00:00). 14 | mem_spp := "12G" help Max. memory for spp (default: 12G). 15 | 16 | 17 | grp_color_spp := "skyblue" 18 | 19 | 20 | init_callpeak_spp() 21 | 22 | 23 | void init_callpeak_spp() { 24 | 25 | cap_num_peak_spp = get_conf_val( cap_num_peak_spp, ["cap_num_peak_spp"] ) 26 | wt_spp = get_conf_val( wt_spp, ["walltime_spp", "wt_spp", "timeout_spp"] ) 27 | mem_spp = get_conf_val( mem_spp, ["memory_spp", "mem_spp"] ) 28 | max_ppsize_spp = get_conf_val( max_ppsize_spp, ["max_ppsize_spp"] ) 29 | speak_spp = get_conf_val_int( speak_spp, ["speak_spp"] ) 30 | extra_param_spp = get_conf_val( extra_param_spp,["extra_param_spp"] ) 31 | 32 | print("\n\n== callpeak spp settings\n") 33 | print( "Threshold for # peak\t\t: $cap_num_peak_spp\n") 34 | print( "Walltime (spp)\t\t\t: $wt_spp\n") 35 | print( "Max. memory (spp)\t\t: $mem_spp\n") 36 | print( "Stack size for run_spp.R\t\t:$max_ppsize_spp\n") 37 | print( "Use-defined cross-corr. peak strandshift; if -1, use frag. len.\t:$speak_spp\n") 38 | print( "Extra parameters for run_spp.R\t:$extra_param_spp\n") 39 | } 40 | 41 | string[] spp( string tag, string ctl_tag, string frag_len, string o_dir, string group, int nth_spp ) { 42 | 43 | if ( ctl_tag == "" ) error("missing file: control tagalign!") 44 | if ( frag_len == "" ) error("missing parameter: fragment length!") 45 | 46 | int_cap_num_peak_spp := parse_number( cap_num_peak_spp ) 47 | 48 | prefix_vs := "$o_dir/" + make_vs_basename_wo_gz( tag, ctl_tag, "" ) 49 | prefix_x := "$o_dir/" + make_x_basename_wo_gz( tag, ctl_tag, "" ) 50 | rpeakfile_vs := "$prefix_vs.regionPeak.gz" 51 | rpeakfile := "$prefix_x.regionPeak.gz" 52 | filt_rpeakfile := "$prefix_x.filt.regionPeak.gz" 53 | ccscore := "$prefix_x.ccscore" 54 | pdf_tmp := replace_dir( rm_ext( tag, ["gz"] ), o_dir ) + ".pdf" 55 | pdf := "$prefix_x.pdf" 56 | param_speak := speak_spp > -1 ? "-speak=$speak_spp" : "-speak=$frag_len" 57 | extra_param := max_ppsize_spp ? "--max-ppsize=$max_ppsize_spp " : "" 58 | if ( extra_param_spp ) extra_param += extra_param_spp 59 | 60 | blacklist_exists := path_exists(blacklist) 61 | 62 | in := [ tag, ctl_tag ] 63 | out := [ rpeakfile, ccscore, pdf ] 64 | 65 | taskName:= "spp " + group 66 | cpus := (nth_spp==1) ? -1 : nth_spp; mem := get_res_mem(mem_spp,nth_spp); timeout := get_res_wt(wt_spp) 67 | 68 | wait_par( cpus ) 69 | 70 | tid := task( out<-in ) { 71 | 72 | sys $shcmd_init 73 | 74 | // # if phantompeakqualtools is an old version, use run_spp_nodups.R. new version has run_spp.R only 75 | sys if [ $(which run_spp_nodups.R 2> /dev/null | wc -l || echo) == "1" ]; then RUN_SPP=$(which run_spp_nodups.R); \ 76 | else RUN_SPP=$(which run_spp.R); \ 77 | fi 78 | 79 | sys Rscript $extra_param ${RUN_SPP} -c=$tag -p=$nth_spp -i=$ctl_tag \ 80 | -npeak=$int_cap_num_peak_spp -odir=$o_dir $param_speak -savr -savp -rf -out=$ccscore 81 | 82 | // Bug fix (we have scientific representation of chr coord., possible bug in run_spp.R?): 83 | sys zcat $rpeakfile_vs | awk 'BEGIN{OFS="\t"}{ if ($2<0) $2=0; print $1,int($2),int($3),$4,$5,$6,$7,$8,$9,$10;}' | gzip -f -nc > $rpeakfile 84 | 85 | sys rm -f $rpeakfile_vs 86 | 87 | sys mv $pdf_tmp $pdf 88 | 89 | // if compressed output file is empty (spp error), remove it 90 | sys if [ $(zcat $rpeakfile | wc -l ) == "0" ]; then rm -f $rpeakfile; fi 91 | 92 | // if no rpeak file, do something to return non-zero exit code 93 | sys if [ ! -f $rpeakfile ]; then error_in_spp_output_peak_does_not_exist; fi 94 | 95 | sys if [[ $blacklist_exists == "true" ]]; then \ 96 | bedtools intersect -v -a <(zcat -f $rpeakfile) -b <(zcat -f $blacklist) \ 97 | | awk 'BEGIN{OFS="\t"} {if ($5>1000) $5=1000; print $0}' | grep -P 'chr[\dXY]+[ \t]' \ 98 | | gzip -nc > $filt_rpeakfile; \ 99 | fi 100 | 101 | sys $shcmd_finalize 102 | } 103 | 104 | register_par( tid, cpus ) 105 | 106 | add_task_to_graph( in, out, group, "SPP", grp_color_spp ) 107 | 108 | return out 109 | } 110 | -------------------------------------------------------------------------------- /modules/sys.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "string.bds" 5 | 6 | helpUnsorted := true // do not sort help 7 | 8 | 9 | script_path := "" 10 | script_dir := "" 11 | 12 | hostname := "" 13 | 14 | // pipeline seeks for executables in the BDS script directory (local git repo) and $PATH 15 | // Add more relative path here if you want to keep your .py .sh .R visible to UNIX `which` as executables. 16 | // Relative paths defined here are according to your script path (not your working directory but where .bds exists) 17 | // Make sure that you chmod 755 your .py .R .sh 18 | _rel_script_file_paths := [".","modules","utils"] 19 | 20 | 21 | init_base() 22 | 23 | 24 | void init_base() { 25 | script_path = "$ppwd/$programPath" 26 | if (!script_path.exists()) script_path = "$programPath" 27 | 28 | script_dir = script_path.dirName() 29 | hostname = get_hostname() 30 | } 31 | 32 | //// script file path 33 | 34 | string[] get_script_file_paths( string suffix ) { 35 | string[] ret 36 | for ( string path : _rel_script_file_paths ) { 37 | path = "$script_dir/$path" 38 | if ( path.exists() ) { 39 | ret.add( path + suffix ) 40 | if ( path.dirName().endsWith( "modules" ) ) ret.add( "$path/../$suffix" ) 41 | } 42 | } 43 | return ret 44 | } 45 | 46 | string[] get_script_file_paths() { 47 | return get_script_file_paths( "" ) 48 | } 49 | 50 | //// command line argument functions 51 | 52 | bool cmd_line_arg_has_key( string key ) { 53 | key = key.toLower() 54 | for ( string arg : args ) { 55 | if ( ("-"+key) == arg.toLower().trim() ) return true 56 | } 57 | return false 58 | } 59 | 60 | bool is_cmd_line_arg_empty() { 61 | return args.size()==0 62 | } 63 | 64 | bool is_first_arg_conf() { 65 | if ( (args.size()>0) && (!args[0].startsWith("-")) ) { 66 | if ( args.size()==1 ) { 67 | return true 68 | } 69 | else { 70 | return args[1].startsWith("-") 71 | } 72 | } 73 | return false 74 | } 75 | 76 | string get_cmd_line_arg_val( string key ) { 77 | key = key.toLower() 78 | for (int i=0; i< args.size(); i++) { 79 | arg := args[i] 80 | if ( ("-"+key) == arg.toLower().trim() ) { 81 | if ( i==(args.size()-1) ) break 82 | next_arg := args[i+1] 83 | 84 | if ( next_arg.startsWith("-") ) break 85 | return next_arg 86 | } 87 | } 88 | return "" 89 | } 90 | 91 | //// functions for file I/O 92 | 93 | string get_path( string str ) { // get absolute path (remove / if exists at end) 94 | if (str.trim() == "") return "" 95 | base := rm_str_at_end( str, "/" ).path() 96 | return base 97 | } 98 | 99 | string mkdir( string str ) { 100 | if (str.trim() == "") return "" 101 | // make filename full path and mkdir -p 102 | path := get_path( str ) 103 | if ( path.exists() ) { 104 | return path 105 | } 106 | else { 107 | path.mkdir() 108 | return path 109 | } 110 | } 111 | 112 | bool path_exists( string path ) { 113 | if ( path!="" ) { 114 | if ( path.exists() ) { 115 | if ( path.isFile() ) { 116 | if ( path.size() > 0 ) return true 117 | } 118 | else { 119 | return true 120 | } 121 | } 122 | } 123 | return false 124 | } 125 | 126 | string copy( string file, string o_dir ) { 127 | file_new := replace_dir( file, o_dir ) 128 | system := "local" // do not use cluster engine for this task 129 | taskName:= "copy file" 130 | 131 | task ( file_new <- file ) { 132 | 133 | sys cp --remove-destination $file $file_new 134 | sys while [ ! -f $file_new ]; do echo FOUND DELAYED WRITE, WAITING...; sleep 0.1; done 135 | } 136 | 137 | return file_new 138 | } 139 | 140 | string get_stdout( string cmd ) { 141 | rnd := randInt() 142 | cmd_ := "cmd_$rnd".path() 143 | sys $cmd &> $cmd_ || true 144 | ret := cmd_.read() 145 | sys rm -f $cmd_ 146 | return rm_str_at_end(ret,"\n") 147 | } 148 | 149 | string get_shell_var( string var ) { 150 | var_ := "var_$var".path() 151 | sys echo "${$var}" > $var_ 152 | ret := var_.read() 153 | sys rm -f $var_ 154 | return ret 155 | } 156 | 157 | string get_md5sum( string file ) { 158 | return get_stdout( "md5sum $file | awk '{print $1}'" ) 159 | } 160 | 161 | int get_num_lines( string file ) { 162 | if ( !path_exists( file ) ) { 163 | error("get_no_lines(): File doesn't exist! ($file)") 164 | } 165 | else { 166 | if ( file.toLower().endsWith(".gz") ) { // check if compressed or not 167 | return get_stdout( "zcat $file | wc -l" ).parseInt() 168 | } 169 | else { 170 | return get_stdout( "cat $file | wc -l" ).parseInt() 171 | } 172 | } 173 | } 174 | 175 | string get_hostname() { 176 | out := get_stdout("hostname -f").replace("\n","") 177 | if (out.startsWith("hostname: ")) return "default" 178 | else return out 179 | } -------------------------------------------------------------------------------- /modules/filetable.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "output.bds" 5 | 6 | 7 | int{} _label_rank 8 | 9 | string{} _filetable_label // key: hierarchy 10 | string{} _filetable_path 11 | int{} _filetable_rank 12 | 13 | string{} _filetable_input 14 | int _curr_rank = 0 15 | 16 | 17 | 18 | void add_label_to_table( string label ) { 19 | _label_rank{ label } = _curr_rank++ 20 | } 21 | 22 | void add_file_to_table( string[] paths, string[] hrchys ) { 23 | for ( int i=0; iExpand all   " + \ 50 | "Collapse all" + \ 51 | " FilesPath " 52 | 53 | _construct_filetable() 54 | 55 | sorted_hrchy := _find_children_and_sort( "" ) 56 | for ( string hrchy : sorted_hrchy ) { 57 | parent := _get_parent( hrchy ) 58 | label := _filetable_label{ hrchy } 59 | path := _filetable_path.hasKey( hrchy ) ? _filetable_path{ hrchy } : "" 60 | if ( parent == "" ) \ 61 | html += " $label "+ html_link_url( path ) +"" 62 | else \ 63 | html += " $label "+ html_link_url( path ) +"" 64 | } 65 | html += "" 66 | html += "
\n" 67 | return html 68 | } 69 | 70 | string html_link_url( string path ) { 71 | rel_path := get_rel_path( path ) 72 | if ( rel_path.startsWith("./") ) \ 73 | return "" + rel_path + "
" 74 | else \ 75 | return rel_path + "
" 76 | } 77 | 78 | void _construct_filetable() { 79 | for( string hrchy : _filetable_input.keys() ) { 80 | _construct_filetable( hrchy, _filetable_input{ hrchy } ) 81 | } 82 | } 83 | 84 | // returns rank of item 85 | void _construct_filetable( string hrchy, string path ) { 86 | if ( hrchy == "" ) return 87 | if ( _filetable_label.hasKey( hrchy ) ) return 88 | 89 | curr := _get_curr( hrchy ) 90 | parent := _get_parent( hrchy ) 91 | _filetable_label{hrchy} = curr //map_label.hasKey(curr) ? map_label{curr} : curr 92 | _filetable_path{hrchy} = path 93 | if ( parent != "" ) _construct_filetable( parent, "" ) 94 | } 95 | 96 | string[] _get_children( string hrchy ) { // not including grand ones 97 | string[] children 98 | 99 | for ( string hrchy_ : _filetable_label.keys() ) { 100 | if ( hrchy == "" ) { 101 | if ( hrchy_.indexOf("/") < 0 ) \ 102 | children.push( hrchy_ ) 103 | } 104 | else if ( hrchy_.toLower().startsWith( hrchy.toLower() + "/" ) ) { 105 | 106 | if ( hrchy_.lastIndexOf("/") <= hrchy.length() ) \ 107 | children.push( hrchy_ ) 108 | } 109 | } 110 | return children 111 | } 112 | 113 | string[] _find_children_and_sort( string hrchy ) { 114 | string[] ret 115 | children := _get_children( hrchy ) 116 | if ( children.size() == 0 ) return ret 117 | 118 | // for bubble sort 119 | int[] ranks 120 | for ( string child : children ) { 121 | curr := _get_curr( child ) 122 | ranks.add( _label_rank.hasKey(curr) ? _label_rank{curr} : 0 ) 123 | } 124 | sorted := _bubble_sort( ranks, children ) 125 | for ( string child : sorted ) { 126 | ret = ret + [child] + _find_children_and_sort( child ) 127 | } 128 | return ret 129 | } 130 | 131 | string _get_parent( string hrchy ) { // "a/b/c" return a/b 132 | return hrchy.substr( 0, hrchy.lastIndexOf("/") ) 133 | } 134 | 135 | string _get_curr( string hrchy ) { // "a/b/c" return c 136 | return hrchy.substr( hrchy.lastIndexOf("/")+1 ) 137 | } 138 | 139 | string[] _bubble_sort( int[] a, string[] s ) { // sorting algorithm 140 | if ( a.size() != s.size() ) error("Array sizes do not match in _bubble_sort()!") 141 | 142 | int temp; //for swapping 143 | string temp2; 144 | n := a.size() 145 | for (int i = 0 ; i < n - 1 ; i++) { 146 | 147 | for (int j = 0 ; j < n - 1 ; j++) { 148 | 149 | if ( a[j] > a[j + 1] ) { 150 | temp = a[j]; 151 | a[j]=a[j + 1]; 152 | a[j + 1] = temp; 153 | 154 | temp2 = s[j]; 155 | s[j]=s[j + 1]; 156 | s[j + 1] = temp2; 157 | } 158 | } 159 | } 160 | return s 161 | } 162 | -------------------------------------------------------------------------------- /modules/postalign_xcor.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "species.bds" 5 | include "module_template.bds" 6 | 7 | 8 | help == postalign bed/tagalign settings 9 | fraglen0 := false help (LEGACY PARAM) Set predefined fragment length as zero for cross corr. analysis (add -speak=0 to run_spp.R). 10 | speak_xcor := -1 help Set user-defined cross-corr. peak strandshift (-speak= in run_spp.R). Use -1 to disable (default: -1). 11 | max_ppsize_xcor := "" help R stack size (R parameter --max-ppsize=; between 5000 and 5000000) for cross corr. analysis. 12 | extra_param_xcor := "" help Set extra parameters for run_spp.R (cross-corr. analysis only). 13 | mem_xcor := "15G" help Max. memory for cross-corr. analysis (default: 15G). 14 | 15 | grp_color_xcor := "yellowgreen" 16 | 17 | init_postalign_xcor() 18 | 19 | 20 | void init_postalign_xcor() { 21 | 22 | fraglen0 = get_conf_val_bool( fraglen0, ["fraglen0"] ) 23 | speak_xcor = get_conf_val_int( speak_xcor, ["speak_xcor"] ) 24 | extra_param_xcor= get_conf_val( extra_param_xcor, ["extra_param_xcor"] ) 25 | mem_xcor = get_conf_val( mem_xcor, ["mem_xcor"] ) 26 | max_ppsize_xcor = get_conf_val( max_ppsize_xcor, ["max_ppsize_xcor"] ) 27 | 28 | // backward compatibility 29 | if ( speak_xcor == -1 && fraglen0 ) speak_xcor = 0 30 | 31 | print("\n\n== postalign cross-corr. analysis settings\n") 32 | print( "Max. memory for UNIX shuf\t\t\t: $mem_shuf\n") 33 | print( "User-defined cross-corr. peak strandshift\t: $speak_xcor\n") 34 | print( "Extra parameters for cross-corr. analysis\t: $extra_param_xcor\n") 35 | print( "Max. memory for cross-corr. analysis\t\t: $mem_xcor\n") 36 | print( "Stack size for cross-corr. analysis\t\t:$max_ppsize_xcor\n") 37 | } 38 | 39 | string subsample_tag_PE_for_xcor( string tag, int nlines, bool non_mito, string o_dir, string group ) { 40 | 41 | prefix := replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir ) 42 | nreads_per_mill := metric_prefix( nlines ) 43 | 44 | subsampled_tag := "$prefix."+(non_mito?"no_chrM.":"")+"$nreads_per_mill.R1.tagAlign.gz" 45 | non_mito_param := non_mito ? "grep -v \"chrM\" | " : "" 46 | 47 | joined := "$prefix.joined" // temporary file 48 | joined_subsampled := "$prefix.joined.subsampled" // temporary file 49 | 50 | in := [ tag ] 51 | out := subsampled_tag 52 | 53 | taskName:= "subsample_tag_PE_4_xcor " + group 54 | mem := get_res_mem(mem_shuf,1) 55 | 56 | wait_par( cpus ) 57 | 58 | tid := task( out<-in ) { 59 | 60 | sys $shcmd_init 61 | 62 | // join consecutive two lines into one 63 | sys zcat $tag | sed 'N;s/\n/\t/' > $joined 64 | 65 | //# Shuffle and split temporary combined file into 2 equal parts 66 | //# Will produce $PR_PREFIX00 and $PR_PREFIX01 67 | sys cat $joined | $non_mito_param shuf -n $nlines --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt /dev/null) > $joined_subsampled 68 | 69 | //# Subsample tagAlign file 70 | sys awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$4,$5,$6}' $joined_subsampled | \ 71 | gzip -nc > $subsampled_tag 72 | 73 | sys rm -f $joined $joined_subsampled 74 | 75 | sys $shcmd_finalize 76 | } 77 | 78 | register_par( tid, cpus ) 79 | 80 | add_task_to_graph( in, out, group ) 81 | 82 | return out 83 | } 84 | 85 | string[] xcor( string tag, string o_dir, string group, int nth_xcor ) { 86 | 87 | // misc. 88 | prefix := replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir ) 89 | xcor_score := "$prefix.cc.qc" 90 | xcor_plot := "$prefix.cc.plot.pdf" 91 | param_speak := speak_xcor > -1 ? "-speak=$speak_xcor" : "" 92 | extra_param := max_ppsize_xcor ? "--max-ppsize=$max_ppsize_xcor " : "" 93 | 94 | in := [ tag ] 95 | out := [ xcor_score, xcor_plot ] 96 | 97 | taskName:= "xcor " + group 98 | cpus := (nth_xcor==1) ? -1 : nth_xcor; mem := get_res_mem(mem_xcor,nth_xcor); 99 | 100 | wait_par( cpus ) 101 | 102 | tid := task( out<-in ) { 103 | 104 | sys $shcmd_init 105 | 106 | // # if phantompeakqualtools is an old version, use run_spp_nodups.R. new version has run_spp.R only 107 | sys if [[ $(which run_spp_nodups.R 2> /dev/null | wc -l || echo) == "1" ]]; then RUN_SPP=$(which run_spp_nodups.R); \ 108 | else RUN_SPP=$(which run_spp.R); \ 109 | fi 110 | 111 | //# CCSCORE FILE format 112 | //# Filename numReads estFragLen correstFragLen PhantomPeak corrphantomPeak argmincorr mincorr phantomPeakCoef relPhantomPeakCoef QualityTag 113 | sys Rscript $extra_param ${RUN_SPP} -rf \ 114 | -c=$tag -p=$nth_xcor \ 115 | -filtchr=chrM -savp=$xcor_plot -out=$xcor_score $param_speak $extra_param_xcor 116 | sys sed -r 's/,[^\t]+//g' $xcor_score > $xcor_score.tmp 117 | sys mv $xcor_score.tmp $xcor_score 118 | 119 | sys $shcmd_finalize 120 | } 121 | 122 | register_par( tid, cpus ) 123 | 124 | add_task_to_graph( in, out, group, "XCOR", grp_color_xcor ) 125 | 126 | return out 127 | } 128 | 129 | string get_fraglen( string xcor_score ) { // get FRAGLEN (3rd column of cc score file) for spp(-speak=$FRAGLEN) 130 | 131 | cols := xcor_score.read().split("\t") 132 | return cols[2] 133 | } 134 | -------------------------------------------------------------------------------- /examples/chipseq_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | WORK=/srv/scratch/shared/mitra/leepc12/run/chipseq_test/CTCF/Snyder_CTCF_GM12878_PE 4 | mkdir -p $WORK; cd $WORK 5 | bds_scr Snyder_CTCF_GM12878_PE /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -pe -nth 12 -species hg19 -title CTCF_Snyder_CTCF_GM12878_PE \ 6 | -fastq1_1 /srv/scratch/shared/mitra/leepc12/data/CTCF/PE/Snyder_CTCF_GM12878_PE_Rep1_1.fastq.gz \ 7 | -fastq1_2 /srv/scratch/shared/mitra/leepc12/data/CTCF/PE/Snyder_CTCF_GM12878_PE_Rep1_2.fastq.gz \ 8 | -fastq2_1 /srv/scratch/shared/mitra/leepc12/data/CTCF/PE/Snyder_CTCF_GM12878_PE_Rep2_1.fastq.gz \ 9 | -fastq2_2 /srv/scratch/shared/mitra/leepc12/data/CTCF/PE/Snyder_CTCF_GM12878_PE_Rep2_2.fastq.gz \ 10 | -ctl_fastq1_1 /srv/scratch/shared/mitra/leepc12/data/CTCF/PE/Snyder_Input_GM12878_PE_1.fastq.gz \ 11 | -ctl_fastq1_2 /srv/scratch/shared/mitra/leepc12/data/CTCF/PE/Snyder_Input_GM12878_PE_2.fastq.gz \ 12 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/CTCF/Snyder_CTCF_GM12878_PE/out 13 | 14 | WORK=/srv/scratch/shared/mitra/leepc12/run/chipseq_test/CTCF/Snyder_CTCF_GM12878_SE 15 | mkdir -p $WORK; cd $WORK 16 | bds_scr Snyder_CTCF_GM12878_SE /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -se -nth 6 -species hg19 -title CTCF_Snyder_CTCF_GM12878_SE \ 17 | -fastq1 /srv/scratch/shared/mitra/leepc12/data/CTCF/SE/Snyder_CTCF_GM12878_SE_Rep1.fastq.gz \ 18 | -fastq2 /srv/scratch/shared/mitra/leepc12/data/CTCF/SE/Snyder_CTCF_GM12878_SE_Rep2.fastq.gz \ 19 | -ctl_fastq1 /srv/scratch/shared/mitra/leepc12/data/CTCF/SE/Snyder_Input_GM12878_SE.fastq.gz \ 20 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/CTCF/Snyder_CTCF_GM12878_SE/out 21 | 22 | WORK=/srv/scratch/shared/mitra/leepc12/run/chipseq_test/GATA2/HAIB_GATA2_K562_SE 23 | mkdir -p $WORK; cd $WORK 24 | bds_scr HAIB_GATA2_K562_SE /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -se -nth 6 -species hg19 -title GATA2_HAIB_GATA2_K562_SE \ 25 | -fastq1 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_GATA2_K562_SE_Rep1.fastq.gz \ 26 | -fastq2 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_GATA2_K562_SE_Rep2.fastq.gz \ 27 | -ctl_fastq1 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_Input_K562_SE_Rep1.fastq.gz \ 28 | -ctl_fastq2 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_Input_K562_SE_Rep2.fastq.gz \ 29 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/GATA2/HAIB_GATA2_K562_SE/out 30 | 31 | WORK=/srv/scratch/shared/mitra/leepc12/run/chipseq_test/GATA2/UCD_GATA2_K562_SE 32 | mkdir -p $WORK; cd $WORK 33 | bds_scr UCD_GATA2_K562_SE /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -se -nth 6 -species hg19 -title GATA2_UCD_GATA2_K562_SE \ 34 | -fastq1 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_GATA2_K562_SE_Rep1.fastq.gz \ 35 | -fastq2 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_GATA2_K562_SE_Rep2.fastq.gz \ 36 | -ctl_fastq1 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_Input_K562_SE_Rep1.fastq.gz \ 37 | -ctl_fastq2 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_Input_K562_SE_Rep2.fastq.gz \ 38 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/GATA2/UCD_GATA2_K562_SE/out 39 | 40 | WORK=/srv/scratch/shared/mitra/leepc12/run/chipseq_test/GATA2/UChicago_GATA2_K562_SE 41 | mkdir -p $WORK; cd $WORK 42 | bds_scr UChicago_GATA2_K562_SE /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -se -nth 6 -species hg19 -title GATA2_UChicago_GATA2_K562_SE \ 43 | -fastq1 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/UChicago_GATA2_K562_SE/UChicago_GATA2_K562_SE_Rep1.fastq.gz \ 44 | -fastq2 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/UChicago_GATA2_K562_SE/UChicago_GATA2_K562_SE_Rep2.fastq.gz \ 45 | -ctl_fastq1 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/UChicago_GATA2_K562_SE/UChicago_Input_K562_SE_Rep1.fastq.gz \ 46 | -ctl_fastq2 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/UChicago_GATA2_K562_SE/UChicago_Input_K562_SE_Rep2.fastq.gz \ 47 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/GATA2/UChicago_GATA2_K562_SE/out 48 | 49 | WORK=/srv/scratch/shared/mitra/leepc12/run/chipseq_test/YY1/HudsonAlpha 50 | mkdir -p $WORK; cd $WORK 51 | bds_scr HudsonAlpha /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -se -nth 4 -species hg19 -title YY1_HudsonAlpha \ 52 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/YY1/HudsonAlpha/out \ 53 | -fastq1 /srv/scratch/shared/mitra/leepc12/data/YY1/HudsonAlpha/ENCFF000OHH.fastq.gz \ 54 | -fastq2 /srv/scratch/shared/mitra/leepc12/data/YY1/HudsonAlpha/ENCFF000OHO.fastq.gz \ 55 | -ctl_fastq1 /srv/scratch/shared/mitra/leepc12/data/YY1/HudsonAlpha/ENCFF000ODP.fastq.gz 56 | 57 | WORK=/srv/scratch/shared/mitra/leepc12/run/chipseq_test/YY1/Sydh 58 | mkdir -p $WORK; cd $WORK 59 | bds_scr Sydh /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -se -nth 6 -species hg19 -title YY1_Sydh \ 60 | -fastq1 /srv/scratch/shared/mitra/leepc12/data/YY1/Sydh/ENCFF000WGS.fastq.gz \ 61 | -fastq2 /srv/scratch/shared/mitra/leepc12/data/YY1/Sydh/ENCFF000WGT.fastq.gz \ 62 | -ctl_fastq1 /srv/scratch/shared/mitra/leepc12/data/YY1/Sydh/ENCFF000VWV.fastq.gz \ 63 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/YY1/Sydh/out 64 | 65 | 66 | -------------------------------------------------------------------------------- /modules/callpeak_peakseq.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "species.bds" 5 | include "module_template.bds" 6 | 7 | 8 | help == callpeak PeakSeq settings 9 | target_fdr_peakseq := 0.05 help Target FDR for PeakSeq (default: 0.05). 10 | n_sim_peakseq := 10 help Number of simulations for PeakSeq (default: 10). 11 | enrich_mapped_fraglen_peakseq := -1 help Enrichment mapped fragment length for PeakSeq. Use -1 to get from upstream cross-corr. analysis (default: -1). 12 | min_interpeak_dist_peakseq := -1 help Minimum interpeak distance for PeakSeq. Use -1 to get from upstream cross-corr. analysis (default: -1). 13 | mappability_map_peakseq := "" help Mappability map file for PeakSeq (http://archive.gersteinlab.org/proj/PeakSeq/Mappability_Map). 14 | max_qval_peakseq := 0.1 help Maximum Q-value for PeakSeq (default: 0.1). 15 | bckgrnd_model_peakseq := "Simulated" help Background model for PeakSeq (default: Simulated). 16 | extra_param_peakseq := "" help Extra parameters for PeakSeq. 17 | wt_peakseq := "47h" help Walltime for PeakSeq (default: 47h, 47:00:00). 18 | mem_peakseq := "12G" help Max. memory for PeakSeq (default: 12G). 19 | 20 | 21 | grp_color_peakseq := "pink" 22 | 23 | 24 | init_callpeak_peakseq() 25 | 26 | 27 | void init_callpeak_peakseq() { 28 | 29 | target_fdr_peakseq = get_conf_val_real( target_fdr_peakseq, ["target_fdr_peakseq"] ) 30 | n_sim_peakseq = get_conf_val_int( n_sim_peakseq, ["n_sim_peakseq"] ) 31 | enrich_mapped_fraglen_peakseq = get_conf_val_int( enrich_mapped_fraglen_peakseq, ["enrich_mapped_fraglen_peakseq"] ) 32 | min_interpeak_dist_peakseq = get_conf_val_int( min_interpeak_dist_peakseq, ["min_interpeak_dist_peakseq"] ) 33 | mappability_map_peakseq = get_conf_val( mappability_map_peakseq, ["mappability_map_peakseq"] ) 34 | max_qval_peakseq = get_conf_val_real( max_qval_peakseq, ["max_qval_peakseq"] ) 35 | bckgrnd_model_peakseq = get_conf_val( bckgrnd_model_peakseq, ["bckgrnd_model_peakseq"] ) 36 | extra_param_peakseq = get_conf_val( extra_param_peakseq, ["extra_param_peakseq"] ) 37 | wt_peakseq = get_conf_val( wt_peakseq, ["walltime_peakseq", "wt_peakseq", "timeout_peakseq"] ) 38 | mem_peakseq = get_conf_val( mem_peakseq, ["memory_peakseq", "mem_peakseq"] ) 39 | 40 | print("\n\n== callpeak PeakSeq settings\n") 41 | print( "Target FDR for PeakSeq\t\t\t:$target_fdr_peakseq\n") 42 | print( "Number of simulations for PeakSeq\t:$n_sim_peakseq\n") 43 | print( "Enrichment mapped frag. len. for PeakSeq\t:$enrich_mapped_fraglen_peakseq\n") 44 | print( "Minimum interpeak distance for PeakSeq\t:$min_interpeak_dist_peakseq\n") 45 | print( "Mappability map file for PeakSeq\t:$mappability_map_peakseq\n") 46 | print( "Maximum Q-value for PeakSeq\t\t:$max_qval_peakseq\n") 47 | print( "Background model for PeakSeq\t\t:$bckgrnd_model_peakseq\n") 48 | print( "Extra parameters for PeakSeq\t\t:$extra_param_peakseq\n") 49 | print( "Walltime (PeakSeq)\t\t\t: $wt_peakseq\n") 50 | print( "Max. memory (PeakSeq)\t\t: $mem_peakseq\n") 51 | } 52 | 53 | void chk_callpeak_peakseq() { 54 | if ( !path_exists( mappability_map_peakseq ) ) \ 55 | error("\nMappability map file for PeakSeq does not exists! (file: $mappability_map_peakseq)\n") 56 | } 57 | 58 | string[] peakseq( string tag, string ctl_tag, string frag_len, string o_dir, string group ) { 59 | if ( frag_len == "" ) error("missing parameter: fragment length!") 60 | prefix := ctl_tag ? ("$o_dir/" + make_x_basename_wo_gz( tag, ctl_tag, "" ) ) \ 61 | : replace_dir( rm_ext( tag, "tagAlign" ), o_dir ) 62 | tmp_chip_dir := "$prefix.tmp_chip_dir" 63 | tmp_ctl_dir := ctl_tag ? "$prefix.tmp_ctl_dir" : "" 64 | config_file := get_peakseq_conf_dat( prefix, frag_len, tmp_chip_dir, tmp_ctl_dir ) 65 | rpeakfile := "$prefix.regionPeak.gz" 66 | make_tmp_ctl_dir := ctl_tag ? "zcat $ctl_tag | PeakSeq -preprocess tagAlign stdin $tmp_ctl_dir" : "" 67 | 68 | in := [ tag, ctl_tag ] 69 | out := [ rpeakfile ] 70 | 71 | taskName:= "peakseq " + group 72 | mem := get_res_mem(mem_peakseq,1); timeout := get_res_wt(wt_peakseq) 73 | 74 | wait_par( cpus ) 75 | 76 | tid := task( out<-in ) { 77 | 78 | sys $shcmd_init 79 | 80 | // # ============================= 81 | // # The chip and input reads (chip.bam and input.bam) should be preprocessed before running: 82 | // # ============================= 83 | sys cd $o_dir 84 | sys mkdir -p $tmp_chip_dir 85 | sys mkdir -p $tmp_ctl_dir 86 | sys zcat $tag | PeakSeq -preprocess tagAlign stdin $tmp_chip_dir 87 | sys $make_tmp_ctl_dir 88 | 89 | // # ============================= 90 | // # Then it is necessary to setup the configuration file (config.dat). An example configuration file is included with the PeakSeq download. An example: 91 | // # ============================= 92 | 93 | // # ============================= 94 | // #Finally, the peaks are called using the configuration file: 95 | // # ============================= 96 | sys PeakSeq -peak_select $config_file $extra_param_peakseq 97 | sys rm -rf $tmp_chip_dir $tmp_ctl_dir 98 | 99 | sys $shcmd_finalize 100 | } 101 | 102 | register_par( tid, cpus ) 103 | 104 | add_task_to_graph( in, out, group, "PEAKSEQ", grp_color_peakseq ) 105 | 106 | return out 107 | } 108 | 109 | string get_peakseq_conf_dat( string prefix, string frag_len, string chipseq_dir, string ctl_dir ) { 110 | out := "$prefix.peakseq.config.dat" 111 | basename := prefix.baseName() 112 | contents := "" 113 | contents += "Experiment_id $basename\n" 114 | contents += "Enrichment_mapped_fragment_length $frag_len\n" 115 | contents += "target_FDR $target_fdr_peakseq\n" 116 | contents += "N_Simulations $n_sim_peakseq\n" 117 | contents += "Minimum_interpeak_distance $frag_len\n" 118 | contents += "Mappability_map_file $mappability_map_peakseq\n" 119 | contents += "ChIP_Seq_reads_data_dirs $chipseq_dir\n" 120 | if ( ctl_dir ) contents += "Input_reads_data_dirs $ctl_dir\n" 121 | contents += "max_Qvalue $max_qval_peakseq\n" 122 | contents += "Background_model $bckgrnd_model_peakseq\n" 123 | out.write(contents) 124 | return out 125 | } -------------------------------------------------------------------------------- /utils/parse_summary_qc_recursively.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # written by Jin Lee, 2016 4 | 5 | import os 6 | import sys 7 | import re 8 | import argparse 9 | import json 10 | import subprocess 11 | from collections import OrderedDict 12 | 13 | parser = argparse.ArgumentParser(prog='ENCODE_summary.json parser for QC', \ 14 | description='Recursively find ENCODE_summary.json, parse it and make a TSV spreadsheet of QC metrics.') 15 | parser.add_argument('--out-file', type=argparse.FileType('w'), default=sys.stdout, \ 16 | help='Output TSV filename)') 17 | parser.add_argument('--search-dir', type=str, default='.', \ 18 | help='Root directory to search for ENCODE_summary.json') 19 | parser.add_argument('--json-file', type=str, default='ENCODE_summary.json', \ 20 | help='Specify json file name to be parsed') 21 | 22 | args = parser.parse_args() 23 | 24 | # find all qc_summary.json recursively 25 | # json_files = [os.path.join(dp, f) for dp, dn, filenames in os.walk(os.getcwd()) \ 26 | # for f in filenames if os.path.splitext(f)[1] == 'qc_summary.json'] 27 | 28 | # find all ENCODE_summary.json recursively 29 | json_files = subprocess.check_output("find -L %s -name %s" % (args.search_dir,args.json_file), \ 30 | shell=True ).strip().split('\n') 31 | # read json 32 | jsons = [] 33 | for json_file in json_files: 34 | with open(json_file,'r') as f: 35 | jsons.append( json.load(f, object_pairs_hook=OrderedDict) ) 36 | 37 | # sort 38 | # sorted_jsons = sorted(jsons, key = lambda x: (\ 39 | # x['ENCODE_award_rfa'], \ 40 | # x['ENCODE_assay_category'], \ 41 | # x['ENCODE_assay_title'], \ 42 | # x['species'], \ 43 | # x['title'])) 44 | 45 | # look at headers first 46 | headers = OrderedDict() 47 | headers['common'] = [\ 48 | 'ENCODE award rfa',\ 49 | 'ENCODE assay category',\ 50 | 'ENCODE assay title',\ 51 | 'species',\ 52 | 'title',\ 53 | 'replicate'] 54 | 55 | # first take longest header for each qc_type 56 | for json in jsons: 57 | for qc_file in json['qc_files']: 58 | qc_type = qc_file['qc_type'] 59 | if qc_type == 'pbc_PE': 60 | qc_type = 'pbc' 61 | qc_file['qc_type'] = qc_type 62 | header_list = qc_file['header'].split('\t') 63 | if not qc_type in headers or len(headers[qc_type])=0) || (k=="mod") ) { // concat. module 58 | if ( init_mod != "" ) { 59 | trimmed := val.trim().replace("module add ","").replace( ":", " " ).replace( ";", " " ).replace( ",", " " ).trim() 60 | trimmed = trimmed.replace( " ", " ").replace( " ", " ") 61 | module = module + " " + trimmed 62 | } 63 | } 64 | else if ( k.indexOf("shcmd")>=0 ) { 65 | shellcmd = shellcmd + " " + val + ";" 66 | } 67 | else if ( k.indexOf("addpath")>=0 ) { 68 | path = path + val.trim().replace(",",":").replace(";",":").replace(" ",":").replace(":::",":").replace("::",":") + ":" 69 | } 70 | } 71 | 72 | // read from cmd. line arg. 73 | if ( mod!="" ) { 74 | string module_header = ". $init_mod;" 75 | if ( init_mod != "" ) { // if /etc/profile.d/modules.sh exists 76 | trimmed := mod.trim().replace("module add ","").replace( ":", " " ).replace( ";", " " ).replace(","," " ).trim() 77 | trimmed = trimmed.replace( " ", " ").replace( " ", " ") 78 | module = module + " " + trimmed 79 | } 80 | } 81 | if ( shcmd!="" ) shellcmd = shellcmd + shcmd.trim() + "; " 82 | if ( addpath!="" ) path = path + \ 83 | addpath.trim().replace(",",":").replace(";",":").replace(" ",":").replace(":::",":").replace("::",":") + ":" 84 | if ( module !="" ) module = ". $init_mod; module add " + module + ";" 85 | 86 | // check script directories to add to PATH 87 | script_file_paths := get_script_file_paths() 88 | for ( string _path : script_file_paths ) { 89 | if ( _path.exists() ) { 90 | path = path + _path + ":" 91 | } 92 | } 93 | 94 | if ( conda_bin_dir ) conda_bin_dir += "/" 95 | if ( path !="" ) path = " export PATH=$path:\${PATH}:/bin:/usr/bin:/usr/local/bin:\${HOME}/.bds;" 96 | // add conda env 97 | if ( conda_env != "" ) conda_py2 = \ 98 | "if [[ -f $(which $conda_bin_dir"+"conda) && $($conda_bin_dir"+"conda env list | grep $conda_env | wc -l) != \"0\" ]];"+\ 99 | " then source $conda_bin_dir"+"activate $conda_env; sleep $delay_conda_env; fi; " 100 | if ( conda_env_py3 != "" ) conda_py3 = \ 101 | "if [[ -f $(which $conda_bin_dir"+"conda) && $($conda_bin_dir"+"conda env list | grep $conda_env_py3 | wc -l) != \"0\" ]];"+\ 102 | " then source $conda_bin_dir"+"activate $conda_env_py3; sleep $delay_conda_env; fi; " 103 | 104 | // additional initialization 105 | shcmd_init_ := module + path + shellcmd 106 | shcmd_init_ += "; set -o pipefail" // to catch and stop on non-zero exit code in a UNIX pipe 107 | shcmd_init_ += "; STARTTIME=$(date +%s)" // to check running time for a task 108 | if ( nice != 0 ) shcmd_init_ += "; if (( $(nice)<$nice )); then renice -n $nice $$; fi" // to set process priority (niceness) 109 | 110 | shcmd_init_ = shcmd_init_.replace( ": :", ":" ).replace( "::", ":" ).replace( "; ;", ";" ).replace( ";;", ";" ) 111 | shcmd_init = conda_py2 + shcmd_init_ 112 | shcmd_init_py3 = conda_py3 + shcmd_init_ 113 | 114 | if ( is_system_local() ) { 115 | shcmd_finalize = "TASKTIME=$[$(date +%s)-${STARTTIME}]; echo \"Task has finished (${TASKTIME} seconds).\"; "+\ 116 | "sleep $cluster_task_delay" 117 | } 118 | else { 119 | shcmd_finalize = "TASKTIME=$[$(date +%s)-${STARTTIME}]; if [ ${TASKTIME} -lt $cluster_task_min_len ]; "+\ 120 | "then echo \"Waiting for $[$cluster_task_min_len-${TASKTIME}] seconds.\";"+\ 121 | " sleep $[$cluster_task_min_len-${TASKTIME}]; sleep $cluster_task_delay; fi" 122 | } 123 | 124 | print("\n\n== shell environment info\n") 125 | print( "Conda env. \t\t\t: $conda_env\n" ) 126 | print( "Conda env. for python3\t\t: $conda_env_py3\n" ) 127 | print( "Conda bin. directory\t\t: $conda_bin_dir\n" ) 128 | print( "\nShell cmd. for init.\t\t: $shcmd_init\n" ) 129 | print( "\nShell cmd. for init.(py3)\t: $shcmd_init_py3\n" ) 130 | print( "\nShell cmd. for fin.\t\t: $shcmd_finalize\n" ) 131 | print( "\nCluster task min. len.\t\t: $cluster_task_min_len\n" ) 132 | print( "\nCluster task delay\t\t\t: $cluster_task_delay\n" ) 133 | } 134 | 135 | -------------------------------------------------------------------------------- /examples/example2.sh: -------------------------------------------------------------------------------- 1 | screen -RD HAIB_GATA2_K562_SE 2 | 3 | DATA_ROOT=/srv/scratch/leepc12/data; WORK_DIR=/srv/scratch/leepc12/run/chipseq_test; SUFFIX=GATA2/HAIB_GATA2_K562_SE 4 | mkdir -p ${WORK_DIR}/$SUFFIX; cd ${WORK_DIR}/$SUFFIX; 5 | bds /users/leepc12/code/bds_atac/chipseq/chipseq.bds \ 6 | -fastq1 ${DATA_ROOT}/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_GATA2_K562_SE_Rep1.fastq.gz \ 7 | -fastq2 ${DATA_ROOT}/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_GATA2_K562_SE_Rep2.fastq.gz \ 8 | -ctl_fastq1 ${DATA_ROOT}/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_Input_K562_SE_Rep1.fastq.gz \ 9 | -ctl_fastq2 ${DATA_ROOT}/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_Input_K562_SE_Rep2.fastq.gz \ 10 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/${SUFFIX}/out \ 11 | -species hg19 -bwa_idx ${DATA_ROOT}/CTCF/bwa_index/male.hg19.fa.gz -chrsz ${DATA_ROOT}/CTCF/male.hg19.chrom.sizes 12 | 13 | screen -RD Snyder_CTCF_GM12878_PE 14 | 15 | DATA_ROOT=/srv/scratch/leepc12/data; WORK_DIR=/srv/scratch/leepc12/run/chipseq_test; SUFFIX=CTCF/Snyder_CTCF_GM12878_PE 16 | mkdir -p ${WORK_DIR}/$SUFFIX; cd ${WORK_DIR}/$SUFFIX; 17 | bds /users/leepc12/code/bds_atac/chipseq/chipseq.bds \ 18 | -fastq1_1 ${DATA_ROOT}/CTCF/PE/Snyder_CTCF_GM12878_PE_Rep1_1.fastq.gz \ 19 | -fastq1_2 ${DATA_ROOT}/CTCF/PE/Snyder_CTCF_GM12878_PE_Rep1_2.fastq.gz \ 20 | -fastq2_1 ${DATA_ROOT}/CTCF/PE/Snyder_CTCF_GM12878_PE_Rep2_1.fastq.gz \ 21 | -fastq2_2 ${DATA_ROOT}/CTCF/PE/Snyder_CTCF_GM12878_PE_Rep2_2.fastq.gz \ 22 | -ctl_fastq1_1 ${DATA_ROOT}/CTCF/PE/Snyder_Input_GM12878_PE_1.fastq.gz \ 23 | -ctl_fastq1_2 ${DATA_ROOT}/CTCF/PE/Snyder_Input_GM12878_PE_2.fastq.gz \ 24 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/${SUFFIX}/out \ 25 | -species hg19 -bwa_idx ${DATA_ROOT}/CTCF/bwa_index/male.hg19.fa.gz -chrsz ${DATA_ROOT}/CTCF/male.hg19.chrom.sizes 26 | 27 | screen -RD UCD_GATA2_K562_SE 28 | 29 | DATA_ROOT=/srv/scratch/leepc12/data; WORK_DIR=/srv/scratch/leepc12/run/chipseq_test; SUFFIX=GATA2/UCD_GATA2_K562_SE 30 | mkdir -p ${WORK_DIR}/$SUFFIX; cd ${WORK_DIR}/$SUFFIX; 31 | bds /users/leepc12/code/bds_atac/chipseq/chipseq.bds \ 32 | -species hg19 -bwa_idx ${DATA_ROOT}/CTCF/bwa_index/male.hg19.fa.gz -chrsz ${DATA_ROOT}/CTCF/male.hg19.chrom.sizes \ 33 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/${SUFFIX}/out \ 34 | -fastq1 ${DATA_ROOT}/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_GATA2_K562_SE_Rep1.fastq.gz \ 35 | -fastq2 ${DATA_ROOT}/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_GATA2_K562_SE_Rep2.fastq.gz \ 36 | -ctl_fastq1 ${DATA_ROOT}/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_Input_K562_SE_Rep1.fastq.gz \ 37 | -ctl_fastq2 ${DATA_ROOT}/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_Input_K562_SE_Rep2.fastq.gz 38 | 39 | screen -RD UChicago_GATA2_K562_SE 40 | 41 | DATA_ROOT=/srv/scratch/leepc12/data; WORK_DIR=/srv/scratch/leepc12/run/chipseq_test; SUFFIX=GATA2/UChicago_GATA2_K562_SE 42 | mkdir -p ${WORK_DIR}/$SUFFIX; cd ${WORK_DIR}/$SUFFIX; 43 | bds /users/leepc12/code/bds_atac/chipseq/chipseq.bds \ 44 | -species hg19 -bwa_idx ${DATA_ROOT}/CTCF/bwa_index/male.hg19.fa.gz -chrsz ${DATA_ROOT}/CTCF/male.hg19.chrom.sizes \ 45 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/${SUFFIX}/out \ 46 | -fastq1 ${DATA_ROOT}/GATA2/SE/UChicago_GATA2_K562_SE/UChicago_GATA2_K562_SE_Rep1.fastq.gz \ 47 | -fastq2 ${DATA_ROOT}/GATA2/SE/UChicago_GATA2_K562_SE/UChicago_GATA2_K562_SE_Rep2.fastq.gz \ 48 | -ctl_fastq1 ${DATA_ROOT}/GATA2/SE/UChicago_GATA2_K562_SE/UChicago_Input_K562_SE_Rep1.fastq.gz \ 49 | -ctl_fastq2 ${DATA_ROOT}/GATA2/SE/UChicago_GATA2_K562_SE/UChicago_Input_K562_SE_Rep2.fastq.gz 50 | 51 | screen -RD Snyder_CTCF_GM12878_SE 52 | 53 | DATA_ROOT=/srv/scratch/leepc12/data; WORK_DIR=/srv/scratch/leepc12/run/chipseq_test; SUFFIX=CTCF/Snyder_CTCF_GM12878_SE 54 | mkdir -p ${WORK_DIR}/$SUFFIX; cd ${WORK_DIR}/$SUFFIX; 55 | bds /users/leepc12/code/bds_atac/chipseq/chipseq.bds \ 56 | -species hg19 -bwa_idx ${DATA_ROOT}/CTCF/bwa_index/male.hg19.fa.gz -chrsz ${DATA_ROOT}/CTCF/male.hg19.chrom.sizes \ 57 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/${SUFFIX}/out \ 58 | -fastq1 ${DATA_ROOT}/CTCF/SE/Snyder_CTCF_GM12878_SE_Rep1.fastq.gz \ 59 | -fastq2 ${DATA_ROOT}/CTCF/SE/Snyder_CTCF_GM12878_SE_Rep2.fastq.gz \ 60 | -ctl_fastq1 ${DATA_ROOT}/CTCF/SE/Snyder_Input_GM12878_SE.fastq.gz 61 | 62 | screen -RD HudsonAlpha 63 | 64 | DATA_ROOT=/srv/scratch/leepc12/data; WORK_DIR=/srv/scratch/leepc12/run/chipseq_test; SUFFIX=YY1/HudsonAlpha 65 | mkdir -p ${WORK_DIR}/$SUFFIX; cd ${WORK_DIR}/$SUFFIX; 66 | bds /users/leepc12/code/bds_atac/chipseq/chipseq.bds \ 67 | -species hg19 -bwa_idx ${DATA_ROOT}/CTCF/bwa_index/male.hg19.fa.gz -chrsz ${DATA_ROOT}/CTCF/male.hg19.chrom.sizes \ 68 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/${SUFFIX}/out \ 69 | -fastq1 ${DATA_ROOT}/YY1/HudsonAlpha/ENCFF000OHH.fastq.gz \ 70 | -fastq2 ${DATA_ROOT}/YY1/HudsonAlpha/ENCFF000OHO.fastq.gz \ 71 | -ctl_fastq1 ${DATA_ROOT}/YY1/HudsonAlpha/ENCFF000ODP.fastq.gz 72 | 73 | screen -RD Sydh 74 | 75 | DATA_ROOT=/srv/scratch/leepc12/data; WORK_DIR=/srv/scratch/leepc12/run/chipseq_test; SUFFIX=YY1/Sydh 76 | mkdir -p ${WORK_DIR}/$SUFFIX; cd ${WORK_DIR}/$SUFFIX; 77 | bds /users/leepc12/code/bds_atac/chipseq/chipseq.bds \ 78 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/${SUFFIX}/out \ 79 | -species hg19 -bwa_idx ${DATA_ROOT}/CTCF/bwa_index/male.hg19.fa.gz -chrsz ${DATA_ROOT}/CTCF/male.hg19.chrom.sizes \ 80 | -fastq1 ${DATA_ROOT}/YY1/Sydh/ENCFF000WGS.fastq.gz \ 81 | -fastq2 ${DATA_ROOT}/YY1/Sydh/ENCFF000WGT.fastq.gz \ 82 | -ctl_fastq1 ${DATA_ROOT}/YY1/Sydh/ENCFF000VWV.fastq.gz 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | WORK_DIR_OLD=/srv/scratch/leepc12/run/TF_chipseq_pipeline_test/$SUFFIX 115 | mkdir -p $WORK_DIR/$SUFFIX/out/align/rep1; cp -pr $WORK_DIR_OLD/out/align_rep1/*.bam $WORK_DIR/$SUFFIX/out/align/rep1/ 116 | mkdir -p $WORK_DIR/$SUFFIX/out/align/rep1; cp -pr $WORK_DIR_OLD/out/align_rep1/*.flagstat.qc $WORK_DIR/$SUFFIX/out/align/rep1/ 117 | mkdir -p $WORK_DIR/$SUFFIX/out/align/rep2; cp -pr $WORK_DIR_OLD/out/align_rep2/*.bam $WORK_DIR/$SUFFIX/out/align/rep2/ 118 | mkdir -p $WORK_DIR/$SUFFIX/out/align/rep2; cp -pr $WORK_DIR_OLD/out/align_rep2/*.flagstat.qc $WORK_DIR/$SUFFIX/out/align/rep2/ 119 | mkdir -p $WORK_DIR/$SUFFIX/out/align/ctl1; cp -pr $WORK_DIR_OLD/out/align_ctl_rep1/*.bam $WORK_DIR/$SUFFIX/out/align/ctl1/ 120 | mkdir -p $WORK_DIR/$SUFFIX/out/align/ctl1; cp -pr $WORK_DIR_OLD/out/align_ctl_rep1/*.flagstat.qc $WORK_DIR/$SUFFIX/out/align/ctl1/ 121 | mkdir -p $WORK_DIR/$SUFFIX/out/align/ctl2; cp -pr $WORK_DIR_OLD/out/align_ctl_rep2/*.bam $WORK_DIR/$SUFFIX/out/align/ctl2/ 122 | mkdir -p $WORK_DIR/$SUFFIX/out/align/ctl2; cp -pr $WORK_DIR_OLD/out/align_ctl_rep2/*.flagstat.qc $WORK_DIR/$SUFFIX/out/align/ctl2/ 123 | find . -name '*.nmsrt.bam' -delete 124 | find . -name '*.nodup.bam' -delete 125 | 126 | -------------------------------------------------------------------------------- /species/scg.conf: -------------------------------------------------------------------------------- 1 | [mm9] 2 | chrsz = /reference/ENCODE/pipeline_genome_data/mm9/mm9.chrom.sizes 3 | seq_dir = /reference/ENCODE/pipeline_genome_data/mm9/seq 4 | gensz = mm 5 | umap = /reference/ENCODE/pipeline_genome_data/mm9/globalmap_k20tok54 6 | bwa_idx = /reference/ENCODE/pipeline_genome_data/mm9/bwa_index/mm9.fa 7 | bwt2_idx = /reference/ENCODE/pipeline_genome_data/mm9/bowtie2_index/mm9.fa 8 | ref_fa = /reference/ENCODE/pipeline_genome_data/mm9/mm9.fa 9 | blacklist = /reference/ENCODE/pipeline_genome_data/mm9/mm9-blacklist.bed.gz 10 | # data for ATAQC 11 | tss_enrich = /reference/ENCODE/pipeline_genome_data/mm9/ataqc/mm9_gencode_tss_unique.bed.gz 12 | dnase = /reference/ENCODE/pipeline_genome_data/mm9/ataqc/mm9_univ_dhs_ucsc.from_mm10.bed.gz 13 | prom = /reference/ENCODE/pipeline_genome_data/mm9/ataqc/tss_mm9_master.from_mm10.bed.gz 14 | enh = /reference/ENCODE/pipeline_genome_data/mm9/ataqc/mm9_enh_dhs_ucsc.from_mm10.bed.gz 15 | reg2map = /reference/ENCODE/pipeline_genome_data/mm9/ataqc/dnase_avgs_merged_named.fseq.vals.gz 16 | reg2map_bed = /reference/ENCODE/pipeline_genome_data/mm9/ataqc/mm9_dhs_universal_ucsc_v1.bed.gz 17 | roadmap_meta = /reference/ENCODE/pipeline_genome_data/mm9/ataqc/accession_to_name.txt 18 | 19 | [mm10, mm10_ENCODE, mm10_ENCODE3] # from ENCODE portal 20 | chrsz = /reference/ENCODE/pipeline_genome_data/mm10/mm10.chrom.sizes 21 | seq_dir = /reference/ENCODE/pipeline_genome_data/mm10/seq 22 | gensz = mm 23 | bwa_idx = /reference/ENCODE/pipeline_genome_data/mm10/bwa_index/mm10_no_alt_analysis_set_ENCODE.fasta 24 | bwt2_idx = /reference/ENCODE/pipeline_genome_data/mm10/bowtie2_index/mm10_no_alt_analysis_set_ENCODE.fasta 25 | ref_fa = /reference/ENCODE/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta 26 | blacklist = /reference/ENCODE/pipeline_genome_data/mm10/mm10.blacklist.bed.gz 27 | # data for ATAQC 28 | tss_enrich = /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_gencode_tss_unique.bed.gz 29 | dnase = /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_univ_dhs_ucsc.bed.gz 30 | prom = /reference/ENCODE/pipeline_genome_data/mm10/ataqc/tss_mm10_master.bed.gz 31 | enh = /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_enh_dhs_ucsc.bed.gz 32 | reg2map = /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_formatted.txt.gz 33 | reg2map_bed = /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_celltype_compare_subsample.bed.gz 34 | roadmap_meta = /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_metadata.txt 35 | ENCODE_assembly = mm10 36 | 37 | [hg19] 38 | chrsz = /reference/ENCODE/pipeline_genome_data/hg19/hg19.chrom.sizes 39 | seq_dir = /reference/ENCODE/pipeline_genome_data/hg19/seq 40 | gensz = hs 41 | umap = /reference/ENCODE/pipeline_genome_data/hg19/globalmap_k20tok54 42 | bwa_idx = /reference/ENCODE/pipeline_genome_data/hg19/bwa_index/male.hg19.fa 43 | bwt2_idx = /reference/ENCODE/pipeline_genome_data/hg19/bowtie2_index/male.hg19.fa 44 | ref_fa = /reference/ENCODE/pipeline_genome_data/hg19/male.hg19.fa 45 | blacklist = /reference/ENCODE/pipeline_genome_data/hg19/wgEncodeDacMapabilityConsensusExcludable.bed.gz 46 | # data for ATAQC 47 | tss_enrich = /reference/ENCODE/pipeline_genome_data/hg19/ataqc/hg19_gencode_tss_unique.bed.gz 48 | dnase = /reference/ENCODE/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.bed.gz 49 | prom = /reference/ENCODE/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_prom_p2.bed.gz 50 | enh = /reference/ENCODE/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_enh_p2.bed.gz 51 | reg2map = /reference/ENCODE/pipeline_genome_data/hg19/ataqc/dnase_avgs_reg2map_p10_merged_named.pvals.gz 52 | roadmap_meta = /reference/ENCODE/pipeline_genome_data/hg19/ataqc/eid_to_mnemonic.txt 53 | 54 | [hg38, hg38_ENCODE, hg38_ENCODE3] # from ENCODE portal 55 | chrsz = /reference/ENCODE/pipeline_genome_data/hg38/hg38.chrom.sizes 56 | seq_dir = /reference/ENCODE/pipeline_genome_data/hg38/seq 57 | gensz = hs 58 | bwa_idx = /reference/ENCODE/pipeline_genome_data/hg38/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta 59 | bwt2_idx = /reference/ENCODE/pipeline_genome_data/hg38/bowtie2_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta 60 | ref_fa = /reference/ENCODE/pipeline_genome_data/hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta 61 | blacklist = /reference/ENCODE/pipeline_genome_data/hg38/hg38.blacklist.bed.gz 62 | # data for ATAQC 63 | tss_enrich = /reference/ENCODE/pipeline_genome_data/hg38/ataqc/hg38_gencode_tss_unique.bed.gz 64 | dnase = /reference/ENCODE/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.hg19_to_hg38.bed.gz 65 | prom = /reference/ENCODE/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_prom_p2.hg19_to_hg38.bed.gz 66 | enh = /reference/ENCODE/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_enh_p2.hg19_to_hg38.bed.gz 67 | reg2map = /reference/ENCODE/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_formatted.txt.gz 68 | reg2map_bed = /reference/ENCODE/pipeline_genome_data/hg38/ataqc/hg38_celltype_compare_subsample.bed.gz 69 | roadmap_meta = /reference/ENCODE/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_metadata.txt 70 | ENCODE_assembly = GRCh38 71 | 72 | [dm3] # installed by install_genome_data.sh 73 | chrsz = /reference/ENCODE/pipeline_genome_data/dm3/dm3.chrom.sizes 74 | seq_dir = /reference/ENCODE/pipeline_genome_data/dm3/seq 75 | gensz = 168736537 76 | bwa_idx = /reference/ENCODE/pipeline_genome_data/dm3/bwa_index/dm3.fa 77 | bwt2_idx = /reference/ENCODE/pipeline_genome_data/dm3/bowtie2_index/dm3.fa 78 | ref_fa = /reference/ENCODE/pipeline_genome_data/dm3/dm3.fa 79 | 80 | [pantro5] # installed by install_genome_data.sh 81 | chrsz = /reference/ENCODE/pipeline_genome_data/pantro5/pantro5.chrom.sizes 82 | seq_dir = /reference/ENCODE/pipeline_genome_data/pantro5/seq 83 | gensz = 3231170666 84 | bwa_idx = /reference/ENCODE/pipeline_genome_data/pantro5/bwa_index/panTro5.fa 85 | bwt2_idx = /reference/ENCODE/pipeline_genome_data/pantro5/bowtie2_index/panTro5.fa 86 | ref_fa = /reference/ENCODE/pipeline_genome_data/pantro5/panTro5.fa 87 | 88 | [macam7] # installed by install_genome_data.sh 89 | chrsz = /reference/ENCODE/pipeline_genome_data/macam7/macam7.chrom.sizes 90 | seq_dir = /reference/ENCODE/pipeline_genome_data/macam7/seq 91 | gensz = 2817542206 92 | bwa_idx = /reference/ENCODE/pipeline_genome_data/macam7/bwa_index/MacaM_Rhesus_Genome_v7.fasta 93 | bwt2_idx = /reference/ENCODE/pipeline_genome_data/macam7/bowtie2_index/MacaM_Rhesus_Genome_v7.fasta 94 | ref_fa = /reference/ENCODE/pipeline_genome_data/macam7/MacaM_Rhesus_Genome_v7.fasta 95 | nonamecheck = true # for bedtools >= 2.24. this prevents name convention error in bedtools intersect 96 | 97 | [saccer3] # installed by install_genome_data.sh 98 | chrsz = /reference/ENCODE/pipeline_genome_data/saccer3/saccer3.chrom.sizes 99 | seq = /reference/ENCODE/pipeline_genome_data/saccer3/seq 100 | gensz = 12157105 101 | bwa_idx = /reference/ENCODE/pipeline_genome_data/saccer3/bwa_index/sacCer3.fa 102 | bwt2_idx= /reference/ENCODE/pipeline_genome_data/saccer3/bowtie2_index/sacCer3.fa 103 | ref_fa = /reference/ENCODE/pipeline_genome_data/saccer3/sacCer3.fa 104 | 105 | -------------------------------------------------------------------------------- /modules/align_bwa.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "species.bds" 5 | include "module_template.bds" 6 | 7 | 8 | help == align bwa settings (requirements: -bwa_idx) 9 | param_bwa_aln := "-q 5 -l 32 -k 2" help Parameters for bwa aln (default: "-q 5 -l 32 -k 2"). 10 | bwa_idx := "" help BWA index (full path prefix of *.bwt file) . 11 | wt_bwa := "47h" help Walltime for bwa (default: 47, 47:00:00). 12 | mem_bwa := "12G" help Max. memory for bwa (default: 12G). 13 | 14 | 15 | grp_color_bwa := "salmon" 16 | 17 | 18 | init_align_bwa() 19 | 20 | 21 | void init_align_bwa() { 22 | 23 | param_bwa_aln = get_conf_val( param_bwa_aln, ["param_bwa_aln"] ) 24 | bwa_idx = get_conf_val( bwa_idx, ["bwa_idx"] ) 25 | wt_bwa = get_conf_val( wt_bwa, ["wt_bwa"] ) 26 | mem_bwa = get_conf_val( mem_bwa, ["mem_bwa"] ) 27 | 28 | print("\n\n== align bwa settings\n") 29 | print( "Param. for bwa\t\t\t: $param_bwa_aln\n") 30 | print( "BWA index\t\t\t: $bwa_idx\n" ) 31 | print( "Walltime (bwa)\t\t\t: $wt_bwa\n") 32 | print( "Max. memory (bwa)\t\t: $mem_bwa\n") 33 | } 34 | 35 | void chk_align_bwa() { 36 | 37 | if ( !path_exists("$bwa_idx.bwt") ) error("\nBwa index (-bwa_idx) doesn't exists! (file: $bwa_idx.bwt)\n") 38 | } 39 | 40 | string[] bwa( string fastq, string o_dir, string log_o_dir, string group, int nth_bwa ) { 41 | 42 | prefix := replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir ) 43 | prefix2 := replace_dir( prefix, log_o_dir ) 44 | bam := "$prefix.bam" 45 | qc := "$prefix2.flagstat.qc" 46 | 47 | in := [ fastq ] 48 | out := [ bam, qc ] 49 | 50 | if ( out <- in ) { // compare file timestamps of in and out (to check if job is already done or not) 51 | 52 | sai := bwa_aln( fastq, o_dir, group, nth_bwa ) 53 | wait 54 | 55 | bwa_sam( fastq, sai, o_dir, log_o_dir, group, nth_bwa ) 56 | wait 57 | 58 | sai.rm() // delete intermediate file sai 59 | } 60 | 61 | add_task_to_graph( in, out, group, "BWA\\n(SE)", grp_color_bwa ) 62 | 63 | return out 64 | } 65 | 66 | string[] bwa_PE( string fastq1, string fastq2, string o_dir, string log_o_dir, string group, int nth_bwa ) { 67 | 68 | prefix := replace_dir( rm_ext( fastq1, ["fastq","fq"] ), o_dir ) + ".PE2SE" 69 | prefix2 := replace_dir( prefix, log_o_dir ) 70 | bam := "$prefix.bam" 71 | qc := "$prefix2.flagstat.qc" 72 | 73 | in := [ fastq1, fastq2 ] 74 | out := [ bam, qc ] 75 | 76 | if ( out <- in ) { // compare file timestamps of in and out (to check if job is already done or not) 77 | 78 | nth_bwa_aln := distribute_nonzero( nth_bwa, [1,1] ) 79 | 80 | // parallel jobs 81 | sai1 := bwa_aln( fastq1, o_dir, group+"_1", nth_bwa_aln[0] ) 82 | sai2 := bwa_aln( fastq2, o_dir, group+"_2", nth_bwa_aln[1] ) 83 | 84 | wait 85 | 86 | bwa_sam_PE( fastq1, fastq2, sai1, sai2, o_dir, log_o_dir, group, nth_bwa ) 87 | wait 88 | 89 | sai1.rm() // delete intermediate file sai1, sai2 90 | sai2.rm() 91 | } 92 | 93 | add_task_to_graph( in, out, group, "BWA\\n(PE)", grp_color_bwa ) 94 | 95 | return out 96 | } 97 | 98 | string bwa_aln( string fastq, string o_dir, string group, int nth_bwa ) { 99 | 100 | prefix := replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir ) 101 | sai := "$prefix.sai" 102 | 103 | in := [ fastq ] 104 | out := sai 105 | 106 | taskName:= "bwa_aln " + group 107 | cpus := (nth_bwa==1) ? -1 : nth_bwa; mem := get_res_mem(mem_bwa,nth_bwa); timeout := get_res_wt(wt_bwa) 108 | 109 | wait_par( cpus ) 110 | 111 | tid := task( out<-in ) { 112 | 113 | sys $shcmd_init 114 | 115 | //# Map reads to create raw SAM file 116 | sys bwa aln $param_bwa_aln -t $nth_bwa $bwa_idx $fastq > $sai 117 | 118 | sys $shcmd_finalize 119 | } 120 | 121 | register_par( tid, cpus ) 122 | 123 | add_task_to_graph( in, out, group ) 124 | 125 | return out 126 | } 127 | 128 | string[] bwa_sam( string fastq, string sai, string o_dir, string log_o_dir, string group, int nth_bwa ) { 129 | 130 | prefix := replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir ) 131 | prefix2 := replace_dir( prefix, log_o_dir ) 132 | bam := "$prefix.bam" 133 | qc := "$prefix2.flagstat.qc" 134 | 135 | in := [ fastq, sai ] 136 | out := [ bam, qc ] 137 | 138 | taskName:= "bwa_sam " + group 139 | cpus := nth_bwa; mem := get_res_mem(mem_bwa,nth_bwa); timeout := get_res_wt(wt_bwa) 140 | 141 | wait_par( cpus ) 142 | 143 | tid := task( out<-in ) { 144 | 145 | sys $shcmd_init 146 | 147 | sys bwa samse $bwa_idx $sai $fastq | samtools view -Su - | samtools sort - $prefix 148 | sys samtools index $bam 149 | sys samtools flagstat $bam > $qc 150 | //sys bwa samse $bwa_idx $sai $fastq | samtools view -Su /dev/stdin \ 151 | // | sambamba sort -t 1 /dev/stdin -o $bam 152 | //sys sambamba flagstat -t 1 $bam > $qc 153 | 154 | sys $shcmd_finalize 155 | } 156 | 157 | register_par( tid, cpus ) 158 | 159 | add_task_to_graph( in, out, group ) 160 | 161 | return out 162 | } 163 | 164 | string[] bwa_sam_PE( string fastq1, string fastq2, string sai1, string sai2, string o_dir, string log_o_dir, string group, int nth_bwa ) { 165 | 166 | prefix := replace_dir( rm_ext( fastq1, ["fastq","fq"] ), o_dir ) + ".PE2SE" 167 | prefix2 := replace_dir( prefix, log_o_dir ) 168 | sam := "$prefix.sam.gz" 169 | badcigar:= "$prefix.badReads" 170 | bam := "$prefix.bam" 171 | qc := "$prefix2.flagstat.qc" 172 | 173 | in := [ fastq1, fastq2, sai1, sai2 ] 174 | out := [ bam, qc ] 175 | 176 | taskName:= "bwa_sam_PE " + group 177 | cpus := nth_bwa; mem := get_res_mem(mem_bwa,nth_bwa); timeout := get_res_wt(wt_bwa) 178 | 179 | wait_par( cpus ) 180 | 181 | tid := task( out<-in ) { 182 | 183 | sys $shcmd_init 184 | 185 | sys bwa sampe $bwa_idx $sai1 $sai2 $fastq1 $fastq2 | pigz -p $nth_bwa -nc > $sam 186 | 187 | //# Remove read pairs with bad CIGAR strings and sort by position 188 | 189 | //# Find bad CIGAR read names 190 | //sys zcat $sam \ 191 | // | awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t"; }' \ 192 | // | sort | uniq > $badcigar 193 | 194 | sys pigz -p $nth_bwa -cd $sam \ 195 | | awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t"; }' \ 196 | | sort | uniq > $badcigar 197 | 198 | //# Remove bad CIGAR read pairs 199 | sys if [ $(cat $badcigar | wc -l) -gt 0 ]; then \ 200 | zcat $sam | grep -v -F -f $badcigar | samtools view -Su - | samtools sort - $prefix; \ 201 | else \ 202 | samtools view -Su $sam | samtools sort - $prefix; \ 203 | fi 204 | //sys if [ $(cat $badcigar | wc -l) -gt 0 ]; then \ 205 | // pigz -p $nth_bwa -cd $sam | grep -v -F -f $badcigar | samtools view -Su /dev/stdin \ 206 | // | sambamba sort -t 1 /dev/stdin -o $bam; \ 207 | // else \ 208 | // pigz -p $nth_bwa -cd $sam | samtools view -Su /dev/stdin | sambamba sort -t 1 /dev/stdin -o $bam; \ 209 | // fi 210 | 211 | sys samtools flagstat $bam > $qc 212 | sys samtools index $bam 213 | 214 | //sys sambamba flagstat -t 1 > $qc 215 | 216 | sys rm -f $badcigar $sam 217 | 218 | sys $shcmd_finalize 219 | } 220 | 221 | register_par( tid, cpus ) 222 | 223 | add_task_to_graph( in, out, group ) 224 | 225 | return out 226 | } 227 | -------------------------------------------------------------------------------- /modules/conf.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "sys.bds" 5 | 6 | 7 | help == configuration file settings 8 | c := "" help Configuration file path. 9 | env := "$script_dir/default.env" help Environment file path. 10 | 11 | 12 | string{} conf // map for configuration 13 | 14 | 15 | init_conf() 16 | 17 | 18 | void init_conf() { 19 | if ( is_cmd_line_arg_empty() ) \ 20 | print( "\nWarning: No parameters are given (specify cmd. line arguments or configuration file)!\n\n") 21 | if ( is_first_arg_conf() ) c = args[0] 22 | 23 | add_to_conf( c, "" ) // then read conf. file 24 | env = get_conf_val( env, ["env"] ) 25 | if ( path_exists( env ) ) add_to_conf( env, hostname ) 26 | add_to_conf( c, "" ) // read conf again to override 27 | 28 | print( "\n\n== configuration file info\n") 29 | print( "Hostname\t\t\t: $hostname\n") 30 | print( "Configuration file\t\t: $c\n" ) 31 | print( "Environment file\t\t: $env\n" ) 32 | } 33 | 34 | string{} read_conf( string file, string section ) { 35 | section = section.trim() 36 | string{} ret 37 | 38 | if ( file == "" ) return ret 39 | lines := file.read().split("\n") 40 | 41 | can_read := (section=="") ? true : false 42 | found_section := (section=="") ? true : false 43 | for ( string line : lines ) { 44 | line = rm_comment( line.trim() ) 45 | if ( line == "" ) continue 46 | 47 | if ( line.startsWith( "[" ) && line.endsWith( "]" ) ) { 48 | line2 := line.substr(1,line.length()-1) 49 | string[] hostnames 50 | string group 51 | // find group if exists 52 | arr := line2.split(":") 53 | if ( arr.size() > 1 ) group = arr[1].trim() 54 | hostnames = arr[0].split(",") 55 | if ( section == "" ) { 56 | can_read = false 57 | } 58 | else { 59 | for ( string host : hostnames ) { 60 | host = host.trim() 61 | if ( match_str( section, host ) ) { // one asterisk (wildcard chr: *) is allowed in hostname string 62 | if ( section == group ) { 63 | error("Recursion (section name == group) found in a conf. or an env. file!"+\ 64 | " (file: $file, section: $section, group: $group)\n") 65 | } 66 | else if ( group != "" ) { 67 | print("\tReading parameters from section group($group) in file($file)...\n") 68 | return read_conf( file, group ) 69 | } 70 | else { 71 | print("\tReading parameters from section ($host) in file($file)...\n") 72 | found_section = true 73 | can_read = true 74 | break; 75 | } 76 | } 77 | else { 78 | can_read = false 79 | } 80 | } 81 | } 82 | continue 83 | } 84 | 85 | if ( can_read ) { 86 | string key, val 87 | (key, val) = parse_conf_line( line ) 88 | ret{ key } = val 89 | } 90 | } 91 | if ( !found_section && section != "default" ) return read_conf( file, "default" ) 92 | 93 | return ret 94 | } 95 | 96 | string{} read_conf( string file ) { 97 | return read_conf( file, "" ) 98 | } 99 | 100 | void add_to_conf( string file, string section ) { 101 | 102 | tmp := read_conf( file, section ) 103 | 104 | for( string k : tmp.keys() ) conf{k} = tmp{k} 105 | } 106 | 107 | void add_to_conf( string file ) { 108 | tmp := read_conf( file ) 109 | for( string k : tmp.keys() ) { 110 | conf{k} = tmp{k} 111 | } 112 | } 113 | 114 | string[] parse_conf_line( string line ) { 115 | delims := [ "=", "\t" ] 116 | delim_found := false 117 | string key, val 118 | for ( string delim : delims ) { 119 | idx := line.indexOf( delim ) 120 | if ( idx > -1 ) { 121 | key = line.substr( 0, idx ).trim().toLower() 122 | val = line.substr( idx+1 ).trim() 123 | delim_found = true 124 | break 125 | } 126 | } 127 | if ( !delim_found ) error("No delimiter (=,\\t) found in line ($line) in the configruation file.\n") 128 | return [key, val] 129 | } 130 | 131 | int get_conf_val_int( int curr_val, string key ) { 132 | string{} tmp 133 | return parse_int( get_conf_val( curr_val, key, tmp ) ) 134 | } 135 | 136 | int get_conf_val_int( int curr_val, string[] keys ) { 137 | string{} tmp 138 | return parse_int( get_conf_val( curr_val, keys, tmp ) ) 139 | } 140 | 141 | bool get_conf_val_bool( bool curr_val, string key ) { 142 | string{} tmp 143 | return parse_bool( get_conf_val( curr_val, key, tmp ) ) 144 | } 145 | 146 | bool get_conf_val_bool( bool curr_val, string[] keys ) { 147 | string{} tmp 148 | return parse_bool( get_conf_val( curr_val, keys, tmp ) ) 149 | } 150 | 151 | real get_conf_val_real( real curr_val, string key ) { 152 | string{} tmp 153 | return parse_real( get_conf_val( curr_val, key, tmp ) ) 154 | } 155 | 156 | real get_conf_val_real( real curr_val, string[] keys ) { 157 | string{} tmp 158 | return parse_real( get_conf_val( curr_val, keys, tmp ) ) 159 | } 160 | 161 | int get_conf_val_int( int curr_val, string key, string{} _conf ) { 162 | return parse_int( get_conf_val( curr_val, key, _conf ) ) 163 | } 164 | 165 | int get_conf_val_int( int curr_val, string[] keys, string{} _conf ) { 166 | return parse_int( get_conf_val( curr_val, keys, _conf ) ) 167 | } 168 | 169 | bool get_conf_val_bool( bool curr_val, string key, string{} _conf ) { 170 | return parse_bool( get_conf_val( curr_val, key, _conf ) ) 171 | } 172 | 173 | bool get_conf_val_bool( bool curr_val, string[] keys, string{} _conf ) { 174 | return parse_bool( get_conf_val( curr_val, keys, _conf ) ) 175 | } 176 | 177 | real get_conf_val_real( real curr_val, string key, string{} _conf ) { 178 | return parse_real( get_conf_val( curr_val, key, _conf ) ) 179 | } 180 | 181 | real get_conf_val_real( real curr_val, string[] keys, string{} _conf ) { 182 | return parse_real( get_conf_val( curr_val, keys, _conf ) ) 183 | } 184 | 185 | string get_conf_val( string curr_val, string key, string{} _conf ) { 186 | key = key.toLower().trim() 187 | if ( cmd_line_arg_has_key( key ) ) return curr_val 188 | if ( _conf.size() == 0 ) { 189 | if ( conf.hasKey( key ) ) { 190 | return (conf{ key } != "") ? substitute_var( rm_comment( conf{ key } ) ) : curr_val 191 | } 192 | } 193 | else { 194 | if ( _conf.hasKey( key ) ) { 195 | return (_conf{ key } != "") ? substitute_var( rm_comment( _conf{ key } ) ) : curr_val 196 | } 197 | } 198 | return curr_val 199 | } 200 | 201 | string substitute_var( string var ) { 202 | var = var.replace("\$script_dir","$script_dir").replace("\${script_dir}","$script_dir") 203 | var = var.replace("~/","$HOME/").replace("\$HOME","$HOME").replace("\${HOME}","$HOME") 204 | return var 205 | } 206 | 207 | string get_conf_val( string curr_val, string[] keys, string{} _conf ) { 208 | for ( string key : keys ) { 209 | val := get_conf_val( curr_val, key, _conf ) 210 | if ( val != curr_val ) return val 211 | } 212 | return curr_val 213 | } 214 | 215 | string get_conf_val( string curr_val, string key ) { 216 | string{} tmp 217 | return get_conf_val( curr_val, key, tmp ) 218 | } 219 | 220 | string get_conf_val( string curr_val, string[] keys ) { 221 | string{} tmp 222 | return get_conf_val( curr_val, keys, tmp ) 223 | } 224 | 225 | bool has_conf_key( string key, string{} _conf ) { 226 | key = key.toLower() 227 | return (_conf.size()==0) ? conf.hasKey( key ) : _conf.hasKey( key ) 228 | } 229 | 230 | bool has_conf_key( string key ) { 231 | string{} tmp 232 | return has_conf_key( key ) 233 | } 234 | 235 | bool conf_file_exists() { 236 | if ( c!="" ) return c.exists() 237 | return false 238 | } 239 | 240 | bool has_key_in_conf_or_cmd_line( string key ) { 241 | return cmd_line_arg_has_key( key )// || has_conf_key( key ) 242 | } 243 | 244 | -------------------------------------------------------------------------------- /modules/input.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "input_fastq.bds" 5 | include "input_bam.bds" 6 | include "input_tagalign.bds" 7 | include "input_peak.bds" 8 | 9 | 10 | help == input endedness settings (SE or PE) : 11 | se := false help Singled-ended data set. To specify it for each replicate, '-se[REP_ID]' for exp. reps, '-ctl_se[CTL_ID]' for control. 12 | pe := false help Paired end data set. To specify it for each replicate, '-pe[REP_ID]' for exp. reps, '-ctl_pe[CTL_ID]' for controls. 13 | 14 | default_is_pe := false // default is se 15 | 16 | 17 | init_input() 18 | 19 | void init_input() { 20 | se = get_conf_val_bool( se, ["se"] ) 21 | pe = get_conf_val_bool( pe, ["pe"] ) 22 | } 23 | 24 | //// ctl==0: exp. replicate, ctl==1: control 25 | 26 | void chk_input( bool true_rep, bool no_pseudo_rep ) { 27 | if ( is_input_peak() ) { 28 | 29 | chk_input_peak( true_rep, no_pseudo_rep ) 30 | return 31 | } 32 | print( "\n\n== checking input files ...\n\n" ); 33 | 34 | string[] data_all 35 | for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control) 36 | if ( ctl==1 && !ctl_exists() ) continue 37 | for ( int rep=1; rep <= get_num_rep( ctl ); rep++) { 38 | string[] data 39 | 40 | prefix := (ctl==1) ? "Control " : "" 41 | suffix := is_pe( ctl, rep ) ? " (PE)" : " (SE)" 42 | 43 | if ( is_input_fastq( ctl, rep ) ) { 44 | prefix = prefix + "Rep$rep fastq" + suffix 45 | fastqs := get_fastqs( ctl, rep ) 46 | if ( fastqs.size()==0 ) { 47 | data.push( "" ) 48 | } 49 | else { 50 | for ( string fastq : fastqs ) data.push( fastq ) 51 | } 52 | } 53 | else if ( is_input_bam( ctl, rep ) ) { 54 | prefix = prefix +"Rep$rep bam" + suffix 55 | data.push( get_bam( ctl, rep ) ) 56 | } 57 | else if ( is_input_filt_bam( ctl, rep ) ) { 58 | prefix = prefix +"Rep$rep filt_bam" + suffix 59 | data.push( get_filt_bam( ctl, rep ) ) 60 | } 61 | else if ( is_input_tag( ctl, rep ) ) { 62 | prefix = prefix + "Rep$rep tagalign" + suffix 63 | data.push( get_tag( ctl, rep ) ) 64 | } 65 | 66 | print("$prefix :\n") 67 | for ( string s : data ) { 68 | print("\t$s\n") 69 | if ( (s != "") && !path_exists(s) ) error("\t\tFile not found!\n") 70 | } 71 | 72 | // if data is missing 73 | if ( data[0] == "" ) { 74 | if ( (rep>=2) && (ctl==1) ) \ 75 | print( "\tWarning: $prefix missing! using control 1 for calling peaks on replicate $rep\n") 76 | else if ( (rep==2) && (ctl==0) ) \ 77 | print( "\tWarning: $prefix missing! peak will be called for replicate 1 only\n") 78 | else \ 79 | error( "\t$prefix missing!\n") 80 | continue 81 | } 82 | // check any duplicate input filename 83 | for ( string s : data ) { 84 | if ( is_in_array( get_basename( s ), get_basename( data_all ) ) ) \ 85 | error( "\t$prefix has duplicate filename!\n") 86 | } 87 | data_all = merge( data_all, data ) 88 | } 89 | } 90 | } 91 | 92 | string[] get_input_files( int ctl, int rep ) { 93 | string[] empty 94 | 95 | if ( is_input_fastq( ctl, rep ) ) { 96 | return get_fastqs( ctl, rep ) 97 | } 98 | else if ( is_input_bam( ctl, rep ) ) { 99 | bam := get_bam( ctl, rep ) 100 | return bam=="" ? empty : [bam] 101 | } 102 | else if ( is_input_filt_bam( ctl, rep ) ) { 103 | filt_bam := get_filt_bam( ctl, rep ) 104 | return filt_bam=="" ? empty : [filt_bam] 105 | } 106 | else if ( is_input_tag( ctl, rep ) ) { 107 | tag := get_tag( ctl, rep ) 108 | return tag=="" ? empty : [tag] 109 | } 110 | else { 111 | return empty 112 | } 113 | } 114 | 115 | string[] get_input_files( int rep ) { 116 | return get_input_files( 0, rep ) 117 | } 118 | 119 | bool input_file_exists( int ctl, int rep ) { 120 | string[] input_files = get_input_files( ctl, rep ) 121 | return input_files.size() > 0 122 | } 123 | 124 | bool input_file_exists( int rep ) { 125 | return input_file_exists( 0, rep ) 126 | } 127 | 128 | int get_num_rep( int ctl ) { 129 | rep := 1 130 | while( get_input_files( ctl, rep ).size() > 0 ) rep++ 131 | 132 | num_rep := rep-1 133 | return num_rep 134 | } 135 | 136 | int get_num_rep() { 137 | return is_input_peak() ? get_num_rep_peak() : get_num_rep( 0 ) 138 | } 139 | 140 | bool is_pe( int ctl, int rep ) { 141 | if ( pe ) return true 142 | if ( se ) return false 143 | 144 | key_pe := ( ctl > 0 ? "ctl_pe" : "pe" ) + rep 145 | key_pe_ctl := "ctl_pe" 146 | key_se := ( ctl > 0 ? "ctl_se" : "se" ) + rep 147 | 148 | if ( cmd_line_arg_has_key( key_pe ) ) { 149 | return true 150 | } 151 | else if ( cmd_line_arg_has_key( key_se ) ) { 152 | return false 153 | } 154 | else if ( ctl==1 && cmd_line_arg_has_key( key_pe_ctl ) ) { 155 | return true 156 | } 157 | else { 158 | if ( conf.hasKey( key_pe ) && parse_bool( conf{ key_pe } ) ) return true 159 | if ( conf.hasKey( key_se ) && parse_bool( conf{ key_se } ) ) return false 160 | if ( ctl==1 && conf.hasKey( key_pe_ctl ) && parse_bool( conf{ key_pe_ctl } ) ) return true 161 | } 162 | 163 | if ( is_input_fastq( ctl, rep ) ) { 164 | fastqs := get_fastq( ctl, rep, 2 ) 165 | return fastqs.size() > 0 166 | } 167 | 168 | if ( default_is_pe ) return true 169 | else return false 170 | } 171 | 172 | bool is_se( int ctl, int rep ) { 173 | return !is_pe( ctl, rep ) 174 | } 175 | 176 | bool is_pe( int rep ) { 177 | return is_pe( 0, rep ) 178 | } 179 | 180 | bool is_se( int rep ) { 181 | return !is_pe( 0, rep ) 182 | } 183 | 184 | bool has_input_fastq() { 185 | for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control) 186 | if ( ctl==1 && !ctl_exists() ) continue 187 | 188 | for ( int rep=1; rep <= get_num_rep( ctl ); rep++) { 189 | if ( is_input_fastq( ctl, rep ) ) return true 190 | } 191 | } 192 | return false 193 | } 194 | 195 | bool has_pe_input_fastq() { 196 | for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control) 197 | if ( ctl==1 && !ctl_exists() ) continue 198 | 199 | for ( int rep=1; rep <= get_num_rep( ctl ); rep++) { 200 | if ( is_input_fastq( ctl, rep ) && is_pe( ctl, rep ) ) return true 201 | } 202 | } 203 | return false 204 | } 205 | 206 | bool has_pe_input_tag( int ctl ) { 207 | for ( int rep=1; rep <= get_num_rep( ctl ); rep++) { 208 | 209 | if ( is_input_tag( ctl, rep ) && is_pe( ctl, rep ) ) return true 210 | } 211 | return false 212 | } 213 | 214 | bool has_pe_input_tag() { 215 | return has_pe_input_tag( 0 ) 216 | } 217 | 218 | bool has_pe() { 219 | for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control) 220 | if ( ctl==1 && !ctl_exists() ) continue 221 | 222 | for ( int rep=1; rep <= get_num_rep( ctl ); rep++) { 223 | if ( is_pe( ctl, rep ) ) return true 224 | } 225 | } 226 | return false 227 | } 228 | 229 | bool has_se() { 230 | for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control) 231 | if ( ctl==1 && !ctl_exists() ) continue 232 | 233 | for ( int rep=1; rep <= get_num_rep( ctl ); rep++) { 234 | if ( !is_pe( ctl, rep ) ) return true 235 | } 236 | } 237 | return false 238 | } 239 | 240 | bool ctl_exists() { 241 | return input_file_exists( 1, 1 ) 242 | } 243 | 244 | string get_long_group_name( int ctl, int rep ) { 245 | return ( (ctl>0) ? "Control " : "Replicate ") + rep 246 | } 247 | 248 | string get_long_group_name( int rep ) { 249 | return "Replicate "+ rep 250 | } 251 | 252 | string get_group_name( int ctl, int rep ) { 253 | return ( (ctl>0) ? "ctl" : "rep") + rep 254 | } 255 | 256 | string get_group_name( int rep ) { 257 | return "rep" + rep 258 | } 259 | -------------------------------------------------------------------------------- /species/sherlock.conf: -------------------------------------------------------------------------------- 1 | [mm9] 2 | chrsz = /home/groups/cherry/encode/pipeline_genome_data/mm9/mm9.chrom.sizes 3 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/mm9/seq 4 | gensz = mm 5 | umap = /home/groups/cherry/encode/pipeline_genome_data/mm9/globalmap_k20tok54 6 | bwa_idx = /home/groups/cherry/encode/pipeline_genome_data/mm9/bwa_index/mm9.fa 7 | bwt2_idx = /home/groups/cherry/encode/pipeline_genome_data/mm9/bowtie2_index/mm9.fa 8 | ref_fa = /home/groups/cherry/encode/pipeline_genome_data/mm9/mm9.fa 9 | blacklist = /home/groups/cherry/encode/pipeline_genome_data/mm9/mm9-blacklist.bed.gz 10 | # data for ATAQC 11 | tss_enrich = /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/mm9_gencode_tss_unique.bed.gz 12 | dnase = /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/mm9_univ_dhs_ucsc.from_mm10.bed.gz 13 | prom = /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/tss_mm9_master.from_mm10.bed.gz 14 | enh = /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/mm9_enh_dhs_ucsc.from_mm10.bed.gz 15 | reg2map = /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/dnase_avgs_merged_named.fseq.vals.gz 16 | reg2map_bed = /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/mm9_dhs_universal_ucsc_v1.bed.gz 17 | roadmap_meta = /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/accession_to_name.txt 18 | 19 | [mm10, mm10_ENCODE, mm10_ENCODE3] # from ENCODE portal 20 | chrsz = /home/groups/cherry/encode/pipeline_genome_data/mm10/mm10.chrom.sizes 21 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/mm10/seq 22 | gensz = mm 23 | bwa_idx = /home/groups/cherry/encode/pipeline_genome_data/mm10/bwa_index/mm10_no_alt_analysis_set_ENCODE.fasta 24 | bwt2_idx = /home/groups/cherry/encode/pipeline_genome_data/mm10/bowtie2_index/mm10_no_alt_analysis_set_ENCODE.fasta 25 | ref_fa = /home/groups/cherry/encode/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta 26 | blacklist = /home/groups/cherry/encode/pipeline_genome_data/mm10/mm10.blacklist.bed.gz 27 | # data for ATAQC 28 | tss_enrich = /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_gencode_tss_unique.bed.gz 29 | dnase = /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_univ_dhs_ucsc.bed.gz 30 | prom = /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/tss_mm10_master.bed.gz 31 | enh = /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_enh_dhs_ucsc.bed.gz 32 | reg2map = /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_formatted.txt.gz 33 | reg2map_bed = /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_celltype_compare_subsample.bed.gz 34 | roadmap_meta = /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_metadata.txt 35 | ENCODE_assembly = mm10 36 | 37 | [hg19] 38 | chrsz = /home/groups/cherry/encode/pipeline_genome_data/hg19/hg19.chrom.sizes 39 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/hg19/seq 40 | gensz = hs 41 | umap = /home/groups/cherry/encode/pipeline_genome_data/hg19/globalmap_k20tok54 42 | bwa_idx = /home/groups/cherry/encode/pipeline_genome_data/hg19/bwa_index/male.hg19.fa 43 | bwt2_idx = /home/groups/cherry/encode/pipeline_genome_data/hg19/bowtie2_index/male.hg19.fa 44 | ref_fa = /home/groups/cherry/encode/pipeline_genome_data/hg19/male.hg19.fa 45 | blacklist = /home/groups/cherry/encode/pipeline_genome_data/hg19/wgEncodeDacMapabilityConsensusExcludable.bed.gz 46 | # data for ATAQC 47 | tss_enrich = /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/hg19_gencode_tss_unique.bed.gz 48 | dnase = /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.bed.gz 49 | prom = /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_prom_p2.bed.gz 50 | enh = /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_enh_p2.bed.gz 51 | reg2map = /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/dnase_avgs_reg2map_p10_merged_named.pvals.gz 52 | roadmap_meta = /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/eid_to_mnemonic.txt 53 | 54 | [hg38, hg38_ENCODE, hg38_ENCODE3] # from ENCODE portal 55 | chrsz = /home/groups/cherry/encode/pipeline_genome_data/hg38/hg38.chrom.sizes 56 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/hg38/seq 57 | gensz = hs 58 | bwa_idx = /home/groups/cherry/encode/pipeline_genome_data/hg38/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta 59 | bwt2_idx = /home/groups/cherry/encode/pipeline_genome_data/hg38/bowtie2_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta 60 | ref_fa = /home/groups/cherry/encode/pipeline_genome_data/hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta 61 | blacklist = /home/groups/cherry/encode/pipeline_genome_data/hg38/hg38.blacklist.bed.gz 62 | # data for ATAQC 63 | tss_enrich = /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/hg38_gencode_tss_unique.bed.gz 64 | dnase = /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.hg19_to_hg38.bed.gz 65 | prom = /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_prom_p2.hg19_to_hg38.bed.gz 66 | enh = /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_enh_p2.hg19_to_hg38.bed.gz 67 | reg2map = /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_formatted.txt.gz 68 | reg2map_bed = /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/hg38_celltype_compare_subsample.bed.gz 69 | roadmap_meta = /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_metadata.txt 70 | ENCODE_assembly = GRCh38 71 | 72 | [dm3] # installed by install_genome_data.sh 73 | chrsz = /home/groups/cherry/encode/pipeline_genome_data/dm3/dm3.chrom.sizes 74 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/dm3/seq 75 | gensz = 168736537 76 | bwa_idx = /home/groups/cherry/encode/pipeline_genome_data/dm3/bwa_index/dm3.fa 77 | bwt2_idx = /home/groups/cherry/encode/pipeline_genome_data/dm3/bowtie2_index/dm3.fa 78 | ref_fa = /home/groups/cherry/encode/pipeline_genome_data/dm3/dm3.fa 79 | 80 | [pantro5] # installed by install_genome_data.sh 81 | chrsz = /home/groups/cherry/encode/pipeline_genome_data/pantro5/pantro5.chrom.sizes 82 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/pantro5/seq 83 | gensz = 3231170666 84 | bwa_idx = /home/groups/cherry/encode/pipeline_genome_data/pantro5/bwa_index/panTro5.fa 85 | bwt2_idx = /home/groups/cherry/encode/pipeline_genome_data/pantro5/bowtie2_index/panTro5.fa 86 | ref_fa = /home/groups/cherry/encode/pipeline_genome_data/pantro5/panTro5.fa 87 | 88 | [macam7] # installed by install_genome_data.sh 89 | chrsz = /home/groups/cherry/encode/pipeline_genome_data/macam7/macam7.chrom.sizes 90 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/macam7/seq 91 | gensz = 2817542206 92 | bwa_idx = /home/groups/cherry/encode/pipeline_genome_data/macam7/bwa_index/MacaM_Rhesus_Genome_v7.fasta 93 | bwt2_idx = /home/groups/cherry/encode/pipeline_genome_data/macam7/bowtie2_index/MacaM_Rhesus_Genome_v7.fasta 94 | ref_fa = /home/groups/cherry/encode/pipeline_genome_data/macam7/MacaM_Rhesus_Genome_v7.fasta 95 | nonamecheck = true # for bedtools >= 2.24. this prevents name convention error in bedtools intersect 96 | 97 | [saccer3] # installed by install_genome_data.sh 98 | chrsz = /home/groups/cherry/encode/pipeline_genome_data/saccer3/saccer3.chrom.sizes 99 | seq = /home/groups/cherry/encode/pipeline_genome_data/saccer3/seq 100 | gensz = 12157105 101 | bwa_idx = /home/groups/cherry/encode/pipeline_genome_data/saccer3/bwa_index/sacCer3.fa 102 | bwt2_idx= /home/groups/cherry/encode/pipeline_genome_data/saccer3/bowtie2_index/sacCer3.fa 103 | ref_fa = /home/groups/cherry/encode/pipeline_genome_data/saccer3/sacCer3.fa 104 | 105 | -------------------------------------------------------------------------------- /species/kundaje.conf: -------------------------------------------------------------------------------- 1 | [mm9] 2 | chrsz = /mnt/data/pipeline_genome_data/mm9/mm9.chrom.sizes 3 | seq_dir = /mnt/data/pipeline_genome_data/mm9/seq 4 | gensz = mm 5 | umap = /mnt/data/pipeline_genome_data/mm9/globalmap_k20tok54 6 | bwa_idx = /mnt/data/pipeline_genome_data/mm9/bwa_index/mm9.fa 7 | bwt2_idx = /mnt/data/pipeline_genome_data/mm9/bowtie2_index/mm9.fa 8 | ref_fa = /mnt/data/pipeline_genome_data/mm9/mm9.fa 9 | blacklist = /mnt/data/pipeline_genome_data/mm9/mm9-blacklist.bed.gz 10 | # data for ATAQC 11 | tss_enrich = /mnt/data/pipeline_genome_data/mm9/ataqc/mm9_gencode_tss_unique.bed.gz 12 | dnase = /mnt/data/pipeline_genome_data/mm9/ataqc/mm9_univ_dhs_ucsc.from_mm10.bed.gz 13 | prom = /mnt/data/pipeline_genome_data/mm9/ataqc/tss_mm9_master.from_mm10.bed.gz 14 | enh = /mnt/data/pipeline_genome_data/mm9/ataqc/mm9_enh_dhs_ucsc.from_mm10.bed.gz 15 | reg2map = /mnt/data/pipeline_genome_data/mm9/ataqc/dnase_avgs_merged_named.fseq.vals.gz 16 | roadmap_meta = /mnt/data/pipeline_genome_data/mm9/ataqc/accession_to_name.txt 17 | 18 | [mm10, mm10_ENCODE, mm10_ENCODE3] # from ENCODE portal 19 | chrsz = /mnt/data/pipeline_genome_data/mm10/mm10.chrom.sizes 20 | seq_dir = /mnt/data/pipeline_genome_data/mm10/seq 21 | gensz = mm 22 | bwa_idx = /mnt/data/pipeline_genome_data/mm10/bwa_index/mm10_no_alt_analysis_set_ENCODE.fasta 23 | bwt2_idx = /mnt/data/pipeline_genome_data/mm10/bowtie2_index/mm10_no_alt_analysis_set_ENCODE.fasta 24 | ref_fa = /mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta 25 | blacklist = /mnt/data/pipeline_genome_data/mm10/mm10.blacklist.bed.gz 26 | # data for ATAQC 27 | tss_enrich = /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_gencode_tss_unique.bed.gz 28 | dnase = /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_univ_dhs_ucsc.bed.gz 29 | prom = /mnt/data/pipeline_genome_data/mm10/ataqc/tss_mm10_master.bed.gz 30 | enh = /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_enh_dhs_ucsc.bed.gz 31 | reg2map = /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_formatted.txt.gz 32 | reg2map_bed = /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_celltype_compare_subsample.bed.gz 33 | roadmap_meta = /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_metadata.txt 34 | ENCODE_assembly = mm10 35 | 36 | [hg19] 37 | chrsz = /mnt/data/pipeline_genome_data/hg19/hg19.chrom.sizes 38 | seq_dir = /mnt/data/pipeline_genome_data/hg19/seq 39 | gensz = hs 40 | umap = /mnt/data/pipeline_genome_data/hg19/globalmap_k20tok54 41 | bwa_idx = /mnt/data/pipeline_genome_data/hg19/bwa_index/male.hg19.fa 42 | bwt2_idx = /mnt/data/pipeline_genome_data/hg19/bowtie2_index/male.hg19.fa 43 | ref_fa = /mnt/data/pipeline_genome_data/hg19/male.hg19.fa 44 | blacklist = /mnt/data/pipeline_genome_data/hg19/wgEncodeDacMapabilityConsensusExcludable.bed.gz 45 | 46 | mappability_map_peakseq = /mnt/data/pipeline_genome_data/hg19/Mapability_HG.txt 47 | 48 | # data for ATAQC 49 | tss_enrich = /mnt/data/pipeline_genome_data/hg19/ataqc/hg19_gencode_tss_unique.bed.gz 50 | dnase = /mnt/data/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.bed.gz 51 | prom = /mnt/data/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_prom_p2.bed.gz 52 | enh = /mnt/data/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_enh_p2.bed.gz 53 | reg2map = /mnt/data/pipeline_genome_data/hg19/ataqc/dnase_avgs_reg2map_p10_merged_named.pvals.gz 54 | roadmap_meta = /mnt/data/pipeline_genome_data/hg19/ataqc/eid_to_mnemonic.txt 55 | 56 | [hg38, hg38_ENCODE, hg38_ENCODE3] # from ENCODE portal 57 | chrsz = /mnt/data/pipeline_genome_data/hg38/hg38.chrom.sizes 58 | seq_dir = /mnt/data/pipeline_genome_data/hg38/seq 59 | gensz = hs 60 | bwa_idx = /mnt/data/pipeline_genome_data/hg38/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta 61 | bwt2_idx = /mnt/data/pipeline_genome_data/hg38/bowtie2_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta 62 | ref_fa = /mnt/data/pipeline_genome_data/hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta 63 | blacklist = /mnt/data/pipeline_genome_data/hg38/hg38.blacklist.bed.gz 64 | # data for ATAQC 65 | tss_enrich = /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_gencode_tss_unique.bed.gz 66 | dnase = /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.hg19_to_hg38.bed.gz 67 | prom = /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_prom_p2.hg19_to_hg38.bed.gz 68 | enh = /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_enh_p2.hg19_to_hg38.bed.gz 69 | reg2map = /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_formatted.txt.gz 70 | reg2map_bed = /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_celltype_compare_subsample.bed.gz 71 | roadmap_meta = /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_metadata.txt 72 | ENCODE_assembly = GRCh38 73 | 74 | [hg38_chr19_chrM] # hg38 with chr19 and chrM only 75 | chrsz = /mnt/data/pipeline_genome_data/hg38_chr19_chrM/hg38_chr19_chrM.chrom.sizes 76 | seq_dir = /mnt/data/pipeline_genome_data/hg38/seq 77 | gensz = hs 78 | bwa_idx = /mnt/data/pipeline_genome_data/hg38_chr19_chrM/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.chr19_chrM.fasta 79 | bwt2_idx = /mnt/data/pipeline_genome_data/hg38_chr19_chrM/bowtie2_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.chr19_chrM.fasta 80 | ref_fa = /mnt/data/pipeline_genome_data/hg38_chr19_chrM/GRCh38_no_alt_analysis_set_GCA_000001405.15.chr19_chrM.fasta 81 | blacklist = /mnt/data/pipeline_genome_data/hg38_chr19_chrM/hg38.blacklist.bed.gz 82 | # data for ATAQC 83 | tss_enrich = /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_gencode_tss_unique.bed.gz 84 | dnase = /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.hg19_to_hg38.bed.gz 85 | prom = /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_prom_p2.hg19_to_hg38.bed.gz 86 | enh = /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_enh_p2.hg19_to_hg38.bed.gz 87 | reg2map = /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_formatted.txt.gz 88 | reg2map_bed = /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_celltype_compare_subsample.bed.gz 89 | roadmap_meta = /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_metadata.txt 90 | ENCODE_assembly = GRCh38 91 | 92 | [dm3] # installed by install_genome_data.sh 93 | chrsz = /mnt/data/pipeline_genome_data/dm3/dm3.chrom.sizes 94 | seq_dir = /mnt/data/pipeline_genome_data/dm3/seq 95 | gensz = 168736537 96 | bwa_idx = /mnt/data/pipeline_genome_data/dm3/bwa_index/dm3.fa 97 | bwt2_idx = /mnt/data/pipeline_genome_data/dm3/bowtie2_index/dm3.fa 98 | ref_fa = /mnt/data/pipeline_genome_data/dm3/dm3.fa 99 | 100 | [pantro5] # installed by install_genome_data.sh 101 | chrsz = /mnt/data/pipeline_genome_data/pantro5/pantro5.chrom.sizes 102 | seq_dir = /mnt/data/pipeline_genome_data/pantro5/seq 103 | gensz = 3231170666 104 | bwa_idx = /mnt/data/pipeline_genome_data/pantro5/bwa_index/panTro5.fa 105 | bwt2_idx = /mnt/data/pipeline_genome_data/pantro5/bowtie2_index/panTro5.fa 106 | ref_fa = /mnt/data/pipeline_genome_data/pantro5/panTro5.fa 107 | 108 | [macam7] # installed by install_genome_data.sh 109 | chrsz = /mnt/data/pipeline_genome_data/macam7/macam7.chrom.sizes 110 | seq_dir = /mnt/data/pipeline_genome_data/macam7/seq 111 | gensz = 2817542206 112 | bwa_idx = /mnt/data/pipeline_genome_data/macam7/bwa_index/MacaM_Rhesus_Genome_v7.fasta 113 | bwt2_idx = /mnt/data/pipeline_genome_data/macam7/bowtie2_index/MacaM_Rhesus_Genome_v7.fasta 114 | ref_fa = /mnt/data/pipeline_genome_data/macam7/MacaM_Rhesus_Genome_v7.fasta 115 | nonamecheck = true # for bedtools >= 2.24. this prevents name convention error in bedtools intersect 116 | 117 | [saccer3] # installed by install_genome_data.sh 118 | chrsz = /mnt/data/pipeline_genome_data/saccer3/saccer3.chrom.sizes 119 | seq = /mnt/data/pipeline_genome_data/saccer3/seq 120 | gensz = 12157105 121 | bwa_idx = /mnt/data/pipeline_genome_data/saccer3/bwa_index/sacCer3.fa 122 | bwt2_idx= /mnt/data/pipeline_genome_data/saccer3/bowtie2_index/sacCer3.fa 123 | ref_fa = /mnt/data/pipeline_genome_data/saccer3/sacCer3.fa 124 | 125 | -------------------------------------------------------------------------------- /examples/scripts/make_bds_cmds_PE.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import os 4 | import sys 5 | import operator 6 | 7 | 8 | def get_files_by_file_size(dirname, reverse=False): 9 | """ Return list of file paths in directory sorted by file size """ 10 | 11 | # Get list of files 12 | filepaths = [] 13 | for basename in os.listdir(dirname): 14 | filename = os.path.join(dirname, basename) 15 | if os.path.isfile(filename): 16 | filepaths.append(filename) 17 | 18 | # Re-populate list with filename, size tuples 19 | for i in xrange(len(filepaths)): 20 | filepaths[i] = (filepaths[i], os.path.getsize(filepaths[i])) 21 | 22 | # Sort list by file size 23 | # If reverse=True sort from largest to smallest 24 | # If reverse=False sort from smallest to largest 25 | filepaths.sort(key=lambda filename: filename[1], reverse=reverse) 26 | 27 | # Re-populate list with just filenames 28 | for i in xrange(len(filepaths)): 29 | filepaths[i] = filepaths[i][0] 30 | 31 | return filepaths 32 | 33 | fsize = dict() 34 | mp = dict() 35 | 36 | order=["ATF7", "FOS", "ATF2", "CREB1", "E2F1", "EGR1", "TCF12", "TCF7L2", "NANOG", "FOXA2", "HNF4A", "FOXA1", "TAF1", "GABPA", "CEBPB", "REST", "MAX", "CTCF", "MYC", "SPI1", "JUND", "MAFK", "GATA3", "FOSL2", "YY1", "ZNF143", "E2F6", "RFX5", "SIX5", "ATF3", "RCOR1", "TBP", "SRF", "TEAD4", "EP300", "STAT3", "ARID3A"] 37 | 38 | lst = get_files_by_file_size(os.getcwd(), True) 39 | #lst.sort() 40 | 41 | blacklist_ctl = [] #["GM12878","SK-N-SH","K562","HeLa-S3","GM20000","GM13977","HL-60","pancreas"] 42 | 43 | #ctl_to_subsample = ["CONTROL.K562.unpaired.fastq.gz", "CONTROL.GM12878.unpaired.fastq.gz", "CONTROL.HepG2.unpaired.fastq.gz", "CONTROL.SK-N-SH.unpaired.fastq.gz", "CONTROL.HeLa-S3.unpaired.fastq.gz", "CONTROL.H1-hESC.unpaired.fastq.gz", "CONTROL.MCF-7.unpaired.fastq.gz", "CONTROL.A549.unpaired.fastq.gz", "CONTROL.liver.BSID_ENCBS401URL.unpaired.fastq.gz", "CONTROL.Panc1.unpaired.fastq.gz", "CONTROL.HCT116.unpaired.fastq.gz", "CONTROL.liver.BSID_ENCBS046RNA.unpaired.fastq.gz", "CONTROL.PC-3.unpaired.fastq.gz", "CONTROL.B_cell.unpaired.fastq.gz", "CONTROL.fibroblast_of_lung.unpaired.fastq.gz", "CONTROL.endothelial_cell_of_umbilical_vein.unpaired.fastq.gz"] 44 | ctl_to_subsample = ["CONTROL.K562.unpaired.fastq.gz", "CONTROL.GM12878.unpaired.fastq.gz", "CONTROL.HepG2.unpaired.fastq.gz", "CONTROL.SK-N-SH.unpaired.fastq.gz", "CONTROL.HeLa-S3.unpaired.fastq.gz", "CONTROL.H1-hESC.unpaired.fastq.gz", "CONTROL.MCF-7.unpaired.fastq.gz", "CONTROL.A549.unpaired.fastq.gz", "CONTROL.liver.BSID_ENCBS401URL.unpaired.fastq.gz", "CONTROL.Panc1.unpaired.fastq.gz", "CONTROL.HCT116.unpaired.fastq.gz", "CONTROL.liver.BSID_ENCBS046RNA.unpaired.fastq.gz", "CONTROL.PC-3.unpaired.fastq.gz", "CONTROL.B_cell.unpaired.fastq.gz", "CONTROL.fibroblast_of_lung.unpaired.fastq.gz", "CONTROL.endothelial_cell_of_umbilical_vein.unpaired.fastq.gz", "CONTROL.astrocyte.unpaired.fastq.gz", "CONTROL.NT2_D1.unpaired.fastq.gz", "CONTROL.myotube.unpaired.fastq.gz", "CONTROL.induced_pluripotent_stem_cell.unpaired.fastq.gz", "CONTROL.GM12892.unpaired.fastq.gz", "CONTROL.HL-60.unpaired.fastq.gz", "CONTROL.foreskin_fibroblast.unpaired.fastq.gz", "CONTROL.IMR-90.unpaired.fastq.gz", "CONTROL.T47D.unpaired.fastq.gz"] 45 | 46 | f = open("CTCF_blacklist.txt") 47 | blacklist_fastq = f.read().splitlines() 48 | 49 | #print blacklist_fastq 50 | #sys.exit(1) 51 | 52 | for i in lst: 53 | prefix = os.path.basename(i).rsplit(".BSID",1)[0] 54 | 55 | if ".unpaired." in i or not "CHIPseq" in i: 56 | continue 57 | 58 | if os.path.basename(i) in blacklist_fastq: 59 | continue 60 | 61 | prefixCTL = prefix.rsplit(".")[2] 62 | 63 | if prefixCTL in blacklist_ctl: 64 | continue 65 | 66 | filesize = os.path.getsize(i) 67 | 68 | idx = 1 69 | for o in order: 70 | if "."+o+"." in prefix: 71 | break; 72 | idx = idx + 1 73 | 74 | if prefix in mp.keys(): 75 | mp[prefix].append( os.path.basename(i) ) 76 | fsize[prefix] = (fsize[prefix][0] + filesize, idx) 77 | else: 78 | mp[prefix] = [] 79 | mp[prefix].append( os.path.basename(i) ) 80 | fsize[prefix] = (filesize, idx) 81 | 82 | sorted_fsize = sorted(fsize.items(), key=operator.itemgetter(1),reverse=True) 83 | 84 | i = 0 85 | 86 | sorted_fsize3 = sorted(sorted_fsize, key=operator.itemgetter(2),reverse=False) 87 | sorted_fsize4 = sorted(sorted_fsize3, key=operator.itemgetter(3),reverse=False) 88 | 89 | cnt = 0 90 | for tup in sorted_fsize4: 91 | key = tup[0] 92 | cnt = cnt + 1 93 | 94 | length = len(mp[key]) 95 | 96 | filesize = tup[1] 97 | o = tup[2] 98 | svr = tup[3] 99 | #nth = filesize/1000000000 100 | nth = filesize/500000000 101 | if nth == 0: 102 | nth = 1 103 | 104 | svr_name = "NONE" 105 | if svr==1: 106 | svr_name="scg" 107 | elif svr==2: 108 | svr_name="nandi" 109 | elif svr==3: 110 | svr_name="mitra" 111 | elif svr==4: 112 | svr_name="kali" 113 | elif svr==5: 114 | svr_name="kadru" 115 | elif svr==6: 116 | svr_name="wotan" 117 | else: 118 | svr_name="NULL" 119 | 120 | print "#"+ str(cnt) + " , " + str(filesize) + ", order: " + str(o) + ", svr: " + svr_name + ", old #: " + str(tup[4]) 121 | print "NTH="+str(nth)+"; SUFFIX=\""+key+"\"" 122 | 123 | if length == 4: 124 | print "FASTQ1=$DATA/DREAM_challenge/"+os.path.basename(mp[key][0]) 125 | print "FASTQ2=$DATA/DREAM_challenge/"+os.path.basename(mp[key][1]) 126 | print "FASTQ3=$DATA/DREAM_challenge/"+os.path.basename(mp[key][2]) 127 | print "FASTQ4=$DATA/DREAM_challenge/"+os.path.basename(mp[key][2]) 128 | elif length == 3: 129 | print "FASTQ1=$DATA/DREAM_challenge/"+os.path.basename(mp[key][0]) 130 | print "FASTQ2=$DATA/DREAM_challenge/"+os.path.basename(mp[key][1]) 131 | print "FASTQ3=$DATA/DREAM_challenge/"+os.path.basename(mp[key][2]) 132 | elif length == 2: 133 | print "FASTQ1=$DATA/DREAM_challenge/"+os.path.basename(mp[key][0]) 134 | print "FASTQ2=$DATA/DREAM_challenge/"+os.path.basename(mp[key][1]) 135 | elif length == 1: 136 | print "FASTQ1=$DATA/DREAM_challenge/"+os.path.basename(mp[key][0]) 137 | else: 138 | print "LEN>3: " + str( length ) 139 | 140 | prefixCTL = key.rsplit(".")[2] 141 | 142 | foundCTL = False 143 | 144 | lst2 = get_files_by_file_size("/srv/gsfs0/scratch/leepc12/data/DREAM_challenge", True) 145 | 146 | for k in lst2: 147 | if "."+prefixCTL+"." in k and "CONTROL" in k and k.endswith("unpaired.fastq.gz"): 148 | print "CTL_FASTQ=$DATA/DREAM_challenge/"+os.path.basename(k) 149 | print "WORK=$RUN/DREAM_challenge/$SUFFIX; mkdir -p $WORK/out/align; mkdir -p $WORK/out/qc" 150 | print "cd $WORK/out/align; ln -s ../../../../DREAM_challenge_ctl/CONTROL."+prefixCTL+"/out/align/rep1 ctl1" 151 | print "cd $WORK/out/qc; ln -s ../../../../DREAM_challenge_ctl/CONTROL."+prefixCTL+"/out/qc/rep1 ctl1" 152 | print "cd $WORK;" 153 | str_FASTQ = " " 154 | if length == 3: 155 | str_FASTQ = " -fastq1 $FASTQ1 -fastq2 $FASTQ2 -fastq3 $FASTQ3 " 156 | if length == 2: 157 | str_FASTQ = " -fastq1 $FASTQ1 -fastq2 $FASTQ2 " 158 | if length == 1: 159 | str_FASTQ = " -fastq1 $FASTQ1 " 160 | 161 | subsample = "" 162 | if os.path.basename(k) in ctl_to_subsample: 163 | subsample = " -subsample_ctl 40000000 " 164 | 165 | print "bds_scr ${SUFFIX//\//_} $CODE/bds_atac/chipseq/chipseq.bds -callpeak spp -no_naive_overlap -species hg19 -nth $NTH " + str_FASTQ + "-ctl_fastq $CTL_FASTQ -title ${SUFFIX//\//_} -url_base http://mitra.stanford.edu/kundaje/leepc12/DREAM_challenge/$SUFFIX/out" + subsample 166 | if os.path.basename(k) in ctl_to_subsample: 167 | print "##SUBSAMPLE!" 168 | print "sleep 5" 169 | print 170 | foundCTL = True 171 | 172 | if not foundCTL: 173 | print "#NOT FOUND! (SE, CTL)" 174 | print 175 | -------------------------------------------------------------------------------- /utils/trimfastq.py: -------------------------------------------------------------------------------- 1 | ################################## 2 | # # 3 | # Last modified 2017/11/08 # 4 | # # 5 | # Georgi Marinov # 6 | # # 7 | ################################## 8 | 9 | import sys 10 | import os 11 | 12 | # try: 13 | # import psyco 14 | # psyco.full() 15 | # except: 16 | # pass 17 | 18 | def run(): 19 | 20 | if len(sys.argv) < 2: 21 | print 'usage: python %s [-trim5 bp] [-flowcellID flowcell] [-addEnd 1 | 2] [-replace string newstring | blank] [-renameIDs prefix] [-stdout]' % sys.argv[0] 22 | print '\tthe -trim5 option will trim additional bp from the 5 end, i.e. if you want the middle 36bp of 38bp reads, use 36 as bp to keep and 1 as the trim5 argument' 23 | print '\tUse - to specify standard input, the script will print to standard output by default' 24 | print '\tThe script can read compressed files as long as they have the correct suffix - .bz2 or .gz' 25 | sys.exit(1) 26 | 27 | inputfilename = sys.argv[1] 28 | doMax=False 29 | if sys.argv[2] == 'max': 30 | doMax=True 31 | trim='max' 32 | else: 33 | trim = int(sys.argv[2]) 34 | outputfilename = inputfilename.split('/')[-1].split('.fastq')[0] + '.' +str(trim)+'mers.fastq' 35 | doFlowcellID=False 36 | 37 | doStdOut=True 38 | # doStdOut=False 39 | # if '-stdout' in sys.argv: 40 | # doStdOut = True 41 | 42 | if '-flowcellID' in sys.argv: 43 | doFlowcellID=True 44 | flowcellID=sys.argv[sys.argv.index('-flowcellID')+1] 45 | if doStdOut: 46 | pass 47 | else: 48 | print 'will include flowcell ID', flowcellID, 'in reads headers' 49 | 50 | doRenameIDs = False 51 | if '-renameIDs' in sys.argv: 52 | doRenameIDs = True 53 | RID = '@' + sys.argv[sys.argv.index('-renameIDs') + 1] 54 | 55 | dotrim5=False 56 | if '-trim5' in sys.argv: 57 | dotrim5=True 58 | trim5=int(sys.argv[sys.argv.index('-trim5')+1]) 59 | if doStdOut: 60 | pass 61 | else: 62 | print 'will trim ', trim5, 'bp from the 5-end' 63 | outputfilename = inputfilename.split('.fastq')[0] + '.' +str(trim)+'bp-5prim-trim.fastq' 64 | 65 | doAddEnd=False 66 | if '-addEnd' in sys.argv: 67 | doAddEnd=True 68 | END=sys.argv[sys.argv.index('-addEnd')+1] 69 | if doStdOut: 70 | pass 71 | else: 72 | print 'will add', '/'+END, 'to read IDs' 73 | 74 | doReplace=False 75 | if '-replace' in sys.argv: 76 | doReplace=True 77 | oldstring=sys.argv[sys.argv.index('-replace')+1] 78 | newstring=sys.argv[sys.argv.index('-replace')+2] 79 | if newstring == 'blank': 80 | newstring='' 81 | if doStdOut: 82 | pass 83 | else: 84 | print 'will replace', oldstring, 'with', newstring, 'in read IDs' 85 | 86 | i=0 87 | shorter=0 88 | 89 | if doStdOut: 90 | pass 91 | else: 92 | outfile = open(outputfilename, 'w') 93 | 94 | doStdIn = False 95 | if inputfilename != '-': 96 | if inputfilename.endswith('.bz2'): 97 | cmd = 'bzip2 -cd ' + inputfilename 98 | elif inputfilename.endswith('.gz'): 99 | cmd = 'gunzip -c ' + inputfilename 100 | else: 101 | cmd = 'cat ' + inputfilename 102 | p = os.popen(cmd, "r") 103 | else: 104 | doStdIn = True 105 | 106 | line = 'line' 107 | 108 | if dotrim5: 109 | i=1 110 | j=0 111 | while line != '': 112 | if doStdIn: 113 | line = sys.stdin.readline() 114 | else: 115 | line = p.readline() 116 | if line == '': 117 | break 118 | if i==1 and line[0]=='@': 119 | if doFlowcellID and flowcellID not in line: 120 | ID='@'+flowcellID+'_'+line.replace(' ','_')[1:-1]+'\n' 121 | else: 122 | ID=line.replace(' ','_') 123 | if doReplace: 124 | ID=ID.replace(oldstring,newstring) 125 | if doRenameIDs: 126 | ID = RID + str(j) 127 | if doAddEnd: 128 | ID=ID.strip()+'/'+END+'\n' 129 | i=2 130 | continue 131 | if i==2: 132 | i=3 133 | sequence=line[trim5:len(line)].strip() 134 | continue 135 | if i==3 and line[0]=='+': 136 | plus='+\n' 137 | i=4 138 | continue 139 | if i==4: 140 | scores=line 141 | i=1 142 | scores=line[trim5:len(line)].strip() 143 | scores=scores[0:trim] 144 | j=j+1 145 | if j % 5000000 == 0: 146 | if doStdOut: 147 | pass 148 | else: 149 | print str(j/1000000) + 'M reads processed' 150 | if doMax: 151 | sequence=sequence.replace('.','N') 152 | else: 153 | sequence=sequence[0:trim].replace('.','N')+'\n' 154 | if doStdOut: 155 | print ID.strip() 156 | print sequence.strip() 157 | print plus.strip() 158 | print scores 159 | else: 160 | outfile.write(ID.strip()+'\n') 161 | outfile.write(sequence.strip()+'\n') 162 | outfile.write(plus.strip()+'\n') 163 | outfile.write(scores + '\n') 164 | continue 165 | else: 166 | i=1 167 | j=0 168 | while line != '': 169 | if doStdIn: 170 | line = sys.stdin.readline() 171 | else: 172 | line = p.readline() 173 | if line == '': 174 | break 175 | if i==1 and line[0]=='@': 176 | if doFlowcellID and flowcellID not in line: 177 | ID='@'+flowcellID+'_'+line.replace(' ','_')[1:-1]+'\n' 178 | else: 179 | ID=line.replace(' ','_') 180 | if doReplace: 181 | ID=ID.replace(oldstring,newstring) 182 | if doRenameIDs: 183 | ID = RID + str(j) 184 | if doAddEnd: 185 | ID=ID.strip()+'/'+END+'\n' 186 | i=2 187 | continue 188 | if i==2: 189 | i=3 190 | j=j+1 191 | if j % 5000000 == 0: 192 | if doStdOut: 193 | pass 194 | else: 195 | print str(j/1000000) + 'M reads processed' 196 | if doMax: 197 | sequence=line 198 | else: 199 | if len(line.strip())0: 245 | print shorter, 'sequences shorter than desired length' 246 | run() 247 | 248 | -------------------------------------------------------------------------------- /etc/Read_Distribution_ChIP-exo.txt: -------------------------------------------------------------------------------- 1 | -150 1.4340596390173622E-4 2 | -149 1.4639789385892905E-4 3 | -148 1.5480693225929023E-4 4 | -147 1.7000129172168031E-4 5 | -146 1.8955643091445948E-4 6 | -145 2.10099620018363E-4 7 | -144 2.2825812921412606E-4 8 | -143 2.4065922868248376E-4 9 | -142 2.4490845059535867E-4 10 | -141 2.42524375089422E-4 11 | -140 2.3600384429253273E-4 12 | -139 2.2784370033254942E-4 13 | -138 2.2054078533733076E-4 14 | -137 2.160103191155617E-4 15 | -136 2.1384103219923188E-4 16 | -135 2.130400328011573E-4 17 | -134 2.1261442913415377E-4 18 | -133 2.115713294110372E-4 19 | -132 2.0924529587337638E-4 20 | -131 2.0628070687775117E-4 21 | -130 2.0364939480949432E-4 22 | -129 2.0232319205393872E-4 23 | -128 2.03273930996417E-4 24 | -127 2.0698831057396183E-4 25 | -126 2.1201249593040522E-4 26 | -125 2.1640751876127905E-4 27 | -124 2.1823441076211522E-4 28 | -123 2.155542036284456E-4 29 | -122 2.0726322585061546E-4 30 | -121 1.955989930982237E-4 31 | -120 1.8363431783568262E-4 32 | -119 1.7444201252740454E-4 33 | -118 1.7109488963780177E-4 34 | -117 1.755322875141391E-4 35 | -116 1.851596480350912E-4 36 | -115 1.9624893896218526E-4 37 | -114 2.0507212805694847E-4 38 | -113 2.079011830809079E-4 39 | -112 2.0271781417655047E-4 40 | -111 1.9434270101020137E-4 41 | -110 1.893062656291457E-4 42 | -109 1.9413893008066825E-4 43 | -108 2.1537111641205402E-4 44 | -107 2.5662859206778457E-4 45 | -106 3.099185060811279E-4 46 | -105 3.643433528825486E-4 47 | -104 4.090056269025112E-4 48 | -103 4.3300782257148034E-4 49 | -102 4.290558500005738E-4 50 | -101 4.042692820235224E-4 51 | -100 3.693711071547101E-4 52 | -99 3.3508431390852095E-4 53 | -98 3.1213189079933875E-4 54 | -97 3.0854936066793305E-4 55 | -96 3.2162238366061427E-4 56 | -95 3.4594915425007844E-4 57 | -94 3.761278669090213E-4 58 | -93 4.0675671611013883E-4 59 | -92 4.334808134009706E-4 60 | -91 4.561329386284308E-4 61 | -90 4.7559278871427704E-4 62 | -89 4.927400605802674E-4 63 | -88 5.084544511481597E-4 64 | -87 5.230262134493347E-4 65 | -86 5.343878249536639E-4 66 | -85 5.398823192406418E-4 67 | -84 5.368527298897633E-4 68 | -83 5.226420904805226E-4 69 | -82 4.96347367884576E-4 70 | -81 4.640812621422249E-4 71 | -80 4.337104065859328E-4 72 | -79 4.1310143454816267E-4 73 | -78 4.10120979361378E-4 74 | -77 4.299382988964986E-4 75 | -76 4.669331491782717E-4 76 | -75 5.127879107699016E-4 77 | -74 5.591849642345921E-4 78 | -73 5.978066901355474E-4 79 | -72 6.224861874065494E-4 80 | -71 6.356594284636885E-4 81 | -70 6.419131040936336E-4 82 | -69 6.458339050830534E-4 83 | -68 6.520085222186162E-4 84 | -67 6.642177211211218E-4 85 | -66 6.830185667478948E-4 86 | -65 7.08162198890391E-4 87 | -64 7.393997573400663E-4 88 | -63 7.764823818883762E-4 89 | -62 8.191343740323966E-4 90 | -61 8.669726820916808E-4 91 | -60 9.195874160914019E-4 92 | -59 9.765686860567337E-4 93 | -58 0.0010375066020128487 94 | -57 0.0011017330590527004 95 | -56 0.00116754709254036 96 | -55 0.0012329895229076788 97 | -54 0.0012961011705865087 98 | -53 0.0013549228560087003 99 | -52 0.0014098242005801068 100 | -51 0.0014704900296025876 101 | -50 0.0015489339693520027 102 | -49 0.0016571696461042138 103 | -48 0.0018072106861350802 104 | -47 0.0020044404349732823 105 | -46 0.002227721115158772 106 | -45 0.0024492846684843195 107 | -44 0.0026413630367426947 108 | -43 0.0027761881617266704 109 | -42 0.002836351287151588 110 | -41 0.002845880864423082 111 | -40 0.0028391646468693605 112 | -39 0.00285059038781863 113 | -38 0.0029145458405990974 114 | -37 0.003054999456921908 115 | -36 0.003254242482029955 116 | -35 0.003484146859549071 117 | -34 0.0037165845331050868 118 | -33 0.003923427446323833 119 | -32 0.004091558501282984 120 | -31 0.0042679044338675715 121 | -30 0.004514402938414466 122 | -29 0.004892991709260541 123 | -28 0.005465608440742671 124 | -27 0.006242892200593844 125 | -26 0.007030287550131535 126 | -25 0.007581940424069329 127 | -24 0.007651996757120818 128 | -23 0.006994602483999588 129 | -22 0.00556114033057625 130 | -21 0.004091940187349483 131 | -20 0.003524568735974988 132 | -19 0.004796592658108461 133 | -18 0.0088455786354056 134 | -17 0.016163107580678765 135 | -16 0.02545681733136696 136 | -15 0.034988359956065855 137 | -14 0.0430193875233711 138 | -13 0.04781155210187838 139 | -12 0.048136600329994075 140 | -11 0.044806657125367494 141 | -10 0.03914394197545865 142 | -9 0.032470674367727596 143 | -8 0.026109073789634344 144 | -7 0.021118332556040682 145 | -6 0.017505534291415395 146 | -5 0.015014735447629032 147 | -4 0.013389992476552141 148 | -3 0.012375361830055257 149 | -2 0.011749289178588132 150 | -1 0.011427777066917324 151 | 0 0.0113612172583886 152 | 1 0.011500001516347726 153 | 2 0.011794521604140462 154 | 3 0.012179966617556161 155 | 4 0.012530714982158502 156 | 5 0.012705942455954753 157 | 6 0.012564824796952177 158 | 7 0.01196653776315804 159 | 8 0.010832880494232516 160 | 9 0.009336145656447381 161 | 10 0.007711249297727332 162 | 11 0.00619310746599705 163 | 12 0.005016636209181219 164 | 13 0.004352368176832731 165 | 14 0.0041133024250172585 166 | 15 0.004148054611428677 167 | 16 0.004305240393760864 168 | 17 0.004433475429707692 169 | 18 0.004414343407977009 170 | 19 0.004261300141332552 171 | 20 0.004020769473552029 172 | 21 0.003739175248413146 173 | 22 0.0034629413096936134 174 | 23 0.0032292248729375262 175 | 24 0.003038116640754531 176 | 25 0.002880440687520661 177 | 26 0.002747021087611949 178 | 27 0.0026286819154044297 179 | 28 0.0025185949491149514 180 | 29 0.002419322782323626 181 | 30 0.00233577571245138 182 | 31 0.002272864036919143 183 | 32 0.0022354980531478397 184 | 33 0.0022249651042246296 185 | 34 0.002228060715901593 186 | 35 0.0022279574595970427 187 | 36 0.002207827906729291 188 | 37 0.0021508446287166493 189 | 38 0.0020466541811146616 190 | 39 0.0019107990560277952 191 | 40 0.0017652957296977485 192 | 41 0.00163216067836622 193 | 42 0.0015334103782749082 194 | 43 0.0014847743229281654 195 | 44 0.0014768340748809537 196 | 45 0.0014938842139508909 197 | 46 0.0015202193199555926 198 | 47 0.0015401339727126753 199 | 48 0.0015409295209052893 200 | 49 0.0015219343886787202 201 | 50 0.0014854837690437867 202 | 51 0.0014339128550113077 203 | 52 0.0013695568395921023 204 | 53 0.0012956931215641026 205 | 54 0.0012193679227736943 206 | 55 0.0011485696708343764 207 | 56 0.0010912867933596476 208 | 57 0.0010555077179630066 209 | 58 0.00104649552274266 210 | 59 0.0010586118877356464 211 | 60 0.0010834931434637114 212 | 61 0.0011127756204486014 213 | 62 0.0011380956492120624 214 | 63 0.0011531119928568795 215 | 64 0.0011595731448099929 216 | 65 0.0011612500310793827 217 | 66 0.0011619135776730276 218 | 67 0.001165334710598908 219 | 68 0.001173884139795982 220 | 69 0.00118433171092713 221 | 70 0.0011920470535862109 222 | 71 0.0011923997973670843 223 | 72 0.001180759571863609 224 | 73 0.0011542417959203714 225 | 74 0.0011169450453848613 226 | 75 0.001074713685355297 227 | 76 0.0010333920809298947 228 | 77 9.988245972068718E-4 229 | 78 9.751657482580574E-4 230 | 79 9.598106440497264E-4 231 | 80 9.484645435217661E-4 232 | 81 9.368327056140632E-4 233 | 82 9.206203892665048E-4 234 | 83 8.967710191298164E-4 235 | 84 8.671806826980791E-4 236 | 85 8.349836331762125E-4 237 | 86 8.033141237691362E-4 238 | 87 7.753064076817695E-4 239 | 88 7.533521846596292E-4 240 | 89 7.368729406106167E-4 241 | 90 7.245476079832308E-4 242 | 91 7.150551192259704E-4 243 | 92 7.070744067873337E-4 244 | 93 6.991828586780328E-4 245 | 94 6.895516851576321E-4 246 | 95 6.762505520479096E-4 247 | 96 6.573491251706434E-4 248 | 97 6.309170703476112E-4 249 | 98 5.96478379599652E-4 250 | 99 5.593743497438486E-4 251 | 100 5.264006037963452E-4 252 | 101 5.043527647732853E-4 253 | 102 5.000264556908131E-4 254 | 103 5.174638557939435E-4 255 | 104 5.496933692431754E-4 256 | 105 5.869899564278788E-4 257 | 106 6.196285777374238E-4 258 | 107 6.378841935611802E-4 259 | 108 6.348481563449292E-4 260 | 109 6.148773867600951E-4 261 | 110 5.851451975345133E-4 262 | 111 5.528249013960191E-4 263 | 112 5.250898110724477E-4 264 | 113 5.077012573189973E-4 265 | 114 5.007726430003173E-4 266 | 115 5.030053890084198E-4 267 | 116 5.131009162353166E-4 268 | 117 5.297606455730198E-4 269 | 118 5.517066043350753E-4 270 | 119 5.777432455211638E-4 271 | 120 6.066956285525004E-4 272 | 121 6.373888128502998E-4 273 | 122 6.686478578357767E-4 274 | 123 6.987252261003888E-4 275 | 124 7.235829929165654E-4 276 | 125 7.386106367269791E-4 277 | 126 7.391976359743019E-4 278 | 127 7.207334691012063E-4 279 | 128 6.81260612525679E-4 280 | 129 6.294335345669648E-4 281 | 130 5.765597015196229E-4 282 | 131 5.339465796782125E-4 283 | 132 5.129016353372928E-4 284 | 133 5.205071308008424E-4 285 | 134 5.46944512410516E-4 286 | 135 5.78170022517388E-4 287 | 136 6.001399034725329E-4 288 | 137 5.988103976270243E-4 289 | 138 5.648419108744506E-4 290 | 139 5.077115032784558E-4 291 | 140 4.416003984451976E-4 292 | 141 3.8068981998083413E-4 293 | 142 3.391609914915229E-4 294 | 143 3.2717597183875294E-4 295 | 144 3.388201609053363E-4 296 | 145 3.641597938294164E-4 297 | 146 3.932611057491365E-4 298 | 147 4.1619033180263966E-4 299 | 148 4.254656610532796E-4 300 | 149 4.234130982652516E-4 301 | 150 4.1481060212796147E-4 302 | -------------------------------------------------------------------------------- /modules/postalign_bed.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "species.bds" 5 | include "module_template.bds" 6 | 7 | 8 | // has functions related to tagalign, and helps getting tagalign from configruation file or command line argument 9 | 10 | help == postalign bed/tagalign settings 11 | mem_shuf := "12G" help Max. memory for UNIX shuf (default: 12G). 12 | no_random_source := false help Disable --random-source for UNIX shuf. Hot fix for end of file error. 13 | 14 | 15 | init_postalign_bed() 16 | 17 | 18 | void init_postalign_bed() { 19 | 20 | // fraglen0 = get_conf_val_bool( fraglen0, ["fraglen0"] ) 21 | mem_shuf = get_conf_val( mem_shuf, ["mem_shuf"] ) 22 | no_random_source = get_conf_val_bool( no_random_source, ["no_random_source"] ) 23 | 24 | print("\n\n== postalign bed/tagalign settings\n") 25 | print( "Max. memory for UNIX shuf\t\t\t: $mem_shuf\n") 26 | print( "No --random-source for UNIX shuf\t\t: $no_random_source\n") 27 | } 28 | 29 | string subsample_tag( string tag, int nlines, bool non_mito, string o_dir, string group ) { 30 | 31 | prefix := replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir ) 32 | nreads_per_mill := metric_prefix( nlines ) 33 | 34 | subsampled_tag := "$prefix."+(non_mito?"no_chrM.":"")+"$nreads_per_mill.tagAlign.gz" 35 | non_mito_param := non_mito ? "grep -v \"chrM\" | " : "" 36 | random_source_param := no_random_source ? "" : "--random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt /dev/null)" 37 | 38 | in := [ tag ] 39 | out := subsampled_tag 40 | 41 | taskName:= "subsample_tag " + group 42 | mem := get_res_mem(mem_shuf,1) 43 | 44 | wait_par( cpus ) 45 | 46 | tid := task( out<-in ) { 47 | 48 | sys $shcmd_init 49 | 50 | //# Subsample tagAlign file 51 | sys zcat $tag | \ 52 | $non_mito_param shuf -n $nlines $random_source_param | gzip -nc > $subsampled_tag 53 | 54 | sys $shcmd_finalize 55 | } 56 | 57 | register_par( tid, cpus ) 58 | 59 | add_task_to_graph( in, out, group ) 60 | 61 | return out 62 | } 63 | 64 | string subsample_tag_PE( string tag, int nlines, bool non_mito, string o_dir, string group ) { 65 | 66 | prefix := replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir ) 67 | nreads_per_mill := metric_prefix( nlines ) 68 | 69 | subsampled_tag := "$prefix."+(non_mito?"no_chrM.":"")+"$nreads_per_mill.tagAlign.gz" 70 | non_mito_param := non_mito ? "grep -v \"chrM\" | " : "" 71 | random_source_param := no_random_source ? "" : "--random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt /dev/null)" 72 | 73 | joined := "$prefix.joined" // temporary file 74 | joined_subsampled := "$prefix.joined.subsampled" // temporary file 75 | 76 | in := [ tag ] 77 | out := subsampled_tag 78 | 79 | taskName:= "subsample_tag_PE " + group 80 | mem := get_res_mem(mem_shuf,1) 81 | 82 | wait_par( cpus ) 83 | 84 | tid := task( out<-in ) { 85 | 86 | sys $shcmd_init 87 | 88 | // join consecutive two lines into one 89 | sys zcat $tag | sed 'N;s/\n/\t/' > $joined 90 | 91 | //# Shuffle and split temporary combined file into 2 equal parts 92 | //# Will produce $PR_PREFIX00 and $PR_PREFIX01 93 | sys cat $joined | $non_mito_param shuf -n $nlines $random_source_param > $joined_subsampled 94 | 95 | //# Subsample tagAlign file 96 | sys awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\t%s\t%s\t%s\n%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}' $joined_subsampled | \ 97 | gzip -nc > $subsampled_tag 98 | 99 | sys rm -f $joined $joined_subsampled 100 | 101 | sys $shcmd_finalize 102 | } 103 | 104 | register_par( tid, cpus ) 105 | 106 | add_task_to_graph( in, out, group ) 107 | 108 | return out 109 | } 110 | 111 | // Adjusts the read-ends in a read BED by Tn5 offsets 112 | string tn5_shift_tag( string tag, string o_dir, string group ) { 113 | 114 | prefix := replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir ) 115 | //shifted_tag := "$prefix.shifted.tagAlign.gz" 116 | shifted_tag := "$prefix.tn5.tagAlign.gz" 117 | 118 | in := [ tag ] 119 | out := shifted_tag 120 | 121 | taskName:= "shift_tag " + group 122 | 123 | wait_par( cpus ) 124 | 125 | tid := task( out<-in ) { 126 | 127 | sys $shcmd_init 128 | 129 | sys zcat $tag | awk -F '\t' 'BEGIN {OFS = FS}{ if ($6 == "+") {$2 = $2 + 4} else if ($6 == "-") {$3 = $3 - 5} print $0}' | gzip -nc > $shifted_tag 130 | 131 | sys $shcmd_finalize 132 | } 133 | 134 | register_par( tid, cpus ) 135 | 136 | add_task_to_graph( in, out, group ) 137 | 138 | return out 139 | } 140 | 141 | // make spr(self_pseudo_replicate) 142 | string[] spr( string tag, string pr1_o_dir, string pr2_o_dir, string group ) { 143 | 144 | prefix_pr1 := replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), pr1_o_dir ) 145 | prefix_pr2 := replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), pr2_o_dir ) 146 | tag_pr1 := "$prefix_pr1.pr1.tagAlign.gz" 147 | tag_pr2 := "$prefix_pr2.pr2.tagAlign.gz" 148 | random_source_param := no_random_source ? "" : "--random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt /dev/null)" 149 | 150 | in := [ tag ] 151 | out := [ tag_pr1, tag_pr2 ] 152 | 153 | taskName:= "spr " + group 154 | mem := get_res_mem(mem_shuf,1) 155 | 156 | wait_par( cpus ) 157 | 158 | tid := task( out<-in ) { 159 | 160 | sys $shcmd_init 161 | 162 | //# Get total number of read pairs 163 | sys nlines=$( zcat $tag | wc -l ) 164 | sys nlines=$(( (nlines + 1) / 2 )) 165 | 166 | //# Shuffle and split BEDPE file into 2 equal parts 167 | //# Will produce $PR_PREFIX00 and $PR_PREFIX01 168 | sys zcat $tag | shuf $random_source_param | split -d -l $((nlines)) - $prefix_pr1. 169 | 170 | //# Convert read pairs to reads into standard tagAlign file 171 | sys gzip -nc $prefix_pr1.00 > $tag_pr1 172 | sys rm -f $prefix_pr1.00 173 | sys gzip -nc $prefix_pr1.01 > $tag_pr2 174 | sys rm -f $prefix_pr1.01 175 | 176 | sys $shcmd_finalize 177 | } 178 | 179 | register_par( tid, cpus ) 180 | 181 | add_task_to_graph( in, out, ["$group PR 1", "$group PR 2"] ) 182 | 183 | return out 184 | } 185 | 186 | string[] spr_tag_PE( string tag, string pr1_o_dir, string pr2_o_dir, string group ) { 187 | 188 | prefix_pr1 := replace_dir( rm_ext( tag, ["bed","tagAlign"] ), pr1_o_dir ) 189 | prefix_pr2 := replace_dir( rm_ext( tag, ["bed","tagAlign"] ), pr2_o_dir ) 190 | 191 | joined := "$prefix_pr1.joined" // temporary file 192 | 193 | tag_pr1 := "$prefix_pr1.pr1.tagAlign.gz" 194 | tag_pr2 := "$prefix_pr2.pr2.tagAlign.gz" 195 | random_source_param := no_random_source ? "" : "--random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt /dev/null)" 196 | 197 | in := [ tag ] 198 | out := [ tag_pr1, tag_pr2 ] 199 | 200 | taskName:= "spr_tag_PE " + group 201 | mem := get_res_mem(mem_shuf,1) 202 | 203 | wait_par( cpus ) 204 | 205 | tid := task( out<-in ) { 206 | 207 | sys $shcmd_init 208 | 209 | // join consecutive two lines into one 210 | sys zcat $tag | sed 'N;s/\n/\t/' > $joined 211 | 212 | //# Get total number of read pairs 213 | sys nlines=$( cat $joined | wc -l ) 214 | sys nlines=$(( (nlines + 1) / 2 )) 215 | 216 | //# Shuffle and split temporary combined file into 2 equal parts 217 | //# Will produce $PR_PREFIX00 and $PR_PREFIX01 218 | sys cat $joined | shuf $random_source_param | split -d -l $((nlines)) - $prefix_pr1. 219 | 220 | //# Convert read pairs to reads into standard tagAlign file 221 | sys awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\t%s\t%s\t%s\n%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}' "$prefix_pr1.00" | \ 222 | gzip -nc > $tag_pr1 223 | sys rm -f $prefix_pr1.00 224 | sys awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\t%s\t%s\t%s\n%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}' "$prefix_pr1.01" | \ 225 | gzip -nc > $tag_pr2 226 | sys rm -f $prefix_pr1.01 227 | 228 | sys rm -f $joined 229 | 230 | sys $shcmd_finalize 231 | } 232 | 233 | register_par( tid, cpus ) 234 | 235 | add_task_to_graph( in, out, ["$group PR 1", "$group PR 2"] ) 236 | 237 | return out 238 | } 239 | 240 | string pool_tag( string tag1, string tag2, string o_dir, string group ) { 241 | // LINUX has limit on filename length (255), make it as shorter as possible 242 | string tag_pooled 243 | if ( get_basename(tag1).length() < 50 && get_basename(tag2).length() < 50 ) { 244 | prefix := "$o_dir/" + merge_basename_wo_ext( tag1, tag2, ["tagAlign","tag","bed"] ) 245 | tag_pooled = "$prefix.tagAlign.gz" 246 | } 247 | else { 248 | prefix := replace_dir( rm_ext( tag1, ["bed","tagAlign"] ), o_dir ) 249 | tag_pooled = "$prefix"+"_pooled.tagAlign.gz" 250 | } 251 | 252 | in := [ tag1, tag2 ] 253 | out := tag_pooled 254 | 255 | taskName:= "pool_tag " + group 256 | 257 | wait_par( cpus ) 258 | 259 | tid := task( out<-in ) { 260 | 261 | sys $shcmd_init 262 | sys zcat $tag1 $tag2 | gzip -nc > $tag_pooled 263 | 264 | sys $shcmd_finalize 265 | } 266 | 267 | register_par( tid, cpus ) 268 | 269 | add_task_to_graph( in, out, group ) 270 | 271 | return out 272 | } 273 | 274 | string pool_tag( string[] tags, string o_dir, string group ) { 275 | // LINUX has limit on filename length (255), make it as short as possible 276 | string tag_pooled 277 | if ( tags.size() <= 2 && get_basename(tags[0]).length() < 50 && get_basename(tags[1]).length() < 50 ) { 278 | prefix := "$o_dir/" + merge_basename_wo_ext( tags[0], tags[1], ["tagAlign","tag","bed"] ) 279 | tag_pooled = "$prefix.tagAlign.gz" 280 | } 281 | else { 282 | prefix := replace_dir( rm_ext( tags[0], ["bed","tagAlign"] ), o_dir ) 283 | tag_pooled = "$prefix"+"_pooled.tagAlign.gz" 284 | } 285 | tags_str := array_to_str( tags, " " ) // join 286 | 287 | in := tags 288 | out := tag_pooled 289 | 290 | taskName:= "pool_tag " + group 291 | 292 | wait_par( cpus ) 293 | 294 | tid := task( out<-in ) { 295 | 296 | sys $shcmd_init 297 | 298 | sys zcat $tags_str | gzip -nc > $tag_pooled 299 | 300 | sys $shcmd_finalize 301 | } 302 | 303 | register_par( tid, cpus ) 304 | 305 | add_task_to_graph( in, out, group ) 306 | 307 | return out 308 | } 309 | --------------------------------------------------------------------------------