├── .gitignore
├── .gitattributes
├── modules
    ├── module_template.bds
    ├── pipeline_template.bds
    ├── align_multimapping.bds
    ├── callpeak_blacklist_filter.bds
    ├── align_trim_fastq.bds
    ├── output.bds
    ├── git.bds
    ├── callpeak_bigbed.bds
    ├── input_tagalign.bds
    ├── callpeak_filter.bds
    ├── species.bds
    ├── input_peak.bds
    ├── input_bam.bds
    ├── input_fastq.bds
    ├── cluster.bds
    ├── callpeak_gem.bds
    ├── parallel.bds
    ├── callpeak_spp.bds
    ├── sys.bds
    ├── filetable.bds
    ├── postalign_xcor.bds
    ├── callpeak_peakseq.bds
    ├── env.bds
    ├── align_bwa.bds
    ├── conf.bds
    ├── input.bds
    └── postalign_bed.bds
├── requirements_py3.txt
├── uninstall_dependencies.sh
├── utils
    ├── kill_scr
    ├── broadpeak.py
    ├── narrowpeak.py
    ├── narrowpeak_idr.py
    ├── gappedpeak.py
    ├── clusterGeneric
    │   ├── run.pl
    │   ├── stat.pl
    │   ├── kill.pl
    │   └── postMortemInfo.pl
    ├── get_read_length_from_fastq.py
    ├── axt_dirfiles.py
    ├── reassemble.py
    ├── bds_scr_5min
    ├── bds_scr
    ├── assign_multimappers.py
    ├── ucsc_ensGene.py
    ├── ucsc_simplegene.py
    ├── parse_summary_qc_recursively.py
    └── trimfastq.py
├── examples
    ├── ENCSR936XTK_SE.json
    ├── ENCSR936XTK_PE.json
    ├── multiple_data_type.sh
    ├── example.env
    ├── encode_test.sh
    ├── start_from_peaks.sh
    ├── chipseq_test.sh
    ├── example2.sh
    └── scripts
    │   └── make_bds_cmds_PE.py
├── html
    ├── jquery.treetable.css
    └── rpt_header.html
├── etc
    ├── broadPeak.as
    ├── narrowPeak.as
    ├── gappedPeak.as
    └── Read_Distribution_ChIP-exo.txt
├── bds.config
├── requirements.txt
├── LICENSE.md
├── example_conf.json
├── default.env
├── example_conf_full.json
├── install_dependencies.sh
└── species
    ├── scg.conf
    ├── sherlock.conf
    └── kundaje.conf


/.gitignore:
--------------------------------------------------------------------------------
1 | *.chp
2 | .*.swp
3 | .nfs*
4 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.bds linguist-language=Java
2 | 


--------------------------------------------------------------------------------
/modules/module_template.bds:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bds
2 | #vim: syntax=java
3 | 
4 | include "parallel.bds"
5 | include "report.bds"
6 | 


--------------------------------------------------------------------------------
/modules/pipeline_template.bds:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bds
2 | #vim: syntax=java
3 | 
4 | include "git.bds"
5 | include "parallel.bds"
6 | include "report.bds"
7 | 


--------------------------------------------------------------------------------
/requirements_py3.txt:
--------------------------------------------------------------------------------
1 | nomkl
2 | python ==3.5.0
3 | numpy ==1.11.3
4 | idr ==2.0.3
5 | bedtools ==2.26.0
6 | pigz
7 | java-jdk ==8.0.92
8 | matplotlib ==1.5.1
9 | 


--------------------------------------------------------------------------------
/uninstall_dependencies.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ## conda environment name
 4 | 
 5 | ENV_NAME=aquas_chipseq
 6 | ENV_NAME_PY3=aquas_chipseq_py3
 7 | 
 8 | conda env remove --name ${ENV_NAME} -y
 9 | conda env remove --name ${ENV_NAME_PY3} -y
10 | 


--------------------------------------------------------------------------------
/utils/kill_scr:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "$#" -lt 1 ]; then
 4 |   echo
 5 |   echo "Kill a screen with name [SCR_NAME]"
 6 |   echo "Usage : kill_scr [SCR_NAME]"
 7 |   echo
 8 |   screen -ls
 9 |   exit 1
10 | fi
11 | 
12 | screen -X -R $1 quit
13 | 


--------------------------------------------------------------------------------
/examples/ENCSR936XTK_SE.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"out_dir" : "ENCSR936XTK/SE",
 3 | 	"se" : true,
 4 | 	"fastq1" : "rep1.fastq.gz",
 5 | 	"fastq2" : "rep2.fastq.gz",
 6 | 	"ctl_fastq1" : "ctl1.fastq.gz",
 7 | 	"ctl_fastq2" : "ctl2.fastq.gz",
 8 | 	"species" : "hg38_ENCODE",
 9 | 	"nth" : 8,
10 | 	"use_pooled_ctl" : true
11 | }
12 | 


--------------------------------------------------------------------------------
/examples/ENCSR936XTK_PE.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"out_dir" : "ENCSR936XTK/PE",
 3 | 	"pe" : true,
 4 | 	"fastq1_1" : "rep1-R1.fastq.gz",
 5 | 	"fastq1_2" : "rep1-R2.fastq.gz",
 6 | 	"fastq2_1" : "rep2-R1.fastq.gz",
 7 | 	"fastq2_2" : "rep2-R2.fastq.gz",
 8 | 	"ctl_fastq1_1" : "ctl1-R1.fastq.gz",
 9 | 	"ctl_fastq1_2" : "ctl1-R2.fastq.gz",
10 | 	"ctl_fastq2_1" : "ctl2-R1.fastq.gz",
11 | 	"ctl_fastq2_2" : "ctl2-R2.fastq.gz",
12 | 	"species" : "hg38_ENCODE",
13 | 	"nth" : 16,
14 | 	"use_pooled_ctl" : true
15 | }
16 | 


--------------------------------------------------------------------------------
/modules/align_multimapping.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "conf.bds"
 5 | 
 6 | 
 7 | help == align multimapping settings
 8 | multimapping 	:= 0 		help # alignments reported for multimapping (default: 0).
 9 | 
10 | 
11 | init_align_multimapping()
12 | 
13 | 
14 | void init_align_multimapping() {
15 | 	multimapping 	= get_conf_val_int( multimapping,		["multimapping"] )
16 | 
17 | 	print("\n\n== align multimapping settings\n")
18 | 	print( "# alignments reported for multimapping\t: $multimapping\n")
19 | }
20 | 


--------------------------------------------------------------------------------
/html/jquery.treetable.css:
--------------------------------------------------------------------------------
 1 | table.treetable span.indenter {
 2 |   display: inline-block;
 3 |   margin: 0;
 4 |   padding: 0;
 5 |   text-align: right;
 6 | 
 7 |   /* Disable text selection of nodes (for better D&D UX) */
 8 |   user-select: none;
 9 |   -khtml-user-select: none;
10 |   -moz-user-select: none;
11 |   -o-user-select: none;
12 |   -webkit-user-select: none;
13 | 
14 |   /* Force content-box box model for indenter (Bootstrap compatibility) */
15 |   -webkit-box-sizing: content-box;
16 |   -moz-box-sizing: content-box;
17 |   box-sizing: content-box;
18 | 
19 |   width: 19px;
20 | }
21 | 
22 | table.treetable span.indenter a {
23 |   background-position: left center;
24 |   background-repeat: no-repeat;
25 |   display: inline-block;
26 |   text-decoration: none;
27 |   width: 19px;
28 | }
29 | 


--------------------------------------------------------------------------------
/utils/broadpeak.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | import sys,os
 4 | 
 5 | if len(sys.argv)!=3:
 6 | 	print '<broadpeak file> <track name>'
 7 | 	sys.exit()
 8 | 
 9 | infile,outfile=sys.argv[1:]
10 | 
11 | # all values on 9th field are -1, exclude them
12 | 
13 | id=1
14 | fout=open(outfile,'w')
15 | with open(infile) as fin:
16 | 	for line in fin:
17 | 		lst=line.rstrip().split('\t')
18 | 		fout.write('{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]}],id:{1},'.format(lst,id))
19 | 		id+=1
20 | 		if len(lst[3])>1:
21 | 			fout.write('name:"'+lst[3]+'",')
22 | 		if lst[5]!='.':
23 | 			fout.write('strand:"'+lst[5]+'",')
24 | 		fout.write('\n')
25 | fout.close()
26 | 
27 | os.system('sort -k1,1 -k2,2n '+outfile+' > '+outfile+'.srt')
28 | os.system('mv '+outfile+'.srt'+' '+outfile)
29 | os.system('bgzip -f '+outfile)
30 | os.system('tabix -f -p bed '+outfile+'.gz')
31 | 


--------------------------------------------------------------------------------
/examples/multiple_data_type.sh:
--------------------------------------------------------------------------------
1 | FASTQ1_1=/srv/gsfs0/scratch/leepc12/data/DREAM_challenge_hidden/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS866ZXX.BSREP1.TECHREP1.FILEIDENCFF164ZYB.R1.fastq.gz
2 | FASTQ1_2=/srv/gsfs0/scratch/leepc12/data/DREAM_challenge_hidden/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS866ZXX.BSREP1.TECHREP1.FILEIDENCFF431EXX.R2.fastq.gz
3 | TAG2=/srv/gsfs0/scratch/leepc12/run/DREAM_challenge_hidden/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO/out/align/rep2/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS705BBA.BSREP2.TECHREP1.FILEIDENCFF478QIY.R1.PE2SE.nodup.tagAlign.gz
4 | CTL_BAM1=/srv/gsfs0/scratch/leepc12/run/DREAM_challenge_hidden/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO/out/align/ctl1/CONTROL.MCF-7.R1.PE2SE.bam
5 | 
6 | bds /home/leepc12/bds_atac/chipseq/chipseq.bds -species hg19 -pe -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -tag2 $TAG2 -ctl_bam1 $CTL_BAM1
7 | 


--------------------------------------------------------------------------------
/utils/narrowpeak.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | import sys,os
 4 | 
 5 | if len(sys.argv)!=3:
 6 | 	print '<narrowpeak file> <track name>'
 7 | 	sys.exit()
 8 | 
 9 | infile,outfile=sys.argv[1:]
10 | 
11 | id=1
12 | fout=open(outfile,'w')
13 | with open(infile) as fin:
14 | 	for line in fin:
15 | 		lst=line.rstrip().split('\t')
16 | 		fout.write('{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]},{0[8]}],id:{1},'.format(lst,id))
17 | 		id+=1
18 | 		if len(lst[3])>1:
19 | 			fout.write('name:"'+lst[3]+'",')
20 | 		if lst[5]!='.':
21 | 			fout.write('strand:"'+lst[5]+'",')
22 | 		if lst[9]!='-1':
23 | 			fout.write('sbstroke:['+lst[9]+']')
24 | 		fout.write('\n')
25 | 
26 | fout.close()
27 | 
28 | os.system('sort -k1,1 -k2,2n '+outfile+' > '+outfile+'.srt')
29 | os.system('mv '+outfile+'.srt'+' '+outfile)
30 | os.system('bgzip -f '+outfile)
31 | os.system('tabix -f -p bed '+outfile+'.gz')
32 | 


--------------------------------------------------------------------------------
/etc/broadPeak.as:
--------------------------------------------------------------------------------
 1 | table broadPeak
 2 | "BED6+3 Peaks of signal enrichment based on pooled, normalized (interpreted) data."
 3 | (
 4 |     string chrom;        "Reference sequence chromosome or scaffold"
 5 |     uint   chromStart;   "Start position in chromosome"
 6 |     uint   chromEnd;     "End position in chromosome"
 7 |     string name;	 "Name given to a region (preferably unique). Use . if no name is assigned."
 8 |     uint   score;        "Indicates how dark the peak will be displayed in the browser (0-1000)"
 9 |     char[1]   strand;     "+ or - or . for unknown"
10 |     float  signalValue;  "Measurement of average enrichment for the region"
11 |     float  pValue;       "Statistical significance of signal value (-log10). Set to -1 if not used."
12 |     float  qValue;       "Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used."
13 | )
14 | 


--------------------------------------------------------------------------------
/bds.config:
--------------------------------------------------------------------------------
 1 | # default system (local, sge, ...)
 2 | system = local
 3 | 
 4 | # shell env.
 5 | taskShell = /bin/bash -e
 6 | sysShell = /bin/bash -e -c
 7 | 
 8 | # regex to get pid
 9 | pidRegex = "(\\d+)"
10 | 
11 | # checkpoint disabled, show full commands/stderr/stdout on task, filter out commands including "export" from task hint
12 | disableCheckpoint = true
13 | taskMaxHintLen = 300
14 | showTaskCode = true
15 | tailLines = 100000000
16 | filterOutTaskHint = export
17 | clusterPostMortemDisabled = true 	# prevent error on scg3/4
18 | 
19 | # SGE
20 | sge.pe = shm
21 | sge.mem = h_vmem
22 | sge.timeout = h_rt
23 | sge.timeout2 = s_rt
24 | clusterRunAdditionalArgs = -V
25 | 
26 | # SLURM (using generic cluster)
27 | clusterGenericRun = ~/.bds/clusterGeneric/run.pl
28 | clusterGenericKill = ~/.bds/clusterGeneric/kill.pl
29 | clusterGenericStat = ~/.bds/clusterGeneric/stat.pl
30 | clusterGenericPostMortemInfo = ~/.bds/clusterGeneric/postMortemInfo.pl
31 | 
32 | 


--------------------------------------------------------------------------------
/etc/narrowPeak.as:
--------------------------------------------------------------------------------
 1 | table narrowPeak
 2 | "BED6+4 Peaks of signal enrichment based on pooled, normalized (interpreted) data."
 3 | (
 4 |     string chrom;        "Reference sequence chromosome or scaffold"
 5 |     uint   chromStart;   "Start position in chromosome"
 6 |     uint   chromEnd;     "End position in chromosome"
 7 |     string name;	 "Name given to a region (preferably unique). Use . if no name is assigned"
 8 |     uint   score;        "Indicates how dark the peak will be displayed in the browser (0-1000) "
 9 |     char[1]  strand;     "+ or - or . for unknown"
10 |     float  signalValue;  "Measurement of average enrichment for the region"
11 |     float  pValue;       "Statistical significance of signal value (-log10). Set to -1 if not used."
12 |     float  qValue;       "Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used."
13 |     int   peak;         "Point-source called for this peak; 0-based offset from chromStart. Set to -1 if no point-source called."
14 | )
15 | 


--------------------------------------------------------------------------------
/utils/narrowpeak_idr.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | # show -log10(GLOBAL IDR SCORE) instead of narrowpeak pval
 4 | 
 5 | import sys,os
 6 | 
 7 | if len(sys.argv)!=3:
 8 | 	print '<narrowpeak_idr file> <track name>'
 9 | 	sys.exit()
10 | 
11 | infile,outfile=sys.argv[1:]
12 | 
13 | id=1
14 | fout=open(outfile,'w')
15 | with open(infile) as fin:
16 | 	for line in fin:
17 | 		lst=line.rstrip().split('\t')
18 | 		fout.write('{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]},{0[8]},{0[10]},{0[11]}],id:{1},'.format(lst,id))
19 | 		id+=1
20 | 		if len(lst[3])>1:
21 | 			fout.write('name:"'+lst[3]+'",')
22 | 		else:
23 | 			fout.write('name:"'+str(id)+'",')
24 | 		if lst[5]!='.':
25 | 			fout.write('strand:"'+lst[5]+'",')
26 | 		if lst[9]!='-1':
27 | 			fout.write('sbstroke:['+lst[9]+']')
28 | 		fout.write('\n')
29 | 
30 | fout.close()
31 | 
32 | os.system('sort -k1,1 -k2,2n '+outfile+' > '+outfile+'.srt')
33 | os.system('mv '+outfile+'.srt'+' '+outfile)
34 | os.system('bgzip -f '+outfile)
35 | os.system('tabix -f -p bed '+outfile+'.gz')
36 | 


--------------------------------------------------------------------------------
/utils/gappedpeak.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | import sys,os
 4 | 
 5 | if len(sys.argv)!=3:
 6 | 	print '<gappedpeak file> <track name>'
 7 | 	sys.exit()
 8 | 
 9 | infile,outfile=sys.argv[1:]
10 | 
11 | id=1
12 | fout=open(outfile,'w')
13 | with open(infile) as fin:
14 | 	for line in fin:
15 | 		lst=line.rstrip().split('\t')
16 | 		fout.write('{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[12]},{0[13]},{0[14]}],id:{1},struct:{{thin:[[{0[1]},{0[2]}]],thick:['.format(lst,id))
17 | 		id+=1
18 | 		a=int(lst[1])
19 | 		sizes=lst[10].split(',')
20 | 		starts=lst[11].split(',')
21 | 		for i in range(len(sizes)):
22 | 			fout.write('[{0},{1}],'.format(a+int(starts[i]),a+int(starts[i])+int(sizes[i])))
23 | 		fout.write(']},')
24 | 
25 | 		if len(lst[3])>1:
26 | 			fout.write('name:"'+lst[3]+'",')
27 | 		if lst[5]!='.':
28 | 			fout.write('strand:"'+lst[5]+'",')
29 | 		fout.write('\n')
30 | 
31 | fout.close()
32 | 
33 | os.system('sort -k1,1 -k2,2n '+outfile+' > '+outfile+'.srt')
34 | os.system('mv '+outfile+'.srt'+' '+outfile)
35 | os.system('bgzip -f '+outfile)
36 | os.system('tabix -f -p bed '+outfile+'.gz')
37 | 


--------------------------------------------------------------------------------
/modules/callpeak_blacklist_filter.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "species.bds"
 5 | include "module_template.bds"
 6 | 
 7 | 
 8 | string blacklist_filter_peak( string filetype, string peak, string o_dir, string group ) {
 9 | 
10 | 	prefix 	:= replace_dir( rm_ext( peak, \
11 | 				["narrowPeak","narrowpeak",\
12 | 				"broadPeak","broadpeak",\
13 | 				"regionPeak","regionpeak",\
14 | 				"gappedPeak","gappedpeak",filetype] )\
15 | 				, o_dir )
16 | 	filtered:= "$prefix.filt.$filetype.gz"
17 | 
18 | 	in 	:= [ peak ]
19 | 	out 	:= filtered
20 | 	
21 | 	taskName:= "blacklist_filter " + group
22 | 	//timeout := 3600 // to get queued fast
23 | 	system := "local"
24 | 
25 | 	wait_par( cpus )
26 | 
27 | 	tid := task( out<-in ) {
28 | 
29 | 		sys $shcmd_init
30 | 
31 | 		sys bedtools intersect -v -a <(zcat -f $peak) -b <(zcat -f $blacklist) \
32 | 			| awk 'BEGIN{OFS="\t"} {if ($5>1000) $5=1000; print $0}' \
33 | 			| grep -P 'chr[\dXY]+[ \t]' | gzip -nc > $filtered
34 | 
35 | 		sys $shcmd_finalize
36 | 	}
37 | 
38 | 	register_par( tid, cpus )
39 | 			   
40 | 	return out
41 | }
42 | 


--------------------------------------------------------------------------------
/utils/clusterGeneric/run.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use POSIX;
 4 | 
 5 | die "Error: Missing arguments.\nUsage: run.pl timeout cpus mem queue saveStdout saveStderr cmd arg1 ... argN\n" if $#ARGV < 6 ;
 6 | 
 7 | $timeout = shift @ARGV;
 8 | $cpus = shift @ARGV;
 9 | $mem = shift @ARGV;
10 | $queue = shift @ARGV;
11 | $saveStdout = shift @ARGV;
12 | $saveStderr = shift @ARGV;
13 | $cmd = join(' ', @ARGV);
14 | 
15 | $qsub = "sbatch --export=ALL ";
16 | $qsub .= "-n 1 --ntasks-per-node=1 --cpus-per-task=$cpus " if( $cpus > 0 );
17 | if( $mem > 0 ) {
18 | 	$mem = ceil($mem/1000000); # MB
19 | 	$qsub .= "--mem-per-cpu $mem ";
20 | }
21 | if( $timeout > 0 ) {
22 | 	$timeout = ceil($timeout/60); # minute
23 | 	$qsub .= "-t $timeout ";
24 | }
25 | if ( $queue ne "" ) {
26 |         $qsub .= "-p $queue "
27 | }
28 | 
29 | $pid = open QSUB, " | $qsub";
30 | die "Cannot run command '$qsub'\n" if ! kill(0, $pid); # Check that process exists
31 | print QSUB "#!/bin/sh \n";	# SLURM sbatch needs this shebang...
32 | print QSUB "$cmd\n";		# Send cluster's task via qsub's STDIN
33 | close QSUB;
34 | 
35 | exit(0);
36 | 
37 | 


--------------------------------------------------------------------------------
/modules/align_trim_fastq.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "species.bds"
 5 | include "module_template.bds"
 6 | 
 7 | 
 8 | help == fastq trimmer settings
 9 | trim_bp	:= 50 	help Number of basepairs after trimming fastqs (default: 50).
10 | 
11 | grp_color_trim_fq	:= "skyblue"
12 | 
13 | init_align_trim_fastq()
14 | 
15 | void init_align_trim_fastq() {
16 | 	trim_bp	= get_conf_val_int( trim_bp, ["trim_bp"] )
17 | 
18 | 	print("\n\n== fastq trimmer settings\n")
19 | 	print( "Number of basepairs after trimming\t\t: $trim_bp\n")
20 | }
21 | 
22 | string trim_fastq( string fastq, string o_dir, string group ) {
23 | 
24 | 	prefix 	:= replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir )
25 | 	trimmed	:= "$prefix.trim_"+metric_prefix(trim_bp)+"bp.fastq.gz"
26 | 
27 | 	in 	:= [ fastq ]
28 | 	out 	:= trimmed
29 | 	taskName:= "trim_fq " + group
30 | 	wait_par( cpus )
31 | 
32 | 	tid := task( out<-in ) {
33 | 		sys $shcmd_init
34 | 		sys python $(which trimfastq.py) $fastq $trim_bp | gzip -nc > $trimmed
35 | 		sys $shcmd_finalize
36 | 	}
37 | 
38 | 	add_task_to_graph( in, out, group, "TRIM-FQ", grp_color_trim_fq )
39 | 
40 | 	return out
41 | }


--------------------------------------------------------------------------------
/utils/get_read_length_from_fastq.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # code extracted from Daniel Kim's ATAQC module (run_ataqc.py)
 3 | 
 4 | import os, sys, re, gzip
 5 | 
 6 | def getFileHandle(filename, mode="r"):
 7 |     if (re.search('.gz$',filename) or re.search('.gzip',filename)):
 8 |         if (mode=="r"):
 9 |             mode="rb";
10 |         return gzip.open(filename,mode)
11 |     else:
12 |         return open(filename,mode)
13 | 
14 | def get_read_length(fastq_file):
15 |     '''
16 |     Get read length out of fastq file
17 |     '''
18 |     total_reads_to_consider = 1000000
19 |     line_num = 0
20 |     total_reads_considered = 0
21 |     max_length = 0
22 |     with getFileHandle(fastq_file, 'rb') as fp:
23 |         for line in fp:
24 |             if line_num % 4 == 1:
25 |                 if len(line.strip()) > max_length:
26 |                     max_length = len(line.strip())
27 |                 total_reads_considered += 1
28 |             if total_reads_considered >= total_reads_to_consider:
29 |                 break
30 |             line_num += 1
31 | 
32 |     return int(max_length)
33 | 
34 | def main():
35 |     print(get_read_length(sys.argv[1]))
36 |     
37 | if __name__ == "__main__":
38 |     main()
39 | 


--------------------------------------------------------------------------------
/html/rpt_header.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 | <!--
 6 |     <link rel="stylesheet" type="text/css" href="http://epigenomegateway.wustl.edu/browser/style.css" />
 7 |     <script type="text/javascript" src="http://epigenomegateway.wustl.edu/browser/js/base.js"></script>
 8 |     <script type="text/javascript" src="http://epigenomegateway.wustl.edu/browser/js/personality.js"></script>
 9 |     <script type="text/javascript" src="http://epigenomegateway.wustl.edu/browser/js/embed.js"></script>
10 | -->
11 |     <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js"></script>
12 |     
13 |     <link href="./report/jquery.treetable.css" rel="stylesheet" type="text/css" />
14 |     <link href="./report/jquery.treetable.theme.default.css" rel="stylesheet" />
15 |     <script src="./report/jquery.treetable.js"></script>
16 | 
17 |     <script type="text/javascript">
18 |       $( function(){
19 |       	$("#filetable").treetable({ 
20 |       		expandable: true,
21 |       		onNodeCollapse: function() {
22 |       		},
23 |       		onNodeExpand: function() {
24 |       		}
25 |       	});
26 |       });
27 |     </script>
28 |   </head>
29 |   <body>
30 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # channels : defaults, r, bioconda
 2 | 
 3 | nomkl
 4 | samtools ==1.2
 5 | htslib ==1.4 # 1.5 in bioconda needed libbz2.so.1.0
 6 | bedtools ==2.26.0 #2.22 # 2.21.0
 7 | picard ==1.126 # wanted 1.129 here but doesn't exist. instead 1.139 has backward compatibility issue, so take 1.126
 8 | ucsc-fetchchromsizes
 9 | ucsc-wigtobigwig
10 | ucsc-bedgraphtobigwig
11 | ucsc-bigwiginfo
12 | ucsc-bedclip
13 | ucsc-bedtobigbed
14 | ucsc-twobittofa
15 | macs2 ==2.1.1.20160309 #2.1.0 (no binaries for OSX)
16 | boost ==1.57.0
17 | openblas ==0.2.19
18 | numpy ==1.11.3 #1.13.3 #1.10.2 (no binaries for OSX) #1.9.0, 1.8.2 conflicts with ATAQC
19 | matplotlib ==1.5.1
20 | six==1.10.0 # to fix (ImportError: cannot import name _thread)
21 | python-dateutil==2.6.1
22 | libgfortran==3.0
23 | graphviz ==2.38.0
24 | libtool
25 | ghostscript # pdf2png
26 | pigz
27 | zlib
28 | sambamba ==0.6.6 # to fix seg fault error in 0.6.1
29 | r ==3.2.2
30 | r-snow
31 | r-snowfall
32 | r-bitops
33 | r-catools
34 | bioconductor-rsamtools
35 | r-spp ==1.13
36 | #glibc #segmentation fault in conda with openssl
37 | pyfaidx ==0.4.7.1
38 | 
39 | bwa ==0.7.13
40 | deeptools ==2.5.4 #2.2.3 does not support plotFingerprint --outQualityMetrics
41 | #openssl ==1.0.2g-0
42 | openssl==1.0.2p
43 | 


--------------------------------------------------------------------------------
/utils/clusterGeneric/stat.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | #-------------------------------------------------------------------------------
 4 | # BDS generic cluster example
 5 | #
 6 | # This is a trivial example of the 'cluster generic' interface implementation.
 7 | # The commands implemented in this example simply pass the propper arguments 
 8 | # to qsub, qdel or qstat commands.
 9 | # This is intended as a toy example, since bds can do this directly (but 
10 | # it's a good starting point to extend your own implementation).
11 | #
12 | # This script is executed in order to show the jobID of all jobs currently 
13 | # scheduled in the cluster
14 | #
15 | # Script's output: 
16 | #     This script is expected to print all jobs currently scheduled or 
17 | #     running in the cluster (e.g. qstat). One per line. The FIRST column 
18 | #     should be the jobID (columns are spce or tab separated). Other 
19 | #     columns may exists (but are currently ignored).
20 | #
21 | # Command line arguments: 
22 | #     None
23 | #
24 | #                                                                Pablo Cingolani
25 | #-------------------------------------------------------------------------------
26 | 
27 | #---
28 | # Execute cluster command to show all tasks
29 | #---
30 | $exitCode = system "squeue";
31 | 
32 | # OK
33 | exit($exitCode);
34 | 


--------------------------------------------------------------------------------
/etc/gappedPeak.as:
--------------------------------------------------------------------------------
 1 | table gappedPeak
 2 | "This format is used to provide called regions of signal enrichment based on pooled, normalized (interpreted) data where the regions may be spliced or incorporate gaps in the genomic sequence. It is a BED12+3 format."
 3 |     (
 4 |     string chrom;	"Reference sequence chromosome or scaffold"
 5 |     uint chromStart;	"Pseudogene alignment start position"
 6 |     uint chromEnd;      "Pseudogene alignment end position"
 7 |     string name;        "Name of pseudogene"
 8 |     uint score;          "Score of pseudogene with gene (0-1000)"
 9 |     char[1] strand;     "+ or - or . for unknown"
10 |     uint thickStart;    "Start of where display should be thick (start codon)"
11 |     uint thickEnd;      "End of where display should be thick (stop codon)"
12 |     uint reserved;      "Always zero for now"
13 |     int blockCount;     "Number of blocks"
14 |     int[blockCount] blockSizes; "Comma separated list of block sizes"
15 |     int[blockCount] chromStarts; "Start positions relative to chromStart"
16 |     float  signalValue;  "Measurement of average enrichment for the region"
17 |     float  pValue;       "Statistical significance of signal value (-log10). Set to -1 if not used."
18 |     float  qValue;       "Statistical significance with multiple-test correction applied (FDR). Set to -1 if not used."
19 | )
20 | 


--------------------------------------------------------------------------------
/modules/output.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "conf.bds"
 5 | 
 6 | 
 7 | help == output/title settings
 8 | out_dir 	:= "out"	help Output directory (default: out).
 9 | title		:= ""		help Prefix for HTML report and outputs without given prefix.
10 | 
11 | 
12 | init_output()
13 | 
14 | 
15 | void init_output() { 
16 | 	out_dir = get_conf_val( out_dir, 	["out_dir"] )		
17 | 	title 	= get_conf_val( title, 		["title"] )		
18 | 
19 | 	if ( title == "" ) { // if title is empty, use directory name as a title
20 | 		dirname := get_basename( get_path(out_dir) )
21 | 		if ( dirname == "out" ) { // if output folder is default one (out), then use parent dir. name
22 | 			dirname = get_basename( rm_str_at_end( get_path(out_dir), "/out" ) )
23 | 		}
24 | 		title = dirname
25 | 	}
26 | 	if ( !is_cmd_line_arg_empty() ) out_dir = mkdir( out_dir ) // create output directory and get absolute path for it
27 | 	title = replace_illegal_chrs( title )
28 | 
29 | 	print("\n\n== output directory/title info\n")
30 | 	print( "Output dir.\t\t\t: $out_dir\n" )
31 | 	print( "Title (prefix)\t\t\t: $title\n" )
32 | }
33 | 
34 | string get_rel_path( string path ) { // get relative path according to $out_dir
35 | 	rel_path := path.path().replace( out_dir.path(), "." )
36 | 	if ( rel_path == path.path() ) 	return path //""
37 | 	else 				return rel_path
38 | }
39 | 


--------------------------------------------------------------------------------
/utils/clusterGeneric/kill.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | #-------------------------------------------------------------------------------
 4 | # BDS generic cluster example
 5 | #
 6 | # This is a trivial example of the 'cluster generic' interface implementation.
 7 | # The commands implemented in this example simply pass the propper arguments 
 8 | # to qsub, qdel or qstat commands.
 9 | # This is intended as a toy example, since bds can do this directly (but 
10 | # it's a good starting point to extend your own implementation).
11 | #
12 | # The script is called when a task is killed
13 | #
14 | # Script's output: 
15 | #     None
16 | #
17 | # Command line arguments: 
18 | #     jobId: This is the jobId returned as the first line in 'clusterGenericRun' 
19 | #           script (i.e. the jobID provided by the cluster management system)
20 | #
21 | #                                                                Pablo Cingolani
22 | #-------------------------------------------------------------------------------
23 | 
24 | #---
25 | # Parse command line arguments
26 | #---
27 | die "Error: Missing arguments.\nUsage: kill.pl jobId\n" if $#ARGV < 0 ;
28 | #$jobId = shift @ARGV;
29 | $jobId = join(' ', @ARGV);
30 | 
31 | #---
32 | # Execute cluster command to kill task
33 | #---
34 | $exitCode = system "scancel $jobId";
35 | 
36 | # OK
37 | exit($exitCode);
38 | 
39 | 


--------------------------------------------------------------------------------
/modules/git.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "sys.bds"
 5 | 
 6 | 
 7 | latest_git_commit_sha1 	:= "" // to show latest git commit sha1/date
 8 | latest_git_commit_date 	:= ""
 9 | 
10 | 
11 | init_git()
12 | 
13 | 
14 | void init_git() { // print latest git commit info
15 | 	script_file_paths := get_script_file_paths()
16 | 	for ( string path : script_file_paths ) {
17 | 		if ( path.exists() && "$path/.git".exists() ) {
18 | 
19 | 			latest_git_commit_sha1 = get_stdout("cd $path; git rev-parse HEAD")
20 | 			latest_git_commit_date = get_stdout("cd $path; git show -s --format=%cd --date=local $latest_git_commit_sha1")
21 | 			break;
22 | 		}
23 | 	}
24 | 
25 | 	print("\n\n== git info\n")
26 | 	if ( latest_git_commit_sha1 == "" ) \
27 | 		print( "Latest git commit\t\t: not under git control\n" )
28 | 	else \
29 | 		print( "Latest git commit\t\t: $latest_git_commit_sha1 ($latest_git_commit_date)\n" )	
30 | }
31 | 
32 | string html_pipeline_version( string git_url_prefix ) {
33 | 	string html	
34 | 	if ( latest_git_commit_sha1 != "" ) {
35 | 		html += "<div id='latest_git_commit'><b>Pipeline version</b><br><p>"
36 | 		html += "Latest git commit SHA1: "+\
37 | 			"<a target=_blank, href=$git_url_prefix/$latest_git_commit_sha1>$latest_git_commit_sha1</a>"+\
38 | 			" ($latest_git_commit_date)\n"
39 | 		html += "</p></div><br>\n"
40 | 	}
41 | 
42 | 	return html
43 | }
44 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | BSD-3-Clause License
 2 | 
 3 | Copyright (c) 2016, Kundaje Lab
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 9 | 
10 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
11 | 
12 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/utils/axt_dirfiles.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | import sys,glob,gzip,os
 4 | 
 5 | # axt format: http://genome.ucsc.edu/goldenPath/help/axt.html
 6 | 
 7 | if len(sys.argv)!=3:
 8 | 	print '<chr size file> <output file> Run under the dir of gzipped Axt files, presumably one for each target chr but that doesn\'t matter'
 9 | 	sys.exit()
10 | 
11 | chrsize={}
12 | with open(sys.argv[1]) as fin:
13 | 	for line in fin:
14 | 		lst=line.rstrip().split('\t')
15 | 		chrsize[lst[0]]=int(lst[1])
16 | 
17 | 
18 | OF=sys.argv[2]
19 | 
20 | fout=open(OF,'w')
21 | 
22 | id=1
23 | 
24 | for f in glob.glob('*'):
25 | 	fin=gzip.GzipFile(f,'r')
26 | 	line=fin.readline()
27 | 	while line:
28 | 		if line[0]!='#':
29 | 			lst=line.rstrip().split()
30 | 			# query start/stop
31 | 			a=0
32 | 			b=0
33 | 			if lst[7]=='+':
34 | 				a=int(lst[5])-1
35 | 				b=lst[6]
36 | 			else:
37 | 				c=chrsize[lst[4]]
38 | 				a=c-int(lst[6])
39 | 				b=c-int(lst[5])+1
40 | 
41 | 			fout.write('{0[1]}\t{2}\t{0[3]}\tid:{1},genomealign:{{chr:"{0[4]}",start:{3},stop:{4},strand:"{0[7]}",targetseq:'.format(
42 | 				lst,
43 | 				id,
44 | 				int(lst[2])-1,
45 | 				a,
46 | 				b
47 | 				))
48 | 			id+=1
49 | 			line=fin.readline().rstrip()
50 | 			fout.write('"'+line+'",queryseq:')
51 | 			line=fin.readline().rstrip()
52 | 			fout.write('"'+line+'"}\n')
53 | 			fin.readline()
54 | 		line=fin.readline()
55 | 
56 | 
57 | fout.close()
58 | 
59 | 
60 | os.system('sort -k1,1 -k2,2n '+OF+' > xx')
61 | os.system('mv xx '+OF)
62 | os.system('bgzip -f '+OF)
63 | os.system('tabix -f -p bed '+OF+'.gz')
64 | 


--------------------------------------------------------------------------------
/examples/example.env:
--------------------------------------------------------------------------------
 1 | ## Get your hostname by `hostname -f`
 2 | 
 3 | [your_hostname]
 4 | 
 5 | mod_chipseq = bwa/0.7.7 samtools/0.1.19 bedtools/2.19.1 ucsc_tools/3.0.9 picard-tools/1.92 MACS2/2.1.0 java/latest
 6 | 
 7 | addpath_chipseq = /srv/gsfs0/scratch/leepc12/software/idrCode:/srv/gsfs0/scratch/leepc12/software/phantompeakqualtools:/srv/gsfs0/scratch/leepc12/software/idr/bin:/srv/gsfs0/scratch/leepc12/software/align2rawsignal/bin:/srv/gsfs0/scratch/leepc12/software/gem:/srv/gsfs0/scratch/leepc12/software/deepTools/bin:/srv/gsfs0/scratch/leepc12/software/R-2.15.1/bin:/srv/gsfs0/scratch/leepc12/software/python3.4/bin:/srv/gsfs0/scratch/leepc12/software/python2.7/bin
 8 | 
 9 | shcmd_chipseq = export GEMROOT=/srv/gsfs0/scratch/leepc12/software/gem; export GEM=/srv/gsfs0/scratch/leepc12/software/gem/gem.jar; export LAPACK=/srv/gsfs0/scratch/leepc12/software/blas/lapack-*/liblapack.a; export _JAVA_OPTIONS='-Xms256M -Xmx512M -XX:ParallelGCThreads=1'; export MAX_JAVA_MEM='8G'; export MALLOC_ARENA_MAX=4; MCRROOT=/srv/gsfs0/scratch/leepc12/software/MATLAB_Compiler_Runtime/v714; LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRROOT}/runtime/glnxa64; LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRROOT}/bin/glnxa64; MCRJRE=${MCRROOT}/sys/java/jre/glnxa64/jre/lib/amd64; LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRJRE}/native_threads; LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRJRE}/server; LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${MCRJRE}; XAPPLRESDIR=${MCRROOT}/X11/app-defaults; export LD_LIBRARY_PATH; export XAPPLRESDIR;
10 | 
11 | species_file = $script_dir/species/scg3.conf
12 | 
13 | use_sys_default = true	# unlimited resource
14 | 


--------------------------------------------------------------------------------
/modules/callpeak_bigbed.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "species.bds"
 5 | include "module_template.bds"
 6 | 
 7 | 
 8 | string peak_to_bigbed( string filetype, string peak, string o_dir, string group ) {
 9 | 
10 | 	prefix 	:= replace_dir( rm_ext( peak, \
11 | 				["narrowPeak","narrowpeak",\
12 | 				"broadPeak","broadpeak",\
13 | 				"gappedPeak","gappedpeak",filetype] )\
14 | 				, o_dir )
15 | 	bigbed 	:= "$prefix.$filetype.bb"
16 | 
17 | 	bed_param := _get_bed_param( filetype )
18 | 
19 | 	in 	:= [ peak ]
20 | 	out 	:= bigbed
21 | 	
22 | 	taskName:= "peak_to_bigbed " + group
23 | 	system := "local"
24 | 
25 | 	wait_par( cpus )
26 | 
27 | 	tid := task( out<-in ) {
28 | 
29 | 		sys $shcmd_init
30 | 
31 | 		sys cat $chrsz | grep -P 'chr[\dXY]+[ \t]' > $bigbed.chrsz.tmp
32 | 		sys zcat $peak | sort -k1,1 -k2,2n > $bigbed.tmp
33 | 		sys bedClip $bigbed.tmp $bigbed.chrsz.tmp $bigbed.tmp2
34 | 
35 | 		sys bedToBigBed $bed_param $bigbed.tmp2 $bigbed.chrsz.tmp $bigbed
36 | 		sys rm -f $bigbed.tmp $bigbed.tmp2 $bigbed.chrsz.tmp
37 | 
38 | 		sys $shcmd_finalize
39 | 	}
40 | 
41 | 	register_par( tid, cpus )
42 | 			   
43 | 	return out
44 | }
45 | 
46 | string _get_bed_param( string filetype ) {
47 | 
48 | 	if ( filetype.toLower() == "narrowpeak" ) {
49 | 		return "-type=bed6+4 -as=$script_dir/etc/narrowPeak.as"
50 | 	}
51 | 	else if ( filetype.toLower() == "broadpeak") { 
52 | 		return "-type=bed6+3 -as=$script_dir/etc/broadPeak.as"
53 | 	}
54 | 	else if ( filetype.toLower() == "gappedpeak") {
55 | 		return "-type=bed12+3 -as=$script_dir/etc/gappedPeak.as"
56 | 	}
57 | 	else {
58 | 		error("Unsupported peak file type! ($filetype)\n")
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------
/utils/clusterGeneric/postMortemInfo.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | #-------------------------------------------------------------------------------
 4 | # BDS generic cluster example
 5 | #
 6 | # This is a trivial example of the 'cluster generic' interface implementation.
 7 | # The commands implemented in this example simply pass the propper arguments 
 8 | # to qsub, qdel or qstat commands.
 9 | # This is intended as a toy example, since bds can do this directly (but 
10 | # it's a good starting point to extend your own implementation).
11 | #
12 | # The following command is executed in order to get information of a recently 
13 | # finished jobId. This information is typically used for debuging and it added
14 | # to bds's output.
15 | #
16 | # Script's output: 
17 | #     The output is not parsed, it is stored and later shown 
18 | #     in bds's report. Is should contain information relevant 
19 | #     to the job's execution (e.g. "qstat -f $jobId" or 
20 | #     "checkjob -v $jobId")
21 | #
22 | # Command line arguments: 
23 | #     jobId: This is the jobId returned as the first line in 'clusterGenericRun' 
24 | #           script (i.e. the jobID provided by the cluster management system)
25 | #
26 | #                                                                Pablo Cingolani
27 | #-------------------------------------------------------------------------------
28 | 
29 | #---
30 | # Parse command line arguments
31 | #---
32 | die "Error: Missing arguments.\nUsage: postMortemInfo.pl jobId\n" if $#ARGV < 0 ;
33 | $jobId = shift @ARGV;
34 | 
35 | #---
36 | # Execute cluster command to show task details
37 | #---
38 | $exitCode = system "squeue -j $jobId";
39 | 
40 | # OK
41 | exit($exitCode);
42 | 
43 | 


--------------------------------------------------------------------------------
/example_conf.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"screen" : "",
 3 | 	"dry_run" : false,
 4 | 	"type" : "TF",
 5 | 	"final_stage" : "idr",
 6 | 	"out_dir" : "out",
 7 | 	"title" : "",
 8 | 	"input_endedness" : {
 9 | 		"se" : false,
10 | 		"pe" : false
11 | 	},
12 | 	"input_files" : {
13 | 	},
14 | 	"species" : {
15 | 		"species" : ""
16 | 	},
17 | 	"cluster" : {
18 | 		"use_system" : "local",
19 | 		"q" : ""
20 | 	},
21 | 	"resource" : {
22 | 		"nth" : 8,
23 | 		"no_par" : false,
24 | 		"wt" : "5h50m",
25 | 		"memory" : "7G",
26 | 		"wt_dedup" : "23h",
27 | 		"mem_dedup" : "12G",
28 | 		"mem_shuf" : "12G",
29 | 		"wt_bwa" : "47h",
30 | 		"mem_bwa" : "12G",
31 | 		"wt_macs2" : "23h",
32 | 		"mem_macs2" : "15G",
33 | 		"wt_spp" : "47h",
34 | 		"mem_spp" : "12G"
35 | 	},
36 | 	"alignment" : {
37 | 		"aligner" : "bwa",
38 | 		"bwa" : {
39 | 			"param_bwa_aln" : "-q 5 -l 32 -k 2"
40 | 		},
41 | 		"filter" : {
42 | 			"dup_marker" : "picard",
43 | 			"anon_filt_bam" : false,
44 | 			"mapq_thresh" : 30,
45 | 			"no_dup_removal" : false
46 | 		},
47 | 		"subsample" : {
48 | 			"subsample_chip" : "0",
49 | 			"subsample_ctl" : "0"
50 | 		}
51 | 	},
52 | 	"cross_corr_analysis" : {
53 | 		"no_xcor" : false,
54 | 		"subsample_xcor" : "15M",
55 | 		"speak_xcor" : -1
56 | 	},
57 | 	"callpeak" : {
58 | 		"peak_caller" : "spp",
59 | 		"ctl_depth_ratio" : 1.2,
60 | 		"use_pooled_ctl" : false,
61 | 		"true_rep" : false,
62 | 		"no_pseudo_rep" : false,
63 | 		"spp" : {
64 | 			"npeak_spp" : 300000
65 | 		},
66 | 		"macs2" : {
67 | 			"pval_thresh_macs2" : 0.01
68 | 		},
69 | 		"idr" : {
70 | 			"idr_thresh" : 0.05
71 | 		},
72 | 		"naive_overlap" : {
73 | 			"nonamecheck" : false
74 | 		}
75 | 	},
76 | 	"visualization" : {
77 | 		"url_base" : ""
78 | 	}
79 | }
80 | 


--------------------------------------------------------------------------------
/utils/reassemble.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | import sys
 4 | 
 5 | if len(sys.argv)!=3:
 6 | 	print '<input coordinate file> <basename of output file>'
 7 | 	sys.exit()
 8 | 
 9 | infile,outn=sys.argv[1:]
10 | 
11 | aliencoord=0
12 | alienchrid=1
13 | id1=1
14 | id2=1
15 | fn1=outn+'_native'
16 | fn2=outn+'_alien'
17 | fout1=open(fn1,'w')
18 | fout2=open(fn2,'w')
19 | 
20 | chrname='scaffold_'
21 | 
22 | with open(infile) as fin:
23 | 	for line in fin:
24 | 		lst=line.rstrip().split('\t')
25 | 		if len(lst)==1:
26 | 			print '{2}{0}:{1}'.format(alienchrid,aliencoord,chrname)
27 | 			aliencoord=0
28 | 			alienchrid+=1
29 | 			continue
30 | 		a=int(lst[1])
31 | 		b=int(lst[2])
32 | 
33 | 		if a>=b:
34 | 			print 'wrong line: '+line
35 | 			sys.exit()
36 | 
37 | 		# native
38 | 		fout1.write('{0}\t{1}\t{2}\tid:{3},genomealign:{{chr:"{8}{4}",start:{5},stop:{6},strand:"{7}"}}\n'.format(
39 | 			lst[0],a,b,
40 | 			id1,
41 | 			alienchrid,
42 | 			aliencoord,
43 | 			aliencoord+b-a,
44 | 			lst[3],
45 | 			chrname
46 | 			))
47 | 		id1+=1
48 | 		# alien
49 | 		fout2.write('{8}{0}\t{1}\t{2}\tid:{3},genomealign:{{chr:"{4}",start:{5},stop:{6},strand:"{7}"}}\n'.format(
50 | 			alienchrid,
51 | 			aliencoord,
52 | 			aliencoord+b-a,
53 | 			id2,
54 | 			lst[0],a,b,
55 | 			lst[3],
56 | 			chrname
57 | 			))
58 | 		id2+=1
59 | 		aliencoord+=b-a
60 | 
61 | print '{2}{0}:{1}'.format(alienchrid,aliencoord,chrname)
62 | 
63 | fout1.close()
64 | fout2.close()
65 | 
66 | import os
67 | 
68 | os.system('sort -k1,1 -k2,2n '+fn1+' > x')
69 | os.system('mv x '+fn1)
70 | os.system('bgzip -f '+fn1)
71 | os.system('tabix -f -p bed '+fn1+'.gz')
72 | 
73 | os.system('sort -k1,1 -k2,2n '+fn2+' > x')
74 | os.system('mv x '+fn2)
75 | os.system('bgzip -f '+fn2)
76 | os.system('tabix -f -p bed '+fn2+'.gz')
77 | 


--------------------------------------------------------------------------------
/examples/encode_test.sh:
--------------------------------------------------------------------------------
 1 | TITLE=ENCSR011PEI
 2 | FASTQ1=/srv/scratch/shared/surya/leepc12/data/ENCODE_test/ENCSR011PEI/rep1/ENCFF282GDI_ENCFF316FIQ.fastq.gz
 3 | FASTQ2=/srv/scratch/shared/surya/leepc12/data/ENCODE_test/ENCSR011PEI/rep2/ENCFF959EDS_ENCFF740WEF.fastq.gz
 4 | CTL_FASTQ1=/srv/scratch/shared/surya/leepc12/data/ENCODE_test/ENCSR011PEI/ctl1/ENCFF728HNA.fastq.gz
 5 | WORKDIR=/srv/scratch/shared/surya/leepc12/run/ENCODE_test/$TITLE
 6 | mkdir -p $WORKDIR; cd $WORKDIR
 7 | bds_scr $TITLE /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -title $TITLE -nth 8 -species hg38 -fastq1 $FASTQ1 -fastq2 $FASTQ2 -ctl_fastq1 $CTL_FASTQ1
 8 | sleep 1
 9 | 
10 | TITLE=ENCSR017GBO
11 | FASTQ1=/srv/scratch/shared/surya/leepc12/data/ENCODE_test/ENCSR017GBO/rep1/ENCFF697GAP_ENCFF713DPD.fastq.gz
12 | FASTQ2=/srv/scratch/shared/surya/leepc12/data/ENCODE_test/ENCSR017GBO/rep2/ENCFF987WCU.fastq.gz
13 | CTL_FASTQ1=/srv/scratch/shared/surya/leepc12/data/ENCODE_test/ENCSR017GBO/ctl1/ENCFF894RGF_ENCFF414HWA.fastq.gz
14 | WORKDIR=/srv/scratch/shared/surya/leepc12/run/ENCODE_test/$TITLE
15 | mkdir -p $WORKDIR; cd $WORKDIR
16 | bds_scr $TITLE /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -title $TITLE -nth 8 -species hg38 -fastq1 $FASTQ1 -fastq2 $FASTQ2 -ctl_fastq1 $CTL_FASTQ1
17 | sleep 1
18 | 
19 | TITLE=ENCSR290MUH
20 | FASTQ1=/srv/scratch/shared/surya/leepc12/data/ENCODE_test/ENCSR290MUH/rep1/ENCFF861PLD_ENCFF346WZR.fastq.gz
21 | FASTQ2=/srv/scratch/shared/surya/leepc12/data/ENCODE_test/ENCSR290MUH/rep2/ENCFF701VYF_ENCFF385UYP.fastq.gz
22 | CTL_FASTQ1=/srv/scratch/shared/surya/leepc12/data/ENCODE_test/ENCSR290MUH/ctl1/ENCFF617CIJ_ENCFF322SHV.fastq.gz
23 | WORKDIR=/srv/scratch/shared/surya/leepc12/run/ENCODE_test/$TITLE
24 | mkdir -p $WORKDIR; cd $WORKDIR
25 | bds_scr $TITLE /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -title $TITLE -nth 8 -species hg38 -fastq1 $FASTQ1 -fastq2 $FASTQ2 -ctl_fastq1 $CTL_FASTQ1
26 | sleep 1
27 | 
28 | 


--------------------------------------------------------------------------------
/utils/bds_scr_5min:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "$#" -lt 2 ]; then
 4 |   echo
 5 |   echo "Create a detached screen for a BDS script and redirect stdout/stderr to a log file."
 6 |   echo "If you skip [LOG_FILE_NAME], a log file [SCR_NAME].log will be generated on the working directory."
 7 |   echo "If a log file already exists, stdout/stderr will be appended to it."
 8 |   echo "Monitor a log file with 'tail -f [LOG_FILE_NAME]'"
 9 |   echo
10 |   echo "Usage: bds_scr [SCR_NAME] [LOG_FILE_NAME] [BDS_PARAM]"
11 |   echo "  Example: bds_scr TEST ~/TEST.log -s sge chipseq.bds -fastq1 ..."
12 |   echo
13 |   exit 0
14 | fi
15 | 
16 | SCR_NAME=$1.BDS
17 | 
18 | #if [ $(screen -ls $SCR_NAME | grep 'No Sockets' | wc -l) != "1" ]; then
19 | if [ $(screen -ls | grep -P "[\t ]\d+.$SCR_NAME" | wc -l) != "0" ]; then
20 |   echo "error: A screen named $SCR_NAME already exists."
21 |   exit 1
22 | else
23 |   echo "[SCR_NAME] : $SCR_NAME"
24 | fi
25 | 
26 | if [[ $2 == -* || $2 == *.bds ]]; then # LOG_FILE_NAME skipped
27 |   LOG_FILE_NAME="$PWD/$SCR_NAME.log"
28 |   PARAM_START_IDX=2
29 | elif [[ $3 == -* || $3 == *.bds ]]; then
30 |   LOG_FILE_NAME=$2
31 |   PARAM_START_IDX=3
32 | else
33 |   echo "error: [BDS_PARAM] is wrong."
34 |   exit 2
35 | fi
36 | 
37 | if [ $(find $LOG_FILE_NAME -mmin -5 | wc -l) != "0" ]; then
38 |   echo "error: log file handle is open or very fresh (modified in past 5 minutes)."
39 |   exit 3
40 | fi
41 | 
42 | PARAM=
43 | 
44 | for ((i=$PARAM_START_IDX;i<=$#;i++)); do
45 |   PARAM="$PARAM ${!i}"
46 | done
47 | 
48 | echo "[LOG_FILE_NAME] : $LOG_FILE_NAME"
49 | echo "[BDS_PARAM] : $PARAM"
50 | 
51 | mkdir -p $(dirname $LOG_FILE_NAME)
52 | 
53 | echo ""
54 | echo "===== Created a new screen ====" >> $LOG_FILE_NAME
55 | echo "DATE : $(date)" >> $LOG_FILE_NAME
56 | echo "[HOST] : $(hostname -f)" >> $LOG_FILE_NAME
57 | echo "[SCR_NAME] : $SCR_NAME" >> $LOG_FILE_NAME
58 | echo "[BDS_PARAM] : $PARAM" >> $LOG_FILE_NAME
59 | echo "" >> $LOG_FILE_NAME
60 | 
61 | screen -Sdm $SCR_NAME bash -c "bds &>>$LOG_FILE_NAME $PARAM $>>$LOG_FILE_NAME"
62 | 
63 | 


--------------------------------------------------------------------------------
/utils/bds_scr:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "$#" -lt 2 ]; then
 4 |   echo
 5 |   echo "Create a detached screen for a BDS script and redirect stdout/stderr to a log file."
 6 |   echo "If you skip [LOG_FILE_NAME], a log file [SCR_NAME].log will be generated on the working directory."
 7 |   echo "If a log file already exists, stdout/stderr will be appended to it."
 8 |   echo "Monitor a log file with 'tail -f [LOG_FILE_NAME]'"
 9 |   echo
10 |   echo "Usage: bds_scr [SCR_NAME] [LOG_FILE_NAME] [BDS_PARAM]"
11 |   echo "  Example: bds_scr TEST ~/TEST.log -s sge chipseq.bds -fastq1 ..."
12 |   echo
13 |   exit 0
14 | fi
15 | 
16 | SCR_NAME="$1".BDS
17 | 
18 | #if [ $(screen -ls $SCR_NAME | grep 'No Sockets' | wc -l) != "1" ]; then
19 | if [ $(screen -ls | grep -P "[\t ]\d+.$SCR_NAME" | wc -l) != "0" ]; then
20 |   echo "error: A screen named $SCR_NAME already exists."
21 |   exit 1
22 | else
23 |   echo "[SCR_NAME] : $SCR_NAME"
24 | fi
25 | 
26 | if [[ $2 == -* || $2 == *.bds ]]; then # LOG_FILE_NAME skipped
27 |   LOG_FILE_NAME="$PWD/$SCR_NAME.log"
28 |   PARAM_START_IDX=2
29 | elif [[ $3 == -* || $3 == *.bds ]]; then
30 |   LOG_FILE_NAME=$2
31 |   PARAM_START_IDX=3
32 | else
33 |   echo "error: [BDS_PARAM] is wrong."
34 |   exit 1
35 | fi
36 | 
37 | PARAM=
38 | 
39 | if [ $(find $LOG_FILE_NAME -mmin -2 2> /dev/null | wc -l) != "0" ]; then
40 |   echo "error: log file handle is open or very fresh (modified in past 2 minutes)."
41 |   exit 3
42 | fi
43 | 
44 | for ((i=$PARAM_START_IDX;i<=$#;i++)); do
45 |   PARAM="$PARAM ${!i}"
46 | done
47 | 
48 | echo "[HOST] : $(hostname -f)"
49 | echo "[LOG_FILE_NAME] : $LOG_FILE_NAME"
50 | echo "[BDS_PARAM] : $PARAM"
51 | 
52 | mkdir -p $(dirname $LOG_FILE_NAME)
53 | 
54 | echo ""
55 | echo "===== Created a new screen ====" >> $LOG_FILE_NAME
56 | echo "[DATE] : $(date)" >> $LOG_FILE_NAME
57 | echo "[HOST] : $(hostname -f)" >> $LOG_FILE_NAME
58 | echo "[SCR_NAME] : $SCR_NAME" >> $LOG_FILE_NAME
59 | echo "[BDS_PARAM] : $PARAM" >> $LOG_FILE_NAME
60 | echo "" >> $LOG_FILE_NAME
61 | 
62 | screen -Sdm $SCR_NAME bash -c "bds &>>$LOG_FILE_NAME $PARAM $>>$LOG_FILE_NAME"
63 | 
64 | 


--------------------------------------------------------------------------------
/modules/input_tagalign.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "conf.bds"
 5 | 
 6 | 
 7 | help == tagalign input definition :
 8 | help         For replicate '-tag[REP_ID]', For control '-ctl_tag[REP_ID]'.
 9 | 
10 | 
11 | string get_tag( int ctl, int rep ) {
12 | 
13 | 	key := ( ctl > 0 ? "ctl_tag" : "tag" ) + "_rep" + rep
14 | 	key2 := ( ctl > 0 ? "ctl_tagalign" : "tagalign" ) + "_rep" + rep
15 | 
16 | 	key3 := ( ctl > 0 ? "ctl_tag" : "tag" ) + rep
17 | 	key4 := ( ctl > 0 ? "ctl_tagalign" : "tagalign" ) + rep
18 | 
19 | 	key5 := ( ctl > 0 ? "ctl_tag" : "tag" )
20 | 	key6 := ( ctl > 0 ? "ctl_tagalign" : "tagalign" )
21 | 
22 | 	if ( cmd_line_arg_has_key( key ) ) {
23 | 		return get_path( get_cmd_line_arg_val( key ) )
24 | 	}
25 | 	else if ( cmd_line_arg_has_key( key2 ) ) {
26 | 		return get_path( get_cmd_line_arg_val( key2 ) )
27 | 	}
28 | 	else if ( cmd_line_arg_has_key( key3 ) ) {
29 | 		return get_path( get_cmd_line_arg_val( key3 ) )
30 | 	}
31 | 	else if ( cmd_line_arg_has_key( key4 ) ) {
32 | 		return get_path( get_cmd_line_arg_val( key4 ) )
33 | 	}
34 | 	else if ( (rep==1) && cmd_line_arg_has_key( key5 ) ) {
35 | 		return get_path( get_cmd_line_arg_val( key5 ) )
36 | 	}
37 | 	else if ( (rep==1) && cmd_line_arg_has_key( key6 ) ) {
38 | 		return get_path( get_cmd_line_arg_val( key6 ) )
39 | 	}
40 | 	else if ( conf.hasKey( key ) ) {
41 | 		return get_path( conf{ key } )
42 | 	}
43 | 	else if ( conf.hasKey( key2 ) ) {
44 | 		return get_path( conf{ key2 } )
45 | 	}
46 | 	else if ( conf.hasKey( key3 ) ) {
47 | 		return get_path( conf{ key3 } )
48 | 	}
49 | 	else if ( conf.hasKey( key4 ) ) {
50 | 		return get_path( conf{ key4 } )
51 | 	}
52 | 	else if ( (rep==1) && conf.hasKey( key5 ) ) {
53 | 		return get_path( conf{ key5 } )
54 | 	}
55 | 	else if ( (rep==1) && conf.hasKey( key6 ) ) {
56 | 		return get_path( conf{ key6 } )
57 | 	}
58 | 	return ""
59 | }
60 | 
61 | string get_tag( int rep ) {
62 | 
63 | 	return get_tag( 0, rep )
64 | }
65 | 
66 | bool is_input_tag( int ctl, int rep ) {
67 | 
68 | 	return get_tag( ctl, rep ) != ""
69 | }
70 | 
71 | bool is_input_tag( int rep ) {
72 | 
73 | 	return is_input_tag( 0, rep )
74 | }
75 | 


--------------------------------------------------------------------------------
/modules/callpeak_filter.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "species.bds"
 5 | include "module_template.bds"
 6 | 
 7 | 
 8 | help == callpeak etc settings
 9 | npeak_filt	 	:= 500000 	help # top peaks filtered from a narrow peak files (default: 500000).
10 | 
11 | 
12 | init_callpeak_etc()
13 | 
14 | 
15 | void init_callpeak_etc() {
16 | 
17 | 	npeak_filt 	= get_conf_val_int( npeak_filt,		["npeak_filt"] )
18 | 
19 | 	print("\n\n== callpeak etc settings\n")
20 | 	print( "# of top peaks to pick up in peak files\t: $npeak_filt\n")
21 | }
22 | 
23 | // sort in a descending order of p-value and take top $npeak_filt peaks
24 | string filt_top_peaks( string filetype, string peakfile, string o_dir, string group ) {
25 | 
26 | 	prefix 		:= replace_dir( rm_ext( peakfile, \
27 | 						["narrowPeak","gappedPeak","broadPeak","regionPeak"] ), o_dir )
28 | 	ext 		:= get_actual_ext( peakfile )
29 | 	peakfile_filt 	:= "$prefix."+metric_prefix( npeak_filt )+".$ext"
30 | 	sort_param 	:= _get_sort_param( filetype )
31 | 
32 | 	in 	:= [ peakfile ]
33 | 	out 	:= peakfile_filt
34 | 
35 | 	taskName:= "filt_top_peaks " + group
36 | 	timeout := 3600 // to get queued fast
37 | 	system  := "local"
38 | 
39 | 	wait_par( cpus )
40 | 
41 | 	tid := task( out<-in ) {
42 | 
43 | 		sys $shcmd_init
44 | 
45 | 		// sort -grk8 returns non-zero exit code when 8th columns of any line pair are equal
46 | 		sys set +o pipefail
47 | 
48 | 		// sort by 8th (-log10(pval) ) column and take top $npeak_filt lines
49 | 		sys zcat $peakfile | sort $sort_param | head -n $npeak_filt | gzip -nc > $peakfile_filt
50 | 
51 | 		sys $shcmd_finalize
52 | 	}
53 | 	
54 | 	register_par( tid, cpus )
55 | 		
56 | 	add_task_to_graph( in, out, group )
57 | 	
58 | 	return out	
59 | }
60 | 
61 | string _get_sort_param( string filetype ) {
62 | 	
63 | 	if ( filetype.toLower() == "narrowpeak" || filetype.toLower() == "regionpeak" || filetype.toLower() == "broadpeak" ) {
64 | 		// p-value is at 8th column
65 | 		return "-s -grk8,8"
66 | 	}
67 | 	else if ( filetype.toLower() == "gappedpeak") { 
68 | 		// p-value is at 14th column
69 | 		return "-s -grk14,14"
70 | 	}
71 | 	else {
72 | 		error("Unsupport peak file type! ($filetype)\n")
73 | 	}
74 | }
75 | 


--------------------------------------------------------------------------------
/examples/start_from_peaks.sh:
--------------------------------------------------------------------------------
 1 | OUT=/srv/gsfs0/scratch/leepc12/run/DREAM_challenge_hidden/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO/out
 2 | 
 3 | peak1=$OUT/peak/spp/rep1/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS866ZXX.BSREP1.TECHREP1.FILEIDENCFF164ZYB.R1.PE2SE.nodup.tagAlign_x_CONTROL.MCF-7.R1.PE2SE.nodup.40M.tagAlign.regionPeak.gz
 4 | peak2=$OUT/peak/spp/rep2/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS705BBA.BSREP2.TECHREP1.FILEIDENCFF478QIY.R1.PE2SE.nodup.tagAlign_x_CONTROL.MCF-7.R1.PE2SE.nodup.40M.tagAlign.regionPeak.gz
 5 | peak_pooled=$OUT/peak/spp/pooled_rep/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS866ZXX.BSREP1.TECHREP1.FILEIDENCFF164ZYB.R1.PE2SE.nodup_pooled.tagAlign_x_CONTROL.MCF-7.R1.PE2SE.nodup.40M.tagAlign.regionPeak.gz
 6 | peak1_pr1=$OUT/peak/spp/pseudo_reps/rep1/pr1/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS866ZXX.BSREP1.TECHREP1.FILEIDENCFF164ZYB.R1.PE2SE.nodup.pr1.tagAlign_x_CONTROL.MCF-7.R1.PE2SE.nodup.40M.tagAlign.regionPeak.gz
 7 | peak1_pr2=$OUT/peak/spp/pseudo_reps/rep1/pr2/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS866ZXX.BSREP1.TECHREP1.FILEIDENCFF164ZYB.R1.PE2SE.nodup.pr2.tagAlign_x_CONTROL.MCF-7.R1.PE2SE.nodup.40M.tagAlign.regionPeak.gz
 8 | peak2_pr1=$OUT/peak/spp/pseudo_reps/rep2/pr1/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS705BBA.BSREP2.TECHREP1.FILEIDENCFF478QIY.R1.PE2SE.nodup.pr1.tagAlign_x_CONTROL.MCF-7.R1.PE2SE.nodup.40M.tagAlign.regionPeak.gz
 9 | peak2_pr2=$OUT/peak/spp/pseudo_reps/rep2/pr2/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS705BBA.BSREP2.TECHREP1.FILEIDENCFF478QIY.R1.PE2SE.nodup.pr2.tagAlign_x_CONTROL.MCF-7.R1.PE2SE.nodup.40M.tagAlign.regionPeak.gz
10 | peak_ppr1=$OUT/peak/spp/pooled_pseudo_reps/ppr1/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS866ZXX.BSREP1.TECHREP1.FILEIDENCFF164ZYB.R1.PE2SE.nodup.pr1_pooled.tagAlign_x_CONTROL.MCF-7.R1.PE2SE.nodup.40M.tagAlign.regionPeak.gz
11 | peak_ppr2=$OUT/peak/spp/pooled_pseudo_reps/ppr2/CHIPseq.ATF2.MCF-7.EXPID_ENCSR881UOO.BSID_ENCBS866ZXX.BSREP1.TECHREP1.FILEIDENCFF164ZYB.R1.PE2SE.nodup.pr2_pooled.tagAlign_x_CONTROL.MCF-7.R1.PE2SE.nodup.40M.tagAlign.regionPeak.gz
12 | 
13 | bds $CODE/bds_atac/chipseq/chipseq.bds -species hg19 \
14 | -peak1 $peak1 -peak2 $peak2 -peak_pooled $peak_pooled \
15 | -peak1_pr1 $peak1_pr1 -peak1_pr2 $peak1_pr2 -peak2_pr1 $peak2_pr1 -peak2_pr2 $peak1_pr2 \
16 | -peak_ppr1 $peak_ppr1 -peak_ppr2 $peak_ppr2
17 | 
18 | 


--------------------------------------------------------------------------------
/modules/species.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "conf.bds"
 5 | 
 6 | 
 7 | help == species settings
 8 | species		:= "" 		help Species. need to specify '-species_file' too if you have not installed genome database with 'install_genome_data.sh'.
 9 | species_file	:= ""		help Species file path.
10 | species_browser := "" 		help Species name in WashU genome browser.
11 | 
12 | ref_fa 		:= ""		help Reference genome sequence fasta.
13 | chrsz 		:= "" 		help Chromosome sizes file path (use fetchChromSizes from UCSC tools).
14 | blacklist 	:= "" 		help Blacklist bed.
15 | seq_dir 	:= ""		help Reference genome sequence directory path (where chr*.fa exist).
16 | 
17 | init_species()
18 | 
19 | void init_species() {
20 | 
21 | 	species 	= get_conf_val( species, 	["species"] )
22 | 	species_file 	= get_conf_val( species_file, 	["species_file"] )
23 | 
24 | 	_read_species()
25 | 
26 | 	species_browser = get_conf_val( species_browser,["species_browser"] )
27 | 
28 | 	ref_fa 		= get_conf_val( ref_fa, 	["ref_fa"] )
29 | 	chrsz		= get_conf_val( chrsz, 		["chrsz"] )
30 | 	blacklist 	= get_conf_val( blacklist, 	["blacklist"] )
31 | 	seq_dir 	= get_conf_val( seq_dir, 	["seq_dir"])
32 | 
33 | 	if ( species_browser == "" ) species_browser = species
34 | 
35 | 	print("\n\n== species settings\n")
36 | 	print( "Species\t\t\t\t: $species\n" )
37 | 	print( "Species file\t\t\t: $species_file\n\n" )
38 | 	print( "Species name (WashU browser)\t: $species_browser\n" )
39 | 	print( "Ref. genome seq. fasta\t\t: $ref_fa\n" )
40 | 	print( "Chr. sizes file\t\t\t: $chrsz\n" )
41 | 	print( "Black list bed\t\t\t: $blacklist\n" )
42 | 	print( "Ref. genome seq. dir.\t\t: $seq_dir\n" )
43 | }
44 | 
45 | void _read_species() { // check for species configruation files
46 | 	// value for key will be overriden as loop goes. so the last element in species_paths has the priority
47 | 	string[] species_paths
48 | 	if ( env != "" ) species_paths.add( env )
49 | 	if ( c != "" ) species_paths.add( c )
50 | 	species_paths.add( species_file )
51 | 
52 | 	for ( string path : species_paths ) {
53 | 		if ( path.exists() ) {
54 | 			add_to_conf( path, species )
55 | 		}
56 | 	}
57 | }
58 | 
59 | 
60 | // temp
61 | /*
62 | bwt_idx		:= ""		help Bowtie index (full path prefix of *.1.ebwt file).
63 | 	bwt_idx 	= get_conf_val( bwt_idx, 	["bwt_idx"] )
64 | 	print( "Bowtie index\t\t\t: $bwt_idx\n" )
65 | */	
66 | 


--------------------------------------------------------------------------------
/utils/assign_multimappers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | # piped script to take multimappers and randomly assign
 4 | # requires a qname sorted file!!
 5 | 
 6 | import sys
 7 | import random
 8 | import argparse
 9 | 
10 | def parse_args():
11 |     '''
12 |     Gives options
13 |     '''
14 |     parser = argparse.ArgumentParser(description='Saves reads below a alignment threshold and discards all others')
15 |     parser.add_argument('-k', help='Alignment number cutoff')
16 |     parser.add_argument('--paired-end', dest='paired_ended', action='store_true', help='Data is paired-end')
17 |     args = parser.parse_args()
18 |     alignment_cutoff = int(args.k)
19 |     paired_ended = args.paired_ended
20 | 
21 |     return alignment_cutoff, paired_ended
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     '''
26 |     Runs the filtering step of choosing multimapped reads
27 |     '''
28 | 
29 |     [alignment_cutoff, paired_ended] = parse_args()
30 | 
31 |     if paired_ended:
32 |         alignment_cutoff = int(alignment_cutoff) * 2
33 | 
34 |     # Store each line in sam file as a list of reads, 
35 |     # where each read is a list of elements to easily 
36 |     # modify or grab things
37 |     current_reads = [] 
38 |     current_qname = ''
39 | 
40 |     for line in sys.stdin:
41 | 
42 |         read_elems = line.strip().split('\t')
43 | 
44 |         if read_elems[0].startswith('@'):
45 |             sys.stdout.write(line)
46 |             continue
47 | 
48 |         # Keep taking lines that have the same qname
49 |         if read_elems[0] == current_qname:
50 |             # Add line to current reads
51 |             current_reads.append(line)
52 |             pass
53 |         else:
54 |             # Discard if there are more than the alignment cutoff
55 |             if len(current_reads) >= alignment_cutoff:
56 |                 current_reads = [line]
57 |                 current_qname = read_elems[0]
58 |             elif len(current_reads) > 0:
59 |                 # Just output all reads, which are then filtered with
60 |                 # samtools
61 |                 for read in current_reads:
62 |                     sys.stdout.write(str(read))
63 | 
64 |                 # And then discard
65 |                 current_reads = [line]
66 |                 current_qname = read_elems[0]
67 |             else:
68 |                 # First read in file
69 |                 current_reads.append(line)
70 |                 current_qname = read_elems[0]
71 | 
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/modules/input_peak.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "conf.bds"
 5 | 
 6 | 
 7 | help == narrow peak input definition : 
 8 | help         For true replicates, use '-peak1' and '-peak2',
 9 | help         For pooled replicates, use '-peak_pooled',
10 | help         For two PR (self-pseudo-replicates), use '-peak[REP_ID]_pr1' and '-peak[REP_ID]_pr2'
11 | help         For two PPR (pooled pseudo-replicates), use '-peak_ppr1' and '-peak_ppr2'
12 | 
13 | 
14 | void chk_input_peak( bool true_rep, bool no_pseudo_rep ) {
15 | 
16 | 	if ( !is_input_peak() ) return // read peaks here
17 | 
18 | 	for ( int rep=0; rep<=get_num_rep_peak(); rep++) { // rep==0 : pooled
19 | 		if ( get_num_rep_peak() == 1 && rep==0 ) continue // if only one replicate, skip reading pooled rep
20 | 
21 | 		for (int pse=0; pse<=2; pse++) { // pse(pseudo)==0 : true rep, pse==1,2 : self-pseudo rep 1,2
22 | 			if ( true_rep && pse > 0 ) continue
23 | 			if ( no_pseudo_rep && rep != 0 && pse > 0 ) continue
24 | 
25 | 			peak_ := get_peak(rep,pse)
26 | 			suffix1 := rep==0 ? "replicate" : "replicate $rep"
27 | 			suffix2 := rep==0 ? "pseudo-replicate $pse" : "pseudo-replicate $pse for replicate $rep"
28 | 			prefix := (rep==0 ? "pooled " : "") + (pse==0 ? suffix1 : suffix2)
29 | 			
30 | 			print( "$prefix: \n\t$peak_"+"\n")
31 | 			if ( !path_exists( peak_ ) ) error("\t\tFile not found!\n")
32 | 		}
33 | 	}
34 | }
35 | 
36 | string get_peak( int rep, int pse ) { // rep==0 : pooled peak, pse==0 : true replicate
37 | 
38 | 	if ( pse > 2 ) error ("\nget_peak() : pse should not be larger than 2!")
39 | 
40 | 	string key, key2
41 | 	if ( rep == 0 ) {
42 | 		key 	= ( pse == 0 ? "peak_pooled" : ("peak_ppr" + pse) )
43 | 		key2 	= key
44 | 	}
45 | 	else {
46 | 		key 	= "peak"     + rep + ( pse == 0 ? "" : ("_pr" + pse) )
47 | 		key2 	= "peak_rep" + rep + ( pse == 0 ? "" : ("_pr" + pse) )
48 | 	}
49 | 
50 | 	if ( cmd_line_arg_has_key( key ) ) {
51 | 		return get_path( get_cmd_line_arg_val( key ) )
52 | 	}
53 | 	else if ( cmd_line_arg_has_key( key2 ) ) {
54 | 		return get_path( get_cmd_line_arg_val( key2 ) )
55 | 	}
56 | 	else if ( conf.hasKey( key ) ) {
57 | 		return get_path( conf{ key } )
58 | 	}
59 | 	else if ( conf.hasKey( key2 ) ) {
60 | 		return get_path( conf{ key2 } )
61 | 	}
62 | 
63 | 	return ""
64 | }
65 | 
66 | bool is_input_peak() {
67 | 
68 | 	return get_peak( 1, 0 ) != ""
69 | }
70 | 
71 | int get_num_rep_peak() {
72 | 
73 | 	rep := 1
74 | 
75 | 	while( get_peak( rep, 0 ) != "" ) rep++
76 | 
77 | 	return rep-1
78 | }
79 | 


--------------------------------------------------------------------------------
/utils/ucsc_ensGene.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | import sys,os
 4 | sys.path.append('/home/xzhou/subtleKnife/script/genescript')
 5 | import parseUcscgenestruct
 6 | 
 7 | if len(sys.argv)!=3:
 8 | 	print '<ensGene.txt file> <tkname> knownToEnsembl.txt and kgXref.txt must be under current dir'
 9 | 	sys.exit()
10 | 
11 | 
12 | aa={}
13 | with open('knownToEnsembl.txt') as fin:
14 | 	for line in fin:
15 | 		lst=line.rstrip().split('\t')
16 | 		aa[lst[0]]=lst[1]
17 | 
18 | symbol={}
19 | desc={}
20 | with open('kgXref.txt') as fin:
21 | 	for line in fin:
22 | 		lst=line.rstrip().split('\t')
23 | 		if lst[0] in aa:
24 | 			ens=aa[lst[0]]
25 | 			if len(lst[4])>0:
26 | 				symbol[ens]=lst[4]
27 | 			if len(lst[7])>0:
28 | 				desc[ens]=lst[7]
29 | 
30 | 
31 | ucsc,tkname=sys.argv[1:]
32 | 
33 | 
34 | 
35 | # dump
36 | fout=open(tkname,'w')
37 | fout2=open(tkname+'_load','w')
38 | 
39 | id=1
40 | with open(ucsc) as fin:
41 | 	for line in fin:
42 | 		lst=line.rstrip().split('\t')
43 | 		g=parseUcscgenestruct.parse(lst,True)
44 | 		name=lst[1]
45 | 		fout.write('{0}\t{1}\t{2}\tname:"{3}",id:{4},strand:"{5}",'.format(
46 | 			g['chrom'],
47 | 			g['start'],
48 | 			g['stop'],
49 | 			name,
50 | 			id,
51 | 			g['strand']))
52 | 		id+=1
53 | 		if 'thin' in g or 'thick' in g:
54 | 			fout.write('struct:{')
55 | 			if 'thin' in g:
56 | 				fout.write('thin:[')
57 | 				for x in g['thin']:
58 | 					fout.write('[{0},{1}],'.format(x[0],x[1]))
59 | 				fout.write('],')
60 | 			if 'thick' in g:
61 | 				fout.write('thick:[')
62 | 				for x in g['thick']:
63 | 					fout.write('[{0},{1}],'.format(x[0],x[1]))
64 | 				fout.write('],')
65 | 			fout.write('},')
66 | 		# desc
67 | 		if name in desc:
68 | 			fout.write('desc:"'+desc[name]+'",')
69 | 		if name in symbol:
70 | 			fout.write('name2:"'+symbol[name]+'"')
71 | 			fout2.write('{0}\t{1}\t{2}\t{3}\n'.format(g['chrom'],g['start'],g['stop'],symbol[name]))
72 | 		fout.write('\n')
73 | 		fout2.write('{0}\t{1}\t{2}\t{3}\n'.format(g['chrom'],g['start'],g['stop'],name))
74 | 
75 | 
76 | fout2.close()
77 | fout.close()
78 | 
79 | import os
80 | os.system('sort -k1,1 -k2,2n '+tkname+' > x')
81 | os.system('mv x '+tkname)
82 | os.system('bgzip -f '+tkname)
83 | os.system('tabix -f -p bed '+tkname+'.gz')
84 | 
85 | print '''
86 | drop table if exists {0};
87 | create table {0} (
88 | chrom varchar(20) not null,
89 | start int unsigned not null,
90 | stop int unsigned not null,
91 | name varchar(100) not null
92 | ) ENGINE=MyISAM DEFAULT CHARSET=latin1;
93 | load data local infile '{0}_load' into table {0};
94 | create index name on {0} (name);
95 | '''.format(tkname)
96 | 
97 | 


--------------------------------------------------------------------------------
/modules/input_bam.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "conf.bds"
 5 | 
 6 | 
 7 | help == bam input (raw or filtered) definition :
 8 | help         Raw bam : For replicate '-bam[REP_ID]', For control '-ctl_bam[REP_ID]'.
 9 | help         Filtered bam : For replicate '-filt_bam[REP_ID]', For control '-ctl_filt_bam[REP_ID]'.
10 | 
11 | 
12 | string get_bam( int ctl, int rep ) {
13 | 
14 | 	key := ( ctl > 0 ? "ctl_bam" : "bam" ) + "_rep" + rep
15 | 	key2 := ( ctl > 0 ? "ctl_bam" : "bam" ) + rep
16 | 	key3 := ( ctl > 0 ? "ctl_bam" : "bam" )
17 | 
18 | 	if ( cmd_line_arg_has_key( key ) ) {
19 | 		return get_path( get_cmd_line_arg_val( key ) )
20 | 	}
21 | 	else if ( cmd_line_arg_has_key( key2 ) ) {
22 | 		return get_path( get_cmd_line_arg_val( key2 ) )
23 | 	}
24 | 	else if ( (rep==1) && cmd_line_arg_has_key( key3 ) ) {
25 | 		return get_path( get_cmd_line_arg_val( key3 ) )
26 | 	}
27 | 	else if ( conf.hasKey( key ) ) {
28 | 		return get_path( conf{ key } )
29 | 	}
30 | 	else if ( conf.hasKey( key2 ) ) {
31 | 		return get_path( conf{ key2 } )
32 | 	}
33 | 	else if ( (rep==1) && conf.hasKey( key3 ) ) {
34 | 		return get_path( conf{ key3 } )
35 | 	}
36 | 	return ""
37 | }
38 | 
39 | string get_bam( int rep ) {
40 | 
41 | 	return get_bam( 0, rep )
42 | }
43 | 
44 | string get_filt_bam( int ctl, int rep ) {
45 | 
46 | 	key := ( ctl > 0 ? "ctl_filt_bam" : "filt_bam" ) + "_rep" + rep
47 | 	key2 := ( ctl > 0 ? "ctl_filt_bam" : "filt_bam" ) + rep
48 | 	key3 := ( ctl > 0 ? "ctl_filt_bam" : "filt_bam" )
49 | 
50 | 	if ( cmd_line_arg_has_key( key ) ) {
51 | 		return get_path( get_cmd_line_arg_val( key ) )
52 | 	}
53 | 	else if ( cmd_line_arg_has_key( key2 ) ) {
54 | 		return get_path( get_cmd_line_arg_val( key2 ) )
55 | 	}
56 | 	else if ( (rep==1) && cmd_line_arg_has_key( key3 ) ) {
57 | 		return get_path( get_cmd_line_arg_val( key3 ) )
58 | 	}
59 | 	else if ( conf.hasKey( key ) ) {
60 | 		return get_path( conf{ key } )
61 | 	}
62 | 	else if ( conf.hasKey( key2 ) ) {
63 | 		return get_path( conf{ key2 } )
64 | 	}
65 | 	else if ( (rep==1) && conf.hasKey( key3 ) ) {
66 | 		return get_path( conf{ key3 } )
67 | 	}
68 | 	return ""
69 | }
70 | 
71 | string get_filt_bam( int rep ) {
72 | 
73 | 	return get_filt_bam( 0, rep )
74 | }
75 | 
76 | bool is_input_bam( int ctl, int rep ) {
77 | 
78 | 	return get_bam( ctl, rep ) != ""
79 | }
80 | 
81 | bool is_input_bam( int rep ) {
82 | 
83 | 	return is_input_bam( 0, rep )
84 | }
85 | 
86 | bool is_input_filt_bam( int ctl, int rep ) {
87 | 
88 | 	return get_filt_bam( ctl, rep ) != ""
89 | }
90 | 
91 | bool is_input_filt_bam( int rep ) {
92 | 
93 | 	return is_input_filt_bam( 0, rep )
94 | }
95 | 


--------------------------------------------------------------------------------
/utils/ucsc_simplegene.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | 
  3 | import sys,os
  4 | sys.path.append('/home/xzhou/subtleKnife/script/genescript')
  5 | import parseUcscgenestruct
  6 | 
  7 | if len(sys.argv)!=3:
  8 | 	print '<ucsc gene file> <tkname>'
  9 | 	sys.exit()
 10 | 
 11 | ucsc,tkname=sys.argv[1:]
 12 | 
 13 | 
 14 | symbol={}
 15 | desc={}
 16 | i=0
 17 | if os.path.exists('refLink.txt'):
 18 | 	'''
 19 | 	0 symbol
 20 | 	1 desc
 21 | 	2 name
 22 | 	3 name
 23 | 	'''
 24 | 	with open('refLink.txt') as fin:
 25 | 		for line in fin:
 26 | 			lst=line.rstrip().split('\t')
 27 | 			if len(lst)<4: continue
 28 | 			w=lst[1].replace('"','')
 29 | 			#w=w.replace("'",'')
 30 | 			desc[lst[2]]=w
 31 | 			desc[lst[3]]=w
 32 | 			symbol[lst[2]]=lst[0]
 33 | 			symbol[lst[3]]=lst[0]
 34 | 			i+=1
 35 | print 'refLink: '+str(i)
 36 | 
 37 | 
 38 | # dump
 39 | fout=open(tkname,'w')
 40 | fout2=open(tkname+'_load','w')
 41 | 
 42 | id=1
 43 | with open(ucsc) as fin:
 44 | 	for line in fin:
 45 | 		lst=line.rstrip().split('\t')
 46 | 		g=parseUcscgenestruct.parse(lst,True)
 47 | 		name=lst[1]
 48 | 		fout.write('{0}\t{1}\t{2}\tname:"{3}",id:{4},strand:"{5}",'.format(
 49 | 			g['chrom'],
 50 | 			g['start'],
 51 | 			g['stop'],
 52 | 			name,
 53 | 			id,
 54 | 			g['strand']))
 55 | 		id+=1
 56 | 		if 'thin' in g or 'thick' in g:
 57 | 			fout.write('struct:{')
 58 | 			if 'thin' in g:
 59 | 				fout.write('thin:[')
 60 | 				for x in g['thin']:
 61 | 					fout.write('[{0},{1}],'.format(x[0],x[1]))
 62 | 				fout.write('],')
 63 | 			if 'thick' in g:
 64 | 				fout.write('thick:[')
 65 | 				for x in g['thick']:
 66 | 					fout.write('[{0},{1}],'.format(x[0],x[1]))
 67 | 				fout.write('],')
 68 | 			fout.write('},')
 69 | 		# desc
 70 | 		if name in desc:
 71 | 			fout.write('desc:"'+desc[name]+'",')
 72 | 		if name in symbol:
 73 | 			fout.write('name2:"'+symbol[name]+'"')
 74 | 			fout2.write('{0}\t{1}\t{2}\t{3}\n'.format(g['chrom'],g['start'],g['stop'],symbol[name]))
 75 | 		fout.write('\n')
 76 | 		fout2.write('{0}\t{1}\t{2}\t{3}\n'.format(g['chrom'],g['start'],g['stop'],name))
 77 | 
 78 | 
 79 | fout2.close()
 80 | fout.close()
 81 | 
 82 | import os
 83 | os.system('sort -k1,1 -k2,2n '+tkname+' > x')
 84 | os.system('mv x '+tkname)
 85 | os.system('bgzip -f '+tkname)
 86 | os.system('tabix -f -p bed '+tkname+'.gz')
 87 | 
 88 | print '''
 89 | drop table if exists {0};
 90 | create table {0} (
 91 | chrom varchar(20) not null,
 92 | start int unsigned not null,
 93 | stop int unsigned not null,
 94 | name varchar(100) not null
 95 | ) ENGINE=MyISAM DEFAULT CHARSET=latin1;
 96 | load data local infile '{0}_load' into table {0};
 97 | create index name on {0} (name);
 98 | '''.format(tkname)
 99 | 
100 | 


--------------------------------------------------------------------------------
/default.env:
--------------------------------------------------------------------------------
 1 | ## Get hostname with the following command: 
 2 | ## $ hostname -f
 3 | ##
 4 | ## Configure an environment per hostname:
 5 | ## [hostname1]
 6 | ## ...
 7 | ##
 8 | ## Use the same environment for multiple hostnames:
 9 | ## [hostname2, hostname3, ...]
10 | ## ...
11 | ##
12 | ## Using group
13 | ## [hostname1, hostname2, ... : group]
14 | ## [group]
15 | ## ...
16 | ##
17 | ## Using an asterisk in hostnames (IMPORTANT: only one * is allowed in hostnames)
18 | ##
19 | ## [host*name1]
20 | ##
21 | ## [*hostname2, hostname3*]
22 | 
23 | # Stanford Kundaje group clusters (out of SGE)
24 | [vayu, mitra, durga]
25 | conda_env	= aquas_chipseq
26 | conda_env_py3	= aquas_chipseq_py3
27 | conda_bin_dir   = /software/miniconda3/bin
28 | species_file	= $script_dir/species/kundaje.conf
29 | unlimited_mem_wt= true 		# unlimited max. memory and walltime on Kundaje clusters
30 | nice            = 10
31 | nth             = 4
32 | 
33 | # Stanford Kundaje group clusters (controlled with SGE)
34 | [nandi, kali, amold, wotan, kadru, surya, indra, brahma]
35 | conda_env	= aquas_chipseq
36 | conda_env_py3	= aquas_chipseq_py3
37 | conda_bin_dir   = /software/miniconda3/bin
38 | species_file	= $script_dir/species/kundaje.conf
39 | unlimited_mem_wt= true 		# unlimited max. memory and walltime on Kundaje clusters
40 | system 		= sge 		# force to use SGE (Sun Grid Engine)
41 | nice            = 20
42 | nth             = 4
43 | 
44 | # Stanford NEW SCG
45 | [*.scg.stanford.edu, dper730xd*, hppsl230s*, dper910*, sgiuv*, sgisummit*, smsx10srw*]
46 | conda_env	= aquas_chipseq
47 | conda_env_py3	= aquas_chipseq_py3
48 | species_file	= $script_dir/species/scg.conf
49 | nth		= 4		# number of threads for each pipeline
50 | system		= slurm		# force to use SLURM SCG
51 | q_for_slurm_account = true      # use --account instead of -p (partition)
52 | cluster_task_delay = 10 	# for NFS delayed write
53 | 
54 | # Stanford OLD SCG : login node, computing nodes, file transfer servers
55 | [scg*.stanford.edu, scg*.local, carmack.stanford.edu, crick.stanford.edu]
56 | conda_env	= aquas_chipseq
57 | conda_env_py3	= aquas_chipseq_py3
58 | species_file	= $script_dir/species/scg.conf
59 | nth 		= 8		# number of threads for each pipeline run
60 | wt_spp 		= 72h		# walltime for spp
61 | system 		= sge 		# force to use SGE (Sun Grid Engine) on SCG3/4 even though a user doesn't explicitly specify SGE on command line with 'bds -s sge chipseq.bds ...'
62 | cluster_task_delay = 10
63 | 
64 | # Stanford Sherlock clusters 
65 | [sherlock*.stanford.edu, sh-*.local, sh-*.int, sh-ln*.stanford.edu]
66 | conda_env	= aquas_chipseq
67 | conda_env_py3	= aquas_chipseq_py3
68 | species_file	= $script_dir/species/sherlock.conf
69 | nth		= 8		# number of threads for each pipeline run
70 | wt_spp		= 47h		# walltime for spp
71 | system		= slurm		# force to use SLURM
72 | cluster_task_delay = 30
73 | 
74 | 
75 | # default (if no section with hostname is found)
76 | [default]
77 | conda_env	= aquas_chipseq
78 | conda_env_py3	= aquas_chipseq_py3
79 | species_file	= # use your own species file here. (DEF_SPECIES_FILE: DO NOT REMOVE THIS COMMENT!)
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/example_conf_full.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"screen" : "",
  3 | 	"dry_run" : false,
  4 | 	"type" : "TF",
  5 | 	"final_stage" : "idr",
  6 | 	"out_dir" : "out",
  7 | 	"title" : "",
  8 | 	"input_endedness" : {
  9 | 		"se" : false,
 10 | 		"pe" : false
 11 | 	},
 12 | 	"input_files" : {
 13 | 	},
 14 | 	"species" : {
 15 | 		"species" : "",
 16 | 		"species_file" : "",
 17 | 		"species_browser" : "",
 18 | 		"ref_fa" : "",
 19 | 		"chrsz" : "",
 20 | 		"blacklist" : "",
 21 | 		"gensz" : ""
 22 | 	},
 23 | 	"cluster" : {
 24 | 		"system" : "local",
 25 | 		"nice" : 0,
 26 | 		"retrial" : 0,
 27 | 		"q" : ""
 28 | 	},
 29 | 	"resource" : {
 30 | 		"nth" : 8,
 31 | 		"no_par" : false,
 32 | 		"wt" : "5h50m",
 33 | 		"memory" : "7G",
 34 | 		"unlimited_mem_wt" : false,
 35 | 		"wt_dedup" : "23h",
 36 | 		"mem_dedup" : "12G",
 37 | 		"mem_shuf" : "12G",
 38 | 		"wt_bwa" : "47h",
 39 | 		"mem_bwa" : "12G",
 40 | 		"wt_macs2" : "23h",
 41 | 		"mem_macs2" : "15G",
 42 | 		"wt_spp" : "47h",
 43 | 		"mem_spp" : "12G"
 44 | 	},
 45 | 	"alignment" : {
 46 | 		"aligner" : "bwa",
 47 | 		"bwa" : {
 48 | 			"param_bwa_aln" : "-q 5 -l 32 -k 2",
 49 | 			"bwa_idx" : ""
 50 | 		},
 51 | 		"filter" : {
 52 | 			"dup_marker" : "picard",
 53 | 			"anon_filt_bam" : false,
 54 | 			"mapq_thresh" : 30,
 55 | 			"rm_chr_from_tag" : "",
 56 | 			"no_dup_removal" : false
 57 | 		},
 58 | 		"subsample" : {
 59 | 			"subsample_chip" : "0",
 60 | 			"subsample_ctl" : "0"
 61 | 		}
 62 | 	},
 63 | 	"cross_corr_analysis" : {
 64 | 		"no_xcor" : false,
 65 | 		"subsample_xcor" : "15M",
 66 | 		"speak_xcor" : -1,
 67 | 		"extra_param_xcor" : ""
 68 | 	},
 69 | 	"callpeak" : {
 70 | 		"peak_caller" : "spp",
 71 | 		"ctl_depth_ratio" : 1.2,
 72 | 		"use_pooled_ctl" : false,
 73 | 		"true_rep" : false,
 74 | 		"no_pseudo_rep" : false,
 75 | 		"spp" : {
 76 | 			"cap_num_peak_spp" : 300000,
 77 | 			"max_ppsize_spp" : "",
 78 | 			"speak_spp" : -1,
 79 | 			"extra_param_spp" : ""
 80 | 		},
 81 | 		"macs2" : {
 82 | 			"pval_thresh_macs2" : 0.01,
 83 | 			"keep_dup_macs2" : "all",
 84 | 			"extsize_macs2" : -1,
 85 | 			"shift_macs2" : 0,
 86 | 			"extra_param_macs2" : ""
 87 | 		},
 88 | 		"idr" : {
 89 | 			"idr_suffix" : false,
 90 | 			"idr_rank" : "",
 91 | 			"idr_thresh" : 0.05
 92 | 		},
 93 | 		"naive_overlap" : {
 94 | 			"nonamecheck" : false
 95 | 		}
 96 | 	},
 97 | 	"signal_track" : {
 98 | 		"sig_trk_for_pooled_rep_only" : false
 99 | 	},
100 | 	"bds_configuration" : {
101 | 		"env" : "$script_dir/default.env"
102 | 	},
103 | 	"visualization" : {
104 | 		"url_base" : ""
105 | 	},
106 | 	"ENCODE_accession" : {
107 | 		"ENCODE_accession" : "",
108 | 		"ENCODE_award_rfa" : "",
109 | 		"ENCODE_assay_category" : "",
110 | 		"ENCODE_assay_title" : "",
111 | 		"ENCODE_award" : "",
112 | 		"ENCODE_lab" : "",
113 | 		"ENCODE_assembly" : "",
114 | 		"ENCODE_alias_prefix" : "KLAB_PIPELINE"
115 | 	},
116 | 	"shell_environment" : {
117 | 		"conda" : {
118 | 			"conda_env" : "",
119 | 			"conda_env_py3" : "",
120 | 			"conda_bin_dir" : ""
121 | 		},
122 | 		"modules" : {
123 | 			"mod" : "",
124 | 			"shcmd" : "",
125 | 			"addpath" : ""
126 | 		}
127 | 	}
128 | }
129 | 


--------------------------------------------------------------------------------
/modules/input_fastq.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "conf.bds"
  5 | 
  6 | 
  7 | help == fastq input definition :
  8 | help         Single-ended : For replicate '-fastq[REP_ID]', For control '-ctl_fastq[REP_ID]'
  9 | help         Paired end : For replicate '-fastq[REP_ID]_[PAIR_ID]', For control '-ctl_fastq[REP_ID]_[PAIR_ID]'
 10 | 
 11 | 
 12 | 
 13 | string[] get_fastqs( int ctl, int rep ) {  // if paired-end return [PE1, PE2], elseif single-end else return [PE1], else []	
 14 | 
 15 | 	string[] ret
 16 | 	for ( int pe=1; pe<=2; pe++ ) {
 17 | 		ret += get_fastq( ctl, rep, pe )
 18 | 	}
 19 | 
 20 | 	return ret
 21 | }
 22 | 
 23 | string[] get_fastqs( int rep ) {  // if paired-end return [PE1, PE2], elseif single-end else return [PE1], else []	
 24 | 
 25 | 	return get_fastqs( 0, rep )
 26 | }
 27 | 
 28 | string[] get_fastq( int ctl, int rep, int p ) {
 29 | 
 30 | 	// allow up to 10 fastqs to be pooled (i.e. fastq1 fastq1:2 fastq1:3, ...)
 31 | 	string[] suffix
 32 | 	suffix.add("")
 33 | 	for ( int i=1; i<=99; i++ ) {
 34 | 		suffix.add(":$i")
 35 | 	}
 36 | 
 37 | 	string[] result
 38 | 	for ( int i=0; i<suffix.size(); i++ ) {
 39 | 		key_wo_p := ( ctl > 0 ? "ctl_fastq" : "fastq" ) + "_rep" + rep
 40 | 		key := key_wo_p + "_p" + p + suffix[i]
 41 | 		key_wo_p += suffix[i]
 42 | 
 43 | 		key_wo_p2 := ( ctl > 0 ? "ctl_fastq" : "fastq" ) + rep
 44 | 		key2 := key_wo_p2 + "_" + p + suffix[i]
 45 | 		key_wo_p2 += suffix[i]
 46 | 
 47 | 		key_wo_p3 := ( ctl > 0 ? "ctl_fastq" : "fastq" )
 48 | 		key3 := key_wo_p3 + "_" + p + suffix[i]
 49 | 		key_wo_p3 += suffix[i]
 50 | 
 51 | 		if ( (p==1) && cmd_line_arg_has_key( key_wo_p ) ) {
 52 | 			result.add( get_path( get_cmd_line_arg_val( key_wo_p ) ) )
 53 | 		}
 54 | 		else if ( (p==1) && cmd_line_arg_has_key( key_wo_p2 ) ) {
 55 | 			result.add( get_path( get_cmd_line_arg_val( key_wo_p2 ) ) )
 56 | 		}
 57 | 		else if ( (p==1) && (rep==1) && cmd_line_arg_has_key( key_wo_p3 ) ) {
 58 | 			result.add( get_path( get_cmd_line_arg_val( key_wo_p3 ) ) )
 59 | 		}
 60 | 		else if ( cmd_line_arg_has_key( key ) ) {
 61 | 			result.add( get_path( get_cmd_line_arg_val( key ) ) )
 62 | 		}
 63 | 		else if ( cmd_line_arg_has_key( key2 ) ) {
 64 | 			result.add( get_path( get_cmd_line_arg_val( key2 ) ) )
 65 | 		}	
 66 | 		else if ( (rep==1) && cmd_line_arg_has_key( key3 ) ) {
 67 | 			result.add( get_path( get_cmd_line_arg_val( key3 ) ) )
 68 | 		}
 69 | 		else if ( (p==1) && conf.hasKey( key_wo_p ) ) {
 70 | 			result.add( get_path( conf{ key_wo_p } ) )
 71 | 		}
 72 | 		else if ( (p==1) && conf.hasKey( key_wo_p2 ) ) {
 73 | 			result.add( get_path( conf{ key_wo_p2 } ) )
 74 | 		}
 75 | 		else if ( (p==1) && (rep==1) && conf.hasKey( key_wo_p3 ) ) {
 76 | 			result.add( get_path( conf{ key_wo_p3 } ) )
 77 | 		}
 78 | 		else if ( conf.hasKey( key ) ) {
 79 | 			result.add( get_path( conf{ key } ) )
 80 | 		}
 81 | 		else if ( conf.hasKey( key2 ) ) {
 82 | 			result.add( get_path( conf{ key2 } ) )
 83 | 		}
 84 | 		else if ( (rep==1) && conf.hasKey( key3 ) ) {
 85 | 			result.add( get_path( conf{ key3 } ) )
 86 | 		}
 87 | 	}
 88 | 
 89 | 	return result
 90 | }
 91 | 
 92 | string[] get_fastq( int rep, int p ) {
 93 | 
 94 | 	return get_fastq( 0, rep, p )
 95 | }
 96 | 
 97 | bool is_input_fastq( int ctl, int rep ) {
 98 | 
 99 | 	fastqs := get_fastqs( ctl, rep )
100 | 	if ( fastqs.size() > 0 ) return true
101 | 	return false
102 | }
103 | 
104 | bool is_input_fastq( int rep ) {
105 | 
106 | 	return is_input_fastq( 0, rep )	
107 | }
108 | 


--------------------------------------------------------------------------------
/modules/cluster.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "conf.bds"
  5 | 
  6 | 
  7 | help == cluster/system/resource settings
  8 | wt 		:= "5h50m"	help Walltime for all single-threaded tasks (example: 8:10:00, 3h, 3600, default: 5h50m, 5:50:00).
  9 | memory 		:= "7G"		help Maximum memory for all single-threaded tasks (equivalent to '-mem', example: 4.5G, 1024M, default: 7G).
 10 | use_system 	:= "local"	help Force to use a system (equivalent to 'bds -s [SYSTEM_NAME] ...', any system defined in bds.config can be used).
 11 | nice 		:= 0 		help Set process priority for all tasks (default: 0; -20 (highest) ~ 19 (lowest) ).
 12 | retrial		:= 0 		help # of Retrial for failed tasks (default: 0).
 13 | q 		:= ""		help Submit tasks to a specified cluster queue.
 14 | q_for_slurm_account := false 	help Use --account instead of -p (partition) for SLURM only.
 15 | unlimited_mem_wt:= false 	help Use unlimited max. memory and walltime.
 16 | java_tmp_dir 	:= "\${TMPDIR}" help Java temporary directory. (change it when you get 'Disk quota exceeded' error in Java, default: ${TMPDIR}).
 17 | 
 18 | init_cluster()
 19 | 
 20 | 
 21 | void init_cluster() { 
 22 | 	wt 		= get_conf_val( wt, 			["wt"] )
 23 | 	memory 		= get_conf_val( memory, 		["memory","mem"] )
 24 | 	use_system 	= get_conf_val( use_system, 		["use_system","system"] )
 25 | 	nice 		= get_conf_val_int( nice, 		["nice"] )
 26 | 	retrial 	= get_conf_val_int( retrial, 		["retrial","retry"] )
 27 | 	q 		= get_conf_val( q, 			["q"] )
 28 | 	unlimited_mem_wt= get_conf_val_bool( unlimited_mem_wt, 	["unlimited_mem_wt"] )
 29 | 	q_for_slurm_account= get_conf_val_bool( q_for_slurm_account, 	["q_for_slurm_account"] )
 30 | 	java_tmp_dir	= get_conf_val( java_tmp_dir, 		["java_tmp_dir"] )
 31 | 
 32 | 	if ( cmd_line_arg_has_key("mem") ) memory = get_cmd_line_arg_val( "mem" )
 33 | 	if ( cmd_line_arg_has_key("system") ) use_system = get_cmd_line_arg_val( "system" )
 34 | 	if ( nice <= -20 ) nice = -20
 35 | 	if ( nice > 19 ) nice = 19	
 36 | 	if ( use_system != "" ) system = use_system.toLower()
 37 | 	if ( system == "slurm" || system == "generic" ) { // for new SCG, which uses --account instead of -p (partition)
 38 | 		system = "generic"
 39 | 		if ( q != "" ) {
 40 | 			if ( q_for_slurm_account ) {
 41 | 				queue = "--account $q"
 42 | 			}
 43 | 			else {
 44 | 				queue = "-p $q"
 45 | 			}		
 46 | 		}
 47 | 	}
 48 | 	else if ( q != "" ) {
 49 | 		queue = q
 50 | 	}
 51 | 
 52 | 	// cpus, mem and timeout are pre-declared BDS variables for default resource settings
 53 | 	mem 		= get_res_mem(memory,1)
 54 | 	timeout 	= get_res_wt(wt)
 55 | 	retry 		= retrial
 56 | 
 57 | 	// do not modify this (BDS timeout; how long BDS will wait for tasks to be queued on the cluster)
 58 | 	walltimeout 	= 3600*24*100 	// timeout var. in BigDataScript (100 days, jobs will never be stopped by BDS due to BDS timeout)
 59 | 
 60 | 	print("\n\n== cluster/system info\n")
 61 | 	print( "Walltime (general)\t\t: $wt\n" )
 62 | 	print( "Max. memory (general)\t\t: $memory\n" )
 63 | 	print( "Force to use a system\t\t: $use_system\n" )
 64 | 	print( "Process priority (niceness)\t: $nice\n" )
 65 | 	print( "Retiral for failed tasks\t: $retrial\n" )
 66 | 	print( "Submit tasks to a cluster queue\t: $q\n" )
 67 | 	print( "Unlimited cluster mem./walltime\t: $unlimited_mem_wt\n")
 68 | 	print( "Use --acount instead of SLURM partition\t\t: $q_for_slurm_account\n")	
 69 | 	print( "Java temporary directory\t\t: $java_tmp_dir\n")
 70 | }
 71 | 
 72 | int get_res_wt( string str ) {
 73 | 	return (unlimited_mem_wt || is_system_local() ) ? -1 : parse_time( str )
 74 | }
 75 | 
 76 | int get_res_mem( string str, int n ) {
 77 | 	if ( n < 1 ) n = 1
 78 | 	return (unlimited_mem_wt || is_system_local() ) ? -1 : parse_mem( str )/n
 79 | }
 80 | 
 81 | int get_res_mem( string str ) {
 82 | 	return get_res_mem( str , 1 )
 83 | }
 84 | 
 85 | bool is_system_sge() {
 86 | 	return system == "sge"
 87 | }
 88 | 
 89 | bool is_system_local() {
 90 | 	return system == "local"
 91 | }
 92 | 
 93 | bool is_system_generic() {
 94 | 	return system == "generic"
 95 | }
 96 | 
 97 | bool is_system_slurm() { 
 98 | 	// slurm uses generic cluster, it's configured in bds.config and ./utils/clusterGeneral
 99 | 	return system == "generic"
100 | }
101 | 


--------------------------------------------------------------------------------
/modules/callpeak_gem.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "species.bds"
  5 | include "module_template.bds"
  6 | 
  7 | 
  8 | help == callpeak gem settings
  9 | npeak_gem 	:= 300000 	help Threshold on # of peaks for GEM (default: 300000).
 10 | k_min_gem 	:= 6 		help Minimum length of k-mers (--k_min in GEM, default: 6).
 11 | k_max_gem 	:= 13		help Maximum length of k-mers (--k_max in GEM, default: 13).
 12 | q_val_thresh_gem:= 0.0		help Q-value threshold (--q in GEM, default: 0).
 13 | read_dist_gem	:= "$script_dir/etc/Read_Distribution_default.txt" 	help Read distribution txt file for GEM (default: $script_dir/etc/Read_Distribution_default.txt).
 14 | extra_param_gem := ""		help Extra parameters for GEM.
 15 | wt_gem		:= "47h"	help Walltime for GEM (default: 47h, 47:00:00).
 16 | mem_gem		:= "15G" 	help Max. memory for GEM (default: 15G).
 17 | 
 18 | grp_color_gem		:= "skyblue"
 19 | 
 20 | 
 21 | init_callpeak_gem()
 22 | 
 23 | 
 24 | void init_callpeak_gem() {
 25 | 
 26 | 	npeak_gem 	= get_conf_val_int( npeak_gem,		["npeak_gem"] )
 27 | 	k_min_gem 	= get_conf_val_int( k_min_gem,		["k_min_gem"] )
 28 | 	k_max_gem 	= get_conf_val_int( k_max_gem,		["k_max_gem"] )
 29 | 	q_val_thresh_gem= get_conf_val_real( q_val_thresh_gem,	["q_val_thresh_gem"] )
 30 | 	read_dist_gem 	= get_conf_val( read_dist_gem,		["read_dist_gem"] )
 31 | 	extra_param_gem = get_conf_val( extra_param_gem,	["extra_param_gem"] )
 32 | 	wt_gem 		= get_conf_val( wt_gem, 		["walltime_gem", "wt_gem", "timeout_gem"] )
 33 | 	mem_gem 	= get_conf_val( mem_gem, 		["memory_gem", "mem_gem"] )
 34 | 
 35 | 	print("\n\n== callpeak gem settings\n")
 36 | 	print( "Threshold for # peak in GEM\t\t: $npeak_gem\n")
 37 | 	print( "Min. length of k-mers in GEM\t\t: $k_min_gem\n")
 38 | 	print( "Max. length of k-mers in GEM\t\t: $k_max_gem\n")
 39 | 	print( "Q-value threshold for GEM\t\t: $q_val_thresh_gem\n")
 40 | 	print( "Read distribution txt for GEM\t\t: $read_dist_gem\n")
 41 | 	print( "Extra parameters for GEM\t:$extra_param_gem\n")
 42 | 	print( "Walltime (GEM)\t\t\t: $wt_gem\n")
 43 | 	print( "Max. memory (GEM)\t\t: $mem_gem\n")
 44 | }
 45 | 
 46 | void chk_callpeak_gem() {
 47 | 	if ( !path_exists( "$seq_dir/chr1.fa") && !path_exists( "$seq_dir/chr1.fasta") ) \
 48 | 		error("\nReference genome sequence directory doesn't exists! (file: $seq_dir/chr1.fa)\n")
 49 | }
 50 | 
 51 | string[] gem( string tag, string ctl_tag, string o_dir, string group, int nth_gem ) {
 52 | 	prefix 		:= ctl_tag ? 	("$o_dir/" + make_x_basename_wo_gz( tag, ctl_tag, "" ) ) \
 53 | 					: replace_dir( rm_ext( tag, "tagAlign" ), o_dir )
 54 | 	tag_tmp 	:= replace_dir( rm_ext( tag, "tagAlign" ), o_dir ) + ".tmp.bed"
 55 | 	ctl_tag_tmp 	:= replace_dir( rm_ext( ctl_tag, "tagAlign" ), o_dir ) + ".tmp.bed"
 56 | 	npeakfile 	:= "$prefix.narrowPeak.gz"
 57 | 	npeakfile_tmp 	:= "$prefix/"+prefix.baseName()+".GEM_events.narrowPeak"
 58 | 	ctl_cmd 	:= ctl_tag ? "zcat $ctl_tag > $ctl_tag_tmp" : "echo"
 59 |  	ctl_param 	:= ctl_tag ? "--ctrl $ctl_tag_tmp" : ""
 60 | 
 61 | 	in 	:= [ tag, ctl_tag ]
 62 | 	out 	:= [ npeakfile ]
 63 | 
 64 | 	max_java_heap 	:= binary_prefix( (mem==-1) ? parse_mem( mem_gem ) : (mem*4)/5 )
 65 | 	taskName:= "gem " + group
 66 | 	cpus 	:= (nth_gem==1) ? -1 : nth_gem;	mem := get_res_mem(mem_gem,nth_gem);	timeout := get_res_wt(wt_gem)
 67 | 
 68 | 	wait_par( cpus )
 69 | 
 70 | 	tid := task( out<-in ) {
 71 | 
 72 | 		sys $shcmd_init_py3
 73 | 
 74 | 		sys zcat $tag > $tag_tmp
 75 | 		sys $ctl_cmd
 76 | 		// # =============================
 77 | 		// # See http://wiki.encodedcc.org/index.php/GPS/GEM of additional information
 78 | 		// # =============================
 79 | 		sys export _JAVA_OPTIONS="-Xms256M -Xmx$max_java_heap -XX:ParallelGCThreads=1"
 80 | 
 81 | 		// removed --s 2400000000 since, can guess from chrsz
 82 | 		sys java -jar $(which gem.jar) --g $chrsz --d $read_dist_gem \
 83 | 			--expt $tag_tmp $ctl_param --f BED --out $prefix \
 84 | 			--genome $seq_dir --k_min $k_min_gem --k_max $k_max_gem --outNP \
 85 | 			--t $nth_gem --q $q_val_thresh_gem $extra_param_gem
 86 | 
 87 | 		// # =============================
 88 | 		// # Sort peaks by signal value and truncate peaks to top 300K
 89 | 		// # =============================
 90 | 		sys sort -k7nr,7nr $npeakfile_tmp | head -n $npeak_gem | gzip -nc > $npeakfile
 91 | 		sys rm -f $tag_tmp $ctl_tag_tmp
 92 | 
 93 | 		sys $shcmd_finalize
 94 | 	}
 95 | 
 96 | 	register_par( tid, cpus )
 97 | 
 98 | 	add_task_to_graph( in, out, group, "GEM", grp_color_gem )
 99 | 
100 | 	return out
101 | }
102 | 


--------------------------------------------------------------------------------
/modules/parallel.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "conf.bds"
  5 | 
  6 | 
  7 | help == parallelization settings
  8 | no_par		:= false	help Serialize all tasks (individual tasks can still use multiple threads up to '-nth').
  9 | nth 		:= 8 		help Maximum # threads for a pipeline. (default: 8).
 10 | 
 11 | string[] _tids_all 	// array of task ids currently running
 12 | int{} _nth_tasks 	// key: task id, value: # of threads for the task
 13 | 
 14 | 
 15 | init_parallel()
 16 | 
 17 | 
 18 | void init_parallel() {
 19 | 	no_par 	= get_conf_val_bool( no_par, 	["no_par"] )
 20 | 	nth 	= get_conf_val_int( nth, 	["nth"] )
 21 | 
 22 | 	if ( nth > 32 ) error("Maximum # threads (-nth) for a pipeline should not exceed 32!")
 23 | 	if ( nth <= 1 ) {
 24 | 		print("\nWarning: Maximum # threads (-nth) for a pipeline is <= 1. Turning off parallelization... (-no_par)")
 25 | 		nth = 1
 26 | 		no_par = true
 27 | 	}
 28 | 
 29 | 	// pre-declared BDS variable
 30 | 	cpus 		= -1 		// With cpus==-1, BDS does not pass number of threads to cluster engine (SGE, SLURM, ...), which means single-threaded
 31 | 
 32 | 	print("\n\n== parallelization info\n")
 33 | 	print( "No parallel jobs\t\t: $no_par\n" )
 34 | 	print( "Maximum # threads \t\t: $nth\n" )
 35 | }
 36 | 
 37 | void wait_par( int nth_task ) {
 38 | 	if ( nth_task < 1 ) nth_task = 1
 39 | 
 40 | 	while ( true ) {
 41 | 		sleep( rand()*1.0 + 0.5 )
 42 | 		_tids_all_  := _tids_all // make dummy array for thread safety
 43 | 
 44 | 		string[] tids_running
 45 | 		int nth_running		
 46 | 		for ( string tid : _tids_all_ ) { // get total # threads for currently running tasks, and find the oldest task
 47 | 			if ( !tid.isDone() ) {
 48 | 				tids_running.add( tid )
 49 | 				nth_running = nth_running + _nth_tasks{tid}
 50 | 			}
 51 | 		}
 52 | 
 53 | 		if ( tids_running.size() == 0 ) {
 54 | 			break
 55 | 		}
 56 | 		else if ( no_par || (nth_running+nth_task) > nth ) {
 57 | 			loop_cnt := 0
 58 | 			while( true ) { // wait until one of running tasks finishes
 59 | 				break_loop := false
 60 | 				for ( string tid : tids_running ) {
 61 | 					if ( tid.isDone() ) {
 62 | 						break_loop = true
 63 | 						break
 64 | 					}
 65 | 				}
 66 | 				if ( break_loop ) break
 67 | 				sleep( rand() + 0.5 )
 68 | 			}
 69 | 			sleep( rand()*1.0 + 0.5 )
 70 | 		}
 71 | 		else {
 72 | 			break
 73 | 		}
 74 | 	}
 75 | }
 76 | 
 77 | void register_par( string tid, int nth_task ) {
 78 | 	if ( nth_task < 1 ) nth_task = 1
 79 | 	if ( tid == "" ) return
 80 | 
 81 | 	_tids_all.add(tid)
 82 | 	_nth_tasks{tid} = nth_task
 83 | }
 84 | 
 85 | int{} distribute_nonzero( int n, int{} weight ) { // distribute integer n according to weight
 86 | 	int{} ret
 87 | 
 88 | 	int sum 
 89 | 	for ( int w : weight ) sum += w
 90 | 	if ( sum == 0 ) error("distribute_nth: sum is zero. check if input file size is 0?\n")
 91 | 	for ( string key : weight.keys() ) {
 92 | 		w := weight{key}
 93 | 		ret{key} = (n*w)/sum
 94 | 
 95 | 		if ( ret{key} == 0 ) ret{key} = 1
 96 | 	}
 97 | 
 98 | 	while( true ) {
 99 | 		int sum2
100 | 		for ( string key : weight.keys() ) sum2 += ret{key}
101 | 		if ( n > sum2 ) {
102 | 			string key_to_plus
103 | 			int max_diff = 0
104 | 			for ( string key : weight.keys() ) {
105 | 				diff := n*weight{key}-ret{key}*sum
106 | 				if ( diff > max_diff ) {
107 | 					key_to_plus = key
108 | 					max_diff = diff
109 | 				}
110 | 			}
111 | 			ret{key_to_plus}++
112 | 		}
113 | 		else {
114 | 			break
115 | 		}
116 | 	}
117 | 
118 | 	print("Distributing $n to ... \n")
119 | 	print(ret)
120 | 	print("\n")
121 | 	return ret
122 | }
123 | 
124 | int[] distribute_nonzero( int n, int[] weight ) { // distribute integer n according to weight
125 | 	int[] ret
126 | 
127 | 	int sum 
128 | 	for ( int w : weight ) sum += w
129 | 	if ( sum == 0 ) error("distribute_nth: sum is zero. check if input file size is 0?\n")
130 | 	for ( int i=0; i<weight.size(); i++) {
131 | 		w := weight[i]
132 | 		to_add := (n*w)/sum
133 | 		if ( to_add == 0 ) to_add = 1
134 | 		ret.add( to_add )
135 | 	}
136 | 	while( true ) {
137 | 		int sum2
138 | 		for ( int i=0; i<weight.size(); i++) sum2 += ret[i]
139 | 		if ( n > sum2 ) {
140 | 			int id_to_plus
141 | 			int max_diff = 0
142 | 			for ( int i=0; i<weight.size(); i++) {
143 | 				diff := n*weight[i]-ret[i]*sum
144 | 				if ( diff > max_diff ) {
145 | 					id_to_plus = i
146 | 					max_diff = diff
147 | 				}
148 | 			}
149 | 			ret[id_to_plus]++
150 | 		}
151 | 		else {
152 | 			break
153 | 		}
154 | 	}
155 | 
156 | 	print("Distributing $n to ... \n")
157 | 	print(ret)
158 | 	print("\n")
159 | 	return ret
160 | }
161 | 


--------------------------------------------------------------------------------
/install_dependencies.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Stop on error
  3 | set -e
  4 | 
  5 | ## conda environment name
  6 | 
  7 | ENV_NAME=aquas_chipseq
  8 | ENV_NAME_PY3=aquas_chipseq_py3
  9 | 
 10 | INSTALL_GEM=1
 11 | INSTALL_PEAKSEQ=1
 12 | 
 13 | ## install packages from official channels (bioconda and r)
 14 | 
 15 | conda create -n ${ENV_NAME} --file requirements.txt -y -c defaults -c bioconda -c r -c bcbio -c daler -c asmeurer
 16 | conda create -n ${ENV_NAME_PY3} --file requirements_py3.txt -y -c defaults -c bioconda -c r -c bcbio -c daler -c asmeurer
 17 | 
 18 | ### bash function definition
 19 | 
 20 | function add_to_activate {
 21 |   if [[ ! -f $CONDA_INIT ]]; then
 22 |     echo > $CONDA_INIT
 23 |   fi
 24 |   for i in "${CONTENTS[@]}"; do
 25 |     if [[ $(grep "$i" "$CONDA_INIT" | wc -l ) == 0 ]]; then
 26 |       echo $i >> "$CONDA_INIT"
 27 |     fi
 28 |   done
 29 | }
 30 | 
 31 | ## install useful tools for BigDataScript
 32 | 
 33 | mkdir -p $HOME/.bds
 34 | cp -f ./utils/bds_scr ./utils/bds_scr_5min ./utils/kill_scr bds.config $HOME/.bds/
 35 | cp -rf ./utils/clusterGeneric/ $HOME/.bds/
 36 | 
 37 | ## install additional packages
 38 | 
 39 | source activate ${ENV_NAME}
 40 | 
 41 | conda uninstall graphviz -y # graphviz in bioconda has segmentation fault bug
 42 | conda install graphviz -c anaconda -y
 43 | 
 44 | conda install ucsc-bedgraphtobigwig -c bioconda -y
 45 | conda install ucsc-bedtobigbed -c bioconda -y
 46 | 
 47 | #CONDA_BIN=$(dirname $(which activate))/../envs/${ENV_NAME}/bin
 48 | #CONDA_BIN=$(dirname $(which activate))
 49 | CONDA_BIN=$(dirname $(which bedtools))
 50 | CONDA_EXTRA="$CONDA_BIN/../extra"
 51 | CONDA_ACTIVATE_D="$CONDA_BIN/../etc/conda/activate.d"
 52 | CONDA_INIT="$CONDA_ACTIVATE_D/init.sh"
 53 | CONDA_LIB="$CONDA_BIN/../lib"
 54 | if [[ $(find $CONDA_LIB -name '*egg-info*' -not -perm -o+r | wc -l ) > 0 ]]; then
 55 |   find $CONDA_LIB -name '*egg-info*' -not -perm -o+r -exec dirname {} \; | xargs chmod o+r -R
 56 | fi
 57 | 
 58 | mkdir -p $CONDA_EXTRA $CONDA_ACTIVATE_D
 59 | 
 60 | ### install Anshul's phantompeakqualtool
 61 | echo $CONDA_EXTRA
 62 | cd $CONDA_EXTRA
 63 | git clone https://github.com/kundajelab/phantompeakqualtools
 64 | chmod 755 -R phantompeakqualtools
 65 | CONTENTS=("export PATH=$CONDA_EXTRA/phantompeakqualtools:\$PATH")
 66 | add_to_activate
 67 | 
 68 | ### disable locally installed python package lookup
 69 | CONTENTS=("export PYTHONNOUSERSITE=True")
 70 | add_to_activate
 71 | #CONTENTS=("export PYTHONPATH=$CONDA_LIB/python2.7/site-packages:\$PYTHONPATH")
 72 | #add_to_activate
 73 | 
 74 | ### decompress MACS2 python egg
 75 | #cd $CONDA_LIB/python2.7/site-packages
 76 | #unzip -o MACS2-2.1.1.20160309-py2.7-linux-x86_64.egg
 77 | 
 78 | # install PeakSeq
 79 | if [[ ${INSTALL_PEAKSEQ} == 1 ]]; then
 80 |   cd $CONDA_EXTRA
 81 |   wget http://archive.gersteinlab.org/proj/PeakSeq/Scoring_ChIPSeq/Code/C/PeakSeq_1.31.zip -N --no-check-certificate
 82 |   unzip PeakSeq_1.31.zip
 83 |   rm -f PeakSeq_1.31.zip
 84 |   cd PeakSeq
 85 |   make
 86 |   chmod 755 bin/PeakSeq
 87 |   cd $CONDA_BIN
 88 |   ln -s $CONDA_EXTRA/PeakSeq/bin/PeakSeq
 89 | fi
 90 | 
 91 | source deactivate
 92 | 
 93 | 
 94 | source activate ${ENV_NAME_PY3}
 95 | 
 96 | # CONDA_BIN=$(dirname $(which activate))/../envs/${ENV_NAME_PY3}/bin
 97 | #CONDA_BIN=$(dirname $(which activate))
 98 | CONDA_BIN=$(dirname $(which bedtools))
 99 | CONDA_EXTRA="$CONDA_BIN/../extra"
100 | CONDA_ACTIVATE_D="$CONDA_BIN/../etc/conda/activate.d"
101 | CONDA_INIT="$CONDA_ACTIVATE_D/init.sh"
102 | CONDA_LIB="$CONDA_BIN/../lib"
103 | if [[ $(find $CONDA_LIB -name '*egg-info*' -not -perm -o+r | wc -l ) > 0 ]]; then
104 |   find $CONDA_LIB -name '*egg-info*' -not -perm -o+r -exec dirname {} \; | xargs chmod o+r -R
105 | fi
106 | 
107 | mkdir -p $CONDA_EXTRA $CONDA_ACTIVATE_D
108 | 
109 | ### uninstall IDR 2.0.4 and install the latest one
110 | conda uninstall idr -y
111 | cd $CONDA_EXTRA
112 | git clone --branch 2.0.4.2 git://github.com/kundajelab/idr
113 | cd idr
114 | python3 setup.py install
115 | cd $CONDA_EXTRA
116 | rm -rf idr
117 | 
118 | ### disable locally installed python package lookup
119 | CONTENTS=("export PYTHONNOUSERSITE=True")
120 | add_to_activate
121 | CONTENTS=("export PYTHONPATH=$CONDA_LIB/python3.5/site-packages:\$PYTHONPATH")
122 | add_to_activate
123 | 
124 | # install GEM
125 | if [[ ${INSTALL_GEM} == 1 ]]; then
126 |   cd $CONDA_EXTRA
127 |   wget http://groups.csail.mit.edu/cgs/gem/download/gem.v3.0.tar.gz -N --no-check-certificate
128 |   tar zxvf gem.v3.0.tar.gz  
129 |   rm -f gem.v3.0.tar.gz  
130 |   cd gem
131 |   chmod 755 gem.jar
132 |   cd $CONDA_BIN
133 |   ln -s $CONDA_EXTRA/gem/gem.jar
134 | fi
135 | 
136 | source deactivate
137 | 
138 | 
139 | echo == Installing dependencies has been successfully done. ==
140 | 


--------------------------------------------------------------------------------
/modules/callpeak_spp.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "species.bds"
  5 | include "module_template.bds"
  6 | 
  7 | 
  8 | help == callpeak spp settings
  9 | cap_num_peak_spp 	:= "300K" help Cap number of peaks (-npeak= in run_spp.R) (default: 300000).
 10 | max_ppsize_spp 	:= ""		help R stack size (R parameter --max-ppsize=; between 5000 and 5000000) for SPP.
 11 | speak_spp 	:= -1 		help User-defined cross-corr. peak strandshift (-speak= in run_spp.R). Use -1 to get from upstream cross-corr. analysis (default: -1).
 12 | extra_param_spp := ""		help Extra parameters for SPP (run_spp.R, peak calling only).
 13 | wt_spp		:= "47h"	help Walltime for spp (default: 47h, 47:00:00).
 14 | mem_spp		:= "12G" 	help Max. memory for spp (default: 12G).
 15 | 
 16 | 
 17 | grp_color_spp		:= "skyblue"
 18 | 
 19 | 
 20 | init_callpeak_spp()
 21 | 
 22 | 
 23 | void init_callpeak_spp() {
 24 | 
 25 | 	cap_num_peak_spp = get_conf_val( cap_num_peak_spp,	["cap_num_peak_spp"] )
 26 | 	wt_spp 		= get_conf_val( wt_spp, 	["walltime_spp", "wt_spp", "timeout_spp"] )
 27 | 	mem_spp 	= get_conf_val( mem_spp, 	["memory_spp", "mem_spp"] )
 28 | 	max_ppsize_spp 	= get_conf_val( max_ppsize_spp, ["max_ppsize_spp"] )
 29 | 	speak_spp 	= get_conf_val_int( speak_spp, 	["speak_spp"] )
 30 | 	extra_param_spp = get_conf_val( extra_param_spp,["extra_param_spp"] )
 31 | 
 32 | 	print("\n\n== callpeak spp settings\n")
 33 | 	print( "Threshold for # peak\t\t: $cap_num_peak_spp\n")
 34 | 	print( "Walltime (spp)\t\t\t: $wt_spp\n")
 35 | 	print( "Max. memory (spp)\t\t: $mem_spp\n")
 36 | 	print( "Stack size for run_spp.R\t\t:$max_ppsize_spp\n")
 37 | 	print( "Use-defined cross-corr. peak strandshift; if -1, use frag. len.\t:$speak_spp\n")
 38 | 	print( "Extra parameters for run_spp.R\t:$extra_param_spp\n")
 39 | }
 40 | 
 41 | string[] spp( string tag, string ctl_tag, string frag_len, string o_dir, string group, int nth_spp ) {
 42 | 
 43 | 	if ( ctl_tag == "" ) error("missing file: control tagalign!")
 44 | 	if ( frag_len == "" ) error("missing parameter: fragment length!")
 45 | 
 46 | 	int_cap_num_peak_spp := parse_number( cap_num_peak_spp )
 47 | 
 48 | 	prefix_vs	:= "$o_dir/" + make_vs_basename_wo_gz( tag, ctl_tag, "" )
 49 | 	prefix_x	:= "$o_dir/" + make_x_basename_wo_gz( tag, ctl_tag, "" )
 50 | 	rpeakfile_vs 	:= "$prefix_vs.regionPeak.gz"	
 51 | 	rpeakfile 	:= "$prefix_x.regionPeak.gz"
 52 | 	filt_rpeakfile	:= "$prefix_x.filt.regionPeak.gz"
 53 | 	ccscore		:= "$prefix_x.ccscore"
 54 | 	pdf_tmp		:= replace_dir( rm_ext( tag, ["gz"] ), o_dir ) + ".pdf"
 55 | 	pdf 		:= "$prefix_x.pdf"
 56 |         param_speak     := speak_spp > -1 ? "-speak=$speak_spp" : "-speak=$frag_len"
 57 | 	extra_param 	:= max_ppsize_spp ? "--max-ppsize=$max_ppsize_spp " : ""
 58 | 	if ( extra_param_spp ) extra_param += extra_param_spp
 59 | 
 60 | 	blacklist_exists := path_exists(blacklist)
 61 | 
 62 | 	in 	:= [ tag, ctl_tag ]
 63 | 	out 	:= [ rpeakfile, ccscore, pdf ]
 64 | 
 65 | 	taskName:= "spp " + group
 66 | 	cpus 	:= (nth_spp==1) ? -1 : nth_spp;	mem := get_res_mem(mem_spp,nth_spp);	timeout := get_res_wt(wt_spp)
 67 | 
 68 | 	wait_par( cpus )
 69 | 
 70 | 	tid := task( out<-in ) {
 71 | 
 72 | 		sys $shcmd_init
 73 | 
 74 | 		// # if phantompeakqualtools is an old version, use run_spp_nodups.R. new version has run_spp.R only		
 75 | 		sys if [ $(which run_spp_nodups.R 2> /dev/null | wc -l || echo) == "1" ]; then RUN_SPP=$(which run_spp_nodups.R); \
 76 | 		    else RUN_SPP=$(which run_spp.R); \
 77 | 		    fi
 78 | 
 79 | 		sys Rscript $extra_param ${RUN_SPP} -c=$tag -p=$nth_spp -i=$ctl_tag \
 80 | 			-npeak=$int_cap_num_peak_spp -odir=$o_dir $param_speak -savr -savp -rf -out=$ccscore
 81 | 
 82 | 		// Bug fix (we have scientific representation of chr coord., possible bug in run_spp.R?):
 83 | 		sys zcat $rpeakfile_vs | awk 'BEGIN{OFS="\t"}{ if ($2<0) $2=0; print $1,int($2),int($3),$4,$5,$6,$7,$8,$9,$10;}' | gzip -f -nc > $rpeakfile
 84 | 		
 85 | 		sys rm -f $rpeakfile_vs
 86 | 
 87 | 		sys mv $pdf_tmp $pdf
 88 | 
 89 | 		// if compressed output file is empty (spp error), remove it
 90 | 		sys if [ $(zcat $rpeakfile | wc -l ) == "0" ]; then rm -f $rpeakfile; fi
 91 | 
 92 | 		// if no rpeak file, do something to return non-zero exit code
 93 | 		sys if [ ! -f $rpeakfile ]; then error_in_spp_output_peak_does_not_exist; fi
 94 | 
 95 | 		sys if [[ $blacklist_exists == "true" ]]; then \
 96 | 			bedtools intersect -v -a <(zcat -f $rpeakfile) -b <(zcat -f $blacklist) \
 97 | 			| awk 'BEGIN{OFS="\t"} {if ($5>1000) $5=1000; print $0}' | grep -P 'chr[\dXY]+[ \t]' \
 98 | 			| gzip -nc > $filt_rpeakfile; \
 99 | 		fi
100 | 
101 | 		sys $shcmd_finalize
102 | 	}
103 | 
104 | 	register_par( tid, cpus )
105 | 
106 | 	add_task_to_graph( in, out, group, "SPP", grp_color_spp )
107 | 
108 | 	return out
109 | }
110 | 


--------------------------------------------------------------------------------
/modules/sys.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "string.bds"
  5 | 
  6 | helpUnsorted := true // do not sort help
  7 | 
  8 | 
  9 | script_path 	:= ""
 10 | script_dir 	:= ""
 11 | 
 12 | hostname 	:= ""
 13 | 
 14 | // pipeline seeks for executables in the BDS script directory (local git repo) and $PATH
 15 | // Add more relative path here if you want to keep your .py .sh .R visible to UNIX `which` as executables.
 16 | // Relative paths defined here are according to your script path (not your working directory but where .bds exists)
 17 | // Make sure that you chmod 755 your .py .R .sh
 18 | _rel_script_file_paths 	:= [".","modules","utils"]
 19 | 
 20 | 
 21 | init_base()
 22 | 
 23 | 
 24 | void init_base() {
 25 | 	script_path 	= "$ppwd/$programPath"
 26 | 	if (!script_path.exists()) script_path = "$programPath"
 27 | 
 28 | 	script_dir 	= script_path.dirName()
 29 | 	hostname 	= get_hostname()
 30 | }
 31 | 
 32 | //// script file path
 33 | 
 34 | string[] get_script_file_paths( string suffix ) {
 35 | 	string[] ret
 36 | 	for ( string path : _rel_script_file_paths ) {
 37 | 		path = "$script_dir/$path"
 38 | 		if ( path.exists() ) {
 39 | 			ret.add( path + suffix )
 40 | 			if ( path.dirName().endsWith( "modules" ) ) ret.add( "$path/../$suffix" )
 41 | 		}
 42 | 	}
 43 | 	return ret
 44 | }
 45 | 
 46 | string[] get_script_file_paths() {
 47 | 	return get_script_file_paths( "" )
 48 | }
 49 | 
 50 | //// command line argument functions
 51 | 
 52 | bool cmd_line_arg_has_key( string key ) {
 53 | 	key = key.toLower()
 54 | 	for ( string arg : args ) {
 55 | 		if ( ("-"+key) == arg.toLower().trim() ) return true
 56 | 	}
 57 | 	return false
 58 | }
 59 | 
 60 | bool is_cmd_line_arg_empty() {
 61 | 	return args.size()==0
 62 | }
 63 | 
 64 | bool is_first_arg_conf() {
 65 | 	if ( (args.size()>0) && (!args[0].startsWith("-")) )  {
 66 | 		if ( args.size()==1 ) {
 67 | 			return true
 68 | 		}
 69 | 		else {
 70 | 			return args[1].startsWith("-")
 71 | 		}
 72 | 	}
 73 | 	return false
 74 | }
 75 | 
 76 | string get_cmd_line_arg_val( string key ) {
 77 | 	key = key.toLower()
 78 | 	for (int i=0; i< args.size(); i++) {
 79 | 		arg := args[i]		
 80 | 		if ( ("-"+key) == arg.toLower().trim() ) {
 81 | 			if ( i==(args.size()-1) ) break
 82 | 			next_arg := args[i+1]
 83 | 
 84 | 			if ( next_arg.startsWith("-") ) break
 85 | 			return next_arg
 86 | 		}
 87 | 	}
 88 | 	return ""
 89 | }
 90 | 
 91 | //// functions for file I/O
 92 | 
 93 | string get_path( string str ) { // get absolute path (remove / if exists at end)
 94 | 	if (str.trim() == "") return ""
 95 | 	base := rm_str_at_end( str, "/" ).path()
 96 | 	return base
 97 | }
 98 | 
 99 | string mkdir( string str ) {
100 | 	if (str.trim() == "") return ""
101 | 	// make filename full path and mkdir -p
102 | 	path := get_path( str )
103 | 	if ( path.exists() ) {		
104 | 		return path
105 | 	}
106 | 	else {
107 | 		path.mkdir()
108 | 		return path
109 | 	}
110 | }
111 | 
112 | bool path_exists( string path ) {
113 | 	if ( path!="" ) {
114 | 		if ( path.exists() ) {
115 | 			if ( path.isFile() ) {
116 | 				if ( path.size() > 0 ) return true
117 | 			}
118 | 			else {
119 | 				return true
120 | 			}
121 | 		}
122 | 	}
123 | 	return false
124 | }
125 | 
126 | string copy( string file, string o_dir ) {
127 | 	file_new := replace_dir( file, o_dir )
128 | 	system := "local" // do not use cluster engine for this task
129 | 	taskName:= "copy file"
130 | 
131 | 	task ( file_new <- file ) {
132 | 
133 | 		sys cp --remove-destination $file $file_new
134 | 		sys while [ ! -f $file_new ]; do echo FOUND DELAYED WRITE, WAITING...; sleep 0.1; done
135 | 	}
136 | 
137 | 	return file_new
138 | }
139 | 
140 | string get_stdout( string cmd ) {
141 | 	rnd  := randInt()
142 | 	cmd_ := "cmd_$rnd".path()
143 | 	sys $cmd &> $cmd_ || true
144 | 	ret := cmd_.read()
145 | 	sys rm -f $cmd_
146 | 	return rm_str_at_end(ret,"\n")
147 | }
148 | 
149 | string get_shell_var( string var ) {
150 | 	var_ := "var_$var".path()
151 | 	sys echo "${$var}" > $var_
152 | 	ret := var_.read()
153 | 	sys rm -f $var_
154 | 	return ret
155 | }
156 | 
157 | string get_md5sum( string file ) {
158 | 	return get_stdout( "md5sum $file | awk '{print $1}'" )
159 | }
160 | 
161 | int get_num_lines( string file ) {	
162 | 	if ( !path_exists( file ) ) {
163 | 		error("get_no_lines(): File doesn't exist! ($file)")
164 | 	}
165 | 	else {		
166 | 		if ( file.toLower().endsWith(".gz") ) { // check if compressed or not
167 | 			return get_stdout( "zcat $file | wc -l" ).parseInt()
168 | 		}
169 | 		else {
170 | 			return get_stdout( "cat $file | wc -l" ).parseInt()
171 | 		}
172 | 	}
173 | }
174 | 
175 | string get_hostname() {
176 | 	out := get_stdout("hostname -f").replace("\n","")
177 | 	if (out.startsWith("hostname: ")) return "default"
178 | 	else return out
179 | }


--------------------------------------------------------------------------------
/modules/filetable.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "output.bds"
  5 | 
  6 | 
  7 | int{} _label_rank
  8 | 
  9 | string{} _filetable_label // key: hierarchy
 10 | string{} _filetable_path
 11 | int{} _filetable_rank
 12 | 
 13 | string{} _filetable_input
 14 | int _curr_rank = 0
 15 | 
 16 | 
 17 | 
 18 | void add_label_to_table( string label ) {
 19 | 	_label_rank{ label } = _curr_rank++
 20 | }
 21 | 
 22 | void add_file_to_table( string[] paths, string[] hrchys ) {
 23 | 	for ( int i=0; i<min(hrchys.size(),paths.size()); i++ )  {
 24 | 		hrchy 	:= hrchys[i]
 25 | 		path 	:= paths[i]
 26 | 
 27 | 		if ( hrchy == "" || path == "" ) continue
 28 | 		_filetable_input{ hrchy } = path
 29 | 	}
 30 | }
 31 | 
 32 | void add_file_to_table( string[] paths, string hrchy ) {
 33 | 	add_file_to_table( paths, [hrchy] )
 34 | }
 35 | 
 36 | void add_file_to_table( string path, string[] hrchys ) {
 37 | 	add_file_to_table( [path], hrchys )
 38 | }
 39 | 
 40 | void add_file_to_table( string path, string hrchy ) {
 41 | 	add_file_to_table( [path], [hrchy] )
 42 | }
 43 | 
 44 | string html_filetable() { // graphviz diagram
 45 | 	wait // for thread safety
 46 | 
 47 | 	html := "<div id='dirs_and_files'><b>Directories and files</b>"
 48 | 	html += "<table id='filetable'> <caption>"+\
 49 | 		"<a href='#' onclick=\"jQuery('#filetable').treetable('expandAll'); return false;\">Expand all</a> &nbsp&nbsp" + \
 50 | 		"<a href='#' onclick=\"jQuery('#filetable').treetable('collapseAll'); return false;\">Collapse all</a>" + \
 51 | 		"</caption> <thead><tr><th>Files</th><th>Path</th></tr></thead> <tbody>"
 52 | 
 53 | 	_construct_filetable()
 54 | 
 55 | 	sorted_hrchy := _find_children_and_sort( "" )
 56 | 	for ( string hrchy : sorted_hrchy ) {
 57 | 		parent 	:= _get_parent( hrchy )
 58 | 		label 	:= _filetable_label{ hrchy }
 59 | 		path 	:= _filetable_path.hasKey( hrchy ) ? _filetable_path{ hrchy } : ""
 60 | 		if ( parent == "" ) \
 61 | 			html += "<tr data-tt-id='$hrchy'><td> $label </td><td>"+ html_link_url( path ) +"</td></tr>"
 62 | 		else \
 63 | 			html += "<tr data-tt-id='$hrchy' data-tt-parent-id='$parent'><td> $label </td><td>"+ html_link_url( path ) +"</td></tr>"
 64 | 	}
 65 | 	html += "</tbody></table>"
 66 | 	html += "</div><br>\n"
 67 | 	return html
 68 | }
 69 | 
 70 | string html_link_url( string path ) {
 71 | 	rel_path := get_rel_path( path )
 72 | 	if ( rel_path.startsWith("./") ) \
 73 | 		return "<a href='" + rel_path + "' target='_blank'>" + rel_path + "</a><br>"
 74 | 	else \
 75 | 		return rel_path + "<br>"
 76 | }
 77 | 
 78 | void _construct_filetable() {
 79 | 	for( string hrchy : _filetable_input.keys() ) {
 80 | 		_construct_filetable( hrchy, _filetable_input{ hrchy } )
 81 | 	}
 82 | }
 83 | 
 84 | // returns rank of item
 85 | void _construct_filetable( string hrchy, string path ) {
 86 | 	if ( hrchy == "" ) return
 87 | 	if ( _filetable_label.hasKey( hrchy ) ) return
 88 | 
 89 | 	curr 	:= _get_curr( hrchy )
 90 | 	parent 	:= _get_parent( hrchy )
 91 | 	_filetable_label{hrchy} 	= curr //map_label.hasKey(curr) ? map_label{curr} : curr
 92 | 	_filetable_path{hrchy} 	= path
 93 | 	if ( parent != "" ) _construct_filetable( parent, "" )
 94 | }
 95 | 
 96 | string[] _get_children( string hrchy ) { // not including grand ones
 97 | 	string[] children
 98 | 
 99 | 	for ( string hrchy_ : _filetable_label.keys() ) {
100 | 		if ( hrchy == "" ) {
101 | 			if ( hrchy_.indexOf("/") < 0 ) \
102 | 				children.push( hrchy_ )
103 | 		}
104 | 		else if ( hrchy_.toLower().startsWith( hrchy.toLower() + "/" ) ) {
105 | 
106 | 			if ( hrchy_.lastIndexOf("/") <= hrchy.length() ) \
107 | 				children.push( hrchy_ )
108 | 		}			
109 | 	}
110 | 	return children
111 | }
112 | 
113 | string[] _find_children_and_sort( string hrchy ) {
114 | 	string[] ret
115 | 	children := _get_children( hrchy )
116 | 	if ( children.size() == 0 ) return ret
117 | 
118 | 	// for bubble sort
119 | 	int[] ranks
120 | 	for ( string child : children ) {
121 | 		curr := _get_curr( child )
122 | 		ranks.add( _label_rank.hasKey(curr) ? _label_rank{curr} : 0 )
123 | 	}
124 | 	sorted := _bubble_sort( ranks, children )
125 | 	for ( string child : sorted ) {
126 | 		ret = ret + [child] + _find_children_and_sort( child )
127 | 	}
128 | 	return ret
129 | }
130 | 
131 | string _get_parent( string hrchy ) { // "a/b/c" return a/b
132 | 	return hrchy.substr( 0, hrchy.lastIndexOf("/") )
133 | }
134 | 
135 | string _get_curr( string hrchy ) { // "a/b/c" return c
136 | 	return hrchy.substr( hrchy.lastIndexOf("/")+1 )
137 | }
138 | 
139 | string[] _bubble_sort( int[] a, string[] s ) { // sorting algorithm
140 | 	if ( a.size() != s.size() ) error("Array sizes do not match in _bubble_sort()!")
141 | 
142 | 	int temp; //for swapping
143 | 	string temp2;
144 | 	n := a.size()
145 | 	for (int i = 0 ; i < n - 1 ; i++) {
146 | 
147 | 		for (int j = 0 ; j < n - 1 ; j++) {
148 | 
149 | 			if ( a[j] > a[j + 1] ) {
150 | 				temp = a[j];
151 | 				a[j]=a[j + 1];
152 | 				a[j + 1] = temp;
153 | 
154 | 				temp2 = s[j];
155 | 				s[j]=s[j + 1];
156 | 				s[j + 1] = temp2;
157 | 			}
158 | 		}
159 | 	}
160 | 	return s
161 | }
162 | 


--------------------------------------------------------------------------------
/modules/postalign_xcor.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "species.bds"
  5 | include "module_template.bds"
  6 | 
  7 | 
  8 | help == postalign bed/tagalign settings
  9 | fraglen0	 := false 	help (LEGACY PARAM) Set predefined fragment length as zero for cross corr. analysis (add -speak=0 to run_spp.R).
 10 | speak_xcor 	 := -1		help Set user-defined cross-corr. peak strandshift (-speak= in run_spp.R). Use -1 to disable (default: -1).
 11 | max_ppsize_xcor	 := ""		help R stack size (R parameter --max-ppsize=; between 5000 and 5000000) for cross corr. analysis.
 12 | extra_param_xcor := ""		help Set extra parameters for run_spp.R (cross-corr. analysis only).
 13 | mem_xcor	 := "15G"	help Max. memory for cross-corr. analysis (default: 15G).
 14 | 
 15 | grp_color_xcor 	 := "yellowgreen"
 16 | 
 17 | init_postalign_xcor()
 18 | 
 19 | 
 20 | void init_postalign_xcor() {
 21 | 
 22 | 	fraglen0 	= get_conf_val_bool( fraglen0,		["fraglen0"] )
 23 | 	speak_xcor	= get_conf_val_int( speak_xcor,		["speak_xcor"] )
 24 | 	extra_param_xcor= get_conf_val( extra_param_xcor,	["extra_param_xcor"] )
 25 | 	mem_xcor	= get_conf_val( mem_xcor,		["mem_xcor"] )
 26 | 	max_ppsize_xcor = get_conf_val( max_ppsize_xcor, 	["max_ppsize_xcor"] )
 27 | 
 28 | 	// backward compatibility
 29 | 	if ( speak_xcor == -1 && fraglen0 ) speak_xcor = 0
 30 | 
 31 | 	print("\n\n== postalign cross-corr. analysis settings\n")
 32 | 	print( "Max. memory for UNIX shuf\t\t\t: $mem_shuf\n")
 33 | 	print( "User-defined cross-corr. peak strandshift\t: $speak_xcor\n")
 34 | 	print( "Extra parameters for cross-corr. analysis\t: $extra_param_xcor\n")
 35 | 	print( "Max. memory for cross-corr. analysis\t\t: $mem_xcor\n")
 36 | 	print( "Stack size for cross-corr. analysis\t\t:$max_ppsize_xcor\n")
 37 | }
 38 | 
 39 | string subsample_tag_PE_for_xcor( string tag, int nlines, bool non_mito, string o_dir, string group ) {
 40 | 
 41 | 	prefix 		:= replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir )
 42 | 	nreads_per_mill := metric_prefix( nlines )
 43 | 
 44 | 	subsampled_tag 	:= "$prefix."+(non_mito?"no_chrM.":"")+"$nreads_per_mill.R1.tagAlign.gz"
 45 | 	non_mito_param 	:= non_mito ? "grep -v \"chrM\" | " : ""
 46 | 
 47 | 	joined 		:= "$prefix.joined" // temporary file
 48 | 	joined_subsampled := "$prefix.joined.subsampled" // temporary file
 49 | 
 50 | 	in 	:= [ tag ]
 51 | 	out 	:= subsampled_tag
 52 | 
 53 | 	taskName:= "subsample_tag_PE_4_xcor " + group
 54 | 	mem := get_res_mem(mem_shuf,1)
 55 | 
 56 | 	wait_par( cpus )
 57 | 
 58 | 	tid := task( out<-in ) {
 59 | 
 60 | 		sys $shcmd_init
 61 | 
 62 | 		// join consecutive two lines into one
 63 | 		sys zcat $tag | sed 'N;s/\n/\t/' > $joined
 64 | 
 65 | 		//# Shuffle and split temporary combined file into 2 equal parts
 66 | 		//# Will produce $PR_PREFIX00 and $PR_PREFIX01
 67 | 		sys cat $joined | $non_mito_param shuf -n $nlines --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt </dev/zero 2>/dev/null) > $joined_subsampled
 68 | 	
 69 | 		//# Subsample tagAlign file
 70 | 		sys awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$4,$5,$6}' $joined_subsampled | \
 71 | 			gzip -nc > $subsampled_tag
 72 | 
 73 | 		sys rm -f $joined $joined_subsampled
 74 | 
 75 | 		sys $shcmd_finalize
 76 | 	}
 77 | 
 78 | 	register_par( tid, cpus )
 79 | 
 80 | 	add_task_to_graph( in, out, group )
 81 | 
 82 | 	return out
 83 | }
 84 | 
 85 | string[] xcor( string tag, string o_dir, string group, int nth_xcor ) {
 86 | 
 87 | 	// misc.
 88 | 	prefix 		:= replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir )
 89 | 	xcor_score 	:= "$prefix.cc.qc"
 90 | 	xcor_plot 	:= "$prefix.cc.plot.pdf"	
 91 |         param_speak     := speak_xcor > -1 ? "-speak=$speak_xcor" : ""
 92 |         extra_param 	:= max_ppsize_xcor ? "--max-ppsize=$max_ppsize_xcor " : ""
 93 | 
 94 | 	in 	:= [ tag ]
 95 | 	out 	:= [ xcor_score, xcor_plot ]
 96 | 
 97 | 	taskName:= "xcor " + group
 98 | 	cpus 	:= (nth_xcor==1) ? -1 : nth_xcor;	mem := get_res_mem(mem_xcor,nth_xcor);
 99 | 
100 | 	wait_par( cpus )
101 | 
102 | 	tid := task( out<-in ) {
103 | 
104 | 		sys $shcmd_init
105 | 
106 | 		// # if phantompeakqualtools is an old version, use run_spp_nodups.R. new version has run_spp.R only		
107 | 		sys if [[ $(which run_spp_nodups.R 2> /dev/null | wc -l || echo) == "1" ]]; then RUN_SPP=$(which run_spp_nodups.R); \
108 | 		    else RUN_SPP=$(which run_spp.R); \
109 | 		    fi
110 | 
111 | 		//# CCSCORE FILE format
112 | 		//# Filename <tab> numReads <tab> estFragLen <tab> correstFragLen <tab> PhantomPeak <tab> corrphantomPeak <tab> argmincorr <tab> mincorr <tab> phantomPeakCoef <tab> relPhantomPeakCoef <tab> QualityTag
113 | 		sys Rscript $extra_param ${RUN_SPP} -rf \
114 | 			-c=$tag -p=$nth_xcor \
115 | 			-filtchr=chrM -savp=$xcor_plot -out=$xcor_score $param_speak $extra_param_xcor
116 | 		sys sed -r 's/,[^\t]+//g' $xcor_score > $xcor_score.tmp
117 | 		sys mv $xcor_score.tmp $xcor_score
118 | 
119 | 		sys $shcmd_finalize
120 | 	}
121 | 
122 | 	register_par( tid, cpus )
123 | 
124 | 	add_task_to_graph( in, out, group, "XCOR", grp_color_xcor )
125 | 
126 | 	return out
127 | }
128 | 
129 | string get_fraglen( string xcor_score ) {   // get FRAGLEN (3rd column of cc score file) for spp(-speak=$FRAGLEN)	
130 | 
131 | 	cols := xcor_score.read().split("\t")
132 | 	return cols[2]
133 | }
134 | 


--------------------------------------------------------------------------------
/examples/chipseq_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | WORK=/srv/scratch/shared/mitra/leepc12/run/chipseq_test/CTCF/Snyder_CTCF_GM12878_PE
 4 | mkdir -p $WORK; cd $WORK
 5 | bds_scr Snyder_CTCF_GM12878_PE /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -pe -nth 12 -species hg19 -title CTCF_Snyder_CTCF_GM12878_PE \
 6 | -fastq1_1 /srv/scratch/shared/mitra/leepc12/data/CTCF/PE/Snyder_CTCF_GM12878_PE_Rep1_1.fastq.gz \
 7 | -fastq1_2 /srv/scratch/shared/mitra/leepc12/data/CTCF/PE/Snyder_CTCF_GM12878_PE_Rep1_2.fastq.gz \
 8 | -fastq2_1 /srv/scratch/shared/mitra/leepc12/data/CTCF/PE/Snyder_CTCF_GM12878_PE_Rep2_1.fastq.gz \
 9 | -fastq2_2 /srv/scratch/shared/mitra/leepc12/data/CTCF/PE/Snyder_CTCF_GM12878_PE_Rep2_2.fastq.gz \
10 | -ctl_fastq1_1 /srv/scratch/shared/mitra/leepc12/data/CTCF/PE/Snyder_Input_GM12878_PE_1.fastq.gz \
11 | -ctl_fastq1_2 /srv/scratch/shared/mitra/leepc12/data/CTCF/PE/Snyder_Input_GM12878_PE_2.fastq.gz \
12 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/CTCF/Snyder_CTCF_GM12878_PE/out
13 | 
14 | WORK=/srv/scratch/shared/mitra/leepc12/run/chipseq_test/CTCF/Snyder_CTCF_GM12878_SE
15 | mkdir -p $WORK; cd $WORK
16 | bds_scr Snyder_CTCF_GM12878_SE /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -se -nth 6 -species hg19 -title CTCF_Snyder_CTCF_GM12878_SE \
17 | -fastq1 /srv/scratch/shared/mitra/leepc12/data/CTCF/SE/Snyder_CTCF_GM12878_SE_Rep1.fastq.gz \
18 | -fastq2 /srv/scratch/shared/mitra/leepc12/data/CTCF/SE/Snyder_CTCF_GM12878_SE_Rep2.fastq.gz \
19 | -ctl_fastq1 /srv/scratch/shared/mitra/leepc12/data/CTCF/SE/Snyder_Input_GM12878_SE.fastq.gz \
20 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/CTCF/Snyder_CTCF_GM12878_SE/out
21 | 
22 | WORK=/srv/scratch/shared/mitra/leepc12/run/chipseq_test/GATA2/HAIB_GATA2_K562_SE
23 | mkdir -p $WORK; cd $WORK
24 | bds_scr HAIB_GATA2_K562_SE /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -se -nth 6 -species hg19 -title GATA2_HAIB_GATA2_K562_SE \
25 | -fastq1 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_GATA2_K562_SE_Rep1.fastq.gz \
26 | -fastq2 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_GATA2_K562_SE_Rep2.fastq.gz \
27 | -ctl_fastq1 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_Input_K562_SE_Rep1.fastq.gz \
28 | -ctl_fastq2 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_Input_K562_SE_Rep2.fastq.gz \
29 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/GATA2/HAIB_GATA2_K562_SE/out
30 | 
31 | WORK=/srv/scratch/shared/mitra/leepc12/run/chipseq_test/GATA2/UCD_GATA2_K562_SE
32 | mkdir -p $WORK; cd $WORK
33 | bds_scr UCD_GATA2_K562_SE /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -se -nth 6 -species hg19 -title GATA2_UCD_GATA2_K562_SE \
34 | -fastq1 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_GATA2_K562_SE_Rep1.fastq.gz \
35 | -fastq2 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_GATA2_K562_SE_Rep2.fastq.gz \
36 | -ctl_fastq1 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_Input_K562_SE_Rep1.fastq.gz \
37 | -ctl_fastq2 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_Input_K562_SE_Rep2.fastq.gz \
38 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/GATA2/UCD_GATA2_K562_SE/out
39 | 
40 | WORK=/srv/scratch/shared/mitra/leepc12/run/chipseq_test/GATA2/UChicago_GATA2_K562_SE
41 | mkdir -p $WORK; cd $WORK
42 | bds_scr UChicago_GATA2_K562_SE /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -se -nth 6 -species hg19 -title GATA2_UChicago_GATA2_K562_SE \
43 | -fastq1 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/UChicago_GATA2_K562_SE/UChicago_GATA2_K562_SE_Rep1.fastq.gz \
44 | -fastq2 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/UChicago_GATA2_K562_SE/UChicago_GATA2_K562_SE_Rep2.fastq.gz \
45 | -ctl_fastq1 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/UChicago_GATA2_K562_SE/UChicago_Input_K562_SE_Rep1.fastq.gz \
46 | -ctl_fastq2 /srv/scratch/shared/mitra/leepc12/data/GATA2/SE/UChicago_GATA2_K562_SE/UChicago_Input_K562_SE_Rep2.fastq.gz \
47 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/GATA2/UChicago_GATA2_K562_SE/out
48 | 
49 | WORK=/srv/scratch/shared/mitra/leepc12/run/chipseq_test/YY1/HudsonAlpha
50 | mkdir -p $WORK; cd $WORK
51 | bds_scr HudsonAlpha /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -se -nth 4 -species hg19 -title YY1_HudsonAlpha \
52 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/YY1/HudsonAlpha/out \
53 | -fastq1 /srv/scratch/shared/mitra/leepc12/data/YY1/HudsonAlpha/ENCFF000OHH.fastq.gz \
54 | -fastq2 /srv/scratch/shared/mitra/leepc12/data/YY1/HudsonAlpha/ENCFF000OHO.fastq.gz \
55 | -ctl_fastq1 /srv/scratch/shared/mitra/leepc12/data/YY1/HudsonAlpha/ENCFF000ODP.fastq.gz 
56 | 
57 | WORK=/srv/scratch/shared/mitra/leepc12/run/chipseq_test/YY1/Sydh
58 | mkdir -p $WORK; cd $WORK
59 | bds_scr Sydh /users/leepc12/code/TF_chipseq_pipeline/chipseq.bds -se -nth 6 -species hg19 -title YY1_Sydh \
60 | -fastq1 /srv/scratch/shared/mitra/leepc12/data/YY1/Sydh/ENCFF000WGS.fastq.gz \
61 | -fastq2 /srv/scratch/shared/mitra/leepc12/data/YY1/Sydh/ENCFF000WGT.fastq.gz \
62 | -ctl_fastq1 /srv/scratch/shared/mitra/leepc12/data/YY1/Sydh/ENCFF000VWV.fastq.gz \
63 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/YY1/Sydh/out
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/modules/callpeak_peakseq.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "species.bds"
  5 | include "module_template.bds"
  6 | 
  7 | 
  8 | help == callpeak PeakSeq settings
  9 | target_fdr_peakseq		:= 0.05		help Target FDR for PeakSeq (default: 0.05).
 10 | n_sim_peakseq			:= 10 		help Number of simulations for PeakSeq (default: 10).
 11 | enrich_mapped_fraglen_peakseq 	:= -1 		help Enrichment mapped fragment length for PeakSeq. Use -1 to get from upstream cross-corr. analysis (default: -1).
 12 | min_interpeak_dist_peakseq 	:= -1 		help Minimum interpeak distance for PeakSeq. Use -1 to get from upstream cross-corr. analysis (default: -1).
 13 | mappability_map_peakseq		:= "" 		help Mappability map file for PeakSeq (http://archive.gersteinlab.org/proj/PeakSeq/Mappability_Map).
 14 | max_qval_peakseq 		:= 0.1		help Maximum Q-value for PeakSeq (default: 0.1).
 15 | bckgrnd_model_peakseq		:= "Simulated" 	help Background model for PeakSeq (default: Simulated).
 16 | extra_param_peakseq	 	:= ""		help Extra parameters for PeakSeq.
 17 | wt_peakseq			:= "47h"	help Walltime for PeakSeq (default: 47h, 47:00:00).
 18 | mem_peakseq			:= "12G" 	help Max. memory for PeakSeq (default: 12G).
 19 | 
 20 | 
 21 | grp_color_peakseq	:= "pink"
 22 | 
 23 | 
 24 | init_callpeak_peakseq()
 25 | 
 26 | 
 27 | void init_callpeak_peakseq() {
 28 | 
 29 | 	target_fdr_peakseq 		= get_conf_val_real( target_fdr_peakseq,	["target_fdr_peakseq"] )
 30 | 	n_sim_peakseq 			= get_conf_val_int( n_sim_peakseq,		["n_sim_peakseq"] )
 31 | 	enrich_mapped_fraglen_peakseq 	= get_conf_val_int( enrich_mapped_fraglen_peakseq,	["enrich_mapped_fraglen_peakseq"] )
 32 | 	min_interpeak_dist_peakseq 	= get_conf_val_int( min_interpeak_dist_peakseq,	["min_interpeak_dist_peakseq"] )
 33 | 	mappability_map_peakseq 	= get_conf_val( mappability_map_peakseq,	["mappability_map_peakseq"] )
 34 | 	max_qval_peakseq 	= get_conf_val_real( max_qval_peakseq,	["max_qval_peakseq"] )	
 35 | 	bckgrnd_model_peakseq 	= get_conf_val( bckgrnd_model_peakseq,	["bckgrnd_model_peakseq"] )
 36 | 	extra_param_peakseq 	= get_conf_val( extra_param_peakseq,	["extra_param_peakseq"] )
 37 | 	wt_peakseq 		= get_conf_val( wt_peakseq, 		["walltime_peakseq", "wt_peakseq", "timeout_peakseq"] )
 38 | 	mem_peakseq 		= get_conf_val( mem_peakseq, 		["memory_peakseq", "mem_peakseq"] )
 39 | 
 40 | 	print("\n\n== callpeak PeakSeq settings\n")
 41 | 	print( "Target FDR for PeakSeq\t\t\t:$target_fdr_peakseq\n")
 42 | 	print( "Number of simulations for PeakSeq\t:$n_sim_peakseq\n")
 43 | 	print( "Enrichment mapped frag. len. for PeakSeq\t:$enrich_mapped_fraglen_peakseq\n")
 44 | 	print( "Minimum interpeak distance for PeakSeq\t:$min_interpeak_dist_peakseq\n")
 45 | 	print( "Mappability map file for PeakSeq\t:$mappability_map_peakseq\n")
 46 | 	print( "Maximum Q-value for PeakSeq\t\t:$max_qval_peakseq\n")
 47 | 	print( "Background model for PeakSeq\t\t:$bckgrnd_model_peakseq\n")
 48 | 	print( "Extra parameters for PeakSeq\t\t:$extra_param_peakseq\n")
 49 | 	print( "Walltime (PeakSeq)\t\t\t: $wt_peakseq\n")
 50 | 	print( "Max. memory (PeakSeq)\t\t: $mem_peakseq\n")
 51 | }
 52 | 
 53 | void chk_callpeak_peakseq() {
 54 | 	if ( !path_exists( mappability_map_peakseq ) ) \
 55 | 		error("\nMappability map file for PeakSeq does not exists! (file: $mappability_map_peakseq)\n")
 56 | }
 57 | 
 58 | string[] peakseq( string tag, string ctl_tag, string frag_len, string o_dir, string group ) {
 59 | 	if ( frag_len == "" ) error("missing parameter: fragment length!")
 60 | 	prefix 		:= ctl_tag ? 	("$o_dir/" + make_x_basename_wo_gz( tag, ctl_tag, "" ) ) \
 61 | 					: replace_dir( rm_ext( tag, "tagAlign" ), o_dir )
 62 | 	tmp_chip_dir 	:= "$prefix.tmp_chip_dir"
 63 | 	tmp_ctl_dir 	:= ctl_tag ? "$prefix.tmp_ctl_dir" : ""
 64 | 	config_file 	:= get_peakseq_conf_dat( prefix, frag_len, tmp_chip_dir, tmp_ctl_dir )
 65 | 	rpeakfile 	:= "$prefix.regionPeak.gz"
 66 | 	make_tmp_ctl_dir := ctl_tag ? "zcat $ctl_tag | PeakSeq -preprocess tagAlign stdin $tmp_ctl_dir" : ""
 67 | 
 68 | 	in 	:= [ tag, ctl_tag ]
 69 | 	out 	:= [ rpeakfile ]
 70 | 
 71 | 	taskName:= "peakseq " + group
 72 | 	mem := get_res_mem(mem_peakseq,1);	timeout := get_res_wt(wt_peakseq)
 73 | 
 74 | 	wait_par( cpus )
 75 | 
 76 | 	tid := task( out<-in ) {
 77 | 
 78 | 		sys $shcmd_init
 79 | 
 80 | 		// # =============================
 81 | 		// # The chip and input reads (chip.bam and input.bam) should be preprocessed before running:
 82 | 		// # =============================
 83 | 		sys cd $o_dir
 84 | 		sys mkdir -p $tmp_chip_dir
 85 | 		sys mkdir -p $tmp_ctl_dir
 86 | 		sys zcat $tag | PeakSeq -preprocess tagAlign stdin $tmp_chip_dir
 87 | 		sys $make_tmp_ctl_dir
 88 | 
 89 | 		// # =============================
 90 | 		// # Then it is necessary to setup the configuration file (config.dat). An example configuration file is included with the PeakSeq download. An example: 
 91 | 		// # =============================
 92 | 
 93 | 		// # =============================
 94 | 		// #Finally, the peaks are called using the configuration file:
 95 | 		// # =============================
 96 | 		sys PeakSeq -peak_select $config_file $extra_param_peakseq
 97 | 		sys rm -rf $tmp_chip_dir $tmp_ctl_dir
 98 | 		
 99 | 		sys $shcmd_finalize
100 | 	}
101 | 
102 | 	register_par( tid, cpus )
103 | 
104 | 	add_task_to_graph( in, out, group, "PEAKSEQ", grp_color_peakseq )
105 | 
106 | 	return out
107 | }
108 | 
109 | string get_peakseq_conf_dat( string prefix, string frag_len, string chipseq_dir, string ctl_dir ) {
110 | 	out := "$prefix.peakseq.config.dat"
111 | 	basename := prefix.baseName()
112 | 	contents := ""
113 | 	contents += "Experiment_id $basename\n"
114 | 	contents += "Enrichment_mapped_fragment_length $frag_len\n"
115 | 	contents += "target_FDR $target_fdr_peakseq\n"
116 | 	contents += "N_Simulations $n_sim_peakseq\n"
117 | 	contents += "Minimum_interpeak_distance $frag_len\n"
118 | 	contents += "Mappability_map_file $mappability_map_peakseq\n"
119 | 	contents += "ChIP_Seq_reads_data_dirs $chipseq_dir\n"
120 | 	if ( ctl_dir ) contents += "Input_reads_data_dirs $ctl_dir\n"
121 | 	contents += "max_Qvalue $max_qval_peakseq\n"
122 | 	contents += "Background_model $bckgrnd_model_peakseq\n"
123 | 	out.write(contents)
124 | 	return out
125 | }


--------------------------------------------------------------------------------
/utils/parse_summary_qc_recursively.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | 
  3 | # written by Jin Lee, 2016
  4 | 
  5 | import os
  6 | import sys
  7 | import re
  8 | import argparse
  9 | import json
 10 | import subprocess
 11 | from collections import OrderedDict
 12 | 
 13 | parser = argparse.ArgumentParser(prog='ENCODE_summary.json parser for QC', \
 14 |                                     description='Recursively find ENCODE_summary.json, parse it and make a TSV spreadsheet of QC metrics.')
 15 | parser.add_argument('--out-file', type=argparse.FileType('w'), default=sys.stdout, \
 16 |                         help='Output TSV filename)')
 17 | parser.add_argument('--search-dir', type=str, default='.', \
 18 |                         help='Root directory to search for ENCODE_summary.json')
 19 | parser.add_argument('--json-file', type=str, default='ENCODE_summary.json', \
 20 |                         help='Specify json file name to be parsed')
 21 | 
 22 | args = parser.parse_args()
 23 | 
 24 | # find all qc_summary.json recursively
 25 | # json_files = [os.path.join(dp, f) for dp, dn, filenames in os.walk(os.getcwd()) \
 26 | #     for f in filenames if os.path.splitext(f)[1] == 'qc_summary.json']
 27 | 
 28 | # find all ENCODE_summary.json recursively
 29 | json_files = subprocess.check_output("find -L %s -name %s" % (args.search_dir,args.json_file), \
 30 |                                     shell=True ).strip().split('\n')
 31 | # read json
 32 | jsons = []
 33 | for json_file in json_files:
 34 |     with open(json_file,'r') as f:
 35 |         jsons.append( json.load(f, object_pairs_hook=OrderedDict) )
 36 | 
 37 | # sort
 38 | # sorted_jsons = sorted(jsons, key = lambda x: (\
 39 | #     x['ENCODE_award_rfa'], \
 40 | #     x['ENCODE_assay_category'], \
 41 | #     x['ENCODE_assay_title'], \
 42 | #     x['species'], \
 43 | #     x['title']))
 44 | 
 45 | # look at headers first
 46 | headers = OrderedDict()
 47 | headers['common'] = [\
 48 |         'ENCODE award rfa',\
 49 |         'ENCODE assay category',\
 50 |         'ENCODE assay title',\
 51 |         'species',\
 52 |         'title',\
 53 |         'replicate']
 54 | 
 55 | # first take longest header for each qc_type
 56 | for json in jsons:
 57 |     for qc_file in json['qc_files']:
 58 |         qc_type = qc_file['qc_type']
 59 |         if qc_type == 'pbc_PE':
 60 |             qc_type = 'pbc'
 61 |             qc_file['qc_type'] = qc_type
 62 |         header_list = qc_file['header'].split('\t')        
 63 |         if not qc_type in headers or len(headers[qc_type])<len(header_list):
 64 |             headers[qc_type] = header_list
 65 | 
 66 | qc_type = 'files_to_be_submitted'
 67 | headers[qc_type] = []
 68 | 
 69 | # second add missing items for each qc_type
 70 | for json in jsons:
 71 |     for qc_file in json['qc_files']:
 72 |         qc_type = qc_file['qc_type']
 73 |         header_list = qc_file['header'].split('\t')
 74 |         for header_item in header_list:
 75 |             if not header_item in headers[qc_type]:
 76 |                 headers[qc_type].append(header_item)
 77 |     # files to be submitted to ENCODE portal
 78 |     qc_type = 'files_to_be_submitted'
 79 |     for data_file in json['data_files']:
 80 |         header_item = ":".join([data_file['output_type'],data_file['file_format']])
 81 |         if not header_item in headers[qc_type]:
 82 |             headers[qc_type].append(header_item)
 83 | 
 84 | # write header1
 85 | args.out_file.write( '\t'.join( [ qc_type+'\t'*(len(headers[qc_type])-1) \
 86 |                         for qc_type in headers ] ) +'\n')
 87 | 
 88 | # write header2
 89 | headers_wo_numbering = OrderedDict()
 90 | for qc_type in headers:
 91 |     headers_wo_numbering[qc_type] = [re.sub(r'^\d+_','',header) for header in headers[qc_type]]
 92 | args.out_file.write( '\t'.join( [ '\t'.join(headers_wo_numbering[qc_type]) \
 93 |                         for qc_type in headers_wo_numbering ] ) +'\n')
 94 | 
 95 | # for each replicate, write contents
 96 | for json in jsons:
 97 |     # count # of replicates per sample
 98 |     replicates = set()
 99 |     for qc_file in json['qc_files']:        
100 |         info = qc_file['info'].replace('-pr','' )
101 |         if not info or info == 'null': info = 'rep1'
102 |         if not re.match(r'^rep\d+$', info): continue
103 |         replicates.add( info )
104 | 
105 |     for rep in sorted(replicates):
106 |         result = json['ENCODE_award_rfa']+'\t'+\
107 |             json['ENCODE_assay_category']+'\t'+\
108 |             json['ENCODE_assay_title']+'\t'+\
109 |             json['species']+'\t'+\
110 |             json['title']+'\t'+\
111 |             rep
112 |         for qc_type in headers:
113 |             if rep == 'rep1' and qc_type == 'files_to_be_submitted':
114 |                 for header in headers[qc_type]:
115 |                     header_found = False
116 |                     tmp_result = []
117 |                     for data_file in json['data_files']:
118 |                         if header == ':'.join([data_file['output_type'],data_file['file_format']]):
119 |                             tmp_result.append(data_file['submitted_file_name'])
120 |                             header_found = True
121 |                     if header_found:
122 |                         result += '\t'+ ','.join(tmp_result)		
123 |                     else:
124 |                         result += '\t'
125 |             if qc_type=='common':
126 |                 continue
127 |             registered_header_list = headers[qc_type]
128 |             found = False
129 |             for qc_file in json['qc_files']:
130 |                 info = qc_file['info'].replace('-pr','' )
131 |                 if not info or info == 'null': info = 'rep1'
132 |                 if not re.match(r'^rep\d+$', info): continue
133 |                 if rep != info:
134 |                     continue
135 |                 if qc_type == qc_file['qc_type']:
136 |                     header_list = qc_file['header'].split('\t')
137 |                     contents_list = qc_file['contents'].split('\t')
138 |                     h_to_c = dict(zip(header_list,contents_list))
139 |                     for header_item in registered_header_list:
140 |                         if header_item in header_list:
141 |                             result += ('\t'+h_to_c[header_item] )
142 |                         else:
143 |                             result += ('\t')
144 |                     found = True
145 |                     break
146 | 		
147 |             if not found:
148 |                 result += ('\t'*len(registered_header_list))
149 | 
150 |         args.out_file.write( result + '\n' )
151 | 


--------------------------------------------------------------------------------
/modules/env.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "cluster.bds"
  5 | 
  6 | help == shell environment settings
  7 | mod 		:= ""		help Modules separated by ; (example: "bowtie/2.2.4; bwa/0.7.7; picard-tools/1.92").
  8 | shcmd 		:= ""		help Shell commands separated by ;. Shell var. must be written as ${VAR} not as $VAR (example: "export PATH=${PATH}:/usr/test; VAR=test"). 
  9 | addpath 	:= "" 		help Path separated by ; or : to be PREPENDED to \$PATH (example: "/bin/test:${HOME}/utils").
 10 | conda_env 	:= ""		help Anaconda Python (or Miniconda) environment name for all softwares including Python2.
 11 | conda_env_py3 	:= ""		help Anaconda Python (or Miniconda) environment name for Python3.
 12 | conda_bin_dir	:= ""		help Anaconda Python (or Miniconda) bin directory.
 13 | cluster_task_min_len 	:= 60 	help Minimum length for a cluster job in seconds (dealing with NFS delayed write, default: 60).
 14 | cluster_task_delay 	:= 0 	help Constant delay for every job in seconds (dealing with NFS delayed write, default: 0).
 15 | 
 16 | shcmd_init 	:= "" 	// Shell command executed prior to all BigDataScript tasks (use this for setting up shell environment)
 17 | shcmd_init_py3	:= "" 	// for softwares using python3
 18 | shcmd_finalize 	:= ""
 19 | 
 20 | delay_conda_env := 5 		// wait for 5 seconds for conda env activation
 21 | 
 22 | 
 23 | init_env()
 24 | 
 25 | 
 26 | void init_env() {
 27 | 	conda_env	= get_conf_val( conda_env, 		["conda_env"] )
 28 | 	conda_env_py3	= get_conf_val( conda_env_py3, 		["conda_env_py3"] )
 29 | 	conda_bin_dir	= get_conf_val( conda_bin_dir, 		["conda_bin_dir"] )
 30 | 	cluster_task_min_len	= get_conf_val_int( cluster_task_min_len, 	["cluster_task_min_len"] )
 31 | 	cluster_task_delay	= get_conf_val_int( cluster_task_delay, 	["cluster_task_delay"] )
 32 | 
 33 | 	// environment modules (sh,bash) init. file paths
 34 | 	init_mods 	:= ["/etc/profile.d/modules.sh", "/etc/profile.d/modules.bash"] 
 35 | 	init_mod 	:= "" // module init. shell script found among the above list
 36 | 	
 37 | 	moduleshome := get_shell_var("MODULESHOME").replace("\n","") // get shell var MODULESHOME if exists
 38 | 	if (moduleshome!="") init_mods.add("$moduleshome/init/bash")
 39 | 
 40 | 	string shellcmd, module, path, conda_py2, conda_py3
 41 | 	
 42 | 	for ( string file : init_mods ) { // find env. modules init script
 43 | 		if ( file.exists() ) {
 44 | 			init_mod = file
 45 | 			break
 46 | 		}
 47 | 	}
 48 | 
 49 | 	if ( init_mod == "" ) {
 50 | 		print("\n\nInfo: Environments module not found on your system " + \
 51 | 			"(e.g. /etc/profile.d/modules.sh). Ignoring shell env. parameters like '-mod'. \n")
 52 | 	}
 53 | 
 54 | 	// read from conf./env. file
 55 | 	for( string k : conf.keys() ) {		
 56 | 		string val = conf{k}		
 57 | 		if ( (k.indexOf("mod_")>=0) || (k=="mod") ) { // concat. module
 58 | 			if ( init_mod != "" ) {
 59 | 				trimmed := val.trim().replace("module add ","").replace( ":", " " ).replace( ";", " " ).replace( ",", " " ).trim()
 60 | 				trimmed = trimmed.replace( "   ", " ").replace( "  ", " ")
 61 | 				module = module + " " + trimmed
 62 | 			}
 63 | 		}
 64 | 		else if ( k.indexOf("shcmd")>=0 ) {
 65 | 			shellcmd = shellcmd + " " + val + ";"
 66 | 		}
 67 | 		else if ( k.indexOf("addpath")>=0 ) {
 68 | 			path = path + val.trim().replace(",",":").replace(";",":").replace(" ",":").replace(":::",":").replace("::",":") + ":"
 69 | 		}
 70 | 	}
 71 | 
 72 | 	// read from cmd. line arg.
 73 | 	if ( mod!="" ) {
 74 | 		string module_header = ". $init_mod;"
 75 | 		if ( init_mod != "" ) { // if /etc/profile.d/modules.sh exists
 76 | 			trimmed := mod.trim().replace("module add ","").replace( ":", " " ).replace( ";", " " ).replace(","," " ).trim()
 77 | 			trimmed = trimmed.replace( "   ", " ").replace( "  ", " ")
 78 | 			module = module + " " + trimmed
 79 | 		}
 80 | 	}
 81 | 	if ( shcmd!="" ) shellcmd = shellcmd + shcmd.trim() + "; "
 82 | 	if ( addpath!="" ) path = path + \
 83 | 			addpath.trim().replace(",",":").replace(";",":").replace(" ",":").replace(":::",":").replace("::",":") + ":"
 84 | 	if ( module !="" ) module = ". $init_mod; module add " + module + ";"
 85 | 
 86 | 	// check script directories to add to PATH
 87 | 	script_file_paths := get_script_file_paths()
 88 | 	for ( string _path : script_file_paths ) {		
 89 | 		if ( _path.exists() ) {
 90 | 			path = path + _path + ":"
 91 | 		}
 92 | 	}
 93 | 
 94 | 	if ( conda_bin_dir ) conda_bin_dir += "/"
 95 | 	if ( path !="" ) path = " export PATH=$path:\${PATH}:/bin:/usr/bin:/usr/local/bin:\${HOME}/.bds;"
 96 | 	// add conda env
 97 | 	if ( conda_env != "" ) conda_py2 = \
 98 | 		"if [[ -f $(which $conda_bin_dir"+"conda) && $($conda_bin_dir"+"conda env list | grep $conda_env | wc -l) != \"0\" ]];"+\
 99 | 		" then source $conda_bin_dir"+"activate $conda_env; sleep $delay_conda_env; fi; "
100 | 	if ( conda_env_py3 != "" ) conda_py3 = \
101 | 		"if [[ -f $(which $conda_bin_dir"+"conda) && $($conda_bin_dir"+"conda env list | grep $conda_env_py3 | wc -l) != \"0\" ]];"+\
102 | 		" then source $conda_bin_dir"+"activate $conda_env_py3; sleep $delay_conda_env; fi; "
103 | 
104 | 	// additional initialization
105 | 	shcmd_init_ := module + path + shellcmd
106 | 	shcmd_init_ += "; set -o pipefail" 		// to catch and stop on non-zero exit code in a UNIX pipe
107 | 	shcmd_init_ += "; STARTTIME=$(date +%s)" 	// to check running time for a task
108 | 	if ( nice != 0 ) shcmd_init_ += "; if (( $(nice)<$nice )); then renice -n $nice $$; fi" 		// to set process priority (niceness)
109 | 
110 | 	shcmd_init_ = shcmd_init_.replace( ": :", ":" ).replace( "::", ":" ).replace( "; ;", ";" ).replace( ";;", ";" )
111 | 	shcmd_init 	= conda_py2 + shcmd_init_
112 | 	shcmd_init_py3 	= conda_py3 + shcmd_init_
113 | 
114 | 	if ( is_system_local() ) {
115 | 		shcmd_finalize = "TASKTIME=$[$(date +%s)-${STARTTIME}]; echo \"Task has finished (${TASKTIME} seconds).\"; "+\
116 |                                  "sleep $cluster_task_delay"
117 | 	}
118 | 	else {
119 | 		shcmd_finalize = "TASKTIME=$[$(date +%s)-${STARTTIME}]; if [ ${TASKTIME} -lt $cluster_task_min_len ]; "+\
120 | 				"then echo \"Waiting for $[$cluster_task_min_len-${TASKTIME}] seconds.\";"+\
121 | 				" sleep $[$cluster_task_min_len-${TASKTIME}]; sleep $cluster_task_delay; fi"
122 | 	}
123 | 
124 | 	print("\n\n== shell environment info\n")
125 | 	print( "Conda env. \t\t\t: $conda_env\n" )
126 | 	print( "Conda env. for python3\t\t: $conda_env_py3\n" )
127 | 	print( "Conda bin. directory\t\t: $conda_bin_dir\n" )	
128 | 	print( "\nShell cmd. for init.\t\t: $shcmd_init\n" )
129 | 	print( "\nShell cmd. for init.(py3)\t: $shcmd_init_py3\n" )
130 | 	print( "\nShell cmd. for fin.\t\t: $shcmd_finalize\n" )
131 | 	print( "\nCluster task min. len.\t\t: $cluster_task_min_len\n" )
132 | 	print( "\nCluster task delay\t\t\t: $cluster_task_delay\n" )
133 | }
134 | 
135 | 


--------------------------------------------------------------------------------
/examples/example2.sh:
--------------------------------------------------------------------------------
  1 | screen -RD HAIB_GATA2_K562_SE
  2 | 
  3 | DATA_ROOT=/srv/scratch/leepc12/data; WORK_DIR=/srv/scratch/leepc12/run/chipseq_test; SUFFIX=GATA2/HAIB_GATA2_K562_SE
  4 | mkdir -p ${WORK_DIR}/$SUFFIX; cd ${WORK_DIR}/$SUFFIX;
  5 | bds /users/leepc12/code/bds_atac/chipseq/chipseq.bds \
  6 | -fastq1 ${DATA_ROOT}/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_GATA2_K562_SE_Rep1.fastq.gz \
  7 | -fastq2 ${DATA_ROOT}/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_GATA2_K562_SE_Rep2.fastq.gz \
  8 | -ctl_fastq1 ${DATA_ROOT}/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_Input_K562_SE_Rep1.fastq.gz \
  9 | -ctl_fastq2 ${DATA_ROOT}/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_Input_K562_SE_Rep2.fastq.gz \
 10 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/${SUFFIX}/out \
 11 | -species hg19 -bwa_idx ${DATA_ROOT}/CTCF/bwa_index/male.hg19.fa.gz -chrsz ${DATA_ROOT}/CTCF/male.hg19.chrom.sizes 
 12 | 
 13 | screen -RD Snyder_CTCF_GM12878_PE
 14 | 
 15 | DATA_ROOT=/srv/scratch/leepc12/data; WORK_DIR=/srv/scratch/leepc12/run/chipseq_test; SUFFIX=CTCF/Snyder_CTCF_GM12878_PE
 16 | mkdir -p ${WORK_DIR}/$SUFFIX; cd ${WORK_DIR}/$SUFFIX;
 17 | bds /users/leepc12/code/bds_atac/chipseq/chipseq.bds \
 18 | -fastq1_1 ${DATA_ROOT}/CTCF/PE/Snyder_CTCF_GM12878_PE_Rep1_1.fastq.gz \
 19 | -fastq1_2 ${DATA_ROOT}/CTCF/PE/Snyder_CTCF_GM12878_PE_Rep1_2.fastq.gz \
 20 | -fastq2_1 ${DATA_ROOT}/CTCF/PE/Snyder_CTCF_GM12878_PE_Rep2_1.fastq.gz \
 21 | -fastq2_2 ${DATA_ROOT}/CTCF/PE/Snyder_CTCF_GM12878_PE_Rep2_2.fastq.gz \
 22 | -ctl_fastq1_1 ${DATA_ROOT}/CTCF/PE/Snyder_Input_GM12878_PE_1.fastq.gz \
 23 | -ctl_fastq1_2 ${DATA_ROOT}/CTCF/PE/Snyder_Input_GM12878_PE_2.fastq.gz \
 24 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/${SUFFIX}/out \
 25 | -species hg19 -bwa_idx ${DATA_ROOT}/CTCF/bwa_index/male.hg19.fa.gz -chrsz ${DATA_ROOT}/CTCF/male.hg19.chrom.sizes
 26 | 
 27 | screen -RD UCD_GATA2_K562_SE
 28 | 
 29 | DATA_ROOT=/srv/scratch/leepc12/data; WORK_DIR=/srv/scratch/leepc12/run/chipseq_test; SUFFIX=GATA2/UCD_GATA2_K562_SE
 30 | mkdir -p ${WORK_DIR}/$SUFFIX; cd ${WORK_DIR}/$SUFFIX;
 31 | bds /users/leepc12/code/bds_atac/chipseq/chipseq.bds \
 32 | -species hg19 -bwa_idx ${DATA_ROOT}/CTCF/bwa_index/male.hg19.fa.gz -chrsz ${DATA_ROOT}/CTCF/male.hg19.chrom.sizes \
 33 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/${SUFFIX}/out \
 34 | -fastq1 ${DATA_ROOT}/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_GATA2_K562_SE_Rep1.fastq.gz \
 35 | -fastq2 ${DATA_ROOT}/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_GATA2_K562_SE_Rep2.fastq.gz \
 36 | -ctl_fastq1 ${DATA_ROOT}/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_Input_K562_SE_Rep1.fastq.gz \
 37 | -ctl_fastq2 ${DATA_ROOT}/GATA2/SE/HAIB_GATA2_K562_SE/HAIB_Input_K562_SE_Rep2.fastq.gz
 38 | 
 39 | screen -RD UChicago_GATA2_K562_SE
 40 | 
 41 | DATA_ROOT=/srv/scratch/leepc12/data; WORK_DIR=/srv/scratch/leepc12/run/chipseq_test; SUFFIX=GATA2/UChicago_GATA2_K562_SE
 42 | mkdir -p ${WORK_DIR}/$SUFFIX; cd ${WORK_DIR}/$SUFFIX;
 43 | bds /users/leepc12/code/bds_atac/chipseq/chipseq.bds \
 44 | -species hg19 -bwa_idx ${DATA_ROOT}/CTCF/bwa_index/male.hg19.fa.gz -chrsz ${DATA_ROOT}/CTCF/male.hg19.chrom.sizes \
 45 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/${SUFFIX}/out \
 46 | -fastq1 ${DATA_ROOT}/GATA2/SE/UChicago_GATA2_K562_SE/UChicago_GATA2_K562_SE_Rep1.fastq.gz \
 47 | -fastq2 ${DATA_ROOT}/GATA2/SE/UChicago_GATA2_K562_SE/UChicago_GATA2_K562_SE_Rep2.fastq.gz \
 48 | -ctl_fastq1 ${DATA_ROOT}/GATA2/SE/UChicago_GATA2_K562_SE/UChicago_Input_K562_SE_Rep1.fastq.gz \
 49 | -ctl_fastq2 ${DATA_ROOT}/GATA2/SE/UChicago_GATA2_K562_SE/UChicago_Input_K562_SE_Rep2.fastq.gz
 50 | 
 51 | screen -RD Snyder_CTCF_GM12878_SE
 52 | 
 53 | DATA_ROOT=/srv/scratch/leepc12/data; WORK_DIR=/srv/scratch/leepc12/run/chipseq_test; SUFFIX=CTCF/Snyder_CTCF_GM12878_SE
 54 | mkdir -p ${WORK_DIR}/$SUFFIX; cd ${WORK_DIR}/$SUFFIX;
 55 | bds /users/leepc12/code/bds_atac/chipseq/chipseq.bds \
 56 | -species hg19 -bwa_idx ${DATA_ROOT}/CTCF/bwa_index/male.hg19.fa.gz -chrsz ${DATA_ROOT}/CTCF/male.hg19.chrom.sizes \
 57 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/${SUFFIX}/out \
 58 | -fastq1 ${DATA_ROOT}/CTCF/SE/Snyder_CTCF_GM12878_SE_Rep1.fastq.gz \
 59 | -fastq2 ${DATA_ROOT}/CTCF/SE/Snyder_CTCF_GM12878_SE_Rep2.fastq.gz \
 60 | -ctl_fastq1 ${DATA_ROOT}/CTCF/SE/Snyder_Input_GM12878_SE.fastq.gz
 61 | 
 62 | screen -RD HudsonAlpha
 63 | 
 64 | DATA_ROOT=/srv/scratch/leepc12/data; WORK_DIR=/srv/scratch/leepc12/run/chipseq_test; SUFFIX=YY1/HudsonAlpha
 65 | mkdir -p ${WORK_DIR}/$SUFFIX; cd ${WORK_DIR}/$SUFFIX;
 66 | bds /users/leepc12/code/bds_atac/chipseq/chipseq.bds \
 67 | -species hg19 -bwa_idx ${DATA_ROOT}/CTCF/bwa_index/male.hg19.fa.gz -chrsz ${DATA_ROOT}/CTCF/male.hg19.chrom.sizes \
 68 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/${SUFFIX}/out \
 69 | -fastq1 ${DATA_ROOT}/YY1/HudsonAlpha/ENCFF000OHH.fastq.gz \
 70 | -fastq2 ${DATA_ROOT}/YY1/HudsonAlpha/ENCFF000OHO.fastq.gz \
 71 | -ctl_fastq1 ${DATA_ROOT}/YY1/HudsonAlpha/ENCFF000ODP.fastq.gz
 72 | 
 73 | screen -RD Sydh
 74 | 
 75 | DATA_ROOT=/srv/scratch/leepc12/data; WORK_DIR=/srv/scratch/leepc12/run/chipseq_test; SUFFIX=YY1/Sydh
 76 | mkdir -p ${WORK_DIR}/$SUFFIX; cd ${WORK_DIR}/$SUFFIX;
 77 | bds /users/leepc12/code/bds_atac/chipseq/chipseq.bds \
 78 | -url_base http://mitra.stanford.edu/kundaje/leepc12/chipseq_test/${SUFFIX}/out \
 79 | -species hg19 -bwa_idx ${DATA_ROOT}/CTCF/bwa_index/male.hg19.fa.gz -chrsz ${DATA_ROOT}/CTCF/male.hg19.chrom.sizes \
 80 | -fastq1 ${DATA_ROOT}/YY1/Sydh/ENCFF000WGS.fastq.gz \
 81 | -fastq2 ${DATA_ROOT}/YY1/Sydh/ENCFF000WGT.fastq.gz \
 82 | -ctl_fastq1 ${DATA_ROOT}/YY1/Sydh/ENCFF000VWV.fastq.gz 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | WORK_DIR_OLD=/srv/scratch/leepc12/run/TF_chipseq_pipeline_test/$SUFFIX
115 | mkdir -p $WORK_DIR/$SUFFIX/out/align/rep1; cp -pr $WORK_DIR_OLD/out/align_rep1/*.bam $WORK_DIR/$SUFFIX/out/align/rep1/
116 | mkdir -p $WORK_DIR/$SUFFIX/out/align/rep1; cp -pr $WORK_DIR_OLD/out/align_rep1/*.flagstat.qc $WORK_DIR/$SUFFIX/out/align/rep1/
117 | mkdir -p $WORK_DIR/$SUFFIX/out/align/rep2; cp -pr $WORK_DIR_OLD/out/align_rep2/*.bam $WORK_DIR/$SUFFIX/out/align/rep2/
118 | mkdir -p $WORK_DIR/$SUFFIX/out/align/rep2; cp -pr $WORK_DIR_OLD/out/align_rep2/*.flagstat.qc $WORK_DIR/$SUFFIX/out/align/rep2/
119 | mkdir -p $WORK_DIR/$SUFFIX/out/align/ctl1; cp -pr $WORK_DIR_OLD/out/align_ctl_rep1/*.bam $WORK_DIR/$SUFFIX/out/align/ctl1/
120 | mkdir -p $WORK_DIR/$SUFFIX/out/align/ctl1; cp -pr $WORK_DIR_OLD/out/align_ctl_rep1/*.flagstat.qc $WORK_DIR/$SUFFIX/out/align/ctl1/
121 | mkdir -p $WORK_DIR/$SUFFIX/out/align/ctl2; cp -pr $WORK_DIR_OLD/out/align_ctl_rep2/*.bam $WORK_DIR/$SUFFIX/out/align/ctl2/
122 | mkdir -p $WORK_DIR/$SUFFIX/out/align/ctl2; cp -pr $WORK_DIR_OLD/out/align_ctl_rep2/*.flagstat.qc $WORK_DIR/$SUFFIX/out/align/ctl2/
123 | find . -name '*.nmsrt.bam' -delete
124 | find . -name '*.nodup.bam' -delete
125 | 
126 | 


--------------------------------------------------------------------------------
/species/scg.conf:
--------------------------------------------------------------------------------
  1 | [mm9]
  2 | chrsz	= /reference/ENCODE/pipeline_genome_data/mm9/mm9.chrom.sizes
  3 | seq_dir = /reference/ENCODE/pipeline_genome_data/mm9/seq
  4 | gensz	= mm
  5 | umap	= /reference/ENCODE/pipeline_genome_data/mm9/globalmap_k20tok54
  6 | bwa_idx	= /reference/ENCODE/pipeline_genome_data/mm9/bwa_index/mm9.fa
  7 | bwt2_idx	= /reference/ENCODE/pipeline_genome_data/mm9/bowtie2_index/mm9.fa
  8 | ref_fa	= /reference/ENCODE/pipeline_genome_data/mm9/mm9.fa
  9 | blacklist	= /reference/ENCODE/pipeline_genome_data/mm9/mm9-blacklist.bed.gz
 10 | # data for ATAQC
 11 | tss_enrich	= /reference/ENCODE/pipeline_genome_data/mm9/ataqc/mm9_gencode_tss_unique.bed.gz
 12 | dnase	= /reference/ENCODE/pipeline_genome_data/mm9/ataqc/mm9_univ_dhs_ucsc.from_mm10.bed.gz
 13 | prom	= /reference/ENCODE/pipeline_genome_data/mm9/ataqc/tss_mm9_master.from_mm10.bed.gz
 14 | enh	= /reference/ENCODE/pipeline_genome_data/mm9/ataqc/mm9_enh_dhs_ucsc.from_mm10.bed.gz
 15 | reg2map	= /reference/ENCODE/pipeline_genome_data/mm9/ataqc/dnase_avgs_merged_named.fseq.vals.gz
 16 | reg2map_bed   = /reference/ENCODE/pipeline_genome_data/mm9/ataqc/mm9_dhs_universal_ucsc_v1.bed.gz
 17 | roadmap_meta	= /reference/ENCODE/pipeline_genome_data/mm9/ataqc/accession_to_name.txt
 18 | 
 19 | [mm10, mm10_ENCODE, mm10_ENCODE3] # from ENCODE portal
 20 | chrsz	= /reference/ENCODE/pipeline_genome_data/mm10/mm10.chrom.sizes
 21 | seq_dir = /reference/ENCODE/pipeline_genome_data/mm10/seq
 22 | gensz	= mm
 23 | bwa_idx	= /reference/ENCODE/pipeline_genome_data/mm10/bwa_index/mm10_no_alt_analysis_set_ENCODE.fasta
 24 | bwt2_idx	= /reference/ENCODE/pipeline_genome_data/mm10/bowtie2_index/mm10_no_alt_analysis_set_ENCODE.fasta
 25 | ref_fa	= /reference/ENCODE/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta
 26 | blacklist	= /reference/ENCODE/pipeline_genome_data/mm10/mm10.blacklist.bed.gz
 27 | # data for ATAQC
 28 | tss_enrich	= /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_gencode_tss_unique.bed.gz
 29 | dnase	= /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_univ_dhs_ucsc.bed.gz
 30 | prom	= /reference/ENCODE/pipeline_genome_data/mm10/ataqc/tss_mm10_master.bed.gz
 31 | enh	= /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_enh_dhs_ucsc.bed.gz
 32 | reg2map	= /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_formatted.txt.gz
 33 | reg2map_bed	= /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_celltype_compare_subsample.bed.gz
 34 | roadmap_meta	= /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_metadata.txt
 35 | ENCODE_assembly = mm10
 36 | 
 37 | [hg19]
 38 | chrsz	= /reference/ENCODE/pipeline_genome_data/hg19/hg19.chrom.sizes
 39 | seq_dir = /reference/ENCODE/pipeline_genome_data/hg19/seq
 40 | gensz	= hs
 41 | umap	= /reference/ENCODE/pipeline_genome_data/hg19/globalmap_k20tok54
 42 | bwa_idx	= /reference/ENCODE/pipeline_genome_data/hg19/bwa_index/male.hg19.fa
 43 | bwt2_idx	= /reference/ENCODE/pipeline_genome_data/hg19/bowtie2_index/male.hg19.fa
 44 | ref_fa	= /reference/ENCODE/pipeline_genome_data/hg19/male.hg19.fa
 45 | blacklist	= /reference/ENCODE/pipeline_genome_data/hg19/wgEncodeDacMapabilityConsensusExcludable.bed.gz
 46 | # data for ATAQC
 47 | tss_enrich	= /reference/ENCODE/pipeline_genome_data/hg19/ataqc/hg19_gencode_tss_unique.bed.gz
 48 | dnase	= /reference/ENCODE/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.bed.gz
 49 | prom	= /reference/ENCODE/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_prom_p2.bed.gz
 50 | enh	= /reference/ENCODE/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_enh_p2.bed.gz
 51 | reg2map	= /reference/ENCODE/pipeline_genome_data/hg19/ataqc/dnase_avgs_reg2map_p10_merged_named.pvals.gz
 52 | roadmap_meta	= /reference/ENCODE/pipeline_genome_data/hg19/ataqc/eid_to_mnemonic.txt
 53 | 
 54 | [hg38, hg38_ENCODE, hg38_ENCODE3] # from ENCODE portal
 55 | chrsz	= /reference/ENCODE/pipeline_genome_data/hg38/hg38.chrom.sizes
 56 | seq_dir = /reference/ENCODE/pipeline_genome_data/hg38/seq
 57 | gensz	= hs
 58 | bwa_idx	= /reference/ENCODE/pipeline_genome_data/hg38/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta
 59 | bwt2_idx	= /reference/ENCODE/pipeline_genome_data/hg38/bowtie2_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta
 60 | ref_fa	= /reference/ENCODE/pipeline_genome_data/hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta
 61 | blacklist	= /reference/ENCODE/pipeline_genome_data/hg38/hg38.blacklist.bed.gz
 62 | # data for ATAQC
 63 | tss_enrich	= /reference/ENCODE/pipeline_genome_data/hg38/ataqc/hg38_gencode_tss_unique.bed.gz
 64 | dnase	= /reference/ENCODE/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.hg19_to_hg38.bed.gz
 65 | prom	= /reference/ENCODE/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_prom_p2.hg19_to_hg38.bed.gz
 66 | enh	= /reference/ENCODE/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_enh_p2.hg19_to_hg38.bed.gz
 67 | reg2map	= /reference/ENCODE/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_formatted.txt.gz
 68 | reg2map_bed	= /reference/ENCODE/pipeline_genome_data/hg38/ataqc/hg38_celltype_compare_subsample.bed.gz
 69 | roadmap_meta	= /reference/ENCODE/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_metadata.txt
 70 | ENCODE_assembly = GRCh38
 71 | 
 72 | [dm3] # installed by install_genome_data.sh
 73 | chrsz	= /reference/ENCODE/pipeline_genome_data/dm3/dm3.chrom.sizes
 74 | seq_dir = /reference/ENCODE/pipeline_genome_data/dm3/seq
 75 | gensz	= 168736537
 76 | bwa_idx	= /reference/ENCODE/pipeline_genome_data/dm3/bwa_index/dm3.fa
 77 | bwt2_idx	= /reference/ENCODE/pipeline_genome_data/dm3/bowtie2_index/dm3.fa
 78 | ref_fa	= /reference/ENCODE/pipeline_genome_data/dm3/dm3.fa
 79 | 
 80 | [pantro5] # installed by install_genome_data.sh
 81 | chrsz	= /reference/ENCODE/pipeline_genome_data/pantro5/pantro5.chrom.sizes
 82 | seq_dir = /reference/ENCODE/pipeline_genome_data/pantro5/seq
 83 | gensz	= 3231170666
 84 | bwa_idx	= /reference/ENCODE/pipeline_genome_data/pantro5/bwa_index/panTro5.fa
 85 | bwt2_idx	= /reference/ENCODE/pipeline_genome_data/pantro5/bowtie2_index/panTro5.fa
 86 | ref_fa	= /reference/ENCODE/pipeline_genome_data/pantro5/panTro5.fa
 87 | 
 88 | [macam7] # installed by install_genome_data.sh
 89 | chrsz	= /reference/ENCODE/pipeline_genome_data/macam7/macam7.chrom.sizes
 90 | seq_dir = /reference/ENCODE/pipeline_genome_data/macam7/seq
 91 | gensz	= 2817542206
 92 | bwa_idx	= /reference/ENCODE/pipeline_genome_data/macam7/bwa_index/MacaM_Rhesus_Genome_v7.fasta
 93 | bwt2_idx	= /reference/ENCODE/pipeline_genome_data/macam7/bowtie2_index/MacaM_Rhesus_Genome_v7.fasta
 94 | ref_fa	= /reference/ENCODE/pipeline_genome_data/macam7/MacaM_Rhesus_Genome_v7.fasta
 95 | nonamecheck = true # for bedtools >= 2.24. this prevents name convention error in bedtools intersect
 96 | 
 97 | [saccer3] # installed by install_genome_data.sh
 98 | chrsz   = /reference/ENCODE/pipeline_genome_data/saccer3/saccer3.chrom.sizes
 99 | seq     = /reference/ENCODE/pipeline_genome_data/saccer3/seq
100 | gensz   = 12157105
101 | bwa_idx = /reference/ENCODE/pipeline_genome_data/saccer3/bwa_index/sacCer3.fa
102 | bwt2_idx= /reference/ENCODE/pipeline_genome_data/saccer3/bowtie2_index/sacCer3.fa
103 | ref_fa  = /reference/ENCODE/pipeline_genome_data/saccer3/sacCer3.fa
104 | 
105 | 


--------------------------------------------------------------------------------
/modules/align_bwa.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "species.bds"
  5 | include "module_template.bds"
  6 | 
  7 | 
  8 | help == align bwa settings (requirements: -bwa_idx)
  9 | param_bwa_aln	:= "-q 5 -l 32 -k 2" 	help Parameters for bwa aln (default: "-q 5 -l 32 -k 2").
 10 | bwa_idx 	:= ""			help BWA index (full path prefix of *.bwt file) .
 11 | wt_bwa		:= "47h"		help Walltime for bwa (default: 47, 47:00:00).
 12 | mem_bwa		:= "12G"	 	help Max. memory for bwa (default: 12G).
 13 | 
 14 | 
 15 | grp_color_bwa	:= "salmon"
 16 | 
 17 | 
 18 | init_align_bwa()
 19 | 
 20 | 
 21 | void init_align_bwa() {
 22 | 
 23 | 	param_bwa_aln 	= get_conf_val( param_bwa_aln, 	["param_bwa_aln"] )
 24 | 	bwa_idx 	= get_conf_val( bwa_idx, 	["bwa_idx"] )
 25 | 	wt_bwa 		= get_conf_val( wt_bwa, 	["wt_bwa"] )
 26 | 	mem_bwa 	= get_conf_val( mem_bwa, 	["mem_bwa"] )
 27 | 
 28 | 	print("\n\n== align bwa settings\n")
 29 | 	print( "Param. for bwa\t\t\t: $param_bwa_aln\n")
 30 | 	print( "BWA index\t\t\t: $bwa_idx\n" )
 31 | 	print( "Walltime (bwa)\t\t\t: $wt_bwa\n")
 32 | 	print( "Max. memory (bwa)\t\t: $mem_bwa\n")
 33 | }
 34 | 
 35 | void chk_align_bwa() {
 36 | 
 37 | 	if ( !path_exists("$bwa_idx.bwt") ) error("\nBwa index (-bwa_idx) doesn't exists! (file: $bwa_idx.bwt)\n")
 38 | }
 39 | 
 40 | string[] bwa( string fastq, string o_dir, string log_o_dir, string group, int nth_bwa ) {
 41 | 
 42 | 	prefix 	:= replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir )
 43 | 	prefix2 := replace_dir( prefix, log_o_dir )
 44 | 	bam 	:= "$prefix.bam"
 45 | 	qc 	:= "$prefix2.flagstat.qc"
 46 | 
 47 | 	in 	:= [ fastq ]
 48 | 	out 	:= [ bam, qc ]
 49 | 
 50 | 	if ( out <- in ) { // compare file timestamps of in and out (to check if job is already done or not)
 51 | 
 52 | 		sai := bwa_aln( fastq, o_dir, group, nth_bwa )
 53 | 		wait
 54 | 
 55 | 		bwa_sam( fastq, sai, o_dir, log_o_dir, group, nth_bwa )
 56 | 		wait
 57 | 
 58 | 		sai.rm() // delete intermediate file sai		
 59 | 	}
 60 | 
 61 | 	add_task_to_graph( in, out, group, "BWA\\n(SE)", grp_color_bwa )
 62 | 
 63 | 	return out
 64 | }
 65 | 
 66 | string[] bwa_PE( string fastq1, string fastq2, string o_dir, string log_o_dir, string group, int nth_bwa ) {
 67 | 
 68 | 	prefix 	:= replace_dir( rm_ext( fastq1, ["fastq","fq"] ), o_dir ) + ".PE2SE"
 69 | 	prefix2 := replace_dir( prefix, log_o_dir )
 70 | 	bam 	:= "$prefix.bam"
 71 | 	qc 	:= "$prefix2.flagstat.qc"
 72 | 
 73 | 	in 	:= [ fastq1, fastq2 ]
 74 | 	out 	:= [ bam, qc ]
 75 | 
 76 | 	if ( out <- in ) { // compare file timestamps of in and out (to check if job is already done or not)
 77 | 
 78 | 		nth_bwa_aln := distribute_nonzero( nth_bwa, [1,1] )
 79 | 
 80 | 		// parallel jobs
 81 | 		sai1 := bwa_aln( fastq1, o_dir, group+"_1", nth_bwa_aln[0] )
 82 | 		sai2 := bwa_aln( fastq2, o_dir, group+"_2", nth_bwa_aln[1] )
 83 | 
 84 | 		wait
 85 | 
 86 | 		bwa_sam_PE( fastq1, fastq2, sai1, sai2, o_dir, log_o_dir, group, nth_bwa )
 87 | 		wait
 88 | 		
 89 | 		sai1.rm() // delete intermediate file sai1, sai2
 90 | 		sai2.rm()
 91 | 	}
 92 | 
 93 | 	add_task_to_graph( in, out, group, "BWA\\n(PE)", grp_color_bwa )
 94 | 
 95 | 	return out
 96 | }
 97 | 
 98 | string bwa_aln( string fastq, string o_dir, string group, int nth_bwa ) {
 99 | 
100 | 	prefix 	:= replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir )
101 | 	sai 	:= "$prefix.sai"	
102 | 	
103 | 	in 	:= [ fastq ]
104 | 	out 	:= sai
105 | 
106 | 	taskName:= "bwa_aln " + group
107 | 	cpus 	:= (nth_bwa==1) ? -1 : nth_bwa;	mem := get_res_mem(mem_bwa,nth_bwa); timeout := get_res_wt(wt_bwa)
108 | 
109 | 	wait_par( cpus )
110 | 
111 | 	tid := task( out<-in ) {
112 | 
113 | 		sys $shcmd_init
114 | 
115 | 		//# Map reads to create raw SAM file
116 | 		sys bwa aln $param_bwa_aln -t $nth_bwa $bwa_idx $fastq > $sai
117 | 
118 | 		sys $shcmd_finalize
119 | 	}
120 | 
121 | 	register_par( tid, cpus )
122 | 
123 | 	add_task_to_graph( in, out, group )
124 | 
125 | 	return out
126 | }
127 | 
128 | string[] bwa_sam( string fastq, string sai, string o_dir, string log_o_dir, string group, int nth_bwa ) {
129 | 
130 | 	prefix 	:= replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir )
131 | 	prefix2 := replace_dir( prefix, log_o_dir )
132 | 	bam 	:= "$prefix.bam"
133 | 	qc	:= "$prefix2.flagstat.qc"
134 | 
135 | 	in 	:= [ fastq, sai ]
136 | 	out 	:= [ bam, qc ]
137 | 
138 | 	taskName:= "bwa_sam " + group
139 | 	cpus 	:= nth_bwa; 	mem := get_res_mem(mem_bwa,nth_bwa);	timeout := get_res_wt(wt_bwa)
140 | 
141 | 	wait_par( cpus )
142 | 
143 | 	tid := task( out<-in ) {
144 | 
145 | 		sys $shcmd_init
146 | 
147 | 		sys bwa samse $bwa_idx $sai $fastq | samtools view -Su - | samtools sort - $prefix
148 | 		sys samtools index $bam
149 | 		sys samtools flagstat $bam > $qc
150 | 		//sys bwa samse $bwa_idx $sai $fastq | samtools view -Su /dev/stdin \
151 | 		//	| sambamba sort -t 1 /dev/stdin -o $bam
152 | 		//sys sambamba flagstat -t 1 $bam > $qc
153 | 
154 | 		sys $shcmd_finalize
155 | 	}
156 | 
157 | 	register_par( tid, cpus )
158 | 
159 | 	add_task_to_graph( in, out, group )
160 | 
161 | 	return out
162 | }
163 | 
164 | string[] bwa_sam_PE( string fastq1, string fastq2, string sai1, string sai2, string o_dir, string log_o_dir, string group, int nth_bwa ) {
165 | 
166 | 	prefix 	:= replace_dir( rm_ext( fastq1, ["fastq","fq"] ), o_dir ) + ".PE2SE"
167 | 	prefix2 := replace_dir( prefix, log_o_dir )
168 | 	sam 	:= "$prefix.sam.gz"
169 | 	badcigar:= "$prefix.badReads"
170 | 	bam 	:= "$prefix.bam"
171 | 	qc	:= "$prefix2.flagstat.qc"
172 | 
173 | 	in 	:= [ fastq1, fastq2, sai1, sai2 ]
174 | 	out 	:= [ bam, qc ]
175 | 
176 | 	taskName:= "bwa_sam_PE " + group
177 | 	cpus 	:= nth_bwa; 	mem := get_res_mem(mem_bwa,nth_bwa);	timeout := get_res_wt(wt_bwa)
178 | 
179 | 	wait_par( cpus )
180 | 
181 | 	tid := task( out<-in ) {
182 | 
183 | 		sys $shcmd_init
184 | 		
185 | 		sys bwa sampe $bwa_idx $sai1 $sai2 $fastq1 $fastq2 | pigz -p $nth_bwa -nc > $sam
186 | 
187 | 		//# Remove read pairs with bad CIGAR strings and sort by position
188 | 
189 | 		//# Find bad CIGAR read names
190 | 		//sys zcat $sam \
191 | 		//	| awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t"; }' \
192 | 		//	| sort | uniq > $badcigar
193 | 
194 | 		sys pigz -p $nth_bwa -cd $sam \
195 | 			| awk 'BEGIN {FS="\t" ; OFS="\t"} ! /^@/ && $6!="*" { cigar=$6; gsub("[0-9]+D","",cigar); n = split(cigar,vals,"[A-Z]"); s = 0; for (i=1;i<=n;i++) s=s+vals[i]; seqlen=length($10) ; if (s!=seqlen) print $1"\t"; }' \
196 | 			| sort | uniq > $badcigar
197 | 
198 | 		//# Remove bad CIGAR read pairs
199 | 		sys if [ $(cat $badcigar | wc -l) -gt 0 ]; then \
200 | 				zcat $sam | grep -v -F -f $badcigar | samtools view -Su - | samtools sort - $prefix; \
201 | 			else \
202 | 				samtools view -Su $sam | samtools sort - $prefix; \
203 | 			fi
204 | 		//sys if [ $(cat $badcigar | wc -l) -gt 0 ]; then \
205 | 		//		pigz -p $nth_bwa -cd $sam | grep -v -F -f $badcigar | samtools view -Su /dev/stdin \
206 | 		//		| sambamba sort -t 1 /dev/stdin -o $bam; \
207 | 		//	else \
208 | 		//		pigz -p $nth_bwa -cd $sam | samtools view -Su /dev/stdin | sambamba sort -t 1 /dev/stdin -o $bam; \
209 | 		//	fi
210 | 
211 | 		sys samtools flagstat $bam > $qc
212 | 		sys samtools index $bam
213 | 
214 | 		//sys sambamba flagstat -t 1 > $qc
215 | 		
216 | 		sys rm -f $badcigar $sam
217 | 
218 | 		sys $shcmd_finalize
219 | 	}
220 | 
221 | 	register_par( tid, cpus )
222 | 
223 | 	add_task_to_graph( in, out, group )
224 | 
225 | 	return out
226 | }
227 | 


--------------------------------------------------------------------------------
/modules/conf.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "sys.bds"
  5 | 
  6 | 
  7 | help == configuration file settings
  8 | c		:= "" 				help Configuration file path.
  9 | env		:= "$script_dir/default.env"	help Environment file path.
 10 | 
 11 | 
 12 | string{} conf 	// map for configuration
 13 | 
 14 | 
 15 | init_conf()
 16 | 
 17 | 
 18 | void init_conf() {
 19 | 	if ( is_cmd_line_arg_empty() ) \
 20 | 		print( "\nWarning: No parameters are given (specify cmd. line arguments or configuration file)!\n\n")	
 21 | 	if ( is_first_arg_conf() ) c = args[0]
 22 | 
 23 | 	add_to_conf( c, "" ) // then read conf. file
 24 | 	env = get_conf_val( env, ["env"] )
 25 | 	if ( path_exists( env ) ) add_to_conf( env, hostname )
 26 | 	add_to_conf( c, "" ) // read conf again to override
 27 | 
 28 | 	print( "\n\n== configuration file info\n")
 29 | 	print( "Hostname\t\t\t: $hostname\n")
 30 | 	print( "Configuration file\t\t: $c\n" )
 31 | 	print( "Environment file\t\t: $env\n" )
 32 | }
 33 | 
 34 | string{} read_conf( string file, string section ) {
 35 | 	section = section.trim()
 36 | 	string{} ret
 37 | 
 38 | 	if ( file == "" ) return ret
 39 | 	lines := file.read().split("\n")
 40 | 
 41 | 	can_read := (section=="") ? true : false
 42 | 	found_section := (section=="") ? true : false
 43 | 	for ( string line : lines ) {
 44 | 		line = rm_comment( line.trim() )
 45 | 		if ( line == "" ) continue
 46 | 	
 47 | 		if ( line.startsWith( "[" ) && line.endsWith( "]" ) )  {
 48 | 			line2 := line.substr(1,line.length()-1)
 49 | 			string[] hostnames
 50 | 			string group
 51 | 			// find group if exists
 52 | 			arr := line2.split(":")
 53 | 			if ( arr.size() > 1 ) 	group = arr[1].trim()
 54 | 			hostnames = arr[0].split(",")
 55 | 			if ( section == "" ) {
 56 | 				can_read = false
 57 | 			}
 58 | 			else {
 59 | 				for ( string host : hostnames ) {
 60 | 					host = host.trim()
 61 | 					if ( match_str( section, host ) ) { // one asterisk (wildcard chr: *) is allowed in hostname string
 62 | 						if ( section == group ) {
 63 | 							error("Recursion (section name == group) found in a conf. or an env. file!"+\
 64 | 								" (file: $file, section: $section, group: $group)\n")
 65 | 						}
 66 | 						else if ( group != "" ) {
 67 | 							print("\tReading parameters from section group($group) in file($file)...\n")
 68 | 							return read_conf( file, group )
 69 | 						}
 70 | 						else {
 71 | 							print("\tReading parameters from section ($host) in file($file)...\n")
 72 | 							found_section = true
 73 | 							can_read = true
 74 | 							break;
 75 | 						}
 76 | 					}
 77 | 					else {
 78 | 						can_read = false
 79 | 					}
 80 | 				}
 81 | 			}
 82 | 			continue
 83 | 		}
 84 | 
 85 | 		if ( can_read ) {
 86 | 			string key, val
 87 | 			(key, val) = parse_conf_line( line )
 88 | 			ret{ key } = val
 89 | 		}
 90 | 	}
 91 | 	if ( !found_section && section != "default" ) return read_conf( file, "default" )
 92 | 
 93 | 	return ret
 94 | }
 95 | 
 96 | string{} read_conf( string file ) {
 97 | 	return read_conf( file, "" )
 98 | }
 99 | 
100 | void add_to_conf( string file, string section ) {
101 | 
102 | 	tmp := read_conf( file, section )
103 | 
104 | 	for( string k : tmp.keys() ) conf{k} = tmp{k}
105 | }
106 | 
107 | void add_to_conf( string file ) {
108 | 	tmp := read_conf( file )
109 | 	for( string k : tmp.keys() ) {
110 | 		conf{k} = tmp{k}
111 | 	}	
112 | }
113 | 
114 | string[] parse_conf_line( string line ) {
115 | 	delims := [ "=", "\t" ]
116 | 	delim_found := false
117 | 	string key, val
118 | 	for ( string delim : delims ) {
119 | 		idx := line.indexOf( delim )
120 | 		if ( idx > -1 ) {
121 | 			key = line.substr( 0, idx ).trim().toLower()
122 | 			val = line.substr( idx+1 ).trim()
123 | 			delim_found = true
124 | 			break
125 | 		}
126 | 	}
127 | 	if ( !delim_found ) error("No delimiter (=,\\t) found in line ($line) in the configruation file.\n")
128 | 	return [key, val]
129 | }
130 | 
131 | int get_conf_val_int( int curr_val, string key ) {
132 | 	string{} tmp
133 | 	return parse_int( get_conf_val( curr_val, key, tmp ) )
134 | }
135 | 
136 | int get_conf_val_int( int curr_val, string[] keys ) {
137 | 	string{} tmp
138 | 	return parse_int( get_conf_val( curr_val, keys, tmp ) )
139 | }
140 | 
141 | bool get_conf_val_bool( bool curr_val, string key ) {
142 | 	string{} tmp
143 | 	return parse_bool( get_conf_val( curr_val, key, tmp ) )
144 | }
145 | 
146 | bool get_conf_val_bool( bool curr_val, string[] keys ) {
147 | 	string{} tmp
148 | 	return parse_bool( get_conf_val( curr_val, keys, tmp ) )
149 | }
150 | 
151 | real get_conf_val_real( real curr_val, string key ) {
152 | 	string{} tmp
153 | 	return parse_real( get_conf_val( curr_val, key, tmp ) )
154 | }
155 | 
156 | real get_conf_val_real( real curr_val, string[] keys ) {
157 | 	string{} tmp
158 | 	return parse_real( get_conf_val( curr_val, keys, tmp ) )
159 | }
160 | 
161 | int get_conf_val_int( int curr_val, string key, string{} _conf ) {
162 | 	return parse_int( get_conf_val( curr_val, key, _conf ) )	
163 | }
164 | 
165 | int get_conf_val_int( int curr_val, string[] keys, string{} _conf ) {
166 | 	return parse_int( get_conf_val( curr_val, keys, _conf ) )	
167 | }
168 | 
169 | bool get_conf_val_bool( bool curr_val, string key, string{} _conf ) {
170 | 	return parse_bool( get_conf_val( curr_val, key, _conf ) )	
171 | }
172 | 
173 | bool get_conf_val_bool( bool curr_val, string[] keys, string{} _conf ) {
174 | 	return parse_bool( get_conf_val( curr_val, keys, _conf ) )	
175 | }
176 | 
177 | real get_conf_val_real( real curr_val, string key, string{} _conf ) {
178 | 	return parse_real( get_conf_val( curr_val, key, _conf ) )
179 | }
180 | 
181 | real get_conf_val_real( real curr_val, string[] keys, string{} _conf ) {
182 | 	return parse_real( get_conf_val( curr_val, keys, _conf ) )
183 | }
184 | 
185 | string get_conf_val( string curr_val, string key, string{} _conf ) {
186 | 	key = key.toLower().trim()
187 | 	if ( cmd_line_arg_has_key( key ) ) return curr_val
188 | 	if ( _conf.size() == 0 ) {
189 | 		if ( conf.hasKey( key ) ) {			
190 | 			return (conf{ key } != "") ? substitute_var( rm_comment( conf{ key } ) ) : curr_val
191 | 		}
192 | 	}
193 | 	else {
194 | 		if ( _conf.hasKey( key ) ) {			
195 | 			return (_conf{ key } != "") ? substitute_var( rm_comment( _conf{ key } ) ) : curr_val
196 | 		}
197 | 	}
198 | 	return curr_val
199 | }
200 | 
201 | string substitute_var( string var ) {
202 | 	var = var.replace("\$script_dir","$script_dir").replace("\${script_dir}","$script_dir")
203 | 	var = var.replace("~/","$HOME/").replace("\$HOME","$HOME").replace("\${HOME}","$HOME")
204 | 	return var
205 | }
206 | 
207 | string get_conf_val( string curr_val, string[] keys, string{} _conf ) {		
208 | 	for ( string key : keys ) {
209 | 		val := get_conf_val( curr_val, key, _conf )		
210 | 		if ( val != curr_val ) return val
211 | 	} 
212 | 	return curr_val
213 | }
214 | 
215 | string get_conf_val( string curr_val, string key ) {
216 | 	string{} tmp
217 | 	return get_conf_val( curr_val, key, tmp )
218 | }
219 | 
220 | string get_conf_val( string curr_val, string[] keys ) {	
221 | 	string{} tmp
222 | 	return get_conf_val( curr_val, keys, tmp )
223 | }
224 | 
225 | bool has_conf_key( string key, string{} _conf ) {
226 | 	key = key.toLower()
227 | 	return (_conf.size()==0) ? conf.hasKey( key ) : _conf.hasKey( key )
228 | }
229 | 
230 | bool has_conf_key( string key ) {
231 | 	string{} tmp
232 | 	return has_conf_key( key )
233 | }
234 | 
235 | bool conf_file_exists() {
236 | 	if ( c!="" ) return c.exists()
237 | 	return false
238 | }
239 | 
240 | bool has_key_in_conf_or_cmd_line( string key ) {
241 | 	return cmd_line_arg_has_key( key )// || has_conf_key( key )
242 | }
243 | 
244 | 


--------------------------------------------------------------------------------
/modules/input.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "input_fastq.bds"
  5 | include "input_bam.bds"
  6 | include "input_tagalign.bds"
  7 | include "input_peak.bds"
  8 | 
  9 | 
 10 | help == input endedness settings (SE or PE) :
 11 | se 	:= false	help Singled-ended data set. To specify it for each replicate, '-se[REP_ID]' for exp. reps, '-ctl_se[CTL_ID]' for control.
 12 | pe 	:= false  	help Paired end data set. To specify it for each replicate, '-pe[REP_ID]' for exp. reps, '-ctl_pe[CTL_ID]' for controls.
 13 | 
 14 | default_is_pe := false 		// default is se
 15 | 
 16 | 
 17 | init_input()
 18 | 
 19 | void init_input() {
 20 | 	se	= get_conf_val_bool( se, 	["se"] )
 21 | 	pe	= get_conf_val_bool( pe, 	["pe"] )
 22 | }
 23 | 
 24 | //// ctl==0: exp. replicate, ctl==1: control
 25 | 
 26 | void chk_input( bool true_rep, bool no_pseudo_rep ) {
 27 | 	if ( is_input_peak() ) {
 28 | 
 29 | 		chk_input_peak( true_rep, no_pseudo_rep )
 30 | 		return
 31 | 	}
 32 | 	print( "\n\n== checking input files ...\n\n" );
 33 | 
 34 | 	string[] data_all
 35 | 	for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control)		
 36 | 		if ( ctl==1 && !ctl_exists() ) continue
 37 | 		for ( int rep=1; rep <= get_num_rep( ctl ); rep++) {
 38 | 			string[] data
 39 | 
 40 | 			prefix := (ctl==1) ? "Control " : ""
 41 | 			suffix := is_pe( ctl, rep ) ? " (PE)" : " (SE)"
 42 | 
 43 | 			if ( is_input_fastq( ctl, rep ) ) {
 44 | 				prefix = prefix + "Rep$rep fastq" + suffix
 45 | 				fastqs := get_fastqs( ctl, rep )
 46 | 				if ( fastqs.size()==0 ) {
 47 | 					data.push( "" )
 48 | 				}
 49 | 				else {
 50 | 					for ( string fastq : fastqs ) data.push( fastq )
 51 | 				}
 52 | 			}
 53 | 			else if ( is_input_bam( ctl, rep ) ) {
 54 | 				prefix = prefix +"Rep$rep bam" + suffix
 55 | 				data.push( get_bam( ctl, rep ) )
 56 | 			}
 57 | 			else if ( is_input_filt_bam( ctl, rep ) ) {
 58 | 				prefix = prefix +"Rep$rep filt_bam" + suffix
 59 | 				data.push( get_filt_bam( ctl, rep ) )
 60 | 			}
 61 | 			else if ( is_input_tag( ctl, rep ) ) {
 62 | 				prefix = prefix + "Rep$rep tagalign" + suffix
 63 | 				data.push( get_tag( ctl, rep ) )
 64 | 			}
 65 | 
 66 | 			print("$prefix :\n")
 67 | 			for ( string s : data ) {
 68 | 				print("\t$s\n")
 69 | 				if ( (s != "") && !path_exists(s) ) error("\t\tFile not found!\n")
 70 | 			}
 71 | 
 72 | 			// if data is missing
 73 | 			if ( data[0] == "" ) {
 74 | 				if ( (rep>=2) && (ctl==1) ) \
 75 | 					print( "\tWarning: $prefix missing! using control 1 for calling peaks on replicate $rep\n")
 76 | 				else if ( (rep==2) && (ctl==0) ) \
 77 | 					print( "\tWarning: $prefix missing! peak will be called for replicate 1 only\n")
 78 | 				else \
 79 | 					error( "\t$prefix missing!\n")
 80 | 				continue
 81 | 			}
 82 | 			// check any duplicate input filename
 83 | 			for ( string s : data ) {
 84 | 				if ( is_in_array( get_basename( s ), get_basename( data_all ) ) ) \
 85 | 					error( "\t$prefix has duplicate filename!\n")
 86 | 			}
 87 | 			data_all = merge( data_all, data )
 88 | 		}
 89 | 	}
 90 | }
 91 | 
 92 | string[] get_input_files( int ctl, int rep ) {
 93 | 	string[] empty
 94 | 
 95 | 	if ( is_input_fastq( ctl, rep ) ) {
 96 | 		return get_fastqs( ctl, rep )
 97 | 	}
 98 | 	else if ( is_input_bam( ctl, rep ) ) {
 99 | 		bam := get_bam( ctl, rep )
100 | 		return bam=="" ? empty : [bam]
101 | 	}
102 | 	else if ( is_input_filt_bam( ctl, rep ) ) {
103 | 		filt_bam := get_filt_bam( ctl, rep )
104 | 		return filt_bam=="" ? empty : [filt_bam]
105 | 	}
106 | 	else if ( is_input_tag( ctl, rep ) ) {
107 | 		tag := get_tag( ctl, rep )
108 | 		return tag=="" ? empty : [tag]
109 | 	}
110 | 	else {
111 | 		return empty
112 | 	}
113 | }
114 | 
115 | string[] get_input_files( int rep ) {
116 | 	return get_input_files( 0, rep )
117 | }
118 | 
119 | bool input_file_exists( int ctl, int rep ) {
120 | 	string[] input_files = get_input_files( ctl, rep )
121 | 	return input_files.size() > 0
122 | }
123 | 
124 | bool input_file_exists( int rep ) {
125 | 	return input_file_exists( 0, rep )
126 | }
127 | 
128 | int get_num_rep( int ctl ) {
129 | 	rep := 1
130 | 	while( get_input_files( ctl, rep ).size() > 0 ) rep++
131 | 
132 | 	num_rep := rep-1	
133 | 	return num_rep
134 | }
135 | 
136 | int get_num_rep() {
137 | 	return is_input_peak() ?  get_num_rep_peak() : get_num_rep( 0 )
138 | }
139 | 
140 | bool is_pe( int ctl, int rep ) {
141 | 	if ( pe ) 	return true
142 | 	if ( se ) 	return false
143 | 
144 | 	key_pe  := ( ctl > 0 ? "ctl_pe" : "pe" ) + rep 
145 | 	key_pe_ctl := "ctl_pe"
146 | 	key_se  := ( ctl > 0 ? "ctl_se" : "se" ) + rep 
147 | 
148 | 	if ( cmd_line_arg_has_key( key_pe ) ) {
149 | 		return true
150 | 	}
151 | 	else if ( cmd_line_arg_has_key( key_se ) ) {
152 | 		return false
153 | 	}
154 | 	else if ( ctl==1 && cmd_line_arg_has_key( key_pe_ctl ) ) {
155 | 		return true		
156 | 	}
157 | 	else {
158 | 		if ( conf.hasKey( key_pe ) && parse_bool( conf{ key_pe } ) ) return true
159 | 		if ( conf.hasKey( key_se ) && parse_bool( conf{ key_se } ) ) return false
160 | 		if ( ctl==1 && conf.hasKey( key_pe_ctl ) && parse_bool( conf{ key_pe_ctl } ) ) return true
161 | 	}
162 | 
163 | 	if ( is_input_fastq( ctl, rep ) ) {
164 | 		fastqs := get_fastq( ctl, rep, 2 )
165 | 		return fastqs.size() > 0
166 | 	}
167 | 
168 | 	if ( default_is_pe ) return true
169 | 	else return false
170 | }
171 | 
172 | bool is_se( int ctl, int rep ) {
173 | 	return !is_pe( ctl, rep )
174 | }
175 | 
176 | bool is_pe( int rep ) {
177 | 	return is_pe( 0, rep )
178 | }
179 | 
180 | bool is_se( int rep ) {
181 | 	return !is_pe( 0, rep )
182 | }
183 | 
184 | bool has_input_fastq() {
185 | 	for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control)		
186 | 		if ( ctl==1 && !ctl_exists() ) continue
187 | 
188 | 		for ( int rep=1; rep <= get_num_rep( ctl ); rep++) {
189 | 			if ( is_input_fastq( ctl, rep ) ) return true
190 | 		}
191 | 	}
192 | 	return false
193 | }
194 | 
195 | bool has_pe_input_fastq() {
196 | 	for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control)		
197 | 		if ( ctl==1 && !ctl_exists() ) continue
198 | 
199 | 		for ( int rep=1; rep <= get_num_rep( ctl ); rep++) {
200 | 			if ( is_input_fastq( ctl, rep ) && is_pe( ctl, rep ) ) return true
201 | 		}
202 | 	}
203 | 	return false	
204 | }
205 | 
206 | bool has_pe_input_tag( int ctl ) {
207 | 	for ( int rep=1; rep <= get_num_rep( ctl ); rep++) {
208 | 
209 | 		if ( is_input_tag( ctl, rep ) && is_pe( ctl, rep ) ) return true
210 | 	}
211 | 	return false	
212 | }
213 | 
214 | bool has_pe_input_tag() {
215 | 	return has_pe_input_tag( 0 )
216 | }
217 | 
218 | bool has_pe() {
219 | 	for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control)		
220 | 		if ( ctl==1 && !ctl_exists() ) continue
221 | 
222 | 		for ( int rep=1; rep <= get_num_rep( ctl ); rep++) {
223 | 			if ( is_pe( ctl, rep ) ) return true
224 | 		}
225 | 	}
226 | 	return false	
227 | }
228 | 
229 | bool has_se() {
230 | 	for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control)		
231 | 		if ( ctl==1 && !ctl_exists() ) continue
232 | 
233 | 		for ( int rep=1; rep <= get_num_rep( ctl ); rep++) {
234 | 			if ( !is_pe( ctl, rep ) ) return true
235 | 		}
236 | 	}
237 | 	return false
238 | }
239 | 
240 | bool ctl_exists() {
241 | 	return input_file_exists( 1, 1 )
242 | }
243 | 
244 | string get_long_group_name( int ctl, int rep ) {
245 | 	return ( (ctl>0) ? "Control " : "Replicate ") + rep
246 | }
247 | 
248 | string get_long_group_name( int rep ) {
249 | 	return "Replicate "+ rep
250 | }
251 | 
252 | string get_group_name( int ctl, int rep ) {
253 | 	return ( (ctl>0) ? "ctl" : "rep") + rep
254 | }
255 | 
256 | string get_group_name( int rep ) {
257 | 	return "rep" + rep
258 | }
259 | 


--------------------------------------------------------------------------------
/species/sherlock.conf:
--------------------------------------------------------------------------------
  1 | [mm9]
  2 | chrsz	= /home/groups/cherry/encode/pipeline_genome_data/mm9/mm9.chrom.sizes
  3 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/mm9/seq
  4 | gensz	= mm
  5 | umap	= /home/groups/cherry/encode/pipeline_genome_data/mm9/globalmap_k20tok54
  6 | bwa_idx	= /home/groups/cherry/encode/pipeline_genome_data/mm9/bwa_index/mm9.fa
  7 | bwt2_idx	= /home/groups/cherry/encode/pipeline_genome_data/mm9/bowtie2_index/mm9.fa
  8 | ref_fa	= /home/groups/cherry/encode/pipeline_genome_data/mm9/mm9.fa
  9 | blacklist	= /home/groups/cherry/encode/pipeline_genome_data/mm9/mm9-blacklist.bed.gz
 10 | # data for ATAQC
 11 | tss_enrich	= /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/mm9_gencode_tss_unique.bed.gz
 12 | dnase	= /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/mm9_univ_dhs_ucsc.from_mm10.bed.gz
 13 | prom	= /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/tss_mm9_master.from_mm10.bed.gz
 14 | enh	= /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/mm9_enh_dhs_ucsc.from_mm10.bed.gz
 15 | reg2map	= /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/dnase_avgs_merged_named.fseq.vals.gz
 16 | reg2map_bed   = /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/mm9_dhs_universal_ucsc_v1.bed.gz
 17 | roadmap_meta	= /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/accession_to_name.txt
 18 | 
 19 | [mm10, mm10_ENCODE, mm10_ENCODE3] # from ENCODE portal
 20 | chrsz	= /home/groups/cherry/encode/pipeline_genome_data/mm10/mm10.chrom.sizes
 21 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/mm10/seq
 22 | gensz	= mm
 23 | bwa_idx	= /home/groups/cherry/encode/pipeline_genome_data/mm10/bwa_index/mm10_no_alt_analysis_set_ENCODE.fasta
 24 | bwt2_idx	= /home/groups/cherry/encode/pipeline_genome_data/mm10/bowtie2_index/mm10_no_alt_analysis_set_ENCODE.fasta
 25 | ref_fa	= /home/groups/cherry/encode/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta
 26 | blacklist	= /home/groups/cherry/encode/pipeline_genome_data/mm10/mm10.blacklist.bed.gz
 27 | # data for ATAQC
 28 | tss_enrich	= /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_gencode_tss_unique.bed.gz
 29 | dnase	= /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_univ_dhs_ucsc.bed.gz
 30 | prom	= /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/tss_mm10_master.bed.gz
 31 | enh	= /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_enh_dhs_ucsc.bed.gz
 32 | reg2map	= /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_formatted.txt.gz
 33 | reg2map_bed	= /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_celltype_compare_subsample.bed.gz
 34 | roadmap_meta	= /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_metadata.txt
 35 | ENCODE_assembly = mm10
 36 | 
 37 | [hg19]
 38 | chrsz	= /home/groups/cherry/encode/pipeline_genome_data/hg19/hg19.chrom.sizes
 39 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/hg19/seq
 40 | gensz	= hs
 41 | umap	= /home/groups/cherry/encode/pipeline_genome_data/hg19/globalmap_k20tok54
 42 | bwa_idx	= /home/groups/cherry/encode/pipeline_genome_data/hg19/bwa_index/male.hg19.fa
 43 | bwt2_idx	= /home/groups/cherry/encode/pipeline_genome_data/hg19/bowtie2_index/male.hg19.fa
 44 | ref_fa	= /home/groups/cherry/encode/pipeline_genome_data/hg19/male.hg19.fa
 45 | blacklist	= /home/groups/cherry/encode/pipeline_genome_data/hg19/wgEncodeDacMapabilityConsensusExcludable.bed.gz
 46 | # data for ATAQC
 47 | tss_enrich	= /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/hg19_gencode_tss_unique.bed.gz
 48 | dnase	= /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.bed.gz
 49 | prom	= /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_prom_p2.bed.gz
 50 | enh	= /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_enh_p2.bed.gz
 51 | reg2map	= /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/dnase_avgs_reg2map_p10_merged_named.pvals.gz
 52 | roadmap_meta	= /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/eid_to_mnemonic.txt
 53 | 
 54 | [hg38, hg38_ENCODE, hg38_ENCODE3] # from ENCODE portal
 55 | chrsz	= /home/groups/cherry/encode/pipeline_genome_data/hg38/hg38.chrom.sizes
 56 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/hg38/seq
 57 | gensz	= hs
 58 | bwa_idx	= /home/groups/cherry/encode/pipeline_genome_data/hg38/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta
 59 | bwt2_idx	= /home/groups/cherry/encode/pipeline_genome_data/hg38/bowtie2_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta
 60 | ref_fa	= /home/groups/cherry/encode/pipeline_genome_data/hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta
 61 | blacklist	= /home/groups/cherry/encode/pipeline_genome_data/hg38/hg38.blacklist.bed.gz
 62 | # data for ATAQC
 63 | tss_enrich	= /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/hg38_gencode_tss_unique.bed.gz
 64 | dnase	= /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.hg19_to_hg38.bed.gz
 65 | prom	= /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_prom_p2.hg19_to_hg38.bed.gz
 66 | enh	= /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_enh_p2.hg19_to_hg38.bed.gz
 67 | reg2map	= /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_formatted.txt.gz
 68 | reg2map_bed	= /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/hg38_celltype_compare_subsample.bed.gz
 69 | roadmap_meta	= /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_metadata.txt
 70 | ENCODE_assembly = GRCh38
 71 | 
 72 | [dm3] # installed by install_genome_data.sh
 73 | chrsz	= /home/groups/cherry/encode/pipeline_genome_data/dm3/dm3.chrom.sizes
 74 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/dm3/seq
 75 | gensz	= 168736537
 76 | bwa_idx	= /home/groups/cherry/encode/pipeline_genome_data/dm3/bwa_index/dm3.fa
 77 | bwt2_idx	= /home/groups/cherry/encode/pipeline_genome_data/dm3/bowtie2_index/dm3.fa
 78 | ref_fa	= /home/groups/cherry/encode/pipeline_genome_data/dm3/dm3.fa
 79 | 
 80 | [pantro5] # installed by install_genome_data.sh
 81 | chrsz	= /home/groups/cherry/encode/pipeline_genome_data/pantro5/pantro5.chrom.sizes
 82 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/pantro5/seq
 83 | gensz	= 3231170666
 84 | bwa_idx	= /home/groups/cherry/encode/pipeline_genome_data/pantro5/bwa_index/panTro5.fa
 85 | bwt2_idx	= /home/groups/cherry/encode/pipeline_genome_data/pantro5/bowtie2_index/panTro5.fa
 86 | ref_fa	= /home/groups/cherry/encode/pipeline_genome_data/pantro5/panTro5.fa
 87 | 
 88 | [macam7] # installed by install_genome_data.sh
 89 | chrsz	= /home/groups/cherry/encode/pipeline_genome_data/macam7/macam7.chrom.sizes
 90 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/macam7/seq
 91 | gensz	= 2817542206
 92 | bwa_idx	= /home/groups/cherry/encode/pipeline_genome_data/macam7/bwa_index/MacaM_Rhesus_Genome_v7.fasta
 93 | bwt2_idx	= /home/groups/cherry/encode/pipeline_genome_data/macam7/bowtie2_index/MacaM_Rhesus_Genome_v7.fasta
 94 | ref_fa	= /home/groups/cherry/encode/pipeline_genome_data/macam7/MacaM_Rhesus_Genome_v7.fasta
 95 | nonamecheck = true # for bedtools >= 2.24. this prevents name convention error in bedtools intersect
 96 | 
 97 | [saccer3] # installed by install_genome_data.sh
 98 | chrsz   = /home/groups/cherry/encode/pipeline_genome_data/saccer3/saccer3.chrom.sizes
 99 | seq     = /home/groups/cherry/encode/pipeline_genome_data/saccer3/seq
100 | gensz   = 12157105
101 | bwa_idx = /home/groups/cherry/encode/pipeline_genome_data/saccer3/bwa_index/sacCer3.fa
102 | bwt2_idx= /home/groups/cherry/encode/pipeline_genome_data/saccer3/bowtie2_index/sacCer3.fa
103 | ref_fa  = /home/groups/cherry/encode/pipeline_genome_data/saccer3/sacCer3.fa
104 | 
105 | 


--------------------------------------------------------------------------------
/species/kundaje.conf:
--------------------------------------------------------------------------------
  1 | [mm9]
  2 | chrsz	= /mnt/data/pipeline_genome_data/mm9/mm9.chrom.sizes
  3 | seq_dir = /mnt/data/pipeline_genome_data/mm9/seq
  4 | gensz	= mm
  5 | umap	= /mnt/data/pipeline_genome_data/mm9/globalmap_k20tok54
  6 | bwa_idx	= /mnt/data/pipeline_genome_data/mm9/bwa_index/mm9.fa
  7 | bwt2_idx	= /mnt/data/pipeline_genome_data/mm9/bowtie2_index/mm9.fa
  8 | ref_fa	= /mnt/data/pipeline_genome_data/mm9/mm9.fa
  9 | blacklist	= /mnt/data/pipeline_genome_data/mm9/mm9-blacklist.bed.gz
 10 | # data for ATAQC
 11 | tss_enrich	= /mnt/data/pipeline_genome_data/mm9/ataqc/mm9_gencode_tss_unique.bed.gz
 12 | dnase	= /mnt/data/pipeline_genome_data/mm9/ataqc/mm9_univ_dhs_ucsc.from_mm10.bed.gz
 13 | prom	= /mnt/data/pipeline_genome_data/mm9/ataqc/tss_mm9_master.from_mm10.bed.gz
 14 | enh	= /mnt/data/pipeline_genome_data/mm9/ataqc/mm9_enh_dhs_ucsc.from_mm10.bed.gz
 15 | reg2map	= /mnt/data/pipeline_genome_data/mm9/ataqc/dnase_avgs_merged_named.fseq.vals.gz
 16 | roadmap_meta	= /mnt/data/pipeline_genome_data/mm9/ataqc/accession_to_name.txt
 17 | 
 18 | [mm10, mm10_ENCODE, mm10_ENCODE3] # from ENCODE portal
 19 | chrsz	= /mnt/data/pipeline_genome_data/mm10/mm10.chrom.sizes
 20 | seq_dir = /mnt/data/pipeline_genome_data/mm10/seq
 21 | gensz	= mm
 22 | bwa_idx	= /mnt/data/pipeline_genome_data/mm10/bwa_index/mm10_no_alt_analysis_set_ENCODE.fasta
 23 | bwt2_idx	= /mnt/data/pipeline_genome_data/mm10/bowtie2_index/mm10_no_alt_analysis_set_ENCODE.fasta
 24 | ref_fa	= /mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta
 25 | blacklist	= /mnt/data/pipeline_genome_data/mm10/mm10.blacklist.bed.gz
 26 | # data for ATAQC
 27 | tss_enrich	= /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_gencode_tss_unique.bed.gz
 28 | dnase	= /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_univ_dhs_ucsc.bed.gz
 29 | prom	= /mnt/data/pipeline_genome_data/mm10/ataqc/tss_mm10_master.bed.gz
 30 | enh	= /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_enh_dhs_ucsc.bed.gz
 31 | reg2map	= /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_formatted.txt.gz
 32 | reg2map_bed	= /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_celltype_compare_subsample.bed.gz
 33 | roadmap_meta	= /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_metadata.txt
 34 | ENCODE_assembly = mm10
 35 | 
 36 | [hg19]
 37 | chrsz	= /mnt/data/pipeline_genome_data/hg19/hg19.chrom.sizes
 38 | seq_dir = /mnt/data/pipeline_genome_data/hg19/seq
 39 | gensz	= hs
 40 | umap	= /mnt/data/pipeline_genome_data/hg19/globalmap_k20tok54
 41 | bwa_idx	= /mnt/data/pipeline_genome_data/hg19/bwa_index/male.hg19.fa
 42 | bwt2_idx	= /mnt/data/pipeline_genome_data/hg19/bowtie2_index/male.hg19.fa
 43 | ref_fa	= /mnt/data/pipeline_genome_data/hg19/male.hg19.fa
 44 | blacklist	= /mnt/data/pipeline_genome_data/hg19/wgEncodeDacMapabilityConsensusExcludable.bed.gz
 45 | 
 46 | mappability_map_peakseq = /mnt/data/pipeline_genome_data/hg19/Mapability_HG.txt
 47 | 
 48 | # data for ATAQC
 49 | tss_enrich	= /mnt/data/pipeline_genome_data/hg19/ataqc/hg19_gencode_tss_unique.bed.gz
 50 | dnase	= /mnt/data/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.bed.gz
 51 | prom	= /mnt/data/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_prom_p2.bed.gz
 52 | enh	= /mnt/data/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_enh_p2.bed.gz
 53 | reg2map	= /mnt/data/pipeline_genome_data/hg19/ataqc/dnase_avgs_reg2map_p10_merged_named.pvals.gz
 54 | roadmap_meta	= /mnt/data/pipeline_genome_data/hg19/ataqc/eid_to_mnemonic.txt
 55 | 
 56 | [hg38, hg38_ENCODE, hg38_ENCODE3] # from ENCODE portal
 57 | chrsz	= /mnt/data/pipeline_genome_data/hg38/hg38.chrom.sizes
 58 | seq_dir = /mnt/data/pipeline_genome_data/hg38/seq
 59 | gensz	= hs
 60 | bwa_idx	= /mnt/data/pipeline_genome_data/hg38/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta
 61 | bwt2_idx	= /mnt/data/pipeline_genome_data/hg38/bowtie2_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta
 62 | ref_fa	= /mnt/data/pipeline_genome_data/hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta
 63 | blacklist	= /mnt/data/pipeline_genome_data/hg38/hg38.blacklist.bed.gz
 64 | # data for ATAQC
 65 | tss_enrich	= /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_gencode_tss_unique.bed.gz
 66 | dnase	= /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.hg19_to_hg38.bed.gz
 67 | prom	= /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_prom_p2.hg19_to_hg38.bed.gz
 68 | enh	= /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_enh_p2.hg19_to_hg38.bed.gz
 69 | reg2map	= /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_formatted.txt.gz
 70 | reg2map_bed	= /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_celltype_compare_subsample.bed.gz
 71 | roadmap_meta	= /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_metadata.txt
 72 | ENCODE_assembly = GRCh38
 73 | 
 74 | [hg38_chr19_chrM] # hg38 with chr19 and chrM only
 75 | chrsz	= /mnt/data/pipeline_genome_data/hg38_chr19_chrM/hg38_chr19_chrM.chrom.sizes
 76 | seq_dir = /mnt/data/pipeline_genome_data/hg38/seq
 77 | gensz	= hs
 78 | bwa_idx	= /mnt/data/pipeline_genome_data/hg38_chr19_chrM/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.chr19_chrM.fasta
 79 | bwt2_idx	= /mnt/data/pipeline_genome_data/hg38_chr19_chrM/bowtie2_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.chr19_chrM.fasta
 80 | ref_fa	= /mnt/data/pipeline_genome_data/hg38_chr19_chrM/GRCh38_no_alt_analysis_set_GCA_000001405.15.chr19_chrM.fasta
 81 | blacklist	= /mnt/data/pipeline_genome_data/hg38_chr19_chrM/hg38.blacklist.bed.gz
 82 | # data for ATAQC
 83 | tss_enrich	= /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_gencode_tss_unique.bed.gz
 84 | dnase	= /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.hg19_to_hg38.bed.gz
 85 | prom	= /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_prom_p2.hg19_to_hg38.bed.gz
 86 | enh	= /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_enh_p2.hg19_to_hg38.bed.gz
 87 | reg2map	= /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_formatted.txt.gz
 88 | reg2map_bed	= /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_celltype_compare_subsample.bed.gz
 89 | roadmap_meta	= /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_metadata.txt
 90 | ENCODE_assembly = GRCh38
 91 | 
 92 | [dm3] # installed by install_genome_data.sh
 93 | chrsz	= /mnt/data/pipeline_genome_data/dm3/dm3.chrom.sizes
 94 | seq_dir = /mnt/data/pipeline_genome_data/dm3/seq
 95 | gensz	= 168736537
 96 | bwa_idx	= /mnt/data/pipeline_genome_data/dm3/bwa_index/dm3.fa
 97 | bwt2_idx	= /mnt/data/pipeline_genome_data/dm3/bowtie2_index/dm3.fa
 98 | ref_fa	= /mnt/data/pipeline_genome_data/dm3/dm3.fa
 99 | 
100 | [pantro5] # installed by install_genome_data.sh
101 | chrsz	= /mnt/data/pipeline_genome_data/pantro5/pantro5.chrom.sizes
102 | seq_dir = /mnt/data/pipeline_genome_data/pantro5/seq
103 | gensz	= 3231170666
104 | bwa_idx	= /mnt/data/pipeline_genome_data/pantro5/bwa_index/panTro5.fa
105 | bwt2_idx	= /mnt/data/pipeline_genome_data/pantro5/bowtie2_index/panTro5.fa
106 | ref_fa	= /mnt/data/pipeline_genome_data/pantro5/panTro5.fa
107 | 
108 | [macam7] # installed by install_genome_data.sh
109 | chrsz	= /mnt/data/pipeline_genome_data/macam7/macam7.chrom.sizes
110 | seq_dir = /mnt/data/pipeline_genome_data/macam7/seq
111 | gensz	= 2817542206
112 | bwa_idx	= /mnt/data/pipeline_genome_data/macam7/bwa_index/MacaM_Rhesus_Genome_v7.fasta
113 | bwt2_idx	= /mnt/data/pipeline_genome_data/macam7/bowtie2_index/MacaM_Rhesus_Genome_v7.fasta
114 | ref_fa	= /mnt/data/pipeline_genome_data/macam7/MacaM_Rhesus_Genome_v7.fasta
115 | nonamecheck = true # for bedtools >= 2.24. this prevents name convention error in bedtools intersect
116 | 
117 | [saccer3] # installed by install_genome_data.sh
118 | chrsz   = /mnt/data/pipeline_genome_data/saccer3/saccer3.chrom.sizes
119 | seq     = /mnt/data/pipeline_genome_data/saccer3/seq
120 | gensz   = 12157105
121 | bwa_idx = /mnt/data/pipeline_genome_data/saccer3/bwa_index/sacCer3.fa
122 | bwt2_idx= /mnt/data/pipeline_genome_data/saccer3/bowtie2_index/sacCer3.fa
123 | ref_fa  = /mnt/data/pipeline_genome_data/saccer3/sacCer3.fa
124 | 
125 | 


--------------------------------------------------------------------------------
/examples/scripts/make_bds_cmds_PE.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | 
  3 | import os
  4 | import sys
  5 | import operator
  6 | 
  7 | 
  8 | def get_files_by_file_size(dirname, reverse=False):
  9 |     """ Return list of file paths in directory sorted by file size """
 10 | 
 11 |     # Get list of files
 12 |     filepaths = []
 13 |     for basename in os.listdir(dirname):
 14 |         filename = os.path.join(dirname, basename)
 15 |         if os.path.isfile(filename):
 16 |             filepaths.append(filename)
 17 | 
 18 |     # Re-populate list with filename, size tuples
 19 |     for i in xrange(len(filepaths)):
 20 |         filepaths[i] = (filepaths[i], os.path.getsize(filepaths[i]))
 21 | 
 22 |     # Sort list by file size
 23 |     # If reverse=True sort from largest to smallest
 24 |     # If reverse=False sort from smallest to largest
 25 |     filepaths.sort(key=lambda filename: filename[1], reverse=reverse)
 26 | 
 27 |     # Re-populate list with just filenames
 28 |     for i in xrange(len(filepaths)):
 29 |         filepaths[i] = filepaths[i][0]
 30 | 
 31 |     return filepaths
 32 | 
 33 | fsize = dict()
 34 | mp = dict()
 35 | 
 36 | order=["ATF7", "FOS", "ATF2", "CREB1", "E2F1", "EGR1", "TCF12", "TCF7L2", "NANOG", "FOXA2", "HNF4A", "FOXA1", "TAF1", "GABPA", "CEBPB", "REST", "MAX", "CTCF", "MYC", "SPI1", "JUND", "MAFK", "GATA3", "FOSL2", "YY1", "ZNF143", "E2F6", "RFX5", "SIX5", "ATF3", "RCOR1", "TBP", "SRF", "TEAD4", "EP300", "STAT3", "ARID3A"]
 37 | 
 38 | lst = get_files_by_file_size(os.getcwd(), True)
 39 | #lst.sort()
 40 | 
 41 | blacklist_ctl = [] #["GM12878","SK-N-SH","K562","HeLa-S3","GM20000","GM13977","HL-60","pancreas"]
 42 | 
 43 | #ctl_to_subsample = ["CONTROL.K562.unpaired.fastq.gz", "CONTROL.GM12878.unpaired.fastq.gz", "CONTROL.HepG2.unpaired.fastq.gz", "CONTROL.SK-N-SH.unpaired.fastq.gz", "CONTROL.HeLa-S3.unpaired.fastq.gz", "CONTROL.H1-hESC.unpaired.fastq.gz", "CONTROL.MCF-7.unpaired.fastq.gz", "CONTROL.A549.unpaired.fastq.gz", "CONTROL.liver.BSID_ENCBS401URL.unpaired.fastq.gz", "CONTROL.Panc1.unpaired.fastq.gz", "CONTROL.HCT116.unpaired.fastq.gz", "CONTROL.liver.BSID_ENCBS046RNA.unpaired.fastq.gz", "CONTROL.PC-3.unpaired.fastq.gz", "CONTROL.B_cell.unpaired.fastq.gz", "CONTROL.fibroblast_of_lung.unpaired.fastq.gz", "CONTROL.endothelial_cell_of_umbilical_vein.unpaired.fastq.gz"]
 44 | ctl_to_subsample = ["CONTROL.K562.unpaired.fastq.gz", "CONTROL.GM12878.unpaired.fastq.gz", "CONTROL.HepG2.unpaired.fastq.gz", "CONTROL.SK-N-SH.unpaired.fastq.gz", "CONTROL.HeLa-S3.unpaired.fastq.gz", "CONTROL.H1-hESC.unpaired.fastq.gz", "CONTROL.MCF-7.unpaired.fastq.gz", "CONTROL.A549.unpaired.fastq.gz", "CONTROL.liver.BSID_ENCBS401URL.unpaired.fastq.gz", "CONTROL.Panc1.unpaired.fastq.gz", "CONTROL.HCT116.unpaired.fastq.gz", "CONTROL.liver.BSID_ENCBS046RNA.unpaired.fastq.gz", "CONTROL.PC-3.unpaired.fastq.gz", "CONTROL.B_cell.unpaired.fastq.gz", "CONTROL.fibroblast_of_lung.unpaired.fastq.gz", "CONTROL.endothelial_cell_of_umbilical_vein.unpaired.fastq.gz", "CONTROL.astrocyte.unpaired.fastq.gz", "CONTROL.NT2_D1.unpaired.fastq.gz", "CONTROL.myotube.unpaired.fastq.gz", "CONTROL.induced_pluripotent_stem_cell.unpaired.fastq.gz", "CONTROL.GM12892.unpaired.fastq.gz", "CONTROL.HL-60.unpaired.fastq.gz", "CONTROL.foreskin_fibroblast.unpaired.fastq.gz", "CONTROL.IMR-90.unpaired.fastq.gz", "CONTROL.T47D.unpaired.fastq.gz"]
 45 | 
 46 | f = open("CTCF_blacklist.txt")
 47 | blacklist_fastq = f.read().splitlines()
 48 | 
 49 | #print blacklist_fastq
 50 | #sys.exit(1)
 51 | 
 52 | for i in lst:    
 53 |     prefix = os.path.basename(i).rsplit(".BSID",1)[0]
 54 | 
 55 |     if ".unpaired." in i or not "CHIPseq" in i:
 56 |         continue
 57 | 
 58 |     if os.path.basename(i) in blacklist_fastq:
 59 |         continue
 60 | 
 61 |     prefixCTL = prefix.rsplit(".")[2]
 62 | 
 63 |     if prefixCTL in blacklist_ctl:
 64 |         continue
 65 | 
 66 |     filesize = os.path.getsize(i)
 67 | 
 68 |     idx = 1
 69 |     for o in order:
 70 |         if "."+o+"." in prefix:
 71 |             break;
 72 |         idx = idx + 1
 73 | 
 74 |     if prefix in mp.keys():
 75 |         mp[prefix].append( os.path.basename(i) )
 76 |         fsize[prefix] = (fsize[prefix][0] + filesize, idx)
 77 |     else:
 78 |         mp[prefix] = []
 79 |         mp[prefix].append( os.path.basename(i) )
 80 |         fsize[prefix] = (filesize, idx)
 81 | 
 82 | sorted_fsize = sorted(fsize.items(), key=operator.itemgetter(1),reverse=True)
 83 | 
 84 | i = 0
 85 | 
 86 | sorted_fsize3 = sorted(sorted_fsize, key=operator.itemgetter(2),reverse=False)
 87 | sorted_fsize4 = sorted(sorted_fsize3, key=operator.itemgetter(3),reverse=False)
 88 | 
 89 | cnt = 0
 90 | for tup in sorted_fsize4:
 91 |     key = tup[0]
 92 |     cnt = cnt + 1
 93 | 
 94 |     length = len(mp[key])
 95 | 
 96 |     filesize = tup[1]
 97 |     o = tup[2]
 98 |     svr = tup[3]
 99 |     #nth = filesize/1000000000
100 |     nth = filesize/500000000
101 |     if nth == 0:
102 |         nth = 1
103 | 
104 |     svr_name = "NONE"
105 |     if svr==1:
106 |         svr_name="scg"
107 |     elif svr==2:
108 |         svr_name="nandi"
109 |     elif svr==3:
110 |         svr_name="mitra"
111 |     elif svr==4:
112 |         svr_name="kali"
113 |     elif svr==5:
114 |         svr_name="kadru"
115 |     elif svr==6:
116 |         svr_name="wotan"
117 |     else:
118 |         svr_name="NULL"
119 | 
120 |     print "#"+ str(cnt) + " , " + str(filesize) + ", order: " + str(o) + ", svr: " + svr_name + ", old #: " + str(tup[4])
121 |     print "NTH="+str(nth)+"; SUFFIX=\""+key+"\""
122 | 
123 |     if length  == 4:
124 |         print "FASTQ1=$DATA/DREAM_challenge/"+os.path.basename(mp[key][0])
125 |         print "FASTQ2=$DATA/DREAM_challenge/"+os.path.basename(mp[key][1])
126 |         print "FASTQ3=$DATA/DREAM_challenge/"+os.path.basename(mp[key][2])
127 |         print "FASTQ4=$DATA/DREAM_challenge/"+os.path.basename(mp[key][2])
128 |     elif length  == 3:
129 |         print "FASTQ1=$DATA/DREAM_challenge/"+os.path.basename(mp[key][0])
130 |         print "FASTQ2=$DATA/DREAM_challenge/"+os.path.basename(mp[key][1])
131 |         print "FASTQ3=$DATA/DREAM_challenge/"+os.path.basename(mp[key][2])
132 |     elif length  == 2:
133 |         print "FASTQ1=$DATA/DREAM_challenge/"+os.path.basename(mp[key][0])
134 |         print "FASTQ2=$DATA/DREAM_challenge/"+os.path.basename(mp[key][1])
135 |     elif length == 1:
136 |         print "FASTQ1=$DATA/DREAM_challenge/"+os.path.basename(mp[key][0])
137 |     else:
138 |         print "LEN>3: " + str( length )
139 | 
140 |     prefixCTL = key.rsplit(".")[2]
141 | 
142 |     foundCTL = False      
143 | 
144 |     lst2 = get_files_by_file_size("/srv/gsfs0/scratch/leepc12/data/DREAM_challenge", True)
145 | 
146 |     for k in lst2:
147 |         if "."+prefixCTL+"." in k and "CONTROL" in k and k.endswith("unpaired.fastq.gz"):
148 |             print "CTL_FASTQ=$DATA/DREAM_challenge/"+os.path.basename(k)
149 |             print "WORK=$RUN/DREAM_challenge/$SUFFIX; mkdir -p $WORK/out/align; mkdir -p $WORK/out/qc"
150 |             print "cd $WORK/out/align; ln -s ../../../../DREAM_challenge_ctl/CONTROL."+prefixCTL+"/out/align/rep1 ctl1"
151 |             print "cd $WORK/out/qc; ln -s ../../../../DREAM_challenge_ctl/CONTROL."+prefixCTL+"/out/qc/rep1 ctl1"
152 |             print "cd $WORK;"
153 |             str_FASTQ = " "
154 |             if length == 3:
155 |                 str_FASTQ = " -fastq1 $FASTQ1 -fastq2 $FASTQ2 -fastq3 $FASTQ3 "
156 |             if length == 2:
157 |                 str_FASTQ = " -fastq1 $FASTQ1 -fastq2 $FASTQ2 "
158 |             if length == 1:
159 |                 str_FASTQ = " -fastq1 $FASTQ1 "
160 | 
161 |             subsample = ""
162 |             if os.path.basename(k) in ctl_to_subsample:
163 |                 subsample = " -subsample_ctl 40000000 "
164 | 
165 |             print "bds_scr ${SUFFIX//\//_} $CODE/bds_atac/chipseq/chipseq.bds -callpeak spp -no_naive_overlap -species hg19 -nth $NTH " + str_FASTQ + "-ctl_fastq $CTL_FASTQ -title ${SUFFIX//\//_} -url_base http://mitra.stanford.edu/kundaje/leepc12/DREAM_challenge/$SUFFIX/out" + subsample
166 |             if os.path.basename(k) in ctl_to_subsample:
167 |                 print "##SUBSAMPLE!"
168 |             print "sleep 5"
169 |             print
170 |             foundCTL = True
171 | 
172 |     if not foundCTL:
173 |         print "#NOT FOUND! (SE, CTL)"
174 |         print
175 | 


--------------------------------------------------------------------------------
/utils/trimfastq.py:
--------------------------------------------------------------------------------
  1 | ##################################
  2 | #                                #
  3 | # Last modified 2017/11/08       # 
  4 | #                                #
  5 | # Georgi Marinov                 #
  6 | #                                # 
  7 | ##################################
  8 | 
  9 | import sys
 10 | import os
 11 | 
 12 | # try:
 13 | # 	import psyco
 14 | # 	psyco.full()
 15 | # except:
 16 | # 	pass
 17 | 
 18 | def run():
 19 | 
 20 |     if len(sys.argv) < 2:
 21 |         print 'usage: python %s <inputfilename> <bpToKeep | max> [-trim5 bp] [-flowcellID flowcell] [-addEnd 1 | 2] [-replace string newstring | blank] [-renameIDs prefix] [-stdout]' % sys.argv[0]
 22 |         print '\tthe -trim5 option will trim additional bp from the 5 end, i.e. if you want the middle 36bp of 38bp reads, use 36 as bp to keep and 1 as the trim5 argument'
 23 |         print '\tUse - to specify standard input, the script will print to standard output by default'
 24 |         print '\tThe script can read compressed files as long as they have the correct suffix - .bz2 or .gz'
 25 |         sys.exit(1)
 26 | 
 27 |     inputfilename = sys.argv[1]
 28 |     doMax=False
 29 |     if sys.argv[2] == 'max':
 30 |         doMax=True
 31 |         trim='max'
 32 |     else: 
 33 |         trim = int(sys.argv[2])
 34 |     outputfilename = inputfilename.split('/')[-1].split('.fastq')[0] + '.' +str(trim)+'mers.fastq'
 35 |     doFlowcellID=False
 36 | 
 37 |     doStdOut=True
 38 | #    doStdOut=False
 39 | #    if '-stdout' in sys.argv:
 40 | #        doStdOut = True
 41 | 
 42 |     if '-flowcellID' in sys.argv:
 43 |         doFlowcellID=True
 44 |         flowcellID=sys.argv[sys.argv.index('-flowcellID')+1]
 45 |         if doStdOut:
 46 |             pass
 47 |         else:
 48 |             print 'will include flowcell ID', flowcellID, 'in reads headers'
 49 | 
 50 |     doRenameIDs = False
 51 |     if '-renameIDs' in sys.argv:
 52 |         doRenameIDs = True
 53 |         RID = '@' + sys.argv[sys.argv.index('-renameIDs') + 1]
 54 | 
 55 |     dotrim5=False
 56 |     if '-trim5' in sys.argv:
 57 |         dotrim5=True
 58 |         trim5=int(sys.argv[sys.argv.index('-trim5')+1])
 59 |         if doStdOut:
 60 |             pass
 61 |         else:
 62 |             print 'will trim ', trim5, 'bp from the 5-end'
 63 |         outputfilename = inputfilename.split('.fastq')[0] + '.' +str(trim)+'bp-5prim-trim.fastq'
 64 | 
 65 |     doAddEnd=False
 66 |     if '-addEnd' in sys.argv:
 67 |         doAddEnd=True
 68 |         END=sys.argv[sys.argv.index('-addEnd')+1]
 69 |         if doStdOut:
 70 |             pass
 71 |         else:
 72 |             print 'will add',  '/'+END, 'to read IDs'
 73 | 
 74 |     doReplace=False
 75 |     if '-replace' in sys.argv:
 76 |         doReplace=True
 77 |         oldstring=sys.argv[sys.argv.index('-replace')+1]
 78 |         newstring=sys.argv[sys.argv.index('-replace')+2]
 79 |         if newstring == 'blank':
 80 |             newstring=''
 81 |         if doStdOut:
 82 |             pass
 83 |         else:
 84 |             print 'will replace',  oldstring, 'with', newstring, 'in read IDs'
 85 | 
 86 |     i=0 
 87 |     shorter=0
 88 | 
 89 |     if doStdOut:
 90 |         pass
 91 |     else:
 92 |         outfile = open(outputfilename, 'w')
 93 | 
 94 |     doStdIn = False
 95 |     if inputfilename != '-':
 96 |         if inputfilename.endswith('.bz2'):
 97 |             cmd = 'bzip2 -cd ' + inputfilename
 98 |         elif inputfilename.endswith('.gz'):
 99 |             cmd = 'gunzip -c ' + inputfilename
100 |         else:
101 |             cmd = 'cat ' + inputfilename
102 |         p = os.popen(cmd, "r")
103 |     else:
104 |         doStdIn = True
105 | 
106 |     line = 'line'
107 | 
108 |     if dotrim5:
109 |         i=1
110 |         j=0
111 |         while line != '':
112 |             if doStdIn:
113 |                 line = sys.stdin.readline()
114 |             else:
115 |                 line = p.readline()
116 |             if line == '':
117 |                 break
118 |             if i==1 and line[0]=='@':
119 |                 if doFlowcellID and flowcellID not in line:
120 |                     ID='@'+flowcellID+'_'+line.replace(' ','_')[1:-1]+'\n'
121 |                 else:
122 |                     ID=line.replace(' ','_')
123 |                 if doReplace:
124 |                     ID=ID.replace(oldstring,newstring)
125 |                 if doRenameIDs:
126 |                     ID = RID + str(j)
127 |                 if doAddEnd:
128 |                     ID=ID.strip()+'/'+END+'\n'
129 |                 i=2
130 |                 continue
131 |             if i==2:
132 |                 i=3
133 |                 sequence=line[trim5:len(line)].strip()
134 |                 continue
135 |             if i==3 and line[0]=='+':
136 |                 plus='+\n'
137 |                 i=4
138 |                 continue
139 |             if i==4:
140 |                 scores=line
141 |                 i=1
142 |                 scores=line[trim5:len(line)].strip()
143 |                 scores=scores[0:trim]
144 |                 j=j+1
145 |                 if j % 5000000 == 0:
146 |                     if doStdOut:
147 |                         pass
148 |                     else:
149 |                         print str(j/1000000) + 'M reads processed'
150 |                 if doMax: 
151 |                     sequence=sequence.replace('.','N')
152 |                 else:
153 |                     sequence=sequence[0:trim].replace('.','N')+'\n'
154 |                 if doStdOut:
155 |                     print ID.strip()
156 |                     print sequence.strip()
157 |                     print plus.strip()
158 |                     print scores
159 |                 else:
160 |                     outfile.write(ID.strip()+'\n')
161 |                     outfile.write(sequence.strip()+'\n')
162 |                     outfile.write(plus.strip()+'\n')
163 |                     outfile.write(scores + '\n')
164 |                 continue
165 |     else:
166 |         i=1
167 |         j=0
168 |         while line != '':
169 |             if doStdIn:
170 |                 line = sys.stdin.readline()
171 |             else:
172 |                 line = p.readline()
173 |             if line == '':
174 |                 break
175 |             if i==1 and line[0]=='@':
176 |                 if doFlowcellID and flowcellID not in line:
177 |                     ID='@'+flowcellID+'_'+line.replace(' ','_')[1:-1]+'\n'
178 |                 else:
179 |                     ID=line.replace(' ','_')
180 |                 if doReplace:
181 |                     ID=ID.replace(oldstring,newstring)
182 |                 if doRenameIDs:
183 |                     ID = RID + str(j)
184 |                 if doAddEnd:
185 |                     ID=ID.strip()+'/'+END+'\n'
186 |                 i=2
187 |                 continue
188 |             if i==2:
189 |                 i=3
190 |                 j=j+1
191 |                 if j % 5000000 == 0:
192 |                     if doStdOut:
193 |                         pass
194 |                     else:
195 |                         print str(j/1000000) + 'M reads processed'
196 |                 if doMax: 
197 |                     sequence=line
198 |                 else:
199 |                     if len(line.strip())<trim:
200 |                         shorter+=1
201 |                         sequence=line.strip().replace('.','N')+'\n'
202 |                     else:
203 |                         sequence=line[0:trim].replace('.','N')+'\n'
204 |                 continue
205 |             if i==3 and line[0]=='+':
206 |                 plus='+\n'
207 |                 i=4
208 |                 continue
209 |             if i==4:
210 |                 i=1
211 |                 if doMax: 
212 |                     scores=line
213 |                     if doStdOut:
214 |                         print ID.strip()
215 |                         print sequence.strip()
216 |                         print plus.strip()
217 |                         print line.strip()
218 |                     else:
219 |                         outfile.write(ID)
220 |                         outfile.write(sequence)
221 |                         outfile.write(plus)
222 |                         outfile.write(line)
223 |                 else:
224 |                     if len(line.strip())<trim:
225 |                         continue
226 |                     scores=line[0:trim]+'\n'
227 |                     if doStdOut:
228 |                         print ID.strip()
229 |                         print sequence.strip()
230 |                         print plus.strip()
231 |                         print scores.strip()
232 |                     else:
233 |                         outfile.write(ID)
234 |                         outfile.write(sequence)
235 |                         outfile.write(plus)
236 |                         outfile.write(scores)
237 |                 continue
238 | 
239 |     if doStdOut:
240 |         pass
241 |     else:
242 |         outfile.close()
243 | 
244 |     if shorter>0:
245 |         print shorter, 'sequences shorter than desired length'
246 | run()
247 | 
248 | 


--------------------------------------------------------------------------------
/etc/Read_Distribution_ChIP-exo.txt:
--------------------------------------------------------------------------------
  1 | -150	1.4340596390173622E-4
  2 | -149	1.4639789385892905E-4
  3 | -148	1.5480693225929023E-4
  4 | -147	1.7000129172168031E-4
  5 | -146	1.8955643091445948E-4
  6 | -145	2.10099620018363E-4
  7 | -144	2.2825812921412606E-4
  8 | -143	2.4065922868248376E-4
  9 | -142	2.4490845059535867E-4
 10 | -141	2.42524375089422E-4
 11 | -140	2.3600384429253273E-4
 12 | -139	2.2784370033254942E-4
 13 | -138	2.2054078533733076E-4
 14 | -137	2.160103191155617E-4
 15 | -136	2.1384103219923188E-4
 16 | -135	2.130400328011573E-4
 17 | -134	2.1261442913415377E-4
 18 | -133	2.115713294110372E-4
 19 | -132	2.0924529587337638E-4
 20 | -131	2.0628070687775117E-4
 21 | -130	2.0364939480949432E-4
 22 | -129	2.0232319205393872E-4
 23 | -128	2.03273930996417E-4
 24 | -127	2.0698831057396183E-4
 25 | -126	2.1201249593040522E-4
 26 | -125	2.1640751876127905E-4
 27 | -124	2.1823441076211522E-4
 28 | -123	2.155542036284456E-4
 29 | -122	2.0726322585061546E-4
 30 | -121	1.955989930982237E-4
 31 | -120	1.8363431783568262E-4
 32 | -119	1.7444201252740454E-4
 33 | -118	1.7109488963780177E-4
 34 | -117	1.755322875141391E-4
 35 | -116	1.851596480350912E-4
 36 | -115	1.9624893896218526E-4
 37 | -114	2.0507212805694847E-4
 38 | -113	2.079011830809079E-4
 39 | -112	2.0271781417655047E-4
 40 | -111	1.9434270101020137E-4
 41 | -110	1.893062656291457E-4
 42 | -109	1.9413893008066825E-4
 43 | -108	2.1537111641205402E-4
 44 | -107	2.5662859206778457E-4
 45 | -106	3.099185060811279E-4
 46 | -105	3.643433528825486E-4
 47 | -104	4.090056269025112E-4
 48 | -103	4.3300782257148034E-4
 49 | -102	4.290558500005738E-4
 50 | -101	4.042692820235224E-4
 51 | -100	3.693711071547101E-4
 52 | -99	3.3508431390852095E-4
 53 | -98	3.1213189079933875E-4
 54 | -97	3.0854936066793305E-4
 55 | -96	3.2162238366061427E-4
 56 | -95	3.4594915425007844E-4
 57 | -94	3.761278669090213E-4
 58 | -93	4.0675671611013883E-4
 59 | -92	4.334808134009706E-4
 60 | -91	4.561329386284308E-4
 61 | -90	4.7559278871427704E-4
 62 | -89	4.927400605802674E-4
 63 | -88	5.084544511481597E-4
 64 | -87	5.230262134493347E-4
 65 | -86	5.343878249536639E-4
 66 | -85	5.398823192406418E-4
 67 | -84	5.368527298897633E-4
 68 | -83	5.226420904805226E-4
 69 | -82	4.96347367884576E-4
 70 | -81	4.640812621422249E-4
 71 | -80	4.337104065859328E-4
 72 | -79	4.1310143454816267E-4
 73 | -78	4.10120979361378E-4
 74 | -77	4.299382988964986E-4
 75 | -76	4.669331491782717E-4
 76 | -75	5.127879107699016E-4
 77 | -74	5.591849642345921E-4
 78 | -73	5.978066901355474E-4
 79 | -72	6.224861874065494E-4
 80 | -71	6.356594284636885E-4
 81 | -70	6.419131040936336E-4
 82 | -69	6.458339050830534E-4
 83 | -68	6.520085222186162E-4
 84 | -67	6.642177211211218E-4
 85 | -66	6.830185667478948E-4
 86 | -65	7.08162198890391E-4
 87 | -64	7.393997573400663E-4
 88 | -63	7.764823818883762E-4
 89 | -62	8.191343740323966E-4
 90 | -61	8.669726820916808E-4
 91 | -60	9.195874160914019E-4
 92 | -59	9.765686860567337E-4
 93 | -58	0.0010375066020128487
 94 | -57	0.0011017330590527004
 95 | -56	0.00116754709254036
 96 | -55	0.0012329895229076788
 97 | -54	0.0012961011705865087
 98 | -53	0.0013549228560087003
 99 | -52	0.0014098242005801068
100 | -51	0.0014704900296025876
101 | -50	0.0015489339693520027
102 | -49	0.0016571696461042138
103 | -48	0.0018072106861350802
104 | -47	0.0020044404349732823
105 | -46	0.002227721115158772
106 | -45	0.0024492846684843195
107 | -44	0.0026413630367426947
108 | -43	0.0027761881617266704
109 | -42	0.002836351287151588
110 | -41	0.002845880864423082
111 | -40	0.0028391646468693605
112 | -39	0.00285059038781863
113 | -38	0.0029145458405990974
114 | -37	0.003054999456921908
115 | -36	0.003254242482029955
116 | -35	0.003484146859549071
117 | -34	0.0037165845331050868
118 | -33	0.003923427446323833
119 | -32	0.004091558501282984
120 | -31	0.0042679044338675715
121 | -30	0.004514402938414466
122 | -29	0.004892991709260541
123 | -28	0.005465608440742671
124 | -27	0.006242892200593844
125 | -26	0.007030287550131535
126 | -25	0.007581940424069329
127 | -24	0.007651996757120818
128 | -23	0.006994602483999588
129 | -22	0.00556114033057625
130 | -21	0.004091940187349483
131 | -20	0.003524568735974988
132 | -19	0.004796592658108461
133 | -18	0.0088455786354056
134 | -17	0.016163107580678765
135 | -16	0.02545681733136696
136 | -15	0.034988359956065855
137 | -14	0.0430193875233711
138 | -13	0.04781155210187838
139 | -12	0.048136600329994075
140 | -11	0.044806657125367494
141 | -10	0.03914394197545865
142 | -9	0.032470674367727596
143 | -8	0.026109073789634344
144 | -7	0.021118332556040682
145 | -6	0.017505534291415395
146 | -5	0.015014735447629032
147 | -4	0.013389992476552141
148 | -3	0.012375361830055257
149 | -2	0.011749289178588132
150 | -1	0.011427777066917324
151 | 0	0.0113612172583886
152 | 1	0.011500001516347726
153 | 2	0.011794521604140462
154 | 3	0.012179966617556161
155 | 4	0.012530714982158502
156 | 5	0.012705942455954753
157 | 6	0.012564824796952177
158 | 7	0.01196653776315804
159 | 8	0.010832880494232516
160 | 9	0.009336145656447381
161 | 10	0.007711249297727332
162 | 11	0.00619310746599705
163 | 12	0.005016636209181219
164 | 13	0.004352368176832731
165 | 14	0.0041133024250172585
166 | 15	0.004148054611428677
167 | 16	0.004305240393760864
168 | 17	0.004433475429707692
169 | 18	0.004414343407977009
170 | 19	0.004261300141332552
171 | 20	0.004020769473552029
172 | 21	0.003739175248413146
173 | 22	0.0034629413096936134
174 | 23	0.0032292248729375262
175 | 24	0.003038116640754531
176 | 25	0.002880440687520661
177 | 26	0.002747021087611949
178 | 27	0.0026286819154044297
179 | 28	0.0025185949491149514
180 | 29	0.002419322782323626
181 | 30	0.00233577571245138
182 | 31	0.002272864036919143
183 | 32	0.0022354980531478397
184 | 33	0.0022249651042246296
185 | 34	0.002228060715901593
186 | 35	0.0022279574595970427
187 | 36	0.002207827906729291
188 | 37	0.0021508446287166493
189 | 38	0.0020466541811146616
190 | 39	0.0019107990560277952
191 | 40	0.0017652957296977485
192 | 41	0.00163216067836622
193 | 42	0.0015334103782749082
194 | 43	0.0014847743229281654
195 | 44	0.0014768340748809537
196 | 45	0.0014938842139508909
197 | 46	0.0015202193199555926
198 | 47	0.0015401339727126753
199 | 48	0.0015409295209052893
200 | 49	0.0015219343886787202
201 | 50	0.0014854837690437867
202 | 51	0.0014339128550113077
203 | 52	0.0013695568395921023
204 | 53	0.0012956931215641026
205 | 54	0.0012193679227736943
206 | 55	0.0011485696708343764
207 | 56	0.0010912867933596476
208 | 57	0.0010555077179630066
209 | 58	0.00104649552274266
210 | 59	0.0010586118877356464
211 | 60	0.0010834931434637114
212 | 61	0.0011127756204486014
213 | 62	0.0011380956492120624
214 | 63	0.0011531119928568795
215 | 64	0.0011595731448099929
216 | 65	0.0011612500310793827
217 | 66	0.0011619135776730276
218 | 67	0.001165334710598908
219 | 68	0.001173884139795982
220 | 69	0.00118433171092713
221 | 70	0.0011920470535862109
222 | 71	0.0011923997973670843
223 | 72	0.001180759571863609
224 | 73	0.0011542417959203714
225 | 74	0.0011169450453848613
226 | 75	0.001074713685355297
227 | 76	0.0010333920809298947
228 | 77	9.988245972068718E-4
229 | 78	9.751657482580574E-4
230 | 79	9.598106440497264E-4
231 | 80	9.484645435217661E-4
232 | 81	9.368327056140632E-4
233 | 82	9.206203892665048E-4
234 | 83	8.967710191298164E-4
235 | 84	8.671806826980791E-4
236 | 85	8.349836331762125E-4
237 | 86	8.033141237691362E-4
238 | 87	7.753064076817695E-4
239 | 88	7.533521846596292E-4
240 | 89	7.368729406106167E-4
241 | 90	7.245476079832308E-4
242 | 91	7.150551192259704E-4
243 | 92	7.070744067873337E-4
244 | 93	6.991828586780328E-4
245 | 94	6.895516851576321E-4
246 | 95	6.762505520479096E-4
247 | 96	6.573491251706434E-4
248 | 97	6.309170703476112E-4
249 | 98	5.96478379599652E-4
250 | 99	5.593743497438486E-4
251 | 100	5.264006037963452E-4
252 | 101	5.043527647732853E-4
253 | 102	5.000264556908131E-4
254 | 103	5.174638557939435E-4
255 | 104	5.496933692431754E-4
256 | 105	5.869899564278788E-4
257 | 106	6.196285777374238E-4
258 | 107	6.378841935611802E-4
259 | 108	6.348481563449292E-4
260 | 109	6.148773867600951E-4
261 | 110	5.851451975345133E-4
262 | 111	5.528249013960191E-4
263 | 112	5.250898110724477E-4
264 | 113	5.077012573189973E-4
265 | 114	5.007726430003173E-4
266 | 115	5.030053890084198E-4
267 | 116	5.131009162353166E-4
268 | 117	5.297606455730198E-4
269 | 118	5.517066043350753E-4
270 | 119	5.777432455211638E-4
271 | 120	6.066956285525004E-4
272 | 121	6.373888128502998E-4
273 | 122	6.686478578357767E-4
274 | 123	6.987252261003888E-4
275 | 124	7.235829929165654E-4
276 | 125	7.386106367269791E-4
277 | 126	7.391976359743019E-4
278 | 127	7.207334691012063E-4
279 | 128	6.81260612525679E-4
280 | 129	6.294335345669648E-4
281 | 130	5.765597015196229E-4
282 | 131	5.339465796782125E-4
283 | 132	5.129016353372928E-4
284 | 133	5.205071308008424E-4
285 | 134	5.46944512410516E-4
286 | 135	5.78170022517388E-4
287 | 136	6.001399034725329E-4
288 | 137	5.988103976270243E-4
289 | 138	5.648419108744506E-4
290 | 139	5.077115032784558E-4
291 | 140	4.416003984451976E-4
292 | 141	3.8068981998083413E-4
293 | 142	3.391609914915229E-4
294 | 143	3.2717597183875294E-4
295 | 144	3.388201609053363E-4
296 | 145	3.641597938294164E-4
297 | 146	3.932611057491365E-4
298 | 147	4.1619033180263966E-4
299 | 148	4.254656610532796E-4
300 | 149	4.234130982652516E-4
301 | 150	4.1481060212796147E-4
302 | 


--------------------------------------------------------------------------------
/modules/postalign_bed.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "species.bds"
  5 | include "module_template.bds"
  6 | 
  7 | 
  8 | // has functions related to tagalign, and helps getting tagalign from configruation file or command line argument
  9 | 
 10 | help == postalign bed/tagalign settings
 11 | mem_shuf 		:= "12G"	help Max. memory for UNIX shuf (default: 12G).
 12 | no_random_source 	:= false 	help Disable --random-source for UNIX shuf. Hot fix for end of file error.
 13 | 
 14 | 
 15 | init_postalign_bed()
 16 | 
 17 | 
 18 | void init_postalign_bed() {
 19 | 
 20 | 	// fraglen0 	= get_conf_val_bool( fraglen0,	["fraglen0"] )
 21 | 	mem_shuf	= get_conf_val( mem_shuf,	["mem_shuf"] )
 22 | 	no_random_source = get_conf_val_bool( no_random_source,	["no_random_source"] )
 23 | 
 24 | 	print("\n\n== postalign bed/tagalign settings\n")
 25 | 	print( "Max. memory for UNIX shuf\t\t\t: $mem_shuf\n")
 26 | 	print( "No --random-source for UNIX shuf\t\t: $no_random_source\n")
 27 | }
 28 | 
 29 | string subsample_tag( string tag, int nlines, bool non_mito, string o_dir, string group ) {
 30 | 
 31 | 	prefix 		:= replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir )
 32 | 	nreads_per_mill := metric_prefix( nlines )
 33 | 
 34 | 	subsampled_tag 	:= "$prefix."+(non_mito?"no_chrM.":"")+"$nreads_per_mill.tagAlign.gz"
 35 | 	non_mito_param 	:= non_mito ? "grep -v \"chrM\" | " : ""	
 36 | 	random_source_param := no_random_source ? "" : "--random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt </dev/zero 2>/dev/null)"
 37 | 
 38 | 	in 	:= [ tag ]
 39 | 	out 	:= subsampled_tag
 40 | 
 41 | 	taskName:= "subsample_tag " + group
 42 | 	mem := get_res_mem(mem_shuf,1)
 43 | 
 44 | 	wait_par( cpus )
 45 | 
 46 | 	tid := task( out<-in ) {
 47 | 
 48 | 		sys $shcmd_init
 49 | 	
 50 | 		//# Subsample tagAlign file
 51 | 		sys zcat $tag | \
 52 | 			$non_mito_param shuf -n $nlines $random_source_param | gzip -nc > $subsampled_tag
 53 | 
 54 | 		sys $shcmd_finalize
 55 | 	}
 56 | 
 57 | 	register_par( tid, cpus )
 58 | 
 59 | 	add_task_to_graph( in, out, group )
 60 | 
 61 | 	return out
 62 | }
 63 | 
 64 | string subsample_tag_PE( string tag, int nlines, bool non_mito, string o_dir, string group ) {
 65 | 
 66 | 	prefix 		:= replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir )
 67 | 	nreads_per_mill := metric_prefix( nlines )
 68 | 
 69 | 	subsampled_tag 	:= "$prefix."+(non_mito?"no_chrM.":"")+"$nreads_per_mill.tagAlign.gz"
 70 | 	non_mito_param 	:= non_mito ? "grep -v \"chrM\" | " : ""
 71 | 	random_source_param := no_random_source ? "" : "--random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt </dev/zero 2>/dev/null)"
 72 | 
 73 | 	joined 		:= "$prefix.joined" // temporary file
 74 | 	joined_subsampled := "$prefix.joined.subsampled" // temporary file
 75 | 
 76 | 	in 	:= [ tag ]
 77 | 	out 	:= subsampled_tag
 78 | 
 79 | 	taskName:= "subsample_tag_PE " + group
 80 | 	mem := get_res_mem(mem_shuf,1)
 81 | 
 82 | 	wait_par( cpus )
 83 | 
 84 | 	tid := task( out<-in ) {
 85 | 
 86 | 		sys $shcmd_init
 87 | 
 88 | 		// join consecutive two lines into one
 89 | 		sys zcat $tag | sed 'N;s/\n/\t/' > $joined
 90 | 
 91 | 		//# Shuffle and split temporary combined file into 2 equal parts
 92 | 		//# Will produce $PR_PREFIX00 and $PR_PREFIX01
 93 | 		sys cat $joined | $non_mito_param shuf -n $nlines $random_source_param > $joined_subsampled
 94 | 	
 95 | 		//# Subsample tagAlign file
 96 | 		sys awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\t%s\t%s\t%s\n%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}' $joined_subsampled | \
 97 | 			gzip -nc > $subsampled_tag
 98 | 
 99 | 		sys rm -f $joined $joined_subsampled
100 | 
101 | 		sys $shcmd_finalize
102 | 	}
103 | 
104 | 	register_par( tid, cpus )
105 | 
106 | 	add_task_to_graph( in, out, group )
107 | 
108 | 	return out
109 | }
110 | 
111 | // Adjusts the read-ends in a read BED by Tn5 offsets
112 | string tn5_shift_tag( string tag, string o_dir, string group ) {
113 | 	
114 | 	prefix 		:= replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir )
115 | 	//shifted_tag 	:= "$prefix.shifted.tagAlign.gz"
116 | 	shifted_tag 	:= "$prefix.tn5.tagAlign.gz"
117 | 
118 | 	in 	:= [ tag ]
119 | 	out 	:= shifted_tag
120 | 
121 | 	taskName:= "shift_tag " + group
122 | 
123 | 	wait_par( cpus )
124 | 
125 | 	tid := task( out<-in ) {
126 | 
127 | 		sys $shcmd_init
128 | 
129 | 		sys zcat $tag | awk -F '\t' 'BEGIN {OFS = FS}{ if ($6 == "+") {$2 = $2 + 4} else if ($6 == "-") {$3 = $3 - 5} print $0}' | gzip -nc > $shifted_tag
130 | 
131 | 		sys $shcmd_finalize
132 | 	}
133 | 
134 | 	register_par( tid, cpus )
135 | 
136 | 	add_task_to_graph( in, out, group )
137 | 
138 | 	return out
139 | }
140 | 
141 | // make spr(self_pseudo_replicate)
142 | string[] spr( string tag, string pr1_o_dir, string pr2_o_dir, string group ) {
143 | 	
144 | 	prefix_pr1 	:= replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), pr1_o_dir )
145 | 	prefix_pr2	:= replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), pr2_o_dir )
146 | 	tag_pr1		:= "$prefix_pr1.pr1.tagAlign.gz"
147 | 	tag_pr2		:= "$prefix_pr2.pr2.tagAlign.gz"
148 | 	random_source_param := no_random_source ? "" : "--random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt </dev/zero 2>/dev/null)"
149 | 
150 | 	in 	:= [ tag ]
151 | 	out 	:= [ tag_pr1, tag_pr2 ]
152 | 
153 | 	taskName:= "spr " + group
154 | 	mem := get_res_mem(mem_shuf,1)
155 | 
156 | 	wait_par( cpus )
157 | 
158 | 	tid := task( out<-in ) {
159 | 
160 | 		sys $shcmd_init
161 | 
162 | 		//# Get total number of read pairs
163 | 		sys nlines=$( zcat $tag | wc -l )
164 | 		sys nlines=$(( (nlines + 1) / 2 ))
165 | 		
166 | 		//# Shuffle and split BEDPE file into 2 equal parts
167 | 		//# Will produce $PR_PREFIX00 and $PR_PREFIX01
168 | 		sys zcat $tag | shuf $random_source_param | split -d -l $((nlines)) - $prefix_pr1.
169 | 
170 | 		//# Convert read pairs to reads into standard tagAlign file
171 | 		sys gzip -nc $prefix_pr1.00 > $tag_pr1
172 | 		sys rm -f $prefix_pr1.00
173 | 		sys gzip -nc $prefix_pr1.01 > $tag_pr2
174 | 		sys rm -f $prefix_pr1.01
175 | 
176 | 		sys $shcmd_finalize
177 | 	}
178 | 
179 | 	register_par( tid, cpus )
180 | 
181 | 	add_task_to_graph( in, out, ["$group PR 1", "$group PR 2"] )
182 | 
183 | 	return out
184 | }
185 | 
186 | string[] spr_tag_PE( string tag, string pr1_o_dir, string pr2_o_dir, string group ) {
187 | 	
188 | 	prefix_pr1 	:= replace_dir( rm_ext( tag, ["bed","tagAlign"] ), pr1_o_dir )
189 | 	prefix_pr2 	:= replace_dir( rm_ext( tag, ["bed","tagAlign"] ), pr2_o_dir )
190 | 
191 | 	joined 	:= "$prefix_pr1.joined" // temporary file
192 | 
193 | 	tag_pr1	:= "$prefix_pr1.pr1.tagAlign.gz"
194 | 	tag_pr2	:= "$prefix_pr2.pr2.tagAlign.gz"
195 | 	random_source_param := no_random_source ? "" : "--random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt </dev/zero 2>/dev/null)"
196 | 
197 | 	in 	:= [ tag ]
198 | 	out 	:= [ tag_pr1, tag_pr2 ]
199 | 
200 | 	taskName:= "spr_tag_PE " + group
201 | 	mem := get_res_mem(mem_shuf,1)
202 | 
203 | 	wait_par( cpus )
204 | 
205 | 	tid := task( out<-in ) {
206 | 
207 | 		sys $shcmd_init
208 | 
209 | 		// join consecutive two lines into one
210 | 		sys zcat $tag | sed 'N;s/\n/\t/' > $joined
211 | 
212 | 		//# Get total number of read pairs
213 | 		sys nlines=$( cat $joined | wc -l )
214 | 		sys nlines=$(( (nlines + 1) / 2 ))
215 | 
216 | 		//# Shuffle and split temporary combined file into 2 equal parts
217 | 		//# Will produce $PR_PREFIX00 and $PR_PREFIX01
218 | 		sys cat $joined | shuf $random_source_param | split -d -l $((nlines)) - $prefix_pr1.  
219 | 
220 | 		//# Convert read pairs to reads into standard tagAlign file
221 | 		sys awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\t%s\t%s\t%s\n%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}' "$prefix_pr1.00" | \
222 | 			gzip -nc > $tag_pr1
223 | 		sys rm -f $prefix_pr1.00
224 | 		sys awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\t%s\t%s\t%s\n%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}' "$prefix_pr1.01" | \
225 | 			gzip -nc > $tag_pr2
226 | 		sys rm -f $prefix_pr1.01
227 | 
228 | 		sys rm -f $joined
229 | 
230 | 		sys $shcmd_finalize
231 | 	}
232 | 	
233 | 	register_par( tid, cpus )
234 | 
235 | 	add_task_to_graph( in, out, ["$group PR 1", "$group PR 2"] )
236 | 
237 | 	return out
238 | }
239 | 
240 | string pool_tag( string tag1, string tag2, string o_dir, string group ) {
241 | 	// LINUX has limit on filename length (255), make it as shorter as possible
242 | 	string tag_pooled
243 | 	if ( get_basename(tag1).length() < 50 && get_basename(tag2).length() < 50 ) {
244 | 		prefix 	:= "$o_dir/" + merge_basename_wo_ext( tag1, tag2, ["tagAlign","tag","bed"] )	
245 | 		tag_pooled = "$prefix.tagAlign.gz"
246 | 	}
247 | 	else {
248 | 		prefix 	:= replace_dir( rm_ext( tag1, ["bed","tagAlign"] ), o_dir )	
249 | 		tag_pooled = "$prefix"+"_pooled.tagAlign.gz"
250 | 	}
251 | 
252 | 	in 	:= [ tag1, tag2 ]
253 | 	out 	:= tag_pooled
254 | 
255 | 	taskName:= "pool_tag " + group
256 | 
257 | 	wait_par( cpus )
258 | 
259 | 	tid := task( out<-in ) {
260 | 
261 | 		sys $shcmd_init
262 | 		sys zcat $tag1 $tag2 | gzip -nc > $tag_pooled
263 | 
264 | 		sys $shcmd_finalize
265 | 	}
266 | 
267 | 	register_par( tid, cpus )
268 | 
269 | 	add_task_to_graph( in, out, group )
270 | 
271 | 	return out
272 | }
273 | 
274 | string pool_tag( string[] tags, string o_dir, string group ) {
275 | 	// LINUX has limit on filename length (255), make it as short as possible
276 | 	string tag_pooled
277 | 	if ( tags.size() <= 2 && get_basename(tags[0]).length() < 50 && get_basename(tags[1]).length() < 50 ) {
278 | 		prefix 	:= "$o_dir/" + merge_basename_wo_ext( tags[0], tags[1], ["tagAlign","tag","bed"] )	
279 | 		tag_pooled = "$prefix.tagAlign.gz"
280 | 	}
281 | 	else {
282 | 		prefix 	:= replace_dir( rm_ext( tags[0], ["bed","tagAlign"] ), o_dir )	
283 | 		tag_pooled = "$prefix"+"_pooled.tagAlign.gz"
284 | 	}
285 | 	tags_str 	:= array_to_str( tags, " " ) // join
286 | 
287 | 	in 		:= tags
288 | 	out 		:= tag_pooled
289 | 
290 | 	taskName:= "pool_tag " + group
291 | 
292 | 	wait_par( cpus )
293 | 
294 | 	tid := task( out<-in ) {
295 | 
296 | 		sys $shcmd_init
297 | 
298 | 		sys zcat $tags_str | gzip -nc > $tag_pooled
299 | 
300 | 		sys $shcmd_finalize
301 | 	}
302 | 
303 | 	register_par( tid, cpus )
304 | 
305 | 	add_task_to_graph( in, out, group )
306 | 
307 | 	return out
308 | }
309 | 


--------------------------------------------------------------------------------