├── .gitignore
├── .gitmodules
├── LICENSE.md
├── README.md
├── adapter_trimmer.bds
├── atac.bds
├── bds.config
├── default.env
├── etc
    ├── broadPeak.as
    ├── gappedPeak.as
    └── narrowPeak.as
├── examples
    ├── ENCODE
    │   └── download_ENCODE_Snyder.py
    ├── atac_shi_new2.sh
    ├── bfremin.sh
    ├── example.sh
    └── training-camp-2016.sh
├── html
    ├── jquery.treetable.css
    ├── jquery.treetable.js
    ├── jquery.treetable.theme.default.css
    └── rpt_header.html
├── install_dependencies.sh
├── install_genome_data.sh
├── modules
    ├── ENCODE_accession.bds
    ├── align_bowtie2.bds
    ├── align_etc.bds
    ├── align_multimapping.bds
    ├── align_trim_adapter.bds
    ├── ataqc.bds
    ├── callpeak_bigbed.bds
    ├── callpeak_blacklist_filter.bds
    ├── callpeak_idr.bds
    ├── callpeak_macs2_atac.bds
    ├── callpeak_naive_overlap.bds
    ├── cluster.bds
    ├── conf.bds
    ├── env.bds
    ├── filetable.bds
    ├── git.bds
    ├── graphviz.bds
    ├── input.bds
    ├── input_adapter.bds
    ├── input_bam.bds
    ├── input_fastq.bds
    ├── input_peak.bds
    ├── input_tagalign.bds
    ├── log_parser.bds
    ├── module_template.bds
    ├── output.bds
    ├── parallel.bds
    ├── pipeline_template.bds
    ├── postalign_bam.bds
    ├── postalign_bed.bds
    ├── postalign_xcor.bds
    ├── report.bds
    ├── species.bds
    ├── string.bds
    └── sys.bds
├── requirements.txt
├── requirements_py3.txt
├── species
    ├── kundaje.conf
    ├── scg.conf
    └── sherlock.conf
├── uninstall_dependencies.sh
└── utils
    ├── assign_multimappers.py
    ├── axt_dirfiles.py
    ├── bds_scr
    ├── bds_scr_5min
    ├── broadpeak.py
    ├── clusterGeneric
        ├── kill.pl
        ├── postMortemInfo.pl
        ├── run.pl
        └── stat.pl
    ├── detect_adapter.py
    ├── gappedpeak.py
    ├── get_read_length_from_fastq.py
    ├── kill_scr
    ├── narrowpeak.py
    ├── narrowpeak_idr.py
    ├── parse_summary_ENCODE_accession_recursively.py
    ├── parse_summary_ENCODE_qc_recursively.py
    ├── parse_summary_qc_recursively.py
    ├── reassemble.py
    ├── trimAdapters.py
    ├── ucsc_ensGene.py
    └── ucsc_simplegene.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.chp
2 | .*.swp
3 | .nfs*
4 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "ataqc"]
2 | 	path = ataqc
3 | 	url = https://github.com/kundajelab/ataqc
4 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | BSD-3-Clause License
 2 | 
 3 | Copyright (c) 2016, Kundaje Lab
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 9 | 
10 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
11 | 
12 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/adapter_trimmer.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | 
  5 | help == trimmer pipeline settings
  6 | 
  7 | save_to_indv_dir:= false 	help Save trimmed fastqs to individual directory for each replicate.
  8 | old_trimmer 	:= false 	help Use legacy trim adapters (trim_galore and trimAdapter.py).
  9 | 
 10 | 
 11 | help() // show help contexts
 12 | 
 13 | include "modules/pipeline_template.bds"
 14 | include "modules/input.bds"
 15 | include "modules/input_adapter.bds"
 16 | 
 17 | include "modules/align_trim_adapter.bds"
 18 | 
 19 | 
 20 | main()
 21 | 
 22 | 
 23 | void main() { // trimmer pipeline starts here
 24 | 
 25 | 	init_trimmer()
 26 | 	chk_input( true, false )
 27 | 	chk_adapters()
 28 | 	trim_adapters()
 29 | }
 30 | 
 31 | void init_trimmer() {
 32 | 
 33 | 	save_to_indv_dir 	= get_conf_val_bool( save_to_indv_dir,	["save_to_indv_dir"] )
 34 | 	old_trimmer 		= get_conf_val_bool( old_trimmer,	["old_trimmer"] )
 35 | 
 36 | 	print( "\n\n== trimmer settings\n")
 37 | 	print( "Save trimmed fastqs to individual directory for each replicate\t: $save_to_indv_dir\n" )
 38 | 	print( "Use old trim adapters\t\t\t: $old_trimmer\n" )
 39 | }
 40 | 
 41 | void chk_adapters() {
 42 | 
 43 | 	print( "\n== checking adapters to be trimmed ...\n" );
 44 | 
 45 | 	// check adapters
 46 | 	for ( int rep=1; rep <= get_num_rep(); rep++) {
 47 | 
 48 | 		string prefix
 49 | 		if ( is_input_fastq( rep ) ) {
 50 | 
 51 | 			if ( !old_trimmer ) { // check adapters
 52 | 				adapters := get_adapters( rep )
 53 | 				
 54 | 				prefix += "Replicate $rep adapters : "
 55 | 
 56 | 				if ( adapters.size()==0 ) {
 57 | 					prefix += "automatically detected"
 58 | 				}
 59 | 				else {
 60 | 					for ( int i=0; i<adapters.size(); i++) {
 61 | 						prefix = prefix + adapters[i] + ", "
 62 | 						if ( adapters[i] == "" ) {
 63 | 							print("$prefix :\n")
 64 | 							error("Adapter sequence (-adapter, -adapter[REP_ID] for SE or "\
 65 | 								+"-adapter[REP_ID]_[PAIR_ID] for PE) must be defined for adapter trimmer!\n")
 66 | 						}						
 67 | 					}
 68 | 				}
 69 | 				
 70 | 			}
 71 | 		}
 72 | 		print("$prefix\n")
 73 | 	}
 74 | }
 75 | 
 76 | void trim_adapters() {
 77 | 
 78 | 	for ( int rep=1; rep <= get_num_rep(); rep++) par trim_adapters( rep )
 79 | }
 80 | 
 81 | void trim_adapters( int rep ) {
 82 | 
 83 | 	if ( !is_input_fastq( rep ) ) error("fastq only!\n")
 84 | 
 85 | 	group 	:= get_group_name( rep )
 86 | 	long 	:= get_long_group_name( rep )
 87 | 
 88 | 	trim_o_dir := mkdir( save_to_indv_dir ? "$out_dir/$group" : "$out_dir" )
 89 | 
 90 | 	if ( is_se( rep ) ) {
 91 | 
 92 | 		fastqs := get_fastqs( rep )
 93 | 
 94 | 		string p1
 95 | 
 96 | 		if ( old_trimmer ) {
 97 | 			p1 = trim_adapters_old( fastqs[0], trim_o_dir, group )
 98 | 		}
 99 | 		else {			
100 | 			adapters := get_adapters( rep )
101 | 
102 | 			if ( adapters.size()==0 ) {
103 | 
104 | 				adapter_log := detect_adapter( fastqs[0], trim_o_dir, group )
105 | 				wait
106 | 
107 | 				adapter := parse_adapter_log( adapter_log )
108 | 
109 | 				print("\nDetected adapter for $group (SE) : $adapter\n")
110 | 				adapters.add( adapter )
111 | 
112 | 			}
113 | 
114 | 			p1 = trim_adapters( fastqs[0], adapters[0], trim_o_dir, group )
115 | 		}
116 | 	}
117 | 	else {
118 | 		fastqs := get_fastqs( rep )
119 | 
120 | 		string p1, p2
121 | 
122 | 		if ( old_trimmer ) {
123 | 			( p1, p2 ) = trim_adapters_PE_old( fastqs[0], fastqs[1], trim_o_dir, group )
124 | 		}
125 | 		else {
126 | 			adapters := get_adapters( rep )
127 | 
128 | 			if ( adapters.size()==0 ) {
129 | 				adapter_log1 := detect_adapter( fastqs[0], trim_o_dir, group )
130 | 				adapter_log2 := detect_adapter( fastqs[1], trim_o_dir, group )
131 | 				wait
132 | 
133 | 				adapter1 := parse_adapter_log( adapter_log1 )
134 | 				adapter2 := parse_adapter_log( adapter_log2 )
135 | 
136 | 				print("\nDetected adapter for $group (PE) : $adapter1, $adapter2\n")
137 | 				adapters.add( adapter1 )
138 | 				adapters.add( adapter2 )
139 | 			}
140 | 
141 | 			( p1, p2 ) = trim_adapters_PE( fastqs[0], fastqs[1], adapters[0], adapters[1], trim_o_dir, group )
142 | 		}
143 | 	}
144 | }
145 | 
146 | void help() {
147 | 
148 | 	if ( is_cmd_line_arg_empty() ) {
149 | 
150 | 		printHelp()
151 | 		exit
152 | 	}
153 | }
154 | 


--------------------------------------------------------------------------------
/bds.config:
--------------------------------------------------------------------------------
 1 | # default system (local, sge, ...)
 2 | system = local
 3 | 
 4 | # shell env.
 5 | taskShell = /bin/bash -e
 6 | sysShell = /bin/bash -e -c
 7 | 
 8 | # regex to get pid
 9 | pidRegex = "(\\d+)"
10 | 
11 | # checkpoint disabled, show full commands/stderr/stdout on task, filter out commands including "export" from task hint
12 | disableCheckpoint = true
13 | taskMaxHintLen = 300
14 | showTaskCode = true
15 | tailLines = 100000000
16 | filterOutTaskHint = export
17 | clusterPostMortemDisabled = true 	# prevent error on scg3/4
18 | 
19 | # SGE
20 | sge.pe = shm
21 | sge.mem = h_vmem
22 | sge.timeout = h_rt
23 | sge.timeout2 = s_rt
24 | clusterRunAdditionalArgs = -V
25 | 
26 | # SLURM (using generic cluster)
27 | clusterGenericRun = ~/.bds/clusterGeneric/run.pl
28 | clusterGenericKill = ~/.bds/clusterGeneric/kill.pl
29 | clusterGenericStat = ~/.bds/clusterGeneric/stat.pl
30 | clusterGenericPostMortemInfo = ~/.bds/clusterGeneric/postMortemInfo.pl
31 | 
32 | 


--------------------------------------------------------------------------------
/default.env:
--------------------------------------------------------------------------------
 1 | ## Get hostname with the following command: 
 2 | ## $ hostname -f
 3 | ##
 4 | ## Configure environment per hostname:
 5 | ## [hostname1]
 6 | ## ...
 7 | ##
 8 | ## Use the same environment for multiple hostnames:
 9 | ## [hostname2, hostname3, ...]
10 | ## ...
11 | ##
12 | ## Using group
13 | ## [hostname1, hostname2, ... : group]
14 | ## [group]
15 | ## ...
16 | ##
17 | ## Using an asterisk in hostnames (IMPORTANT: only one * is allowed in hostnames)
18 | ##
19 | ## [host*name1]
20 | ##
21 | ## [*hostname2, hostname3*]
22 | 
23 | # Stanford Kundaje group clusters (out of SGE)
24 | [vayu, mitra, durga]
25 | conda_env	= bds_atac
26 | conda_env_py3	= bds_atac_py3
27 | conda_bin_dir	= /software/miniconda3/bin
28 | species_file	= $script_dir/species/kundaje.conf
29 | unlimited_mem_wt= true 		# unlimited max. memory and walltime on Kundaje clusters
30 | nice 		= 20
31 | nth 		= 3
32 | 
33 | # Stanford Kundaje group clusters (controlled with SGE)
34 | [nandi, kali, amold, wotan, kadru, surya, indra, brahma]
35 | conda_env	= bds_atac
36 | conda_env_py3	= bds_atac_py3
37 | conda_bin_dir	= /software/miniconda3/bin
38 | species_file	= $script_dir/species/kundaje.conf
39 | unlimited_mem_wt= true 		# unlimited max. memory and walltime on Kundaje clusters
40 | system 		= sge
41 | nice 		= 20
42 | nth 		= 3
43 | 
44 | # Stanford NEW SCG
45 | [*.scg.stanford.edu, dper730xd*, hppsl230s*, dper910*, sgiuv*, sgisummit*, smsx10srw*]
46 | conda_env	= bds_atac
47 | conda_env_py3	= bds_atac_py3
48 | species_file	= $script_dir/species/scg.conf
49 | nth		= 4		# number of threads for each pipeline
50 | system		= slurm		# force to use SLURM SCG
51 | q_for_slurm_account = true	# use --account instead of -p (partition)
52 | cluster_task_delay = 10 	# for NFS delayed write
53 | 
54 | # Stanford OLD SCG4
55 | [scg*.stanford.edu, scg*.local, carmack.stanford.edu, crick.stanford.edu]
56 | conda_env	= bds_atac
57 | conda_env_py3	= bds_atac_py3
58 | species_file	= $script_dir/species/scg.conf
59 | nth		= 4		# number of threads for each pipeline
60 | system		= sge		# force to use SGE (Sun Grid Engine) on SCG3/4 even though a user doesn't explicitly specify SGE on command line with 'bds -s sge atac.bds ...'
61 | cluster_task_delay = 10 	# for NFS delayed write
62 | 
63 | # Stanford Sherlock clusters
64 | [sherlock*.stanford.edu, sh-*.local, sh-*.int, sh-ln*.stanford.edu]
65 | conda_env	= bds_atac
66 | conda_env_py3	= bds_atac_py3
67 | species_file	= $script_dir/species/sherlock.conf
68 | nth		= 4		# number of threads for each pipeline
69 | system		= slurm
70 | cluster_task_delay = 30		# for NFS delayed write
71 | 
72 | 
73 | # default
74 | [default]
75 | conda_env	= bds_atac
76 | conda_env_py3	= bds_atac_py3
77 | species_file	= # use your own species file here. (DEF_SPECIES_FILE: DO NOT REMOVE THIS COMMENT!)
78 | 


--------------------------------------------------------------------------------
/etc/broadPeak.as:
--------------------------------------------------------------------------------
 1 | table broadPeak
 2 | "BED6+3 Peaks of signal enrichment based on pooled, normalized (interpreted) data."
 3 | (
 4 |     string chrom;        "Reference sequence chromosome or scaffold"
 5 |     uint   chromStart;   "Start position in chromosome"
 6 |     uint   chromEnd;     "End position in chromosome"
 7 |     string name;	 "Name given to a region (preferably unique). Use . if no name is assigned."
 8 |     uint   score;        "Indicates how dark the peak will be displayed in the browser (0-1000)"
 9 |     char[1]   strand;     "+ or - or . for unknown"
10 |     float  signalValue;  "Measurement of average enrichment for the region"
11 |     float  pValue;       "Statistical significance of signal value (-log10). Set to -1 if not used."
12 |     float  qValue;       "Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used."
13 | )
14 | 


--------------------------------------------------------------------------------
/etc/gappedPeak.as:
--------------------------------------------------------------------------------
 1 | table gappedPeak
 2 | "This format is used to provide called regions of signal enrichment based on pooled, normalized (interpreted) data where the regions may be spliced or incorporate gaps in the genomic sequence. It is a BED12+3 format."
 3 |     (
 4 |     string chrom;	"Reference sequence chromosome or scaffold"
 5 |     uint chromStart;	"Pseudogene alignment start position"
 6 |     uint chromEnd;      "Pseudogene alignment end position"
 7 |     string name;        "Name of pseudogene"
 8 |     uint score;          "Score of pseudogene with gene (0-1000)"
 9 |     char[1] strand;     "+ or - or . for unknown"
10 |     uint thickStart;    "Start of where display should be thick (start codon)"
11 |     uint thickEnd;      "End of where display should be thick (stop codon)"
12 |     uint reserved;      "Always zero for now"
13 |     int blockCount;     "Number of blocks"
14 |     int[blockCount] blockSizes; "Comma separated list of block sizes"
15 |     int[blockCount] chromStarts; "Start positions relative to chromStart"
16 |     float  signalValue;  "Measurement of average enrichment for the region"
17 |     float  pValue;       "Statistical significance of signal value (-log10). Set to -1 if not used."
18 |     float  qValue;       "Statistical significance with multiple-test correction applied (FDR). Set to -1 if not used."
19 | )
20 | 


--------------------------------------------------------------------------------
/etc/narrowPeak.as:
--------------------------------------------------------------------------------
 1 | table narrowPeak
 2 | "BED6+4 Peaks of signal enrichment based on pooled, normalized (interpreted) data."
 3 | (
 4 |     string chrom;        "Reference sequence chromosome or scaffold"
 5 |     uint   chromStart;   "Start position in chromosome"
 6 |     uint   chromEnd;     "End position in chromosome"
 7 |     string name;	 "Name given to a region (preferably unique). Use . if no name is assigned"
 8 |     uint   score;        "Indicates how dark the peak will be displayed in the browser (0-1000) "
 9 |     char[1]  strand;     "+ or - or . for unknown"
10 |     float  signalValue;  "Measurement of average enrichment for the region"
11 |     float  pValue;       "Statistical significance of signal value (-log10). Set to -1 if not used."
12 |     float  qValue;       "Statistical significance with multiple-test correction applied (FDR -log10). Set to -1 if not used."
13 |     int   peak;         "Point-source called for this peak; 0-based offset from chromStart. Set to -1 if no point-source called."
14 | )
15 | 


--------------------------------------------------------------------------------
/examples/ENCODE/download_ENCODE_Snyder.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | import sys, json, urllib2
 4 | 
 5 | exps = ['ENCSR955JSO',\
 6 | 'ENCSR355SGJ',\
 7 | 'ENCSR846VLJ',\
 8 | 'ENCSR386HAZ',\
 9 | 'ENCSR404LLJ',\
10 | 'ENCSR761TKU',\
11 | 'ENCSR086OGH',\
12 | 'ENCSR668VCT',\
13 | 'ENCSR260ZIV',\
14 | 'ENCSR788TRR',\
15 | 'ENCSR670REK',\
16 | 'ENCSR337UIU',\
17 | 'ENCSR540BML',\
18 | 'ENCSR630REB',\
19 | 'ENCSR846ZBX',\
20 | 'ENCSR654UYP',\
21 | 'ENCSR078EBD',\
22 | 'ENCSR851SBY',\
23 | 'ENCSR548QCP']
24 | 
25 | fastq = dict()
26 | 
27 | for exp in exps:
28 |     json_data = urllib2.urlopen('https://www.encodeproject.org/experiments/'+exp+'/?format=json').read()
29 |     json_obj = json.loads(json_data)
30 | 
31 |     print '#============ exp: %s =========' % (exp,)
32 |     for i,f in enumerate(json_obj['files']):
33 |         out_dir = '$TMP/data/ENCODE_mouse/%s/pair%d' % (exp, i+1)
34 |         print 'mkdir -p %s' % (out_dir,)
35 |         print 'wget -P %s -bqc https://www.encodeproject.org%s' % ( out_dir, f['href'] )
36 | 


--------------------------------------------------------------------------------
/examples/atac_shi_new2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TITLE=A628T;SPECIES=hg19;
 4 | FASTQ1_1=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160205_MONK_0465_BC8C8DACXX/L3/160205_MONK_0465_BC8C8DACXX_L3_TAAGGCGA_1_pf.fastq.gz
 5 | FASTQ1_2=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160205_MONK_0465_BC8C8DACXX/L3/160205_MONK_0465_BC8C8DACXX_L3_TAAGGCGA_2_pf.fastq.gz
 6 | WORK=/srv/gsfs0/scratch/leepc12/run/atac_shi_new2/$TITLE; mkdir -p $WORK; cd $WORK
 7 | bds_scr $TITLE /home/leepc12/atac_dnase_pipelines/atac.bds -species $SPECIES -nth 8 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2
 8 | 
 9 | TITLE=A629P;SPECIES=hg19
10 | FASTQ1_1=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160205_MONK_0465_BC8C8DACXX/L3/160205_MONK_0465_BC8C8DACXX_L3_CGTACTAG_1_pf.fastq.gz
11 | FASTQ1_2=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160205_MONK_0465_BC8C8DACXX/L3/160205_MONK_0465_BC8C8DACXX_L3_CGTACTAG_2_pf.fastq.gz
12 | bds_scr $TITLE /home/leepc12/atac_dnase_pipelines/atac.bds -species $SPECIES -nth 8 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2
13 | WORK=/srv/gsfs0/scratch/leepc12/run/atac_shi_new2/$TITLE; mkdir -p $WORK; cd $WORK
14 | bds_scr $TITLE /home/leepc12/atac_dnase_pipelines/atac.bds -species $SPECIES -nth 8 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2
15 | 
16 | TITLE=A62I3;SPECIES=hg19
17 | FASTQ1_1=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160212_BRISCOE_0278_AC7YJJACXX/L1/160212_BRISCOE_0278_AC7YJJACXX_L1_CGTACTAG_1_pf.fastq.gz
18 | FASTQ1_2=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160212_BRISCOE_0278_AC7YJJACXX/L1/160212_BRISCOE_0278_AC7YJJACXX_L1_CGTACTAG_2_pf.fastq.gz
19 | WORK=/srv/gsfs0/scratch/leepc12/run/atac_shi_new2/$TITLE; mkdir -p $WORK; cd $WORK
20 | bds_scr $TITLE /home/leepc12/atac_dnase_pipelines/atac.bds -species $SPECIES -nth 8 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2
21 | 
22 | TITLE=A62A6;SPECIES=hg19
23 | FASTQ1_1=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160212_BRISCOE_0278_AC7YJJACXX/L1/160212_BRISCOE_0278_AC7YJJACXX_L1_AGGCAGAA_1_pf.fastq.gz
24 | FASTQ1_2=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160212_BRISCOE_0278_AC7YJJACXX/L1/160212_BRISCOE_0278_AC7YJJACXX_L1_AGGCAGAA_2_pf.fastq.gz
25 | WORK=/srv/gsfs0/scratch/leepc12/run/atac_shi_new2/$TITLE; mkdir -p $WORK; cd $WORK
26 | bds_scr $TITLE /home/leepc12/atac_dnase_pipelines/atac.bds -species $SPECIES -nth 8 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2
27 | 
28 | TITLE=A62IK;SPECIES=hg19
29 | FASTQ1_1=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160212_BRISCOE_0278_AC7YJJACXX/L1/160212_BRISCOE_0278_AC7YJJACXX_L1_TCCTGAGC_1_pf.fastq.gz
30 | FASTQ1_2=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160212_BRISCOE_0278_AC7YJJACXX/L1/160212_BRISCOE_0278_AC7YJJACXX_L1_TCCTGAGC_2_pf.fastq.gz
31 | WORK=/srv/gsfs0/scratch/leepc12/run/atac_shi_new2/$TITLE; mkdir -p $WORK; cd $WORK
32 | bds_scr $TITLE /home/leepc12/atac_dnase_pipelines/atac.bds -species $SPECIES -nth 8 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2
33 | 
34 | TITLE=A6296;SPECIES=hg19
35 | FASTQ1_1=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160212_BRISCOE_0278_AC7YJJACXX/L3/160212_BRISCOE_0278_AC7YJJACXX_L3_AGGCAGAA_1_pf.fastq.gz
36 | FASTQ1_2=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160212_BRISCOE_0278_AC7YJJACXX/L3/160212_BRISCOE_0278_AC7YJJACXX_L3_AGGCAGAA_2_pf.fastq.gz
37 | WORK=/srv/gsfs0/scratch/leepc12/run/atac_shi_new2/$TITLE; mkdir -p $WORK; cd $WORK
38 | bds_scr $TITLE /home/leepc12/atac_dnase_pipelines/atac.bds -species $SPECIES -nth 8 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2
39 | 
40 | TITLE=A629G;SPECIES=hg19
41 | FASTQ1_1=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160212_BRISCOE_0278_AC7YJJACXX/L3/160212_BRISCOE_0278_AC7YJJACXX_L3_TCCTGAGC_1_pf.fastq.gz
42 | FASTQ1_2=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160212_BRISCOE_0278_AC7YJJACXX/L3/160212_BRISCOE_0278_AC7YJJACXX_L3_TCCTGAGC_2_pf.fastq.gz
43 | WORK=/srv/gsfs0/scratch/leepc12/run/atac_shi_new2/$TITLE; mkdir -p $WORK; cd $WORK
44 | bds_scr $TITLE /home/leepc12/atac_dnase_pipelines/atac.bds -species $SPECIES -nth 8 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2
45 | 
46 | TITLE=A629A;SPECIES=hg19
47 | FASTQ1_1=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160212_BRISCOE_0278_AC7YJJACXX/L3/160212_BRISCOE_0278_AC7YJJACXX_L3_GGACTCCT_1_pf.fastq.gz
48 | FASTQ1_2=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160212_BRISCOE_0278_AC7YJJACXX/L3/160212_BRISCOE_0278_AC7YJJACXX_L3_GGACTCCT_2_pf.fastq.gz
49 | WORK=/srv/gsfs0/scratch/leepc12/run/atac_shi_new2/$TITLE; mkdir -p $WORK; cd $WORK
50 | bds_scr $TITLE /home/leepc12/atac_dnase_pipelines/atac.bds -species $SPECIES -nth 8 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2
51 | 
52 | TITLE=A629O;SPECIES=hg19
53 | FASTQ1_1=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160212_BRISCOE_0278_AC7YJJACXX/L3/160212_BRISCOE_0278_AC7YJJACXX_L3_TAGGCATG_1_pf.fastq.gz
54 | FASTQ1_2=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160212_BRISCOE_0278_AC7YJJACXX/L3/160212_BRISCOE_0278_AC7YJJACXX_L3_TAGGCATG_2_pf.fastq.gz
55 | WORK=/srv/gsfs0/scratch/leepc12/run/atac_shi_new2/$TITLE; mkdir -p $WORK; cd $WORK
56 | bds_scr $TITLE /home/leepc12/atac_dnase_pipelines/atac.bds -species $SPECIES -nth 8 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2
57 | 
58 | TITLE=A629Y;SPECIES=hg19
59 | FASTQ1_1=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160217_PINKERTON_0397_BC83FPACXX/L1/160217_PINKERTON_0397_BC83FPACXX_L1_TAAGGCGA_1_pf.fastq.gz
60 | FASTQ1_2=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160217_PINKERTON_0397_BC83FPACXX/L1/160217_PINKERTON_0397_BC83FPACXX_L1_TAAGGCGA_2_pf.fastq.gz
61 | WORK=/srv/gsfs0/scratch/leepc12/run/atac_shi_new2/$TITLE; mkdir -p $WORK; cd $WORK
62 | bds_scr $TITLE /home/leepc12/atac_dnase_pipelines/atac.bds -species $SPECIES -nth 8 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2
63 | 
64 | TITLE=A629Z;SPECIES=hg19
65 | FASTQ1_1=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160217_PINKERTON_0397_BC83FPACXX/L1/160217_PINKERTON_0397_BC83FPACXX_L1_AGGCAGAA_1_pf.fastq.gz
66 | FASTQ1_2=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160217_PINKERTON_0397_BC83FPACXX/L1/160217_PINKERTON_0397_BC83FPACXX_L1_AGGCAGAA_2_pf.fastq.gz
67 | WORK=/srv/gsfs0/scratch/leepc12/run/atac_shi_new2/$TITLE; mkdir -p $WORK; cd $WORK
68 | bds_scr $TITLE /home/leepc12/atac_dnase_pipelines/atac.bds -species $SPECIES -nth 8 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2
69 | 
70 | TITLE=A629W;SPECIES=hg19
71 | FASTQ1_1=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160217_PINKERTON_0397_BC83FPACXX/L1/160217_PINKERTON_0397_BC83FPACXX_L1_GGACTCCT_1_pf.fastq.gz
72 | FASTQ1_2=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160217_PINKERTON_0397_BC83FPACXX/L1/160217_PINKERTON_0397_BC83FPACXX_L1_GGACTCCT_2_pf.fastq.gz
73 | WORK=/srv/gsfs0/scratch/leepc12/run/atac_shi_new2/$TITLE; mkdir -p $WORK; cd $WORK
74 | bds_scr $TITLE /home/leepc12/atac_dnase_pipelines/atac.bds -species $SPECIES -nth 8 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2
75 | 
76 | TITLE=A62IE;SPECIES=hg19
77 | FASTQ1_1=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160217_PINKERTON_0397_BC83FPACXX/L1/160217_PINKERTON_0397_BC83FPACXX_L1_TAGGCATG_1_pf.fastq.gz
78 | FASTQ1_2=/srv/gsfs0/projects/seq_center/Illumina/PublishedResults/2016/feb/160217_PINKERTON_0397_BC83FPACXX/L1/160217_PINKERTON_0397_BC83FPACXX_L1_TAGGCATG_2_pf.fastq.gz
79 | WORK=/srv/gsfs0/scratch/leepc12/run/atac_shi_new2/$TITLE; mkdir -p $WORK; cd $WORK
80 | bds_scr $TITLE /home/leepc12/atac_dnase_pipelines/atac.bds -species $SPECIES -nth 8 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2
81 | 
82 | 


--------------------------------------------------------------------------------
/examples/bfremin.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | TITLE=Butyrate
 4 | FASTQ1_1=/srv/scratch/shared/surya/leepc12/data/bfremin/ATAC-SEQ/But-1-60K-CAGAGAGG_S3_R1_001.fastq.gz
 5 | FASTQ1_2=/srv/scratch/shared/surya/leepc12/data/bfremin/ATAC-SEQ/But-1-60K-CAGAGAGG_S3_R2_001.fastq.gz
 6 | FASTQ2_1=/srv/scratch/shared/surya/leepc12/data/bfremin/ATAC-SEQ/But-2-60K-AAGAGGCA_S4_R1_001.fastq.gz
 7 | FASTQ2_2=/srv/scratch/shared/surya/leepc12/data/bfremin/ATAC-SEQ/But-2-60K-AAGAGGCA_S4_R2_001.fastq.gz
 8 | WORKDIR=/srv/scratch/shared/surya/leepc12/run/bfremin/ATAC-SEQ/$TITLE; mkdir -p $WORKDIR; cd $WORKDIR
 9 | bds_scr $TITLE /users/leepc12/code/bds_atac/atac.bds -nth 12 -species hg19 -title $TITLE -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -fastq2_1 $FASTQ2_1 -fastq2_2 $FASTQ2_2 
10 | sleep 2
11 | 
12 | TITLE=ButSulf
13 | FASTQ1_1=/srv/scratch/shared/surya/leepc12/data/bfremin/ATAC-SEQ/ButSulf-1-50K-AGGTTGGG_S7_R1_001.fastq.gz
14 | FASTQ1_2=/srv/scratch/shared/surya/leepc12/data/bfremin/ATAC-SEQ/ButSulf-1-50K-AGGTTGGG_S7_R2_001.fastq.gz
15 | FASTQ2_1=/srv/scratch/shared/surya/leepc12/data/bfremin/ATAC-SEQ/ButSulf-2-60K-TTGACCCT_S8_R1_001.fastq.gz
16 | FASTQ2_2=/srv/scratch/shared/surya/leepc12/data/bfremin/ATAC-SEQ/ButSulf-2-60K-TTGACCCT_S8_R2_001.fastq.gz
17 | WORKDIR=/srv/scratch/shared/surya/leepc12/run/bfremin/ATAC-SEQ/$TITLE; mkdir -p $WORKDIR; cd $WORKDIR
18 | bds_scr $TITLE /users/leepc12/code/bds_atac/atac.bds -nth 12 -species hg19 -title $TITLE -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -fastq2_1 $FASTQ2_1 -fastq2_2 $FASTQ2_2 
19 | sleep 2
20 | 
21 | TITLE=Control
22 | FASTQ1_1=/srv/scratch/shared/surya/leepc12/data/bfremin/ATAC-SEQ/Control-1-60K-CGTACTAG_S1_R1_001.fastq.gz
23 | FASTQ1_2=/srv/scratch/shared/surya/leepc12/data/bfremin/ATAC-SEQ/Control-1-60K-CGTACTAG_S1_R2_001.fastq.gz
24 | FASTQ2_1=/srv/scratch/shared/surya/leepc12/data/bfremin/ATAC-SEQ/Control-2-60K-GGACTCCT_S2_R1_001.fastq.gz
25 | FASTQ2_2=/srv/scratch/shared/surya/leepc12/data/bfremin/ATAC-SEQ/Control-2-60K-GGACTCCT_S2_R2_001.fastq.gz
26 | WORKDIR=/srv/scratch/shared/surya/leepc12/run/bfremin/ATAC-SEQ/$TITLE; mkdir -p $WORKDIR; cd $WORKDIR
27 | bds_scr $TITLE /users/leepc12/code/bds_atac/atac.bds -nth 12 -species hg19 -title $TITLE -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -fastq2_1 $FASTQ2_1 -fastq2_2 $FASTQ2_2 
28 | sleep 2
29 | 
30 | TITLE=Sulfide
31 | FASTQ1_1=/srv/scratch/shared/surya/leepc12/data/bfremin/ATAC-SEQ/Sulf-1-60K-CGAGGCTG_S5_R1_001.fastq.gz
32 | FASTQ1_2=/srv/scratch/shared/surya/leepc12/data/bfremin/ATAC-SEQ/Sulf-1-60K-CGAGGCTG_S5_R2_001.fastq.gz
33 | FASTQ2_1=/srv/scratch/shared/surya/leepc12/data/bfremin/ATAC-SEQ/Sulf-2-60K-TGCTGGGT_S6_R1_001.fastq.gz
34 | FASTQ2_2=/srv/scratch/shared/surya/leepc12/data/bfremin/ATAC-SEQ/Sulf-2-60K-TGCTGGGT_S6_R2_001.fastq.gz
35 | WORKDIR=/srv/scratch/shared/surya/leepc12/run/bfremin/ATAC-SEQ/$TITLE; mkdir -p $WORKDIR; cd $WORKDIR
36 | bds_scr $TITLE /users/leepc12/code/bds_atac/atac.bds -nth 12 -species hg19 -title $TITLE -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -fastq2_1 $FASTQ2_1 -fastq2_2 $FASTQ2_2 
37 | sleep 2
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/examples/training-camp-2016.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SUFFIX=Ct-1h; WORK=/srv/scratch/shared/nandi/projects/training-camp-2016/run/$SUFFIX; mkdir -p $WORK; cd $WORK
 4 | FASTQ1_1=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Ct-1h_S5_L001_R1_001.fastq.gz
 5 | FASTQ1_2=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Ct-1h_S5_L001_R2_001.fastq.gz
 6 | bds_scr $SUFFIX /users/leepc12/code/bds_atac/atac.bds -pe -title $SUFFIX -nth 5 -species saccer3 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -url_base http://mitra.stanford.edu/kundaje/leepc12/training-camp-2016/$SUFFIX/out
 7 | sleep 3
 8 | 
 9 | SUFFIX=Ct-3h; WORK=/srv/scratch/shared/nandi/projects/training-camp-2016/run/$SUFFIX; mkdir -p $WORK; cd $WORK
10 | FASTQ1_1=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Ct-3h_S12_L001_R1_001.fastq.gz
11 | FASTQ1_2=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Ct-3h_S12_L001_R2_001.fastq.gz
12 | bds_scr $SUFFIX /users/leepc12/code/bds_atac/atac.bds -pe -title $SUFFIX -nth 5 -species saccer3 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -url_base http://mitra.stanford.edu/kundaje/leepc12/training-camp-2016/$SUFFIX/out
13 | sleep 3
14 | 
15 | SUFFIX=Cu-1h; WORK=/srv/scratch/shared/nandi/projects/training-camp-2016/run/$SUFFIX; mkdir -p $WORK; cd $WORK
16 | FASTQ1_1=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Cu-1h_S4_L001_R1_001.fastq.gz
17 | FASTQ1_2=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Cu-1h_S4_L001_R2_001.fastq.gz
18 | bds_scr $SUFFIX /users/leepc12/code/bds_atac/atac.bds -pe -title $SUFFIX -nth 5 -species saccer3 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -url_base http://mitra.stanford.edu/kundaje/leepc12/training-camp-2016/$SUFFIX/out
19 | sleep 3
20 | 
21 | SUFFIX=Cu-3h; WORK=/srv/scratch/shared/nandi/projects/training-camp-2016/run/$SUFFIX; mkdir -p $WORK; cd $WORK
22 | FASTQ1_1=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Cu-3h_S11_L001_R1_001.fastq.gz
23 | FASTQ1_2=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Cu-3h_S11_L001_R2_001.fastq.gz
24 | bds_scr $SUFFIX /users/leepc12/code/bds_atac/atac.bds -pe -title $SUFFIX -nth 5 -species saccer3 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -url_base http://mitra.stanford.edu/kundaje/leepc12/training-camp-2016/$SUFFIX/out
25 | sleep 3
26 | 
27 | SUFFIX=Cz-1h; WORK=/srv/scratch/shared/nandi/projects/training-camp-2016/run/$SUFFIX; mkdir -p $WORK; cd $WORK
28 | FASTQ1_1=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Cz-1h_S2_L001_R1_001.fastq.gz
29 | FASTQ1_2=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Cz-1h_S2_L001_R2_001.fastq.gz
30 | bds_scr $SUFFIX /users/leepc12/code/bds_atac/atac.bds -pe -title $SUFFIX -nth 5 -species saccer3 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -url_base http://mitra.stanford.edu/kundaje/leepc12/training-camp-2016/$SUFFIX/out
31 | sleep 3
32 | 
33 | SUFFIX=Cz-3h; WORK=/srv/scratch/shared/nandi/projects/training-camp-2016/run/$SUFFIX; mkdir -p $WORK; cd $WORK
34 | FASTQ1_1=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Cz-3h_S9_L001_R1_001.fastq.gz
35 | FASTQ1_2=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Cz-3h_S9_L001_R2_001.fastq.gz
36 | bds_scr $SUFFIX /users/leepc12/code/bds_atac/atac.bds -pe -title $SUFFIX -nth 5 -species saccer3 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -url_base http://mitra.stanford.edu/kundaje/leepc12/training-camp-2016/$SUFFIX/out
37 | sleep 3
38 | 
39 | SUFFIX=DMSO-1h; WORK=/srv/scratch/shared/nandi/projects/training-camp-2016/run/$SUFFIX; mkdir -p $WORK; cd $WORK
40 | FASTQ1_1=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/DMSO-1h_S6_L001_R1_001.fastq.gz
41 | FASTQ1_2=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/DMSO-1h_S6_L001_R2_001.fastq.gz
42 | bds_scr $SUFFIX /users/leepc12/code/bds_atac/atac.bds -pe -title $SUFFIX -nth 5 -species saccer3 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -url_base http://mitra.stanford.edu/kundaje/leepc12/training-camp-2016/$SUFFIX/out
43 | sleep 3
44 | 
45 | SUFFIX=DMSO-3h; WORK=/srv/scratch/shared/nandi/projects/training-camp-2016/run/$SUFFIX; mkdir -p $WORK; cd $WORK
46 | FASTQ1_1=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/DMSO-3h_S13_L001_R1_001.fastq.gz
47 | FASTQ1_2=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/DMSO-3h_S13_L001_R2_001.fastq.gz
48 | bds_scr $SUFFIX /users/leepc12/code/bds_atac/atac.bds -pe -title $SUFFIX -nth 5 -species saccer3 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -url_base http://mitra.stanford.edu/kundaje/leepc12/training-camp-2016/$SUFFIX/out
49 | sleep 3
50 | 
51 | SUFFIX=Kz-1h; WORK=/srv/scratch/shared/nandi/projects/training-camp-2016/run/$SUFFIX; mkdir -p $WORK; cd $WORK
52 | FASTQ1_1=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Kz-1h_S1_L001_R1_001.fastq.gz
53 | FASTQ1_2=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Kz-1h_S1_L001_R2_001.fastq.gz
54 | bds_scr $SUFFIX /users/leepc12/code/bds_atac/atac.bds -pe -title $SUFFIX -nth 5 -species saccer3 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -url_base http://mitra.stanford.edu/kundaje/leepc12/training-camp-2016/$SUFFIX/out
55 | sleep 3
56 | 
57 | SUFFIX=Kz-3h; WORK=/srv/scratch/shared/nandi/projects/training-camp-2016/run/$SUFFIX; mkdir -p $WORK; cd $WORK
58 | FASTQ1_1=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Kz-3h_S8_L001_R1_001.fastq.gz
59 | FASTQ1_2=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Kz-3h_S8_L001_R2_001.fastq.gz
60 | bds_scr $SUFFIX /users/leepc12/code/bds_atac/atac.bds -pe -title $SUFFIX -nth 5 -species saccer3 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -url_base http://mitra.stanford.edu/kundaje/leepc12/training-camp-2016/$SUFFIX/out
61 | sleep 3
62 | 
63 | SUFFIX=Mz-1h; WORK=/srv/scratch/shared/nandi/projects/training-camp-2016/run/$SUFFIX; mkdir -p $WORK; cd $WORK
64 | FASTQ1_1=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Mz-1h_S3_L001_R1_001.fastq.gz
65 | FASTQ1_2=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Mz-1h_S3_L001_R2_001.fastq.gz
66 | bds_scr $SUFFIX /users/leepc12/code/bds_atac/atac.bds -pe -title $SUFFIX -nth 5 -species saccer3 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -url_base http://mitra.stanford.edu/kundaje/leepc12/training-camp-2016/$SUFFIX/out
67 | sleep 3
68 | 
69 | SUFFIX=Mz-3h; WORK=/srv/scratch/shared/nandi/projects/training-camp-2016/run/$SUFFIX; mkdir -p $WORK; cd $WORK
70 | FASTQ1_1=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Mz-3h_S10_L001_R1_001.fastq.gz
71 | FASTQ1_2=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/Mz-3h_S10_L001_R2_001.fastq.gz
72 | bds_scr $SUFFIX /users/leepc12/code/bds_atac/atac.bds -pe -title $SUFFIX -nth 5 -species saccer3 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -url_base http://mitra.stanford.edu/kundaje/leepc12/training-camp-2016/$SUFFIX/out
73 | sleep 3
74 | 
75 | SUFFIX=WT-1h; WORK=/srv/scratch/shared/nandi/projects/training-camp-2016/run/$SUFFIX; mkdir -p $WORK; cd $WORK
76 | FASTQ1_1=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/WT-1h_S7_L001_R1_001.fastq.gz
77 | FASTQ1_2=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/WT-1h_S7_L001_R2_001.fastq.gz
78 | bds_scr $SUFFIX /users/leepc12/code/bds_atac/atac.bds -pe -title $SUFFIX -nth 5 -species saccer3 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -url_base http://mitra.stanford.edu/kundaje/leepc12/training-camp-2016/$SUFFIX/out
79 | sleep 3
80 | 
81 | SUFFIX=WT-3h; WORK=/srv/scratch/shared/nandi/projects/training-camp-2016/run/$SUFFIX; mkdir -p $WORK; cd $WORK
82 | FASTQ1_1=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/WT-3h_S14_L001_R1_001.fastq.gz
83 | FASTQ1_2=/srv/scratch/shared/nandi/projects/training-camp-2016/data/fastqs/WT-3h_S14_L001_R2_001.fastq.gz
84 | bds_scr $SUFFIX /users/leepc12/code/bds_atac/atac.bds -pe -title $SUFFIX -nth 5 -species saccer3 -fastq1_1 $FASTQ1_1 -fastq1_2 $FASTQ1_2 -url_base http://mitra.stanford.edu/kundaje/leepc12/training-camp-2016/$SUFFIX/out
85 | sleep 3
86 | 


--------------------------------------------------------------------------------
/html/jquery.treetable.css:
--------------------------------------------------------------------------------
 1 | table.treetable span.indenter {
 2 |   display: inline-block;
 3 |   margin: 0;
 4 |   padding: 0;
 5 |   text-align: right;
 6 | 
 7 |   /* Disable text selection of nodes (for better D&D UX) */
 8 |   user-select: none;
 9 |   -khtml-user-select: none;
10 |   -moz-user-select: none;
11 |   -o-user-select: none;
12 |   -webkit-user-select: none;
13 | 
14 |   /* Force content-box box model for indenter (Bootstrap compatibility) */
15 |   -webkit-box-sizing: content-box;
16 |   -moz-box-sizing: content-box;
17 |   box-sizing: content-box;
18 | 
19 |   width: 19px;
20 | }
21 | 
22 | table.treetable span.indenter a {
23 |   background-position: left center;
24 |   background-repeat: no-repeat;
25 |   display: inline-block;
26 |   text-decoration: none;
27 |   width: 19px;
28 | }
29 | 


--------------------------------------------------------------------------------
/html/rpt_header.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 | <!--
 6 |     <link rel="stylesheet" type="text/css" href="http://epigenomegateway.wustl.edu/browser/style.css" />
 7 |     <script type="text/javascript" src="http://epigenomegateway.wustl.edu/browser/js/base.js"></script>
 8 |     <script type="text/javascript" src="http://epigenomegateway.wustl.edu/browser/js/personality.js"></script>
 9 |     <script type="text/javascript" src="http://epigenomegateway.wustl.edu/browser/js/embed.js"></script>
10 | -->
11 |     <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js"></script>
12 |     
13 |     <link href="./report/jquery.treetable.css" rel="stylesheet" type="text/css" />
14 |     <link href="./report/jquery.treetable.theme.default.css" rel="stylesheet" />
15 |     <script src="./report/jquery.treetable.js"></script>
16 | 
17 |     <script type="text/javascript">
18 |       $( function(){
19 |       	$("#filetable").treetable({ 
20 |       		expandable: true,
21 |       		onNodeCollapse: function() {
22 |       		},
23 |       		onNodeExpand: function() {
24 |       		}
25 |       	});
26 |       });
27 |     </script>
28 |   </head>
29 |   <body>
30 | 


--------------------------------------------------------------------------------
/install_dependencies.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Stop on error
  3 | set -e
  4 | 
  5 | ## conda environment name
  6 | 
  7 | ENV_NAME=bds_atac
  8 | ENV_NAME_PY3=bds_atac_py3
  9 | 
 10 | INSTALL_GEM=0
 11 | INSTALL_PEAKSEQ=0
 12 | 
 13 | ## install packages from official channels (bioconda and r)
 14 | 
 15 | conda create -n ${ENV_NAME} --file requirements.txt -y -c defaults -c bioconda -c r -c bcbio -c daler -c asmeurer
 16 | conda create -n ${ENV_NAME_PY3} --file requirements_py3.txt -y -c defaults -c bioconda -c r -c bcbio -c daler -c asmeurer
 17 | 
 18 | ### bash function definition
 19 | 
 20 | function add_to_activate {
 21 |   if [[ ! -f $CONDA_INIT ]]; then
 22 |     echo > $CONDA_INIT
 23 |   fi
 24 |   for i in "${CONTENTS[@]}"; do
 25 |     if [[ $(grep "$i" "$CONDA_INIT" | wc -l ) == 0 ]]; then
 26 |       echo $i >> "$CONDA_INIT"
 27 |     fi
 28 |   done
 29 | }
 30 | 
 31 | ## install useful tools for BigDataScript
 32 | 
 33 | mkdir -p $HOME/.bds
 34 | cp -f ./utils/bds_scr ./utils/bds_scr_5min ./utils/kill_scr bds.config $HOME/.bds/
 35 | cp -rf ./utils/clusterGeneric/ $HOME/.bds/
 36 | 
 37 | ## install additional packages
 38 | 
 39 | source activate ${ENV_NAME}
 40 | 
 41 | conda uninstall graphviz -y # graphviz in bioconda has segmentation fault bug
 42 | conda install graphviz -c anaconda -y
 43 | 
 44 | conda install ucsc-bedgraphtobigwig -c bioconda -y
 45 | conda install ucsc-bedtobigbed -c bioconda -y
 46 | 
 47 | #CONDA_BIN=$(dirname $(which activate))/../envs/${ENV_NAME}/bin
 48 | #CONDA_BIN=$(dirname $(which activate))
 49 | CONDA_BIN=$(dirname $(which bedtools))
 50 | CONDA_EXTRA="$CONDA_BIN/../extra"
 51 | CONDA_ACTIVATE_D="$CONDA_BIN/../etc/conda/activate.d"
 52 | CONDA_INIT="$CONDA_ACTIVATE_D/init.sh"
 53 | CONDA_LIB="$CONDA_BIN/../lib"
 54 | if [[ $(find $CONDA_LIB -name '*egg-info*' -not -perm -o+r | wc -l ) > 0 ]]; then
 55 |   find $CONDA_LIB -name '*egg-info*' -not -perm -o+r -exec dirname {} \; | xargs chmod o+r -R
 56 | fi
 57 | 
 58 | mkdir -p $CONDA_EXTRA $CONDA_ACTIVATE_D
 59 | 
 60 | ### install Anshul's phantompeakqualtool
 61 | echo $CONDA_EXTRA
 62 | cd $CONDA_EXTRA
 63 | git clone https://github.com/kundajelab/phantompeakqualtools
 64 | chmod 755 -R phantompeakqualtools
 65 | CONTENTS=("export PATH=$CONDA_EXTRA/phantompeakqualtools:\$PATH")
 66 | add_to_activate
 67 | 
 68 | ### disable locally installed python package lookup
 69 | CONTENTS=("export PYTHONNOUSERSITE=True")
 70 | add_to_activate
 71 | #CONTENTS=("export PYTHONPATH=$CONDA_LIB/python2.7/site-packages:\$PYTHONPATH")
 72 | #add_to_activate
 73 | 
 74 | ### decompress MACS2 python egg
 75 | #cd $CONDA_LIB/python2.7/site-packages
 76 | #unzip -o MACS2-2.1.1.20160309-py2.7-linux-x86_64.egg
 77 | 
 78 | # install PeakSeq
 79 | if [[ ${INSTALL_PEAKSEQ} == 1 ]]; then
 80 |   cd $CONDA_EXTRA
 81 |   wget http://archive.gersteinlab.org/proj/PeakSeq/Scoring_ChIPSeq/Code/C/PeakSeq_1.31.zip -N --no-check-certificate
 82 |   unzip PeakSeq_1.31.zip
 83 |   rm -f PeakSeq_1.31.zip
 84 |   cd PeakSeq
 85 |   make
 86 |   chmod 755 bin/PeakSeq
 87 |   cd $CONDA_BIN
 88 |   ln -s $CONDA_EXTRA/PeakSeq/bin/PeakSeq
 89 | fi
 90 | 
 91 | source deactivate
 92 | 
 93 | 
 94 | source activate ${ENV_NAME_PY3}
 95 | 
 96 | #CONDA_BIN=$(dirname $(which activate))/../envs/${ENV_NAME_PY3}/bin
 97 | #CONDA_BIN=$(dirname $(which activate))
 98 | CONDA_BIN=$(dirname $(which bedtools))
 99 | CONDA_EXTRA="$CONDA_BIN/../extra"
100 | CONDA_ACTIVATE_D="$CONDA_BIN/../etc/conda/activate.d"
101 | CONDA_INIT="$CONDA_ACTIVATE_D/init.sh"
102 | CONDA_LIB="$CONDA_BIN/../lib"
103 | if [[ $(find $CONDA_LIB -name '*egg-info*' -not -perm -o+r | wc -l ) > 0 ]]; then
104 |   find $CONDA_LIB -name '*egg-info*' -not -perm -o+r -exec dirname {} \; | xargs chmod o+r -R
105 | fi
106 | 
107 | mkdir -p $CONDA_EXTRA $CONDA_ACTIVATE_D
108 | 
109 | ### uninstall IDR 2.0.4 and install the latest one
110 | conda uninstall idr -y
111 | cd $CONDA_EXTRA
112 | git clone --branch 2.0.4.2 git://github.com/kundajelab/idr
113 | cd idr
114 | python3 setup.py install
115 | cd $CONDA_EXTRA
116 | rm -rf idr
117 | 
118 | ### disable locally installed python package lookup
119 | CONTENTS=("export PYTHONNOUSERSITE=True")
120 | add_to_activate
121 | CONTENTS=("export PYTHONPATH=$CONDA_LIB/python3.5/site-packages:\$PYTHONPATH")
122 | add_to_activate
123 | 
124 | # install GEM
125 | if [[ ${INSTALL_GEM} == 1 ]]; then
126 |   cd $CONDA_EXTRA
127 |   wget http://groups.csail.mit.edu/cgs/gem/download/gem.v3.0.tar.gz -N --no-check-certificate
128 |   tar zxvf gem.v3.0.tar.gz  
129 |   rm -f gem.v3.0.tar.gz  
130 |   cd gem
131 |   chmod 755 gem.jar
132 |   cd $CONDA_BIN
133 |   ln -s $CONDA_EXTRA/gem/gem.jar
134 | fi
135 | 
136 | source deactivate
137 | 
138 | 
139 | echo == Installing dependencies has been successfully done. ==
140 | 


--------------------------------------------------------------------------------
/modules/align_bowtie2.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "species.bds"
  5 | include "align_multimapping.bds"
  6 | include "module_template.bds"
  7 | 
  8 | 
  9 | help == align bowtie2 settings (requirements: -bwt2_idx)
 10 | bwt2_idx	:= ""		help Bowtie2 index (full path prefix of *.1.bt2 file).
 11 | scoremin_bwt2	:= ""    	help Replacement --score-min for bowtie2.
 12 | wt_bwt2		:= "47h"	help Walltime for bowtie2 (default: 47h, 47:00:00).
 13 | mem_bwt2	:= "12G"	help Max. memory for bowtie2 (default: 12G).
 14 | extra_param_bwt2:= "" 		help Extra parameter for bowtie2.
 15 | no_idx_on_mem_bwt2	:= false help Disable loading index on memory by removing --mm flag for bowtie2.
 16 | 
 17 | grp_color_bwt2		:= "salmon"
 18 | 
 19 | 
 20 | init_align_bwt2()
 21 | 
 22 | 
 23 | void init_align_bwt2() {
 24 | 
 25 | 	bwt2_idx 	= get_conf_val( bwt2_idx, 	["bwt2_idx"] )
 26 | 	scoremin_bwt2 	= get_conf_val( scoremin_bwt2,	["scoremin_bwt2"] )
 27 | 	wt_bwt2 	= get_conf_val( wt_bwt2, 	["wt_bwt2"] )
 28 | 	mem_bwt2 	= get_conf_val( mem_bwt2,	["mem_bwt2"] )
 29 | 	extra_param_bwt2= get_conf_val( extra_param_bwt2,["extra_param_bwt2"] )
 30 | 	no_idx_on_mem_bwt2 	= get_conf_val_bool( no_idx_on_mem_bwt2,	["no_idx_on_mem_bwt2"] )
 31 | 
 32 | 	print("\n\n== align bowtie2 settings\n")
 33 | 	print( "Bowtie2 index\t\t\t: $bwt2_idx\n" )
 34 | 	print( "Replacement --score-min for bowtie2\t: $scoremin_bwt2\n" )
 35 | 	print( "Walltime (bowtie2)\t\t: $wt_bwt2\n")
 36 | 	print( "Max. memory (bowtie2)\t\t: $mem_bwt2\n")
 37 | 	print( "Extra param. (bowtie2)\t\t: $extra_param_bwt2\n")
 38 | 	print( "Disable index on memory (bowtie2)\t: $no_idx_on_mem_bwt2\n")
 39 | }
 40 | 
 41 | void chk_align_bwt2() {
 42 | 
 43 | 	if ( !path_exists("$bwt2_idx.1.bt2") && !path_exists("$bwt2_idx.1.bt2l") ) {
 44 | 		error("Bowtie2 index (-bwt2_idx) doesn't exists! (file: $bwt2_idx.1.bt2 or $bwt2_idx.1.bt2l)\n")
 45 | 	}
 46 | }
 47 | 
 48 | string[] bowtie2( string fastq, string o_dir, string log_o_dir, string group, int nth_bwt2 ) { 
 49 | 
 50 | 	prefix 	:= replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir )
 51 | 	prefix2	:= replace_dir( prefix, log_o_dir )
 52 | 	bam 	:= "$prefix.bam"
 53 | 	log 	:= "$prefix2.align.log"
 54 | 	param 	:= multimapping>0 ? "-k $multimapping" : ""
 55 | 	param3 	:= scoremin_bwt2 ? "--score-min $scoremin_bwt2" : ""
 56 | 
 57 | 	in 	:= [ fastq ]
 58 | 	out 	:= [ bam, log ]
 59 | 
 60 | 	taskName:= "bowtie2 "+group
 61 | 	cpus 	:= (nth_bwt2==1) ? -1 : nth_bwt2; mem := get_res_mem(mem_bwt2,nth_bwt2); 	timeout := get_res_wt(wt_bwt2)
 62 | 
 63 | 	wait_par( cpus )
 64 | 
 65 | 	tid := task( out<-in ) {
 66 | 
 67 | 		sys $shcmd_init
 68 | 
 69 | 		//sys bowtie2 $param -x $bwt2_idx --threads $nth_bwt2 -U <(zcat -f $fastq) 2> $log | \
 70 | 		//	samtools view -Su /dev/stdin | sambamba sort -t 1 /dev/stdin -o $bam
 71 | 		sys bowtie2 $extra_param_bwt2 $param $param3 --local -x $bwt2_idx --threads $nth_bwt2 -U <(zcat -f $fastq) 2> $log | \
 72 | 			samtools view -Su /dev/stdin | samtools sort - $prefix
 73 | 		sys cat $log
 74 | 		sys samtools index $bam
 75 | 
 76 | 		sys $shcmd_finalize
 77 | 	}
 78 | 
 79 | 	register_par( tid, cpus )
 80 | 
 81 | 	add_task_to_graph( in, out, group, "BOWTIE2\\n(SE)", grp_color_bwt2 )
 82 | 
 83 | 	return out
 84 | }
 85 | 
 86 | string[] bowtie2_csem( string fastq, string o_dir, string log_o_dir, string group, int nth_bwt2 ) {
 87 | 
 88 | 	prefix 	:= replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir )
 89 | 	prefix2 := replace_dir( prefix, log_o_dir )
 90 | 	sam 		:= "$prefix.sam"
 91 | 	log 		:= "$prefix2.align.log"
 92 | 	srt_bam		:= "$prefix.csem.sorted.bam"
 93 | 	srt_bam_prefix 	:= "$prefix.csem"
 94 | 	bam 		:= "$prefix.csem.bam"
 95 | 	param 	:= multimapping ? "-k $multimapping" : ""
 96 | 	param3 	:= scoremin_bwt2 ? "--score-min $scoremin_bwt2" : ""
 97 | 
 98 | 	in 	:= [ fastq ]
 99 | 	out 	:= [ bam, log ]
100 | 
101 | 	taskName:= "bowtie2_csem "+group
102 | 	cpus 	:= (nth_bwt2==1) ? -1 : nth_bwt2; mem := get_res_mem(mem_bwt2,nth_bwt2); 	timeout := get_res_wt(wt_bwt2)
103 | 
104 | 	wait_par( cpus )
105 | 
106 | 	tid := task( out<-in ) {
107 | 
108 | 		sys $shcmd_init
109 | 
110 | 		sys bowtie2 $extra_param_bwt2 $param $param3 --local -x $bwt2_idx --threads $nth_bwt2 -U <(zcat -f $fastq) 2> $log > $sam
111 | 		sys cat $log
112 | 		sys run-csem --sam -p $nth_bwt2 $sam 100 $srt_bam_prefix
113 | 
114 | 		sys mv $srt_bam $bam
115 | 
116 | 		sys sambamba index -t $nth_bwt2 $bam
117 | 
118 | 		sys rm -f $sam
119 | 
120 | 		sys $shcmd_finalize
121 | 	}
122 | 
123 | 	register_par( tid, cpus )
124 | 
125 | 	add_task_to_graph( in, out, group, "BOWTIE2\\n(CSEM)", grp_color_bwt2 )
126 | 
127 | 	return out
128 | }
129 | 
130 | string[] bowtie2_PE( string fastq1, string fastq2, string o_dir, string log_o_dir, string group, int nth_bwt2 ) { 
131 | 
132 | 	prefix 	:= replace_dir( rm_ext( fastq1, ["fastq","fq"] ), o_dir ) + ".PE2SE"
133 | 	prefix2 := replace_dir( prefix, log_o_dir )
134 | 	bam 	:= "$prefix.bam"
135 | 	log 	:= "$prefix2.align.log"
136 | 	param 	:= multimapping ? "-k $multimapping" : ""
137 | 	param3 	:= scoremin_bwt2 ? "--score-min $scoremin_bwt2" : ""
138 | 	param_mm := no_idx_on_mem_bwt2 ? "" : "--mm"
139 | 
140 | 	in 	:= [ fastq1, fastq2 ]
141 | 	out 	:= [ bam, log ]
142 | 
143 | 	taskName:= "bowtie2_PE "+group
144 | 	cpus 	:= (nth_bwt2==1) ? -1 : nth_bwt2; mem := get_res_mem(mem_bwt2,nth_bwt2); 	timeout := get_res_wt(wt_bwt2)
145 | 
146 | 	wait_par( cpus )
147 | 
148 | 	tid := task( out<-in ) {
149 | 
150 | 		sys $shcmd_init
151 | 
152 | 		//sys bowtie2 $param -X2000 $param_mm --threads $nth_bwt2 -x $bwt2_idx \
153 | 		//	-1 $fastq1 -2 $fastq2 2>$log | \
154 | 		//	samtools view -Su /dev/stdin | sambamba sort -t 1 /dev/stdin -o $bam
155 | 		sys bowtie2 $extra_param_bwt2 $param $param3 -X2000 $param_mm --local --threads $nth_bwt2 -x $bwt2_idx \
156 | 			-1 $fastq1 -2 $fastq2 2>$log | \
157 | 			samtools view -Su /dev/stdin | samtools sort - $prefix
158 | 		sys cat $log
159 | 		sys samtools index $bam
160 | 
161 | 		sys $shcmd_finalize
162 | 	}
163 | 
164 | 	register_par( tid, cpus )
165 | 
166 | 	add_task_to_graph( in, out, group, "BOWTIE2\\n(PE)", grp_color_bwt2 )
167 | 
168 | 	return out
169 | }
170 | 


--------------------------------------------------------------------------------
/modules/align_etc.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "species.bds"
 5 | 
 6 | string get_read_length_log( string fastq, string o_dir, string group ) { 
 7 | 
 8 | 	prefix 	:= replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir )
 9 | 	log 	:= "$prefix.read_length.txt"
10 | 
11 | 	in 	:= [ fastq ]
12 | 	out 	:= log
13 | 
14 | 	taskName:= "read_length "+group
15 | 	wait_par( cpus )
16 | 
17 | 	tid := task( out<-in ) {
18 | 
19 | 		sys $shcmd_init
20 | 
21 | 		sys python $(which get_read_length_from_fastq.py) $fastq > $log
22 | 
23 | 		sys $shcmd_finalize
24 | 	}
25 | 
26 | 	register_par( tid, cpus )
27 | 
28 | 	add_task_to_graph( in, out, group )
29 | 
30 | 	return out
31 | }
32 | 


--------------------------------------------------------------------------------
/modules/align_multimapping.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "conf.bds"
 5 | 
 6 | 
 7 | help == align multimapping settings
 8 | multimapping 	:= 0 		help # alignments reported for multimapping (default: 0).
 9 | 
10 | 
11 | init_align_multimapping()
12 | 
13 | 
14 | void init_align_multimapping() {
15 | 	multimapping 	= get_conf_val_int( multimapping,		["multimapping"] )
16 | 
17 | 	print("\n\n== align multimapping settings\n")
18 | 	print( "# alignments reported for multimapping\t: $multimapping\n")
19 | }
20 | 


--------------------------------------------------------------------------------
/modules/align_trim_adapter.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "species.bds"
  5 | include "module_template.bds"
  6 | 
  7 | 
  8 | help == adapter trimmer settings
  9 | adapter_err_rate	:= "0.10" 	help Maximum allowed adapter error rate (# errors divided by the length of the matching adapter region, default: 0.10).
 10 | min_trim_len		:= 5 		help Minimum trim length for cutadapt -m, throwing away processed reads shorter than this (default: 5).
 11 | 
 12 | wt_trim			:= "23h"	help Walltime for adapter trimming (default: 23h, 23:00:00).
 13 | mem_trim		:= "12G"	help Max. memory for adapter trimming (default: 12G).
 14 | 
 15 | 
 16 | grp_color_trim_adapter 	:= "darkorange"
 17 | 
 18 | 
 19 | init_align_trim_adapter()
 20 | 
 21 | 
 22 | void init_align_trim_adapter() {
 23 | 	adapter_err_rate	= get_conf_val( adapter_err_rate, 	["adapter_err_rate"] )
 24 | 	min_trim_len 		= get_conf_val_int( min_trim_len, 	["min_trim_len"] )
 25 | 	wt_trim 		= get_conf_val( wt_trim, 		["wt_trim"] )
 26 | 	mem_trim 		= get_conf_val( mem_trim, 		["mem_trim"] )
 27 | 
 28 | 	print("\n\n== adapter trimmer settings\n")
 29 | 	print( "Maximum allowed error rate for cutadapt\t: $adapter_err_rate\n")
 30 | 	print( "Minimum trim. length for cutadapt -m\t: $min_trim_len\n")
 31 | 	print( "Walltime (adapter trimming)\t\t: $wt_trim\n")
 32 | 	print( "Max. memory (adapter trimming)\t\t: $mem_trim\n")
 33 | }
 34 | 
 35 | // also returns tid
 36 | string[] detect_adapter( string fastq, string o_dir, string group ) {
 37 | 
 38 | 	prefix	:= replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir )
 39 | 	log	:= "$prefix.adapter.txt"
 40 | 
 41 | 	in 	:= [ fastq ]
 42 | 	out 	:= log
 43 | 
 44 | 	taskName:= "detect_adapter $group"
 45 | 
 46 | 	system 	:= "local"
 47 | 
 48 | 	tid := task( out<-in ) {
 49 | 
 50 | 		sys $shcmd_init_py3
 51 | 
 52 | 		sys python3 $(which detect_adapter.py) $fastq > $log
 53 | 
 54 | 		sys $shcmd_finalize
 55 | 	}
 56 | 
 57 | 	return [out, tid]
 58 | }
 59 | 
 60 | string parse_adapter_log( string log ) {
 61 | 	string adapter
 62 | 	lines := log.readLines()
 63 | 	for ( int i=0; i<lines.size(); i++ ) {
 64 | 		if ( lines[i].indexOf("Sequences analysed") > -1 ) {
 65 | 			if ( i+1>lines.size()-1 ) {
 66 | 				adapter = ""
 67 | 			}
 68 | 			else {
 69 | 				line := lines[i+1]
 70 | 				arr := line.split("\t")
 71 | 				if (arr.size()<3) adapter = ""
 72 | 				else adapter = arr[2]
 73 | 			}
 74 | 			break;
 75 | 		}
 76 | 	}
 77 | 	//if ( adapter == "" ) error("No adapter found ($log)!")
 78 | 	return adapter
 79 | }
 80 | 
 81 | string trim_adapters( string fastq, string adapter, string o_dir, string group, string graph_suffix ) {
 82 | 	if ( !adapter ) return fastq
 83 | 
 84 | 	prefix	:= replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir )
 85 | 	p_gz	:= "$prefix"+".trim.fastq.gz"
 86 | 
 87 | 	in 	:= [ fastq ]
 88 | 	out 	:= p_gz
 89 | 
 90 | 	param_min_trim_len := min_trim_len==0 ? "" : "-m $min_trim_len"
 91 | 
 92 | 	taskName:= "trim_adapters $group"
 93 | 	mem := get_res_mem(mem_trim,1); 	timeout := get_res_wt(wt_trim)
 94 | 
 95 | 	wait_par( cpus )
 96 | 
 97 | 	tid := task( out<-in ) {
 98 | 
 99 | 		sys $shcmd_init
100 | 
101 | 		sys cutadapt $param_min_trim_len -e $adapter_err_rate -a $adapter $fastq | gzip -nc > $p_gz
102 | 
103 | 		sys $shcmd_finalize
104 | 	}
105 | 
106 | 	register_par( tid, cpus )
107 | 
108 | 	add_task_to_graph( in, out, group, "CUT-\\nADAPT$graph_suffix", grp_color_trim_adapter )
109 | 
110 | 	return out
111 | }
112 | 
113 | string[] trim_adapters_PE( string fastq1, string fastq2, string adapter1, string adapter2, string o_dir, \
114 | 				string group, string graph_suffix ) {
115 | 	if ( !adapter1 || !adapter2 ) return [fastq1, fastq2]
116 | 
117 | 	prefix1	:= replace_dir( rm_ext( fastq1, ["fastq","fq"] ), o_dir )
118 | 	prefix2	:= replace_dir( rm_ext( fastq2, ["fastq","fq"] ), o_dir )
119 | 	p1 	:= "$prefix1.trim.fastq"
120 | 	p2 	:= "$prefix2.trim.fastq"
121 | 	p1_gz	:= "$p1.gz"
122 | 	p2_gz 	:= "$p2.gz"
123 | 
124 | 	in 	:= [ fastq1, fastq2 ]
125 | 	out 	:= [ p1_gz, p2_gz ]
126 | 
127 | 	param_min_trim_len := min_trim_len==0 ? "" : "-m $min_trim_len"
128 | 
129 | 	taskName:= "trim_adapters_PE " + group
130 | 	mem := get_res_mem(mem_trim,1); 	timeout := get_res_wt(wt_trim)
131 | 
132 | 	wait_par( cpus )
133 | 
134 | 	tid := task( out<-in ) {
135 | 
136 | 		sys $shcmd_init
137 | 
138 | 		sys cutadapt $param_min_trim_len -e $adapter_err_rate -a $adapter1 -A $adapter2 -o $p1 -p $p2 $fastq1 $fastq2
139 | 
140 | 		sys gzip -f $p1
141 | 		sys gzip -f $p2
142 | 
143 | 		sys $shcmd_finalize
144 | 	}
145 | 
146 | 	register_par( tid, cpus )
147 | 
148 | 	add_task_to_graph( in, out, group, "CUT-\\nADAPT$graph_suffix", grp_color_trim_adapter )
149 | 
150 | 	return out
151 | }
152 | 
153 | string trim_adapters_old( string fastq, string o_dir, string group, string graph_suffix ) {
154 | 
155 | 	prefix	:= replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir )
156 | 	p 	:= "$prefix"+"_trimmed.fq"
157 | 	p_gz	:= "$p.gz"
158 | 	p2 	:= "$prefix"+"_trimmed.fastq"
159 | 	p2_gz 	:= "$p2.gz"
160 | 	in 	:= [ fastq ]
161 | 	out 	:= p2_gz
162 | 
163 | 	taskName:= "trim_adapters " + group
164 | 	mem := get_res_mem(mem_trim,1); 	timeout := get_res_wt(wt_trim)
165 | 
166 | 	wait_par( cpus )
167 | 
168 | 	tid := task( out<-in ) {
169 | 
170 | 		sys $shcmd_init
171 | 
172 | 		sys trim_galore $fastq -o $o_dir --dont_gzip
173 | 		sys gzip -f $p
174 | 		sys mv $p_gz $p2_gz
175 | 		sys rm -f $p_gz $p
176 | 
177 | 		sys $shcmd_finalize
178 | 	}
179 | 
180 | 	register_par( tid, cpus )
181 | 
182 | 	add_task_to_graph( in, out, group, "TRIM GALORE\\n$graph_suffix", grp_color_trim_adapter )
183 | 
184 | 	return out
185 | }
186 | 
187 | string[] trim_adapters_PE_old( string fastq1, string fastq2, string o_dir, string group, string graph_suffix ) {
188 | 
189 | 	prefix1	:= replace_dir( rm_ext( fastq1, ["fastq","fq"] ), o_dir )
190 | 	prefix2	:= replace_dir( rm_ext( fastq2, ["fastq","fq"] ), o_dir )
191 | 	p1 	:= "$prefix1.trim.fastq"
192 | 	p2 	:= "$prefix2.trim.fastq"
193 | 	p1_gz	:= "$p1.gz"
194 | 	p2_gz 	:= "$p2.gz"
195 | 
196 | 	in 	:= [ fastq1, fastq2 ]
197 | 	out 	:= [ p1_gz, p2_gz ]
198 | 
199 | 	taskName:= "trim_adapters_PE " + group
200 | 	mem := get_res_mem(mem_trim,1); 	timeout := get_res_wt(wt_trim)
201 | 
202 | 	wait_par( cpus )
203 | 
204 | 	tid := task( out<-in ) {
205 | 
206 | 		sys $shcmd_init
207 | 
208 | 		sys cd $o_dir
209 | 		sys $(which trimAdapters.py) -a $fastq1 -b $fastq2
210 | 		sys gzip -f $p1
211 | 		sys gzip -f $p2
212 | 
213 | 		sys $shcmd_finalize
214 | 	}
215 | 
216 | 	register_par( tid, cpus )
217 | 
218 | 	add_task_to_graph( in, out, group, "TRIMADAPTOR.PY\\n$graph_suffix", grp_color_trim_adapter )
219 | 
220 | 	return out
221 | }
222 | 


--------------------------------------------------------------------------------
/modules/ataqc.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | // depends on external git repo: https://github.com/kundajelab/ataqc
  5 | // needs to have $script_dir/ataqc/run_ataqc.py
  6 | 
  7 | include "species.bds"
  8 | 
  9 | 
 10 | help == ATAQC settings
 11 | tss_enrich 	:= "" 		help TSS enrichment bed for ataqc.
 12 | dnase 		:= ""		help DNase bed (open chromatin region file) for ataqc.
 13 | prom 		:= ""		help Promoter bed (promoter region file) for ataqc.
 14 | enh 		:= ""		help Enhancer bed (enhancer region file) for ataqc.
 15 | reg2map 	:= ""		help Reg2map (file with cell type signals) for ataqc.
 16 | reg2map_bed 	:= ""		help Reg2map_bed (file of regions used to generate reg2map signals) for ataqc.
 17 | roadmap_meta 	:= ""		help Roadmap metadata for ataqc.
 18 | 
 19 | mem_ataqc 	:= "20G"	help Max. memory for ATAQC (default: 20G).
 20 | wt_ataqc	:= "47h"	help Walltime for ATAQC (default: 47h, 47:00:00).
 21 | 
 22 | grp_color_ataqc := "pink"
 23 | 
 24 | init_ataqc()
 25 | 
 26 | void init_ataqc() {
 27 | 
 28 | 	tss_enrich 	= get_conf_val( tss_enrich,	["tss_enrich"] )
 29 | 	dnase 		= get_conf_val( dnase,		["dnase"] )
 30 | 	prom 		= get_conf_val( prom,		["prom"] )
 31 | 	enh 		= get_conf_val( enh,		["enh"] )
 32 | 	reg2map 	= get_conf_val( reg2map,	["reg2map"] )
 33 | 	reg2map_bed 	= get_conf_val( reg2map_bed,	["reg2map_bed"] )
 34 | 	roadmap_meta 	= get_conf_val( roadmap_meta,	["roadmap_meta"] )
 35 | 
 36 | 	mem_ataqc 	= get_conf_val( mem_ataqc,	["mem_ataqc"] )
 37 | 	wt_ataqc	= get_conf_val( wt_ataqc,	["wt_ataqc"] )
 38 | 
 39 | 	if ( reg2map_bed == "" ) reg2map_bed = dnase
 40 | 
 41 | 	print("\n\n== ATAQC settings\n")
 42 | 	print( "TSS enrichment bed\t\t: $tss_enrich\n" )
 43 | 	print( "DNase bed for ataqc\t\t: $dnase\n" )
 44 | 	print( "Promoter bed for ataqc\t\t: $prom\n" )
 45 | 	print( "Enhancer bed for ataqc\t\t: $enh\n" )
 46 | 	print( "Reg2map for ataqc\t\t\t: $reg2map\n" )
 47 | 	print( "Reg2map_bed for ataqc\t\t: $reg2map_bed\n" )
 48 | 	print( "Roadmap metadata for ataqc\t: $roadmap_meta\n" )
 49 | 	print( "Max. memory for ATAQC\t\t\t: $mem_ataqc\n")
 50 | 	print( "Walltime for ATAQC\t\t\t: $wt_ataqc\n")
 51 | }
 52 | 
 53 | bool chk_ataqc() {
 54 | 
 55 | 	print("\nChecking parameters and data files for ATAQC. \n\n")
 56 | 	disable_ataqc := false
 57 | 
 58 | 	if ( species == "" ) 	{ print("Warning: Genome name is missing ( '-species [GENOME_NAME; hg19, mm9, ...]' )!\n" ); disable_ataqc  = true }
 59 | 	if ( ref_fa == "" ) 	{ print("Warning: Specify your reference genome .fa ('-ref_fa [FA]')!\n"); disable_ataqc  = true }
 60 | 	if ( tss_enrich == "" ) { print("Warning: TSS enrichment bed is missing ( '-tss_enrich' )!\n"); disable_ataqc  = true }
 61 | 	if ( dnase == "" ) 	{ print("Warning: DNase bed is missing ( '-dnase' )!\n"); disable_ataqc  = true }
 62 | 	if ( blacklist == "" ) 	{ print("Warning: Blacklist bed is missing ( '-blacklist' )!\n"); disable_ataqc  = true }
 63 | 	if ( prom == "" ) 	{ print("Warning: Promoter bed is missing ( '-prom' )!\n"); disable_ataqc  = true }
 64 | 	if ( enh == "" ) 	{ print("Warning: Enhancer bed is missing ( '-enh' )!\n"); disable_ataqc  = true }
 65 | 	// if ( reg2map == "" ) 	{ print("Warning: reg2map is missing ( '-reg2map' )!\n"); disable_ataqc  = true }
 66 | 	// if ( reg2map_bed == "" ){ print("Warning: reg2map_bed is missing ( '-reg2map_bed' )!\n"); disable_ataqc  = true }
 67 | 	// if ( roadmap_meta == "" ) { print("Warning: Roadmap metadata are missing ( '-roadmap_meta' )!\n"); disable_ataqc  = true }
 68 | 
 69 | 	if ( disable_ataqc ) {
 70 | 		print("\nDisabling ATAQC...\n")
 71 | 		return false
 72 | 	}
 73 | 	return true
 74 | }
 75 | 
 76 | string[] ataqc( string fastq1, string fastq2, string bam, string align_log, string pbc_log, \
 77 | 		 string dup_log, string filt_bam, string bed, string bigwig, string peak, \
 78 | 		 string peak_naive_overlap, string idr_peak, string o_dir, string group ) {
 79 | 
 80 | 	prefix 		:= replace_dir( rm_ext( fastq1, ["fastq","fq"] ), o_dir ) + ( (fastq2!="") ? ".PE2SE" : "" )
 81 | 
 82 | 	html 		:= "$prefix"+"_qc.html"
 83 | 	txt 		:= "$prefix"+"_qc.txt"
 84 | 	prefix_basename := get_basename( prefix )
 85 | 
 86 | 	param_fastq 	:= (fastq2!="") ? " --fastq1 $fastq1 --fastq2 $fastq2" : " --fastq1 $fastq1"
 87 | 	param_overlap 	:= (peak_naive_overlap!="") ? " --naive_overlap_peaks $peak_naive_overlap" : ""
 88 | 	param_idr 	:= (idr_peak!="") ? " --idr_peaks $idr_peak" : ""
 89 | 	param_use_sambamba	:=	(use_sambamba_markdup) ? " --use_sambamba_markdup" : ""
 90 | 
 91 | 	param_reg2map 	:= (reg2map!="") ? " --reg2map $reg2map" : ""
 92 | 	param_reg2map_bed 	:= (reg2map_bed!="") ? " --reg2map_bed $reg2map_bed" : ""
 93 | 	param_meta	:= (roadmap_meta!="") ? " --meta $roadmap_meta" : ""
 94 | 
 95 | 	species_ataqc 	:= species.indexOf("_")>=0 ? (species.split("_"))[0] : species
 96 | 	print("species_ataqc: $species_ataqc\n")
 97 | 
 98 | 	in  	:= (fastq2!="") ? [ fastq1, fastq2, bam, align_log, pbc_log, dup_log, filt_bam, bed, bigwig, peak ] \
 99 | 				: [ fastq1, bam, align_log, pbc_log, dup_log, filt_bam, bed, bigwig, peak ]
100 | 	out 	:= [ html, txt ] //, gc_plot, hist_graph, lg_vplot, vplot, signal ]
101 | 
102 | 	taskName:= "ataqc "+group
103 | 	mem := get_res_mem(mem_ataqc,1)
104 | 	max_java_heap 	:= binary_prefix( (mem==-1) ? parse_mem( mem_ataqc ) : (mem*3)/4 )
105 | 	timeout := get_res_wt(wt_ataqc)
106 | 
107 | 	wait_par( cpus )
108 | 
109 | 	tid := task( out<-in ) {
110 | 
111 | 		sys $shcmd_init
112 | 
113 | 		// To prevent java heap space error (Exception in thread "main" java.lang.OutOfMemoryError: Java heap space)
114 | 		sys export _JAVA_OPTIONS="-Xms256M -Xmx$max_java_heap -XX:ParallelGCThreads=1"
115 | 		// sys if [ -d "${TMPDIR}" ]; then \
116 | 		sys if [ "$java_tmp_dir" != "" ] && [ -d "$java_tmp_dir" ]; then \
117 | 			export _JAVA_OPTIONS="${_JAVA_OPTIONS} -Djava.io.tmpdir=$java_tmp_dir"; \
118 | 		fi
119 | 		sys cd $o_dir
120 | 
121 | 		// # if PICARDROOT is not defined, then look into ../shared/picard* (default picard dir. in bioconda)
122 | 		sys if [ -f "$(which picard)" ]; then export PICARDROOT="$(dirname $(which picard))/../share/picard"*; fi
123 | 
124 | 		sys $script_dir/ataqc/run_ataqc.py \
125 | 		    --workdir $o_dir \
126 | 		    --outdir $o_dir \
127 | 		    --outprefix $prefix_basename \
128 | 		    --genome $species_ataqc \
129 | 		    --chromsizes $chrsz \
130 | 		    --ref $ref_fa \
131 | 		    --tss $tss_enrich \
132 | 		    --dnase $dnase \
133 | 		    --blacklist $blacklist \
134 | 		    --prom $prom \
135 | 		    --enh $enh \
136 | 		    --pbc $pbc_log\
137 | 		    $param_fastq \
138 | 		    --alignedbam $bam \
139 | 		    --alignmentlog $align_log \
140 | 		    --coordsortbam $bam \
141 | 		    --duplog $dup_log \
142 | 		    --finalbam $filt_bam \
143 | 		    --finalbed $bed \
144 | 		    --bigwig $bigwig \
145 | 		    --peaks $peak $param_overlap $param_idr $param_use_sambamba \
146 | 		    $param_reg2map $param_reg2map_bed $param_meta
147 | 		sys rm -f test.log test.png
148 | 
149 | 		sys $shcmd_finalize
150 | 	}
151 | 
152 | 	register_par( tid, cpus )
153 | 
154 | 	add_task_to_graph( in, out, group, "ATAQC", grp_color_ataqc )
155 | 
156 | 	return out
157 | }
158 | 
159 | 


--------------------------------------------------------------------------------
/modules/callpeak_bigbed.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "species.bds"
 5 | include "module_template.bds"
 6 | 
 7 | 
 8 | string peak_to_bigbed( string filetype, string peak, string o_dir, string group ) {
 9 | 
10 | 	prefix 	:= replace_dir( rm_ext( peak, \
11 | 				["narrowPeak","narrowpeak",\
12 | 				"broadPeak","broadpeak",\
13 | 				"gappedPeak","gappedpeak",filetype] )\
14 | 				, o_dir )
15 | 	bigbed 	:= "$prefix.$filetype.bb"
16 | 
17 | 	bed_param := _get_bed_param( filetype )
18 | 
19 | 	in 	:= [ peak ]
20 | 	out 	:= bigbed
21 | 	
22 | 	taskName:= "peak_to_bigbed " + group
23 | 	system := "local"
24 | 
25 | 	wait_par( cpus )
26 | 
27 | 	tid := task( out<-in ) {
28 | 
29 | 		sys $shcmd_init
30 | 
31 | 		sys cat $chrsz | grep -P 'chr[\dXY]+[ \t]' > $bigbed.chrsz.tmp
32 | 		sys zcat $peak | sort -k1,1 -k2,2n > $bigbed.tmp
33 | 		sys bedClip $bigbed.tmp $bigbed.chrsz.tmp $bigbed.tmp2
34 | 
35 | 		sys bedToBigBed $bed_param $bigbed.tmp2 $bigbed.chrsz.tmp $bigbed
36 | 		sys rm -f $bigbed.tmp $bigbed.tmp2 $bigbed.chrsz.tmp
37 | 
38 | 		sys $shcmd_finalize
39 | 	}
40 | 
41 | 	register_par( tid, cpus )
42 | 			   
43 | 	return out
44 | }
45 | 
46 | string _get_bed_param( string filetype ) {
47 | 
48 | 	if ( filetype.toLower() == "narrowpeak" ) {
49 | 		return "-type=bed6+4 -as=$script_dir/etc/narrowPeak.as"
50 | 	}
51 | 	else if ( filetype.toLower() == "broadpeak") { 
52 | 		return "-type=bed6+3 -as=$script_dir/etc/broadPeak.as"
53 | 	}
54 | 	else if ( filetype.toLower() == "gappedpeak") {
55 | 		return "-type=bed12+3 -as=$script_dir/etc/gappedPeak.as"
56 | 	}
57 | 	else {
58 | 		error("Unsupported peak file type! ($filetype)\n")
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------
/modules/callpeak_blacklist_filter.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "species.bds"
 5 | include "module_template.bds"
 6 | 
 7 | 
 8 | string blacklist_filter_peak( string filetype, string peak, string o_dir, string group ) {
 9 | 
10 | 	prefix 	:= replace_dir( rm_ext( peak, \
11 | 				["narrowPeak","narrowpeak",\
12 | 				"broadPeak","broadpeak",\
13 | 				"regionPeak","regionpeak",\
14 | 				"gappedPeak","gappedpeak",filetype] )\
15 | 				, o_dir )
16 | 	filtered:= "$prefix.filt.$filetype.gz"
17 | 
18 | 	in 	:= [ peak ]
19 | 	out 	:= filtered
20 | 	
21 | 	taskName:= "blacklist_filter " + group
22 | 	//timeout := 3600 // to get queued fast
23 | 	system := "local"
24 | 
25 | 	wait_par( cpus )
26 | 
27 | 	tid := task( out<-in ) {
28 | 
29 | 		sys $shcmd_init
30 | 
31 | 		sys bedtools intersect -v -a <(zcat -f $peak) -b <(zcat -f $blacklist) \
32 | 			| awk 'BEGIN{OFS="\t"} {if ($5>1000) $5=1000; print $0}' \
33 | 			| grep -P 'chr[\dXY]+[ \t]' | gzip -nc > $filtered
34 | 
35 | 		sys $shcmd_finalize
36 | 	}
37 | 
38 | 	register_par( tid, cpus )
39 | 			   
40 | 	return out
41 | }
42 | 


--------------------------------------------------------------------------------
/modules/callpeak_macs2_atac.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "species.bds"
  5 | include "module_template.bds"
  6 | 
  7 | 
  8 | help == callpeak macs2 settings (requirements: -chrsz -gensz)
  9 | gensz 		:= ""		help Genome size; hs for human, mm for mouse.
 10 | wt_macs2	:= "23h"	help Walltime for MACS2 (default: 23h, 23:00:00).
 11 | mem_macs2	:= "15G" 	help Max. memory for MACS2 (default: 15G).
 12 | cap_num_peak_macs2 := "300K"	help Cap number of peaks by taking top N peaks for MACS2 (default: 300K).
 13 | extra_param_macs2 := ""		help Extra parameters for macs2 callpeak.
 14 | 
 15 | 
 16 | grp_color_macs2	:= "lightgreen"
 17 | 
 18 | 
 19 | init_callpeak_macs2()
 20 | 
 21 | 
 22 | void init_callpeak_macs2() {
 23 | 
 24 | 	gensz		= get_conf_val( gensz, 		["gensz"])
 25 | 	wt_macs2 	= get_conf_val( wt_macs2, 	["wt_macs2"] )
 26 | 	mem_macs2	= get_conf_val( mem_macs2, 	["mem_macs2"] )
 27 | 	cap_num_peak_macs2	= get_conf_val( cap_num_peak_macs2, ["cap_num_peak_macs2"] )
 28 | 	extra_param_macs2	= get_conf_val( extra_param_macs2, 	["extra_param_macs2"] )
 29 | 
 30 | 	print("\n\n== callpeak macs2 settings\n")
 31 | 	print( "Genome size (hs,mm)\t\t: $gensz\n" )
 32 | 	print( "Walltime (macs2)\t\t: $wt_macs2\n")
 33 | 	print( "Max. memory (macs2)\t\t: $mem_macs2\n")
 34 | 	print( "Cap number of peaks (macs2)\t: $cap_num_peak_macs2\n")
 35 | 	print( "Extra parameters for macs2 callpeak\t\t: $extra_param_macs2\n")
 36 | }
 37 | 
 38 | void chk_callpeak_macs2() {
 39 | 
 40 | 	if ( gensz == "" ) error("Genome size (-gensz) must be defined! (mm for mouse, hs for human)")
 41 | 	if ( !path_exists( chrsz ) ) error("Chromosome size file (-chrsz) is missing! (file: $chrsz)")
 42 | }
 43 | 
 44 | string[] macs2_atac_npeak_and_signal( string tag, string smooth_window, string pval_thresh, bool make_sig, \
 45 | 			string peak_o_dir, string sig_o_dir, string group ) {
 46 | 
 47 | 	prefix 		:= replace_dir( rm_ext( tag, ["bed","tagAlign"] ), peak_o_dir ) + ".pf"
 48 | 	prefix_sig 	:= replace_dir( prefix, sig_o_dir )
 49 | 	int_cap_num_peak_macs2 := parse_number( cap_num_peak_macs2 )
 50 | 	// peaks
 51 | 	peakfile 	:= "$prefix.pval$pval_thresh.$cap_num_peak_macs2.narrowPeak.gz"
 52 | 
 53 | 	fc_bedgraph 	:= "$prefix.fc.signal.bedgraph"
 54 | 	fc_bedgraph_srt	:= "$prefix.fc.signal.srt.bedgraph"	
 55 | 	fc_bigwig 	:= "$prefix_sig.fc.signal.bigwig"
 56 | 
 57 | 	pval_bedgraph 	:= "$prefix.pval.signal.bedgraph"
 58 | 	pval_bedgraph_srt 	:= "$prefix.pval.signal.srt.bedgraph"	
 59 | 	pval_bigwig 	:= "$prefix_sig.pval.signal.bigwig"
 60 |  
 61 | 	shiftsize := round( smooth_window.parseReal()/2.0 )
 62 | 
 63 | 	blacklist_exists := path_exists(blacklist)
 64 | 
 65 | 	in 	:= [ tag ]
 66 | 	// out 	:= make_sig ? [ peakfile, gpeakfile, fc_bigwig, pval_bigwig ] : [ peakfile, gpeakfile ]
 67 | 	out 	:= make_sig ? [ peakfile, fc_bigwig, pval_bigwig ] : [ peakfile ]
 68 | 
 69 | 	taskName:= "macs2 n/s " + group
 70 | 	mem := get_res_mem(mem_macs2,1);	timeout := get_res_wt(wt_macs2)
 71 | 
 72 | 	wait_par( cpus )
 73 | 
 74 | 	tid := task( out<-in ) {
 75 | 	
 76 | 		sys $shcmd_init
 77 | 		sys export LC_COLLATE=C
 78 | 
 79 | 		sys macs2 callpeak \
 80 | 			-t $tag -f BED -n "$prefix" -g "$gensz" -p $pval_thresh \
 81 | 			--nomodel --shift -$shiftsize --extsize $smooth_window -B --SPMR --keep-dup all --call-summits $extra_param_macs2
 82 | 
 83 | 		//# Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_<peakRank>
 84 | 		// sys sort -k 8gr,8gr "$prefix"_peaks.narrowPeak | awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}' | head -n $int_cap_num_peak_macs2 | gzip -nc > $peakfile
 85 | 		sys sort -k 8gr,8gr "$prefix"_peaks.narrowPeak | awk 'BEGIN{OFS="\t"}{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; print $0}' > $peakfile.tmp
 86 | 		// sys zcat -f "$prefix"_peaks.narrowPeak | sort -k 8gr,8gr | head -n $int_cap_num_peak_macs2 | awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}' | gzip -nc > $peakfile
 87 | 		sys head -n $int_cap_num_peak_macs2 $peakfile.tmp | gzip -nc > $peakfile
 88 | 		sys rm -f $peakfile.tmp 
 89 | 		sys rm -f "$prefix"_peaks.narrowPeak
 90 | 		sys rm -f "$prefix"_summits.bed
 91 | 
 92 | 		sys if [[ $make_sig == "false" ]]; then \
 93 | 			rm -f "$prefix"_treat_pileup.bdg "$prefix"_control_lambda.bdg; \
 94 | 			$shcmd_finalize; \
 95 | 			exit; \
 96 | 		fi
 97 | 
 98 | 		sys macs2 bdgcmp -t "$prefix"_treat_pileup.bdg -c "$prefix"_control_lambda.bdg \
 99 | 			--o-prefix "$prefix" -m FE
100 | 		sys slopBed -i "$prefix"_FE.bdg -g "$chrsz" -b 0 | bedClip stdin "$chrsz" $fc_bedgraph
101 | 		sys rm -f "$prefix"_FE.bdg
102 | 		
103 | 		sys LC_COLLATE=C sort -S 4G -k1,1 -k2,2n $fc_bedgraph > $fc_bedgraph_srt
104 | 		sys bedGraphToBigWig $fc_bedgraph_srt "$chrsz" "$fc_bigwig"
105 | 		sys rm -f $fc_bedgraph $fc_bedgraph_srt
106 | 		
107 | 		//# sval counts the number of tags per million in the (compressed) BED file
108 | 		sys sval=$(wc -l <(zcat -f "$tag") | awk '{printf "%f", $1/1000000}')
109 | 		
110 | 		sys macs2 bdgcmp \
111 | 			-t "$prefix"_treat_pileup.bdg -c "$prefix"_control_lambda.bdg \
112 | 			--o-prefix "$prefix" -m ppois -S "${sval}"
113 | 		sys slopBed -i "$prefix"_ppois.bdg -g "$chrsz" -b 0 | bedClip stdin "$chrsz" $pval_bedgraph
114 | 		sys rm -f "$prefix"_ppois.bdg
115 | 		
116 | 		sys LC_COLLATE=C sort -S 4G -k1,1 -k2,2n $pval_bedgraph > $pval_bedgraph_srt
117 | 		sys bedGraphToBigWig $pval_bedgraph_srt "$chrsz" "$pval_bigwig"
118 | 		sys rm -f $pval_bedgraph $pval_bedgraph_srt
119 | 
120 | 		sys rm -f "$prefix"_treat_pileup.bdg "$prefix"_control_lambda.bdg
121 | 
122 | 		sys $shcmd_finalize
123 | 	}
124 | 
125 | 	register_par( tid, cpus )
126 | 
127 | 	add_task_to_graph( in, out, group, "MACS2\\np-val<$pval_thresh", grp_color_macs2 )
128 | 
129 | 	return out
130 | }
131 | 
132 | string[] macs2_atac_gpeak_and_bpeak( string tag, string smooth_window, string pval_thresh, \
133 | 			string peak_o_dir, string group ) {
134 | 
135 | 	prefix 		:= replace_dir( rm_ext( tag, ["bed","tagAlign"] ), peak_o_dir ) + ".pf"
136 | 	// peaks
137 | 	bpeakfile	:= "$prefix.pval$pval_thresh.$cap_num_peak_macs2.broadPeak.gz"
138 | 	gpeakfile	:= "$prefix.pval$pval_thresh.$cap_num_peak_macs2.gappedPeak.gz"
139 | 	int_cap_num_peak_macs2 := parse_number( cap_num_peak_macs2 )
140 | 	// peaks
141 | 
142 | 	shiftsize := round( smooth_window.parseReal()/2.0 )
143 | 
144 | 	blacklist_exists := path_exists(blacklist)
145 | 
146 | 	in 	:= [ tag ]
147 | 	// out 	:= make_sig ? [ peakfile, gpeakfile, fc_bigwig, pval_bigwig ] : [ peakfile, gpeakfile ]
148 | 	out 	:= [ gpeakfile, bpeakfile ]
149 | 
150 | 	taskName:= "macs2 g/b " + group
151 | 	mem := get_res_mem(mem_macs2,1);	timeout := get_res_wt(wt_macs2)
152 | 
153 | 	wait_par( cpus )
154 | 
155 | 	tid := task( out<-in ) {
156 | 	
157 | 		sys $shcmd_init
158 | 		sys export LC_COLLATE=C
159 | 
160 | 		// .tmp is to prevent file race condition with macs2_atac_npeak_and_signal
161 | 		sys macs2 callpeak \
162 | 			-t $tag -f BED -n "$prefix.tmp" -g "$gensz" -p $pval_thresh \
163 | 			--nomodel --shift -$shiftsize --extsize $smooth_window --broad --keep-dup all $extra_param_macs2
164 | 
165 | 		//# Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_<peakRank>
166 | 		sys sort -k 8gr,8gr "$prefix.tmp"_peaks.broadPeak | awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}' > $bpeakfile.tmp
167 | 		sys sort -k 14gr,14gr "$prefix.tmp"_peaks.gappedPeak | awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}' > $gpeakfile.tmp
168 | 		sys head -n $int_cap_num_peak_macs2 $bpeakfile.tmp | gzip -nc > $bpeakfile
169 | 		sys head -n $int_cap_num_peak_macs2 $gpeakfile.tmp | gzip -nc > $gpeakfile
170 | 		sys rm -f $bpeakfile.tmp $gpeakfile.tmp 
171 | 
172 | 		sys rm -f "$prefix.tmp"_peaks.broadPeak
173 | 		sys rm -f "$prefix.tmp"_peaks.gappedPeak
174 | 		sys rm -f "$prefix.tmp"_peaks.xls
175 | 		sys rm -f "$prefix.tmp"_summits.bed
176 | 		sys rm -f "$prefix.tmp"_treat_pileup.bdg "$prefix.tmp"_control_lambda.bdg
177 | 
178 | 		sys $shcmd_finalize
179 | 	}
180 | 
181 | 	register_par( tid, cpus )
182 | 
183 | 	add_task_to_graph( in, out, group, "MACS2\\np-val<$pval_thresh", grp_color_macs2 )
184 | 
185 | 	return out
186 | }
187 | 


--------------------------------------------------------------------------------
/modules/cluster.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "conf.bds"
  5 | 
  6 | 
  7 | help == cluster/system/resource settings
  8 | wt 		:= "5h50m"	help Walltime for all single-threaded tasks (example: 8:10:00, 3h, 3600, default: 5h50m, 5:50:00).
  9 | memory 		:= "7G"		help Maximum memory for all single-threaded tasks (equivalent to '-mem', example: 4.5G, 1024M, default: 7G).
 10 | use_system 	:= "local"	help Force to use a system (equivalent to 'bds -s [SYSTEM_NAME] ...', any system defined in bds.config can be used).
 11 | nice 		:= 0 		help Set process priority for all tasks (default: 0; -20 (highest) ~ 19 (lowest) ).
 12 | retrial		:= 0 		help # of Retrial for failed tasks (default: 0).
 13 | q 		:= ""		help Submit tasks to a specified cluster queue.
 14 | q_for_slurm_account := false 	help Use --account instead of -p (partition) for SLURM only.
 15 | unlimited_mem_wt:= false 	help Use unlimited max. memory and walltime.
 16 | java_tmp_dir 	:= "\${TMPDIR}" help Java temporary directory. (change it when you get 'Disk quota exceeded' error in Java, default: ${TMPDIR}).
 17 | 
 18 | init_cluster()
 19 | 
 20 | 
 21 | void init_cluster() { 
 22 | 	wt 		= get_conf_val( wt, 			["wt"] )
 23 | 	memory 		= get_conf_val( memory, 		["memory","mem"] )
 24 | 	use_system 	= get_conf_val( use_system, 		["use_system","system"] )
 25 | 	nice 		= get_conf_val_int( nice, 		["nice"] )
 26 | 	retrial 	= get_conf_val_int( retrial, 		["retrial","retry"] )
 27 | 	q 		= get_conf_val( q, 			["q"] )
 28 | 	unlimited_mem_wt= get_conf_val_bool( unlimited_mem_wt, 	["unlimited_mem_wt"] )
 29 | 	q_for_slurm_account= get_conf_val_bool( q_for_slurm_account, 	["q_for_slurm_account"] )
 30 | 	java_tmp_dir	= get_conf_val( java_tmp_dir, 		["java_tmp_dir"] )
 31 | 
 32 | 	if ( cmd_line_arg_has_key("mem") ) memory = get_cmd_line_arg_val( "mem" )
 33 | 	if ( cmd_line_arg_has_key("system") ) use_system = get_cmd_line_arg_val( "system" )
 34 | 	if ( nice <= -20 ) nice = -20
 35 | 	if ( nice > 19 ) nice = 19	
 36 | 	if ( use_system != "" ) system = use_system.toLower()
 37 | 	if ( system == "slurm" || system == "generic" ) { // for new SCG, which uses --account instead of -p (partition)
 38 | 		system = "generic"
 39 | 		if ( q != "" ) {
 40 | 			if ( q_for_slurm_account ) {
 41 | 				queue = "--account $q"
 42 | 			}
 43 | 			else {
 44 | 				queue = "-p $q"
 45 | 			}		
 46 | 		}
 47 | 	}
 48 | 	else if ( q != "" ) {
 49 | 		queue = q
 50 | 	}
 51 | 
 52 | 	// cpus, mem and timeout are pre-declared BDS variables for default resource settings
 53 | 	mem 		= get_res_mem(memory,1)
 54 | 	timeout 	= get_res_wt(wt)
 55 | 	retry 		= retrial
 56 | 
 57 | 	// do not modify this (BDS timeout; how long BDS will wait for tasks to be queued on the cluster)
 58 | 	walltimeout 	= 3600*24*100 	// timeout var. in BigDataScript (100 days, jobs will never be stopped by BDS due to BDS timeout)
 59 | 
 60 | 	print("\n\n== cluster/system info\n")
 61 | 	print( "Walltime (general)\t\t: $wt\n" )
 62 | 	print( "Max. memory (general)\t\t: $memory\n" )
 63 | 	print( "Force to use a system\t\t: $use_system\n" )
 64 | 	print( "Process priority (niceness)\t: $nice\n" )
 65 | 	print( "Retiral for failed tasks\t: $retrial\n" )
 66 | 	print( "Submit tasks to a cluster queue\t: $q\n" )
 67 | 	print( "Unlimited cluster mem./walltime\t: $unlimited_mem_wt\n")
 68 | 	print( "Use --acount instead of SLURM partition\t\t: $q_for_slurm_account\n")	
 69 | 	print( "Java temporary directory\t\t: $java_tmp_dir\n")
 70 | }
 71 | 
 72 | int get_res_wt( string str ) {
 73 | 	return (unlimited_mem_wt || is_system_local() ) ? -1 : parse_time( str )
 74 | }
 75 | 
 76 | int get_res_mem( string str, int n ) {
 77 | 	if ( n < 1 ) n = 1
 78 | 	return (unlimited_mem_wt || is_system_local() ) ? -1 : parse_mem( str )/n
 79 | }
 80 | 
 81 | int get_res_mem( string str ) {
 82 | 	return get_res_mem( str , 1 )
 83 | }
 84 | 
 85 | bool is_system_sge() {
 86 | 	return system == "sge"
 87 | }
 88 | 
 89 | bool is_system_local() {
 90 | 	return system == "local"
 91 | }
 92 | 
 93 | bool is_system_generic() {
 94 | 	return system == "generic"
 95 | }
 96 | 
 97 | bool is_system_slurm() { 
 98 | 	// slurm uses generic cluster, it's configured in bds.config and ./utils/clusterGeneral
 99 | 	return system == "generic"
100 | }
101 | 


--------------------------------------------------------------------------------
/modules/conf.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "sys.bds"
  5 | 
  6 | 
  7 | help == configuration file settings
  8 | c		:= "" 				help Configuration file path.
  9 | env		:= "$script_dir/default.env"	help Environment file path.
 10 | 
 11 | 
 12 | string{} conf 	// map for configuration
 13 | 
 14 | 
 15 | init_conf()
 16 | 
 17 | 
 18 | void init_conf() {
 19 | 	if ( is_cmd_line_arg_empty() ) \
 20 | 		print( "\nWarning: No parameters are given (specify cmd. line arguments or configuration file)!\n\n")	
 21 | 	if ( is_first_arg_conf() ) c = args[0]
 22 | 
 23 | 	add_to_conf( c, "" ) // then read conf. file
 24 | 	env = get_conf_val( env, ["env"] )
 25 | 	if ( path_exists( env ) ) add_to_conf( env, hostname )
 26 | 	add_to_conf( c, "" ) // read conf again to override
 27 | 
 28 | 	print( "\n\n== configuration file info\n")
 29 | 	print( "Hostname\t\t\t: $hostname\n")
 30 | 	print( "Configuration file\t\t: $c\n" )
 31 | 	print( "Environment file\t\t: $env\n" )
 32 | }
 33 | 
 34 | string{} read_conf( string file, string section ) {
 35 | 	section = section.trim()
 36 | 	string{} ret
 37 | 
 38 | 	if ( file == "" ) return ret
 39 | 	lines := file.read().split("\n")
 40 | 
 41 | 	can_read := (section=="") ? true : false
 42 | 	found_section := (section=="") ? true : false
 43 | 	for ( string line : lines ) {
 44 | 		line = rm_comment( line.trim() )
 45 | 		if ( line == "" ) continue
 46 | 	
 47 | 		if ( line.startsWith( "[" ) && line.endsWith( "]" ) )  {
 48 | 			line2 := line.substr(1,line.length()-1)
 49 | 			string[] hostnames
 50 | 			string group
 51 | 			// find group if exists
 52 | 			arr := line2.split(":")
 53 | 			if ( arr.size() > 1 ) 	group = arr[1].trim()
 54 | 			hostnames = arr[0].split(",")
 55 | 			if ( section == "" ) {
 56 | 				can_read = false
 57 | 			}
 58 | 			else {
 59 | 				for ( string host : hostnames ) {
 60 | 					host = host.trim()
 61 | 					if ( match_str( section, host ) ) { // one asterisk (wildcard chr: *) is allowed in hostname string
 62 | 						if ( section == group ) {
 63 | 							error("Recursion (section name == group) found in a conf. or an env. file!"+\
 64 | 								" (file: $file, section: $section, group: $group)\n")
 65 | 						}
 66 | 						else if ( group != "" ) {
 67 | 							print("\tReading parameters from section group($group) in file($file)...\n")
 68 | 							return read_conf( file, group )
 69 | 						}
 70 | 						else {
 71 | 							print("\tReading parameters from section ($host) in file($file)...\n")
 72 | 							found_section = true
 73 | 							can_read = true
 74 | 							break;
 75 | 						}
 76 | 					}
 77 | 					else {
 78 | 						can_read = false
 79 | 					}
 80 | 				}
 81 | 			}
 82 | 			continue
 83 | 		}
 84 | 
 85 | 		if ( can_read ) {
 86 | 			string key, val
 87 | 			(key, val) = parse_conf_line( line )
 88 | 			ret{ key } = val
 89 | 		}
 90 | 	}
 91 | 	if ( !found_section && section != "default" ) return read_conf( file, "default" )
 92 | 
 93 | 	return ret
 94 | }
 95 | 
 96 | string{} read_conf( string file ) {
 97 | 	return read_conf( file, "" )
 98 | }
 99 | 
100 | void add_to_conf( string file, string section ) {
101 | 
102 | 	tmp := read_conf( file, section )
103 | 
104 | 	for( string k : tmp.keys() ) conf{k} = tmp{k}
105 | }
106 | 
107 | void add_to_conf( string file ) {
108 | 	tmp := read_conf( file )
109 | 	for( string k : tmp.keys() ) {
110 | 		conf{k} = tmp{k}
111 | 	}	
112 | }
113 | 
114 | string[] parse_conf_line( string line ) {
115 | 	delims := [ "=", "\t" ]
116 | 	delim_found := false
117 | 	string key, val
118 | 	for ( string delim : delims ) {
119 | 		idx := line.indexOf( delim )
120 | 		if ( idx > -1 ) {
121 | 			key = line.substr( 0, idx ).trim().toLower()
122 | 			val = line.substr( idx+1 ).trim()
123 | 			delim_found = true
124 | 			break
125 | 		}
126 | 	}
127 | 	if ( !delim_found ) error("No delimiter (=,\\t) found in line ($line) in the configruation file.\n")
128 | 	return [key, val]
129 | }
130 | 
131 | int get_conf_val_int( int curr_val, string key ) {
132 | 	string{} tmp
133 | 	return parse_int( get_conf_val( curr_val, key, tmp ) )
134 | }
135 | 
136 | int get_conf_val_int( int curr_val, string[] keys ) {
137 | 	string{} tmp
138 | 	return parse_int( get_conf_val( curr_val, keys, tmp ) )
139 | }
140 | 
141 | bool get_conf_val_bool( bool curr_val, string key ) {
142 | 	string{} tmp
143 | 	return parse_bool( get_conf_val( curr_val, key, tmp ) )
144 | }
145 | 
146 | bool get_conf_val_bool( bool curr_val, string[] keys ) {
147 | 	string{} tmp
148 | 	return parse_bool( get_conf_val( curr_val, keys, tmp ) )
149 | }
150 | 
151 | real get_conf_val_real( real curr_val, string key ) {
152 | 	string{} tmp
153 | 	return parse_real( get_conf_val( curr_val, key, tmp ) )
154 | }
155 | 
156 | real get_conf_val_real( real curr_val, string[] keys ) {
157 | 	string{} tmp
158 | 	return parse_real( get_conf_val( curr_val, keys, tmp ) )
159 | }
160 | 
161 | int get_conf_val_int( int curr_val, string key, string{} _conf ) {
162 | 	return parse_int( get_conf_val( curr_val, key, _conf ) )	
163 | }
164 | 
165 | int get_conf_val_int( int curr_val, string[] keys, string{} _conf ) {
166 | 	return parse_int( get_conf_val( curr_val, keys, _conf ) )	
167 | }
168 | 
169 | bool get_conf_val_bool( bool curr_val, string key, string{} _conf ) {
170 | 	return parse_bool( get_conf_val( curr_val, key, _conf ) )	
171 | }
172 | 
173 | bool get_conf_val_bool( bool curr_val, string[] keys, string{} _conf ) {
174 | 	return parse_bool( get_conf_val( curr_val, keys, _conf ) )	
175 | }
176 | 
177 | real get_conf_val_real( real curr_val, string key, string{} _conf ) {
178 | 	return parse_real( get_conf_val( curr_val, key, _conf ) )
179 | }
180 | 
181 | real get_conf_val_real( real curr_val, string[] keys, string{} _conf ) {
182 | 	return parse_real( get_conf_val( curr_val, keys, _conf ) )
183 | }
184 | 
185 | string get_conf_val( string curr_val, string key, string{} _conf ) {
186 | 	key = key.toLower().trim()
187 | 	if ( cmd_line_arg_has_key( key ) ) return curr_val
188 | 	if ( _conf.size() == 0 ) {
189 | 		if ( conf.hasKey( key ) ) {			
190 | 			return (conf{ key } != "") ? substitute_var( rm_comment( conf{ key } ) ) : curr_val
191 | 		}
192 | 	}
193 | 	else {
194 | 		if ( _conf.hasKey( key ) ) {			
195 | 			return (_conf{ key } != "") ? substitute_var( rm_comment( _conf{ key } ) ) : curr_val
196 | 		}
197 | 	}
198 | 	return curr_val
199 | }
200 | 
201 | string substitute_var( string var ) {
202 | 	var = var.replace("\$script_dir","$script_dir").replace("\${script_dir}","$script_dir")
203 | 	var = var.replace("~/","$HOME/").replace("\$HOME","$HOME").replace("\${HOME}","$HOME")
204 | 	return var
205 | }
206 | 
207 | string get_conf_val( string curr_val, string[] keys, string{} _conf ) {		
208 | 	for ( string key : keys ) {
209 | 		val := get_conf_val( curr_val, key, _conf )		
210 | 		if ( val != curr_val ) return val
211 | 	} 
212 | 	return curr_val
213 | }
214 | 
215 | string get_conf_val( string curr_val, string key ) {
216 | 	string{} tmp
217 | 	return get_conf_val( curr_val, key, tmp )
218 | }
219 | 
220 | string get_conf_val( string curr_val, string[] keys ) {	
221 | 	string{} tmp
222 | 	return get_conf_val( curr_val, keys, tmp )
223 | }
224 | 
225 | bool has_conf_key( string key, string{} _conf ) {
226 | 	key = key.toLower()
227 | 	return (_conf.size()==0) ? conf.hasKey( key ) : _conf.hasKey( key )
228 | }
229 | 
230 | bool has_conf_key( string key ) {
231 | 	string{} tmp
232 | 	return has_conf_key( key )
233 | }
234 | 
235 | bool conf_file_exists() {
236 | 	if ( c!="" ) return c.exists()
237 | 	return false
238 | }
239 | 
240 | bool has_key_in_conf_or_cmd_line( string key ) {
241 | 	return cmd_line_arg_has_key( key )// || has_conf_key( key )
242 | }
243 | 
244 | 


--------------------------------------------------------------------------------
/modules/env.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "cluster.bds"
  5 | 
  6 | help == shell environment settings
  7 | mod 		:= ""		help Modules separated by ; (example: "bowtie/2.2.4; bwa/0.7.7; picard-tools/1.92").
  8 | shcmd 		:= ""		help Shell commands separated by ;. Shell var. must be written as ${VAR} not as $VAR (example: "export PATH=${PATH}:/usr/test; VAR=test"). 
  9 | addpath 	:= "" 		help Path separated by ; or : to be PREPENDED to \$PATH (example: "/bin/test:${HOME}/utils").
 10 | conda_env 	:= ""		help Anaconda Python (or Miniconda) environment name for all softwares including Python2.
 11 | conda_env_py3 	:= ""		help Anaconda Python (or Miniconda) environment name for Python3.
 12 | conda_bin_dir	:= ""		help Anaconda Python (or Miniconda) bin directory.
 13 | cluster_task_min_len 	:= 60 	help Minimum length for a cluster job in seconds (dealing with NFS delayed write, default: 60).
 14 | cluster_task_delay 	:= 0 	help Constant delay for every job in seconds (dealing with NFS delayed write, default: 0).
 15 | 
 16 | shcmd_init 	:= "" 	// Shell command executed prior to all BigDataScript tasks (use this for setting up shell environment)
 17 | shcmd_init_py3	:= "" 	// for softwares using python3
 18 | shcmd_finalize 	:= ""
 19 | 
 20 | delay_conda_env := 5 		// wait for 5 seconds for conda env activation
 21 | 
 22 | 
 23 | init_env()
 24 | 
 25 | 
 26 | void init_env() {
 27 | 	conda_env	= get_conf_val( conda_env, 		["conda_env"] )
 28 | 	conda_env_py3	= get_conf_val( conda_env_py3, 		["conda_env_py3"] )
 29 | 	conda_bin_dir	= get_conf_val( conda_bin_dir, 		["conda_bin_dir"] )
 30 | 	cluster_task_min_len	= get_conf_val_int( cluster_task_min_len, 	["cluster_task_min_len"] )
 31 | 	cluster_task_delay	= get_conf_val_int( cluster_task_delay, 	["cluster_task_delay"] )
 32 | 
 33 | 	// environment modules (sh,bash) init. file paths
 34 | 	init_mods 	:= ["/etc/profile.d/modules.sh", "/etc/profile.d/modules.bash"] 
 35 | 	init_mod 	:= "" // module init. shell script found among the above list
 36 | 	
 37 | 	moduleshome := get_shell_var("MODULESHOME").replace("\n","") // get shell var MODULESHOME if exists
 38 | 	if (moduleshome!="") init_mods.add("$moduleshome/init/bash")
 39 | 
 40 | 	string shellcmd, module, path, conda_py2, conda_py3
 41 | 	
 42 | 	for ( string file : init_mods ) { // find env. modules init script
 43 | 		if ( file.exists() ) {
 44 | 			init_mod = file
 45 | 			break
 46 | 		}
 47 | 	}
 48 | 
 49 | 	if ( init_mod == "" ) {
 50 | 		print("\n\nInfo: Environments module not found on your system " + \
 51 | 			"(e.g. /etc/profile.d/modules.sh). Ignoring shell env. parameters like '-mod'. \n")
 52 | 	}
 53 | 
 54 | 	// read from conf./env. file
 55 | 	for( string k : conf.keys() ) {		
 56 | 		string val = conf{k}		
 57 | 		if ( (k.indexOf("mod_")>=0) || (k=="mod") ) { // concat. module
 58 | 			if ( init_mod != "" ) {
 59 | 				trimmed := val.trim().replace("module add ","").replace( ":", " " ).replace( ";", " " ).replace( ",", " " ).trim()
 60 | 				trimmed = trimmed.replace( "   ", " ").replace( "  ", " ")
 61 | 				module = module + " " + trimmed
 62 | 			}
 63 | 		}
 64 | 		else if ( k.indexOf("shcmd")>=0 ) {
 65 | 			shellcmd = shellcmd + " " + val + ";"
 66 | 		}
 67 | 		else if ( k.indexOf("addpath")>=0 ) {
 68 | 			path = path + val.trim().replace(",",":").replace(";",":").replace(" ",":").replace(":::",":").replace("::",":") + ":"
 69 | 		}
 70 | 	}
 71 | 
 72 | 	// read from cmd. line arg.
 73 | 	if ( mod!="" ) {
 74 | 		string module_header = ". $init_mod;"
 75 | 		if ( init_mod != "" ) { // if /etc/profile.d/modules.sh exists
 76 | 			trimmed := mod.trim().replace("module add ","").replace( ":", " " ).replace( ";", " " ).replace(","," " ).trim()
 77 | 			trimmed = trimmed.replace( "   ", " ").replace( "  ", " ")
 78 | 			module = module + " " + trimmed
 79 | 		}
 80 | 	}
 81 | 	if ( shcmd!="" ) shellcmd = shellcmd + shcmd.trim() + "; "
 82 | 	if ( addpath!="" ) path = path + \
 83 | 			addpath.trim().replace(",",":").replace(";",":").replace(" ",":").replace(":::",":").replace("::",":") + ":"
 84 | 	if ( module !="" ) module = ". $init_mod; module add " + module + ";"
 85 | 
 86 | 	// check script directories to add to PATH
 87 | 	script_file_paths := get_script_file_paths()
 88 | 	for ( string _path : script_file_paths ) {		
 89 | 		if ( _path.exists() ) {
 90 | 			path = path + _path + ":"
 91 | 		}
 92 | 	}
 93 | 
 94 | 	if ( conda_bin_dir ) conda_bin_dir += "/"
 95 | 	if ( path !="" ) path = " export PATH=$path:\${PATH}:/bin:/usr/bin:/usr/local/bin:\${HOME}/.bds;"
 96 | 	// add conda env
 97 | 	if ( conda_env != "" ) conda_py2 = \
 98 | 		"if [[ -f $(which $conda_bin_dir"+"conda) && $($conda_bin_dir"+"conda env list | grep $conda_env | wc -l) != \"0\" ]];"+\
 99 | 		" then source $conda_bin_dir"+"activate $conda_env; sleep $delay_conda_env; fi; "
100 | 	if ( conda_env_py3 != "" ) conda_py3 = \
101 | 		"if [[ -f $(which $conda_bin_dir"+"conda) && $($conda_bin_dir"+"conda env list | grep $conda_env_py3 | wc -l) != \"0\" ]];"+\
102 | 		" then source $conda_bin_dir"+"activate $conda_env_py3; sleep $delay_conda_env; fi; "
103 | 
104 | 	// additional initialization
105 | 	shcmd_init_ := module + path + shellcmd
106 | 	shcmd_init_ += "; set -o pipefail" 		// to catch and stop on non-zero exit code in a UNIX pipe
107 | 	shcmd_init_ += "; STARTTIME=$(date +%s)" 	// to check running time for a task
108 | 	if ( nice != 0 ) shcmd_init_ += "; if (( $(nice)<$nice )); then renice -n $nice $$; fi" 		// to set process priority (niceness)
109 | 
110 | 	shcmd_init_ = shcmd_init_.replace( ": :", ":" ).replace( "::", ":" ).replace( "; ;", ";" ).replace( ";;", ";" )
111 | 	shcmd_init 	= conda_py2 + shcmd_init_
112 | 	shcmd_init_py3 	= conda_py3 + shcmd_init_
113 | 
114 | 	if ( is_system_local() ) {
115 | 		shcmd_finalize = "TASKTIME=$[$(date +%s)-${STARTTIME}]; echo \"Task has finished (${TASKTIME} seconds).\"; "+\
116 |                                  "sleep $cluster_task_delay"
117 | 	}
118 | 	else {
119 | 		shcmd_finalize = "TASKTIME=$[$(date +%s)-${STARTTIME}]; if [ ${TASKTIME} -lt $cluster_task_min_len ]; "+\
120 | 				"then echo \"Waiting for $[$cluster_task_min_len-${TASKTIME}] seconds.\";"+\
121 | 				" sleep $[$cluster_task_min_len-${TASKTIME}]; sleep $cluster_task_delay; fi"
122 | 	}
123 | 
124 | 	print("\n\n== shell environment info\n")
125 | 	print( "Conda env. \t\t\t: $conda_env\n" )
126 | 	print( "Conda env. for python3\t\t: $conda_env_py3\n" )
127 | 	print( "Conda bin. directory\t\t: $conda_bin_dir\n" )	
128 | 	print( "\nShell cmd. for init.\t\t: $shcmd_init\n" )
129 | 	print( "\nShell cmd. for init.(py3)\t: $shcmd_init_py3\n" )
130 | 	print( "\nShell cmd. for fin.\t\t: $shcmd_finalize\n" )
131 | 	print( "\nCluster task min. len.\t\t: $cluster_task_min_len\n" )
132 | 	print( "\nCluster task delay\t\t\t: $cluster_task_delay\n" )
133 | }
134 | 
135 | 


--------------------------------------------------------------------------------
/modules/filetable.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "output.bds"
  5 | 
  6 | 
  7 | int{} _label_rank
  8 | 
  9 | string{} _filetable_label // key: hierarchy
 10 | string{} _filetable_path
 11 | int{} _filetable_rank
 12 | 
 13 | string{} _filetable_input
 14 | int _curr_rank = 0
 15 | 
 16 | 
 17 | 
 18 | void add_label_to_table( string label ) {
 19 | 	_label_rank{ label } = _curr_rank++
 20 | }
 21 | 
 22 | void add_file_to_table( string[] paths, string[] hrchys ) {
 23 | 	for ( int i=0; i<min(hrchys.size(),paths.size()); i++ )  {
 24 | 		hrchy 	:= hrchys[i]
 25 | 		path 	:= paths[i]
 26 | 
 27 | 		if ( hrchy == "" || path == "" ) continue
 28 | 		_filetable_input{ hrchy } = path
 29 | 	}
 30 | }
 31 | 
 32 | void add_file_to_table( string[] paths, string hrchy ) {
 33 | 	add_file_to_table( paths, [hrchy] )
 34 | }
 35 | 
 36 | void add_file_to_table( string path, string[] hrchys ) {
 37 | 	add_file_to_table( [path], hrchys )
 38 | }
 39 | 
 40 | void add_file_to_table( string path, string hrchy ) {
 41 | 	add_file_to_table( [path], [hrchy] )
 42 | }
 43 | 
 44 | string html_filetable() { // graphviz diagram
 45 | 	wait // for thread safety
 46 | 
 47 | 	html := "<div id='dirs_and_files'><b>Directories and files</b>"
 48 | 	html += "<table id='filetable'> <caption>"+\
 49 | 		"<a href='#' onclick=\"jQuery('#filetable').treetable('expandAll'); return false;\">Expand all</a> &nbsp&nbsp" + \
 50 | 		"<a href='#' onclick=\"jQuery('#filetable').treetable('collapseAll'); return false;\">Collapse all</a>" + \
 51 | 		"</caption> <thead><tr><th>Files</th><th>Path</th></tr></thead> <tbody>"
 52 | 
 53 | 	_construct_filetable()
 54 | 
 55 | 	sorted_hrchy := _find_children_and_sort( "" )
 56 | 	for ( string hrchy : sorted_hrchy ) {
 57 | 		parent 	:= _get_parent( hrchy )
 58 | 		label 	:= _filetable_label{ hrchy }
 59 | 		path 	:= _filetable_path.hasKey( hrchy ) ? _filetable_path{ hrchy } : ""
 60 | 		if ( parent == "" ) \
 61 | 			html += "<tr data-tt-id='$hrchy'><td> $label </td><td>"+ html_link_url( path ) +"</td></tr>"
 62 | 		else \
 63 | 			html += "<tr data-tt-id='$hrchy' data-tt-parent-id='$parent'><td> $label </td><td>"+ html_link_url( path ) +"</td></tr>"
 64 | 	}
 65 | 	html += "</tbody></table>"
 66 | 	html += "</div><br>\n"
 67 | 	return html
 68 | }
 69 | 
 70 | string html_link_url( string path ) {
 71 | 	rel_path := get_rel_path( path )
 72 | 	if ( rel_path.startsWith("./") ) \
 73 | 		return "<a href='" + rel_path + "' target='_blank'>" + rel_path + "</a><br>"
 74 | 	else \
 75 | 		return rel_path + "<br>"
 76 | }
 77 | 
 78 | void _construct_filetable() {
 79 | 	for( string hrchy : _filetable_input.keys() ) {
 80 | 		_construct_filetable( hrchy, _filetable_input{ hrchy } )
 81 | 	}
 82 | }
 83 | 
 84 | // returns rank of item
 85 | void _construct_filetable( string hrchy, string path ) {
 86 | 	if ( hrchy == "" ) return
 87 | 	if ( _filetable_label.hasKey( hrchy ) ) return
 88 | 
 89 | 	curr 	:= _get_curr( hrchy )
 90 | 	parent 	:= _get_parent( hrchy )
 91 | 	_filetable_label{hrchy} 	= curr //map_label.hasKey(curr) ? map_label{curr} : curr
 92 | 	_filetable_path{hrchy} 	= path
 93 | 	if ( parent != "" ) _construct_filetable( parent, "" )
 94 | }
 95 | 
 96 | string[] _get_children( string hrchy ) { // not including grand ones
 97 | 	string[] children
 98 | 
 99 | 	for ( string hrchy_ : _filetable_label.keys() ) {
100 | 		if ( hrchy == "" ) {
101 | 			if ( hrchy_.indexOf("/") < 0 ) \
102 | 				children.push( hrchy_ )
103 | 		}
104 | 		else if ( hrchy_.toLower().startsWith( hrchy.toLower() + "/" ) ) {
105 | 
106 | 			if ( hrchy_.lastIndexOf("/") <= hrchy.length() ) \
107 | 				children.push( hrchy_ )
108 | 		}			
109 | 	}
110 | 	return children
111 | }
112 | 
113 | string[] _find_children_and_sort( string hrchy ) {
114 | 	string[] ret
115 | 	children := _get_children( hrchy )
116 | 	if ( children.size() == 0 ) return ret
117 | 
118 | 	// for bubble sort
119 | 	int[] ranks
120 | 	for ( string child : children ) {
121 | 		curr := _get_curr( child )
122 | 		ranks.add( _label_rank.hasKey(curr) ? _label_rank{curr} : 0 )
123 | 	}
124 | 	sorted := _bubble_sort( ranks, children )
125 | 	for ( string child : sorted ) {
126 | 		ret = ret + [child] + _find_children_and_sort( child )
127 | 	}
128 | 	return ret
129 | }
130 | 
131 | string _get_parent( string hrchy ) { // "a/b/c" return a/b
132 | 	return hrchy.substr( 0, hrchy.lastIndexOf("/") )
133 | }
134 | 
135 | string _get_curr( string hrchy ) { // "a/b/c" return c
136 | 	return hrchy.substr( hrchy.lastIndexOf("/")+1 )
137 | }
138 | 
139 | string[] _bubble_sort( int[] a, string[] s ) { // sorting algorithm
140 | 	if ( a.size() != s.size() ) error("Array sizes do not match in _bubble_sort()!")
141 | 
142 | 	int temp; //for swapping
143 | 	string temp2;
144 | 	n := a.size()
145 | 	for (int i = 0 ; i < n - 1 ; i++) {
146 | 
147 | 		for (int j = 0 ; j < n - 1 ; j++) {
148 | 
149 | 			if ( a[j] > a[j + 1] ) {
150 | 				temp = a[j];
151 | 				a[j]=a[j + 1];
152 | 				a[j + 1] = temp;
153 | 
154 | 				temp2 = s[j];
155 | 				s[j]=s[j + 1];
156 | 				s[j + 1] = temp2;
157 | 			}
158 | 		}
159 | 	}
160 | 	return s
161 | }
162 | 


--------------------------------------------------------------------------------
/modules/git.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "sys.bds"
 5 | 
 6 | 
 7 | latest_git_commit_sha1 	:= "" // to show latest git commit sha1/date
 8 | latest_git_commit_date 	:= ""
 9 | 
10 | 
11 | init_git()
12 | 
13 | 
14 | void init_git() { // print latest git commit info
15 | 	script_file_paths := get_script_file_paths()
16 | 	for ( string path : script_file_paths ) {
17 | 		if ( path.exists() && "$path/.git".exists() ) {
18 | 
19 | 			latest_git_commit_sha1 = get_stdout("cd $path; git rev-parse HEAD")
20 | 			latest_git_commit_date = get_stdout("cd $path; git show -s --format=%cd --date=local $latest_git_commit_sha1")
21 | 			break;
22 | 		}
23 | 	}
24 | 
25 | 	print("\n\n== git info\n")
26 | 	if ( latest_git_commit_sha1 == "" ) \
27 | 		print( "Latest git commit\t\t: not under git control\n" )
28 | 	else \
29 | 		print( "Latest git commit\t\t: $latest_git_commit_sha1 ($latest_git_commit_date)\n" )	
30 | }
31 | 
32 | string html_pipeline_version( string git_url_prefix ) {
33 | 	string html	
34 | 	if ( latest_git_commit_sha1 != "" ) {
35 | 		html += "<div id='latest_git_commit'><b>Pipeline version</b><br><p>"
36 | 		html += "Latest git commit SHA1: "+\
37 | 			"<a target=_blank, href=$git_url_prefix/$latest_git_commit_sha1>$latest_git_commit_sha1</a>"+\
38 | 			" ($latest_git_commit_date)\n"
39 | 		html += "</p></div><br>\n"
40 | 	}
41 | 
42 | 	return html
43 | }
44 | 


--------------------------------------------------------------------------------
/modules/input.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "input_fastq.bds"
  5 | include "input_bam.bds"
  6 | include "input_tagalign.bds"
  7 | include "input_peak.bds"
  8 | 
  9 | 
 10 | help == input endedness settings (SE or PE) :
 11 | se 	:= false	help Singled-ended data set. To specify it for each replicate, '-se[REP_ID]' for exp. reps, '-ctl_se[CTL_ID]' for control.
 12 | pe 	:= false  	help Paired end data set. To specify it for each replicate, '-pe[REP_ID]' for exp. reps, '-ctl_pe[CTL_ID]' for controls.
 13 | 
 14 | default_is_pe := false 		// default is se
 15 | 
 16 | 
 17 | init_input()
 18 | 
 19 | void init_input() {
 20 | 	se	= get_conf_val_bool( se, 	["se"] )
 21 | 	pe	= get_conf_val_bool( pe, 	["pe"] )
 22 | }
 23 | 
 24 | //// ctl==0: exp. replicate, ctl==1: control
 25 | 
 26 | void chk_input( bool true_rep, bool no_pseudo_rep ) {
 27 | 	if ( is_input_peak() ) {
 28 | 
 29 | 		chk_input_peak( true_rep, no_pseudo_rep )
 30 | 		return
 31 | 	}
 32 | 	print( "\n\n== checking input files ...\n\n" );
 33 | 
 34 | 	string[] data_all
 35 | 	for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control)		
 36 | 		if ( ctl==1 && !ctl_exists() ) continue
 37 | 		for ( int rep=1; rep <= get_num_rep( ctl ); rep++) {
 38 | 			string[] data
 39 | 
 40 | 			prefix := (ctl==1) ? "Control " : ""
 41 | 			suffix := is_pe( ctl, rep ) ? " (PE)" : " (SE)"
 42 | 
 43 | 			if ( is_input_fastq( ctl, rep ) ) {
 44 | 				prefix = prefix + "Rep$rep fastq" + suffix
 45 | 				fastqs := get_fastqs( ctl, rep )
 46 | 				if ( fastqs.size()==0 ) {
 47 | 					data.push( "" )
 48 | 				}
 49 | 				else {
 50 | 					for ( string fastq : fastqs ) data.push( fastq )
 51 | 				}
 52 | 			}
 53 | 			else if ( is_input_bam( ctl, rep ) ) {
 54 | 				prefix = prefix +"Rep$rep bam" + suffix
 55 | 				data.push( get_bam( ctl, rep ) )
 56 | 			}
 57 | 			else if ( is_input_filt_bam( ctl, rep ) ) {
 58 | 				prefix = prefix +"Rep$rep filt_bam" + suffix
 59 | 				data.push( get_filt_bam( ctl, rep ) )
 60 | 			}
 61 | 			else if ( is_input_tag( ctl, rep ) ) {
 62 | 				prefix = prefix + "Rep$rep tagalign" + suffix
 63 | 				data.push( get_tag( ctl, rep ) )
 64 | 			}
 65 | 
 66 | 			print("$prefix :\n")
 67 | 			for ( string s : data ) {
 68 | 				print("\t$s\n")
 69 | 				if ( (s != "") && !path_exists(s) ) error("\t\tFile not found!\n")
 70 | 			}
 71 | 
 72 | 			// if data is missing
 73 | 			if ( data[0] == "" ) {
 74 | 				if ( (rep>=2) && (ctl==1) ) \
 75 | 					print( "\tWarning: $prefix missing! using control 1 for calling peaks on replicate $rep\n")
 76 | 				else if ( (rep==2) && (ctl==0) ) \
 77 | 					print( "\tWarning: $prefix missing! peak will be called for replicate 1 only\n")
 78 | 				else \
 79 | 					error( "\t$prefix missing!\n")
 80 | 				continue
 81 | 			}
 82 | 			// check any duplicate input filename
 83 | 			for ( string s : data ) {
 84 | 				if ( is_in_array( get_basename( s ), get_basename( data_all ) ) ) \
 85 | 					error( "\t$prefix has duplicate filename!\n")
 86 | 			}
 87 | 			data_all = merge( data_all, data )
 88 | 		}
 89 | 	}
 90 | }
 91 | 
 92 | string[] get_input_files( int ctl, int rep ) {
 93 | 	string[] empty
 94 | 
 95 | 	if ( is_input_fastq( ctl, rep ) ) {
 96 | 		return get_fastqs( ctl, rep )
 97 | 	}
 98 | 	else if ( is_input_bam( ctl, rep ) ) {
 99 | 		bam := get_bam( ctl, rep )
100 | 		return bam=="" ? empty : [bam]
101 | 	}
102 | 	else if ( is_input_filt_bam( ctl, rep ) ) {
103 | 		filt_bam := get_filt_bam( ctl, rep )
104 | 		return filt_bam=="" ? empty : [filt_bam]
105 | 	}
106 | 	else if ( is_input_tag( ctl, rep ) ) {
107 | 		tag := get_tag( ctl, rep )
108 | 		return tag=="" ? empty : [tag]
109 | 	}
110 | 	else {
111 | 		return empty
112 | 	}
113 | }
114 | 
115 | string[] get_input_files( int rep ) {
116 | 	return get_input_files( 0, rep )
117 | }
118 | 
119 | bool input_file_exists( int ctl, int rep ) {
120 | 	string[] input_files = get_input_files( ctl, rep )
121 | 	return input_files.size() > 0
122 | }
123 | 
124 | bool input_file_exists( int rep ) {
125 | 	return input_file_exists( 0, rep )
126 | }
127 | 
128 | int get_num_rep( int ctl ) {
129 | 	rep := 1
130 | 	while( get_input_files( ctl, rep ).size() > 0 ) rep++
131 | 
132 | 	num_rep := rep-1	
133 | 	return num_rep
134 | }
135 | 
136 | int get_num_rep() {
137 | 	return is_input_peak() ?  get_num_rep_peak() : get_num_rep( 0 )
138 | }
139 | 
140 | bool is_pe( int ctl, int rep ) {
141 | 	if ( pe ) 	return true
142 | 	if ( se ) 	return false
143 | 
144 | 	key_pe  := ( ctl > 0 ? "ctl_pe" : "pe" ) + rep 
145 | 	key_pe_ctl := "ctl_pe"
146 | 	key_se  := ( ctl > 0 ? "ctl_se" : "se" ) + rep 
147 | 
148 | 	if ( cmd_line_arg_has_key( key_pe ) ) {
149 | 		return true
150 | 	}
151 | 	else if ( cmd_line_arg_has_key( key_se ) ) {
152 | 		return false
153 | 	}
154 | 	else if ( ctl==1 && cmd_line_arg_has_key( key_pe_ctl ) ) {
155 | 		return true		
156 | 	}
157 | 	else {
158 | 		if ( conf.hasKey( key_pe ) && parse_bool( conf{ key_pe } ) ) return true
159 | 		if ( conf.hasKey( key_se ) && parse_bool( conf{ key_se } ) ) return false
160 | 		if ( ctl==1 && conf.hasKey( key_pe_ctl ) && parse_bool( conf{ key_pe_ctl } ) ) return true
161 | 	}
162 | 
163 | 	if ( is_input_fastq( ctl, rep ) ) {
164 | 		fastqs := get_fastq( ctl, rep, 2 )
165 | 		return fastqs.size() > 0
166 | 	}
167 | 
168 | 	if ( default_is_pe ) return true
169 | 	else return false
170 | }
171 | 
172 | bool is_se( int ctl, int rep ) {
173 | 	return !is_pe( ctl, rep )
174 | }
175 | 
176 | bool is_pe( int rep ) {
177 | 	return is_pe( 0, rep )
178 | }
179 | 
180 | bool is_se( int rep ) {
181 | 	return !is_pe( 0, rep )
182 | }
183 | 
184 | bool has_input_fastq() {
185 | 	for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control)		
186 | 		if ( ctl==1 && !ctl_exists() ) continue
187 | 
188 | 		for ( int rep=1; rep <= get_num_rep( ctl ); rep++) {
189 | 			if ( is_input_fastq( ctl, rep ) ) return true
190 | 		}
191 | 	}
192 | 	return false
193 | }
194 | 
195 | bool has_pe_input_fastq() {
196 | 	for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control)		
197 | 		if ( ctl==1 && !ctl_exists() ) continue
198 | 
199 | 		for ( int rep=1; rep <= get_num_rep( ctl ); rep++) {
200 | 			if ( is_input_fastq( ctl, rep ) && is_pe( ctl, rep ) ) return true
201 | 		}
202 | 	}
203 | 	return false	
204 | }
205 | 
206 | bool has_pe_input_tag( int ctl ) {
207 | 	for ( int rep=1; rep <= get_num_rep( ctl ); rep++) {
208 | 
209 | 		if ( is_input_tag( ctl, rep ) && is_pe( ctl, rep ) ) return true
210 | 	}
211 | 	return false	
212 | }
213 | 
214 | bool has_pe_input_tag() {
215 | 	return has_pe_input_tag( 0 )
216 | }
217 | 
218 | bool has_pe() {
219 | 	for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control)		
220 | 		if ( ctl==1 && !ctl_exists() ) continue
221 | 
222 | 		for ( int rep=1; rep <= get_num_rep( ctl ); rep++) {
223 | 			if ( is_pe( ctl, rep ) ) return true
224 | 		}
225 | 	}
226 | 	return false	
227 | }
228 | 
229 | bool has_se() {
230 | 	for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control)		
231 | 		if ( ctl==1 && !ctl_exists() ) continue
232 | 
233 | 		for ( int rep=1; rep <= get_num_rep( ctl ); rep++) {
234 | 			if ( !is_pe( ctl, rep ) ) return true
235 | 		}
236 | 	}
237 | 	return false
238 | }
239 | 
240 | bool ctl_exists() {
241 | 	return input_file_exists( 1, 1 )
242 | }
243 | 
244 | string get_long_group_name( int ctl, int rep ) {
245 | 	return ( (ctl>0) ? "Control " : "Replicate ") + rep
246 | }
247 | 
248 | string get_long_group_name( int rep ) {
249 | 	return "Replicate "+ rep
250 | }
251 | 
252 | string get_group_name( int ctl, int rep ) {
253 | 	return ( (ctl>0) ? "ctl" : "rep") + rep
254 | }
255 | 
256 | string get_group_name( int rep ) {
257 | 	return "rep" + rep
258 | }
259 | 


--------------------------------------------------------------------------------
/modules/input_adapter.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "conf.bds"
 5 | 
 6 | 
 7 | help == adapter sequence definition :
 8 | help         Single-ended : For replicate '-adapter[REP_ID]'
 9 | help         Paired end : For replicate '-adapter[REP_ID]_[PAIR_ID]'
10 | 
11 | 
12 | string{} get_adapter( int ctl, int rep, int p ) {
13 | 	// allow up to 99 adapters to be pooled (i.e. adapter1 adapter1:2 adapter1:3, ...)
14 | 	string{} result
15 | 	for ( int i=0; i<100; i++ ) {
16 | 		suffix := i ? ":$i" : ""
17 | 		key_wo_p := ( ctl > 0 ? "ctl_adapter" : "adapter" ) + "_rep" + rep
18 | 		key := key_wo_p + "_p" + p + suffix
19 | 		key_wo_p += suffix
20 | 
21 | 		key_wo_p2 := ( ctl > 0 ? "ctl_adapter" : "adapter" ) + rep
22 | 		key2 := key_wo_p2 + "_" + p + suffix
23 | 		key_wo_p2 += suffix
24 | 
25 | 		key_wo_p3 := ( ctl > 0 ? "ctl_adapter" : "adapter" )
26 | 		key3 := key_wo_p3 + "_" + p + suffix
27 | 		key_wo_p3 += suffix
28 | 
29 | 		formatted_i := format_digit(i,2)
30 | 		if ( (p==1) && cmd_line_arg_has_key( key_wo_p ) ) {
31 | 			result{formatted_i}= get_cmd_line_arg_val( key_wo_p )
32 | 		}
33 | 		else if ( (p==1) && cmd_line_arg_has_key( key_wo_p2 ) ) {
34 | 			result{formatted_i}= get_cmd_line_arg_val( key_wo_p2 )
35 | 		}
36 | 		else if ( (p==1) && (rep==1) && cmd_line_arg_has_key( key_wo_p3 ) ) {
37 | 			result{formatted_i}= get_cmd_line_arg_val( key_wo_p3 )
38 | 		}
39 | 		else if ( cmd_line_arg_has_key( key ) ) {
40 | 			result{formatted_i}= get_cmd_line_arg_val( key )
41 | 		}
42 | 		else if ( cmd_line_arg_has_key( key2 ) ) {
43 | 			result{formatted_i}= get_cmd_line_arg_val( key2 )
44 | 		}	
45 | 		else if ( (rep==1) && cmd_line_arg_has_key( key3 ) ) {
46 | 			result{formatted_i}= get_cmd_line_arg_val( key3 )
47 | 		}
48 | 		else if ( (p==1) && conf.hasKey( key_wo_p ) ) {
49 | 			result{formatted_i}= conf{ key_wo_p }
50 | 		}
51 | 		else if ( (p==1) && conf.hasKey( key_wo_p2 ) ) {
52 | 			result{formatted_i}= conf{ key_wo_p2 }
53 | 		}
54 | 		else if ( (p==1) && (rep==1) && conf.hasKey( key_wo_p3 ) ) {
55 | 			result{formatted_i}= conf{ key_wo_p3 }
56 | 		}
57 | 		else if ( conf.hasKey( key ) ) {
58 | 			result{formatted_i}= conf{ key }
59 | 		}
60 | 		else if ( conf.hasKey( key2 ) ) {
61 | 			result{formatted_i}= conf{ key2 }
62 | 		}
63 | 		else if ( (rep==1) && conf.hasKey( key3 ) ) {
64 | 			result{formatted_i}= conf{ key3 }
65 | 		}
66 | 	}
67 | 	return result
68 | }
69 | 
70 | int get_num_rep_adapter( int ctl ) {
71 | 	rep := 1
72 | 	while( get_adapter( ctl, rep, 1 ).size() > 0 ) rep++
73 | 
74 | 	num_rep := rep-1	
75 | 	return num_rep
76 | }
77 | 
78 | int get_num_rep_adapter() {
79 | 	return get_num_rep_adapter( 0 )
80 | }
81 | 


--------------------------------------------------------------------------------
/modules/input_bam.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "conf.bds"
 5 | 
 6 | 
 7 | help == bam input (raw or filtered) definition :
 8 | help         Raw bam : For replicate '-bam[REP_ID]', For control '-ctl_bam[REP_ID]'.
 9 | help         Filtered bam : For replicate '-filt_bam[REP_ID]', For control '-ctl_filt_bam[REP_ID]'.
10 | 
11 | 
12 | string get_bam( int ctl, int rep ) {
13 | 
14 | 	key := ( ctl > 0 ? "ctl_bam" : "bam" ) + "_rep" + rep
15 | 	key2 := ( ctl > 0 ? "ctl_bam" : "bam" ) + rep
16 | 	key3 := ( ctl > 0 ? "ctl_bam" : "bam" )
17 | 
18 | 	if ( cmd_line_arg_has_key( key ) ) {
19 | 		return get_path( get_cmd_line_arg_val( key ) )
20 | 	}
21 | 	else if ( cmd_line_arg_has_key( key2 ) ) {
22 | 		return get_path( get_cmd_line_arg_val( key2 ) )
23 | 	}
24 | 	else if ( (rep==1) && cmd_line_arg_has_key( key3 ) ) {
25 | 		return get_path( get_cmd_line_arg_val( key3 ) )
26 | 	}
27 | 	else if ( conf.hasKey( key ) ) {
28 | 		return get_path( conf{ key } )
29 | 	}
30 | 	else if ( conf.hasKey( key2 ) ) {
31 | 		return get_path( conf{ key2 } )
32 | 	}
33 | 	else if ( (rep==1) && conf.hasKey( key3 ) ) {
34 | 		return get_path( conf{ key3 } )
35 | 	}
36 | 	return ""
37 | }
38 | 
39 | string get_bam( int rep ) {
40 | 
41 | 	return get_bam( 0, rep )
42 | }
43 | 
44 | string get_filt_bam( int ctl, int rep ) {
45 | 
46 | 	key := ( ctl > 0 ? "ctl_filt_bam" : "filt_bam" ) + "_rep" + rep
47 | 	key2 := ( ctl > 0 ? "ctl_filt_bam" : "filt_bam" ) + rep
48 | 	key3 := ( ctl > 0 ? "ctl_filt_bam" : "filt_bam" )
49 | 
50 | 	if ( cmd_line_arg_has_key( key ) ) {
51 | 		return get_path( get_cmd_line_arg_val( key ) )
52 | 	}
53 | 	else if ( cmd_line_arg_has_key( key2 ) ) {
54 | 		return get_path( get_cmd_line_arg_val( key2 ) )
55 | 	}
56 | 	else if ( (rep==1) && cmd_line_arg_has_key( key3 ) ) {
57 | 		return get_path( get_cmd_line_arg_val( key3 ) )
58 | 	}
59 | 	else if ( conf.hasKey( key ) ) {
60 | 		return get_path( conf{ key } )
61 | 	}
62 | 	else if ( conf.hasKey( key2 ) ) {
63 | 		return get_path( conf{ key2 } )
64 | 	}
65 | 	else if ( (rep==1) && conf.hasKey( key3 ) ) {
66 | 		return get_path( conf{ key3 } )
67 | 	}
68 | 	return ""
69 | }
70 | 
71 | string get_filt_bam( int rep ) {
72 | 
73 | 	return get_filt_bam( 0, rep )
74 | }
75 | 
76 | bool is_input_bam( int ctl, int rep ) {
77 | 
78 | 	return get_bam( ctl, rep ) != ""
79 | }
80 | 
81 | bool is_input_bam( int rep ) {
82 | 
83 | 	return is_input_bam( 0, rep )
84 | }
85 | 
86 | bool is_input_filt_bam( int ctl, int rep ) {
87 | 
88 | 	return get_filt_bam( ctl, rep ) != ""
89 | }
90 | 
91 | bool is_input_filt_bam( int rep ) {
92 | 
93 | 	return is_input_filt_bam( 0, rep )
94 | }
95 | 


--------------------------------------------------------------------------------
/modules/input_fastq.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "conf.bds"
 5 | 
 6 | 
 7 | help == fastq input definition :
 8 | help         Single-ended : For replicate '-fastq[REP_ID]', For control '-ctl_fastq[REP_ID]'
 9 | help         Paired end : For replicate '-fastq[REP_ID]_[PAIR_ID]', For control '-ctl_fastq[REP_ID]_[PAIR_ID]'
10 | 
11 | string{} get_fastq( int ctl, int rep, int p ) {
12 | 	// allow up to 99 fastqs to be pooled (i.e. fastq1 fastq1:2 fastq1:3, ...)
13 | 	string{} result
14 | 	for ( int i=0; i<100; i++ ) {
15 | 		suffix := i ? ":$i" : ""
16 | 		key_wo_p := ( ctl > 0 ? "ctl_fastq" : "fastq" ) + "_rep" + rep
17 | 		key := key_wo_p + "_p" + p + suffix
18 | 		key_wo_p += suffix
19 | 
20 | 		key_wo_p2 := ( ctl > 0 ? "ctl_fastq" : "fastq" ) + rep
21 | 		key2 := key_wo_p2 + "_" + p + suffix
22 | 		key_wo_p2 += suffix
23 | 
24 | 		key_wo_p3 := ( ctl > 0 ? "ctl_fastq" : "fastq" )
25 | 		key3 := key_wo_p3 + "_" + p + suffix
26 | 		key_wo_p3 += suffix
27 | 
28 | 		formatted_i := format_digit(i,2)
29 | 		if ( (p==1) && cmd_line_arg_has_key( key_wo_p ) ) {
30 | 			result{formatted_i}= get_path( get_cmd_line_arg_val( key_wo_p ) )
31 | 		}
32 | 		else if ( (p==1) && cmd_line_arg_has_key( key_wo_p2 ) ) {
33 | 			result{formatted_i}= get_path( get_cmd_line_arg_val( key_wo_p2 ) )
34 | 		}
35 | 		else if ( (p==1) && (rep==1) && cmd_line_arg_has_key( key_wo_p3 ) ) {
36 | 			result{formatted_i}= get_path( get_cmd_line_arg_val( key_wo_p3 ) )
37 | 		}
38 | 		else if ( cmd_line_arg_has_key( key ) ) {
39 | 			result{formatted_i}= get_path( get_cmd_line_arg_val( key ) )
40 | 		}
41 | 		else if ( cmd_line_arg_has_key( key2 ) ) {
42 | 			result{formatted_i}= get_path( get_cmd_line_arg_val( key2 ) )
43 | 		}	
44 | 		else if ( (rep==1) && cmd_line_arg_has_key( key3 ) ) {
45 | 			result{formatted_i}= get_path( get_cmd_line_arg_val( key3 ) )
46 | 		}
47 | 		else if ( (p==1) && conf.hasKey( key_wo_p ) ) {
48 | 			result{formatted_i}= get_path( conf{ key_wo_p } )
49 | 		}
50 | 		else if ( (p==1) && conf.hasKey( key_wo_p2 ) ) {
51 | 			result{formatted_i}= get_path( conf{ key_wo_p2 } )
52 | 		}
53 | 		else if ( (p==1) && (rep==1) && conf.hasKey( key_wo_p3 ) ) {
54 | 			result{formatted_i}= get_path( conf{ key_wo_p3 } )
55 | 		}
56 | 		else if ( conf.hasKey( key ) ) {
57 | 			result{formatted_i}= get_path( conf{ key } )
58 | 		}
59 | 		else if ( conf.hasKey( key2 ) ) {
60 | 			result{formatted_i}= get_path( conf{ key2 } )
61 | 		}
62 | 		else if ( (rep==1) && conf.hasKey( key3 ) ) {
63 | 			result{formatted_i}= get_path( conf{ key3 } )
64 | 		}
65 | 	}
66 | 	return result
67 | }
68 | 
69 | string[] get_fastqs( int ctl, int rep ) {
70 | 	string[] result 
71 | 	for (int p=1;p<=2;p++) {		
72 | 		for ( string fastq : get_fastq( ctl, rep, p ) ) {
73 | 			result.add( fastq )
74 | 		}
75 | 	}
76 | 	return result
77 | }
78 | 
79 | string[] get_fastqs( int rep ) {
80 | 	return get_fastqs( 0, rep )
81 | }
82 | 
83 | bool is_input_fastq( int ctl, int rep ) {
84 | 	fastqs := get_fastq( ctl, rep, 1 )
85 | 	if ( fastqs.size() > 0 ) return true
86 | 	return false
87 | }
88 | 
89 | bool is_input_fastq( int rep ) {
90 | 	return is_input_fastq( 0, rep )	
91 | }
92 | 


--------------------------------------------------------------------------------
/modules/input_peak.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "conf.bds"
 5 | 
 6 | 
 7 | help == narrow peak input definition : 
 8 | help         For true replicates, use '-peak1' and '-peak2',
 9 | help         For pooled replicates, use '-peak_pooled',
10 | help         For two PR (self-pseudo-replicates), use '-peak[REP_ID]_pr1' and '-peak[REP_ID]_pr2'
11 | help         For two PPR (pooled pseudo-replicates), use '-peak_ppr1' and '-peak_ppr2'
12 | 
13 | 
14 | void chk_input_peak( bool true_rep, bool no_pseudo_rep ) {
15 | 
16 | 	if ( !is_input_peak() ) return // read peaks here
17 | 
18 | 	for ( int rep=0; rep<=get_num_rep_peak(); rep++) { // rep==0 : pooled
19 | 		if ( get_num_rep_peak() == 1 && rep==0 ) continue // if only one replicate, skip reading pooled rep
20 | 
21 | 		for (int pse=0; pse<=2; pse++) { // pse(pseudo)==0 : true rep, pse==1,2 : self-pseudo rep 1,2
22 | 			if ( true_rep && pse > 0 ) continue
23 | 			if ( no_pseudo_rep && rep != 0 && pse > 0 ) continue
24 | 
25 | 			peak_ := get_peak(rep,pse)
26 | 			suffix1 := rep==0 ? "replicate" : "replicate $rep"
27 | 			suffix2 := rep==0 ? "pseudo-replicate $pse" : "pseudo-replicate $pse for replicate $rep"
28 | 			prefix := (rep==0 ? "pooled " : "") + (pse==0 ? suffix1 : suffix2)
29 | 			
30 | 			print( "$prefix: \n\t$peak_"+"\n")
31 | 			if ( !path_exists( peak_ ) ) error("\t\tFile not found!\n")
32 | 		}
33 | 	}
34 | }
35 | 
36 | string get_peak( int rep, int pse ) { // rep==0 : pooled peak, pse==0 : true replicate
37 | 
38 | 	if ( pse > 2 ) error ("\nget_peak() : pse should not be larger than 2!")
39 | 
40 | 	string key, key2
41 | 	if ( rep == 0 ) {
42 | 		key 	= ( pse == 0 ? "peak_pooled" : ("peak_ppr" + pse) )
43 | 		key2 	= key
44 | 	}
45 | 	else {
46 | 		key 	= "peak"     + rep + ( pse == 0 ? "" : ("_pr" + pse) )
47 | 		key2 	= "peak_rep" + rep + ( pse == 0 ? "" : ("_pr" + pse) )
48 | 	}
49 | 
50 | 	if ( cmd_line_arg_has_key( key ) ) {
51 | 		return get_path( get_cmd_line_arg_val( key ) )
52 | 	}
53 | 	else if ( cmd_line_arg_has_key( key2 ) ) {
54 | 		return get_path( get_cmd_line_arg_val( key2 ) )
55 | 	}
56 | 	else if ( conf.hasKey( key ) ) {
57 | 		return get_path( conf{ key } )
58 | 	}
59 | 	else if ( conf.hasKey( key2 ) ) {
60 | 		return get_path( conf{ key2 } )
61 | 	}
62 | 
63 | 	return ""
64 | }
65 | 
66 | bool is_input_peak() {
67 | 
68 | 	return get_peak( 1, 0 ) != ""
69 | }
70 | 
71 | int get_num_rep_peak() {
72 | 
73 | 	rep := 1
74 | 
75 | 	while( get_peak( rep, 0 ) != "" ) rep++
76 | 
77 | 	return rep-1
78 | }
79 | 


--------------------------------------------------------------------------------
/modules/input_tagalign.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "conf.bds"
 5 | 
 6 | 
 7 | help == tagalign input definition :
 8 | help         For replicate '-tag[REP_ID]', For control '-ctl_tag[REP_ID]'.
 9 | 
10 | 
11 | string get_tag( int ctl, int rep ) {
12 | 
13 | 	key := ( ctl > 0 ? "ctl_tag" : "tag" ) + "_rep" + rep
14 | 	key2 := ( ctl > 0 ? "ctl_tagalign" : "tagalign" ) + "_rep" + rep
15 | 
16 | 	key3 := ( ctl > 0 ? "ctl_tag" : "tag" ) + rep
17 | 	key4 := ( ctl > 0 ? "ctl_tagalign" : "tagalign" ) + rep
18 | 
19 | 	key5 := ( ctl > 0 ? "ctl_tag" : "tag" )
20 | 	key6 := ( ctl > 0 ? "ctl_tagalign" : "tagalign" )
21 | 
22 | 	if ( cmd_line_arg_has_key( key ) ) {
23 | 		return get_path( get_cmd_line_arg_val( key ) )
24 | 	}
25 | 	else if ( cmd_line_arg_has_key( key2 ) ) {
26 | 		return get_path( get_cmd_line_arg_val( key2 ) )
27 | 	}
28 | 	else if ( cmd_line_arg_has_key( key3 ) ) {
29 | 		return get_path( get_cmd_line_arg_val( key3 ) )
30 | 	}
31 | 	else if ( cmd_line_arg_has_key( key4 ) ) {
32 | 		return get_path( get_cmd_line_arg_val( key4 ) )
33 | 	}
34 | 	else if ( (rep==1) && cmd_line_arg_has_key( key5 ) ) {
35 | 		return get_path( get_cmd_line_arg_val( key5 ) )
36 | 	}
37 | 	else if ( (rep==1) && cmd_line_arg_has_key( key6 ) ) {
38 | 		return get_path( get_cmd_line_arg_val( key6 ) )
39 | 	}
40 | 	else if ( conf.hasKey( key ) ) {
41 | 		return get_path( conf{ key } )
42 | 	}
43 | 	else if ( conf.hasKey( key2 ) ) {
44 | 		return get_path( conf{ key2 } )
45 | 	}
46 | 	else if ( conf.hasKey( key3 ) ) {
47 | 		return get_path( conf{ key3 } )
48 | 	}
49 | 	else if ( conf.hasKey( key4 ) ) {
50 | 		return get_path( conf{ key4 } )
51 | 	}
52 | 	else if ( (rep==1) && conf.hasKey( key5 ) ) {
53 | 		return get_path( conf{ key5 } )
54 | 	}
55 | 	else if ( (rep==1) && conf.hasKey( key6 ) ) {
56 | 		return get_path( conf{ key6 } )
57 | 	}
58 | 	return ""
59 | }
60 | 
61 | string get_tag( int rep ) {
62 | 
63 | 	return get_tag( 0, rep )
64 | }
65 | 
66 | bool is_input_tag( int ctl, int rep ) {
67 | 
68 | 	return get_tag( ctl, rep ) != ""
69 | }
70 | 
71 | bool is_input_tag( int rep ) {
72 | 
73 | 	return is_input_tag( 0, rep )
74 | }
75 | 


--------------------------------------------------------------------------------
/modules/module_template.bds:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bds
2 | #vim: syntax=java
3 | 
4 | include "parallel.bds"
5 | include "report.bds"
6 | 


--------------------------------------------------------------------------------
/modules/output.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "conf.bds"
 5 | 
 6 | 
 7 | help == output/title settings
 8 | out_dir 	:= "out"	help Output directory (default: out).
 9 | title		:= ""		help Prefix for HTML report and outputs without given prefix.
10 | 
11 | 
12 | init_output()
13 | 
14 | 
15 | void init_output() { 
16 | 	out_dir = get_conf_val( out_dir, 	["out_dir"] )		
17 | 	title 	= get_conf_val( title, 		["title"] )		
18 | 
19 | 	if ( title == "" ) { // if title is empty, use directory name as a title
20 | 		dirname := get_basename( get_path(out_dir) )
21 | 		if ( dirname == "out" ) { // if output folder is default one (out), then use parent dir. name
22 | 			dirname = get_basename( rm_str_at_end( get_path(out_dir), "/out" ) )
23 | 		}
24 | 		title = dirname
25 | 	}
26 | 	if ( !is_cmd_line_arg_empty() ) out_dir = mkdir( out_dir ) // create output directory and get absolute path for it
27 | 	title = replace_illegal_chrs( title )
28 | 
29 | 	print("\n\n== output directory/title info\n")
30 | 	print( "Output dir.\t\t\t: $out_dir\n" )
31 | 	print( "Title (prefix)\t\t\t: $title\n" )
32 | }
33 | 
34 | string get_rel_path( string path ) { // get relative path according to $out_dir
35 | 	rel_path := path.path().replace( out_dir.path(), "." )
36 | 	if ( rel_path == path.path() ) 	return path //""
37 | 	else 				return rel_path
38 | }
39 | 


--------------------------------------------------------------------------------
/modules/parallel.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "conf.bds"
  5 | 
  6 | 
  7 | help == parallelization settings
  8 | no_par		:= false	help Serialize all tasks (individual tasks can still use multiple threads up to '-nth').
  9 | nth 		:= 8 		help Maximum # threads for a pipeline. (default: 8).
 10 | 
 11 | string[] _tids_all 	// array of task ids currently running
 12 | int{} _nth_tasks 	// key: task id, value: # of threads for the task
 13 | 
 14 | 
 15 | init_parallel()
 16 | 
 17 | 
 18 | void init_parallel() {
 19 | 	no_par 	= get_conf_val_bool( no_par, 	["no_par"] )
 20 | 	nth 	= get_conf_val_int( nth, 	["nth"] )
 21 | 
 22 | 	if ( nth > 32 ) error("Maximum # threads (-nth) for a pipeline should not exceed 32!")
 23 | 	if ( nth <= 1 ) {
 24 | 		print("\nWarning: Maximum # threads (-nth) for a pipeline is <= 1. Turning off parallelization... (-no_par)")
 25 | 		nth = 1
 26 | 		no_par = true
 27 | 	}
 28 | 
 29 | 	// pre-declared BDS variable
 30 | 	cpus 		= -1 		// With cpus==-1, BDS does not pass number of threads to cluster engine (SGE, SLURM, ...), which means single-threaded
 31 | 
 32 | 	print("\n\n== parallelization info\n")
 33 | 	print( "No parallel jobs\t\t: $no_par\n" )
 34 | 	print( "Maximum # threads \t\t: $nth\n" )
 35 | }
 36 | 
 37 | void wait_par( int nth_task ) {
 38 | 	if ( nth_task < 1 ) nth_task = 1
 39 | 
 40 | 	while ( true ) {
 41 | 		sleep( rand()*1.0 + 0.5 )
 42 | 		_tids_all_  := _tids_all // make dummy array for thread safety
 43 | 
 44 | 		string[] tids_running
 45 | 		int nth_running		
 46 | 		for ( string tid : _tids_all_ ) { // get total # threads for currently running tasks, and find the oldest task
 47 | 			if ( !tid.isDone() ) {
 48 | 				tids_running.add( tid )
 49 | 				nth_running = nth_running + _nth_tasks{tid}
 50 | 			}
 51 | 		}
 52 | 
 53 | 		if ( tids_running.size() == 0 ) {
 54 | 			break
 55 | 		}
 56 | 		else if ( no_par || (nth_running+nth_task) > nth ) {
 57 | 			loop_cnt := 0
 58 | 			while( true ) { // wait until one of running tasks finishes
 59 | 				break_loop := false
 60 | 				for ( string tid : tids_running ) {
 61 | 					if ( tid.isDone() ) {
 62 | 						break_loop = true
 63 | 						break
 64 | 					}
 65 | 				}
 66 | 				if ( break_loop ) break
 67 | 				sleep( rand() + 0.5 )
 68 | 			}
 69 | 			sleep( rand()*1.0 + 0.5 )
 70 | 		}
 71 | 		else {
 72 | 			break
 73 | 		}
 74 | 	}
 75 | }
 76 | 
 77 | void register_par( string tid, int nth_task ) {
 78 | 	if ( nth_task < 1 ) nth_task = 1
 79 | 	if ( tid == "" ) return
 80 | 
 81 | 	_tids_all.add(tid)
 82 | 	_nth_tasks{tid} = nth_task
 83 | }
 84 | 
 85 | int{} distribute_nonzero( int n, int{} weight ) { // distribute integer n according to weight
 86 | 	int{} ret
 87 | 
 88 | 	int sum 
 89 | 	for ( int w : weight ) sum += w
 90 | 	if ( sum == 0 ) error("distribute_nth: sum is zero. check if input file size is 0?\n")
 91 | 	for ( string key : weight.keys() ) {
 92 | 		w := weight{key}
 93 | 		ret{key} = (n*w)/sum
 94 | 
 95 | 		if ( ret{key} == 0 ) ret{key} = 1
 96 | 	}
 97 | 
 98 | 	while( true ) {
 99 | 		int sum2
100 | 		for ( string key : weight.keys() ) sum2 += ret{key}
101 | 		if ( n > sum2 ) {
102 | 			string key_to_plus
103 | 			int max_diff = 0
104 | 			for ( string key : weight.keys() ) {
105 | 				diff := n*weight{key}-ret{key}*sum
106 | 				if ( diff > max_diff ) {
107 | 					key_to_plus = key
108 | 					max_diff = diff
109 | 				}
110 | 			}
111 | 			ret{key_to_plus}++
112 | 		}
113 | 		else {
114 | 			break
115 | 		}
116 | 	}
117 | 
118 | 	print("Distributing $n to ... \n")
119 | 	print(ret)
120 | 	print("\n")
121 | 	return ret
122 | }
123 | 
124 | int[] distribute_nonzero( int n, int[] weight ) { // distribute integer n according to weight
125 | 	int[] ret
126 | 
127 | 	int sum 
128 | 	for ( int w : weight ) sum += w
129 | 	if ( sum == 0 ) error("distribute_nth: sum is zero. check if input file size is 0?\n")
130 | 	for ( int i=0; i<weight.size(); i++) {
131 | 		w := weight[i]
132 | 		to_add := (n*w)/sum
133 | 		if ( to_add == 0 ) to_add = 1
134 | 		ret.add( to_add )
135 | 	}
136 | 	while( true ) {
137 | 		int sum2
138 | 		for ( int i=0; i<weight.size(); i++) sum2 += ret[i]
139 | 		if ( n > sum2 ) {
140 | 			int id_to_plus
141 | 			int max_diff = 0
142 | 			for ( int i=0; i<weight.size(); i++) {
143 | 				diff := n*weight[i]-ret[i]*sum
144 | 				if ( diff > max_diff ) {
145 | 					id_to_plus = i
146 | 					max_diff = diff
147 | 				}
148 | 			}
149 | 			ret[id_to_plus]++
150 | 		}
151 | 		else {
152 | 			break
153 | 		}
154 | 	}
155 | 
156 | 	print("Distributing $n to ... \n")
157 | 	print(ret)
158 | 	print("\n")
159 | 	return ret
160 | }
161 | 


--------------------------------------------------------------------------------
/modules/pipeline_template.bds:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bds
2 | #vim: syntax=java
3 | 
4 | include "git.bds"
5 | include "parallel.bds"
6 | include "report.bds"
7 | 


--------------------------------------------------------------------------------
/modules/postalign_bed.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "species.bds"
  5 | include "module_template.bds"
  6 | 
  7 | 
  8 | // has functions related to tagalign, and helps getting tagalign from configruation file or command line argument
  9 | 
 10 | help == postalign bed/tagalign settings
 11 | mem_shuf 		:= "12G"	help Max. memory for UNIX shuf (default: 12G).
 12 | no_random_source 	:= false 	help Disable --random-source for UNIX shuf. Hot fix for end of file error.
 13 | 
 14 | 
 15 | init_postalign_bed()
 16 | 
 17 | 
 18 | void init_postalign_bed() {
 19 | 
 20 | 	// fraglen0 	= get_conf_val_bool( fraglen0,	["fraglen0"] )
 21 | 	mem_shuf	= get_conf_val( mem_shuf,	["mem_shuf"] )
 22 | 	no_random_source = get_conf_val_bool( no_random_source,	["no_random_source"] )
 23 | 
 24 | 	print("\n\n== postalign bed/tagalign settings\n")
 25 | 	print( "Max. memory for UNIX shuf\t\t\t: $mem_shuf\n")
 26 | 	print( "No --random-source for UNIX shuf\t\t: $no_random_source\n")
 27 | }
 28 | 
 29 | string subsample_tag( string tag, int nlines, bool non_mito, string o_dir, string group ) {
 30 | 
 31 | 	prefix 		:= replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir )
 32 | 	nreads_per_mill := metric_prefix( nlines )
 33 | 
 34 | 	subsampled_tag 	:= "$prefix."+(non_mito?"no_chrM.":"")+"$nreads_per_mill.tagAlign.gz"
 35 | 	non_mito_param 	:= non_mito ? "grep -v \"chrM\" | " : ""	
 36 | 	random_source_param := no_random_source ? "" : "--random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt </dev/zero 2>/dev/null)"
 37 | 
 38 | 	in 	:= [ tag ]
 39 | 	out 	:= subsampled_tag
 40 | 
 41 | 	taskName:= "subsample_tag " + group
 42 | 	mem := get_res_mem(mem_shuf,1)
 43 | 
 44 | 	wait_par( cpus )
 45 | 
 46 | 	tid := task( out<-in ) {
 47 | 
 48 | 		sys $shcmd_init
 49 | 	
 50 | 		//# Subsample tagAlign file
 51 | 		sys zcat $tag | \
 52 | 			$non_mito_param shuf -n $nlines $random_source_param | gzip -nc > $subsampled_tag
 53 | 
 54 | 		sys $shcmd_finalize
 55 | 	}
 56 | 
 57 | 	register_par( tid, cpus )
 58 | 
 59 | 	add_task_to_graph( in, out, group )
 60 | 
 61 | 	return out
 62 | }
 63 | 
 64 | string subsample_tag_PE( string tag, int nlines, bool non_mito, string o_dir, string group ) {
 65 | 
 66 | 	prefix 		:= replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir )
 67 | 	nreads_per_mill := metric_prefix( nlines )
 68 | 
 69 | 	subsampled_tag 	:= "$prefix."+(non_mito?"no_chrM.":"")+"$nreads_per_mill.tagAlign.gz"
 70 | 	non_mito_param 	:= non_mito ? "grep -v \"chrM\" | " : ""
 71 | 	random_source_param := no_random_source ? "" : "--random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt </dev/zero 2>/dev/null)"
 72 | 
 73 | 	joined 		:= "$prefix.joined" // temporary file
 74 | 	joined_subsampled := "$prefix.joined.subsampled" // temporary file
 75 | 
 76 | 	in 	:= [ tag ]
 77 | 	out 	:= subsampled_tag
 78 | 
 79 | 	taskName:= "subsample_tag_PE " + group
 80 | 	mem := get_res_mem(mem_shuf,1)
 81 | 
 82 | 	wait_par( cpus )
 83 | 
 84 | 	tid := task( out<-in ) {
 85 | 
 86 | 		sys $shcmd_init
 87 | 
 88 | 		// join consecutive two lines into one
 89 | 		sys zcat $tag | sed 'N;s/\n/\t/' > $joined
 90 | 
 91 | 		//# Shuffle and split temporary combined file into 2 equal parts
 92 | 		//# Will produce $PR_PREFIX00 and $PR_PREFIX01
 93 | 		sys cat $joined | $non_mito_param shuf -n $nlines $random_source_param > $joined_subsampled
 94 | 	
 95 | 		//# Subsample tagAlign file
 96 | 		sys awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\t%s\t%s\t%s\n%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}' $joined_subsampled | \
 97 | 			gzip -nc > $subsampled_tag
 98 | 
 99 | 		sys rm -f $joined $joined_subsampled
100 | 
101 | 		sys $shcmd_finalize
102 | 	}
103 | 
104 | 	register_par( tid, cpus )
105 | 
106 | 	add_task_to_graph( in, out, group )
107 | 
108 | 	return out
109 | }
110 | 
111 | // Adjusts the read-ends in a read BED by Tn5 offsets
112 | string tn5_shift_tag( string tag, string o_dir, string group ) {
113 | 	
114 | 	prefix 		:= replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir )
115 | 	//shifted_tag 	:= "$prefix.shifted.tagAlign.gz"
116 | 	shifted_tag 	:= "$prefix.tn5.tagAlign.gz"
117 | 
118 | 	in 	:= [ tag ]
119 | 	out 	:= shifted_tag
120 | 
121 | 	taskName:= "shift_tag " + group
122 | 
123 | 	wait_par( cpus )
124 | 
125 | 	tid := task( out<-in ) {
126 | 
127 | 		sys $shcmd_init
128 | 
129 | 		sys zcat $tag | awk -F '\t' 'BEGIN {OFS = FS}{ if ($6 == "+") {$2 = $2 + 4} else if ($6 == "-") {$3 = $3 - 5} print $0}' | gzip -nc > $shifted_tag
130 | 
131 | 		sys $shcmd_finalize
132 | 	}
133 | 
134 | 	register_par( tid, cpus )
135 | 
136 | 	add_task_to_graph( in, out, group )
137 | 
138 | 	return out
139 | }
140 | 
141 | // make spr(self_pseudo_replicate)
142 | string[] spr( string tag, string pr1_o_dir, string pr2_o_dir, string group ) {
143 | 	
144 | 	prefix_pr1 	:= replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), pr1_o_dir )
145 | 	prefix_pr2	:= replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), pr2_o_dir )
146 | 	tag_pr1		:= "$prefix_pr1.pr1.tagAlign.gz"
147 | 	tag_pr2		:= "$prefix_pr2.pr2.tagAlign.gz"
148 | 	random_source_param := no_random_source ? "" : "--random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt </dev/zero 2>/dev/null)"
149 | 
150 | 	in 	:= [ tag ]
151 | 	out 	:= [ tag_pr1, tag_pr2 ]
152 | 
153 | 	taskName:= "spr " + group
154 | 	mem := get_res_mem(mem_shuf,1)
155 | 
156 | 	wait_par( cpus )
157 | 
158 | 	tid := task( out<-in ) {
159 | 
160 | 		sys $shcmd_init
161 | 
162 | 		//# Get total number of read pairs
163 | 		sys nlines=$( zcat $tag | wc -l )
164 | 		sys nlines=$(( (nlines + 1) / 2 ))
165 | 		
166 | 		//# Shuffle and split BEDPE file into 2 equal parts
167 | 		//# Will produce $PR_PREFIX00 and $PR_PREFIX01
168 | 		sys zcat $tag | shuf $random_source_param | split -d -l $((nlines)) - $prefix_pr1.
169 | 
170 | 		//# Convert read pairs to reads into standard tagAlign file
171 | 		sys gzip -nc $prefix_pr1.00 > $tag_pr1
172 | 		sys rm -f $prefix_pr1.00
173 | 		sys gzip -nc $prefix_pr1.01 > $tag_pr2
174 | 		sys rm -f $prefix_pr1.01
175 | 
176 | 		sys $shcmd_finalize
177 | 	}
178 | 
179 | 	register_par( tid, cpus )
180 | 
181 | 	add_task_to_graph( in, out, ["$group PR 1", "$group PR 2"] )
182 | 
183 | 	return out
184 | }
185 | 
186 | string[] spr_tag_PE( string tag, string pr1_o_dir, string pr2_o_dir, string group ) {
187 | 	
188 | 	prefix_pr1 	:= replace_dir( rm_ext( tag, ["bed","tagAlign"] ), pr1_o_dir )
189 | 	prefix_pr2 	:= replace_dir( rm_ext( tag, ["bed","tagAlign"] ), pr2_o_dir )
190 | 
191 | 	joined 	:= "$prefix_pr1.joined" // temporary file
192 | 
193 | 	tag_pr1	:= "$prefix_pr1.pr1.tagAlign.gz"
194 | 	tag_pr2	:= "$prefix_pr2.pr2.tagAlign.gz"
195 | 	random_source_param := no_random_source ? "" : "--random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt </dev/zero 2>/dev/null)"
196 | 
197 | 	in 	:= [ tag ]
198 | 	out 	:= [ tag_pr1, tag_pr2 ]
199 | 
200 | 	taskName:= "spr_tag_PE " + group
201 | 	mem := get_res_mem(mem_shuf,1)
202 | 
203 | 	wait_par( cpus )
204 | 
205 | 	tid := task( out<-in ) {
206 | 
207 | 		sys $shcmd_init
208 | 
209 | 		// join consecutive two lines into one
210 | 		sys zcat $tag | sed 'N;s/\n/\t/' > $joined
211 | 
212 | 		//# Get total number of read pairs
213 | 		sys nlines=$( cat $joined | wc -l )
214 | 		sys nlines=$(( (nlines + 1) / 2 ))
215 | 
216 | 		//# Shuffle and split temporary combined file into 2 equal parts
217 | 		//# Will produce $PR_PREFIX00 and $PR_PREFIX01
218 | 		sys cat $joined | shuf $random_source_param | split -d -l $((nlines)) - $prefix_pr1.  
219 | 
220 | 		//# Convert read pairs to reads into standard tagAlign file
221 | 		sys awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\t%s\t%s\t%s\n%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}' "$prefix_pr1.00" | \
222 | 			gzip -nc > $tag_pr1
223 | 		sys rm -f $prefix_pr1.00
224 | 		sys awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\t%s\t%s\t%s\n%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}' "$prefix_pr1.01" | \
225 | 			gzip -nc > $tag_pr2
226 | 		sys rm -f $prefix_pr1.01
227 | 
228 | 		sys rm -f $joined
229 | 
230 | 		sys $shcmd_finalize
231 | 	}
232 | 	
233 | 	register_par( tid, cpus )
234 | 
235 | 	add_task_to_graph( in, out, ["$group PR 1", "$group PR 2"] )
236 | 
237 | 	return out
238 | }
239 | 
240 | string pool_tag( string tag1, string tag2, string o_dir, string group ) {
241 | 	// LINUX has limit on filename length (255), make it as shorter as possible
242 | 	string tag_pooled
243 | 	if ( get_basename(tag1).length() < 50 && get_basename(tag2).length() < 50 ) {
244 | 		prefix 	:= "$o_dir/" + merge_basename_wo_ext( tag1, tag2, ["tagAlign","tag","bed"] )	
245 | 		tag_pooled = "$prefix.tagAlign.gz"
246 | 	}
247 | 	else {
248 | 		prefix 	:= replace_dir( rm_ext( tag1, ["bed","tagAlign"] ), o_dir )	
249 | 		tag_pooled = "$prefix"+"_pooled.tagAlign.gz"
250 | 	}
251 | 
252 | 	in 	:= [ tag1, tag2 ]
253 | 	out 	:= tag_pooled
254 | 
255 | 	taskName:= "pool_tag " + group
256 | 
257 | 	wait_par( cpus )
258 | 
259 | 	tid := task( out<-in ) {
260 | 
261 | 		sys $shcmd_init
262 | 		sys zcat $tag1 $tag2 | gzip -nc > $tag_pooled
263 | 
264 | 		sys $shcmd_finalize
265 | 	}
266 | 
267 | 	register_par( tid, cpus )
268 | 
269 | 	add_task_to_graph( in, out, group )
270 | 
271 | 	return out
272 | }
273 | 
274 | string pool_tag( string[] tags, string o_dir, string group ) {
275 | 	// LINUX has limit on filename length (255), make it as short as possible
276 | 	string tag_pooled
277 | 	if ( tags.size() <= 2 && get_basename(tags[0]).length() < 50 && get_basename(tags[1]).length() < 50 ) {
278 | 		prefix 	:= "$o_dir/" + merge_basename_wo_ext( tags[0], tags[1], ["tagAlign","tag","bed"] )	
279 | 		tag_pooled = "$prefix.tagAlign.gz"
280 | 	}
281 | 	else {
282 | 		prefix 	:= replace_dir( rm_ext( tags[0], ["bed","tagAlign"] ), o_dir )	
283 | 		tag_pooled = "$prefix"+"_pooled.tagAlign.gz"
284 | 	}
285 | 	tags_str 	:= array_to_str( tags, " " ) // join
286 | 
287 | 	in 		:= tags
288 | 	out 		:= tag_pooled
289 | 
290 | 	taskName:= "pool_tag " + group
291 | 
292 | 	wait_par( cpus )
293 | 
294 | 	tid := task( out<-in ) {
295 | 
296 | 		sys $shcmd_init
297 | 
298 | 		sys zcat $tags_str | gzip -nc > $tag_pooled
299 | 
300 | 		sys $shcmd_finalize
301 | 	}
302 | 
303 | 	register_par( tid, cpus )
304 | 
305 | 	add_task_to_graph( in, out, group )
306 | 
307 | 	return out
308 | }
309 | 


--------------------------------------------------------------------------------
/modules/postalign_xcor.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "species.bds"
  5 | include "module_template.bds"
  6 | 
  7 | 
  8 | help == postalign bed/tagalign settings
  9 | fraglen0	 := false 	help (LEGACY PARAM) Set predefined fragment length as zero for cross corr. analysis (add -speak=0 to run_spp.R).
 10 | speak_xcor 	 := -1		help Set user-defined cross-corr. peak strandshift (-speak= in run_spp.R). Use -1 to disable (default: -1).
 11 | max_ppsize_xcor	 := ""		help R stack size (R parameter --max-ppsize=; between 5000 and 5000000) for cross corr. analysis.
 12 | extra_param_xcor := ""		help Set extra parameters for run_spp.R (cross-corr. analysis only).
 13 | mem_xcor	 := "15G"	help Max. memory for cross-corr. analysis (default: 15G).
 14 | 
 15 | grp_color_xcor 	 := "yellowgreen"
 16 | 
 17 | init_postalign_xcor()
 18 | 
 19 | 
 20 | void init_postalign_xcor() {
 21 | 
 22 | 	fraglen0 	= get_conf_val_bool( fraglen0,		["fraglen0"] )
 23 | 	speak_xcor	= get_conf_val_int( speak_xcor,		["speak_xcor"] )
 24 | 	extra_param_xcor= get_conf_val( extra_param_xcor,	["extra_param_xcor"] )
 25 | 	mem_xcor	= get_conf_val( mem_xcor,		["mem_xcor"] )
 26 | 	max_ppsize_xcor = get_conf_val( max_ppsize_xcor, 	["max_ppsize_xcor"] )
 27 | 
 28 | 	// backward compatibility
 29 | 	if ( speak_xcor == -1 && fraglen0 ) speak_xcor = 0
 30 | 
 31 | 	print("\n\n== postalign cross-corr. analysis settings\n")
 32 | 	print( "Max. memory for UNIX shuf\t\t\t: $mem_shuf\n")
 33 | 	print( "User-defined cross-corr. peak strandshift\t: $speak_xcor\n")
 34 | 	print( "Extra parameters for cross-corr. analysis\t: $extra_param_xcor\n")
 35 | 	print( "Max. memory for cross-corr. analysis\t\t: $mem_xcor\n")
 36 | 	print( "Stack size for cross-corr. analysis\t\t:$max_ppsize_xcor\n")
 37 | }
 38 | 
 39 | string subsample_tag_PE_for_xcor( string tag, int nlines, bool non_mito, string o_dir, string group ) {
 40 | 
 41 | 	prefix 		:= replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir )
 42 | 	nreads_per_mill := metric_prefix( nlines )
 43 | 
 44 | 	subsampled_tag 	:= "$prefix."+(non_mito?"no_chrM.":"")+"$nreads_per_mill.R1.tagAlign.gz"
 45 | 	non_mito_param 	:= non_mito ? "grep -v \"chrM\" | " : ""
 46 | 
 47 | 	joined 		:= "$prefix.joined" // temporary file
 48 | 	joined_subsampled := "$prefix.joined.subsampled" // temporary file
 49 | 
 50 | 	in 	:= [ tag ]
 51 | 	out 	:= subsampled_tag
 52 | 
 53 | 	taskName:= "subsample_tag_PE_4_xcor " + group
 54 | 	mem := get_res_mem(mem_shuf,1)
 55 | 
 56 | 	wait_par( cpus )
 57 | 
 58 | 	tid := task( out<-in ) {
 59 | 
 60 | 		sys $shcmd_init
 61 | 
 62 | 		// join consecutive two lines into one
 63 | 		sys zcat $tag | sed 'N;s/\n/\t/' > $joined
 64 | 
 65 | 		//# Shuffle and split temporary combined file into 2 equal parts
 66 | 		//# Will produce $PR_PREFIX00 and $PR_PREFIX01
 67 | 		sys cat $joined | $non_mito_param shuf -n $nlines --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt </dev/zero 2>/dev/null) > $joined_subsampled
 68 | 	
 69 | 		//# Subsample tagAlign file
 70 | 		sys awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$4,$5,$6}' $joined_subsampled | \
 71 | 			gzip -nc > $subsampled_tag
 72 | 
 73 | 		sys rm -f $joined $joined_subsampled
 74 | 
 75 | 		sys $shcmd_finalize
 76 | 	}
 77 | 
 78 | 	register_par( tid, cpus )
 79 | 
 80 | 	add_task_to_graph( in, out, group )
 81 | 
 82 | 	return out
 83 | }
 84 | 
 85 | string[] xcor( string tag, string o_dir, string group, int nth_xcor ) {
 86 | 
 87 | 	// misc.
 88 | 	prefix 		:= replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir )
 89 | 	xcor_score 	:= "$prefix.cc.qc"
 90 | 	xcor_plot 	:= "$prefix.cc.plot.pdf"	
 91 |         param_speak     := speak_xcor > -1 ? "-speak=$speak_xcor" : ""
 92 |         extra_param 	:= max_ppsize_xcor ? "--max-ppsize=$max_ppsize_xcor " : ""
 93 | 
 94 | 	in 	:= [ tag ]
 95 | 	out 	:= [ xcor_score, xcor_plot ]
 96 | 
 97 | 	taskName:= "xcor " + group
 98 | 	cpus 	:= (nth_xcor==1) ? -1 : nth_xcor;	mem := get_res_mem(mem_xcor,nth_xcor);
 99 | 
100 | 	wait_par( cpus )
101 | 
102 | 	tid := task( out<-in ) {
103 | 
104 | 		sys $shcmd_init
105 | 
106 | 		// # if phantompeakqualtools is an old version, use run_spp_nodups.R. new version has run_spp.R only		
107 | 		sys if [[ $(which run_spp_nodups.R 2> /dev/null | wc -l || echo) == "1" ]]; then RUN_SPP=$(which run_spp_nodups.R); \
108 | 		    else RUN_SPP=$(which run_spp.R); \
109 | 		    fi
110 | 
111 | 		//# CCSCORE FILE format
112 | 		//# Filename <tab> numReads <tab> estFragLen <tab> correstFragLen <tab> PhantomPeak <tab> corrphantomPeak <tab> argmincorr <tab> mincorr <tab> phantomPeakCoef <tab> relPhantomPeakCoef <tab> QualityTag
113 | 		sys Rscript $extra_param ${RUN_SPP} -rf \
114 | 			-c=$tag -p=$nth_xcor \
115 | 			-filtchr=chrM -savp=$xcor_plot -out=$xcor_score $param_speak $extra_param_xcor
116 | 		sys sed -r 's/,[^\t]+//g' $xcor_score > $xcor_score.tmp
117 | 		sys mv $xcor_score.tmp $xcor_score
118 | 
119 | 		sys $shcmd_finalize
120 | 	}
121 | 
122 | 	register_par( tid, cpus )
123 | 
124 | 	add_task_to_graph( in, out, group, "XCOR", grp_color_xcor )
125 | 
126 | 	return out
127 | }
128 | 
129 | string get_fraglen( string xcor_score ) {   // get FRAGLEN (3rd column of cc score file) for spp(-speak=$FRAGLEN)	
130 | 
131 | 	cols := xcor_score.read().split("\t")
132 | 	return cols[2]
133 | }
134 | 


--------------------------------------------------------------------------------
/modules/species.bds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bds
 2 | #vim: syntax=java
 3 | 
 4 | include "conf.bds"
 5 | 
 6 | 
 7 | help == species settings
 8 | species		:= "" 		help Species. need to specify '-species_file' too if you have not installed genome database with 'install_genome_data.sh'.
 9 | species_file	:= ""		help Species file path.
10 | species_browser := "" 		help Species name in WashU genome browser.
11 | 
12 | ref_fa 		:= ""		help Reference genome sequence fasta.
13 | chrsz 		:= "" 		help Chromosome sizes file path (use fetchChromSizes from UCSC tools).
14 | blacklist 	:= "" 		help Blacklist bed.
15 | seq_dir 	:= ""		help Reference genome sequence directory path (where chr*.fa exist).
16 | 
17 | init_species()
18 | 
19 | void init_species() {
20 | 
21 | 	species 	= get_conf_val( species, 	["species"] )
22 | 	species_file 	= get_conf_val( species_file, 	["species_file"] )
23 | 
24 | 	_read_species()
25 | 
26 | 	species_browser = get_conf_val( species_browser,["species_browser"] )
27 | 
28 | 	ref_fa 		= get_conf_val( ref_fa, 	["ref_fa"] )
29 | 	chrsz		= get_conf_val( chrsz, 		["chrsz"] )
30 | 	blacklist 	= get_conf_val( blacklist, 	["blacklist"] )
31 | 	seq_dir 	= get_conf_val( seq_dir, 	["seq_dir"])
32 | 
33 | 	if ( species_browser == "" ) species_browser = species
34 | 
35 | 	print("\n\n== species settings\n")
36 | 	print( "Species\t\t\t\t: $species\n" )
37 | 	print( "Species file\t\t\t: $species_file\n\n" )
38 | 	print( "Species name (WashU browser)\t: $species_browser\n" )
39 | 	print( "Ref. genome seq. fasta\t\t: $ref_fa\n" )
40 | 	print( "Chr. sizes file\t\t\t: $chrsz\n" )
41 | 	print( "Black list bed\t\t\t: $blacklist\n" )
42 | 	print( "Ref. genome seq. dir.\t\t: $seq_dir\n" )
43 | }
44 | 
45 | void _read_species() { // check for species configruation files
46 | 	// value for key will be overriden as loop goes. so the last element in species_paths has the priority
47 | 	string[] species_paths
48 | 	if ( env != "" ) species_paths.add( env )
49 | 	if ( c != "" ) species_paths.add( c )
50 | 	species_paths.add( species_file )
51 | 
52 | 	for ( string path : species_paths ) {
53 | 		if ( path.exists() ) {
54 | 			add_to_conf( path, species )
55 | 		}
56 | 	}
57 | }
58 | 
59 | 
60 | // temp
61 | /*
62 | bwt_idx		:= ""		help Bowtie index (full path prefix of *.1.ebwt file).
63 | 	bwt_idx 	= get_conf_val( bwt_idx, 	["bwt_idx"] )
64 | 	print( "Bowtie index\t\t\t: $bwt_idx\n" )
65 | */	
66 | 


--------------------------------------------------------------------------------
/modules/sys.bds:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bds
  2 | #vim: syntax=java
  3 | 
  4 | include "string.bds"
  5 | 
  6 | helpUnsorted := true // do not sort help
  7 | 
  8 | 
  9 | script_path 	:= ""
 10 | script_dir 	:= ""
 11 | 
 12 | hostname 	:= ""
 13 | 
 14 | // pipeline seeks for executables in the BDS script directory (local git repo) and $PATH
 15 | // Add more relative path here if you want to keep your .py .sh .R visible to UNIX `which` as executables.
 16 | // Relative paths defined here are according to your script path (not your working directory but where .bds exists)
 17 | // Make sure that you chmod 755 your .py .R .sh
 18 | _rel_script_file_paths 	:= [".","modules","utils"]
 19 | 
 20 | 
 21 | init_base()
 22 | 
 23 | 
 24 | void init_base() {
 25 | 	script_path 	= "$ppwd/$programPath"
 26 | 	if (!script_path.exists()) script_path = "$programPath"
 27 | 
 28 | 	script_dir 	= script_path.dirName()
 29 | 	hostname 	= get_hostname()
 30 | }
 31 | 
 32 | //// script file path
 33 | 
 34 | string[] get_script_file_paths( string suffix ) {
 35 | 	string[] ret
 36 | 	for ( string path : _rel_script_file_paths ) {
 37 | 		path = "$script_dir/$path"
 38 | 		if ( path.exists() ) {
 39 | 			ret.add( path + suffix )
 40 | 			if ( path.dirName().endsWith( "modules" ) ) ret.add( "$path/../$suffix" )
 41 | 		}
 42 | 	}
 43 | 	return ret
 44 | }
 45 | 
 46 | string[] get_script_file_paths() {
 47 | 	return get_script_file_paths( "" )
 48 | }
 49 | 
 50 | //// command line argument functions
 51 | 
 52 | bool cmd_line_arg_has_key( string key ) {
 53 | 	key = key.toLower()
 54 | 	for ( string arg : args ) {
 55 | 		if ( ("-"+key) == arg.toLower().trim() ) return true
 56 | 	}
 57 | 	return false
 58 | }
 59 | 
 60 | bool is_cmd_line_arg_empty() {
 61 | 	return args.size()==0
 62 | }
 63 | 
 64 | bool is_first_arg_conf() {
 65 | 	if ( (args.size()>0) && (!args[0].startsWith("-")) )  {
 66 | 		if ( args.size()==1 ) {
 67 | 			return true
 68 | 		}
 69 | 		else {
 70 | 			return args[1].startsWith("-")
 71 | 		}
 72 | 	}
 73 | 	return false
 74 | }
 75 | 
 76 | string get_cmd_line_arg_val( string key ) {
 77 | 	key = key.toLower()
 78 | 	for (int i=0; i< args.size(); i++) {
 79 | 		arg := args[i]		
 80 | 		if ( ("-"+key) == arg.toLower().trim() ) {
 81 | 			if ( i==(args.size()-1) ) break
 82 | 			next_arg := args[i+1]
 83 | 
 84 | 			if ( next_arg.startsWith("-") ) break
 85 | 			return next_arg
 86 | 		}
 87 | 	}
 88 | 	return ""
 89 | }
 90 | 
 91 | //// functions for file I/O
 92 | 
 93 | string get_path( string str ) { // get absolute path (remove / if exists at end)
 94 | 	if (str.trim() == "") return ""
 95 | 	base := rm_str_at_end( str, "/" ).path()
 96 | 	return base
 97 | }
 98 | 
 99 | string mkdir( string str ) {
100 | 	if (str.trim() == "") return ""
101 | 	// make filename full path and mkdir -p
102 | 	path := get_path( str )
103 | 	if ( path.exists() ) {		
104 | 		return path
105 | 	}
106 | 	else {
107 | 		path.mkdir()
108 | 		return path
109 | 	}
110 | }
111 | 
112 | bool path_exists( string path ) {
113 | 	if ( path!="" ) {
114 | 		if ( path.exists() ) {
115 | 			if ( path.isFile() ) {
116 | 				if ( path.size() > 0 ) return true
117 | 			}
118 | 			else {
119 | 				return true
120 | 			}
121 | 		}
122 | 	}
123 | 	return false
124 | }
125 | 
126 | string copy( string file, string o_dir ) {
127 | 	file_new := replace_dir( file, o_dir )
128 | 	system := "local" // do not use cluster engine for this task
129 | 	taskName:= "copy file"
130 | 
131 | 	task ( file_new <- file ) {
132 | 
133 | 		sys cp --remove-destination $file $file_new
134 | 		sys while [ ! -f $file_new ]; do echo FOUND DELAYED WRITE, WAITING...; sleep 0.1; done
135 | 	}
136 | 
137 | 	return file_new
138 | }
139 | 
140 | string get_stdout( string cmd ) {
141 | 	rnd  := randInt()
142 | 	cmd_ := "cmd_$rnd".path()
143 | 	sys $cmd &> $cmd_ || true
144 | 	ret := cmd_.read()
145 | 	sys rm -f $cmd_
146 | 	return rm_str_at_end(ret,"\n")
147 | }
148 | 
149 | string get_shell_var( string var ) {
150 | 	var_ := "var_$var".path()
151 | 	sys echo "${$var}" > $var_
152 | 	ret := var_.read()
153 | 	sys rm -f $var_
154 | 	return ret
155 | }
156 | 
157 | string get_md5sum( string file ) {
158 | 	return get_stdout( "md5sum $file | awk '{print $1}'" )
159 | }
160 | 
161 | int get_num_lines( string file ) {	
162 | 	if ( !path_exists( file ) ) {
163 | 		error("get_no_lines(): File doesn't exist! ($file)")
164 | 	}
165 | 	else {		
166 | 		if ( file.toLower().endsWith(".gz") ) { // check if compressed or not
167 | 			return get_stdout( "zcat $file | wc -l" ).parseInt()
168 | 		}
169 | 		else {
170 | 			return get_stdout( "cat $file | wc -l" ).parseInt()
171 | 		}
172 | 	}
173 | }
174 | 
175 | string get_hostname() {
176 | 	out := get_stdout("hostname -f").replace("\n","")
177 | 	if (out.startsWith("hostname: ")) return "default"
178 | 	else return out
179 | }


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # channels : defaults, r, bioconda
 2 | 
 3 | nomkl
 4 | samtools ==1.2
 5 | htslib ==1.4 # 1.5 in bioconda needed libbz2.so.1.0
 6 | bedtools ==2.26.0 #2.22 # 2.21.0
 7 | picard ==1.126 # wanted 1.129 here but doesn't exist. instead 1.139 has backward compatibility issue, so take 1.126
 8 | ucsc-fetchchromsizes
 9 | ucsc-wigtobigwig
10 | ucsc-bedgraphtobigwig
11 | ucsc-bigwiginfo
12 | ucsc-bedclip
13 | ucsc-bedtobigbed
14 | ucsc-twobittofa
15 | macs2 ==2.1.1.20160309 #2.1.0 (no binaries for OSX)
16 | boost ==1.57.0
17 | openblas ==0.2.19
18 | numpy ==1.11.3 #1.13.3 #1.10.2 (no binaries for OSX) #1.9.0, 1.8.2 conflicts with ATAQC
19 | matplotlib ==1.5.1
20 | six==1.10.0 # to fix (ImportError: cannot import name _thread)
21 | python-dateutil==2.6.1
22 | libgfortran==3.0
23 | graphviz ==2.38.0
24 | libtool
25 | ghostscript # pdf2png
26 | pigz
27 | zlib
28 | sambamba ==0.6.6 # to fix seg fault error in 0.6.1
29 | r ==3.2.2
30 | r-snow
31 | r-snowfall
32 | r-bitops
33 | r-catools
34 | bioconductor-rsamtools
35 | r-spp ==1.13
36 | #glibc #segmentation fault in conda with openssl
37 | pyfaidx ==0.4.7.1
38 | 
39 | cutadapt ==1.9.1
40 | preseq ==2.0.3
41 | trim-galore ==0.4.1 # for old trimmer
42 | python-levenshtein # for old trimmer (trimAdapter.py)
43 | 
44 | bowtie2 ==2.2.6
45 | ncurses
46 | ucsc-bigWigAverageOverBed
47 | gnuplot #==5.0.3
48 | scipy # ==0.17.0: to fix 'undefined symbol: PyUnicodeUCS2_DecodeUTF8'
49 | pandas #==0.18.0 #==0.16.1 # ataqc
50 | metaseq #==0.5.6 # ataqc
51 | jinja2 # ataqc
52 | gsl # for preseq
53 | pysam==0.8.2.1 # 0.8.3, 0.9 from bioconda has an issue with ATAQC (segmentation fault), need to use -c bcbio
54 | pybedtools==0.6.9 # same issue as in pysam
55 | openssl==1.0.2p
56 | 


--------------------------------------------------------------------------------
/requirements_py3.txt:
--------------------------------------------------------------------------------
1 | nomkl
2 | python ==3.5.0
3 | numpy ==1.11.3
4 | idr ==2.0.3
5 | bedtools ==2.26.0
6 | pigz
7 | java-jdk ==8.0.92
8 | matplotlib ==1.5.1
9 | 


--------------------------------------------------------------------------------
/species/kundaje.conf:
--------------------------------------------------------------------------------
  1 | [mm9]
  2 | chrsz	= /mnt/data/pipeline_genome_data/mm9/mm9.chrom.sizes
  3 | seq_dir = /mnt/data/pipeline_genome_data/mm9/seq
  4 | gensz	= mm
  5 | umap	= /mnt/data/pipeline_genome_data/mm9/globalmap_k20tok54
  6 | bwa_idx	= /mnt/data/pipeline_genome_data/mm9/bwa_index/mm9.fa
  7 | bwt2_idx	= /mnt/data/pipeline_genome_data/mm9/bowtie2_index/mm9.fa
  8 | ref_fa	= /mnt/data/pipeline_genome_data/mm9/mm9.fa
  9 | blacklist	= /mnt/data/pipeline_genome_data/mm9/mm9-blacklist.bed.gz
 10 | # data for ATAQC
 11 | tss_enrich	= /mnt/data/pipeline_genome_data/mm9/ataqc/mm9_gencode_tss_unique.bed.gz
 12 | dnase	= /mnt/data/pipeline_genome_data/mm9/ataqc/mm9_univ_dhs_ucsc.from_mm10.bed.gz
 13 | prom	= /mnt/data/pipeline_genome_data/mm9/ataqc/tss_mm9_master.from_mm10.bed.gz
 14 | enh	= /mnt/data/pipeline_genome_data/mm9/ataqc/mm9_enh_dhs_ucsc.from_mm10.bed.gz
 15 | reg2map	= /mnt/data/pipeline_genome_data/mm9/ataqc/dnase_avgs_merged_named.fseq.vals.gz
 16 | roadmap_meta	= /mnt/data/pipeline_genome_data/mm9/ataqc/accession_to_name.txt
 17 | 
 18 | [mm10, mm10_ENCODE, mm10_ENCODE3] # from ENCODE portal
 19 | chrsz	= /mnt/data/pipeline_genome_data/mm10/mm10.chrom.sizes
 20 | seq_dir = /mnt/data/pipeline_genome_data/mm10/seq
 21 | gensz	= mm
 22 | bwa_idx	= /mnt/data/pipeline_genome_data/mm10/bwa_index/mm10_no_alt_analysis_set_ENCODE.fasta
 23 | bwt2_idx	= /mnt/data/pipeline_genome_data/mm10/bowtie2_index/mm10_no_alt_analysis_set_ENCODE.fasta
 24 | ref_fa	= /mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta
 25 | blacklist	= /mnt/data/pipeline_genome_data/mm10/mm10.blacklist.bed.gz
 26 | # data for ATAQC
 27 | tss_enrich	= /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_gencode_tss_unique.bed.gz
 28 | dnase	= /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_univ_dhs_ucsc.bed.gz
 29 | prom	= /mnt/data/pipeline_genome_data/mm10/ataqc/tss_mm10_master.bed.gz
 30 | enh	= /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_enh_dhs_ucsc.bed.gz
 31 | reg2map	= /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_formatted.txt.gz
 32 | reg2map_bed	= /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_celltype_compare_subsample.bed.gz
 33 | roadmap_meta	= /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_metadata.txt
 34 | ENCODE_assembly = mm10
 35 | 
 36 | [hg19]
 37 | chrsz	= /mnt/data/pipeline_genome_data/hg19/hg19.chrom.sizes
 38 | seq_dir = /mnt/data/pipeline_genome_data/hg19/seq
 39 | gensz	= hs
 40 | umap	= /mnt/data/pipeline_genome_data/hg19/globalmap_k20tok54
 41 | bwa_idx	= /mnt/data/pipeline_genome_data/hg19/bwa_index/male.hg19.fa
 42 | bwt2_idx	= /mnt/data/pipeline_genome_data/hg19/bowtie2_index/male.hg19.fa
 43 | ref_fa	= /mnt/data/pipeline_genome_data/hg19/male.hg19.fa
 44 | blacklist	= /mnt/data/pipeline_genome_data/hg19/wgEncodeDacMapabilityConsensusExcludable.bed.gz
 45 | 
 46 | mappability_map_peakseq = /mnt/data/pipeline_genome_data/hg19/Mapability_HG.txt
 47 | 
 48 | # data for ATAQC
 49 | tss_enrich	= /mnt/data/pipeline_genome_data/hg19/ataqc/hg19_gencode_tss_unique.bed.gz
 50 | dnase	= /mnt/data/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.bed.gz
 51 | prom	= /mnt/data/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_prom_p2.bed.gz
 52 | enh	= /mnt/data/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_enh_p2.bed.gz
 53 | reg2map	= /mnt/data/pipeline_genome_data/hg19/ataqc/dnase_avgs_reg2map_p10_merged_named.pvals.gz
 54 | roadmap_meta	= /mnt/data/pipeline_genome_data/hg19/ataqc/eid_to_mnemonic.txt
 55 | 
 56 | [hg38, hg38_ENCODE, hg38_ENCODE3] # from ENCODE portal
 57 | chrsz	= /mnt/data/pipeline_genome_data/hg38/hg38.chrom.sizes
 58 | seq_dir = /mnt/data/pipeline_genome_data/hg38/seq
 59 | gensz	= hs
 60 | bwa_idx	= /mnt/data/pipeline_genome_data/hg38/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta
 61 | bwt2_idx	= /mnt/data/pipeline_genome_data/hg38/bowtie2_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta
 62 | ref_fa	= /mnt/data/pipeline_genome_data/hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta
 63 | blacklist	= /mnt/data/pipeline_genome_data/hg38/hg38.blacklist.bed.gz
 64 | # data for ATAQC
 65 | tss_enrich	= /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_gencode_tss_unique.bed.gz
 66 | dnase	= /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.hg19_to_hg38.bed.gz
 67 | prom	= /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_prom_p2.hg19_to_hg38.bed.gz
 68 | enh	= /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_enh_p2.hg19_to_hg38.bed.gz
 69 | reg2map	= /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_formatted.txt.gz
 70 | reg2map_bed	= /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_celltype_compare_subsample.bed.gz
 71 | roadmap_meta	= /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_metadata.txt
 72 | ENCODE_assembly = GRCh38
 73 | 
 74 | [hg38_chr19_chrM] # hg38 with chr19 and chrM only
 75 | chrsz	= /mnt/data/pipeline_genome_data/hg38_chr19_chrM/hg38_chr19_chrM.chrom.sizes
 76 | seq_dir = /mnt/data/pipeline_genome_data/hg38/seq
 77 | gensz	= hs
 78 | bwa_idx	= /mnt/data/pipeline_genome_data/hg38_chr19_chrM/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.chr19_chrM.fasta
 79 | bwt2_idx	= /mnt/data/pipeline_genome_data/hg38_chr19_chrM/bowtie2_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.chr19_chrM.fasta
 80 | ref_fa	= /mnt/data/pipeline_genome_data/hg38_chr19_chrM/GRCh38_no_alt_analysis_set_GCA_000001405.15.chr19_chrM.fasta
 81 | blacklist	= /mnt/data/pipeline_genome_data/hg38_chr19_chrM/hg38.blacklist.bed.gz
 82 | # data for ATAQC
 83 | tss_enrich	= /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_gencode_tss_unique.bed.gz
 84 | dnase	= /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.hg19_to_hg38.bed.gz
 85 | prom	= /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_prom_p2.hg19_to_hg38.bed.gz
 86 | enh	= /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_enh_p2.hg19_to_hg38.bed.gz
 87 | reg2map	= /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_formatted.txt.gz
 88 | reg2map_bed	= /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_celltype_compare_subsample.bed.gz
 89 | roadmap_meta	= /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_metadata.txt
 90 | ENCODE_assembly = GRCh38
 91 | 
 92 | [dm3] # installed by install_genome_data.sh
 93 | chrsz	= /mnt/data/pipeline_genome_data/dm3/dm3.chrom.sizes
 94 | seq_dir = /mnt/data/pipeline_genome_data/dm3/seq
 95 | gensz	= 168736537
 96 | bwa_idx	= /mnt/data/pipeline_genome_data/dm3/bwa_index/dm3.fa
 97 | bwt2_idx	= /mnt/data/pipeline_genome_data/dm3/bowtie2_index/dm3.fa
 98 | ref_fa	= /mnt/data/pipeline_genome_data/dm3/dm3.fa
 99 | 
100 | [pantro5] # installed by install_genome_data.sh
101 | chrsz	= /mnt/data/pipeline_genome_data/pantro5/pantro5.chrom.sizes
102 | seq_dir = /mnt/data/pipeline_genome_data/pantro5/seq
103 | gensz	= 3231170666
104 | bwa_idx	= /mnt/data/pipeline_genome_data/pantro5/bwa_index/panTro5.fa
105 | bwt2_idx	= /mnt/data/pipeline_genome_data/pantro5/bowtie2_index/panTro5.fa
106 | ref_fa	= /mnt/data/pipeline_genome_data/pantro5/panTro5.fa
107 | 
108 | [macam7] # installed by install_genome_data.sh
109 | chrsz	= /mnt/data/pipeline_genome_data/macam7/macam7.chrom.sizes
110 | seq_dir = /mnt/data/pipeline_genome_data/macam7/seq
111 | gensz	= 2817542206
112 | bwa_idx	= /mnt/data/pipeline_genome_data/macam7/bwa_index/MacaM_Rhesus_Genome_v7.fasta
113 | bwt2_idx	= /mnt/data/pipeline_genome_data/macam7/bowtie2_index/MacaM_Rhesus_Genome_v7.fasta
114 | ref_fa	= /mnt/data/pipeline_genome_data/macam7/MacaM_Rhesus_Genome_v7.fasta
115 | nonamecheck = true # for bedtools >= 2.24. this prevents name convention error in bedtools intersect
116 | 
117 | [saccer3] # installed by install_genome_data.sh
118 | chrsz   = /mnt/data/pipeline_genome_data/saccer3/saccer3.chrom.sizes
119 | seq     = /mnt/data/pipeline_genome_data/saccer3/seq
120 | gensz   = 12157105
121 | bwa_idx = /mnt/data/pipeline_genome_data/saccer3/bwa_index/sacCer3.fa
122 | bwt2_idx= /mnt/data/pipeline_genome_data/saccer3/bowtie2_index/sacCer3.fa
123 | ref_fa  = /mnt/data/pipeline_genome_data/saccer3/sacCer3.fa
124 | 
125 | 


--------------------------------------------------------------------------------
/species/scg.conf:
--------------------------------------------------------------------------------
  1 | [mm9]
  2 | chrsz	= /reference/ENCODE/pipeline_genome_data/mm9/mm9.chrom.sizes
  3 | seq_dir = /reference/ENCODE/pipeline_genome_data/mm9/seq
  4 | gensz	= mm
  5 | umap	= /reference/ENCODE/pipeline_genome_data/mm9/globalmap_k20tok54
  6 | bwa_idx	= /reference/ENCODE/pipeline_genome_data/mm9/bwa_index/mm9.fa
  7 | bwt2_idx	= /reference/ENCODE/pipeline_genome_data/mm9/bowtie2_index/mm9.fa
  8 | ref_fa	= /reference/ENCODE/pipeline_genome_data/mm9/mm9.fa
  9 | blacklist	= /reference/ENCODE/pipeline_genome_data/mm9/mm9-blacklist.bed.gz
 10 | # data for ATAQC
 11 | tss_enrich	= /reference/ENCODE/pipeline_genome_data/mm9/ataqc/mm9_gencode_tss_unique.bed.gz
 12 | dnase	= /reference/ENCODE/pipeline_genome_data/mm9/ataqc/mm9_univ_dhs_ucsc.from_mm10.bed.gz
 13 | prom	= /reference/ENCODE/pipeline_genome_data/mm9/ataqc/tss_mm9_master.from_mm10.bed.gz
 14 | enh	= /reference/ENCODE/pipeline_genome_data/mm9/ataqc/mm9_enh_dhs_ucsc.from_mm10.bed.gz
 15 | reg2map	= /reference/ENCODE/pipeline_genome_data/mm9/ataqc/dnase_avgs_merged_named.fseq.vals.gz
 16 | reg2map_bed   = /reference/ENCODE/pipeline_genome_data/mm9/ataqc/mm9_dhs_universal_ucsc_v1.bed.gz
 17 | roadmap_meta	= /reference/ENCODE/pipeline_genome_data/mm9/ataqc/accession_to_name.txt
 18 | 
 19 | [mm10, mm10_ENCODE, mm10_ENCODE3] # from ENCODE portal
 20 | chrsz	= /reference/ENCODE/pipeline_genome_data/mm10/mm10.chrom.sizes
 21 | seq_dir = /reference/ENCODE/pipeline_genome_data/mm10/seq
 22 | gensz	= mm
 23 | bwa_idx	= /reference/ENCODE/pipeline_genome_data/mm10/bwa_index/mm10_no_alt_analysis_set_ENCODE.fasta
 24 | bwt2_idx	= /reference/ENCODE/pipeline_genome_data/mm10/bowtie2_index/mm10_no_alt_analysis_set_ENCODE.fasta
 25 | ref_fa	= /reference/ENCODE/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta
 26 | blacklist	= /reference/ENCODE/pipeline_genome_data/mm10/mm10.blacklist.bed.gz
 27 | # data for ATAQC
 28 | tss_enrich	= /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_gencode_tss_unique.bed.gz
 29 | dnase	= /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_univ_dhs_ucsc.bed.gz
 30 | prom	= /reference/ENCODE/pipeline_genome_data/mm10/ataqc/tss_mm10_master.bed.gz
 31 | enh	= /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_enh_dhs_ucsc.bed.gz
 32 | reg2map	= /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_formatted.txt.gz
 33 | reg2map_bed	= /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_celltype_compare_subsample.bed.gz
 34 | roadmap_meta	= /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_metadata.txt
 35 | ENCODE_assembly = mm10
 36 | 
 37 | [hg19]
 38 | chrsz	= /reference/ENCODE/pipeline_genome_data/hg19/hg19.chrom.sizes
 39 | seq_dir = /reference/ENCODE/pipeline_genome_data/hg19/seq
 40 | gensz	= hs
 41 | umap	= /reference/ENCODE/pipeline_genome_data/hg19/globalmap_k20tok54
 42 | bwa_idx	= /reference/ENCODE/pipeline_genome_data/hg19/bwa_index/male.hg19.fa
 43 | bwt2_idx	= /reference/ENCODE/pipeline_genome_data/hg19/bowtie2_index/male.hg19.fa
 44 | ref_fa	= /reference/ENCODE/pipeline_genome_data/hg19/male.hg19.fa
 45 | blacklist	= /reference/ENCODE/pipeline_genome_data/hg19/wgEncodeDacMapabilityConsensusExcludable.bed.gz
 46 | # data for ATAQC
 47 | tss_enrich	= /reference/ENCODE/pipeline_genome_data/hg19/ataqc/hg19_gencode_tss_unique.bed.gz
 48 | dnase	= /reference/ENCODE/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.bed.gz
 49 | prom	= /reference/ENCODE/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_prom_p2.bed.gz
 50 | enh	= /reference/ENCODE/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_enh_p2.bed.gz
 51 | reg2map	= /reference/ENCODE/pipeline_genome_data/hg19/ataqc/dnase_avgs_reg2map_p10_merged_named.pvals.gz
 52 | roadmap_meta	= /reference/ENCODE/pipeline_genome_data/hg19/ataqc/eid_to_mnemonic.txt
 53 | 
 54 | [hg38, hg38_ENCODE, hg38_ENCODE3] # from ENCODE portal
 55 | chrsz	= /reference/ENCODE/pipeline_genome_data/hg38/hg38.chrom.sizes
 56 | seq_dir = /reference/ENCODE/pipeline_genome_data/hg38/seq
 57 | gensz	= hs
 58 | bwa_idx	= /reference/ENCODE/pipeline_genome_data/hg38/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta
 59 | bwt2_idx	= /reference/ENCODE/pipeline_genome_data/hg38/bowtie2_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta
 60 | ref_fa	= /reference/ENCODE/pipeline_genome_data/hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta
 61 | blacklist	= /reference/ENCODE/pipeline_genome_data/hg38/hg38.blacklist.bed.gz
 62 | # data for ATAQC
 63 | tss_enrich	= /reference/ENCODE/pipeline_genome_data/hg38/ataqc/hg38_gencode_tss_unique.bed.gz
 64 | dnase	= /reference/ENCODE/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.hg19_to_hg38.bed.gz
 65 | prom	= /reference/ENCODE/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_prom_p2.hg19_to_hg38.bed.gz
 66 | enh	= /reference/ENCODE/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_enh_p2.hg19_to_hg38.bed.gz
 67 | reg2map	= /reference/ENCODE/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_formatted.txt.gz
 68 | reg2map_bed	= /reference/ENCODE/pipeline_genome_data/hg38/ataqc/hg38_celltype_compare_subsample.bed.gz
 69 | roadmap_meta	= /reference/ENCODE/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_metadata.txt
 70 | ENCODE_assembly = GRCh38
 71 | 
 72 | [dm3] # installed by install_genome_data.sh
 73 | chrsz	= /reference/ENCODE/pipeline_genome_data/dm3/dm3.chrom.sizes
 74 | seq_dir = /reference/ENCODE/pipeline_genome_data/dm3/seq
 75 | gensz	= 168736537
 76 | bwa_idx	= /reference/ENCODE/pipeline_genome_data/dm3/bwa_index/dm3.fa
 77 | bwt2_idx	= /reference/ENCODE/pipeline_genome_data/dm3/bowtie2_index/dm3.fa
 78 | ref_fa	= /reference/ENCODE/pipeline_genome_data/dm3/dm3.fa
 79 | 
 80 | [pantro5] # installed by install_genome_data.sh
 81 | chrsz	= /reference/ENCODE/pipeline_genome_data/pantro5/pantro5.chrom.sizes
 82 | seq_dir = /reference/ENCODE/pipeline_genome_data/pantro5/seq
 83 | gensz	= 3231170666
 84 | bwa_idx	= /reference/ENCODE/pipeline_genome_data/pantro5/bwa_index/panTro5.fa
 85 | bwt2_idx	= /reference/ENCODE/pipeline_genome_data/pantro5/bowtie2_index/panTro5.fa
 86 | ref_fa	= /reference/ENCODE/pipeline_genome_data/pantro5/panTro5.fa
 87 | 
 88 | [macam7] # installed by install_genome_data.sh
 89 | chrsz	= /reference/ENCODE/pipeline_genome_data/macam7/macam7.chrom.sizes
 90 | seq_dir = /reference/ENCODE/pipeline_genome_data/macam7/seq
 91 | gensz	= 2817542206
 92 | bwa_idx	= /reference/ENCODE/pipeline_genome_data/macam7/bwa_index/MacaM_Rhesus_Genome_v7.fasta
 93 | bwt2_idx	= /reference/ENCODE/pipeline_genome_data/macam7/bowtie2_index/MacaM_Rhesus_Genome_v7.fasta
 94 | ref_fa	= /reference/ENCODE/pipeline_genome_data/macam7/MacaM_Rhesus_Genome_v7.fasta
 95 | nonamecheck = true # for bedtools >= 2.24. this prevents name convention error in bedtools intersect
 96 | 
 97 | [saccer3] # installed by install_genome_data.sh
 98 | chrsz   = /reference/ENCODE/pipeline_genome_data/saccer3/saccer3.chrom.sizes
 99 | seq     = /reference/ENCODE/pipeline_genome_data/saccer3/seq
100 | gensz   = 12157105
101 | bwa_idx = /reference/ENCODE/pipeline_genome_data/saccer3/bwa_index/sacCer3.fa
102 | bwt2_idx= /reference/ENCODE/pipeline_genome_data/saccer3/bowtie2_index/sacCer3.fa
103 | ref_fa  = /reference/ENCODE/pipeline_genome_data/saccer3/sacCer3.fa
104 | 
105 | 


--------------------------------------------------------------------------------
/species/sherlock.conf:
--------------------------------------------------------------------------------
  1 | [mm9]
  2 | chrsz	= /home/groups/cherry/encode/pipeline_genome_data/mm9/mm9.chrom.sizes
  3 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/mm9/seq
  4 | gensz	= mm
  5 | umap	= /home/groups/cherry/encode/pipeline_genome_data/mm9/globalmap_k20tok54
  6 | bwa_idx	= /home/groups/cherry/encode/pipeline_genome_data/mm9/bwa_index/mm9.fa
  7 | bwt2_idx	= /home/groups/cherry/encode/pipeline_genome_data/mm9/bowtie2_index/mm9.fa
  8 | ref_fa	= /home/groups/cherry/encode/pipeline_genome_data/mm9/mm9.fa
  9 | blacklist	= /home/groups/cherry/encode/pipeline_genome_data/mm9/mm9-blacklist.bed.gz
 10 | # data for ATAQC
 11 | tss_enrich	= /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/mm9_gencode_tss_unique.bed.gz
 12 | dnase	= /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/mm9_univ_dhs_ucsc.from_mm10.bed.gz
 13 | prom	= /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/tss_mm9_master.from_mm10.bed.gz
 14 | enh	= /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/mm9_enh_dhs_ucsc.from_mm10.bed.gz
 15 | reg2map	= /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/dnase_avgs_merged_named.fseq.vals.gz
 16 | reg2map_bed   = /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/mm9_dhs_universal_ucsc_v1.bed.gz
 17 | roadmap_meta	= /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/accession_to_name.txt
 18 | 
 19 | [mm10, mm10_ENCODE, mm10_ENCODE3] # from ENCODE portal
 20 | chrsz	= /home/groups/cherry/encode/pipeline_genome_data/mm10/mm10.chrom.sizes
 21 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/mm10/seq
 22 | gensz	= mm
 23 | bwa_idx	= /home/groups/cherry/encode/pipeline_genome_data/mm10/bwa_index/mm10_no_alt_analysis_set_ENCODE.fasta
 24 | bwt2_idx	= /home/groups/cherry/encode/pipeline_genome_data/mm10/bowtie2_index/mm10_no_alt_analysis_set_ENCODE.fasta
 25 | ref_fa	= /home/groups/cherry/encode/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta
 26 | blacklist	= /home/groups/cherry/encode/pipeline_genome_data/mm10/mm10.blacklist.bed.gz
 27 | # data for ATAQC
 28 | tss_enrich	= /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_gencode_tss_unique.bed.gz
 29 | dnase	= /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_univ_dhs_ucsc.bed.gz
 30 | prom	= /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/tss_mm10_master.bed.gz
 31 | enh	= /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_enh_dhs_ucsc.bed.gz
 32 | reg2map	= /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_formatted.txt.gz
 33 | reg2map_bed	= /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_celltype_compare_subsample.bed.gz
 34 | roadmap_meta	= /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_metadata.txt
 35 | ENCODE_assembly = mm10
 36 | 
 37 | [hg19]
 38 | chrsz	= /home/groups/cherry/encode/pipeline_genome_data/hg19/hg19.chrom.sizes
 39 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/hg19/seq
 40 | gensz	= hs
 41 | umap	= /home/groups/cherry/encode/pipeline_genome_data/hg19/globalmap_k20tok54
 42 | bwa_idx	= /home/groups/cherry/encode/pipeline_genome_data/hg19/bwa_index/male.hg19.fa
 43 | bwt2_idx	= /home/groups/cherry/encode/pipeline_genome_data/hg19/bowtie2_index/male.hg19.fa
 44 | ref_fa	= /home/groups/cherry/encode/pipeline_genome_data/hg19/male.hg19.fa
 45 | blacklist	= /home/groups/cherry/encode/pipeline_genome_data/hg19/wgEncodeDacMapabilityConsensusExcludable.bed.gz
 46 | # data for ATAQC
 47 | tss_enrich	= /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/hg19_gencode_tss_unique.bed.gz
 48 | dnase	= /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.bed.gz
 49 | prom	= /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_prom_p2.bed.gz
 50 | enh	= /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_enh_p2.bed.gz
 51 | reg2map	= /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/dnase_avgs_reg2map_p10_merged_named.pvals.gz
 52 | roadmap_meta	= /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/eid_to_mnemonic.txt
 53 | 
 54 | [hg38, hg38_ENCODE, hg38_ENCODE3] # from ENCODE portal
 55 | chrsz	= /home/groups/cherry/encode/pipeline_genome_data/hg38/hg38.chrom.sizes
 56 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/hg38/seq
 57 | gensz	= hs
 58 | bwa_idx	= /home/groups/cherry/encode/pipeline_genome_data/hg38/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta
 59 | bwt2_idx	= /home/groups/cherry/encode/pipeline_genome_data/hg38/bowtie2_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta
 60 | ref_fa	= /home/groups/cherry/encode/pipeline_genome_data/hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta
 61 | blacklist	= /home/groups/cherry/encode/pipeline_genome_data/hg38/hg38.blacklist.bed.gz
 62 | # data for ATAQC
 63 | tss_enrich	= /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/hg38_gencode_tss_unique.bed.gz
 64 | dnase	= /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.hg19_to_hg38.bed.gz
 65 | prom	= /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_prom_p2.hg19_to_hg38.bed.gz
 66 | enh	= /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_enh_p2.hg19_to_hg38.bed.gz
 67 | reg2map	= /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_formatted.txt.gz
 68 | reg2map_bed	= /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/hg38_celltype_compare_subsample.bed.gz
 69 | roadmap_meta	= /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_metadata.txt
 70 | ENCODE_assembly = GRCh38
 71 | 
 72 | [dm3] # installed by install_genome_data.sh
 73 | chrsz	= /home/groups/cherry/encode/pipeline_genome_data/dm3/dm3.chrom.sizes
 74 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/dm3/seq
 75 | gensz	= 168736537
 76 | bwa_idx	= /home/groups/cherry/encode/pipeline_genome_data/dm3/bwa_index/dm3.fa
 77 | bwt2_idx	= /home/groups/cherry/encode/pipeline_genome_data/dm3/bowtie2_index/dm3.fa
 78 | ref_fa	= /home/groups/cherry/encode/pipeline_genome_data/dm3/dm3.fa
 79 | 
 80 | [pantro5] # installed by install_genome_data.sh
 81 | chrsz	= /home/groups/cherry/encode/pipeline_genome_data/pantro5/pantro5.chrom.sizes
 82 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/pantro5/seq
 83 | gensz	= 3231170666
 84 | bwa_idx	= /home/groups/cherry/encode/pipeline_genome_data/pantro5/bwa_index/panTro5.fa
 85 | bwt2_idx	= /home/groups/cherry/encode/pipeline_genome_data/pantro5/bowtie2_index/panTro5.fa
 86 | ref_fa	= /home/groups/cherry/encode/pipeline_genome_data/pantro5/panTro5.fa
 87 | 
 88 | [macam7] # installed by install_genome_data.sh
 89 | chrsz	= /home/groups/cherry/encode/pipeline_genome_data/macam7/macam7.chrom.sizes
 90 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/macam7/seq
 91 | gensz	= 2817542206
 92 | bwa_idx	= /home/groups/cherry/encode/pipeline_genome_data/macam7/bwa_index/MacaM_Rhesus_Genome_v7.fasta
 93 | bwt2_idx	= /home/groups/cherry/encode/pipeline_genome_data/macam7/bowtie2_index/MacaM_Rhesus_Genome_v7.fasta
 94 | ref_fa	= /home/groups/cherry/encode/pipeline_genome_data/macam7/MacaM_Rhesus_Genome_v7.fasta
 95 | nonamecheck = true # for bedtools >= 2.24. this prevents name convention error in bedtools intersect
 96 | 
 97 | [saccer3] # installed by install_genome_data.sh
 98 | chrsz   = /home/groups/cherry/encode/pipeline_genome_data/saccer3/saccer3.chrom.sizes
 99 | seq     = /home/groups/cherry/encode/pipeline_genome_data/saccer3/seq
100 | gensz   = 12157105
101 | bwa_idx = /home/groups/cherry/encode/pipeline_genome_data/saccer3/bwa_index/sacCer3.fa
102 | bwt2_idx= /home/groups/cherry/encode/pipeline_genome_data/saccer3/bowtie2_index/sacCer3.fa
103 | ref_fa  = /home/groups/cherry/encode/pipeline_genome_data/saccer3/sacCer3.fa
104 | 
105 | 


--------------------------------------------------------------------------------
/uninstall_dependencies.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ## conda environment name
 4 | 
 5 | ENV_NAME=bds_atac
 6 | ENV_NAME_PY3=bds_atac_py3
 7 | 
 8 | conda env remove --name ${ENV_NAME} -y
 9 | conda env remove --name ${ENV_NAME_PY3} -y
10 | 


--------------------------------------------------------------------------------
/utils/assign_multimappers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | # piped script to take multimappers and randomly assign
 4 | # requires a qname sorted file!!
 5 | 
 6 | import sys
 7 | import random
 8 | import argparse
 9 | 
10 | def parse_args():
11 |     '''
12 |     Gives options
13 |     '''
14 |     parser = argparse.ArgumentParser(description='Saves reads below a alignment threshold and discards all others')
15 |     parser.add_argument('-k', help='Alignment number cutoff')
16 |     parser.add_argument('--paired-end', dest='paired_ended', action='store_true', help='Data is paired-end')
17 |     args = parser.parse_args()
18 |     alignment_cutoff = int(args.k)
19 |     paired_ended = args.paired_ended
20 | 
21 |     return alignment_cutoff, paired_ended
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     '''
26 |     Runs the filtering step of choosing multimapped reads
27 |     '''
28 | 
29 |     [alignment_cutoff, paired_ended] = parse_args()
30 | 
31 |     if paired_ended:
32 |         alignment_cutoff = int(alignment_cutoff) * 2
33 | 
34 |     # Store each line in sam file as a list of reads, 
35 |     # where each read is a list of elements to easily 
36 |     # modify or grab things
37 |     current_reads = [] 
38 |     current_qname = ''
39 | 
40 |     for line in sys.stdin:
41 | 
42 |         read_elems = line.strip().split('\t')
43 | 
44 |         if read_elems[0].startswith('@'):
45 |             sys.stdout.write(line)
46 |             continue
47 | 
48 |         # Keep taking lines that have the same qname
49 |         if read_elems[0] == current_qname:
50 |             # Add line to current reads
51 |             current_reads.append(line)
52 |             pass
53 |         else:
54 |             # Discard if there are more than the alignment cutoff
55 |             if len(current_reads) >= alignment_cutoff:
56 |                 current_reads = [line]
57 |                 current_qname = read_elems[0]
58 |             elif len(current_reads) > 0:
59 |                 # Just output all reads, which are then filtered with
60 |                 # samtools
61 |                 for read in current_reads:
62 |                     sys.stdout.write(str(read))
63 | 
64 |                 # And then discard
65 |                 current_reads = [line]
66 |                 current_qname = read_elems[0]
67 |             else:
68 |                 # First read in file
69 |                 current_reads.append(line)
70 |                 current_qname = read_elems[0]
71 | 
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/utils/axt_dirfiles.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | import sys,glob,gzip,os
 4 | 
 5 | # axt format: http://genome.ucsc.edu/goldenPath/help/axt.html
 6 | 
 7 | if len(sys.argv)!=3:
 8 | 	print '<chr size file> <output file> Run under the dir of gzipped Axt files, presumably one for each target chr but that doesn\'t matter'
 9 | 	sys.exit()
10 | 
11 | chrsize={}
12 | with open(sys.argv[1]) as fin:
13 | 	for line in fin:
14 | 		lst=line.rstrip().split('\t')
15 | 		chrsize[lst[0]]=int(lst[1])
16 | 
17 | 
18 | OF=sys.argv[2]
19 | 
20 | fout=open(OF,'w')
21 | 
22 | id=1
23 | 
24 | for f in glob.glob('*'):
25 | 	fin=gzip.GzipFile(f,'r')
26 | 	line=fin.readline()
27 | 	while line:
28 | 		if line[0]!='#':
29 | 			lst=line.rstrip().split()
30 | 			# query start/stop
31 | 			a=0
32 | 			b=0
33 | 			if lst[7]=='+':
34 | 				a=int(lst[5])-1
35 | 				b=lst[6]
36 | 			else:
37 | 				c=chrsize[lst[4]]
38 | 				a=c-int(lst[6])
39 | 				b=c-int(lst[5])+1
40 | 
41 | 			fout.write('{0[1]}\t{2}\t{0[3]}\tid:{1},genomealign:{{chr:"{0[4]}",start:{3},stop:{4},strand:"{0[7]}",targetseq:'.format(
42 | 				lst,
43 | 				id,
44 | 				int(lst[2])-1,
45 | 				a,
46 | 				b
47 | 				))
48 | 			id+=1
49 | 			line=fin.readline().rstrip()
50 | 			fout.write('"'+line+'",queryseq:')
51 | 			line=fin.readline().rstrip()
52 | 			fout.write('"'+line+'"}\n')
53 | 			fin.readline()
54 | 		line=fin.readline()
55 | 
56 | 
57 | fout.close()
58 | 
59 | 
60 | os.system('sort -k1,1 -k2,2n '+OF+' > xx')
61 | os.system('mv xx '+OF)
62 | os.system('bgzip -f '+OF)
63 | os.system('tabix -f -p bed '+OF+'.gz')
64 | 


--------------------------------------------------------------------------------
/utils/bds_scr:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "$#" -lt 2 ]; then
 4 |   echo
 5 |   echo "Create a detached screen for a BDS script and redirect stdout/stderr to a log file."
 6 |   echo "If you skip [LOG_FILE_NAME], a log file [SCR_NAME].log will be generated on the working directory."
 7 |   echo "If a log file already exists, stdout/stderr will be appended to it."
 8 |   echo "Monitor a log file with 'tail -f [LOG_FILE_NAME]'"
 9 |   echo
10 |   echo "Usage: bds_scr [SCR_NAME] [LOG_FILE_NAME] [BDS_PARAM]"
11 |   echo "  Example: bds_scr TEST ~/TEST.log -s sge chipseq.bds -fastq1 ..."
12 |   echo
13 |   exit 0
14 | fi
15 | 
16 | SCR_NAME="$1".BDS
17 | 
18 | #if [ $(screen -ls $SCR_NAME | grep 'No Sockets' | wc -l) != "1" ]; then
19 | if [ $(screen -ls | grep -P "[\t ]\d+.$SCR_NAME" | wc -l) != "0" ]; then
20 |   echo "error: A screen named $SCR_NAME already exists."
21 |   exit 1
22 | else
23 |   echo "[SCR_NAME] : $SCR_NAME"
24 | fi
25 | 
26 | if [[ $2 == -* || $2 == *.bds ]]; then # LOG_FILE_NAME skipped
27 |   LOG_FILE_NAME="$PWD/$SCR_NAME.log"
28 |   PARAM_START_IDX=2
29 | elif [[ $3 == -* || $3 == *.bds ]]; then
30 |   LOG_FILE_NAME=$2
31 |   PARAM_START_IDX=3
32 | else
33 |   echo "error: [BDS_PARAM] is wrong."
34 |   exit 1
35 | fi
36 | 
37 | PARAM=
38 | 
39 | if [ $(find $LOG_FILE_NAME -mmin -2 2> /dev/null | wc -l) != "0" ]; then
40 |   echo "error: log file handle is open or very fresh (modified in past 2 minutes)."
41 |   exit 3
42 | fi
43 | 
44 | for ((i=$PARAM_START_IDX;i<=$#;i++)); do
45 |   PARAM="$PARAM ${!i}"
46 | done
47 | 
48 | echo "[HOST] : $(hostname -f)"
49 | echo "[LOG_FILE_NAME] : $LOG_FILE_NAME"
50 | echo "[BDS_PARAM] : $PARAM"
51 | 
52 | mkdir -p $(dirname $LOG_FILE_NAME)
53 | 
54 | echo ""
55 | echo "===== Created a new screen ====" >> $LOG_FILE_NAME
56 | echo "[DATE] : $(date)" >> $LOG_FILE_NAME
57 | echo "[HOST] : $(hostname -f)" >> $LOG_FILE_NAME
58 | echo "[SCR_NAME] : $SCR_NAME" >> $LOG_FILE_NAME
59 | echo "[BDS_PARAM] : $PARAM" >> $LOG_FILE_NAME
60 | echo "" >> $LOG_FILE_NAME
61 | 
62 | screen -Sdm $SCR_NAME bash -c "bds &>>$LOG_FILE_NAME $PARAM $>>$LOG_FILE_NAME"
63 | 
64 | 


--------------------------------------------------------------------------------
/utils/bds_scr_5min:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "$#" -lt 2 ]; then
 4 |   echo
 5 |   echo "Create a detached screen for a BDS script and redirect stdout/stderr to a log file."
 6 |   echo "If you skip [LOG_FILE_NAME], a log file [SCR_NAME].log will be generated on the working directory."
 7 |   echo "If a log file already exists, stdout/stderr will be appended to it."
 8 |   echo "Monitor a log file with 'tail -f [LOG_FILE_NAME]'"
 9 |   echo
10 |   echo "Usage: bds_scr [SCR_NAME] [LOG_FILE_NAME] [BDS_PARAM]"
11 |   echo "  Example: bds_scr TEST ~/TEST.log -s sge chipseq.bds -fastq1 ..."
12 |   echo
13 |   exit 0
14 | fi
15 | 
16 | SCR_NAME=$1.BDS
17 | 
18 | #if [ $(screen -ls $SCR_NAME | grep 'No Sockets' | wc -l) != "1" ]; then
19 | if [ $(screen -ls | grep -P "[\t ]\d+.$SCR_NAME" | wc -l) != "0" ]; then
20 |   echo "error: A screen named $SCR_NAME already exists."
21 |   exit 1
22 | else
23 |   echo "[SCR_NAME] : $SCR_NAME"
24 | fi
25 | 
26 | if [[ $2 == -* || $2 == *.bds ]]; then # LOG_FILE_NAME skipped
27 |   LOG_FILE_NAME="$PWD/$SCR_NAME.log"
28 |   PARAM_START_IDX=2
29 | elif [[ $3 == -* || $3 == *.bds ]]; then
30 |   LOG_FILE_NAME=$2
31 |   PARAM_START_IDX=3
32 | else
33 |   echo "error: [BDS_PARAM] is wrong."
34 |   exit 2
35 | fi
36 | 
37 | if [ $(find $LOG_FILE_NAME -mmin -5 | wc -l) != "0" ]; then
38 |   echo "error: log file handle is open or very fresh (modified in past 5 minutes)."
39 |   exit 3
40 | fi
41 | 
42 | PARAM=
43 | 
44 | for ((i=$PARAM_START_IDX;i<=$#;i++)); do
45 |   PARAM="$PARAM ${!i}"
46 | done
47 | 
48 | echo "[LOG_FILE_NAME] : $LOG_FILE_NAME"
49 | echo "[BDS_PARAM] : $PARAM"
50 | 
51 | mkdir -p $(dirname $LOG_FILE_NAME)
52 | 
53 | echo ""
54 | echo "===== Created a new screen ====" >> $LOG_FILE_NAME
55 | echo "DATE : $(date)" >> $LOG_FILE_NAME
56 | echo "[HOST] : $(hostname -f)" >> $LOG_FILE_NAME
57 | echo "[SCR_NAME] : $SCR_NAME" >> $LOG_FILE_NAME
58 | echo "[BDS_PARAM] : $PARAM" >> $LOG_FILE_NAME
59 | echo "" >> $LOG_FILE_NAME
60 | 
61 | screen -Sdm $SCR_NAME bash -c "bds &>>$LOG_FILE_NAME $PARAM $>>$LOG_FILE_NAME"
62 | 
63 | 


--------------------------------------------------------------------------------
/utils/broadpeak.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | import sys,os
 4 | 
 5 | if len(sys.argv)!=3:
 6 | 	print '<broadpeak file> <track name>'
 7 | 	sys.exit()
 8 | 
 9 | infile,outfile=sys.argv[1:]
10 | 
11 | # all values on 9th field are -1, exclude them
12 | 
13 | id=1
14 | fout=open(outfile,'w')
15 | with open(infile) as fin:
16 | 	for line in fin:
17 | 		lst=line.rstrip().split('\t')
18 | 		fout.write('{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]}],id:{1},'.format(lst,id))
19 | 		id+=1
20 | 		if len(lst[3])>1:
21 | 			fout.write('name:"'+lst[3]+'",')
22 | 		if lst[5]!='.':
23 | 			fout.write('strand:"'+lst[5]+'",')
24 | 		fout.write('\n')
25 | fout.close()
26 | 
27 | os.system('sort -k1,1 -k2,2n '+outfile+' > '+outfile+'.srt')
28 | os.system('mv '+outfile+'.srt'+' '+outfile)
29 | os.system('bgzip -f '+outfile)
30 | os.system('tabix -f -p bed '+outfile+'.gz')
31 | 


--------------------------------------------------------------------------------
/utils/clusterGeneric/kill.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | #-------------------------------------------------------------------------------
 4 | # BDS generic cluster example
 5 | #
 6 | # This is a trivial example of the 'cluster generic' interface implementation.
 7 | # The commands implemented in this example simply pass the propper arguments 
 8 | # to qsub, qdel or qstat commands.
 9 | # This is intended as a toy example, since bds can do this directly (but 
10 | # it's a good starting point to extend your own implementation).
11 | #
12 | # The script is called when a task is killed
13 | #
14 | # Script's output: 
15 | #     None
16 | #
17 | # Command line arguments: 
18 | #     jobId: This is the jobId returned as the first line in 'clusterGenericRun' 
19 | #           script (i.e. the jobID provided by the cluster management system)
20 | #
21 | #                                                                Pablo Cingolani
22 | #-------------------------------------------------------------------------------
23 | 
24 | #---
25 | # Parse command line arguments
26 | #---
27 | die "Error: Missing arguments.\nUsage: kill.pl jobId\n" if $#ARGV < 0 ;
28 | #$jobId = shift @ARGV;
29 | $jobId = join(' ', @ARGV);
30 | 
31 | #---
32 | # Execute cluster command to kill task
33 | #---
34 | $exitCode = system "scancel $jobId";
35 | 
36 | # OK
37 | exit($exitCode);
38 | 
39 | 


--------------------------------------------------------------------------------
/utils/clusterGeneric/postMortemInfo.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | #-------------------------------------------------------------------------------
 4 | # BDS generic cluster example
 5 | #
 6 | # This is a trivial example of the 'cluster generic' interface implementation.
 7 | # The commands implemented in this example simply pass the propper arguments 
 8 | # to qsub, qdel or qstat commands.
 9 | # This is intended as a toy example, since bds can do this directly (but 
10 | # it's a good starting point to extend your own implementation).
11 | #
12 | # The following command is executed in order to get information of a recently 
13 | # finished jobId. This information is typically used for debuging and it added
14 | # to bds's output.
15 | #
16 | # Script's output: 
17 | #     The output is not parsed, it is stored and later shown 
18 | #     in bds's report. Is should contain information relevant 
19 | #     to the job's execution (e.g. "qstat -f $jobId" or 
20 | #     "checkjob -v $jobId")
21 | #
22 | # Command line arguments: 
23 | #     jobId: This is the jobId returned as the first line in 'clusterGenericRun' 
24 | #           script (i.e. the jobID provided by the cluster management system)
25 | #
26 | #                                                                Pablo Cingolani
27 | #-------------------------------------------------------------------------------
28 | 
29 | #---
30 | # Parse command line arguments
31 | #---
32 | die "Error: Missing arguments.\nUsage: postMortemInfo.pl jobId\n" if $#ARGV < 0 ;
33 | $jobId = shift @ARGV;
34 | 
35 | #---
36 | # Execute cluster command to show task details
37 | #---
38 | $exitCode = system "squeue -j $jobId";
39 | 
40 | # OK
41 | exit($exitCode);
42 | 
43 | 


--------------------------------------------------------------------------------
/utils/clusterGeneric/run.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use POSIX;
 4 | 
 5 | die "Error: Missing arguments.\nUsage: run.pl timeout cpus mem queue saveStdout saveStderr cmd arg1 ... argN\n" if $#ARGV < 6 ;
 6 | 
 7 | $timeout = shift @ARGV;
 8 | $cpus = shift @ARGV;
 9 | $mem = shift @ARGV;
10 | $queue = shift @ARGV;
11 | $saveStdout = shift @ARGV;
12 | $saveStderr = shift @ARGV;
13 | $cmd = join(' ', @ARGV);
14 | 
15 | $qsub = "sbatch --export=ALL ";
16 | $qsub .= "-n 1 --ntasks-per-node=1 --cpus-per-task=$cpus " if( $cpus > 0 );
17 | if( $mem > 0 ) {
18 | 	$mem = ceil($mem/1000000); # MB
19 | 	$qsub .= "--mem-per-cpu $mem ";
20 | }
21 | if( $timeout > 0 ) {
22 | 	$timeout = ceil($timeout/60); # minute
23 | 	$qsub .= "-t $timeout ";
24 | }
25 | if ( $queue ne "" ) {
26 |         $qsub .= "-p $queue "
27 | }
28 | 
29 | $pid = open QSUB, " | $qsub";
30 | die "Cannot run command '$qsub'\n" if ! kill(0, $pid); # Check that process exists
31 | print QSUB "#!/bin/sh \n";	# SLURM sbatch needs this shebang...
32 | print QSUB "$cmd\n";		# Send cluster's task via qsub's STDIN
33 | close QSUB;
34 | 
35 | exit(0);
36 | 
37 | 


--------------------------------------------------------------------------------
/utils/clusterGeneric/stat.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | #-------------------------------------------------------------------------------
 4 | # BDS generic cluster example
 5 | #
 6 | # This is a trivial example of the 'cluster generic' interface implementation.
 7 | # The commands implemented in this example simply pass the propper arguments 
 8 | # to qsub, qdel or qstat commands.
 9 | # This is intended as a toy example, since bds can do this directly (but 
10 | # it's a good starting point to extend your own implementation).
11 | #
12 | # This script is executed in order to show the jobID of all jobs currently 
13 | # scheduled in the cluster
14 | #
15 | # Script's output: 
16 | #     This script is expected to print all jobs currently scheduled or 
17 | #     running in the cluster (e.g. qstat). One per line. The FIRST column 
18 | #     should be the jobID (columns are spce or tab separated). Other 
19 | #     columns may exists (but are currently ignored).
20 | #
21 | # Command line arguments: 
22 | #     None
23 | #
24 | #                                                                Pablo Cingolani
25 | #-------------------------------------------------------------------------------
26 | 
27 | #---
28 | # Execute cluster command to show all tasks
29 | #---
30 | $exitCode = system "squeue";
31 | 
32 | # OK
33 | exit($exitCode);
34 | 


--------------------------------------------------------------------------------
/utils/detect_adapter.py:
--------------------------------------------------------------------------------
 1 | # written by Nathan Boley, from https://github.com/nboley/GGR_code
 2 | 
 3 | import sys
 4 | import gzip
 5 | 
 6 | VERBOSE = False
 7 | 
 8 | adapters = {
 9 |     'Illumina': b'AGATCGGAAGAGC',
10 |     'Nextera ': b'CTGTCTCTTATA',
11 |     'smallRNA': b'TGGAATTCTCGG'
12 | }
13 | 
14 | def detect_adapters_and_cnts(fname, max_n_lines=1000000):
15 |     adapter_cnts = {
16 |         'Illumina': 0,
17 |         'Nextera ': 0,
18 |         'smallRNA': 0
19 |     }
20 | 
21 |     with gzip.open(sys.argv[1]) as fp:
22 |         # read the first million sequences or to the end of the while -- whichever
23 |         # comes first, and then use the adapter for trimming which was found to
24 |         # occur most often
25 |         for seq_index, line in enumerate(fp):
26 |             if seq_index >= max_n_lines: break
27 |             if seq_index%4 != 1: continue
28 |             for key in adapters:
29 |                 if line.find(adapters[key]) > -1:
30 |                     adapter_cnts[key] += 1
31 | 
32 |     observed_adapters = [
33 |         adapter for adapter, cnt in sorted(
34 |             adapter_cnts.items(), key=lambda x: -x[1])
35 |         if cnt > 0
36 |     ]
37 |     return observed_adapters, adapter_cnts, seq_index//4
38 | 
39 | def detect_most_likely_adapter(fname):
40 |     observed_adapters, adapter_cnts, n_obs_adapters = detect_adapters_and_cnts(fname)
41 |     if observed_adapters:
42 |         best_adapter = observed_adapters[0]
43 |     else:
44 |         best_adapter = ""
45 | 
46 |     if VERBOSE:
47 |         print("\n\nAUTO-DETECTING ADAPTER TYPE\n===========================")
48 |         print("Attempting to auto-detect adapter type from the first 1 million sequences of the first file (>> {} <<)\n".format(
49 |             fname)
50 |         )
51 |         print("Found perfect matches for the following adapter sequences:")
52 |         print("Adapter type\tCount\tSequence\tSequences analysed\tPercentage")
53 |         for adapter in observed_adapters:
54 |             print("{}\t{}\t{}\t{}\t\t\t{:.2%}".format(
55 |                 adapter,
56 |                 adapter_cnts[adapter],
57 |                 adapters[adapter].decode(),
58 |                 n_obs_adapters,
59 |                 adapter_cnts[adapter]/n_obs_adapters)
60 |             )
61 |     return best_adapter
62 | 
63 | def main():
64 |     global VERBOSE
65 |     VERBOSE = True
66 |     best_adapter = detect_most_likely_adapter(sys.argv[1])
67 |     print(best_adapter)
68 | 
69 | if __name__ == '__main__':
70 |     main()
71 | 


--------------------------------------------------------------------------------
/utils/gappedpeak.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | import sys,os
 4 | 
 5 | if len(sys.argv)!=3:
 6 | 	print '<gappedpeak file> <track name>'
 7 | 	sys.exit()
 8 | 
 9 | infile,outfile=sys.argv[1:]
10 | 
11 | id=1
12 | fout=open(outfile,'w')
13 | with open(infile) as fin:
14 | 	for line in fin:
15 | 		lst=line.rstrip().split('\t')
16 | 		fout.write('{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[12]},{0[13]},{0[14]}],id:{1},struct:{{thin:[[{0[1]},{0[2]}]],thick:['.format(lst,id))
17 | 		id+=1
18 | 		a=int(lst[1])
19 | 		sizes=lst[10].split(',')
20 | 		starts=lst[11].split(',')
21 | 		for i in range(len(sizes)):
22 | 			fout.write('[{0},{1}],'.format(a+int(starts[i]),a+int(starts[i])+int(sizes[i])))
23 | 		fout.write(']},')
24 | 
25 | 		if len(lst[3])>1:
26 | 			fout.write('name:"'+lst[3]+'",')
27 | 		if lst[5]!='.':
28 | 			fout.write('strand:"'+lst[5]+'",')
29 | 		fout.write('\n')
30 | 
31 | fout.close()
32 | 
33 | os.system('sort -k1,1 -k2,2n '+outfile+' > '+outfile+'.srt')
34 | os.system('mv '+outfile+'.srt'+' '+outfile)
35 | os.system('bgzip -f '+outfile)
36 | os.system('tabix -f -p bed '+outfile+'.gz')
37 | 


--------------------------------------------------------------------------------
/utils/get_read_length_from_fastq.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # code extracted from Daniel Kim's ATAQC module (run_ataqc.py)
 3 | 
 4 | import os, sys, re, gzip
 5 | 
 6 | def getFileHandle(filename, mode="r"):
 7 |     if (re.search('.gz$',filename) or re.search('.gzip',filename)):
 8 |         if (mode=="r"):
 9 |             mode="rb";
10 |         return gzip.open(filename,mode)
11 |     else:
12 |         return open(filename,mode)
13 | 
14 | def get_read_length(fastq_file):
15 |     '''
16 |     Get read length out of fastq file
17 |     '''
18 |     total_reads_to_consider = 1000000
19 |     line_num = 0
20 |     total_reads_considered = 0
21 |     max_length = 0
22 |     with getFileHandle(fastq_file, 'rb') as fp:
23 |         for line in fp:
24 |             if line_num % 4 == 1:
25 |                 if len(line.strip()) > max_length:
26 |                     max_length = len(line.strip())
27 |                 total_reads_considered += 1
28 |             if total_reads_considered >= total_reads_to_consider:
29 |                 break
30 |             line_num += 1
31 | 
32 |     return int(max_length)
33 | 
34 | def main():
35 |     print(get_read_length(sys.argv[1]))
36 |     
37 | if __name__ == "__main__":
38 |     main()
39 | 


--------------------------------------------------------------------------------
/utils/kill_scr:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "$#" -lt 1 ]; then
 4 |   echo
 5 |   echo "Kill a screen with name [SCR_NAME]"
 6 |   echo "Usage : kill_scr [SCR_NAME]"
 7 |   echo
 8 |   screen -ls
 9 |   exit 1
10 | fi
11 | 
12 | screen -X -R $1 quit
13 | 


--------------------------------------------------------------------------------
/utils/narrowpeak.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | import sys,os
 4 | 
 5 | if len(sys.argv)!=3:
 6 | 	print '<narrowpeak file> <track name>'
 7 | 	sys.exit()
 8 | 
 9 | infile,outfile=sys.argv[1:]
10 | 
11 | id=1
12 | fout=open(outfile,'w')
13 | with open(infile) as fin:
14 | 	for line in fin:
15 | 		lst=line.rstrip().split('\t')
16 | 		fout.write('{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]},{0[8]}],id:{1},'.format(lst,id))
17 | 		id+=1
18 | 		if len(lst[3])>1:
19 | 			fout.write('name:"'+lst[3]+'",')
20 | 		if lst[5]!='.':
21 | 			fout.write('strand:"'+lst[5]+'",')
22 | 		if lst[9]!='-1':
23 | 			fout.write('sbstroke:['+lst[9]+']')
24 | 		fout.write('\n')
25 | 
26 | fout.close()
27 | 
28 | os.system('sort -k1,1 -k2,2n '+outfile+' > '+outfile+'.srt')
29 | os.system('mv '+outfile+'.srt'+' '+outfile)
30 | os.system('bgzip -f '+outfile)
31 | os.system('tabix -f -p bed '+outfile+'.gz')
32 | 


--------------------------------------------------------------------------------
/utils/narrowpeak_idr.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | # show -log10(GLOBAL IDR SCORE) instead of narrowpeak pval
 4 | 
 5 | import sys,os
 6 | 
 7 | if len(sys.argv)!=3:
 8 | 	print '<narrowpeak_idr file> <track name>'
 9 | 	sys.exit()
10 | 
11 | infile,outfile=sys.argv[1:]
12 | 
13 | id=1
14 | fout=open(outfile,'w')
15 | with open(infile) as fin:
16 | 	for line in fin:
17 | 		lst=line.rstrip().split('\t')
18 | 		fout.write('{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]},{0[8]},{0[10]},{0[11]}],id:{1},'.format(lst,id))
19 | 		id+=1
20 | 		if len(lst[3])>1:
21 | 			fout.write('name:"'+lst[3]+'",')
22 | 		else:
23 | 			fout.write('name:"'+str(id)+'",')
24 | 		if lst[5]!='.':
25 | 			fout.write('strand:"'+lst[5]+'",')
26 | 		if lst[9]!='-1':
27 | 			fout.write('sbstroke:['+lst[9]+']')
28 | 		fout.write('\n')
29 | 
30 | fout.close()
31 | 
32 | os.system('sort -k1,1 -k2,2n '+outfile+' > '+outfile+'.srt')
33 | os.system('mv '+outfile+'.srt'+' '+outfile)
34 | os.system('bgzip -f '+outfile)
35 | os.system('tabix -f -p bed '+outfile+'.gz')
36 | 


--------------------------------------------------------------------------------
/utils/parse_summary_ENCODE_accession_recursively.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | 
  3 | # written by Jin Lee, 2016
  4 | 
  5 | import os, sys
  6 | import json
  7 | import subprocess
  8 | import collections
  9 | import argparse
 10 | 
 11 | parser = argparse.ArgumentParser(prog='ENCODE_summary.json parser for ENCODE accession', \
 12 |                                     description='Recursively find ENCODE_summary.json, parse it and make a CSV for uploading to the ENCODE portal. Use https://github.com/ENCODE-DCC/pyencoded-tools/blob/master/ENCODE_submit_files.py for uploading.')
 13 | parser.add_argument('--out-file', type=argparse.FileType('w'), default=sys.stdout, \
 14 |                         help='Output CSV filename)')
 15 | parser.add_argument('--search-dir', type=str, default='.', \
 16 |                         help='Root directory to search for ENCODE_summary.json')
 17 | parser.add_argument('--json-file', type=str, default='ENCODE_summary.json', \
 18 |                         help='Specify json file name to be parsed')
 19 | parser.add_argument('--sort-by-genome-and-exp', dest='sort_by_genome_and_exp', action='store_true', \
 20 |                         help='Sort rows by genomes and ENCODE experiment accession ID')
 21 | group_accession_ids = parser.add_mutually_exclusive_group()
 22 | group_accession_ids.add_argument('--ignored-accession-ids-file', type=str, \
 23 |                         help='Accession IDs in this text file will be ignored. (1 acc. ID per line)')
 24 | group_accession_ids.add_argument('--accession-ids-file', type=str, \
 25 |                         help='Only accession IDs in this text file will be downloaded. (1 acc. ID per line). Others will be ignored.')
 26 | parser.set_defaults(sort_by_genome_and_exp=False)
 27 | 
 28 | args = parser.parse_args()
 29 | 
 30 | # loaded ignored accession list
 31 | ignored_accession_ids = []
 32 | if args.ignored_accession_ids_file and os.path.isfile(args.ignored_accession_ids_file):
 33 |     with open(args.ignored_accession_ids_file,'r') as f:
 34 |         ignored_accession_ids = f.read().splitlines()
 35 |     ignored_accession_ids = \
 36 |         [accession_id for accession_id in ignored_accession_ids if accession_id and not accession_id.startswith("#") ]
 37 | accession_ids = []
 38 | if args.accession_ids_file and os.path.isfile(args.accession_ids_file):
 39 |     with open(args.accession_ids_file,'r') as f:
 40 |         accession_ids = f.read().splitlines()
 41 |     accession_ids = \
 42 |         [accession_id for accession_id in accession_ids if accession_id and not accession_id.startswith("#") ]
 43 | 
 44 | # find all ENCODE_summary.json recursively
 45 | json_files = subprocess.check_output("find -L %s -name %s" % (args.search_dir,args.json_file), \
 46 |                                     shell=True ).strip().split('\n')
 47 | # read json
 48 | jsons = []
 49 | for json_file in json_files:
 50 |     with open(json_file,'r') as f:
 51 |         jsons.append( json.load(f) )
 52 |  
 53 | # look at headers first
 54 | raw_headers = list()
 55 | 
 56 | for json in jsons:
 57 |     if not 'data_files' in json:
 58 |         continue
 59 |     data_files = json['data_files']
 60 |     for data_file in data_files:
 61 |         for key in data_file:
 62 |             if not key in raw_headers:
 63 |                 raw_headers.append( key )
 64 | # sort header
 65 | order_by_header = collections.defaultdict(int, \
 66 |     {
 67 |         'file_format':20,
 68 |         'file_format_type':19,
 69 |         'output_type':18,
 70 |         'dataset':17,
 71 |         'assembly':16,
 72 |         'aliases:array':15,
 73 |         'derived_from:array':14,
 74 |         'md5sum':13,
 75 |         'award':12,
 76 |         'lab':11,
 77 |         'submitted_file_name':10,
 78 |     })
 79 | 
 80 | headers = sorted(raw_headers, key=lambda x: order_by_header[x], reverse=True)
 81 | 
 82 | # write header
 83 | args.out_file.write( ','.join( headers ) +'\n')
 84 | 
 85 | lines = list()
 86 | 
 87 | def find_submitted_file_name( submitted_file_name ):
 88 |     # recursively find file under a working directory and return path relative to working dir.
 89 |     files = subprocess.check_output("find . -type f -name '%s'" % (submitted_file_name), \
 90 |                 shell=True ).strip().split('\n')
 91 |     return files[0]
 92 | 
 93 | # for each replicate, write contents
 94 | for json in jsons:
 95 |     if not 'data_files' in json:
 96 |         continue
 97 |     if ignored_accession_ids and json['ENCODE_accession'] in ignored_accession_ids: continue
 98 |     if accession_ids and not json['ENCODE_accession'] in accession_ids: continue
 99 |     data_files = json['data_files']
100 |     for data_file in data_files:
101 |         line = collections.OrderedDict()
102 |         for key in headers:
103 |             if key in data_file:
104 |                 if key == 'submitted_file_name':
105 |                     line[key] = find_submitted_file_name( data_file[key] )
106 |                     metadata_file = line[key]+'.meta'
107 |                     if os.path.exists(metadata_file):
108 |                         with open(metadata_file,mode='r') as f:
109 |                             for l in f:
110 |                                 if 'md5sum' in l:
111 |                                     md5sum = l.split('=')[1].strip()
112 |                                     if md5sum != data_file['md5sum']:
113 |                                         print('Warning: In accession {}, md5sum of file {} does not match! (json:{}, actual:{})'.format(
114 |                                             json['ENCODE_accession'], line[key], data_file['md5sum'], md5sum ))
115 |                 else:
116 |                     line[key] = data_file[key]
117 |             else:
118 |                 line[key] = ""
119 |         lines.append(line)
120 | 
121 | order_by_file_format = collections.defaultdict(int, \
122 |     {
123 |         'bam':20,
124 |         'tagAlign':19,
125 |         'bigWig':18,
126 |         'bed':17,
127 |         'bigBed':16,
128 |     })
129 | order_by_output_type = collections.defaultdict(int, \
130 |     {
131 |         'alignments':20,
132 |         'unfiltered alignments':19,
133 |         'signal p-value':18,
134 |         'fold change over control':17,
135 |         'filtered peaks':16,
136 |         'replicated peaks':15,
137 |         'idr thresholded peaks':14,
138 |         'optimal idr thresholded peaks':13,
139 |         'conservative idr thresholded peaks':12,
140 |     })
141 | 
142 | # sort lines
143 | sorted_lines = sorted(lines, key = lambda x: (\
144 |     order_by_file_format[x['file_format']],\
145 |     order_by_output_type[x['output_type']]), reverse=True)
146 | 
147 | if args.sort_by_genome_and_exp:
148 |     sorted_lines = sorted(sorted_lines, key = lambda x: (\
149 |         x['assembly'],\
150 |         x['dataset']) )
151 | 
152 | for line in sorted_lines:
153 |     result = ''
154 |     for key in headers:
155 |         result += (line[key]+ ('' if key==headers[-1] else ','))
156 |     args.out_file.write( result + '\n' )
157 | 


--------------------------------------------------------------------------------
/utils/parse_summary_ENCODE_qc_recursively.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | 
  3 | # written by Jin Lee, 2016
  4 | 
  5 | import os, sys
  6 | import json
  7 | import subprocess
  8 | import collections
  9 | import argparse
 10 | import xlwt
 11 | 
 12 | parser = argparse.ArgumentParser(prog='ENCODE_summary.json parser for ENCODE QC import', \
 13 |                                     description='Recursively find ENCODE_summary.json, parse it and make an excel file for importing quality metrics to the ENCODE portal. Use https://github.com/ENCODE-DCC/pyencoded-tools/blob/master/ENCODE_import_data.py for uploading.')
 14 | parser.add_argument('out_file', metavar='out-file', type=str, \
 15 |                         help='Output Excel filename (extention should be .xls, not .xlsx)')
 16 | parser.add_argument('--search-dir', type=str, default='.', \
 17 |                         help='Root directory to search for ENCODE_summary.json')
 18 | parser.add_argument('--json-file', type=str, default='ENCODE_summary.json', \
 19 |                         help='Specify json file name to be parsed')
 20 | parser.add_argument('--sort-by-genome-and-exp', dest='sort_by_genome_and_exp', action='store_true', \
 21 |                         help='Sort rows by genomes and ENCODE experiment accession ID')
 22 | group_accession_ids = parser.add_mutually_exclusive_group()
 23 | group_accession_ids.add_argument('--ignored-accession-ids-file', type=str, \
 24 |                         help='Accession IDs in this text file will be ignored. (1 acc. ID per line)')
 25 | group_accession_ids.add_argument('--accession-ids-file', type=str, \
 26 |                         help='Only accession IDs in this text file will be parsed. (1 acc. ID per line). Others will be ignored.')
 27 | parser.set_defaults(sort_by_genome_and_exp=False)
 28 | 
 29 | args = parser.parse_args()
 30 | 
 31 | # loaded ignored accession list
 32 | ignored_accession_ids = []
 33 | if args.ignored_accession_ids_file and os.path.isfile(args.ignored_accession_ids_file):
 34 |     with open(args.ignored_accession_ids_file,'r') as f:
 35 |         ignored_accession_ids = f.read().splitlines()
 36 |     ignored_accession_ids = \
 37 |         [accession_id for accession_id in ignored_accession_ids if accession_id and not accession_id.startswith("#") ]
 38 | accession_ids = []
 39 | if args.accession_ids_file and os.path.isfile(args.accession_ids_file):
 40 |     with open(args.accession_ids_file,'r') as f:
 41 |         accession_ids = f.read().splitlines()
 42 |     accession_ids = \
 43 |         [accession_id for accession_id in accession_ids if accession_id and not accession_id.startswith("#") ]
 44 | 
 45 | # find all ENCODE_summary.json recursively
 46 | json_files = subprocess.check_output("find -L %s -name %s" % (args.search_dir,args.json_file), \
 47 |                                     shell=True ).strip().split('\n')
 48 | # read json
 49 | jsons = []
 50 | for json_file in json_files:
 51 |     with open(json_file,'r') as f:
 52 |         jsons.append( json.load(f) )
 53 |  
 54 | # look at headers first
 55 | raw_headers = dict()
 56 | 
 57 | for json in jsons:
 58 |     if ignored_accession_ids and json['ENCODE_accession'] in ignored_accession_ids: continue
 59 |     if accession_ids and not json['ENCODE_accession'] in accession_ids: continue
 60 | 
 61 |     if not 'ENCODE_quality_metrics' in json: continue
 62 |     data_files = json['ENCODE_quality_metrics']
 63 |     for data_file in data_files:
 64 |         print data_file
 65 |         ENCODE_qc_type = data_file["ENCODE_qc_type"]
 66 |         if not raw_headers.has_key( "ENCODE_qc_type" ):
 67 |             raw_headers[ ENCODE_qc_type ] = list()
 68 |         for key in data_file:
 69 |             if key == "ENCODE_qc_type": continue
 70 |             if not key in raw_headers[ ENCODE_qc_type ]:
 71 |                 raw_headers[ ENCODE_qc_type ].append( key )
 72 | 
 73 | # write header (fhs=file handles)
 74 | workbook = xlwt.Workbook()
 75 | sheets = {}
 76 | 
 77 | cnt=0
 78 | for ENCODE_qc_type in raw_headers:
 79 |     title = "".join([word.title().replace("Idr","IDR") for word in ENCODE_qc_type.split("_")])
 80 |     print "Creating a sheet with name: ", title
 81 |     # sheet = workbook.add_sheet(str(cnt))
 82 |     sheet = workbook.add_sheet(title)
 83 |     sheets[ENCODE_qc_type] = sheet
 84 |     for i, header in enumerate(raw_headers[ENCODE_qc_type]):
 85 |         sheet.write(0,i,header)
 86 |     cnt+=1
 87 |     # fh = open( "%s.%s.tsv" % (args.out_file_prefix,ENCODE_qc_type) ,'w')
 88 |     # fh.write(delimiter.join(raw_headers[ENCODE_qc_type]))
 89 |     # fh.write("\n")
 90 |     # fhs[ENCODE_qc_type] = fh
 91 | 
 92 | # for each replicate, write contents
 93 | lines = dict()
 94 | for json in jsons:
 95 |     if ignored_accession_ids and json['ENCODE_accession'] in ignored_accession_ids: continue
 96 |     if accession_ids and not json['ENCODE_accession'] in accession_ids: continue
 97 |     data_files = json['ENCODE_quality_metrics']
 98 |     for data_file in data_files:
 99 |         ENCODE_qc_type = data_file["ENCODE_qc_type"]
100 |         if not lines.has_key(ENCODE_qc_type): 
101 |             lines[ENCODE_qc_type] = list()
102 |         line = collections.OrderedDict()
103 |         for key in raw_headers[ENCODE_qc_type]:
104 |             if key in data_file:
105 |                 line[key] = data_file[key]
106 |             else:
107 |                 line[key] = ""
108 |         lines[ENCODE_qc_type].append(line)
109 | 
110 | def is_float(s):
111 |     try:
112 |         float(s)
113 |         return True
114 |     except ValueError:
115 |         return False
116 | 
117 | def is_int(s):
118 |     try:
119 |         int(s)
120 |         return True
121 |     except ValueError:
122 |         return False
123 | 
124 | def is_bool(s):
125 |     if s.lower() in ['true','t','false','f']:
126 |         return True
127 |     else:
128 |         return False
129 | 
130 | sorted_lines = lines
131 | for ENCODE_qc_type in sorted_lines:
132 |     data = sorted_lines[ENCODE_qc_type]
133 |     sheet = sheets[ENCODE_qc_type]
134 |     row = 1
135 |     for line in data:
136 |         for col, key in enumerate(line):
137 |             val = line[key]
138 |             if key.endswith('_pct'):
139 |                 val += "%"
140 |             if val.startswith('null') or val.startswith('N/A:N/A'):
141 |                 val = "null"
142 |             if is_int(val):
143 |                 val = int(val)
144 |             elif is_float(val):
145 |                 val = float(val)
146 |             # elif is_bool(val):
147 |             # else:                
148 |             #     style = xlwt.easyxf()
149 |             sheet.write(row, col, label=val)
150 |         row += 1
151 | 
152 | workbook.save(args.out_file)
153 | 


--------------------------------------------------------------------------------
/utils/parse_summary_qc_recursively.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | 
  3 | # written by Jin Lee, 2016
  4 | 
  5 | import os
  6 | import sys
  7 | import re
  8 | import argparse
  9 | import json
 10 | import subprocess
 11 | from collections import OrderedDict
 12 | 
 13 | parser = argparse.ArgumentParser(prog='ENCODE_summary.json parser for QC', \
 14 |                                     description='Recursively find ENCODE_summary.json, parse it and make a TSV spreadsheet of QC metrics.')
 15 | parser.add_argument('--out-file', type=argparse.FileType('w'), default=sys.stdout, \
 16 |                         help='Output TSV filename)')
 17 | parser.add_argument('--search-dir', type=str, default='.', \
 18 |                         help='Root directory to search for ENCODE_summary.json')
 19 | parser.add_argument('--json-file', type=str, default='ENCODE_summary.json', \
 20 |                         help='Specify json file name to be parsed')
 21 | 
 22 | args = parser.parse_args()
 23 | 
 24 | # find all qc_summary.json recursively
 25 | # json_files = [os.path.join(dp, f) for dp, dn, filenames in os.walk(os.getcwd()) \
 26 | #     for f in filenames if os.path.splitext(f)[1] == 'qc_summary.json']
 27 | 
 28 | # find all ENCODE_summary.json recursively
 29 | json_files = subprocess.check_output("find -L %s -name %s" % (args.search_dir,args.json_file), \
 30 |                                     shell=True ).strip().split('\n')
 31 | # read json
 32 | jsons = []
 33 | for json_file in json_files:
 34 |     with open(json_file,'r') as f:
 35 |         jsons.append( json.load(f, object_pairs_hook=OrderedDict) )
 36 | 
 37 | # sort
 38 | # sorted_jsons = sorted(jsons, key = lambda x: (\
 39 | #     x['ENCODE_award_rfa'], \
 40 | #     x['ENCODE_assay_category'], \
 41 | #     x['ENCODE_assay_title'], \
 42 | #     x['species'], \
 43 | #     x['title']))
 44 | 
 45 | # look at headers first
 46 | headers = OrderedDict()
 47 | headers['common'] = [\
 48 |         'ENCODE award rfa',\
 49 |         'ENCODE assay category',\
 50 |         'ENCODE assay title',\
 51 |         'species',\
 52 |         'title',\
 53 |         'replicate']
 54 | 
 55 | # first take longest header for each qc_type
 56 | for json in jsons:
 57 |     for qc_file in json['qc_files']:
 58 |         qc_type = qc_file['qc_type']
 59 |         if qc_type == 'pbc_PE':
 60 |             qc_type = 'pbc'
 61 |             qc_file['qc_type'] = qc_type
 62 |         header_list = qc_file['header'].split('\t')        
 63 |         if not qc_type in headers or len(headers[qc_type])<len(header_list):
 64 |             headers[qc_type] = header_list
 65 | 
 66 | qc_type = 'files_to_be_submitted'
 67 | headers[qc_type] = []
 68 | 
 69 | # second add missing items for each qc_type
 70 | for json in jsons:
 71 |     for qc_file in json['qc_files']:
 72 |         qc_type = qc_file['qc_type']
 73 |         header_list = qc_file['header'].split('\t')
 74 |         for header_item in header_list:
 75 |             if not header_item in headers[qc_type]:
 76 |                 headers[qc_type].append(header_item)
 77 |     # files to be submitted to ENCODE portal
 78 |     qc_type = 'files_to_be_submitted'
 79 |     for data_file in json['data_files']:
 80 |         header_item = ":".join([data_file['output_type'],data_file['file_format']])
 81 |         if not header_item in headers[qc_type]:
 82 |             headers[qc_type].append(header_item)
 83 | 
 84 | # write header1
 85 | args.out_file.write( '\t'.join( [ qc_type+'\t'*(len(headers[qc_type])-1) \
 86 |                         for qc_type in headers ] ) +'\n')
 87 | 
 88 | # write header2
 89 | headers_wo_numbering = OrderedDict()
 90 | for qc_type in headers:
 91 |     headers_wo_numbering[qc_type] = [re.sub(r'^\d+_','',header) for header in headers[qc_type]]
 92 | args.out_file.write( '\t'.join( [ '\t'.join(headers_wo_numbering[qc_type]) \
 93 |                         for qc_type in headers_wo_numbering ] ) +'\n')
 94 | 
 95 | # for each replicate, write contents
 96 | for json in jsons:
 97 |     # count # of replicates per sample
 98 |     replicates = set()
 99 |     for qc_file in json['qc_files']:        
100 |         info = qc_file['info'].replace('-pr','' )
101 |         if not info or info == 'null': info = 'rep1'
102 |         if not re.match(r'^rep\d+$', info): continue
103 |         replicates.add( info )
104 | 
105 |     for rep in sorted(replicates):
106 |         result = json['ENCODE_award_rfa']+'\t'+\
107 |             json['ENCODE_assay_category']+'\t'+\
108 |             json['ENCODE_assay_title']+'\t'+\
109 |             json['species']+'\t'+\
110 |             json['title']+'\t'+\
111 |             rep
112 |         for qc_type in headers:
113 |             if rep == 'rep1' and qc_type == 'files_to_be_submitted':
114 |                 for header in headers[qc_type]:
115 |                     header_found = False
116 |                     tmp_result = []
117 |                     for data_file in json['data_files']:
118 |                         if header == ':'.join([data_file['output_type'],data_file['file_format']]):
119 |                             tmp_result.append(data_file['submitted_file_name'])
120 |                             header_found = True
121 |                     if header_found:
122 |                         result += '\t'+ ','.join(tmp_result)		
123 |                     else:
124 |                         result += '\t'
125 |             if qc_type=='common':
126 |                 continue
127 |             registered_header_list = headers[qc_type]
128 |             found = False
129 |             for qc_file in json['qc_files']:
130 |                 info = qc_file['info'].replace('-pr','' )
131 |                 if not info or info == 'null': info = 'rep1'
132 |                 if not re.match(r'^rep\d+$', info): continue
133 |                 if rep != info:
134 |                     continue
135 |                 if qc_type == qc_file['qc_type']:
136 |                     header_list = qc_file['header'].split('\t')
137 |                     contents_list = qc_file['contents'].split('\t')
138 |                     h_to_c = dict(zip(header_list,contents_list))
139 |                     for header_item in registered_header_list:
140 |                         if header_item in header_list:
141 |                             result += ('\t'+h_to_c[header_item] )
142 |                         else:
143 |                             result += ('\t')
144 |                     found = True
145 |                     break
146 | 		
147 |             if not found:
148 |                 result += ('\t'*len(registered_header_list))
149 | 
150 |         args.out_file.write( result + '\n' )
151 | 


--------------------------------------------------------------------------------
/utils/reassemble.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | import sys
 4 | 
 5 | if len(sys.argv)!=3:
 6 | 	print '<input coordinate file> <basename of output file>'
 7 | 	sys.exit()
 8 | 
 9 | infile,outn=sys.argv[1:]
10 | 
11 | aliencoord=0
12 | alienchrid=1
13 | id1=1
14 | id2=1
15 | fn1=outn+'_native'
16 | fn2=outn+'_alien'
17 | fout1=open(fn1,'w')
18 | fout2=open(fn2,'w')
19 | 
20 | chrname='scaffold_'
21 | 
22 | with open(infile) as fin:
23 | 	for line in fin:
24 | 		lst=line.rstrip().split('\t')
25 | 		if len(lst)==1:
26 | 			print '{2}{0}:{1}'.format(alienchrid,aliencoord,chrname)
27 | 			aliencoord=0
28 | 			alienchrid+=1
29 | 			continue
30 | 		a=int(lst[1])
31 | 		b=int(lst[2])
32 | 
33 | 		if a>=b:
34 | 			print 'wrong line: '+line
35 | 			sys.exit()
36 | 
37 | 		# native
38 | 		fout1.write('{0}\t{1}\t{2}\tid:{3},genomealign:{{chr:"{8}{4}",start:{5},stop:{6},strand:"{7}"}}\n'.format(
39 | 			lst[0],a,b,
40 | 			id1,
41 | 			alienchrid,
42 | 			aliencoord,
43 | 			aliencoord+b-a,
44 | 			lst[3],
45 | 			chrname
46 | 			))
47 | 		id1+=1
48 | 		# alien
49 | 		fout2.write('{8}{0}\t{1}\t{2}\tid:{3},genomealign:{{chr:"{4}",start:{5},stop:{6},strand:"{7}"}}\n'.format(
50 | 			alienchrid,
51 | 			aliencoord,
52 | 			aliencoord+b-a,
53 | 			id2,
54 | 			lst[0],a,b,
55 | 			lst[3],
56 | 			chrname
57 | 			))
58 | 		id2+=1
59 | 		aliencoord+=b-a
60 | 
61 | print '{2}{0}:{1}'.format(alienchrid,aliencoord,chrname)
62 | 
63 | fout1.close()
64 | fout2.close()
65 | 
66 | import os
67 | 
68 | os.system('sort -k1,1 -k2,2n '+fn1+' > x')
69 | os.system('mv x '+fn1)
70 | os.system('bgzip -f '+fn1)
71 | os.system('tabix -f -p bed '+fn1+'.gz')
72 | 
73 | os.system('sort -k1,1 -k2,2n '+fn2+' > x')
74 | os.system('mv x '+fn2)
75 | os.system('bgzip -f '+fn2)
76 | os.system('tabix -f -p bed '+fn2+'.gz')
77 | 


--------------------------------------------------------------------------------
/utils/trimAdapters.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2.7
  2 | 
  3 | # Author: Jason Buenrostro, Stanford University
  4 | # The following program will compress daisy chain seq data into singe molecules
  5 | 
  6 | ##### IMPORT MODULES #####
  7 | # import necessary for python
  8 | import os
  9 | import re
 10 | import sys
 11 | import gzip
 12 | import string
 13 | import Levenshtein
 14 | from optparse import OptionParser
 15 | 
 16 | ##### DEFINE FUNCTIONS #####
 17 | # Reverse complement
 18 | complement = string.maketrans('ATCGN', 'TAGCN')
 19 | def reverse_complement(sequence):
 20 |     return sequence.upper().translate(complement)[::-1]
 21 | 
 22 | # Align with mismatch, find first and move on, assumes only one
 23 | def fuzz_align(s_seq,l_seq,mismatch):
 24 |     for i, base in enumerate(l_seq):  # loop through equal size windows
 25 |         l_subset = l_seq[i:i+len(s_seq)]
 26 |         dist = Levenshtein.distance(l_subset, s_seq)
 27 |         if dist <= mismatch:  # find first then break
 28 |             return i, dist
 29 |             break
 30 | 
 31 | # added by Jin Lee for hot fix (output name bug)
 32 | def rreplace(s, old, new, occurrence):
 33 |     li = s.rsplit(old, occurrence)
 34 |     return new.join(li)
 35 | 
 36 | #### OPTIONS ####
 37 | # define options
 38 | opts = OptionParser()
 39 | usage = "usage: %prog [options] [inputs] This will trim adapters"
 40 | opts = OptionParser(usage=usage)
 41 | opts.add_option("-a", help="<Read1> Accepts fastq or fastq.gz")
 42 | opts.add_option("-b", help="<Read2> Accepts fastq or fastq.gz")
 43 | options, arguments = opts.parse_args()
 44 | 
 45 | # return usage information if no argvs given AND they're not available in the environment
 46 | # command line arguments always override environment variables
 47 | if len(sys.argv)==1:
 48 | 	p1_in = os.environ.get('P1_IN')
 49 | 	p2_in = os.environ.get('P2_IN')
 50 | 	if  (p1_in is None) or (p2_in is None):
 51 | 		os.system(sys.argv[0]+" --help")
 52 | 		sys.exit()
 53 | else:
 54 | 	##### INPUTS AND OUTPUTS #####
 55 | 	# name input and outputs
 56 | 	p1_in = options.a
 57 | 	p2_in = options.b
 58 | 
 59 | # name outputs and print to working dir
 60 | p1_file = p1_in.split('/')[-1]
 61 | p2_file = p2_in.split('/')[-1]
 62 | p1_out = re.sub(".fastq", ".trim.fastq", p1_file)
 63 | p2_out = re.sub(".fastq", ".trim.fastq", p2_file)
 64 | 
 65 | #check for file type and open input file
 66 | append = p1_in.split('.')[-1]
 67 | if append == "fastq":
 68 |     p1_rds = open(p1_in,'r')
 69 |     p2_rds = open(p2_in,'r')
 70 |     p1_out = re.sub(".fastq", ".trim.fastq", p1_file)
 71 |     p2_out = re.sub(".fastq", ".trim.fastq", p2_file)
 72 | elif append == "fq":
 73 |     p1_rds = open(p1_in,'r')
 74 |     p2_rds = open(p2_in,'r')
 75 |     p1_out = re.sub(".fq", ".trim.fastq", p1_file)
 76 |     p2_out = re.sub(".fq", ".trim.fastq", p2_file)
 77 | elif append == "gz":
 78 |     p1_rds = gzip.open(p1_in,'r')
 79 |     p2_rds = gzip.open(p2_in,'r')
 80 |     p1_out = re.sub(".fastq.gz", ".trim.fastq", p1_file)
 81 |     p2_out = re.sub(".fastq.gz", ".trim.fastq", p2_file)
 82 |     p1_out = re.sub(".fq.gz", ".trim.fastq", p1_out)
 83 |     p2_out = re.sub(".fq.gz", ".trim.fastq", p2_out)
 84 | else:
 85 |     sys.exit("ERROR! The input file2 must be a .fastq or .fastq.gz")
 86 | 
 87 | ##### SCRIPT #####
 88 | # initialize variables
 89 | i=0;j=0;k=0;tot_b=0;count=1
 90 | n=20  # match seq
 91 | mismatch=1  # only allow 0-1 mismatches for now, if allow two then gets mis indexed, to fix this need to change fuzz_align to save L as a vector and reiterate to find 2nd
 92 | 
 93 | # initilize write files
 94 | r1_write = open(p1_out, 'w')
 95 | r2_write = open(p2_out, 'w')
 96 | 
 97 | while 1:
 98 |     # read lines
 99 |     p1_line = p1_rds.readline()
100 |     p2_line = p2_rds.readline()
101 | 
102 |     # break if at end of file
103 |     if not p1_line:
104 |         break
105 | 
106 |     # load fastq into memory
107 |     if count ==1:
108 |         seqhead1 = p1_line
109 |         seqhead2 = p2_line
110 |     elif count ==2:
111 |         seq1 = p1_line.rstrip()
112 |         seq2 = p2_line.rstrip()
113 |     elif count ==3:
114 |         qualhead1 = p1_line
115 |         qualhead2 = p2_line
116 |     elif count ==4:
117 |         qual1 = p1_line.rstrip()
118 |         qual2 = p2_line.rstrip()
119 | 
120 |         # align reads to themselves
121 |         i = i+1  # total reads
122 |         rc_seq2 = reverse_complement(seq2[0:n])
123 |         idx = seq1.rfind(rc_seq2) # look for perfect match
124 |         if idx > 0:
125 |             j = j+1  # 0 mismatchs
126 |         elif mismatch>0:
127 |             hold = fuzz_align(rc_seq2,seq1,mismatch)  # else allow for mismatch
128 |             if hold:
129 |                 idx,mis=hold
130 |                 if mis == 1:
131 |                     k=k+1  # 1 mismatch
132 | 
133 |         # trim reads if idx exist
134 |         if idx > 0:
135 |             # keep track on how much trimming
136 |             tot_b = tot_b+len(seq2[idx+n:-1]) #track total bases trimmed 
137 |             
138 |             # trim data
139 |             seq1 = seq1[0:idx+n-1]  # modified to sub1 because some aligners (bowtie) dont like perfectly overlapping reads
140 |             seq2 = seq2[0:idx+n-1]
141 |             qual1 = qual1[0:idx+n-1]
142 |             qual2 = qual2[0:idx+n-1]
143 |         
144 |         # print read1
145 |         r1_write.write(seqhead1)
146 |         r1_write.write(seq1+"\n")
147 |         r1_write.write(qualhead1)
148 |         r1_write.write(qual1+"\n")
149 | 
150 |         # print read2
151 |         r2_write.write(seqhead2)
152 |         r2_write.write(seq2+"\n")
153 |         r2_write.write(qualhead2)
154 |         r2_write.write(qual2+"\n")
155 | 
156 |     # increment count
157 |     count = count + 1
158 |     if count == 5:
159 |         count = 1
160 |     else:
161 |         count = count
162 | 
163 | # close files to write the file
164 | r1_write.close()
165 | r2_write.close()
166 | p1_rds.close()
167 | p2_rds.close()
168 | 
169 | # write file output names for passing into next step of pipeline
170 | # !!! DO NOT WRITE ANYTHING ELSE TO STDOUT AFTER THIS !!!
171 | sys.stdout.write(p1_out + '\n')
172 | sys.stdout.write(p2_out + '\n')
173 | 
174 | # give summary
175 | sys.stderr.write(str(i)+" sequences total\n")
176 | sys.stderr.write(str(j)+" sequences trimmed with 0 mismatches\n")
177 | sys.stderr.write(str(k)+" sequences trimmed with 1 mismatch\n")
178 | sys.stderr.write(str(tot_b/(j+k))+" mean number of bases trimmed for reads requiring trimming\n")
179 | 


--------------------------------------------------------------------------------
/utils/ucsc_ensGene.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | 
 3 | import sys,os
 4 | sys.path.append('/home/xzhou/subtleKnife/script/genescript')
 5 | import parseUcscgenestruct
 6 | 
 7 | if len(sys.argv)!=3:
 8 | 	print '<ensGene.txt file> <tkname> knownToEnsembl.txt and kgXref.txt must be under current dir'
 9 | 	sys.exit()
10 | 
11 | 
12 | aa={}
13 | with open('knownToEnsembl.txt') as fin:
14 | 	for line in fin:
15 | 		lst=line.rstrip().split('\t')
16 | 		aa[lst[0]]=lst[1]
17 | 
18 | symbol={}
19 | desc={}
20 | with open('kgXref.txt') as fin:
21 | 	for line in fin:
22 | 		lst=line.rstrip().split('\t')
23 | 		if lst[0] in aa:
24 | 			ens=aa[lst[0]]
25 | 			if len(lst[4])>0:
26 | 				symbol[ens]=lst[4]
27 | 			if len(lst[7])>0:
28 | 				desc[ens]=lst[7]
29 | 
30 | 
31 | ucsc,tkname=sys.argv[1:]
32 | 
33 | 
34 | 
35 | # dump
36 | fout=open(tkname,'w')
37 | fout2=open(tkname+'_load','w')
38 | 
39 | id=1
40 | with open(ucsc) as fin:
41 | 	for line in fin:
42 | 		lst=line.rstrip().split('\t')
43 | 		g=parseUcscgenestruct.parse(lst,True)
44 | 		name=lst[1]
45 | 		fout.write('{0}\t{1}\t{2}\tname:"{3}",id:{4},strand:"{5}",'.format(
46 | 			g['chrom'],
47 | 			g['start'],
48 | 			g['stop'],
49 | 			name,
50 | 			id,
51 | 			g['strand']))
52 | 		id+=1
53 | 		if 'thin' in g or 'thick' in g:
54 | 			fout.write('struct:{')
55 | 			if 'thin' in g:
56 | 				fout.write('thin:[')
57 | 				for x in g['thin']:
58 | 					fout.write('[{0},{1}],'.format(x[0],x[1]))
59 | 				fout.write('],')
60 | 			if 'thick' in g:
61 | 				fout.write('thick:[')
62 | 				for x in g['thick']:
63 | 					fout.write('[{0},{1}],'.format(x[0],x[1]))
64 | 				fout.write('],')
65 | 			fout.write('},')
66 | 		# desc
67 | 		if name in desc:
68 | 			fout.write('desc:"'+desc[name]+'",')
69 | 		if name in symbol:
70 | 			fout.write('name2:"'+symbol[name]+'"')
71 | 			fout2.write('{0}\t{1}\t{2}\t{3}\n'.format(g['chrom'],g['start'],g['stop'],symbol[name]))
72 | 		fout.write('\n')
73 | 		fout2.write('{0}\t{1}\t{2}\t{3}\n'.format(g['chrom'],g['start'],g['stop'],name))
74 | 
75 | 
76 | fout2.close()
77 | fout.close()
78 | 
79 | import os
80 | os.system('sort -k1,1 -k2,2n '+tkname+' > x')
81 | os.system('mv x '+tkname)
82 | os.system('bgzip -f '+tkname)
83 | os.system('tabix -f -p bed '+tkname+'.gz')
84 | 
85 | print '''
86 | drop table if exists {0};
87 | create table {0} (
88 | chrom varchar(20) not null,
89 | start int unsigned not null,
90 | stop int unsigned not null,
91 | name varchar(100) not null
92 | ) ENGINE=MyISAM DEFAULT CHARSET=latin1;
93 | load data local infile '{0}_load' into table {0};
94 | create index name on {0} (name);
95 | '''.format(tkname)
96 | 
97 | 


--------------------------------------------------------------------------------
/utils/ucsc_simplegene.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | 
  3 | import sys,os
  4 | sys.path.append('/home/xzhou/subtleKnife/script/genescript')
  5 | import parseUcscgenestruct
  6 | 
  7 | if len(sys.argv)!=3:
  8 | 	print '<ucsc gene file> <tkname>'
  9 | 	sys.exit()
 10 | 
 11 | ucsc,tkname=sys.argv[1:]
 12 | 
 13 | 
 14 | symbol={}
 15 | desc={}
 16 | i=0
 17 | if os.path.exists('refLink.txt'):
 18 | 	'''
 19 | 	0 symbol
 20 | 	1 desc
 21 | 	2 name
 22 | 	3 name
 23 | 	'''
 24 | 	with open('refLink.txt') as fin:
 25 | 		for line in fin:
 26 | 			lst=line.rstrip().split('\t')
 27 | 			if len(lst)<4: continue
 28 | 			w=lst[1].replace('"','')
 29 | 			#w=w.replace("'",'')
 30 | 			desc[lst[2]]=w
 31 | 			desc[lst[3]]=w
 32 | 			symbol[lst[2]]=lst[0]
 33 | 			symbol[lst[3]]=lst[0]
 34 | 			i+=1
 35 | print 'refLink: '+str(i)
 36 | 
 37 | 
 38 | # dump
 39 | fout=open(tkname,'w')
 40 | fout2=open(tkname+'_load','w')
 41 | 
 42 | id=1
 43 | with open(ucsc) as fin:
 44 | 	for line in fin:
 45 | 		lst=line.rstrip().split('\t')
 46 | 		g=parseUcscgenestruct.parse(lst,True)
 47 | 		name=lst[1]
 48 | 		fout.write('{0}\t{1}\t{2}\tname:"{3}",id:{4},strand:"{5}",'.format(
 49 | 			g['chrom'],
 50 | 			g['start'],
 51 | 			g['stop'],
 52 | 			name,
 53 | 			id,
 54 | 			g['strand']))
 55 | 		id+=1
 56 | 		if 'thin' in g or 'thick' in g:
 57 | 			fout.write('struct:{')
 58 | 			if 'thin' in g:
 59 | 				fout.write('thin:[')
 60 | 				for x in g['thin']:
 61 | 					fout.write('[{0},{1}],'.format(x[0],x[1]))
 62 | 				fout.write('],')
 63 | 			if 'thick' in g:
 64 | 				fout.write('thick:[')
 65 | 				for x in g['thick']:
 66 | 					fout.write('[{0},{1}],'.format(x[0],x[1]))
 67 | 				fout.write('],')
 68 | 			fout.write('},')
 69 | 		# desc
 70 | 		if name in desc:
 71 | 			fout.write('desc:"'+desc[name]+'",')
 72 | 		if name in symbol:
 73 | 			fout.write('name2:"'+symbol[name]+'"')
 74 | 			fout2.write('{0}\t{1}\t{2}\t{3}\n'.format(g['chrom'],g['start'],g['stop'],symbol[name]))
 75 | 		fout.write('\n')
 76 | 		fout2.write('{0}\t{1}\t{2}\t{3}\n'.format(g['chrom'],g['start'],g['stop'],name))
 77 | 
 78 | 
 79 | fout2.close()
 80 | fout.close()
 81 | 
 82 | import os
 83 | os.system('sort -k1,1 -k2,2n '+tkname+' > x')
 84 | os.system('mv x '+tkname)
 85 | os.system('bgzip -f '+tkname)
 86 | os.system('tabix -f -p bed '+tkname+'.gz')
 87 | 
 88 | print '''
 89 | drop table if exists {0};
 90 | create table {0} (
 91 | chrom varchar(20) not null,
 92 | start int unsigned not null,
 93 | stop int unsigned not null,
 94 | name varchar(100) not null
 95 | ) ENGINE=MyISAM DEFAULT CHARSET=latin1;
 96 | load data local infile '{0}_load' into table {0};
 97 | create index name on {0} (name);
 98 | '''.format(tkname)
 99 | 
100 | 


--------------------------------------------------------------------------------