├── .gitignore ├── .gitmodules ├── LICENSE.md ├── README.md ├── adapter_trimmer.bds ├── atac.bds ├── bds.config ├── default.env ├── etc ├── broadPeak.as ├── gappedPeak.as └── narrowPeak.as ├── examples ├── ENCODE │ └── download_ENCODE_Snyder.py ├── atac_shi_new2.sh ├── bfremin.sh ├── example.sh └── training-camp-2016.sh ├── html ├── jquery.treetable.css ├── jquery.treetable.js ├── jquery.treetable.theme.default.css └── rpt_header.html ├── install_dependencies.sh ├── install_genome_data.sh ├── modules ├── ENCODE_accession.bds ├── align_bowtie2.bds ├── align_etc.bds ├── align_multimapping.bds ├── align_trim_adapter.bds ├── ataqc.bds ├── callpeak_bigbed.bds ├── callpeak_blacklist_filter.bds ├── callpeak_idr.bds ├── callpeak_macs2_atac.bds ├── callpeak_naive_overlap.bds ├── cluster.bds ├── conf.bds ├── env.bds ├── filetable.bds ├── git.bds ├── graphviz.bds ├── input.bds ├── input_adapter.bds ├── input_bam.bds ├── input_fastq.bds ├── input_peak.bds ├── input_tagalign.bds ├── log_parser.bds ├── module_template.bds ├── output.bds ├── parallel.bds ├── pipeline_template.bds ├── postalign_bam.bds ├── postalign_bed.bds ├── postalign_xcor.bds ├── report.bds ├── species.bds ├── string.bds └── sys.bds ├── requirements.txt ├── requirements_py3.txt ├── species ├── kundaje.conf ├── scg.conf └── sherlock.conf ├── uninstall_dependencies.sh └── utils ├── assign_multimappers.py ├── axt_dirfiles.py ├── bds_scr ├── bds_scr_5min ├── broadpeak.py ├── clusterGeneric ├── kill.pl ├── postMortemInfo.pl ├── run.pl └── stat.pl ├── detect_adapter.py ├── gappedpeak.py ├── get_read_length_from_fastq.py ├── kill_scr ├── narrowpeak.py ├── narrowpeak_idr.py ├── parse_summary_ENCODE_accession_recursively.py ├── parse_summary_ENCODE_qc_recursively.py ├── parse_summary_qc_recursively.py ├── reassemble.py ├── trimAdapters.py ├── ucsc_ensGene.py └── ucsc_simplegene.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.chp 2 | .*.swp 3 | .nfs* 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "ataqc"] 2 | path = ataqc 3 | url = https://github.com/kundajelab/ataqc 4 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | BSD-3-Clause License 2 | 3 | Copyright (c) 2016, Kundaje Lab 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 11 | 12 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /adapter_trimmer.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | 5 | help == trimmer pipeline settings 6 | 7 | save_to_indv_dir:= false help Save trimmed fastqs to individual directory for each replicate. 8 | old_trimmer := false help Use legacy trim adapters (trim_galore and trimAdapter.py). 9 | 10 | 11 | help() // show help contexts 12 | 13 | include "modules/pipeline_template.bds" 14 | include "modules/input.bds" 15 | include "modules/input_adapter.bds" 16 | 17 | include "modules/align_trim_adapter.bds" 18 | 19 | 20 | main() 21 | 22 | 23 | void main() { // trimmer pipeline starts here 24 | 25 | init_trimmer() 26 | chk_input( true, false ) 27 | chk_adapters() 28 | trim_adapters() 29 | } 30 | 31 | void init_trimmer() { 32 | 33 | save_to_indv_dir = get_conf_val_bool( save_to_indv_dir, ["save_to_indv_dir"] ) 34 | old_trimmer = get_conf_val_bool( old_trimmer, ["old_trimmer"] ) 35 | 36 | print( "\n\n== trimmer settings\n") 37 | print( "Save trimmed fastqs to individual directory for each replicate\t: $save_to_indv_dir\n" ) 38 | print( "Use old trim adapters\t\t\t: $old_trimmer\n" ) 39 | } 40 | 41 | void chk_adapters() { 42 | 43 | print( "\n== checking adapters to be trimmed ...\n" ); 44 | 45 | // check adapters 46 | for ( int rep=1; rep <= get_num_rep(); rep++) { 47 | 48 | string prefix 49 | if ( is_input_fastq( rep ) ) { 50 | 51 | if ( !old_trimmer ) { // check adapters 52 | adapters := get_adapters( rep ) 53 | 54 | prefix += "Replicate $rep adapters : " 55 | 56 | if ( adapters.size()==0 ) { 57 | prefix += "automatically detected" 58 | } 59 | else { 60 | for ( int i=0; i 2 | 3 | 4 | 5 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /install_dependencies.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Stop on error 3 | set -e 4 | 5 | ## conda environment name 6 | 7 | ENV_NAME=bds_atac 8 | ENV_NAME_PY3=bds_atac_py3 9 | 10 | INSTALL_GEM=0 11 | INSTALL_PEAKSEQ=0 12 | 13 | ## install packages from official channels (bioconda and r) 14 | 15 | conda create -n ${ENV_NAME} --file requirements.txt -y -c defaults -c bioconda -c r -c bcbio -c daler -c asmeurer 16 | conda create -n ${ENV_NAME_PY3} --file requirements_py3.txt -y -c defaults -c bioconda -c r -c bcbio -c daler -c asmeurer 17 | 18 | ### bash function definition 19 | 20 | function add_to_activate { 21 | if [[ ! -f $CONDA_INIT ]]; then 22 | echo > $CONDA_INIT 23 | fi 24 | for i in "${CONTENTS[@]}"; do 25 | if [[ $(grep "$i" "$CONDA_INIT" | wc -l ) == 0 ]]; then 26 | echo $i >> "$CONDA_INIT" 27 | fi 28 | done 29 | } 30 | 31 | ## install useful tools for BigDataScript 32 | 33 | mkdir -p $HOME/.bds 34 | cp -f ./utils/bds_scr ./utils/bds_scr_5min ./utils/kill_scr bds.config $HOME/.bds/ 35 | cp -rf ./utils/clusterGeneric/ $HOME/.bds/ 36 | 37 | ## install additional packages 38 | 39 | source activate ${ENV_NAME} 40 | 41 | conda uninstall graphviz -y # graphviz in bioconda has segmentation fault bug 42 | conda install graphviz -c anaconda -y 43 | 44 | conda install ucsc-bedgraphtobigwig -c bioconda -y 45 | conda install ucsc-bedtobigbed -c bioconda -y 46 | 47 | #CONDA_BIN=$(dirname $(which activate))/../envs/${ENV_NAME}/bin 48 | #CONDA_BIN=$(dirname $(which activate)) 49 | CONDA_BIN=$(dirname $(which bedtools)) 50 | CONDA_EXTRA="$CONDA_BIN/../extra" 51 | CONDA_ACTIVATE_D="$CONDA_BIN/../etc/conda/activate.d" 52 | CONDA_INIT="$CONDA_ACTIVATE_D/init.sh" 53 | CONDA_LIB="$CONDA_BIN/../lib" 54 | if [[ $(find $CONDA_LIB -name '*egg-info*' -not -perm -o+r | wc -l ) > 0 ]]; then 55 | find $CONDA_LIB -name '*egg-info*' -not -perm -o+r -exec dirname {} \; | xargs chmod o+r -R 56 | fi 57 | 58 | mkdir -p $CONDA_EXTRA $CONDA_ACTIVATE_D 59 | 60 | ### install Anshul's phantompeakqualtool 61 | echo $CONDA_EXTRA 62 | cd $CONDA_EXTRA 63 | git clone https://github.com/kundajelab/phantompeakqualtools 64 | chmod 755 -R phantompeakqualtools 65 | CONTENTS=("export PATH=$CONDA_EXTRA/phantompeakqualtools:\$PATH") 66 | add_to_activate 67 | 68 | ### disable locally installed python package lookup 69 | CONTENTS=("export PYTHONNOUSERSITE=True") 70 | add_to_activate 71 | #CONTENTS=("export PYTHONPATH=$CONDA_LIB/python2.7/site-packages:\$PYTHONPATH") 72 | #add_to_activate 73 | 74 | ### decompress MACS2 python egg 75 | #cd $CONDA_LIB/python2.7/site-packages 76 | #unzip -o MACS2-2.1.1.20160309-py2.7-linux-x86_64.egg 77 | 78 | # install PeakSeq 79 | if [[ ${INSTALL_PEAKSEQ} == 1 ]]; then 80 | cd $CONDA_EXTRA 81 | wget http://archive.gersteinlab.org/proj/PeakSeq/Scoring_ChIPSeq/Code/C/PeakSeq_1.31.zip -N --no-check-certificate 82 | unzip PeakSeq_1.31.zip 83 | rm -f PeakSeq_1.31.zip 84 | cd PeakSeq 85 | make 86 | chmod 755 bin/PeakSeq 87 | cd $CONDA_BIN 88 | ln -s $CONDA_EXTRA/PeakSeq/bin/PeakSeq 89 | fi 90 | 91 | source deactivate 92 | 93 | 94 | source activate ${ENV_NAME_PY3} 95 | 96 | #CONDA_BIN=$(dirname $(which activate))/../envs/${ENV_NAME_PY3}/bin 97 | #CONDA_BIN=$(dirname $(which activate)) 98 | CONDA_BIN=$(dirname $(which bedtools)) 99 | CONDA_EXTRA="$CONDA_BIN/../extra" 100 | CONDA_ACTIVATE_D="$CONDA_BIN/../etc/conda/activate.d" 101 | CONDA_INIT="$CONDA_ACTIVATE_D/init.sh" 102 | CONDA_LIB="$CONDA_BIN/../lib" 103 | if [[ $(find $CONDA_LIB -name '*egg-info*' -not -perm -o+r | wc -l ) > 0 ]]; then 104 | find $CONDA_LIB -name '*egg-info*' -not -perm -o+r -exec dirname {} \; | xargs chmod o+r -R 105 | fi 106 | 107 | mkdir -p $CONDA_EXTRA $CONDA_ACTIVATE_D 108 | 109 | ### uninstall IDR 2.0.4 and install the latest one 110 | conda uninstall idr -y 111 | cd $CONDA_EXTRA 112 | git clone --branch 2.0.4.2 git://github.com/kundajelab/idr 113 | cd idr 114 | python3 setup.py install 115 | cd $CONDA_EXTRA 116 | rm -rf idr 117 | 118 | ### disable locally installed python package lookup 119 | CONTENTS=("export PYTHONNOUSERSITE=True") 120 | add_to_activate 121 | CONTENTS=("export PYTHONPATH=$CONDA_LIB/python3.5/site-packages:\$PYTHONPATH") 122 | add_to_activate 123 | 124 | # install GEM 125 | if [[ ${INSTALL_GEM} == 1 ]]; then 126 | cd $CONDA_EXTRA 127 | wget http://groups.csail.mit.edu/cgs/gem/download/gem.v3.0.tar.gz -N --no-check-certificate 128 | tar zxvf gem.v3.0.tar.gz 129 | rm -f gem.v3.0.tar.gz 130 | cd gem 131 | chmod 755 gem.jar 132 | cd $CONDA_BIN 133 | ln -s $CONDA_EXTRA/gem/gem.jar 134 | fi 135 | 136 | source deactivate 137 | 138 | 139 | echo == Installing dependencies has been successfully done. == 140 | -------------------------------------------------------------------------------- /modules/align_bowtie2.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "species.bds" 5 | include "align_multimapping.bds" 6 | include "module_template.bds" 7 | 8 | 9 | help == align bowtie2 settings (requirements: -bwt2_idx) 10 | bwt2_idx := "" help Bowtie2 index (full path prefix of *.1.bt2 file). 11 | scoremin_bwt2 := "" help Replacement --score-min for bowtie2. 12 | wt_bwt2 := "47h" help Walltime for bowtie2 (default: 47h, 47:00:00). 13 | mem_bwt2 := "12G" help Max. memory for bowtie2 (default: 12G). 14 | extra_param_bwt2:= "" help Extra parameter for bowtie2. 15 | no_idx_on_mem_bwt2 := false help Disable loading index on memory by removing --mm flag for bowtie2. 16 | 17 | grp_color_bwt2 := "salmon" 18 | 19 | 20 | init_align_bwt2() 21 | 22 | 23 | void init_align_bwt2() { 24 | 25 | bwt2_idx = get_conf_val( bwt2_idx, ["bwt2_idx"] ) 26 | scoremin_bwt2 = get_conf_val( scoremin_bwt2, ["scoremin_bwt2"] ) 27 | wt_bwt2 = get_conf_val( wt_bwt2, ["wt_bwt2"] ) 28 | mem_bwt2 = get_conf_val( mem_bwt2, ["mem_bwt2"] ) 29 | extra_param_bwt2= get_conf_val( extra_param_bwt2,["extra_param_bwt2"] ) 30 | no_idx_on_mem_bwt2 = get_conf_val_bool( no_idx_on_mem_bwt2, ["no_idx_on_mem_bwt2"] ) 31 | 32 | print("\n\n== align bowtie2 settings\n") 33 | print( "Bowtie2 index\t\t\t: $bwt2_idx\n" ) 34 | print( "Replacement --score-min for bowtie2\t: $scoremin_bwt2\n" ) 35 | print( "Walltime (bowtie2)\t\t: $wt_bwt2\n") 36 | print( "Max. memory (bowtie2)\t\t: $mem_bwt2\n") 37 | print( "Extra param. (bowtie2)\t\t: $extra_param_bwt2\n") 38 | print( "Disable index on memory (bowtie2)\t: $no_idx_on_mem_bwt2\n") 39 | } 40 | 41 | void chk_align_bwt2() { 42 | 43 | if ( !path_exists("$bwt2_idx.1.bt2") && !path_exists("$bwt2_idx.1.bt2l") ) { 44 | error("Bowtie2 index (-bwt2_idx) doesn't exists! (file: $bwt2_idx.1.bt2 or $bwt2_idx.1.bt2l)\n") 45 | } 46 | } 47 | 48 | string[] bowtie2( string fastq, string o_dir, string log_o_dir, string group, int nth_bwt2 ) { 49 | 50 | prefix := replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir ) 51 | prefix2 := replace_dir( prefix, log_o_dir ) 52 | bam := "$prefix.bam" 53 | log := "$prefix2.align.log" 54 | param := multimapping>0 ? "-k $multimapping" : "" 55 | param3 := scoremin_bwt2 ? "--score-min $scoremin_bwt2" : "" 56 | 57 | in := [ fastq ] 58 | out := [ bam, log ] 59 | 60 | taskName:= "bowtie2 "+group 61 | cpus := (nth_bwt2==1) ? -1 : nth_bwt2; mem := get_res_mem(mem_bwt2,nth_bwt2); timeout := get_res_wt(wt_bwt2) 62 | 63 | wait_par( cpus ) 64 | 65 | tid := task( out<-in ) { 66 | 67 | sys $shcmd_init 68 | 69 | //sys bowtie2 $param -x $bwt2_idx --threads $nth_bwt2 -U <(zcat -f $fastq) 2> $log | \ 70 | // samtools view -Su /dev/stdin | sambamba sort -t 1 /dev/stdin -o $bam 71 | sys bowtie2 $extra_param_bwt2 $param $param3 --local -x $bwt2_idx --threads $nth_bwt2 -U <(zcat -f $fastq) 2> $log | \ 72 | samtools view -Su /dev/stdin | samtools sort - $prefix 73 | sys cat $log 74 | sys samtools index $bam 75 | 76 | sys $shcmd_finalize 77 | } 78 | 79 | register_par( tid, cpus ) 80 | 81 | add_task_to_graph( in, out, group, "BOWTIE2\\n(SE)", grp_color_bwt2 ) 82 | 83 | return out 84 | } 85 | 86 | string[] bowtie2_csem( string fastq, string o_dir, string log_o_dir, string group, int nth_bwt2 ) { 87 | 88 | prefix := replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir ) 89 | prefix2 := replace_dir( prefix, log_o_dir ) 90 | sam := "$prefix.sam" 91 | log := "$prefix2.align.log" 92 | srt_bam := "$prefix.csem.sorted.bam" 93 | srt_bam_prefix := "$prefix.csem" 94 | bam := "$prefix.csem.bam" 95 | param := multimapping ? "-k $multimapping" : "" 96 | param3 := scoremin_bwt2 ? "--score-min $scoremin_bwt2" : "" 97 | 98 | in := [ fastq ] 99 | out := [ bam, log ] 100 | 101 | taskName:= "bowtie2_csem "+group 102 | cpus := (nth_bwt2==1) ? -1 : nth_bwt2; mem := get_res_mem(mem_bwt2,nth_bwt2); timeout := get_res_wt(wt_bwt2) 103 | 104 | wait_par( cpus ) 105 | 106 | tid := task( out<-in ) { 107 | 108 | sys $shcmd_init 109 | 110 | sys bowtie2 $extra_param_bwt2 $param $param3 --local -x $bwt2_idx --threads $nth_bwt2 -U <(zcat -f $fastq) 2> $log > $sam 111 | sys cat $log 112 | sys run-csem --sam -p $nth_bwt2 $sam 100 $srt_bam_prefix 113 | 114 | sys mv $srt_bam $bam 115 | 116 | sys sambamba index -t $nth_bwt2 $bam 117 | 118 | sys rm -f $sam 119 | 120 | sys $shcmd_finalize 121 | } 122 | 123 | register_par( tid, cpus ) 124 | 125 | add_task_to_graph( in, out, group, "BOWTIE2\\n(CSEM)", grp_color_bwt2 ) 126 | 127 | return out 128 | } 129 | 130 | string[] bowtie2_PE( string fastq1, string fastq2, string o_dir, string log_o_dir, string group, int nth_bwt2 ) { 131 | 132 | prefix := replace_dir( rm_ext( fastq1, ["fastq","fq"] ), o_dir ) + ".PE2SE" 133 | prefix2 := replace_dir( prefix, log_o_dir ) 134 | bam := "$prefix.bam" 135 | log := "$prefix2.align.log" 136 | param := multimapping ? "-k $multimapping" : "" 137 | param3 := scoremin_bwt2 ? "--score-min $scoremin_bwt2" : "" 138 | param_mm := no_idx_on_mem_bwt2 ? "" : "--mm" 139 | 140 | in := [ fastq1, fastq2 ] 141 | out := [ bam, log ] 142 | 143 | taskName:= "bowtie2_PE "+group 144 | cpus := (nth_bwt2==1) ? -1 : nth_bwt2; mem := get_res_mem(mem_bwt2,nth_bwt2); timeout := get_res_wt(wt_bwt2) 145 | 146 | wait_par( cpus ) 147 | 148 | tid := task( out<-in ) { 149 | 150 | sys $shcmd_init 151 | 152 | //sys bowtie2 $param -X2000 $param_mm --threads $nth_bwt2 -x $bwt2_idx \ 153 | // -1 $fastq1 -2 $fastq2 2>$log | \ 154 | // samtools view -Su /dev/stdin | sambamba sort -t 1 /dev/stdin -o $bam 155 | sys bowtie2 $extra_param_bwt2 $param $param3 -X2000 $param_mm --local --threads $nth_bwt2 -x $bwt2_idx \ 156 | -1 $fastq1 -2 $fastq2 2>$log | \ 157 | samtools view -Su /dev/stdin | samtools sort - $prefix 158 | sys cat $log 159 | sys samtools index $bam 160 | 161 | sys $shcmd_finalize 162 | } 163 | 164 | register_par( tid, cpus ) 165 | 166 | add_task_to_graph( in, out, group, "BOWTIE2\\n(PE)", grp_color_bwt2 ) 167 | 168 | return out 169 | } 170 | -------------------------------------------------------------------------------- /modules/align_etc.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "species.bds" 5 | 6 | string get_read_length_log( string fastq, string o_dir, string group ) { 7 | 8 | prefix := replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir ) 9 | log := "$prefix.read_length.txt" 10 | 11 | in := [ fastq ] 12 | out := log 13 | 14 | taskName:= "read_length "+group 15 | wait_par( cpus ) 16 | 17 | tid := task( out<-in ) { 18 | 19 | sys $shcmd_init 20 | 21 | sys python $(which get_read_length_from_fastq.py) $fastq > $log 22 | 23 | sys $shcmd_finalize 24 | } 25 | 26 | register_par( tid, cpus ) 27 | 28 | add_task_to_graph( in, out, group ) 29 | 30 | return out 31 | } 32 | -------------------------------------------------------------------------------- /modules/align_multimapping.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "conf.bds" 5 | 6 | 7 | help == align multimapping settings 8 | multimapping := 0 help # alignments reported for multimapping (default: 0). 9 | 10 | 11 | init_align_multimapping() 12 | 13 | 14 | void init_align_multimapping() { 15 | multimapping = get_conf_val_int( multimapping, ["multimapping"] ) 16 | 17 | print("\n\n== align multimapping settings\n") 18 | print( "# alignments reported for multimapping\t: $multimapping\n") 19 | } 20 | -------------------------------------------------------------------------------- /modules/align_trim_adapter.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "species.bds" 5 | include "module_template.bds" 6 | 7 | 8 | help == adapter trimmer settings 9 | adapter_err_rate := "0.10" help Maximum allowed adapter error rate (# errors divided by the length of the matching adapter region, default: 0.10). 10 | min_trim_len := 5 help Minimum trim length for cutadapt -m, throwing away processed reads shorter than this (default: 5). 11 | 12 | wt_trim := "23h" help Walltime for adapter trimming (default: 23h, 23:00:00). 13 | mem_trim := "12G" help Max. memory for adapter trimming (default: 12G). 14 | 15 | 16 | grp_color_trim_adapter := "darkorange" 17 | 18 | 19 | init_align_trim_adapter() 20 | 21 | 22 | void init_align_trim_adapter() { 23 | adapter_err_rate = get_conf_val( adapter_err_rate, ["adapter_err_rate"] ) 24 | min_trim_len = get_conf_val_int( min_trim_len, ["min_trim_len"] ) 25 | wt_trim = get_conf_val( wt_trim, ["wt_trim"] ) 26 | mem_trim = get_conf_val( mem_trim, ["mem_trim"] ) 27 | 28 | print("\n\n== adapter trimmer settings\n") 29 | print( "Maximum allowed error rate for cutadapt\t: $adapter_err_rate\n") 30 | print( "Minimum trim. length for cutadapt -m\t: $min_trim_len\n") 31 | print( "Walltime (adapter trimming)\t\t: $wt_trim\n") 32 | print( "Max. memory (adapter trimming)\t\t: $mem_trim\n") 33 | } 34 | 35 | // also returns tid 36 | string[] detect_adapter( string fastq, string o_dir, string group ) { 37 | 38 | prefix := replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir ) 39 | log := "$prefix.adapter.txt" 40 | 41 | in := [ fastq ] 42 | out := log 43 | 44 | taskName:= "detect_adapter $group" 45 | 46 | system := "local" 47 | 48 | tid := task( out<-in ) { 49 | 50 | sys $shcmd_init_py3 51 | 52 | sys python3 $(which detect_adapter.py) $fastq > $log 53 | 54 | sys $shcmd_finalize 55 | } 56 | 57 | return [out, tid] 58 | } 59 | 60 | string parse_adapter_log( string log ) { 61 | string adapter 62 | lines := log.readLines() 63 | for ( int i=0; i -1 ) { 65 | if ( i+1>lines.size()-1 ) { 66 | adapter = "" 67 | } 68 | else { 69 | line := lines[i+1] 70 | arr := line.split("\t") 71 | if (arr.size()<3) adapter = "" 72 | else adapter = arr[2] 73 | } 74 | break; 75 | } 76 | } 77 | //if ( adapter == "" ) error("No adapter found ($log)!") 78 | return adapter 79 | } 80 | 81 | string trim_adapters( string fastq, string adapter, string o_dir, string group, string graph_suffix ) { 82 | if ( !adapter ) return fastq 83 | 84 | prefix := replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir ) 85 | p_gz := "$prefix"+".trim.fastq.gz" 86 | 87 | in := [ fastq ] 88 | out := p_gz 89 | 90 | param_min_trim_len := min_trim_len==0 ? "" : "-m $min_trim_len" 91 | 92 | taskName:= "trim_adapters $group" 93 | mem := get_res_mem(mem_trim,1); timeout := get_res_wt(wt_trim) 94 | 95 | wait_par( cpus ) 96 | 97 | tid := task( out<-in ) { 98 | 99 | sys $shcmd_init 100 | 101 | sys cutadapt $param_min_trim_len -e $adapter_err_rate -a $adapter $fastq | gzip -nc > $p_gz 102 | 103 | sys $shcmd_finalize 104 | } 105 | 106 | register_par( tid, cpus ) 107 | 108 | add_task_to_graph( in, out, group, "CUT-\\nADAPT$graph_suffix", grp_color_trim_adapter ) 109 | 110 | return out 111 | } 112 | 113 | string[] trim_adapters_PE( string fastq1, string fastq2, string adapter1, string adapter2, string o_dir, \ 114 | string group, string graph_suffix ) { 115 | if ( !adapter1 || !adapter2 ) return [fastq1, fastq2] 116 | 117 | prefix1 := replace_dir( rm_ext( fastq1, ["fastq","fq"] ), o_dir ) 118 | prefix2 := replace_dir( rm_ext( fastq2, ["fastq","fq"] ), o_dir ) 119 | p1 := "$prefix1.trim.fastq" 120 | p2 := "$prefix2.trim.fastq" 121 | p1_gz := "$p1.gz" 122 | p2_gz := "$p2.gz" 123 | 124 | in := [ fastq1, fastq2 ] 125 | out := [ p1_gz, p2_gz ] 126 | 127 | param_min_trim_len := min_trim_len==0 ? "" : "-m $min_trim_len" 128 | 129 | taskName:= "trim_adapters_PE " + group 130 | mem := get_res_mem(mem_trim,1); timeout := get_res_wt(wt_trim) 131 | 132 | wait_par( cpus ) 133 | 134 | tid := task( out<-in ) { 135 | 136 | sys $shcmd_init 137 | 138 | sys cutadapt $param_min_trim_len -e $adapter_err_rate -a $adapter1 -A $adapter2 -o $p1 -p $p2 $fastq1 $fastq2 139 | 140 | sys gzip -f $p1 141 | sys gzip -f $p2 142 | 143 | sys $shcmd_finalize 144 | } 145 | 146 | register_par( tid, cpus ) 147 | 148 | add_task_to_graph( in, out, group, "CUT-\\nADAPT$graph_suffix", grp_color_trim_adapter ) 149 | 150 | return out 151 | } 152 | 153 | string trim_adapters_old( string fastq, string o_dir, string group, string graph_suffix ) { 154 | 155 | prefix := replace_dir( rm_ext( fastq, ["fastq","fq"] ), o_dir ) 156 | p := "$prefix"+"_trimmed.fq" 157 | p_gz := "$p.gz" 158 | p2 := "$prefix"+"_trimmed.fastq" 159 | p2_gz := "$p2.gz" 160 | in := [ fastq ] 161 | out := p2_gz 162 | 163 | taskName:= "trim_adapters " + group 164 | mem := get_res_mem(mem_trim,1); timeout := get_res_wt(wt_trim) 165 | 166 | wait_par( cpus ) 167 | 168 | tid := task( out<-in ) { 169 | 170 | sys $shcmd_init 171 | 172 | sys trim_galore $fastq -o $o_dir --dont_gzip 173 | sys gzip -f $p 174 | sys mv $p_gz $p2_gz 175 | sys rm -f $p_gz $p 176 | 177 | sys $shcmd_finalize 178 | } 179 | 180 | register_par( tid, cpus ) 181 | 182 | add_task_to_graph( in, out, group, "TRIM GALORE\\n$graph_suffix", grp_color_trim_adapter ) 183 | 184 | return out 185 | } 186 | 187 | string[] trim_adapters_PE_old( string fastq1, string fastq2, string o_dir, string group, string graph_suffix ) { 188 | 189 | prefix1 := replace_dir( rm_ext( fastq1, ["fastq","fq"] ), o_dir ) 190 | prefix2 := replace_dir( rm_ext( fastq2, ["fastq","fq"] ), o_dir ) 191 | p1 := "$prefix1.trim.fastq" 192 | p2 := "$prefix2.trim.fastq" 193 | p1_gz := "$p1.gz" 194 | p2_gz := "$p2.gz" 195 | 196 | in := [ fastq1, fastq2 ] 197 | out := [ p1_gz, p2_gz ] 198 | 199 | taskName:= "trim_adapters_PE " + group 200 | mem := get_res_mem(mem_trim,1); timeout := get_res_wt(wt_trim) 201 | 202 | wait_par( cpus ) 203 | 204 | tid := task( out<-in ) { 205 | 206 | sys $shcmd_init 207 | 208 | sys cd $o_dir 209 | sys $(which trimAdapters.py) -a $fastq1 -b $fastq2 210 | sys gzip -f $p1 211 | sys gzip -f $p2 212 | 213 | sys $shcmd_finalize 214 | } 215 | 216 | register_par( tid, cpus ) 217 | 218 | add_task_to_graph( in, out, group, "TRIMADAPTOR.PY\\n$graph_suffix", grp_color_trim_adapter ) 219 | 220 | return out 221 | } 222 | -------------------------------------------------------------------------------- /modules/ataqc.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | // depends on external git repo: https://github.com/kundajelab/ataqc 5 | // needs to have $script_dir/ataqc/run_ataqc.py 6 | 7 | include "species.bds" 8 | 9 | 10 | help == ATAQC settings 11 | tss_enrich := "" help TSS enrichment bed for ataqc. 12 | dnase := "" help DNase bed (open chromatin region file) for ataqc. 13 | prom := "" help Promoter bed (promoter region file) for ataqc. 14 | enh := "" help Enhancer bed (enhancer region file) for ataqc. 15 | reg2map := "" help Reg2map (file with cell type signals) for ataqc. 16 | reg2map_bed := "" help Reg2map_bed (file of regions used to generate reg2map signals) for ataqc. 17 | roadmap_meta := "" help Roadmap metadata for ataqc. 18 | 19 | mem_ataqc := "20G" help Max. memory for ATAQC (default: 20G). 20 | wt_ataqc := "47h" help Walltime for ATAQC (default: 47h, 47:00:00). 21 | 22 | grp_color_ataqc := "pink" 23 | 24 | init_ataqc() 25 | 26 | void init_ataqc() { 27 | 28 | tss_enrich = get_conf_val( tss_enrich, ["tss_enrich"] ) 29 | dnase = get_conf_val( dnase, ["dnase"] ) 30 | prom = get_conf_val( prom, ["prom"] ) 31 | enh = get_conf_val( enh, ["enh"] ) 32 | reg2map = get_conf_val( reg2map, ["reg2map"] ) 33 | reg2map_bed = get_conf_val( reg2map_bed, ["reg2map_bed"] ) 34 | roadmap_meta = get_conf_val( roadmap_meta, ["roadmap_meta"] ) 35 | 36 | mem_ataqc = get_conf_val( mem_ataqc, ["mem_ataqc"] ) 37 | wt_ataqc = get_conf_val( wt_ataqc, ["wt_ataqc"] ) 38 | 39 | if ( reg2map_bed == "" ) reg2map_bed = dnase 40 | 41 | print("\n\n== ATAQC settings\n") 42 | print( "TSS enrichment bed\t\t: $tss_enrich\n" ) 43 | print( "DNase bed for ataqc\t\t: $dnase\n" ) 44 | print( "Promoter bed for ataqc\t\t: $prom\n" ) 45 | print( "Enhancer bed for ataqc\t\t: $enh\n" ) 46 | print( "Reg2map for ataqc\t\t\t: $reg2map\n" ) 47 | print( "Reg2map_bed for ataqc\t\t: $reg2map_bed\n" ) 48 | print( "Roadmap metadata for ataqc\t: $roadmap_meta\n" ) 49 | print( "Max. memory for ATAQC\t\t\t: $mem_ataqc\n") 50 | print( "Walltime for ATAQC\t\t\t: $wt_ataqc\n") 51 | } 52 | 53 | bool chk_ataqc() { 54 | 55 | print("\nChecking parameters and data files for ATAQC. \n\n") 56 | disable_ataqc := false 57 | 58 | if ( species == "" ) { print("Warning: Genome name is missing ( '-species [GENOME_NAME; hg19, mm9, ...]' )!\n" ); disable_ataqc = true } 59 | if ( ref_fa == "" ) { print("Warning: Specify your reference genome .fa ('-ref_fa [FA]')!\n"); disable_ataqc = true } 60 | if ( tss_enrich == "" ) { print("Warning: TSS enrichment bed is missing ( '-tss_enrich' )!\n"); disable_ataqc = true } 61 | if ( dnase == "" ) { print("Warning: DNase bed is missing ( '-dnase' )!\n"); disable_ataqc = true } 62 | if ( blacklist == "" ) { print("Warning: Blacklist bed is missing ( '-blacklist' )!\n"); disable_ataqc = true } 63 | if ( prom == "" ) { print("Warning: Promoter bed is missing ( '-prom' )!\n"); disable_ataqc = true } 64 | if ( enh == "" ) { print("Warning: Enhancer bed is missing ( '-enh' )!\n"); disable_ataqc = true } 65 | // if ( reg2map == "" ) { print("Warning: reg2map is missing ( '-reg2map' )!\n"); disable_ataqc = true } 66 | // if ( reg2map_bed == "" ){ print("Warning: reg2map_bed is missing ( '-reg2map_bed' )!\n"); disable_ataqc = true } 67 | // if ( roadmap_meta == "" ) { print("Warning: Roadmap metadata are missing ( '-roadmap_meta' )!\n"); disable_ataqc = true } 68 | 69 | if ( disable_ataqc ) { 70 | print("\nDisabling ATAQC...\n") 71 | return false 72 | } 73 | return true 74 | } 75 | 76 | string[] ataqc( string fastq1, string fastq2, string bam, string align_log, string pbc_log, \ 77 | string dup_log, string filt_bam, string bed, string bigwig, string peak, \ 78 | string peak_naive_overlap, string idr_peak, string o_dir, string group ) { 79 | 80 | prefix := replace_dir( rm_ext( fastq1, ["fastq","fq"] ), o_dir ) + ( (fastq2!="") ? ".PE2SE" : "" ) 81 | 82 | html := "$prefix"+"_qc.html" 83 | txt := "$prefix"+"_qc.txt" 84 | prefix_basename := get_basename( prefix ) 85 | 86 | param_fastq := (fastq2!="") ? " --fastq1 $fastq1 --fastq2 $fastq2" : " --fastq1 $fastq1" 87 | param_overlap := (peak_naive_overlap!="") ? " --naive_overlap_peaks $peak_naive_overlap" : "" 88 | param_idr := (idr_peak!="") ? " --idr_peaks $idr_peak" : "" 89 | param_use_sambamba := (use_sambamba_markdup) ? " --use_sambamba_markdup" : "" 90 | 91 | param_reg2map := (reg2map!="") ? " --reg2map $reg2map" : "" 92 | param_reg2map_bed := (reg2map_bed!="") ? " --reg2map_bed $reg2map_bed" : "" 93 | param_meta := (roadmap_meta!="") ? " --meta $roadmap_meta" : "" 94 | 95 | species_ataqc := species.indexOf("_")>=0 ? (species.split("_"))[0] : species 96 | print("species_ataqc: $species_ataqc\n") 97 | 98 | in := (fastq2!="") ? [ fastq1, fastq2, bam, align_log, pbc_log, dup_log, filt_bam, bed, bigwig, peak ] \ 99 | : [ fastq1, bam, align_log, pbc_log, dup_log, filt_bam, bed, bigwig, peak ] 100 | out := [ html, txt ] //, gc_plot, hist_graph, lg_vplot, vplot, signal ] 101 | 102 | taskName:= "ataqc "+group 103 | mem := get_res_mem(mem_ataqc,1) 104 | max_java_heap := binary_prefix( (mem==-1) ? parse_mem( mem_ataqc ) : (mem*3)/4 ) 105 | timeout := get_res_wt(wt_ataqc) 106 | 107 | wait_par( cpus ) 108 | 109 | tid := task( out<-in ) { 110 | 111 | sys $shcmd_init 112 | 113 | // To prevent java heap space error (Exception in thread "main" java.lang.OutOfMemoryError: Java heap space) 114 | sys export _JAVA_OPTIONS="-Xms256M -Xmx$max_java_heap -XX:ParallelGCThreads=1" 115 | // sys if [ -d "${TMPDIR}" ]; then \ 116 | sys if [ "$java_tmp_dir" != "" ] && [ -d "$java_tmp_dir" ]; then \ 117 | export _JAVA_OPTIONS="${_JAVA_OPTIONS} -Djava.io.tmpdir=$java_tmp_dir"; \ 118 | fi 119 | sys cd $o_dir 120 | 121 | // # if PICARDROOT is not defined, then look into ../shared/picard* (default picard dir. in bioconda) 122 | sys if [ -f "$(which picard)" ]; then export PICARDROOT="$(dirname $(which picard))/../share/picard"*; fi 123 | 124 | sys $script_dir/ataqc/run_ataqc.py \ 125 | --workdir $o_dir \ 126 | --outdir $o_dir \ 127 | --outprefix $prefix_basename \ 128 | --genome $species_ataqc \ 129 | --chromsizes $chrsz \ 130 | --ref $ref_fa \ 131 | --tss $tss_enrich \ 132 | --dnase $dnase \ 133 | --blacklist $blacklist \ 134 | --prom $prom \ 135 | --enh $enh \ 136 | --pbc $pbc_log\ 137 | $param_fastq \ 138 | --alignedbam $bam \ 139 | --alignmentlog $align_log \ 140 | --coordsortbam $bam \ 141 | --duplog $dup_log \ 142 | --finalbam $filt_bam \ 143 | --finalbed $bed \ 144 | --bigwig $bigwig \ 145 | --peaks $peak $param_overlap $param_idr $param_use_sambamba \ 146 | $param_reg2map $param_reg2map_bed $param_meta 147 | sys rm -f test.log test.png 148 | 149 | sys $shcmd_finalize 150 | } 151 | 152 | register_par( tid, cpus ) 153 | 154 | add_task_to_graph( in, out, group, "ATAQC", grp_color_ataqc ) 155 | 156 | return out 157 | } 158 | 159 | -------------------------------------------------------------------------------- /modules/callpeak_bigbed.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "species.bds" 5 | include "module_template.bds" 6 | 7 | 8 | string peak_to_bigbed( string filetype, string peak, string o_dir, string group ) { 9 | 10 | prefix := replace_dir( rm_ext( peak, \ 11 | ["narrowPeak","narrowpeak",\ 12 | "broadPeak","broadpeak",\ 13 | "gappedPeak","gappedpeak",filetype] )\ 14 | , o_dir ) 15 | bigbed := "$prefix.$filetype.bb" 16 | 17 | bed_param := _get_bed_param( filetype ) 18 | 19 | in := [ peak ] 20 | out := bigbed 21 | 22 | taskName:= "peak_to_bigbed " + group 23 | system := "local" 24 | 25 | wait_par( cpus ) 26 | 27 | tid := task( out<-in ) { 28 | 29 | sys $shcmd_init 30 | 31 | sys cat $chrsz | grep -P 'chr[\dXY]+[ \t]' > $bigbed.chrsz.tmp 32 | sys zcat $peak | sort -k1,1 -k2,2n > $bigbed.tmp 33 | sys bedClip $bigbed.tmp $bigbed.chrsz.tmp $bigbed.tmp2 34 | 35 | sys bedToBigBed $bed_param $bigbed.tmp2 $bigbed.chrsz.tmp $bigbed 36 | sys rm -f $bigbed.tmp $bigbed.tmp2 $bigbed.chrsz.tmp 37 | 38 | sys $shcmd_finalize 39 | } 40 | 41 | register_par( tid, cpus ) 42 | 43 | return out 44 | } 45 | 46 | string _get_bed_param( string filetype ) { 47 | 48 | if ( filetype.toLower() == "narrowpeak" ) { 49 | return "-type=bed6+4 -as=$script_dir/etc/narrowPeak.as" 50 | } 51 | else if ( filetype.toLower() == "broadpeak") { 52 | return "-type=bed6+3 -as=$script_dir/etc/broadPeak.as" 53 | } 54 | else if ( filetype.toLower() == "gappedpeak") { 55 | return "-type=bed12+3 -as=$script_dir/etc/gappedPeak.as" 56 | } 57 | else { 58 | error("Unsupported peak file type! ($filetype)\n") 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /modules/callpeak_blacklist_filter.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "species.bds" 5 | include "module_template.bds" 6 | 7 | 8 | string blacklist_filter_peak( string filetype, string peak, string o_dir, string group ) { 9 | 10 | prefix := replace_dir( rm_ext( peak, \ 11 | ["narrowPeak","narrowpeak",\ 12 | "broadPeak","broadpeak",\ 13 | "regionPeak","regionpeak",\ 14 | "gappedPeak","gappedpeak",filetype] )\ 15 | , o_dir ) 16 | filtered:= "$prefix.filt.$filetype.gz" 17 | 18 | in := [ peak ] 19 | out := filtered 20 | 21 | taskName:= "blacklist_filter " + group 22 | //timeout := 3600 // to get queued fast 23 | system := "local" 24 | 25 | wait_par( cpus ) 26 | 27 | tid := task( out<-in ) { 28 | 29 | sys $shcmd_init 30 | 31 | sys bedtools intersect -v -a <(zcat -f $peak) -b <(zcat -f $blacklist) \ 32 | | awk 'BEGIN{OFS="\t"} {if ($5>1000) $5=1000; print $0}' \ 33 | | grep -P 'chr[\dXY]+[ \t]' | gzip -nc > $filtered 34 | 35 | sys $shcmd_finalize 36 | } 37 | 38 | register_par( tid, cpus ) 39 | 40 | return out 41 | } 42 | -------------------------------------------------------------------------------- /modules/callpeak_macs2_atac.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "species.bds" 5 | include "module_template.bds" 6 | 7 | 8 | help == callpeak macs2 settings (requirements: -chrsz -gensz) 9 | gensz := "" help Genome size; hs for human, mm for mouse. 10 | wt_macs2 := "23h" help Walltime for MACS2 (default: 23h, 23:00:00). 11 | mem_macs2 := "15G" help Max. memory for MACS2 (default: 15G). 12 | cap_num_peak_macs2 := "300K" help Cap number of peaks by taking top N peaks for MACS2 (default: 300K). 13 | extra_param_macs2 := "" help Extra parameters for macs2 callpeak. 14 | 15 | 16 | grp_color_macs2 := "lightgreen" 17 | 18 | 19 | init_callpeak_macs2() 20 | 21 | 22 | void init_callpeak_macs2() { 23 | 24 | gensz = get_conf_val( gensz, ["gensz"]) 25 | wt_macs2 = get_conf_val( wt_macs2, ["wt_macs2"] ) 26 | mem_macs2 = get_conf_val( mem_macs2, ["mem_macs2"] ) 27 | cap_num_peak_macs2 = get_conf_val( cap_num_peak_macs2, ["cap_num_peak_macs2"] ) 28 | extra_param_macs2 = get_conf_val( extra_param_macs2, ["extra_param_macs2"] ) 29 | 30 | print("\n\n== callpeak macs2 settings\n") 31 | print( "Genome size (hs,mm)\t\t: $gensz\n" ) 32 | print( "Walltime (macs2)\t\t: $wt_macs2\n") 33 | print( "Max. memory (macs2)\t\t: $mem_macs2\n") 34 | print( "Cap number of peaks (macs2)\t: $cap_num_peak_macs2\n") 35 | print( "Extra parameters for macs2 callpeak\t\t: $extra_param_macs2\n") 36 | } 37 | 38 | void chk_callpeak_macs2() { 39 | 40 | if ( gensz == "" ) error("Genome size (-gensz) must be defined! (mm for mouse, hs for human)") 41 | if ( !path_exists( chrsz ) ) error("Chromosome size file (-chrsz) is missing! (file: $chrsz)") 42 | } 43 | 44 | string[] macs2_atac_npeak_and_signal( string tag, string smooth_window, string pval_thresh, bool make_sig, \ 45 | string peak_o_dir, string sig_o_dir, string group ) { 46 | 47 | prefix := replace_dir( rm_ext( tag, ["bed","tagAlign"] ), peak_o_dir ) + ".pf" 48 | prefix_sig := replace_dir( prefix, sig_o_dir ) 49 | int_cap_num_peak_macs2 := parse_number( cap_num_peak_macs2 ) 50 | // peaks 51 | peakfile := "$prefix.pval$pval_thresh.$cap_num_peak_macs2.narrowPeak.gz" 52 | 53 | fc_bedgraph := "$prefix.fc.signal.bedgraph" 54 | fc_bedgraph_srt := "$prefix.fc.signal.srt.bedgraph" 55 | fc_bigwig := "$prefix_sig.fc.signal.bigwig" 56 | 57 | pval_bedgraph := "$prefix.pval.signal.bedgraph" 58 | pval_bedgraph_srt := "$prefix.pval.signal.srt.bedgraph" 59 | pval_bigwig := "$prefix_sig.pval.signal.bigwig" 60 | 61 | shiftsize := round( smooth_window.parseReal()/2.0 ) 62 | 63 | blacklist_exists := path_exists(blacklist) 64 | 65 | in := [ tag ] 66 | // out := make_sig ? [ peakfile, gpeakfile, fc_bigwig, pval_bigwig ] : [ peakfile, gpeakfile ] 67 | out := make_sig ? [ peakfile, fc_bigwig, pval_bigwig ] : [ peakfile ] 68 | 69 | taskName:= "macs2 n/s " + group 70 | mem := get_res_mem(mem_macs2,1); timeout := get_res_wt(wt_macs2) 71 | 72 | wait_par( cpus ) 73 | 74 | tid := task( out<-in ) { 75 | 76 | sys $shcmd_init 77 | sys export LC_COLLATE=C 78 | 79 | sys macs2 callpeak \ 80 | -t $tag -f BED -n "$prefix" -g "$gensz" -p $pval_thresh \ 81 | --nomodel --shift -$shiftsize --extsize $smooth_window -B --SPMR --keep-dup all --call-summits $extra_param_macs2 82 | 83 | //# Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_ 84 | // sys sort -k 8gr,8gr "$prefix"_peaks.narrowPeak | awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}' | head -n $int_cap_num_peak_macs2 | gzip -nc > $peakfile 85 | sys sort -k 8gr,8gr "$prefix"_peaks.narrowPeak | awk 'BEGIN{OFS="\t"}{$4="Peak_"NR; if ($2<0) $2=0; if ($3<0) $3=0; print $0}' > $peakfile.tmp 86 | // sys zcat -f "$prefix"_peaks.narrowPeak | sort -k 8gr,8gr | head -n $int_cap_num_peak_macs2 | awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}' | gzip -nc > $peakfile 87 | sys head -n $int_cap_num_peak_macs2 $peakfile.tmp | gzip -nc > $peakfile 88 | sys rm -f $peakfile.tmp 89 | sys rm -f "$prefix"_peaks.narrowPeak 90 | sys rm -f "$prefix"_summits.bed 91 | 92 | sys if [[ $make_sig == "false" ]]; then \ 93 | rm -f "$prefix"_treat_pileup.bdg "$prefix"_control_lambda.bdg; \ 94 | $shcmd_finalize; \ 95 | exit; \ 96 | fi 97 | 98 | sys macs2 bdgcmp -t "$prefix"_treat_pileup.bdg -c "$prefix"_control_lambda.bdg \ 99 | --o-prefix "$prefix" -m FE 100 | sys slopBed -i "$prefix"_FE.bdg -g "$chrsz" -b 0 | bedClip stdin "$chrsz" $fc_bedgraph 101 | sys rm -f "$prefix"_FE.bdg 102 | 103 | sys LC_COLLATE=C sort -S 4G -k1,1 -k2,2n $fc_bedgraph > $fc_bedgraph_srt 104 | sys bedGraphToBigWig $fc_bedgraph_srt "$chrsz" "$fc_bigwig" 105 | sys rm -f $fc_bedgraph $fc_bedgraph_srt 106 | 107 | //# sval counts the number of tags per million in the (compressed) BED file 108 | sys sval=$(wc -l <(zcat -f "$tag") | awk '{printf "%f", $1/1000000}') 109 | 110 | sys macs2 bdgcmp \ 111 | -t "$prefix"_treat_pileup.bdg -c "$prefix"_control_lambda.bdg \ 112 | --o-prefix "$prefix" -m ppois -S "${sval}" 113 | sys slopBed -i "$prefix"_ppois.bdg -g "$chrsz" -b 0 | bedClip stdin "$chrsz" $pval_bedgraph 114 | sys rm -f "$prefix"_ppois.bdg 115 | 116 | sys LC_COLLATE=C sort -S 4G -k1,1 -k2,2n $pval_bedgraph > $pval_bedgraph_srt 117 | sys bedGraphToBigWig $pval_bedgraph_srt "$chrsz" "$pval_bigwig" 118 | sys rm -f $pval_bedgraph $pval_bedgraph_srt 119 | 120 | sys rm -f "$prefix"_treat_pileup.bdg "$prefix"_control_lambda.bdg 121 | 122 | sys $shcmd_finalize 123 | } 124 | 125 | register_par( tid, cpus ) 126 | 127 | add_task_to_graph( in, out, group, "MACS2\\np-val<$pval_thresh", grp_color_macs2 ) 128 | 129 | return out 130 | } 131 | 132 | string[] macs2_atac_gpeak_and_bpeak( string tag, string smooth_window, string pval_thresh, \ 133 | string peak_o_dir, string group ) { 134 | 135 | prefix := replace_dir( rm_ext( tag, ["bed","tagAlign"] ), peak_o_dir ) + ".pf" 136 | // peaks 137 | bpeakfile := "$prefix.pval$pval_thresh.$cap_num_peak_macs2.broadPeak.gz" 138 | gpeakfile := "$prefix.pval$pval_thresh.$cap_num_peak_macs2.gappedPeak.gz" 139 | int_cap_num_peak_macs2 := parse_number( cap_num_peak_macs2 ) 140 | // peaks 141 | 142 | shiftsize := round( smooth_window.parseReal()/2.0 ) 143 | 144 | blacklist_exists := path_exists(blacklist) 145 | 146 | in := [ tag ] 147 | // out := make_sig ? [ peakfile, gpeakfile, fc_bigwig, pval_bigwig ] : [ peakfile, gpeakfile ] 148 | out := [ gpeakfile, bpeakfile ] 149 | 150 | taskName:= "macs2 g/b " + group 151 | mem := get_res_mem(mem_macs2,1); timeout := get_res_wt(wt_macs2) 152 | 153 | wait_par( cpus ) 154 | 155 | tid := task( out<-in ) { 156 | 157 | sys $shcmd_init 158 | sys export LC_COLLATE=C 159 | 160 | // .tmp is to prevent file race condition with macs2_atac_npeak_and_signal 161 | sys macs2 callpeak \ 162 | -t $tag -f BED -n "$prefix.tmp" -g "$gensz" -p $pval_thresh \ 163 | --nomodel --shift -$shiftsize --extsize $smooth_window --broad --keep-dup all $extra_param_macs2 164 | 165 | //# Sort by Col8 in descending order and replace long peak names in Column 4 with Peak_ 166 | sys sort -k 8gr,8gr "$prefix.tmp"_peaks.broadPeak | awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}' > $bpeakfile.tmp 167 | sys sort -k 14gr,14gr "$prefix.tmp"_peaks.gappedPeak | awk 'BEGIN{OFS="\t"}{$4="Peak_"NR ; print $0}' > $gpeakfile.tmp 168 | sys head -n $int_cap_num_peak_macs2 $bpeakfile.tmp | gzip -nc > $bpeakfile 169 | sys head -n $int_cap_num_peak_macs2 $gpeakfile.tmp | gzip -nc > $gpeakfile 170 | sys rm -f $bpeakfile.tmp $gpeakfile.tmp 171 | 172 | sys rm -f "$prefix.tmp"_peaks.broadPeak 173 | sys rm -f "$prefix.tmp"_peaks.gappedPeak 174 | sys rm -f "$prefix.tmp"_peaks.xls 175 | sys rm -f "$prefix.tmp"_summits.bed 176 | sys rm -f "$prefix.tmp"_treat_pileup.bdg "$prefix.tmp"_control_lambda.bdg 177 | 178 | sys $shcmd_finalize 179 | } 180 | 181 | register_par( tid, cpus ) 182 | 183 | add_task_to_graph( in, out, group, "MACS2\\np-val<$pval_thresh", grp_color_macs2 ) 184 | 185 | return out 186 | } 187 | -------------------------------------------------------------------------------- /modules/cluster.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "conf.bds" 5 | 6 | 7 | help == cluster/system/resource settings 8 | wt := "5h50m" help Walltime for all single-threaded tasks (example: 8:10:00, 3h, 3600, default: 5h50m, 5:50:00). 9 | memory := "7G" help Maximum memory for all single-threaded tasks (equivalent to '-mem', example: 4.5G, 1024M, default: 7G). 10 | use_system := "local" help Force to use a system (equivalent to 'bds -s [SYSTEM_NAME] ...', any system defined in bds.config can be used). 11 | nice := 0 help Set process priority for all tasks (default: 0; -20 (highest) ~ 19 (lowest) ). 12 | retrial := 0 help # of Retrial for failed tasks (default: 0). 13 | q := "" help Submit tasks to a specified cluster queue. 14 | q_for_slurm_account := false help Use --account instead of -p (partition) for SLURM only. 15 | unlimited_mem_wt:= false help Use unlimited max. memory and walltime. 16 | java_tmp_dir := "\${TMPDIR}" help Java temporary directory. (change it when you get 'Disk quota exceeded' error in Java, default: ${TMPDIR}). 17 | 18 | init_cluster() 19 | 20 | 21 | void init_cluster() { 22 | wt = get_conf_val( wt, ["wt"] ) 23 | memory = get_conf_val( memory, ["memory","mem"] ) 24 | use_system = get_conf_val( use_system, ["use_system","system"] ) 25 | nice = get_conf_val_int( nice, ["nice"] ) 26 | retrial = get_conf_val_int( retrial, ["retrial","retry"] ) 27 | q = get_conf_val( q, ["q"] ) 28 | unlimited_mem_wt= get_conf_val_bool( unlimited_mem_wt, ["unlimited_mem_wt"] ) 29 | q_for_slurm_account= get_conf_val_bool( q_for_slurm_account, ["q_for_slurm_account"] ) 30 | java_tmp_dir = get_conf_val( java_tmp_dir, ["java_tmp_dir"] ) 31 | 32 | if ( cmd_line_arg_has_key("mem") ) memory = get_cmd_line_arg_val( "mem" ) 33 | if ( cmd_line_arg_has_key("system") ) use_system = get_cmd_line_arg_val( "system" ) 34 | if ( nice <= -20 ) nice = -20 35 | if ( nice > 19 ) nice = 19 36 | if ( use_system != "" ) system = use_system.toLower() 37 | if ( system == "slurm" || system == "generic" ) { // for new SCG, which uses --account instead of -p (partition) 38 | system = "generic" 39 | if ( q != "" ) { 40 | if ( q_for_slurm_account ) { 41 | queue = "--account $q" 42 | } 43 | else { 44 | queue = "-p $q" 45 | } 46 | } 47 | } 48 | else if ( q != "" ) { 49 | queue = q 50 | } 51 | 52 | // cpus, mem and timeout are pre-declared BDS variables for default resource settings 53 | mem = get_res_mem(memory,1) 54 | timeout = get_res_wt(wt) 55 | retry = retrial 56 | 57 | // do not modify this (BDS timeout; how long BDS will wait for tasks to be queued on the cluster) 58 | walltimeout = 3600*24*100 // timeout var. in BigDataScript (100 days, jobs will never be stopped by BDS due to BDS timeout) 59 | 60 | print("\n\n== cluster/system info\n") 61 | print( "Walltime (general)\t\t: $wt\n" ) 62 | print( "Max. memory (general)\t\t: $memory\n" ) 63 | print( "Force to use a system\t\t: $use_system\n" ) 64 | print( "Process priority (niceness)\t: $nice\n" ) 65 | print( "Retiral for failed tasks\t: $retrial\n" ) 66 | print( "Submit tasks to a cluster queue\t: $q\n" ) 67 | print( "Unlimited cluster mem./walltime\t: $unlimited_mem_wt\n") 68 | print( "Use --acount instead of SLURM partition\t\t: $q_for_slurm_account\n") 69 | print( "Java temporary directory\t\t: $java_tmp_dir\n") 70 | } 71 | 72 | int get_res_wt( string str ) { 73 | return (unlimited_mem_wt || is_system_local() ) ? -1 : parse_time( str ) 74 | } 75 | 76 | int get_res_mem( string str, int n ) { 77 | if ( n < 1 ) n = 1 78 | return (unlimited_mem_wt || is_system_local() ) ? -1 : parse_mem( str )/n 79 | } 80 | 81 | int get_res_mem( string str ) { 82 | return get_res_mem( str , 1 ) 83 | } 84 | 85 | bool is_system_sge() { 86 | return system == "sge" 87 | } 88 | 89 | bool is_system_local() { 90 | return system == "local" 91 | } 92 | 93 | bool is_system_generic() { 94 | return system == "generic" 95 | } 96 | 97 | bool is_system_slurm() { 98 | // slurm uses generic cluster, it's configured in bds.config and ./utils/clusterGeneral 99 | return system == "generic" 100 | } 101 | -------------------------------------------------------------------------------- /modules/conf.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "sys.bds" 5 | 6 | 7 | help == configuration file settings 8 | c := "" help Configuration file path. 9 | env := "$script_dir/default.env" help Environment file path. 10 | 11 | 12 | string{} conf // map for configuration 13 | 14 | 15 | init_conf() 16 | 17 | 18 | void init_conf() { 19 | if ( is_cmd_line_arg_empty() ) \ 20 | print( "\nWarning: No parameters are given (specify cmd. line arguments or configuration file)!\n\n") 21 | if ( is_first_arg_conf() ) c = args[0] 22 | 23 | add_to_conf( c, "" ) // then read conf. file 24 | env = get_conf_val( env, ["env"] ) 25 | if ( path_exists( env ) ) add_to_conf( env, hostname ) 26 | add_to_conf( c, "" ) // read conf again to override 27 | 28 | print( "\n\n== configuration file info\n") 29 | print( "Hostname\t\t\t: $hostname\n") 30 | print( "Configuration file\t\t: $c\n" ) 31 | print( "Environment file\t\t: $env\n" ) 32 | } 33 | 34 | string{} read_conf( string file, string section ) { 35 | section = section.trim() 36 | string{} ret 37 | 38 | if ( file == "" ) return ret 39 | lines := file.read().split("\n") 40 | 41 | can_read := (section=="") ? true : false 42 | found_section := (section=="") ? true : false 43 | for ( string line : lines ) { 44 | line = rm_comment( line.trim() ) 45 | if ( line == "" ) continue 46 | 47 | if ( line.startsWith( "[" ) && line.endsWith( "]" ) ) { 48 | line2 := line.substr(1,line.length()-1) 49 | string[] hostnames 50 | string group 51 | // find group if exists 52 | arr := line2.split(":") 53 | if ( arr.size() > 1 ) group = arr[1].trim() 54 | hostnames = arr[0].split(",") 55 | if ( section == "" ) { 56 | can_read = false 57 | } 58 | else { 59 | for ( string host : hostnames ) { 60 | host = host.trim() 61 | if ( match_str( section, host ) ) { // one asterisk (wildcard chr: *) is allowed in hostname string 62 | if ( section == group ) { 63 | error("Recursion (section name == group) found in a conf. or an env. file!"+\ 64 | " (file: $file, section: $section, group: $group)\n") 65 | } 66 | else if ( group != "" ) { 67 | print("\tReading parameters from section group($group) in file($file)...\n") 68 | return read_conf( file, group ) 69 | } 70 | else { 71 | print("\tReading parameters from section ($host) in file($file)...\n") 72 | found_section = true 73 | can_read = true 74 | break; 75 | } 76 | } 77 | else { 78 | can_read = false 79 | } 80 | } 81 | } 82 | continue 83 | } 84 | 85 | if ( can_read ) { 86 | string key, val 87 | (key, val) = parse_conf_line( line ) 88 | ret{ key } = val 89 | } 90 | } 91 | if ( !found_section && section != "default" ) return read_conf( file, "default" ) 92 | 93 | return ret 94 | } 95 | 96 | string{} read_conf( string file ) { 97 | return read_conf( file, "" ) 98 | } 99 | 100 | void add_to_conf( string file, string section ) { 101 | 102 | tmp := read_conf( file, section ) 103 | 104 | for( string k : tmp.keys() ) conf{k} = tmp{k} 105 | } 106 | 107 | void add_to_conf( string file ) { 108 | tmp := read_conf( file ) 109 | for( string k : tmp.keys() ) { 110 | conf{k} = tmp{k} 111 | } 112 | } 113 | 114 | string[] parse_conf_line( string line ) { 115 | delims := [ "=", "\t" ] 116 | delim_found := false 117 | string key, val 118 | for ( string delim : delims ) { 119 | idx := line.indexOf( delim ) 120 | if ( idx > -1 ) { 121 | key = line.substr( 0, idx ).trim().toLower() 122 | val = line.substr( idx+1 ).trim() 123 | delim_found = true 124 | break 125 | } 126 | } 127 | if ( !delim_found ) error("No delimiter (=,\\t) found in line ($line) in the configruation file.\n") 128 | return [key, val] 129 | } 130 | 131 | int get_conf_val_int( int curr_val, string key ) { 132 | string{} tmp 133 | return parse_int( get_conf_val( curr_val, key, tmp ) ) 134 | } 135 | 136 | int get_conf_val_int( int curr_val, string[] keys ) { 137 | string{} tmp 138 | return parse_int( get_conf_val( curr_val, keys, tmp ) ) 139 | } 140 | 141 | bool get_conf_val_bool( bool curr_val, string key ) { 142 | string{} tmp 143 | return parse_bool( get_conf_val( curr_val, key, tmp ) ) 144 | } 145 | 146 | bool get_conf_val_bool( bool curr_val, string[] keys ) { 147 | string{} tmp 148 | return parse_bool( get_conf_val( curr_val, keys, tmp ) ) 149 | } 150 | 151 | real get_conf_val_real( real curr_val, string key ) { 152 | string{} tmp 153 | return parse_real( get_conf_val( curr_val, key, tmp ) ) 154 | } 155 | 156 | real get_conf_val_real( real curr_val, string[] keys ) { 157 | string{} tmp 158 | return parse_real( get_conf_val( curr_val, keys, tmp ) ) 159 | } 160 | 161 | int get_conf_val_int( int curr_val, string key, string{} _conf ) { 162 | return parse_int( get_conf_val( curr_val, key, _conf ) ) 163 | } 164 | 165 | int get_conf_val_int( int curr_val, string[] keys, string{} _conf ) { 166 | return parse_int( get_conf_val( curr_val, keys, _conf ) ) 167 | } 168 | 169 | bool get_conf_val_bool( bool curr_val, string key, string{} _conf ) { 170 | return parse_bool( get_conf_val( curr_val, key, _conf ) ) 171 | } 172 | 173 | bool get_conf_val_bool( bool curr_val, string[] keys, string{} _conf ) { 174 | return parse_bool( get_conf_val( curr_val, keys, _conf ) ) 175 | } 176 | 177 | real get_conf_val_real( real curr_val, string key, string{} _conf ) { 178 | return parse_real( get_conf_val( curr_val, key, _conf ) ) 179 | } 180 | 181 | real get_conf_val_real( real curr_val, string[] keys, string{} _conf ) { 182 | return parse_real( get_conf_val( curr_val, keys, _conf ) ) 183 | } 184 | 185 | string get_conf_val( string curr_val, string key, string{} _conf ) { 186 | key = key.toLower().trim() 187 | if ( cmd_line_arg_has_key( key ) ) return curr_val 188 | if ( _conf.size() == 0 ) { 189 | if ( conf.hasKey( key ) ) { 190 | return (conf{ key } != "") ? substitute_var( rm_comment( conf{ key } ) ) : curr_val 191 | } 192 | } 193 | else { 194 | if ( _conf.hasKey( key ) ) { 195 | return (_conf{ key } != "") ? substitute_var( rm_comment( _conf{ key } ) ) : curr_val 196 | } 197 | } 198 | return curr_val 199 | } 200 | 201 | string substitute_var( string var ) { 202 | var = var.replace("\$script_dir","$script_dir").replace("\${script_dir}","$script_dir") 203 | var = var.replace("~/","$HOME/").replace("\$HOME","$HOME").replace("\${HOME}","$HOME") 204 | return var 205 | } 206 | 207 | string get_conf_val( string curr_val, string[] keys, string{} _conf ) { 208 | for ( string key : keys ) { 209 | val := get_conf_val( curr_val, key, _conf ) 210 | if ( val != curr_val ) return val 211 | } 212 | return curr_val 213 | } 214 | 215 | string get_conf_val( string curr_val, string key ) { 216 | string{} tmp 217 | return get_conf_val( curr_val, key, tmp ) 218 | } 219 | 220 | string get_conf_val( string curr_val, string[] keys ) { 221 | string{} tmp 222 | return get_conf_val( curr_val, keys, tmp ) 223 | } 224 | 225 | bool has_conf_key( string key, string{} _conf ) { 226 | key = key.toLower() 227 | return (_conf.size()==0) ? conf.hasKey( key ) : _conf.hasKey( key ) 228 | } 229 | 230 | bool has_conf_key( string key ) { 231 | string{} tmp 232 | return has_conf_key( key ) 233 | } 234 | 235 | bool conf_file_exists() { 236 | if ( c!="" ) return c.exists() 237 | return false 238 | } 239 | 240 | bool has_key_in_conf_or_cmd_line( string key ) { 241 | return cmd_line_arg_has_key( key )// || has_conf_key( key ) 242 | } 243 | 244 | -------------------------------------------------------------------------------- /modules/env.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "cluster.bds" 5 | 6 | help == shell environment settings 7 | mod := "" help Modules separated by ; (example: "bowtie/2.2.4; bwa/0.7.7; picard-tools/1.92"). 8 | shcmd := "" help Shell commands separated by ;. Shell var. must be written as ${VAR} not as $VAR (example: "export PATH=${PATH}:/usr/test; VAR=test"). 9 | addpath := "" help Path separated by ; or : to be PREPENDED to \$PATH (example: "/bin/test:${HOME}/utils"). 10 | conda_env := "" help Anaconda Python (or Miniconda) environment name for all softwares including Python2. 11 | conda_env_py3 := "" help Anaconda Python (or Miniconda) environment name for Python3. 12 | conda_bin_dir := "" help Anaconda Python (or Miniconda) bin directory. 13 | cluster_task_min_len := 60 help Minimum length for a cluster job in seconds (dealing with NFS delayed write, default: 60). 14 | cluster_task_delay := 0 help Constant delay for every job in seconds (dealing with NFS delayed write, default: 0). 15 | 16 | shcmd_init := "" // Shell command executed prior to all BigDataScript tasks (use this for setting up shell environment) 17 | shcmd_init_py3 := "" // for softwares using python3 18 | shcmd_finalize := "" 19 | 20 | delay_conda_env := 5 // wait for 5 seconds for conda env activation 21 | 22 | 23 | init_env() 24 | 25 | 26 | void init_env() { 27 | conda_env = get_conf_val( conda_env, ["conda_env"] ) 28 | conda_env_py3 = get_conf_val( conda_env_py3, ["conda_env_py3"] ) 29 | conda_bin_dir = get_conf_val( conda_bin_dir, ["conda_bin_dir"] ) 30 | cluster_task_min_len = get_conf_val_int( cluster_task_min_len, ["cluster_task_min_len"] ) 31 | cluster_task_delay = get_conf_val_int( cluster_task_delay, ["cluster_task_delay"] ) 32 | 33 | // environment modules (sh,bash) init. file paths 34 | init_mods := ["/etc/profile.d/modules.sh", "/etc/profile.d/modules.bash"] 35 | init_mod := "" // module init. shell script found among the above list 36 | 37 | moduleshome := get_shell_var("MODULESHOME").replace("\n","") // get shell var MODULESHOME if exists 38 | if (moduleshome!="") init_mods.add("$moduleshome/init/bash") 39 | 40 | string shellcmd, module, path, conda_py2, conda_py3 41 | 42 | for ( string file : init_mods ) { // find env. modules init script 43 | if ( file.exists() ) { 44 | init_mod = file 45 | break 46 | } 47 | } 48 | 49 | if ( init_mod == "" ) { 50 | print("\n\nInfo: Environments module not found on your system " + \ 51 | "(e.g. /etc/profile.d/modules.sh). Ignoring shell env. parameters like '-mod'. \n") 52 | } 53 | 54 | // read from conf./env. file 55 | for( string k : conf.keys() ) { 56 | string val = conf{k} 57 | if ( (k.indexOf("mod_")>=0) || (k=="mod") ) { // concat. module 58 | if ( init_mod != "" ) { 59 | trimmed := val.trim().replace("module add ","").replace( ":", " " ).replace( ";", " " ).replace( ",", " " ).trim() 60 | trimmed = trimmed.replace( " ", " ").replace( " ", " ") 61 | module = module + " " + trimmed 62 | } 63 | } 64 | else if ( k.indexOf("shcmd")>=0 ) { 65 | shellcmd = shellcmd + " " + val + ";" 66 | } 67 | else if ( k.indexOf("addpath")>=0 ) { 68 | path = path + val.trim().replace(",",":").replace(";",":").replace(" ",":").replace(":::",":").replace("::",":") + ":" 69 | } 70 | } 71 | 72 | // read from cmd. line arg. 73 | if ( mod!="" ) { 74 | string module_header = ". $init_mod;" 75 | if ( init_mod != "" ) { // if /etc/profile.d/modules.sh exists 76 | trimmed := mod.trim().replace("module add ","").replace( ":", " " ).replace( ";", " " ).replace(","," " ).trim() 77 | trimmed = trimmed.replace( " ", " ").replace( " ", " ") 78 | module = module + " " + trimmed 79 | } 80 | } 81 | if ( shcmd!="" ) shellcmd = shellcmd + shcmd.trim() + "; " 82 | if ( addpath!="" ) path = path + \ 83 | addpath.trim().replace(",",":").replace(";",":").replace(" ",":").replace(":::",":").replace("::",":") + ":" 84 | if ( module !="" ) module = ". $init_mod; module add " + module + ";" 85 | 86 | // check script directories to add to PATH 87 | script_file_paths := get_script_file_paths() 88 | for ( string _path : script_file_paths ) { 89 | if ( _path.exists() ) { 90 | path = path + _path + ":" 91 | } 92 | } 93 | 94 | if ( conda_bin_dir ) conda_bin_dir += "/" 95 | if ( path !="" ) path = " export PATH=$path:\${PATH}:/bin:/usr/bin:/usr/local/bin:\${HOME}/.bds;" 96 | // add conda env 97 | if ( conda_env != "" ) conda_py2 = \ 98 | "if [[ -f $(which $conda_bin_dir"+"conda) && $($conda_bin_dir"+"conda env list | grep $conda_env | wc -l) != \"0\" ]];"+\ 99 | " then source $conda_bin_dir"+"activate $conda_env; sleep $delay_conda_env; fi; " 100 | if ( conda_env_py3 != "" ) conda_py3 = \ 101 | "if [[ -f $(which $conda_bin_dir"+"conda) && $($conda_bin_dir"+"conda env list | grep $conda_env_py3 | wc -l) != \"0\" ]];"+\ 102 | " then source $conda_bin_dir"+"activate $conda_env_py3; sleep $delay_conda_env; fi; " 103 | 104 | // additional initialization 105 | shcmd_init_ := module + path + shellcmd 106 | shcmd_init_ += "; set -o pipefail" // to catch and stop on non-zero exit code in a UNIX pipe 107 | shcmd_init_ += "; STARTTIME=$(date +%s)" // to check running time for a task 108 | if ( nice != 0 ) shcmd_init_ += "; if (( $(nice)<$nice )); then renice -n $nice $$; fi" // to set process priority (niceness) 109 | 110 | shcmd_init_ = shcmd_init_.replace( ": :", ":" ).replace( "::", ":" ).replace( "; ;", ";" ).replace( ";;", ";" ) 111 | shcmd_init = conda_py2 + shcmd_init_ 112 | shcmd_init_py3 = conda_py3 + shcmd_init_ 113 | 114 | if ( is_system_local() ) { 115 | shcmd_finalize = "TASKTIME=$[$(date +%s)-${STARTTIME}]; echo \"Task has finished (${TASKTIME} seconds).\"; "+\ 116 | "sleep $cluster_task_delay" 117 | } 118 | else { 119 | shcmd_finalize = "TASKTIME=$[$(date +%s)-${STARTTIME}]; if [ ${TASKTIME} -lt $cluster_task_min_len ]; "+\ 120 | "then echo \"Waiting for $[$cluster_task_min_len-${TASKTIME}] seconds.\";"+\ 121 | " sleep $[$cluster_task_min_len-${TASKTIME}]; sleep $cluster_task_delay; fi" 122 | } 123 | 124 | print("\n\n== shell environment info\n") 125 | print( "Conda env. \t\t\t: $conda_env\n" ) 126 | print( "Conda env. for python3\t\t: $conda_env_py3\n" ) 127 | print( "Conda bin. directory\t\t: $conda_bin_dir\n" ) 128 | print( "\nShell cmd. for init.\t\t: $shcmd_init\n" ) 129 | print( "\nShell cmd. for init.(py3)\t: $shcmd_init_py3\n" ) 130 | print( "\nShell cmd. for fin.\t\t: $shcmd_finalize\n" ) 131 | print( "\nCluster task min. len.\t\t: $cluster_task_min_len\n" ) 132 | print( "\nCluster task delay\t\t\t: $cluster_task_delay\n" ) 133 | } 134 | 135 | -------------------------------------------------------------------------------- /modules/filetable.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "output.bds" 5 | 6 | 7 | int{} _label_rank 8 | 9 | string{} _filetable_label // key: hierarchy 10 | string{} _filetable_path 11 | int{} _filetable_rank 12 | 13 | string{} _filetable_input 14 | int _curr_rank = 0 15 | 16 | 17 | 18 | void add_label_to_table( string label ) { 19 | _label_rank{ label } = _curr_rank++ 20 | } 21 | 22 | void add_file_to_table( string[] paths, string[] hrchys ) { 23 | for ( int i=0; iExpand all   " + \ 50 | "Collapse all" + \ 51 | " FilesPath " 52 | 53 | _construct_filetable() 54 | 55 | sorted_hrchy := _find_children_and_sort( "" ) 56 | for ( string hrchy : sorted_hrchy ) { 57 | parent := _get_parent( hrchy ) 58 | label := _filetable_label{ hrchy } 59 | path := _filetable_path.hasKey( hrchy ) ? _filetable_path{ hrchy } : "" 60 | if ( parent == "" ) \ 61 | html += " $label "+ html_link_url( path ) +"" 62 | else \ 63 | html += " $label "+ html_link_url( path ) +"" 64 | } 65 | html += "" 66 | html += "
\n" 67 | return html 68 | } 69 | 70 | string html_link_url( string path ) { 71 | rel_path := get_rel_path( path ) 72 | if ( rel_path.startsWith("./") ) \ 73 | return "" + rel_path + "
" 74 | else \ 75 | return rel_path + "
" 76 | } 77 | 78 | void _construct_filetable() { 79 | for( string hrchy : _filetable_input.keys() ) { 80 | _construct_filetable( hrchy, _filetable_input{ hrchy } ) 81 | } 82 | } 83 | 84 | // returns rank of item 85 | void _construct_filetable( string hrchy, string path ) { 86 | if ( hrchy == "" ) return 87 | if ( _filetable_label.hasKey( hrchy ) ) return 88 | 89 | curr := _get_curr( hrchy ) 90 | parent := _get_parent( hrchy ) 91 | _filetable_label{hrchy} = curr //map_label.hasKey(curr) ? map_label{curr} : curr 92 | _filetable_path{hrchy} = path 93 | if ( parent != "" ) _construct_filetable( parent, "" ) 94 | } 95 | 96 | string[] _get_children( string hrchy ) { // not including grand ones 97 | string[] children 98 | 99 | for ( string hrchy_ : _filetable_label.keys() ) { 100 | if ( hrchy == "" ) { 101 | if ( hrchy_.indexOf("/") < 0 ) \ 102 | children.push( hrchy_ ) 103 | } 104 | else if ( hrchy_.toLower().startsWith( hrchy.toLower() + "/" ) ) { 105 | 106 | if ( hrchy_.lastIndexOf("/") <= hrchy.length() ) \ 107 | children.push( hrchy_ ) 108 | } 109 | } 110 | return children 111 | } 112 | 113 | string[] _find_children_and_sort( string hrchy ) { 114 | string[] ret 115 | children := _get_children( hrchy ) 116 | if ( children.size() == 0 ) return ret 117 | 118 | // for bubble sort 119 | int[] ranks 120 | for ( string child : children ) { 121 | curr := _get_curr( child ) 122 | ranks.add( _label_rank.hasKey(curr) ? _label_rank{curr} : 0 ) 123 | } 124 | sorted := _bubble_sort( ranks, children ) 125 | for ( string child : sorted ) { 126 | ret = ret + [child] + _find_children_and_sort( child ) 127 | } 128 | return ret 129 | } 130 | 131 | string _get_parent( string hrchy ) { // "a/b/c" return a/b 132 | return hrchy.substr( 0, hrchy.lastIndexOf("/") ) 133 | } 134 | 135 | string _get_curr( string hrchy ) { // "a/b/c" return c 136 | return hrchy.substr( hrchy.lastIndexOf("/")+1 ) 137 | } 138 | 139 | string[] _bubble_sort( int[] a, string[] s ) { // sorting algorithm 140 | if ( a.size() != s.size() ) error("Array sizes do not match in _bubble_sort()!") 141 | 142 | int temp; //for swapping 143 | string temp2; 144 | n := a.size() 145 | for (int i = 0 ; i < n - 1 ; i++) { 146 | 147 | for (int j = 0 ; j < n - 1 ; j++) { 148 | 149 | if ( a[j] > a[j + 1] ) { 150 | temp = a[j]; 151 | a[j]=a[j + 1]; 152 | a[j + 1] = temp; 153 | 154 | temp2 = s[j]; 155 | s[j]=s[j + 1]; 156 | s[j + 1] = temp2; 157 | } 158 | } 159 | } 160 | return s 161 | } 162 | -------------------------------------------------------------------------------- /modules/git.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "sys.bds" 5 | 6 | 7 | latest_git_commit_sha1 := "" // to show latest git commit sha1/date 8 | latest_git_commit_date := "" 9 | 10 | 11 | init_git() 12 | 13 | 14 | void init_git() { // print latest git commit info 15 | script_file_paths := get_script_file_paths() 16 | for ( string path : script_file_paths ) { 17 | if ( path.exists() && "$path/.git".exists() ) { 18 | 19 | latest_git_commit_sha1 = get_stdout("cd $path; git rev-parse HEAD") 20 | latest_git_commit_date = get_stdout("cd $path; git show -s --format=%cd --date=local $latest_git_commit_sha1") 21 | break; 22 | } 23 | } 24 | 25 | print("\n\n== git info\n") 26 | if ( latest_git_commit_sha1 == "" ) \ 27 | print( "Latest git commit\t\t: not under git control\n" ) 28 | else \ 29 | print( "Latest git commit\t\t: $latest_git_commit_sha1 ($latest_git_commit_date)\n" ) 30 | } 31 | 32 | string html_pipeline_version( string git_url_prefix ) { 33 | string html 34 | if ( latest_git_commit_sha1 != "" ) { 35 | html += "
Pipeline version

" 36 | html += "Latest git commit SHA1: "+\ 37 | "$latest_git_commit_sha1"+\ 38 | " ($latest_git_commit_date)\n" 39 | html += "


\n" 40 | } 41 | 42 | return html 43 | } 44 | -------------------------------------------------------------------------------- /modules/input.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "input_fastq.bds" 5 | include "input_bam.bds" 6 | include "input_tagalign.bds" 7 | include "input_peak.bds" 8 | 9 | 10 | help == input endedness settings (SE or PE) : 11 | se := false help Singled-ended data set. To specify it for each replicate, '-se[REP_ID]' for exp. reps, '-ctl_se[CTL_ID]' for control. 12 | pe := false help Paired end data set. To specify it for each replicate, '-pe[REP_ID]' for exp. reps, '-ctl_pe[CTL_ID]' for controls. 13 | 14 | default_is_pe := false // default is se 15 | 16 | 17 | init_input() 18 | 19 | void init_input() { 20 | se = get_conf_val_bool( se, ["se"] ) 21 | pe = get_conf_val_bool( pe, ["pe"] ) 22 | } 23 | 24 | //// ctl==0: exp. replicate, ctl==1: control 25 | 26 | void chk_input( bool true_rep, bool no_pseudo_rep ) { 27 | if ( is_input_peak() ) { 28 | 29 | chk_input_peak( true_rep, no_pseudo_rep ) 30 | return 31 | } 32 | print( "\n\n== checking input files ...\n\n" ); 33 | 34 | string[] data_all 35 | for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control) 36 | if ( ctl==1 && !ctl_exists() ) continue 37 | for ( int rep=1; rep <= get_num_rep( ctl ); rep++) { 38 | string[] data 39 | 40 | prefix := (ctl==1) ? "Control " : "" 41 | suffix := is_pe( ctl, rep ) ? " (PE)" : " (SE)" 42 | 43 | if ( is_input_fastq( ctl, rep ) ) { 44 | prefix = prefix + "Rep$rep fastq" + suffix 45 | fastqs := get_fastqs( ctl, rep ) 46 | if ( fastqs.size()==0 ) { 47 | data.push( "" ) 48 | } 49 | else { 50 | for ( string fastq : fastqs ) data.push( fastq ) 51 | } 52 | } 53 | else if ( is_input_bam( ctl, rep ) ) { 54 | prefix = prefix +"Rep$rep bam" + suffix 55 | data.push( get_bam( ctl, rep ) ) 56 | } 57 | else if ( is_input_filt_bam( ctl, rep ) ) { 58 | prefix = prefix +"Rep$rep filt_bam" + suffix 59 | data.push( get_filt_bam( ctl, rep ) ) 60 | } 61 | else if ( is_input_tag( ctl, rep ) ) { 62 | prefix = prefix + "Rep$rep tagalign" + suffix 63 | data.push( get_tag( ctl, rep ) ) 64 | } 65 | 66 | print("$prefix :\n") 67 | for ( string s : data ) { 68 | print("\t$s\n") 69 | if ( (s != "") && !path_exists(s) ) error("\t\tFile not found!\n") 70 | } 71 | 72 | // if data is missing 73 | if ( data[0] == "" ) { 74 | if ( (rep>=2) && (ctl==1) ) \ 75 | print( "\tWarning: $prefix missing! using control 1 for calling peaks on replicate $rep\n") 76 | else if ( (rep==2) && (ctl==0) ) \ 77 | print( "\tWarning: $prefix missing! peak will be called for replicate 1 only\n") 78 | else \ 79 | error( "\t$prefix missing!\n") 80 | continue 81 | } 82 | // check any duplicate input filename 83 | for ( string s : data ) { 84 | if ( is_in_array( get_basename( s ), get_basename( data_all ) ) ) \ 85 | error( "\t$prefix has duplicate filename!\n") 86 | } 87 | data_all = merge( data_all, data ) 88 | } 89 | } 90 | } 91 | 92 | string[] get_input_files( int ctl, int rep ) { 93 | string[] empty 94 | 95 | if ( is_input_fastq( ctl, rep ) ) { 96 | return get_fastqs( ctl, rep ) 97 | } 98 | else if ( is_input_bam( ctl, rep ) ) { 99 | bam := get_bam( ctl, rep ) 100 | return bam=="" ? empty : [bam] 101 | } 102 | else if ( is_input_filt_bam( ctl, rep ) ) { 103 | filt_bam := get_filt_bam( ctl, rep ) 104 | return filt_bam=="" ? empty : [filt_bam] 105 | } 106 | else if ( is_input_tag( ctl, rep ) ) { 107 | tag := get_tag( ctl, rep ) 108 | return tag=="" ? empty : [tag] 109 | } 110 | else { 111 | return empty 112 | } 113 | } 114 | 115 | string[] get_input_files( int rep ) { 116 | return get_input_files( 0, rep ) 117 | } 118 | 119 | bool input_file_exists( int ctl, int rep ) { 120 | string[] input_files = get_input_files( ctl, rep ) 121 | return input_files.size() > 0 122 | } 123 | 124 | bool input_file_exists( int rep ) { 125 | return input_file_exists( 0, rep ) 126 | } 127 | 128 | int get_num_rep( int ctl ) { 129 | rep := 1 130 | while( get_input_files( ctl, rep ).size() > 0 ) rep++ 131 | 132 | num_rep := rep-1 133 | return num_rep 134 | } 135 | 136 | int get_num_rep() { 137 | return is_input_peak() ? get_num_rep_peak() : get_num_rep( 0 ) 138 | } 139 | 140 | bool is_pe( int ctl, int rep ) { 141 | if ( pe ) return true 142 | if ( se ) return false 143 | 144 | key_pe := ( ctl > 0 ? "ctl_pe" : "pe" ) + rep 145 | key_pe_ctl := "ctl_pe" 146 | key_se := ( ctl > 0 ? "ctl_se" : "se" ) + rep 147 | 148 | if ( cmd_line_arg_has_key( key_pe ) ) { 149 | return true 150 | } 151 | else if ( cmd_line_arg_has_key( key_se ) ) { 152 | return false 153 | } 154 | else if ( ctl==1 && cmd_line_arg_has_key( key_pe_ctl ) ) { 155 | return true 156 | } 157 | else { 158 | if ( conf.hasKey( key_pe ) && parse_bool( conf{ key_pe } ) ) return true 159 | if ( conf.hasKey( key_se ) && parse_bool( conf{ key_se } ) ) return false 160 | if ( ctl==1 && conf.hasKey( key_pe_ctl ) && parse_bool( conf{ key_pe_ctl } ) ) return true 161 | } 162 | 163 | if ( is_input_fastq( ctl, rep ) ) { 164 | fastqs := get_fastq( ctl, rep, 2 ) 165 | return fastqs.size() > 0 166 | } 167 | 168 | if ( default_is_pe ) return true 169 | else return false 170 | } 171 | 172 | bool is_se( int ctl, int rep ) { 173 | return !is_pe( ctl, rep ) 174 | } 175 | 176 | bool is_pe( int rep ) { 177 | return is_pe( 0, rep ) 178 | } 179 | 180 | bool is_se( int rep ) { 181 | return !is_pe( 0, rep ) 182 | } 183 | 184 | bool has_input_fastq() { 185 | for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control) 186 | if ( ctl==1 && !ctl_exists() ) continue 187 | 188 | for ( int rep=1; rep <= get_num_rep( ctl ); rep++) { 189 | if ( is_input_fastq( ctl, rep ) ) return true 190 | } 191 | } 192 | return false 193 | } 194 | 195 | bool has_pe_input_fastq() { 196 | for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control) 197 | if ( ctl==1 && !ctl_exists() ) continue 198 | 199 | for ( int rep=1; rep <= get_num_rep( ctl ); rep++) { 200 | if ( is_input_fastq( ctl, rep ) && is_pe( ctl, rep ) ) return true 201 | } 202 | } 203 | return false 204 | } 205 | 206 | bool has_pe_input_tag( int ctl ) { 207 | for ( int rep=1; rep <= get_num_rep( ctl ); rep++) { 208 | 209 | if ( is_input_tag( ctl, rep ) && is_pe( ctl, rep ) ) return true 210 | } 211 | return false 212 | } 213 | 214 | bool has_pe_input_tag() { 215 | return has_pe_input_tag( 0 ) 216 | } 217 | 218 | bool has_pe() { 219 | for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control) 220 | if ( ctl==1 && !ctl_exists() ) continue 221 | 222 | for ( int rep=1; rep <= get_num_rep( ctl ); rep++) { 223 | if ( is_pe( ctl, rep ) ) return true 224 | } 225 | } 226 | return false 227 | } 228 | 229 | bool has_se() { 230 | for ( int ctl=0; ctl <= 1; ctl++) { // iterate through replicates (0: not control, 1: control) 231 | if ( ctl==1 && !ctl_exists() ) continue 232 | 233 | for ( int rep=1; rep <= get_num_rep( ctl ); rep++) { 234 | if ( !is_pe( ctl, rep ) ) return true 235 | } 236 | } 237 | return false 238 | } 239 | 240 | bool ctl_exists() { 241 | return input_file_exists( 1, 1 ) 242 | } 243 | 244 | string get_long_group_name( int ctl, int rep ) { 245 | return ( (ctl>0) ? "Control " : "Replicate ") + rep 246 | } 247 | 248 | string get_long_group_name( int rep ) { 249 | return "Replicate "+ rep 250 | } 251 | 252 | string get_group_name( int ctl, int rep ) { 253 | return ( (ctl>0) ? "ctl" : "rep") + rep 254 | } 255 | 256 | string get_group_name( int rep ) { 257 | return "rep" + rep 258 | } 259 | -------------------------------------------------------------------------------- /modules/input_adapter.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "conf.bds" 5 | 6 | 7 | help == adapter sequence definition : 8 | help Single-ended : For replicate '-adapter[REP_ID]' 9 | help Paired end : For replicate '-adapter[REP_ID]_[PAIR_ID]' 10 | 11 | 12 | string{} get_adapter( int ctl, int rep, int p ) { 13 | // allow up to 99 adapters to be pooled (i.e. adapter1 adapter1:2 adapter1:3, ...) 14 | string{} result 15 | for ( int i=0; i<100; i++ ) { 16 | suffix := i ? ":$i" : "" 17 | key_wo_p := ( ctl > 0 ? "ctl_adapter" : "adapter" ) + "_rep" + rep 18 | key := key_wo_p + "_p" + p + suffix 19 | key_wo_p += suffix 20 | 21 | key_wo_p2 := ( ctl > 0 ? "ctl_adapter" : "adapter" ) + rep 22 | key2 := key_wo_p2 + "_" + p + suffix 23 | key_wo_p2 += suffix 24 | 25 | key_wo_p3 := ( ctl > 0 ? "ctl_adapter" : "adapter" ) 26 | key3 := key_wo_p3 + "_" + p + suffix 27 | key_wo_p3 += suffix 28 | 29 | formatted_i := format_digit(i,2) 30 | if ( (p==1) && cmd_line_arg_has_key( key_wo_p ) ) { 31 | result{formatted_i}= get_cmd_line_arg_val( key_wo_p ) 32 | } 33 | else if ( (p==1) && cmd_line_arg_has_key( key_wo_p2 ) ) { 34 | result{formatted_i}= get_cmd_line_arg_val( key_wo_p2 ) 35 | } 36 | else if ( (p==1) && (rep==1) && cmd_line_arg_has_key( key_wo_p3 ) ) { 37 | result{formatted_i}= get_cmd_line_arg_val( key_wo_p3 ) 38 | } 39 | else if ( cmd_line_arg_has_key( key ) ) { 40 | result{formatted_i}= get_cmd_line_arg_val( key ) 41 | } 42 | else if ( cmd_line_arg_has_key( key2 ) ) { 43 | result{formatted_i}= get_cmd_line_arg_val( key2 ) 44 | } 45 | else if ( (rep==1) && cmd_line_arg_has_key( key3 ) ) { 46 | result{formatted_i}= get_cmd_line_arg_val( key3 ) 47 | } 48 | else if ( (p==1) && conf.hasKey( key_wo_p ) ) { 49 | result{formatted_i}= conf{ key_wo_p } 50 | } 51 | else if ( (p==1) && conf.hasKey( key_wo_p2 ) ) { 52 | result{formatted_i}= conf{ key_wo_p2 } 53 | } 54 | else if ( (p==1) && (rep==1) && conf.hasKey( key_wo_p3 ) ) { 55 | result{formatted_i}= conf{ key_wo_p3 } 56 | } 57 | else if ( conf.hasKey( key ) ) { 58 | result{formatted_i}= conf{ key } 59 | } 60 | else if ( conf.hasKey( key2 ) ) { 61 | result{formatted_i}= conf{ key2 } 62 | } 63 | else if ( (rep==1) && conf.hasKey( key3 ) ) { 64 | result{formatted_i}= conf{ key3 } 65 | } 66 | } 67 | return result 68 | } 69 | 70 | int get_num_rep_adapter( int ctl ) { 71 | rep := 1 72 | while( get_adapter( ctl, rep, 1 ).size() > 0 ) rep++ 73 | 74 | num_rep := rep-1 75 | return num_rep 76 | } 77 | 78 | int get_num_rep_adapter() { 79 | return get_num_rep_adapter( 0 ) 80 | } 81 | -------------------------------------------------------------------------------- /modules/input_bam.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "conf.bds" 5 | 6 | 7 | help == bam input (raw or filtered) definition : 8 | help Raw bam : For replicate '-bam[REP_ID]', For control '-ctl_bam[REP_ID]'. 9 | help Filtered bam : For replicate '-filt_bam[REP_ID]', For control '-ctl_filt_bam[REP_ID]'. 10 | 11 | 12 | string get_bam( int ctl, int rep ) { 13 | 14 | key := ( ctl > 0 ? "ctl_bam" : "bam" ) + "_rep" + rep 15 | key2 := ( ctl > 0 ? "ctl_bam" : "bam" ) + rep 16 | key3 := ( ctl > 0 ? "ctl_bam" : "bam" ) 17 | 18 | if ( cmd_line_arg_has_key( key ) ) { 19 | return get_path( get_cmd_line_arg_val( key ) ) 20 | } 21 | else if ( cmd_line_arg_has_key( key2 ) ) { 22 | return get_path( get_cmd_line_arg_val( key2 ) ) 23 | } 24 | else if ( (rep==1) && cmd_line_arg_has_key( key3 ) ) { 25 | return get_path( get_cmd_line_arg_val( key3 ) ) 26 | } 27 | else if ( conf.hasKey( key ) ) { 28 | return get_path( conf{ key } ) 29 | } 30 | else if ( conf.hasKey( key2 ) ) { 31 | return get_path( conf{ key2 } ) 32 | } 33 | else if ( (rep==1) && conf.hasKey( key3 ) ) { 34 | return get_path( conf{ key3 } ) 35 | } 36 | return "" 37 | } 38 | 39 | string get_bam( int rep ) { 40 | 41 | return get_bam( 0, rep ) 42 | } 43 | 44 | string get_filt_bam( int ctl, int rep ) { 45 | 46 | key := ( ctl > 0 ? "ctl_filt_bam" : "filt_bam" ) + "_rep" + rep 47 | key2 := ( ctl > 0 ? "ctl_filt_bam" : "filt_bam" ) + rep 48 | key3 := ( ctl > 0 ? "ctl_filt_bam" : "filt_bam" ) 49 | 50 | if ( cmd_line_arg_has_key( key ) ) { 51 | return get_path( get_cmd_line_arg_val( key ) ) 52 | } 53 | else if ( cmd_line_arg_has_key( key2 ) ) { 54 | return get_path( get_cmd_line_arg_val( key2 ) ) 55 | } 56 | else if ( (rep==1) && cmd_line_arg_has_key( key3 ) ) { 57 | return get_path( get_cmd_line_arg_val( key3 ) ) 58 | } 59 | else if ( conf.hasKey( key ) ) { 60 | return get_path( conf{ key } ) 61 | } 62 | else if ( conf.hasKey( key2 ) ) { 63 | return get_path( conf{ key2 } ) 64 | } 65 | else if ( (rep==1) && conf.hasKey( key3 ) ) { 66 | return get_path( conf{ key3 } ) 67 | } 68 | return "" 69 | } 70 | 71 | string get_filt_bam( int rep ) { 72 | 73 | return get_filt_bam( 0, rep ) 74 | } 75 | 76 | bool is_input_bam( int ctl, int rep ) { 77 | 78 | return get_bam( ctl, rep ) != "" 79 | } 80 | 81 | bool is_input_bam( int rep ) { 82 | 83 | return is_input_bam( 0, rep ) 84 | } 85 | 86 | bool is_input_filt_bam( int ctl, int rep ) { 87 | 88 | return get_filt_bam( ctl, rep ) != "" 89 | } 90 | 91 | bool is_input_filt_bam( int rep ) { 92 | 93 | return is_input_filt_bam( 0, rep ) 94 | } 95 | -------------------------------------------------------------------------------- /modules/input_fastq.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "conf.bds" 5 | 6 | 7 | help == fastq input definition : 8 | help Single-ended : For replicate '-fastq[REP_ID]', For control '-ctl_fastq[REP_ID]' 9 | help Paired end : For replicate '-fastq[REP_ID]_[PAIR_ID]', For control '-ctl_fastq[REP_ID]_[PAIR_ID]' 10 | 11 | string{} get_fastq( int ctl, int rep, int p ) { 12 | // allow up to 99 fastqs to be pooled (i.e. fastq1 fastq1:2 fastq1:3, ...) 13 | string{} result 14 | for ( int i=0; i<100; i++ ) { 15 | suffix := i ? ":$i" : "" 16 | key_wo_p := ( ctl > 0 ? "ctl_fastq" : "fastq" ) + "_rep" + rep 17 | key := key_wo_p + "_p" + p + suffix 18 | key_wo_p += suffix 19 | 20 | key_wo_p2 := ( ctl > 0 ? "ctl_fastq" : "fastq" ) + rep 21 | key2 := key_wo_p2 + "_" + p + suffix 22 | key_wo_p2 += suffix 23 | 24 | key_wo_p3 := ( ctl > 0 ? "ctl_fastq" : "fastq" ) 25 | key3 := key_wo_p3 + "_" + p + suffix 26 | key_wo_p3 += suffix 27 | 28 | formatted_i := format_digit(i,2) 29 | if ( (p==1) && cmd_line_arg_has_key( key_wo_p ) ) { 30 | result{formatted_i}= get_path( get_cmd_line_arg_val( key_wo_p ) ) 31 | } 32 | else if ( (p==1) && cmd_line_arg_has_key( key_wo_p2 ) ) { 33 | result{formatted_i}= get_path( get_cmd_line_arg_val( key_wo_p2 ) ) 34 | } 35 | else if ( (p==1) && (rep==1) && cmd_line_arg_has_key( key_wo_p3 ) ) { 36 | result{formatted_i}= get_path( get_cmd_line_arg_val( key_wo_p3 ) ) 37 | } 38 | else if ( cmd_line_arg_has_key( key ) ) { 39 | result{formatted_i}= get_path( get_cmd_line_arg_val( key ) ) 40 | } 41 | else if ( cmd_line_arg_has_key( key2 ) ) { 42 | result{formatted_i}= get_path( get_cmd_line_arg_val( key2 ) ) 43 | } 44 | else if ( (rep==1) && cmd_line_arg_has_key( key3 ) ) { 45 | result{formatted_i}= get_path( get_cmd_line_arg_val( key3 ) ) 46 | } 47 | else if ( (p==1) && conf.hasKey( key_wo_p ) ) { 48 | result{formatted_i}= get_path( conf{ key_wo_p } ) 49 | } 50 | else if ( (p==1) && conf.hasKey( key_wo_p2 ) ) { 51 | result{formatted_i}= get_path( conf{ key_wo_p2 } ) 52 | } 53 | else if ( (p==1) && (rep==1) && conf.hasKey( key_wo_p3 ) ) { 54 | result{formatted_i}= get_path( conf{ key_wo_p3 } ) 55 | } 56 | else if ( conf.hasKey( key ) ) { 57 | result{formatted_i}= get_path( conf{ key } ) 58 | } 59 | else if ( conf.hasKey( key2 ) ) { 60 | result{formatted_i}= get_path( conf{ key2 } ) 61 | } 62 | else if ( (rep==1) && conf.hasKey( key3 ) ) { 63 | result{formatted_i}= get_path( conf{ key3 } ) 64 | } 65 | } 66 | return result 67 | } 68 | 69 | string[] get_fastqs( int ctl, int rep ) { 70 | string[] result 71 | for (int p=1;p<=2;p++) { 72 | for ( string fastq : get_fastq( ctl, rep, p ) ) { 73 | result.add( fastq ) 74 | } 75 | } 76 | return result 77 | } 78 | 79 | string[] get_fastqs( int rep ) { 80 | return get_fastqs( 0, rep ) 81 | } 82 | 83 | bool is_input_fastq( int ctl, int rep ) { 84 | fastqs := get_fastq( ctl, rep, 1 ) 85 | if ( fastqs.size() > 0 ) return true 86 | return false 87 | } 88 | 89 | bool is_input_fastq( int rep ) { 90 | return is_input_fastq( 0, rep ) 91 | } 92 | -------------------------------------------------------------------------------- /modules/input_peak.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "conf.bds" 5 | 6 | 7 | help == narrow peak input definition : 8 | help For true replicates, use '-peak1' and '-peak2', 9 | help For pooled replicates, use '-peak_pooled', 10 | help For two PR (self-pseudo-replicates), use '-peak[REP_ID]_pr1' and '-peak[REP_ID]_pr2' 11 | help For two PPR (pooled pseudo-replicates), use '-peak_ppr1' and '-peak_ppr2' 12 | 13 | 14 | void chk_input_peak( bool true_rep, bool no_pseudo_rep ) { 15 | 16 | if ( !is_input_peak() ) return // read peaks here 17 | 18 | for ( int rep=0; rep<=get_num_rep_peak(); rep++) { // rep==0 : pooled 19 | if ( get_num_rep_peak() == 1 && rep==0 ) continue // if only one replicate, skip reading pooled rep 20 | 21 | for (int pse=0; pse<=2; pse++) { // pse(pseudo)==0 : true rep, pse==1,2 : self-pseudo rep 1,2 22 | if ( true_rep && pse > 0 ) continue 23 | if ( no_pseudo_rep && rep != 0 && pse > 0 ) continue 24 | 25 | peak_ := get_peak(rep,pse) 26 | suffix1 := rep==0 ? "replicate" : "replicate $rep" 27 | suffix2 := rep==0 ? "pseudo-replicate $pse" : "pseudo-replicate $pse for replicate $rep" 28 | prefix := (rep==0 ? "pooled " : "") + (pse==0 ? suffix1 : suffix2) 29 | 30 | print( "$prefix: \n\t$peak_"+"\n") 31 | if ( !path_exists( peak_ ) ) error("\t\tFile not found!\n") 32 | } 33 | } 34 | } 35 | 36 | string get_peak( int rep, int pse ) { // rep==0 : pooled peak, pse==0 : true replicate 37 | 38 | if ( pse > 2 ) error ("\nget_peak() : pse should not be larger than 2!") 39 | 40 | string key, key2 41 | if ( rep == 0 ) { 42 | key = ( pse == 0 ? "peak_pooled" : ("peak_ppr" + pse) ) 43 | key2 = key 44 | } 45 | else { 46 | key = "peak" + rep + ( pse == 0 ? "" : ("_pr" + pse) ) 47 | key2 = "peak_rep" + rep + ( pse == 0 ? "" : ("_pr" + pse) ) 48 | } 49 | 50 | if ( cmd_line_arg_has_key( key ) ) { 51 | return get_path( get_cmd_line_arg_val( key ) ) 52 | } 53 | else if ( cmd_line_arg_has_key( key2 ) ) { 54 | return get_path( get_cmd_line_arg_val( key2 ) ) 55 | } 56 | else if ( conf.hasKey( key ) ) { 57 | return get_path( conf{ key } ) 58 | } 59 | else if ( conf.hasKey( key2 ) ) { 60 | return get_path( conf{ key2 } ) 61 | } 62 | 63 | return "" 64 | } 65 | 66 | bool is_input_peak() { 67 | 68 | return get_peak( 1, 0 ) != "" 69 | } 70 | 71 | int get_num_rep_peak() { 72 | 73 | rep := 1 74 | 75 | while( get_peak( rep, 0 ) != "" ) rep++ 76 | 77 | return rep-1 78 | } 79 | -------------------------------------------------------------------------------- /modules/input_tagalign.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "conf.bds" 5 | 6 | 7 | help == tagalign input definition : 8 | help For replicate '-tag[REP_ID]', For control '-ctl_tag[REP_ID]'. 9 | 10 | 11 | string get_tag( int ctl, int rep ) { 12 | 13 | key := ( ctl > 0 ? "ctl_tag" : "tag" ) + "_rep" + rep 14 | key2 := ( ctl > 0 ? "ctl_tagalign" : "tagalign" ) + "_rep" + rep 15 | 16 | key3 := ( ctl > 0 ? "ctl_tag" : "tag" ) + rep 17 | key4 := ( ctl > 0 ? "ctl_tagalign" : "tagalign" ) + rep 18 | 19 | key5 := ( ctl > 0 ? "ctl_tag" : "tag" ) 20 | key6 := ( ctl > 0 ? "ctl_tagalign" : "tagalign" ) 21 | 22 | if ( cmd_line_arg_has_key( key ) ) { 23 | return get_path( get_cmd_line_arg_val( key ) ) 24 | } 25 | else if ( cmd_line_arg_has_key( key2 ) ) { 26 | return get_path( get_cmd_line_arg_val( key2 ) ) 27 | } 28 | else if ( cmd_line_arg_has_key( key3 ) ) { 29 | return get_path( get_cmd_line_arg_val( key3 ) ) 30 | } 31 | else if ( cmd_line_arg_has_key( key4 ) ) { 32 | return get_path( get_cmd_line_arg_val( key4 ) ) 33 | } 34 | else if ( (rep==1) && cmd_line_arg_has_key( key5 ) ) { 35 | return get_path( get_cmd_line_arg_val( key5 ) ) 36 | } 37 | else if ( (rep==1) && cmd_line_arg_has_key( key6 ) ) { 38 | return get_path( get_cmd_line_arg_val( key6 ) ) 39 | } 40 | else if ( conf.hasKey( key ) ) { 41 | return get_path( conf{ key } ) 42 | } 43 | else if ( conf.hasKey( key2 ) ) { 44 | return get_path( conf{ key2 } ) 45 | } 46 | else if ( conf.hasKey( key3 ) ) { 47 | return get_path( conf{ key3 } ) 48 | } 49 | else if ( conf.hasKey( key4 ) ) { 50 | return get_path( conf{ key4 } ) 51 | } 52 | else if ( (rep==1) && conf.hasKey( key5 ) ) { 53 | return get_path( conf{ key5 } ) 54 | } 55 | else if ( (rep==1) && conf.hasKey( key6 ) ) { 56 | return get_path( conf{ key6 } ) 57 | } 58 | return "" 59 | } 60 | 61 | string get_tag( int rep ) { 62 | 63 | return get_tag( 0, rep ) 64 | } 65 | 66 | bool is_input_tag( int ctl, int rep ) { 67 | 68 | return get_tag( ctl, rep ) != "" 69 | } 70 | 71 | bool is_input_tag( int rep ) { 72 | 73 | return is_input_tag( 0, rep ) 74 | } 75 | -------------------------------------------------------------------------------- /modules/module_template.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "parallel.bds" 5 | include "report.bds" 6 | -------------------------------------------------------------------------------- /modules/output.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "conf.bds" 5 | 6 | 7 | help == output/title settings 8 | out_dir := "out" help Output directory (default: out). 9 | title := "" help Prefix for HTML report and outputs without given prefix. 10 | 11 | 12 | init_output() 13 | 14 | 15 | void init_output() { 16 | out_dir = get_conf_val( out_dir, ["out_dir"] ) 17 | title = get_conf_val( title, ["title"] ) 18 | 19 | if ( title == "" ) { // if title is empty, use directory name as a title 20 | dirname := get_basename( get_path(out_dir) ) 21 | if ( dirname == "out" ) { // if output folder is default one (out), then use parent dir. name 22 | dirname = get_basename( rm_str_at_end( get_path(out_dir), "/out" ) ) 23 | } 24 | title = dirname 25 | } 26 | if ( !is_cmd_line_arg_empty() ) out_dir = mkdir( out_dir ) // create output directory and get absolute path for it 27 | title = replace_illegal_chrs( title ) 28 | 29 | print("\n\n== output directory/title info\n") 30 | print( "Output dir.\t\t\t: $out_dir\n" ) 31 | print( "Title (prefix)\t\t\t: $title\n" ) 32 | } 33 | 34 | string get_rel_path( string path ) { // get relative path according to $out_dir 35 | rel_path := path.path().replace( out_dir.path(), "." ) 36 | if ( rel_path == path.path() ) return path //"" 37 | else return rel_path 38 | } 39 | -------------------------------------------------------------------------------- /modules/parallel.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "conf.bds" 5 | 6 | 7 | help == parallelization settings 8 | no_par := false help Serialize all tasks (individual tasks can still use multiple threads up to '-nth'). 9 | nth := 8 help Maximum # threads for a pipeline. (default: 8). 10 | 11 | string[] _tids_all // array of task ids currently running 12 | int{} _nth_tasks // key: task id, value: # of threads for the task 13 | 14 | 15 | init_parallel() 16 | 17 | 18 | void init_parallel() { 19 | no_par = get_conf_val_bool( no_par, ["no_par"] ) 20 | nth = get_conf_val_int( nth, ["nth"] ) 21 | 22 | if ( nth > 32 ) error("Maximum # threads (-nth) for a pipeline should not exceed 32!") 23 | if ( nth <= 1 ) { 24 | print("\nWarning: Maximum # threads (-nth) for a pipeline is <= 1. Turning off parallelization... (-no_par)") 25 | nth = 1 26 | no_par = true 27 | } 28 | 29 | // pre-declared BDS variable 30 | cpus = -1 // With cpus==-1, BDS does not pass number of threads to cluster engine (SGE, SLURM, ...), which means single-threaded 31 | 32 | print("\n\n== parallelization info\n") 33 | print( "No parallel jobs\t\t: $no_par\n" ) 34 | print( "Maximum # threads \t\t: $nth\n" ) 35 | } 36 | 37 | void wait_par( int nth_task ) { 38 | if ( nth_task < 1 ) nth_task = 1 39 | 40 | while ( true ) { 41 | sleep( rand()*1.0 + 0.5 ) 42 | _tids_all_ := _tids_all // make dummy array for thread safety 43 | 44 | string[] tids_running 45 | int nth_running 46 | for ( string tid : _tids_all_ ) { // get total # threads for currently running tasks, and find the oldest task 47 | if ( !tid.isDone() ) { 48 | tids_running.add( tid ) 49 | nth_running = nth_running + _nth_tasks{tid} 50 | } 51 | } 52 | 53 | if ( tids_running.size() == 0 ) { 54 | break 55 | } 56 | else if ( no_par || (nth_running+nth_task) > nth ) { 57 | loop_cnt := 0 58 | while( true ) { // wait until one of running tasks finishes 59 | break_loop := false 60 | for ( string tid : tids_running ) { 61 | if ( tid.isDone() ) { 62 | break_loop = true 63 | break 64 | } 65 | } 66 | if ( break_loop ) break 67 | sleep( rand() + 0.5 ) 68 | } 69 | sleep( rand()*1.0 + 0.5 ) 70 | } 71 | else { 72 | break 73 | } 74 | } 75 | } 76 | 77 | void register_par( string tid, int nth_task ) { 78 | if ( nth_task < 1 ) nth_task = 1 79 | if ( tid == "" ) return 80 | 81 | _tids_all.add(tid) 82 | _nth_tasks{tid} = nth_task 83 | } 84 | 85 | int{} distribute_nonzero( int n, int{} weight ) { // distribute integer n according to weight 86 | int{} ret 87 | 88 | int sum 89 | for ( int w : weight ) sum += w 90 | if ( sum == 0 ) error("distribute_nth: sum is zero. check if input file size is 0?\n") 91 | for ( string key : weight.keys() ) { 92 | w := weight{key} 93 | ret{key} = (n*w)/sum 94 | 95 | if ( ret{key} == 0 ) ret{key} = 1 96 | } 97 | 98 | while( true ) { 99 | int sum2 100 | for ( string key : weight.keys() ) sum2 += ret{key} 101 | if ( n > sum2 ) { 102 | string key_to_plus 103 | int max_diff = 0 104 | for ( string key : weight.keys() ) { 105 | diff := n*weight{key}-ret{key}*sum 106 | if ( diff > max_diff ) { 107 | key_to_plus = key 108 | max_diff = diff 109 | } 110 | } 111 | ret{key_to_plus}++ 112 | } 113 | else { 114 | break 115 | } 116 | } 117 | 118 | print("Distributing $n to ... \n") 119 | print(ret) 120 | print("\n") 121 | return ret 122 | } 123 | 124 | int[] distribute_nonzero( int n, int[] weight ) { // distribute integer n according to weight 125 | int[] ret 126 | 127 | int sum 128 | for ( int w : weight ) sum += w 129 | if ( sum == 0 ) error("distribute_nth: sum is zero. check if input file size is 0?\n") 130 | for ( int i=0; i sum2 ) { 140 | int id_to_plus 141 | int max_diff = 0 142 | for ( int i=0; i max_diff ) { 145 | id_to_plus = i 146 | max_diff = diff 147 | } 148 | } 149 | ret[id_to_plus]++ 150 | } 151 | else { 152 | break 153 | } 154 | } 155 | 156 | print("Distributing $n to ... \n") 157 | print(ret) 158 | print("\n") 159 | return ret 160 | } 161 | -------------------------------------------------------------------------------- /modules/pipeline_template.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "git.bds" 5 | include "parallel.bds" 6 | include "report.bds" 7 | -------------------------------------------------------------------------------- /modules/postalign_bed.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "species.bds" 5 | include "module_template.bds" 6 | 7 | 8 | // has functions related to tagalign, and helps getting tagalign from configruation file or command line argument 9 | 10 | help == postalign bed/tagalign settings 11 | mem_shuf := "12G" help Max. memory for UNIX shuf (default: 12G). 12 | no_random_source := false help Disable --random-source for UNIX shuf. Hot fix for end of file error. 13 | 14 | 15 | init_postalign_bed() 16 | 17 | 18 | void init_postalign_bed() { 19 | 20 | // fraglen0 = get_conf_val_bool( fraglen0, ["fraglen0"] ) 21 | mem_shuf = get_conf_val( mem_shuf, ["mem_shuf"] ) 22 | no_random_source = get_conf_val_bool( no_random_source, ["no_random_source"] ) 23 | 24 | print("\n\n== postalign bed/tagalign settings\n") 25 | print( "Max. memory for UNIX shuf\t\t\t: $mem_shuf\n") 26 | print( "No --random-source for UNIX shuf\t\t: $no_random_source\n") 27 | } 28 | 29 | string subsample_tag( string tag, int nlines, bool non_mito, string o_dir, string group ) { 30 | 31 | prefix := replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir ) 32 | nreads_per_mill := metric_prefix( nlines ) 33 | 34 | subsampled_tag := "$prefix."+(non_mito?"no_chrM.":"")+"$nreads_per_mill.tagAlign.gz" 35 | non_mito_param := non_mito ? "grep -v \"chrM\" | " : "" 36 | random_source_param := no_random_source ? "" : "--random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt /dev/null)" 37 | 38 | in := [ tag ] 39 | out := subsampled_tag 40 | 41 | taskName:= "subsample_tag " + group 42 | mem := get_res_mem(mem_shuf,1) 43 | 44 | wait_par( cpus ) 45 | 46 | tid := task( out<-in ) { 47 | 48 | sys $shcmd_init 49 | 50 | //# Subsample tagAlign file 51 | sys zcat $tag | \ 52 | $non_mito_param shuf -n $nlines $random_source_param | gzip -nc > $subsampled_tag 53 | 54 | sys $shcmd_finalize 55 | } 56 | 57 | register_par( tid, cpus ) 58 | 59 | add_task_to_graph( in, out, group ) 60 | 61 | return out 62 | } 63 | 64 | string subsample_tag_PE( string tag, int nlines, bool non_mito, string o_dir, string group ) { 65 | 66 | prefix := replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir ) 67 | nreads_per_mill := metric_prefix( nlines ) 68 | 69 | subsampled_tag := "$prefix."+(non_mito?"no_chrM.":"")+"$nreads_per_mill.tagAlign.gz" 70 | non_mito_param := non_mito ? "grep -v \"chrM\" | " : "" 71 | random_source_param := no_random_source ? "" : "--random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt /dev/null)" 72 | 73 | joined := "$prefix.joined" // temporary file 74 | joined_subsampled := "$prefix.joined.subsampled" // temporary file 75 | 76 | in := [ tag ] 77 | out := subsampled_tag 78 | 79 | taskName:= "subsample_tag_PE " + group 80 | mem := get_res_mem(mem_shuf,1) 81 | 82 | wait_par( cpus ) 83 | 84 | tid := task( out<-in ) { 85 | 86 | sys $shcmd_init 87 | 88 | // join consecutive two lines into one 89 | sys zcat $tag | sed 'N;s/\n/\t/' > $joined 90 | 91 | //# Shuffle and split temporary combined file into 2 equal parts 92 | //# Will produce $PR_PREFIX00 and $PR_PREFIX01 93 | sys cat $joined | $non_mito_param shuf -n $nlines $random_source_param > $joined_subsampled 94 | 95 | //# Subsample tagAlign file 96 | sys awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\t%s\t%s\t%s\n%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}' $joined_subsampled | \ 97 | gzip -nc > $subsampled_tag 98 | 99 | sys rm -f $joined $joined_subsampled 100 | 101 | sys $shcmd_finalize 102 | } 103 | 104 | register_par( tid, cpus ) 105 | 106 | add_task_to_graph( in, out, group ) 107 | 108 | return out 109 | } 110 | 111 | // Adjusts the read-ends in a read BED by Tn5 offsets 112 | string tn5_shift_tag( string tag, string o_dir, string group ) { 113 | 114 | prefix := replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir ) 115 | //shifted_tag := "$prefix.shifted.tagAlign.gz" 116 | shifted_tag := "$prefix.tn5.tagAlign.gz" 117 | 118 | in := [ tag ] 119 | out := shifted_tag 120 | 121 | taskName:= "shift_tag " + group 122 | 123 | wait_par( cpus ) 124 | 125 | tid := task( out<-in ) { 126 | 127 | sys $shcmd_init 128 | 129 | sys zcat $tag | awk -F '\t' 'BEGIN {OFS = FS}{ if ($6 == "+") {$2 = $2 + 4} else if ($6 == "-") {$3 = $3 - 5} print $0}' | gzip -nc > $shifted_tag 130 | 131 | sys $shcmd_finalize 132 | } 133 | 134 | register_par( tid, cpus ) 135 | 136 | add_task_to_graph( in, out, group ) 137 | 138 | return out 139 | } 140 | 141 | // make spr(self_pseudo_replicate) 142 | string[] spr( string tag, string pr1_o_dir, string pr2_o_dir, string group ) { 143 | 144 | prefix_pr1 := replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), pr1_o_dir ) 145 | prefix_pr2 := replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), pr2_o_dir ) 146 | tag_pr1 := "$prefix_pr1.pr1.tagAlign.gz" 147 | tag_pr2 := "$prefix_pr2.pr2.tagAlign.gz" 148 | random_source_param := no_random_source ? "" : "--random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt /dev/null)" 149 | 150 | in := [ tag ] 151 | out := [ tag_pr1, tag_pr2 ] 152 | 153 | taskName:= "spr " + group 154 | mem := get_res_mem(mem_shuf,1) 155 | 156 | wait_par( cpus ) 157 | 158 | tid := task( out<-in ) { 159 | 160 | sys $shcmd_init 161 | 162 | //# Get total number of read pairs 163 | sys nlines=$( zcat $tag | wc -l ) 164 | sys nlines=$(( (nlines + 1) / 2 )) 165 | 166 | //# Shuffle and split BEDPE file into 2 equal parts 167 | //# Will produce $PR_PREFIX00 and $PR_PREFIX01 168 | sys zcat $tag | shuf $random_source_param | split -d -l $((nlines)) - $prefix_pr1. 169 | 170 | //# Convert read pairs to reads into standard tagAlign file 171 | sys gzip -nc $prefix_pr1.00 > $tag_pr1 172 | sys rm -f $prefix_pr1.00 173 | sys gzip -nc $prefix_pr1.01 > $tag_pr2 174 | sys rm -f $prefix_pr1.01 175 | 176 | sys $shcmd_finalize 177 | } 178 | 179 | register_par( tid, cpus ) 180 | 181 | add_task_to_graph( in, out, ["$group PR 1", "$group PR 2"] ) 182 | 183 | return out 184 | } 185 | 186 | string[] spr_tag_PE( string tag, string pr1_o_dir, string pr2_o_dir, string group ) { 187 | 188 | prefix_pr1 := replace_dir( rm_ext( tag, ["bed","tagAlign"] ), pr1_o_dir ) 189 | prefix_pr2 := replace_dir( rm_ext( tag, ["bed","tagAlign"] ), pr2_o_dir ) 190 | 191 | joined := "$prefix_pr1.joined" // temporary file 192 | 193 | tag_pr1 := "$prefix_pr1.pr1.tagAlign.gz" 194 | tag_pr2 := "$prefix_pr2.pr2.tagAlign.gz" 195 | random_source_param := no_random_source ? "" : "--random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt /dev/null)" 196 | 197 | in := [ tag ] 198 | out := [ tag_pr1, tag_pr2 ] 199 | 200 | taskName:= "spr_tag_PE " + group 201 | mem := get_res_mem(mem_shuf,1) 202 | 203 | wait_par( cpus ) 204 | 205 | tid := task( out<-in ) { 206 | 207 | sys $shcmd_init 208 | 209 | // join consecutive two lines into one 210 | sys zcat $tag | sed 'N;s/\n/\t/' > $joined 211 | 212 | //# Get total number of read pairs 213 | sys nlines=$( cat $joined | wc -l ) 214 | sys nlines=$(( (nlines + 1) / 2 )) 215 | 216 | //# Shuffle and split temporary combined file into 2 equal parts 217 | //# Will produce $PR_PREFIX00 and $PR_PREFIX01 218 | sys cat $joined | shuf $random_source_param | split -d -l $((nlines)) - $prefix_pr1. 219 | 220 | //# Convert read pairs to reads into standard tagAlign file 221 | sys awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\t%s\t%s\t%s\n%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}' "$prefix_pr1.00" | \ 222 | gzip -nc > $tag_pr1 223 | sys rm -f $prefix_pr1.00 224 | sys awk 'BEGIN{OFS="\t"}{printf "%s\t%s\t%s\t%s\t%s\t%s\n%s\t%s\t%s\t%s\t%s\t%s\n",$1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}' "$prefix_pr1.01" | \ 225 | gzip -nc > $tag_pr2 226 | sys rm -f $prefix_pr1.01 227 | 228 | sys rm -f $joined 229 | 230 | sys $shcmd_finalize 231 | } 232 | 233 | register_par( tid, cpus ) 234 | 235 | add_task_to_graph( in, out, ["$group PR 1", "$group PR 2"] ) 236 | 237 | return out 238 | } 239 | 240 | string pool_tag( string tag1, string tag2, string o_dir, string group ) { 241 | // LINUX has limit on filename length (255), make it as shorter as possible 242 | string tag_pooled 243 | if ( get_basename(tag1).length() < 50 && get_basename(tag2).length() < 50 ) { 244 | prefix := "$o_dir/" + merge_basename_wo_ext( tag1, tag2, ["tagAlign","tag","bed"] ) 245 | tag_pooled = "$prefix.tagAlign.gz" 246 | } 247 | else { 248 | prefix := replace_dir( rm_ext( tag1, ["bed","tagAlign"] ), o_dir ) 249 | tag_pooled = "$prefix"+"_pooled.tagAlign.gz" 250 | } 251 | 252 | in := [ tag1, tag2 ] 253 | out := tag_pooled 254 | 255 | taskName:= "pool_tag " + group 256 | 257 | wait_par( cpus ) 258 | 259 | tid := task( out<-in ) { 260 | 261 | sys $shcmd_init 262 | sys zcat $tag1 $tag2 | gzip -nc > $tag_pooled 263 | 264 | sys $shcmd_finalize 265 | } 266 | 267 | register_par( tid, cpus ) 268 | 269 | add_task_to_graph( in, out, group ) 270 | 271 | return out 272 | } 273 | 274 | string pool_tag( string[] tags, string o_dir, string group ) { 275 | // LINUX has limit on filename length (255), make it as short as possible 276 | string tag_pooled 277 | if ( tags.size() <= 2 && get_basename(tags[0]).length() < 50 && get_basename(tags[1]).length() < 50 ) { 278 | prefix := "$o_dir/" + merge_basename_wo_ext( tags[0], tags[1], ["tagAlign","tag","bed"] ) 279 | tag_pooled = "$prefix.tagAlign.gz" 280 | } 281 | else { 282 | prefix := replace_dir( rm_ext( tags[0], ["bed","tagAlign"] ), o_dir ) 283 | tag_pooled = "$prefix"+"_pooled.tagAlign.gz" 284 | } 285 | tags_str := array_to_str( tags, " " ) // join 286 | 287 | in := tags 288 | out := tag_pooled 289 | 290 | taskName:= "pool_tag " + group 291 | 292 | wait_par( cpus ) 293 | 294 | tid := task( out<-in ) { 295 | 296 | sys $shcmd_init 297 | 298 | sys zcat $tags_str | gzip -nc > $tag_pooled 299 | 300 | sys $shcmd_finalize 301 | } 302 | 303 | register_par( tid, cpus ) 304 | 305 | add_task_to_graph( in, out, group ) 306 | 307 | return out 308 | } 309 | -------------------------------------------------------------------------------- /modules/postalign_xcor.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "species.bds" 5 | include "module_template.bds" 6 | 7 | 8 | help == postalign bed/tagalign settings 9 | fraglen0 := false help (LEGACY PARAM) Set predefined fragment length as zero for cross corr. analysis (add -speak=0 to run_spp.R). 10 | speak_xcor := -1 help Set user-defined cross-corr. peak strandshift (-speak= in run_spp.R). Use -1 to disable (default: -1). 11 | max_ppsize_xcor := "" help R stack size (R parameter --max-ppsize=; between 5000 and 5000000) for cross corr. analysis. 12 | extra_param_xcor := "" help Set extra parameters for run_spp.R (cross-corr. analysis only). 13 | mem_xcor := "15G" help Max. memory for cross-corr. analysis (default: 15G). 14 | 15 | grp_color_xcor := "yellowgreen" 16 | 17 | init_postalign_xcor() 18 | 19 | 20 | void init_postalign_xcor() { 21 | 22 | fraglen0 = get_conf_val_bool( fraglen0, ["fraglen0"] ) 23 | speak_xcor = get_conf_val_int( speak_xcor, ["speak_xcor"] ) 24 | extra_param_xcor= get_conf_val( extra_param_xcor, ["extra_param_xcor"] ) 25 | mem_xcor = get_conf_val( mem_xcor, ["mem_xcor"] ) 26 | max_ppsize_xcor = get_conf_val( max_ppsize_xcor, ["max_ppsize_xcor"] ) 27 | 28 | // backward compatibility 29 | if ( speak_xcor == -1 && fraglen0 ) speak_xcor = 0 30 | 31 | print("\n\n== postalign cross-corr. analysis settings\n") 32 | print( "Max. memory for UNIX shuf\t\t\t: $mem_shuf\n") 33 | print( "User-defined cross-corr. peak strandshift\t: $speak_xcor\n") 34 | print( "Extra parameters for cross-corr. analysis\t: $extra_param_xcor\n") 35 | print( "Max. memory for cross-corr. analysis\t\t: $mem_xcor\n") 36 | print( "Stack size for cross-corr. analysis\t\t:$max_ppsize_xcor\n") 37 | } 38 | 39 | string subsample_tag_PE_for_xcor( string tag, int nlines, bool non_mito, string o_dir, string group ) { 40 | 41 | prefix := replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir ) 42 | nreads_per_mill := metric_prefix( nlines ) 43 | 44 | subsampled_tag := "$prefix."+(non_mito?"no_chrM.":"")+"$nreads_per_mill.R1.tagAlign.gz" 45 | non_mito_param := non_mito ? "grep -v \"chrM\" | " : "" 46 | 47 | joined := "$prefix.joined" // temporary file 48 | joined_subsampled := "$prefix.joined.subsampled" // temporary file 49 | 50 | in := [ tag ] 51 | out := subsampled_tag 52 | 53 | taskName:= "subsample_tag_PE_4_xcor " + group 54 | mem := get_res_mem(mem_shuf,1) 55 | 56 | wait_par( cpus ) 57 | 58 | tid := task( out<-in ) { 59 | 60 | sys $shcmd_init 61 | 62 | // join consecutive two lines into one 63 | sys zcat $tag | sed 'N;s/\n/\t/' > $joined 64 | 65 | //# Shuffle and split temporary combined file into 2 equal parts 66 | //# Will produce $PR_PREFIX00 and $PR_PREFIX01 67 | sys cat $joined | $non_mito_param shuf -n $nlines --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f $tag | wc -c) -nosalt /dev/null) > $joined_subsampled 68 | 69 | //# Subsample tagAlign file 70 | sys awk 'BEGIN{OFS="\t"}{print $1,$2,$3,$4,$5,$6}' $joined_subsampled | \ 71 | gzip -nc > $subsampled_tag 72 | 73 | sys rm -f $joined $joined_subsampled 74 | 75 | sys $shcmd_finalize 76 | } 77 | 78 | register_par( tid, cpus ) 79 | 80 | add_task_to_graph( in, out, group ) 81 | 82 | return out 83 | } 84 | 85 | string[] xcor( string tag, string o_dir, string group, int nth_xcor ) { 86 | 87 | // misc. 88 | prefix := replace_dir( rm_ext( tag, ["tagAlign","tag","bed"] ), o_dir ) 89 | xcor_score := "$prefix.cc.qc" 90 | xcor_plot := "$prefix.cc.plot.pdf" 91 | param_speak := speak_xcor > -1 ? "-speak=$speak_xcor" : "" 92 | extra_param := max_ppsize_xcor ? "--max-ppsize=$max_ppsize_xcor " : "" 93 | 94 | in := [ tag ] 95 | out := [ xcor_score, xcor_plot ] 96 | 97 | taskName:= "xcor " + group 98 | cpus := (nth_xcor==1) ? -1 : nth_xcor; mem := get_res_mem(mem_xcor,nth_xcor); 99 | 100 | wait_par( cpus ) 101 | 102 | tid := task( out<-in ) { 103 | 104 | sys $shcmd_init 105 | 106 | // # if phantompeakqualtools is an old version, use run_spp_nodups.R. new version has run_spp.R only 107 | sys if [[ $(which run_spp_nodups.R 2> /dev/null | wc -l || echo) == "1" ]]; then RUN_SPP=$(which run_spp_nodups.R); \ 108 | else RUN_SPP=$(which run_spp.R); \ 109 | fi 110 | 111 | //# CCSCORE FILE format 112 | //# Filename numReads estFragLen correstFragLen PhantomPeak corrphantomPeak argmincorr mincorr phantomPeakCoef relPhantomPeakCoef QualityTag 113 | sys Rscript $extra_param ${RUN_SPP} -rf \ 114 | -c=$tag -p=$nth_xcor \ 115 | -filtchr=chrM -savp=$xcor_plot -out=$xcor_score $param_speak $extra_param_xcor 116 | sys sed -r 's/,[^\t]+//g' $xcor_score > $xcor_score.tmp 117 | sys mv $xcor_score.tmp $xcor_score 118 | 119 | sys $shcmd_finalize 120 | } 121 | 122 | register_par( tid, cpus ) 123 | 124 | add_task_to_graph( in, out, group, "XCOR", grp_color_xcor ) 125 | 126 | return out 127 | } 128 | 129 | string get_fraglen( string xcor_score ) { // get FRAGLEN (3rd column of cc score file) for spp(-speak=$FRAGLEN) 130 | 131 | cols := xcor_score.read().split("\t") 132 | return cols[2] 133 | } 134 | -------------------------------------------------------------------------------- /modules/species.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "conf.bds" 5 | 6 | 7 | help == species settings 8 | species := "" help Species. need to specify '-species_file' too if you have not installed genome database with 'install_genome_data.sh'. 9 | species_file := "" help Species file path. 10 | species_browser := "" help Species name in WashU genome browser. 11 | 12 | ref_fa := "" help Reference genome sequence fasta. 13 | chrsz := "" help Chromosome sizes file path (use fetchChromSizes from UCSC tools). 14 | blacklist := "" help Blacklist bed. 15 | seq_dir := "" help Reference genome sequence directory path (where chr*.fa exist). 16 | 17 | init_species() 18 | 19 | void init_species() { 20 | 21 | species = get_conf_val( species, ["species"] ) 22 | species_file = get_conf_val( species_file, ["species_file"] ) 23 | 24 | _read_species() 25 | 26 | species_browser = get_conf_val( species_browser,["species_browser"] ) 27 | 28 | ref_fa = get_conf_val( ref_fa, ["ref_fa"] ) 29 | chrsz = get_conf_val( chrsz, ["chrsz"] ) 30 | blacklist = get_conf_val( blacklist, ["blacklist"] ) 31 | seq_dir = get_conf_val( seq_dir, ["seq_dir"]) 32 | 33 | if ( species_browser == "" ) species_browser = species 34 | 35 | print("\n\n== species settings\n") 36 | print( "Species\t\t\t\t: $species\n" ) 37 | print( "Species file\t\t\t: $species_file\n\n" ) 38 | print( "Species name (WashU browser)\t: $species_browser\n" ) 39 | print( "Ref. genome seq. fasta\t\t: $ref_fa\n" ) 40 | print( "Chr. sizes file\t\t\t: $chrsz\n" ) 41 | print( "Black list bed\t\t\t: $blacklist\n" ) 42 | print( "Ref. genome seq. dir.\t\t: $seq_dir\n" ) 43 | } 44 | 45 | void _read_species() { // check for species configruation files 46 | // value for key will be overriden as loop goes. so the last element in species_paths has the priority 47 | string[] species_paths 48 | if ( env != "" ) species_paths.add( env ) 49 | if ( c != "" ) species_paths.add( c ) 50 | species_paths.add( species_file ) 51 | 52 | for ( string path : species_paths ) { 53 | if ( path.exists() ) { 54 | add_to_conf( path, species ) 55 | } 56 | } 57 | } 58 | 59 | 60 | // temp 61 | /* 62 | bwt_idx := "" help Bowtie index (full path prefix of *.1.ebwt file). 63 | bwt_idx = get_conf_val( bwt_idx, ["bwt_idx"] ) 64 | print( "Bowtie index\t\t\t: $bwt_idx\n" ) 65 | */ 66 | -------------------------------------------------------------------------------- /modules/sys.bds: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bds 2 | #vim: syntax=java 3 | 4 | include "string.bds" 5 | 6 | helpUnsorted := true // do not sort help 7 | 8 | 9 | script_path := "" 10 | script_dir := "" 11 | 12 | hostname := "" 13 | 14 | // pipeline seeks for executables in the BDS script directory (local git repo) and $PATH 15 | // Add more relative path here if you want to keep your .py .sh .R visible to UNIX `which` as executables. 16 | // Relative paths defined here are according to your script path (not your working directory but where .bds exists) 17 | // Make sure that you chmod 755 your .py .R .sh 18 | _rel_script_file_paths := [".","modules","utils"] 19 | 20 | 21 | init_base() 22 | 23 | 24 | void init_base() { 25 | script_path = "$ppwd/$programPath" 26 | if (!script_path.exists()) script_path = "$programPath" 27 | 28 | script_dir = script_path.dirName() 29 | hostname = get_hostname() 30 | } 31 | 32 | //// script file path 33 | 34 | string[] get_script_file_paths( string suffix ) { 35 | string[] ret 36 | for ( string path : _rel_script_file_paths ) { 37 | path = "$script_dir/$path" 38 | if ( path.exists() ) { 39 | ret.add( path + suffix ) 40 | if ( path.dirName().endsWith( "modules" ) ) ret.add( "$path/../$suffix" ) 41 | } 42 | } 43 | return ret 44 | } 45 | 46 | string[] get_script_file_paths() { 47 | return get_script_file_paths( "" ) 48 | } 49 | 50 | //// command line argument functions 51 | 52 | bool cmd_line_arg_has_key( string key ) { 53 | key = key.toLower() 54 | for ( string arg : args ) { 55 | if ( ("-"+key) == arg.toLower().trim() ) return true 56 | } 57 | return false 58 | } 59 | 60 | bool is_cmd_line_arg_empty() { 61 | return args.size()==0 62 | } 63 | 64 | bool is_first_arg_conf() { 65 | if ( (args.size()>0) && (!args[0].startsWith("-")) ) { 66 | if ( args.size()==1 ) { 67 | return true 68 | } 69 | else { 70 | return args[1].startsWith("-") 71 | } 72 | } 73 | return false 74 | } 75 | 76 | string get_cmd_line_arg_val( string key ) { 77 | key = key.toLower() 78 | for (int i=0; i< args.size(); i++) { 79 | arg := args[i] 80 | if ( ("-"+key) == arg.toLower().trim() ) { 81 | if ( i==(args.size()-1) ) break 82 | next_arg := args[i+1] 83 | 84 | if ( next_arg.startsWith("-") ) break 85 | return next_arg 86 | } 87 | } 88 | return "" 89 | } 90 | 91 | //// functions for file I/O 92 | 93 | string get_path( string str ) { // get absolute path (remove / if exists at end) 94 | if (str.trim() == "") return "" 95 | base := rm_str_at_end( str, "/" ).path() 96 | return base 97 | } 98 | 99 | string mkdir( string str ) { 100 | if (str.trim() == "") return "" 101 | // make filename full path and mkdir -p 102 | path := get_path( str ) 103 | if ( path.exists() ) { 104 | return path 105 | } 106 | else { 107 | path.mkdir() 108 | return path 109 | } 110 | } 111 | 112 | bool path_exists( string path ) { 113 | if ( path!="" ) { 114 | if ( path.exists() ) { 115 | if ( path.isFile() ) { 116 | if ( path.size() > 0 ) return true 117 | } 118 | else { 119 | return true 120 | } 121 | } 122 | } 123 | return false 124 | } 125 | 126 | string copy( string file, string o_dir ) { 127 | file_new := replace_dir( file, o_dir ) 128 | system := "local" // do not use cluster engine for this task 129 | taskName:= "copy file" 130 | 131 | task ( file_new <- file ) { 132 | 133 | sys cp --remove-destination $file $file_new 134 | sys while [ ! -f $file_new ]; do echo FOUND DELAYED WRITE, WAITING...; sleep 0.1; done 135 | } 136 | 137 | return file_new 138 | } 139 | 140 | string get_stdout( string cmd ) { 141 | rnd := randInt() 142 | cmd_ := "cmd_$rnd".path() 143 | sys $cmd &> $cmd_ || true 144 | ret := cmd_.read() 145 | sys rm -f $cmd_ 146 | return rm_str_at_end(ret,"\n") 147 | } 148 | 149 | string get_shell_var( string var ) { 150 | var_ := "var_$var".path() 151 | sys echo "${$var}" > $var_ 152 | ret := var_.read() 153 | sys rm -f $var_ 154 | return ret 155 | } 156 | 157 | string get_md5sum( string file ) { 158 | return get_stdout( "md5sum $file | awk '{print $1}'" ) 159 | } 160 | 161 | int get_num_lines( string file ) { 162 | if ( !path_exists( file ) ) { 163 | error("get_no_lines(): File doesn't exist! ($file)") 164 | } 165 | else { 166 | if ( file.toLower().endsWith(".gz") ) { // check if compressed or not 167 | return get_stdout( "zcat $file | wc -l" ).parseInt() 168 | } 169 | else { 170 | return get_stdout( "cat $file | wc -l" ).parseInt() 171 | } 172 | } 173 | } 174 | 175 | string get_hostname() { 176 | out := get_stdout("hostname -f").replace("\n","") 177 | if (out.startsWith("hostname: ")) return "default" 178 | else return out 179 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # channels : defaults, r, bioconda 2 | 3 | nomkl 4 | samtools ==1.2 5 | htslib ==1.4 # 1.5 in bioconda needed libbz2.so.1.0 6 | bedtools ==2.26.0 #2.22 # 2.21.0 7 | picard ==1.126 # wanted 1.129 here but doesn't exist. instead 1.139 has backward compatibility issue, so take 1.126 8 | ucsc-fetchchromsizes 9 | ucsc-wigtobigwig 10 | ucsc-bedgraphtobigwig 11 | ucsc-bigwiginfo 12 | ucsc-bedclip 13 | ucsc-bedtobigbed 14 | ucsc-twobittofa 15 | macs2 ==2.1.1.20160309 #2.1.0 (no binaries for OSX) 16 | boost ==1.57.0 17 | openblas ==0.2.19 18 | numpy ==1.11.3 #1.13.3 #1.10.2 (no binaries for OSX) #1.9.0, 1.8.2 conflicts with ATAQC 19 | matplotlib ==1.5.1 20 | six==1.10.0 # to fix (ImportError: cannot import name _thread) 21 | python-dateutil==2.6.1 22 | libgfortran==3.0 23 | graphviz ==2.38.0 24 | libtool 25 | ghostscript # pdf2png 26 | pigz 27 | zlib 28 | sambamba ==0.6.6 # to fix seg fault error in 0.6.1 29 | r ==3.2.2 30 | r-snow 31 | r-snowfall 32 | r-bitops 33 | r-catools 34 | bioconductor-rsamtools 35 | r-spp ==1.13 36 | #glibc #segmentation fault in conda with openssl 37 | pyfaidx ==0.4.7.1 38 | 39 | cutadapt ==1.9.1 40 | preseq ==2.0.3 41 | trim-galore ==0.4.1 # for old trimmer 42 | python-levenshtein # for old trimmer (trimAdapter.py) 43 | 44 | bowtie2 ==2.2.6 45 | ncurses 46 | ucsc-bigWigAverageOverBed 47 | gnuplot #==5.0.3 48 | scipy # ==0.17.0: to fix 'undefined symbol: PyUnicodeUCS2_DecodeUTF8' 49 | pandas #==0.18.0 #==0.16.1 # ataqc 50 | metaseq #==0.5.6 # ataqc 51 | jinja2 # ataqc 52 | gsl # for preseq 53 | pysam==0.8.2.1 # 0.8.3, 0.9 from bioconda has an issue with ATAQC (segmentation fault), need to use -c bcbio 54 | pybedtools==0.6.9 # same issue as in pysam 55 | openssl==1.0.2p 56 | -------------------------------------------------------------------------------- /requirements_py3.txt: -------------------------------------------------------------------------------- 1 | nomkl 2 | python ==3.5.0 3 | numpy ==1.11.3 4 | idr ==2.0.3 5 | bedtools ==2.26.0 6 | pigz 7 | java-jdk ==8.0.92 8 | matplotlib ==1.5.1 9 | -------------------------------------------------------------------------------- /species/kundaje.conf: -------------------------------------------------------------------------------- 1 | [mm9] 2 | chrsz = /mnt/data/pipeline_genome_data/mm9/mm9.chrom.sizes 3 | seq_dir = /mnt/data/pipeline_genome_data/mm9/seq 4 | gensz = mm 5 | umap = /mnt/data/pipeline_genome_data/mm9/globalmap_k20tok54 6 | bwa_idx = /mnt/data/pipeline_genome_data/mm9/bwa_index/mm9.fa 7 | bwt2_idx = /mnt/data/pipeline_genome_data/mm9/bowtie2_index/mm9.fa 8 | ref_fa = /mnt/data/pipeline_genome_data/mm9/mm9.fa 9 | blacklist = /mnt/data/pipeline_genome_data/mm9/mm9-blacklist.bed.gz 10 | # data for ATAQC 11 | tss_enrich = /mnt/data/pipeline_genome_data/mm9/ataqc/mm9_gencode_tss_unique.bed.gz 12 | dnase = /mnt/data/pipeline_genome_data/mm9/ataqc/mm9_univ_dhs_ucsc.from_mm10.bed.gz 13 | prom = /mnt/data/pipeline_genome_data/mm9/ataqc/tss_mm9_master.from_mm10.bed.gz 14 | enh = /mnt/data/pipeline_genome_data/mm9/ataqc/mm9_enh_dhs_ucsc.from_mm10.bed.gz 15 | reg2map = /mnt/data/pipeline_genome_data/mm9/ataqc/dnase_avgs_merged_named.fseq.vals.gz 16 | roadmap_meta = /mnt/data/pipeline_genome_data/mm9/ataqc/accession_to_name.txt 17 | 18 | [mm10, mm10_ENCODE, mm10_ENCODE3] # from ENCODE portal 19 | chrsz = /mnt/data/pipeline_genome_data/mm10/mm10.chrom.sizes 20 | seq_dir = /mnt/data/pipeline_genome_data/mm10/seq 21 | gensz = mm 22 | bwa_idx = /mnt/data/pipeline_genome_data/mm10/bwa_index/mm10_no_alt_analysis_set_ENCODE.fasta 23 | bwt2_idx = /mnt/data/pipeline_genome_data/mm10/bowtie2_index/mm10_no_alt_analysis_set_ENCODE.fasta 24 | ref_fa = /mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta 25 | blacklist = /mnt/data/pipeline_genome_data/mm10/mm10.blacklist.bed.gz 26 | # data for ATAQC 27 | tss_enrich = /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_gencode_tss_unique.bed.gz 28 | dnase = /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_univ_dhs_ucsc.bed.gz 29 | prom = /mnt/data/pipeline_genome_data/mm10/ataqc/tss_mm10_master.bed.gz 30 | enh = /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_enh_dhs_ucsc.bed.gz 31 | reg2map = /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_formatted.txt.gz 32 | reg2map_bed = /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_celltype_compare_subsample.bed.gz 33 | roadmap_meta = /mnt/data/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_metadata.txt 34 | ENCODE_assembly = mm10 35 | 36 | [hg19] 37 | chrsz = /mnt/data/pipeline_genome_data/hg19/hg19.chrom.sizes 38 | seq_dir = /mnt/data/pipeline_genome_data/hg19/seq 39 | gensz = hs 40 | umap = /mnt/data/pipeline_genome_data/hg19/globalmap_k20tok54 41 | bwa_idx = /mnt/data/pipeline_genome_data/hg19/bwa_index/male.hg19.fa 42 | bwt2_idx = /mnt/data/pipeline_genome_data/hg19/bowtie2_index/male.hg19.fa 43 | ref_fa = /mnt/data/pipeline_genome_data/hg19/male.hg19.fa 44 | blacklist = /mnt/data/pipeline_genome_data/hg19/wgEncodeDacMapabilityConsensusExcludable.bed.gz 45 | 46 | mappability_map_peakseq = /mnt/data/pipeline_genome_data/hg19/Mapability_HG.txt 47 | 48 | # data for ATAQC 49 | tss_enrich = /mnt/data/pipeline_genome_data/hg19/ataqc/hg19_gencode_tss_unique.bed.gz 50 | dnase = /mnt/data/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.bed.gz 51 | prom = /mnt/data/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_prom_p2.bed.gz 52 | enh = /mnt/data/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_enh_p2.bed.gz 53 | reg2map = /mnt/data/pipeline_genome_data/hg19/ataqc/dnase_avgs_reg2map_p10_merged_named.pvals.gz 54 | roadmap_meta = /mnt/data/pipeline_genome_data/hg19/ataqc/eid_to_mnemonic.txt 55 | 56 | [hg38, hg38_ENCODE, hg38_ENCODE3] # from ENCODE portal 57 | chrsz = /mnt/data/pipeline_genome_data/hg38/hg38.chrom.sizes 58 | seq_dir = /mnt/data/pipeline_genome_data/hg38/seq 59 | gensz = hs 60 | bwa_idx = /mnt/data/pipeline_genome_data/hg38/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta 61 | bwt2_idx = /mnt/data/pipeline_genome_data/hg38/bowtie2_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta 62 | ref_fa = /mnt/data/pipeline_genome_data/hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta 63 | blacklist = /mnt/data/pipeline_genome_data/hg38/hg38.blacklist.bed.gz 64 | # data for ATAQC 65 | tss_enrich = /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_gencode_tss_unique.bed.gz 66 | dnase = /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.hg19_to_hg38.bed.gz 67 | prom = /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_prom_p2.hg19_to_hg38.bed.gz 68 | enh = /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_enh_p2.hg19_to_hg38.bed.gz 69 | reg2map = /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_formatted.txt.gz 70 | reg2map_bed = /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_celltype_compare_subsample.bed.gz 71 | roadmap_meta = /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_metadata.txt 72 | ENCODE_assembly = GRCh38 73 | 74 | [hg38_chr19_chrM] # hg38 with chr19 and chrM only 75 | chrsz = /mnt/data/pipeline_genome_data/hg38_chr19_chrM/hg38_chr19_chrM.chrom.sizes 76 | seq_dir = /mnt/data/pipeline_genome_data/hg38/seq 77 | gensz = hs 78 | bwa_idx = /mnt/data/pipeline_genome_data/hg38_chr19_chrM/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.chr19_chrM.fasta 79 | bwt2_idx = /mnt/data/pipeline_genome_data/hg38_chr19_chrM/bowtie2_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.chr19_chrM.fasta 80 | ref_fa = /mnt/data/pipeline_genome_data/hg38_chr19_chrM/GRCh38_no_alt_analysis_set_GCA_000001405.15.chr19_chrM.fasta 81 | blacklist = /mnt/data/pipeline_genome_data/hg38_chr19_chrM/hg38.blacklist.bed.gz 82 | # data for ATAQC 83 | tss_enrich = /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_gencode_tss_unique.bed.gz 84 | dnase = /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.hg19_to_hg38.bed.gz 85 | prom = /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_prom_p2.hg19_to_hg38.bed.gz 86 | enh = /mnt/data/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_enh_p2.hg19_to_hg38.bed.gz 87 | reg2map = /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_formatted.txt.gz 88 | reg2map_bed = /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_celltype_compare_subsample.bed.gz 89 | roadmap_meta = /mnt/data/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_metadata.txt 90 | ENCODE_assembly = GRCh38 91 | 92 | [dm3] # installed by install_genome_data.sh 93 | chrsz = /mnt/data/pipeline_genome_data/dm3/dm3.chrom.sizes 94 | seq_dir = /mnt/data/pipeline_genome_data/dm3/seq 95 | gensz = 168736537 96 | bwa_idx = /mnt/data/pipeline_genome_data/dm3/bwa_index/dm3.fa 97 | bwt2_idx = /mnt/data/pipeline_genome_data/dm3/bowtie2_index/dm3.fa 98 | ref_fa = /mnt/data/pipeline_genome_data/dm3/dm3.fa 99 | 100 | [pantro5] # installed by install_genome_data.sh 101 | chrsz = /mnt/data/pipeline_genome_data/pantro5/pantro5.chrom.sizes 102 | seq_dir = /mnt/data/pipeline_genome_data/pantro5/seq 103 | gensz = 3231170666 104 | bwa_idx = /mnt/data/pipeline_genome_data/pantro5/bwa_index/panTro5.fa 105 | bwt2_idx = /mnt/data/pipeline_genome_data/pantro5/bowtie2_index/panTro5.fa 106 | ref_fa = /mnt/data/pipeline_genome_data/pantro5/panTro5.fa 107 | 108 | [macam7] # installed by install_genome_data.sh 109 | chrsz = /mnt/data/pipeline_genome_data/macam7/macam7.chrom.sizes 110 | seq_dir = /mnt/data/pipeline_genome_data/macam7/seq 111 | gensz = 2817542206 112 | bwa_idx = /mnt/data/pipeline_genome_data/macam7/bwa_index/MacaM_Rhesus_Genome_v7.fasta 113 | bwt2_idx = /mnt/data/pipeline_genome_data/macam7/bowtie2_index/MacaM_Rhesus_Genome_v7.fasta 114 | ref_fa = /mnt/data/pipeline_genome_data/macam7/MacaM_Rhesus_Genome_v7.fasta 115 | nonamecheck = true # for bedtools >= 2.24. this prevents name convention error in bedtools intersect 116 | 117 | [saccer3] # installed by install_genome_data.sh 118 | chrsz = /mnt/data/pipeline_genome_data/saccer3/saccer3.chrom.sizes 119 | seq = /mnt/data/pipeline_genome_data/saccer3/seq 120 | gensz = 12157105 121 | bwa_idx = /mnt/data/pipeline_genome_data/saccer3/bwa_index/sacCer3.fa 122 | bwt2_idx= /mnt/data/pipeline_genome_data/saccer3/bowtie2_index/sacCer3.fa 123 | ref_fa = /mnt/data/pipeline_genome_data/saccer3/sacCer3.fa 124 | 125 | -------------------------------------------------------------------------------- /species/scg.conf: -------------------------------------------------------------------------------- 1 | [mm9] 2 | chrsz = /reference/ENCODE/pipeline_genome_data/mm9/mm9.chrom.sizes 3 | seq_dir = /reference/ENCODE/pipeline_genome_data/mm9/seq 4 | gensz = mm 5 | umap = /reference/ENCODE/pipeline_genome_data/mm9/globalmap_k20tok54 6 | bwa_idx = /reference/ENCODE/pipeline_genome_data/mm9/bwa_index/mm9.fa 7 | bwt2_idx = /reference/ENCODE/pipeline_genome_data/mm9/bowtie2_index/mm9.fa 8 | ref_fa = /reference/ENCODE/pipeline_genome_data/mm9/mm9.fa 9 | blacklist = /reference/ENCODE/pipeline_genome_data/mm9/mm9-blacklist.bed.gz 10 | # data for ATAQC 11 | tss_enrich = /reference/ENCODE/pipeline_genome_data/mm9/ataqc/mm9_gencode_tss_unique.bed.gz 12 | dnase = /reference/ENCODE/pipeline_genome_data/mm9/ataqc/mm9_univ_dhs_ucsc.from_mm10.bed.gz 13 | prom = /reference/ENCODE/pipeline_genome_data/mm9/ataqc/tss_mm9_master.from_mm10.bed.gz 14 | enh = /reference/ENCODE/pipeline_genome_data/mm9/ataqc/mm9_enh_dhs_ucsc.from_mm10.bed.gz 15 | reg2map = /reference/ENCODE/pipeline_genome_data/mm9/ataqc/dnase_avgs_merged_named.fseq.vals.gz 16 | reg2map_bed = /reference/ENCODE/pipeline_genome_data/mm9/ataqc/mm9_dhs_universal_ucsc_v1.bed.gz 17 | roadmap_meta = /reference/ENCODE/pipeline_genome_data/mm9/ataqc/accession_to_name.txt 18 | 19 | [mm10, mm10_ENCODE, mm10_ENCODE3] # from ENCODE portal 20 | chrsz = /reference/ENCODE/pipeline_genome_data/mm10/mm10.chrom.sizes 21 | seq_dir = /reference/ENCODE/pipeline_genome_data/mm10/seq 22 | gensz = mm 23 | bwa_idx = /reference/ENCODE/pipeline_genome_data/mm10/bwa_index/mm10_no_alt_analysis_set_ENCODE.fasta 24 | bwt2_idx = /reference/ENCODE/pipeline_genome_data/mm10/bowtie2_index/mm10_no_alt_analysis_set_ENCODE.fasta 25 | ref_fa = /reference/ENCODE/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta 26 | blacklist = /reference/ENCODE/pipeline_genome_data/mm10/mm10.blacklist.bed.gz 27 | # data for ATAQC 28 | tss_enrich = /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_gencode_tss_unique.bed.gz 29 | dnase = /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_univ_dhs_ucsc.bed.gz 30 | prom = /reference/ENCODE/pipeline_genome_data/mm10/ataqc/tss_mm10_master.bed.gz 31 | enh = /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_enh_dhs_ucsc.bed.gz 32 | reg2map = /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_formatted.txt.gz 33 | reg2map_bed = /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_celltype_compare_subsample.bed.gz 34 | roadmap_meta = /reference/ENCODE/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_metadata.txt 35 | ENCODE_assembly = mm10 36 | 37 | [hg19] 38 | chrsz = /reference/ENCODE/pipeline_genome_data/hg19/hg19.chrom.sizes 39 | seq_dir = /reference/ENCODE/pipeline_genome_data/hg19/seq 40 | gensz = hs 41 | umap = /reference/ENCODE/pipeline_genome_data/hg19/globalmap_k20tok54 42 | bwa_idx = /reference/ENCODE/pipeline_genome_data/hg19/bwa_index/male.hg19.fa 43 | bwt2_idx = /reference/ENCODE/pipeline_genome_data/hg19/bowtie2_index/male.hg19.fa 44 | ref_fa = /reference/ENCODE/pipeline_genome_data/hg19/male.hg19.fa 45 | blacklist = /reference/ENCODE/pipeline_genome_data/hg19/wgEncodeDacMapabilityConsensusExcludable.bed.gz 46 | # data for ATAQC 47 | tss_enrich = /reference/ENCODE/pipeline_genome_data/hg19/ataqc/hg19_gencode_tss_unique.bed.gz 48 | dnase = /reference/ENCODE/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.bed.gz 49 | prom = /reference/ENCODE/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_prom_p2.bed.gz 50 | enh = /reference/ENCODE/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_enh_p2.bed.gz 51 | reg2map = /reference/ENCODE/pipeline_genome_data/hg19/ataqc/dnase_avgs_reg2map_p10_merged_named.pvals.gz 52 | roadmap_meta = /reference/ENCODE/pipeline_genome_data/hg19/ataqc/eid_to_mnemonic.txt 53 | 54 | [hg38, hg38_ENCODE, hg38_ENCODE3] # from ENCODE portal 55 | chrsz = /reference/ENCODE/pipeline_genome_data/hg38/hg38.chrom.sizes 56 | seq_dir = /reference/ENCODE/pipeline_genome_data/hg38/seq 57 | gensz = hs 58 | bwa_idx = /reference/ENCODE/pipeline_genome_data/hg38/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta 59 | bwt2_idx = /reference/ENCODE/pipeline_genome_data/hg38/bowtie2_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta 60 | ref_fa = /reference/ENCODE/pipeline_genome_data/hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta 61 | blacklist = /reference/ENCODE/pipeline_genome_data/hg38/hg38.blacklist.bed.gz 62 | # data for ATAQC 63 | tss_enrich = /reference/ENCODE/pipeline_genome_data/hg38/ataqc/hg38_gencode_tss_unique.bed.gz 64 | dnase = /reference/ENCODE/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.hg19_to_hg38.bed.gz 65 | prom = /reference/ENCODE/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_prom_p2.hg19_to_hg38.bed.gz 66 | enh = /reference/ENCODE/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_enh_p2.hg19_to_hg38.bed.gz 67 | reg2map = /reference/ENCODE/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_formatted.txt.gz 68 | reg2map_bed = /reference/ENCODE/pipeline_genome_data/hg38/ataqc/hg38_celltype_compare_subsample.bed.gz 69 | roadmap_meta = /reference/ENCODE/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_metadata.txt 70 | ENCODE_assembly = GRCh38 71 | 72 | [dm3] # installed by install_genome_data.sh 73 | chrsz = /reference/ENCODE/pipeline_genome_data/dm3/dm3.chrom.sizes 74 | seq_dir = /reference/ENCODE/pipeline_genome_data/dm3/seq 75 | gensz = 168736537 76 | bwa_idx = /reference/ENCODE/pipeline_genome_data/dm3/bwa_index/dm3.fa 77 | bwt2_idx = /reference/ENCODE/pipeline_genome_data/dm3/bowtie2_index/dm3.fa 78 | ref_fa = /reference/ENCODE/pipeline_genome_data/dm3/dm3.fa 79 | 80 | [pantro5] # installed by install_genome_data.sh 81 | chrsz = /reference/ENCODE/pipeline_genome_data/pantro5/pantro5.chrom.sizes 82 | seq_dir = /reference/ENCODE/pipeline_genome_data/pantro5/seq 83 | gensz = 3231170666 84 | bwa_idx = /reference/ENCODE/pipeline_genome_data/pantro5/bwa_index/panTro5.fa 85 | bwt2_idx = /reference/ENCODE/pipeline_genome_data/pantro5/bowtie2_index/panTro5.fa 86 | ref_fa = /reference/ENCODE/pipeline_genome_data/pantro5/panTro5.fa 87 | 88 | [macam7] # installed by install_genome_data.sh 89 | chrsz = /reference/ENCODE/pipeline_genome_data/macam7/macam7.chrom.sizes 90 | seq_dir = /reference/ENCODE/pipeline_genome_data/macam7/seq 91 | gensz = 2817542206 92 | bwa_idx = /reference/ENCODE/pipeline_genome_data/macam7/bwa_index/MacaM_Rhesus_Genome_v7.fasta 93 | bwt2_idx = /reference/ENCODE/pipeline_genome_data/macam7/bowtie2_index/MacaM_Rhesus_Genome_v7.fasta 94 | ref_fa = /reference/ENCODE/pipeline_genome_data/macam7/MacaM_Rhesus_Genome_v7.fasta 95 | nonamecheck = true # for bedtools >= 2.24. this prevents name convention error in bedtools intersect 96 | 97 | [saccer3] # installed by install_genome_data.sh 98 | chrsz = /reference/ENCODE/pipeline_genome_data/saccer3/saccer3.chrom.sizes 99 | seq = /reference/ENCODE/pipeline_genome_data/saccer3/seq 100 | gensz = 12157105 101 | bwa_idx = /reference/ENCODE/pipeline_genome_data/saccer3/bwa_index/sacCer3.fa 102 | bwt2_idx= /reference/ENCODE/pipeline_genome_data/saccer3/bowtie2_index/sacCer3.fa 103 | ref_fa = /reference/ENCODE/pipeline_genome_data/saccer3/sacCer3.fa 104 | 105 | -------------------------------------------------------------------------------- /species/sherlock.conf: -------------------------------------------------------------------------------- 1 | [mm9] 2 | chrsz = /home/groups/cherry/encode/pipeline_genome_data/mm9/mm9.chrom.sizes 3 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/mm9/seq 4 | gensz = mm 5 | umap = /home/groups/cherry/encode/pipeline_genome_data/mm9/globalmap_k20tok54 6 | bwa_idx = /home/groups/cherry/encode/pipeline_genome_data/mm9/bwa_index/mm9.fa 7 | bwt2_idx = /home/groups/cherry/encode/pipeline_genome_data/mm9/bowtie2_index/mm9.fa 8 | ref_fa = /home/groups/cherry/encode/pipeline_genome_data/mm9/mm9.fa 9 | blacklist = /home/groups/cherry/encode/pipeline_genome_data/mm9/mm9-blacklist.bed.gz 10 | # data for ATAQC 11 | tss_enrich = /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/mm9_gencode_tss_unique.bed.gz 12 | dnase = /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/mm9_univ_dhs_ucsc.from_mm10.bed.gz 13 | prom = /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/tss_mm9_master.from_mm10.bed.gz 14 | enh = /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/mm9_enh_dhs_ucsc.from_mm10.bed.gz 15 | reg2map = /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/dnase_avgs_merged_named.fseq.vals.gz 16 | reg2map_bed = /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/mm9_dhs_universal_ucsc_v1.bed.gz 17 | roadmap_meta = /home/groups/cherry/encode/pipeline_genome_data/mm9/ataqc/accession_to_name.txt 18 | 19 | [mm10, mm10_ENCODE, mm10_ENCODE3] # from ENCODE portal 20 | chrsz = /home/groups/cherry/encode/pipeline_genome_data/mm10/mm10.chrom.sizes 21 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/mm10/seq 22 | gensz = mm 23 | bwa_idx = /home/groups/cherry/encode/pipeline_genome_data/mm10/bwa_index/mm10_no_alt_analysis_set_ENCODE.fasta 24 | bwt2_idx = /home/groups/cherry/encode/pipeline_genome_data/mm10/bowtie2_index/mm10_no_alt_analysis_set_ENCODE.fasta 25 | ref_fa = /home/groups/cherry/encode/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta 26 | blacklist = /home/groups/cherry/encode/pipeline_genome_data/mm10/mm10.blacklist.bed.gz 27 | # data for ATAQC 28 | tss_enrich = /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_gencode_tss_unique.bed.gz 29 | dnase = /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_univ_dhs_ucsc.bed.gz 30 | prom = /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/tss_mm10_master.bed.gz 31 | enh = /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_enh_dhs_ucsc.bed.gz 32 | reg2map = /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_formatted.txt.gz 33 | reg2map_bed = /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_celltype_compare_subsample.bed.gz 34 | roadmap_meta = /home/groups/cherry/encode/pipeline_genome_data/mm10/ataqc/mm10_dnase_avg_fseq_signal_metadata.txt 35 | ENCODE_assembly = mm10 36 | 37 | [hg19] 38 | chrsz = /home/groups/cherry/encode/pipeline_genome_data/hg19/hg19.chrom.sizes 39 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/hg19/seq 40 | gensz = hs 41 | umap = /home/groups/cherry/encode/pipeline_genome_data/hg19/globalmap_k20tok54 42 | bwa_idx = /home/groups/cherry/encode/pipeline_genome_data/hg19/bwa_index/male.hg19.fa 43 | bwt2_idx = /home/groups/cherry/encode/pipeline_genome_data/hg19/bowtie2_index/male.hg19.fa 44 | ref_fa = /home/groups/cherry/encode/pipeline_genome_data/hg19/male.hg19.fa 45 | blacklist = /home/groups/cherry/encode/pipeline_genome_data/hg19/wgEncodeDacMapabilityConsensusExcludable.bed.gz 46 | # data for ATAQC 47 | tss_enrich = /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/hg19_gencode_tss_unique.bed.gz 48 | dnase = /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.bed.gz 49 | prom = /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_prom_p2.bed.gz 50 | enh = /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/reg2map_honeybadger2_dnase_enh_p2.bed.gz 51 | reg2map = /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/dnase_avgs_reg2map_p10_merged_named.pvals.gz 52 | roadmap_meta = /home/groups/cherry/encode/pipeline_genome_data/hg19/ataqc/eid_to_mnemonic.txt 53 | 54 | [hg38, hg38_ENCODE, hg38_ENCODE3] # from ENCODE portal 55 | chrsz = /home/groups/cherry/encode/pipeline_genome_data/hg38/hg38.chrom.sizes 56 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/hg38/seq 57 | gensz = hs 58 | bwa_idx = /home/groups/cherry/encode/pipeline_genome_data/hg38/bwa_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta 59 | bwt2_idx = /home/groups/cherry/encode/pipeline_genome_data/hg38/bowtie2_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta 60 | ref_fa = /home/groups/cherry/encode/pipeline_genome_data/hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta 61 | blacklist = /home/groups/cherry/encode/pipeline_genome_data/hg38/hg38.blacklist.bed.gz 62 | # data for ATAQC 63 | tss_enrich = /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/hg38_gencode_tss_unique.bed.gz 64 | dnase = /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.hg19_to_hg38.bed.gz 65 | prom = /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_prom_p2.hg19_to_hg38.bed.gz 66 | enh = /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_enh_p2.hg19_to_hg38.bed.gz 67 | reg2map = /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_formatted.txt.gz 68 | reg2map_bed = /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/hg38_celltype_compare_subsample.bed.gz 69 | roadmap_meta = /home/groups/cherry/encode/pipeline_genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_metadata.txt 70 | ENCODE_assembly = GRCh38 71 | 72 | [dm3] # installed by install_genome_data.sh 73 | chrsz = /home/groups/cherry/encode/pipeline_genome_data/dm3/dm3.chrom.sizes 74 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/dm3/seq 75 | gensz = 168736537 76 | bwa_idx = /home/groups/cherry/encode/pipeline_genome_data/dm3/bwa_index/dm3.fa 77 | bwt2_idx = /home/groups/cherry/encode/pipeline_genome_data/dm3/bowtie2_index/dm3.fa 78 | ref_fa = /home/groups/cherry/encode/pipeline_genome_data/dm3/dm3.fa 79 | 80 | [pantro5] # installed by install_genome_data.sh 81 | chrsz = /home/groups/cherry/encode/pipeline_genome_data/pantro5/pantro5.chrom.sizes 82 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/pantro5/seq 83 | gensz = 3231170666 84 | bwa_idx = /home/groups/cherry/encode/pipeline_genome_data/pantro5/bwa_index/panTro5.fa 85 | bwt2_idx = /home/groups/cherry/encode/pipeline_genome_data/pantro5/bowtie2_index/panTro5.fa 86 | ref_fa = /home/groups/cherry/encode/pipeline_genome_data/pantro5/panTro5.fa 87 | 88 | [macam7] # installed by install_genome_data.sh 89 | chrsz = /home/groups/cherry/encode/pipeline_genome_data/macam7/macam7.chrom.sizes 90 | seq_dir = /home/groups/cherry/encode/pipeline_genome_data/macam7/seq 91 | gensz = 2817542206 92 | bwa_idx = /home/groups/cherry/encode/pipeline_genome_data/macam7/bwa_index/MacaM_Rhesus_Genome_v7.fasta 93 | bwt2_idx = /home/groups/cherry/encode/pipeline_genome_data/macam7/bowtie2_index/MacaM_Rhesus_Genome_v7.fasta 94 | ref_fa = /home/groups/cherry/encode/pipeline_genome_data/macam7/MacaM_Rhesus_Genome_v7.fasta 95 | nonamecheck = true # for bedtools >= 2.24. this prevents name convention error in bedtools intersect 96 | 97 | [saccer3] # installed by install_genome_data.sh 98 | chrsz = /home/groups/cherry/encode/pipeline_genome_data/saccer3/saccer3.chrom.sizes 99 | seq = /home/groups/cherry/encode/pipeline_genome_data/saccer3/seq 100 | gensz = 12157105 101 | bwa_idx = /home/groups/cherry/encode/pipeline_genome_data/saccer3/bwa_index/sacCer3.fa 102 | bwt2_idx= /home/groups/cherry/encode/pipeline_genome_data/saccer3/bowtie2_index/sacCer3.fa 103 | ref_fa = /home/groups/cherry/encode/pipeline_genome_data/saccer3/sacCer3.fa 104 | 105 | -------------------------------------------------------------------------------- /uninstall_dependencies.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## conda environment name 4 | 5 | ENV_NAME=bds_atac 6 | ENV_NAME_PY3=bds_atac_py3 7 | 8 | conda env remove --name ${ENV_NAME} -y 9 | conda env remove --name ${ENV_NAME_PY3} -y 10 | -------------------------------------------------------------------------------- /utils/assign_multimappers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # piped script to take multimappers and randomly assign 4 | # requires a qname sorted file!! 5 | 6 | import sys 7 | import random 8 | import argparse 9 | 10 | def parse_args(): 11 | ''' 12 | Gives options 13 | ''' 14 | parser = argparse.ArgumentParser(description='Saves reads below a alignment threshold and discards all others') 15 | parser.add_argument('-k', help='Alignment number cutoff') 16 | parser.add_argument('--paired-end', dest='paired_ended', action='store_true', help='Data is paired-end') 17 | args = parser.parse_args() 18 | alignment_cutoff = int(args.k) 19 | paired_ended = args.paired_ended 20 | 21 | return alignment_cutoff, paired_ended 22 | 23 | 24 | if __name__ == "__main__": 25 | ''' 26 | Runs the filtering step of choosing multimapped reads 27 | ''' 28 | 29 | [alignment_cutoff, paired_ended] = parse_args() 30 | 31 | if paired_ended: 32 | alignment_cutoff = int(alignment_cutoff) * 2 33 | 34 | # Store each line in sam file as a list of reads, 35 | # where each read is a list of elements to easily 36 | # modify or grab things 37 | current_reads = [] 38 | current_qname = '' 39 | 40 | for line in sys.stdin: 41 | 42 | read_elems = line.strip().split('\t') 43 | 44 | if read_elems[0].startswith('@'): 45 | sys.stdout.write(line) 46 | continue 47 | 48 | # Keep taking lines that have the same qname 49 | if read_elems[0] == current_qname: 50 | # Add line to current reads 51 | current_reads.append(line) 52 | pass 53 | else: 54 | # Discard if there are more than the alignment cutoff 55 | if len(current_reads) >= alignment_cutoff: 56 | current_reads = [line] 57 | current_qname = read_elems[0] 58 | elif len(current_reads) > 0: 59 | # Just output all reads, which are then filtered with 60 | # samtools 61 | for read in current_reads: 62 | sys.stdout.write(str(read)) 63 | 64 | # And then discard 65 | current_reads = [line] 66 | current_qname = read_elems[0] 67 | else: 68 | # First read in file 69 | current_reads.append(line) 70 | current_qname = read_elems[0] 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /utils/axt_dirfiles.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import sys,glob,gzip,os 4 | 5 | # axt format: http://genome.ucsc.edu/goldenPath/help/axt.html 6 | 7 | if len(sys.argv)!=3: 8 | print ' Run under the dir of gzipped Axt files, presumably one for each target chr but that doesn\'t matter' 9 | sys.exit() 10 | 11 | chrsize={} 12 | with open(sys.argv[1]) as fin: 13 | for line in fin: 14 | lst=line.rstrip().split('\t') 15 | chrsize[lst[0]]=int(lst[1]) 16 | 17 | 18 | OF=sys.argv[2] 19 | 20 | fout=open(OF,'w') 21 | 22 | id=1 23 | 24 | for f in glob.glob('*'): 25 | fin=gzip.GzipFile(f,'r') 26 | line=fin.readline() 27 | while line: 28 | if line[0]!='#': 29 | lst=line.rstrip().split() 30 | # query start/stop 31 | a=0 32 | b=0 33 | if lst[7]=='+': 34 | a=int(lst[5])-1 35 | b=lst[6] 36 | else: 37 | c=chrsize[lst[4]] 38 | a=c-int(lst[6]) 39 | b=c-int(lst[5])+1 40 | 41 | fout.write('{0[1]}\t{2}\t{0[3]}\tid:{1},genomealign:{{chr:"{0[4]}",start:{3},stop:{4},strand:"{0[7]}",targetseq:'.format( 42 | lst, 43 | id, 44 | int(lst[2])-1, 45 | a, 46 | b 47 | )) 48 | id+=1 49 | line=fin.readline().rstrip() 50 | fout.write('"'+line+'",queryseq:') 51 | line=fin.readline().rstrip() 52 | fout.write('"'+line+'"}\n') 53 | fin.readline() 54 | line=fin.readline() 55 | 56 | 57 | fout.close() 58 | 59 | 60 | os.system('sort -k1,1 -k2,2n '+OF+' > xx') 61 | os.system('mv xx '+OF) 62 | os.system('bgzip -f '+OF) 63 | os.system('tabix -f -p bed '+OF+'.gz') 64 | -------------------------------------------------------------------------------- /utils/bds_scr: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$#" -lt 2 ]; then 4 | echo 5 | echo "Create a detached screen for a BDS script and redirect stdout/stderr to a log file." 6 | echo "If you skip [LOG_FILE_NAME], a log file [SCR_NAME].log will be generated on the working directory." 7 | echo "If a log file already exists, stdout/stderr will be appended to it." 8 | echo "Monitor a log file with 'tail -f [LOG_FILE_NAME]'" 9 | echo 10 | echo "Usage: bds_scr [SCR_NAME] [LOG_FILE_NAME] [BDS_PARAM]" 11 | echo " Example: bds_scr TEST ~/TEST.log -s sge chipseq.bds -fastq1 ..." 12 | echo 13 | exit 0 14 | fi 15 | 16 | SCR_NAME="$1".BDS 17 | 18 | #if [ $(screen -ls $SCR_NAME | grep 'No Sockets' | wc -l) != "1" ]; then 19 | if [ $(screen -ls | grep -P "[\t ]\d+.$SCR_NAME" | wc -l) != "0" ]; then 20 | echo "error: A screen named $SCR_NAME already exists." 21 | exit 1 22 | else 23 | echo "[SCR_NAME] : $SCR_NAME" 24 | fi 25 | 26 | if [[ $2 == -* || $2 == *.bds ]]; then # LOG_FILE_NAME skipped 27 | LOG_FILE_NAME="$PWD/$SCR_NAME.log" 28 | PARAM_START_IDX=2 29 | elif [[ $3 == -* || $3 == *.bds ]]; then 30 | LOG_FILE_NAME=$2 31 | PARAM_START_IDX=3 32 | else 33 | echo "error: [BDS_PARAM] is wrong." 34 | exit 1 35 | fi 36 | 37 | PARAM= 38 | 39 | if [ $(find $LOG_FILE_NAME -mmin -2 2> /dev/null | wc -l) != "0" ]; then 40 | echo "error: log file handle is open or very fresh (modified in past 2 minutes)." 41 | exit 3 42 | fi 43 | 44 | for ((i=$PARAM_START_IDX;i<=$#;i++)); do 45 | PARAM="$PARAM ${!i}" 46 | done 47 | 48 | echo "[HOST] : $(hostname -f)" 49 | echo "[LOG_FILE_NAME] : $LOG_FILE_NAME" 50 | echo "[BDS_PARAM] : $PARAM" 51 | 52 | mkdir -p $(dirname $LOG_FILE_NAME) 53 | 54 | echo "" 55 | echo "===== Created a new screen ====" >> $LOG_FILE_NAME 56 | echo "[DATE] : $(date)" >> $LOG_FILE_NAME 57 | echo "[HOST] : $(hostname -f)" >> $LOG_FILE_NAME 58 | echo "[SCR_NAME] : $SCR_NAME" >> $LOG_FILE_NAME 59 | echo "[BDS_PARAM] : $PARAM" >> $LOG_FILE_NAME 60 | echo "" >> $LOG_FILE_NAME 61 | 62 | screen -Sdm $SCR_NAME bash -c "bds &>>$LOG_FILE_NAME $PARAM $>>$LOG_FILE_NAME" 63 | 64 | -------------------------------------------------------------------------------- /utils/bds_scr_5min: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$#" -lt 2 ]; then 4 | echo 5 | echo "Create a detached screen for a BDS script and redirect stdout/stderr to a log file." 6 | echo "If you skip [LOG_FILE_NAME], a log file [SCR_NAME].log will be generated on the working directory." 7 | echo "If a log file already exists, stdout/stderr will be appended to it." 8 | echo "Monitor a log file with 'tail -f [LOG_FILE_NAME]'" 9 | echo 10 | echo "Usage: bds_scr [SCR_NAME] [LOG_FILE_NAME] [BDS_PARAM]" 11 | echo " Example: bds_scr TEST ~/TEST.log -s sge chipseq.bds -fastq1 ..." 12 | echo 13 | exit 0 14 | fi 15 | 16 | SCR_NAME=$1.BDS 17 | 18 | #if [ $(screen -ls $SCR_NAME | grep 'No Sockets' | wc -l) != "1" ]; then 19 | if [ $(screen -ls | grep -P "[\t ]\d+.$SCR_NAME" | wc -l) != "0" ]; then 20 | echo "error: A screen named $SCR_NAME already exists." 21 | exit 1 22 | else 23 | echo "[SCR_NAME] : $SCR_NAME" 24 | fi 25 | 26 | if [[ $2 == -* || $2 == *.bds ]]; then # LOG_FILE_NAME skipped 27 | LOG_FILE_NAME="$PWD/$SCR_NAME.log" 28 | PARAM_START_IDX=2 29 | elif [[ $3 == -* || $3 == *.bds ]]; then 30 | LOG_FILE_NAME=$2 31 | PARAM_START_IDX=3 32 | else 33 | echo "error: [BDS_PARAM] is wrong." 34 | exit 2 35 | fi 36 | 37 | if [ $(find $LOG_FILE_NAME -mmin -5 | wc -l) != "0" ]; then 38 | echo "error: log file handle is open or very fresh (modified in past 5 minutes)." 39 | exit 3 40 | fi 41 | 42 | PARAM= 43 | 44 | for ((i=$PARAM_START_IDX;i<=$#;i++)); do 45 | PARAM="$PARAM ${!i}" 46 | done 47 | 48 | echo "[LOG_FILE_NAME] : $LOG_FILE_NAME" 49 | echo "[BDS_PARAM] : $PARAM" 50 | 51 | mkdir -p $(dirname $LOG_FILE_NAME) 52 | 53 | echo "" 54 | echo "===== Created a new screen ====" >> $LOG_FILE_NAME 55 | echo "DATE : $(date)" >> $LOG_FILE_NAME 56 | echo "[HOST] : $(hostname -f)" >> $LOG_FILE_NAME 57 | echo "[SCR_NAME] : $SCR_NAME" >> $LOG_FILE_NAME 58 | echo "[BDS_PARAM] : $PARAM" >> $LOG_FILE_NAME 59 | echo "" >> $LOG_FILE_NAME 60 | 61 | screen -Sdm $SCR_NAME bash -c "bds &>>$LOG_FILE_NAME $PARAM $>>$LOG_FILE_NAME" 62 | 63 | -------------------------------------------------------------------------------- /utils/broadpeak.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import sys,os 4 | 5 | if len(sys.argv)!=3: 6 | print ' ' 7 | sys.exit() 8 | 9 | infile,outfile=sys.argv[1:] 10 | 11 | # all values on 9th field are -1, exclude them 12 | 13 | id=1 14 | fout=open(outfile,'w') 15 | with open(infile) as fin: 16 | for line in fin: 17 | lst=line.rstrip().split('\t') 18 | fout.write('{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]}],id:{1},'.format(lst,id)) 19 | id+=1 20 | if len(lst[3])>1: 21 | fout.write('name:"'+lst[3]+'",') 22 | if lst[5]!='.': 23 | fout.write('strand:"'+lst[5]+'",') 24 | fout.write('\n') 25 | fout.close() 26 | 27 | os.system('sort -k1,1 -k2,2n '+outfile+' > '+outfile+'.srt') 28 | os.system('mv '+outfile+'.srt'+' '+outfile) 29 | os.system('bgzip -f '+outfile) 30 | os.system('tabix -f -p bed '+outfile+'.gz') 31 | -------------------------------------------------------------------------------- /utils/clusterGeneric/kill.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #------------------------------------------------------------------------------- 4 | # BDS generic cluster example 5 | # 6 | # This is a trivial example of the 'cluster generic' interface implementation. 7 | # The commands implemented in this example simply pass the propper arguments 8 | # to qsub, qdel or qstat commands. 9 | # This is intended as a toy example, since bds can do this directly (but 10 | # it's a good starting point to extend your own implementation). 11 | # 12 | # The script is called when a task is killed 13 | # 14 | # Script's output: 15 | # None 16 | # 17 | # Command line arguments: 18 | # jobId: This is the jobId returned as the first line in 'clusterGenericRun' 19 | # script (i.e. the jobID provided by the cluster management system) 20 | # 21 | # Pablo Cingolani 22 | #------------------------------------------------------------------------------- 23 | 24 | #--- 25 | # Parse command line arguments 26 | #--- 27 | die "Error: Missing arguments.\nUsage: kill.pl jobId\n" if $#ARGV < 0 ; 28 | #$jobId = shift @ARGV; 29 | $jobId = join(' ', @ARGV); 30 | 31 | #--- 32 | # Execute cluster command to kill task 33 | #--- 34 | $exitCode = system "scancel $jobId"; 35 | 36 | # OK 37 | exit($exitCode); 38 | 39 | -------------------------------------------------------------------------------- /utils/clusterGeneric/postMortemInfo.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #------------------------------------------------------------------------------- 4 | # BDS generic cluster example 5 | # 6 | # This is a trivial example of the 'cluster generic' interface implementation. 7 | # The commands implemented in this example simply pass the propper arguments 8 | # to qsub, qdel or qstat commands. 9 | # This is intended as a toy example, since bds can do this directly (but 10 | # it's a good starting point to extend your own implementation). 11 | # 12 | # The following command is executed in order to get information of a recently 13 | # finished jobId. This information is typically used for debuging and it added 14 | # to bds's output. 15 | # 16 | # Script's output: 17 | # The output is not parsed, it is stored and later shown 18 | # in bds's report. Is should contain information relevant 19 | # to the job's execution (e.g. "qstat -f $jobId" or 20 | # "checkjob -v $jobId") 21 | # 22 | # Command line arguments: 23 | # jobId: This is the jobId returned as the first line in 'clusterGenericRun' 24 | # script (i.e. the jobID provided by the cluster management system) 25 | # 26 | # Pablo Cingolani 27 | #------------------------------------------------------------------------------- 28 | 29 | #--- 30 | # Parse command line arguments 31 | #--- 32 | die "Error: Missing arguments.\nUsage: postMortemInfo.pl jobId\n" if $#ARGV < 0 ; 33 | $jobId = shift @ARGV; 34 | 35 | #--- 36 | # Execute cluster command to show task details 37 | #--- 38 | $exitCode = system "squeue -j $jobId"; 39 | 40 | # OK 41 | exit($exitCode); 42 | 43 | -------------------------------------------------------------------------------- /utils/clusterGeneric/run.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use POSIX; 4 | 5 | die "Error: Missing arguments.\nUsage: run.pl timeout cpus mem queue saveStdout saveStderr cmd arg1 ... argN\n" if $#ARGV < 6 ; 6 | 7 | $timeout = shift @ARGV; 8 | $cpus = shift @ARGV; 9 | $mem = shift @ARGV; 10 | $queue = shift @ARGV; 11 | $saveStdout = shift @ARGV; 12 | $saveStderr = shift @ARGV; 13 | $cmd = join(' ', @ARGV); 14 | 15 | $qsub = "sbatch --export=ALL "; 16 | $qsub .= "-n 1 --ntasks-per-node=1 --cpus-per-task=$cpus " if( $cpus > 0 ); 17 | if( $mem > 0 ) { 18 | $mem = ceil($mem/1000000); # MB 19 | $qsub .= "--mem-per-cpu $mem "; 20 | } 21 | if( $timeout > 0 ) { 22 | $timeout = ceil($timeout/60); # minute 23 | $qsub .= "-t $timeout "; 24 | } 25 | if ( $queue ne "" ) { 26 | $qsub .= "-p $queue " 27 | } 28 | 29 | $pid = open QSUB, " | $qsub"; 30 | die "Cannot run command '$qsub'\n" if ! kill(0, $pid); # Check that process exists 31 | print QSUB "#!/bin/sh \n"; # SLURM sbatch needs this shebang... 32 | print QSUB "$cmd\n"; # Send cluster's task via qsub's STDIN 33 | close QSUB; 34 | 35 | exit(0); 36 | 37 | -------------------------------------------------------------------------------- /utils/clusterGeneric/stat.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | #------------------------------------------------------------------------------- 4 | # BDS generic cluster example 5 | # 6 | # This is a trivial example of the 'cluster generic' interface implementation. 7 | # The commands implemented in this example simply pass the propper arguments 8 | # to qsub, qdel or qstat commands. 9 | # This is intended as a toy example, since bds can do this directly (but 10 | # it's a good starting point to extend your own implementation). 11 | # 12 | # This script is executed in order to show the jobID of all jobs currently 13 | # scheduled in the cluster 14 | # 15 | # Script's output: 16 | # This script is expected to print all jobs currently scheduled or 17 | # running in the cluster (e.g. qstat). One per line. The FIRST column 18 | # should be the jobID (columns are spce or tab separated). Other 19 | # columns may exists (but are currently ignored). 20 | # 21 | # Command line arguments: 22 | # None 23 | # 24 | # Pablo Cingolani 25 | #------------------------------------------------------------------------------- 26 | 27 | #--- 28 | # Execute cluster command to show all tasks 29 | #--- 30 | $exitCode = system "squeue"; 31 | 32 | # OK 33 | exit($exitCode); 34 | -------------------------------------------------------------------------------- /utils/detect_adapter.py: -------------------------------------------------------------------------------- 1 | # written by Nathan Boley, from https://github.com/nboley/GGR_code 2 | 3 | import sys 4 | import gzip 5 | 6 | VERBOSE = False 7 | 8 | adapters = { 9 | 'Illumina': b'AGATCGGAAGAGC', 10 | 'Nextera ': b'CTGTCTCTTATA', 11 | 'smallRNA': b'TGGAATTCTCGG' 12 | } 13 | 14 | def detect_adapters_and_cnts(fname, max_n_lines=1000000): 15 | adapter_cnts = { 16 | 'Illumina': 0, 17 | 'Nextera ': 0, 18 | 'smallRNA': 0 19 | } 20 | 21 | with gzip.open(sys.argv[1]) as fp: 22 | # read the first million sequences or to the end of the while -- whichever 23 | # comes first, and then use the adapter for trimming which was found to 24 | # occur most often 25 | for seq_index, line in enumerate(fp): 26 | if seq_index >= max_n_lines: break 27 | if seq_index%4 != 1: continue 28 | for key in adapters: 29 | if line.find(adapters[key]) > -1: 30 | adapter_cnts[key] += 1 31 | 32 | observed_adapters = [ 33 | adapter for adapter, cnt in sorted( 34 | adapter_cnts.items(), key=lambda x: -x[1]) 35 | if cnt > 0 36 | ] 37 | return observed_adapters, adapter_cnts, seq_index//4 38 | 39 | def detect_most_likely_adapter(fname): 40 | observed_adapters, adapter_cnts, n_obs_adapters = detect_adapters_and_cnts(fname) 41 | if observed_adapters: 42 | best_adapter = observed_adapters[0] 43 | else: 44 | best_adapter = "" 45 | 46 | if VERBOSE: 47 | print("\n\nAUTO-DETECTING ADAPTER TYPE\n===========================") 48 | print("Attempting to auto-detect adapter type from the first 1 million sequences of the first file (>> {} <<)\n".format( 49 | fname) 50 | ) 51 | print("Found perfect matches for the following adapter sequences:") 52 | print("Adapter type\tCount\tSequence\tSequences analysed\tPercentage") 53 | for adapter in observed_adapters: 54 | print("{}\t{}\t{}\t{}\t\t\t{:.2%}".format( 55 | adapter, 56 | adapter_cnts[adapter], 57 | adapters[adapter].decode(), 58 | n_obs_adapters, 59 | adapter_cnts[adapter]/n_obs_adapters) 60 | ) 61 | return best_adapter 62 | 63 | def main(): 64 | global VERBOSE 65 | VERBOSE = True 66 | best_adapter = detect_most_likely_adapter(sys.argv[1]) 67 | print(best_adapter) 68 | 69 | if __name__ == '__main__': 70 | main() 71 | -------------------------------------------------------------------------------- /utils/gappedpeak.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import sys,os 4 | 5 | if len(sys.argv)!=3: 6 | print ' ' 7 | sys.exit() 8 | 9 | infile,outfile=sys.argv[1:] 10 | 11 | id=1 12 | fout=open(outfile,'w') 13 | with open(infile) as fin: 14 | for line in fin: 15 | lst=line.rstrip().split('\t') 16 | fout.write('{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[12]},{0[13]},{0[14]}],id:{1},struct:{{thin:[[{0[1]},{0[2]}]],thick:['.format(lst,id)) 17 | id+=1 18 | a=int(lst[1]) 19 | sizes=lst[10].split(',') 20 | starts=lst[11].split(',') 21 | for i in range(len(sizes)): 22 | fout.write('[{0},{1}],'.format(a+int(starts[i]),a+int(starts[i])+int(sizes[i]))) 23 | fout.write(']},') 24 | 25 | if len(lst[3])>1: 26 | fout.write('name:"'+lst[3]+'",') 27 | if lst[5]!='.': 28 | fout.write('strand:"'+lst[5]+'",') 29 | fout.write('\n') 30 | 31 | fout.close() 32 | 33 | os.system('sort -k1,1 -k2,2n '+outfile+' > '+outfile+'.srt') 34 | os.system('mv '+outfile+'.srt'+' '+outfile) 35 | os.system('bgzip -f '+outfile) 36 | os.system('tabix -f -p bed '+outfile+'.gz') 37 | -------------------------------------------------------------------------------- /utils/get_read_length_from_fastq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # code extracted from Daniel Kim's ATAQC module (run_ataqc.py) 3 | 4 | import os, sys, re, gzip 5 | 6 | def getFileHandle(filename, mode="r"): 7 | if (re.search('.gz$',filename) or re.search('.gzip',filename)): 8 | if (mode=="r"): 9 | mode="rb"; 10 | return gzip.open(filename,mode) 11 | else: 12 | return open(filename,mode) 13 | 14 | def get_read_length(fastq_file): 15 | ''' 16 | Get read length out of fastq file 17 | ''' 18 | total_reads_to_consider = 1000000 19 | line_num = 0 20 | total_reads_considered = 0 21 | max_length = 0 22 | with getFileHandle(fastq_file, 'rb') as fp: 23 | for line in fp: 24 | if line_num % 4 == 1: 25 | if len(line.strip()) > max_length: 26 | max_length = len(line.strip()) 27 | total_reads_considered += 1 28 | if total_reads_considered >= total_reads_to_consider: 29 | break 30 | line_num += 1 31 | 32 | return int(max_length) 33 | 34 | def main(): 35 | print(get_read_length(sys.argv[1])) 36 | 37 | if __name__ == "__main__": 38 | main() 39 | -------------------------------------------------------------------------------- /utils/kill_scr: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$#" -lt 1 ]; then 4 | echo 5 | echo "Kill a screen with name [SCR_NAME]" 6 | echo "Usage : kill_scr [SCR_NAME]" 7 | echo 8 | screen -ls 9 | exit 1 10 | fi 11 | 12 | screen -X -R $1 quit 13 | -------------------------------------------------------------------------------- /utils/narrowpeak.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import sys,os 4 | 5 | if len(sys.argv)!=3: 6 | print ' ' 7 | sys.exit() 8 | 9 | infile,outfile=sys.argv[1:] 10 | 11 | id=1 12 | fout=open(outfile,'w') 13 | with open(infile) as fin: 14 | for line in fin: 15 | lst=line.rstrip().split('\t') 16 | fout.write('{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]},{0[8]}],id:{1},'.format(lst,id)) 17 | id+=1 18 | if len(lst[3])>1: 19 | fout.write('name:"'+lst[3]+'",') 20 | if lst[5]!='.': 21 | fout.write('strand:"'+lst[5]+'",') 22 | if lst[9]!='-1': 23 | fout.write('sbstroke:['+lst[9]+']') 24 | fout.write('\n') 25 | 26 | fout.close() 27 | 28 | os.system('sort -k1,1 -k2,2n '+outfile+' > '+outfile+'.srt') 29 | os.system('mv '+outfile+'.srt'+' '+outfile) 30 | os.system('bgzip -f '+outfile) 31 | os.system('tabix -f -p bed '+outfile+'.gz') 32 | -------------------------------------------------------------------------------- /utils/narrowpeak_idr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # show -log10(GLOBAL IDR SCORE) instead of narrowpeak pval 4 | 5 | import sys,os 6 | 7 | if len(sys.argv)!=3: 8 | print ' ' 9 | sys.exit() 10 | 11 | infile,outfile=sys.argv[1:] 12 | 13 | id=1 14 | fout=open(outfile,'w') 15 | with open(infile) as fin: 16 | for line in fin: 17 | lst=line.rstrip().split('\t') 18 | fout.write('{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]},{0[8]},{0[10]},{0[11]}],id:{1},'.format(lst,id)) 19 | id+=1 20 | if len(lst[3])>1: 21 | fout.write('name:"'+lst[3]+'",') 22 | else: 23 | fout.write('name:"'+str(id)+'",') 24 | if lst[5]!='.': 25 | fout.write('strand:"'+lst[5]+'",') 26 | if lst[9]!='-1': 27 | fout.write('sbstroke:['+lst[9]+']') 28 | fout.write('\n') 29 | 30 | fout.close() 31 | 32 | os.system('sort -k1,1 -k2,2n '+outfile+' > '+outfile+'.srt') 33 | os.system('mv '+outfile+'.srt'+' '+outfile) 34 | os.system('bgzip -f '+outfile) 35 | os.system('tabix -f -p bed '+outfile+'.gz') 36 | -------------------------------------------------------------------------------- /utils/parse_summary_ENCODE_accession_recursively.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # written by Jin Lee, 2016 4 | 5 | import os, sys 6 | import json 7 | import subprocess 8 | import collections 9 | import argparse 10 | 11 | parser = argparse.ArgumentParser(prog='ENCODE_summary.json parser for ENCODE accession', \ 12 | description='Recursively find ENCODE_summary.json, parse it and make a CSV for uploading to the ENCODE portal. Use https://github.com/ENCODE-DCC/pyencoded-tools/blob/master/ENCODE_submit_files.py for uploading.') 13 | parser.add_argument('--out-file', type=argparse.FileType('w'), default=sys.stdout, \ 14 | help='Output CSV filename)') 15 | parser.add_argument('--search-dir', type=str, default='.', \ 16 | help='Root directory to search for ENCODE_summary.json') 17 | parser.add_argument('--json-file', type=str, default='ENCODE_summary.json', \ 18 | help='Specify json file name to be parsed') 19 | parser.add_argument('--sort-by-genome-and-exp', dest='sort_by_genome_and_exp', action='store_true', \ 20 | help='Sort rows by genomes and ENCODE experiment accession ID') 21 | group_accession_ids = parser.add_mutually_exclusive_group() 22 | group_accession_ids.add_argument('--ignored-accession-ids-file', type=str, \ 23 | help='Accession IDs in this text file will be ignored. (1 acc. ID per line)') 24 | group_accession_ids.add_argument('--accession-ids-file', type=str, \ 25 | help='Only accession IDs in this text file will be downloaded. (1 acc. ID per line). Others will be ignored.') 26 | parser.set_defaults(sort_by_genome_and_exp=False) 27 | 28 | args = parser.parse_args() 29 | 30 | # loaded ignored accession list 31 | ignored_accession_ids = [] 32 | if args.ignored_accession_ids_file and os.path.isfile(args.ignored_accession_ids_file): 33 | with open(args.ignored_accession_ids_file,'r') as f: 34 | ignored_accession_ids = f.read().splitlines() 35 | ignored_accession_ids = \ 36 | [accession_id for accession_id in ignored_accession_ids if accession_id and not accession_id.startswith("#") ] 37 | accession_ids = [] 38 | if args.accession_ids_file and os.path.isfile(args.accession_ids_file): 39 | with open(args.accession_ids_file,'r') as f: 40 | accession_ids = f.read().splitlines() 41 | accession_ids = \ 42 | [accession_id for accession_id in accession_ids if accession_id and not accession_id.startswith("#") ] 43 | 44 | # find all ENCODE_summary.json recursively 45 | json_files = subprocess.check_output("find -L %s -name %s" % (args.search_dir,args.json_file), \ 46 | shell=True ).strip().split('\n') 47 | # read json 48 | jsons = [] 49 | for json_file in json_files: 50 | with open(json_file,'r') as f: 51 | jsons.append( json.load(f) ) 52 | 53 | # look at headers first 54 | raw_headers = list() 55 | 56 | for json in jsons: 57 | if not 'data_files' in json: 58 | continue 59 | data_files = json['data_files'] 60 | for data_file in data_files: 61 | for key in data_file: 62 | if not key in raw_headers: 63 | raw_headers.append( key ) 64 | # sort header 65 | order_by_header = collections.defaultdict(int, \ 66 | { 67 | 'file_format':20, 68 | 'file_format_type':19, 69 | 'output_type':18, 70 | 'dataset':17, 71 | 'assembly':16, 72 | 'aliases:array':15, 73 | 'derived_from:array':14, 74 | 'md5sum':13, 75 | 'award':12, 76 | 'lab':11, 77 | 'submitted_file_name':10, 78 | }) 79 | 80 | headers = sorted(raw_headers, key=lambda x: order_by_header[x], reverse=True) 81 | 82 | # write header 83 | args.out_file.write( ','.join( headers ) +'\n') 84 | 85 | lines = list() 86 | 87 | def find_submitted_file_name( submitted_file_name ): 88 | # recursively find file under a working directory and return path relative to working dir. 89 | files = subprocess.check_output("find . -type f -name '%s'" % (submitted_file_name), \ 90 | shell=True ).strip().split('\n') 91 | return files[0] 92 | 93 | # for each replicate, write contents 94 | for json in jsons: 95 | if not 'data_files' in json: 96 | continue 97 | if ignored_accession_ids and json['ENCODE_accession'] in ignored_accession_ids: continue 98 | if accession_ids and not json['ENCODE_accession'] in accession_ids: continue 99 | data_files = json['data_files'] 100 | for data_file in data_files: 101 | line = collections.OrderedDict() 102 | for key in headers: 103 | if key in data_file: 104 | if key == 'submitted_file_name': 105 | line[key] = find_submitted_file_name( data_file[key] ) 106 | metadata_file = line[key]+'.meta' 107 | if os.path.exists(metadata_file): 108 | with open(metadata_file,mode='r') as f: 109 | for l in f: 110 | if 'md5sum' in l: 111 | md5sum = l.split('=')[1].strip() 112 | if md5sum != data_file['md5sum']: 113 | print('Warning: In accession {}, md5sum of file {} does not match! (json:{}, actual:{})'.format( 114 | json['ENCODE_accession'], line[key], data_file['md5sum'], md5sum )) 115 | else: 116 | line[key] = data_file[key] 117 | else: 118 | line[key] = "" 119 | lines.append(line) 120 | 121 | order_by_file_format = collections.defaultdict(int, \ 122 | { 123 | 'bam':20, 124 | 'tagAlign':19, 125 | 'bigWig':18, 126 | 'bed':17, 127 | 'bigBed':16, 128 | }) 129 | order_by_output_type = collections.defaultdict(int, \ 130 | { 131 | 'alignments':20, 132 | 'unfiltered alignments':19, 133 | 'signal p-value':18, 134 | 'fold change over control':17, 135 | 'filtered peaks':16, 136 | 'replicated peaks':15, 137 | 'idr thresholded peaks':14, 138 | 'optimal idr thresholded peaks':13, 139 | 'conservative idr thresholded peaks':12, 140 | }) 141 | 142 | # sort lines 143 | sorted_lines = sorted(lines, key = lambda x: (\ 144 | order_by_file_format[x['file_format']],\ 145 | order_by_output_type[x['output_type']]), reverse=True) 146 | 147 | if args.sort_by_genome_and_exp: 148 | sorted_lines = sorted(sorted_lines, key = lambda x: (\ 149 | x['assembly'],\ 150 | x['dataset']) ) 151 | 152 | for line in sorted_lines: 153 | result = '' 154 | for key in headers: 155 | result += (line[key]+ ('' if key==headers[-1] else ',')) 156 | args.out_file.write( result + '\n' ) 157 | -------------------------------------------------------------------------------- /utils/parse_summary_ENCODE_qc_recursively.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # written by Jin Lee, 2016 4 | 5 | import os, sys 6 | import json 7 | import subprocess 8 | import collections 9 | import argparse 10 | import xlwt 11 | 12 | parser = argparse.ArgumentParser(prog='ENCODE_summary.json parser for ENCODE QC import', \ 13 | description='Recursively find ENCODE_summary.json, parse it and make an excel file for importing quality metrics to the ENCODE portal. Use https://github.com/ENCODE-DCC/pyencoded-tools/blob/master/ENCODE_import_data.py for uploading.') 14 | parser.add_argument('out_file', metavar='out-file', type=str, \ 15 | help='Output Excel filename (extention should be .xls, not .xlsx)') 16 | parser.add_argument('--search-dir', type=str, default='.', \ 17 | help='Root directory to search for ENCODE_summary.json') 18 | parser.add_argument('--json-file', type=str, default='ENCODE_summary.json', \ 19 | help='Specify json file name to be parsed') 20 | parser.add_argument('--sort-by-genome-and-exp', dest='sort_by_genome_and_exp', action='store_true', \ 21 | help='Sort rows by genomes and ENCODE experiment accession ID') 22 | group_accession_ids = parser.add_mutually_exclusive_group() 23 | group_accession_ids.add_argument('--ignored-accession-ids-file', type=str, \ 24 | help='Accession IDs in this text file will be ignored. (1 acc. ID per line)') 25 | group_accession_ids.add_argument('--accession-ids-file', type=str, \ 26 | help='Only accession IDs in this text file will be parsed. (1 acc. ID per line). Others will be ignored.') 27 | parser.set_defaults(sort_by_genome_and_exp=False) 28 | 29 | args = parser.parse_args() 30 | 31 | # loaded ignored accession list 32 | ignored_accession_ids = [] 33 | if args.ignored_accession_ids_file and os.path.isfile(args.ignored_accession_ids_file): 34 | with open(args.ignored_accession_ids_file,'r') as f: 35 | ignored_accession_ids = f.read().splitlines() 36 | ignored_accession_ids = \ 37 | [accession_id for accession_id in ignored_accession_ids if accession_id and not accession_id.startswith("#") ] 38 | accession_ids = [] 39 | if args.accession_ids_file and os.path.isfile(args.accession_ids_file): 40 | with open(args.accession_ids_file,'r') as f: 41 | accession_ids = f.read().splitlines() 42 | accession_ids = \ 43 | [accession_id for accession_id in accession_ids if accession_id and not accession_id.startswith("#") ] 44 | 45 | # find all ENCODE_summary.json recursively 46 | json_files = subprocess.check_output("find -L %s -name %s" % (args.search_dir,args.json_file), \ 47 | shell=True ).strip().split('\n') 48 | # read json 49 | jsons = [] 50 | for json_file in json_files: 51 | with open(json_file,'r') as f: 52 | jsons.append( json.load(f) ) 53 | 54 | # look at headers first 55 | raw_headers = dict() 56 | 57 | for json in jsons: 58 | if ignored_accession_ids and json['ENCODE_accession'] in ignored_accession_ids: continue 59 | if accession_ids and not json['ENCODE_accession'] in accession_ids: continue 60 | 61 | if not 'ENCODE_quality_metrics' in json: continue 62 | data_files = json['ENCODE_quality_metrics'] 63 | for data_file in data_files: 64 | print data_file 65 | ENCODE_qc_type = data_file["ENCODE_qc_type"] 66 | if not raw_headers.has_key( "ENCODE_qc_type" ): 67 | raw_headers[ ENCODE_qc_type ] = list() 68 | for key in data_file: 69 | if key == "ENCODE_qc_type": continue 70 | if not key in raw_headers[ ENCODE_qc_type ]: 71 | raw_headers[ ENCODE_qc_type ].append( key ) 72 | 73 | # write header (fhs=file handles) 74 | workbook = xlwt.Workbook() 75 | sheets = {} 76 | 77 | cnt=0 78 | for ENCODE_qc_type in raw_headers: 79 | title = "".join([word.title().replace("Idr","IDR") for word in ENCODE_qc_type.split("_")]) 80 | print "Creating a sheet with name: ", title 81 | # sheet = workbook.add_sheet(str(cnt)) 82 | sheet = workbook.add_sheet(title) 83 | sheets[ENCODE_qc_type] = sheet 84 | for i, header in enumerate(raw_headers[ENCODE_qc_type]): 85 | sheet.write(0,i,header) 86 | cnt+=1 87 | # fh = open( "%s.%s.tsv" % (args.out_file_prefix,ENCODE_qc_type) ,'w') 88 | # fh.write(delimiter.join(raw_headers[ENCODE_qc_type])) 89 | # fh.write("\n") 90 | # fhs[ENCODE_qc_type] = fh 91 | 92 | # for each replicate, write contents 93 | lines = dict() 94 | for json in jsons: 95 | if ignored_accession_ids and json['ENCODE_accession'] in ignored_accession_ids: continue 96 | if accession_ids and not json['ENCODE_accession'] in accession_ids: continue 97 | data_files = json['ENCODE_quality_metrics'] 98 | for data_file in data_files: 99 | ENCODE_qc_type = data_file["ENCODE_qc_type"] 100 | if not lines.has_key(ENCODE_qc_type): 101 | lines[ENCODE_qc_type] = list() 102 | line = collections.OrderedDict() 103 | for key in raw_headers[ENCODE_qc_type]: 104 | if key in data_file: 105 | line[key] = data_file[key] 106 | else: 107 | line[key] = "" 108 | lines[ENCODE_qc_type].append(line) 109 | 110 | def is_float(s): 111 | try: 112 | float(s) 113 | return True 114 | except ValueError: 115 | return False 116 | 117 | def is_int(s): 118 | try: 119 | int(s) 120 | return True 121 | except ValueError: 122 | return False 123 | 124 | def is_bool(s): 125 | if s.lower() in ['true','t','false','f']: 126 | return True 127 | else: 128 | return False 129 | 130 | sorted_lines = lines 131 | for ENCODE_qc_type in sorted_lines: 132 | data = sorted_lines[ENCODE_qc_type] 133 | sheet = sheets[ENCODE_qc_type] 134 | row = 1 135 | for line in data: 136 | for col, key in enumerate(line): 137 | val = line[key] 138 | if key.endswith('_pct'): 139 | val += "%" 140 | if val.startswith('null') or val.startswith('N/A:N/A'): 141 | val = "null" 142 | if is_int(val): 143 | val = int(val) 144 | elif is_float(val): 145 | val = float(val) 146 | # elif is_bool(val): 147 | # else: 148 | # style = xlwt.easyxf() 149 | sheet.write(row, col, label=val) 150 | row += 1 151 | 152 | workbook.save(args.out_file) 153 | -------------------------------------------------------------------------------- /utils/parse_summary_qc_recursively.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | # written by Jin Lee, 2016 4 | 5 | import os 6 | import sys 7 | import re 8 | import argparse 9 | import json 10 | import subprocess 11 | from collections import OrderedDict 12 | 13 | parser = argparse.ArgumentParser(prog='ENCODE_summary.json parser for QC', \ 14 | description='Recursively find ENCODE_summary.json, parse it and make a TSV spreadsheet of QC metrics.') 15 | parser.add_argument('--out-file', type=argparse.FileType('w'), default=sys.stdout, \ 16 | help='Output TSV filename)') 17 | parser.add_argument('--search-dir', type=str, default='.', \ 18 | help='Root directory to search for ENCODE_summary.json') 19 | parser.add_argument('--json-file', type=str, default='ENCODE_summary.json', \ 20 | help='Specify json file name to be parsed') 21 | 22 | args = parser.parse_args() 23 | 24 | # find all qc_summary.json recursively 25 | # json_files = [os.path.join(dp, f) for dp, dn, filenames in os.walk(os.getcwd()) \ 26 | # for f in filenames if os.path.splitext(f)[1] == 'qc_summary.json'] 27 | 28 | # find all ENCODE_summary.json recursively 29 | json_files = subprocess.check_output("find -L %s -name %s" % (args.search_dir,args.json_file), \ 30 | shell=True ).strip().split('\n') 31 | # read json 32 | jsons = [] 33 | for json_file in json_files: 34 | with open(json_file,'r') as f: 35 | jsons.append( json.load(f, object_pairs_hook=OrderedDict) ) 36 | 37 | # sort 38 | # sorted_jsons = sorted(jsons, key = lambda x: (\ 39 | # x['ENCODE_award_rfa'], \ 40 | # x['ENCODE_assay_category'], \ 41 | # x['ENCODE_assay_title'], \ 42 | # x['species'], \ 43 | # x['title'])) 44 | 45 | # look at headers first 46 | headers = OrderedDict() 47 | headers['common'] = [\ 48 | 'ENCODE award rfa',\ 49 | 'ENCODE assay category',\ 50 | 'ENCODE assay title',\ 51 | 'species',\ 52 | 'title',\ 53 | 'replicate'] 54 | 55 | # first take longest header for each qc_type 56 | for json in jsons: 57 | for qc_file in json['qc_files']: 58 | qc_type = qc_file['qc_type'] 59 | if qc_type == 'pbc_PE': 60 | qc_type = 'pbc' 61 | qc_file['qc_type'] = qc_type 62 | header_list = qc_file['header'].split('\t') 63 | if not qc_type in headers or len(headers[qc_type]) ' 7 | sys.exit() 8 | 9 | infile,outn=sys.argv[1:] 10 | 11 | aliencoord=0 12 | alienchrid=1 13 | id1=1 14 | id2=1 15 | fn1=outn+'_native' 16 | fn2=outn+'_alien' 17 | fout1=open(fn1,'w') 18 | fout2=open(fn2,'w') 19 | 20 | chrname='scaffold_' 21 | 22 | with open(infile) as fin: 23 | for line in fin: 24 | lst=line.rstrip().split('\t') 25 | if len(lst)==1: 26 | print '{2}{0}:{1}'.format(alienchrid,aliencoord,chrname) 27 | aliencoord=0 28 | alienchrid+=1 29 | continue 30 | a=int(lst[1]) 31 | b=int(lst[2]) 32 | 33 | if a>=b: 34 | print 'wrong line: '+line 35 | sys.exit() 36 | 37 | # native 38 | fout1.write('{0}\t{1}\t{2}\tid:{3},genomealign:{{chr:"{8}{4}",start:{5},stop:{6},strand:"{7}"}}\n'.format( 39 | lst[0],a,b, 40 | id1, 41 | alienchrid, 42 | aliencoord, 43 | aliencoord+b-a, 44 | lst[3], 45 | chrname 46 | )) 47 | id1+=1 48 | # alien 49 | fout2.write('{8}{0}\t{1}\t{2}\tid:{3},genomealign:{{chr:"{4}",start:{5},stop:{6},strand:"{7}"}}\n'.format( 50 | alienchrid, 51 | aliencoord, 52 | aliencoord+b-a, 53 | id2, 54 | lst[0],a,b, 55 | lst[3], 56 | chrname 57 | )) 58 | id2+=1 59 | aliencoord+=b-a 60 | 61 | print '{2}{0}:{1}'.format(alienchrid,aliencoord,chrname) 62 | 63 | fout1.close() 64 | fout2.close() 65 | 66 | import os 67 | 68 | os.system('sort -k1,1 -k2,2n '+fn1+' > x') 69 | os.system('mv x '+fn1) 70 | os.system('bgzip -f '+fn1) 71 | os.system('tabix -f -p bed '+fn1+'.gz') 72 | 73 | os.system('sort -k1,1 -k2,2n '+fn2+' > x') 74 | os.system('mv x '+fn2) 75 | os.system('bgzip -f '+fn2) 76 | os.system('tabix -f -p bed '+fn2+'.gz') 77 | -------------------------------------------------------------------------------- /utils/trimAdapters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | 3 | # Author: Jason Buenrostro, Stanford University 4 | # The following program will compress daisy chain seq data into singe molecules 5 | 6 | ##### IMPORT MODULES ##### 7 | # import necessary for python 8 | import os 9 | import re 10 | import sys 11 | import gzip 12 | import string 13 | import Levenshtein 14 | from optparse import OptionParser 15 | 16 | ##### DEFINE FUNCTIONS ##### 17 | # Reverse complement 18 | complement = string.maketrans('ATCGN', 'TAGCN') 19 | def reverse_complement(sequence): 20 | return sequence.upper().translate(complement)[::-1] 21 | 22 | # Align with mismatch, find first and move on, assumes only one 23 | def fuzz_align(s_seq,l_seq,mismatch): 24 | for i, base in enumerate(l_seq): # loop through equal size windows 25 | l_subset = l_seq[i:i+len(s_seq)] 26 | dist = Levenshtein.distance(l_subset, s_seq) 27 | if dist <= mismatch: # find first then break 28 | return i, dist 29 | break 30 | 31 | # added by Jin Lee for hot fix (output name bug) 32 | def rreplace(s, old, new, occurrence): 33 | li = s.rsplit(old, occurrence) 34 | return new.join(li) 35 | 36 | #### OPTIONS #### 37 | # define options 38 | opts = OptionParser() 39 | usage = "usage: %prog [options] [inputs] This will trim adapters" 40 | opts = OptionParser(usage=usage) 41 | opts.add_option("-a", help=" Accepts fastq or fastq.gz") 42 | opts.add_option("-b", help=" Accepts fastq or fastq.gz") 43 | options, arguments = opts.parse_args() 44 | 45 | # return usage information if no argvs given AND they're not available in the environment 46 | # command line arguments always override environment variables 47 | if len(sys.argv)==1: 48 | p1_in = os.environ.get('P1_IN') 49 | p2_in = os.environ.get('P2_IN') 50 | if (p1_in is None) or (p2_in is None): 51 | os.system(sys.argv[0]+" --help") 52 | sys.exit() 53 | else: 54 | ##### INPUTS AND OUTPUTS ##### 55 | # name input and outputs 56 | p1_in = options.a 57 | p2_in = options.b 58 | 59 | # name outputs and print to working dir 60 | p1_file = p1_in.split('/')[-1] 61 | p2_file = p2_in.split('/')[-1] 62 | p1_out = re.sub(".fastq", ".trim.fastq", p1_file) 63 | p2_out = re.sub(".fastq", ".trim.fastq", p2_file) 64 | 65 | #check for file type and open input file 66 | append = p1_in.split('.')[-1] 67 | if append == "fastq": 68 | p1_rds = open(p1_in,'r') 69 | p2_rds = open(p2_in,'r') 70 | p1_out = re.sub(".fastq", ".trim.fastq", p1_file) 71 | p2_out = re.sub(".fastq", ".trim.fastq", p2_file) 72 | elif append == "fq": 73 | p1_rds = open(p1_in,'r') 74 | p2_rds = open(p2_in,'r') 75 | p1_out = re.sub(".fq", ".trim.fastq", p1_file) 76 | p2_out = re.sub(".fq", ".trim.fastq", p2_file) 77 | elif append == "gz": 78 | p1_rds = gzip.open(p1_in,'r') 79 | p2_rds = gzip.open(p2_in,'r') 80 | p1_out = re.sub(".fastq.gz", ".trim.fastq", p1_file) 81 | p2_out = re.sub(".fastq.gz", ".trim.fastq", p2_file) 82 | p1_out = re.sub(".fq.gz", ".trim.fastq", p1_out) 83 | p2_out = re.sub(".fq.gz", ".trim.fastq", p2_out) 84 | else: 85 | sys.exit("ERROR! The input file2 must be a .fastq or .fastq.gz") 86 | 87 | ##### SCRIPT ##### 88 | # initialize variables 89 | i=0;j=0;k=0;tot_b=0;count=1 90 | n=20 # match seq 91 | mismatch=1 # only allow 0-1 mismatches for now, if allow two then gets mis indexed, to fix this need to change fuzz_align to save L as a vector and reiterate to find 2nd 92 | 93 | # initilize write files 94 | r1_write = open(p1_out, 'w') 95 | r2_write = open(p2_out, 'w') 96 | 97 | while 1: 98 | # read lines 99 | p1_line = p1_rds.readline() 100 | p2_line = p2_rds.readline() 101 | 102 | # break if at end of file 103 | if not p1_line: 104 | break 105 | 106 | # load fastq into memory 107 | if count ==1: 108 | seqhead1 = p1_line 109 | seqhead2 = p2_line 110 | elif count ==2: 111 | seq1 = p1_line.rstrip() 112 | seq2 = p2_line.rstrip() 113 | elif count ==3: 114 | qualhead1 = p1_line 115 | qualhead2 = p2_line 116 | elif count ==4: 117 | qual1 = p1_line.rstrip() 118 | qual2 = p2_line.rstrip() 119 | 120 | # align reads to themselves 121 | i = i+1 # total reads 122 | rc_seq2 = reverse_complement(seq2[0:n]) 123 | idx = seq1.rfind(rc_seq2) # look for perfect match 124 | if idx > 0: 125 | j = j+1 # 0 mismatchs 126 | elif mismatch>0: 127 | hold = fuzz_align(rc_seq2,seq1,mismatch) # else allow for mismatch 128 | if hold: 129 | idx,mis=hold 130 | if mis == 1: 131 | k=k+1 # 1 mismatch 132 | 133 | # trim reads if idx exist 134 | if idx > 0: 135 | # keep track on how much trimming 136 | tot_b = tot_b+len(seq2[idx+n:-1]) #track total bases trimmed 137 | 138 | # trim data 139 | seq1 = seq1[0:idx+n-1] # modified to sub1 because some aligners (bowtie) dont like perfectly overlapping reads 140 | seq2 = seq2[0:idx+n-1] 141 | qual1 = qual1[0:idx+n-1] 142 | qual2 = qual2[0:idx+n-1] 143 | 144 | # print read1 145 | r1_write.write(seqhead1) 146 | r1_write.write(seq1+"\n") 147 | r1_write.write(qualhead1) 148 | r1_write.write(qual1+"\n") 149 | 150 | # print read2 151 | r2_write.write(seqhead2) 152 | r2_write.write(seq2+"\n") 153 | r2_write.write(qualhead2) 154 | r2_write.write(qual2+"\n") 155 | 156 | # increment count 157 | count = count + 1 158 | if count == 5: 159 | count = 1 160 | else: 161 | count = count 162 | 163 | # close files to write the file 164 | r1_write.close() 165 | r2_write.close() 166 | p1_rds.close() 167 | p2_rds.close() 168 | 169 | # write file output names for passing into next step of pipeline 170 | # !!! DO NOT WRITE ANYTHING ELSE TO STDOUT AFTER THIS !!! 171 | sys.stdout.write(p1_out + '\n') 172 | sys.stdout.write(p2_out + '\n') 173 | 174 | # give summary 175 | sys.stderr.write(str(i)+" sequences total\n") 176 | sys.stderr.write(str(j)+" sequences trimmed with 0 mismatches\n") 177 | sys.stderr.write(str(k)+" sequences trimmed with 1 mismatch\n") 178 | sys.stderr.write(str(tot_b/(j+k))+" mean number of bases trimmed for reads requiring trimming\n") 179 | -------------------------------------------------------------------------------- /utils/ucsc_ensGene.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import sys,os 4 | sys.path.append('/home/xzhou/subtleKnife/script/genescript') 5 | import parseUcscgenestruct 6 | 7 | if len(sys.argv)!=3: 8 | print ' knownToEnsembl.txt and kgXref.txt must be under current dir' 9 | sys.exit() 10 | 11 | 12 | aa={} 13 | with open('knownToEnsembl.txt') as fin: 14 | for line in fin: 15 | lst=line.rstrip().split('\t') 16 | aa[lst[0]]=lst[1] 17 | 18 | symbol={} 19 | desc={} 20 | with open('kgXref.txt') as fin: 21 | for line in fin: 22 | lst=line.rstrip().split('\t') 23 | if lst[0] in aa: 24 | ens=aa[lst[0]] 25 | if len(lst[4])>0: 26 | symbol[ens]=lst[4] 27 | if len(lst[7])>0: 28 | desc[ens]=lst[7] 29 | 30 | 31 | ucsc,tkname=sys.argv[1:] 32 | 33 | 34 | 35 | # dump 36 | fout=open(tkname,'w') 37 | fout2=open(tkname+'_load','w') 38 | 39 | id=1 40 | with open(ucsc) as fin: 41 | for line in fin: 42 | lst=line.rstrip().split('\t') 43 | g=parseUcscgenestruct.parse(lst,True) 44 | name=lst[1] 45 | fout.write('{0}\t{1}\t{2}\tname:"{3}",id:{4},strand:"{5}",'.format( 46 | g['chrom'], 47 | g['start'], 48 | g['stop'], 49 | name, 50 | id, 51 | g['strand'])) 52 | id+=1 53 | if 'thin' in g or 'thick' in g: 54 | fout.write('struct:{') 55 | if 'thin' in g: 56 | fout.write('thin:[') 57 | for x in g['thin']: 58 | fout.write('[{0},{1}],'.format(x[0],x[1])) 59 | fout.write('],') 60 | if 'thick' in g: 61 | fout.write('thick:[') 62 | for x in g['thick']: 63 | fout.write('[{0},{1}],'.format(x[0],x[1])) 64 | fout.write('],') 65 | fout.write('},') 66 | # desc 67 | if name in desc: 68 | fout.write('desc:"'+desc[name]+'",') 69 | if name in symbol: 70 | fout.write('name2:"'+symbol[name]+'"') 71 | fout2.write('{0}\t{1}\t{2}\t{3}\n'.format(g['chrom'],g['start'],g['stop'],symbol[name])) 72 | fout.write('\n') 73 | fout2.write('{0}\t{1}\t{2}\t{3}\n'.format(g['chrom'],g['start'],g['stop'],name)) 74 | 75 | 76 | fout2.close() 77 | fout.close() 78 | 79 | import os 80 | os.system('sort -k1,1 -k2,2n '+tkname+' > x') 81 | os.system('mv x '+tkname) 82 | os.system('bgzip -f '+tkname) 83 | os.system('tabix -f -p bed '+tkname+'.gz') 84 | 85 | print ''' 86 | drop table if exists {0}; 87 | create table {0} ( 88 | chrom varchar(20) not null, 89 | start int unsigned not null, 90 | stop int unsigned not null, 91 | name varchar(100) not null 92 | ) ENGINE=MyISAM DEFAULT CHARSET=latin1; 93 | load data local infile '{0}_load' into table {0}; 94 | create index name on {0} (name); 95 | '''.format(tkname) 96 | 97 | -------------------------------------------------------------------------------- /utils/ucsc_simplegene.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import sys,os 4 | sys.path.append('/home/xzhou/subtleKnife/script/genescript') 5 | import parseUcscgenestruct 6 | 7 | if len(sys.argv)!=3: 8 | print ' ' 9 | sys.exit() 10 | 11 | ucsc,tkname=sys.argv[1:] 12 | 13 | 14 | symbol={} 15 | desc={} 16 | i=0 17 | if os.path.exists('refLink.txt'): 18 | ''' 19 | 0 symbol 20 | 1 desc 21 | 2 name 22 | 3 name 23 | ''' 24 | with open('refLink.txt') as fin: 25 | for line in fin: 26 | lst=line.rstrip().split('\t') 27 | if len(lst)<4: continue 28 | w=lst[1].replace('"','') 29 | #w=w.replace("'",'') 30 | desc[lst[2]]=w 31 | desc[lst[3]]=w 32 | symbol[lst[2]]=lst[0] 33 | symbol[lst[3]]=lst[0] 34 | i+=1 35 | print 'refLink: '+str(i) 36 | 37 | 38 | # dump 39 | fout=open(tkname,'w') 40 | fout2=open(tkname+'_load','w') 41 | 42 | id=1 43 | with open(ucsc) as fin: 44 | for line in fin: 45 | lst=line.rstrip().split('\t') 46 | g=parseUcscgenestruct.parse(lst,True) 47 | name=lst[1] 48 | fout.write('{0}\t{1}\t{2}\tname:"{3}",id:{4},strand:"{5}",'.format( 49 | g['chrom'], 50 | g['start'], 51 | g['stop'], 52 | name, 53 | id, 54 | g['strand'])) 55 | id+=1 56 | if 'thin' in g or 'thick' in g: 57 | fout.write('struct:{') 58 | if 'thin' in g: 59 | fout.write('thin:[') 60 | for x in g['thin']: 61 | fout.write('[{0},{1}],'.format(x[0],x[1])) 62 | fout.write('],') 63 | if 'thick' in g: 64 | fout.write('thick:[') 65 | for x in g['thick']: 66 | fout.write('[{0},{1}],'.format(x[0],x[1])) 67 | fout.write('],') 68 | fout.write('},') 69 | # desc 70 | if name in desc: 71 | fout.write('desc:"'+desc[name]+'",') 72 | if name in symbol: 73 | fout.write('name2:"'+symbol[name]+'"') 74 | fout2.write('{0}\t{1}\t{2}\t{3}\n'.format(g['chrom'],g['start'],g['stop'],symbol[name])) 75 | fout.write('\n') 76 | fout2.write('{0}\t{1}\t{2}\t{3}\n'.format(g['chrom'],g['start'],g['stop'],name)) 77 | 78 | 79 | fout2.close() 80 | fout.close() 81 | 82 | import os 83 | os.system('sort -k1,1 -k2,2n '+tkname+' > x') 84 | os.system('mv x '+tkname) 85 | os.system('bgzip -f '+tkname) 86 | os.system('tabix -f -p bed '+tkname+'.gz') 87 | 88 | print ''' 89 | drop table if exists {0}; 90 | create table {0} ( 91 | chrom varchar(20) not null, 92 | start int unsigned not null, 93 | stop int unsigned not null, 94 | name varchar(100) not null 95 | ) ENGINE=MyISAM DEFAULT CHARSET=latin1; 96 | load data local infile '{0}_load' into table {0}; 97 | create index name on {0} (name); 98 | '''.format(tkname) 99 | 100 | --------------------------------------------------------------------------------