├── .circleci └── config.yml ├── .dockerignore ├── .editorconfig ├── .github └── ISSUE_TEMPLATE │ └── bug_report.md ├── .gitignore ├── LICENSE ├── README.md ├── atac.croo.v5.json ├── atac.wdl ├── dev ├── build_on_dx_dockerhub.sh ├── docker_image │ ├── Dockerfile │ └── mysql │ │ └── init_user.sql ├── example_input_json │ ├── aws │ │ └── ENCSR356KRQ_subsampled_aws.json │ ├── caper │ │ └── ENCSR356KRQ_subsampled_caper.json │ ├── gcp │ │ └── ENCSR356KRQ_subsampled_gcp.json │ ├── klab │ │ ├── ENCSR356KRQ_klab.json │ │ ├── ENCSR356KRQ_subsampled_klab.json │ │ ├── ENCSR356KRQ_subsampled_start_from_bam_klab.json │ │ └── ENCSR889WQX_subsampled_klab.json │ ├── scg │ │ └── ENCSR356KRQ_subsampled_scg.json │ └── sherlock │ │ └── ENCSR356KRQ_subsampled_sherlock.json └── test │ ├── README.md │ ├── run_cromwell_server_on_gc.sh │ ├── test_py │ └── __init__.py │ ├── test_task │ ├── .gitignore │ ├── compare_md5sum.wdl │ ├── test_annot_enrich.json │ ├── test_annot_enrich.wdl │ ├── test_bam2ta.json │ ├── test_bam2ta.wdl │ ├── test_bowtie2.json │ ├── test_bowtie2.wdl │ ├── test_compare_signal_to_roadmap.json │ ├── test_compare_signal_to_roadmap.wdl │ ├── test_count_signal_track.json │ ├── test_count_signal_track.wdl │ ├── test_filter.json │ ├── test_filter.wdl │ ├── test_frac_mito.json │ ├── test_frac_mito.wdl │ ├── test_fraglen_stat_pe.json │ ├── test_fraglen_stat_pe.wdl │ ├── test_gc_bias.json │ ├── test_gc_bias.wdl │ ├── test_idr.json │ ├── test_idr.wdl │ ├── test_jsd.json │ ├── test_jsd.wdl │ ├── test_macs2.json │ ├── test_macs2.wdl │ ├── test_macs2_signal_track.json │ ├── test_macs2_signal_track.wdl │ ├── test_overlap.json │ ├── test_overlap.wdl │ ├── test_pool_ta.json │ ├── test_pool_ta.wdl │ ├── test_preseq.json │ ├── test_preseq.wdl │ ├── test_reproducibility.json │ ├── test_reproducibility.wdl │ ├── test_spr.json │ ├── test_spr.wdl │ ├── test_tss_enrich.json │ ├── test_tss_enrich.wdl │ ├── test_xcor.json │ └── test_xcor.wdl │ └── test_workflow │ ├── .gitignore │ ├── ENCSR356KRQ.json │ ├── ENCSR356KRQ_subsampled.json │ ├── ENCSR356KRQ_subsampled_chr19_only.json │ ├── ENCSR356KRQ_subsampled_start_from_bam.json │ ├── ENCSR889WQX.json │ ├── ENCSR889WQX_subsampled.json │ ├── ENCSR889WQX_subsampled_chr19_only.json │ ├── ENCSR889WQX_subsampled_unrep.json │ └── ref_output │ ├── sync.sh │ ├── v1.1.4 │ ├── ENCSR356KRQ_qc.json │ ├── ENCSR356KRQ_subsampled_chr19_only_qc.json │ ├── ENCSR356KRQ_subsampled_qc.json │ ├── ENCSR889WQX_qc.json │ ├── ENCSR889WQX_subsampled_chr19_only_qc.json │ └── ENCSR889WQX_subsampled_qc.json │ ├── v1.1.5 │ ├── ENCSR356KRQ_subsampled │ │ └── qc.json │ ├── ENCSR356KRQ_subsampled_chr19_only │ │ └── qc.json │ ├── ENCSR889WQX │ │ └── qc.json │ ├── ENCSR889WQX_subsampled │ │ └── qc.json │ └── ENCSR889WQX_subsampled_chr19_only │ │ └── qc.json │ ├── v1.1.6.a │ ├── ENCSR356KRQ │ │ └── qc.json │ ├── ENCSR356KRQ_subsampled │ │ └── qc.json │ ├── ENCSR356KRQ_subsampled_chr19_only │ │ └── qc.json │ ├── ENCSR889WQX │ │ └── qc.json │ ├── ENCSR889WQX_subsampled │ │ └── qc.json │ └── ENCSR889WQX_subsampled_chr19_only │ │ └── qc.json │ ├── v1.1.6 │ ├── ENCSR356KRQ │ │ └── qc.json │ ├── ENCSR356KRQ_subsampled │ │ └── qc.json │ ├── ENCSR356KRQ_subsampled_chr19_only │ │ └── qc.json │ ├── ENCSR889WQX │ │ └── qc.json │ ├── ENCSR889WQX_subsampled │ │ └── qc.json │ └── ENCSR889WQX_subsampled_chr19_only │ │ └── qc.json │ ├── v1.1.7.2 │ └── ENCSR356KRQ_subsampled │ │ └── qc.json │ ├── v1.3.0 │ ├── ENCSR356KRQ_subsampled │ │ └── qc.json │ └── ENCSR889WQX_subsampled │ │ └── qc.json │ ├── v1.4.0 │ ├── ENCSR356KRQ_subsampled │ │ └── qc.json │ └── ENCSR889WQX_subsampled │ │ └── qc.json │ ├── v1.4.1 │ └── ENCSR889WQX_subsampled │ │ └── qc.json │ ├── v1.5.0 │ ├── ENCSR356KRQ_subsampled │ │ └── qc.json │ ├── ENCSR356KRQ_subsampled_start_from_bam │ │ └── qc.json │ ├── ENCSR889WQX_subsampled │ │ └── qc.json │ └── ENCSR889WQX_subsampled_unrep │ │ └── qc.json │ ├── v1.6.0 │ ├── ENCSR356KRQ_subsampled │ │ └── qc.json │ ├── ENCSR889WQX_subsampled │ │ └── qc.json │ └── ENCSR889WQX_subsampled_unrep │ │ └── qc.json │ ├── v1.7.0 │ ├── ENCSR356KRQ_subsampled │ │ └── qc.json │ ├── ENCSR356KRQ_subsampled_start_from_bam │ │ └── qc.json │ ├── ENCSR889WQX_subsampled │ │ └── qc.json │ └── ENCSR889WQX_subsampled_unrep │ │ └── qc.json │ ├── v1.8.0 │ ├── ENCSR356KRQ_subsampled │ │ └── qc.json │ ├── ENCSR356KRQ_subsampled_start_from_bam │ │ └── qc.json │ ├── ENCSR889WQX_subsampled │ │ └── qc.json │ └── ENCSR889WQX_subsampled_unrep │ │ └── qc.json │ ├── v2.1.0 │ ├── ENCSR356KRQ_subsampled │ │ └── qc.json │ └── ENCSR356KRQ_subsampled_start_from_bam │ │ └── qc.json │ └── v2.2.2 │ ├── ENCSR356KRQ_subsampled │ └── qc.json │ └── ENCSR356KRQ_subsampled_start_from_bam │ └── qc.json ├── docs ├── build_genome_database.md ├── example_output │ ├── v1.1.4 │ │ └── qc.json │ └── v1.1.5 │ │ └── qc.json ├── how_to_config_sge.md ├── input.md ├── input_short.md ├── install_conda.md ├── tutorial_dx_cli.md └── tutorial_dx_web.md ├── example_input_json ├── ENCSR356KRQ_subsampled.json ├── dx │ ├── ENCSR356KRQ_subsampled_dx.json │ ├── ENCSR356KRQ_subsampled_rep1_dx.json │ ├── template_general.json │ ├── template_hg19.json │ ├── template_hg38.json │ ├── template_mm10.json │ └── template_mm9.json ├── dx_azure │ ├── ENCSR356KRQ_subsampled_dx_azure.json │ ├── template_general.json │ ├── template_hg19.json │ ├── template_hg38.json │ ├── template_mm10.json │ └── template_mm9.json ├── template.full.json ├── template.json └── terra │ └── ENCSR356KRQ_subsampled.terra.json ├── scripts ├── build_genome_data.sh ├── download_genome_data.sh ├── install_conda_env.sh ├── requirements.macs2.txt ├── requirements.python2.txt ├── requirements.spp.txt ├── requirements.txt ├── uninstall_conda_env.sh └── update_conda_env.sh └── src ├── assign_multimappers.py ├── detect_adapter.py ├── dev_check_sync_atac.sh ├── encode_lib_blacklist_filter.py ├── encode_lib_common.py ├── encode_lib_frip.py ├── encode_lib_genomic.py ├── encode_lib_log_parser.py ├── encode_lib_qc_category.py ├── encode_task_annot_enrich.py ├── encode_task_bam2ta.py ├── encode_task_bam_to_pbam.py ├── encode_task_bowtie2.py ├── encode_task_bwa.py ├── encode_task_choose_ctl.py ├── encode_task_compare_signal_to_roadmap.py ├── encode_task_count_signal_track.py ├── encode_task_filter.py ├── encode_task_frac_mito.py ├── encode_task_fraglen_stat_pe.py ├── encode_task_gc_bias.py ├── encode_task_idr.py ├── encode_task_jsd.py ├── encode_task_macs2_atac.py ├── encode_task_macs2_chip.py ├── encode_task_macs2_signal_track_atac.py ├── encode_task_macs2_signal_track_chip.py ├── encode_task_merge_fastq.py ├── encode_task_overlap.py ├── encode_task_pool_ta.py ├── encode_task_post_align.py ├── encode_task_post_call_peak_atac.py ├── encode_task_post_call_peak_chip.py ├── encode_task_preseq.py ├── encode_task_qc_report.py ├── encode_task_reproducibility.py ├── encode_task_spp.py ├── encode_task_spr.py ├── encode_task_subsample_ctl.py ├── encode_task_trim_adapter.py ├── encode_task_trim_fastq.py ├── encode_task_trimmomatic.py ├── encode_task_tss_enrich.py ├── encode_task_xcor.py └── trimfastq.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .editorconfig 3 | .git 4 | .gitignore 5 | cromwell-executions 6 | cromwell-workflow-logs 7 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | [*.{wdl,json,conf}] 2 | indent_style = tab 3 | indent_size = 4 4 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## **Describe the bug** 11 | A clear and concise description of what the problem is. 12 | 13 | ## **OS/Platform** 14 | - OS/Platform: [e.g. Ubuntu 18.04, Google Cloud, Stanford Sherlock/SCG cluster, ...] 15 | - Conda version: If you used Conda (`$ conda --version`). 16 | - Pipeline version: [e.g. v1.8.0] 17 | - Caper version: [e.g. v1.2.0] 18 | 19 | ## **Caper configuration file** 20 | Paste contents of `~/.caper/default.conf`. 21 | ```ini 22 | PASTE CAPER CONF CONTENTS HERE 23 | ``` 24 | 25 | ## **Input JSON file** 26 | Paste contents of your input JSON file. 27 | ```json 28 | PASTE INPUT JSON CONTENTS HERE 29 | ``` 30 | 31 | ## **Troubleshooting result** 32 | 33 | If you ran `caper run` without Caper server then Caper automatically runs a troubleshooter for failed workflows. Find troubleshooting result in the bottom of Caper's screen log. 34 | 35 | If you ran `caper submit` with a running Caper server then first find your workflow ID (1st column) with `caper list` and run `caper debug [WORKFLOW_ID]`. 36 | 37 | Paste troubleshooting result. 38 | ``` 39 | PASTE TROUBLESHOOTING RESULT HERE 40 | ``` 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files/ 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # temp fastqs 104 | fastqs/ 105 | 106 | # cromwell temp dirs 107 | cromwell-executions/ 108 | cromwell-workflow-logs/ 109 | cromwell*.jar 110 | 111 | #test.wdl 112 | #test_google.wdl 113 | output_*.json 114 | test/*_standalone.wdl 115 | .DS_Store 116 | test_genome* 117 | test_sample 118 | *.tar 119 | tmp 120 | tmp_db* 121 | *.local.json 122 | temp_db* 123 | cromwell.out 124 | cromwell.out.* 125 | .dev 126 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 ENCODE DCC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /dev/build_on_dx_dockerhub.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | WDL=atac.wdl 5 | VER=$(cat ${WDL} | grep "String pipeline_ver = " | awk '{gsub("'"'"'",""); print $4}') 6 | DXWDL=~/dxWDL-v1.50.jar 7 | 8 | # general 9 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines" -f -folder \ 10 | /ATAC-seq/workflows/$VER/general -defaults example_input_json/dx/template_general.json 11 | 12 | # hg38 13 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines" -f -folder \ 14 | /ATAC-seq/workflows/$VER/hg38 -defaults example_input_json/dx/template_hg38.json 15 | 16 | # hg19 17 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines" -f -folder \ 18 | /ATAC-seq/workflows/$VER/hg19 -defaults example_input_json/dx/template_hg19.json 19 | 20 | # mm10 21 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines" -f -folder \ 22 | /ATAC-seq/workflows/$VER/mm10 -defaults example_input_json/dx/template_mm10.json 23 | 24 | # mm9 25 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines" -f -folder \ 26 | /ATAC-seq/workflows/$VER/mm9 -defaults example_input_json/dx/template_mm9.json 27 | 28 | # test sample 29 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines" -f -folder \ 30 | /ATAC-seq/workflows/$VER/test_ENCSR356KRQ_subsampled -defaults example_input_json/dx/ENCSR356KRQ_subsampled_dx.json 31 | 32 | # test sample (single rep) 33 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines" -f -folder \ 34 | /ATAC-seq/workflows/$VER/test_ENCSR356KRQ_subsampled_rep1 -defaults example_input_json/dx/ENCSR356KRQ_subsampled_rep1_dx.json 35 | 36 | ## DX Azure 37 | 38 | # general 39 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines Azure" -f -folder \ 40 | /ATAC-seq/workflows/$VER/general -defaults example_input_json/dx_azure/template_general.json 41 | 42 | # hg38 43 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines Azure" -f -folder \ 44 | /ATAC-seq/workflows/$VER/hg38 -defaults example_input_json/dx_azure/template_hg38.json 45 | 46 | # hg19 47 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines Azure" -f -folder \ 48 | /ATAC-seq/workflows/$VER/hg19 -defaults example_input_json/dx_azure/template_hg19.json 49 | 50 | # mm10 51 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines Azure" -f -folder \ 52 | /ATAC-seq/workflows/$VER/mm10 -defaults example_input_json/dx_azure/template_mm10.json 53 | 54 | # mm9 55 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines Azure" -f -folder \ 56 | /ATAC-seq/workflows/$VER/mm9 -defaults example_input_json/dx_azure/template_mm9.json 57 | 58 | # test sample 59 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines Azure" -f -folder \ 60 | /ATAC-seq/workflows/$VER/test_ENCSR356KRQ_subsampled -defaults example_input_json/dx_azure/ENCSR356KRQ_subsampled_dx_azure.json 61 | -------------------------------------------------------------------------------- /dev/docker_image/mysql/init_user.sql: -------------------------------------------------------------------------------- 1 | CREATE USER 'cromwell'@'localhost' IDENTIFIED BY 'cromwell'; 2 | GRANT ALL PRIVILEGES ON cromwell_db.* TO 'cromwell'@'localhost' WITH GRANT OPTION; 3 | CREATE USER 'cromwell'@'%' IDENTIFIED BY 'cromwell'; 4 | GRANT ALL PRIVILEGES ON cromwell_db.* TO 'cromwell'@'%' WITH GRANT OPTION; -------------------------------------------------------------------------------- /dev/example_input_json/aws/ENCSR356KRQ_subsampled_aws.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "s3://encode-pipeline-genome-data/genome_tsv/v1/hg38_aws.tsv", 4 | "atac.fastqs_rep1_R1" : [ 5 | "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz", 6 | "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz" 7 | ], 8 | "atac.fastqs_rep1_R2" : [ 9 | "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz", 10 | "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz" 11 | ], 12 | "atac.fastqs_rep2_R1" : [ 13 | "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz", 14 | "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz", 15 | "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz", 16 | "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz", 17 | "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz", 18 | "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz" 19 | ], 20 | "atac.fastqs_rep2_R2" : [ 21 | "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz", 22 | "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz", 23 | "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz", 24 | "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz", 25 | "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz", 26 | "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz" 27 | ], 28 | "atac.paired_end" : true, 29 | "atac.auto_detect_adapter" : true, 30 | "atac.enable_xcor" : true, 31 | "atac.title" : "ENCSR356KRQ (subsampled 1/400)", 32 | "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation" 33 | } 34 | -------------------------------------------------------------------------------- /dev/example_input_json/caper/ENCSR356KRQ_subsampled_caper.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v1/hg38_caper.tsv", 4 | "atac.fastqs_rep1_R1" : [ 5 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz", 6 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz" 7 | ], 8 | "atac.fastqs_rep1_R2" : [ 9 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz", 10 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz" 11 | ], 12 | "atac.fastqs_rep2_R1" : [ 13 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz", 14 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz", 15 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz", 16 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz", 17 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz", 18 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz" 19 | ], 20 | "atac.fastqs_rep2_R2" : [ 21 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz", 22 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz", 23 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz", 24 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz", 25 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz", 26 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz" 27 | ], 28 | "atac.paired_end" : true, 29 | "atac.auto_detect_adapter" : true, 30 | "atac.enable_xcor" : true, 31 | "atac.title" : "ENCSR356KRQ (subsampled 1/400)", 32 | "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation" 33 | } 34 | -------------------------------------------------------------------------------- /dev/example_input_json/gcp/ENCSR356KRQ_subsampled_gcp.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "gs://encode-pipeline-genome-data/genome_tsv/v1/hg38_gcp.tsv", 4 | "atac.fastqs_rep1_R1" : [ 5 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz", 6 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz" 7 | ], 8 | "atac.fastqs_rep1_R2" : [ 9 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz", 10 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz" 11 | ], 12 | "atac.fastqs_rep2_R1" : [ 13 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz", 14 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz", 15 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz", 16 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz", 17 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz", 18 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz" 19 | ], 20 | "atac.fastqs_rep2_R2" : [ 21 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz", 22 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz", 23 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz", 24 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz", 25 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz", 26 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz" 27 | ], 28 | "atac.paired_end" : true, 29 | "atac.auto_detect_adapter" : true, 30 | "atac.enable_xcor" : true, 31 | "atac.title" : "ENCSR356KRQ (subsampled 1/400)", 32 | "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation" 33 | } 34 | -------------------------------------------------------------------------------- /dev/example_input_json/klab/ENCSR356KRQ_klab.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "/mnt/data/pipeline_genome_data/genome_tsv/v1/hg38_klab.tsv", 4 | "atac.fastqs_rep1_R1" : [ 5 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair1/ENCFF341MYG.fastq.gz", 6 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair1/ENCFF106QGY.fastq.gz" 7 | ], 8 | "atac.fastqs_rep1_R2" : [ 9 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair2/ENCFF248EJF.fastq.gz", 10 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair2/ENCFF368TYI.fastq.gz" 11 | ], 12 | "atac.fastqs_rep2_R1" : [ 13 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF641SFZ.fastq.gz", 14 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF751XTV.fastq.gz", 15 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF927LSG.fastq.gz", 16 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF859BDM.fastq.gz", 17 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF193RRC.fastq.gz", 18 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF366DFI.fastq.gz" 19 | ], 20 | "atac.fastqs_rep2_R2" : [ 21 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF031ARQ.fastq.gz", 22 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF590SYZ.fastq.gz", 23 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF734PEQ.fastq.gz", 24 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF007USV.fastq.gz", 25 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF886FSC.fastq.gz", 26 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF573UXK.fastq.gz" 27 | ], 28 | "atac.paired_end" : true, 29 | "atac.auto_detect_adapter" : true, 30 | "atac.enable_xcor" : true, 31 | "atac.title" : "ENCSR356KRQ", 32 | "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation" 33 | } 34 | -------------------------------------------------------------------------------- /dev/example_input_json/klab/ENCSR356KRQ_subsampled_klab.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "/mnt/data/pipeline_genome_data/genome_tsv/v1/hg38_klab.tsv", 4 | "atac.fastqs_rep1_R1" : [ 5 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz", 6 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz" 7 | ], 8 | "atac.fastqs_rep1_R2" : [ 9 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz", 10 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz" 11 | ], 12 | "atac.fastqs_rep2_R1" : [ 13 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz", 14 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz", 15 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz", 16 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz", 17 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz", 18 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz" 19 | ], 20 | "atac.fastqs_rep2_R2" : [ 21 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz", 22 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz", 23 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz", 24 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz", 25 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz", 26 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz" 27 | ], 28 | "atac.paired_end" : true, 29 | "atac.auto_detect_adapter" : true, 30 | "atac.enable_xcor" : true, 31 | "atac.title" : "ENCSR356KRQ (subsampled 1/400)", 32 | "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation" 33 | } 34 | -------------------------------------------------------------------------------- /dev/example_input_json/klab/ENCSR356KRQ_subsampled_start_from_bam_klab.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "/mnt/data/pipeline_genome_data/genome_tsv/v1/hg38_klab.tsv", 4 | "atac.nodup_bams" : [ 5 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/bam_subsampled/rep1/ENCFF341MYG.subsampled.400.trim.merged.nodup.no_chrM_MT.bam", 6 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/bam_subsampled/rep2/ENCFF641SFZ.subsampled.400.trim.merged.nodup.no_chrM_MT.bam" 7 | ], 8 | "atac.read_len" : [76, 76], 9 | "atac.paired_end" : true, 10 | "atac.auto_detect_adapter" : true, 11 | "atac.enable_xcor" : true, 12 | "atac.title" : "ENCSR356KRQ (subsampled 1/400, staring from NODUP_BAMs with specified read_len)", 13 | "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation" 14 | } 15 | -------------------------------------------------------------------------------- /dev/example_input_json/klab/ENCSR889WQX_subsampled_klab.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "/mnt/data/pipeline_genome_data/genome_tsv/v1/mm10_klab.tsv", 4 | "atac.fastqs_rep1_R1" : [ 5 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF439VSY.subsampled.400.fastq.gz", 6 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF325FCQ.subsampled.400.fastq.gz", 7 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF683IQS.subsampled.400.fastq.gz", 8 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF744CHW.subsampled.400.fastq.gz" 9 | ], 10 | "atac.fastqs_rep2_R1" : [ 11 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep2/ENCFF463QCX.subsampled.400.fastq.gz", 12 | "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep2/ENCFF992TSA.subsampled.400.fastq.gz" 13 | ], 14 | "atac.paired_end" : false, 15 | "atac.auto_detect_adapter" : true, 16 | "atac.enable_xcor" : true, 17 | "atac.enable_tss_enrich" : false, 18 | "atac.title" : "ENCSR889WQX (subsampled 1/400 reads)", 19 | "atac.description" : "ATAC-seq on Mus musculus C57BL/6 frontal cortex adult" 20 | } 21 | -------------------------------------------------------------------------------- /dev/example_input_json/scg/ENCSR356KRQ_subsampled_scg.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "/reference/ENCODE/pipeline_genome_data/genome_tsv/v1/hg38_scg.tsv", 4 | "atac.fastqs_rep1_R1" : [ 5 | "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz", 6 | "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz" 7 | ], 8 | "atac.fastqs_rep1_R2" : [ 9 | "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz", 10 | "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz" 11 | ], 12 | "atac.fastqs_rep2_R1" : [ 13 | "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz", 14 | "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz", 15 | "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz", 16 | "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz", 17 | "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz", 18 | "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz" 19 | ], 20 | "atac.fastqs_rep2_R2" : [ 21 | "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz", 22 | "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz", 23 | "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz", 24 | "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz", 25 | "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz", 26 | "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz" 27 | ], 28 | "atac.paired_end" : true, 29 | "atac.auto_detect_adapter" : true, 30 | "atac.enable_xcor" : true, 31 | "atac.title" : "ENCSR356KRQ (subsampled 1/400)", 32 | "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation" 33 | } 34 | -------------------------------------------------------------------------------- /dev/example_input_json/sherlock/ENCSR356KRQ_subsampled_sherlock.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "/home/groups/cherry/encode/pipeline_genome_data/genome_tsv/v1/hg38_sherlock.tsv", 4 | "atac.fastqs_rep1_R1" : [ 5 | "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz", 6 | "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz" 7 | ], 8 | "atac.fastqs_rep1_R2" : [ 9 | "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz", 10 | "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz" 11 | ], 12 | "atac.fastqs_rep2_R1" : [ 13 | "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz", 14 | "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz", 15 | "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz", 16 | "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz", 17 | "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz", 18 | "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz" 19 | ], 20 | "atac.fastqs_rep2_R2" : [ 21 | "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz", 22 | "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz", 23 | "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz", 24 | "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz", 25 | "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz", 26 | "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz" 27 | ], 28 | "atac.paired_end" : true, 29 | "atac.auto_detect_adapter" : true, 30 | "atac.enable_xcor" : true, 31 | "atac.title" : "ENCSR356KRQ (subsampled 1/400)", 32 | "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation" 33 | } 34 | -------------------------------------------------------------------------------- /dev/test/README.md: -------------------------------------------------------------------------------- 1 | ENCODE ATAC-seq pipeline test 2 | =================================================== 3 | 4 | # Task level test (local) 5 | 6 | This test requires `atac-seq-pipeline-test-data` directory in `test_task/`. Git glone [a data repo](https://github.com/leepc12/atac-seq-pipeline-test-data) on `test_task/`. This repo has 1/400 subsampled test samples and chr19-chrM only bowtie2 indices and other genome data for hg38 and mm10. Make sure that you have `cromwell-31.jar` in your `$PATH` as an executable (`chmod +x`) and `Docker` installed on your system. 7 | ``` 8 | $ cd test_task/ 9 | $ git clone https://github.com/encode-dcc/atac-seq-pipeline-test-data 10 | ``` 11 | 12 | Each task in `../atac.wdl` has a corresponding pair of tester WDL/JSON (`[TASK_NAME].WDL` and [TASK_NAME].json`). You can also specify your own docker image to test each task. 13 | ``` 14 | $ cd test_task/ 15 | $ ./test.sh [WDL] [INPUT_JSON] [DOCKER_IMAGE](optional) 16 | ``` 17 | 18 | # Workflow level test (on GC) 19 | 20 | Make sure that you have a Cromwell server running on GC. This shell script will submit `../atac.wdl` to the server and wait for a response (`result.json`). There are two input JSON files (original and subsampled) for each endedness (SE and PE). You can also check all outputs on GC bucket `gs://encode-pipeline-test-runs`. 21 | ``` 22 | $ cd test_workflow/ 23 | $ ./test_atac.sh [INPUT_JSON] [QC_JSON_TO_COMPARE] [DOCKER_IMAGE](optional) 24 | ``` 25 | 26 | Jenkins must do the following: 27 | ``` 28 | $ cd test_workflow/ 29 | # For master branch (full test sample, ~24hr) 30 | $ ./test_atac.sh ENCSR356KRQ.json ref_output/ENCSR356KRQ_qc.json [NEW_DOCKER_IMAGE] 31 | $ ./test_atac.sh ENCSR889WQX.json ref_output/ENCSR889WQX_qc.json [NEW_DOCKER_IMAGE] 32 | # For develop branch (1/400 subsampled and chr19 only test sample ~30mins) 33 | $ ./test_atac.sh ENCSR356KRQ_subsampled.json ref_output/ENCSR356KRQ_subsampled_chr19_only_qc.json [NEW_DOCKER_IMAGE] 34 | $ ./test_atac.sh ENCSR889WQX_subsampled.json ref_output/ENCSR889WQX_subsampled_chr19_only_qc.json [NEW_DOCKER_IMAGE] 35 | ``` 36 | 37 | `test_atac.sh` will generate the following files to validate pipeline outputs. Jenkins must check if `PREFIX.qc_json_diff.txt` is empty or not. 38 | * `PREFIX.result.json`: all outputs of `atac.wdl`. 39 | * `PREFIX.result.qc.json`: qc summary JSON file `qc.json` of `atac.wdl`. 40 | * `PREFIX.qc_json_diff.txt`: diff between `PREFIX.result.qc.json` and reference in `ref_output/`. 41 | 42 | # How to run a Cromwell server on GC 43 | 44 | 1) Create/restart an instance with the following settings. 45 | * name : `encode-cromwell-test-server`. 46 | * resource: 1vCPU and 4GB memory 47 | * zone: `us-west1-a`. 48 | * image: `Ubuntu 16.04 (xenial)` 49 | * disk: `Standard persistent disk 20GB` 50 | * Network tags: add a tag `cromwell-server`. 51 | * Cloud API access scopes: `Allow full access to all Cloud APIs`. 52 | * External IP (optional): any static IP address. 53 | 54 | 2) SSH to the instance and run the followings to install Docker and Java 8: 55 | ``` 56 | $ sudo apt-get update 57 | $ sudo apt-get install docker.io default-jre 58 | $ sudo usermod -aG docker $USER 59 | ``` 60 | 61 | 3) Log out and log back in. 62 | 63 | 4) Install cromwell. 64 | ``` 65 | $ cd 66 | $ wget https://github.com/broadinstitute/cromwell/releases/download/31/cromwell-31.jar 67 | $ chmod +x cromwell*.jar 68 | $ echo "export PATH=\$PATH:\$HOME">> ~/.bashrc 69 | $ source ~/.bashrc 70 | ``` 71 | 72 | 5) Clone pipeline, make DB directory (where metadata of all pipelines are stored) and run `MySQL` container. 73 | ``` 74 | $ cd 75 | $ git clone https://github.com/ENCODE-DCC/atac-seq-pipeline 76 | $ mkdir cromwell_db 77 | $ docker run -d --name mysql-cromwell -v $HOME/cromwell_db:/var/lib/mysql -v $HOME/atac-seq-pipeline/docker_image/mysql:/docker-entrypoint-initdb.d -e MYSQL_ROOT_PASSWORD=cromwell -e MYSQL_DATABASE=cromwell_db --publish 3306:3306 mysql 78 | $ docker ps 79 | ``` 80 | 81 | 4) Run Cromwell server 82 | ``` 83 | $ cd $HOME/atac-seq-pipeline 84 | $ git checkout develop_test_jenkins 85 | $ cd test 86 | $ screen -RD cromwell # make screen for cromwell server 87 | $ bash run_cromwell_server_on_gc.sh 88 | ``` 89 | 90 | 5) Firewall settings to open port 8000 91 | * Go to Google Cloud Console 92 | * Choose your Project. 93 | * Choose Networking > VPC network 94 | * Choose "Firewalls rules" 95 | * Choose Create Firewall Rule `encode-cromwell-test-server-open-port-8000`. 96 | * Targets: `Specified target rags`. 97 | * Target tags: cromwell-server 98 | * Source IP ranges: `0.0.0.0/0` (CIDR notation for allowed IP range) 99 | * Protocols and Ports: `Specified protocols and ports` with `tcp:8000`. 100 | -------------------------------------------------------------------------------- /dev/test/run_cromwell_server_on_gc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -f "cromwell-32.jar" ]; then 4 | echo "Skip downloading cromwell." 5 | else 6 | wget -N -c https://github.com/broadinstitute/cromwell/releases/download/32/cromwell-32.jar 7 | fi 8 | CROMWELL_JAR=cromwell-32.jar 9 | BACKEND_CONF=../backends/backend_with_db.conf 10 | BACKEND=google 11 | GC_PROJ=encode-dcc-1016 12 | GC_ROOT=gs://encode-pipeline-test-runs 13 | 14 | java -Dconfig.file=${BACKEND_CONF} -Dbackend.default=${BACKEND} -Dbackend.providers.google.config.project=${GC_PROJ} \ 15 | -Dbackend.providers.google.config.root=${GC_ROOT} -jar ${CROMWELL_JAR} server 16 | -------------------------------------------------------------------------------- /dev/test/test_py/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ENCODE-DCC/atac-seq-pipeline/47ba8dff9c332e24b48e767303e9fcac98589cf2/dev/test/test_py/__init__.py -------------------------------------------------------------------------------- /dev/test/test_task/.gitignore: -------------------------------------------------------------------------------- 1 | atac-seq-pipeline-test-data 2 | *.result.json 3 | *.metadata.json 4 | *wf_opt.json 5 | cromwell*.jar 6 | *.fasta 7 | *.fa 8 | *.gz 9 | *.docker.json 10 | -------------------------------------------------------------------------------- /dev/test/test_task/compare_md5sum.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | 3 | task compare_md5sum { 4 | input { 5 | Array[String] labels 6 | Array[File] files 7 | Array[File] ref_files 8 | } 9 | command <<< 10 | python <>> 107 | output { 108 | Map[String,String] match = read_map('match.tsv') # key:label, val:match 109 | Boolean match_overall = read_boolean('match_overall.txt') 110 | File json = glob('result.json')[0] # details (json file) 111 | String json_str = read_string('result.json') # details (string) 112 | } 113 | runtime { 114 | cpu : 1 115 | memory : '4000 MB' 116 | time : 1 117 | disks : 'local-disk 50 HDD' 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /dev/test/test_task/test_annot_enrich.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_annot_enrich.blacklist" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38_chr19_chrM/hg38.blacklist.bed.gz", 3 | "test_annot_enrich.dnase" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.hg19_to_hg38.bed.gz", 4 | "test_annot_enrich.prom" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_prom_p2.hg19_to_hg38.bed.gz", 5 | "test_annot_enrich.enh" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_enh_p2.hg19_to_hg38.bed.gz", 6 | 7 | "test_annot_enrich.ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/ataqc/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz", 8 | 9 | "test_annot_enrich.ref_annot_enrich_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_annot_enrich/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.annot_enrich.qc" 10 | } 11 | -------------------------------------------------------------------------------- /dev/test/test_task/test_annot_enrich.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import '../../../atac.wdl' as atac 3 | import 'compare_md5sum.wdl' as compare_md5sum 4 | 5 | 6 | 7 | workflow test_annot_enrich { 8 | input { 9 | File ta 10 | File blacklist 11 | File dnase 12 | File prom 13 | File enh 14 | File ref_annot_enrich_qc 15 | String docker 16 | } 17 | RuntimeEnvironment runtime_environment = { 18 | "docker": docker, 19 | "singularity": "", 20 | "conda": "" 21 | } 22 | 23 | call atac.annot_enrich { input : 24 | ta = ta, 25 | blacklist = blacklist, 26 | dnase = dnase, 27 | prom = prom, 28 | enh = enh, 29 | runtime_environment = runtime_environment, 30 | } 31 | 32 | call compare_md5sum.compare_md5sum { input : 33 | labels = [ 34 | 'test_annot_enrich_qc', 35 | ], 36 | files = [ 37 | annot_enrich.annot_enrich_qc, 38 | ], 39 | ref_files = [ 40 | ref_annot_enrich_qc, 41 | ], 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /dev/test/test_task/test_bam2ta.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_bam2ta.pe_nodup_bam" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/nodup_bams/rep1/ENCFF341MYG.subsampled.400.trim.merged.nodup.bam", 3 | "test_bam2ta.se_nodup_bam" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/nodup_bams/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.bam", 4 | 5 | "test_bam2ta.ref_pe_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bam2ta/pe/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz", 6 | "test_bam2ta.ref_pe_ta_disable_tn5_shift" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bam2ta/pe/disable_tn5_shift/ENCFF341MYG.subsampled.400.trim.merged.nodup.tagAlign.gz", 7 | "test_bam2ta.ref_pe_ta_subsample" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bam2ta/pe/subsample/fix_PIP-917/ENCFF341MYG.subsampled.400.trim.merged.nodup.5K.tn5.tagAlign.gz", 8 | 9 | "test_bam2ta.ref_se_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bam2ta/se/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz", 10 | "test_bam2ta.ref_se_ta_disable_tn5_shift" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bam2ta/se/disable_tn5_shift/ENCFF439VSY.subsampled.400.trim.merged.nodup.tagAlign.gz", 11 | "test_bam2ta.ref_se_ta_subsample" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bam2ta/se/subsample/ENCFF439VSY.subsampled.400.trim.merged.nodup.5K.tn5.tagAlign.gz", 12 | 13 | "test_bam2ta.bam2ta_subsample" : 5000 14 | } 15 | -------------------------------------------------------------------------------- /dev/test/test_task/test_bam2ta.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import '../../../atac.wdl' as atac 3 | import 'compare_md5sum.wdl' as compare_md5sum 4 | 5 | workflow test_bam2ta { 6 | input { 7 | Int bam2ta_subsample 8 | 9 | String pe_nodup_bam 10 | String se_nodup_bam 11 | 12 | String ref_pe_ta 13 | String ref_pe_ta_disable_tn5_shift 14 | String ref_pe_ta_subsample 15 | String ref_se_ta 16 | String ref_se_ta_disable_tn5_shift 17 | String ref_se_ta_subsample 18 | String mito_chr_name = 'chrM' 19 | 20 | Int bam2ta_cpu = 1 21 | Float bam2ta_mem_factor = 0.0 22 | Int bam2ta_time_hr = 6 23 | Float bam2ta_disk_factor = 4.0 24 | String docker 25 | } 26 | RuntimeEnvironment runtime_environment = { 27 | "docker": docker, 28 | "singularity": "", 29 | "conda": "" 30 | } 31 | 32 | call atac.bam2ta as pe_bam2ta { input : 33 | bam = pe_nodup_bam, 34 | disable_tn5_shift = false, 35 | subsample = 0, 36 | paired_end = true, 37 | mito_chr_name = mito_chr_name, 38 | 39 | cpu = bam2ta_cpu, 40 | mem_factor = bam2ta_mem_factor, 41 | time_hr = bam2ta_time_hr, 42 | disk_factor = bam2ta_disk_factor, 43 | runtime_environment = runtime_environment, 44 | } 45 | call atac.bam2ta as pe_bam2ta_disable_tn5_shift { input : 46 | bam = pe_nodup_bam, 47 | disable_tn5_shift = true, 48 | subsample = 0, 49 | paired_end = true, 50 | mito_chr_name = mito_chr_name, 51 | 52 | cpu = bam2ta_cpu, 53 | mem_factor = bam2ta_mem_factor, 54 | time_hr = bam2ta_time_hr, 55 | disk_factor = bam2ta_disk_factor, 56 | runtime_environment = runtime_environment, 57 | } 58 | call atac.bam2ta as pe_bam2ta_subsample { input : 59 | bam = pe_nodup_bam, 60 | disable_tn5_shift = false, 61 | subsample = bam2ta_subsample, 62 | paired_end = true, 63 | mito_chr_name = mito_chr_name, 64 | 65 | cpu = bam2ta_cpu, 66 | mem_factor = bam2ta_mem_factor, 67 | time_hr = bam2ta_time_hr, 68 | disk_factor = bam2ta_disk_factor, 69 | runtime_environment = runtime_environment, 70 | } 71 | call atac.bam2ta as se_bam2ta { input : 72 | bam = se_nodup_bam, 73 | disable_tn5_shift = false, 74 | subsample = 0, 75 | paired_end = false, 76 | mito_chr_name = mito_chr_name, 77 | 78 | cpu = bam2ta_cpu, 79 | mem_factor = bam2ta_mem_factor, 80 | time_hr = bam2ta_time_hr, 81 | disk_factor = bam2ta_disk_factor, 82 | runtime_environment = runtime_environment, 83 | } 84 | call atac.bam2ta as se_bam2ta_disable_tn5_shift { input : 85 | bam = se_nodup_bam, 86 | disable_tn5_shift = true, 87 | subsample = 0, 88 | paired_end = false, 89 | mito_chr_name = mito_chr_name, 90 | 91 | cpu = bam2ta_cpu, 92 | mem_factor = bam2ta_mem_factor, 93 | time_hr = bam2ta_time_hr, 94 | disk_factor = bam2ta_disk_factor, 95 | runtime_environment = runtime_environment, 96 | } 97 | call atac.bam2ta as se_bam2ta_subsample { input : 98 | bam = se_nodup_bam, 99 | disable_tn5_shift = false, 100 | subsample = bam2ta_subsample, 101 | paired_end = false, 102 | mito_chr_name = mito_chr_name, 103 | 104 | cpu = bam2ta_cpu, 105 | mem_factor = bam2ta_mem_factor, 106 | time_hr = bam2ta_time_hr, 107 | disk_factor = bam2ta_disk_factor, 108 | runtime_environment = runtime_environment, 109 | } 110 | 111 | call compare_md5sum.compare_md5sum { input : 112 | labels = [ 113 | 'pe_bam2ta', 114 | 'pe_bam2ta_disable_tn5_shift', 115 | 'pe_bam2ta_subsample', 116 | 'se_bam2ta', 117 | 'se_bam2ta_disable_tn5_shift', 118 | 'se_bam2ta_subsample', 119 | ], 120 | files = [ 121 | pe_bam2ta.ta, 122 | pe_bam2ta_disable_tn5_shift.ta, 123 | pe_bam2ta_subsample.ta, 124 | se_bam2ta.ta, 125 | se_bam2ta_disable_tn5_shift.ta, 126 | se_bam2ta_subsample.ta, 127 | ], 128 | ref_files = [ 129 | ref_pe_ta, 130 | ref_pe_ta_disable_tn5_shift, 131 | ref_pe_ta_subsample, 132 | ref_se_ta, 133 | ref_se_ta_disable_tn5_shift, 134 | ref_se_ta_subsample, 135 | ], 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /dev/test/test_task/test_bowtie2.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_bowtie2.pe_fastqs_R1" : [ 3 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/fastqs/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz", 4 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/fastqs/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz" 5 | ], 6 | "test_bowtie2.pe_fastqs_R2" : [ 7 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/fastqs/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz", 8 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/fastqs/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz" 9 | ], 10 | "test_bowtie2.se_fastqs_R1" : [ 11 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/fastqs/rep1/ENCFF439VSY.subsampled.400.fastq.gz", 12 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/fastqs/rep1/ENCFF325FCQ.subsampled.400.fastq.gz", 13 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/fastqs/rep1/ENCFF683IQS.subsampled.400.fastq.gz", 14 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/fastqs/rep1/ENCFF744CHW.subsampled.400.fastq.gz" 15 | ], 16 | "test_bowtie2.pe_bowtie2_idx_tar" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38_chr19_chrM/bowtie2_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.chr19_chrM.fasta.tar", 17 | "test_bowtie2.se_bowtie2_idx_tar" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/bowtie2_index/mm10_no_alt_analysis_set_ENCODE.chr19_chrM.fasta.tar", 18 | 19 | "test_bowtie2.se_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10_chr19_chrM.chrom.sizes", 20 | "test_bowtie2.pe_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38_chr19_chrM/hg38_chr19_chrM.chrom.sizes", 21 | 22 | "test_bowtie2.ref_pe_flagstat" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bowtie2/pe/multimapping/merge_fastqs_R1_ENCFF341MYG.subsampled.400.trim.merged.samstats.qc", 23 | "test_bowtie2.ref_pe_flagstat_no_multimapping" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bowtie2/pe/no_multimapping/merge_fastqs_R1_ENCFF341MYG.subsampled.400.trim.merged.samstats.qc", 24 | "test_bowtie2.ref_se_flagstat" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bowtie2/se/multimapping/merge_fastqs_R1_ENCFF439VSY.subsampled.400.trim.merged.samstats.qc", 25 | "test_bowtie2.ref_se_flagstat_no_multimapping" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bowtie2/se/no_multimapping/merge_fastqs_R1_ENCFF439VSY.subsampled.400.trim.merged.samstats.qc", 26 | 27 | "test_bowtie2.multimapping" : 4 28 | } 29 | -------------------------------------------------------------------------------- /dev/test/test_task/test_bowtie2.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import '../../../atac.wdl' as atac 3 | import 'compare_md5sum.wdl' as compare_md5sum 4 | 5 | workflow test_bowtie2 { 6 | input { 7 | Int multimapping 8 | 9 | Array[String] pe_fastqs_R1 10 | Array[String] pe_fastqs_R2 11 | Array[String] se_fastqs_R1 12 | 13 | String se_chrsz 14 | String pe_chrsz 15 | String cutadapt_param = '-e 0.1 -m 5' 16 | 17 | # we don't compare BAM because BAM's header includes date 18 | # hence md5sums don't match all the time 19 | String ref_pe_flagstat 20 | String ref_pe_flagstat_no_multimapping 21 | 22 | String ref_se_flagstat 23 | String ref_se_flagstat_no_multimapping 24 | 25 | String pe_bowtie2_idx_tar 26 | String se_bowtie2_idx_tar 27 | 28 | Int bowtie2_cpu = 1 29 | Float bowtie2_mem_factor = 0.0 30 | Int bowtie2_time_hr = 48 31 | Float bowtie2_disk_factor = 6.0 32 | String docker 33 | } 34 | RuntimeEnvironment runtime_environment = { 35 | "docker": docker, 36 | "singularity": "", 37 | "conda": "" 38 | } 39 | 40 | call atac.align as pe_bowtie2 { input : 41 | aligner = 'bowtie2', 42 | idx_tar = pe_bowtie2_idx_tar, 43 | mito_chr_name = 'chrM', 44 | fastqs_R1 = pe_fastqs_R1, 45 | fastqs_R2 = pe_fastqs_R2, 46 | adapters_R1 = [], 47 | adapters_R2 = [], 48 | cutadapt_param = cutadapt_param, 49 | multimapping = multimapping, 50 | paired_end = true, 51 | chrsz = pe_chrsz, 52 | auto_detect_adapter = true, 53 | 54 | cpu = bowtie2_cpu, 55 | mem_factor = bowtie2_mem_factor, 56 | time_hr = bowtie2_time_hr, 57 | disk_factor = bowtie2_disk_factor, 58 | runtime_environment = runtime_environment, 59 | } 60 | call atac.align as pe_bowtie2_no_multimapping { input : 61 | aligner = 'bowtie2', 62 | idx_tar = pe_bowtie2_idx_tar, 63 | mito_chr_name = 'chrM', 64 | fastqs_R1 = pe_fastqs_R1, 65 | fastqs_R2 = pe_fastqs_R2, 66 | adapters_R1 = [], 67 | adapters_R2 = [], 68 | cutadapt_param = cutadapt_param, 69 | multimapping = 0, 70 | paired_end = true, 71 | chrsz = pe_chrsz, 72 | auto_detect_adapter = true, 73 | 74 | cpu = bowtie2_cpu, 75 | mem_factor = bowtie2_mem_factor, 76 | time_hr = bowtie2_time_hr, 77 | disk_factor = bowtie2_disk_factor, 78 | runtime_environment = runtime_environment, 79 | } 80 | call atac.align as se_bowtie2 { input : 81 | aligner = 'bowtie2', 82 | idx_tar = se_bowtie2_idx_tar, 83 | mito_chr_name = 'chrM', 84 | fastqs_R1 = se_fastqs_R1, 85 | fastqs_R2 = [], 86 | adapters_R1 = [], 87 | adapters_R2 = [], 88 | cutadapt_param = cutadapt_param, 89 | multimapping = multimapping, 90 | paired_end = false, 91 | chrsz = se_chrsz, 92 | auto_detect_adapter = true, 93 | 94 | cpu = bowtie2_cpu, 95 | mem_factor = bowtie2_mem_factor, 96 | time_hr = bowtie2_time_hr, 97 | disk_factor = bowtie2_disk_factor, 98 | runtime_environment = runtime_environment, 99 | } 100 | call atac.align as se_bowtie2_no_multimapping { input : 101 | aligner = 'bowtie2', 102 | idx_tar = se_bowtie2_idx_tar, 103 | mito_chr_name = 'chrM', 104 | fastqs_R1 = se_fastqs_R1, 105 | fastqs_R2 = [], 106 | adapters_R1 = [], 107 | adapters_R2 = [], 108 | cutadapt_param = cutadapt_param, 109 | multimapping = 0, 110 | paired_end = false, 111 | chrsz = se_chrsz, 112 | auto_detect_adapter = true, 113 | 114 | cpu = bowtie2_cpu, 115 | mem_factor = bowtie2_mem_factor, 116 | time_hr = bowtie2_time_hr, 117 | disk_factor = bowtie2_disk_factor, 118 | runtime_environment = runtime_environment, 119 | } 120 | 121 | call compare_md5sum.compare_md5sum { input : 122 | labels = [ 123 | 'pe_bowtie2', 124 | 'pe_bowtie2_no_multimapping', 125 | 'se_bowtie2', 126 | 'se_bowtie2_no_multimapping', 127 | ], 128 | files = [ 129 | pe_bowtie2.samstat_qc, 130 | pe_bowtie2_no_multimapping.samstat_qc, 131 | se_bowtie2.samstat_qc, 132 | se_bowtie2_no_multimapping.samstat_qc, 133 | ], 134 | ref_files = [ 135 | ref_pe_flagstat, 136 | ref_pe_flagstat_no_multimapping, 137 | ref_se_flagstat, 138 | ref_se_flagstat_no_multimapping, 139 | ], 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /dev/test/test_task/test_compare_signal_to_roadmap.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_compare_signal_to_roadmap.reg2map_bed" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38/ataqc/hg38_celltype_compare_subsample.bed.gz", 3 | "test_compare_signal_to_roadmap.reg2map" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_formatted.txt.gz", 4 | "test_compare_signal_to_roadmap.roadmap_meta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_metadata.txt", 5 | 6 | "test_compare_signal_to_roadmap.pval_bw" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/ataqc/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.pval.signal.bigwig", 7 | 8 | "test_compare_signal_to_roadmap.ref_roadmap_compare_log" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_compare_signal_to_roadmap/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.pval.signal.roadmap_compare.log" 9 | } 10 | -------------------------------------------------------------------------------- /dev/test/test_task/test_compare_signal_to_roadmap.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import '../../../atac.wdl' as atac 3 | import 'compare_md5sum.wdl' as compare_md5sum 4 | 5 | workflow test_compare_signal_to_roadmap { 6 | input { 7 | File pval_bw 8 | File reg2map_bed 9 | File reg2map 10 | File roadmap_meta 11 | 12 | File ref_roadmap_compare_log 13 | String docker 14 | } 15 | RuntimeEnvironment runtime_environment = { 16 | "docker": docker, 17 | "singularity": "", 18 | "conda": "" 19 | } 20 | 21 | call atac.compare_signal_to_roadmap { input : 22 | pval_bw = pval_bw, 23 | 24 | reg2map_bed = reg2map_bed, 25 | reg2map = reg2map, 26 | roadmap_meta = roadmap_meta, 27 | runtime_environment = runtime_environment, 28 | } 29 | 30 | call compare_md5sum.compare_md5sum { input : 31 | labels = [ 32 | 'ref_roadmap_compare_log', 33 | ], 34 | files = [ 35 | compare_signal_to_roadmap.roadmap_compare_log, 36 | ], 37 | ref_files = [ 38 | ref_roadmap_compare_log, 39 | ], 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /dev/test/test_task/test_count_signal_track.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_count_signal_track.se_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38_chr19_chrM/hg38_chr19_chrM.chrom.sizes", 3 | 4 | "test_count_signal_track.se_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/tas/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz", 5 | 6 | "test_count_signal_track.ref_se_count_signal_track_pos_bw" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_count_signal_track/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.positive.bigwig", 7 | "test_count_signal_track.ref_se_count_signal_track_neg_bw" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_count_signal_track/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.negative.bigwig" 8 | } 9 | -------------------------------------------------------------------------------- /dev/test/test_task/test_count_signal_track.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import '../../../atac.wdl' as atac 3 | import 'compare_md5sum.wdl' as compare_md5sum 4 | 5 | workflow test_count_signal_track { 6 | input { 7 | String se_ta 8 | 9 | String ref_se_count_signal_track_pos_bw 10 | String ref_se_count_signal_track_neg_bw 11 | 12 | String se_chrsz 13 | String docker 14 | } 15 | RuntimeEnvironment runtime_environment = { 16 | "docker": docker, 17 | "singularity": "", 18 | "conda": "" 19 | } 20 | 21 | call atac.count_signal_track as se_count_signal_track { input : 22 | ta = se_ta, 23 | chrsz = se_chrsz, 24 | runtime_environment = runtime_environment, 25 | } 26 | 27 | call compare_md5sum.compare_md5sum { input : 28 | labels = [ 29 | 'se_count_signal_track_pos_bw', 30 | 'se_count_signal_track_neg_bw', 31 | ], 32 | files = [ 33 | se_count_signal_track.pos_bw, 34 | se_count_signal_track.neg_bw, 35 | ], 36 | ref_files = [ 37 | ref_se_count_signal_track_pos_bw, 38 | ref_se_count_signal_track_neg_bw, 39 | ], 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /dev/test/test_task/test_filter.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_filter.se_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10_chr19_chrM.chrom.sizes", 3 | "test_filter.pe_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38_chr19_chrM/hg38_chr19_chrM.chrom.sizes", 4 | "test_filter.pe_bam" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/bams/rep1/ENCFF341MYG.subsampled.400.trim.merged.bam", 5 | "test_filter.pe_bam_no_multimapping" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/bams_no_multimapping/rep1/ENCFF341MYG.subsampled.400.trim.merged.bam", 6 | "test_filter.se_bam" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/bams/rep1/ENCFF439VSY.subsampled.400.trim.merged.bam", 7 | "test_filter.se_bam_no_multimapping" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/bams_no_multimapping/rep1/ENCFF439VSY.subsampled.400.trim.merged.bam", 8 | 9 | "test_filter.ref_pe_nodup_samstat_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_filter/pe/multimapping/ENCFF341MYG.subsampled.400.trim.merged.nodup.no_chrM.samstats.qc", 10 | "test_filter.ref_pe_nodup_samstat_qc_no_multimapping" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_filter/pe/no_multimapping/ENCFF341MYG.subsampled.400.trim.merged.nodup.no_chrM.samstats.qc", 11 | "test_filter.ref_pe_filt_samstat_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_filter/pe/no_dup_removal/ENCFF341MYG.subsampled.400.trim.merged.filt.no_chrM.samstats.qc", 12 | 13 | "test_filter.ref_se_nodup_samstat_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_filter/se/multimapping/ENCFF439VSY.subsampled.400.trim.merged.nodup.no_chrM.samstats.qc", 14 | "test_filter.ref_se_nodup_samstat_qc_no_multimapping" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_filter/se/no_multimapping/ENCFF439VSY.subsampled.400.trim.merged.nodup.no_chrM.samstats.qc", 15 | "test_filter.ref_se_filt_samstat_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_filter/se/no_dup_removal/ENCFF439VSY.subsampled.400.trim.merged.filt.no_chrM.samstats.qc", 16 | 17 | "test_filter.multimapping" : 4 18 | } 19 | -------------------------------------------------------------------------------- /dev/test/test_task/test_frac_mito.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_frac_mito.non_mito_samstat" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/samstat_qcs/ENCFF341MYG.subsampled.400.trim.merged.non_mito.samstats.qc", 3 | "test_frac_mito.mito_samstat" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/samstat_qcs/ENCFF341MYG.subsampled.400.trim.merged.samstats.qc", 4 | "test_frac_mito.ref_frac_mito_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_frac_mito/ENCFF341MYG.subsampled.400.trim.merged.frac_mito.qc" 5 | } 6 | -------------------------------------------------------------------------------- /dev/test/test_task/test_frac_mito.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import '../../../atac.wdl' as atac 3 | import 'compare_md5sum.wdl' as compare_md5sum 4 | 5 | workflow test_frac_mito { 6 | input { 7 | File non_mito_samstat 8 | File mito_samstat 9 | 10 | File ref_frac_mito_qc 11 | String docker 12 | } 13 | RuntimeEnvironment runtime_environment = { 14 | "docker": docker, 15 | "singularity": "", 16 | "conda": "" 17 | } 18 | 19 | call atac.frac_mito as frac_mito { input: 20 | non_mito_samstat = non_mito_samstat, 21 | mito_samstat = mito_samstat, 22 | runtime_environment = runtime_environment, 23 | } 24 | 25 | call compare_md5sum.compare_md5sum { input : 26 | labels = [ 27 | 'frac_mito', 28 | ], 29 | files = [ 30 | frac_mito.frac_mito_qc, 31 | ], 32 | ref_files = [ 33 | ref_frac_mito_qc, 34 | ], 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /dev/test/test_task/test_fraglen_stat_pe.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_fraglen_stat_pe.nodup_bam" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/ataqc/ENCFF341MYG.subsampled.400.trim.merged.nodup.bam", 3 | 4 | "test_fraglen_stat_pe.ref_nucleosomal_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_fraglen_stat_pe/ENCFF341MYG.subsampled.400.trim.merged.nodup.nucleosomal.qc" 5 | } 6 | -------------------------------------------------------------------------------- /dev/test/test_task/test_fraglen_stat_pe.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import '../../../atac.wdl' as atac 3 | import 'compare_md5sum.wdl' as compare_md5sum 4 | 5 | workflow test_fraglen_stat_pe { 6 | input { 7 | File nodup_bam 8 | 9 | File ref_nucleosomal_qc 10 | String docker 11 | } 12 | RuntimeEnvironment runtime_environment = { 13 | "docker": docker, 14 | "singularity": "", 15 | "conda": "" 16 | } 17 | 18 | call atac.fraglen_stat_pe { input : 19 | nodup_bam = nodup_bam, 20 | picard_java_heap = '4G', 21 | runtime_environment = runtime_environment, 22 | } 23 | 24 | call compare_md5sum.compare_md5sum { input : 25 | labels = [ 26 | 'test_nucleosomal_qc', 27 | ], 28 | files = [ 29 | fraglen_stat_pe.nucleosomal_qc, 30 | ], 31 | ref_files = [ 32 | ref_nucleosomal_qc, 33 | ], 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /dev/test/test_task/test_gc_bias.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_gc_bias.ref_fa" : "GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz", 3 | "test_gc_bias.nodup_bam" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/ataqc/ENCFF341MYG.subsampled.400.trim.merged.nodup.bam", 4 | 5 | "test_gc_bias.ref_gc_log" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_gc_bias/ENCFF341MYG.subsampled.400.trim.merged.nodup.gc.txt" 6 | } 7 | -------------------------------------------------------------------------------- /dev/test/test_task/test_gc_bias.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import '../../../atac.wdl' as atac 3 | import 'compare_md5sum.wdl' as compare_md5sum 4 | 5 | workflow test_gc_bias { 6 | input { 7 | File nodup_bam 8 | 9 | File ref_fa 10 | 11 | File ref_gc_log 12 | String docker 13 | } 14 | RuntimeEnvironment runtime_environment = { 15 | "docker": docker, 16 | "singularity": "", 17 | "conda": "" 18 | } 19 | 20 | call atac.gc_bias { input : 21 | nodup_bam = nodup_bam, 22 | ref_fa = ref_fa, 23 | picard_java_heap = '4G', 24 | runtime_environment = runtime_environment, 25 | } 26 | 27 | call remove_comments_from_gc_log { input : 28 | gc_log = gc_bias.gc_log 29 | } 30 | 31 | call remove_comments_from_gc_log as remove_comments_from_gc_log_ref { input : 32 | gc_log = ref_gc_log 33 | } 34 | 35 | call compare_md5sum.compare_md5sum { input : 36 | labels = [ 37 | 'test_gc_log', 38 | ], 39 | files = [ 40 | remove_comments_from_gc_log.filt_gc_log, 41 | ], 42 | ref_files = [ 43 | remove_comments_from_gc_log_ref.filt_gc_log, 44 | ], 45 | } 46 | } 47 | 48 | task remove_comments_from_gc_log { 49 | input { 50 | File gc_log 51 | } 52 | command { 53 | zcat -f ${gc_log} | grep -v '# ' \ 54 | > ${basename(gc_log) + '.date_filt_out'} 55 | } 56 | output { 57 | File filt_gc_log = glob('*.date_filt_out')[0] 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /dev/test/test_task/test_idr.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_idr.se_blacklist" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10.blacklist.bed.gz", 3 | "test_idr.se_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10_chr19_chrM.chrom.sizes", 4 | 5 | "test_idr.se_peak_rep1" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/peaks/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pval0.01.300K.narrowPeak.gz", 6 | "test_idr.se_peak_rep2" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/peaks/rep2/ENCFF463QCX.subsampled.400.trim.merged.nodup.tn5.pval0.01.300K.narrowPeak.gz", 7 | "test_idr.se_peak_pooled" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/peaks/pooled_rep/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pooled.pval0.01.300K.narrowPeak.gz", 8 | "test_idr.se_ta_pooled" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/tas/pooled_rep/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pooled.tagAlign.gz", 9 | 10 | "test_idr.ref_se_idr_peak" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_idr/rep1-rep2.idr0.05.narrowPeak.gz", 11 | "test_idr.ref_se_idr_bfilt_peak" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_idr/rep1-rep2.idr0.05.bfilt.narrowPeak.gz", 12 | "test_idr.ref_se_idr_frip_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_idr/rep1-rep2.idr0.05.bfilt.frip.qc", 13 | 14 | "test_idr.idr_thresh" : 0.05 15 | } 16 | -------------------------------------------------------------------------------- /dev/test/test_task/test_idr.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import '../../../atac.wdl' as atac 3 | import 'compare_md5sum.wdl' as compare_md5sum 4 | 5 | workflow test_idr { 6 | input { 7 | Float idr_thresh 8 | 9 | String se_peak_rep1 10 | String se_peak_rep2 11 | String se_peak_pooled 12 | String se_ta_pooled 13 | 14 | String ref_se_idr_peak 15 | String ref_se_idr_bfilt_peak 16 | String ref_se_idr_frip_qc 17 | 18 | String se_blacklist 19 | String se_chrsz 20 | 21 | String regex_bfilt_peak_chr_name = 'chr[\\dXY]+' 22 | String docker 23 | } 24 | RuntimeEnvironment runtime_environment = { 25 | "docker": docker, 26 | "singularity": "", 27 | "conda": "" 28 | } 29 | 30 | call atac.idr as se_idr { input : 31 | prefix = 'rep1-rep2', 32 | peak1 = se_peak_rep1, 33 | peak2 = se_peak_rep2, 34 | peak_pooled = se_peak_pooled, 35 | idr_thresh = idr_thresh, 36 | peak_type = 'narrowPeak', 37 | rank = 'p.value', 38 | blacklist = se_blacklist, 39 | chrsz = se_chrsz, 40 | regex_bfilt_peak_chr_name = regex_bfilt_peak_chr_name, 41 | ta = se_ta_pooled, 42 | runtime_environment = runtime_environment, 43 | } 44 | 45 | call compare_md5sum.compare_md5sum { input : 46 | labels = [ 47 | 'se_idr_peak', 48 | 'se_idr_bfilt_peak', 49 | 'se_idr_frip_qc', 50 | ], 51 | files = [se_idr.idr_peak, 52 | se_idr.bfilt_idr_peak, 53 | se_idr.frip_qc, 54 | ], 55 | ref_files = [ 56 | ref_se_idr_peak, 57 | ref_se_idr_bfilt_peak, 58 | ref_se_idr_frip_qc, 59 | ], 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /dev/test/test_task/test_jsd.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_jsd.se_nodup_bams" : [ 3 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/nodup_bams/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.bam" 4 | ], 5 | "test_jsd.se_blacklist" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10.blacklist.bed.gz", 6 | 7 | "test_jsd.se_fake_blacklist" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/fake_blacklist/mm10.whole_chr19.blacklist.bed.gz", 8 | 9 | "test_jsd.ref_se_jsd_logs" : [ 10 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_jsd/rep1.ENCFF439VSY.subsampled.400.trim.merged.nodup.bfilt.jsd.qc" 11 | ], 12 | "test_jsd.ref_se_jsd_fake_blacklist_logs" : [ 13 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_jsd/fake_blacklist/rep1.ENCFF439VSY.subsampled.400.trim.merged.nodup.bfilt.jsd.qc" 14 | ] 15 | } 16 | -------------------------------------------------------------------------------- /dev/test/test_task/test_jsd.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import '../../../atac.wdl' as atac 3 | import 'compare_md5sum.wdl' as compare_md5sum 4 | 5 | workflow test_jsd { 6 | input { 7 | Array[File] se_nodup_bams 8 | File se_blacklist 9 | File se_fake_blacklist 10 | Array[File] ref_se_jsd_logs 11 | Array[File] ref_se_jsd_fake_blacklist_logs 12 | # task level test data (BAM) is generated from BWA 13 | # so we keep using 30 here, this should be 255 for bowtie2 BAMs 14 | Int mapq_thresh = 30 15 | 16 | Int jsd_cpu = 1 17 | Float jsd_mem_factor = 0.0 18 | Int jsd_time_hr = 12 19 | Float jsd_disk_factor = 2.0 20 | String docker 21 | } 22 | RuntimeEnvironment runtime_environment = { 23 | "docker": docker, 24 | "singularity": "", 25 | "conda": "" 26 | } 27 | 28 | call atac.jsd as se_jsd { input : 29 | nodup_bams = se_nodup_bams, 30 | blacklist = se_blacklist, 31 | mapq_thresh = mapq_thresh, 32 | 33 | cpu = jsd_cpu, 34 | mem_factor = jsd_mem_factor, 35 | time_hr = jsd_time_hr, 36 | disk_factor = jsd_disk_factor, 37 | runtime_environment = runtime_environment, 38 | } 39 | 40 | call atac.jsd as se_jsd_fake_blacklist { input : 41 | nodup_bams = se_nodup_bams, 42 | blacklist = se_fake_blacklist, 43 | mapq_thresh = mapq_thresh, 44 | 45 | cpu = jsd_cpu, 46 | mem_factor = jsd_mem_factor, 47 | time_hr = jsd_time_hr, 48 | disk_factor = jsd_disk_factor, 49 | runtime_environment = runtime_environment, 50 | } 51 | 52 | # take first 8 columns (vaule in other columns are random) 53 | #scatter(i in range(2)){ 54 | # call take_8_cols { input : 55 | # f = se_jsd.jsd_qcs[i], 56 | # } 57 | # call take_8_cols as ref_take_8_cols { input : 58 | # f = ref_se_jsd_logs[i], 59 | # } 60 | #} 61 | 62 | call compare_md5sum.compare_md5sum { input : 63 | labels = [ 64 | 'se_jsd_rep1', 65 | 'se_jsd_fake_blacklist_rep1', 66 | ], 67 | files = [ 68 | #take_8_cols.out[0], 69 | se_jsd.jsd_qcs[0], 70 | se_jsd_fake_blacklist.jsd_qcs[0], 71 | ], 72 | ref_files = [ 73 | #ref_take_8_cols.out[0], 74 | ref_se_jsd_logs[0], 75 | ref_se_jsd_fake_blacklist_logs[0], 76 | ], 77 | } 78 | } 79 | 80 | task take_8_cols { 81 | input { 82 | File f 83 | } 84 | command { 85 | cut -f 1-8 ${f} > out.txt 86 | } 87 | output { 88 | File out = 'out.txt' 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /dev/test/test_task/test_macs2.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_macs2.se_blacklist" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10.blacklist.bed.gz", 3 | "test_macs2.se_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10_chr19_chrM.chrom.sizes", 4 | "test_macs2.se_gensz" : "mm", 5 | 6 | "test_macs2.se_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/tas/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz", 7 | 8 | "test_macs2.ref_se_macs2_npeak" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_macs2/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pval0.01.300K.narrowPeak.gz", 9 | "test_macs2.ref_se_macs2_bfilt_npeak" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_macs2/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pval0.01.300K.bfilt.narrowPeak.gz", 10 | "test_macs2.ref_se_macs2_frip_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_macs2/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pval0.01.300K.bfilt.frip.qc", 11 | 12 | "test_macs2.cap_num_peak" : 300000, 13 | "test_macs2.pval_thresh" : 0.01, 14 | "test_macs2.smooth_win" : 150 15 | } 16 | -------------------------------------------------------------------------------- /dev/test/test_task/test_macs2.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import '../../../atac.wdl' as atac 3 | import 'compare_md5sum.wdl' as compare_md5sum 4 | 5 | workflow test_macs2 { 6 | input { 7 | Int cap_num_peak 8 | Float pval_thresh 9 | Int smooth_win 10 | 11 | # test macs2 for SE set only 12 | String se_ta 13 | 14 | String ref_se_macs2_npeak # raw narrow-peak 15 | String ref_se_macs2_bfilt_npeak # blacklist filtered narrow-peak 16 | String ref_se_macs2_frip_qc 17 | 18 | String se_blacklist 19 | String se_chrsz 20 | String se_gensz 21 | 22 | String regex_bfilt_peak_chr_name = 'chr[\\dXY]+' 23 | 24 | Float macs2_mem_factor = 2.0 25 | Int macs2_time_hr = 24 26 | Float macs2_disk_factor = 15.0 27 | String docker 28 | } 29 | RuntimeEnvironment runtime_environment = { 30 | "docker": docker, 31 | "singularity": "", 32 | "conda": "" 33 | } 34 | 35 | call atac.call_peak as se_macs2 { input : 36 | peak_caller = 'macs2', 37 | peak_type = 'narrowPeak', 38 | ta = se_ta, 39 | gensz = se_gensz, 40 | chrsz = se_chrsz, 41 | cap_num_peak = cap_num_peak, 42 | pval_thresh = pval_thresh, 43 | smooth_win = smooth_win, 44 | blacklist = se_blacklist, 45 | regex_bfilt_peak_chr_name = regex_bfilt_peak_chr_name, 46 | 47 | cpu = 2, 48 | mem_factor = macs2_mem_factor, 49 | time_hr = macs2_time_hr, 50 | disk_factor = macs2_disk_factor, 51 | runtime_environment = runtime_environment, 52 | } 53 | 54 | call compare_md5sum.compare_md5sum { input : 55 | labels = [ 56 | 'se_macs2_npeak', 57 | 'se_macs2_bfilt_npeak', 58 | 'se_macs2_frip_qc', 59 | ], 60 | files = [ 61 | se_macs2.peak, 62 | se_macs2.bfilt_peak, 63 | se_macs2.frip_qc, 64 | ], 65 | ref_files = [ 66 | ref_se_macs2_npeak, 67 | ref_se_macs2_bfilt_npeak, 68 | ref_se_macs2_frip_qc, 69 | ], 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /dev/test/test_task/test_macs2_signal_track.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_macs2_signal_track.se_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10_chr19_chrM.chrom.sizes", 3 | "test_macs2_signal_track.se_gensz" : "mm", 4 | 5 | "test_macs2_signal_track.se_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/tas/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz", 6 | 7 | "test_macs2_signal_track.ref_se_macs2_pval_bw" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_macs2/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pval.signal.bigwig", 8 | 9 | "test_macs2_signal_track.pval_thresh" : 0.01, 10 | "test_macs2_signal_track.smooth_win" : 150 11 | } 12 | -------------------------------------------------------------------------------- /dev/test/test_task/test_macs2_signal_track.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import '../../../atac.wdl' as atac 3 | import 'compare_md5sum.wdl' as compare_md5sum 4 | 5 | workflow test_macs2_signal_track { 6 | input { 7 | Float pval_thresh 8 | Int smooth_win 9 | 10 | # test macs2 for SE set only 11 | String se_ta 12 | 13 | String ref_se_macs2_pval_bw # p-val signal 14 | 15 | String se_chrsz 16 | String se_gensz 17 | 18 | Float macs2_mem_factor = 0.0 19 | Int macs2_time_hr = 24 20 | Float macs2_disk_factor = 40.0 21 | String docker 22 | } 23 | RuntimeEnvironment runtime_environment = { 24 | "docker": docker, 25 | "singularity": "", 26 | "conda": "" 27 | } 28 | 29 | call atac.macs2_signal_track as se_macs2_signal_track { input : 30 | ta = se_ta, 31 | gensz = se_gensz, 32 | chrsz = se_chrsz, 33 | pval_thresh = pval_thresh, 34 | smooth_win = smooth_win, 35 | 36 | mem_factor = macs2_mem_factor, 37 | time_hr = macs2_time_hr, 38 | disk_factor = macs2_disk_factor, 39 | runtime_environment = runtime_environment, 40 | } 41 | 42 | call compare_md5sum.compare_md5sum { input : 43 | labels = [ 44 | 'se_macs2_pval_bw', 45 | ], 46 | files = [ 47 | se_macs2_signal_track.pval_bw, 48 | ], 49 | ref_files = [ 50 | ref_se_macs2_pval_bw, 51 | ], 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /dev/test/test_task/test_overlap.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_overlap.se_blacklist" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10.blacklist.bed.gz", 3 | "test_overlap.se_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10_chr19_chrM.chrom.sizes", 4 | 5 | "test_overlap.se_peak_rep1" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/peaks/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pval0.01.300K.narrowPeak.gz", 6 | "test_overlap.se_peak_rep2" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/peaks/rep2/ENCFF463QCX.subsampled.400.trim.merged.nodup.tn5.pval0.01.300K.narrowPeak.gz", 7 | "test_overlap.se_peak_pooled" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/peaks/pooled_rep/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pooled.pval0.01.300K.narrowPeak.gz", 8 | "test_overlap.se_ta_pooled" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/tas/pooled_rep/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pooled.tagAlign.gz", 9 | 10 | "test_overlap.ref_se_overlap_peak" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_overlap/rep1-rep2.overlap.narrowPeak.gz", 11 | "test_overlap.ref_se_overlap_bfilt_peak" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_overlap/rep1-rep2.overlap.bfilt.narrowPeak.gz", 12 | "test_overlap.ref_se_overlap_frip_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_overlap/rep1-rep2.overlap.bfilt.frip.qc" 13 | } 14 | -------------------------------------------------------------------------------- /dev/test/test_task/test_overlap.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import '../../../atac.wdl' as atac 3 | import 'compare_md5sum.wdl' as compare_md5sum 4 | 5 | workflow test_overlap { 6 | input { 7 | String se_peak_rep1 # test overlap,idr for SE set only 8 | String se_peak_rep2 9 | String se_peak_pooled 10 | String se_ta_pooled 11 | 12 | String ref_se_overlap_peak 13 | String ref_se_overlap_bfilt_peak 14 | String ref_se_overlap_frip_qc 15 | 16 | String se_blacklist 17 | String se_chrsz 18 | 19 | String regex_bfilt_peak_chr_name = 'chr[\\dXY]+' 20 | String docker 21 | } 22 | RuntimeEnvironment runtime_environment = { 23 | "docker": docker, 24 | "singularity": "", 25 | "conda": "" 26 | } 27 | 28 | call atac.overlap as se_overlap { input : 29 | prefix = 'rep1-rep2', 30 | peak1 = se_peak_rep1, 31 | peak2 = se_peak_rep2, 32 | peak_pooled = se_peak_pooled, 33 | peak_type = 'narrowPeak', 34 | blacklist = se_blacklist, 35 | regex_bfilt_peak_chr_name = regex_bfilt_peak_chr_name, 36 | chrsz = se_chrsz, 37 | ta = se_ta_pooled, 38 | runtime_environment = runtime_environment, 39 | } 40 | 41 | call compare_md5sum.compare_md5sum { input : 42 | labels = [ 43 | 'se_overlap_peak', 44 | 'se_overlap_bfilt_peak', 45 | 'se_overlap_frip_qc', 46 | ], 47 | files = [ 48 | se_overlap.overlap_peak, 49 | se_overlap.bfilt_overlap_peak, 50 | se_overlap.frip_qc, 51 | ], 52 | ref_files = [ 53 | ref_se_overlap_peak, 54 | ref_se_overlap_bfilt_peak, 55 | ref_se_overlap_frip_qc, 56 | ], 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /dev/test/test_task/test_pool_ta.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_pool_ta.se_ta_rep1" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/tas/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz", 3 | "test_pool_ta.se_ta_rep2" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/tas/rep2/ENCFF463QCX.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz", 4 | 5 | "test_pool_ta.ref_se_pooled_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_pool_ta/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pooled.tagAlign.gz" 6 | } 7 | -------------------------------------------------------------------------------- /dev/test/test_task/test_pool_ta.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import '../../../atac.wdl' as atac 3 | import 'compare_md5sum.wdl' as compare_md5sum 4 | 5 | workflow test_pool_ta { 6 | input { 7 | String se_ta_rep1 8 | String se_ta_rep2 9 | 10 | String ref_se_pooled_ta 11 | String docker 12 | } 13 | RuntimeEnvironment runtime_environment = { 14 | "docker": docker, 15 | "singularity": "", 16 | "conda": "" 17 | } 18 | 19 | call atac.pool_ta as se_pool_ta { input : 20 | tas = [se_ta_rep1, se_ta_rep2], 21 | runtime_environment = runtime_environment, 22 | } 23 | 24 | call compare_md5sum.compare_md5sum { input : 25 | labels = [ 26 | 'se_pool_ta', 27 | ], 28 | files = [ 29 | se_pool_ta.ta_pooled, 30 | ], 31 | ref_files = [ 32 | ref_se_pooled_ta, 33 | ], 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /dev/test/test_task/test_preseq.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_preseq.paired_end" : true, 3 | "test_preseq.bam" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/ataqc/ENCFF341MYG.subsampled.400.trim.merged.bam", 4 | 5 | "test_preseq.ref_picard_est_lib_size_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_preseq/ENCFF341MYG.subsampled.400.trim.merged.picard_est_lib_size.qc", 6 | "test_preseq.ref_preseq_log" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_preseq/ENCFF341MYG.subsampled.400.trim.merged.preseq.log" 7 | } 8 | -------------------------------------------------------------------------------- /dev/test/test_task/test_preseq.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import '../../../atac.wdl' as atac 3 | import 'compare_md5sum.wdl' as compare_md5sum 4 | 5 | workflow test_preseq { 6 | input { 7 | File bam 8 | Boolean paired_end 9 | 10 | File ref_picard_est_lib_size_qc 11 | File ref_preseq_log 12 | 13 | Float preseq_mem_factor = 0.0 14 | Float preseq_disk_factor = 5.0 15 | String docker 16 | } 17 | RuntimeEnvironment runtime_environment = { 18 | "docker": docker, 19 | "singularity": "", 20 | "conda": "" 21 | } 22 | 23 | call atac.preseq { input : 24 | paired_end = paired_end, 25 | bam = bam, 26 | mem_factor = preseq_mem_factor, 27 | disk_factor = preseq_disk_factor, 28 | picard_java_heap = '4G', 29 | runtime_environment = runtime_environment, 30 | } 31 | 32 | call compare_md5sum.compare_md5sum { input : 33 | labels = [ 34 | 'test_picard_est_lib_size_qc', 35 | 'test_preseq_log', 36 | ], 37 | files = select_all([ 38 | preseq.picard_est_lib_size_qc, 39 | preseq.preseq_log, 40 | ]), 41 | ref_files = [ 42 | ref_picard_est_lib_size_qc, 43 | ref_preseq_log, 44 | ], 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /dev/test/test_task/test_reproducibility.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_reproducibility.se_overlap_peak_rep1_vs_rep2" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/overlap_peaks/rep1-rep2.overlap.bfilt.narrowPeak.gz", 3 | "test_reproducibility.se_overlap_peak_rep1_pr" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/overlap_peaks/rep1-pr.overlap.bfilt.narrowPeak.gz", 4 | "test_reproducibility.se_overlap_peak_rep2_pr" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/overlap_peaks/rep2-pr.overlap.bfilt.narrowPeak.gz", 5 | "test_reproducibility.se_overlap_peak_ppr" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/overlap_peaks/ppr.overlap.bfilt.narrowPeak.gz", 6 | "test_reproducibility.se_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10_chr19_chrM.chrom.sizes", 7 | 8 | "test_reproducibility.ref_se_reproducibility_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_reproducibility/overlap.reproducibility.qc" 9 | } 10 | -------------------------------------------------------------------------------- /dev/test/test_task/test_reproducibility.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import '../../../atac.wdl' as atac 3 | import 'compare_md5sum.wdl' as compare_md5sum 4 | 5 | workflow test_reproducibility { 6 | input { 7 | String se_overlap_peak_rep1_vs_rep2 8 | String se_overlap_peak_rep1_pr 9 | String se_overlap_peak_rep2_pr 10 | String se_overlap_peak_ppr 11 | String se_chrsz 12 | 13 | String ref_se_reproducibility_qc 14 | String docker 15 | } 16 | RuntimeEnvironment runtime_environment = { 17 | "docker": docker, 18 | "singularity": "", 19 | "conda": "" 20 | } 21 | 22 | call atac.reproducibility as se_reproducibility { input : 23 | prefix = 'overlap', 24 | peaks = [se_overlap_peak_rep1_vs_rep2], 25 | peaks_pr = [se_overlap_peak_rep1_pr, se_overlap_peak_rep2_pr], 26 | peak_ppr = se_overlap_peak_ppr, 27 | peak_type = 'narrowPeak', 28 | chrsz = se_chrsz, 29 | runtime_environment = runtime_environment, 30 | } 31 | 32 | call compare_md5sum.compare_md5sum { input : 33 | labels = [ 34 | 'se_reproducibility', 35 | ], 36 | files = [ 37 | se_reproducibility.reproducibility_qc, 38 | ], 39 | ref_files = [ 40 | ref_se_reproducibility_qc, 41 | ], 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /dev/test/test_task/test_spr.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_spr.pe_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/tas/rep1/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz", 3 | "test_spr.se_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/tas/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz", 4 | 5 | "test_spr.ref_pe_ta_pr1" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_spr/pe/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.pr1.tagAlign.gz", 6 | "test_spr.ref_pe_ta_pr2" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_spr/pe/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.pr2.tagAlign.gz", 7 | "test_spr.ref_se_ta_pr1" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_spr/se/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pr1.tagAlign.gz", 8 | "test_spr.ref_se_ta_pr2" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_spr/se/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pr2.tagAlign.gz", 9 | 10 | "test_spr.ref_pe_seed_10_ta_pr1" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_spr/pe/pseudoreplication_random_seed_10/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.pr1.tagAlign.gz", 11 | "test_spr.ref_pe_seed_10_ta_pr2" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_spr/pe/pseudoreplication_random_seed_10/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.pr2.tagAlign.gz", 12 | "test_spr.ref_se_seed_10_ta_pr1" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_spr/se/pseudoreplication_random_seed_10/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pr1.tagAlign.gz", 13 | "test_spr.ref_se_seed_10_ta_pr2" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_spr/se/pseudoreplication_random_seed_10/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pr2.tagAlign.gz" 14 | } 15 | -------------------------------------------------------------------------------- /dev/test/test_task/test_spr.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import '../../../atac.wdl' as atac 3 | import 'compare_md5sum.wdl' as compare_md5sum 4 | 5 | workflow test_spr { 6 | input { 7 | File pe_ta 8 | File se_ta 9 | 10 | File ref_pe_ta_pr1 11 | File ref_pe_ta_pr2 12 | File ref_se_ta_pr1 13 | File ref_se_ta_pr2 14 | File ref_pe_seed_10_ta_pr1 15 | File ref_pe_seed_10_ta_pr2 16 | File ref_se_seed_10_ta_pr1 17 | File ref_se_seed_10_ta_pr2 18 | 19 | Float spr_mem_factor = 0.0 20 | Float spr_disk_factor = 6.0 21 | String docker 22 | } 23 | RuntimeEnvironment runtime_environment = { 24 | "docker": docker, 25 | "singularity": "", 26 | "conda": "" 27 | } 28 | 29 | call atac.spr as pe_spr { input : 30 | ta = pe_ta, 31 | paired_end = true, 32 | pseudoreplication_random_seed = 0, 33 | mem_factor = spr_mem_factor, 34 | disk_factor = spr_disk_factor, 35 | runtime_environment = runtime_environment, 36 | } 37 | call atac.spr as se_spr { input : 38 | ta = se_ta, 39 | paired_end = false, 40 | pseudoreplication_random_seed = 0, 41 | mem_factor = spr_mem_factor, 42 | disk_factor = spr_disk_factor, 43 | runtime_environment = runtime_environment, 44 | } 45 | call atac.spr as pe_spr_seed_10 { input : 46 | ta = pe_ta, 47 | paired_end = true, 48 | pseudoreplication_random_seed = 10, 49 | mem_factor = spr_mem_factor, 50 | disk_factor = spr_disk_factor, 51 | runtime_environment = runtime_environment, 52 | } 53 | call atac.spr as se_spr_seed_10 { input : 54 | ta = se_ta, 55 | paired_end = false, 56 | pseudoreplication_random_seed = 10, 57 | mem_factor = spr_mem_factor, 58 | disk_factor = spr_disk_factor, 59 | runtime_environment = runtime_environment, 60 | } 61 | 62 | call compare_md5sum.compare_md5sum { input : 63 | labels = [ 64 | 'pe_spr_pr1', 65 | 'pe_spr_pr2', 66 | 'se_spr_pr1', 67 | 'se_spr_pr2', 68 | 'pe_spr_seed_10_pr1', 69 | 'pe_spr_seed_10_pr2', 70 | 'se_spr_seed_10_pr1', 71 | 'se_spr_seed_10_pr2', 72 | ], 73 | files = [ 74 | pe_spr.ta_pr1, 75 | pe_spr.ta_pr2, 76 | se_spr.ta_pr1, 77 | se_spr.ta_pr2, 78 | pe_spr_seed_10.ta_pr1, 79 | pe_spr_seed_10.ta_pr2, 80 | se_spr_seed_10.ta_pr1, 81 | se_spr_seed_10.ta_pr2, 82 | ], 83 | ref_files = [ 84 | ref_pe_ta_pr1, 85 | ref_pe_ta_pr2, 86 | ref_se_ta_pr1, 87 | ref_se_ta_pr2, 88 | ref_pe_seed_10_ta_pr1, 89 | ref_pe_seed_10_ta_pr2, 90 | ref_se_seed_10_ta_pr1, 91 | ref_se_seed_10_ta_pr2, 92 | ], 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /dev/test/test_task/test_tss_enrich.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_tss_enrich.tss" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38/ataqc/ENCFF766FGL.bed.gz", 3 | "test_tss_enrich.chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38/hg38.chrom.sizes", 4 | 5 | "test_tss_enrich.read_len_log" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/ataqc/ENCFF341MYG.subsampled.400.trim.merged.read_length.txt", 6 | "test_tss_enrich.nodup_bam" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/ataqc/ENCFF341MYG.subsampled.400.trim.merged.nodup.bam", 7 | 8 | "test_tss_enrich.ref_tss_enrich_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_tss_enrich/ENCFF341MYG.subsampled.400.trim.merged.nodup.tss_enrich.qc" 9 | } 10 | -------------------------------------------------------------------------------- /dev/test/test_task/test_tss_enrich.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import '../../../atac.wdl' as atac 3 | import 'compare_md5sum.wdl' as compare_md5sum 4 | 5 | workflow test_tss_enrich { 6 | input { 7 | File read_len_log 8 | File nodup_bam 9 | File tss 10 | File chrsz 11 | 12 | File ref_tss_enrich_qc 13 | String docker 14 | } 15 | RuntimeEnvironment runtime_environment = { 16 | "docker": docker, 17 | "singularity": "", 18 | "conda": "" 19 | } 20 | 21 | Int? read_len_ = read_int(read_len_log) 22 | 23 | call atac.tss_enrich { input : 24 | read_len = read_len_, 25 | nodup_bam = nodup_bam, 26 | chrsz = chrsz, 27 | tss = tss, 28 | runtime_environment = runtime_environment, 29 | } 30 | 31 | call compare_md5sum.compare_md5sum { input : 32 | labels = [ 33 | 'test_tss_enrich_qc', 34 | ], 35 | files = [ 36 | tss_enrich.tss_enrich_qc, 37 | ], 38 | ref_files = [ 39 | ref_tss_enrich_qc, 40 | ], 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /dev/test/test_task/test_xcor.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_xcor.pe_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/tas/rep1/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz", 3 | "test_xcor.se_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/tas/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz", 4 | 5 | "test_xcor.ref_pe_xcor_log" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_xcor/pe/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.no_chrM.R1.25M.cc.qc", 6 | "test_xcor.ref_pe_xcor_log_subsample" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_xcor/pe/subsample/fix_PIP-917/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.no_chrM.R1.5K.cc.qc", 7 | "test_xcor.ref_se_xcor_log" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_xcor/se/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.25M.cc.qc", 8 | "test_xcor.ref_se_xcor_log_subsample" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_xcor/se/subsample/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.no_chrM.5K.cc.qc", 9 | 10 | "test_xcor.xcor_subsample" : 5000 11 | } 12 | -------------------------------------------------------------------------------- /dev/test/test_task/test_xcor.wdl: -------------------------------------------------------------------------------- 1 | version 1.0 2 | import '../../../atac.wdl' as atac 3 | import 'compare_md5sum.wdl' as compare_md5sum 4 | 5 | workflow test_xcor { 6 | input { 7 | Int xcor_subsample 8 | Int xcor_subsample_default = 25000000 9 | 10 | String pe_ta 11 | String se_ta 12 | 13 | String ref_pe_xcor_log 14 | String ref_pe_xcor_log_subsample 15 | String ref_se_xcor_log 16 | String ref_se_xcor_log_subsample 17 | String mito_chr_name = 'chrM' 18 | 19 | Int xcor_cpu = 2 20 | Float xcor_mem_factor = 0.0 21 | Int xcor_time_hr = 6 22 | Float xcor_disk_factor = 1.5 23 | String docker 24 | } 25 | RuntimeEnvironment runtime_environment = { 26 | "docker": docker, 27 | "singularity": "", 28 | "conda": "" 29 | } 30 | 31 | call atac.xcor as pe_xcor { input : 32 | ta = pe_ta, 33 | subsample = xcor_subsample_default, 34 | paired_end = true, 35 | mito_chr_name = mito_chr_name, 36 | 37 | cpu = xcor_cpu, 38 | mem_factor = xcor_mem_factor, 39 | time_hr = xcor_time_hr, 40 | disk_factor = xcor_disk_factor, 41 | runtime_environment = runtime_environment, 42 | } 43 | call atac.xcor as pe_xcor_subsample { input : 44 | ta = pe_ta, 45 | subsample = xcor_subsample, 46 | paired_end = true, 47 | mito_chr_name = mito_chr_name, 48 | 49 | cpu = xcor_cpu, 50 | mem_factor = xcor_mem_factor, 51 | time_hr = xcor_time_hr, 52 | disk_factor = xcor_disk_factor, 53 | runtime_environment = runtime_environment, 54 | } 55 | call atac.xcor as se_xcor { input : 56 | ta = se_ta, 57 | subsample = xcor_subsample_default, 58 | paired_end = false, 59 | mito_chr_name = mito_chr_name, 60 | 61 | cpu = xcor_cpu, 62 | mem_factor = xcor_mem_factor, 63 | time_hr = xcor_time_hr, 64 | disk_factor = xcor_disk_factor, 65 | runtime_environment = runtime_environment, 66 | } 67 | call atac.xcor as se_xcor_subsample { input : 68 | ta = se_ta, 69 | subsample = xcor_subsample, 70 | paired_end = false, 71 | mito_chr_name = mito_chr_name, 72 | 73 | cpu = xcor_cpu, 74 | mem_factor = xcor_mem_factor, 75 | time_hr = xcor_time_hr, 76 | disk_factor = xcor_disk_factor, 77 | runtime_environment = runtime_environment, 78 | } 79 | 80 | call compare_md5sum.compare_md5sum { input : 81 | labels = [ 82 | 'pe_xcor', 83 | 'pe_xcor_subsample', 84 | 'se_xcor', 85 | 'se_xcor_subsample', 86 | ], 87 | files = [ 88 | pe_xcor.score, 89 | pe_xcor_subsample.score, 90 | se_xcor.score, 91 | se_xcor_subsample.score, 92 | ], 93 | ref_files = [ 94 | ref_pe_xcor_log, 95 | ref_pe_xcor_log_subsample, 96 | ref_se_xcor_log, 97 | ref_se_xcor_log_subsample, 98 | ], 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /dev/test/test_workflow/.gitignore: -------------------------------------------------------------------------------- 1 | *qc_json_diff.txt 2 | *qc_json_match.txt 3 | *.result.json 4 | *.result.qc.json 5 | *.status.json 6 | *.metadata.json 7 | *.submit.json 8 | *.test_atac_wf_opt.json 9 | cromwell*.jar 10 | tmp_secret_key.json 11 | -------------------------------------------------------------------------------- /dev/test/test_workflow/ENCSR356KRQ.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ref_output/v1.1.6/ENCSR356KRQ/qc.json", 3 | "atac.pipeline_type" : "atac", 4 | "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v4/hg38.tsv", 5 | "atac.fastqs_rep1_R1" : [ 6 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair1/ENCFF341MYG.fastq.gz", 7 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair1/ENCFF106QGY.fastq.gz" 8 | ], 9 | "atac.fastqs_rep1_R2" : [ 10 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair2/ENCFF248EJF.fastq.gz", 11 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair2/ENCFF368TYI.fastq.gz" 12 | ], 13 | "atac.fastqs_rep2_R1" : [ 14 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF641SFZ.fastq.gz", 15 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF751XTV.fastq.gz", 16 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF927LSG.fastq.gz", 17 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF859BDM.fastq.gz", 18 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF193RRC.fastq.gz", 19 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF366DFI.fastq.gz" 20 | ], 21 | "atac.fastqs_rep2_R2" : [ 22 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF031ARQ.fastq.gz", 23 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF590SYZ.fastq.gz", 24 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF734PEQ.fastq.gz", 25 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF007USV.fastq.gz", 26 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF886FSC.fastq.gz", 27 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF573UXK.fastq.gz" 28 | ], 29 | "atac.paired_end" : true, 30 | "atac.auto_detect_adapter" : true, 31 | "atac.enable_xcor" : true, 32 | "atac.enable_tss_enrich" : true, 33 | "atac.title" : "ENCSR356KRQ", 34 | "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation", 35 | "atac.align_cpu" : 8 36 | } 37 | -------------------------------------------------------------------------------- /dev/test/test_workflow/ENCSR356KRQ_subsampled.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ref_output/v2.2.2/ENCSR356KRQ_subsampled/qc.json", 3 | "atac.pipeline_type" : "atac", 4 | "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v4/hg38.tsv", 5 | "atac.fastqs_rep1_R1" : [ 6 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz", 7 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz" 8 | ], 9 | "atac.fastqs_rep1_R2" : [ 10 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz", 11 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz" 12 | ], 13 | "atac.fastqs_rep2_R1" : [ 14 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz", 15 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz", 16 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz", 17 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz", 18 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz", 19 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz" 20 | ], 21 | "atac.fastqs_rep2_R2" : [ 22 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz", 23 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz", 24 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz", 25 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz", 26 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz", 27 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz" 28 | ], 29 | "atac.paired_end" : true, 30 | "atac.auto_detect_adapter" : true, 31 | "atac.enable_xcor" : true, 32 | "atac.title" : "ENCSR356KRQ (subsampled 1/400 reads)", 33 | "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation" 34 | } 35 | -------------------------------------------------------------------------------- /dev/test/test_workflow/ENCSR356KRQ_subsampled_chr19_only.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ref_output/v1.1.7.2/ENCSR356KRQ_subsampled_chr19_only/qc.json", 3 | "atac.pipeline_type" : "atac", 4 | "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v4/hg38_chr19_chrM.tsv", 5 | "atac.fastqs_rep1_R1" : [ 6 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz", 7 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz" 8 | ], 9 | "atac.fastqs_rep1_R2" : [ 10 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz", 11 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz" 12 | ], 13 | "atac.fastqs_rep2_R1" : [ 14 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz", 15 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz", 16 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz", 17 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz", 18 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz", 19 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz" 20 | ], 21 | "atac.fastqs_rep2_R2" : [ 22 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz", 23 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz", 24 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz", 25 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz", 26 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz", 27 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz" 28 | ], 29 | "atac.paired_end" : true, 30 | "atac.auto_detect_adapter" : true, 31 | "atac.enable_xcor" : true, 32 | "atac.enable_tss_enrich" : false, 33 | "atac.title" : "ENCSR356KRQ (subsampled 1/400 reads, chr19_chrM only)", 34 | "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation" 35 | } 36 | -------------------------------------------------------------------------------- /dev/test/test_workflow/ENCSR356KRQ_subsampled_start_from_bam.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ref_output/v2.2.2/ENCSR356KRQ_subsampled_start_from_bam/qc.json", 3 | "atac.pipeline_type" : "atac", 4 | "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v4/hg38.tsv", 5 | "atac.read_len" : [76, 76], 6 | "atac.nodup_bams" : [ 7 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/bam_subsampled/rep1/ENCFF341MYG.subsampled.400.trim.merged.nodup.no_chrM_MT.bam", 8 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/bam_subsampled/rep2/ENCFF641SFZ.subsampled.400.trim.merged.nodup.no_chrM_MT.bam" 9 | ], 10 | "atac.paired_end" : true, 11 | "atac.auto_detect_adapter" : true, 12 | "atac.enable_xcor" : true, 13 | "atac.title" : "ENCSR356KRQ (subsampled 1/400 reads, starting from BAM)", 14 | "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation" 15 | } 16 | -------------------------------------------------------------------------------- /dev/test/test_workflow/ENCSR889WQX.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ref_output/v1.1.6/ENCSR889WQX/qc.json", 3 | "atac.pipeline_type" : "atac", 4 | "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v4/mm10.tsv", 5 | "atac.fastqs_rep1_R1" : [ 6 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq/rep1/ENCFF439VSY.fastq.gz", 7 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq/rep1/ENCFF325FCQ.fastq.gz", 8 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq/rep1/ENCFF683IQS.fastq.gz", 9 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq/rep1/ENCFF744CHW.fastq.gz" 10 | ], 11 | "atac.fastqs_rep2_R1" : [ 12 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq/rep2/ENCFF463QCX.fastq.gz", 13 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq/rep2/ENCFF992TSA.fastq.gz" 14 | ], 15 | "atac.paired_end" : false, 16 | "atac.auto_detect_adapter" : true, 17 | "atac.enable_xcor" : true, 18 | "atac.enable_tss_enrich" : true, 19 | "atac.title" : "ENCSR889WQX", 20 | "atac.description" : "ATAC-seq on Mus musculus C57BL/6 frontal cortex adult" 21 | } 22 | -------------------------------------------------------------------------------- /dev/test/test_workflow/ENCSR889WQX_subsampled.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ref_output/v1.8.0/ENCSR889WQX_subsampled/qc.json", 3 | "atac.pipeline_type" : "atac", 4 | "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v4/mm10.tsv", 5 | "atac.fastqs_rep1_R1" : [ 6 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF439VSY.subsampled.400.fastq.gz", 7 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF325FCQ.subsampled.400.fastq.gz", 8 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF683IQS.subsampled.400.fastq.gz", 9 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF744CHW.subsampled.400.fastq.gz" 10 | ], 11 | "atac.fastqs_rep2_R1" : [ 12 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep2/ENCFF463QCX.subsampled.400.fastq.gz", 13 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep2/ENCFF992TSA.subsampled.400.fastq.gz" 14 | ], 15 | "atac.paired_end" : false, 16 | "atac.auto_detect_adapter" : true, 17 | "atac.enable_xcor" : true, 18 | "atac.enable_tss_enrich" : false, 19 | "atac.title" : "ENCSR889WQX (subsampled 1/400 reads)", 20 | "atac.description" : "ATAC-seq on Mus musculus C57BL/6 frontal cortex adult" 21 | } 22 | -------------------------------------------------------------------------------- /dev/test/test_workflow/ENCSR889WQX_subsampled_chr19_only.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ref_output/v1.1.6.a/ENCSR889WQX_subsampled_chr19_only/qc.json", 3 | "atac.pipeline_type" : "atac", 4 | "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v4/mm10_chr19_chrM.tsv", 5 | "atac.fastqs_rep1_R1" : [ 6 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF439VSY.subsampled.400.fastq.gz", 7 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF325FCQ.subsampled.400.fastq.gz", 8 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF683IQS.subsampled.400.fastq.gz", 9 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF744CHW.subsampled.400.fastq.gz" 10 | ], 11 | "atac.fastqs_rep2_R1" : [ 12 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep2/ENCFF463QCX.subsampled.400.fastq.gz", 13 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep2/ENCFF992TSA.subsampled.400.fastq.gz" 14 | ], 15 | "atac.paired_end" : false, 16 | "atac.auto_detect_adapter" : true, 17 | "atac.enable_xcor" : true, 18 | "atac.enable_tss_enrich" : false, 19 | "atac.title" : "ENCSR889WQX (subsampled 1/400 reads, chr19_chrM only)", 20 | "atac.description" : "ATAC-seq on Mus musculus C57BL/6 frontal cortex adult" 21 | } 22 | -------------------------------------------------------------------------------- /dev/test/test_workflow/ENCSR889WQX_subsampled_unrep.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ref_output/v1.8.0/ENCSR889WQX_subsampled_unrep/qc.json", 3 | "atac.pipeline_type" : "atac", 4 | "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v4/mm10.tsv", 5 | "atac.fastqs_rep1_R1" : [ 6 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF439VSY.subsampled.400.fastq.gz", 7 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF325FCQ.subsampled.400.fastq.gz", 8 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF683IQS.subsampled.400.fastq.gz", 9 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF744CHW.subsampled.400.fastq.gz" 10 | ], 11 | "atac.paired_end" : false, 12 | "atac.auto_detect_adapter" : true, 13 | "atac.enable_xcor" : true, 14 | "atac.enable_tss_enrich" : false, 15 | "atac.title" : "ENCSR889WQX (subsampled 1/400 reads, unrep)", 16 | "atac.description" : "ATAC-seq on Mus musculus C57BL/6 frontal cortex adult" 17 | } 18 | -------------------------------------------------------------------------------- /dev/test/test_workflow/ref_output/sync.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gsutil -m rsync -r -d . gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ref_output 4 | -------------------------------------------------------------------------------- /docs/build_genome_database.md: -------------------------------------------------------------------------------- 1 | ## How to download genome database 2 | 3 | 1. Choose `GENOME` from `hg19`, `hg38`, `mm9` and `mm10` and specify a destination directory. 4 | ```bash 5 | $ bash scripts/download_genome_data.sh [GENOME] [DESTINATION_DIR] 6 | ``` 7 | 2. Find a TSV file on the destination directory and use it for `"atac.genome_tsv"` in your input JSON. 8 | 9 | # How to build genome database 10 | 11 | 1. [Install Conda](https://conda.io/miniconda.html). 12 | 13 | 2. Install pipeline's Conda environment. 14 | ```bash 15 | $ bash scripts/uninstall_conda_env.sh # to remove any existing pipeline env 16 | $ bash scripts/install_conda_env.sh 17 | ``` 18 | 19 | 3. Choose `GENOME` from `hg19`, `hg38`, `mm9` and `mm10` and specify a destination directory. This will take several hours. We recommend not to run this installer on a login node of your cluster. It will take >8GB memory and >2h time. 20 | ```bash 21 | $ conda activate encd-atac 22 | $ bash scripts/build_genome_data.sh [GENOME] [DESTINATION_DIR] 23 | ``` 24 | 25 | 3. Find a TSV file on the destination directory and use it for `"atac.genome_tsv"` in your input JSON. 26 | 27 | 28 | ## How to build genome database for your own genome 29 | 30 | 1. You can build your own genome database if your reference genome has one of the following file types. 31 | * `.fasta.gz` 32 | * `.fa.gz` 33 | * `.fasta.bz2` 34 | * `.fa.gz2` 35 | * `.2bit` 36 | 37 | 2. Get a URL for your reference genome. You may need to upload it to somewhere on the internet. 38 | 39 | 3. Get a URL for a gzipped blacklist BED file for your genome. If you don't have one then skip this step. An example blacklist for hg38 is [here](https://www.encodeproject.org/files/ENCFF356LFX/@@download/ENCFF356LFX.bed.gz). 40 | 41 | 4. Find the following lines in `scripts/build_genome_data.sh` and modify them as follows. Give a good name `[YOUR_OWN_GENOME]` for your genome. For `MITO_CHR_NAME` use a correct mitochondrial chromosome name of your genome (e.g. `chrM` or `MT`). For `REGEX_BFILT_PEAK_CHR_NAME` Perl style regular expression must be used to keep regular chromosome names only in a blacklist filtered (`.bfilt.`) peaks files. This `.bfilt.` peak files are considered final peaks output of the pipeline and peaks BED files for genome browser tracks (`.bigBed` and `.hammock.gz`) are converted from these `.bfilt.` peaks files. Chromosome name filtering with `REGEX_BFILT_PEAK_CHR_NAME` will be done even without the blacklist itself. 42 | ```bash 43 | ... 44 | 45 | elif [[ $GENOME == "YOUR_OWN_GENOME" ]]; then 46 | # Perl style regular expression to keep regular chromosomes only. 47 | # this reg-ex will be applied to peaks after blacklist filtering (b-filt) with "grep -P". 48 | # so that b-filt peak file (.bfilt.*Peak.gz) will only have chromosomes matching with this pattern 49 | # this reg-ex will work even without a blacklist. 50 | # you will still be able to find a .bfilt. peak file 51 | REGEX_BFILT_PEAK_CHR_NAME="chr[\dXY]+" 52 | # mitochondrial chromosome name (e.g. chrM, MT) 53 | MITO_CHR_NAME="chrM" 54 | # URL for your reference FASTA (fasta, fasta.gz, fa, fa.gz, 2bit) 55 | REF_FA="https://some.where.com/your.genome.fa.gz" 56 | # 3-col blacklist BED file to filter out overlapping peaks from b-filt peak file (.bfilt.*Peak.gz file). 57 | # leave it empty if you don't have one 58 | BLACKLIST= 59 | ... 60 | ``` 61 | 62 | 5. Specify a destination directory for your genome database and run the installer. This will take several hours. 63 | ```bash 64 | $ bash scripts/build_genome_data.sh [YOUR_OWN_GENOME] [DESTINATION_DIR] 65 | ``` 66 | 67 | 6. Find a TSV file in the destination directory and use it for `"atac.genome_tsv"` in your input JSON. 68 | -------------------------------------------------------------------------------- /docs/how_to_config_sge.md: -------------------------------------------------------------------------------- 1 | # How to configure SGE for pipeline 2 | 3 | 1. List all parallel environments (PE) on your SGE. 4 | ```bash 5 | $ qconf -spl 6 | ``` 7 | 8 | 2. If you don't have one then ask your system admin to add a new one with name `shm`. 9 | ```bash 10 | $ sudo qconf -ap shm 11 | ``` 12 | 13 | 3. Give a large number to `slots` for your PE. 14 | ```bash 15 | $ sudo qconf -mp shm 16 | pe_name shm 17 | slots 999 18 | ... 19 | ``` 20 | 21 | 4. List all queues on your SGE. 22 | ```bash 23 | $ qconf -sql 24 | ``` 25 | 26 | 5. Ask your system admin to connect PE to your queue. 27 | ```bash 28 | $ sudo qconf -mq [QUEUE_NAME] 29 | ... 30 | pe_list make shm 31 | ... 32 | ``` 33 | -------------------------------------------------------------------------------- /docs/install_conda.md: -------------------------------------------------------------------------------- 1 | # How to install pipeline's Conda environment 2 | 3 | If you do not have miniconda (or anaconda) installed, follow the instructions below in steps 1 - 4 to install miniconda. 4 | 5 | **IF YOU ALREADY HAVE ANACONDA OR MINICONDA INSTALLED, SKIP TO STEP 5** 6 | 7 | 1) Download [Miniconda installer](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh). Use default answers to all questions except for the first and last. 8 | ```bash 9 | $ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 10 | $ bash Miniconda3-4.6.14-Linux-x86_64.sh 11 | ``` 12 | 13 | Type `yes` to the first question. 14 | ```bash 15 | Do you accept the license terms? [yes|no] 16 | [no] >>> yes 17 | ``` 18 | 19 | Type `yes` to the last question. 20 | ```bash 21 | Do you wish the installer to initialize Miniconda3 22 | by running conda init? [yes|no] 23 | [no] >>> yes 24 | ``` 25 | 26 | 2) **IMPORTANT**: Close your session and re-login. If you skip this step then pipeline's Conda environment will be messed up with base Conda environment. 27 | 28 | 3) **IMPORTANT**: Disable auto activation of base Conda enrivonment. 29 | ```bash 30 | $ conda config --set auto_activate_base false 31 | ``` 32 | 33 | 4) **IMPORTANT**: Close your session and re-login. 34 | 35 | 5) Install pipeline's Conda environment. Add `mamba` to the install command line to resolve conflicts much faster. 36 | 37 | ```bash 38 | $ bash scripts/uninstall_conda_env.sh # uninstall it for clean-install 39 | $ bash scripts/install_conda_env.sh mamba # remove mamba if it does not work 40 | ``` 41 | 42 | > **WARNING**: DO NOT PROCEED TO RUN PIPELINES UNTIL YOU SEE THE FOLLOWING SUCCESS MESSAGE OR PIPELINE WILL NOT WORK. 43 | ```bash 44 | === All done successfully === 45 | ``` 46 | 47 | 6) Activate pipeline's Conda environment before running a pipeline. 48 | ```bash 49 | $ conda activate encode-atac-seq-pipeline 50 | 51 | $ caper run ... 52 | $ caper server ... 53 | ``` 54 | -------------------------------------------------------------------------------- /docs/tutorial_dx_cli.md: -------------------------------------------------------------------------------- 1 | # Tutorial for DNAnexus Platform (CLI) 2 | 3 | All test samples and genome data are shared on our public DNAnexus project. You don't have to download any data for testing our pipeline on DNAnexus platform. 4 | 5 | There are two methods to run our pipeline on DNAnexus. 6 | 7 | 1) Building your own DX workflow from `atac.wdl` with dxWDL (CLI) 8 | 2) [Using a pre-built DX workflow on our public DX project (Web UI)](tutorial_dx_web.md) 9 | 10 | This document describes instruction for the item 1). 11 | 12 | 1. Sign up for a [DNAnexus account](https://platform.DNAnexus.com/register). 13 | 14 | 2. Create a new [DX project](https://platform.DNAnexus.com/projects) with name `[YOUR_PROJECT_NAME]` by clicking on "+New Project" on the top left. 15 | 16 | 3. Download dxWDL. 17 | ```bash 18 | $ cd 19 | $ wget https://github.com/dnanexus/dxWDL/releases/download/v1.46.4/dxWDL-v1.46.4.jar 20 | $ chmod +rx dxWDL-v1.46.4.jar 21 | ``` 22 | 23 | 4. Git clone this pipeline. 24 | ```bash 25 | $ cd 26 | $ git clone https://github.com/ENCODE-DCC/atac-seq-pipeline 27 | ``` 28 | 29 | 5. Move to pipeline's directory. 30 | ```bash 31 | $ cd atac-seq-pipeline 32 | ``` 33 | 34 | 6. Choose an appropriate input for your project (AWS or Azure): 35 | * AWS 36 | ```bash 37 | $ INPUT=example_input_json/dx/ENCSR356KRQ_subsampled_dx.json 38 | ``` 39 | * Azure 40 | ```bash 41 | $ INPUT=example_input_json/dx_azure/ENCSR356KRQ_subsampled_dx_azure.json 42 | ``` 43 | 44 | 7. Make a WDL for DNAnexus use only. The original WDL will not work with inputs (e.g. BAMs, TAs) other than FASTQs. Then compile `atac.dx.wdl` with an input JSON for the SUBSAMPLED (1/400) paired-end sample of [ENCSR356KRQ](https://www.encodeproject.org/experiments/ENCSR356KRQ/). 45 | ```bash 46 | $ cp atac.wdl atac.dx.wdl 47 | $ sed -i 's/Array\[File?\] bams = \[\]/Array\[File\] bams = \[\]/g' atac.dx.wdl 48 | $ sed -i 's/Array\[File?\] nodup_bams = \[\]/Array\[File\] nodup_bams = \[\]/g' atac.dx.wdl 49 | $ sed -i 's/Array\[File?\] tas = \[\]/Array\[File\] tas = \[\]/g' atac.dx.wdl 50 | ``` 51 | 52 | ```bash 53 | $ WDL=atac.dx.wdl 54 | $ DXWDL=dxWDL-v1.46.4.jar 55 | $ PROJECT=[YOUR_PROJECT_NAME] 56 | $ OUT_FOLDER=/test_sample_atac_ENCSR356KRQ_subsampled 57 | $ DOCKER=$(cat ${WDL} | grep caper_docker | awk 'BEGIN{FS="'\''"} {print $2}') 58 | 59 | $ java -jar ${DXWDL} compile ${WDL} -project ${PROJECT} -f -folder ${OUT_FOLDER} -defaults ${INPUT} -extras <(echo "{\"default_runtime_attributes\":{\"docker\":\"${DOCKER}\"}}") 60 | ``` 61 | 62 | 8. Go to DNAnexus [project page](https://platform.DNAnexus.com/projects) and click on your project. 63 | 64 | 9. Move to the directory `/test_sample_atac_ENCSR356KRQ_subsampled`. 65 | 66 | 10. You will find a DX workflow `atac` with all parameters pre-defined. Click on it. 67 | 68 | 11. Specify an output directory by clicking "Workflow Actions" on the top right. Click on "Set output folder" and choose an output folder. 69 | 70 | 12. Click on "Run as Analysis..." and you will be automatically redirected to the "Monitor" tab. 71 | 72 | 13. It will take about an hour. You will be able to find all outputs on your output folder. Final QC report (`qc.html`)/JSON (`qc.json`) will be found on it. 73 | 74 | 14. See full specification for [input JSON file](input.md). 75 | -------------------------------------------------------------------------------- /docs/tutorial_dx_web.md: -------------------------------------------------------------------------------- 1 | # Tutorial for DNAnexus Platform (web) 2 | 3 | All test samples and genome data are shared on our public DNAnexus project. You don't have to download any data for testing our pipeline on DNAnexus platform. 4 | 5 | There are two methods to run our pipeline on DNAnexus. 6 | 7 | 1) [Building your own DX workflow from `atac.wdl` with dxWDL (CLI)](tutorial_dx_cli.md) 8 | 2) Using a pre-built DX workflow on our public DX project (Web UI) 9 | 10 | This document describes instruction for the item 2). 11 | 12 | 1. Sign up for a [DNAnexus account](https://platform.DNAnexus.com/register). 13 | 14 | 2. Create a new [DX project](https://platform.DNAnexus.com/projects) by clicking on "+New Project" on the top left. 15 | 16 | 3. Move to one of the following workflow directories according to the platform you have chosen for your project (AWS or Azure). These DX workflows are pre-built with all parameters defined. 17 | 18 | * [AWS test workflow](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ATAC-seq/workflows): Use `[LATEST_VER]/test_ENCSR356KRQ_subsampled`. 19 | * [Azure test workflow](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ATAC-seq/workflows): Use `[LATEST_VER]/test_ENCSR356KRQ_subsampled`. 20 | 21 | 4. Copy it to your project by right-clicking on the DX workflow `atac` and choose "Copy". 22 | 23 | 5. Choose your project and create a folder for the test run by clicking on the "Folder+" icon. 24 | 25 | 6. Click on "Copy into this folder" on the bottom left. 26 | 27 | 7. Move to the target folder and click on the DX workflow `atac`. 28 | 29 | 9. Specify an output directory by clicking "Workflow Actions" on the top right. Click on "Set output folder" and choose an output folder. 30 | 31 | 10. Click on "Run as Analysis..." and you will be automatically redirected to the "Monitor" tab. 32 | 33 | 11. It will take about an hour. You will be able to find all outputs on your output folder. Final QC report (`qc.html`)/JSON (`qc.json`) will be found on it. 34 | 35 | 11. See full specification for [input JSON file](input.md). 36 | 37 | 38 | ## Extras for advanced users 39 | 40 | 1. DNAnexus allows only one copy of a workflow per project. The example workflow in the previous section is pre-built for the subsampled test sample [ENCSR356KRQ](https://www.encodeproject.org/experiments/ENCSR356KRQ/) with all parameters defined already. 41 | 42 | 2. Choose your main platform (AWS or Azure). Move to [ENCODE ATAC-seq pipeline repository for AWS](https://platform.dnanexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ATAC-seq/workflows) or [ENCODE ATAC-seq pipeline repository for Azure](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ATAC-seq/workflows). 43 | 44 | 3. Choose a folder with the latest available version. 45 | 46 | 4. Copy one of the following workflows according to the platform you have chosen for your project. 47 | > **IMPORTANT**: Make sure that you have chosen a correct platform (AWS or Azure) for your project. 48 | 49 | * general: General workflow without pre-defined reference genome. 50 | * hg38: Worfklow with pre-defined hg38 reference genome. 51 | * hg19: Worfklow with pre-defined hg19 reference genome. 52 | 53 | 5. Click on the DX workflow `atac`. 54 | 55 | 6. Specify your input files (FASTQs, BAMs, TAG-ALIGNs, ...) on the top left. For example, click on the item "fastqs_rep1_R1" and choose your R1 FASTQ file for replicate 1. See details [here](input.md) for other input types. 56 | 57 | 7. Choose a reference genome. See details [here](input.md). 58 | 59 | 8. Click on "Run as Analysis..." and you will be automatically redirected to the "Monitor" tab. 60 | 61 | -------------------------------------------------------------------------------- /example_input_json/ENCSR356KRQ_subsampled.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v4/hg38.tsv", 4 | "atac.fastqs_rep1_R1" : [ 5 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz", 6 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz" 7 | ], 8 | "atac.fastqs_rep1_R2" : [ 9 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz", 10 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz" 11 | ], 12 | "atac.fastqs_rep2_R1" : [ 13 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz", 14 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz", 15 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz", 16 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz", 17 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz", 18 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz" 19 | ], 20 | "atac.fastqs_rep2_R2" : [ 21 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz", 22 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz", 23 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz", 24 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz", 25 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz", 26 | "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz" 27 | ], 28 | "atac.paired_end" : true, 29 | "atac.auto_detect_adapter" : true, 30 | "atac.enable_xcor" : true, 31 | "atac.title" : "ENCSR356KRQ (subsampled 1/400)", 32 | "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation" 33 | } 34 | -------------------------------------------------------------------------------- /example_input_json/dx/ENCSR356KRQ_subsampled_dx.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-genome-data/genome_tsv/v4/hg38.dx.tsv", 4 | "atac.fastqs_rep1_R1" : [ 5 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz", 6 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz" 7 | ], 8 | "atac.fastqs_rep1_R2" : [ 9 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz", 10 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz" 11 | ], 12 | "atac.fastqs_rep2_R1" : [ 13 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz", 14 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz", 15 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz", 16 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz", 17 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz", 18 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz" 19 | ], 20 | "atac.fastqs_rep2_R2" : [ 21 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz", 22 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz", 23 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz", 24 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz", 25 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz", 26 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz" 27 | ], 28 | "atac.paired_end" : true, 29 | "atac.auto_detect_adapter" : true, 30 | "atac.enable_tss_enrich" : false, 31 | "atac.enable_xcor" : true, 32 | "atac.title" : "ENCSR356KRQ (subsampled 1/400)", 33 | "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation" 34 | } 35 | -------------------------------------------------------------------------------- /example_input_json/dx/ENCSR356KRQ_subsampled_rep1_dx.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-genome-data/genome_tsv/v4/hg38.dx.tsv", 4 | "atac.fastqs_rep1_R1" : [ 5 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz", 6 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz" 7 | ], 8 | "atac.fastqs_rep1_R2" : [ 9 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz", 10 | "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz" 11 | ], 12 | "atac.paired_end" : true, 13 | "atac.auto_detect_adapter" : true, 14 | "atac.enable_tss_enrich" : false, 15 | "atac.enable_xcor" : true, 16 | "atac.title" : "ENCSR356KRQ (unreplicated, subsampled 1/400)", 17 | "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation" 18 | } 19 | -------------------------------------------------------------------------------- /example_input_json/dx/template_general.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac" 3 | } 4 | -------------------------------------------------------------------------------- /example_input_json/dx/template_hg19.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-genome-data/genome_tsv/v1/hg19_dx.tsv" 4 | } 5 | -------------------------------------------------------------------------------- /example_input_json/dx/template_hg38.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-genome-data/genome_tsv/v4/hg38.dx.tsv" 4 | } 5 | -------------------------------------------------------------------------------- /example_input_json/dx/template_mm10.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-genome-data/genome_tsv/v4/mm10.dx.tsv" 4 | } 5 | -------------------------------------------------------------------------------- /example_input_json/dx/template_mm9.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-genome-data/genome_tsv/v1/mm9_dx.tsv" 4 | } 5 | -------------------------------------------------------------------------------- /example_input_json/dx_azure/ENCSR356KRQ_subsampled_dx_azure.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-genome-data/genome_tsv/v4/hg38.dx_azure.tsv", 4 | "atac.fastqs_rep1_R1" : [ 5 | "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz", 6 | "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz" 7 | ], 8 | "atac.fastqs_rep1_R2" : [ 9 | "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz", 10 | "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz" 11 | ], 12 | "atac.fastqs_rep2_R1" : [ 13 | "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz", 14 | "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz", 15 | "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz", 16 | "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz", 17 | "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz", 18 | "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz" 19 | ], 20 | "atac.fastqs_rep2_R2" : [ 21 | "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz", 22 | "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz", 23 | "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz", 24 | "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz", 25 | "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz", 26 | "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz" 27 | ], 28 | "atac.paired_end" : true, 29 | "atac.auto_detect_adapter" : true, 30 | "atac.enable_tss_enrich" : false, 31 | "atac.enable_xcor" : true, 32 | "atac.title" : "ENCSR356KRQ (subsampled 1/400)", 33 | "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation" 34 | } 35 | -------------------------------------------------------------------------------- /example_input_json/dx_azure/template_general.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac" 3 | } 4 | -------------------------------------------------------------------------------- /example_input_json/dx_azure/template_hg19.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-genome-data/genome_tsv/v1/hg19_dx_azure.tsv" 4 | } 5 | -------------------------------------------------------------------------------- /example_input_json/dx_azure/template_hg38.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-genome-data/genome_tsv/v4/hg38.dx_azure.tsv" 4 | } 5 | -------------------------------------------------------------------------------- /example_input_json/dx_azure/template_mm10.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-genome-data/genome_tsv/v4/mm10.dx_azure.tsv" 4 | } 5 | -------------------------------------------------------------------------------- /example_input_json/dx_azure/template_mm9.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-genome-data/genome_tsv/v1/mm9_dx_azure.tsv" 4 | } 5 | -------------------------------------------------------------------------------- /example_input_json/template.full.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.title" : "Example (paired end)", 3 | "atac.description" : "This is a template input JSON for paired ended sample.", 4 | 5 | "atac.pipeline_type" : "atac", 6 | "atac.align_only" : false, 7 | "atac.true_rep_only" : false, 8 | 9 | "atac.genome_tsv" : "/path_to_genome_data/hg38/hg38.tsv", 10 | 11 | "atac.paired_end" : true, 12 | "atac.paired_ends" : [true, true], 13 | 14 | "atac.fastqs_rep1_R1" : [ "rep1_R1_L1.fastq.gz", "rep1_R1_L2.fastq.gz", "rep1_R1_L3.fastq.gz" ], 15 | "atac.fastqs_rep1_R2" : [ "rep1_R2_L1.fastq.gz", "rep1_R2_L2.fastq.gz", "rep1_R2_L3.fastq.gz" ], 16 | "atac.fastqs_rep2_R1" : [ "rep2_R1_L1.fastq.gz", "rep2_R1_L2.fastq.gz" ], 17 | "atac.fastqs_rep2_R2" : [ "rep2_R2_L1.fastq.gz", "rep2_R2_L2.fastq.gz" ], 18 | 19 | "atac.read_len" : [], 20 | "atac.pseudoreplication_random_seed" : 0, 21 | 22 | "atac.auto_detect_adapter" : false, 23 | "atac.adapter" : "AATTCCGG", 24 | "atac.adapters_rep1_R1" : [ "AATTCCGG", "AATTCCGG", "AATTCCGG" ], 25 | "atac.adapters_rep1_R2" : [ "AATTCCGG", "AATTCCGG" ], 26 | "atac.adapters_rep2_R1" : [ "AATTCCGG", "AATTCCGG", "AATTCCGG" ], 27 | "atac.adapters_rep2_R2" : [ "AATTCCGG", "AATTCCGG" ], 28 | "atac.cutadapt_param" : "-e 0.1 -m 5", 29 | 30 | "atac.multimapping" : 4, 31 | 32 | "atac.mapq_thresh" : 30, 33 | "atac.dup_marker" : "picard", 34 | "atac.no_dup_removal" : false, 35 | 36 | "atac.subsample_reads" : 0, 37 | "atac.xcor_subsample_reads" : 25000000, 38 | 39 | "atac.cap_num_peak" : 300000, 40 | "atac.pval_thresh" : 0.01, 41 | "atac.smooth_win" : 150, 42 | 43 | "atac.enable_idr" : true, 44 | "atac.idr_thresh" : 0.05, 45 | 46 | "atac.enable_xcor" : false, 47 | "atac.enable_count_signal_track" : false, 48 | 49 | "atac.filter_chrs" : ["chrM", "MT"], 50 | 51 | "atac.enable_preseq" : false, 52 | "atac.enable_compare_to_roadmap" : false, 53 | "atac.enable_tss_enrich" : true, 54 | "atac.enable_gc_bias" : true, 55 | 56 | "atac.align_cpu" : 6, 57 | "atac.align_mem_factor" : 0.15, 58 | "atac.align_time_hr" : 48, 59 | "atac.align_disk_factor" : 8.0, 60 | 61 | "atac.filter_cpu" : 4, 62 | "atac.filter_mem_factor" : 0.4, 63 | "atac.filter_time_hr" : 24, 64 | "atac.filter_disk_factor" : 8.0, 65 | 66 | "atac.bam2ta_cpu" : 2, 67 | "atac.bam2ta_mem_factor" : 0.3, 68 | "atac.bam2ta_time_hr" : 12, 69 | "atac.bam2ta_disk_factor" : 4.0, 70 | 71 | "atac.spr_mem_factor" : 13.5, 72 | "atac.spr_disk_factor" : 18.0, 73 | 74 | "atac.jsd_cpu" : 4, 75 | "atac.jsd_mem_factor" : 0.1, 76 | "atac.jsd_time_hr" : 12, 77 | "atac.jsd_disk_factor" : 2.0, 78 | 79 | "atac.xcor_cpu" : 2, 80 | "atac.xcor_mem_factor" : 1.0, 81 | "atac.xcor_time_hr" : 6, 82 | "atac.xcor_disk_factor" : 4.5, 83 | 84 | "atac.call_peak_cpu" : 2, 85 | "atac.call_peak_mem_factor" : 2.0, 86 | "atac.call_peak_time_hr" : 24, 87 | "atac.call_peak_disk_factor" : 30.0, 88 | 89 | "atac.macs2_signal_track_mem_factor" : 12.0, 90 | "atac.macs2_signal_track_time_hr" : 24, 91 | "atac.macs2_signal_track_disk_factor" : 80.0, 92 | 93 | "atac.preseq_mem_factor" : 0.5, 94 | "atac.preseq_disk_factor" : 5.0 95 | } 96 | -------------------------------------------------------------------------------- /example_input_json/template.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.title" : "Example (paired end)", 3 | "atac.description" : "This is a template input JSON for paired ended sample.", 4 | 5 | "atac.pipeline_type" : "atac", 6 | "atac.align_only" : false, 7 | "atac.true_rep_only" : false, 8 | 9 | "atac.genome_tsv" : "/path_to_genome_data/hg38/hg38.tsv", 10 | 11 | "atac.paired_end" : true, 12 | 13 | "atac.fastqs_rep1_R1" : [ "rep1_R1_L1.fastq.gz", "rep1_R1_L2.fastq.gz", "rep1_R1_L3.fastq.gz" ], 14 | "atac.fastqs_rep1_R2" : [ "rep1_R2_L1.fastq.gz", "rep1_R2_L2.fastq.gz", "rep1_R2_L3.fastq.gz" ], 15 | "atac.fastqs_rep2_R1" : [ "rep2_R1_L1.fastq.gz", "rep2_R1_L2.fastq.gz" ], 16 | "atac.fastqs_rep2_R2" : [ "rep2_R2_L1.fastq.gz", "rep2_R2_L2.fastq.gz" ], 17 | 18 | "atac.auto_detect_adapter" : false, 19 | "atac.adapter" : "AATTCCGG", 20 | "atac.adapters_rep1_R1" : [ "AATTCCGG", "AATTCCGG", "AATTCCGG" ], 21 | "atac.adapters_rep1_R2" : [ "AATTCCGG", "AATTCCGG" ], 22 | "atac.adapters_rep2_R1" : [ "AATTCCGG", "AATTCCGG", "AATTCCGG" ], 23 | "atac.adapters_rep2_R2" : [ "AATTCCGG", "AATTCCGG" ], 24 | 25 | "atac.multimapping" : 4 26 | } 27 | -------------------------------------------------------------------------------- /example_input_json/terra/ENCSR356KRQ_subsampled.terra.json: -------------------------------------------------------------------------------- 1 | { 2 | "atac.pipeline_type" : "atac", 3 | "atac.genome_tsv" : "gs://encode-pipeline-genome-data/genome_tsv/v4/hg38.terra.tsv", 4 | "atac.fastqs_rep1_R1" : [ 5 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz", 6 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz" 7 | ], 8 | "atac.fastqs_rep1_R2" : [ 9 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz", 10 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz" 11 | ], 12 | "atac.fastqs_rep2_R1" : [ 13 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz", 14 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz", 15 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz", 16 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz", 17 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz", 18 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz" 19 | ], 20 | "atac.fastqs_rep2_R2" : [ 21 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz", 22 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz", 23 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz", 24 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz", 25 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz", 26 | "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz" 27 | ], 28 | "atac.paired_end" : true, 29 | "atac.auto_detect_adapter" : true, 30 | "atac.enable_xcor" : true, 31 | "atac.title" : "ENCSR356KRQ (subsampled 1/400)", 32 | "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation" 33 | } 34 | -------------------------------------------------------------------------------- /scripts/install_conda_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e # Stop on error 3 | 4 | install_ucsc_tools_369() { 5 | # takes in conda env name and find conda bin 6 | CONDA_BIN=$(conda run -n $1 bash -c "echo \$(dirname \$(which python))") 7 | curl -o "$CONDA_BIN/fetchChromSizes" "https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/fetchChromSizes" 8 | curl -o "$CONDA_BIN/wigToBigWig" "https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/wigToBigWig" 9 | curl -o "$CONDA_BIN/bedGraphToBigWig" "https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/bedGraphToBigWig" 10 | curl -o "$CONDA_BIN/bigWigInfo" "https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/bigWigInfo" 11 | curl -o "$CONDA_BIN/bedClip" "https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/bedClip" 12 | curl -o "$CONDA_BIN/bedToBigBed" "https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/bedToBigBed" 13 | curl -o "$CONDA_BIN/twoBitToFa" "https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/twoBitToFa" 14 | curl -o "$CONDA_BIN/bigWigAverageOverBed" "https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/bigWigAverageOverBed" 15 | 16 | chmod +x "$CONDA_BIN/fetchChromSizes" 17 | chmod +x "$CONDA_BIN/wigToBigWig" 18 | chmod +x "$CONDA_BIN/bedGraphToBigWig" 19 | chmod +x "$CONDA_BIN/bigWigInfo" 20 | chmod +x "$CONDA_BIN/bedClip" 21 | chmod +x "$CONDA_BIN/bedToBigBed" 22 | chmod +x "$CONDA_BIN/twoBitToFa" 23 | chmod +x "$CONDA_BIN/bigWigAverageOverBed" 24 | } 25 | 26 | SH_SCRIPT_DIR=$(cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd) 27 | 28 | echo "$(date): Installing pipeline's Conda environments..." 29 | 30 | conda create -n encd-atac --file ${SH_SCRIPT_DIR}/requirements.txt \ 31 | --override-channels -c bioconda -c defaults -y 32 | 33 | conda create -n encd-atac-macs2 --file ${SH_SCRIPT_DIR}/requirements.macs2.txt \ 34 | --override-channels -c bioconda -c defaults -y 35 | 36 | conda create -n encd-atac-py2 --file ${SH_SCRIPT_DIR}/requirements.python2.txt \ 37 | --override-channels -c conda-forge -c bioconda -c defaults -y 38 | 39 | conda create -n encd-atac-spp --file ${SH_SCRIPT_DIR}/requirements.spp.txt \ 40 | -c r -c bioconda -c defaults -y 41 | 42 | # adhoc fix for the following issues: 43 | # - https://github.com/ENCODE-DCC/chip-seq-pipeline2/issues/259 44 | # - https://github.com/ENCODE-DCC/chip-seq-pipeline2/issues/265 45 | # force-install readline 6.2, ncurses 5.9 from conda-forge (ignoring dependencies) 46 | #conda install -n encd-atac-spp --no-deps --no-update-deps -y \ 47 | # readline==6.2 ncurses==5.9 -c conda-forge 48 | 49 | CONDA_BIN=$(conda run -n encd-atac-spp bash -c "echo \$(dirname \$(which python))") 50 | 51 | echo "$(date): Installing phantompeakqualtools in Conda environments..." 52 | RUN_SPP="https://raw.githubusercontent.com/kundajelab/phantompeakqualtools/1.2.2/run_spp.R" 53 | conda run -n encd-atac-spp bash -c \ 54 | "curl -o $CONDA_BIN/run_spp.R $RUN_SPP && chmod +x $CONDA_BIN/run_spp.R" 55 | 56 | echo "$(date): Installing R packages in Conda environments..." 57 | CRAN="https://cran.r-project.org/" 58 | conda run -n encd-atac-spp bash -c \ 59 | "Rscript -e \"install.packages('snow', repos='$CRAN')\"" 60 | conda run -n encd-atac-spp bash -c \ 61 | "Rscript -e \"install.packages('snowfall', repos='$CRAN')\"" 62 | conda run -n encd-atac-spp bash -c \ 63 | "Rscript -e \"install.packages('bitops', repos='$CRAN')\"" 64 | conda run -n encd-atac-spp bash -c \ 65 | "Rscript -e \"install.packages('caTools', repos='$CRAN')\"" 66 | conda run -n encd-atac-spp bash -c \ 67 | "Rscript -e \"install.packages('BiocManager', repos='$CRAN')\"" 68 | conda run -n encd-atac-spp bash -c \ 69 | "Rscript -e \"require('BiocManager'); BiocManager::install('Rsamtools'); BiocManager::install('Rcpp')\"" 70 | 71 | echo "$(date): Installing R spp 1.15.5 in Conda environments..." 72 | SPP="https://cran.r-project.org/src/contrib/Archive/spp/spp_1.15.5.tar.gz" 73 | SPP_BASENAME=$(basename $SPP) 74 | curl -o "$CONDA_BIN/$SPP_BASENAME" "$SPP" 75 | conda run -n encd-atac-spp bash -c \ 76 | "Rscript -e \"install.packages('$CONDA_BIN/$SPP_BASENAME')\"" 77 | 78 | echo "$(date): Installing USCS tools (v369)..." 79 | install_ucsc_tools_369 encd-atac 80 | install_ucsc_tools_369 encd-atac-spp 81 | install_ucsc_tools_369 encd-atac-macs2 82 | 83 | echo "$(date): Done successfully." 84 | echo 85 | echo "If you see openssl,readline,ncurses lib errors while running pipelines" 86 | echo "then switch to Singularity method. Conda method will not work on your system." 87 | 88 | bash ${SH_SCRIPT_DIR}/update_conda_env.sh 89 | -------------------------------------------------------------------------------- /scripts/requirements.macs2.txt: -------------------------------------------------------------------------------- 1 | # Conda environment for tasks (macs2, macs2_signal_track) in atac/chip 2 | 3 | nomkl # using MKL can change MACS2 output randomly on different platforms 4 | python >=3 5 | 6 | macs2 ==2.2.4 7 | bedtools ==2.29.0 8 | bedops ==2.4.39 9 | pybedtools ==0.8.0 10 | pybigwig ==0.3.13 11 | tabix 12 | 13 | matplotlib 14 | ghostscript 15 | 16 | -------------------------------------------------------------------------------- /scripts/requirements.python2.txt: -------------------------------------------------------------------------------- 1 | # Conda environment for python2-based tasks (tss_enrich) in atac/chip 2 | # (metaseq is still in py2) 3 | 4 | python ==2.7.16 5 | 6 | biopython ==1.76 7 | metaseq ==0.5.6 8 | samtools ==1.9 9 | gffutils ==0.10.1 # 0.11.0 is not py2 compatible 10 | 11 | python-dateutil ==2.8.0 12 | grep 13 | tar 14 | ghostscript 15 | -------------------------------------------------------------------------------- /scripts/requirements.spp.txt: -------------------------------------------------------------------------------- 1 | # Conda environment for tasks (spp, xcor) in atac/chip 2 | # some packages (phantompeakquals, r-spp) will be installed separately 3 | # couldn't resolve all conda conflicts 4 | 5 | python >=3 6 | bedtools ==2.29.0 7 | bedops ==2.4.39 8 | 9 | r-base ==3.6.1 10 | 11 | tabix 12 | 13 | matplotlib 14 | pandas 15 | numpy 16 | ghostscript 17 | 18 | -------------------------------------------------------------------------------- /scripts/requirements.txt: -------------------------------------------------------------------------------- 1 | # default Conda environment for atac/chip 2 | 3 | python >=3 4 | bwa ==0.7.17 5 | bowtie2 ==2.3.4.3 6 | tbb ==2020.2 # use old version to fix libtbb.so.2 error for bowtie2 7 | samtools ==1.9 8 | htslib ==1.9 9 | bedtools ==2.29.0 10 | sambamba ==0.6.6 11 | 12 | pysam ==0.15.3 13 | pybedtools ==0.8.0 14 | pybigwig ==0.3.13 15 | 16 | deeptools ==3.3.1 17 | cutadapt ==2.5 18 | preseq ==2.0.3 19 | pyfaidx ==0.5.5.2 20 | bedops ==2.4.39 21 | 22 | ptools_bin 23 | 24 | jsondiff ==1.1.1 25 | ghostscript 26 | tabix 27 | matplotlib 28 | numpy 29 | scikit-learn 30 | scipy 31 | pandas 32 | jinja2 33 | gsl 34 | 35 | samstats ==0.2.1 36 | idr ==2.0.4.2 37 | 38 | java-jdk 39 | 40 | picard ==2.20.7 41 | trimmomatic ==0.39 42 | 43 | -------------------------------------------------------------------------------- /scripts/uninstall_conda_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PIPELINE_CONDA_ENVS=( 4 | encd-atac 5 | encd-atac-macs2 6 | encd-atac-spp 7 | encd-atac-py2 8 | ) 9 | for PIPELINE_CONDA_ENV in "${PIPELINE_CONDA_ENVS[@]}" 10 | do 11 | conda env remove -n ${PIPELINE_CONDA_ENV} -y 12 | done 13 | -------------------------------------------------------------------------------- /scripts/update_conda_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e # Stop on error 3 | 4 | SH_SCRIPT_DIR=$(cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd) 5 | SRC_DIR=${SH_SCRIPT_DIR}/../src 6 | 7 | PIPELINE_CONDA_ENVS=( 8 | encd-atac 9 | encd-atac-macs2 10 | encd-atac-spp 11 | encd-atac-py2 12 | ) 13 | chmod u+rx ${SRC_DIR}/*.py 14 | 15 | echo "$(date): Updating WDL task wrappers on each Conda environment..." 16 | for PIPELINE_CONDA_ENV in "${PIPELINE_CONDA_ENVS[@]}" 17 | do 18 | CONDA_BIN=$(dirname $(conda run -n ${PIPELINE_CONDA_ENV} which python)) 19 | echo -e "$(date): Transferring WDL task wrappers to ${CONDA_BIN}..." 20 | cp -f ${SRC_DIR}/*.py ${CONDA_BIN}/ 21 | done 22 | -------------------------------------------------------------------------------- /src/assign_multimappers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # piped script to take multimappers and randomly assign 4 | # requires a qname sorted file!! 5 | 6 | import sys 7 | import random 8 | import argparse 9 | 10 | 11 | def parse_args(): 12 | ''' 13 | Gives options 14 | ''' 15 | parser = argparse.ArgumentParser( 16 | description='Saves reads below a alignment threshold and discards all others') 17 | parser.add_argument('-k', help='Alignment number cutoff') 18 | parser.add_argument('--paired-end', dest='paired_ended', 19 | action='store_true', help='Data is paired-end') 20 | args = parser.parse_args() 21 | alignment_cutoff = int(args.k) 22 | paired_ended = args.paired_ended 23 | 24 | return alignment_cutoff, paired_ended 25 | 26 | 27 | if __name__ == "__main__": 28 | ''' 29 | Runs the filtering step of choosing multimapped reads 30 | ''' 31 | 32 | [alignment_cutoff, paired_ended] = parse_args() 33 | 34 | if paired_ended: 35 | alignment_cutoff = int(alignment_cutoff) * 2 36 | 37 | # Store each line in sam file as a list of reads, 38 | # where each read is a list of elements to easily 39 | # modify or grab things 40 | current_reads = [] 41 | current_qname = '' 42 | 43 | for line in sys.stdin: 44 | 45 | read_elems = line.strip().split('\t') 46 | 47 | if read_elems[0].startswith('@'): 48 | sys.stdout.write(line) 49 | continue 50 | 51 | # Keep taking lines that have the same qname 52 | if read_elems[0] == current_qname: 53 | # Add line to current reads 54 | current_reads.append(line) 55 | pass 56 | else: 57 | # Discard if there are more than the alignment cutoff 58 | if len(current_reads) > alignment_cutoff: 59 | current_reads = [line] 60 | current_qname = read_elems[0] 61 | elif len(current_reads) > 0: 62 | # Just output all reads, which are then filtered with 63 | # samtools 64 | for read in current_reads: 65 | sys.stdout.write(str(read)) 66 | 67 | # And then discard 68 | current_reads = [line] 69 | current_qname = read_elems[0] 70 | else: 71 | # First read in file 72 | current_reads.append(line) 73 | current_qname = read_elems[0] 74 | -------------------------------------------------------------------------------- /src/detect_adapter.py: -------------------------------------------------------------------------------- 1 | # written by Nathan Boley, from https://github.com/nboley/GGR_code 2 | 3 | import sys 4 | import gzip 5 | 6 | VERBOSE = False 7 | 8 | adapters = { 9 | 'Illumina': b'AGATCGGAAGAGC', 10 | 'Nextera ': b'CTGTCTCTTATA', 11 | 'smallRNA': b'TGGAATTCTCGG' 12 | } 13 | 14 | 15 | def open_gz(fname): 16 | return gzip.open(fname) if fname.endswith('.gz') else open(fname, 'rb') 17 | 18 | 19 | def detect_adapters_and_cnts(fname, max_n_lines=1000000): 20 | adapter_cnts = { 21 | 'Illumina': 0, 22 | 'Nextera ': 0, 23 | 'smallRNA': 0 24 | } 25 | 26 | with open_gz(fname) as fp: 27 | # read the first million sequences or to the end of the while -- whichever 28 | # comes first, and then use the adapter for trimming which was found to 29 | # occur most often 30 | for seq_index, line in enumerate(fp): 31 | if seq_index >= max_n_lines: 32 | break 33 | if seq_index % 4 != 1: 34 | continue 35 | for key in adapters: 36 | if line.find(adapters[key]) > -1: 37 | adapter_cnts[key] += 1 38 | 39 | observed_adapters = [ 40 | adapter for adapter, cnt in sorted( 41 | adapter_cnts.items(), key=lambda x: -x[1]) 42 | if cnt > 0 43 | ] 44 | return observed_adapters, adapter_cnts, seq_index//4 45 | 46 | 47 | def detect_most_likely_adapter(fname): 48 | observed_adapters, adapter_cnts, n_obs_adapters = detect_adapters_and_cnts( 49 | fname) 50 | if observed_adapters: 51 | best_adapter = observed_adapters[0] 52 | else: 53 | best_adapter = "" 54 | 55 | if VERBOSE: 56 | print("\n\nAUTO-DETECTING ADAPTER TYPE\n===========================") 57 | print("Attempting to auto-detect adapter type from the first 1 million sequences of the first file (>> {} <<)\n".format( 58 | fname) 59 | ) 60 | print("Found perfect matches for the following adapter sequences:") 61 | print("Adapter type\tCount\tSequence\tSequences analysed\tPercentage") 62 | for adapter in observed_adapters: 63 | print("{}\t{}\t{}\t{}\t\t\t{:.2%}".format( 64 | adapter, 65 | adapter_cnts[adapter], 66 | adapters[adapter].decode(), 67 | n_obs_adapters, 68 | adapter_cnts[adapter]/n_obs_adapters) 69 | ) 70 | if best_adapter: 71 | return adapters[best_adapter].decode() 72 | else: 73 | return "" 74 | 75 | 76 | def main(): 77 | global VERBOSE 78 | VERBOSE = False 79 | best_adapter = detect_most_likely_adapter(sys.argv[1]) 80 | print(best_adapter) 81 | 82 | 83 | if __name__ == '__main__': 84 | main() 85 | -------------------------------------------------------------------------------- /src/dev_check_sync_atac.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | diff ../../atac-seq-pipeline/src/encode_task_bam2ta.py encode_task_bam2ta.py 4 | diff ../../atac-seq-pipeline/src/encode_lib_blacklist_filter.py encode_lib_blacklist_filter.py 5 | diff ../../atac-seq-pipeline/src/encode_lib_genomic.py encode_lib_genomic.py 6 | diff ../../atac-seq-pipeline/src/encode_lib_log_parser.py encode_lib_log_parser.py 7 | diff ../../atac-seq-pipeline/src/encode_lib_common.py encode_lib_common.py 8 | diff ../../atac-seq-pipeline/src/encode_task_bowtie2.py encode_task_bowtie2.py 9 | diff ../../atac-seq-pipeline/src/encode_task_filter.py encode_task_filter.py 10 | diff ../../atac-seq-pipeline/src/encode_task_post_align.py encode_task_post_align.py 11 | diff ../../atac-seq-pipeline/src/encode_lib_frip.py encode_lib_frip.py 12 | diff ../../atac-seq-pipeline/src/encode_task_idr.py encode_task_idr.py 13 | diff ../../atac-seq-pipeline/src/encode_task_overlap.py encode_task_overlap.py 14 | diff ../../atac-seq-pipeline/src/encode_task_pool_ta.py encode_task_pool_ta.py 15 | diff ../../atac-seq-pipeline/src/encode_task_qc_report.py encode_task_qc_report.py 16 | diff ../../atac-seq-pipeline/src/encode_task_reproducibility.py encode_task_reproducibility.py 17 | diff ../../atac-seq-pipeline/src/encode_task_spr.py encode_task_spr.py 18 | diff ../../atac-seq-pipeline/src/encode_task_xcor.py encode_task_xcor.py 19 | diff ../../atac-seq-pipeline/src/encode_task_jsd.py encode_task_jsd.py 20 | diff ../../atac-seq-pipeline/src/encode_task_gc_bias.py encode_task_gc_bias.py 21 | 22 | -------------------------------------------------------------------------------- /src/encode_lib_blacklist_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ENCODE DCC blacklist filter wrapper 4 | # Author: Jin Lee (leepc12@gmail.com) 5 | 6 | import sys 7 | import os 8 | import argparse 9 | from encode_lib_common import ( 10 | get_ext, get_num_lines, gunzip, log, mkdir_p, 11 | rm_f, run_shell_cmd, strip_ext, strip_ext_bam) 12 | 13 | 14 | def parse_arguments(): 15 | parser = argparse.ArgumentParser(prog='ENCODE DCC Blacklist filter.') 16 | parser.add_argument('peak', type=str, help='Peak file.') 17 | parser.add_argument('--blacklist', type=str, 18 | help='Blacklist BED file.') 19 | parser.add_argument('--regex-bfilt-peak-chr-name', 20 | help='Keep chromosomes matching this pattern only ' 21 | 'in .bfilt. peak files.') 22 | parser.add_argument('--out-dir', default='', type=str, 23 | help='Output directory.') 24 | parser.add_argument('--log-level', default='INFO', 25 | choices=['NOTSET', 'DEBUG', 'INFO', 26 | 'WARNING', 'CRITICAL', 'ERROR', 27 | 'CRITICAL'], 28 | help='Log level') 29 | args = parser.parse_args() 30 | if args.blacklist is None or args.blacklist.endswith('null'): 31 | args.blacklist = '' 32 | 33 | log.setLevel(args.log_level) 34 | log.info(sys.argv) 35 | return args 36 | 37 | 38 | def blacklist_filter(peak, blacklist, regex_bfilt_peak_chr_name, out_dir): 39 | prefix = os.path.join( 40 | out_dir, 41 | os.path.basename(strip_ext(peak))) 42 | peak_ext = get_ext(peak) 43 | filtered = '{}.bfilt.{}.gz'.format(prefix, peak_ext) 44 | if regex_bfilt_peak_chr_name is None: 45 | regex_bfilt_peak_chr_name = '' 46 | 47 | if blacklist is None or blacklist == '' or get_num_lines(peak) == 0 \ 48 | or get_num_lines(blacklist) == 0: 49 | cmd = 'zcat -f {} | ' 50 | cmd += 'grep -P \'{}\\b\' | ' 51 | cmd += 'gzip -nc > {}' 52 | cmd = cmd.format( 53 | peak, 54 | regex_bfilt_peak_chr_name, 55 | filtered) 56 | run_shell_cmd(cmd) 57 | else: 58 | # due to bedtools bug when .gz is given for -a and -b 59 | tmp1 = gunzip(peak, 'tmp1', out_dir) 60 | tmp2 = gunzip(blacklist, 'tmp2', out_dir) 61 | 62 | cmd = 'bedtools intersect -nonamecheck -v -a {} -b {} | ' 63 | cmd += 'awk \'BEGIN{{OFS="\\t"}} ' 64 | cmd += '{{if ($5>1000) $5=1000; print $0}}\' | ' 65 | cmd += 'grep -P \'{}\\b\' | ' 66 | cmd += 'gzip -nc > {}' 67 | cmd = cmd.format( 68 | tmp1, # peak 69 | tmp2, # blacklist 70 | regex_bfilt_peak_chr_name, # regex 71 | filtered) 72 | run_shell_cmd(cmd) 73 | rm_f([tmp1, tmp2]) 74 | return filtered 75 | 76 | 77 | def blacklist_filter_bam(bam, blacklist, out_dir): 78 | prefix = os.path.join(out_dir, 79 | os.path.basename(strip_ext_bam(bam))) 80 | filtered = '{}.bfilt.bam'.format(prefix) 81 | 82 | if blacklist == '' or get_num_lines(blacklist) == 0: 83 | cmd = 'cp -f {b} {f}'.format(b=bam, f=filtered) 84 | run_shell_cmd(cmd) 85 | else: 86 | # due to bedtools bug when .gz is given for -a and -b 87 | tmp2 = gunzip(blacklist, 'tmp2', out_dir) 88 | 89 | cmd = 'bedtools intersect -nonamecheck -v -abam {} -b {} > {}' 90 | cmd = cmd.format( 91 | bam, 92 | tmp2, # blacklist 93 | filtered) 94 | run_shell_cmd(cmd) 95 | rm_f([tmp2]) 96 | return filtered 97 | 98 | 99 | def main(): 100 | # read params 101 | args = parse_arguments() 102 | log.info('Initializing and making output directory...') 103 | 104 | # make out_dir (root of all outputs) 105 | mkdir_p(args.out_dir) 106 | 107 | # reproducibility QC 108 | log.info('Filtering peak with blacklist...') 109 | blacklist_filter( 110 | args.peak, args.blacklist, 111 | args.keep_irregular_chr, args.out_dir) 112 | 113 | log.info('All done.') 114 | 115 | 116 | if __name__ == '__main__': 117 | main() 118 | -------------------------------------------------------------------------------- /src/encode_lib_frip.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ENCODE DCC FRiP wrapper 4 | # Author: Jin Lee (leepc12@gmail.com) 5 | 6 | import sys 7 | import os 8 | import argparse 9 | from encode_lib_common import ( 10 | get_num_lines, gunzip, log, ls_l, mkdir_p, rm_f, 11 | run_shell_cmd, strip_ext, write_txt) 12 | 13 | 14 | def parse_arguments(): 15 | parser = argparse.ArgumentParser(prog='ENCODE DCC FRiP.', 16 | description='') 17 | parser.add_argument('peak', type=str, 18 | help='Peak file.') 19 | parser.add_argument('ta', type=str, 20 | help='TAGALIGN file.') 21 | parser.add_argument('--chrsz', type=str, 22 | help='2-col chromosome sizes file. \ 23 | If given, do shifted FRiP (for ChIP-Seq).') 24 | parser.add_argument('--fraglen', type=int, default=0, 25 | help='Fragment length for TAGALIGN file. \ 26 | If given, do shifted FRiP (for ChIP-Seq).') 27 | parser.add_argument('--out-dir', default='', type=str, 28 | help='Output directory.') 29 | parser.add_argument('--log-level', default='INFO', 30 | choices=['NOTSET', 'DEBUG', 'INFO', 31 | 'WARNING', 'CRITICAL', 'ERROR', 32 | 'CRITICAL'], 33 | help='Log level') 34 | args = parser.parse_args() 35 | log.setLevel(args.log_level) 36 | log.info(sys.argv) 37 | return args 38 | 39 | 40 | def frip(ta, peak, out_dir): 41 | prefix = os.path.join(out_dir, 42 | os.path.basename(strip_ext(peak))) 43 | frip_qc = '{}.frip.qc'.format(prefix) 44 | 45 | if get_num_lines(peak) == 0: 46 | val1 = 0.0 47 | tmp_files = [] 48 | else: 49 | # due to bedtools bug when .gz is given for -a and -b 50 | tmp1 = gunzip(ta, 'tmp1', out_dir) 51 | tmp2 = gunzip(peak, 'tmp2', out_dir) 52 | 53 | cmd = 'bedtools intersect -nonamecheck -a {} -b {} -wa -u | wc -l' 54 | cmd = cmd.format( 55 | tmp1, # ta 56 | tmp2) # peak 57 | val1 = run_shell_cmd(cmd) 58 | tmp_files = [tmp1, tmp2] 59 | val2 = get_num_lines(ta) 60 | write_txt(frip_qc, str(float(val1)/float(val2))) 61 | rm_f(tmp_files) 62 | return frip_qc 63 | 64 | 65 | def frip_shifted(ta, peak, chrsz, fraglen, out_dir): 66 | prefix = os.path.join(out_dir, 67 | os.path.basename(strip_ext(peak))) 68 | frip_qc = '{}.frip.qc'.format(prefix) 69 | half_fraglen = (fraglen+1)/2 70 | 71 | if get_num_lines(peak) == 0: 72 | val1 = 0.0 73 | else: 74 | # due to bedtools bug when .gz is given for -a and -b 75 | tmp2 = gunzip(peak, 'tmp2', out_dir) 76 | 77 | cmd = 'bedtools slop -i {} -g {} ' 78 | cmd += '-s -l {} -r {} | ' 79 | cmd += 'awk \'{{if ($2>=0 && $3>=0 && $2<=$3) print $0}}\' | ' 80 | cmd += 'bedtools intersect -nonamecheck -a stdin -b {} ' 81 | cmd += '-wa -u | wc -l' 82 | cmd = cmd.format( 83 | ta, 84 | chrsz, 85 | -half_fraglen, 86 | half_fraglen, 87 | tmp2) # peak 88 | val1 = run_shell_cmd(cmd) 89 | rm_f(tmp2) 90 | val2 = get_num_lines(ta) 91 | write_txt(frip_qc, str(float(val1)/float(val2))) 92 | return frip_qc 93 | 94 | 95 | def main(): 96 | # read params 97 | args = parse_arguments() 98 | log.info('Initializing and making output directory...') 99 | mkdir_p(args.out_dir) 100 | 101 | if args.fraglen: 102 | frip_shifted(args.ta, args.peak, 103 | args.chrsz, args.fraglen, args.out_dir) 104 | else: 105 | frip(args.ta, args.peak, args.out_dir) 106 | 107 | log.info('List all files in output directory...') 108 | ls_l(args.out_dir) 109 | 110 | log.info('All done.') 111 | 112 | 113 | if __name__ == '__main__': 114 | main() 115 | -------------------------------------------------------------------------------- /src/encode_task_annot_enrich.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ENCODE annot_enrich (fraction of reads in annotated regions) wrapper 4 | # Author: Daniel Kim, Jin Lee (leepc12@gmail.com) 5 | 6 | import sys 7 | import os 8 | import argparse 9 | from encode_lib_common import ( 10 | run_shell_cmd, strip_ext_ta, 11 | ls_l, get_num_lines, log) 12 | import warnings 13 | warnings.filterwarnings("ignore") 14 | 15 | 16 | def parse_arguments(): 17 | parser = argparse.ArgumentParser( 18 | prog='ENCODE annot_enrich (fraction of reads in annotated regions)') 19 | parser.add_argument( 20 | '--ta', type=str, help='TAG-ALIGN file (from task bam2ta).') 21 | parser.add_argument('--dnase', type=str, help='DNase definition bed file.') 22 | parser.add_argument('--blacklist', type=str, help='Blacklist bed file.') 23 | parser.add_argument('--prom', type=str, 24 | help='Promoter definition bed file.') 25 | parser.add_argument('--enh', type=str, 26 | help='Enhancer definition bed file.') 27 | parser.add_argument('--out-dir', default='', type=str, 28 | help='Output directory.') 29 | parser.add_argument('--log-level', default='INFO', help='Log level', 30 | choices=['NOTSET', 'DEBUG', 'INFO', 'WARNING', 31 | 'CRITICAL', 'ERROR', 'CRITICAL']) 32 | args = parser.parse_args() 33 | log.setLevel(args.log_level) 34 | log.info(sys.argv) 35 | return args 36 | 37 | 38 | def get_fract_reads_in_regions(reads_bed, regions_bed): 39 | """Function that takes in bed file of reads and bed file of regions and 40 | gets fraction of reads sitting in said regions 41 | """ 42 | # uses new run_shell_cmd 43 | cmd = "bedtools sort -i {} | " 44 | cmd += "bedtools merge -i stdin | " 45 | cmd += "bedtools intersect -u -nonamecheck -a {} -b stdin | " 46 | cmd += "wc -l" 47 | cmd = cmd.format(regions_bed, reads_bed) 48 | intersect_read_count = int(run_shell_cmd(cmd)) 49 | total_read_count = get_num_lines(reads_bed) 50 | fract_reads = float(intersect_read_count) / total_read_count 51 | 52 | return intersect_read_count, fract_reads 53 | 54 | 55 | def main(): 56 | # read params 57 | args = parse_arguments() 58 | FINAL_BED = args.ta 59 | OUTPUT_PREFIX = os.path.join( 60 | args.out_dir, 61 | os.path.basename(strip_ext_ta(FINAL_BED))) 62 | 63 | DNASE = args.dnase if args.dnase and os.path.basename( 64 | args.dnase) != 'null' else '' 65 | BLACKLIST = args.blacklist if args.blacklist and os.path.basename( 66 | args.blacklist) != 'null' else '' 67 | PROM = args.prom if args.prom and os.path.basename( 68 | args.prom) != 'null' else '' 69 | ENH = args.enh if args.enh and os.path.basename(args.enh) != 'null' else '' 70 | 71 | result = [] 72 | # Dnase regions 73 | if DNASE: 74 | reads_dnase, fract_dnase = get_fract_reads_in_regions(FINAL_BED, DNASE) 75 | result.append(('fraction_of_reads_in_universal_DHS_regions', 76 | str(reads_dnase), str(fract_dnase))) 77 | 78 | # Blacklist regions 79 | if BLACKLIST: 80 | reads_blacklist, \ 81 | fract_blacklist = get_fract_reads_in_regions(FINAL_BED, BLACKLIST) 82 | result.append(('fraction_of_reads_in_blacklist_regions', 83 | str(reads_blacklist), str(fract_blacklist))) 84 | 85 | # Prom regions 86 | if PROM: 87 | reads_prom, fract_prom = get_fract_reads_in_regions(FINAL_BED, PROM) 88 | result.append(('fraction_of_reads_in_promoter_regions', 89 | str(reads_prom), str(fract_prom))) 90 | 91 | # Enh regions 92 | if ENH: 93 | reads_enh, fract_enh = get_fract_reads_in_regions(FINAL_BED, ENH) 94 | result.append(('fraction_of_reads_in_enhancer_regions', 95 | str(reads_enh), str(fract_enh))) 96 | 97 | annot_enrich_qc = OUTPUT_PREFIX + '.annot_enrich.qc' 98 | with open(annot_enrich_qc, 'w') as fp: 99 | for line in result: 100 | fp.write('\t'.join(line) + '\n') 101 | 102 | log.info('List all files in output directory...') 103 | ls_l(args.out_dir) 104 | 105 | log.info('All done.') 106 | 107 | 108 | if __name__ == '__main__': 109 | main() 110 | -------------------------------------------------------------------------------- /src/encode_task_bam_to_pbam.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Author: Jin Lee (leepc12@gmail.com) 4 | 5 | import sys 6 | import os 7 | import argparse 8 | from encode_lib_common import ( 9 | log, 10 | ls_l, 11 | mkdir_p, 12 | rm_f, 13 | ) 14 | from encode_lib_genomic import ( 15 | bam_to_pbam, 16 | ) 17 | 18 | 19 | def parse_arguments(): 20 | parser = argparse.ArgumentParser(prog='ENCODE bam to pbam', 21 | description='') 22 | parser.add_argument('bam', type=str, 23 | help='Path for BAM.') 24 | parser.add_argument('--ref-fa', type=str, 25 | help='Path for reference fasta.') 26 | parser.add_argument('--delete-original-bam', action='store_true', 27 | help='Delete original BAM after conversion.') 28 | parser.add_argument('--out-dir', default='', type=str, 29 | help='Output directory.') 30 | parser.add_argument('--log-level', default='INFO', 31 | choices=['NOTSET', 'DEBUG', 'INFO', 32 | 'WARNING', 'CRITICAL', 'ERROR', 33 | 'CRITICAL'], 34 | help='Log level') 35 | args = parser.parse_args() 36 | 37 | log.setLevel(args.log_level) 38 | log.info(sys.argv) 39 | return args 40 | 41 | 42 | def main(): 43 | # read params 44 | args = parse_arguments() 45 | 46 | log.info('Initializing and making output directory...') 47 | mkdir_p(args.out_dir) 48 | 49 | # generate read length file 50 | log.info('Converting BAM into pBAM...') 51 | bam_to_pbam(args.bam, args.ref_fa, args.out_dir) 52 | 53 | if args.delete_original_bam: 54 | log.info('Deleting original BAM...') 55 | rm_f(args.bam) 56 | 57 | log.info('List all files in output directory...') 58 | ls_l(args.out_dir) 59 | 60 | log.info('All done.') 61 | 62 | 63 | if __name__ == '__main__': 64 | main() 65 | -------------------------------------------------------------------------------- /src/encode_task_count_signal_track.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ENCODE DCC Count signal track generation 4 | # Author: Jin Lee (leepc12@gmail.com) 5 | 6 | import sys 7 | import os 8 | import argparse 9 | from encode_lib_common import ( 10 | log, ls_l, mkdir_p, rm_f, run_shell_cmd, strip_ext_ta, 11 | get_gnu_sort_param, 12 | ) 13 | 14 | 15 | def parse_arguments(): 16 | parser = argparse.ArgumentParser( 17 | prog='ENCODE DCC Count signal track generation') 18 | parser.add_argument('ta', type=str, 19 | help='Path for TAGALIGN file.') 20 | parser.add_argument('--chrsz', type=str, 21 | help='2-col chromosome sizes file.') 22 | parser.add_argument('--mem-gb', type=float, default=4.0, 23 | help='Max. memory for this job in GB. ' 24 | 'This will be used to determine GNU sort -S (defaulting to 0.5 of this value). ' 25 | 'It should be total memory for this task (not memory per thread).') 26 | parser.add_argument('--out-dir', default='', type=str, 27 | help='Output directory.') 28 | parser.add_argument('--log-level', default='INFO', 29 | choices=['NOTSET', 'DEBUG', 'INFO', 30 | 'WARNING', 'CRITICAL', 'ERROR', 31 | 'CRITICAL'], 32 | help='Log level') 33 | args = parser.parse_args() 34 | log.setLevel(args.log_level) 35 | log.info(sys.argv) 36 | return args 37 | 38 | 39 | def count_signal_track(ta, chrsz, mem_gb, out_dir): 40 | prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) 41 | pos_bw = '{}.positive.bigwig'.format(prefix) 42 | neg_bw = '{}.negative.bigwig'.format(prefix) 43 | # temporary files 44 | pos_bedgraph = '{}.positive.bedgraph'.format(prefix) 45 | neg_bedgraph = '{}.negative.bedgraph'.format(prefix) 46 | 47 | temp_files = [] 48 | 49 | run_shell_cmd( 50 | 'zcat -f {ta} | sort -k1,1 -k2,2n {sort_param} | ' 51 | 'bedtools genomecov -5 -bg -strand + -g {chrsz} -i stdin > {pos_bedgraph}'.format( 52 | ta=ta, 53 | sort_param=get_gnu_sort_param(mem_gb * 1024 ** 3, ratio=0.5), 54 | chrsz=chrsz, 55 | pos_bedgraph=pos_bedgraph, 56 | ) 57 | ) 58 | 59 | run_shell_cmd( 60 | 'zcat -f {ta} | sort -k1,1 -k2,2n {sort_param} | ' 61 | 'bedtools genomecov -5 -bg -strand - -g {chrsz} -i stdin > {neg_bedgraph}'.format( 62 | ta=ta, 63 | sort_param=get_gnu_sort_param(mem_gb * 1024 ** 3, ratio=0.5), 64 | chrsz=chrsz, 65 | neg_bedgraph=neg_bedgraph, 66 | ) 67 | ) 68 | 69 | run_shell_cmd( 70 | 'bedGraphToBigWig {pos_bedgraph} {chrsz} {pos_bw}'.format( 71 | pos_bedgraph=pos_bedgraph, 72 | chrsz=chrsz, 73 | pos_bw=pos_bw, 74 | ) 75 | ) 76 | 77 | run_shell_cmd( 78 | 'bedGraphToBigWig {neg_bedgraph} {chrsz} {neg_bw}'.format( 79 | neg_bedgraph=neg_bedgraph, 80 | chrsz=chrsz, 81 | neg_bw=neg_bw, 82 | ) 83 | ) 84 | 85 | # remove temporary files 86 | temp_files.append(pos_bedgraph) 87 | temp_files.append(neg_bedgraph) 88 | rm_f(temp_files) 89 | 90 | return pos_bw, neg_bw 91 | 92 | 93 | def main(): 94 | # read params 95 | args = parse_arguments() 96 | 97 | log.info('Initializing and making output directory...') 98 | mkdir_p(args.out_dir) 99 | 100 | log.info('Generating count signal tracks...') 101 | pos_bw, neg_bw = count_signal_track( 102 | args.ta, 103 | args.chrsz, 104 | args.mem_gb, 105 | args.out_dir 106 | ) 107 | 108 | log.info('List all files in output directory...') 109 | ls_l(args.out_dir) 110 | 111 | log.info('All done.') 112 | 113 | 114 | if __name__ == '__main__': 115 | main() 116 | -------------------------------------------------------------------------------- /src/encode_task_frac_mito.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ENCODE frac mito 4 | # Author: Jin Lee (leepc12@gmail.com) 5 | 6 | import sys 7 | import os 8 | import argparse 9 | from encode_lib_common import ( 10 | log, ls_l, mkdir_p, strip_ext) 11 | from encode_lib_log_parser import parse_flagstat_qc 12 | 13 | 14 | def parse_arguments(): 15 | parser = argparse.ArgumentParser( 16 | prog='ENCODE frac mito', 17 | description='Calculates fraction of mito reads') 18 | parser.add_argument('non_mito_samstat', type=str, 19 | help='Path for SAMstats log file') 20 | parser.add_argument('mito_samstat', type=str, 21 | help='Path for SAMstats log file (mito only)') 22 | parser.add_argument('--out-dir', default='', type=str, 23 | help='Output directory.') 24 | parser.add_argument('--log-level', default='INFO', 25 | choices=['NOTSET', 'DEBUG', 'INFO', 26 | 'WARNING', 'CRITICAL', 'ERROR', 27 | 'CRITICAL'], 28 | help='Log level') 29 | args = parser.parse_args() 30 | 31 | log.setLevel(args.log_level) 32 | log.info(sys.argv) 33 | return args 34 | 35 | 36 | def frac_mito(non_mito_samstat, mito_samstat, out_dir): 37 | prefix = os.path.join( 38 | out_dir, 39 | os.path.basename(strip_ext(non_mito_samstat, 40 | 'non_mito.samstats.qc'))) 41 | frac_mito_qc = '{}.frac_mito.qc'.format(prefix) 42 | 43 | non_mito_samstat_dict = parse_flagstat_qc(non_mito_samstat) 44 | mito_samstat_dict = parse_flagstat_qc(mito_samstat) 45 | 46 | if 'mapped' in non_mito_samstat_dict: 47 | # backward compatibility (old key name was 'total') 48 | key_mapped = 'mapped' 49 | elif 'mapped_reads' in non_mito_samstat_dict: 50 | key_mapped = 'mapped_reads' 51 | Rn = non_mito_samstat_dict[key_mapped] 52 | 53 | if 'mapped' in mito_samstat_dict: 54 | # backward compatibility (old key name was 'total') 55 | key_mapped = 'mapped' 56 | elif 'mapped_reads' in mito_samstat_dict: 57 | key_mapped = 'mapped_reads' 58 | Rm = mito_samstat_dict[key_mapped] 59 | 60 | frac = float(Rm)/float(Rn + Rm) 61 | with open(frac_mito_qc, 'w') as fp: 62 | fp.write('non_mito_reads\t{}\n'.format(Rn)) 63 | fp.write('mito_reads\t{}\n'.format(Rm)) 64 | fp.write('frac_mito_reads\t{}\n'.format(frac)) 65 | 66 | return frac_mito_qc 67 | 68 | 69 | def main(): 70 | # read params 71 | args = parse_arguments() 72 | log.info('Initializing and making output directory...') 73 | mkdir_p(args.out_dir) 74 | 75 | frac_mito(args.non_mito_samstat, 76 | args.mito_samstat, 77 | args.out_dir) 78 | 79 | log.info('List all files in output directory...') 80 | ls_l(args.out_dir) 81 | 82 | log.info('All done.') 83 | 84 | 85 | if __name__ == '__main__': 86 | main() 87 | -------------------------------------------------------------------------------- /src/encode_task_jsd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ENCODE DCC fingerprint/JSD plot wrapper 4 | # Author: Jin Lee (leepc12@gmail.com) 5 | 6 | import sys 7 | import os 8 | import argparse 9 | from encode_lib_common import ( 10 | log, ls_l, mkdir_p, rm_f, run_shell_cmd, strip_ext_bam) 11 | from encode_lib_genomic import ( 12 | samtools_index) 13 | 14 | from encode_lib_blacklist_filter import blacklist_filter_bam 15 | 16 | 17 | def parse_arguments(): 18 | parser = argparse.ArgumentParser( 19 | prog='ENCODE DCC Fingerprint/JSD plot.') 20 | parser.add_argument( 21 | 'bams', nargs='+', type=str, 22 | help='List of paths for filtered experiment BAM files.') 23 | parser.add_argument('--ctl-bam', type=str, default='', 24 | help='Path for filtered control BAM file.') 25 | parser.add_argument('--blacklist', type=str, default='', 26 | help='Blacklist BED file.') 27 | parser.add_argument('--mapq-thresh', default=30, type=int, 28 | help='Threshold for low MAPQ reads removal.') 29 | parser.add_argument('--nth', type=int, default=1, 30 | help='Number of threads to parallelize.') 31 | parser.add_argument('--out-dir', default='', type=str, 32 | help='Output directory.') 33 | parser.add_argument('--log-level', default='INFO', 34 | choices=['NOTSET', 'DEBUG', 'INFO', 35 | 'WARNING', 'CRITICAL', 'ERROR', 36 | 'CRITICAL'], 37 | help='Log level') 38 | args = parser.parse_args() 39 | 40 | log.setLevel(args.log_level) 41 | log.info(sys.argv) 42 | return args 43 | 44 | 45 | def fingerprint(bams, ctl_bam, blacklist, mapq_thresh, nth, out_dir): 46 | # make bam index (.bai) first 47 | # filter bams with blacklist 48 | filtered_bams = [] 49 | for bam in bams: 50 | filtered_bam = blacklist_filter_bam(bam, blacklist, out_dir) 51 | samtools_index(filtered_bam, nth) 52 | filtered_bams.append(filtered_bam) 53 | filtered_ctl_bam = None 54 | if ctl_bam: 55 | filtered_ctl_bam = blacklist_filter_bam(ctl_bam, blacklist, out_dir) 56 | samtools_index(filtered_ctl_bam, nth) 57 | 58 | prefix = os.path.join(out_dir, 59 | os.path.basename(strip_ext_bam(bams[0]))) 60 | plot_png = '{}.jsd_plot.png'.format(prefix) 61 | tmp_log = '{}.jsd.tmp'.format(prefix) 62 | 63 | labels = [] 64 | bam_paths = [] 65 | jsd_qcs = [] 66 | for i, bam in enumerate(filtered_bams): 67 | prefix_ = os.path.join(out_dir, 68 | os.path.basename(strip_ext_bam(bam))) 69 | jsd_qcs.append('rep{}.{}.jsd.qc'.format(i+1, prefix_)) 70 | labels.append('rep{}'.format(i+1)) # repN 71 | bam_paths.append(bam) 72 | # add control 73 | if filtered_ctl_bam: 74 | labels.append('ctl1') 75 | bam_paths.append(filtered_ctl_bam) 76 | 77 | cmd = 'LC_ALL=en_US.UTF-8 LANG=en_US.UTF-8 plotFingerprint -b {} ' 78 | if filtered_ctl_bam: 79 | cmd += '--JSDsample {} '.format(filtered_ctl_bam) 80 | cmd += '--labels {} ' 81 | cmd += '--outQualityMetrics {} ' 82 | cmd += '--minMappingQuality {} ' 83 | cmd += '-T "Fingerprints of different samples" ' 84 | cmd += '--numberOfProcessors {} ' 85 | cmd += '--plotFile {}' 86 | cmd = cmd.format( 87 | ' '.join(bam_paths), 88 | ' '.join(labels), 89 | tmp_log, 90 | mapq_thresh, 91 | nth, 92 | plot_png) 93 | run_shell_cmd(cmd) 94 | 95 | # remove intermediate files (blacklist-filtered BAM) 96 | if filtered_ctl_bam: 97 | rm_f(filtered_ctl_bam) 98 | rm_f(filtered_bams) 99 | 100 | # parse tmp_log to get jsd_qc for each exp replicate 101 | with open(tmp_log, 'r') as fp: 102 | for i, line in enumerate(fp.readlines()): # i is rep_id-1 103 | if i == 0: 104 | continue 105 | if i > len(jsd_qcs): 106 | break 107 | with open(jsd_qcs[i-1], 'w') as fp2: 108 | # removing repN from lines 109 | fp2.write('\t'.join(line.strip().split('\t')[1:])) 110 | rm_f(tmp_log) 111 | return plot_png, jsd_qcs 112 | 113 | 114 | def main(): 115 | # read params 116 | args = parse_arguments() 117 | 118 | log.info('Initializing and making output directory...') 119 | mkdir_p(args.out_dir) 120 | 121 | log.info('Plotting Fingerprint on BAMs and calculating JSD...') 122 | plot_png, jsd_qcs = fingerprint( 123 | args.bams, args.ctl_bam, args.blacklist, args.mapq_thresh, 124 | args.nth, args.out_dir) 125 | 126 | log.info('List all files in output directory...') 127 | ls_l(args.out_dir) 128 | 129 | log.info('All done.') 130 | 131 | 132 | if __name__ == '__main__': 133 | main() 134 | -------------------------------------------------------------------------------- /src/encode_task_merge_fastq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ENCODE DCC fastq merger wrapper 4 | # Author: Jin Lee (leepc12@gmail.com) 5 | 6 | import sys 7 | import os 8 | import argparse 9 | from encode_lib_common import ( 10 | log, ls_l, mkdir_p, read_tsv, run_shell_cmd, 11 | strip_ext_fastq) 12 | 13 | 14 | def parse_arguments(debug=False): 15 | parser = argparse.ArgumentParser(prog='ENCODE DCC fastq merger.', 16 | description='') 17 | parser.add_argument( 18 | 'fastqs', nargs='+', type=str, 19 | help='TSV file path or list of FASTQs. ' 20 | 'FASTQs must be compressed with gzip (with .gz). ' 21 | 'Use TSV for multiple fastqs to be merged later. ' 22 | 'row=merge_id, col=end_id).') 23 | parser.add_argument('--paired-end', action="store_true", 24 | help='Paired-end FASTQs.') 25 | parser.add_argument('--nth', type=int, default=1, 26 | help='Number of threads to parallelize.') 27 | parser.add_argument('--out-dir', default='', type=str, 28 | help='Output directory.') 29 | parser.add_argument('--log-level', default='INFO', 30 | choices=['NOTSET', 'DEBUG', 'INFO', 31 | 'WARNING', 'CRITICAL', 'ERROR', 32 | 'CRITICAL'], 33 | help='Log level') 34 | args = parser.parse_args() 35 | 36 | # parse fastqs command line 37 | if args.fastqs[0].endswith('.gz') or args.fastqs[0].endswith('.fastq') or \ 38 | args.fastqs[0].endswith('.fq'): # it's fastq 39 | args.fastqs = [[f] for f in args.fastqs] # make it a matrix 40 | else: # it's TSV 41 | args.fastqs = read_tsv(args.fastqs[0]) 42 | 43 | for i, fastqs in enumerate(args.fastqs): 44 | if args.paired_end and len(fastqs) != 2: 45 | raise argparse.ArgumentTypeError( 46 | 'Need 2 fastqs per replicate for paired end.') 47 | if not args.paired_end and len(fastqs) != 1: 48 | raise argparse.ArgumentTypeError( 49 | 'Need 1 fastq per replicate for single end.') 50 | 51 | log.setLevel(args.log_level) 52 | log.info(sys.argv) 53 | return args 54 | 55 | 56 | def merge_fastqs(fastqs, end, out_dir): 57 | """make merged fastqs on $out_dir/R1, $out_dir/R2 58 | """ 59 | out_dir = os.path.join(out_dir, end) 60 | mkdir_p(out_dir) 61 | prefix = os.path.join(out_dir, 62 | os.path.basename(strip_ext_fastq(fastqs[0]))) 63 | 64 | if len(fastqs) > 1: 65 | merged = '{}.merged.fastq.gz'.format(prefix) 66 | else: 67 | merged = '{}.fastq.gz'.format(prefix) 68 | 69 | cmd = 'zcat -f {} | gzip -nc > {}'.format( 70 | ' '.join(fastqs), 71 | merged) 72 | run_shell_cmd(cmd) 73 | return merged 74 | 75 | 76 | def main(): 77 | # read params 78 | args = parse_arguments() 79 | 80 | log.info('Initializing and making output directory...') 81 | mkdir_p(args.out_dir) 82 | 83 | # update array with trimmed fastqs 84 | fastqs_R1 = [] 85 | fastqs_R2 = [] 86 | for fastqs in args.fastqs: 87 | fastqs_R1.append(fastqs[0]) 88 | if args.paired_end: 89 | fastqs_R2.append(fastqs[1]) 90 | 91 | log.info('Merging fastqs...') 92 | log.info('R1 to be merged: {}'.format(fastqs_R1)) 93 | merged_R1 = merge_fastqs(fastqs_R1, 'R1', args.out_dir) 94 | if args.paired_end: 95 | log.info('R2 to be merged: {}'.format(fastqs_R2)) 96 | merged_R2 = merge_fastqs(fastqs_R2, 'R2', args.out_dir) 97 | 98 | log.info('List all files in output directory...') 99 | ls_l(args.out_dir) 100 | 101 | log.info('All done.') 102 | 103 | 104 | if __name__ == '__main__': 105 | main() 106 | -------------------------------------------------------------------------------- /src/encode_task_pool_ta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ENCODE DCC TAGALIGN pooler wrapper 4 | # Author: Jin Lee (leepc12@gmail.com) 5 | 6 | import sys 7 | import os 8 | import argparse 9 | from encode_lib_common import ( 10 | log, ls_l, make_hard_link, mkdir_p, run_shell_cmd, strip_ext_ta) 11 | 12 | 13 | def parse_arguments(): 14 | parser = argparse.ArgumentParser(prog='ENCODE DCC TAGALIGN pooler.', 15 | description='') 16 | parser.add_argument('tas', nargs='+', type=str, 17 | help='List of TAGALIGNs to be pooled.') 18 | parser.add_argument('--prefix', type=str, 19 | help='Basename prefix.') 20 | parser.add_argument('--out-dir', default='', type=str, 21 | help='Output directory.') 22 | parser.add_argument('--col', 23 | help='Number of columns to keep in a pooled TAGALIGN. ' 24 | 'Keep all columns if not defined.') 25 | parser.add_argument('--log-level', default='INFO', 26 | choices=['NOTSET', 'DEBUG', 'INFO', 27 | 'WARNING', 'CRITICAL', 'ERROR', 28 | 'CRITICAL'], 29 | help='Log level') 30 | args = parser.parse_args() 31 | 32 | log.setLevel(args.log_level) 33 | log.info(sys.argv) 34 | return args 35 | 36 | 37 | def pool_ta(tas, col, basename_prefix, out_dir): 38 | if len(tas) > 1: 39 | if basename_prefix is not None: 40 | prefix = os.path.join(out_dir, basename_prefix) 41 | else: 42 | prefix = os.path.join(out_dir, 43 | os.path.basename(strip_ext_ta(tas[0]))) 44 | pooled_ta = '{}.pooled.tagAlign.gz'.format(prefix) 45 | 46 | cmd = 'zcat -f {} | ' 47 | if col is not None: 48 | cmd += 'cut -f 1-{} | '.format(col) 49 | cmd += 'gzip -nc > {}' 50 | cmd = cmd.format( 51 | ' '.join(tas), 52 | pooled_ta) 53 | run_shell_cmd(cmd) 54 | return pooled_ta 55 | else: 56 | raise ValueError('Needs at least two TAs (or BEDs) to be pooled.') 57 | 58 | 59 | def main(): 60 | # read params 61 | args = parse_arguments() 62 | 63 | log.info('Initializing and making output directory...') 64 | mkdir_p(args.out_dir) 65 | 66 | log.info('Pooling TAGALIGNs...') 67 | pool_ta(args.tas, args.col, args.prefix, args.out_dir) 68 | 69 | log.info('List all files in output directory...') 70 | ls_l(args.out_dir) 71 | 72 | log.info('All done.') 73 | 74 | 75 | if __name__ == '__main__': 76 | main() 77 | -------------------------------------------------------------------------------- /src/encode_task_post_align.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Author: Jin Lee (leepc12@gmail.com) 4 | 5 | import sys 6 | import os 7 | import argparse 8 | from encode_lib_common import ( 9 | mkdir_p, log, ls_l, rm_f, strip_ext_fastq) 10 | from encode_lib_genomic import ( 11 | get_read_length, remove_chrs_from_bam, samstat, samtools_index) 12 | 13 | 14 | def parse_arguments(): 15 | parser = argparse.ArgumentParser(prog='ENCODE post align', 16 | description='') 17 | parser.add_argument('fastq', type=str, 18 | help='Path for FASTQ R1') 19 | parser.add_argument('bam', type=str, 20 | help='Path for BAM') 21 | parser.add_argument( 22 | '--chrsz', type=str, 23 | help='2-col chromosome sizes file. If not given then ' 24 | 'SAMstats on mito-free BAM will not be calcaulted.') 25 | parser.add_argument('--mito-chr-name', default='chrM', 26 | help='Mito chromosome name.') 27 | parser.add_argument('--nth', type=int, default=1, 28 | help='Number of threads to parallelize.') 29 | parser.add_argument('--mem-gb', type=float, 30 | help='Max. memory for samtools sort in GB. ' 31 | 'It should be total memory for this task (not memory per thread).') 32 | parser.add_argument('--out-dir', default='', type=str, 33 | help='Output directory.') 34 | parser.add_argument('--log-level', default='INFO', 35 | choices=['NOTSET', 'DEBUG', 'INFO', 36 | 'WARNING', 'CRITICAL', 'ERROR', 37 | 'CRITICAL'], 38 | help='Log level') 39 | args = parser.parse_args() 40 | 41 | log.setLevel(args.log_level) 42 | log.info(sys.argv) 43 | return args 44 | 45 | 46 | def make_read_length_file(fastq, out_dir): 47 | basename = os.path.basename(strip_ext_fastq(fastq)) 48 | prefix = os.path.join(out_dir, basename) 49 | txt = '{}.read_length.txt'.format(prefix) 50 | read_length = get_read_length(fastq) 51 | with open(txt, 'w') as fp: 52 | fp.write(str(read_length)) 53 | return txt 54 | 55 | 56 | def main(): 57 | # read params 58 | args = parse_arguments() 59 | 60 | log.info('Initializing and making output directory...') 61 | mkdir_p(args.out_dir) 62 | 63 | # generate read length file 64 | log.info('Generating read length file...') 65 | make_read_length_file( 66 | args.fastq, args.out_dir) 67 | 68 | log.info('Running samtools index...') 69 | samtools_index(args.bam, args.nth, args.out_dir) 70 | 71 | log.info('SAMstats on raw BAM...') 72 | samstat(args.bam, args.nth, args.mem_gb, args.out_dir) 73 | 74 | if args.chrsz: 75 | log.info('SAMstats on non-mito BAM...') 76 | non_mito_out_dir = os.path.join(args.out_dir, 'non_mito') 77 | mkdir_p(non_mito_out_dir) 78 | non_mito_bam = remove_chrs_from_bam(args.bam, [args.mito_chr_name], 79 | args.chrsz, 80 | args.nth, 81 | non_mito_out_dir) 82 | samstat(non_mito_bam, args.nth, args.mem_gb, non_mito_out_dir) 83 | rm_f(non_mito_bam) 84 | 85 | log.info('List all files in output directory...') 86 | ls_l(args.out_dir) 87 | 88 | log.info('All done.') 89 | 90 | 91 | if __name__ == '__main__': 92 | main() 93 | -------------------------------------------------------------------------------- /src/encode_task_post_call_peak_atac.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Author: Jin Lee (leepc12@gmail.com) 4 | 5 | import sys 6 | import argparse 7 | from encode_lib_common import ( 8 | assert_file_not_empty, 9 | log, 10 | ls_l, 11 | mkdir_p, 12 | ) 13 | from encode_lib_genomic import ( 14 | peak_to_bigbed, 15 | peak_to_hammock, 16 | get_region_size_metrics, 17 | get_num_peaks, 18 | peak_to_starch, 19 | ) 20 | from encode_lib_blacklist_filter import blacklist_filter 21 | from encode_lib_frip import frip 22 | 23 | 24 | def parse_arguments(): 25 | parser = argparse.ArgumentParser(prog='ENCODE post_call_peak (atac)', 26 | description='') 27 | parser.add_argument( 28 | 'peak', type=str, 29 | help='Path for PEAK file. Peak filename should be "*.*Peak.gz". ' 30 | 'e.g. rep1.narrowPeak.gz') 31 | parser.add_argument('--ta', type=str, 32 | help='TAG-ALIGN file.') 33 | parser.add_argument('--peak-type', type=str, required=True, 34 | choices=['narrowPeak', 'regionPeak', 35 | 'broadPeak', 'gappedPeak'], 36 | help='Peak file type.') 37 | parser.add_argument('--chrsz', type=str, 38 | help='2-col chromosome sizes file.') 39 | parser.add_argument('--blacklist', type=str, 40 | help='Blacklist BED file.') 41 | parser.add_argument('--regex-bfilt-peak-chr-name', 42 | help='Keep chromosomes matching this pattern only ' 43 | 'in .bfilt. peak files.') 44 | parser.add_argument('--mem-gb', type=float, default=4.0, 45 | help='Max. memory for this job in GB. ' 46 | 'This will be used to determine GNU sort -S (defaulting to 0.5 of this value). ' 47 | 'It should be total memory for this task (not memory per thread).') 48 | parser.add_argument('--out-dir', default='', type=str, 49 | help='Output directory.') 50 | parser.add_argument('--log-level', default='INFO', 51 | choices=['NOTSET', 'DEBUG', 'INFO', 52 | 'WARNING', 'CRITICAL', 'ERROR', 53 | 'CRITICAL'], 54 | help='Log level') 55 | args = parser.parse_args() 56 | if args.blacklist is None or args.blacklist.endswith('null'): 57 | args.blacklist = '' 58 | 59 | log.setLevel(args.log_level) 60 | log.info(sys.argv) 61 | return args 62 | 63 | 64 | def main(): 65 | # read params 66 | args = parse_arguments() 67 | 68 | log.info('Initializing and making output directory...') 69 | mkdir_p(args.out_dir) 70 | 71 | log.info('Blacklist-filtering peaks...') 72 | bfilt_peak = blacklist_filter( 73 | args.peak, args.blacklist, args.regex_bfilt_peak_chr_name, args.out_dir) 74 | 75 | log.info('Checking if output is empty...') 76 | assert_file_not_empty(bfilt_peak) 77 | 78 | log.info('Converting peak to bigbed...') 79 | peak_to_bigbed(bfilt_peak, args.peak_type, args.chrsz, 80 | args.mem_gb, args.out_dir) 81 | 82 | log.info('Converting peak to starch...') 83 | peak_to_starch(bfilt_peak, args.out_dir) 84 | 85 | log.info('Converting peak to hammock...') 86 | peak_to_hammock(bfilt_peak, args.mem_gb, args.out_dir) 87 | 88 | log.info('FRiP without fragment length...') 89 | frip(args.ta, bfilt_peak, args.out_dir) 90 | 91 | log.info('Calculating (blacklist-filtered) peak region size QC/plot...') 92 | get_region_size_metrics(bfilt_peak) 93 | 94 | log.info('Calculating number of peaks (blacklist-filtered)...') 95 | get_num_peaks(bfilt_peak) 96 | 97 | log.info('List all files in output directory...') 98 | ls_l(args.out_dir) 99 | 100 | log.info('All done.') 101 | 102 | 103 | if __name__ == '__main__': 104 | main() 105 | -------------------------------------------------------------------------------- /src/encode_task_post_call_peak_chip.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Author: Jin Lee (leepc12@gmail.com) 4 | 5 | import sys 6 | import os 7 | import argparse 8 | from encode_lib_common import ( 9 | assert_file_not_empty, 10 | log, 11 | ls_l, 12 | mkdir_p, 13 | ) 14 | from encode_lib_genomic import ( 15 | peak_to_bigbed, 16 | peak_to_hammock, 17 | get_region_size_metrics, 18 | get_num_peaks, 19 | peak_to_starch, 20 | ) 21 | from encode_lib_blacklist_filter import blacklist_filter 22 | from encode_lib_frip import frip_shifted 23 | 24 | 25 | def parse_arguments(): 26 | parser = argparse.ArgumentParser(prog='ENCODE post_call_peak (chip)', 27 | description='') 28 | parser.add_argument('peak', type=str, 29 | help='Path for PEAK file. Peak filename should be "*.*Peak.gz". ' 30 | 'e.g. rep1.narrowPeak.gz') 31 | parser.add_argument('--ta', type=str, 32 | help='TAG-ALIGN file.') 33 | parser.add_argument('--peak-type', type=str, required=True, 34 | choices=['narrowPeak', 'regionPeak', 35 | 'broadPeak', 'gappedPeak'], 36 | help='Peak file type.') 37 | parser.add_argument('--fraglen', type=int, required=True, 38 | help='Fragment length.') 39 | parser.add_argument('--chrsz', type=str, 40 | help='2-col chromosome sizes file.') 41 | parser.add_argument('--blacklist', type=str, 42 | help='Blacklist BED file.') 43 | parser.add_argument('--regex-bfilt-peak-chr-name', 44 | help='Keep chromosomes matching this pattern only ' 45 | 'in .bfilt. peak files.') 46 | parser.add_argument('--mem-gb', type=float, default=4.0, 47 | help='Max. memory for this job in GB. ' 48 | 'This will be used to determine GNU sort -S (defaulting to 0.5 of this value). ' 49 | 'It should be total memory for this task (not memory per thread).') 50 | parser.add_argument('--out-dir', default='', type=str, 51 | help='Output directory.') 52 | parser.add_argument('--log-level', default='INFO', 53 | choices=['NOTSET', 'DEBUG', 'INFO', 54 | 'WARNING', 'CRITICAL', 'ERROR', 55 | 'CRITICAL'], 56 | help='Log level') 57 | args = parser.parse_args() 58 | if args.blacklist is None or args.blacklist.endswith('null'): 59 | args.blacklist = '' 60 | 61 | log.setLevel(args.log_level) 62 | log.info(sys.argv) 63 | return args 64 | 65 | 66 | def main(): 67 | # read params 68 | args = parse_arguments() 69 | 70 | log.info('Initializing and making output directory...') 71 | mkdir_p(args.out_dir) 72 | 73 | log.info('Blacklist-filtering peaks...') 74 | bfilt_peak = blacklist_filter( 75 | args.peak, args.blacklist, args.regex_bfilt_peak_chr_name, args.out_dir) 76 | 77 | log.info('Checking if output is empty...') 78 | assert_file_not_empty(bfilt_peak) 79 | 80 | log.info('Converting peak to bigbed...') 81 | peak_to_bigbed(bfilt_peak, args.peak_type, args.chrsz, 82 | args.mem_gb, args.out_dir) 83 | 84 | log.info('Converting peak to starch...') 85 | peak_to_starch(bfilt_peak, args.out_dir) 86 | 87 | log.info('Converting peak to hammock...') 88 | peak_to_hammock(bfilt_peak, args.mem_gb, args.out_dir) 89 | 90 | log.info('Shifted FRiP with fragment length...') 91 | frip_qc = frip_shifted(args.ta, bfilt_peak, 92 | args.chrsz, args.fraglen, args.out_dir) 93 | 94 | log.info('Calculating (blacklist-filtered) peak region size QC/plot...') 95 | region_size_qc, region_size_plot = get_region_size_metrics(bfilt_peak) 96 | 97 | log.info('Calculating number of peaks (blacklist-filtered)...') 98 | num_peak_qc = get_num_peaks(bfilt_peak) 99 | 100 | log.info('List all files in output directory...') 101 | ls_l(args.out_dir) 102 | 103 | log.info('All done.') 104 | 105 | 106 | if __name__ == '__main__': 107 | main() 108 | -------------------------------------------------------------------------------- /src/encode_task_subsample_ctl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import os 4 | import argparse 5 | from encode_lib_common import ( 6 | assert_file_not_empty, get_num_lines, log, ls_l, mkdir_p, rm_f, 7 | run_shell_cmd, strip_ext_ta) 8 | from encode_lib_genomic import ( 9 | subsample_ta_pe, subsample_ta_se) 10 | 11 | def parse_arguments(): 12 | parser = argparse.ArgumentParser( 13 | prog='ENCODE DCC control TAG-ALIGN subsampler.' 14 | 'This script does not check if number of reads in TA is higher than ' 15 | 'subsampling number (--subsample). ' 16 | 'If number of reads in TA is lower than subsampling number then ' 17 | 'TA will be just shuffled.') 18 | parser.add_argument('ta', type=str, 19 | help='Path for control TAGALIGN file.') 20 | parser.add_argument('--paired-end', action="store_true", 21 | help='Paired-end TAGALIGN.') 22 | parser.add_argument('--subsample', default=0, type=int, 23 | help='Number of reads to subsample.') 24 | parser.add_argument('--out-dir', default='', type=str, 25 | help='Output directory.') 26 | parser.add_argument('--log-level', default='INFO', 27 | choices=['NOTSET', 'DEBUG', 'INFO', 28 | 'WARNING', 'CRITICAL', 'ERROR', 29 | 'CRITICAL'], 30 | help='Log level') 31 | args = parser.parse_args() 32 | if not args.subsample: 33 | raise ValueError('--subsample should be a positive integer.') 34 | 35 | log.setLevel(args.log_level) 36 | log.info(sys.argv) 37 | return args 38 | 39 | 40 | def main(): 41 | # read params 42 | args = parse_arguments() 43 | log.info('Initializing and making output directory...') 44 | mkdir_p(args.out_dir) 45 | 46 | if args.paired_end: 47 | subsampled_ta = subsample_ta_pe( 48 | args.ta, args.subsample, 49 | non_mito=False, mito_chr_name=None, r1_only=False, 50 | out_dir=args.out_dir) 51 | else: 52 | subsampled_ta = subsample_ta_se( 53 | args.ta, args.subsample, 54 | non_mito=False, mito_chr_name=None, 55 | out_dir=args.out_dir) 56 | log.info('Checking if output is empty...') 57 | assert_file_not_empty(subsampled_ta) 58 | 59 | log.info('List all files in output directory...') 60 | ls_l(args.out_dir) 61 | 62 | log.info('All done.') 63 | 64 | 65 | if __name__ == '__main__': 66 | main() 67 | -------------------------------------------------------------------------------- /src/encode_task_trim_fastq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ENCODE DCC fastq merger wrapper 4 | # Author: Jin Lee (leepc12@gmail.com) 5 | 6 | import sys 7 | import os 8 | import argparse 9 | from encode_lib_common import ( 10 | assert_file_not_empty, copy_f_to_f, log, ls_l, mkdir_p, 11 | run_shell_cmd, strip_ext_fastq) 12 | 13 | 14 | def parse_arguments(debug=False): 15 | parser = argparse.ArgumentParser( 16 | prog='ENCODE DCC fastq merger.') 17 | parser.add_argument('fastq', type=str, 18 | help='FASTQ to be trimmed.') 19 | parser.add_argument('--trim-bp', type=int, default=50, 20 | help='Number of basepair after trimming.') 21 | parser.add_argument('--out-dir', default='', type=str, 22 | help='Output directory.') 23 | parser.add_argument('--log-level', default='INFO', 24 | choices=['NOTSET', 'DEBUG', 'INFO', 25 | 'WARNING', 'CRITICAL', 'ERROR', 26 | 'CRITICAL'], 27 | help='Log level') 28 | args = parser.parse_args() 29 | 30 | log.setLevel(args.log_level) 31 | log.info(sys.argv) 32 | return args 33 | 34 | 35 | def trim_fastq(fastq, trim_bp, out_dir): 36 | prefix = os.path.join(out_dir, 37 | os.path.basename(strip_ext_fastq(fastq))) 38 | trimmed = '{}.trim_{}bp.fastq.gz'.format(prefix, trim_bp) 39 | 40 | cmd = 'python $(which trimfastq.py) {} {} | gzip -nc > {}'.format( 41 | fastq, trim_bp, trimmed) 42 | run_shell_cmd(cmd) 43 | 44 | # if shorter than trim_bp 45 | cmd2 = 'zcat -f {} | (grep \'sequences shorter than desired length\' ' 46 | cmd2 += '|| true) | wc -l' 47 | cmd2 = cmd2.format( 48 | trimmed) 49 | if int(run_shell_cmd(cmd2)) > 0: 50 | copy_f_to_f(fastq, trimmed) 51 | 52 | return trimmed 53 | 54 | 55 | def main(): 56 | # read params 57 | args = parse_arguments() 58 | 59 | log.info('Initializing and making output directory...') 60 | mkdir_p(args.out_dir) 61 | 62 | log.info('Trimming fastqs ({} bp)...'.format(args.trim_bp)) 63 | trimmed = trim_fastq(args.fastq, args.trim_bp, args.out_dir) 64 | assert_file_not_empty(trimmed) 65 | 66 | log.info('List all files in output directory...') 67 | ls_l(args.out_dir) 68 | 69 | log.info('All done.') 70 | 71 | 72 | if __name__ == '__main__': 73 | main() 74 | --------------------------------------------------------------------------------