├── .circleci
    └── config.yml
├── .dockerignore
├── .editorconfig
├── .github
    └── ISSUE_TEMPLATE
    │   └── bug_report.md
├── .gitignore
├── LICENSE
├── README.md
├── atac.croo.v5.json
├── atac.wdl
├── dev
    ├── build_on_dx_dockerhub.sh
    ├── docker_image
    │   ├── Dockerfile
    │   └── mysql
    │   │   └── init_user.sql
    ├── example_input_json
    │   ├── aws
    │   │   └── ENCSR356KRQ_subsampled_aws.json
    │   ├── caper
    │   │   └── ENCSR356KRQ_subsampled_caper.json
    │   ├── gcp
    │   │   └── ENCSR356KRQ_subsampled_gcp.json
    │   ├── klab
    │   │   ├── ENCSR356KRQ_klab.json
    │   │   ├── ENCSR356KRQ_subsampled_klab.json
    │   │   ├── ENCSR356KRQ_subsampled_start_from_bam_klab.json
    │   │   └── ENCSR889WQX_subsampled_klab.json
    │   ├── scg
    │   │   └── ENCSR356KRQ_subsampled_scg.json
    │   └── sherlock
    │   │   └── ENCSR356KRQ_subsampled_sherlock.json
    └── test
    │   ├── README.md
    │   ├── run_cromwell_server_on_gc.sh
    │   ├── test_py
    │       └── __init__.py
    │   ├── test_task
    │       ├── .gitignore
    │       ├── compare_md5sum.wdl
    │       ├── test_annot_enrich.json
    │       ├── test_annot_enrich.wdl
    │       ├── test_bam2ta.json
    │       ├── test_bam2ta.wdl
    │       ├── test_bowtie2.json
    │       ├── test_bowtie2.wdl
    │       ├── test_compare_signal_to_roadmap.json
    │       ├── test_compare_signal_to_roadmap.wdl
    │       ├── test_count_signal_track.json
    │       ├── test_count_signal_track.wdl
    │       ├── test_filter.json
    │       ├── test_filter.wdl
    │       ├── test_frac_mito.json
    │       ├── test_frac_mito.wdl
    │       ├── test_fraglen_stat_pe.json
    │       ├── test_fraglen_stat_pe.wdl
    │       ├── test_gc_bias.json
    │       ├── test_gc_bias.wdl
    │       ├── test_idr.json
    │       ├── test_idr.wdl
    │       ├── test_jsd.json
    │       ├── test_jsd.wdl
    │       ├── test_macs2.json
    │       ├── test_macs2.wdl
    │       ├── test_macs2_signal_track.json
    │       ├── test_macs2_signal_track.wdl
    │       ├── test_overlap.json
    │       ├── test_overlap.wdl
    │       ├── test_pool_ta.json
    │       ├── test_pool_ta.wdl
    │       ├── test_preseq.json
    │       ├── test_preseq.wdl
    │       ├── test_reproducibility.json
    │       ├── test_reproducibility.wdl
    │       ├── test_spr.json
    │       ├── test_spr.wdl
    │       ├── test_tss_enrich.json
    │       ├── test_tss_enrich.wdl
    │       ├── test_xcor.json
    │       └── test_xcor.wdl
    │   └── test_workflow
    │       ├── .gitignore
    │       ├── ENCSR356KRQ.json
    │       ├── ENCSR356KRQ_subsampled.json
    │       ├── ENCSR356KRQ_subsampled_chr19_only.json
    │       ├── ENCSR356KRQ_subsampled_start_from_bam.json
    │       ├── ENCSR889WQX.json
    │       ├── ENCSR889WQX_subsampled.json
    │       ├── ENCSR889WQX_subsampled_chr19_only.json
    │       ├── ENCSR889WQX_subsampled_unrep.json
    │       └── ref_output
    │           ├── sync.sh
    │           ├── v1.1.4
    │               ├── ENCSR356KRQ_qc.json
    │               ├── ENCSR356KRQ_subsampled_chr19_only_qc.json
    │               ├── ENCSR356KRQ_subsampled_qc.json
    │               ├── ENCSR889WQX_qc.json
    │               ├── ENCSR889WQX_subsampled_chr19_only_qc.json
    │               └── ENCSR889WQX_subsampled_qc.json
    │           ├── v1.1.5
    │               ├── ENCSR356KRQ_subsampled
    │               │   └── qc.json
    │               ├── ENCSR356KRQ_subsampled_chr19_only
    │               │   └── qc.json
    │               ├── ENCSR889WQX
    │               │   └── qc.json
    │               ├── ENCSR889WQX_subsampled
    │               │   └── qc.json
    │               └── ENCSR889WQX_subsampled_chr19_only
    │               │   └── qc.json
    │           ├── v1.1.6.a
    │               ├── ENCSR356KRQ
    │               │   └── qc.json
    │               ├── ENCSR356KRQ_subsampled
    │               │   └── qc.json
    │               ├── ENCSR356KRQ_subsampled_chr19_only
    │               │   └── qc.json
    │               ├── ENCSR889WQX
    │               │   └── qc.json
    │               ├── ENCSR889WQX_subsampled
    │               │   └── qc.json
    │               └── ENCSR889WQX_subsampled_chr19_only
    │               │   └── qc.json
    │           ├── v1.1.6
    │               ├── ENCSR356KRQ
    │               │   └── qc.json
    │               ├── ENCSR356KRQ_subsampled
    │               │   └── qc.json
    │               ├── ENCSR356KRQ_subsampled_chr19_only
    │               │   └── qc.json
    │               ├── ENCSR889WQX
    │               │   └── qc.json
    │               ├── ENCSR889WQX_subsampled
    │               │   └── qc.json
    │               └── ENCSR889WQX_subsampled_chr19_only
    │               │   └── qc.json
    │           ├── v1.1.7.2
    │               └── ENCSR356KRQ_subsampled
    │               │   └── qc.json
    │           ├── v1.3.0
    │               ├── ENCSR356KRQ_subsampled
    │               │   └── qc.json
    │               └── ENCSR889WQX_subsampled
    │               │   └── qc.json
    │           ├── v1.4.0
    │               ├── ENCSR356KRQ_subsampled
    │               │   └── qc.json
    │               └── ENCSR889WQX_subsampled
    │               │   └── qc.json
    │           ├── v1.4.1
    │               └── ENCSR889WQX_subsampled
    │               │   └── qc.json
    │           ├── v1.5.0
    │               ├── ENCSR356KRQ_subsampled
    │               │   └── qc.json
    │               ├── ENCSR356KRQ_subsampled_start_from_bam
    │               │   └── qc.json
    │               ├── ENCSR889WQX_subsampled
    │               │   └── qc.json
    │               └── ENCSR889WQX_subsampled_unrep
    │               │   └── qc.json
    │           ├── v1.6.0
    │               ├── ENCSR356KRQ_subsampled
    │               │   └── qc.json
    │               ├── ENCSR889WQX_subsampled
    │               │   └── qc.json
    │               └── ENCSR889WQX_subsampled_unrep
    │               │   └── qc.json
    │           ├── v1.7.0
    │               ├── ENCSR356KRQ_subsampled
    │               │   └── qc.json
    │               ├── ENCSR356KRQ_subsampled_start_from_bam
    │               │   └── qc.json
    │               ├── ENCSR889WQX_subsampled
    │               │   └── qc.json
    │               └── ENCSR889WQX_subsampled_unrep
    │               │   └── qc.json
    │           ├── v1.8.0
    │               ├── ENCSR356KRQ_subsampled
    │               │   └── qc.json
    │               ├── ENCSR356KRQ_subsampled_start_from_bam
    │               │   └── qc.json
    │               ├── ENCSR889WQX_subsampled
    │               │   └── qc.json
    │               └── ENCSR889WQX_subsampled_unrep
    │               │   └── qc.json
    │           ├── v2.1.0
    │               ├── ENCSR356KRQ_subsampled
    │               │   └── qc.json
    │               └── ENCSR356KRQ_subsampled_start_from_bam
    │               │   └── qc.json
    │           └── v2.2.2
    │               ├── ENCSR356KRQ_subsampled
    │                   └── qc.json
    │               └── ENCSR356KRQ_subsampled_start_from_bam
    │                   └── qc.json
├── docs
    ├── build_genome_database.md
    ├── example_output
    │   ├── v1.1.4
    │   │   └── qc.json
    │   └── v1.1.5
    │   │   └── qc.json
    ├── how_to_config_sge.md
    ├── input.md
    ├── input_short.md
    ├── install_conda.md
    ├── tutorial_dx_cli.md
    └── tutorial_dx_web.md
├── example_input_json
    ├── ENCSR356KRQ_subsampled.json
    ├── dx
    │   ├── ENCSR356KRQ_subsampled_dx.json
    │   ├── ENCSR356KRQ_subsampled_rep1_dx.json
    │   ├── template_general.json
    │   ├── template_hg19.json
    │   ├── template_hg38.json
    │   ├── template_mm10.json
    │   └── template_mm9.json
    ├── dx_azure
    │   ├── ENCSR356KRQ_subsampled_dx_azure.json
    │   ├── template_general.json
    │   ├── template_hg19.json
    │   ├── template_hg38.json
    │   ├── template_mm10.json
    │   └── template_mm9.json
    ├── template.full.json
    ├── template.json
    └── terra
    │   └── ENCSR356KRQ_subsampled.terra.json
├── scripts
    ├── build_genome_data.sh
    ├── download_genome_data.sh
    ├── install_conda_env.sh
    ├── requirements.macs2.txt
    ├── requirements.python2.txt
    ├── requirements.spp.txt
    ├── requirements.txt
    ├── uninstall_conda_env.sh
    └── update_conda_env.sh
└── src
    ├── assign_multimappers.py
    ├── detect_adapter.py
    ├── dev_check_sync_atac.sh
    ├── encode_lib_blacklist_filter.py
    ├── encode_lib_common.py
    ├── encode_lib_frip.py
    ├── encode_lib_genomic.py
    ├── encode_lib_log_parser.py
    ├── encode_lib_qc_category.py
    ├── encode_task_annot_enrich.py
    ├── encode_task_bam2ta.py
    ├── encode_task_bam_to_pbam.py
    ├── encode_task_bowtie2.py
    ├── encode_task_bwa.py
    ├── encode_task_choose_ctl.py
    ├── encode_task_compare_signal_to_roadmap.py
    ├── encode_task_count_signal_track.py
    ├── encode_task_filter.py
    ├── encode_task_frac_mito.py
    ├── encode_task_fraglen_stat_pe.py
    ├── encode_task_gc_bias.py
    ├── encode_task_idr.py
    ├── encode_task_jsd.py
    ├── encode_task_macs2_atac.py
    ├── encode_task_macs2_chip.py
    ├── encode_task_macs2_signal_track_atac.py
    ├── encode_task_macs2_signal_track_chip.py
    ├── encode_task_merge_fastq.py
    ├── encode_task_overlap.py
    ├── encode_task_pool_ta.py
    ├── encode_task_post_align.py
    ├── encode_task_post_call_peak_atac.py
    ├── encode_task_post_call_peak_chip.py
    ├── encode_task_preseq.py
    ├── encode_task_qc_report.py
    ├── encode_task_reproducibility.py
    ├── encode_task_spp.py
    ├── encode_task_spr.py
    ├── encode_task_subsample_ctl.py
    ├── encode_task_trim_adapter.py
    ├── encode_task_trim_fastq.py
    ├── encode_task_trimmomatic.py
    ├── encode_task_tss_enrich.py
    ├── encode_task_xcor.py
    └── trimfastq.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .editorconfig
3 | .git
4 | .gitignore
5 | cromwell-executions
6 | cromwell-workflow-logs
7 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | [*.{wdl,json,conf}]
2 | indent_style = tab
3 | indent_size = 4
4 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## **Describe the bug**
11 | A clear and concise description of what the problem is.
12 | 
13 | ## **OS/Platform**
14 | - OS/Platform: [e.g. Ubuntu 18.04, Google Cloud, Stanford Sherlock/SCG cluster, ...]
15 | - Conda version: If you used Conda (`$ conda --version`).
16 | - Pipeline version: [e.g. v1.8.0]
17 | - Caper version: [e.g. v1.2.0]
18 | 
19 | ## **Caper configuration file**
20 | Paste contents of `~/.caper/default.conf`.
21 | ```ini
22 | PASTE CAPER CONF CONTENTS HERE
23 | ```
24 | 
25 | ## **Input JSON file**
26 | Paste contents of your input JSON file.
27 | ```json
28 | PASTE INPUT JSON CONTENTS HERE
29 | ```
30 | 
31 | ## **Troubleshooting result**
32 | 
33 | If you ran `caper run` without Caper server then Caper automatically runs a troubleshooter for failed workflows. Find troubleshooting result in the bottom of Caper's screen log.
34 | 
35 | If you ran `caper submit` with a running Caper server then first find your workflow ID (1st column) with `caper list` and run `caper debug [WORKFLOW_ID]`.
36 | 
37 | Paste troubleshooting result.
38 | ```
39 | PASTE TROUBLESHOOTING RESULT HERE
40 | ```
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files/
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # temp fastqs
104 | fastqs/
105 | 
106 | # cromwell temp dirs
107 | cromwell-executions/
108 | cromwell-workflow-logs/
109 | cromwell*.jar
110 | 
111 | #test.wdl
112 | #test_google.wdl
113 | output_*.json
114 | test/*_standalone.wdl
115 | .DS_Store
116 | test_genome*
117 | test_sample
118 | *.tar
119 | tmp
120 | tmp_db*
121 | *.local.json
122 | temp_db*
123 | cromwell.out
124 | cromwell.out.*
125 | .dev
126 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 ENCODE DCC
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/dev/build_on_dx_dockerhub.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | WDL=atac.wdl
 5 | VER=$(cat ${WDL} | grep "String pipeline_ver = " | awk '{gsub("'"'"'",""); print $4}')
 6 | DXWDL=~/dxWDL-v1.50.jar
 7 | 
 8 | # general
 9 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines" -f -folder \
10 | /ATAC-seq/workflows/$VER/general -defaults example_input_json/dx/template_general.json
11 | 
12 | # hg38
13 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines" -f -folder \
14 | /ATAC-seq/workflows/$VER/hg38 -defaults example_input_json/dx/template_hg38.json
15 | 
16 | # hg19
17 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines" -f -folder \
18 | /ATAC-seq/workflows/$VER/hg19 -defaults example_input_json/dx/template_hg19.json
19 | 
20 | # mm10
21 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines" -f -folder \
22 | /ATAC-seq/workflows/$VER/mm10 -defaults example_input_json/dx/template_mm10.json
23 | 
24 | # mm9
25 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines" -f -folder \
26 | /ATAC-seq/workflows/$VER/mm9 -defaults example_input_json/dx/template_mm9.json
27 | 
28 | # test sample
29 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines" -f -folder \
30 | /ATAC-seq/workflows/$VER/test_ENCSR356KRQ_subsampled -defaults example_input_json/dx/ENCSR356KRQ_subsampled_dx.json
31 | 
32 | # test sample (single rep)
33 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines" -f -folder \
34 | /ATAC-seq/workflows/$VER/test_ENCSR356KRQ_subsampled_rep1 -defaults example_input_json/dx/ENCSR356KRQ_subsampled_rep1_dx.json
35 | 
36 | ## DX Azure
37 | 
38 | # general
39 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines Azure" -f -folder \
40 | /ATAC-seq/workflows/$VER/general -defaults example_input_json/dx_azure/template_general.json
41 | 
42 | # hg38
43 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines Azure" -f -folder \
44 | /ATAC-seq/workflows/$VER/hg38 -defaults example_input_json/dx_azure/template_hg38.json
45 | 
46 | # hg19
47 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines Azure" -f -folder \
48 | /ATAC-seq/workflows/$VER/hg19 -defaults example_input_json/dx_azure/template_hg19.json
49 | 
50 | # mm10
51 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines Azure" -f -folder \
52 | /ATAC-seq/workflows/$VER/mm10 -defaults example_input_json/dx_azure/template_mm10.json
53 | 
54 | # mm9
55 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines Azure" -f -folder \
56 | /ATAC-seq/workflows/$VER/mm9 -defaults example_input_json/dx_azure/template_mm9.json
57 | 
58 | # test sample
59 | java -jar ${DXWDL} compile ${WDL} -project "ENCODE Uniform Processing Pipelines Azure" -f -folder \
60 | /ATAC-seq/workflows/$VER/test_ENCSR356KRQ_subsampled -defaults example_input_json/dx_azure/ENCSR356KRQ_subsampled_dx_azure.json
61 | 


--------------------------------------------------------------------------------
/dev/docker_image/mysql/init_user.sql:
--------------------------------------------------------------------------------
1 | CREATE USER 'cromwell'@'localhost' IDENTIFIED BY 'cromwell';
2 | GRANT ALL PRIVILEGES ON cromwell_db.* TO 'cromwell'@'localhost' WITH GRANT OPTION;
3 | CREATE USER 'cromwell'@'%' IDENTIFIED BY 'cromwell';
4 | GRANT ALL PRIVILEGES ON cromwell_db.* TO 'cromwell'@'%' WITH GRANT OPTION;


--------------------------------------------------------------------------------
/dev/example_input_json/aws/ENCSR356KRQ_subsampled_aws.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.pipeline_type" : "atac",
 3 |     "atac.genome_tsv" : "s3://encode-pipeline-genome-data/genome_tsv/v1/hg38_aws.tsv",
 4 |     "atac.fastqs_rep1_R1" : [
 5 |         "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz",
 6 |         "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz"
 7 |     ],
 8 |     "atac.fastqs_rep1_R2" : [
 9 |         "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz",
10 |         "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz"
11 |     ],
12 |     "atac.fastqs_rep2_R1" : [
13 |         "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz",
14 |         "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz",
15 |         "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz",
16 |         "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz",
17 |         "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz",
18 |         "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz"
19 |     ],
20 |     "atac.fastqs_rep2_R2" : [
21 |          "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz",
22 |          "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz",
23 |          "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz",
24 |          "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz",
25 |          "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz",
26 |          "s3://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz"
27 |     ],
28 |     "atac.paired_end" : true,
29 |     "atac.auto_detect_adapter" : true,
30 |     "atac.enable_xcor" : true,
31 |     "atac.title" : "ENCSR356KRQ (subsampled 1/400)",
32 |     "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation"
33 | }
34 | 


--------------------------------------------------------------------------------
/dev/example_input_json/caper/ENCSR356KRQ_subsampled_caper.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.pipeline_type" : "atac",
 3 |     "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v1/hg38_caper.tsv",
 4 |     "atac.fastqs_rep1_R1" : [
 5 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz",
 6 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz"
 7 |     ],
 8 |     "atac.fastqs_rep1_R2" : [
 9 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz",
10 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz"
11 |     ],
12 |     "atac.fastqs_rep2_R1" : [
13 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz",
14 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz",
15 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz",
16 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz",
17 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz",
18 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz"
19 |     ],
20 |     "atac.fastqs_rep2_R2" : [
21 |          "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz",
22 |          "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz",
23 |          "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz",
24 |          "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz",
25 |          "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz",
26 |          "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz"
27 |     ],
28 |     "atac.paired_end" : true,
29 |     "atac.auto_detect_adapter" : true,
30 |     "atac.enable_xcor" : true,
31 |     "atac.title" : "ENCSR356KRQ (subsampled 1/400)",
32 |     "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation"
33 | }
34 | 


--------------------------------------------------------------------------------
/dev/example_input_json/gcp/ENCSR356KRQ_subsampled_gcp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.pipeline_type" : "atac",
 3 |     "atac.genome_tsv" : "gs://encode-pipeline-genome-data/genome_tsv/v1/hg38_gcp.tsv",
 4 |     "atac.fastqs_rep1_R1" : [
 5 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz",
 6 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz"
 7 |     ],
 8 |     "atac.fastqs_rep1_R2" : [
 9 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz",
10 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz"
11 |     ],
12 |     "atac.fastqs_rep2_R1" : [
13 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz",
14 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz",
15 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz",
16 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz",
17 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz",
18 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz"
19 |     ],
20 |     "atac.fastqs_rep2_R2" : [
21 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz",
22 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz",
23 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz",
24 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz",
25 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz",
26 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz"
27 |     ],
28 |     "atac.paired_end" : true,
29 |     "atac.auto_detect_adapter" : true,
30 |     "atac.enable_xcor" : true,
31 |     "atac.title" : "ENCSR356KRQ (subsampled 1/400)",
32 |     "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation"
33 | }
34 | 


--------------------------------------------------------------------------------
/dev/example_input_json/klab/ENCSR356KRQ_klab.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.pipeline_type" : "atac",
 3 |     "atac.genome_tsv" : "/mnt/data/pipeline_genome_data/genome_tsv/v1/hg38_klab.tsv",
 4 |     "atac.fastqs_rep1_R1" : [
 5 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair1/ENCFF341MYG.fastq.gz",
 6 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair1/ENCFF106QGY.fastq.gz"
 7 |     ],
 8 |     "atac.fastqs_rep1_R2" : [
 9 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair2/ENCFF248EJF.fastq.gz",
10 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair2/ENCFF368TYI.fastq.gz"
11 |     ],
12 |     "atac.fastqs_rep2_R1" : [
13 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF641SFZ.fastq.gz",
14 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF751XTV.fastq.gz",
15 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF927LSG.fastq.gz",
16 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF859BDM.fastq.gz",
17 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF193RRC.fastq.gz",
18 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF366DFI.fastq.gz"
19 |     ],
20 |     "atac.fastqs_rep2_R2" : [
21 |          "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF031ARQ.fastq.gz",
22 |          "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF590SYZ.fastq.gz",
23 |          "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF734PEQ.fastq.gz",
24 |          "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF007USV.fastq.gz",
25 |          "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF886FSC.fastq.gz",
26 |          "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF573UXK.fastq.gz"
27 |     ],
28 |     "atac.paired_end" : true,
29 |     "atac.auto_detect_adapter" : true,
30 |     "atac.enable_xcor" : true,
31 |     "atac.title" : "ENCSR356KRQ",
32 |     "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation"
33 | }
34 | 


--------------------------------------------------------------------------------
/dev/example_input_json/klab/ENCSR356KRQ_subsampled_klab.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.pipeline_type" : "atac",
 3 |     "atac.genome_tsv" : "/mnt/data/pipeline_genome_data/genome_tsv/v1/hg38_klab.tsv",
 4 |     "atac.fastqs_rep1_R1" : [
 5 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz",
 6 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz"
 7 |     ],
 8 |     "atac.fastqs_rep1_R2" : [
 9 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz",
10 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz"
11 |     ],
12 |     "atac.fastqs_rep2_R1" : [
13 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz",
14 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz",
15 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz",
16 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz",
17 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz",
18 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz"
19 |     ],
20 |     "atac.fastqs_rep2_R2" : [
21 |          "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz",
22 |          "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz",
23 |          "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz",
24 |          "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz",
25 |          "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz",
26 |          "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz"
27 |     ],
28 |     "atac.paired_end" : true,
29 |     "atac.auto_detect_adapter" : true,
30 |     "atac.enable_xcor" : true,
31 |     "atac.title" : "ENCSR356KRQ (subsampled 1/400)",
32 |     "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation"
33 | }
34 | 


--------------------------------------------------------------------------------
/dev/example_input_json/klab/ENCSR356KRQ_subsampled_start_from_bam_klab.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.pipeline_type" : "atac",
 3 |     "atac.genome_tsv" : "/mnt/data/pipeline_genome_data/genome_tsv/v1/hg38_klab.tsv",
 4 |     "atac.nodup_bams" : [
 5 |          "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/bam_subsampled/rep1/ENCFF341MYG.subsampled.400.trim.merged.nodup.no_chrM_MT.bam",
 6 |          "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/bam_subsampled/rep2/ENCFF641SFZ.subsampled.400.trim.merged.nodup.no_chrM_MT.bam"
 7 |     ],
 8 |     "atac.read_len" : [76, 76],
 9 |     "atac.paired_end" : true,
10 |     "atac.auto_detect_adapter" : true,
11 |     "atac.enable_xcor" : true,
12 |     "atac.title" : "ENCSR356KRQ (subsampled 1/400, staring from NODUP_BAMs with specified read_len)",
13 |     "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation"
14 | }
15 | 


--------------------------------------------------------------------------------
/dev/example_input_json/klab/ENCSR889WQX_subsampled_klab.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.pipeline_type" : "atac",
 3 |     "atac.genome_tsv" : "/mnt/data/pipeline_genome_data/genome_tsv/v1/mm10_klab.tsv",
 4 |     "atac.fastqs_rep1_R1" : [
 5 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF439VSY.subsampled.400.fastq.gz",
 6 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF325FCQ.subsampled.400.fastq.gz",
 7 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF683IQS.subsampled.400.fastq.gz",
 8 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF744CHW.subsampled.400.fastq.gz"
 9 |     ],
10 |     "atac.fastqs_rep2_R1" : [
11 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep2/ENCFF463QCX.subsampled.400.fastq.gz",
12 |         "/mnt/data/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep2/ENCFF992TSA.subsampled.400.fastq.gz"
13 |     ],
14 |     "atac.paired_end" : false,
15 |     "atac.auto_detect_adapter" : true,
16 |     "atac.enable_xcor" : true,
17 |     "atac.enable_tss_enrich" : false,
18 |     "atac.title" : "ENCSR889WQX (subsampled 1/400 reads)",
19 |     "atac.description" : "ATAC-seq on Mus musculus C57BL/6 frontal cortex adult"
20 | }
21 | 


--------------------------------------------------------------------------------
/dev/example_input_json/scg/ENCSR356KRQ_subsampled_scg.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.pipeline_type" : "atac",
 3 |     "atac.genome_tsv" : "/reference/ENCODE/pipeline_genome_data/genome_tsv/v1/hg38_scg.tsv",
 4 |     "atac.fastqs_rep1_R1" : [
 5 |         "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz",
 6 |         "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz"
 7 |     ],
 8 |     "atac.fastqs_rep1_R2" : [
 9 |         "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz",
10 |         "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz"
11 |     ],
12 |     "atac.fastqs_rep2_R1" : [
13 |         "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz",
14 |         "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz",
15 |         "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz",
16 |         "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz",
17 |         "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz",
18 |         "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz"
19 |     ],
20 |     "atac.fastqs_rep2_R2" : [
21 |         "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz",
22 |         "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz",
23 |         "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz",
24 |         "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz",
25 |         "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz",
26 |         "/reference/ENCODE/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz"
27 |     ],
28 |     "atac.paired_end" : true,
29 |     "atac.auto_detect_adapter" : true,
30 |     "atac.enable_xcor" : true,
31 |     "atac.title" : "ENCSR356KRQ (subsampled 1/400)",
32 |     "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation"
33 | }
34 | 


--------------------------------------------------------------------------------
/dev/example_input_json/sherlock/ENCSR356KRQ_subsampled_sherlock.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.pipeline_type" : "atac",
 3 |     "atac.genome_tsv" : "/home/groups/cherry/encode/pipeline_genome_data/genome_tsv/v1/hg38_sherlock.tsv",
 4 |     "atac.fastqs_rep1_R1" : [
 5 |         "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz",
 6 |         "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz"
 7 |     ],
 8 |     "atac.fastqs_rep1_R2" : [
 9 |         "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz",
10 |         "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz"
11 |     ],
12 |     "atac.fastqs_rep2_R1" : [
13 |         "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz",
14 |         "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz",
15 |         "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz",
16 |         "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz",
17 |         "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz",
18 |         "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz"
19 |     ],
20 |     "atac.fastqs_rep2_R2" : [
21 |         "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz",
22 |         "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz",
23 |         "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz",
24 |         "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz",
25 |         "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz",
26 |         "/home/groups/cherry/encode/pipeline_test_samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz"
27 |     ],
28 |     "atac.paired_end" : true,
29 |     "atac.auto_detect_adapter" : true,
30 |     "atac.enable_xcor" : true,
31 |     "atac.title" : "ENCSR356KRQ (subsampled 1/400)",
32 |     "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation"
33 | }
34 | 


--------------------------------------------------------------------------------
/dev/test/README.md:
--------------------------------------------------------------------------------
  1 | ENCODE ATAC-seq pipeline test
  2 | ===================================================
  3 | 
  4 | # Task level test (local)
  5 | 
  6 | This test requires `atac-seq-pipeline-test-data` directory in `test_task/`. Git glone [a data repo](https://github.com/leepc12/atac-seq-pipeline-test-data) on `test_task/`. This repo has 1/400 subsampled test samples and chr19-chrM only bowtie2 indices and other genome data for hg38 and mm10. Make sure that you have `cromwell-31.jar` in your `$PATH` as an executable (`chmod +x`) and `Docker` installed on your system.
  7 | ```
  8 | $ cd test_task/
  9 | $ git clone https://github.com/encode-dcc/atac-seq-pipeline-test-data
 10 | ```
 11 | 
 12 | Each task in `../atac.wdl` has a corresponding pair of tester WDL/JSON (`[TASK_NAME].WDL` and [TASK_NAME].json`). You can also specify your own docker image to test each task.
 13 | ```
 14 | $ cd test_task/
 15 | $ ./test.sh [WDL] [INPUT_JSON] [DOCKER_IMAGE](optional)
 16 | ```
 17 | 
 18 | # Workflow level test (on GC)
 19 | 
 20 | Make sure that you have a Cromwell server running on GC. This shell script will submit `../atac.wdl` to the server and wait for a response (`result.json`). There are two input JSON files (original and subsampled) for each endedness (SE and PE). You can also check all outputs on GC bucket `gs://encode-pipeline-test-runs`.
 21 | ```
 22 | $ cd test_workflow/
 23 | $ ./test_atac.sh [INPUT_JSON] [QC_JSON_TO_COMPARE] [DOCKER_IMAGE](optional)
 24 | ```
 25 | 
 26 | Jenkins must do the following:
 27 | ```
 28 | $ cd test_workflow/
 29 | # For master branch (full test sample, ~24hr)
 30 | $ ./test_atac.sh ENCSR356KRQ.json ref_output/ENCSR356KRQ_qc.json [NEW_DOCKER_IMAGE]
 31 | $ ./test_atac.sh ENCSR889WQX.json ref_output/ENCSR889WQX_qc.json [NEW_DOCKER_IMAGE]
 32 | # For develop branch (1/400 subsampled and chr19 only test sample ~30mins)
 33 | $ ./test_atac.sh ENCSR356KRQ_subsampled.json ref_output/ENCSR356KRQ_subsampled_chr19_only_qc.json [NEW_DOCKER_IMAGE]
 34 | $ ./test_atac.sh ENCSR889WQX_subsampled.json ref_output/ENCSR889WQX_subsampled_chr19_only_qc.json [NEW_DOCKER_IMAGE]
 35 | ```
 36 | 
 37 | `test_atac.sh` will generate the following files to validate pipeline outputs. Jenkins must check if `PREFIX.qc_json_diff.txt` is empty or not.
 38 | * `PREFIX.result.json`: all outputs of `atac.wdl`.
 39 | * `PREFIX.result.qc.json`: qc summary JSON file `qc.json` of `atac.wdl`.
 40 | * `PREFIX.qc_json_diff.txt`: diff between `PREFIX.result.qc.json` and reference in `ref_output/`.
 41 | 
 42 | # How to run a Cromwell server on GC
 43 | 
 44 | 1) Create/restart an instance with the following settings.
 45 | * name : `encode-cromwell-test-server`. 
 46 | * resource: 1vCPU and 4GB memory
 47 | * zone: `us-west1-a`.
 48 | * image: `Ubuntu 16.04 (xenial)`
 49 | * disk: `Standard persistent disk 20GB`
 50 | * Network tags: add a tag `cromwell-server`.
 51 | * Cloud API access scopes: `Allow full access to all Cloud APIs`.
 52 | * External IP (optional): any static IP address.
 53 | 
 54 | 2) SSH to the instance and run the followings to install Docker and Java 8:
 55 | ```
 56 | $ sudo apt-get update
 57 | $ sudo apt-get install docker.io default-jre
 58 | $ sudo usermod -aG docker $USER
 59 | ```
 60 | 
 61 | 3) Log out and log back in.
 62 | 
 63 | 4) Install cromwell.
 64 | ```
 65 | $ cd
 66 | $ wget https://github.com/broadinstitute/cromwell/releases/download/31/cromwell-31.jar
 67 | $ chmod +x cromwell*.jar
 68 | $ echo "export PATH=\$PATH:\$HOME">> ~/.bashrc
 69 | $ source ~/.bashrc
 70 | ```
 71 | 
 72 | 5) Clone pipeline, make DB directory (where metadata of all pipelines are stored) and run `MySQL` container.
 73 | ```
 74 | $ cd
 75 | $ git clone https://github.com/ENCODE-DCC/atac-seq-pipeline
 76 | $ mkdir cromwell_db
 77 | $ docker run -d --name mysql-cromwell -v $HOME/cromwell_db:/var/lib/mysql -v $HOME/atac-seq-pipeline/docker_image/mysql:/docker-entrypoint-initdb.d -e MYSQL_ROOT_PASSWORD=cromwell -e MYSQL_DATABASE=cromwell_db --publish 3306:3306 mysql
 78 | $ docker ps
 79 | ```
 80 | 
 81 | 4) Run Cromwell server
 82 | ```
 83 | $ cd $HOME/atac-seq-pipeline
 84 | $ git checkout develop_test_jenkins
 85 | $ cd test
 86 | $ screen -RD cromwell # make screen for cromwell server
 87 | $ bash run_cromwell_server_on_gc.sh
 88 | ```
 89 | 
 90 | 5) Firewall settings to open port 8000
 91 | * Go to Google Cloud Console
 92 | * Choose your Project.
 93 | * Choose Networking > VPC network
 94 | * Choose "Firewalls rules"
 95 | * Choose Create Firewall Rule `encode-cromwell-test-server-open-port-8000`.
 96 | * Targets: `Specified target rags`.
 97 | * Target tags: cromwell-server
 98 | * Source IP ranges: `0.0.0.0/0` (CIDR notation for allowed IP range)
 99 | * Protocols and Ports: `Specified protocols and ports` with `tcp:8000`.
100 | 


--------------------------------------------------------------------------------
/dev/test/run_cromwell_server_on_gc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ -f "cromwell-32.jar" ]; then
 4 |   echo "Skip downloading cromwell."
 5 | else
 6 |   wget -N -c https://github.com/broadinstitute/cromwell/releases/download/32/cromwell-32.jar
 7 | fi
 8 | CROMWELL_JAR=cromwell-32.jar
 9 | BACKEND_CONF=../backends/backend_with_db.conf
10 | BACKEND=google
11 | GC_PROJ=encode-dcc-1016
12 | GC_ROOT=gs://encode-pipeline-test-runs
13 | 
14 | java -Dconfig.file=${BACKEND_CONF} -Dbackend.default=${BACKEND} -Dbackend.providers.google.config.project=${GC_PROJ} \
15 | -Dbackend.providers.google.config.root=${GC_ROOT} -jar ${CROMWELL_JAR} server
16 | 


--------------------------------------------------------------------------------
/dev/test/test_py/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ENCODE-DCC/atac-seq-pipeline/47ba8dff9c332e24b48e767303e9fcac98589cf2/dev/test/test_py/__init__.py


--------------------------------------------------------------------------------
/dev/test/test_task/.gitignore:
--------------------------------------------------------------------------------
 1 | atac-seq-pipeline-test-data
 2 | *.result.json
 3 | *.metadata.json
 4 | *wf_opt.json
 5 | cromwell*.jar
 6 | *.fasta
 7 | *.fa
 8 | *.gz
 9 | *.docker.json
10 | 


--------------------------------------------------------------------------------
/dev/test/test_task/compare_md5sum.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | 
  3 | task compare_md5sum {
  4 |     input {
  5 |         Array[String] labels
  6 |         Array[File] files
  7 |         Array[File] ref_files
  8 |     }
  9 |     command <<<
 10 |         python <<CODE    
 11 |         from collections import OrderedDict
 12 |         import os
 13 |         import json
 14 |         import hashlib
 15 |         import struct
 16 | 
 17 |         def getuncompressedsize(filename):
 18 |             # https://stackoverflow.com/a/22348071
 19 |             with open(filename, 'rb') as f:
 20 |                 f.seek(-4, 2)
 21 |                 return struct.unpack('I', f.read(4))[0]
 22 | 
 23 |         def md5sum(filename, blocksize=65536):
 24 |             hash = hashlib.md5()
 25 |             with open(filename, 'rb') as f:
 26 |                 for block in iter(lambda: f.read(blocksize), b''):
 27 |                     hash.update(block)
 28 |             return hash.hexdigest()
 29 | 
 30 |         with open('~{write_lines(labels)}','r') as fp:
 31 |             labels = fp.read().splitlines()
 32 |         with open('~{write_lines(files)}','r') as fp:
 33 |             files = fp.read().splitlines()
 34 |         with open('~{write_lines(ref_files)}','r') as fp:
 35 |             ref_files = fp.read().splitlines()
 36 | 
 37 |         result = OrderedDict()
 38 |         match = OrderedDict()
 39 |         match_overall = True
 40 | 
 41 |         result['tasks'] = []
 42 |         result['failed_task_labels'] = []
 43 |         result['succeeded_task_labels'] = []
 44 |         for i, label in enumerate(labels):
 45 |             f = files[i]
 46 |             ref_f = ref_files[i]
 47 |             md5 = md5sum(f)
 48 |             ref_md5 = md5sum(ref_f)
 49 | 
 50 |             filesize = os.path.getsize(f)
 51 |             ref_filesize = os.path.getsize(ref_f)
 52 | 
 53 |             # if text file, read in contents
 54 |             if f.endswith('.qc') or f.endswith('.txt') or \
 55 |                 f.endswith('.log') or f.endswith('.out'):
 56 |                 with open(f,'r') as fp:
 57 |                     contents = fp.read()
 58 |                 with open(ref_f,'r') as fp:
 59 |                     ref_contents = fp.read()
 60 |             elif f.endswith('.gz'):
 61 |                 uncompressed_filesize = getuncompressedsize(f)
 62 |                 ref_uncompressed_filesize = getuncompressedsize(ref_f)
 63 | 
 64 |                 contents = 'filesize={filesize}, uncompressed_filesize={uncompressed_filesize}'.format(
 65 |                     filesize=filesize,
 66 |                     uncompressed_filesize=uncompressed_filesize,
 67 |                 )
 68 |                 ref_contents = 'filesize={filesize}, uncompressed_filesize={uncompressed_filesize}'.format(
 69 |                     filesize=ref_filesize,
 70 |                     uncompressed_filesize=ref_uncompressed_filesize,
 71 |                 )
 72 |             else:
 73 |                 contents = 'filesize={filesize}'.format(filesize=filesize)
 74 |                 ref_contents = 'filesize={filesize}'.format(filesize=ref_filesize)
 75 | 
 76 |             matched = md5==ref_md5
 77 |             result['tasks'].append(OrderedDict([
 78 |                 ('label', label),
 79 |                 ('match', matched),
 80 |                 ('md5sum', md5),
 81 |                 ('ref_md5sum', ref_md5),
 82 |                 ('basename', os.path.basename(f)),
 83 |                 ('ref_basename', os.path.basename(ref_f)),
 84 |                 ('contents', contents),
 85 |                 ('ref_contents', ref_contents),
 86 |                 ]))
 87 |             match[label] = matched
 88 |             match_overall &= matched
 89 |             if matched:
 90 |                 result['succeeded_task_labels'].append(label)
 91 |             else:
 92 |                 result['failed_task_labels'].append(label)        
 93 |         result['match_overall'] = match_overall
 94 | 
 95 |         with open('result.json','w') as fp:
 96 |             fp.write(json.dumps(result, indent=4))
 97 |         match_tmp = []
 98 |         for key in match:
 99 |             val = match[key]
100 |             match_tmp.append('{}\t{}'.format(key, val))
101 |         with open('match.tsv','w') as fp:
102 |             fp.writelines('\n'.join(match_tmp))
103 |         with open('match_overall.txt','w') as fp:
104 |             fp.write(str(match_overall))
105 |         CODE
106 |     >>>
107 |     output {
108 |         Map[String,String] match = read_map('match.tsv') # key:label, val:match
109 |         Boolean match_overall = read_boolean('match_overall.txt')
110 |         File json = glob('result.json')[0] # details (json file)
111 |         String json_str = read_string('result.json') # details (string)
112 |     }
113 |     runtime {
114 |         cpu : 1
115 |         memory : '4000 MB'
116 |         time : 1
117 |         disks : 'local-disk 50 HDD'
118 |     }
119 | }
120 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_annot_enrich.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_annot_enrich.blacklist" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38_chr19_chrM/hg38.blacklist.bed.gz",
 3 |     "test_annot_enrich.dnase" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_all_p10_ucsc.hg19_to_hg38.bed.gz",
 4 |     "test_annot_enrich.prom" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_prom_p2.hg19_to_hg38.bed.gz",
 5 |     "test_annot_enrich.enh" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38/ataqc/reg2map_honeybadger2_dnase_enh_p2.hg19_to_hg38.bed.gz",
 6 |     
 7 |     "test_annot_enrich.ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/ataqc/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz",
 8 | 
 9 |     "test_annot_enrich.ref_annot_enrich_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_annot_enrich/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.annot_enrich.qc"
10 | }
11 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_annot_enrich.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import '../../../atac.wdl' as atac
 3 | import 'compare_md5sum.wdl' as compare_md5sum
 4 | 
 5 | 
 6 | 
 7 | workflow test_annot_enrich {
 8 |     input {
 9 |         File ta
10 |         File blacklist
11 |         File dnase
12 |         File prom
13 |         File enh
14 |         File ref_annot_enrich_qc
15 |         String docker
16 |     }
17 |     RuntimeEnvironment runtime_environment = {
18 |         "docker": docker,
19 |         "singularity": "",
20 |         "conda": ""
21 |     }
22 | 
23 |     call atac.annot_enrich { input : 
24 |         ta = ta,
25 |         blacklist = blacklist,
26 |         dnase = dnase,
27 |         prom = prom,
28 |         enh = enh,
29 |         runtime_environment = runtime_environment,
30 |     }
31 | 
32 |     call compare_md5sum.compare_md5sum { input :
33 |         labels = [
34 |             'test_annot_enrich_qc',
35 |         ],
36 |         files = [
37 |             annot_enrich.annot_enrich_qc,
38 |         ],
39 |         ref_files = [
40 |             ref_annot_enrich_qc,
41 |         ],
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_bam2ta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_bam2ta.pe_nodup_bam" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/nodup_bams/rep1/ENCFF341MYG.subsampled.400.trim.merged.nodup.bam",
 3 |     "test_bam2ta.se_nodup_bam" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/nodup_bams/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.bam",
 4 | 
 5 |     "test_bam2ta.ref_pe_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bam2ta/pe/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz",
 6 |     "test_bam2ta.ref_pe_ta_disable_tn5_shift" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bam2ta/pe/disable_tn5_shift/ENCFF341MYG.subsampled.400.trim.merged.nodup.tagAlign.gz",
 7 |     "test_bam2ta.ref_pe_ta_subsample" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bam2ta/pe/subsample/fix_PIP-917/ENCFF341MYG.subsampled.400.trim.merged.nodup.5K.tn5.tagAlign.gz",
 8 | 
 9 |     "test_bam2ta.ref_se_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bam2ta/se/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz",
10 |     "test_bam2ta.ref_se_ta_disable_tn5_shift" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bam2ta/se/disable_tn5_shift/ENCFF439VSY.subsampled.400.trim.merged.nodup.tagAlign.gz",
11 |     "test_bam2ta.ref_se_ta_subsample" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bam2ta/se/subsample/ENCFF439VSY.subsampled.400.trim.merged.nodup.5K.tn5.tagAlign.gz",
12 | 
13 |     "test_bam2ta.bam2ta_subsample" : 5000
14 | }
15 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_bam2ta.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | import '../../../atac.wdl' as atac
  3 | import 'compare_md5sum.wdl' as compare_md5sum
  4 | 
  5 | workflow test_bam2ta {
  6 |     input {
  7 |         Int bam2ta_subsample
  8 | 
  9 |         String pe_nodup_bam
 10 |         String se_nodup_bam
 11 | 
 12 |         String ref_pe_ta
 13 |         String ref_pe_ta_disable_tn5_shift
 14 |         String ref_pe_ta_subsample
 15 |         String ref_se_ta
 16 |         String ref_se_ta_disable_tn5_shift
 17 |         String ref_se_ta_subsample
 18 |         String mito_chr_name = 'chrM'
 19 | 
 20 |         Int bam2ta_cpu = 1
 21 |         Float bam2ta_mem_factor = 0.0
 22 |         Int bam2ta_time_hr = 6
 23 |         Float bam2ta_disk_factor = 4.0
 24 |         String docker
 25 |     }
 26 |     RuntimeEnvironment runtime_environment = {
 27 |         "docker": docker,
 28 |         "singularity": "",
 29 |         "conda": ""
 30 |     }
 31 | 
 32 |     call atac.bam2ta as pe_bam2ta { input :
 33 |         bam = pe_nodup_bam,
 34 |         disable_tn5_shift = false,
 35 |         subsample = 0,
 36 |         paired_end = true,
 37 |         mito_chr_name = mito_chr_name,
 38 | 
 39 |         cpu = bam2ta_cpu,
 40 |         mem_factor = bam2ta_mem_factor,
 41 |         time_hr = bam2ta_time_hr,
 42 |         disk_factor = bam2ta_disk_factor,
 43 |         runtime_environment = runtime_environment,
 44 |     }
 45 |     call atac.bam2ta as pe_bam2ta_disable_tn5_shift { input :
 46 |         bam = pe_nodup_bam,
 47 |         disable_tn5_shift = true,
 48 |         subsample = 0,
 49 |         paired_end = true,
 50 |         mito_chr_name = mito_chr_name,
 51 | 
 52 |         cpu = bam2ta_cpu,
 53 |         mem_factor = bam2ta_mem_factor,
 54 |         time_hr = bam2ta_time_hr,
 55 |         disk_factor = bam2ta_disk_factor,
 56 |         runtime_environment = runtime_environment,
 57 |     }
 58 |     call atac.bam2ta as pe_bam2ta_subsample { input :
 59 |         bam = pe_nodup_bam,
 60 |         disable_tn5_shift = false,
 61 |         subsample = bam2ta_subsample,
 62 |         paired_end = true,
 63 |         mito_chr_name = mito_chr_name,
 64 | 
 65 |         cpu = bam2ta_cpu,
 66 |         mem_factor = bam2ta_mem_factor,
 67 |         time_hr = bam2ta_time_hr,
 68 |         disk_factor = bam2ta_disk_factor,
 69 |         runtime_environment = runtime_environment,
 70 |     }
 71 |     call atac.bam2ta as se_bam2ta { input :
 72 |         bam = se_nodup_bam,
 73 |         disable_tn5_shift = false,
 74 |         subsample = 0,
 75 |         paired_end = false,
 76 |         mito_chr_name = mito_chr_name,
 77 | 
 78 |         cpu = bam2ta_cpu,
 79 |         mem_factor = bam2ta_mem_factor,
 80 |         time_hr = bam2ta_time_hr,
 81 |         disk_factor = bam2ta_disk_factor,
 82 |         runtime_environment = runtime_environment,
 83 |     }
 84 |     call atac.bam2ta as se_bam2ta_disable_tn5_shift { input :
 85 |         bam = se_nodup_bam,
 86 |         disable_tn5_shift = true,
 87 |         subsample = 0,
 88 |         paired_end = false,
 89 |         mito_chr_name = mito_chr_name,
 90 | 
 91 |         cpu = bam2ta_cpu,
 92 |         mem_factor = bam2ta_mem_factor,
 93 |         time_hr = bam2ta_time_hr,
 94 |         disk_factor = bam2ta_disk_factor,
 95 |         runtime_environment = runtime_environment,
 96 |     }
 97 |     call atac.bam2ta as se_bam2ta_subsample { input :
 98 |         bam = se_nodup_bam,
 99 |         disable_tn5_shift = false,
100 |         subsample = bam2ta_subsample,
101 |         paired_end = false,
102 |         mito_chr_name = mito_chr_name,
103 | 
104 |         cpu = bam2ta_cpu,
105 |         mem_factor = bam2ta_mem_factor,
106 |         time_hr = bam2ta_time_hr,
107 |         disk_factor = bam2ta_disk_factor,
108 |         runtime_environment = runtime_environment,
109 |     }
110 | 
111 |     call compare_md5sum.compare_md5sum { input :
112 |         labels = [
113 |             'pe_bam2ta',
114 |             'pe_bam2ta_disable_tn5_shift',
115 |             'pe_bam2ta_subsample',
116 |             'se_bam2ta',
117 |             'se_bam2ta_disable_tn5_shift',
118 |             'se_bam2ta_subsample',
119 |         ],
120 |         files = [
121 |             pe_bam2ta.ta,
122 |             pe_bam2ta_disable_tn5_shift.ta,
123 |             pe_bam2ta_subsample.ta,
124 |             se_bam2ta.ta,
125 |             se_bam2ta_disable_tn5_shift.ta,
126 |             se_bam2ta_subsample.ta,
127 |         ],
128 |         ref_files = [
129 |             ref_pe_ta,
130 |             ref_pe_ta_disable_tn5_shift,
131 |             ref_pe_ta_subsample,
132 |             ref_se_ta,
133 |             ref_se_ta_disable_tn5_shift,
134 |             ref_se_ta_subsample,
135 |         ],
136 |     }
137 | }
138 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_bowtie2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_bowtie2.pe_fastqs_R1" : [
 3 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/fastqs/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz",
 4 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/fastqs/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz"
 5 |     ],
 6 |     "test_bowtie2.pe_fastqs_R2" : [
 7 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/fastqs/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz",
 8 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/fastqs/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz"
 9 |     ],
10 |     "test_bowtie2.se_fastqs_R1" : [
11 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/fastqs/rep1/ENCFF439VSY.subsampled.400.fastq.gz",
12 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/fastqs/rep1/ENCFF325FCQ.subsampled.400.fastq.gz",
13 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/fastqs/rep1/ENCFF683IQS.subsampled.400.fastq.gz",
14 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/fastqs/rep1/ENCFF744CHW.subsampled.400.fastq.gz"
15 |     ],    
16 |     "test_bowtie2.pe_bowtie2_idx_tar" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38_chr19_chrM/bowtie2_index/GRCh38_no_alt_analysis_set_GCA_000001405.15.chr19_chrM.fasta.tar",
17 |     "test_bowtie2.se_bowtie2_idx_tar" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/bowtie2_index/mm10_no_alt_analysis_set_ENCODE.chr19_chrM.fasta.tar",
18 | 
19 |     "test_bowtie2.se_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10_chr19_chrM.chrom.sizes",
20 |     "test_bowtie2.pe_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38_chr19_chrM/hg38_chr19_chrM.chrom.sizes",
21 | 
22 |     "test_bowtie2.ref_pe_flagstat" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bowtie2/pe/multimapping/merge_fastqs_R1_ENCFF341MYG.subsampled.400.trim.merged.samstats.qc",
23 |     "test_bowtie2.ref_pe_flagstat_no_multimapping" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bowtie2/pe/no_multimapping/merge_fastqs_R1_ENCFF341MYG.subsampled.400.trim.merged.samstats.qc",
24 |     "test_bowtie2.ref_se_flagstat" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bowtie2/se/multimapping/merge_fastqs_R1_ENCFF439VSY.subsampled.400.trim.merged.samstats.qc",
25 |     "test_bowtie2.ref_se_flagstat_no_multimapping" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_bowtie2/se/no_multimapping/merge_fastqs_R1_ENCFF439VSY.subsampled.400.trim.merged.samstats.qc",
26 | 
27 |     "test_bowtie2.multimapping" : 4
28 | }
29 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_bowtie2.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | import '../../../atac.wdl' as atac
  3 | import 'compare_md5sum.wdl' as compare_md5sum
  4 | 
  5 | workflow test_bowtie2 {
  6 |     input {
  7 |         Int multimapping
  8 | 
  9 |         Array[String] pe_fastqs_R1
 10 |         Array[String] pe_fastqs_R2
 11 |         Array[String] se_fastqs_R1
 12 | 
 13 |         String se_chrsz
 14 |         String pe_chrsz
 15 |         String cutadapt_param = '-e 0.1 -m 5'
 16 | 
 17 |         # we don't compare BAM because BAM's header includes date
 18 |         # hence md5sums don't match all the time
 19 |         String ref_pe_flagstat
 20 |         String ref_pe_flagstat_no_multimapping
 21 | 
 22 |         String ref_se_flagstat
 23 |         String ref_se_flagstat_no_multimapping
 24 | 
 25 |         String pe_bowtie2_idx_tar
 26 |         String se_bowtie2_idx_tar
 27 | 
 28 |         Int bowtie2_cpu = 1
 29 |         Float bowtie2_mem_factor = 0.0
 30 |         Int bowtie2_time_hr = 48
 31 |         Float bowtie2_disk_factor = 6.0
 32 |         String docker
 33 |     }
 34 |     RuntimeEnvironment runtime_environment = {
 35 |         "docker": docker,
 36 |         "singularity": "",
 37 |         "conda": ""
 38 |     }
 39 | 
 40 |     call atac.align as pe_bowtie2 { input :
 41 |         aligner = 'bowtie2',
 42 |         idx_tar = pe_bowtie2_idx_tar,
 43 |         mito_chr_name = 'chrM',
 44 |         fastqs_R1 = pe_fastqs_R1,
 45 |         fastqs_R2 = pe_fastqs_R2,
 46 |         adapters_R1 = [],
 47 |         adapters_R2 = [],
 48 |         cutadapt_param = cutadapt_param,
 49 |         multimapping = multimapping,
 50 |         paired_end = true,
 51 |         chrsz = pe_chrsz,
 52 |         auto_detect_adapter = true,
 53 | 
 54 |         cpu = bowtie2_cpu,
 55 |         mem_factor = bowtie2_mem_factor,
 56 |         time_hr = bowtie2_time_hr,
 57 |         disk_factor = bowtie2_disk_factor,
 58 |         runtime_environment = runtime_environment,
 59 |     }
 60 |     call atac.align as pe_bowtie2_no_multimapping { input :
 61 |         aligner = 'bowtie2',
 62 |         idx_tar = pe_bowtie2_idx_tar,
 63 |         mito_chr_name = 'chrM',
 64 |         fastqs_R1 = pe_fastqs_R1,
 65 |         fastqs_R2 = pe_fastqs_R2,
 66 |         adapters_R1 = [],
 67 |         adapters_R2 = [],
 68 |         cutadapt_param = cutadapt_param,
 69 |         multimapping = 0,
 70 |         paired_end = true,
 71 |         chrsz = pe_chrsz,
 72 |         auto_detect_adapter = true,
 73 | 
 74 |         cpu = bowtie2_cpu,
 75 |         mem_factor = bowtie2_mem_factor,
 76 |         time_hr = bowtie2_time_hr,
 77 |         disk_factor = bowtie2_disk_factor,
 78 |         runtime_environment = runtime_environment,
 79 |     }
 80 |     call atac.align as se_bowtie2 { input :
 81 |         aligner = 'bowtie2',
 82 |         idx_tar = se_bowtie2_idx_tar,
 83 |         mito_chr_name = 'chrM',
 84 |         fastqs_R1 = se_fastqs_R1,
 85 |         fastqs_R2 = [],
 86 |         adapters_R1 = [],
 87 |         adapters_R2 = [],
 88 |         cutadapt_param = cutadapt_param,
 89 |         multimapping = multimapping,
 90 |         paired_end = false,
 91 |         chrsz = se_chrsz,
 92 |         auto_detect_adapter = true,
 93 | 
 94 |         cpu = bowtie2_cpu,
 95 |         mem_factor = bowtie2_mem_factor,
 96 |         time_hr = bowtie2_time_hr,
 97 |         disk_factor = bowtie2_disk_factor,
 98 |         runtime_environment = runtime_environment,
 99 |     }
100 |     call atac.align as se_bowtie2_no_multimapping { input :
101 |         aligner = 'bowtie2',
102 |         idx_tar = se_bowtie2_idx_tar,
103 |         mito_chr_name = 'chrM',
104 |         fastqs_R1 = se_fastqs_R1,
105 |         fastqs_R2 = [],
106 |         adapters_R1 = [],
107 |         adapters_R2 = [],
108 |         cutadapt_param = cutadapt_param,
109 |         multimapping = 0,
110 |         paired_end = false,
111 |         chrsz = se_chrsz,
112 |         auto_detect_adapter = true,
113 | 
114 |         cpu = bowtie2_cpu,
115 |         mem_factor = bowtie2_mem_factor,
116 |         time_hr = bowtie2_time_hr,
117 |         disk_factor = bowtie2_disk_factor,
118 |         runtime_environment = runtime_environment,
119 |     }
120 | 
121 |     call compare_md5sum.compare_md5sum { input :
122 |         labels = [
123 |             'pe_bowtie2',
124 |             'pe_bowtie2_no_multimapping',
125 |             'se_bowtie2',
126 |             'se_bowtie2_no_multimapping',
127 |         ],
128 |         files = [
129 |             pe_bowtie2.samstat_qc,
130 |             pe_bowtie2_no_multimapping.samstat_qc,
131 |             se_bowtie2.samstat_qc,
132 |             se_bowtie2_no_multimapping.samstat_qc,
133 |         ],
134 |         ref_files = [
135 |             ref_pe_flagstat,
136 |             ref_pe_flagstat_no_multimapping,
137 |             ref_se_flagstat,
138 |             ref_se_flagstat_no_multimapping,
139 |         ],
140 |     }
141 | }
142 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_compare_signal_to_roadmap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_compare_signal_to_roadmap.reg2map_bed" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38/ataqc/hg38_celltype_compare_subsample.bed.gz",
 3 |     "test_compare_signal_to_roadmap.reg2map" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_formatted.txt.gz",
 4 |     "test_compare_signal_to_roadmap.roadmap_meta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38/ataqc/hg38_dnase_avg_fseq_signal_metadata.txt",
 5 | 
 6 |     "test_compare_signal_to_roadmap.pval_bw" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/ataqc/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.pval.signal.bigwig",
 7 | 
 8 |     "test_compare_signal_to_roadmap.ref_roadmap_compare_log" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_compare_signal_to_roadmap/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.pval.signal.roadmap_compare.log"
 9 | }
10 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_compare_signal_to_roadmap.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import '../../../atac.wdl' as atac
 3 | import 'compare_md5sum.wdl' as compare_md5sum
 4 | 
 5 | workflow test_compare_signal_to_roadmap {
 6 |     input {
 7 |         File pval_bw
 8 |         File reg2map_bed
 9 |         File reg2map
10 |         File roadmap_meta
11 | 
12 |         File ref_roadmap_compare_log
13 |         String docker
14 |     }
15 |     RuntimeEnvironment runtime_environment = {
16 |         "docker": docker,
17 |         "singularity": "",
18 |         "conda": ""
19 |     }
20 | 
21 |     call atac.compare_signal_to_roadmap { input : 
22 |         pval_bw = pval_bw,
23 | 
24 |         reg2map_bed = reg2map_bed,
25 |         reg2map = reg2map,
26 |         roadmap_meta = roadmap_meta,
27 |         runtime_environment = runtime_environment,
28 |     }
29 | 
30 |     call compare_md5sum.compare_md5sum { input :
31 |         labels = [
32 |             'ref_roadmap_compare_log',
33 |         ],
34 |         files = [
35 |             compare_signal_to_roadmap.roadmap_compare_log,
36 |         ],
37 |         ref_files = [
38 |             ref_roadmap_compare_log,
39 |         ],
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_count_signal_track.json:
--------------------------------------------------------------------------------
1 | {
2 |     "test_count_signal_track.se_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38_chr19_chrM/hg38_chr19_chrM.chrom.sizes",
3 | 
4 |     "test_count_signal_track.se_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/tas/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz",
5 | 
6 |     "test_count_signal_track.ref_se_count_signal_track_pos_bw" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_count_signal_track/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.positive.bigwig",
7 |     "test_count_signal_track.ref_se_count_signal_track_neg_bw" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_count_signal_track/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.negative.bigwig"
8 | }
9 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_count_signal_track.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import '../../../atac.wdl' as atac
 3 | import 'compare_md5sum.wdl' as compare_md5sum
 4 | 
 5 | workflow test_count_signal_track {
 6 |     input {    
 7 |         String se_ta
 8 | 
 9 |         String ref_se_count_signal_track_pos_bw
10 |         String ref_se_count_signal_track_neg_bw
11 | 
12 |         String se_chrsz
13 |         String docker
14 |     }
15 |     RuntimeEnvironment runtime_environment = {
16 |         "docker": docker,
17 |         "singularity": "",
18 |         "conda": ""
19 |     }
20 | 
21 |     call atac.count_signal_track as se_count_signal_track { input :
22 |         ta = se_ta,
23 |         chrsz = se_chrsz,
24 |         runtime_environment = runtime_environment,
25 |     }
26 | 
27 |     call compare_md5sum.compare_md5sum { input :
28 |         labels = [
29 |             'se_count_signal_track_pos_bw',
30 |             'se_count_signal_track_neg_bw',
31 |         ],
32 |         files = [
33 |             se_count_signal_track.pos_bw,
34 |             se_count_signal_track.neg_bw,
35 |         ],
36 |         ref_files = [
37 |             ref_se_count_signal_track_pos_bw,
38 |             ref_se_count_signal_track_neg_bw,
39 |         ],
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_filter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_filter.se_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10_chr19_chrM.chrom.sizes",
 3 |     "test_filter.pe_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38_chr19_chrM/hg38_chr19_chrM.chrom.sizes",
 4 |     "test_filter.pe_bam" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/bams/rep1/ENCFF341MYG.subsampled.400.trim.merged.bam",
 5 |     "test_filter.pe_bam_no_multimapping" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/bams_no_multimapping/rep1/ENCFF341MYG.subsampled.400.trim.merged.bam",
 6 |     "test_filter.se_bam" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/bams/rep1/ENCFF439VSY.subsampled.400.trim.merged.bam",
 7 |     "test_filter.se_bam_no_multimapping" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/bams_no_multimapping/rep1/ENCFF439VSY.subsampled.400.trim.merged.bam",
 8 | 
 9 |     "test_filter.ref_pe_nodup_samstat_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_filter/pe/multimapping/ENCFF341MYG.subsampled.400.trim.merged.nodup.no_chrM.samstats.qc",
10 |     "test_filter.ref_pe_nodup_samstat_qc_no_multimapping" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_filter/pe/no_multimapping/ENCFF341MYG.subsampled.400.trim.merged.nodup.no_chrM.samstats.qc",
11 |     "test_filter.ref_pe_filt_samstat_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_filter/pe/no_dup_removal/ENCFF341MYG.subsampled.400.trim.merged.filt.no_chrM.samstats.qc",
12 | 
13 |     "test_filter.ref_se_nodup_samstat_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_filter/se/multimapping/ENCFF439VSY.subsampled.400.trim.merged.nodup.no_chrM.samstats.qc",
14 |     "test_filter.ref_se_nodup_samstat_qc_no_multimapping" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_filter/se/no_multimapping/ENCFF439VSY.subsampled.400.trim.merged.nodup.no_chrM.samstats.qc",
15 |     "test_filter.ref_se_filt_samstat_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_filter/se/no_dup_removal/ENCFF439VSY.subsampled.400.trim.merged.filt.no_chrM.samstats.qc",
16 | 
17 |     "test_filter.multimapping" : 4
18 | }
19 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_frac_mito.json:
--------------------------------------------------------------------------------
1 | {
2 |     "test_frac_mito.non_mito_samstat" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/samstat_qcs/ENCFF341MYG.subsampled.400.trim.merged.non_mito.samstats.qc",
3 |     "test_frac_mito.mito_samstat" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/samstat_qcs/ENCFF341MYG.subsampled.400.trim.merged.samstats.qc",
4 |     "test_frac_mito.ref_frac_mito_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_frac_mito/ENCFF341MYG.subsampled.400.trim.merged.frac_mito.qc"
5 | }
6 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_frac_mito.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import '../../../atac.wdl' as atac
 3 | import 'compare_md5sum.wdl' as compare_md5sum
 4 | 
 5 | workflow test_frac_mito {
 6 |     input {
 7 |         File non_mito_samstat
 8 |         File mito_samstat
 9 | 
10 |         File ref_frac_mito_qc
11 |         String docker
12 |     }
13 |     RuntimeEnvironment runtime_environment = {
14 |         "docker": docker,
15 |         "singularity": "",
16 |         "conda": ""
17 |     }
18 | 
19 |     call atac.frac_mito as frac_mito { input:
20 |         non_mito_samstat = non_mito_samstat,
21 |         mito_samstat = mito_samstat,
22 |         runtime_environment = runtime_environment,
23 |     }
24 | 
25 |     call compare_md5sum.compare_md5sum { input :
26 |         labels = [
27 |             'frac_mito', 
28 |         ],
29 |         files = [
30 |             frac_mito.frac_mito_qc,
31 |         ],
32 |         ref_files = [
33 |             ref_frac_mito_qc,
34 |         ],
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_fraglen_stat_pe.json:
--------------------------------------------------------------------------------
1 | {
2 |     "test_fraglen_stat_pe.nodup_bam" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/ataqc/ENCFF341MYG.subsampled.400.trim.merged.nodup.bam",
3 | 
4 |     "test_fraglen_stat_pe.ref_nucleosomal_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_fraglen_stat_pe/ENCFF341MYG.subsampled.400.trim.merged.nodup.nucleosomal.qc"
5 | }
6 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_fraglen_stat_pe.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import '../../../atac.wdl' as atac
 3 | import 'compare_md5sum.wdl' as compare_md5sum
 4 | 
 5 | workflow test_fraglen_stat_pe {
 6 |     input {
 7 |         File nodup_bam
 8 | 
 9 |         File ref_nucleosomal_qc
10 |         String docker
11 |     }
12 |     RuntimeEnvironment runtime_environment = {
13 |         "docker": docker,
14 |         "singularity": "",
15 |         "conda": ""
16 |     }
17 | 
18 |     call atac.fraglen_stat_pe { input : 
19 |         nodup_bam = nodup_bam,
20 |         picard_java_heap = '4G',
21 |         runtime_environment = runtime_environment,
22 |     }
23 | 
24 |     call compare_md5sum.compare_md5sum { input :
25 |         labels = [
26 |             'test_nucleosomal_qc',
27 |         ],
28 |         files = [
29 |             fraglen_stat_pe.nucleosomal_qc,
30 |         ],
31 |         ref_files = [
32 |             ref_nucleosomal_qc,
33 |         ],
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_gc_bias.json:
--------------------------------------------------------------------------------
1 | {
2 |     "test_gc_bias.ref_fa" : "GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta.gz",
3 |     "test_gc_bias.nodup_bam" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/ataqc/ENCFF341MYG.subsampled.400.trim.merged.nodup.bam",
4 | 
5 |     "test_gc_bias.ref_gc_log" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_gc_bias/ENCFF341MYG.subsampled.400.trim.merged.nodup.gc.txt"
6 | }
7 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_gc_bias.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import '../../../atac.wdl' as atac
 3 | import 'compare_md5sum.wdl' as compare_md5sum
 4 | 
 5 | workflow test_gc_bias {
 6 |     input {
 7 |         File nodup_bam
 8 | 
 9 |         File ref_fa
10 | 
11 |         File ref_gc_log
12 |         String docker
13 |     }
14 |     RuntimeEnvironment runtime_environment = {
15 |         "docker": docker,
16 |         "singularity": "",
17 |         "conda": ""
18 |     }
19 | 
20 |     call atac.gc_bias { input : 
21 |         nodup_bam = nodup_bam,
22 |         ref_fa = ref_fa,
23 |         picard_java_heap = '4G',
24 |         runtime_environment = runtime_environment,
25 |     }
26 | 
27 |     call remove_comments_from_gc_log { input :
28 |         gc_log = gc_bias.gc_log
29 |     }
30 | 
31 |     call remove_comments_from_gc_log as remove_comments_from_gc_log_ref { input :
32 |         gc_log = ref_gc_log
33 |     }
34 | 
35 |     call compare_md5sum.compare_md5sum { input :
36 |         labels = [
37 |             'test_gc_log',
38 |         ],
39 |         files = [
40 |             remove_comments_from_gc_log.filt_gc_log,
41 |         ],
42 |         ref_files = [
43 |             remove_comments_from_gc_log_ref.filt_gc_log,
44 |         ],
45 |     }
46 | }
47 | 
48 | task remove_comments_from_gc_log {
49 |     input {
50 |         File gc_log
51 |     }
52 |     command {
53 |         zcat -f ${gc_log} | grep -v '# ' \
54 |             > ${basename(gc_log) + '.date_filt_out'}
55 |     }
56 |     output {
57 |         File filt_gc_log = glob('*.date_filt_out')[0]
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_idr.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_idr.se_blacklist" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10.blacklist.bed.gz",
 3 |     "test_idr.se_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10_chr19_chrM.chrom.sizes",
 4 | 
 5 |     "test_idr.se_peak_rep1" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/peaks/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pval0.01.300K.narrowPeak.gz",
 6 |     "test_idr.se_peak_rep2" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/peaks/rep2/ENCFF463QCX.subsampled.400.trim.merged.nodup.tn5.pval0.01.300K.narrowPeak.gz",
 7 |     "test_idr.se_peak_pooled" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/peaks/pooled_rep/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pooled.pval0.01.300K.narrowPeak.gz",
 8 |     "test_idr.se_ta_pooled" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/tas/pooled_rep/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pooled.tagAlign.gz",
 9 | 
10 |     "test_idr.ref_se_idr_peak" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_idr/rep1-rep2.idr0.05.narrowPeak.gz",
11 |     "test_idr.ref_se_idr_bfilt_peak" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_idr/rep1-rep2.idr0.05.bfilt.narrowPeak.gz",
12 |     "test_idr.ref_se_idr_frip_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_idr/rep1-rep2.idr0.05.bfilt.frip.qc",
13 | 
14 |     "test_idr.idr_thresh" : 0.05
15 | }
16 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_idr.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import '../../../atac.wdl' as atac
 3 | import 'compare_md5sum.wdl' as compare_md5sum
 4 | 
 5 | workflow test_idr {
 6 |     input {
 7 |         Float idr_thresh
 8 | 
 9 |         String se_peak_rep1
10 |         String se_peak_rep2
11 |         String se_peak_pooled
12 |         String se_ta_pooled
13 | 
14 |         String ref_se_idr_peak
15 |         String ref_se_idr_bfilt_peak
16 |         String ref_se_idr_frip_qc
17 | 
18 |         String se_blacklist
19 |         String se_chrsz
20 | 
21 |         String regex_bfilt_peak_chr_name = 'chr[\\dXY]+'
22 |         String docker
23 |     }
24 |     RuntimeEnvironment runtime_environment = {
25 |         "docker": docker,
26 |         "singularity": "",
27 |         "conda": ""
28 |     }
29 | 
30 |     call atac.idr as se_idr { input : 
31 |         prefix = 'rep1-rep2',
32 |         peak1 = se_peak_rep1,
33 |         peak2 = se_peak_rep2,
34 |         peak_pooled = se_peak_pooled,
35 |         idr_thresh = idr_thresh,
36 |         peak_type = 'narrowPeak',
37 |         rank = 'p.value',
38 |         blacklist = se_blacklist,
39 |         chrsz = se_chrsz,
40 |         regex_bfilt_peak_chr_name = regex_bfilt_peak_chr_name,
41 |         ta = se_ta_pooled,
42 |         runtime_environment = runtime_environment,
43 |     }
44 | 
45 |     call compare_md5sum.compare_md5sum { input :
46 |         labels = [
47 |             'se_idr_peak',
48 |             'se_idr_bfilt_peak',
49 |             'se_idr_frip_qc',
50 |         ],
51 |         files = [se_idr.idr_peak,
52 |             se_idr.bfilt_idr_peak,
53 |             se_idr.frip_qc,
54 |         ],
55 |         ref_files = [
56 |             ref_se_idr_peak,
57 |             ref_se_idr_bfilt_peak,
58 |             ref_se_idr_frip_qc,
59 |         ],
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_jsd.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_jsd.se_nodup_bams" : [
 3 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/nodup_bams/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.bam"
 4 |     ],
 5 |     "test_jsd.se_blacklist" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10.blacklist.bed.gz",
 6 | 
 7 |     "test_jsd.se_fake_blacklist" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/fake_blacklist/mm10.whole_chr19.blacklist.bed.gz",
 8 | 
 9 |     "test_jsd.ref_se_jsd_logs" : [
10 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_jsd/rep1.ENCFF439VSY.subsampled.400.trim.merged.nodup.bfilt.jsd.qc"
11 |     ],
12 |     "test_jsd.ref_se_jsd_fake_blacklist_logs" : [
13 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_jsd/fake_blacklist/rep1.ENCFF439VSY.subsampled.400.trim.merged.nodup.bfilt.jsd.qc"
14 |     ]
15 | }
16 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_jsd.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import '../../../atac.wdl' as atac
 3 | import 'compare_md5sum.wdl' as compare_md5sum
 4 | 
 5 | workflow test_jsd {
 6 |     input {
 7 |         Array[File] se_nodup_bams
 8 |         File se_blacklist
 9 |         File se_fake_blacklist
10 |         Array[File] ref_se_jsd_logs
11 |         Array[File] ref_se_jsd_fake_blacklist_logs
12 |         # task level test data (BAM) is generated from BWA
13 |         # so we keep using 30 here, this should be 255 for bowtie2 BAMs
14 |         Int mapq_thresh = 30
15 | 
16 |         Int jsd_cpu = 1
17 |         Float jsd_mem_factor = 0.0
18 |         Int jsd_time_hr = 12
19 |         Float jsd_disk_factor = 2.0
20 |         String docker
21 |     }
22 |     RuntimeEnvironment runtime_environment = {
23 |         "docker": docker,
24 |         "singularity": "",
25 |         "conda": ""
26 |     }
27 | 
28 |     call atac.jsd as se_jsd { input :
29 |         nodup_bams = se_nodup_bams,
30 |         blacklist = se_blacklist,
31 |         mapq_thresh = mapq_thresh,
32 | 
33 |         cpu = jsd_cpu,
34 |         mem_factor = jsd_mem_factor,
35 |         time_hr = jsd_time_hr,
36 |         disk_factor = jsd_disk_factor,
37 |         runtime_environment = runtime_environment,
38 |     }
39 | 
40 |     call atac.jsd as se_jsd_fake_blacklist { input :
41 |         nodup_bams = se_nodup_bams,
42 |         blacklist = se_fake_blacklist,
43 |         mapq_thresh = mapq_thresh,
44 | 
45 |         cpu = jsd_cpu,
46 |         mem_factor = jsd_mem_factor,
47 |         time_hr = jsd_time_hr,
48 |         disk_factor = jsd_disk_factor,
49 |         runtime_environment = runtime_environment,
50 |     }
51 | 
52 |     # take first 8 columns (vaule in other columns are random)
53 |     #scatter(i in range(2)){
54 |     #    call take_8_cols { input :
55 |     #        f = se_jsd.jsd_qcs[i],
56 |     #    }
57 |     #    call take_8_cols as ref_take_8_cols { input :
58 |     #        f = ref_se_jsd_logs[i],
59 |     #    }
60 |     #}
61 | 
62 |     call compare_md5sum.compare_md5sum { input :
63 |         labels = [
64 |             'se_jsd_rep1',
65 |             'se_jsd_fake_blacklist_rep1',
66 |         ],
67 |         files = [
68 |             #take_8_cols.out[0],
69 |             se_jsd.jsd_qcs[0],
70 |             se_jsd_fake_blacklist.jsd_qcs[0],
71 |         ],
72 |         ref_files = [
73 |             #ref_take_8_cols.out[0],
74 |             ref_se_jsd_logs[0],
75 |             ref_se_jsd_fake_blacklist_logs[0],
76 |         ],
77 |     }
78 | }
79 | 
80 | task take_8_cols {
81 |     input {
82 |         File f
83 |     }
84 |     command {
85 |         cut -f 1-8 ${f} > out.txt
86 |     }
87 |     output {
88 |         File out = 'out.txt'
89 |     }
90 | }
91 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_macs2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_macs2.se_blacklist" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10.blacklist.bed.gz",
 3 |     "test_macs2.se_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10_chr19_chrM.chrom.sizes",
 4 |     "test_macs2.se_gensz" : "mm",
 5 | 
 6 |     "test_macs2.se_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/tas/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz",
 7 | 
 8 |     "test_macs2.ref_se_macs2_npeak" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_macs2/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pval0.01.300K.narrowPeak.gz",
 9 |     "test_macs2.ref_se_macs2_bfilt_npeak" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_macs2/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pval0.01.300K.bfilt.narrowPeak.gz",
10 |     "test_macs2.ref_se_macs2_frip_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_macs2/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pval0.01.300K.bfilt.frip.qc",
11 |     
12 |     "test_macs2.cap_num_peak" : 300000,
13 |     "test_macs2.pval_thresh" : 0.01,
14 |     "test_macs2.smooth_win" : 150
15 | }
16 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_macs2.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import '../../../atac.wdl' as atac
 3 | import 'compare_md5sum.wdl' as compare_md5sum
 4 | 
 5 | workflow test_macs2 {
 6 |     input {
 7 |         Int cap_num_peak
 8 |         Float pval_thresh
 9 |         Int smooth_win
10 | 
11 |         # test macs2 for SE set only
12 |         String se_ta
13 | 
14 |         String ref_se_macs2_npeak # raw narrow-peak
15 |         String ref_se_macs2_bfilt_npeak # blacklist filtered narrow-peak
16 |         String ref_se_macs2_frip_qc 
17 | 
18 |         String se_blacklist
19 |         String se_chrsz
20 |         String se_gensz
21 | 
22 |         String regex_bfilt_peak_chr_name = 'chr[\\dXY]+'
23 | 
24 |         Float macs2_mem_factor = 2.0
25 |         Int macs2_time_hr = 24
26 |         Float macs2_disk_factor = 15.0
27 |         String docker
28 |     }
29 |     RuntimeEnvironment runtime_environment = {
30 |         "docker": docker,
31 |         "singularity": "",
32 |         "conda": ""
33 |     }
34 | 
35 |     call atac.call_peak as se_macs2 { input :
36 |         peak_caller = 'macs2',
37 |         peak_type = 'narrowPeak',
38 |         ta = se_ta,
39 |         gensz = se_gensz,
40 |         chrsz = se_chrsz,
41 |         cap_num_peak = cap_num_peak,
42 |         pval_thresh = pval_thresh,
43 |         smooth_win = smooth_win,
44 |         blacklist = se_blacklist,
45 |         regex_bfilt_peak_chr_name = regex_bfilt_peak_chr_name,
46 | 
47 |         cpu = 2,
48 |         mem_factor = macs2_mem_factor,
49 |         time_hr = macs2_time_hr,
50 |         disk_factor = macs2_disk_factor,
51 |         runtime_environment = runtime_environment,
52 |     }
53 | 
54 |     call compare_md5sum.compare_md5sum { input :
55 |         labels = [
56 |             'se_macs2_npeak',
57 |             'se_macs2_bfilt_npeak',
58 |             'se_macs2_frip_qc',
59 |         ],
60 |         files = [
61 |             se_macs2.peak,
62 |             se_macs2.bfilt_peak,
63 |             se_macs2.frip_qc,
64 |         ],
65 |         ref_files = [
66 |             ref_se_macs2_npeak,
67 |             ref_se_macs2_bfilt_npeak,
68 |             ref_se_macs2_frip_qc,
69 |         ],
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_macs2_signal_track.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_macs2_signal_track.se_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10_chr19_chrM.chrom.sizes",
 3 |     "test_macs2_signal_track.se_gensz" : "mm",
 4 | 
 5 |     "test_macs2_signal_track.se_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/tas/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz",
 6 | 
 7 |     "test_macs2_signal_track.ref_se_macs2_pval_bw" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_macs2/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pval.signal.bigwig",
 8 |     
 9 |     "test_macs2_signal_track.pval_thresh" : 0.01,
10 |     "test_macs2_signal_track.smooth_win" : 150
11 | }
12 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_macs2_signal_track.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import '../../../atac.wdl' as atac
 3 | import 'compare_md5sum.wdl' as compare_md5sum
 4 | 
 5 | workflow test_macs2_signal_track {
 6 |     input {
 7 |         Float pval_thresh
 8 |         Int smooth_win
 9 | 
10 |         # test macs2 for SE set only
11 |         String se_ta
12 | 
13 |         String ref_se_macs2_pval_bw # p-val signal
14 | 
15 |         String se_chrsz
16 |         String se_gensz
17 | 
18 |         Float macs2_mem_factor = 0.0
19 |         Int macs2_time_hr = 24
20 |         Float macs2_disk_factor = 40.0
21 |         String docker
22 |     }
23 |     RuntimeEnvironment runtime_environment = {
24 |         "docker": docker,
25 |         "singularity": "",
26 |         "conda": ""
27 |     }
28 | 
29 |     call atac.macs2_signal_track as se_macs2_signal_track { input :
30 |         ta = se_ta,
31 |         gensz = se_gensz,
32 |         chrsz = se_chrsz,
33 |         pval_thresh = pval_thresh,
34 |         smooth_win = smooth_win,
35 | 
36 |         mem_factor = macs2_mem_factor,
37 |         time_hr = macs2_time_hr,
38 |         disk_factor = macs2_disk_factor,
39 |         runtime_environment = runtime_environment,
40 |     }
41 | 
42 |     call compare_md5sum.compare_md5sum { input :
43 |         labels = [
44 |             'se_macs2_pval_bw',
45 |         ],
46 |         files = [
47 |             se_macs2_signal_track.pval_bw,
48 |         ],
49 |         ref_files = [
50 |             ref_se_macs2_pval_bw,
51 |         ],
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_overlap.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_overlap.se_blacklist" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10.blacklist.bed.gz",
 3 |     "test_overlap.se_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10_chr19_chrM.chrom.sizes",
 4 | 
 5 |     "test_overlap.se_peak_rep1" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/peaks/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pval0.01.300K.narrowPeak.gz",
 6 |     "test_overlap.se_peak_rep2" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/peaks/rep2/ENCFF463QCX.subsampled.400.trim.merged.nodup.tn5.pval0.01.300K.narrowPeak.gz",
 7 |     "test_overlap.se_peak_pooled" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/peaks/pooled_rep/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pooled.pval0.01.300K.narrowPeak.gz",
 8 |     "test_overlap.se_ta_pooled" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/tas/pooled_rep/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pooled.tagAlign.gz",
 9 | 
10 |     "test_overlap.ref_se_overlap_peak" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_overlap/rep1-rep2.overlap.narrowPeak.gz",
11 |     "test_overlap.ref_se_overlap_bfilt_peak" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_overlap/rep1-rep2.overlap.bfilt.narrowPeak.gz",
12 |     "test_overlap.ref_se_overlap_frip_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_overlap/rep1-rep2.overlap.bfilt.frip.qc"
13 | }
14 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_overlap.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import '../../../atac.wdl' as atac
 3 | import 'compare_md5sum.wdl' as compare_md5sum
 4 | 
 5 | workflow test_overlap {
 6 |     input {
 7 |         String se_peak_rep1 # test overlap,idr for SE set only
 8 |         String se_peak_rep2
 9 |         String se_peak_pooled
10 |         String se_ta_pooled
11 | 
12 |         String ref_se_overlap_peak
13 |         String ref_se_overlap_bfilt_peak
14 |         String ref_se_overlap_frip_qc
15 | 
16 |         String se_blacklist
17 |         String se_chrsz
18 | 
19 |         String regex_bfilt_peak_chr_name = 'chr[\\dXY]+'
20 |         String docker
21 |     }
22 |     RuntimeEnvironment runtime_environment = {
23 |         "docker": docker,
24 |         "singularity": "",
25 |         "conda": ""
26 |     }
27 | 
28 |     call atac.overlap as se_overlap { input :
29 |         prefix = 'rep1-rep2',
30 |         peak1 = se_peak_rep1,
31 |         peak2 = se_peak_rep2,
32 |         peak_pooled = se_peak_pooled,
33 |         peak_type = 'narrowPeak',
34 |         blacklist = se_blacklist,
35 |         regex_bfilt_peak_chr_name = regex_bfilt_peak_chr_name,
36 |         chrsz = se_chrsz,
37 |         ta = se_ta_pooled,
38 |         runtime_environment = runtime_environment,
39 |     }
40 | 
41 |     call compare_md5sum.compare_md5sum { input :
42 |         labels = [
43 |             'se_overlap_peak',
44 |             'se_overlap_bfilt_peak',
45 |             'se_overlap_frip_qc',
46 |         ],
47 |         files = [
48 |             se_overlap.overlap_peak,
49 |             se_overlap.bfilt_overlap_peak,
50 |             se_overlap.frip_qc,
51 |         ],
52 |         ref_files = [
53 |             ref_se_overlap_peak,
54 |             ref_se_overlap_bfilt_peak,
55 |             ref_se_overlap_frip_qc,
56 |         ],
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_pool_ta.json:
--------------------------------------------------------------------------------
1 | {
2 |     "test_pool_ta.se_ta_rep1" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/tas/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz",
3 |     "test_pool_ta.se_ta_rep2" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/tas/rep2/ENCFF463QCX.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz",
4 | 
5 |     "test_pool_ta.ref_se_pooled_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_pool_ta/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pooled.tagAlign.gz"
6 | }
7 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_pool_ta.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import '../../../atac.wdl' as atac
 3 | import 'compare_md5sum.wdl' as compare_md5sum
 4 | 
 5 | workflow test_pool_ta {
 6 |     input {
 7 |         String se_ta_rep1
 8 |         String se_ta_rep2
 9 | 
10 |         String ref_se_pooled_ta
11 |         String docker
12 |     }
13 |     RuntimeEnvironment runtime_environment = {
14 |         "docker": docker,
15 |         "singularity": "",
16 |         "conda": ""
17 |     }
18 | 
19 |     call atac.pool_ta as se_pool_ta { input :
20 |         tas = [se_ta_rep1, se_ta_rep2],
21 |         runtime_environment = runtime_environment,
22 |     }
23 | 
24 |     call compare_md5sum.compare_md5sum { input :
25 |         labels = [
26 |             'se_pool_ta',
27 |         ],
28 |         files = [
29 |             se_pool_ta.ta_pooled,
30 |         ],
31 |         ref_files = [
32 |             ref_se_pooled_ta,
33 |         ],
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_preseq.json:
--------------------------------------------------------------------------------
1 | {
2 |     "test_preseq.paired_end" : true,
3 |     "test_preseq.bam" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/ataqc/ENCFF341MYG.subsampled.400.trim.merged.bam",
4 | 
5 |     "test_preseq.ref_picard_est_lib_size_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_preseq/ENCFF341MYG.subsampled.400.trim.merged.picard_est_lib_size.qc",
6 |     "test_preseq.ref_preseq_log" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_preseq/ENCFF341MYG.subsampled.400.trim.merged.preseq.log"
7 | }
8 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_preseq.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import '../../../atac.wdl' as atac
 3 | import 'compare_md5sum.wdl' as compare_md5sum
 4 | 
 5 | workflow test_preseq {
 6 |     input {
 7 |         File bam
 8 |         Boolean paired_end
 9 | 
10 |         File ref_picard_est_lib_size_qc
11 |         File ref_preseq_log
12 | 
13 |         Float preseq_mem_factor = 0.0
14 |         Float preseq_disk_factor = 5.0
15 |         String docker
16 |     }
17 |     RuntimeEnvironment runtime_environment = {
18 |         "docker": docker,
19 |         "singularity": "",
20 |         "conda": ""
21 |     }
22 | 
23 |     call atac.preseq { input : 
24 |         paired_end = paired_end,
25 |         bam = bam,
26 |         mem_factor = preseq_mem_factor,
27 |         disk_factor = preseq_disk_factor,
28 |         picard_java_heap = '4G',
29 |         runtime_environment = runtime_environment,
30 |     }
31 | 
32 |     call compare_md5sum.compare_md5sum { input :
33 |         labels = [
34 |             'test_picard_est_lib_size_qc',
35 |             'test_preseq_log',
36 |         ],
37 |         files = select_all([
38 |             preseq.picard_est_lib_size_qc,
39 |             preseq.preseq_log,
40 |         ]),
41 |         ref_files = [
42 |             ref_picard_est_lib_size_qc,
43 |             ref_preseq_log,
44 |         ],
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_reproducibility.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_reproducibility.se_overlap_peak_rep1_vs_rep2" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/overlap_peaks/rep1-rep2.overlap.bfilt.narrowPeak.gz",
 3 |     "test_reproducibility.se_overlap_peak_rep1_pr" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/overlap_peaks/rep1-pr.overlap.bfilt.narrowPeak.gz",
 4 |     "test_reproducibility.se_overlap_peak_rep2_pr" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/overlap_peaks/rep2-pr.overlap.bfilt.narrowPeak.gz",
 5 |     "test_reproducibility.se_overlap_peak_ppr" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/overlap_peaks/ppr.overlap.bfilt.narrowPeak.gz",
 6 |     "test_reproducibility.se_chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/mm10_chr19_chrM/mm10_chr19_chrM.chrom.sizes",
 7 | 
 8 |     "test_reproducibility.ref_se_reproducibility_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_reproducibility/overlap.reproducibility.qc"
 9 | }
10 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_reproducibility.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import '../../../atac.wdl' as atac
 3 | import 'compare_md5sum.wdl' as compare_md5sum
 4 | 
 5 | workflow test_reproducibility {
 6 |     input {
 7 |         String se_overlap_peak_rep1_vs_rep2
 8 |         String se_overlap_peak_rep1_pr
 9 |         String se_overlap_peak_rep2_pr
10 |         String se_overlap_peak_ppr
11 |         String se_chrsz
12 | 
13 |         String ref_se_reproducibility_qc
14 |         String docker
15 |     }
16 |     RuntimeEnvironment runtime_environment = {
17 |         "docker": docker,
18 |         "singularity": "",
19 |         "conda": ""
20 |     }
21 | 
22 |     call atac.reproducibility as se_reproducibility { input :
23 |         prefix = 'overlap',
24 |         peaks = [se_overlap_peak_rep1_vs_rep2],
25 |         peaks_pr = [se_overlap_peak_rep1_pr, se_overlap_peak_rep2_pr],
26 |         peak_ppr = se_overlap_peak_ppr,
27 |         peak_type = 'narrowPeak',
28 |         chrsz = se_chrsz,        
29 |         runtime_environment = runtime_environment,
30 |     }
31 | 
32 |     call compare_md5sum.compare_md5sum { input :
33 |         labels = [
34 |             'se_reproducibility',
35 |         ],
36 |         files = [
37 |             se_reproducibility.reproducibility_qc,
38 |         ],
39 |         ref_files = [
40 |             ref_se_reproducibility_qc,
41 |         ],
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_spr.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_spr.pe_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/tas/rep1/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz",
 3 |     "test_spr.se_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/tas/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz",
 4 | 
 5 |     "test_spr.ref_pe_ta_pr1" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_spr/pe/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.pr1.tagAlign.gz",
 6 |     "test_spr.ref_pe_ta_pr2" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_spr/pe/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.pr2.tagAlign.gz",
 7 |     "test_spr.ref_se_ta_pr1" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_spr/se/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pr1.tagAlign.gz",
 8 |     "test_spr.ref_se_ta_pr2" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_spr/se/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pr2.tagAlign.gz",
 9 | 
10 |     "test_spr.ref_pe_seed_10_ta_pr1" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_spr/pe/pseudoreplication_random_seed_10/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.pr1.tagAlign.gz",
11 |     "test_spr.ref_pe_seed_10_ta_pr2" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_spr/pe/pseudoreplication_random_seed_10/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.pr2.tagAlign.gz",
12 |     "test_spr.ref_se_seed_10_ta_pr1" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_spr/se/pseudoreplication_random_seed_10/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pr1.tagAlign.gz",
13 |     "test_spr.ref_se_seed_10_ta_pr2" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_spr/se/pseudoreplication_random_seed_10/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.pr2.tagAlign.gz"
14 | }
15 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_spr.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import '../../../atac.wdl' as atac
 3 | import 'compare_md5sum.wdl' as compare_md5sum
 4 | 
 5 | workflow test_spr {
 6 |     input {
 7 |         File pe_ta
 8 |         File se_ta
 9 | 
10 |         File ref_pe_ta_pr1
11 |         File ref_pe_ta_pr2
12 |         File ref_se_ta_pr1
13 |         File ref_se_ta_pr2
14 |         File ref_pe_seed_10_ta_pr1
15 |         File ref_pe_seed_10_ta_pr2
16 |         File ref_se_seed_10_ta_pr1
17 |         File ref_se_seed_10_ta_pr2
18 | 
19 |         Float spr_mem_factor = 0.0
20 |         Float spr_disk_factor = 6.0
21 |         String docker
22 |     }
23 |     RuntimeEnvironment runtime_environment = {
24 |         "docker": docker,
25 |         "singularity": "",
26 |         "conda": ""
27 |     }
28 | 
29 |     call atac.spr as pe_spr { input :
30 |         ta = pe_ta,
31 |         paired_end = true,
32 |         pseudoreplication_random_seed = 0,
33 |         mem_factor = spr_mem_factor,
34 |         disk_factor = spr_disk_factor,
35 |         runtime_environment = runtime_environment,
36 |     }    
37 |     call atac.spr as se_spr { input :
38 |         ta = se_ta,
39 |         paired_end = false,
40 |         pseudoreplication_random_seed = 0,
41 |         mem_factor = spr_mem_factor,
42 |         disk_factor = spr_disk_factor,
43 |         runtime_environment = runtime_environment,
44 |     }
45 |     call atac.spr as pe_spr_seed_10 { input :
46 |         ta = pe_ta,
47 |         paired_end = true,
48 |         pseudoreplication_random_seed = 10,
49 |         mem_factor = spr_mem_factor,
50 |         disk_factor = spr_disk_factor,
51 |         runtime_environment = runtime_environment,
52 |     }
53 |     call atac.spr as se_spr_seed_10 { input :
54 |         ta = se_ta,
55 |         paired_end = false,
56 |         pseudoreplication_random_seed = 10,
57 |         mem_factor = spr_mem_factor,
58 |         disk_factor = spr_disk_factor,
59 |         runtime_environment = runtime_environment,
60 |     }
61 | 
62 |     call compare_md5sum.compare_md5sum { input :
63 |         labels = [
64 |             'pe_spr_pr1',
65 |             'pe_spr_pr2',
66 |             'se_spr_pr1',
67 |             'se_spr_pr2',
68 |             'pe_spr_seed_10_pr1',
69 |             'pe_spr_seed_10_pr2',
70 |             'se_spr_seed_10_pr1',
71 |             'se_spr_seed_10_pr2',
72 |         ],
73 |         files = [
74 |             pe_spr.ta_pr1,
75 |             pe_spr.ta_pr2,
76 |             se_spr.ta_pr1,
77 |             se_spr.ta_pr2,
78 |             pe_spr_seed_10.ta_pr1,
79 |             pe_spr_seed_10.ta_pr2,
80 |             se_spr_seed_10.ta_pr1,
81 |             se_spr_seed_10.ta_pr2,
82 |         ],
83 |         ref_files = [
84 |             ref_pe_ta_pr1,
85 |             ref_pe_ta_pr2,
86 |             ref_se_ta_pr1,
87 |             ref_se_ta_pr2,
88 |             ref_pe_seed_10_ta_pr1,
89 |             ref_pe_seed_10_ta_pr2,
90 |             ref_se_seed_10_ta_pr1,
91 |             ref_se_seed_10_ta_pr2,
92 |         ],
93 |     }
94 | }
95 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_tss_enrich.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_tss_enrich.tss" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38/ataqc/ENCFF766FGL.bed.gz",
 3 |     "test_tss_enrich.chrsz" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/genome_data/hg38/hg38.chrom.sizes",
 4 |     
 5 |     "test_tss_enrich.read_len_log" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/ataqc/ENCFF341MYG.subsampled.400.trim.merged.read_length.txt",
 6 |     "test_tss_enrich.nodup_bam" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/ataqc/ENCFF341MYG.subsampled.400.trim.merged.nodup.bam",
 7 | 
 8 |     "test_tss_enrich.ref_tss_enrich_qc" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_tss_enrich/ENCFF341MYG.subsampled.400.trim.merged.nodup.tss_enrich.qc"
 9 | }
10 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_tss_enrich.wdl:
--------------------------------------------------------------------------------
 1 | version 1.0
 2 | import '../../../atac.wdl' as atac
 3 | import 'compare_md5sum.wdl' as compare_md5sum
 4 | 
 5 | workflow test_tss_enrich {
 6 |     input {
 7 |         File read_len_log
 8 |         File nodup_bam
 9 |         File tss
10 |         File chrsz
11 | 
12 |         File ref_tss_enrich_qc
13 |         String docker
14 |     }
15 |     RuntimeEnvironment runtime_environment = {
16 |         "docker": docker,
17 |         "singularity": "",
18 |         "conda": ""
19 |     }
20 | 
21 |     Int? read_len_ = read_int(read_len_log)
22 | 
23 |     call atac.tss_enrich { input : 
24 |         read_len = read_len_,
25 |         nodup_bam = nodup_bam,
26 |         chrsz = chrsz,
27 |         tss = tss,
28 |         runtime_environment = runtime_environment,
29 |     }
30 | 
31 |     call compare_md5sum.compare_md5sum { input :
32 |         labels = [
33 |             'test_tss_enrich_qc',
34 |         ],
35 |         files = [
36 |             tss_enrich.tss_enrich_qc,
37 |         ],
38 |         ref_files = [
39 |             ref_tss_enrich_qc,
40 |         ],
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_xcor.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "test_xcor.pe_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/pe/tas/rep1/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz",
 3 |     "test_xcor.se_ta" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/input/se/tas/rep1/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.tagAlign.gz",
 4 | 
 5 |     "test_xcor.ref_pe_xcor_log" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_xcor/pe/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.no_chrM.R1.25M.cc.qc",
 6 |     "test_xcor.ref_pe_xcor_log_subsample" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_xcor/pe/subsample/fix_PIP-917/ENCFF341MYG.subsampled.400.trim.merged.nodup.tn5.no_chrM.R1.5K.cc.qc",
 7 |     "test_xcor.ref_se_xcor_log" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_xcor/se/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.25M.cc.qc",
 8 |     "test_xcor.ref_se_xcor_log_subsample" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/atac-seq-pipeline-test-data/ref_output/test_xcor/se/subsample/ENCFF439VSY.subsampled.400.trim.merged.nodup.tn5.no_chrM.5K.cc.qc",
 9 | 
10 |     "test_xcor.xcor_subsample" : 5000
11 | }
12 | 


--------------------------------------------------------------------------------
/dev/test/test_task/test_xcor.wdl:
--------------------------------------------------------------------------------
  1 | version 1.0
  2 | import '../../../atac.wdl' as atac
  3 | import 'compare_md5sum.wdl' as compare_md5sum
  4 | 
  5 | workflow test_xcor {
  6 |     input {
  7 |         Int xcor_subsample
  8 |         Int xcor_subsample_default = 25000000
  9 | 
 10 |         String pe_ta
 11 |         String se_ta
 12 | 
 13 |         String ref_pe_xcor_log
 14 |         String ref_pe_xcor_log_subsample
 15 |         String ref_se_xcor_log
 16 |         String ref_se_xcor_log_subsample
 17 |         String mito_chr_name = 'chrM'
 18 | 
 19 |         Int xcor_cpu = 2
 20 |         Float xcor_mem_factor = 0.0
 21 |         Int xcor_time_hr = 6
 22 |         Float xcor_disk_factor = 1.5
 23 |         String docker
 24 |     }
 25 |     RuntimeEnvironment runtime_environment = {
 26 |         "docker": docker,
 27 |         "singularity": "",
 28 |         "conda": ""
 29 |     }
 30 | 
 31 |     call atac.xcor as pe_xcor { input :
 32 |         ta = pe_ta,
 33 |         subsample = xcor_subsample_default,
 34 |         paired_end = true,
 35 |         mito_chr_name = mito_chr_name,
 36 | 
 37 |         cpu = xcor_cpu,
 38 |         mem_factor = xcor_mem_factor,
 39 |         time_hr = xcor_time_hr,
 40 |         disk_factor = xcor_disk_factor,
 41 |         runtime_environment = runtime_environment,
 42 |     }
 43 |     call atac.xcor as pe_xcor_subsample { input :
 44 |         ta = pe_ta,
 45 |         subsample = xcor_subsample,
 46 |         paired_end = true,
 47 |         mito_chr_name = mito_chr_name,
 48 | 
 49 |         cpu = xcor_cpu,
 50 |         mem_factor = xcor_mem_factor,
 51 |         time_hr = xcor_time_hr,
 52 |         disk_factor = xcor_disk_factor,
 53 |         runtime_environment = runtime_environment,
 54 |     }
 55 |     call atac.xcor as se_xcor { input :
 56 |         ta = se_ta,
 57 |         subsample = xcor_subsample_default,
 58 |         paired_end = false,
 59 |         mito_chr_name = mito_chr_name,
 60 | 
 61 |         cpu = xcor_cpu,
 62 |         mem_factor = xcor_mem_factor,
 63 |         time_hr = xcor_time_hr,
 64 |         disk_factor = xcor_disk_factor,
 65 |         runtime_environment = runtime_environment,
 66 |     }
 67 |     call atac.xcor as se_xcor_subsample { input :
 68 |         ta = se_ta,
 69 |         subsample = xcor_subsample,
 70 |         paired_end = false,
 71 |         mito_chr_name = mito_chr_name,
 72 | 
 73 |         cpu = xcor_cpu,
 74 |         mem_factor = xcor_mem_factor,
 75 |         time_hr = xcor_time_hr,
 76 |         disk_factor = xcor_disk_factor,
 77 |         runtime_environment = runtime_environment,
 78 |     }
 79 | 
 80 |     call compare_md5sum.compare_md5sum { input :
 81 |         labels = [
 82 |             'pe_xcor',
 83 |             'pe_xcor_subsample',
 84 |             'se_xcor',
 85 |             'se_xcor_subsample',
 86 |         ],
 87 |         files = [
 88 |             pe_xcor.score,
 89 |             pe_xcor_subsample.score,
 90 |             se_xcor.score,
 91 |             se_xcor_subsample.score,
 92 |         ],
 93 |         ref_files = [
 94 |             ref_pe_xcor_log,
 95 |             ref_pe_xcor_log_subsample,
 96 |             ref_se_xcor_log,
 97 |             ref_se_xcor_log_subsample,
 98 |         ],
 99 |     }
100 | }
101 | 


--------------------------------------------------------------------------------
/dev/test/test_workflow/.gitignore:
--------------------------------------------------------------------------------
 1 | *qc_json_diff.txt
 2 | *qc_json_match.txt
 3 | *.result.json
 4 | *.result.qc.json
 5 | *.status.json
 6 | *.metadata.json
 7 | *.submit.json
 8 | *.test_atac_wf_opt.json
 9 | cromwell*.jar
10 | tmp_secret_key.json
11 | 


--------------------------------------------------------------------------------
/dev/test/test_workflow/ENCSR356KRQ.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ref_output/v1.1.6/ENCSR356KRQ/qc.json",
 3 |     "atac.pipeline_type" : "atac",
 4 |     "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v4/hg38.tsv",
 5 |     "atac.fastqs_rep1_R1" : [
 6 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair1/ENCFF341MYG.fastq.gz",
 7 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair1/ENCFF106QGY.fastq.gz"
 8 |     ],
 9 |     "atac.fastqs_rep1_R2" : [
10 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair2/ENCFF248EJF.fastq.gz",
11 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep1/pair2/ENCFF368TYI.fastq.gz"
12 |     ],
13 |     "atac.fastqs_rep2_R1" : [
14 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF641SFZ.fastq.gz",
15 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF751XTV.fastq.gz",
16 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF927LSG.fastq.gz",
17 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF859BDM.fastq.gz",
18 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF193RRC.fastq.gz",
19 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair1/ENCFF366DFI.fastq.gz"
20 |     ],
21 |     "atac.fastqs_rep2_R2" : [
22 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF031ARQ.fastq.gz",
23 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF590SYZ.fastq.gz",
24 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF734PEQ.fastq.gz",
25 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF007USV.fastq.gz",
26 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF886FSC.fastq.gz",
27 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq/rep2/pair2/ENCFF573UXK.fastq.gz"
28 |     ],
29 |     "atac.paired_end" : true,
30 |     "atac.auto_detect_adapter" : true,
31 |     "atac.enable_xcor" : true,
32 |     "atac.enable_tss_enrich" : true,
33 |     "atac.title" : "ENCSR356KRQ",
34 |     "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation",
35 |     "atac.align_cpu" : 8
36 | }
37 | 


--------------------------------------------------------------------------------
/dev/test/test_workflow/ENCSR356KRQ_subsampled.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ref_output/v2.2.2/ENCSR356KRQ_subsampled/qc.json",
 3 |     "atac.pipeline_type" : "atac",
 4 |     "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v4/hg38.tsv",
 5 |     "atac.fastqs_rep1_R1" : [
 6 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz",
 7 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz"
 8 |     ],
 9 |     "atac.fastqs_rep1_R2" : [
10 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz",
11 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz"
12 |     ],
13 |     "atac.fastqs_rep2_R1" : [
14 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz",
15 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz",
16 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz",
17 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz",
18 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz",
19 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz"
20 |     ],
21 |     "atac.fastqs_rep2_R2" : [
22 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz",
23 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz",
24 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz",
25 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz",
26 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz",
27 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz"
28 |     ],
29 |     "atac.paired_end" : true,
30 |     "atac.auto_detect_adapter" : true,
31 |     "atac.enable_xcor" : true,
32 |     "atac.title" : "ENCSR356KRQ (subsampled 1/400 reads)",
33 |     "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation"
34 | }
35 | 


--------------------------------------------------------------------------------
/dev/test/test_workflow/ENCSR356KRQ_subsampled_chr19_only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ref_output/v1.1.7.2/ENCSR356KRQ_subsampled_chr19_only/qc.json",
 3 |     "atac.pipeline_type" : "atac",
 4 |     "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v4/hg38_chr19_chrM.tsv",
 5 |     "atac.fastqs_rep1_R1" : [
 6 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz",
 7 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz"
 8 |     ],
 9 |     "atac.fastqs_rep1_R2" : [
10 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz",
11 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz"
12 |     ],
13 |     "atac.fastqs_rep2_R1" : [
14 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz",
15 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz",
16 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz",
17 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz",
18 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz",
19 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz"
20 |     ],
21 |     "atac.fastqs_rep2_R2" : [
22 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz",
23 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz",
24 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz",
25 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz",
26 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz",
27 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz"
28 |     ],
29 |     "atac.paired_end" : true,
30 |     "atac.auto_detect_adapter" : true,
31 |     "atac.enable_xcor" : true,
32 |     "atac.enable_tss_enrich" : false,
33 |     "atac.title" : "ENCSR356KRQ (subsampled 1/400 reads, chr19_chrM only)",
34 |     "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation"
35 | }
36 | 


--------------------------------------------------------------------------------
/dev/test/test_workflow/ENCSR356KRQ_subsampled_start_from_bam.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ref_output/v2.2.2/ENCSR356KRQ_subsampled_start_from_bam/qc.json",
 3 |     "atac.pipeline_type" : "atac",
 4 |     "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v4/hg38.tsv",
 5 |     "atac.read_len" : [76, 76],
 6 |     "atac.nodup_bams" : [
 7 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/bam_subsampled/rep1/ENCFF341MYG.subsampled.400.trim.merged.nodup.no_chrM_MT.bam",
 8 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/bam_subsampled/rep2/ENCFF641SFZ.subsampled.400.trim.merged.nodup.no_chrM_MT.bam"
 9 |     ],
10 |     "atac.paired_end" : true,
11 |     "atac.auto_detect_adapter" : true,
12 |     "atac.enable_xcor" : true,
13 |     "atac.title" : "ENCSR356KRQ (subsampled 1/400 reads, starting from BAM)",
14 |     "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation"
15 | }
16 | 


--------------------------------------------------------------------------------
/dev/test/test_workflow/ENCSR889WQX.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ref_output/v1.1.6/ENCSR889WQX/qc.json",
 3 |     "atac.pipeline_type" : "atac",
 4 |     "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v4/mm10.tsv",
 5 |     "atac.fastqs_rep1_R1" : [
 6 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq/rep1/ENCFF439VSY.fastq.gz",
 7 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq/rep1/ENCFF325FCQ.fastq.gz",
 8 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq/rep1/ENCFF683IQS.fastq.gz",
 9 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq/rep1/ENCFF744CHW.fastq.gz"
10 |     ],
11 |     "atac.fastqs_rep2_R1" : [
12 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq/rep2/ENCFF463QCX.fastq.gz",
13 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq/rep2/ENCFF992TSA.fastq.gz"
14 |     ],
15 |     "atac.paired_end" : false,
16 |     "atac.auto_detect_adapter" : true,
17 |     "atac.enable_xcor" : true,
18 |     "atac.enable_tss_enrich" : true,
19 |     "atac.title" : "ENCSR889WQX",
20 |     "atac.description" : "ATAC-seq on Mus musculus C57BL/6 frontal cortex adult"
21 | }
22 | 


--------------------------------------------------------------------------------
/dev/test/test_workflow/ENCSR889WQX_subsampled.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ref_output/v1.8.0/ENCSR889WQX_subsampled/qc.json",
 3 |     "atac.pipeline_type" : "atac",
 4 |     "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v4/mm10.tsv",
 5 |     "atac.fastqs_rep1_R1" : [
 6 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF439VSY.subsampled.400.fastq.gz",
 7 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF325FCQ.subsampled.400.fastq.gz",
 8 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF683IQS.subsampled.400.fastq.gz",
 9 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF744CHW.subsampled.400.fastq.gz"
10 |     ],
11 |     "atac.fastqs_rep2_R1" : [
12 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep2/ENCFF463QCX.subsampled.400.fastq.gz",
13 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep2/ENCFF992TSA.subsampled.400.fastq.gz"
14 |     ],
15 |     "atac.paired_end" : false,
16 |     "atac.auto_detect_adapter" : true,
17 |     "atac.enable_xcor" : true,
18 |     "atac.enable_tss_enrich" : false,
19 |     "atac.title" : "ENCSR889WQX (subsampled 1/400 reads)",
20 |     "atac.description" : "ATAC-seq on Mus musculus C57BL/6 frontal cortex adult"
21 | }
22 | 


--------------------------------------------------------------------------------
/dev/test/test_workflow/ENCSR889WQX_subsampled_chr19_only.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ref_output/v1.1.6.a/ENCSR889WQX_subsampled_chr19_only/qc.json",
 3 |     "atac.pipeline_type" : "atac",
 4 |     "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v4/mm10_chr19_chrM.tsv",
 5 |     "atac.fastqs_rep1_R1" : [
 6 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF439VSY.subsampled.400.fastq.gz",
 7 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF325FCQ.subsampled.400.fastq.gz",
 8 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF683IQS.subsampled.400.fastq.gz",
 9 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF744CHW.subsampled.400.fastq.gz"
10 |     ],
11 |     "atac.fastqs_rep2_R1" : [
12 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep2/ENCFF463QCX.subsampled.400.fastq.gz",
13 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep2/ENCFF992TSA.subsampled.400.fastq.gz"
14 |     ],
15 |     "atac.paired_end" : false,
16 |     "atac.auto_detect_adapter" : true,
17 |     "atac.enable_xcor" : true,
18 |     "atac.enable_tss_enrich" : false,
19 |     "atac.title" : "ENCSR889WQX (subsampled 1/400 reads, chr19_chrM only)",
20 |     "atac.description" : "ATAC-seq on Mus musculus C57BL/6 frontal cortex adult"
21 | }
22 | 


--------------------------------------------------------------------------------
/dev/test/test_workflow/ENCSR889WQX_subsampled_unrep.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ref_output/v1.8.0/ENCSR889WQX_subsampled_unrep/qc.json",
 3 |     "atac.pipeline_type" : "atac",
 4 |     "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v4/mm10.tsv",
 5 |     "atac.fastqs_rep1_R1" : [
 6 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF439VSY.subsampled.400.fastq.gz",
 7 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF325FCQ.subsampled.400.fastq.gz",
 8 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF683IQS.subsampled.400.fastq.gz",
 9 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR889WQX/fastq_subsampled/rep1/ENCFF744CHW.subsampled.400.fastq.gz"
10 |     ],
11 |     "atac.paired_end" : false,
12 |     "atac.auto_detect_adapter" : true,
13 |     "atac.enable_xcor" : true,
14 |     "atac.enable_tss_enrich" : false,
15 |     "atac.title" : "ENCSR889WQX (subsampled 1/400 reads, unrep)",
16 |     "atac.description" : "ATAC-seq on Mus musculus C57BL/6 frontal cortex adult"
17 | }
18 | 


--------------------------------------------------------------------------------
/dev/test/test_workflow/ref_output/sync.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | gsutil -m rsync -r -d . gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ref_output
4 | 


--------------------------------------------------------------------------------
/docs/build_genome_database.md:
--------------------------------------------------------------------------------
 1 | ## How to download genome database
 2 | 
 3 | 1. Choose `GENOME` from `hg19`, `hg38`, `mm9` and `mm10` and specify a destination directory.
 4 |     ```bash
 5 |     $ bash scripts/download_genome_data.sh [GENOME] [DESTINATION_DIR]
 6 |     ```
 7 | 2. Find a TSV file on the destination directory and use it for `"atac.genome_tsv"` in your input JSON.
 8 | 
 9 | # How to build genome database
10 | 
11 | 1. [Install Conda](https://conda.io/miniconda.html).
12 | 
13 | 2. Install pipeline's Conda environment.
14 |     ```bash
15 |     $ bash scripts/uninstall_conda_env.sh  # to remove any existing pipeline env
16 |     $ bash scripts/install_conda_env.sh
17 |     ```
18 | 
19 | 3. Choose `GENOME` from `hg19`, `hg38`, `mm9` and `mm10` and specify a destination directory. This will take several hours. We recommend not to run this installer on a login node of your cluster. It will take >8GB memory and >2h time.
20 |     ```bash
21 |     $ conda activate encd-atac
22 |     $ bash scripts/build_genome_data.sh [GENOME] [DESTINATION_DIR]
23 |     ```
24 | 
25 | 3. Find a TSV file on the destination directory and use it for `"atac.genome_tsv"` in your input JSON.
26 | 
27 | 
28 | ## How to build genome database for your own genome
29 | 
30 | 1. You can build your own genome database if your reference genome has one of the following file types.
31 |    * `.fasta.gz`
32 |    * `.fa.gz`
33 |    * `.fasta.bz2`
34 |    * `.fa.gz2`
35 |    * `.2bit`
36 | 
37 | 2. Get a URL for your reference genome. You may need to upload it to somewhere on the internet.
38 | 
39 | 3. Get a URL for a gzipped blacklist BED file for your genome. If you don't have one then skip this step. An example blacklist for hg38 is [here](https://www.encodeproject.org/files/ENCFF356LFX/@@download/ENCFF356LFX.bed.gz).
40 | 
41 | 4. Find the following lines in `scripts/build_genome_data.sh` and modify them as follows. Give a good name `[YOUR_OWN_GENOME]` for your genome. For `MITO_CHR_NAME` use a correct mitochondrial chromosome name of your genome (e.g. `chrM` or `MT`). For `REGEX_BFILT_PEAK_CHR_NAME` Perl style regular expression must be used to keep regular chromosome names only in a blacklist filtered (`.bfilt.`) peaks files. This `.bfilt.` peak files are considered final peaks output of the pipeline and peaks BED files for genome browser tracks (`.bigBed` and `.hammock.gz`) are converted from these `.bfilt.` peaks files. Chromosome name filtering with `REGEX_BFILT_PEAK_CHR_NAME` will be done even without the blacklist itself.
42 |     ```bash
43 |     ...
44 | 
45 |     elif [[ $GENOME == "YOUR_OWN_GENOME" ]]; then
46 |       # Perl style regular expression to keep regular chromosomes only.
47 |       # this reg-ex will be applied to peaks after blacklist filtering (b-filt) with "grep -P".
48 |       # so that b-filt peak file (.bfilt.*Peak.gz) will only have chromosomes matching with this pattern
49 |       # this reg-ex will work even without a blacklist.
50 |       # you will still be able to find a .bfilt. peak file
51 |       REGEX_BFILT_PEAK_CHR_NAME="chr[\dXY]+"
52 |       # mitochondrial chromosome name (e.g. chrM, MT)
53 |       MITO_CHR_NAME="chrM"
54 |       # URL for your reference FASTA (fasta, fasta.gz, fa, fa.gz, 2bit)
55 |       REF_FA="https://some.where.com/your.genome.fa.gz"
56 |       # 3-col blacklist BED file to filter out overlapping peaks from b-filt peak file (.bfilt.*Peak.gz file).
57 |       # leave it empty if you don't have one
58 |       BLACKLIST=
59 |     ...
60 |     ```
61 | 
62 | 5. Specify a destination directory for your genome database and run the installer. This will take several hours.
63 |     ```bash
64 |     $ bash scripts/build_genome_data.sh [YOUR_OWN_GENOME] [DESTINATION_DIR]
65 |     ```
66 | 
67 | 6. Find a TSV file in the destination directory and use it for `"atac.genome_tsv"` in your input JSON.
68 | 


--------------------------------------------------------------------------------
/docs/how_to_config_sge.md:
--------------------------------------------------------------------------------
 1 | # How to configure SGE for pipeline
 2 | 
 3 | 1. List all parallel environments (PE) on your SGE.
 4 |     ```bash
 5 |     $ qconf -spl
 6 |     ```
 7 | 
 8 | 2. If you don't have one then ask your system admin to add a new one with name `shm`.
 9 |     ```bash
10 |     $ sudo qconf -ap shm
11 |     ```
12 | 
13 | 3. Give a large number to `slots` for your PE.
14 |     ```bash
15 |     $ sudo qconf -mp shm
16 |       pe_name            shm
17 |       slots              999
18 |       ...
19 |     ```
20 | 
21 | 4. List all queues on your SGE.
22 |     ```bash
23 |     $ qconf -sql
24 |     ```
25 | 
26 | 5. Ask your system admin to connect PE to your queue.
27 |     ```bash
28 |     $ sudo qconf -mq [QUEUE_NAME]
29 |     ...
30 |     pe_list               make shm
31 |     ...
32 |     ```
33 | 


--------------------------------------------------------------------------------
/docs/install_conda.md:
--------------------------------------------------------------------------------
 1 | # How to install pipeline's Conda environment
 2 | 
 3 | If you do not have miniconda (or anaconda) installed, follow the instructions below in steps 1 - 4 to install miniconda.
 4 | 
 5 | **IF YOU ALREADY HAVE ANACONDA OR MINICONDA INSTALLED, SKIP TO STEP 5** 
 6 | 
 7 | 1) Download [Miniconda installer](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh). Use default answers to all questions except for the first and last.
 8 |   ```bash
 9 |   $ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
10 |   $ bash Miniconda3-4.6.14-Linux-x86_64.sh
11 |   ```
12 | 
13 |   Type `yes` to the first question.
14 |   ```bash
15 |   Do you accept the license terms? [yes|no]
16 |   [no] >>> yes
17 |   ```
18 | 
19 |   Type `yes` to the last question.
20 |   ```bash
21 |   Do you wish the installer to initialize Miniconda3
22 |   by running conda init? [yes|no]
23 |   [no] >>> yes
24 |   ```
25 | 
26 | 2) **IMPORTANT**: Close your session and re-login. If you skip this step then pipeline's Conda environment will be messed up with base Conda environment.
27 | 
28 | 3) **IMPORTANT**: Disable auto activation of base Conda enrivonment. 
29 |   ```bash
30 |   $ conda config --set auto_activate_base false
31 |   ```
32 | 
33 | 4) **IMPORTANT**: Close your session and re-login.
34 | 
35 | 5) Install pipeline's Conda environment. Add `mamba` to the install command line to resolve conflicts much faster.
36 | 
37 |   ```bash
38 |   $ bash scripts/uninstall_conda_env.sh  # uninstall it for clean-install
39 |   $ bash scripts/install_conda_env.sh mamba  # remove mamba if it does not work
40 |   ```
41 | 
42 | > **WARNING**: DO NOT PROCEED TO RUN PIPELINES UNTIL YOU SEE THE FOLLOWING SUCCESS MESSAGE OR PIPELINE WILL NOT WORK.
43 |   ```bash
44 |   === All done successfully ===
45 |   ```
46 | 
47 | 6) Activate pipeline's Conda environment before running a pipeline.
48 |   ```bash
49 |   $ conda activate encode-atac-seq-pipeline
50 | 
51 |   $ caper run ...
52 |   $ caper server ...
53 |   ```
54 | 


--------------------------------------------------------------------------------
/docs/tutorial_dx_cli.md:
--------------------------------------------------------------------------------
 1 | # Tutorial for DNAnexus Platform (CLI)
 2 | 
 3 | All test samples and genome data are shared on our public DNAnexus project. You don't have to download any data for testing our pipeline on DNAnexus platform.
 4 | 
 5 | There are two methods to run our pipeline on DNAnexus.
 6 | 
 7 | 1) Building your own DX workflow from `atac.wdl` with dxWDL (CLI)
 8 | 2) [Using a pre-built DX workflow on our public DX project (Web UI)](tutorial_dx_web.md)
 9 | 
10 | This document describes instruction for the item 1).
11 | 
12 | 1. Sign up for a [DNAnexus account](https://platform.DNAnexus.com/register).
13 | 
14 | 2. Create a new [DX project](https://platform.DNAnexus.com/projects) with name `[YOUR_PROJECT_NAME]` by clicking on "+New Project" on the top left.
15 | 
16 | 3. Download dxWDL.
17 |     ```bash
18 |     $ cd
19 |     $ wget https://github.com/dnanexus/dxWDL/releases/download/v1.46.4/dxWDL-v1.46.4.jar
20 |     $ chmod +rx dxWDL-v1.46.4.jar
21 |     ```
22 | 
23 | 4. Git clone this pipeline.
24 |     ```bash
25 |     $ cd
26 |     $ git clone https://github.com/ENCODE-DCC/atac-seq-pipeline
27 |     ```
28 | 
29 | 5. Move to pipeline's directory.
30 |     ```bash
31 |     $ cd atac-seq-pipeline
32 |     ```
33 | 
34 | 6. Choose an appropriate input for your project (AWS or Azure):
35 |     * AWS
36 |       ```bash
37 |       $ INPUT=example_input_json/dx/ENCSR356KRQ_subsampled_dx.json
38 |       ```
39 |     * Azure
40 |       ```bash
41 |       $ INPUT=example_input_json/dx_azure/ENCSR356KRQ_subsampled_dx_azure.json
42 |       ```
43 | 
44 | 7. Make a WDL for DNAnexus use only. The original WDL will not work with inputs (e.g. BAMs, TAs) other than FASTQs. Then compile `atac.dx.wdl` with an input JSON for the SUBSAMPLED (1/400) paired-end sample of [ENCSR356KRQ](https://www.encodeproject.org/experiments/ENCSR356KRQ/).
45 |     ```bash
46 |     $ cp atac.wdl atac.dx.wdl
47 |     $ sed -i 's/Array\[File?\] bams = \[\]/Array\[File\] bams = \[\]/g' atac.dx.wdl
48 |     $ sed -i 's/Array\[File?\] nodup_bams = \[\]/Array\[File\] nodup_bams = \[\]/g' atac.dx.wdl
49 |     $ sed -i 's/Array\[File?\] tas = \[\]/Array\[File\] tas = \[\]/g' atac.dx.wdl
50 |     ```
51 | 
52 |     ```bash
53 |     $ WDL=atac.dx.wdl
54 |     $ DXWDL=dxWDL-v1.46.4.jar    
55 |     $ PROJECT=[YOUR_PROJECT_NAME]
56 |     $ OUT_FOLDER=/test_sample_atac_ENCSR356KRQ_subsampled
57 |     $ DOCKER=$(cat ${WDL} | grep caper_docker | awk 'BEGIN{FS="'\''"} {print $2}')
58 | 
59 |     $ java -jar ${DXWDL} compile ${WDL} -project ${PROJECT} -f -folder ${OUT_FOLDER} -defaults ${INPUT} -extras <(echo "{\"default_runtime_attributes\":{\"docker\":\"${DOCKER}\"}}")
60 |     ```
61 | 
62 | 8. Go to DNAnexus [project page](https://platform.DNAnexus.com/projects) and click on your project.
63 | 
64 | 9. Move to the directory `/test_sample_atac_ENCSR356KRQ_subsampled`.
65 | 
66 | 10. You will find a DX workflow `atac` with all parameters pre-defined. Click on it. 
67 | 
68 | 11. Specify an output directory by clicking "Workflow Actions" on the top right. Click on "Set output folder" and choose an output folder.
69 | 
70 | 12. Click on "Run as Analysis..." and you will be automatically redirected to the "Monitor" tab.
71 | 
72 | 13. It will take about an hour. You will be able to find all outputs on your output folder. Final QC report (`qc.html`)/JSON (`qc.json`) will be found on it.
73 | 
74 | 14. See full specification for [input JSON file](input.md).
75 | 


--------------------------------------------------------------------------------
/docs/tutorial_dx_web.md:
--------------------------------------------------------------------------------
 1 | # Tutorial for DNAnexus Platform (web)
 2 | 
 3 | All test samples and genome data are shared on our public DNAnexus project. You don't have to download any data for testing our pipeline on DNAnexus platform.
 4 | 
 5 | There are two methods to run our pipeline on DNAnexus.
 6 | 
 7 | 1) [Building your own DX workflow from `atac.wdl` with dxWDL (CLI)](tutorial_dx_cli.md)
 8 | 2) Using a pre-built DX workflow on our public DX project (Web UI)
 9 | 
10 | This document describes instruction for the item 2).
11 | 
12 | 1. Sign up for a [DNAnexus account](https://platform.DNAnexus.com/register).
13 | 
14 | 2. Create a new [DX project](https://platform.DNAnexus.com/projects) by clicking on "+New Project" on the top left.
15 | 
16 | 3. Move to one of the following workflow directories according to the platform you have chosen for your project (AWS or Azure). These DX workflows are pre-built with all parameters defined.
17 | 
18 | * [AWS test workflow](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ATAC-seq/workflows): Use `[LATEST_VER]/test_ENCSR356KRQ_subsampled`.
19 | * [Azure test workflow](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ATAC-seq/workflows): Use `[LATEST_VER]/test_ENCSR356KRQ_subsampled`.
20 | 
21 | 4. Copy it to your project by right-clicking on the DX workflow `atac` and choose "Copy". 
22 | 
23 | 5. Choose your project and create a folder for the test run by clicking on the "Folder+" icon.
24 | 
25 | 6. Click on "Copy into this folder" on the bottom left.
26 | 
27 | 7. Move to the target folder and click on the DX workflow `atac`.
28 | 
29 | 9. Specify an output directory by clicking "Workflow Actions" on the top right. Click on "Set output folder" and choose an output folder.
30 | 
31 | 10. Click on "Run as Analysis..." and you will be automatically redirected to the "Monitor" tab.
32 | 
33 | 11. It will take about an hour. You will be able to find all outputs on your output folder. Final QC report (`qc.html`)/JSON (`qc.json`) will be found on it.
34 | 
35 | 11. See full specification for [input JSON file](input.md).
36 | 
37 | 
38 | ## Extras for advanced users
39 | 
40 | 1. DNAnexus allows only one copy of a workflow per project. The example workflow in the previous section is pre-built for the subsampled test sample [ENCSR356KRQ](https://www.encodeproject.org/experiments/ENCSR356KRQ/) with all parameters defined already.
41 | 
42 | 2. Choose your main platform (AWS or Azure). Move to [ENCODE ATAC-seq pipeline repository for AWS](https://platform.dnanexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ATAC-seq/workflows) or [ENCODE ATAC-seq pipeline repository for Azure](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ATAC-seq/workflows).
43 | 
44 | 3. Choose a folder with the latest available version.
45 | 
46 | 4. Copy one of the following workflows according to the platform you have chosen for your project.
47 | > **IMPORTANT**: Make sure that you have chosen a correct platform (AWS or Azure) for your project.
48 | 
49 |   * general: General workflow without pre-defined reference genome.
50 |   * hg38: Worfklow with pre-defined hg38 reference genome.
51 |   * hg19: Worfklow with pre-defined hg19 reference genome.
52 | 
53 | 5. Click on the DX workflow `atac`.
54 | 
55 | 6. Specify your input files (FASTQs, BAMs, TAG-ALIGNs, ...) on the top left. For example, click on the item "fastqs_rep1_R1" and choose your R1 FASTQ file for replicate 1. See details [here](input.md) for other input types.
56 | 
57 | 7. Choose a reference genome. See details [here](input.md).
58 | 
59 | 8. Click on "Run as Analysis..." and you will be automatically redirected to the "Monitor" tab.
60 | 
61 | 


--------------------------------------------------------------------------------
/example_input_json/ENCSR356KRQ_subsampled.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.pipeline_type" : "atac",
 3 |     "atac.genome_tsv" : "https://storage.googleapis.com/encode-pipeline-genome-data/genome_tsv/v4/hg38.tsv",
 4 |     "atac.fastqs_rep1_R1" : [
 5 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz",
 6 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz"
 7 |     ],
 8 |     "atac.fastqs_rep1_R2" : [
 9 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz",
10 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz"
11 |     ],
12 |     "atac.fastqs_rep2_R1" : [
13 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz",
14 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz",
15 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz",
16 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz",
17 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz",
18 |         "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz"
19 |     ],
20 |     "atac.fastqs_rep2_R2" : [
21 |          "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz",
22 |          "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz",
23 |          "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz",
24 |          "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz",
25 |          "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz",
26 |          "https://storage.googleapis.com/encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz"
27 |     ],
28 |     "atac.paired_end" : true,
29 |     "atac.auto_detect_adapter" : true,
30 |     "atac.enable_xcor" : true,
31 |     "atac.title" : "ENCSR356KRQ (subsampled 1/400)",
32 |     "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation"
33 | }
34 | 


--------------------------------------------------------------------------------
/example_input_json/dx/ENCSR356KRQ_subsampled_dx.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.pipeline_type" : "atac",
 3 |     "atac.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-genome-data/genome_tsv/v4/hg38.dx.tsv",
 4 |     "atac.fastqs_rep1_R1" : [
 5 |         "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz",
 6 |         "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz"
 7 |     ],
 8 |     "atac.fastqs_rep1_R2" : [
 9 |          "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz",
10 |          "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz"
11 |     ],
12 |     "atac.fastqs_rep2_R1" : [
13 |         "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz",
14 |         "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz",
15 |         "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz",
16 |         "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz",
17 |         "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz",
18 |         "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz"
19 |     ],
20 |     "atac.fastqs_rep2_R2" : [
21 |          "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz",
22 |          "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz",
23 |          "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz",
24 |          "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz",
25 |          "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz",
26 |          "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz"
27 |     ],
28 |     "atac.paired_end" : true,
29 |     "atac.auto_detect_adapter" : true,
30 |     "atac.enable_tss_enrich" : false,
31 |     "atac.enable_xcor" : true,
32 |     "atac.title" : "ENCSR356KRQ (subsampled 1/400)",
33 |     "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation"
34 | }
35 | 


--------------------------------------------------------------------------------
/example_input_json/dx/ENCSR356KRQ_subsampled_rep1_dx.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.pipeline_type" : "atac",
 3 |     "atac.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-genome-data/genome_tsv/v4/hg38.dx.tsv",
 4 |     "atac.fastqs_rep1_R1" : [
 5 |         "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz",
 6 |         "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz"
 7 |     ],
 8 |     "atac.fastqs_rep1_R2" : [
 9 |          "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz",
10 |          "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz"
11 |     ],
12 |     "atac.paired_end" : true,
13 |     "atac.auto_detect_adapter" : true,
14 |     "atac.enable_tss_enrich" : false,
15 |     "atac.enable_xcor" : true,
16 |     "atac.title" : "ENCSR356KRQ (unreplicated, subsampled 1/400)",
17 |     "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation"
18 | }
19 | 


--------------------------------------------------------------------------------
/example_input_json/dx/template_general.json:
--------------------------------------------------------------------------------
1 | {
2 |     "atac.pipeline_type" : "atac"
3 | }
4 | 


--------------------------------------------------------------------------------
/example_input_json/dx/template_hg19.json:
--------------------------------------------------------------------------------
1 | {
2 |     "atac.pipeline_type" : "atac",
3 |     "atac.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-genome-data/genome_tsv/v1/hg19_dx.tsv"
4 | }
5 | 


--------------------------------------------------------------------------------
/example_input_json/dx/template_hg38.json:
--------------------------------------------------------------------------------
1 | {
2 |     "atac.pipeline_type" : "atac",
3 |     "atac.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-genome-data/genome_tsv/v4/hg38.dx.tsv"
4 | }
5 | 


--------------------------------------------------------------------------------
/example_input_json/dx/template_mm10.json:
--------------------------------------------------------------------------------
1 | {
2 |     "atac.pipeline_type" : "atac",
3 |     "atac.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-genome-data/genome_tsv/v4/mm10.dx.tsv"
4 | }
5 | 


--------------------------------------------------------------------------------
/example_input_json/dx/template_mm9.json:
--------------------------------------------------------------------------------
1 | {
2 |     "atac.pipeline_type" : "atac",
3 |     "atac.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:/pipeline-genome-data/genome_tsv/v1/mm9_dx.tsv"
4 | }
5 | 


--------------------------------------------------------------------------------
/example_input_json/dx_azure/ENCSR356KRQ_subsampled_dx_azure.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.pipeline_type" : "atac",
 3 |     "atac.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-genome-data/genome_tsv/v4/hg38.dx_azure.tsv",
 4 |     "atac.fastqs_rep1_R1" : [
 5 |         "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz",
 6 |         "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz"
 7 |     ],
 8 |     "atac.fastqs_rep1_R2" : [
 9 |          "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz",
10 |          "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz"
11 |     ],
12 |     "atac.fastqs_rep2_R1" : [
13 |         "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz",
14 |         "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz",
15 |         "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz",
16 |         "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz",
17 |         "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz",
18 |         "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz"
19 |     ],
20 |     "atac.fastqs_rep2_R2" : [
21 |          "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz",
22 |          "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz",
23 |          "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz",
24 |          "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz",
25 |          "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz",
26 |          "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz"
27 |     ],
28 |     "atac.paired_end" : true,
29 |     "atac.auto_detect_adapter" : true,
30 |     "atac.enable_tss_enrich" : false,
31 |     "atac.enable_xcor" : true,
32 |     "atac.title" : "ENCSR356KRQ (subsampled 1/400)",
33 |     "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation"
34 | }
35 | 


--------------------------------------------------------------------------------
/example_input_json/dx_azure/template_general.json:
--------------------------------------------------------------------------------
1 | {
2 |     "atac.pipeline_type" : "atac"
3 | }
4 | 


--------------------------------------------------------------------------------
/example_input_json/dx_azure/template_hg19.json:
--------------------------------------------------------------------------------
1 | {
2 |     "atac.pipeline_type" : "atac",
3 |     "atac.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-genome-data/genome_tsv/v1/hg19_dx_azure.tsv"
4 | }
5 | 


--------------------------------------------------------------------------------
/example_input_json/dx_azure/template_hg38.json:
--------------------------------------------------------------------------------
1 | {
2 |     "atac.pipeline_type" : "atac",
3 |     "atac.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-genome-data/genome_tsv/v4/hg38.dx_azure.tsv"
4 | }
5 | 


--------------------------------------------------------------------------------
/example_input_json/dx_azure/template_mm10.json:
--------------------------------------------------------------------------------
1 | {
2 |     "atac.pipeline_type" : "atac",
3 |     "atac.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-genome-data/genome_tsv/v4/mm10.dx_azure.tsv"
4 | }
5 | 


--------------------------------------------------------------------------------
/example_input_json/dx_azure/template_mm9.json:
--------------------------------------------------------------------------------
1 | {
2 |     "atac.pipeline_type" : "atac",
3 |     "atac.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:/pipeline-genome-data/genome_tsv/v1/mm9_dx_azure.tsv"
4 | }
5 | 


--------------------------------------------------------------------------------
/example_input_json/template.full.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.title" : "Example (paired end)",
 3 |     "atac.description" : "This is a template input JSON for paired ended sample.",
 4 | 
 5 |     "atac.pipeline_type" : "atac",
 6 |     "atac.align_only" : false,
 7 |     "atac.true_rep_only" : false,
 8 | 
 9 |     "atac.genome_tsv" : "/path_to_genome_data/hg38/hg38.tsv",
10 | 
11 |     "atac.paired_end" : true,
12 |     "atac.paired_ends" : [true, true],
13 | 
14 |     "atac.fastqs_rep1_R1" : [ "rep1_R1_L1.fastq.gz", "rep1_R1_L2.fastq.gz", "rep1_R1_L3.fastq.gz" ],
15 |     "atac.fastqs_rep1_R2" : [ "rep1_R2_L1.fastq.gz", "rep1_R2_L2.fastq.gz", "rep1_R2_L3.fastq.gz" ],
16 |     "atac.fastqs_rep2_R1" : [ "rep2_R1_L1.fastq.gz", "rep2_R1_L2.fastq.gz" ],
17 |     "atac.fastqs_rep2_R2" : [ "rep2_R2_L1.fastq.gz", "rep2_R2_L2.fastq.gz" ],
18 | 
19 |     "atac.read_len" : [],
20 |     "atac.pseudoreplication_random_seed" : 0,
21 | 
22 |     "atac.auto_detect_adapter" : false,
23 |     "atac.adapter" : "AATTCCGG",
24 |     "atac.adapters_rep1_R1" : [ "AATTCCGG", "AATTCCGG", "AATTCCGG" ],
25 |     "atac.adapters_rep1_R2" : [ "AATTCCGG", "AATTCCGG" ],
26 |     "atac.adapters_rep2_R1" : [ "AATTCCGG", "AATTCCGG", "AATTCCGG" ],
27 |     "atac.adapters_rep2_R2" : [ "AATTCCGG", "AATTCCGG" ],
28 |     "atac.cutadapt_param" : "-e 0.1 -m 5",
29 | 
30 |     "atac.multimapping" : 4,
31 | 
32 |     "atac.mapq_thresh" : 30,
33 |     "atac.dup_marker" : "picard",
34 |     "atac.no_dup_removal" : false,
35 | 
36 |     "atac.subsample_reads" : 0,
37 |     "atac.xcor_subsample_reads" : 25000000,
38 | 
39 |     "atac.cap_num_peak" : 300000,
40 |     "atac.pval_thresh" : 0.01,
41 |     "atac.smooth_win" : 150,
42 | 
43 |     "atac.enable_idr" : true,
44 |     "atac.idr_thresh" : 0.05,
45 | 
46 |     "atac.enable_xcor" : false,
47 |     "atac.enable_count_signal_track" : false,
48 | 
49 |     "atac.filter_chrs" : ["chrM", "MT"],
50 | 
51 |     "atac.enable_preseq" : false,
52 |     "atac.enable_compare_to_roadmap" : false,
53 |     "atac.enable_tss_enrich" : true,
54 |     "atac.enable_gc_bias" : true,
55 | 
56 |     "atac.align_cpu" : 6,
57 |     "atac.align_mem_factor" : 0.15,
58 |     "atac.align_time_hr" : 48,
59 |     "atac.align_disk_factor" : 8.0,
60 | 
61 |     "atac.filter_cpu" : 4,
62 |     "atac.filter_mem_factor" : 0.4,
63 |     "atac.filter_time_hr" : 24,
64 |     "atac.filter_disk_factor" : 8.0,
65 | 
66 |     "atac.bam2ta_cpu" : 2,
67 |     "atac.bam2ta_mem_factor" : 0.3,
68 |     "atac.bam2ta_time_hr" : 12,
69 |     "atac.bam2ta_disk_factor" : 4.0,
70 | 
71 |     "atac.spr_mem_factor" : 13.5,
72 |     "atac.spr_disk_factor" : 18.0,
73 | 
74 |     "atac.jsd_cpu" : 4,
75 |     "atac.jsd_mem_factor" : 0.1,
76 |     "atac.jsd_time_hr" : 12,
77 |     "atac.jsd_disk_factor" : 2.0,
78 | 
79 |     "atac.xcor_cpu" : 2,
80 |     "atac.xcor_mem_factor" : 1.0,
81 |     "atac.xcor_time_hr" : 6,
82 |     "atac.xcor_disk_factor" : 4.5,
83 | 
84 |     "atac.call_peak_cpu" : 2,
85 |     "atac.call_peak_mem_factor" : 2.0,
86 |     "atac.call_peak_time_hr" : 24,
87 |     "atac.call_peak_disk_factor" : 30.0,
88 | 
89 |     "atac.macs2_signal_track_mem_factor" : 12.0,
90 |     "atac.macs2_signal_track_time_hr" : 24,
91 |     "atac.macs2_signal_track_disk_factor" : 80.0,
92 | 
93 |     "atac.preseq_mem_factor" : 0.5,
94 |     "atac.preseq_disk_factor" : 5.0
95 | }
96 | 


--------------------------------------------------------------------------------
/example_input_json/template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.title" : "Example (paired end)",
 3 |     "atac.description" : "This is a template input JSON for paired ended sample.",
 4 | 
 5 |     "atac.pipeline_type" : "atac",
 6 |     "atac.align_only" : false,
 7 |     "atac.true_rep_only" : false,
 8 | 
 9 |     "atac.genome_tsv" : "/path_to_genome_data/hg38/hg38.tsv",
10 | 
11 |     "atac.paired_end" : true,
12 | 
13 |     "atac.fastqs_rep1_R1" : [ "rep1_R1_L1.fastq.gz", "rep1_R1_L2.fastq.gz", "rep1_R1_L3.fastq.gz" ],
14 |     "atac.fastqs_rep1_R2" : [ "rep1_R2_L1.fastq.gz", "rep1_R2_L2.fastq.gz", "rep1_R2_L3.fastq.gz" ],
15 |     "atac.fastqs_rep2_R1" : [ "rep2_R1_L1.fastq.gz", "rep2_R1_L2.fastq.gz" ],
16 |     "atac.fastqs_rep2_R2" : [ "rep2_R2_L1.fastq.gz", "rep2_R2_L2.fastq.gz" ],
17 | 
18 |     "atac.auto_detect_adapter" : false,
19 |     "atac.adapter" : "AATTCCGG",
20 |     "atac.adapters_rep1_R1" : [ "AATTCCGG", "AATTCCGG", "AATTCCGG" ],
21 |     "atac.adapters_rep1_R2" : [ "AATTCCGG", "AATTCCGG" ],
22 |     "atac.adapters_rep2_R1" : [ "AATTCCGG", "AATTCCGG", "AATTCCGG" ],
23 |     "atac.adapters_rep2_R2" : [ "AATTCCGG", "AATTCCGG" ],
24 | 
25 |     "atac.multimapping" : 4
26 | }
27 | 


--------------------------------------------------------------------------------
/example_input_json/terra/ENCSR356KRQ_subsampled.terra.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "atac.pipeline_type" : "atac",
 3 |     "atac.genome_tsv" : "gs://encode-pipeline-genome-data/genome_tsv/v4/hg38.terra.tsv",
 4 |     "atac.fastqs_rep1_R1" : [
 5 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF341MYG.subsampled.400.fastq.gz",
 6 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair1/ENCFF106QGY.subsampled.400.fastq.gz"
 7 |     ],
 8 |     "atac.fastqs_rep1_R2" : [
 9 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF248EJF.subsampled.400.fastq.gz",
10 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep1/pair2/ENCFF368TYI.subsampled.400.fastq.gz"
11 |     ],
12 |     "atac.fastqs_rep2_R1" : [
13 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF641SFZ.subsampled.400.fastq.gz",
14 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF751XTV.subsampled.400.fastq.gz",
15 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF927LSG.subsampled.400.fastq.gz",
16 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF859BDM.subsampled.400.fastq.gz",
17 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF193RRC.subsampled.400.fastq.gz",
18 |         "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair1/ENCFF366DFI.subsampled.400.fastq.gz"
19 |     ],
20 |     "atac.fastqs_rep2_R2" : [
21 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF031ARQ.subsampled.400.fastq.gz",
22 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF590SYZ.subsampled.400.fastq.gz",
23 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF734PEQ.subsampled.400.fastq.gz",
24 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF007USV.subsampled.400.fastq.gz",
25 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF886FSC.subsampled.400.fastq.gz",
26 |          "gs://encode-pipeline-test-samples/encode-atac-seq-pipeline/ENCSR356KRQ/fastq_subsampled/rep2/pair2/ENCFF573UXK.subsampled.400.fastq.gz"
27 |     ],
28 |     "atac.paired_end" : true,
29 |     "atac.auto_detect_adapter" : true,
30 |     "atac.enable_xcor" : true,
31 |     "atac.title" : "ENCSR356KRQ (subsampled 1/400)",
32 |     "atac.description" : "ATAC-seq on primary keratinocytes in day 0.0 of differentiation"
33 | }
34 | 


--------------------------------------------------------------------------------
/scripts/install_conda_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e  # Stop on error
 3 | 
 4 | install_ucsc_tools_369() {
 5 |   # takes in conda env name and find conda bin
 6 |   CONDA_BIN=$(conda run -n $1 bash -c "echo \$(dirname \$(which python))")
 7 |   curl -o "$CONDA_BIN/fetchChromSizes" "https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/fetchChromSizes"
 8 |   curl -o "$CONDA_BIN/wigToBigWig" "https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/wigToBigWig"
 9 |   curl -o "$CONDA_BIN/bedGraphToBigWig" "https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/bedGraphToBigWig"
10 |   curl -o "$CONDA_BIN/bigWigInfo" "https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/bigWigInfo"
11 |   curl -o "$CONDA_BIN/bedClip" "https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/bedClip"
12 |   curl -o "$CONDA_BIN/bedToBigBed" "https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/bedToBigBed"
13 |   curl -o "$CONDA_BIN/twoBitToFa" "https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/twoBitToFa"
14 |   curl -o "$CONDA_BIN/bigWigAverageOverBed" "https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64.v369/bigWigAverageOverBed"
15 | 
16 |   chmod +x "$CONDA_BIN/fetchChromSizes"
17 |   chmod +x "$CONDA_BIN/wigToBigWig"
18 |   chmod +x "$CONDA_BIN/bedGraphToBigWig"
19 |   chmod +x "$CONDA_BIN/bigWigInfo"
20 |   chmod +x "$CONDA_BIN/bedClip"
21 |   chmod +x "$CONDA_BIN/bedToBigBed"
22 |   chmod +x "$CONDA_BIN/twoBitToFa"
23 |   chmod +x "$CONDA_BIN/bigWigAverageOverBed"
24 | }
25 | 
26 | SH_SCRIPT_DIR=$(cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd)
27 | 
28 | echo "$(date): Installing pipeline's Conda environments..."
29 | 
30 | conda create -n encd-atac --file ${SH_SCRIPT_DIR}/requirements.txt \
31 |   --override-channels -c bioconda -c defaults -y
32 | 
33 | conda create -n encd-atac-macs2 --file ${SH_SCRIPT_DIR}/requirements.macs2.txt \
34 |   --override-channels -c bioconda -c defaults -y
35 | 
36 | conda create -n encd-atac-py2 --file ${SH_SCRIPT_DIR}/requirements.python2.txt \
37 |   --override-channels -c conda-forge -c bioconda -c defaults -y
38 | 
39 | conda create -n encd-atac-spp --file ${SH_SCRIPT_DIR}/requirements.spp.txt \
40 |   -c r -c bioconda -c defaults -y
41 | 
42 | # adhoc fix for the following issues:
43 | # - https://github.com/ENCODE-DCC/chip-seq-pipeline2/issues/259
44 | # - https://github.com/ENCODE-DCC/chip-seq-pipeline2/issues/265
45 | # force-install readline 6.2, ncurses 5.9 from conda-forge (ignoring dependencies)
46 | #conda install -n encd-atac-spp --no-deps --no-update-deps -y \
47 | #  readline==6.2 ncurses==5.9 -c conda-forge
48 | 
49 | CONDA_BIN=$(conda run -n encd-atac-spp bash -c "echo \$(dirname \$(which python))")
50 | 
51 | echo "$(date): Installing phantompeakqualtools in Conda environments..."
52 | RUN_SPP="https://raw.githubusercontent.com/kundajelab/phantompeakqualtools/1.2.2/run_spp.R"
53 | conda run -n encd-atac-spp bash -c \
54 |   "curl -o $CONDA_BIN/run_spp.R $RUN_SPP && chmod +x $CONDA_BIN/run_spp.R"
55 | 
56 | echo "$(date): Installing R packages in Conda environments..."
57 | CRAN="https://cran.r-project.org/"
58 | conda run -n encd-atac-spp bash -c \
59 |   "Rscript -e \"install.packages('snow', repos='$CRAN')\""
60 | conda run -n encd-atac-spp bash -c \
61 |   "Rscript -e \"install.packages('snowfall', repos='$CRAN')\""
62 | conda run -n encd-atac-spp bash -c \
63 |   "Rscript -e \"install.packages('bitops', repos='$CRAN')\""
64 | conda run -n encd-atac-spp bash -c \
65 |   "Rscript -e \"install.packages('caTools', repos='$CRAN')\""
66 | conda run -n encd-atac-spp bash -c \
67 |   "Rscript -e \"install.packages('BiocManager', repos='$CRAN')\""
68 | conda run -n encd-atac-spp bash -c \
69 |   "Rscript -e \"require('BiocManager'); BiocManager::install('Rsamtools'); BiocManager::install('Rcpp')\""
70 | 
71 | echo "$(date): Installing R spp 1.15.5 in Conda environments..."
72 | SPP="https://cran.r-project.org/src/contrib/Archive/spp/spp_1.15.5.tar.gz"
73 | SPP_BASENAME=$(basename $SPP)
74 | curl -o "$CONDA_BIN/$SPP_BASENAME" "$SPP"
75 | conda run -n encd-atac-spp bash -c \
76 |   "Rscript -e \"install.packages('$CONDA_BIN/$SPP_BASENAME')\""
77 | 
78 | echo "$(date): Installing USCS tools (v369)..."
79 | install_ucsc_tools_369 encd-atac
80 | install_ucsc_tools_369 encd-atac-spp
81 | install_ucsc_tools_369 encd-atac-macs2
82 | 
83 | echo "$(date): Done successfully."
84 | echo
85 | echo "If you see openssl,readline,ncurses lib errors while running pipelines"
86 | echo "then switch to Singularity method. Conda method will not work on your system."
87 | 
88 | bash ${SH_SCRIPT_DIR}/update_conda_env.sh
89 | 


--------------------------------------------------------------------------------
/scripts/requirements.macs2.txt:
--------------------------------------------------------------------------------
 1 | # Conda environment for tasks (macs2, macs2_signal_track) in atac/chip 
 2 | 
 3 | nomkl # using MKL can change MACS2 output randomly on different platforms
 4 | python >=3
 5 | 
 6 | macs2 ==2.2.4
 7 | bedtools ==2.29.0
 8 | bedops ==2.4.39
 9 | pybedtools ==0.8.0
10 | pybigwig ==0.3.13
11 | tabix
12 | 
13 | matplotlib
14 | ghostscript
15 | 
16 | 


--------------------------------------------------------------------------------
/scripts/requirements.python2.txt:
--------------------------------------------------------------------------------
 1 | # Conda environment for python2-based tasks (tss_enrich) in atac/chip
 2 | # (metaseq is still in py2)
 3 | 
 4 | python ==2.7.16
 5 | 
 6 | biopython ==1.76
 7 | metaseq ==0.5.6
 8 | samtools ==1.9
 9 | gffutils ==0.10.1  # 0.11.0 is not py2 compatible
10 | 
11 | python-dateutil ==2.8.0
12 | grep
13 | tar
14 | ghostscript
15 | 


--------------------------------------------------------------------------------
/scripts/requirements.spp.txt:
--------------------------------------------------------------------------------
 1 | # Conda environment for tasks (spp, xcor) in atac/chip 
 2 | # some packages (phantompeakquals, r-spp) will be installed separately
 3 | # couldn't resolve all conda conflicts
 4 | 
 5 | python >=3
 6 | bedtools ==2.29.0
 7 | bedops ==2.4.39
 8 | 
 9 | r-base ==3.6.1
10 | 
11 | tabix
12 | 
13 | matplotlib
14 | pandas
15 | numpy
16 | ghostscript
17 | 
18 | 


--------------------------------------------------------------------------------
/scripts/requirements.txt:
--------------------------------------------------------------------------------
 1 | # default Conda environment for atac/chip
 2 | 
 3 | python >=3
 4 | bwa ==0.7.17
 5 | bowtie2 ==2.3.4.3
 6 | tbb ==2020.2 # use old version to fix libtbb.so.2 error for bowtie2
 7 | samtools ==1.9
 8 | htslib ==1.9
 9 | bedtools ==2.29.0
10 | sambamba ==0.6.6
11 | 
12 | pysam ==0.15.3
13 | pybedtools ==0.8.0
14 | pybigwig ==0.3.13
15 | 
16 | deeptools ==3.3.1
17 | cutadapt ==2.5
18 | preseq ==2.0.3
19 | pyfaidx ==0.5.5.2
20 | bedops ==2.4.39
21 | 
22 | ptools_bin
23 | 
24 | jsondiff ==1.1.1
25 | ghostscript
26 | tabix
27 | matplotlib
28 | numpy
29 | scikit-learn
30 | scipy
31 | pandas
32 | jinja2
33 | gsl
34 | 
35 | samstats ==0.2.1
36 | idr ==2.0.4.2
37 | 
38 | java-jdk
39 | 
40 | picard ==2.20.7
41 | trimmomatic ==0.39
42 | 
43 | 


--------------------------------------------------------------------------------
/scripts/uninstall_conda_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PIPELINE_CONDA_ENVS=(
 4 |   encd-atac
 5 |   encd-atac-macs2
 6 |   encd-atac-spp
 7 |   encd-atac-py2
 8 | )
 9 | for PIPELINE_CONDA_ENV in "${PIPELINE_CONDA_ENVS[@]}"
10 | do
11 |   conda env remove -n ${PIPELINE_CONDA_ENV} -y
12 | done
13 | 


--------------------------------------------------------------------------------
/scripts/update_conda_env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e  # Stop on error
 3 | 
 4 | SH_SCRIPT_DIR=$(cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd)
 5 | SRC_DIR=${SH_SCRIPT_DIR}/../src
 6 | 
 7 | PIPELINE_CONDA_ENVS=(
 8 |   encd-atac
 9 |   encd-atac-macs2
10 |   encd-atac-spp
11 |   encd-atac-py2
12 | )
13 | chmod u+rx ${SRC_DIR}/*.py
14 | 
15 | echo "$(date): Updating WDL task wrappers on each Conda environment..."
16 | for PIPELINE_CONDA_ENV in "${PIPELINE_CONDA_ENVS[@]}"
17 | do	
18 |   CONDA_BIN=$(dirname $(conda run -n ${PIPELINE_CONDA_ENV} which python))
19 |   echo -e "$(date): Transferring WDL task wrappers to ${CONDA_BIN}..."
20 |   cp -f ${SRC_DIR}/*.py ${CONDA_BIN}/
21 | done
22 | 


--------------------------------------------------------------------------------
/src/assign_multimappers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # piped script to take multimappers and randomly assign
 4 | # requires a qname sorted file!!
 5 | 
 6 | import sys
 7 | import random
 8 | import argparse
 9 | 
10 | 
11 | def parse_args():
12 |     '''
13 |     Gives options
14 |     '''
15 |     parser = argparse.ArgumentParser(
16 |         description='Saves reads below a alignment threshold and discards all others')
17 |     parser.add_argument('-k', help='Alignment number cutoff')
18 |     parser.add_argument('--paired-end', dest='paired_ended',
19 |                         action='store_true', help='Data is paired-end')
20 |     args = parser.parse_args()
21 |     alignment_cutoff = int(args.k)
22 |     paired_ended = args.paired_ended
23 | 
24 |     return alignment_cutoff, paired_ended
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     '''
29 |     Runs the filtering step of choosing multimapped reads
30 |     '''
31 | 
32 |     [alignment_cutoff, paired_ended] = parse_args()
33 | 
34 |     if paired_ended:
35 |         alignment_cutoff = int(alignment_cutoff) * 2
36 | 
37 |     # Store each line in sam file as a list of reads,
38 |     # where each read is a list of elements to easily
39 |     # modify or grab things
40 |     current_reads = []
41 |     current_qname = ''
42 | 
43 |     for line in sys.stdin:
44 | 
45 |         read_elems = line.strip().split('\t')
46 | 
47 |         if read_elems[0].startswith('@'):
48 |             sys.stdout.write(line)
49 |             continue
50 | 
51 |         # Keep taking lines that have the same qname
52 |         if read_elems[0] == current_qname:
53 |             # Add line to current reads
54 |             current_reads.append(line)
55 |             pass
56 |         else:
57 |             # Discard if there are more than the alignment cutoff
58 |             if len(current_reads) > alignment_cutoff:
59 |                 current_reads = [line]
60 |                 current_qname = read_elems[0]
61 |             elif len(current_reads) > 0:
62 |                 # Just output all reads, which are then filtered with
63 |                 # samtools
64 |                 for read in current_reads:
65 |                     sys.stdout.write(str(read))
66 | 
67 |                 # And then discard
68 |                 current_reads = [line]
69 |                 current_qname = read_elems[0]
70 |             else:
71 |                 # First read in file
72 |                 current_reads.append(line)
73 |                 current_qname = read_elems[0]
74 | 


--------------------------------------------------------------------------------
/src/detect_adapter.py:
--------------------------------------------------------------------------------
 1 | # written by Nathan Boley, from https://github.com/nboley/GGR_code
 2 | 
 3 | import sys
 4 | import gzip
 5 | 
 6 | VERBOSE = False
 7 | 
 8 | adapters = {
 9 |     'Illumina': b'AGATCGGAAGAGC',
10 |     'Nextera ': b'CTGTCTCTTATA',
11 |     'smallRNA': b'TGGAATTCTCGG'
12 | }
13 | 
14 | 
15 | def open_gz(fname):
16 |     return gzip.open(fname) if fname.endswith('.gz') else open(fname, 'rb')
17 | 
18 | 
19 | def detect_adapters_and_cnts(fname, max_n_lines=1000000):
20 |     adapter_cnts = {
21 |         'Illumina': 0,
22 |         'Nextera ': 0,
23 |         'smallRNA': 0
24 |     }
25 | 
26 |     with open_gz(fname) as fp:
27 |         # read the first million sequences or to the end of the while -- whichever
28 |         # comes first, and then use the adapter for trimming which was found to
29 |         # occur most often
30 |         for seq_index, line in enumerate(fp):
31 |             if seq_index >= max_n_lines:
32 |                 break
33 |             if seq_index % 4 != 1:
34 |                 continue
35 |             for key in adapters:
36 |                 if line.find(adapters[key]) > -1:
37 |                     adapter_cnts[key] += 1
38 | 
39 |     observed_adapters = [
40 |         adapter for adapter, cnt in sorted(
41 |             adapter_cnts.items(), key=lambda x: -x[1])
42 |         if cnt > 0
43 |     ]
44 |     return observed_adapters, adapter_cnts, seq_index//4
45 | 
46 | 
47 | def detect_most_likely_adapter(fname):
48 |     observed_adapters, adapter_cnts, n_obs_adapters = detect_adapters_and_cnts(
49 |         fname)
50 |     if observed_adapters:
51 |         best_adapter = observed_adapters[0]
52 |     else:
53 |         best_adapter = ""
54 | 
55 |     if VERBOSE:
56 |         print("\n\nAUTO-DETECTING ADAPTER TYPE\n===========================")
57 |         print("Attempting to auto-detect adapter type from the first 1 million sequences of the first file (>> {} <<)\n".format(
58 |             fname)
59 |         )
60 |         print("Found perfect matches for the following adapter sequences:")
61 |         print("Adapter type\tCount\tSequence\tSequences analysed\tPercentage")
62 |         for adapter in observed_adapters:
63 |             print("{}\t{}\t{}\t{}\t\t\t{:.2%}".format(
64 |                 adapter,
65 |                 adapter_cnts[adapter],
66 |                 adapters[adapter].decode(),
67 |                 n_obs_adapters,
68 |                 adapter_cnts[adapter]/n_obs_adapters)
69 |             )
70 |     if best_adapter:
71 |         return adapters[best_adapter].decode()
72 |     else:
73 |         return ""
74 | 
75 | 
76 | def main():
77 |     global VERBOSE
78 |     VERBOSE = False
79 |     best_adapter = detect_most_likely_adapter(sys.argv[1])
80 |     print(best_adapter)
81 | 
82 | 
83 | if __name__ == '__main__':
84 |     main()
85 | 


--------------------------------------------------------------------------------
/src/dev_check_sync_atac.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | diff ../../atac-seq-pipeline/src/encode_task_bam2ta.py encode_task_bam2ta.py 
 4 | diff ../../atac-seq-pipeline/src/encode_lib_blacklist_filter.py encode_lib_blacklist_filter.py 
 5 | diff ../../atac-seq-pipeline/src/encode_lib_genomic.py encode_lib_genomic.py 
 6 | diff ../../atac-seq-pipeline/src/encode_lib_log_parser.py encode_lib_log_parser.py 
 7 | diff ../../atac-seq-pipeline/src/encode_lib_common.py encode_lib_common.py 
 8 | diff ../../atac-seq-pipeline/src/encode_task_bowtie2.py encode_task_bowtie2.py 
 9 | diff ../../atac-seq-pipeline/src/encode_task_filter.py encode_task_filter.py 
10 | diff ../../atac-seq-pipeline/src/encode_task_post_align.py encode_task_post_align.py 
11 | diff ../../atac-seq-pipeline/src/encode_lib_frip.py encode_lib_frip.py 
12 | diff ../../atac-seq-pipeline/src/encode_task_idr.py encode_task_idr.py 
13 | diff ../../atac-seq-pipeline/src/encode_task_overlap.py encode_task_overlap.py 
14 | diff ../../atac-seq-pipeline/src/encode_task_pool_ta.py encode_task_pool_ta.py 
15 | diff ../../atac-seq-pipeline/src/encode_task_qc_report.py encode_task_qc_report.py 
16 | diff ../../atac-seq-pipeline/src/encode_task_reproducibility.py encode_task_reproducibility.py 
17 | diff ../../atac-seq-pipeline/src/encode_task_spr.py encode_task_spr.py 
18 | diff ../../atac-seq-pipeline/src/encode_task_xcor.py encode_task_xcor.py
19 | diff ../../atac-seq-pipeline/src/encode_task_jsd.py encode_task_jsd.py
20 | diff ../../atac-seq-pipeline/src/encode_task_gc_bias.py encode_task_gc_bias.py
21 | 
22 | 


--------------------------------------------------------------------------------
/src/encode_lib_blacklist_filter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # ENCODE DCC blacklist filter wrapper
  4 | # Author: Jin Lee (leepc12@gmail.com)
  5 | 
  6 | import sys
  7 | import os
  8 | import argparse
  9 | from encode_lib_common import (
 10 |     get_ext, get_num_lines, gunzip, log, mkdir_p,
 11 |     rm_f, run_shell_cmd, strip_ext, strip_ext_bam)
 12 | 
 13 | 
 14 | def parse_arguments():
 15 |     parser = argparse.ArgumentParser(prog='ENCODE DCC Blacklist filter.')
 16 |     parser.add_argument('peak', type=str, help='Peak file.')
 17 |     parser.add_argument('--blacklist', type=str,
 18 |                         help='Blacklist BED file.')
 19 |     parser.add_argument('--regex-bfilt-peak-chr-name',
 20 |                         help='Keep chromosomes matching this pattern only '
 21 |                              'in .bfilt. peak files.')
 22 |     parser.add_argument('--out-dir', default='', type=str,
 23 |                         help='Output directory.')
 24 |     parser.add_argument('--log-level', default='INFO',
 25 |                         choices=['NOTSET', 'DEBUG', 'INFO',
 26 |                                  'WARNING', 'CRITICAL', 'ERROR',
 27 |                                  'CRITICAL'],
 28 |                         help='Log level')
 29 |     args = parser.parse_args()
 30 |     if args.blacklist is None or args.blacklist.endswith('null'):
 31 |         args.blacklist = ''
 32 | 
 33 |     log.setLevel(args.log_level)
 34 |     log.info(sys.argv)
 35 |     return args
 36 | 
 37 | 
 38 | def blacklist_filter(peak, blacklist, regex_bfilt_peak_chr_name, out_dir):
 39 |     prefix = os.path.join(
 40 |         out_dir,
 41 |         os.path.basename(strip_ext(peak)))
 42 |     peak_ext = get_ext(peak)
 43 |     filtered = '{}.bfilt.{}.gz'.format(prefix, peak_ext)
 44 |     if regex_bfilt_peak_chr_name is None:
 45 |         regex_bfilt_peak_chr_name = ''
 46 | 
 47 |     if blacklist is None or blacklist == '' or get_num_lines(peak) == 0 \
 48 |             or get_num_lines(blacklist) == 0:
 49 |         cmd = 'zcat -f {} | '
 50 |         cmd += 'grep -P \'{}\\b\' | '
 51 |         cmd += 'gzip -nc > {}'
 52 |         cmd = cmd.format(
 53 |             peak,
 54 |             regex_bfilt_peak_chr_name,
 55 |             filtered)
 56 |         run_shell_cmd(cmd)
 57 |     else:
 58 |         # due to bedtools bug when .gz is given for -a and -b
 59 |         tmp1 = gunzip(peak, 'tmp1', out_dir)
 60 |         tmp2 = gunzip(blacklist, 'tmp2', out_dir)
 61 | 
 62 |         cmd = 'bedtools intersect -nonamecheck -v -a {} -b {} | '
 63 |         cmd += 'awk \'BEGIN{{OFS="\\t"}} '
 64 |         cmd += '{{if ($5>1000) $5=1000; print $0}}\' | '
 65 |         cmd += 'grep -P \'{}\\b\' | '
 66 |         cmd += 'gzip -nc > {}'
 67 |         cmd = cmd.format(
 68 |             tmp1,  # peak
 69 |             tmp2,  # blacklist
 70 |             regex_bfilt_peak_chr_name, # regex
 71 |             filtered)
 72 |         run_shell_cmd(cmd)
 73 |         rm_f([tmp1, tmp2])
 74 |     return filtered
 75 | 
 76 | 
 77 | def blacklist_filter_bam(bam, blacklist, out_dir):
 78 |     prefix = os.path.join(out_dir,
 79 |                           os.path.basename(strip_ext_bam(bam)))
 80 |     filtered = '{}.bfilt.bam'.format(prefix)
 81 | 
 82 |     if blacklist == '' or get_num_lines(blacklist) == 0:
 83 |         cmd = 'cp -f {b} {f}'.format(b=bam, f=filtered)
 84 |         run_shell_cmd(cmd)
 85 |     else:
 86 |         # due to bedtools bug when .gz is given for -a and -b
 87 |         tmp2 = gunzip(blacklist, 'tmp2', out_dir)
 88 | 
 89 |         cmd = 'bedtools intersect -nonamecheck -v -abam {} -b {} > {}'
 90 |         cmd = cmd.format(
 91 |             bam,
 92 |             tmp2,  # blacklist
 93 |             filtered)
 94 |         run_shell_cmd(cmd)
 95 |         rm_f([tmp2])
 96 |     return filtered
 97 | 
 98 | 
 99 | def main():
100 |     # read params
101 |     args = parse_arguments()
102 |     log.info('Initializing and making output directory...')
103 | 
104 |     # make out_dir (root of all outputs)
105 |     mkdir_p(args.out_dir)
106 | 
107 |     # reproducibility QC
108 |     log.info('Filtering peak with blacklist...')
109 |     blacklist_filter(
110 |         args.peak, args.blacklist,
111 |         args.keep_irregular_chr, args.out_dir)
112 | 
113 |     log.info('All done.')
114 | 
115 | 
116 | if __name__ == '__main__':
117 |     main()
118 | 


--------------------------------------------------------------------------------
/src/encode_lib_frip.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # ENCODE DCC FRiP wrapper
  4 | # Author: Jin Lee (leepc12@gmail.com)
  5 | 
  6 | import sys
  7 | import os
  8 | import argparse
  9 | from encode_lib_common import (
 10 |     get_num_lines, gunzip, log, ls_l, mkdir_p, rm_f,
 11 |     run_shell_cmd, strip_ext, write_txt)
 12 | 
 13 | 
 14 | def parse_arguments():
 15 |     parser = argparse.ArgumentParser(prog='ENCODE DCC FRiP.',
 16 |                                      description='')
 17 |     parser.add_argument('peak', type=str,
 18 |                         help='Peak file.')
 19 |     parser.add_argument('ta', type=str,
 20 |                         help='TAGALIGN file.')
 21 |     parser.add_argument('--chrsz', type=str,
 22 |                         help='2-col chromosome sizes file. \
 23 |                         If given, do shifted FRiP (for ChIP-Seq).')
 24 |     parser.add_argument('--fraglen', type=int, default=0,
 25 |                         help='Fragment length for TAGALIGN file. \
 26 |                         If given, do shifted FRiP (for ChIP-Seq).')
 27 |     parser.add_argument('--out-dir', default='', type=str,
 28 |                         help='Output directory.')
 29 |     parser.add_argument('--log-level', default='INFO',
 30 |                         choices=['NOTSET', 'DEBUG', 'INFO',
 31 |                                  'WARNING', 'CRITICAL', 'ERROR',
 32 |                                  'CRITICAL'],
 33 |                         help='Log level')
 34 |     args = parser.parse_args()
 35 |     log.setLevel(args.log_level)
 36 |     log.info(sys.argv)
 37 |     return args
 38 | 
 39 | 
 40 | def frip(ta, peak, out_dir):
 41 |     prefix = os.path.join(out_dir,
 42 |                           os.path.basename(strip_ext(peak)))
 43 |     frip_qc = '{}.frip.qc'.format(prefix)
 44 | 
 45 |     if get_num_lines(peak) == 0:
 46 |         val1 = 0.0
 47 |         tmp_files = []
 48 |     else:
 49 |         # due to bedtools bug when .gz is given for -a and -b
 50 |         tmp1 = gunzip(ta, 'tmp1', out_dir)
 51 |         tmp2 = gunzip(peak, 'tmp2', out_dir)
 52 | 
 53 |         cmd = 'bedtools intersect -nonamecheck -a {} -b {} -wa -u | wc -l'
 54 |         cmd = cmd.format(
 55 |             tmp1,  # ta
 56 |             tmp2)  # peak
 57 |         val1 = run_shell_cmd(cmd)
 58 |         tmp_files = [tmp1, tmp2]
 59 |     val2 = get_num_lines(ta)
 60 |     write_txt(frip_qc, str(float(val1)/float(val2)))
 61 |     rm_f(tmp_files)
 62 |     return frip_qc
 63 | 
 64 | 
 65 | def frip_shifted(ta, peak, chrsz, fraglen, out_dir):
 66 |     prefix = os.path.join(out_dir,
 67 |                           os.path.basename(strip_ext(peak)))
 68 |     frip_qc = '{}.frip.qc'.format(prefix)
 69 |     half_fraglen = (fraglen+1)/2
 70 | 
 71 |     if get_num_lines(peak) == 0:
 72 |         val1 = 0.0
 73 |     else:
 74 |         # due to bedtools bug when .gz is given for -a and -b
 75 |         tmp2 = gunzip(peak, 'tmp2', out_dir)
 76 | 
 77 |         cmd = 'bedtools slop -i {} -g {} '
 78 |         cmd += '-s -l {} -r {} | '
 79 |         cmd += 'awk \'{{if ($2>=0 && $3>=0 && $2<=$3) print $0}}\' | '
 80 |         cmd += 'bedtools intersect -nonamecheck -a stdin -b {} '
 81 |         cmd += '-wa -u | wc -l'
 82 |         cmd = cmd.format(
 83 |             ta,
 84 |             chrsz,
 85 |             -half_fraglen,
 86 |             half_fraglen,
 87 |             tmp2)  # peak
 88 |         val1 = run_shell_cmd(cmd)
 89 |         rm_f(tmp2)
 90 |     val2 = get_num_lines(ta)
 91 |     write_txt(frip_qc, str(float(val1)/float(val2)))
 92 |     return frip_qc
 93 | 
 94 | 
 95 | def main():
 96 |     # read params
 97 |     args = parse_arguments()
 98 |     log.info('Initializing and making output directory...')
 99 |     mkdir_p(args.out_dir)
100 | 
101 |     if args.fraglen:
102 |         frip_shifted(args.ta, args.peak,
103 |                      args.chrsz, args.fraglen, args.out_dir)
104 |     else:
105 |         frip(args.ta, args.peak, args.out_dir)
106 | 
107 |     log.info('List all files in output directory...')
108 |     ls_l(args.out_dir)
109 | 
110 |     log.info('All done.')
111 | 
112 | 
113 | if __name__ == '__main__':
114 |     main()
115 | 


--------------------------------------------------------------------------------
/src/encode_task_annot_enrich.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # ENCODE annot_enrich (fraction of reads in annotated regions) wrapper
  4 | # Author: Daniel Kim, Jin Lee (leepc12@gmail.com)
  5 | 
  6 | import sys
  7 | import os
  8 | import argparse
  9 | from encode_lib_common import (
 10 |     run_shell_cmd, strip_ext_ta,
 11 |     ls_l, get_num_lines, log)
 12 | import warnings
 13 | warnings.filterwarnings("ignore")
 14 | 
 15 | 
 16 | def parse_arguments():
 17 |     parser = argparse.ArgumentParser(
 18 |         prog='ENCODE annot_enrich (fraction of reads in annotated regions)')
 19 |     parser.add_argument(
 20 |         '--ta', type=str, help='TAG-ALIGN file (from task bam2ta).')
 21 |     parser.add_argument('--dnase', type=str, help='DNase definition bed file.')
 22 |     parser.add_argument('--blacklist', type=str, help='Blacklist bed file.')
 23 |     parser.add_argument('--prom', type=str,
 24 |                         help='Promoter definition bed file.')
 25 |     parser.add_argument('--enh', type=str,
 26 |                         help='Enhancer definition bed file.')
 27 |     parser.add_argument('--out-dir', default='', type=str,
 28 |                         help='Output directory.')
 29 |     parser.add_argument('--log-level', default='INFO', help='Log level',
 30 |                         choices=['NOTSET', 'DEBUG', 'INFO', 'WARNING',
 31 |                                  'CRITICAL', 'ERROR', 'CRITICAL'])
 32 |     args = parser.parse_args()
 33 |     log.setLevel(args.log_level)
 34 |     log.info(sys.argv)
 35 |     return args
 36 | 
 37 | 
 38 | def get_fract_reads_in_regions(reads_bed, regions_bed):
 39 |     """Function that takes in bed file of reads and bed file of regions and
 40 |     gets fraction of reads sitting in said regions
 41 |     """
 42 |     # uses new run_shell_cmd
 43 |     cmd = "bedtools sort -i {}  | "
 44 |     cmd += "bedtools merge -i stdin | "
 45 |     cmd += "bedtools intersect -u -nonamecheck -a {} -b stdin | "
 46 |     cmd += "wc -l"
 47 |     cmd = cmd.format(regions_bed, reads_bed)
 48 |     intersect_read_count = int(run_shell_cmd(cmd))
 49 |     total_read_count = get_num_lines(reads_bed)
 50 |     fract_reads = float(intersect_read_count) / total_read_count
 51 | 
 52 |     return intersect_read_count, fract_reads
 53 | 
 54 | 
 55 | def main():
 56 |     # read params
 57 |     args = parse_arguments()
 58 |     FINAL_BED = args.ta
 59 |     OUTPUT_PREFIX = os.path.join(
 60 |         args.out_dir,
 61 |         os.path.basename(strip_ext_ta(FINAL_BED)))
 62 | 
 63 |     DNASE = args.dnase if args.dnase and os.path.basename(
 64 |         args.dnase) != 'null' else ''
 65 |     BLACKLIST = args.blacklist if args.blacklist and os.path.basename(
 66 |         args.blacklist) != 'null' else ''
 67 |     PROM = args.prom if args.prom and os.path.basename(
 68 |         args.prom) != 'null' else ''
 69 |     ENH = args.enh if args.enh and os.path.basename(args.enh) != 'null' else ''
 70 | 
 71 |     result = []
 72 |     # Dnase regions
 73 |     if DNASE:
 74 |         reads_dnase, fract_dnase = get_fract_reads_in_regions(FINAL_BED, DNASE)
 75 |         result.append(('fraction_of_reads_in_universal_DHS_regions',
 76 |                        str(reads_dnase), str(fract_dnase)))
 77 | 
 78 |     # Blacklist regions
 79 |     if BLACKLIST:
 80 |         reads_blacklist, \
 81 |             fract_blacklist = get_fract_reads_in_regions(FINAL_BED, BLACKLIST)
 82 |         result.append(('fraction_of_reads_in_blacklist_regions',
 83 |                        str(reads_blacklist), str(fract_blacklist)))
 84 | 
 85 |     # Prom regions
 86 |     if PROM:
 87 |         reads_prom, fract_prom = get_fract_reads_in_regions(FINAL_BED, PROM)
 88 |         result.append(('fraction_of_reads_in_promoter_regions',
 89 |                        str(reads_prom), str(fract_prom)))
 90 | 
 91 |     # Enh regions
 92 |     if ENH:
 93 |         reads_enh, fract_enh = get_fract_reads_in_regions(FINAL_BED, ENH)
 94 |         result.append(('fraction_of_reads_in_enhancer_regions',
 95 |                        str(reads_enh), str(fract_enh)))
 96 | 
 97 |     annot_enrich_qc = OUTPUT_PREFIX + '.annot_enrich.qc'
 98 |     with open(annot_enrich_qc, 'w') as fp:
 99 |         for line in result:
100 |             fp.write('\t'.join(line) + '\n')
101 | 
102 |     log.info('List all files in output directory...')
103 |     ls_l(args.out_dir)
104 | 
105 |     log.info('All done.')
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     main()
110 | 


--------------------------------------------------------------------------------
/src/encode_task_bam_to_pbam.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Author: Jin Lee (leepc12@gmail.com)
 4 | 
 5 | import sys
 6 | import os
 7 | import argparse
 8 | from encode_lib_common import (
 9 |     log,
10 |     ls_l,
11 |     mkdir_p,
12 |     rm_f,
13 | )
14 | from encode_lib_genomic import (
15 |     bam_to_pbam,
16 | )
17 | 
18 | 
19 | def parse_arguments():
20 |     parser = argparse.ArgumentParser(prog='ENCODE bam to pbam',
21 |                                      description='')
22 |     parser.add_argument('bam', type=str,
23 |                         help='Path for BAM.')
24 |     parser.add_argument('--ref-fa', type=str,
25 |                         help='Path for reference fasta.')
26 |     parser.add_argument('--delete-original-bam', action='store_true',
27 |                         help='Delete original BAM after conversion.')
28 |     parser.add_argument('--out-dir', default='', type=str,
29 |                         help='Output directory.')
30 |     parser.add_argument('--log-level', default='INFO',
31 |                         choices=['NOTSET', 'DEBUG', 'INFO',
32 |                                  'WARNING', 'CRITICAL', 'ERROR',
33 |                                  'CRITICAL'],
34 |                         help='Log level')
35 |     args = parser.parse_args()
36 | 
37 |     log.setLevel(args.log_level)
38 |     log.info(sys.argv)
39 |     return args
40 | 
41 | 
42 | def main():
43 |     # read params
44 |     args = parse_arguments()
45 | 
46 |     log.info('Initializing and making output directory...')
47 |     mkdir_p(args.out_dir)
48 | 
49 |     # generate read length file
50 |     log.info('Converting BAM into pBAM...')
51 |     bam_to_pbam(args.bam, args.ref_fa, args.out_dir)
52 | 
53 |     if args.delete_original_bam:
54 |         log.info('Deleting original BAM...')
55 |         rm_f(args.bam)
56 | 
57 |     log.info('List all files in output directory...')
58 |     ls_l(args.out_dir)
59 | 
60 |     log.info('All done.')
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     main()
65 | 


--------------------------------------------------------------------------------
/src/encode_task_count_signal_track.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # ENCODE DCC Count signal track generation
  4 | # Author: Jin Lee (leepc12@gmail.com)
  5 | 
  6 | import sys
  7 | import os
  8 | import argparse
  9 | from encode_lib_common import (
 10 |     log, ls_l, mkdir_p, rm_f, run_shell_cmd, strip_ext_ta,
 11 |     get_gnu_sort_param,
 12 | )
 13 | 
 14 | 
 15 | def parse_arguments():
 16 |     parser = argparse.ArgumentParser(
 17 |         prog='ENCODE DCC Count signal track generation')
 18 |     parser.add_argument('ta', type=str,
 19 |                         help='Path for TAGALIGN file.')
 20 |     parser.add_argument('--chrsz', type=str,
 21 |                         help='2-col chromosome sizes file.')
 22 |     parser.add_argument('--mem-gb', type=float, default=4.0,
 23 |                         help='Max. memory for this job in GB. '
 24 |                         'This will be used to determine GNU sort -S (defaulting to 0.5 of this value). '
 25 |                         'It should be total memory for this task (not memory per thread).')
 26 |     parser.add_argument('--out-dir', default='', type=str,
 27 |                         help='Output directory.')
 28 |     parser.add_argument('--log-level', default='INFO',
 29 |                         choices=['NOTSET', 'DEBUG', 'INFO',
 30 |                                  'WARNING', 'CRITICAL', 'ERROR',
 31 |                                  'CRITICAL'],
 32 |                         help='Log level')
 33 |     args = parser.parse_args()
 34 |     log.setLevel(args.log_level)
 35 |     log.info(sys.argv)
 36 |     return args
 37 | 
 38 | 
 39 | def count_signal_track(ta, chrsz, mem_gb, out_dir):
 40 |     prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta)))
 41 |     pos_bw = '{}.positive.bigwig'.format(prefix)
 42 |     neg_bw = '{}.negative.bigwig'.format(prefix)
 43 |     # temporary files
 44 |     pos_bedgraph = '{}.positive.bedgraph'.format(prefix)
 45 |     neg_bedgraph = '{}.negative.bedgraph'.format(prefix)
 46 | 
 47 |     temp_files = []
 48 | 
 49 |     run_shell_cmd(
 50 |         'zcat -f {ta} | sort -k1,1 -k2,2n {sort_param} | '
 51 |         'bedtools genomecov -5 -bg -strand + -g {chrsz} -i stdin > {pos_bedgraph}'.format(
 52 |             ta=ta,
 53 |             sort_param=get_gnu_sort_param(mem_gb * 1024 ** 3, ratio=0.5),
 54 |             chrsz=chrsz,
 55 |             pos_bedgraph=pos_bedgraph,
 56 |         )
 57 |     )
 58 | 
 59 |     run_shell_cmd(
 60 |         'zcat -f {ta} | sort -k1,1 -k2,2n {sort_param} | '
 61 |         'bedtools genomecov -5 -bg -strand - -g {chrsz} -i stdin > {neg_bedgraph}'.format(
 62 |             ta=ta,
 63 |             sort_param=get_gnu_sort_param(mem_gb * 1024 ** 3, ratio=0.5),
 64 |             chrsz=chrsz,
 65 |             neg_bedgraph=neg_bedgraph,
 66 |         )
 67 |     )
 68 | 
 69 |     run_shell_cmd(
 70 |         'bedGraphToBigWig {pos_bedgraph} {chrsz} {pos_bw}'.format(
 71 |             pos_bedgraph=pos_bedgraph,
 72 |             chrsz=chrsz,
 73 |             pos_bw=pos_bw,
 74 |         )
 75 |     )
 76 | 
 77 |     run_shell_cmd(
 78 |         'bedGraphToBigWig {neg_bedgraph} {chrsz} {neg_bw}'.format(
 79 |             neg_bedgraph=neg_bedgraph,
 80 |             chrsz=chrsz,
 81 |             neg_bw=neg_bw,
 82 |         )
 83 |     )
 84 | 
 85 |     # remove temporary files
 86 |     temp_files.append(pos_bedgraph)
 87 |     temp_files.append(neg_bedgraph)
 88 |     rm_f(temp_files)
 89 | 
 90 |     return pos_bw, neg_bw
 91 | 
 92 | 
 93 | def main():
 94 |     # read params
 95 |     args = parse_arguments()
 96 | 
 97 |     log.info('Initializing and making output directory...')
 98 |     mkdir_p(args.out_dir)
 99 | 
100 |     log.info('Generating count signal tracks...')
101 |     pos_bw, neg_bw = count_signal_track(
102 |         args.ta,
103 |         args.chrsz,
104 |         args.mem_gb,
105 |         args.out_dir
106 |     )
107 | 
108 |     log.info('List all files in output directory...')
109 |     ls_l(args.out_dir)
110 | 
111 |     log.info('All done.')
112 | 
113 | 
114 | if __name__ == '__main__':
115 |     main()
116 | 


--------------------------------------------------------------------------------
/src/encode_task_frac_mito.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # ENCODE frac mito
 4 | # Author: Jin Lee (leepc12@gmail.com)
 5 | 
 6 | import sys
 7 | import os
 8 | import argparse
 9 | from encode_lib_common import (
10 |     log, ls_l, mkdir_p, strip_ext)
11 | from encode_lib_log_parser import parse_flagstat_qc
12 | 
13 | 
14 | def parse_arguments():
15 |     parser = argparse.ArgumentParser(
16 |         prog='ENCODE frac mito',
17 |         description='Calculates fraction of mito reads')
18 |     parser.add_argument('non_mito_samstat', type=str,
19 |                         help='Path for SAMstats log file')
20 |     parser.add_argument('mito_samstat', type=str,
21 |                         help='Path for SAMstats log file (mito only)')
22 |     parser.add_argument('--out-dir', default='', type=str,
23 |                         help='Output directory.')
24 |     parser.add_argument('--log-level', default='INFO',
25 |                         choices=['NOTSET', 'DEBUG', 'INFO',
26 |                                  'WARNING', 'CRITICAL', 'ERROR',
27 |                                  'CRITICAL'],
28 |                         help='Log level')
29 |     args = parser.parse_args()
30 | 
31 |     log.setLevel(args.log_level)
32 |     log.info(sys.argv)
33 |     return args
34 | 
35 | 
36 | def frac_mito(non_mito_samstat, mito_samstat, out_dir):
37 |     prefix = os.path.join(
38 |         out_dir,
39 |         os.path.basename(strip_ext(non_mito_samstat,
40 |                                    'non_mito.samstats.qc')))
41 |     frac_mito_qc = '{}.frac_mito.qc'.format(prefix)
42 | 
43 |     non_mito_samstat_dict = parse_flagstat_qc(non_mito_samstat)
44 |     mito_samstat_dict = parse_flagstat_qc(mito_samstat)
45 | 
46 |     if 'mapped' in non_mito_samstat_dict:
47 |         # backward compatibility (old key name was 'total')
48 |         key_mapped = 'mapped'
49 |     elif 'mapped_reads' in non_mito_samstat_dict:
50 |         key_mapped = 'mapped_reads'
51 |     Rn = non_mito_samstat_dict[key_mapped]
52 | 
53 |     if 'mapped' in mito_samstat_dict:
54 |         # backward compatibility (old key name was 'total')
55 |         key_mapped = 'mapped'
56 |     elif 'mapped_reads' in mito_samstat_dict:
57 |         key_mapped = 'mapped_reads'
58 |     Rm = mito_samstat_dict[key_mapped]
59 | 
60 |     frac = float(Rm)/float(Rn + Rm)
61 |     with open(frac_mito_qc, 'w') as fp:
62 |         fp.write('non_mito_reads\t{}\n'.format(Rn))
63 |         fp.write('mito_reads\t{}\n'.format(Rm))
64 |         fp.write('frac_mito_reads\t{}\n'.format(frac))
65 | 
66 |     return frac_mito_qc
67 | 
68 | 
69 | def main():
70 |     # read params
71 |     args = parse_arguments()
72 |     log.info('Initializing and making output directory...')
73 |     mkdir_p(args.out_dir)
74 | 
75 |     frac_mito(args.non_mito_samstat,
76 |               args.mito_samstat,
77 |               args.out_dir)
78 | 
79 |     log.info('List all files in output directory...')
80 |     ls_l(args.out_dir)
81 | 
82 |     log.info('All done.')
83 | 
84 | 
85 | if __name__ == '__main__':
86 |     main()
87 | 


--------------------------------------------------------------------------------
/src/encode_task_jsd.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # ENCODE DCC fingerprint/JSD plot wrapper
  4 | # Author: Jin Lee (leepc12@gmail.com)
  5 | 
  6 | import sys
  7 | import os
  8 | import argparse
  9 | from encode_lib_common import (
 10 |     log, ls_l, mkdir_p, rm_f, run_shell_cmd, strip_ext_bam)
 11 | from encode_lib_genomic import (
 12 |     samtools_index)
 13 | 
 14 | from encode_lib_blacklist_filter import blacklist_filter_bam
 15 | 
 16 | 
 17 | def parse_arguments():
 18 |     parser = argparse.ArgumentParser(
 19 |         prog='ENCODE DCC Fingerprint/JSD plot.')
 20 |     parser.add_argument(
 21 |         'bams', nargs='+', type=str,
 22 |         help='List of paths for filtered experiment BAM files.')
 23 |     parser.add_argument('--ctl-bam', type=str, default='',
 24 |                         help='Path for filtered control BAM file.')
 25 |     parser.add_argument('--blacklist', type=str, default='',
 26 |                         help='Blacklist BED file.')
 27 |     parser.add_argument('--mapq-thresh', default=30, type=int,
 28 |                         help='Threshold for low MAPQ reads removal.')
 29 |     parser.add_argument('--nth', type=int, default=1,
 30 |                         help='Number of threads to parallelize.')
 31 |     parser.add_argument('--out-dir', default='', type=str,
 32 |                         help='Output directory.')
 33 |     parser.add_argument('--log-level', default='INFO',
 34 |                         choices=['NOTSET', 'DEBUG', 'INFO',
 35 |                                  'WARNING', 'CRITICAL', 'ERROR',
 36 |                                  'CRITICAL'],
 37 |                         help='Log level')
 38 |     args = parser.parse_args()
 39 | 
 40 |     log.setLevel(args.log_level)
 41 |     log.info(sys.argv)
 42 |     return args
 43 | 
 44 | 
 45 | def fingerprint(bams, ctl_bam, blacklist, mapq_thresh, nth, out_dir):
 46 |     # make bam index (.bai) first
 47 |     # filter bams with blacklist
 48 |     filtered_bams = []
 49 |     for bam in bams:
 50 |         filtered_bam = blacklist_filter_bam(bam, blacklist, out_dir)
 51 |         samtools_index(filtered_bam, nth)
 52 |         filtered_bams.append(filtered_bam)
 53 |     filtered_ctl_bam = None
 54 |     if ctl_bam:
 55 |         filtered_ctl_bam = blacklist_filter_bam(ctl_bam, blacklist, out_dir)
 56 |         samtools_index(filtered_ctl_bam, nth)
 57 | 
 58 |     prefix = os.path.join(out_dir,
 59 |                           os.path.basename(strip_ext_bam(bams[0])))
 60 |     plot_png = '{}.jsd_plot.png'.format(prefix)
 61 |     tmp_log = '{}.jsd.tmp'.format(prefix)
 62 | 
 63 |     labels = []
 64 |     bam_paths = []
 65 |     jsd_qcs = []
 66 |     for i, bam in enumerate(filtered_bams):
 67 |         prefix_ = os.path.join(out_dir,
 68 |                                os.path.basename(strip_ext_bam(bam)))
 69 |         jsd_qcs.append('rep{}.{}.jsd.qc'.format(i+1, prefix_))
 70 |         labels.append('rep{}'.format(i+1))  # repN
 71 |         bam_paths.append(bam)
 72 |     # add control
 73 |     if filtered_ctl_bam:
 74 |         labels.append('ctl1')
 75 |         bam_paths.append(filtered_ctl_bam)
 76 | 
 77 |     cmd = 'LC_ALL=en_US.UTF-8 LANG=en_US.UTF-8 plotFingerprint -b {} '
 78 |     if filtered_ctl_bam:
 79 |         cmd += '--JSDsample {} '.format(filtered_ctl_bam)
 80 |     cmd += '--labels {} '
 81 |     cmd += '--outQualityMetrics {} '
 82 |     cmd += '--minMappingQuality {} '
 83 |     cmd += '-T "Fingerprints of different samples" '
 84 |     cmd += '--numberOfProcessors {} '
 85 |     cmd += '--plotFile {}'
 86 |     cmd = cmd.format(
 87 |         ' '.join(bam_paths),
 88 |         ' '.join(labels),
 89 |         tmp_log,
 90 |         mapq_thresh,
 91 |         nth,
 92 |         plot_png)
 93 |     run_shell_cmd(cmd)
 94 | 
 95 |     # remove intermediate files (blacklist-filtered BAM)
 96 |     if filtered_ctl_bam:
 97 |         rm_f(filtered_ctl_bam)
 98 |     rm_f(filtered_bams)
 99 | 
100 |     # parse tmp_log to get jsd_qc for each exp replicate
101 |     with open(tmp_log, 'r') as fp:
102 |         for i, line in enumerate(fp.readlines()):  # i is rep_id-1
103 |             if i == 0:
104 |                 continue
105 |             if i > len(jsd_qcs):
106 |                 break
107 |             with open(jsd_qcs[i-1], 'w') as fp2:
108 |                 # removing repN from lines
109 |                 fp2.write('\t'.join(line.strip().split('\t')[1:]))
110 |     rm_f(tmp_log)
111 |     return plot_png, jsd_qcs
112 | 
113 | 
114 | def main():
115 |     # read params
116 |     args = parse_arguments()
117 | 
118 |     log.info('Initializing and making output directory...')
119 |     mkdir_p(args.out_dir)
120 | 
121 |     log.info('Plotting Fingerprint on BAMs and calculating JSD...')
122 |     plot_png, jsd_qcs = fingerprint(
123 |         args.bams, args.ctl_bam, args.blacklist, args.mapq_thresh,
124 |         args.nth, args.out_dir)
125 | 
126 |     log.info('List all files in output directory...')
127 |     ls_l(args.out_dir)
128 | 
129 |     log.info('All done.')
130 | 
131 | 
132 | if __name__ == '__main__':
133 |     main()
134 | 


--------------------------------------------------------------------------------
/src/encode_task_merge_fastq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # ENCODE DCC fastq merger wrapper
  4 | # Author: Jin Lee (leepc12@gmail.com)
  5 | 
  6 | import sys
  7 | import os
  8 | import argparse
  9 | from encode_lib_common import (
 10 |     log, ls_l, mkdir_p, read_tsv, run_shell_cmd,
 11 |     strip_ext_fastq)
 12 | 
 13 | 
 14 | def parse_arguments(debug=False):
 15 |     parser = argparse.ArgumentParser(prog='ENCODE DCC fastq merger.',
 16 |                                      description='')
 17 |     parser.add_argument(
 18 |         'fastqs', nargs='+', type=str,
 19 |         help='TSV file path or list of FASTQs. '
 20 |              'FASTQs must be compressed with gzip (with .gz). '
 21 |              'Use TSV for multiple fastqs to be merged later. '
 22 |              'row=merge_id, col=end_id).')
 23 |     parser.add_argument('--paired-end', action="store_true",
 24 |                         help='Paired-end FASTQs.')
 25 |     parser.add_argument('--nth', type=int, default=1,
 26 |                         help='Number of threads to parallelize.')
 27 |     parser.add_argument('--out-dir', default='', type=str,
 28 |                         help='Output directory.')
 29 |     parser.add_argument('--log-level', default='INFO',
 30 |                         choices=['NOTSET', 'DEBUG', 'INFO',
 31 |                                  'WARNING', 'CRITICAL', 'ERROR',
 32 |                                  'CRITICAL'],
 33 |                         help='Log level')
 34 |     args = parser.parse_args()
 35 | 
 36 |     # parse fastqs command line
 37 |     if args.fastqs[0].endswith('.gz') or args.fastqs[0].endswith('.fastq') or \
 38 |             args.fastqs[0].endswith('.fq'):  # it's fastq
 39 |         args.fastqs = [[f] for f in args.fastqs]  # make it a matrix
 40 |     else:  # it's TSV
 41 |         args.fastqs = read_tsv(args.fastqs[0])
 42 | 
 43 |     for i, fastqs in enumerate(args.fastqs):
 44 |         if args.paired_end and len(fastqs) != 2:
 45 |             raise argparse.ArgumentTypeError(
 46 |                 'Need 2 fastqs per replicate for paired end.')
 47 |         if not args.paired_end and len(fastqs) != 1:
 48 |             raise argparse.ArgumentTypeError(
 49 |                 'Need 1 fastq per replicate for single end.')
 50 | 
 51 |     log.setLevel(args.log_level)
 52 |     log.info(sys.argv)
 53 |     return args
 54 | 
 55 | 
 56 | def merge_fastqs(fastqs, end, out_dir):
 57 |     """make merged fastqs on $out_dir/R1, $out_dir/R2
 58 |     """
 59 |     out_dir = os.path.join(out_dir, end)
 60 |     mkdir_p(out_dir)
 61 |     prefix = os.path.join(out_dir,
 62 |                           os.path.basename(strip_ext_fastq(fastqs[0])))
 63 | 
 64 |     if len(fastqs) > 1:
 65 |         merged = '{}.merged.fastq.gz'.format(prefix)
 66 |     else:
 67 |         merged = '{}.fastq.gz'.format(prefix)
 68 | 
 69 |     cmd = 'zcat -f {} | gzip -nc > {}'.format(
 70 |         ' '.join(fastqs),
 71 |         merged)
 72 |     run_shell_cmd(cmd)
 73 |     return merged
 74 | 
 75 | 
 76 | def main():
 77 |     # read params
 78 |     args = parse_arguments()
 79 | 
 80 |     log.info('Initializing and making output directory...')
 81 |     mkdir_p(args.out_dir)
 82 | 
 83 |     # update array with trimmed fastqs
 84 |     fastqs_R1 = []
 85 |     fastqs_R2 = []
 86 |     for fastqs in args.fastqs:
 87 |         fastqs_R1.append(fastqs[0])
 88 |         if args.paired_end:
 89 |             fastqs_R2.append(fastqs[1])
 90 | 
 91 |     log.info('Merging fastqs...')
 92 |     log.info('R1 to be merged: {}'.format(fastqs_R1))
 93 |     merged_R1 = merge_fastqs(fastqs_R1, 'R1', args.out_dir)
 94 |     if args.paired_end:
 95 |         log.info('R2 to be merged: {}'.format(fastqs_R2))
 96 |         merged_R2 = merge_fastqs(fastqs_R2, 'R2', args.out_dir)
 97 | 
 98 |     log.info('List all files in output directory...')
 99 |     ls_l(args.out_dir)
100 | 
101 |     log.info('All done.')
102 | 
103 | 
104 | if __name__ == '__main__':
105 |     main()
106 | 


--------------------------------------------------------------------------------
/src/encode_task_pool_ta.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # ENCODE DCC TAGALIGN pooler wrapper
 4 | # Author: Jin Lee (leepc12@gmail.com)
 5 | 
 6 | import sys
 7 | import os
 8 | import argparse
 9 | from encode_lib_common import (
10 |     log, ls_l, make_hard_link, mkdir_p, run_shell_cmd, strip_ext_ta)
11 | 
12 | 
13 | def parse_arguments():
14 |     parser = argparse.ArgumentParser(prog='ENCODE DCC TAGALIGN pooler.',
15 |                                      description='')
16 |     parser.add_argument('tas', nargs='+', type=str,
17 |                         help='List of TAGALIGNs to be pooled.')
18 |     parser.add_argument('--prefix', type=str,
19 |                         help='Basename prefix.')
20 |     parser.add_argument('--out-dir', default='', type=str,
21 |                         help='Output directory.')
22 |     parser.add_argument('--col',
23 |                         help='Number of columns to keep in a pooled TAGALIGN. '
24 |                              'Keep all columns if not defined.')
25 |     parser.add_argument('--log-level', default='INFO',
26 |                         choices=['NOTSET', 'DEBUG', 'INFO',
27 |                                  'WARNING', 'CRITICAL', 'ERROR',
28 |                                  'CRITICAL'],
29 |                         help='Log level')
30 |     args = parser.parse_args()
31 | 
32 |     log.setLevel(args.log_level)
33 |     log.info(sys.argv)
34 |     return args
35 | 
36 | 
37 | def pool_ta(tas, col, basename_prefix, out_dir):
38 |     if len(tas) > 1:
39 |         if basename_prefix is not None:
40 |             prefix = os.path.join(out_dir, basename_prefix)
41 |         else:
42 |             prefix = os.path.join(out_dir,
43 |                               os.path.basename(strip_ext_ta(tas[0])))
44 |         pooled_ta = '{}.pooled.tagAlign.gz'.format(prefix)
45 | 
46 |         cmd = 'zcat -f {} | '
47 |         if col is not None:
48 |             cmd += 'cut -f 1-{} | '.format(col)
49 |         cmd += 'gzip -nc > {}'
50 |         cmd = cmd.format(
51 |             ' '.join(tas),
52 |             pooled_ta)
53 |         run_shell_cmd(cmd)
54 |         return pooled_ta
55 |     else:
56 |         raise ValueError('Needs at least two TAs (or BEDs) to be pooled.')
57 | 
58 | 
59 | def main():
60 |     # read params
61 |     args = parse_arguments()
62 | 
63 |     log.info('Initializing and making output directory...')
64 |     mkdir_p(args.out_dir)
65 | 
66 |     log.info('Pooling TAGALIGNs...')
67 |     pool_ta(args.tas, args.col, args.prefix, args.out_dir)
68 | 
69 |     log.info('List all files in output directory...')
70 |     ls_l(args.out_dir)
71 | 
72 |     log.info('All done.')
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     main()
77 | 


--------------------------------------------------------------------------------
/src/encode_task_post_align.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # Author: Jin Lee (leepc12@gmail.com)
 4 | 
 5 | import sys
 6 | import os
 7 | import argparse
 8 | from encode_lib_common import (
 9 |     mkdir_p, log, ls_l, rm_f, strip_ext_fastq)
10 | from encode_lib_genomic import (
11 |     get_read_length, remove_chrs_from_bam, samstat, samtools_index)
12 | 
13 | 
14 | def parse_arguments():
15 |     parser = argparse.ArgumentParser(prog='ENCODE post align',
16 |                                      description='')
17 |     parser.add_argument('fastq', type=str,
18 |                         help='Path for FASTQ R1')
19 |     parser.add_argument('bam', type=str,
20 |                         help='Path for BAM')
21 |     parser.add_argument(
22 |         '--chrsz', type=str,
23 |         help='2-col chromosome sizes file. If not given then '
24 |              'SAMstats on mito-free BAM will not be calcaulted.')
25 |     parser.add_argument('--mito-chr-name', default='chrM',
26 |                         help='Mito chromosome name.')
27 |     parser.add_argument('--nth', type=int, default=1,
28 |                         help='Number of threads to parallelize.')
29 |     parser.add_argument('--mem-gb', type=float,
30 |                         help='Max. memory for samtools sort in GB. '
31 |                         'It should be total memory for this task (not memory per thread).')
32 |     parser.add_argument('--out-dir', default='', type=str,
33 |                         help='Output directory.')
34 |     parser.add_argument('--log-level', default='INFO',
35 |                         choices=['NOTSET', 'DEBUG', 'INFO',
36 |                                  'WARNING', 'CRITICAL', 'ERROR',
37 |                                  'CRITICAL'],
38 |                         help='Log level')
39 |     args = parser.parse_args()
40 | 
41 |     log.setLevel(args.log_level)
42 |     log.info(sys.argv)
43 |     return args
44 | 
45 | 
46 | def make_read_length_file(fastq, out_dir):
47 |     basename = os.path.basename(strip_ext_fastq(fastq))
48 |     prefix = os.path.join(out_dir, basename)
49 |     txt = '{}.read_length.txt'.format(prefix)
50 |     read_length = get_read_length(fastq)
51 |     with open(txt, 'w') as fp:
52 |         fp.write(str(read_length))
53 |     return txt
54 | 
55 | 
56 | def main():
57 |     # read params
58 |     args = parse_arguments()
59 | 
60 |     log.info('Initializing and making output directory...')
61 |     mkdir_p(args.out_dir)
62 | 
63 |     # generate read length file
64 |     log.info('Generating read length file...')
65 |     make_read_length_file(
66 |         args.fastq, args.out_dir)
67 | 
68 |     log.info('Running samtools index...')
69 |     samtools_index(args.bam, args.nth, args.out_dir)
70 | 
71 |     log.info('SAMstats on raw BAM...')
72 |     samstat(args.bam, args.nth, args.mem_gb, args.out_dir)
73 | 
74 |     if args.chrsz:
75 |         log.info('SAMstats on non-mito BAM...')
76 |         non_mito_out_dir = os.path.join(args.out_dir, 'non_mito')
77 |         mkdir_p(non_mito_out_dir)
78 |         non_mito_bam = remove_chrs_from_bam(args.bam, [args.mito_chr_name],
79 |                                             args.chrsz,
80 |                                             args.nth,
81 |                                             non_mito_out_dir)
82 |         samstat(non_mito_bam, args.nth, args.mem_gb, non_mito_out_dir)
83 |         rm_f(non_mito_bam)
84 | 
85 |     log.info('List all files in output directory...')
86 |     ls_l(args.out_dir)
87 | 
88 |     log.info('All done.')
89 | 
90 | 
91 | if __name__ == '__main__':
92 |     main()
93 | 


--------------------------------------------------------------------------------
/src/encode_task_post_call_peak_atac.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Author: Jin Lee (leepc12@gmail.com)
  4 | 
  5 | import sys
  6 | import argparse
  7 | from encode_lib_common import (
  8 |     assert_file_not_empty,
  9 |     log,
 10 |     ls_l,
 11 |     mkdir_p,
 12 | )
 13 | from encode_lib_genomic import (
 14 |     peak_to_bigbed,
 15 |     peak_to_hammock,
 16 |     get_region_size_metrics,
 17 |     get_num_peaks,
 18 |     peak_to_starch,
 19 | )
 20 | from encode_lib_blacklist_filter import blacklist_filter
 21 | from encode_lib_frip import frip
 22 | 
 23 | 
 24 | def parse_arguments():
 25 |     parser = argparse.ArgumentParser(prog='ENCODE post_call_peak (atac)',
 26 |                                      description='')
 27 |     parser.add_argument(
 28 |         'peak', type=str,
 29 |         help='Path for PEAK file. Peak filename should be "*.*Peak.gz". '
 30 |              'e.g. rep1.narrowPeak.gz')
 31 |     parser.add_argument('--ta', type=str,
 32 |                         help='TAG-ALIGN file.')
 33 |     parser.add_argument('--peak-type', type=str, required=True,
 34 |                         choices=['narrowPeak', 'regionPeak',
 35 |                                  'broadPeak', 'gappedPeak'],
 36 |                         help='Peak file type.')
 37 |     parser.add_argument('--chrsz', type=str,
 38 |                         help='2-col chromosome sizes file.')
 39 |     parser.add_argument('--blacklist', type=str,
 40 |                         help='Blacklist BED file.')
 41 |     parser.add_argument('--regex-bfilt-peak-chr-name',
 42 |                         help='Keep chromosomes matching this pattern only '
 43 |                              'in .bfilt. peak files.')
 44 |     parser.add_argument('--mem-gb', type=float, default=4.0,
 45 |                         help='Max. memory for this job in GB. '
 46 |                         'This will be used to determine GNU sort -S (defaulting to 0.5 of this value). '
 47 |                         'It should be total memory for this task (not memory per thread).')
 48 |     parser.add_argument('--out-dir', default='', type=str,
 49 |                         help='Output directory.')
 50 |     parser.add_argument('--log-level', default='INFO',
 51 |                         choices=['NOTSET', 'DEBUG', 'INFO',
 52 |                                  'WARNING', 'CRITICAL', 'ERROR',
 53 |                                  'CRITICAL'],
 54 |                         help='Log level')
 55 |     args = parser.parse_args()
 56 |     if args.blacklist is None or args.blacklist.endswith('null'):
 57 |         args.blacklist = ''
 58 | 
 59 |     log.setLevel(args.log_level)
 60 |     log.info(sys.argv)
 61 |     return args
 62 | 
 63 | 
 64 | def main():
 65 |     # read params
 66 |     args = parse_arguments()
 67 | 
 68 |     log.info('Initializing and making output directory...')
 69 |     mkdir_p(args.out_dir)
 70 | 
 71 |     log.info('Blacklist-filtering peaks...')
 72 |     bfilt_peak = blacklist_filter(
 73 |         args.peak, args.blacklist, args.regex_bfilt_peak_chr_name, args.out_dir)
 74 | 
 75 |     log.info('Checking if output is empty...')
 76 |     assert_file_not_empty(bfilt_peak)
 77 | 
 78 |     log.info('Converting peak to bigbed...')
 79 |     peak_to_bigbed(bfilt_peak, args.peak_type, args.chrsz,
 80 |                    args.mem_gb, args.out_dir)
 81 | 
 82 |     log.info('Converting peak to starch...')
 83 |     peak_to_starch(bfilt_peak, args.out_dir)
 84 | 
 85 |     log.info('Converting peak to hammock...')
 86 |     peak_to_hammock(bfilt_peak, args.mem_gb, args.out_dir)
 87 | 
 88 |     log.info('FRiP without fragment length...')
 89 |     frip(args.ta, bfilt_peak, args.out_dir)
 90 | 
 91 |     log.info('Calculating (blacklist-filtered) peak region size QC/plot...')
 92 |     get_region_size_metrics(bfilt_peak)
 93 | 
 94 |     log.info('Calculating number of peaks (blacklist-filtered)...')
 95 |     get_num_peaks(bfilt_peak)
 96 | 
 97 |     log.info('List all files in output directory...')
 98 |     ls_l(args.out_dir)
 99 | 
100 |     log.info('All done.')
101 | 
102 | 
103 | if __name__ == '__main__':
104 |     main()
105 | 


--------------------------------------------------------------------------------
/src/encode_task_post_call_peak_chip.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Author: Jin Lee (leepc12@gmail.com)
  4 | 
  5 | import sys
  6 | import os
  7 | import argparse
  8 | from encode_lib_common import (
  9 |     assert_file_not_empty,
 10 |     log,
 11 |     ls_l,
 12 |     mkdir_p,
 13 | )
 14 | from encode_lib_genomic import (
 15 |     peak_to_bigbed,
 16 |     peak_to_hammock,
 17 |     get_region_size_metrics,
 18 |     get_num_peaks,
 19 |     peak_to_starch,
 20 | )
 21 | from encode_lib_blacklist_filter import blacklist_filter
 22 | from encode_lib_frip import frip_shifted
 23 | 
 24 | 
 25 | def parse_arguments():
 26 |     parser = argparse.ArgumentParser(prog='ENCODE post_call_peak (chip)',
 27 |                                      description='')
 28 |     parser.add_argument('peak', type=str,
 29 |                         help='Path for PEAK file. Peak filename should be "*.*Peak.gz". '
 30 |                              'e.g. rep1.narrowPeak.gz')
 31 |     parser.add_argument('--ta', type=str,
 32 |                         help='TAG-ALIGN file.')
 33 |     parser.add_argument('--peak-type', type=str, required=True,
 34 |                         choices=['narrowPeak', 'regionPeak',
 35 |                                  'broadPeak', 'gappedPeak'],
 36 |                         help='Peak file type.')
 37 |     parser.add_argument('--fraglen', type=int, required=True,
 38 |                         help='Fragment length.')
 39 |     parser.add_argument('--chrsz', type=str,
 40 |                         help='2-col chromosome sizes file.')
 41 |     parser.add_argument('--blacklist', type=str,
 42 |                         help='Blacklist BED file.')
 43 |     parser.add_argument('--regex-bfilt-peak-chr-name',
 44 |                         help='Keep chromosomes matching this pattern only '
 45 |                              'in .bfilt. peak files.')
 46 |     parser.add_argument('--mem-gb', type=float, default=4.0,
 47 |                         help='Max. memory for this job in GB. '
 48 |                         'This will be used to determine GNU sort -S (defaulting to 0.5 of this value). '
 49 |                         'It should be total memory for this task (not memory per thread).')
 50 |     parser.add_argument('--out-dir', default='', type=str,
 51 |                         help='Output directory.')
 52 |     parser.add_argument('--log-level', default='INFO',
 53 |                         choices=['NOTSET', 'DEBUG', 'INFO',
 54 |                                  'WARNING', 'CRITICAL', 'ERROR',
 55 |                                  'CRITICAL'],
 56 |                         help='Log level')
 57 |     args = parser.parse_args()
 58 |     if args.blacklist is None or args.blacklist.endswith('null'):
 59 |         args.blacklist = ''
 60 | 
 61 |     log.setLevel(args.log_level)
 62 |     log.info(sys.argv)
 63 |     return args
 64 | 
 65 | 
 66 | def main():
 67 |     # read params
 68 |     args = parse_arguments()
 69 | 
 70 |     log.info('Initializing and making output directory...')
 71 |     mkdir_p(args.out_dir)
 72 | 
 73 |     log.info('Blacklist-filtering peaks...')
 74 |     bfilt_peak = blacklist_filter(
 75 |         args.peak, args.blacklist, args.regex_bfilt_peak_chr_name, args.out_dir)
 76 | 
 77 |     log.info('Checking if output is empty...')
 78 |     assert_file_not_empty(bfilt_peak)
 79 | 
 80 |     log.info('Converting peak to bigbed...')
 81 |     peak_to_bigbed(bfilt_peak, args.peak_type, args.chrsz,
 82 |                    args.mem_gb, args.out_dir)
 83 | 
 84 |     log.info('Converting peak to starch...')
 85 |     peak_to_starch(bfilt_peak, args.out_dir)
 86 | 
 87 |     log.info('Converting peak to hammock...')
 88 |     peak_to_hammock(bfilt_peak, args.mem_gb, args.out_dir)
 89 | 
 90 |     log.info('Shifted FRiP with fragment length...')
 91 |     frip_qc = frip_shifted(args.ta, bfilt_peak,
 92 |                            args.chrsz, args.fraglen, args.out_dir)
 93 | 
 94 |     log.info('Calculating (blacklist-filtered) peak region size QC/plot...')
 95 |     region_size_qc, region_size_plot = get_region_size_metrics(bfilt_peak)
 96 | 
 97 |     log.info('Calculating number of peaks (blacklist-filtered)...')
 98 |     num_peak_qc = get_num_peaks(bfilt_peak)
 99 | 
100 |     log.info('List all files in output directory...')
101 |     ls_l(args.out_dir)
102 | 
103 |     log.info('All done.')
104 | 
105 | 
106 | if __name__ == '__main__':
107 |     main()
108 | 


--------------------------------------------------------------------------------
/src/encode_task_subsample_ctl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | import os
 4 | import argparse
 5 | from encode_lib_common import (
 6 |     assert_file_not_empty, get_num_lines, log, ls_l, mkdir_p, rm_f,
 7 |     run_shell_cmd, strip_ext_ta)
 8 | from encode_lib_genomic import (
 9 |     subsample_ta_pe, subsample_ta_se)
10 | 
11 | def parse_arguments():
12 |     parser = argparse.ArgumentParser(
13 |         prog='ENCODE DCC control TAG-ALIGN subsampler.'
14 |              'This script does not check if number of reads in TA is higher than '
15 |              'subsampling number (--subsample). '
16 |              'If number of reads in TA is lower than subsampling number then '
17 |              'TA will be just shuffled.')
18 |     parser.add_argument('ta', type=str,
19 |                         help='Path for control TAGALIGN file.')
20 |     parser.add_argument('--paired-end', action="store_true",
21 |                         help='Paired-end TAGALIGN.')
22 |     parser.add_argument('--subsample', default=0, type=int,
23 |                         help='Number of reads to subsample.')
24 |     parser.add_argument('--out-dir', default='', type=str,
25 |                         help='Output directory.')
26 |     parser.add_argument('--log-level', default='INFO',
27 |                         choices=['NOTSET', 'DEBUG', 'INFO',
28 |                                  'WARNING', 'CRITICAL', 'ERROR',
29 |                                  'CRITICAL'],
30 |                         help='Log level')
31 |     args = parser.parse_args()
32 |     if not args.subsample:
33 |         raise ValueError('--subsample should be a positive integer.')
34 | 
35 |     log.setLevel(args.log_level)
36 |     log.info(sys.argv)
37 |     return args
38 | 
39 | 
40 | def main():
41 |     # read params
42 |     args = parse_arguments()
43 |     log.info('Initializing and making output directory...')
44 |     mkdir_p(args.out_dir)
45 | 
46 |     if args.paired_end:
47 |         subsampled_ta = subsample_ta_pe(
48 |             args.ta, args.subsample,
49 |             non_mito=False, mito_chr_name=None, r1_only=False,
50 |             out_dir=args.out_dir)
51 |     else:
52 |         subsampled_ta = subsample_ta_se(
53 |             args.ta, args.subsample,
54 |             non_mito=False, mito_chr_name=None,
55 |             out_dir=args.out_dir)
56 |     log.info('Checking if output is empty...')
57 |     assert_file_not_empty(subsampled_ta)
58 | 
59 |     log.info('List all files in output directory...')
60 |     ls_l(args.out_dir)
61 | 
62 |     log.info('All done.')
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     main()
67 | 


--------------------------------------------------------------------------------
/src/encode_task_trim_fastq.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # ENCODE DCC fastq merger wrapper
 4 | # Author: Jin Lee (leepc12@gmail.com)
 5 | 
 6 | import sys
 7 | import os
 8 | import argparse
 9 | from encode_lib_common import (
10 |     assert_file_not_empty, copy_f_to_f, log, ls_l, mkdir_p,
11 |     run_shell_cmd, strip_ext_fastq)
12 | 
13 | 
14 | def parse_arguments(debug=False):
15 |     parser = argparse.ArgumentParser(
16 |         prog='ENCODE DCC fastq merger.')
17 |     parser.add_argument('fastq', type=str,
18 |                         help='FASTQ to be trimmed.')
19 |     parser.add_argument('--trim-bp', type=int, default=50,
20 |                         help='Number of basepair after trimming.')
21 |     parser.add_argument('--out-dir', default='', type=str,
22 |                         help='Output directory.')
23 |     parser.add_argument('--log-level', default='INFO',
24 |                         choices=['NOTSET', 'DEBUG', 'INFO',
25 |                                  'WARNING', 'CRITICAL', 'ERROR',
26 |                                  'CRITICAL'],
27 |                         help='Log level')
28 |     args = parser.parse_args()
29 | 
30 |     log.setLevel(args.log_level)
31 |     log.info(sys.argv)
32 |     return args
33 | 
34 | 
35 | def trim_fastq(fastq, trim_bp, out_dir):
36 |     prefix = os.path.join(out_dir,
37 |                           os.path.basename(strip_ext_fastq(fastq)))
38 |     trimmed = '{}.trim_{}bp.fastq.gz'.format(prefix, trim_bp)
39 | 
40 |     cmd = 'python $(which trimfastq.py) {} {} | gzip -nc > {}'.format(
41 |         fastq, trim_bp, trimmed)
42 |     run_shell_cmd(cmd)
43 | 
44 |     # if shorter than trim_bp
45 |     cmd2 = 'zcat -f {} | (grep \'sequences shorter than desired length\' '
46 |     cmd2 += '|| true) | wc -l'
47 |     cmd2 = cmd2.format(
48 |         trimmed)
49 |     if int(run_shell_cmd(cmd2)) > 0:
50 |         copy_f_to_f(fastq, trimmed)
51 | 
52 |     return trimmed
53 | 
54 | 
55 | def main():
56 |     # read params
57 |     args = parse_arguments()
58 | 
59 |     log.info('Initializing and making output directory...')
60 |     mkdir_p(args.out_dir)
61 | 
62 |     log.info('Trimming fastqs ({} bp)...'.format(args.trim_bp))
63 |     trimmed = trim_fastq(args.fastq, args.trim_bp, args.out_dir)
64 |     assert_file_not_empty(trimmed)
65 | 
66 |     log.info('List all files in output directory...')
67 |     ls_l(args.out_dir)
68 | 
69 |     log.info('All done.')
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     main()
74 | 


--------------------------------------------------------------------------------