├── .dockerignore
├── .editorconfig
├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.yml
    │   ├── config.yml
    │   ├── feature_request.yml
    │   └── question.yml
├── .gitignore
├── .gitlab-ci.yml
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── LICENSE
├── README.md
├── bin
    ├── create_pairs_report.py
    ├── workflow-glue
    └── workflow_glue
    │   ├── __init__.py
    │   ├── check_bam_headers_in_dir.py
    │   ├── check_sample_sheet.py
    │   ├── check_xam_index.py
    │   ├── configure_igv.py
    │   ├── get_max_depth_locus.py
    │   ├── report.py
    │   ├── tests
    │       ├── __init__.py
    │       └── test_test.py
    │   └── util.py
├── data
    └── OPTIONAL_FILE
├── docs
    ├── 01_brief_description.md
    ├── 02_introduction.md
    ├── 03_compute_requirements.md
    ├── 04_install_and_run.md
    ├── 05_related_protocols.md
    ├── 06_input_example.md
    ├── 06_input_parameters.md
    ├── 07_outputs.md
    ├── 08_pipeline_overview.md
    ├── 09_troubleshooting.md
    ├── 10_FAQ.md
    └── 11_other.md
├── lib
    ├── ArgumentParser.groovy
    ├── CWUtil.groovy
    ├── NfcoreSchema.groovy
    ├── NfcoreTemplate.groovy
    ├── Pinguscript.groovy
    ├── WorkflowMain.groovy
    ├── common.nf
    ├── ingress.nf
    └── nfcore_external_java_deps.jar
├── main.nf
├── modules
    └── local
    │   ├── 4dn.nf
    │   ├── common.nf
    │   └── pore-c.nf
├── nextflow.config
├── nextflow_schema.json
├── output_definition.json
├── subworkflows
    └── local
    │   └── prepare_genome.nf
└── test_data
    ├── bams
        ├── barcode01
        │   └── porec_test.concatemers.bam
        └── barcode02
        │   └── porec_test.concatemers.bam
    ├── bams_dir
        ├── shard_0001.bam
        ├── shard_0002.bam
        ├── shard_0003.bam
        └── shard_0004.bam
    ├── porec_test.concatemers.bam
    ├── porec_test.concatemers.fastq
    ├── porec_test.fasta
    ├── porec_test.fasta.fai
    ├── porec_test.monomer.fastq
    ├── porec_test.params.json
    ├── porec_test.phased_variants.vcf.gz
    ├── porec_test.phased_variants.vcf.gz.tbi
    ├── porec_test_no_index.phased_variants.vcf.gz
    ├── sample_sheet.csv
    ├── sample_sheet_cutter.csv
    ├── sample_sheet_no_tbi.csv
    ├── sample_sheet_no_vcf.csv
    └── tests.pairs.stats.txt


/.dockerignore:
--------------------------------------------------------------------------------
 1 | .git
 2 | bin
 3 | CHANGELOG.md
 4 | data
 5 | lib
 6 | LICENSE
 7 | main.nf
 8 | nextflow.config
 9 | README.md
10 | test_data
11 | # we typically run tests with outputs to these:
12 | output
13 | work
14 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | charset = utf-8
 5 | end_of_line = lf
 6 | insert_final_newline = true
 7 | trim_trailing_whitespace = true
 8 | indent_size = 4
 9 | indent_style = space
10 | 
11 | [*.{md,yml,yaml,html,css,scss,js,cff}]
12 | indent_size = 2
13 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
  1 | name: Bug Report
  2 | description: File a bug report
  3 | labels: ["triage"]
  4 | body:
  5 |   - type: markdown
  6 |     attributes:
  7 |       value: |
  8 |         Thanks for taking the time to fill out this bug report!
  9 | 
 10 | 
 11 |   - type: markdown
 12 |     attributes:
 13 |       value: |
 14 |           # Background
 15 |   - type: dropdown
 16 |     id: os
 17 |     attributes:
 18 |       label: Operating System
 19 |       description: What operating system are you running?
 20 |       options:
 21 |         - Windows 10
 22 |         - Windows 11
 23 |         - macOS
 24 |         - Ubuntu 22.04
 25 |         - CentOS 7
 26 |         - Other Linux (please specify below)
 27 |     validations:
 28 |       required: true
 29 |   - type: input
 30 |     id: other-os
 31 |     attributes:
 32 |       label: Other Linux
 33 |       placeholder: e.g. Fedora 38
 34 |   - type: input
 35 |     id: version
 36 |     attributes:
 37 |       label: Workflow Version
 38 |       description: This is most easily found in the workflow output log
 39 |       placeholder: v1.2.3
 40 |     validations:
 41 |       required: true
 42 |   - type: dropdown
 43 |     id: execution
 44 |     attributes:
 45 |       label: Workflow Execution
 46 |       description: Where are you running the workflow?
 47 |       options:
 48 |         - EPI2ME Desktop (Local)
 49 |         - EPI2ME Desktop (Cloud)
 50 |         - Command line (Local)
 51 |         - Command line (Cluster)
 52 |         - Other (please describe)
 53 |     validations:
 54 |       required: true
 55 |   - type: input
 56 |     id: other-workflow-execution
 57 |     attributes:
 58 |       label: Other workflow execution
 59 |       description: If "Other", please describe
 60 |       placeholder: Tell us where / how you are running the workflow.
 61 | 
 62 |   - type: markdown
 63 |     attributes:
 64 |       value: |
 65 |         # EPI2ME Desktop Application
 66 |         If you are using the application please provide the following.
 67 |   - type: input
 68 |     id: labs-version
 69 |     attributes:
 70 |       label: EPI2ME Version
 71 |       description: Available from the application settings page.
 72 |       placeholder: v5.1.1
 73 |     validations:
 74 |       required: false
 75 | 
 76 | 
 77 |   - type: markdown
 78 |     attributes:
 79 |       value: |
 80 |         # Command-line execution
 81 |         If you are using nextflow on a command-line, please provide the following.
 82 |   - type: textarea
 83 |     id: cli-command
 84 |     attributes:
 85 |       label: CLI command run
 86 |       description: Please tell us the command you are running
 87 |       placeholder: e.g. nextflow run epi2me-labs/wf-human-variations -profile standard --fastq my-reads/fastq
 88 |     validations:
 89 |       required: false
 90 |   - type: dropdown
 91 |     id: profile
 92 |     attributes:
 93 |       label: Workflow Execution - CLI Execution Profile
 94 |       description: Which execution profile are you using? If you are using a custom profile or nextflow configuration, please give details below.
 95 |       options:
 96 |         - standard (default)
 97 |         - singularity
 98 |         - custom
 99 |     validations:
100 |       required: false
101 | 
102 | 
103 |   - type: markdown
104 |     attributes:
105 |       value: |
106 |         # Report details
107 |   - type: textarea
108 |     id: what-happened
109 |     attributes:
110 |       label: What happened?
111 |       description: Also tell us, what did you expect to happen?
112 |       placeholder: Tell us what you see!
113 |     validations:
114 |       required: true
115 |   - type: textarea
116 |     id: logs
117 |     attributes:
118 |       label: Relevant log output
119 |       description: For CLI execution please include the full output from running nextflow. For execution from the EPI2ME application please copy the contents of the "Workflow logs" panel from the "Logs" tab corresponding to your workflow instance. (This will be automatically formatted into code, so no need for backticks).
120 |       render: shell
121 |     validations:
122 |       required: true
123 |   - type: textarea
124 |     id: activity-log
125 |     attributes:
126 |       label: Application activity log entry
127 |       description: For use with the EPI2ME application please see the Settings > View Activity Log page, and copy the contents of any items listed in red using the Copy to clipboard button.
128 |       render: shell
129 |     validations:
130 |       required: false
131 |   - type: dropdown
132 |     id: run-demo
133 |     attributes:
134 |       label: Were you able to successfully run the latest version of the workflow with the demo data?
135 |       description: For CLI execution, were you able to successfully run the workflow using the demo data available in the [Install and run](./README.md#install-and-run) section of the `README.md`? For execution in the EPI2ME application, were you able to successfully run the workflow via the "Use demo data" button?
136 |       options:
137 |         - 'yes'
138 |         - 'no'
139 |         - other (please describe below)
140 |     validations:
141 |       required: true
142 |   - type: textarea
143 |     id: demo-other
144 |     attributes:
145 |       label: Other demo data information
146 |       render: shell
147 |     validations:
148 |       required: false
149 | 
150 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 |     - name: Nanopore customer support
4 |       url: https://nanoporetech.com/contact
5 |       about: For general support, including bioinformatics questions.
6 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yml:
--------------------------------------------------------------------------------
 1 | name: Feature request
 2 | description: Suggest an idea for this project
 3 | labels: ["feature request"]
 4 | body:
 5 |   
 6 |   - type: textarea
 7 |     id: question1
 8 |     attributes:
 9 |       label: Is your feature related to a problem?
10 |       placeholder: A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
11 |     validations:
12 |       required: true
13 |   - type: textarea
14 |     id: question2
15 |     attributes:
16 |       label: Describe the solution you'd like
17 |       placeholder: A clear and concise description of what you want to happen.
18 |     validations:
19 |       required: true
20 |   - type: textarea
21 |     id: question3
22 |     attributes:
23 |       label: Describe alternatives you've considered
24 |       placeholder: A clear and concise description of any alternative solutions or features you've considered.
25 |     validations:
26 |       required: true
27 |   - type: textarea
28 |     id: question4
29 |     attributes:
30 |       label: Additional context
31 |       placeholder: Add any other context about the feature request here.
32 |     validations:
33 |       required: false
34 | 
35 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.yml:
--------------------------------------------------------------------------------
 1 | name: Question
 2 | description: Ask a generic question about this project unrelated to features or bugs.
 3 | labels: ["question"]
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Please reserve this form for issues not related to bugs or feature requests. If our developers deem your questions to be related to bugs or features you will be asked to fill in the appropriate form.
 9 |   - type: textarea
10 |     id: question1
11 |     attributes:
12 |       label: Ask away!
13 |       placeholder: |
14 |           Bad question: How do I use this workflow in my HPC cluster?
15 |           Good question: My HPC cluster uses a GridEngine scheduler. Can you point me to documentation for how to use your workflows to efficiently submit jobs to my cluster?
16 |     validations:
17 |       required: true
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | nextflow
2 | .nextflow*
3 | template-workflow
4 | .*.swp
5 | .*.swo
6 | *.pyc
7 | *.pyo
8 | 


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
  1 | # Include shared CI
  2 | include:
  3 |     - project: "epi2melabs/ci-templates"
  4 |       file: "wf-containers.yaml"
  5 | 
  6 | 
  7 | variables:
  8 |     # Workflow inputs given to nextflow.
  9 |     # The workflow should define `--out_dir`, the CI template sets this.
 10 |     # Only common file inputs and option values need to be given here
 11 |     # (not things such as -profile)
 12 |     NF_BEFORE_SCRIPT: "mkdir -p ${CI_PROJECT_NAME}/data/ && wget -O ${CI_PROJECT_NAME}/data/wf-pore-c-demo.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-pore-c/wf-pore-c-demo.tar.gz && tar -xzvf ${CI_PROJECT_NAME}/data/wf-pore-c-demo.tar.gz -C ${CI_PROJECT_NAME}/data/"
 13 |     NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \
 14 |       --bam ${CI_PROJECT_NAME}/data/wf-pore-c-demo/porec_test.concatemers.bam --chunk_size 100 --ref \
 15 |       ${CI_PROJECT_NAME}/data/wf-pore-c-demo/porec_test.fasta \
 16 |       --cutter NlaIII \
 17 |       --vcf ${CI_PROJECT_NAME}/data/wf-pore-c-demo/porec_test.phased_variants.vcf.gz \
 18 |       --paired_end_minimum_distance 100 --paired_end_maximum_distance 200 --hi_c --mcool --paired_end"
 19 |     CI_FLAVOUR: "new"
 20 | 
 21 | macos-run:
 22 |     tags:
 23 |         - macos
 24 |         - x86
 25 | 
 26 | docker-run:
 27 |     parallel:
 28 |         matrix:
 29 |             - MATRIX_NAME: [
 30 |                 "no-sample-sheet-chunk", "sample-sheet", "sample-sheet-cutter",
 31 |                 "fastq", "chromunity", "input-dir-hic", "demo",
 32 |                 "sample-sheet-and-chunk-size", "vcf-no-tbi",
 33 |                 "sample-sheet-no-vcf", "sample-sheet-vcf-no-tbi"]
 34 |     rules:
 35 |         - if: ($CI_COMMIT_BRANCH == null || $CI_COMMIT_BRANCH == "dev-template")
 36 |           when: never
 37 |         - if: $MATRIX_NAME == "no-sample-sheet-chunk"
 38 |           variables:
 39 |             NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \
 40 |             --bam test_data/porec_test.concatemers.bam --chunk_size 100 --ref \
 41 |             test_data/porec_test.fasta \
 42 |             --cutter NlaIII \
 43 |             --vcf test_data/porec_test.phased_variants.vcf.gz --pairs \
 44 |             --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \
 45 |             --paired_end_maximum_distance 200 --bed"
 46 |         - if: $MATRIX_NAME == "sample-sheet"
 47 |           variables:
 48 |             NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \
 49 |             --bam test_data/bams --ref \
 50 |             test_data/porec_test.fasta \
 51 |             --pairs --chunk_size 0 \
 52 |             --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \
 53 |             --paired_end_maximum_distance 200 --sample_sheet test_data/sample_sheet.csv \
 54 |             --max_monomers 8"
 55 |             NF_IGNORE_PROCESSES: "index_bam" 
 56 |         - if: $MATRIX_NAME == "sample-sheet-cutter"
 57 |           variables:
 58 |             NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \
 59 |             --bam test_data/bams --ref \
 60 |             test_data/porec_test.fasta \
 61 |             --pairs --chunk_size 0 \
 62 |             --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \
 63 |             --paired_end_maximum_distance 200 --sample_sheet test_data/sample_sheet_cutter.csv \
 64 |             --max_monomers 8"
 65 |             NF_IGNORE_PROCESSES: "index_bam" 
 66 |         - if: $MATRIX_NAME == "input-dir-hic"
 67 |           variables:
 68 |             NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \
 69 |             --bam test_data/bams_dir --chunk_size 500 --ref \
 70 |             test_data/porec_test.fasta \
 71 |             --cutter NlaIII \
 72 |             --vcf test_data/porec_test.phased_variants.vcf.gz \
 73 |             --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \
 74 |             --paired_end_maximum_distance 200 --hi_c"
 75 |         - if: $MATRIX_NAME == "fastq"
 76 |           variables:
 77 |             NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \
 78 |             --fastq test_data/porec_test.concatemers.fastq --ref \
 79 |             test_data/porec_test.fasta \
 80 |             --cutter NlaIII \
 81 |             --vcf test_data/porec_test.phased_variants.vcf.gz --pairs \
 82 |             --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \
 83 |             --paired_end_maximum_distance 200"
 84 |             NF_IGNORE_PROCESSES: "index_bam"
 85 |         - if: $MATRIX_NAME == "chromunity"
 86 |           variables:
 87 |             NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \
 88 |             --fastq test_data/porec_test.concatemers.fastq --ref \
 89 |             test_data/porec_test.fasta \
 90 |             --mcool_resolutions 1000,2000,3000 --cutter NlaIII \
 91 |             --vcf test_data/porec_test.phased_variants.vcf.gz --pairs \
 92 |             --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \
 93 |             --paired_end_maximum_distance 200 --chromunity --chromunity_merge_distance 5 --mcool"
 94 |             NF_IGNORE_PROCESSES: "index_bam"
 95 |         - if: $MATRIX_NAME == "demo"
 96 |           variables:
 97 |                 NF_BEFORE_SCRIPT: "mkdir -p ${CI_PROJECT_NAME}/data/ && wget -O ${CI_PROJECT_NAME}/data/wf-pore-c-demo.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-pore-c/wf-pore-c-demo.tar.gz && tar -xzvf ${CI_PROJECT_NAME}/data/wf-pore-c-demo.tar.gz -C ${CI_PROJECT_NAME}/data/"
 98 |                 NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \
 99 |                 --bam ${CI_PROJECT_NAME}/data/wf-pore-c-demo/porec_test.concatemers.bam --chunk_size 100 --ref \
100 |                 ${CI_PROJECT_NAME}/data/wf-pore-c-demo/porec_test.fasta \
101 |                 --cutter NlaIII \
102 |                 --vcf ${CI_PROJECT_NAME}/data/wf-pore-c-demo/porec_test.phased_variants.vcf.gz \
103 |                 --paired_end_minimum_distance 100 --paired_end_maximum_distance 200 --hi_c --mcool --paired_end"
104 |         - if: $MATRIX_NAME == "sample-sheet-and-chunk-size"
105 |           variables:
106 |             NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \
107 |             --bam test_data/bams --ref \
108 |             test_data/porec_test.fasta \
109 |             --pairs --chunk_size 100 \
110 |             --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \
111 |             --paired_end_maximum_distance 200 --sample_sheet test_data/sample_sheet.csv"
112 |         - if: $MATRIX_NAME == "vcf-no-tbi"
113 |           variables:
114 |             NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \
115 |             --fastq test_data/porec_test.concatemers.fastq --ref \
116 |             test_data/porec_test.fasta \
117 |             --cutter NlaIII \
118 |             --vcf test_data/porec_test_no_index.phased_variants.vcf.gz --pairs \
119 |             --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \
120 |             --paired_end_maximum_distance 20"
121 |         - if: $MATRIX_NAME == "sample-sheet-no-vcf"
122 |           variables:
123 |             NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \
124 |             --bam test_data/bams --ref \
125 |             test_data/porec_test.fasta \
126 |             --pairs --chunk_size 100 \
127 |             --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \
128 |             --paired_end_maximum_distance 200 --sample_sheet test_data/sample_sheet_no_vcf.csv"
129 |         - if: $MATRIX_NAME == "sample-sheet-vcf-no-tbi"
130 |           variables:
131 |             NF_WORKFLOW_OPTS: "-executor.\\$$local.memory 32GB \
132 |             --bam test_data/bams --ref \
133 |             test_data/porec_test.fasta \
134 |             --pairs --chunk_size 100 \
135 |             --chromunity --paired_end --filter_pairs --paired_end_minimum_distance 100 \
136 |             --paired_end_maximum_distance 200 \
137 |             --sample_sheet test_data/sample_sheet_no_tbi.csv"


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: local
 3 |     hooks:
 4 |       - id: docs_readme
 5 |         name: docs_readme
 6 |         entry: parse_docs -p docs -e .md -s 01_brief_description 02_introduction 03_compute_requirements 04_install_and_run 05_related_protocols 06_input_example 06_input_parameters 07_outputs 08_pipeline_overview 09_troubleshooting 10_FAQ 11_other -ot README.md -od output_definition.json -ns nextflow_schema.json
 7 |         language: python
 8 |         always_run: true
 9 |         pass_filenames: false
10 |         additional_dependencies:
11 |           - epi2melabs==0.0.57
12 |       - id: build_models
13 |         name: build_models
14 |         entry: datamodel-codegen --strict-nullable --base-class workflow_glue.results_schema_helpers.BaseModel --use-schema-description --disable-timestamp --input results_schema.yml --input-file-type openapi --output bin/workflow_glue/results_schema.py
15 |         language: python
16 |         files: 'results_schema.yml'
17 |         pass_filenames: false
18 |         additional_dependencies:
19 |           - datamodel-code-generator
20 |   - repo: https://github.com/pycqa/flake8
21 |     rev: 5.0.4
22 |     hooks:
23 |       - id: flake8
24 |         pass_filenames: false
25 |         additional_dependencies:
26 |           - flake8-rst-docstrings
27 |           - flake8-docstrings
28 |           - flake8-import-order
29 |           - flake8-forbid-visual-indent
30 |           - pep8-naming
31 |           - flake8-no-types
32 |           - flake8-builtins
33 |           - flake8-absolute-import
34 |           - flake8-print
35 |         args: [
36 |             "bin",
37 |             "--import-order-style=google",
38 |             "--statistics",
39 |             "--max-line-length=88",
40 |             "--extend-exclude=bin/workflow_glue/results_schema.py",
41 |         ]
42 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | All notable changes to this project will be documented in this file.
  3 | 
  4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
  5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
  6 | 
  7 | ## [v1.3.0]
  8 | ### Added
  9 | - Workflow will generate VCF index file if it doesn't exist.
 10 | ### Fixed
 11 | - Set format of `--bam` and `--fastq` in schema to `path`, to enable directories to be selected as input in the EPI2ME application.
 12 | ### Removed
 13 | - Empty alignment stats plots which are not relevant to this workflow.
 14 | 
 15 | ### [v1.2.2]
 16 | ### Fixed
 17 | - Capitalised modified base tags additionally removed from monomers if no modified bases for a monomer.
 18 | 
 19 | ## [v1.2.1]
 20 | ### Fixed
 21 | - bamindex fetch error when running more than one sample and `--chunk_size` is greater than 0.
 22 |   
 23 | ## [v1.2.0]
 24 | ### Fixed
 25 | - `--bed` parameter will now output BED file using the paired_end BAM file.
 26 | - Reduce memory usage of BED file creation and sorting.
 27 | - Increased memory allocation for `prepare_hic` and `merge_mcool` processes.
 28 | - If sample sheet provided and cutter column not present the workflow will instead use `--cutter` parameter.
 29 | ### Changed
 30 | - Bump pore-c-py to v2.1.4 to prevent issues with modified base tags and strip minimap2 tags from inputs.
 31 | ### Added
 32 | - Reduce peak memory usage of minimap2 by adding `--cap-kalloc 100m --cap-sw-mem 50m` to the minimap2 command in the `digest_align_annotate` process.
 33 | 
 34 | ## [v1.1.0]
 35 | ### Added
 36 | - `--bed` parameter which if set to true will output a BED file that is compatible with downstream tools including the scaffolder [Yahs](https://github.com/c-zhou/yahs).
 37 | - `--pairtools_chunksize` parameter which exposes pairtools dedup chunksize parameter, in case peak memory usage of hi_c process needs to be reduced.
 38 | - `digest_align_annotate` process uses dedicated pore_c_py container.
 39 | - `--max_monomers` parameter, which is set to 250 by default, will filter out any reads that have more than this number of monomers. These reads will not be included in the analysis.
 40 | - Output a `filtered_out/{alias}.bam` with any reads that are filtered out due to the max_monomers parameter.
 41 | 
 42 | ## [v1.0.0]
 43 | ### Changed
 44 | - New documentation.
 45 | 
 46 | ## [v0.2.0]
 47 | ### Fixed
 48 | - Pairtools merge step single quote the input directory so it will not error with Argument list too long.
 49 | - Chromunity parquet files now contain the correct column names.
 50 | ### Changed
 51 | - `--ubam` parameter has been renamed `--bam`
 52 | - All other ubam related parameters have been renamed with bam for consistency
 53 | - The `--bam_map_threads`, `--digest_annotate_threads` and `bam_bam2fq_threads` threading parameters are now automatically extracted from the `--threads` specifying the maximum number of threads to use for a process.
 54 | ### Removed
 55 | - Default local executor CPU and RAM limits.
 56 | 
 57 | ## [v0.1.1]
 58 | ### Changed
 59 | - If `--hi_c` parameter set to true the pairs file will be created. 
 60 | 
 61 | ## [v0.1.0]
 62 | ### Changed
 63 | - GitHub issue templates
 64 | - Nextflow minimum version 23.04.2.
 65 | - `--sample_id` parameter has been changed to `--sample` for consistency.
 66 | - `--summary_json` optional parameter with default set to true, to include an annotation summary json in outputs.
 67 | - Remove `--params_sheet` parameter and add all per sample parameters to sample_sheet.
 68 | 
 69 | ### Added
 70 | - `--hi_c` optional parameter with default set to false, to include a `.hic` output file which is compatible with [Juice box](https://www.aidenlab.org/juicebox/).
 71 | 
 72 | ## [v0.0.8]
 73 | * Improve schema parameter explanations and output file descriptions in the README.
 74 | * Add a default `--chunk_size` parameter value of 25000.
 75 | * Update fastcat which removes need to index ubam.
 76 | * Enum choices are enumerated in the `--help` output.
 77 | * Enum choices are enumerated as part of the error message when a user has selected an invalid choice.
 78 | * Bumped minimum required Nextflow version to 22.10.8.
 79 | 
 80 | ### Fixed
 81 | - Replaced `--threads` option in fastqingress with hardcoded values to remove warning about undefined `param.threads`
 82 | 
 83 | ## [v0.0.7]
 84 | ### Fixed
 85 | - Testing for the cooler tool.
 86 | 
 87 | ## [v0.0.6]
 88 | ### Added
 89 | - Configuration for running demo data in AWS
 90 | 
 91 | ## [v0.0.5]
 92 | ### Fixed
 93 | - Broken heat map in the pairtools report.
 94 | - Meta table repeated tabs.
 95 | - Nextflow config example cmd.
 96 | 
 97 | ### Added
 98 | - Cutter parameter help text link to Restriction Enzyme options.
 99 | 
100 | ## [v0.0.4]
101 | ### Added
102 | - Changed LICENSE to Oxford Nanopore Technologies PLC. Public License Version 1.0.
103 | - Test for Chromunity writer
104 | 
105 | ### Fixed
106 | - Use latest pore-c-py package with fix for the modified bases digest step.
107 | 
108 | ## [v0.0.3]
109 | ### Fixed
110 | - Reduce time by using bamindex instead of splitting bam.
111 | 
112 | ### Changed
113 | - Replace input check with fastq ingress.
114 | - Parameters to input fastq or ubam.
115 | - Output a basic report.
116 | 
117 | ## [v0.0.2]
118 | ### Fixed
119 | - Create pairs report handling of missing references in pairs file.
120 | 
121 | ### Changed
122 | - Update Pore-c-py package used to v2.0.1
123 | - Improved performance
124 | - Use one pipe for digest, align and annotate processes.
125 | 
126 | ## [v0.0.1]
127 | * First release of Wf-Pore-C
128 | 
129 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Oxford Nanopore Technologies PLC. Public License Version 1.0
  2 | =============================================================
  3 | 
  4 | 1. Definitions
  5 | --------------
  6 | 
  7 | 1.1. "Contributor"
  8 |     means each individual or legal entity that creates, contributes to
  9 |     the creation of, or owns Covered Software.
 10 | 
 11 | 1.2. "Contributor Version"
 12 |     means the combination of the Contributions of others (if any) used
 13 |     by a Contributor and that particular Contributor’s Contribution.
 14 | 
 15 | 1.3. "Contribution"
 16 |     means Covered Software of a particular Contributor.
 17 | 
 18 | 1.4. "Covered Software"
 19 |     means Source Code Form to which the initial Contributor has attached
 20 |     the notice in Exhibit A, the Executable Form of such Source Code
 21 |     Form, and Modifications of such Source Code Form, in each case
 22 |     including portions thereof.
 23 | 
 24 | 1.5. "Executable Form"
 25 |     means any form of the work other than Source Code Form.
 26 | 
 27 | 1.6. "Larger Work"
 28 |     means a work that combines Covered Software with other material, in
 29 |     a separate file or files, that is not Covered Software.
 30 | 
 31 | 1.7. "License"
 32 |     means this document.
 33 | 
 34 | 1.8. "Licensable"
 35 |     means having the right to grant, to the maximum extent possible,
 36 |     whether at the time of the initial grant or subsequently, any and
 37 |     all of the rights conveyed by this License.
 38 | 
 39 | 1.9. "Modifications"
 40 |     means any of the following:
 41 | 
 42 |     (a)	  any file in Source Code Form that results from an addition to,
 43 |           deletion from, or modification of the contents of Covered
 44 |           Software; or
 45 |     (b)   any new file in Source Code Form that contains any Covered
 46 |           Software.
 47 | 
 48 | 1.10. "Research Purposes"
 49 |     means use for internal research and not intended for or directed
 50 |     towards commercial advantages or monetary compensation; provided,
 51 |     however, that monetary compensation does not include sponsored
 52 |     research of research funded by grants.
 53 | 
 54 | 1.11  "Secondary License"
 55 |     means either the GNU General Public License, Version 2.0, the GNU
 56 |     Lesser General Public License, Version 2.1, the GNU Affero General
 57 |     Public License, Version 3.0, or any later versions of those
 58 |     licenses.
 59 | 
 60 | 1.12. "Source Code Form"
 61 |     means the form of the work preferred for making modifications.
 62 | 
 63 | 1.13. "You" (or "Your")
 64 |     means an individual or a legal entity exercising rights under this
 65 |     License. For legal entities, "You" includes any entity that
 66 |     controls, is controlled by, or is under common control with You. For
 67 |     purposes of this definition, "control" means (a) the power, direct
 68 |     or indirect, to cause the direction or management of such entity,
 69 |     whether by contract or otherwise, or (b) ownership of more than
 70 |     fifty percent (50%) of the outstanding shares or beneficial
 71 |     ownership of such entity.
 72 | 
 73 | 2. License Grants and Conditions
 74 | --------------------------------
 75 | 
 76 | 2.1. Grants
 77 | 
 78 | Each Contributor hereby grants You a world-wide, royalty-free,
 79 | non-exclusive license under Contributor copyrights Licensable by such
 80 | Contributor to use, reproduce, make available, modify, display,
 81 | perform, distribute, and otherwise exploit solely for Research Purposes
 82 | its Contributions, either on an unmodified basis, with Modifications,
 83 | or as part of a Larger Work.
 84 | 
 85 | 2.2. Effective Date
 86 | 
 87 | The licenses granted in Section 2.1 with respect to any Contribution
 88 | become effective for each Contribution on the date the Contributor
 89 | first distributes such Contribution.
 90 | 
 91 | 2.3. Limitations on Grant Scope
 92 | 
 93 | The licenses granted in this Section 2 are the only rights granted under
 94 | this License. No additional rights or licenses will be implied from the
 95 | distribution or licensing of Covered Software under this License. The
 96 | License is incompatible with Secondary Licenses.  Notwithstanding
 97 | Section 2.1 above, no copyright license is granted:
 98 | 
 99 | (a) for any code that a Contributor has removed from Covered Software;
100 |     or
101 | 
102 | (b) use of the Contributions or its Contributor Version other than for
103 | Research Purposes only; or
104 | 
105 | (c) for infringements caused by: (i) Your and any other third party’s
106 | modifications of Covered Software, or (ii) the combination of its
107 | Contributions with other software (except as part of its Contributor
108 | Version).
109 | 
110 | This License does not grant any rights in the patents, trademarks,
111 | service marks, or logos of any Contributor (except as may be necessary
112 | to comply with the notice requirements in Section 3.4).
113 | 
114 | 2.4. Subsequent Licenses
115 | 
116 | No Contributor makes additional grants as a result of Your choice to
117 | distribute the Covered Software under a subsequent version of this
118 | License (see Section 10.2) or under the terms of a Secondary License
119 | (if permitted under the terms of Section 3.3).
120 | 
121 | 2.5. Representation
122 | 
123 | Each Contributor represents that the Contributor believes its
124 | Contributions are its original creation(s) or it has sufficient rights
125 | to grant the rights to its Contributions conveyed by this License.
126 | 
127 | 2.6. Fair Use
128 | 
129 | This License is not intended to limit any rights You have under
130 | applicable copyright doctrines of fair use, fair dealing, or other
131 | equivalents.
132 | 
133 | 2.7. Conditions
134 | 
135 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
136 | in Section 2.1.
137 | 
138 | 3. Responsibilities
139 | -------------------
140 | 
141 | 3.1. Distribution of Source Form
142 | 
143 | All distribution of Covered Software in Source Code Form, including any
144 | Modifications that You create or to which You contribute, must be under
145 | the terms of this License. You must inform recipients that the Source
146 | Code Form of the Covered Software is governed by the terms of this
147 | License, and how they can obtain a copy of this License. You may not
148 | attempt to alter or restrict the recipients’ rights in the Source Code Form.
149 | 
150 | 3.2. Distribution of Executable Form
151 | 
152 | If You distribute Covered Software in Executable Form then:
153 | 
154 | (a) such Covered Software must also be made available in Source Code
155 |     Form, as described in Section 3.1, and You must inform recipients of
156 |     the Executable Form how they can obtain a copy of such Source Code
157 |     Form by reasonable means in a timely manner, at a charge no more
158 |     than the cost of distribution to the recipient; and
159 | 
160 | (b) You may distribute such Executable Form under the terms of this
161 |     License.
162 | 
163 | 3.3. Distribution of a Larger Work
164 | 
165 | You may create and distribute a Larger Work under terms of Your choice,
166 | provided that You also comply with the requirements of this License for
167 | the Covered Software. The Larger Work may not be a combination of Covered
168 | Software with a work governed by one or more Secondary Licenses.
169 | 
170 | 3.4. Notices
171 | 
172 | You may not remove or alter the substance of any license notices
173 | (including copyright notices, patent notices, disclaimers of warranty,
174 | or limitations of liability) contained within the Source Code Form of
175 | the Covered Software, except that You may alter any license notices to
176 | the extent required to remedy known factual inaccuracies.
177 | 
178 | 3.5. Application of Additional Terms
179 | 
180 | You may not choose to offer, or charge a fee for use of the Covered
181 | Software or a fee for, warranty, support, indemnity or liability
182 | obligations to one or more recipients of Covered Software.  You must
183 | make it absolutely clear that any such warranty, support, indemnity, or
184 | liability obligation is offered by You alone, and You hereby agree to
185 | indemnify every Contributor for any liability incurred by such
186 | Contributor as a result of warranty, support, indemnity or liability
187 | terms You offer. You may include additional disclaimers of warranty and
188 | limitations of liability specific to any jurisdiction.
189 | 
190 | 4. Inability to Comply Due to Statute or Regulation
191 | ---------------------------------------------------
192 | 
193 | If it is impossible for You to comply with any of the terms of this
194 | License with respect to some or all of the Covered Software due to
195 | statute, judicial order, or regulation then You must: (a) comply with
196 | the terms of this License to the maximum extent possible; and (b)
197 | describe the limitations and the code they affect. Such description must
198 | be placed in a text file included with all distributions of the Covered
199 | Software under this License. Except to the extent prohibited by statute
200 | or regulation, such description must be sufficiently detailed for a
201 | recipient of ordinary skill to be able to understand it.
202 | 
203 | 5. Termination
204 | --------------
205 | 
206 | 5.1. The rights granted under this License will terminate automatically
207 | if You fail to comply with any of its terms.
208 | 
209 | 5.2. If You initiate litigation against any entity by asserting an
210 | infringement claim (excluding declaratory judgment actions,
211 | counter-claims, and cross-claims) alleging that a Contributor Version
212 | directly or indirectly infringes, then the rights granted to
213 | You by any and all Contributors for the Covered Software under Section
214 | 2.1 of this License shall terminate.
215 | 
216 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
217 | end user license agreements (excluding distributors and resellers) which
218 | have been validly granted by You or Your distributors under this License
219 | prior to termination shall survive termination.
220 | 
221 | ************************************************************************
222 | *                                                                      *
223 | *  6. Disclaimer of Warranty                                           *
224 | *  -------------------------                                           *
225 | *                                                                      *
226 | *  Covered Software is provided under this License on an "as is"       *
227 | *  basis, without warranty of any kind, either expressed, implied, or  *
228 | *  statutory, including, without limitation, warranties that the       *
229 | *  Covered Software is free of defects, merchantable, fit for a        *
230 | *  particular purpose or non-infringing. The entire risk as to the     *
231 | *  quality and performance of the Covered Software is with You.        *
232 | *  Should any Covered Software prove defective in any respect, You     *
233 | *  (not any Contributor) assume the cost of any necessary servicing,   *
234 | *  repair, or correction. This disclaimer of warranty constitutes an   *
235 | *  essential part of this License. No use of any Covered Software is   *
236 | *  authorized under this License except under this disclaimer.         *
237 | *                                                                      *
238 | ************************************************************************
239 | 
240 | ************************************************************************
241 | *                                                                      *
242 | *  7. Limitation of Liability                                          *
243 | *  --------------------------                                          *
244 | *                                                                      *
245 | *  Under no circumstances and under no legal theory, whether tort      *
246 | *  (including negligence), contract, or otherwise, shall any           *
247 | *  Contributor, or anyone who distributes Covered Software as          *
248 | *  permitted above, be liable to You for any direct, indirect,         *
249 | *  special, incidental, or consequential damages of any character      *
250 | *  including, without limitation, damages for lost profits, loss of    *
251 | *  goodwill, work stoppage, computer failure or malfunction, or any    *
252 | *  and all other commercial damages or losses, even if such party      *
253 | *  shall have been informed of the possibility of such damages. This   *
254 | *  limitation of liability shall not apply to liability for death or   *
255 | *  personal injury resulting from such party’s negligence to the       *
256 | *  extent applicable law prohibits such limitation, but in such event, *
257 | *  and to the greatest extent permissible, damages will be limited to  *
258 | *  direct damages not to exceed one hundred dollars. Some              *
259 | *  jurisdictions do not allow the exclusion or limitation of           *
260 | *  incidental or consequential damages, so this exclusion and          *
261 | *  limitation may not apply to You.                                    *
262 | *                                                                      *
263 | ************************************************************************
264 | 
265 | 8. Litigation
266 | -------------
267 | 
268 | Any litigation relating to this License may be brought only in the
269 | courts of a jurisdiction where the defendant maintains its principal
270 | place of business and such litigation shall be governed by laws of that
271 | jurisdiction, without reference to its conflict-of-law provisions.
272 | Nothing in this Section shall prevent a party’s ability to bring
273 | cross-claims or counter-claims.
274 | 
275 | 9. Miscellaneous
276 | ----------------
277 | 
278 | This License represents the complete agreement concerning the subject
279 | matter hereof. If any provision of this License is held to be
280 | unenforceable, such provision shall be reformed only to the extent
281 | necessary to make it enforceable. Any law or regulation which provides
282 | that the language of a contract shall be construed against the drafter
283 | shall not be used to construe this License against a Contributor.
284 | 
285 | 10. Versions of the License
286 | ---------------------------
287 | 
288 | 10.1. New Versions
289 | 
290 | Oxford Nanopore Technologies PLC. is the license steward. Except as
291 | provided in Section 10.3, no one other than the license steward has the
292 | right to modify or publish new versions of this License. Each version
293 | will be given a distinguishing version number.
294 | 
295 | 10.2. Effect of New Versions
296 | 
297 | You may distribute the Covered Software under the terms of the version
298 | of the License under which You originally received the Covered Software,
299 | or under the terms of any subsequent version published by the license
300 | steward.
301 | 
302 | 10.3. Modified Versions
303 | 
304 | If you create software not governed by this License, and you want to
305 | create a new license for such software, you may create and use a
306 | modified version of this License if you rename the license and remove
307 | any references to the name of the license steward (except to note that
308 | such modified license differs from this License).
309 | 
310 | Exhibit A - Source Code Form License Notice
311 | -------------------------------------------
312 | 
313 |   This Source Code Form is subject to the terms of the Oxford Nanopore
314 |   Technologies PLC. Public License, v. 1.0. Full licence can be found
315 |   obtained from support@nanoporetech.com
316 | 
317 | If it is not possible or desirable to put the notice in a particular
318 | file, then You may include the notice in a location (such as a LICENSE
319 | file in a relevant directory) where a recipient would be likely to look
320 | for such a notice.
321 | 
322 | You may add additional accurate notices of copyright ownership.
323 | 


--------------------------------------------------------------------------------
/bin/create_pairs_report.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Create pairs report."""
  3 | 
  4 | from collections import defaultdict
  5 | from typing import List, Tuple
  6 | 
  7 | import pandas as pd
  8 | import panel as pn
  9 | import typer
 10 | import hvplot.pandas  # noqa
 11 | 
 12 | pn.extension()
 13 | 
 14 | PAIR_TYPES = {
 15 |     "W": "walk",
 16 |     "N": "null",
 17 |     "X": "corrupt",
 18 |     "M": "multi",
 19 |     "R": "rescued",
 20 |     "U": "unique",
 21 |     "D": "duplicate",
 22 | }
 23 | # https://github.com/4dn-dcic/pairsqc/blob/master/pairsqc.py
 24 | ORI_NAMES = dict(zip(["+-", "-+", "++", "--"], ["Inner", "Outer", "Right", "Left"]))
 25 | 
 26 | 
 27 | # %%
 28 | def _parse_totals_table(data=List[Tuple[str, str]]):
 29 |     """Parse totals table."""
 30 |     res = []
 31 |     total = 0
 32 |     for key, val in data:
 33 |         key = key.strip()
 34 |         if key == "total":
 35 |             section = "all"
 36 |             total = int(val)
 37 |         elif key in ("total_unmapped", "total_single_sided_mapped", "total_mapped"):
 38 |             section = "mapping"
 39 |         elif key in ("total_dups", "total_nodups"):
 40 |             section = "duplicates"
 41 |         elif key in ("cis", "trans"):
 42 |             section = "cis/trans"
 43 |         elif key.startswith("cis_"):
 44 |             section = "distance"
 45 |         else:
 46 |             raise ValueError(f"#{key}#")
 47 | 
 48 |         res.append((section, key, int(val)))
 49 |     df = pd.DataFrame(res, columns=["Section", "Type", "Count"])
 50 |     df["Perc. of Total"] = df["Count"] / total * 100.0
 51 |     df["Perc. of Section"] = df.groupby("Section")["Count"].transform(
 52 |         lambda x: 100.0 * x / x.sum()
 53 |     )
 54 |     return df
 55 | 
 56 | 
 57 | def _parse_pair_types(data=List[Tuple[str, str]]):
 58 |     """Parse pair types."""
 59 |     res = []
 60 |     for code, val in data:
 61 |         left, right = code[0], code[1]
 62 |         label = f"{PAIR_TYPES[left]}-{PAIR_TYPES[right]}"
 63 |         res.append((code, left, right, label, int(val)))
 64 |     df = pd.DataFrame(res, columns=["code", "left", "right", "label", "pairs"])
 65 |     df["perc"] = 100.0 * df["pairs"] / df["pairs"].sum()
 66 |     return df
 67 | 
 68 | 
 69 | def _parse_chrom_freq(data=List[Tuple[str, str]]):
 70 |     """Parse chrom freq."""
 71 |     res = []
 72 |     for code, val in data:
 73 |         chr1, chr2 = code.split("/")
 74 |         res.append((chr1, chr2, int(val)))
 75 | 
 76 |     df = (
 77 |         pd.DataFrame(res, columns=["chrom1", "chrom2", "count"])
 78 |         .set_index(["chrom1", "chrom2"])
 79 |         .sort_index()
 80 |         .unstack(fill_value=0)
 81 |     )
 82 |     df = df.xs("count", axis=1)
 83 |     return df
 84 | 
 85 | 
 86 | def _parse_summary(data=List[Tuple[str, str]]):
 87 |     """Parse summary."""
 88 |     res = []
 89 |     for key, val in data:
 90 |         res.append({"statistic": key, "value": float(val)})
 91 |     return pd.DataFrame(res)
 92 | 
 93 | 
 94 | def _parse_dist_freq(data=List[Tuple[str, str]]):
 95 |     """Parse dist freq."""
 96 |     res = []
 97 |     for key, val in data:
 98 |         interval, ori = key.split("/")
 99 |         interval = interval.strip()
100 |         if interval.endswith("+"):
101 |             bin_left = bin_right = interval[:-1]
102 |         else:
103 |             bin_left, bin_right = interval.split("-")
104 |         res.append(
105 |             (int(bin_left), int(bin_right), ori, ORI_NAMES[ori] + f" ({ori})", int(val))
106 |         )
107 |     res = pd.DataFrame(
108 |         res, columns=["bin_left", "bin_right", "ori", "ori_name", "count"]
109 |     )
110 |     return res
111 | 
112 | 
113 | def read_pairs_stats(path):
114 |     """Read Pairs stats."""
115 |     _data = defaultdict(list)
116 |     with open(path) as f:
117 |         for i in f:
118 |             if "/" not in i:
119 |                 table = "totals"
120 |             else:
121 |                 table, i = i.split("/", 1)
122 |             _data[table].append(tuple(i.strip().split("\t")))
123 |     totals = _parse_totals_table(_data["totals"])
124 |     pair_types = _parse_pair_types(_data["pair_types"])
125 |     chrom_freq = _parse_chrom_freq(_data["chrom_freq"])
126 |     summary = _parse_summary(_data["summary"])
127 |     dist_freq = _parse_dist_freq(_data["dist_freq"])
128 |     return totals, pair_types, chrom_freq, summary, dist_freq
129 | 
130 | 
131 | def main(pair_stats, report_html, show_chroms=None):
132 |     """Entry point."""
133 |     totals, pair_types, chrom_freq, summary, dist_freq = read_pairs_stats(
134 |         pair_stats)
135 |     totals_pane = pn.Column(
136 |         pn.Row(
137 |             pn.pane.DataFrame(totals.set_index(
138 |                 ["Section", "Type"]), width=600),
139 |             totals.query("Section == 'mapping'").hvplot.bar(
140 |                 x="Section",
141 |                 y="Perc. of Total",
142 |                 by="Type",
143 |                 hover_cols=["Count", "Perc. of Total"],
144 |                 stacked=True,
145 |                 width=400,
146 |                 title="Mapping Rate",
147 |             ),
148 |         ),
149 |         totals.query("Section == 'distance'").hvplot.bar(
150 |             x="Type", y="Perc. of Section",
151 |             title="Genomic Distance Distribution"
152 |         ),
153 |     )
154 | 
155 |     pair_type_pane = pn.Column(
156 |         pair_types.hvplot.bar(
157 |             x="label", y="perc", hover_cols=["pairs"], title="Pair Types"
158 |         ),
159 |         pn.pane.DataFrame(pair_types, width=600),
160 |     )
161 |     show_chroms_columns = chrom_freq.columns
162 |     show_chroms_index = chrom_freq.index
163 |     if show_chroms is not None:
164 |         show_chroms_columns = show_chroms_columns.intersection(show_chroms)
165 |         show_chroms_index = show_chroms_index.intersection(show_chroms)
166 |     chrom_freq = chrom_freq.reindex(
167 |         index=show_chroms_index, columns=show_chroms_columns, fill_value=0
168 |     )
169 |     chrom_contact_pane = pn.Row(
170 |         chrom_freq.hvplot.heatmap(
171 |             width=600,
172 |             height=600,
173 |             colorbar=False,
174 |             rot=45,
175 |             colormap="viridis",
176 |             title="Contact Count",
177 |         ),
178 |         chrom_freq
179 |         .pipe(lambda x: x.div(x.sum(axis=0), axis=1))
180 |         .hvplot.heatmap(
181 |             width=600,
182 |             height=600,
183 |             colorbar=False,
184 |             rot=45,
185 |             colormap="viridis",
186 |             title="Contact Proportion (normalized by Chromosome)",
187 |         ),
188 |     )
189 | 
190 |     distance_pane = pn.Row(
191 |         dist_freq.hvplot.line(
192 |             x="bin_right", by="ori_name", y="count", logx=True)
193 |     )
194 | 
195 |     report = pn.Tabs(
196 |         ("Pairs", totals_pane),
197 |         ("Pair Types", pair_type_pane),
198 |         ("Chrom Contacts", chrom_contact_pane),
199 |         ("Distance", distance_pane),
200 |     )
201 |     report.save(report_html)
202 | 
203 | 
204 | if __name__ == "__main__":
205 |     typer.run(main)
206 | 


--------------------------------------------------------------------------------
/bin/workflow-glue:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """Entrypoint of pseudo-package for all the code used in the workflow."""
3 | 
4 | from workflow_glue import cli
5 | 
6 | if __name__ == "__main__":
7 |     cli()
8 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/__init__.py:
--------------------------------------------------------------------------------
 1 | """Workflow Python code."""
 2 | import argparse
 3 | import glob
 4 | import importlib
 5 | import os
 6 | import sys
 7 | 
 8 | from .util import _log_level, get_main_logger  # noqa: ABS101
 9 | 
10 | 
11 | __version__ = "0.0.1"
12 | _package_name = "workflow_glue"
13 | 
14 | 
15 | def get_components(allowed_components=None):
16 |     """Find a list of workflow command scripts."""
17 |     logger = get_main_logger(_package_name)
18 |     path = os.path.dirname(os.path.abspath(__file__))
19 |     components = dict()
20 |     for fname in glob.glob(os.path.join(path, "*.py")):
21 |         name = os.path.splitext(os.path.basename(fname))[0]
22 |         if name in ("__init__", "util"):
23 |             continue
24 |         if allowed_components is not None and name not in allowed_components:
25 |             continue
26 | 
27 |         # leniently attempt to import module
28 |         try:
29 |             mod = importlib.import_module(f"{_package_name}.{name}")
30 |         except ModuleNotFoundError as e:
31 |             # if imports cannot be satisifed, refuse to add the component
32 |             # rather than exploding
33 |             logger.warn(f"Could not load {name} due to missing module {e.name}")
34 |             continue
35 | 
36 |         # if theres a main() and and argparser() thats good enough for us.
37 |         try:
38 |             req = "main", "argparser"
39 |             if all(callable(getattr(mod, x)) for x in req):
40 |                 components[name] = mod
41 |         except Exception:
42 |             pass
43 |     return components
44 | 
45 | 
46 | def cli():
47 |     """Run workflow entry points."""
48 |     logger = get_main_logger(_package_name)
49 |     logger.info("Bootstrapping CLI.")
50 |     parser = argparse.ArgumentParser(
51 |         'wf-glue',
52 |         parents=[_log_level()],
53 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
54 | 
55 |     parser.add_argument(
56 |         '-v', '--version', action='version',
57 |         version='%(prog)s {}'.format(__version__))
58 | 
59 |     subparsers = parser.add_subparsers(
60 |         title='subcommands', description='valid commands',
61 |         help='additional help', dest='command')
62 |     subparsers.required = True
63 | 
64 |     # importing everything can take time, try to shortcut
65 |     if len(sys.argv) > 1:
66 |         components = get_components(allowed_components=[sys.argv[1]])
67 |         if not sys.argv[1] in components:
68 |             logger.warn("Importing all modules, this may take some time.")
69 |             components = get_components()
70 |     else:
71 |         components = get_components()
72 | 
73 |     # add all module parsers to main CLI
74 |     for name, module in components.items():
75 |         p = subparsers.add_parser(
76 |             name.split(".")[-1], parents=[module.argparser()])
77 |         p.set_defaults(func=module.main)
78 | 
79 |     args = parser.parse_args()
80 | 
81 |     logger.info("Starting entrypoint.")
82 |     args.func(args)
83 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/check_bam_headers_in_dir.py:
--------------------------------------------------------------------------------
 1 | """Check (u)BAM files for `@SQ` lines whether they are the same in all headers."""
 2 | 
 3 | from pathlib import Path
 4 | import sys
 5 | 
 6 | import pysam
 7 | 
 8 | from .util import get_named_logger, wf_parser  # noqa: ABS101
 9 | 
10 | 
11 | def main(args):
12 |     """Run the entry point."""
13 |     logger = get_named_logger("checkBamHdr")
14 | 
15 |     if not args.input_path.is_dir():
16 |         raise ValueError(f"Input path '{args.input_path}' must be a directory.")
17 | 
18 |     target_files = list(args.input_path.glob("*"))
19 |     if not target_files:
20 |         raise ValueError(f"No files found in input directory '{args.input_path}'.")
21 |     # Loop over target files and check if there are `@SQ` lines in all headers or not.
22 |     # Set `is_unaligned` accordingly. If there are mixed headers (either with some files
23 |     # containing `@SQ` lines and some not or with different files containing different
24 |     # `@SQ` lines), set `mixed_headers` to `True`.
25 |     # Also check if there is the SO line, to validate whether the file is (un)sorted.
26 |     first_sq_lines = None
27 |     mixed_headers = False
28 |     sorted_xam = False
29 |     for xam_file in target_files:
30 |         # get the `@SQ` and `@HD` lines in the header
31 |         with pysam.AlignmentFile(xam_file, check_sq=False) as f:
32 |             # compare only the SN/LN/M5 elements of SQ to avoid labelling XAM with
33 |             # same reference but different SQ.UR as mixed_header (see CW-4842)
34 |             sq_lines = [{
35 |                 "SN": sq["SN"],
36 |                 "LN": sq["LN"],
37 |                 "M5": sq.get("M5"),
38 |             } for sq in f.header.get("SQ", [])]
39 |             hd_lines = f.header.get("HD")
40 |         # Check if it is sorted.
41 |         # When there is more than one BAM, merging/sorting
42 |         # will happen regardless of this flag.
43 |         if hd_lines is not None and hd_lines.get('SO') == 'coordinate':
44 |             sorted_xam = True
45 |         if first_sq_lines is None:
46 |             # this is the first file
47 |             first_sq_lines = sq_lines
48 |         else:
49 |             # this is a subsequent file; check with the first `@SQ` lines
50 |             if sq_lines != first_sq_lines:
51 |                 mixed_headers = True
52 |                 break
53 | 
54 |     # we set `is_unaligned` to `True` if there were no mixed headers and the last file
55 |     # didn't have `@SQ` lines (as we can then be sure that none of the files did)
56 |     is_unaligned = not mixed_headers and not sq_lines
57 |     # write `is_unaligned` and `mixed_headers` out so that they can be set as env.
58 |     # variables
59 |     sys.stdout.write(
60 |         f"IS_UNALIGNED={int(is_unaligned)};" +
61 |         f"MIXED_HEADERS={int(mixed_headers)};" +
62 |         f"IS_SORTED={int(sorted_xam)}"
63 |     )
64 |     logger.info(f"Checked (u)BAM headers in '{args.input_path}'.")
65 | 
66 | 
67 | def argparser():
68 |     """Argument parser for entrypoint."""
69 |     parser = wf_parser("check_bam_headers_in_dir")
70 |     parser.add_argument("input_path", type=Path, help="Path to target directory")
71 |     return parser
72 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/check_sample_sheet.py:
--------------------------------------------------------------------------------
  1 | """Check if a sample sheet is valid."""
  2 | import codecs
  3 | import csv
  4 | import os
  5 | import re
  6 | import sys
  7 | 
  8 | from .util import get_named_logger, wf_parser  # noqa: ABS101
  9 | 
 10 | 
 11 | # Some Excel users save their CSV as UTF-8 (and occasionally for a reason beyond my
 12 | # comprehension, UTF-16); Excel then adds a byte order mark (unnecessarily for UTF-8
 13 | # I should add). If we do not handle this with the correct encoding, the mark will
 14 | # appear in the parsed data, causing the header to be malformed.
 15 | # See CW-2310
 16 | def determine_codec(f):
 17 |     """Peek at a file and return an appropriate reading codec."""
 18 |     with open(f, 'rb') as f_bytes:
 19 |         # Could use chardet here if we need to expand codec support
 20 |         initial_bytes = f_bytes.read(8)
 21 | 
 22 |         for codec, encoding_name in [
 23 |             [codecs.BOM_UTF8, "utf-8-sig"],  # use the -sig codec to drop the mark
 24 |             [codecs.BOM_UTF16_BE, "utf-16"],  # don't specify LE or BE to drop mark
 25 |             [codecs.BOM_UTF16_LE, "utf-16"],
 26 |             [codecs.BOM_UTF32_BE, "utf-32"],  # handle 32 for completeness
 27 |             [codecs.BOM_UTF32_LE, "utf-32"],  # again skip LE or BE to drop mark
 28 |         ]:
 29 |             if initial_bytes.startswith(codec):
 30 |                 return encoding_name
 31 |         return None  # will cause file to be opened with default encoding
 32 | 
 33 | 
 34 | def main(args):
 35 |     """Run the entry point."""
 36 |     logger = get_named_logger("checkSheet")
 37 | 
 38 |     barcodes = []
 39 |     aliases = []
 40 |     sample_types = []
 41 |     analysis_groups = []
 42 |     allowed_sample_types = [
 43 |         "test_sample", "positive_control", "negative_control", "no_template_control"
 44 |     ]
 45 | 
 46 |     if not os.path.exists(args.sample_sheet) or not os.path.isfile(args.sample_sheet):
 47 |         sys.stdout.write("Could not open sample sheet file.")
 48 |         sys.exit()
 49 | 
 50 |     try:
 51 |         encoding = determine_codec(args.sample_sheet)
 52 |         with open(args.sample_sheet, "r", encoding=encoding) as f:
 53 |             try:
 54 |                 # Excel files don't throw any error until here
 55 |                 csv.Sniffer().sniff(f.readline())
 56 |                 f.seek(0)  # return to initial position again
 57 |             except Exception as e:
 58 |                 # Excel fails with UniCode error
 59 |                 sys.stdout.write(
 60 |                     "The sample sheet doesn't seem to be a CSV file.\n"
 61 |                     "The sample sheet has to be a CSV file.\n"
 62 |                     "Please verify that the sample sheet is a CSV file.\n"
 63 |                     f"Parsing error: {e}"
 64 |                  )
 65 | 
 66 |                 sys.exit()
 67 | 
 68 |             csv_reader = csv.DictReader(f)
 69 |             n_row = 0
 70 |             for row in csv_reader:
 71 |                 n_row += 1
 72 |                 if n_row == 1:
 73 |                     n_cols = len(row)
 74 |                 else:
 75 |                     # check we got the same number of fields
 76 |                     if len(row) != n_cols:
 77 |                         sys.stdout.write(
 78 |                             f"Unexpected number of cells in row number {n_row}"
 79 |                         )
 80 |                         sys.exit()
 81 |                 try:
 82 |                     barcodes.append(row["barcode"])
 83 |                 except KeyError:
 84 |                     sys.stdout.write("'barcode' column missing")
 85 |                     sys.exit()
 86 |                 try:
 87 |                     aliases.append(row["alias"])
 88 |                 except KeyError:
 89 |                     sys.stdout.write("'alias' column missing")
 90 |                     sys.exit()
 91 |                 try:
 92 |                     sample_types.append(row["type"])
 93 |                 except KeyError:
 94 |                     pass
 95 |                 try:
 96 |                     analysis_groups.append(row["analysis_group"])
 97 |                 except KeyError:
 98 |                     pass
 99 |     except Exception as e:
100 |         sys.stdout.write(f"Parsing error: {e}")
101 |         sys.exit()
102 | 
103 |     # check barcodes are correct format
104 |     for barcode in barcodes:
105 |         if not re.match(r'^barcode\d\d+$', barcode):
106 |             sys.stdout.write("values in 'barcode' column are incorrect format")
107 |             sys.exit()
108 | 
109 |     # check barcodes are all the same length
110 |     first_length = len(barcodes[0])
111 |     for barcode in barcodes[1:]:
112 |         if len(barcode) != first_length:
113 |             sys.stdout.write("values in 'barcode' column are different lengths")
114 |             sys.exit()
115 | 
116 |     # check barcode and alias values are unique
117 |     if len(barcodes) > len(set(barcodes)):
118 |         sys.stdout.write("values in 'barcode' column not unique")
119 |         sys.exit()
120 |     if len(aliases) > len(set(aliases)):
121 |         sys.stdout.write("values in 'alias' column not unique")
122 |         sys.exit()
123 | 
124 |     if sample_types:
125 |         # check if "type" column has unexpected values
126 |         unexp_type_vals = set(sample_types) - set(allowed_sample_types)
127 | 
128 |         if unexp_type_vals:
129 |             sys.stdout.write(
130 |                 f"found unexpected values in 'type' column: {unexp_type_vals}. "
131 |                 f"Allowed values are: {allowed_sample_types}"
132 |             )
133 |             sys.exit()
134 | 
135 |         if args.required_sample_types:
136 |             for required_type in args.required_sample_types:
137 |                 if required_type not in allowed_sample_types:
138 |                     sys.stdout.write(f"Not an allowed sample type: {required_type}")
139 |                     sys.exit()
140 |                 if sample_types.count(required_type) < 1:
141 |                     sys.stdout.write(
142 |                         f"Sample sheet requires at least 1 of {required_type}")
143 |                     sys.exit()
144 |     if analysis_groups:
145 |         # if there was a "analysis_group" column, make sure it had values for all
146 |         # samples
147 |         if not all(analysis_groups):
148 |             sys.stdout.write(
149 |                 "if an 'analysis_group' column exists, it needs values in each row"
150 |             )
151 |             sys.exit()
152 | 
153 |     logger.info(f"Checked sample sheet {args.sample_sheet}.")
154 | 
155 | 
156 | def argparser():
157 |     """Argument parser for entrypoint."""
158 |     parser = wf_parser("check_sample_sheet")
159 |     parser.add_argument("sample_sheet", help="Sample sheet to check")
160 |     parser.add_argument(
161 |         "--required_sample_types",
162 |         help="List of required sample types. Each sample type provided must "
163 |              "appear at least once in the sample sheet",
164 |         nargs="*"
165 |     )
166 |     return parser
167 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/check_xam_index.py:
--------------------------------------------------------------------------------
 1 | """Validate a single (u)BAM file index."""
 2 | 
 3 | from pathlib import Path
 4 | import sys
 5 | 
 6 | import pysam
 7 | 
 8 | from .util import get_named_logger, wf_parser  # noqa: ABS101
 9 | 
10 | 
11 | def validate_xam_index(xam_file):
12 |     """Use fetch to validate the index.
13 | 
14 |     Invalid indexes will fail the call with a ValueError:
15 |     ValueError: fetch called on bamfile without index
16 |     """
17 |     with pysam.AlignmentFile(xam_file, check_sq=False) as alignments:
18 |         try:
19 |             alignments.fetch()
20 |             has_valid_index = True
21 |         except ValueError:
22 |             has_valid_index = False
23 |     return has_valid_index
24 | 
25 | 
26 | def main(args):
27 |     """Run the entry point."""
28 |     logger = get_named_logger("checkBamIdx")
29 | 
30 |     # Check if a XAM has a valid index
31 |     has_valid_index = validate_xam_index(args.input_xam)
32 |     # write `has_valid_index` out so that they can be set as env.
33 |     sys.stdout.write(
34 |         f"HAS_VALID_INDEX={int(has_valid_index)}"
35 |     )
36 |     logger.info(f"Checked (u)BAM index for: '{args.input_xam}'.")
37 | 
38 | 
39 | def argparser():
40 |     """Argument parser for entrypoint."""
41 |     parser = wf_parser("check_xam_index")
42 |     parser.add_argument("input_xam", type=Path, help="Path to target XAM")
43 |     return parser
44 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/configure_igv.py:
--------------------------------------------------------------------------------
  1 | """Create an IGV config file."""
  2 | 
  3 | from itertools import zip_longest
  4 | import json
  5 | import sys
  6 | 
  7 | from .util import get_named_logger, wf_parser  # noqa: ABS101
  8 | 
  9 | 
 10 | def parse_fnames(fofn):
 11 |     """Parse list with filenames and return them grouped as ref-, XAM-, or VCF-related.
 12 | 
 13 |     :param fofn: File with list of file names (one per line)
 14 |     :return: dict of reference-related filenames (with keys 'ref', 'fai', and '.gzi' and
 15 |         `None` as default values); lists of XAM- and VCF-related filenames
 16 |     """
 17 |     ref_extensions = [".fasta", ".fasta.gz", ".fa", ".fa.gz", ".fna", ".fna.gz"]
 18 |     ref_dict = {}
 19 |     xams = []
 20 |     xam_indices = []
 21 |     vcfs = []
 22 |     vcf_indices = []
 23 |     with open(fofn, "r") as f:
 24 |         for line in f:
 25 |             fname = line.strip()
 26 |             if any(fname.endswith(ext) for ext in ref_extensions):
 27 |                 ref_dict["ref"] = fname
 28 |             elif fname.endswith(".fai"):
 29 |                 ref_dict["fai"] = fname
 30 |             elif fname.endswith(".gzi"):
 31 |                 ref_dict["gzi"] = fname
 32 |             elif fname.endswith(".bam") or fname.endswith(".cram"):
 33 |                 xams.append(fname)
 34 |             elif fname.endswith(".bai") or fname.endswith(".crai"):
 35 |                 xam_indices.append(fname)
 36 |             elif fname.endswith(".vcf") or fname.endswith(".vcf.gz"):
 37 |                 vcfs.append(fname)
 38 |             elif fname.endswith(".csi") or fname.endswith(".tbi"):
 39 |                 vcf_indices.append(fname)
 40 |     # do some sanity checks
 41 |     if "ref" not in ref_dict:
 42 |         raise ValueError(
 43 |             "No reference file (i.e. file ending in one of "
 44 |             f"{ref_extensions} was found)."
 45 |         )
 46 |     ref = ref_dict["ref"]
 47 |     if (gzi := ref_dict.get("gzi")) is not None:
 48 |         # since we got a '.gzi' index, make sure that the reference is actually
 49 |         # compressed
 50 |         if not ref_dict["ref"].endswith(".gz"):
 51 |             raise ValueError(
 52 |                 f"Found GZI reference index '{gzi}', but the reference file "
 53 |                 f"'{ref}' appears not to be compressed."
 54 |             )
 55 |     if xam_indices:
 56 |         if len(xams) != len(xam_indices):
 57 |             raise ValueError("Got different number of XAM and XAM index files.")
 58 |     if vcf_indices:
 59 |         if len(vcfs) != len(vcf_indices):
 60 |             raise ValueError("Got different number of VCF and VCF index files.")
 61 |     if xams and vcfs:
 62 |         if len(xams) != len(vcfs):
 63 |             raise ValueError("Got different number of XAM and VCF files.")
 64 |     # if we got XAM or VCF indices, pair them up with their corresponding files (and
 65 |     # otherwise with `None`)
 66 |     xams_with_indices = zip_longest(xams, xam_indices)
 67 |     vcfs_with_indices = zip_longest(vcfs, vcf_indices)
 68 |     return ref_dict, xams_with_indices, vcfs_with_indices
 69 | 
 70 | 
 71 | def get_reference_options(ref, fai=None, gzi=None):
 72 |     """Create dict with IGV reference options.
 73 | 
 74 |     :param ref: reference file name
 75 |     :param fai: name reference `.fai` index file
 76 |     :param gzi: name of `.gzi` index file for a compressed reference
 77 |     :return: dict with reference options
 78 |     """
 79 |     # initialise the options dict and add the index attributes later
 80 |     ref_opts = {
 81 |         "id": "ref",
 82 |         "name": "ref",
 83 |         "wholeGenomeView": False,
 84 |         "fastaURL": ref,
 85 |     }
 86 |     if fai is not None:
 87 |         ref_opts["indexURL"] = fai
 88 |     if gzi is not None:
 89 |         ref_opts["compressedIndexURL"] = gzi
 90 |     return ref_opts
 91 | 
 92 | 
 93 | def get_alignment_track(xam, xai=None, extra_opts=None):
 94 |     """Create dict with options for IGV alignment track.
 95 | 
 96 |     :param xam: name of XAM file to be displayed
 97 |     :param xai: name of XAM index file
 98 |     :param extra_opts: dict of extra options for the alignment track
 99 |     :return: dict with alignment track options
100 |     """
101 |     alignment_track_dict = {
102 |         "name": xam,
103 |         "type": "alignment",
104 |         "format": xam.split(".")[-1],
105 |         "url": xam,
106 |     }
107 |     # add the XAM index if present
108 |     if xai is not None:
109 |         alignment_track_dict["indexURL"] = xai
110 |     alignment_track_dict.update(extra_opts or {})
111 |     return alignment_track_dict
112 | 
113 | 
114 | def get_variant_track(vcf, index=None, extra_opts=None):
115 |     """Create dict with options for IGV variant track.
116 | 
117 |     :param vcf: name of VCF file to be displayed
118 |     :param index: name of VCF index file (ending in `.csi` or `.tbi`)
119 |     :param extra_opts: dict of extra options for the variant track
120 |     :return: dict with variant track options
121 |     """
122 |     variant_track_dict = {
123 |         "name": vcf,
124 |         "type": "variant",
125 |         "format": "vcf",
126 |         "url": vcf,
127 |     }
128 |     # add the VCF index if we got an index extension
129 |     if index is not None:
130 |         variant_track_dict["indexURL"] = index
131 |     variant_track_dict.update(extra_opts or {})
132 |     return variant_track_dict
133 | 
134 | 
135 | def main(args):
136 |     """Run the entry point."""
137 |     logger = get_named_logger("configIGV")
138 | 
139 |     # parse the FOFN
140 |     ref_dict, xams_with_indices, vcfs_with_indices = parse_fnames(args.fofn)
141 | 
142 |     # initialise the IGV options dict with the reference options
143 |     json_dict = {"reference": get_reference_options(**ref_dict)}
144 | 
145 |     # if we got JSON files with extra options for the alignment / variant tracks, read
146 |     # them
147 |     extra_alignment_opts = {}
148 |     if args.extra_alignment_opts is not None:
149 |         with open(args.extra_alignment_opts, "r") as f:
150 |             extra_alignment_opts = json.load(f)
151 |     extra_variant_opts = {}
152 |     if args.extra_variant_opts is not None:
153 |         with open(args.extra_variant_opts, "r") as f:
154 |             extra_variant_opts = json.load(f)
155 | 
156 |     # now add the alignment and variant tracks
157 |     json_dict["tracks"] = []
158 |     # we use `zip_longest` to make sure that variant and alignment tracks from the same
159 |     # sample are added after each other
160 |     for (vcf, vcf_index), (xam, xam_index) in zip_longest(
161 |         vcfs_with_indices, xams_with_indices, fillvalue=(None, None)
162 |     ):
163 |         if vcf is not None:
164 |             # add a variant track for the VCF
165 |             json_dict["tracks"].append(
166 |                 get_variant_track(vcf, vcf_index, extra_variant_opts)
167 |             )
168 |         if xam is not None:
169 |             # add an alignment track for the XAM
170 |             json_dict["tracks"].append(
171 |                 get_alignment_track(xam, xam_index, extra_alignment_opts)
172 |             )
173 | 
174 |     if args.locus is not None:
175 |         json_dict["locus"] = args.locus
176 | 
177 |     json.dump(json_dict, sys.stdout, indent=4)
178 | 
179 |     logger.info("Printed IGV config JSON to STDOUT.")
180 | 
181 | 
182 | def argparser():
183 |     """Argument parser for entrypoint."""
184 |     parser = wf_parser("configure_igv")
185 |     parser.add_argument(
186 |         "--fofn",
187 |         required=True,
188 |         help=(
189 |             "File with list of names of reference / XAM / VCF files and indices "
190 |             "(one filename per line)"
191 |         ),
192 |     )
193 |     parser.add_argument(
194 |         "--locus",
195 |         help="Locus string to set initial genomic coordinates to display in IGV",
196 |     )
197 |     parser.add_argument(
198 |         "--extra-alignment-opts",
199 |         help="JSON file with extra options for alignment tracks",
200 |     )
201 |     parser.add_argument(
202 |         "--extra-variant-opts",
203 |         help="JSON file with extra options for variant tracks",
204 |     )
205 |     return parser
206 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/get_max_depth_locus.py:
--------------------------------------------------------------------------------
 1 | """Find max depth window in a `mosdepth` regions BED file and write as locus string."""
 2 | 
 3 | from pathlib import Path
 4 | import sys
 5 | 
 6 | import pandas as pd
 7 | 
 8 | from .util import get_named_logger, wf_parser  # noqa: ABS101
 9 | 
10 | 
11 | def main(args):
12 |     """Run the entry point."""
13 |     logger = get_named_logger("getMaxDepth")
14 | 
15 |     # read the regions BED file
16 |     df = pd.read_csv(
17 |         args.depths_bed, sep="\t", header=None, names=["ref", "start", "end", "depth"]
18 |     )
19 | 
20 |     # get the window with the largest depth
21 |     ref, start, end, depth = df.loc[df["depth"].idxmax()]
22 | 
23 |     # get the length of the reference of that window
24 |     ref_length = df.query("ref == @ref")["end"].iloc[-1]
25 | 
26 |     # show the whole reference in case it's shorter than the desired locus size
27 |     if ref_length < args.locus_size:
28 |         start = 1
29 |         end = ref_length
30 |     else:
31 |         # otherwise, show a region of the desired size around the window
32 |         half_size = args.locus_size // 2
33 |         mid = (start + end) // 2
34 |         start = mid - half_size
35 |         end = mid + half_size
36 |         # check if the region starts below `1` or ends beyond the end of the reference
37 |         if start < 1:
38 |             start = 1
39 |             end = args.locus_size
40 |         if end > ref_length:
41 |             start = ref_length - args.locus_size
42 |             end = ref_length
43 | 
44 |     # write depth and locus string
45 |     sys.stdout.write(f"{depth}\t{ref}:{start}-{end}")
46 | 
47 |     logger.info("Wrote locus with maximum depth to STDOUT.")
48 | 
49 | 
50 | def argparser():
51 |     """Argument parser for entrypoint."""
52 |     parser = wf_parser("get_max_depth_locus")
53 |     parser.add_argument(
54 |         "depths_bed",
55 |         type=Path,
56 |         help="path to mosdepth regions depth file (can be compressed)",
57 |     )
58 |     parser.add_argument(
59 |         "locus_size", type=int, help="size of the locus in basepairs (e.g. '2000')"
60 |     )
61 |     return parser
62 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/report.py:
--------------------------------------------------------------------------------
 1 | """Create workflow report."""
 2 | import json
 3 | 
 4 | from ezcharts.components import fastcat
 5 | from ezcharts.components.reports import labs
 6 | from ezcharts.layout.snippets import Tabs
 7 | from ezcharts.layout.snippets.table import DataTable
 8 | import pandas as pd
 9 | 
10 | from .util import get_named_logger, wf_parser  # noqa: ABS101
11 | 
12 | 
13 | def main(args):
14 |     """Run the entry point."""
15 |     logger = get_named_logger("Report")
16 |     report = labs.LabsReport(
17 |         "Workflow Pore C report", "wf-pore-c",
18 |         args.params, args.versions, args.wf_version)
19 | 
20 |     with open(args.metadata) as metadata:
21 |         sample_details = [{
22 |             'sample': d['alias'],
23 |             'type': d['type'],
24 |             'barcode': d['barcode']
25 |         } for d in json.load(metadata)]
26 | 
27 |     if args.stats:
28 |         with report.add_section("Read summary", "Read summary"):
29 |             names = tuple(d['sample'] for d in sample_details)
30 |             stats = tuple(args.stats)
31 |             if len(stats) == 1:
32 |                 stats = stats[0]
33 |                 names = names[0]
34 |             fastcat.SeqSummary(
35 |                 stats, sample_names=names, alignment_stats=False)
36 | 
37 |     with report.add_section("Sample Metadata", "Sample Metadata"):
38 |         tabs = Tabs()
39 |         for d in sample_details:
40 |             with tabs.add_tab(d["sample"]):
41 |                 df = pd.DataFrame.from_dict(d, orient="index", columns=["Value"])
42 |                 df.index.name = "Key"
43 |                 DataTable.from_pandas(df)
44 | 
45 |     report.write(args.report)
46 |     logger.info(f"Report written to {args.report}.")
47 | 
48 | 
49 | def argparser():
50 |     """Argument parser for entrypoint."""
51 |     parser = wf_parser("report")
52 |     parser.add_argument("report", help="Report output file")
53 |     parser.add_argument(
54 |         "--stats", nargs='+',
55 |         help="Fastcat stats histogram directories, \
56 |           ordered as per entries in --metadata.")
57 |     parser.add_argument(
58 |         "--metadata", required=True,
59 |         help="sample metadata JSON")
60 |     parser.add_argument(
61 |         "--versions", required=True,
62 |         help="directory containing CSVs containing name,version.")
63 |     parser.add_argument(
64 |         "--params", required=True,
65 |         help="A JSON file containing the workflow parameter key/values")
66 |     parser.add_argument(
67 |         "--wf_version", default='unknown',
68 |         help="version of the executed workflow")
69 |     return parser
70 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """__init__.py for the tests."""
2 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/tests/test_test.py:
--------------------------------------------------------------------------------
 1 | """A dummy test."""
 2 | 
 3 | import argparse
 4 | 
 5 | from workflow_glue import report
 6 | 
 7 | 
 8 | def test():
 9 |     """Just showing that we can import using the workflow-glue."""
10 |     assert isinstance(report.argparser(), argparse.ArgumentParser)
11 | 


--------------------------------------------------------------------------------
/bin/workflow_glue/util.py:
--------------------------------------------------------------------------------
 1 | """The odd helper function."""
 2 | 
 3 | import argparse
 4 | import logging
 5 | 
 6 | _log_name = None
 7 | 
 8 | 
 9 | def get_main_logger(name):
10 |     """Create the top-level logger."""
11 |     global _log_name
12 |     _log_name = name
13 |     logging.basicConfig(
14 |         format='[%(asctime)s - %(name)s] %(message)s',
15 |         datefmt='%H:%M:%S', level=logging.INFO)
16 |     return logging.getLogger(name)
17 | 
18 | 
19 | def get_named_logger(name):
20 |     """Create a logger with a name.
21 | 
22 |     :param name: name of logger.
23 |     """
24 |     name = name.ljust(10)[:10]  # so logging is aligned
25 |     logger = logging.getLogger('{}.{}'.format(_log_name, name))
26 |     return logger
27 | 
28 | 
29 | def wf_parser(name):
30 |     """Make an argument parser for a workflow command."""
31 |     return argparse.ArgumentParser(
32 |         name,
33 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
34 |         add_help=False)
35 | 
36 | 
37 | def _log_level():
38 |     """Parser to set logging level and acquire software version/commit."""
39 |     parser = argparse.ArgumentParser(
40 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter, add_help=False)
41 | 
42 |     modify_log_level = parser.add_mutually_exclusive_group()
43 |     modify_log_level.add_argument(
44 |         '--debug', action='store_const',
45 |         dest='log_level', const=logging.DEBUG, default=logging.INFO,
46 |         help='Verbose logging of debug information.')
47 |     modify_log_level.add_argument(
48 |         '--quiet', action='store_const',
49 |         dest='log_level', const=logging.WARNING, default=logging.INFO,
50 |         help='Minimal logging; warnings only.')
51 | 
52 |     return parser
53 | 


--------------------------------------------------------------------------------
/data/OPTIONAL_FILE:
--------------------------------------------------------------------------------
 1 | # Nothing to see here. A sentinel file to replace real data.
 2 | # e.g.:
 3 | #
 4 | # process run {
 5 | #     input:
 6 | #         path some_data
 7 | #         path extra_data
 8 | #     script:
 9 | #     def extra = extra_data.name != 'OPTIONAL_FILE' ? "--extra-data $opt" : ''
10 | #     """
11 | #     command ${some_data} ${extra}
12 | #     """
13 | # }
14 | #
15 | # some_data = ...
16 | # extra_data = Channel.fromPath("$projectDir/data/OPTIONAL_FILE"))
17 | # run(some_data, extra_data)
18 | 


--------------------------------------------------------------------------------
/docs/01_brief_description.md:
--------------------------------------------------------------------------------
1 | Workflow for analysing Pore-c data for chromatin conformation capture.


--------------------------------------------------------------------------------
/docs/02_introduction.md:
--------------------------------------------------------------------------------
 1 | Pore-C is an end-to-end workflow unique to Oxford Nanopore which combines chromatin conformation capture (3C) with direct, long nanopore sequencing reads. With nanopore reads, long-range, multi-way contact information can be obtained. 
 2 | 
 3 | This workflow can be used for the following:
 4 | 
 5 | * Pre-processing a reference genome or draft assembly to generate auxiliary files used in downstream analyses.
 6 | * Creating virtual digests of Pore-c reads.
 7 | * Filtering the raw reads to remove any that might break downstream tools.
 8 | * Align virtually digested reads against a reference genome.
 9 | * Processing results to filter spurious alignments, detect ligation junctions and assign fragments.
10 | * Outputting aligned, sorted and annotated BAM files.
11 | * Generating a contact map, which shows the intensity of the physical interaction between two genome regions.
12 | * Create output files for downstream analysis in the following formats.
13 |   - [Pairs format](https://github.com/4dn-dcic/pairix/blob/master/pairs_format_specification.md)
14 |   - [Cooler format](https://mirnylab.github.io/cooler/)
15 |   - [Hic format](https://github.com/aidenlab/juicer/wiki/)


--------------------------------------------------------------------------------
/docs/03_compute_requirements.md:
--------------------------------------------------------------------------------
 1 | Recommended requirements:
 2 | 
 3 | + CPUs = 64
 4 | + Memory = 128GB
 5 | 
 6 | Minimum requirements:
 7 | 
 8 | + CPUs = 8
 9 | + Memory = 32GB
10 | 
11 | Approximate run time: 12 hours for 100GB input BAM using the recommended resources, this will vary depending on number of monomers found per read.
12 | 
13 | ARM processor support: False
14 | 


--------------------------------------------------------------------------------
/docs/04_install_and_run.md:
--------------------------------------------------------------------------------
 1 | 
 2 | These are instructions to install and run the workflow on command line.
 3 | You can also access the workflow via the
 4 | [EPI2ME Desktop application](https://labs.epi2me.io/downloads/).
 5 | 
 6 | The workflow uses [Nextflow](https://www.nextflow.io/) to manage
 7 | compute and software resources,
 8 | therefore Nextflow will need to be
 9 | installed before attempting to run the workflow.
10 | 
11 | The workflow can currently be run using either
12 | [Docker](https://www.docker.com/products/docker-desktop)
13 | or [Singularity](https://docs.sylabs.io/guides/3.0/user-guide/index.html)
14 | to provide isolation of the required software.
15 | Both methods are automated out-of-the-box provided
16 | either Docker or Singularity is installed.
17 | This is controlled by the
18 | [`-profile`](https://www.nextflow.io/docs/latest/config.html#config-profiles)
19 | parameter as exemplified below.
20 | 
21 | It is not required to clone or download the git repository
22 | in order to run the workflow.
23 | More information on running EPI2ME workflows can
24 | be found on our [website](https://labs.epi2me.io/wfindex).
25 | 
26 | The following command can be used to obtain the workflow.
27 | This will pull the repository in to the assets folder of
28 | Nextflow and provide a list of all parameters
29 | available for the workflow as well as an example command:
30 | 
31 | ```
32 | nextflow run epi2me-labs/wf-pore-c --help
33 | ```
34 | To update a workflow to the latest version on the command line use
35 | the following command:
36 | ```
37 | nextflow pull epi2me-labs/wf-pore-c
38 | ```
39 | 
40 | A demo dataset is provided for testing of the workflow.
41 | It can be downloaded and unpacked using the following commands:
42 | ```
43 | wget https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-pore-c/wf-pore-c-demo.tar.gz
44 | tar -xzvf wf-pore-c-demo.tar.gz
45 | ```
46 | The workflow can then be run with the downloaded demo data using:
47 | ```
48 | nextflow run epi2me-labs/wf-pore-c \
49 | 	--bam 'wf-pore-c-demo/porec_test.concatemers.bam' \
50 | 	--chunk_size 100 \
51 | 	--cutter 'NlaIII' \
52 | 	--hi_c \
53 | 	--mcool \
54 | 	--paired_end \
55 | 	--paired_end_maximum_distance 200 \
56 | 	--paired_end_minimum_distance 100 \
57 | 	--phased_vcf 'wf-pore-c-demo/porec_test.phased_variants.vcf.gz' \
58 | 	--ref 'wf-pore-c-demo/porec_test.fasta' \
59 | 	--vcf 'wf-pore-c-demo/porec_test.phased_variants.vcf.gz' \
60 | 	-profile standard
61 | ```
62 | 
63 | For further information about running a workflow on
64 | the command line see https://labs.epi2me.io/wfquickstart/
65 | 


--------------------------------------------------------------------------------
/docs/05_related_protocols.md:
--------------------------------------------------------------------------------
1 | This workflow is designed to take input sequences that have been produced from [Oxford Nanopore Technologies](https://nanoporetech.com/) devices.
2 | 
3 | Find related protocols in the [Nanopore community](https://community.nanoporetech.com/docs/).


--------------------------------------------------------------------------------
/docs/06_input_example.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <!---Example of input directory structure, delete and edit as appropriate per workflow.--->
 3 | This workflow accepts either FASTQ or unaligned BAM files as input.
 4 | 
 5 | The FASTQ and BAM input parameters for this workflow accept one of three cases: (i) the path to a single FASTQ or BAM file; (ii) the path to a top-level directory containing FASTQ or BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ or BAM files. In the first and second cases (i and ii), a sample name can be supplied with `--sample`. In the last case (iii), the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`.
 6 | 
 7 | ```
 8 | (i)                     (ii)                 (iii)    
 9 | input_reads.fastq   ─── input_directory  ─── input_directory
10 |                         ├── reads0.fastq     ├── barcode01
11 |                         └── reads1.fastq     │   ├── reads0.fastq
12 |                                              │   └── reads1.fastq
13 |                                              ├── barcode02
14 |                                              │   ├── reads0.fastq
15 |                                              │   ├── reads1.fastq
16 |                                              │   └── reads2.fastq
17 |                                              └── barcode03
18 |                                               └── reads0.fastq
19 | ```


--------------------------------------------------------------------------------
/docs/06_input_parameters.md:
--------------------------------------------------------------------------------
 1 | ### Input Options
 2 | 
 3 | | Nextflow parameter name  | Type | Description | Help | Default |
 4 | |--------------------------|------|-------------|------|---------|
 5 | | bam | string | An unaligned BAM file containing Pore-C concatemer sequences. | This accepts one of three cases: (i) the path to a single BAM file; (ii) the path to a top-level directory containing BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain BAM files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. |  |
 6 | | fastq | string | FASTQ files to use in the analysis. | This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`. |  |
 7 | | sample_sheet | string | A CSV file used to map barcodes to sample aliases and optionally provide per-sample parameters. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files. | The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Optionally, a `cutter` column can contain the name of the enzyme used per sample (see the `--cutter` parameter for more details) and a `vcf` column can be used to provide a phased VCF file per sample if you require haplotagged alignments. |  |
 8 | | sample | string | A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files. |  |  |
 9 | | analyse_unclassified | boolean | Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory. | If selected and if the input is a multiplex directory the workflow will also process the unclassified directory. | False |
10 | | ref | string | A FASTA file containing the reference genome to map against. |  |  |
11 | | vcf | string | An optional phased VCF file that will be used to haplotag alignments. |  |  |
12 | | cutter | string | The enzyme used in the restriction digest. | Any enzyme from the Biopython restriction dictionary can be used. See `https://github.com/biopython/biopython/blob/master/Bio/Restriction/Restriction_Dictionary.py`. This can also be defined per sample: see `--sample_sheet` parameter. | NlaIII |
13 | 
14 | 
15 | ### Output Options
16 | 
17 | | Nextflow parameter name  | Type | Description | Help | Default |
18 | |--------------------------|------|-------------|------|---------|
19 | | out_dir | string | Directory for output of all user-facing files. |  | output |
20 | | hi_c | boolean | Output a Hi-C formatted file; will convert pairs format to a Hi-C (`.hic`) file which will be compatible with [juicer](https://github.com/aidenlab/juicer). | Load this file with [Juice box](https://www.aidenlab.org/juicebox/) for an alternative contact map visualisation. | False |
21 | | bed | boolean | Output a BED file of the paired-end BAM alignments for use with downstream tools. Setting this to true will also trigger creation of the paired-end BAM. | Will use the paired-end BAM to create a BED file compatible with downstream tools including scaffolding tool [Yahs](https://github.com/c-zhou/yahs). | False |
22 | 
23 | 
24 | ### Advanced Options
25 | 
26 | | Nextflow parameter name  | Type | Description | Help | Default |
27 | |--------------------------|------|-------------|------|---------|
28 | | chunk_size | integer | Process input in chunks of this number of reads. | To reduce per-process memory requirements for large datasets, process the inputs in chunks of reads. Set to 0 to process entire dataset in one go. | 20000 |
29 | | threads | integer | Set maximum number of threads to use for more intense processes (limited by config executor cpus). We recommend a minimum of 4, but if available 19. |  | 4 |
30 | 
31 | 
32 | ### Pore-C Tools Options
33 | 
34 | | Nextflow parameter name  | Type | Description | Help | Default |
35 | |--------------------------|------|-------------|------|---------|
36 | | minimap2_settings | string | The minimap2 settings for mapping monomers. |  | -x map-ont |
37 | | max_monomers | integer | The maximum number of monomers allowed for a read to be included in downstream analysis. |  | 250 |
38 | | coverage | boolean | Calculate restriction-fragment coverage using mosdepth. |  | False |
39 | | summary_json | boolean | Output pore-c-py annotation summary in json format. |  | True |
40 | 
41 | 
42 | ### Chromunity Options
43 | 
44 | | Nextflow parameter name  | Type | Description | Help | Default |
45 | |--------------------------|------|-------------|------|---------|
46 | | chromunity | boolean | Create parquet files for Chromunity. | See the chromunity documentation for further details 'https://github.com/mskilab/chromunity'. | False |
47 | | chromunity_merge_distance | integer | Merge colinear alignments separated by less than this base pair distance into a single monomer. |  | -1 |
48 | 
49 | 
50 | ### 4DN files Options
51 | 
52 | | Nextflow parameter name  | Type | Description | Help | Default |
53 | |--------------------------|------|-------------|------|---------|
54 | | pairs | boolean | Create a 4DN-format pairs file (also calculate stats). | Outputs a directory with a pairs stats report and a pairs file which can be used for downstream anaylsis. | False |
55 | | pairtools_chunksize | integer | Number of pairs to be processed in each chunk in the prepare_hic process which uses the pairtools dedup tool. | Reduce for lower memory footprint. Below 10,000 performance starts suffering significantly. | 100000 |
56 | | mcool | boolean | Create a multi-resolution cooler file. Will output the cooler formatted file which you can load with cooler. | See 'https://open2c.github.io/cooler' for more details. | False |
57 | | cool_bin_size | integer | The bin size of the cooler output file in base pairs. | See 'https://open2c.github.io/cooler' for more details. | 1000 |
58 | | mcool_resolutions | string | The resolutions of the mcool file in pixels (see cooler documentation for details). | Comma-separated list of target resolutions. Use suffixes B or N to specify a progression: B for binary (geometric steps of factor 2), N for nice (geometric steps of factor 10 interleaved with steps of 2 and 5). This is the equivalent of the `--resolutions` flag in cooler; see an example here 'https://cooler.readthedocs.io/en/latest/cli.html'. | 1000,2000,5000N |
59 | 
60 | 
61 | ### Paired-end BAM Options
62 | 
63 | | Nextflow parameter name  | Type | Description | Help | Default |
64 | |--------------------------|------|-------------|------|---------|
65 | | paired_end | boolean | Create mock paired-end BAM files. |  | False |
66 | | filter_pairs | boolean | Filter paired-end reads using minimum and maximum distance parameters. |  | False |
67 | | paired_end_minimum_distance | integer | Remove trans/cis pairs separated by a distance shorter than this. |  | -1 |
68 | | paired_end_maximum_distance | integer | Remove trans/cis pairs separated by a distance greater than this. |  | -1 |
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/docs/07_outputs.md:
--------------------------------------------------------------------------------
 1 | Output files may be aggregated including information for all samples or provided per sample. Per-sample files will be prefixed with respective aliases and represented below as {{ alias }}.
 2 | 
 3 | | Title | File path | Description | Per sample or aggregated |
 4 | |-------|-----------|-------------|--------------------------|
 5 | | workflow report | ./wf-template-report.html | Report for all samples. | aggregated |
 6 | | Per file read stats | ./ingress_results/reads/fastcat_stats/per-file-stats.tsv | A TSV with per file read stats, including all samples. | aggregated |
 7 | | Per read stats | ./ingress_results/reads/fastcat_stats/per-read-stats.tsv | A TSV with per read stats, including all samples. | aggregated |
 8 | | Run ID's | ./ingress_results/reads/fastcat_stats/run_ids | List of run ID's present in reads. | aggregated |
 9 | | Meta map json | ./ingress_results/reads/metamap.json | Meta data used in workflow presented in a JSON. | aggregated |
10 | | Concatenated sequence data | ./ingress_results/reads/{{ alias }}.fastq.gz | Per-sample reads concatenated in to one fastq file. | per-sample |
11 | | Coordinate-sorted Bam | ./bams/{{ alias }}.cs.bam | Coordinate-sorted Bam. | per-sample |
12 | | Coordinate-sorted Bam Index | ./bams/{{ alias }}.cs.bam.bai | Coordinate-sorted Bam Index. | per-sample |
13 | | Name-sorted Bam | ./bams/{{ alias }}.ns.bam | Name-sorted Bam. | per-sample |
14 | | Pairs file | ./pairs/{{ alias }}.pairs.gz | This file contains contact information in a human-readable tabular format, and can be used with downstream tools. See [Pairtools documentation](https://pairtools.readthedocs.io/en/latest/formats.html#pairs) for full specification. | per-sample |
15 | | Pairs summary stats file | ./pairs/{{ alias }}.pairs.stats.txt | Summary statistics of the pairs file. See this [overview](https://pairtools.readthedocs.io/en/latest/stats.html) for a full specification. | per-sample |
16 | | Pairs summary report | ./pairs/{{ alias }}.pairs.stats.html | Pairs html report with result including an interactive contact map and statistics. See [pairsqc documentation](https://github.com/4dn-dcic/pairsqc) for further details. | per-sample |
17 | | Multi-resolution cool file | ./cooler/{{ alias }}.mcool | Multi-resolution cool `.mcool` file which can be used with downstream tools to provide a high resolution genomic interaction matrix. See [Cool tools documentation](https://github.com/open2c/cooltools) for details on downstream analysis. | per-sample |
18 | | Paired-end BAM | ./paired_end/{{ alias }}.ns.bam | Mock paired end BAM. | per-sample |
19 | | Chromunity parquet files. | ./chromunity | Chromunity directory with parquet files which can be used with the Chromunity package. Chromunity enables the nomination and statistical evaluation of high order interactions. See [Chromunity documentation](http://mskilab.com/chromunity/tutorial.html) for further details. | per-sample |
20 | | Fragments BED | ./paireds/fragments.bed | File with the DNA fragments created from the virtual digest. | per-sample |
21 | | Hi-C for contact map | ./hi-c/{{ alias }}.hic | File which can be loaded into the [Juice box tool](https://www.aidenlab.org/juicebox/) for an alternative contact map visualisation. | per-sample |
22 | | Filtered out reads | ./filtered_out/{{ alias }}.bam | BAM file containing any reads that were filtered out at the digest step and not included in the analysis. | per-sample |
23 | 


--------------------------------------------------------------------------------
/docs/08_pipeline_overview.md:
--------------------------------------------------------------------------------
 1 | ### 1. Concatenate input files and generate per read stats.
 2 | 
 3 | This workflow accepts FASTQ or unaligned BAM as input. [Fastcat or Bamstats](https://github.com/epi2me-labs/fastcat) tool is used to concatenate multifile samples to be processed by the workflow. It will also output per read stats including average read lengths and qualities.
 4 | 
 5 | ### 2. Index reference
 6 | 
 7 | The input reference genome is indexed with [Minimap2](https://github.com/lh3/minimap2).
 8 | 
 9 | ### 3. Split input file
10 | 
11 | The reads are indexed in chunks for parallel processing using the `chunk_size` parameter which is defaulted to 10,000.
12 | 
13 | ### 4. Digest Reads
14 | 
15 | Chimeric Pore-C reads are digested using the [Pore-c-py](https://github.com/epi2me-labs/pore-c-py) python package. The enzyme provided to the `cutter` parameter will be used by the Pore-c-py package to find the corresponding sequence using the [Biopython](https://biopython.org/) restriction enzymes library. Any reads containing more than `max_monomers` (default: 250) will be excluded at this stage as they are assumed to have been created in error.
16 | 
17 | ### 5. Align Reads
18 | 
19 | The monomers are then aligned with the reference genome using Minimap2.
20 | 
21 | ### 6. Annotate
22 | 
23 | The Pore-c-py package is then used again to filter spurious alignments, detect ligation junctions and assign chimeric fragments. The aligned segmnets will be used to generate a "walk" which enumerates the alignment coordinates of the monomers comprising the chimeric read and this is used to annotate the alignments.
24 | 
25 | ### 7. Output BAMS
26 | 
27 | The Pore-c-py will output the tagged alignments in a name sorted and coordinate sorted BAM. If the `paired_end` parameter is selected a mock paired end bam will also be output, this is for use with downstream tools such as [Pairtools](https://github.com/open2c/pairtools). At this stage if the [Chromunity](https://github.com/mskilab-org/chromunity) parameter is set to true the annotate script will also output the parquet files required for us with the downstream Chromunity tool.
28 | 
29 | ### 8. Haplotag Alignments
30 | 
31 | If a phased VCF is provided using the `vcf` parameter the output BAM will be haplotagged using [Whatshap](https://github.com/whatshap/whatshap).
32 | 
33 | ### 9. Merge BAMS
34 | 
35 | The outputs BAM's from each of the split chunks will be merged and sorted per sample using [Samtools](https://www.htslib.org/doc/samtools-merge.html).
36 | 
37 | ### 10. Coverage is calculated
38 | 
39 | If the `coverage` parameter is set to true [Mosdepth](https://github.com/brentp/mosdepth) is used to find coverage across the input reference genome.
40 | 
41 | ### 11. Additional output formats for downstream analysis
42 | 
43 | The workflow will output several formats that can be used with downstream tools. 
44 | 
45 | + [Pairtools](https://github.com/open2c/pairtools) is used to create pairs format file and html report which contains a contact map and other statistics. Use the `pairs` parameter for the workflow to generate this output.
46 | 
47 | + [Cooler](https://github.com/open2c/cooler) is used to output cooler format for use with cooler, a multi-resolution contact map. Use the `mcool` parameter to generate this output.
48 | 
49 | + [Juicer](https://github.com/aidenlab/juicer) tools is used to create `.hic` format file which can be used for visualising the file which can be loaded into the [Juice box tool](https://www.aidenlab.org/juicebox/) for an alternative contact map visualisation. Use the `hi_c` parameter to generate this output. 


--------------------------------------------------------------------------------
/docs/09_troubleshooting.md:
--------------------------------------------------------------------------------
1 | <!---Any additional tips.--->
2 | + If the workflow fails please run it with the demo data set to ensure the workflow itself is working. This will help us determine if the issue is related to the environment, input parameters or a bug.
3 | + See how to interpret some common nextflow exit codes [here](https://labs.epi2me.io/trouble-shooting/).
4 | + If the workflow breaks with a memory error, try running the workflow again with a reduced chunk size parameter.


--------------------------------------------------------------------------------
/docs/10_FAQ.md:
--------------------------------------------------------------------------------
1 | <!---Frequently asked questions, pose any known limitations as FAQ's.--->
2 | * Does the workflow have support for a scaffolding tool? * - Currently we do not support any scaffolding tool but you may like to try [Yahs](https://academic.oup.com/bioinformatics/article/39/1/btac808/6917071).
3 | 
4 | If your question is not answered here, please report any issues or suggestions on the [github issues](https://github.com/epi2me-labs/wf-pore-c/issues) page or start a discussion on the [community](https://community.nanoporetech.com/).


--------------------------------------------------------------------------------
/docs/11_other.md:
--------------------------------------------------------------------------------
1 | + [Importing third-party workflows into EPI2ME Labs](https://labs.epi2me.io/nexflow-for-epi2melabs/)
2 | 
3 | See the [EPI2ME website](https://labs.epi2me.io/) for lots of other resources and blog posts.


--------------------------------------------------------------------------------
/lib/ArgumentParser.groovy:
--------------------------------------------------------------------------------
 1 | /* Check arguments of a Nextflow function
 2 |  *
 3 |  * Nextflow script does not support the Groovy idiom:
 4 |  *
 5 |  *     def function(Map args[:], arg1, arg2, ...)
 6 |  * 
 7 |  * to support unordered kwargs. The methods here are designed
 8 |  * to reduce boileplate while allowing Nextflow script to implement
 9 |  *
10 |  *     def function(Map args[:])
11 |  *
12 |  * with required and default values. This is similar to some Python
13 |  * libraries' (notably matplotlib) extensive use of things like:
14 |  *
15 |  *     def function(*args, **kwargs)
16 |  *
17 |  * to implement generic APIs. Why do we want to do all this? Because
18 |  * we want to write library code with a clean set of required parameters
19 |  * but also extensible with non-required parameters with default values.
20 |  * This allows us to later add parameters without breaking existing code,
21 |  * and is very common practice elsewhere.
22 |  */
23 | 
24 | import java.util.Set
25 | 
26 | class ArgumentParser {
27 |     Set args
28 |     Map kwargs
29 |     String name
30 | 
31 |     /* Parse arguments, raising an error on unknown keys */
32 |     public Map parse_args(LinkedHashMap given_args) {
33 |         Set opt_keys = kwargs.keySet()
34 |         Set given_keys = given_args.keySet()
35 |         check_required(given_keys)
36 |         check_unknown(given_keys, opt_keys)
37 |         return kwargs + given_args
38 |     }
39 |     
40 |     /* Parse arguments, without raising an error for extra keys */
41 |     public Map parse_known_args(LinkedHashMap given_args) {
42 |         Set opt_keys = kwargs.keySet()
43 |         Set given_keys = given_args.keySet()
44 |         check_required(given_keys)
45 |         return kwargs + given_args
46 |     }
47 |     
48 |     private void check_required(Set given) {
49 |         Set missing_keys = args - given
50 |         if (!missing_keys.isEmpty()) {
51 |             throw new Exception("Missing arguments for function ${name}: ${missing_keys}")
52 |         }
53 |     }
54 |     
55 |     private void check_unknown(Set given, Set kwargs_keys) {
56 |         Set extra_keys = given - (args + kwargs_keys)
57 |         if (!extra_keys.isEmpty()) {
58 |             throw new Exception("Unknown arguments provided to function ${name}: ${extra_keys}.")
59 |         }
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/lib/CWUtil.groovy:
--------------------------------------------------------------------------------
 1 | /* Miscellaneous utilities for workflows from the ONT Customer Workflows Group.
 2 |  */
 3 | class CWUtil {
 4 | 
 5 |     /* Mutate the global Nextflow params map
 6 |     *
 7 |     * Occasionally, we may wish to mutate the value of a parameter provided
 8 |     * by the user. Typically, this leads to workflows with `params.my_param`
 9 |     * and `params._my_param` which is ripe for confusion. Instead, we can
10 |     * mutate the parameter value in the Nextflow params ScriptMap itself
11 |     * with the following call:
12 |     *
13 |     *     CWUtil.mutateParam(params, k, v)
14 |     *
15 |     * This is possible as Groovy actually has a surprisingly loose
16 |     * definition of "private", and allows us to call the private `allowNames`
17 |     * method on the ScriptMap which removes the read-only status for a key set.
18 |     * We can follow this up with a call to the private `put0` to reinsert
19 |     * the key and mark it as read-only again.
20 |     */
21 |     public static void mutateParam(nf_params, key, value) {
22 |         Set s = [key] // must be a set to allow call to allowNames
23 |         nf_params.allowNames(s)
24 |         nf_params.put0(key, value)
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/lib/NfcoreTemplate.groovy:
--------------------------------------------------------------------------------
  1 | //
  2 | // This file holds several functions used within the nf-core pipeline template.
  3 | //
  4 | 
  5 | // MIT License
  6 | // 
  7 | // Copyright (c) 2018 nf-core
  8 | // 
  9 | // Permission is hereby granted, free of charge, to any person obtaining a copy
 10 | // of this software and associated documentation files (the "Software"), to deal
 11 | // in the Software without restriction, including without limitation the rights
 12 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 13 | // copies of the Software, and to permit persons to whom the Software is
 14 | // furnished to do so, subject to the following conditions:
 15 | // 
 16 | // The above copyright notice and this permission notice shall be included in all
 17 | // copies or substantial portions of the Software.
 18 | // 
 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 20 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 22 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 24 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 25 | // SOFTWARE.
 26 | 
 27 | 
 28 | import org.yaml.snakeyaml.Yaml
 29 | 
 30 | class NfcoreTemplate {
 31 | 
 32 |     //
 33 |     // Check AWS Batch related parameters have been specified correctly
 34 |     //
 35 |     public static void awsBatch(workflow, params) {
 36 |         if (workflow.profile.contains('awsbatch')) {
 37 |             // Check params.awsqueue and params.awsregion have been set if running on AWSBatch
 38 |             assert (params.awsqueue && params.awsregion) : "Specify correct --awsqueue and --awsregion parameters on AWSBatch!"
 39 |             // Check outdir paths to be S3 buckets if running on AWSBatch
 40 |             assert params.outdir.startsWith('s3:')       : "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!"
 41 |         }
 42 |     }
 43 | 
 44 |     //
 45 |     // Check params.hostnames
 46 |     //
 47 |     public static void hostName(workflow, params, log) {
 48 |         Map colors = logColours(params.monochrome_logs)
 49 |         if (params.hostnames) {
 50 |             try {
 51 |                 def hostname = "hostname".execute().text.trim()
 52 |                 params.hostnames.each { prof, hnames ->
 53 |                     hnames.each { hname ->
 54 |                         if (hostname.contains(hname) && !workflow.profile.contains(prof)) {
 55 |                             log.info "=${colors.yellow}====================================================${colors.reset}=\n" +
 56 |                                 "${colors.yellow}WARN: You are running with `-profile $workflow.profile`\n" +
 57 |                                 "      but your machine hostname is ${colors.white}'$hostname'${colors.reset}.\n" +
 58 |                                 "      ${colors.yellow_bold}Please use `-profile $prof${colors.reset}`\n" +
 59 |                                 "=${colors.yellow}====================================================${colors.reset}="
 60 |                         }
 61 |                     }
 62 |                 }
 63 |             } catch (Exception e) {
 64 |                 log.warn "[$workflow.manifest.name] Could not determine 'hostname' - skipping check. Reason: ${e.message}."
 65 |             }
 66 |         }
 67 |     }
 68 | 
 69 |     //
 70 |     // Generate version string
 71 |     //
 72 |     public static String version(workflow) {
 73 |         String version_string = ""
 74 | 
 75 |         if (workflow.manifest.version) {
 76 |             def prefix_v = workflow.manifest.version[0] != 'v' ? 'v' : ''
 77 |             version_string += "${prefix_v}${workflow.manifest.version}"
 78 |         }
 79 | 
 80 |         if (workflow.commitId) {
 81 |             def git_shortsha = workflow.commitId.substring(0, 7)
 82 |             version_string += "-g${git_shortsha}"
 83 |         }
 84 | 
 85 |         return version_string
 86 |     }
 87 | 
 88 |     //
 89 |     // Construct and send completion email
 90 |     //
 91 |     public static void email(workflow, params, summary_params, projectDir, log, multiqc_report=[], fail_mapped_reads=[:]) {
 92 | 
 93 |         // Set up the e-mail variables
 94 |         def subject = "[$workflow.manifest.name] Successful: $workflow.runName"
 95 |         if (fail_mapped_reads.size() > 0) {
 96 |             subject = "[$workflow.manifest.name] Partially successful (${fail_mapped_reads.size()} skipped): $workflow.runName"
 97 |         }
 98 |         if (!workflow.success) {
 99 |             subject = "[$workflow.manifest.name] FAILED: $workflow.runName"
100 |         }
101 | 
102 |         def summary = [:]
103 |         for (group in summary_params.keySet()) {
104 |             summary << summary_params[group]
105 |         }
106 | 
107 |         def misc_fields = [:]
108 |         misc_fields['Date Started']              = workflow.start
109 |         misc_fields['Date Completed']            = workflow.complete
110 |         misc_fields['Pipeline script file path'] = workflow.scriptFile
111 |         misc_fields['Pipeline script hash ID']   = workflow.scriptId
112 |         if (workflow.repository) misc_fields['Pipeline repository Git URL']    = workflow.repository
113 |         if (workflow.commitId)   misc_fields['Pipeline repository Git Commit'] = workflow.commitId
114 |         if (workflow.revision)   misc_fields['Pipeline Git branch/tag']        = workflow.revision
115 |         misc_fields['Nextflow Version']           = workflow.nextflow.version
116 |         misc_fields['Nextflow Build']             = workflow.nextflow.build
117 |         misc_fields['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp
118 | 
119 |         def email_fields = [:]
120 |         email_fields['version']           = NfcoreTemplate.version(workflow)
121 |         email_fields['runName']           = workflow.runName
122 |         email_fields['success']           = workflow.success
123 |         email_fields['dateComplete']      = workflow.complete
124 |         email_fields['duration']          = workflow.duration
125 |         email_fields['exitStatus']        = workflow.exitStatus
126 |         email_fields['errorMessage']      = (workflow.errorMessage ?: 'None')
127 |         email_fields['errorReport']       = (workflow.errorReport ?: 'None')
128 |         email_fields['commandLine']       = workflow.commandLine
129 |         email_fields['projectDir']        = workflow.projectDir
130 |         email_fields['summary']           = summary << misc_fields
131 |         email_fields['fail_mapped_reads'] = fail_mapped_reads.keySet()
132 |         email_fields['min_mapped_reads']  = params.min_mapped_reads
133 | 
134 |         // On success try attach the multiqc report
135 |         def mqc_report = null
136 |         try {
137 |             if (workflow.success && !params.skip_multiqc) {
138 |                 mqc_report = multiqc_report.getVal()
139 |                 if (mqc_report.getClass() == ArrayList && mqc_report.size() >= 1) {
140 |                     if (mqc_report.size() > 1) {
141 |                         log.warn "[$workflow.manifest.name] Found multiple reports from process 'MULTIQC', will use only one"
142 |                     }
143 |                     mqc_report = mqc_report[0]
144 |                 }
145 |             }
146 |         } catch (all) {
147 |             if (multiqc_report) {
148 |                 log.warn "[$workflow.manifest.name] Could not attach MultiQC report to summary email"
149 |             }
150 |         }
151 | 
152 |         // Check if we are only sending emails on failure
153 |         def email_address = params.email
154 |         if (!params.email && params.email_on_fail && !workflow.success) {
155 |             email_address = params.email_on_fail
156 |         }
157 | 
158 |         // Render the TXT template
159 |         def engine       = new groovy.text.GStringTemplateEngine()
160 |         def tf           = new File("$projectDir/assets/email_template.txt")
161 |         def txt_template = engine.createTemplate(tf).make(email_fields)
162 |         def email_txt    = txt_template.toString()
163 | 
164 |         // Render the HTML template
165 |         def hf            = new File("$projectDir/assets/email_template.html")
166 |         def html_template = engine.createTemplate(hf).make(email_fields)
167 |         def email_html    = html_template.toString()
168 | 
169 |         // Render the sendmail template
170 |         def max_multiqc_email_size = params.max_multiqc_email_size as nextflow.util.MemoryUnit
171 |         def smail_fields           = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes() ]
172 |         def sf                     = new File("$projectDir/assets/sendmail_template.txt")
173 |         def sendmail_template      = engine.createTemplate(sf).make(smail_fields)
174 |         def sendmail_html          = sendmail_template.toString()
175 | 
176 |         // Send the HTML e-mail
177 |         Map colors = logColours(params.monochrome_logs)
178 |         if (email_address) {
179 |             try {
180 |                 if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') }
181 |                 // Try to send HTML e-mail using sendmail
182 |                 [ 'sendmail', '-t' ].execute() << sendmail_html
183 |                 log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (sendmail)-"
184 |             } catch (all) {
185 |                 // Catch failures and try with plaintext
186 |                 def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ]
187 |                 if ( mqc_report.size() <= max_multiqc_email_size.toBytes() ) {
188 |                     mail_cmd += [ '-A', mqc_report ]
189 |                 }
190 |                 mail_cmd.execute() << email_html
191 |                 log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (mail)-"
192 |             }
193 |         }
194 | 
195 |         // Write summary e-mail HTML to a file
196 |         def output_d = new File("${params.outdir}/pipeline_info/")
197 |         if (!output_d.exists()) {
198 |             output_d.mkdirs()
199 |         }
200 |         def output_hf = new File(output_d, "pipeline_report.html")
201 |         output_hf.withWriter { w -> w << email_html }
202 |         def output_tf = new File(output_d, "pipeline_report.txt")
203 |         output_tf.withWriter { w -> w << email_txt }
204 |     }
205 | 
206 |     //
207 |     // Print pipeline summary on completion
208 |     //
209 |     public static void summary(workflow, params, log, fail_mapped_reads=[:], pass_mapped_reads=[:]) {
210 |         Map colors = logColours(params.monochrome_logs)
211 | 
212 |         if (pass_mapped_reads.size() > 0) {
213 |             def idx = 0
214 |             def samp_aln = ''
215 |             def total_aln_count = pass_mapped_reads.size() + fail_mapped_reads.size()
216 |             for (samp in pass_mapped_reads) {
217 |                 samp_aln += "    ${samp.value}: ${samp.key}\n"
218 |                 idx += 1
219 |                 if (idx > 5) {
220 |                     samp_aln += "    ..see pipeline reports for full list\n"
221 |                     break;
222 |                 }
223 |             }
224 |             log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} ${pass_mapped_reads.size()}/$total_aln_count samples passed Bowtie2 ${params.min_mapped_reads} mapped read threshold:\n${samp_aln}${colors.reset}-"
225 |         }
226 |         if (fail_mapped_reads.size() > 0) {
227 |             def samp_aln = ''
228 |             for (samp in fail_mapped_reads) {
229 |                 samp_aln += "    ${samp.value}: ${samp.key}\n"
230 |             }
231 |             log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} ${fail_mapped_reads.size()} samples skipped since they failed Bowtie2 ${params.min_mapped_reads} mapped read threshold:\n${samp_aln}${colors.reset}-"
232 |         }
233 | 
234 |         if (workflow.success) {
235 |             if (workflow.stats.ignoredCount == 0) {
236 |                 log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-"
237 |             } else {
238 |                 log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed successfully, but with errored process(es) ${colors.reset}-"
239 |             }
240 |         } else {
241 |             hostName(workflow, params, log)
242 |             log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed with errors${colors.reset}-"
243 |         }
244 |     }
245 | 
246 |     //
247 |     // ANSII Colours used for terminal logging
248 |     //
249 |     public static Map logColours(Boolean monochrome_logs) {
250 |         Map colorcodes = [:]
251 | 
252 |         // Reset / Meta
253 |         colorcodes['reset']      = monochrome_logs ? '' : "\033[0m"
254 |         colorcodes['bold']       = monochrome_logs ? '' : "\033[1m"
255 |         colorcodes['dim']        = monochrome_logs ? '' : "\033[2m"
256 |         colorcodes['underlined'] = monochrome_logs ? '' : "\033[4m"
257 |         colorcodes['blink']      = monochrome_logs ? '' : "\033[5m"
258 |         colorcodes['reverse']    = monochrome_logs ? '' : "\033[7m"
259 |         colorcodes['hidden']     = monochrome_logs ? '' : "\033[8m"
260 | 
261 |         // Regular Colors
262 |         colorcodes['black']      = monochrome_logs ? '' : "\033[0;30m"
263 |         colorcodes['red']        = monochrome_logs ? '' : "\033[0;31m"
264 |         colorcodes['green']      = monochrome_logs ? '' : "\033[0;32m"
265 |         colorcodes['yellow']     = monochrome_logs ? '' : "\033[0;33m"
266 |         colorcodes['blue']       = monochrome_logs ? '' : "\033[0;34m"
267 |         colorcodes['purple']     = monochrome_logs ? '' : "\033[0;35m"
268 |         colorcodes['cyan']       = monochrome_logs ? '' : "\033[0;36m"
269 |         colorcodes['white']      = monochrome_logs ? '' : "\033[0;37m"
270 | 
271 |         // Bold
272 |         colorcodes['bblack']     = monochrome_logs ? '' : "\033[1;30m"
273 |         colorcodes['bred']       = monochrome_logs ? '' : "\033[1;31m"
274 |         colorcodes['bgreen']     = monochrome_logs ? '' : "\033[1;32m"
275 |         colorcodes['byellow']    = monochrome_logs ? '' : "\033[1;33m"
276 |         colorcodes['bblue']      = monochrome_logs ? '' : "\033[1;34m"
277 |         colorcodes['bpurple']    = monochrome_logs ? '' : "\033[1;35m"
278 |         colorcodes['bcyan']      = monochrome_logs ? '' : "\033[1;36m"
279 |         colorcodes['bwhite']     = monochrome_logs ? '' : "\033[1;37m"
280 | 
281 |         // Underline
282 |         colorcodes['ublack']     = monochrome_logs ? '' : "\033[4;30m"
283 |         colorcodes['ured']       = monochrome_logs ? '' : "\033[4;31m"
284 |         colorcodes['ugreen']     = monochrome_logs ? '' : "\033[4;32m"
285 |         colorcodes['uyellow']    = monochrome_logs ? '' : "\033[4;33m"
286 |         colorcodes['ublue']      = monochrome_logs ? '' : "\033[4;34m"
287 |         colorcodes['upurple']    = monochrome_logs ? '' : "\033[4;35m"
288 |         colorcodes['ucyan']      = monochrome_logs ? '' : "\033[4;36m"
289 |         colorcodes['uwhite']     = monochrome_logs ? '' : "\033[4;37m"
290 | 
291 |         // High Intensity
292 |         colorcodes['iblack']     = monochrome_logs ? '' : "\033[0;90m"
293 |         colorcodes['ired']       = monochrome_logs ? '' : "\033[0;91m"
294 |         colorcodes['igreen']     = monochrome_logs ? '' : "\033[0;92m"
295 |         colorcodes['iyellow']    = monochrome_logs ? '' : "\033[0;93m"
296 |         colorcodes['iblue']      = monochrome_logs ? '' : "\033[0;94m"
297 |         colorcodes['ipurple']    = monochrome_logs ? '' : "\033[0;95m"
298 |         colorcodes['icyan']      = monochrome_logs ? '' : "\033[0;96m"
299 |         colorcodes['iwhite']     = monochrome_logs ? '' : "\033[0;97m"
300 | 
301 |         // Bold High Intensity
302 |         colorcodes['biblack']    = monochrome_logs ? '' : "\033[1;90m"
303 |         colorcodes['bired']      = monochrome_logs ? '' : "\033[1;91m"
304 |         colorcodes['bigreen']    = monochrome_logs ? '' : "\033[1;92m"
305 |         colorcodes['biyellow']   = monochrome_logs ? '' : "\033[1;93m"
306 |         colorcodes['biblue']     = monochrome_logs ? '' : "\033[1;94m"
307 |         colorcodes['bipurple']   = monochrome_logs ? '' : "\033[1;95m"
308 |         colorcodes['bicyan']     = monochrome_logs ? '' : "\033[1;96m"
309 |         colorcodes['biwhite']    = monochrome_logs ? '' : "\033[1;97m"
310 | 
311 |         return colorcodes
312 |     }
313 | 
314 |     //
315 |     // Does what is says on the tin
316 |     //
317 |     public static String dashedLine(monochrome_logs) {
318 |         Map colors = logColours(monochrome_logs)
319 |         return "${colors.dim}--------------------------------------------------------------------------------${colors.reset}"
320 |     }
321 | 
322 |     // epi2me-labs logo
323 |     public static String logo(workflow, monochrome_logs) {
324 |         Map colors = NfcoreTemplate.logColours(monochrome_logs)
325 |         String workflow_name = workflow.manifest.name.split("/")[1]
326 |         String workflow_version = version(workflow)
327 |         String.format(
328 |             """
329 |             ${colors.igreen}||||||||||   ${colors.reset}${colors.dim}_____ ____ ___ ____  __  __ _____      _       _
330 |             ${colors.igreen}||||||||||  ${colors.reset}${colors.dim}| ____|  _ \\_ _|___ \\|  \\/  | ____|    | | __ _| |__  ___
331 |             ${colors.yellow}|||||       ${colors.reset}${colors.dim}|  _| | |_) | |  __) | |\\/| |  _| _____| |/ _` | '_ \\/ __|
332 |             ${colors.yellow}|||||       ${colors.reset}${colors.dim}| |___|  __/| | / __/| |  | | |__|_____| | (_| | |_) \\__ \\
333 |             ${colors.iblue}||||||||||  ${colors.reset}${colors.dim}|_____|_|  |___|_____|_|  |_|_____|    |_|\\__,_|_.__/|___/
334 |             ${colors.iblue}||||||||||  ${colors.reset}${colors.bold}${workflow_name} ${workflow_version}${colors.reset}
335 |             ${NfcoreTemplate.dashedLine(monochrome_logs)}
336 |             """.stripIndent()
337 |         )
338 |     }
339 | }
340 | 


--------------------------------------------------------------------------------
/lib/Pinguscript.groovy:
--------------------------------------------------------------------------------
  1 | import static groovy.json.JsonOutput.toJson
  2 | import groovy.json.JsonBuilder
  3 | import groovy.json.JsonSlurper
  4 | 
  5 | 
  6 | class Pinguscript {
  7 | 
  8 |     // Send a ping for the start of a workflow
  9 |     public static void ping_start(nextflow, workflow, params) {
 10 |         wf_ping(nextflow, workflow, "start", null, params)
 11 |     }
 12 |     // Send a ping for a completed workflow (successful or otherwise)
 13 |     public static void ping_complete(nextflow, workflow, params) {
 14 |         wf_ping(nextflow, workflow, "end", null, params)
 15 |     }
 16 |     // Send a ping for a workflow error
 17 |     public static void ping_error(nextflow, workflow, params) {
 18 |         def error_message = workflow.errorMessage
 19 |         wf_ping(nextflow, workflow, "error", error_message, params)
 20 |     }
 21 |     // Shared handler to construct a ping JSON and send it
 22 |     private static String wf_ping(nextflow, workflow, event, error_message, params) {
 23 |         if (params.disable_ping) {
 24 |             return "{}"
 25 |         }
 26 |         def body_json = make_wf_ping(nextflow, workflow, event, error_message, params)
 27 |         send_ping_post("epilaby", body_json)
 28 |     }
 29 | 
 30 |     // Helper to removing keys from a map
 31 |     private static clean_meta(meta, keys_to_remove) {
 32 |         for (key in keys_to_remove) {
 33 |             if (meta.containsKey(key)) {
 34 |                 meta.remove(key)
 35 |             }
 36 |         }
 37 |     }
 38 | 
 39 |     // Helper for fetching a key from the params map
 40 |     // seems pointless but you just know someone is going to end up writing meta.this ? meta.that
 41 |     private static get_meta(meta, key) {
 42 |         (meta.containsKey(key) && meta[key]) ? meta[key].toString() : null
 43 |     }
 44 | 
 45 |     // Construct workflow ping JSON
 46 |     private static String make_wf_ping(nextflow, workflow, event, error_message, params) {
 47 |         // cheeky deepcopy using json
 48 |         String paramsJSON = new JsonBuilder(params).toPrettyString()
 49 |         def params_data = new JsonSlurper().parseText(paramsJSON)
 50 | 
 51 |         // hostname
 52 |         def host = null
 53 |         try {
 54 |             host = InetAddress.getLocalHost().getHostName()
 55 |         }
 56 |         catch(Exception e) {}
 57 | 
 58 |         // OS
 59 |         // TODO check version on WSL
 60 |         def opsys = System.properties['os.name'].toLowerCase()
 61 |         def opver = System.properties['os.version']
 62 |         if (opver.toLowerCase().contains("wsl")){
 63 |             opsys = "wsl"
 64 |         }
 65 | 
 66 |         // placeholder for any future okta business
 67 |         // for now we'll use the guest_<ulid> sent to wf.epi2me_user
 68 |         def user = get_meta(params.wf, "epi2me_user")
 69 | 
 70 |         // drop cruft to save some precious bytes
 71 |         // affects the deep copy rather than original params
 72 |         clean_meta(params_data, [
 73 |             "schema_ignore_params",
 74 |         ])
 75 |         def ingress_ids = []
 76 |         if (params_data.containsKey("wf")) {
 77 |             ingress_ids = params_data.wf["ingress.run_ids"] ?: []
 78 |             clean_meta(params_data.wf, [
 79 |                 "agent", // we send this later
 80 |                 "epi2me_instance", // we send this later
 81 |                 "epi2me_user", // we send this later
 82 |                 "example_cmd",
 83 |                 "ingress.run_ids", // we will send this elsewhere
 84 |             ])
 85 |         }
 86 | 
 87 |         // try and get runtime information
 88 |         def cpus = null
 89 |         try {
 90 |             cpus = Runtime.getRuntime().availableProcessors()
 91 |         }
 92 |         catch(Exception e) {}
 93 | 
 94 |         def workflow_success = null
 95 |         def workflow_exitcode = null
 96 |         if (event != "start") {
 97 |             workflow_success = workflow.success
 98 |             workflow_exitcode = workflow.exitStatus
 99 |         }
100 | 
101 |         /// build message
102 |         def body_json = new JsonBuilder()
103 |         body_json \
104 |             "tracking_id": [
105 |                 "msg_id": UUID.randomUUID().toString(),
106 |                 "version": "3.0.0"
107 |             ],
108 |             "source": "workflow",
109 |             "event": event,
110 |             "params": params_data,
111 |             // data will be null on start events, as ingress has not run
112 |             "data": event != "start" ? [run_ids: ingress_ids] : null,
113 |             "workflow": [
114 |                 "name": workflow.manifest.name,
115 |                 "version": workflow.manifest.version, // could use NfcoreTemplate.version(workflow)
116 |                 "run_name": workflow.runName, // required to disambiguate sessions
117 |                 "session": workflow.sessionId,
118 |                 "profile": workflow.profile,
119 |                 "resume": workflow.resume,
120 |                 "error": error_message, // null if no error
121 |                 "success": workflow_success,
122 |                 "exitcode": workflow_exitcode,
123 |             ],
124 |             "env": [
125 |                 "user": user, // placeholder for any future okta
126 |                 "hostname": host,
127 |                 "os": [
128 |                     "name": opsys,
129 |                     "version": opver
130 |                 ],
131 |                 "resource": [
132 |                     "cpus": cpus,
133 |                     "memory": null, // placeholder, no point asking via Runtime as it will just give us the Xmx size
134 |                 ],
135 |                 "agent": get_meta(params.wf, "agent"), // access via original params
136 |                 "epi2me": [
137 |                     "instance": get_meta(params.wf, "epi2me_instance"),
138 |                     "user": user,
139 |                 ],
140 |                 "nextflow": [
141 |                     "version": nextflow.version.toString(),
142 |                     "version_compat": nextflow.version.matches(workflow.manifest.nextflowVersion)
143 |                 ]
144 |             ]
145 |         return body_json
146 |     }
147 | 
148 |     // Send a JSON payload to a given endpoint
149 |     private static String send_ping_post(endpoint, body_json) {
150 |         // Attempt to send payload and absorb any possible Exception gracefully
151 |         String postResult
152 |         boolean raise_exception = false
153 |         try {
154 |             ((HttpURLConnection)new URL("https://ping.oxfordnanoportal.com/${endpoint}").openConnection()).with({
155 |                 requestMethod = 'POST'
156 |                 doOutput = true
157 |                 setConnectTimeout(5000)
158 |                 setReadTimeout(10000)
159 |                 setRequestProperty('Content-Type', 'application/json')
160 |                 setRequestProperty('accept', 'application/json')
161 |                 outputStream.withPrintWriter({printWriter ->
162 |                     printWriter.write(body_json.toString())
163 |                 })
164 | 
165 |                 // Rethrow exceptions that imply we're not using this endpoint properly
166 |                 if(responseCode >= 400 && agent.toString() == "cw-ci") {
167 |                     raise_exception = true
168 |                 }
169 |                 // Accessing inputStream.text will raise an Exception for failed requests
170 |                 postResult = inputStream.text
171 |             })
172 |         }
173 |         catch(Exception e) {
174 |             if(raise_exception) { throw e }
175 |         }
176 |         return (postResult)
177 |     }
178 | }
179 | 


--------------------------------------------------------------------------------
/lib/WorkflowMain.groovy:
--------------------------------------------------------------------------------
 1 | // This file is based on the nf-core/tools pipeline-template.
 2 | // Changes to this file must be propagated via wf-template.
 3 | 
 4 | class WorkflowMain {
 5 | 
 6 |     // Citation string for pipeline
 7 |     public static String citation(workflow) {
 8 |         return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" +
 9 |             "* The nf-core framework\n" +
10 |             "  https://doi.org/10.1038/s41587-020-0439-x\n\n"
11 |     }
12 | 
13 |     // Generate help string
14 |     public static String help(workflow, params, log) {
15 |         String line_sep = ' \\ \n\t'
16 |         String command_example = params.wf.example_cmd.join(line_sep)
17 |         String command = 'nextflow run ' + workflow.manifest.name + line_sep + command_example
18 |         String help_string = ''
19 |         help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs)
20 |         help_string += NfcoreSchema.paramsHelp(workflow, params, command)
21 |         help_string += '\n' + citation(workflow) + '\n'
22 |         return help_string
23 |     }
24 | 
25 |     // Generate parameter summary log string
26 |     public static String paramsSummaryLog(workflow, params, log) {
27 |         String workflow_version = NfcoreTemplate.version(workflow)
28 |         String summary_log = ''
29 |         summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs)
30 |         summary_log += NfcoreSchema.paramsSummaryLog(workflow, params)
31 |         summary_log += '\n' + citation(workflow) + '\n'
32 |         summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs)
33 |         summary_log += "\nThis is ${workflow.manifest.name} ${workflow_version}.\n"
34 |         summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs)
35 |         return summary_log
36 |     }
37 | 
38 |     // Validate parameters and print summary to screen
39 |     public static void initialise(workflow, params, log) {
40 |         // Print help to screen if required
41 |         if (params.help) {
42 |             log.info help(workflow, params, log)
43 |             System.exit(0)
44 |         }
45 | 
46 |         // Print workflow version and exit on --version
47 |         if (params.version) {
48 |             String workflow_version = NfcoreTemplate.version(workflow)
49 |             log.info "${workflow.manifest.name} ${workflow_version}"
50 |             System.exit(0)
51 |         }
52 | 
53 |         // Explode on conda
54 |         // conda.enabled seems to be backward compatible but wrap this
55 |         // in a generic catch just in case
56 |         try {
57 |             if (workflow.session.config.conda.enabled) {
58 |                 log.error "Sorry, this workflow is not compatible with Conda, please use -profile standard (Docker) or -profile singularity."
59 |                 System.exit(1)
60 |             }
61 |         } catch(Exception e) {}
62 | 
63 |         // Validate workflow parameters via the JSON schema
64 |         if (params.validate_params) {
65 |             NfcoreSchema.validateParameters(workflow, params, log)
66 |         }
67 | 
68 |         // Print parameter summary log to screen
69 |         log.info paramsSummaryLog(workflow, params, log)
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/lib/common.nf:
--------------------------------------------------------------------------------
 1 | import groovy.json.JsonBuilder
 2 | 
 3 | process getParams {
 4 |     label "wf_common"
 5 |     cpus 1
 6 |     memory "2 GB"
 7 |     output:
 8 |         path "params.json"
 9 |     script:
10 |         def paramsJSON = new JsonBuilder(params).toPrettyString()
11 |     """
12 |     # Output nextflow params object to JSON
13 |     echo '$paramsJSON' > params.json
14 |     """
15 | }
16 | 
17 | process configure_igv {
18 |     publishDir "${params.out_dir}/", mode: 'copy', pattern: 'igv.json', enabled: params.containsKey("igv") && params.igv
19 |     label "wf_common"
20 |     cpus 1
21 |     memory "2 GB"
22 |     input:
23 |         // the python script will work out what to do with all the files based on their
24 |         // extensions
25 |         path "file-names.txt"
26 |         val locus_str
27 |         val aln_extra_opts
28 |         val var_extra_opts
29 |     output: path "igv.json"
30 |     script:
31 |     // the locus argument just makes sure that the initial view in IGV shows something
32 |     // interesting
33 |     String locus_arg = locus_str ? "--locus $locus_str" : ""
34 |     // extra options for alignment tracks
35 |     def aln_opts_json_str = \
36 |         aln_extra_opts ? new JsonBuilder(aln_extra_opts).toPrettyString() : ""
37 |     String aln_extra_opts_arg = \
38 |         aln_extra_opts ? "--extra-alignment-opts extra-aln-opts.json" : ""
39 |     // extra options for variant tracks
40 |     def var_opts_json_str = \
41 |         var_extra_opts ? new JsonBuilder(var_extra_opts).toPrettyString() : ""
42 |     String var_extra_opts_arg = \
43 |         var_extra_opts ? "--extra-vcf-opts extra-var-opts.json" : ""
44 |     """
45 |     # write out JSON files with extra options for the alignment and variant tracks
46 |     echo '$aln_opts_json_str' > extra-aln-opts.json
47 |     echo '$var_opts_json_str' > extra-var-opts.json
48 | 
49 |     workflow-glue configure_igv \
50 |         --fofn file-names.txt \
51 |         $locus_arg \
52 |         $aln_extra_opts_arg \
53 |         $var_extra_opts_arg \
54 |     > igv.json
55 |     """
56 | }
57 | 
58 | 


--------------------------------------------------------------------------------
/lib/nfcore_external_java_deps.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/lib/nfcore_external_java_deps.jar


--------------------------------------------------------------------------------
/main.nf:
--------------------------------------------------------------------------------
  1 | #!usr/bin/env nextflow
  2 | import groovy.json.JsonBuilder
  3 | nextflow.enable.dsl = 2
  4 | include {
  5 |     fastq_ingress
  6 |     xam_ingress
  7 | } from "./lib/ingress"
  8 | include {
  9 |     index_ref_fai
 10 |     decompress_ref
 11 |     publish_artifact
 12 |     merge_namesorted_bams
 13 |     merge_namesorted_bams as merge_paired_end_bams
 14 |     merge_coordsorted_bams
 15 |     mosdepth_coverage
 16 |     get_filtered_out_bam
 17 |     index_vcf
 18 | } from './modules/local/common'
 19 | 
 20 | include {
 21 |     digest_align_annotate
 22 |     haplotagReads as haplotag_alignments
 23 |     merge_parquets_to_dataset
 24 | } from './modules/local/pore-c'
 25 | include {
 26 |    to_pairs_file
 27 |    pairsToCooler
 28 |    merge_mcools
 29 |    merge_pairs
 30 |    merge_pairs_stats
 31 |    create_restriction_bed
 32 |    pair_stats_report
 33 |    prepare_hic
 34 |    createBed
 35 |    mergeBed
 36 | } from './modules/local/4dn'
 37 | 
 38 | 
 39 | include { prepare_genome } from "./subworkflows/local/prepare_genome"
 40 | 
 41 | OPTIONAL_FILE = file("$projectDir/data/OPTIONAL_FILE")
 42 | 
 43 | // bamindex will work with bam or fastq format file as input
 44 | process index_bam {
 45 |     label "wfporec"
 46 |     cpus 4
 47 |     memory "8 GB"
 48 |     input:
 49 |         tuple val(meta), path("concatemers.bam")
 50 |         val chunk_size
 51 |     output:
 52 |         tuple val(meta), path("concatemers.bam"), path("concatemers.bam.bci"), path("indexed_chunks.csv")
 53 |     shell:
 54 |         args = task.ext.args ?: " "
 55 |     """
 56 |     bamindex build -c ${params.chunk_size} -t ${task.cpus} concatemers.bam
 57 |     bamindex dump concatemers.bam.bci > chunks.csv
 58 |     awk -F' ' -v OFS=' ' 'NR == 1 {print "ID", \$0; next} {print (NR-2), \$0}' chunks.csv > indexed_chunks.csv
 59 |     """
 60 | }
 61 | 
 62 | 
 63 | process getVersions {
 64 |     label "wfporec"
 65 |     cpus 4
 66 |     memory "4 GB"
 67 |     output:
 68 |         path "versions.txt"
 69 |     script:
 70 |     """
 71 |     fastcat --version | sed 's/^/fastcat,/' >> versions.txt
 72 |     mosdepth --version | sed 's/ /,/' >> versions.txt
 73 |     pairtools --version | sed 's/\\<version\\>//g' >> versions.txt
 74 |     whatshap --version | sed 's/^/whatshap,/' >> versions.txt
 75 |     pore-c-py --version | sed 's/ /,/' >> versions.txt
 76 |     samtools --version | (head -n 1 && exit 0) | sed 's/ /,/' >> versions.txt
 77 |     """
 78 | }
 79 | 
 80 | 
 81 | process getParams {
 82 |     label "wfporec"
 83 |     cpus 1
 84 |     memory "4 GB"
 85 |     output:
 86 |         path "params.json"
 87 |     script:
 88 |         String paramsJSON = new JsonBuilder(params).toPrettyString()
 89 |     """
 90 |     # Output nextflow params object to JSON
 91 |     echo '$paramsJSON' > params.json
 92 |     """
 93 | }
 94 | 
 95 | 
 96 | process makeReport {
 97 |     label "wf_common"
 98 |     cpus 4
 99 |     memory "15 GB"
100 |     input:
101 |         val metadata
102 |         path(stats, stageAs: "stats_*")
103 |         path "versions/*"
104 |         path "params.json"
105 |         val wf_version
106 |     output:
107 |         path "wf-pore-c-report.html"
108 |     script:
109 |         String report_name = "wf-pore-c-report.html"
110 |         String metadata = new JsonBuilder(metadata).toPrettyString()
111 |     """
112 |     echo '${metadata}' > metadata.json
113 |     workflow-glue report $report_name \
114 |         --metadata metadata.json \
115 |         --stats $stats \
116 |         --versions versions \
117 |         --params params.json \
118 |         --wf_version $wf_version
119 |     """
120 | }
121 | 
122 | // Creates a new directory named after the sample alias and moves the ingress results
123 | // into it. So output folder will contain alias named folders with stats.
124 | process collectIngressResultsInDir {
125 |     label "wf_common"
126 |     input:
127 |         // inputs might be `OPTIONAL_FILE` --> stage in different sub-directories
128 |         // to avoid name collisions
129 |         tuple val(meta),
130 |             path(stats, stageAs: "stats/*")
131 |     output:
132 |         // use sub-dir to avoid name clashes (in the unlikely event of a sample alias
133 |         // being `reads` or `stats`)
134 |         tuple path("out/*"), val("ingress_results")
135 |     script:
136 |     String outdir = "out/${meta["alias"]}"
137 |     String metaJson = new JsonBuilder(meta).toPrettyString()
138 |     String stats = stats.fileName.name == OPTIONAL_FILE.name ? "" : stats
139 |     """
140 |     mkdir -p $outdir
141 |     echo '$metaJson' > metamap.json
142 |     mv metamap.json $stats $outdir
143 |     """
144 | }
145 | 
146 | 
147 | // See https://github.com/nextflow-io/nextflow/issues/1636. This is the only way to
148 | // publish files from a workflow whilst decoupling the publish from the process steps.
149 | // The process takes a tuple containing the filename and the name of a sub-directory to
150 | // put the file into. If the latter is `null`, puts it into the top-level directory.
151 | process publish {
152 |     // publish inputs to output directory
153 |     label "wfporec"
154 |     cpus 1
155 |     memory "4 GB"
156 |     publishDir (
157 |         params.out_dir,
158 |         mode: "copy",
159 |         saveAs: { dirname ? "$dirname/$fname" : fname }
160 |     )
161 |     input:
162 |         tuple path(fname), val(dirname)
163 |     output:
164 |         path fname
165 |     """
166 |     """
167 | }
168 | 
169 | // entrypointworkflow
170 | WorkflowMain.initialise(workflow, params, log)
171 | 
172 | workflow POREC {
173 |     main:
174 |         Pinguscript.ping_start(nextflow, workflow, params)
175 |         /// PREPARE INPUTS  ///
176 | 
177 |         if (params.fastq) {
178 |             sample_data = fastq_ingress([
179 |                 "input":params.fastq,
180 |                 "sample":params.sample,
181 |                 "sample_sheet":params.sample_sheet,
182 |                 "analyse_unclassified":params.analyse_unclassified,
183 |                 "stats": true,
184 |                 "fastcat_extra_args": "",
185 |             ])
186 |             // fastq_ingress doesn't have the index; add one extra null for compatibility.
187 |             // We do not use variable name as assigning variable name with a tuple
188 |             // not matching (e.g. meta, bam, bai, stats <- [meta, bam, stats]) causes
189 |             // the workflow to crash.
190 |             sample_data = sample_data
191 |                 .map{
192 |                     it.size() == 4 ? it : [it[0], it[1], null, it[2]]
193 |                 }
194 |         } else {
195 |         // if we didn't get a `--fastq`, there must have been a `--bam` (as is codified
196 |         // by the schema)
197 |             sample_data = xam_ingress([
198 |                 "input":params.bam,
199 |                 "sample":params.sample,
200 |                 "sample_sheet":params.sample_sheet,
201 |                 "analyse_unclassified":params.analyse_unclassified,
202 |                 "keep_unaligned": true,
203 |                 "stats": true,
204 |             ])
205 |         }
206 | 
207 |         // create channel of input chimeric reads
208 |         input_reads = sample_data.map{meta, path, index, stats -> [meta, path]}
209 | 
210 |         if (params.chunk_size > 0) {
211 |             chunks = index_bam(input_reads, channel.value(params.chunk_size))
212 |             // create tuple for each region
213 |             reads = chunks
214 |                 .map{meta, bam, bai, chunk_csv ->
215 |                     tuple(meta, bam, bai,chunk_csv.splitCsv(header: ['index','region', 'ref'], skip: 1 , sep:' '))}
216 |                 .transpose()
217 |                 .map{ meta, bam, bai, chunk_index ->
218 |                     [meta, bam, bai, chunk_index.index, chunk_index.ref]}
219 |         } else {
220 |             // Add optional file and nulls to satisfy channel structure.
221 |             // These values are ignored in digest_align_annotate
222 |             reads = input_reads.combine(Channel.of(tuple(OPTIONAL_FILE, null, null)))
223 |         }
224 |         if (!params.sample_sheet) {
225 |             if (params.vcf){
226 |                 // If vcf index does not exist create index
227 |                 vcf_channel = Channel.of(file(params.vcf, checkExists:true))
228 |                 def candidate_tbi = file("${params.vcf}.tbi")
229 |                 vcf_file_tmp = input_reads.combine(vcf_channel).map{ meta, path, vcf -> [meta, vcf]}
230 |                 if (candidate_tbi.exists()){
231 |                     tbi_file = Channel.of(candidate_tbi)
232 |                     vcf_file = vcf_channel
233 |                 } else {
234 |                     vcf = index_vcf(vcf_file_tmp)
235 |                     vcf_file = vcf.map{meta, vcf, tbi -> vcf}.flatten()
236 |                     tbi_file = vcf.map{meta, vcf, tbi -> tbi}.flatten()
237 |                 }
238 |             } else {
239 |                 vcf_file = Channel.of(OPTIONAL_FILE)
240 |                 tbi_file = Channel.of(OPTIONAL_FILE)
241 |             }
242 |             ch_chunks = reads 
243 |             | combine(vcf_file) 
244 |             | combine(tbi_file)
245 |             | map{meta, bam, index, chunk_index, chunk_ref, vcf, tbi ->
246 |                 if (!params.vcf){
247 |                     vcf = null
248 |                     tbi = null
249 |                 }
250 |                 [meta + [cutter: params.cutter, vcf:vcf, tbi:tbi],
251 |                 bam, index, chunk_index, chunk_ref]}
252 |         } else {
253 |             // check if vcf exists if not set to null and haplotag will be skipped
254 |             // Branch to get samples with vcf
255 |             sample_data
256 |                 | map{
257 |                     meta, path, index, stats ->
258 |                     def vcf_file = meta["vcf"] ? file(meta["vcf"], checkExists: true) : null
259 |                     def tbi_file = file(meta["vcf"] + '.tbi')
260 |                     def tbi = vcf_file && tbi_file.exists() ? tbi_file : null
261 |                     [meta, vcf_file, tbi]
262 |                 }
263 |                 | branch{
264 |                     indexed_vcf: it[1] != null && it[2] != null
265 |                     unindexed_vcf: it[1] != null && it[2] == null
266 |                     no_vcf: true
267 |                 } | set{vcf_fork}
268 |             // Index vcfs with no existing index
269 |             vcf = index_vcf(vcf_fork.unindexed_vcf.map{meta, vcf, index -> [meta, vcf]})
270 |             // Combine back with any samples that have index
271 |             vcf_index = vcf_fork.indexed_vcf.mix(vcf)
272 |             // Combine back with samples that have no vcf
273 |             per_sample = vcf_fork.no_vcf.mix(vcf_index)
274 |             | map{meta, vcf, tbi -> [meta.alias, vcf, tbi]}         
275 |             // combine with output of ingress
276 |             combined_samples = reads
277 |             .map { [it[0]["alias"], *it] }
278 |             .combine(per_sample, by: 0)
279 |             .map { it[1..-1] }
280 |             // add tuple values to meta data
281 |             pre_chunks = combined_samples.map{meta, bam, index, chunk_index, chunk_ref, vcf_file, tbi_file ->
282 |             [meta + [vcf:vcf_file, tbi:tbi_file], bam, index, chunk_index, chunk_ref]}
283 |             // use params.cutter if it was missing from user provided sample_sheet
284 |             ch_chunks = pre_chunks.map{ meta, bam, index, chunk_index, chunk_ref -> 
285 |                 if (meta.cutter && params.cutter){
286 |                     log.warn("Using cutter: ${meta.cutter} from sample sheet column for ${meta.alias}")
287 |                 }
288 |                 cutter = meta.cutter ?: params.cutter
289 |                 return [ meta + ["cutter": cutter], bam, index, chunk_index, chunk_ref]
290 |             }   
291 |         }
292 |         ref = prepare_genome(params.ref, params.minimap2_settings)
293 |         
294 |         /// RUN PORE-C TOOLS ///
295 |         chunks_refs = ch_chunks.combine(ref.mmi).combine(ref.minimap2_settings)
296 | 
297 |         ch_annotated_monomers = digest_align_annotate(chunks_refs)
298 | 
299 |         // create a fork for samples that have phase info available
300 |         ch_annotated_monomers.cs_bam
301 |             .branch{
302 |                 to_haplotag: it[0].vcf != null
303 |                 no_haplotag: it[0].vcf == null
304 |             }
305 |             .set { haplotag_fork }
306 |         // haplotag bams when we have VCF available
307 |         (haplotag_fork
308 |             .to_haplotag  // [meta, bam bai]
309 |             .combine(ref.fasta)
310 |             .combine(ref.fai)
311 |             .map(i -> {
312 |                 [
313 |                 i[0], // meta
314 |                 i[1], // bam
315 |                 i[2], // bai
316 |                 i[3], // fasta
317 |                 i[4], // fai
318 |                 i[0].vcf, // vcf
319 |                 i[0].tbi, // tbi
320 |                 ]
321 |             })) | haplotag_alignments | set {haplotagged_monomers}
322 | 
323 |         // merge haplotagged and non-haplotagged coord-sorted bam chunks
324 |         // back to single channel
325 |         haplotag_fork
326 |             .no_haplotag
327 |             .mix(haplotagged_monomers.cs_bam)
328 |             .set { cs_bam_chunks }
329 | 
330 |         /// MERGE PORE-C BAMS ///
331 | 
332 |         // merge coord-sorted bams by alias
333 |         cs_bam = merge_coordsorted_bams(
334 |             cs_bam_chunks.map(i -> [i[0], i[1]])
335 |             .groupTuple()
336 |         )
337 |         // merge namesorted bams by alias
338 |         ns_bam = merge_namesorted_bams(
339 |             ch_annotated_monomers
340 |             .ns_bam
341 |             .map(i -> [i[0],  i[1]])
342 |             .groupTuple()
343 |         )
344 | 
345 |         if (params.coverage || params.pairs || params.mcool || params.hi_c) {
346 |             // for each cutter a bed file of the fragments
347 |             digest_ch = create_restriction_bed(
348 |                 ch_chunks.map{meta, bam, index, chunk_index, chunk_ref -> meta.cutter}
349 |                 .unique()
350 |                 .combine(ref.fasta)
351 |                 .combine(ref.fai)
352 |             )
353 |         }
354 | 
355 |         /// COVERAGE CALCULATIONS
356 |         if (params.coverage) {
357 |             // calculate coverage on the merged BAM
358 |             digest_ch
359 |                 .cross(
360 |                     cs_bam
361 |                     .map(i -> [i[0].cutter, i[0], i[1], i[2]]) // [key, meta, bam, bai]
362 |                 )
363 |                 .map(i -> [
364 |                     i[1][1], // meta
365 |                     i[1][2], // bam
366 |                     i[1][3], // bai
367 |                     i[0][2], // bed
368 |                 ]) | mosdepth_coverage | set{ coverage }
369 |         }
370 |         /// 4DN file formats
371 |         if (params.pairs || params.mcool || params.hi_c) {
372 |             (digest_ch
373 |                 .cross(
374 |                     ch_annotated_monomers
375 |                     .ns_bam
376 |                     .map(i -> [i[0].cutter, i[0], i[1]]) // [key, meta, bam]
377 |                 )
378 |                 .map(i -> [
379 |                         i[1][1], // meta
380 |                         i[1][2], // bam
381 |                         i[0][1], // fai
382 |                         i[0][2], // bed
383 |                     ])
384 |                 ) | to_pairs_file | set {pair_chunks}
385 |             
386 |             if (params.mcool) {
387 |                 mcool_chunks = pairsToCooler(
388 |                     pair_chunks
389 |                     .pairs
390 |                     .combine(Channel.of(params.cool_bin_size))
391 |                 )
392 |                 mcool = merge_mcools(
393 |                     mcool_chunks
394 |                     .groupTuple()
395 |                     .combine(Channel.of(params.mcool_resolutions))
396 |                 )
397 |             }
398 |             if (params.pairs || params.hi_c) {
399 |                 unsorted_pairs = merge_pairs(
400 |                     pair_chunks.pairs.map(i -> [i[0], i[2]]).groupTuple()
401 |                 )
402 |                 pairs_stats = merge_pairs_stats(
403 |                     pair_chunks.stats.groupTuple()
404 |                 )
405 |                 pairs_report = pair_stats_report(
406 |                     pairs_stats
407 |                 )
408 |           
409 |             }
410 |         }
411 |         /// CHROMUNITY
412 |         if (params.chromunity) {
413 |             chromunity_pq = merge_parquets_to_dataset(
414 |                 ch_annotated_monomers
415 |                 .chromunity_pq
416 |                 .groupTuple()
417 |             )
418 |         }
419 | 
420 |         /// Paired end bams
421 |         if (params.paired_end) {
422 |             pe_bam = merge_paired_end_bams(
423 |                 ch_annotated_monomers
424 |                 .paired_end_bam
425 |                 .map(i -> [i[0],  i[1]])
426 |                 .groupTuple()
427 |             )
428 |         }
429 | 
430 |         // Make a report
431 |         software_versions = getVersions()
432 |         workflow_params = getParams()
433 |     
434 |         // get metadata and stats files, keeping them ordered (could do with transpose I suppose)
435 |         sample_data.multiMap{ meta, path, index, stats ->
436 |             meta: meta
437 |             stats: stats
438 |         }.set { for_report }
439 |         metadata = for_report.meta.collect()
440 |         // create a file list of the stats, and signal if its empty or not
441 |         stats = for_report.stats | collect
442 |         report = makeReport(
443 |             metadata, stats, software_versions, workflow_params, workflow.manifest.version
444 |         )
445 | 
446 |         if (params.hi_c){
447 |             hi_c = prepare_hic(merge_pairs.out.merged_pairs.combine(ref.fai))
448 |         }
449 | 
450 |         if (params.bed){
451 |             bed_chunks = createBed(ch_annotated_monomers.paired_end_bam)
452 |             mergeBed(bed_chunks.groupTuple())
453 | 
454 |         }
455 |       
456 | 
457 |         sample_data
458 |         | map {
459 |             meta, path, index, stats ->
460 |             if (stats) [ meta, stats ]
461 |         }
462 |         | collectIngressResultsInDir
463 | 
464 | 
465 |         // Group together lists of filtered reads from all the processed chunks
466 |         named_filtered_read_ids = ch_annotated_monomers.filtered_read_ids.groupTuple().map{ meta, read_ids -> tuple(meta.alias, read_ids)}
467 |         named_reads = input_reads.map{ meta, reads -> tuple(meta.alias, reads)}
468 |         // Combine with input reads
469 |         filtered_reads = named_filtered_read_ids.join(named_reads, remainder:false)
470 |         // Retrieve filtered out BAM from list of filtered reads per sample
471 |         filtered_out = get_filtered_out_bam(filtered_reads)
472 | 
473 | 
474 |     emit:
475 |         name_sorted_bam = ns_bam
476 |         coord_sorted_bam = cs_bam
477 |         report = report
478 |         ingress_results = collectIngressResultsInDir.out
479 | }
480 | 
481 | workflow {
482 |     if (params.containsKey("params_sheet")) {
483 |         error = "`--params_sheet` parameter is deprecated. Use parameter `--sample_sheet` instead."
484 |     }
485 |     POREC()
486 |     publish(POREC.out.ingress_results)
487 | }
488 | 
489 | workflow.onComplete {
490 |     Pinguscript.ping_complete(nextflow, workflow, params)
491 | }
492 | workflow.onError {
493 |     Pinguscript.ping_error(nextflow, workflow, params)
494 | }
495 | 


--------------------------------------------------------------------------------
/modules/local/4dn.nf:
--------------------------------------------------------------------------------
  1 | #!usr/bin/env nextflow
  2 | nextflow.enable.dsl = 2
  3 | 
  4 | 
  5 | process to_pairs_file {
  6 |     label 'wfporec'
  7 |     cpus 2
  8 |     memory "8 GB"
  9 |     input:
 10 |         tuple val(meta), path("monomers.mm2.ns.bam"), path("fasta.fai"), path("fragments.bed")
 11 |     output:
 12 |         tuple val(meta), path("fasta.fai"), path("${meta.alias}.pairs.gz"), emit: "pairs"
 13 |         tuple val(meta), path("${meta.alias}.stats.txt"), emit: "stats"
 14 |     shell:
 15 |         def args = task.ext.args ?: "--drop-sam --drop-seq --expand --add-pair-index --add-columns mapq,pos5,pos3,cigar,read_len,matched_bp,algn_ref_span,algn_read_span,dist_to_5,dist_to_3,mismatches"
 16 |     """
 17 |     pairtools parse2  \
 18 |     --output-stats "${meta.alias}.stats.txt" \
 19 |     -c "fasta.fai" --single-end --readid-transform 'readID.split(":")[0]' \
 20 |     $args "monomers.mm2.ns.bam" > extract_pairs.tmp
 21 |     pairtools restrict  -f "fragments.bed" -o "${meta.alias}.pairs.gz" extract_pairs.tmp
 22 |     rm -rf extract_pairs.tmp
 23 |     """
 24 | }
 25 | 
 26 | 
 27 | process prepare_hic {
 28 |     label 'wfporec'
 29 |     cpus 2
 30 |     memory "31 GB"
 31 |     input:
 32 |         tuple val(meta), path("input.pairs.gz"), path("fasta.fai")
 33 |     output:
 34 |         path "${meta.alias}.hic", emit: hic
 35 |     """
 36 |     cut -f1,2 fasta.fai > sizes.genome
 37 |     pairtools flip input.pairs.gz -c sizes.genome  > flipped.pairs.tmp
 38 |     pairtools sort flipped.pairs.tmp > sorted.pairs.tmp
 39 |     pairtools dedup --chunksize ${params.pairtools_chunksize} sorted.pairs.tmp > dedup.pairs.tmp
 40 |     java -jar /home/epi2melabs/juicer_tools_1.22.01.jar pre dedup.pairs.tmp "${meta.alias}.hic" sizes.genome
 41 |     rm -rf "*.pairs.tmp"
 42 |     """
 43 | }
 44 | 
 45 | process merge_pairs {
 46 |     label 'wfporec'
 47 |     cpus 2
 48 |     memory "8 GB"
 49 |     input:
 50 |         tuple val(meta), path('to_merge/{?}.gz')
 51 |     output:
 52 |         tuple val(meta), path("${prefix}.pairs.gz"), emit: merged_pairs
 53 |     shell:
 54 |         prefix = task.ext.prefix ?: "${meta.alias}"
 55 |         def args = task.ext.args ?: "--concatenate"
 56 |     """
 57 |     # pass a quoted glob, pairtools will do its own globbing
 58 |     pairtools merge -o "${prefix}.pairs.gz" $args 'to_merge/*'
 59 |     """
 60 | }
 61 | 
 62 | process merge_pairs_stats {
 63 |     label 'wfporec'
 64 |     cpus 2
 65 |     memory "4 GB"
 66 |     input: 
 67 |         tuple val(meta), path('to_merge/src*.stats.txt')
 68 |     output: 
 69 |         tuple val(meta), path("${prefix}.pairs.stats.txt")
 70 |     shell:
 71 |         prefix = task.ext.prefix ?: "${meta.alias}"
 72 |         def args = task.ext.args ?: "--merge "
 73 |     """
 74 |     pairtools stats -o "${prefix}.pairs.stats.txt" $args to_merge/src*.stats.txt
 75 |     """
 76 | }
 77 | 
 78 | process pair_stats_report {
 79 |     label 'wfporec'
 80 |     cpus 2
 81 |     memory "4 GB"
 82 |     input: 
 83 |         tuple val(meta), path("pairs.stats.txt")
 84 |     output:
 85 |         tuple val(meta), path("${prefix}.pairs.stats.html")
 86 |     shell:
 87 |         prefix = task.ext.prefix ?: "${meta.alias}"
 88 |     """
 89 |     create_pairs_report.py "pairs.stats.txt" "${prefix}.pairs.stats.html"
 90 |     """
 91 | }
 92 | 
 93 | process create_restriction_bed {
 94 |     label 'wfporec'
 95 |     cpus 2
 96 |     memory "4 GB"
 97 |     input:
 98 |         tuple val(enzyme), path("reference.fasta"), path("reference.fasta.fai")
 99 |     output:
100 |         tuple val(enzyme), path("reference.fasta.fai"), path("fragments.bed")
101 |     shell:
102 |         def args = task.ext.args ?: " "
103 |     """
104 |     cooler digest -o "fragments.bed" $args "reference.fasta.fai" "reference.fasta"  $enzyme
105 |     """
106 | }
107 | 
108 | process pairsToCooler {
109 |     label 'wfporec'
110 |     cpus 2
111 |     memory "4 GB"
112 |     input:
113 |         tuple val(meta), path(fai), path(pairs), val(min_bin_width)
114 |     output:
115 |         tuple val(meta), path("${pairs.baseName}.cool")
116 |     shell:
117 |     """
118 |     cooler cload pairs -c1 2 -p1 3 -c2 4 -p2 5 $fai:${min_bin_width} $pairs ${pairs.baseName}.cool
119 |     """
120 | }
121 | 
122 | process merge_mcools {
123 |     label 'wfporec'
124 |     cpus 2
125 |     memory "15 GB"
126 |     input:
127 |         tuple val(meta), path('to_merge/src*.cool'), val(resolutions)
128 |     output:
129 |         tuple val(meta), path("${prefix}.mcool")
130 |     shell:
131 |         prefix = task.ext.prefix ?: "${meta.alias}"
132 |         def args = task.ext.args ?: " "
133 |     """
134 |     cooler merge  ${prefix}.cool $args to_merge/src*.cool
135 |     cooler zoomify -r ${resolutions} -o ${prefix}.mcool  ${prefix}.cool
136 |     """
137 | }
138 | 
139 | 
140 | process createBed {
141 |     label 'wfporec'
142 |     cpus 2
143 |     memory "4 GB"
144 |     input:
145 |         tuple val(meta), path("monomers.mm2.ns.bam")
146 |     output:
147 |         tuple val(meta), path("${meta.alias}.${task.index}.bed")
148 |     // Use Sed to remove coordinates from monomer names
149 |     // as only required for pairtools.
150 |     """
151 |     bedtools bamtobed -i monomers.mm2.ns.bam > tmp.out.bed
152 |     sed -E 's/:[0-9]+//g' tmp.out.bed > "${meta.alias}.${task.index}.bed"
153 |     rm -rf tmp*
154 |     """
155 | }
156 | 
157 | 
158 | process mergeBed {
159 |     label 'wfporec'
160 |     cpus params.threads
161 |     memory "16 GB"
162 |     input:
163 |         tuple val(meta), path('to_merge/src*.bed')
164 |     output:
165 |         tuple val(meta), path("${meta.alias}.bed")
166 |     // Merge and sort by the monomer ID so contacts are grouped
167 |     // and remove any duplicates.
168 |     """
169 |     cat to_merge/* > tmp.bed
170 |     sort --parallel=${task.cpus} -S 15G -k4,4 tmp.bed | uniq > "${meta.alias}.bed"
171 |     rm -rf tmp*
172 |     """
173 | }
174 | 


--------------------------------------------------------------------------------
/modules/local/common.nf:
--------------------------------------------------------------------------------
  1 | process index_ref_fai {
  2 |     label 'wfporec'
  3 |     memory "15 GB"
  4 |     cpus 1
  5 |     input:
  6 |         path "reference.fasta"
  7 |     output:
  8 |         path "reference.fasta.fai", emit: reference_index
  9 |     """
 10 |     samtools faidx "reference.fasta"
 11 |     """
 12 | }
 13 | 
 14 | process index_ref_mmi {
 15 |     label 'wfporec'
 16 |     memory "15 GB"
 17 |     cpus 4
 18 |     input:
 19 |         path "reference.fasta"
 20 |         val(minimap_settings)
 21 |     output:
 22 |         path "reference.fasta.mmi"
 23 |     """
 24 |     minimap2 ${minimap_settings} -d "reference.fasta.mmi" "reference.fasta"
 25 |     """
 26 | }
 27 | 
 28 | // NOTE -f required to compress symlink
 29 | process decompress_ref {
 30 |     label 'wfporec'
 31 |     memory "4 GB"
 32 |     cpus 1
 33 |     input:
 34 |         path compressed_ref
 35 |     output:
 36 |         path "${compressed_ref.baseName}", emit: decompressed_ref
 37 |     """
 38 |     gzip -df "${compressed_ref}"
 39 |     """
 40 | }
 41 | 
 42 | // See https://github.com/nextflow-io/nextflow/issues/1636
 43 | // This is the only way to publish files from a workflow whilst
 44 | // decoupling the publish from the process steps.
 45 | process publish_artifact {
 46 |     cpus 1
 47 |     memory "4 GB"
 48 |     label 'wfporec'
 49 |     publishDir "${params.out_dir}", mode: 'copy', pattern: "*"
 50 |     input:
 51 |         path fname
 52 |     output:
 53 |         path fname
 54 |     """
 55 |     echo "Writing output files"
 56 |     """
 57 | }
 58 | 
 59 | // TODO rewrite as single merge process
 60 | process merge_namesorted_bams {
 61 |     label 'wfporec'
 62 |     cpus 2
 63 |     memory "4 GB"
 64 |     input:
 65 |         tuple val(meta), path('to_merge/src*.bam')
 66 |     output:
 67 |         tuple val(meta), path("${prefix}.${suffix}.bam")
 68 |     shell:
 69 |         suffix = task.ext.suffix ?: "ns"
 70 |         prefix = task.ext.prefix ?: "${meta.alias}"
 71 |     """
 72 |     samtools cat --threads $task.cpus -o "${prefix}.${suffix}.bam" --no-PG to_merge/src*.bam
 73 |     """
 74 | }
 75 | 
 76 | process merge_coordsorted_bams {
 77 |     label 'wfporec'
 78 |     memory "8 GB"
 79 |     cpus params.threads
 80 |     input:
 81 |         tuple val(meta), path('to_merge/src*.bam')
 82 |     output:
 83 |         tuple val(meta), path("${prefix}.bam"), path("${prefix}.bam.csi")
 84 |     shell:
 85 |         prefix = task.ext.prefix ?: "${meta.alias}.cs"
 86 |     """
 87 |     samtools merge --threads $task.cpus -o "${prefix}.bam" -p --write-index --no-PG to_merge/src*.bam
 88 |     """
 89 | }
 90 | 
 91 | process mosdepth_coverage {
 92 |     label 'wfporec'
 93 |     cpus params.threads
 94 |     memory "4 GB"
 95 |     input:
 96 |         tuple val(meta),
 97 |               path("concatemers.cs.bam"),
 98 |               path("concatemers.cs.bam.csi"),
 99 |               path("fragments.bed")
100 |     output:
101 |         tuple val(meta),
102 |             path("${prefix}.per-base.d4"),
103 |             emit: d4
104 |         tuple val(meta),
105 |             path("${prefix}.regions.bed.gz"),
106 |             path("${prefix}.regions.bed.gz.csi"),
107 |             emit: regions
108 |         tuple val(meta),
109 |             path("${prefix}.thresholds.bed.gz"),
110 |             path("${prefix}.thresholds.bed.gz.csi"),
111 |             emit: thresholds
112 |         tuple val(meta),
113 |             path("${prefix}.mosdepth.*"),
114 |             emit: summaries
115 |     shell:
116 |         prefix = task.ext.prefix ?: "${meta.alias}"
117 |         args = task.ext.args ?: "--thresholds 1,10,30,60,100"
118 |     """
119 |     mosdepth --threads $task.cpus --d4 --by "fragments.bed" $args $prefix "concatemers.cs.bam"
120 |     """
121 | }
122 | 
123 | 
124 | process get_filtered_out_bam{
125 |     label "wfporec"
126 |     cpus 1
127 |     memory "15 GB"
128 |     input:
129 |         tuple val(alias), path ("filtered_files/?.txt"), path("concatemers.bam")
130 |     output:
131 |         path ("${alias}.filtered_out.bam")
132 |     // Output the list of reads that were filtered out of the analysis in a BAM. 
133 |     """
134 |     find -L filtered_files -name '*.txt' -exec cat {} + > filtered.txt
135 |     samtools view -N filtered.txt "concatemers.bam" > "${alias}".filtered_out.bam
136 |     """
137 | }
138 | 
139 | 
140 | process index_vcf {
141 |     label 'wfporec'
142 |     memory "4 GB"
143 |     cpus 3
144 |     input:
145 |         tuple val(meta), path(vcf)
146 |     output:
147 |         tuple val(meta), path("porec.vcf.gz"), path("porec.vcf.gz.tbi")
148 |     """
149 |     gzip -f -c -d "${vcf}" > "porec.vcf"
150 |     bgzip --threads ${task.cpus} "porec.vcf"
151 |     tabix "porec.vcf.gz"
152 |     """
153 | }
154 | 


--------------------------------------------------------------------------------
/modules/local/pore-c.nf:
--------------------------------------------------------------------------------
  1 | process digest_align_annotate {
  2 |     label 'pore_c_py'
  3 |     errorStrategy = 'retry'
  4 |     memory { 15.GB * task.attempt }
  5 |     maxRetries 1
  6 |     errorStrategy { task.exitStatus in 137..140 ? 'retry' : 'terminate' }
  7 |     cpus params.threads
  8 |     input:
  9 |         tuple val(meta), 
 10 |             path("concatemers.bam"),
 11 |             path("concatemers.bam.bci"),
 12 |             val(chunk_index), val(chunk_ref), path("reference.fasta.mmi"),
 13 |             val(minimap2_settings)
 14 |     output:
 15 |         tuple val(meta),
 16 |             path("${meta.alias}_out.ns.bam"),
 17 |             emit: ns_bam
 18 |         tuple val(meta),
 19 |             path("${meta.alias}.cs.bam"),
 20 |             path("${meta.alias}.cs.bam.csi"),
 21 |             emit: cs_bam
 22 |         tuple val(meta),
 23 |             path("${meta.alias}.chromunity.parquet"),
 24 |             emit: chromunity_pq, optional: true
 25 |         tuple val(meta),
 26 |             path("${meta.alias}.pe.bam"),
 27 |             emit: paired_end_bam, optional: true
 28 |         tuple val(meta),
 29 |             path("filtered_reads.txt"),
 30 |             emit: filtered_read_ids, optional: true
 31 |     script:
 32 |         args = task.ext.args ?: " "
 33 |         if (params.chromunity) {
 34 |             args += "--chromunity "
 35 |             if (params.chromunity_merge_distance != null) {
 36 |                 args += "--chromunity_merge_distance ${params.chromunity_merge_distance} "
 37 |             }
 38 |         }
 39 |         if (params.paired_end | params.bed) {
 40 |             args += "--paired_end "
 41 |             if (params.filter_pairs) {
 42 |                 args += "--filter_pairs "
 43 |                 if (params.paired_end_minimum_distance != null) {
 44 |                     args += "--paired_end_minimum_distance ${params.paired_end_minimum_distance} "
 45 |                 }
 46 |                 if (params.paired_end_maximum_distance != null) {
 47 |                     args += "--paired_end_maximum_distance ${params.paired_end_maximum_distance} "
 48 |                 }
 49 |             }
 50 |         }
 51 |         if (params.summary_json) {
 52 |             args  += "--summary "
 53 |         }
 54 |         def chunk = task.index - 1
 55 |         // 2 threads are recommended for each the pore-c-py processes
 56 |         def digest_annotate_threads = params.threads >= 8 ? 2 : 1 
 57 |         // if possible use 3 for samtools (--threads 2 + 1)
 58 |         def samtools_threads = params.threads >= 8 ? 2 : 1
 59 |         // calculate the left over threads for mapping and leave one as samtools will require 3
 60 |         def ubam_map_threads = params.threads - (digest_annotate_threads * 2) - samtools_threads - 1
 61 |         if (params.chunk_size > 0){
 62 |             """
 63 |             echo "${chunk_ref}"
 64 |             bamindex fetch --chunk=${chunk_index} "concatemers.bam" |
 65 |                 pore-c-py digest "${meta.cutter}" --max_monomers ${params.max_monomers} --excluded_list "filtered_reads.txt" \
 66 |                 --header "concatemers.bam" \
 67 |                 --threads ${digest_annotate_threads} |
 68 |             samtools fastq --threads 1 -T '*' |
 69 |             minimap2 -ay -t ${ubam_map_threads} ${minimap2_settings} --cap-kalloc 100m --cap-sw-mem 50m \
 70 |                 "reference.fasta.mmi" - |
 71 |             pore-c-py annotate - "${meta.alias}" --monomers \
 72 |                 --threads ${digest_annotate_threads}  --stdout ${args} | \
 73 |             tee "${meta.alias}_out.ns.bam" |
 74 |             samtools sort -m 1G --threads ${samtools_threads}  -u --write-index -o "${meta.alias}.cs.bam" -  
 75 |             """  
 76 |         }else{
 77 |             """
 78 |             pore-c-py digest "concatemers.bam" "${meta.cutter}" --max_monomers ${params.max_monomers} --excluded_list "filtered_reads.txt" \
 79 |                 --header "concatemers.bam" \
 80 |                 --threads ${digest_annotate_threads} | 
 81 |             samtools fastq --threads 1 -T '*' |
 82 |             minimap2 -ay -t ${ubam_map_threads} ${minimap2_settings} --cap-kalloc 100m --cap-sw-mem 50m \
 83 |                 "reference.fasta.mmi" - |
 84 |             pore-c-py annotate - "${meta.alias}" --monomers \
 85 |                 --threads ${digest_annotate_threads}  --stdout ${args} | \
 86 |             tee "${meta.alias}_out.ns.bam" |
 87 |             samtools sort -m 1G --threads ${samtools_threads}  -u --write-index -o "${meta.alias}.cs.bam" -  
 88 |             """  
 89 |         }
 90 |         
 91 | }
 92 | 
 93 | process haplotagReads {
 94 |     label 'wfporec'
 95 |     cpus 2
 96 |     memory "15 GB"
 97 |     input:
 98 |         tuple val(meta),
 99 |             path("concatemers.cs.bam"),
100 |             path("concatemers.cs.bam.csi"),
101 |             path("reference.fasta"),
102 |             path("reference.fasta.fai"),
103 |             path(phased_vcf),
104 |             path(phased_vcf_tbi)
105 |     output:
106 |         tuple val(meta),
107 |             path("${meta.alias}.ht.bam"),
108 |             path("${meta.alias}.ht.bam.csi"),
109 |             emit: "cs_bam"
110 |         tuple val(meta),
111 |             path("${meta.alias}.ht.txt.gz"),
112 |             emit: "haplotagged_monomers"
113 |     shell:
114 |         args = task.ext.args ?: "--ignore-read-groups --skip-missing-contigs "
115 |     """
116 |     whatshap haplotag --reference "reference.fasta"  -o "${meta.alias}.ht.bam" \
117 |     --output-haplotag-list "${meta.alias}.ht.txt.gz" $args "$phased_vcf" "concatemers.cs.bam"
118 |     samtools index -c "${meta.alias}.ht.bam"
119 |     """
120 | }
121 | 
122 | /// gather individual parquets into a single directory
123 | process merge_parquets_to_dataset {
124 |     label 'wfporec'
125 |     cpus 2
126 |     memory "4 GB"
127 |     input:
128 |     tuple val(meta),
129 |           path("to_merge/part?????.parquet")
130 |     output:
131 |         tuple val(meta),
132 |             path("$prefix"),
133 |             emit: "parquets"
134 |     shell:
135 |         prefix = task.ext.prefix ?: "${meta.alias}.chromunity.parquet"
136 |     """
137 |     mkdir $prefix
138 |     cp to_merge/part*.parquet  $prefix/
139 |     """
140 | }
141 | 


--------------------------------------------------------------------------------
/nextflow.config:
--------------------------------------------------------------------------------
  1 | //
  2 | // Notes to End Users.
  3 | //
  4 | // The workflow should run without editing this configuration file,
  5 | // however there may be instances in which you wish to edit this
  6 | // file for compute performance or other reasons. Please see:
  7 | //
  8 | //   https://nextflow.io/docs/latest/config.html#configuration
  9 | //
 10 | // for further help editing this file.
 11 | 
 12 | params {
 13 |     help = false
 14 |     version = false
 15 |     bam = null
 16 |     fastq = null
 17 |     ref = null
 18 |     cutter = 'NlaIII'
 19 |     out_dir = 'output'
 20 |     chunk_size = 20000
 21 |     sample = null
 22 |     vcf = null
 23 |     pairs = false
 24 |     mcool = false
 25 |     mcool_resolutions = '1000,2000,5000N' // 4DN tuple default
 26 |     coverage = false
 27 | 
 28 |     minimap2_settings = '-x map-ont'
 29 |     threads = 4
 30 | 
 31 |     aws_image_prefix = null
 32 |     aws_queue = null
 33 |     disable_ping = false
 34 | 
 35 |     analyse_unclassified = false
 36 |     monochrome_logs = false
 37 |     validate_params = true
 38 |     show_hidden_params = false
 39 |     schema_ignore_params = 'show_hidden_params,validate_params,monochrome_logs,aws_queue,aws_image_prefix,wf'
 40 |     chromunity = false
 41 |     chromunity_merge_distance = -1
 42 |     cool_bin_size = 1000
 43 |     paired_end = false
 44 |     summary_json = true
 45 |     filter_pairs = false
 46 |     paired_end_minimum_distance = -1
 47 |     paired_end_maximum_distance = -1
 48 |     sample_sheet = null
 49 |     hi_c = false
 50 |     bed = false
 51 |     pairtools_chunksize = 100000
 52 |     max_monomers = 250
 53 | 
 54 |     wf {
 55 |         name = 'wf-pore-c'
 56 |         example_cmd = [
 57 |             "--bam 'wf-pore-c-demo/porec_test.concatemers.bam'",
 58 |             "--chunk_size 100",
 59 |             "--cutter 'NlaIII'",
 60 |             "--hi_c",
 61 |             "--mcool",
 62 |             "--paired_end",
 63 |             "--paired_end_maximum_distance 200",
 64 |             "--paired_end_minimum_distance 100",
 65 |             "--phased_vcf 'wf-pore-c-demo/porec_test.phased_variants.vcf.gz'",
 66 |             "--ref 'wf-pore-c-demo/porec_test.fasta'",
 67 |             "--vcf 'wf-pore-c-demo/porec_test.phased_variants.vcf.gz'",
 68 |         ]
 69 |         common_sha = "shad28e55140f75a68f59bbecc74e880aeab16ab158"
 70 |         container_sha = 'sha3787c234c0cacf66a67fb77da223cc2e1cb0baf0'
 71 |         pore_c_py_sha = 'sha50378db56ddafe19f5e1d313ddb52dc70bbcc2bd'
 72 |         agent = null
 73 |     }
 74 | }
 75 | 
 76 | manifest {
 77 |     name            = 'epi2me-labs/wf-pore-c'
 78 |     author          = 'Oxford Nanopore Technologies'
 79 |     homePage        = 'https://github.com/epi2me-labs/wf-pore-c'
 80 |     description     = 'workflow for analysing pore-c data.'
 81 |     mainScript      = 'main.nf'
 82 |     nextflowVersion = '>=23.04.2'
 83 |     version         = 'v1.3.0'
 84 | }
 85 | 
 86 | epi2melabs {
 87 |   tags = "pore-c,contact,map,chromatin,conformation,capture"
 88 | }
 89 | 
 90 | // used by default for "standard" (docker) and singularity profiles,
 91 | // other profiles may override.
 92 | process {
 93 |     withLabel:wfporec {
 94 |         container = "ontresearch/wf-pore-c:${params.wf.container_sha}"
 95 |     }
 96 |     withLabel:pore_c_py {
 97 |         container = "ontresearch/pore-c-py:${params.wf.pore_c_py_sha}"
 98 |     }
 99 |     withLabel:wf_common {
100 |         container = "ontresearch/wf-common:${params.wf.common_sha}"
101 |     }
102 |     shell = ['/bin/bash', '-euo', 'pipefail']
103 |     withName: "merge_coordsorted_bams|merge_namesorted_bams" {
104 |         publishDir = [
105 |             path: { "${params.out_dir}/bams/" },
106 |             mode: "copy",
107 |             pattern: '*.{bam,csi,bai}'
108 |         ]
109 |     }
110 |     withName: "merge_pairs|merge_pairs_stats|merge_mcools|create_restriction_bed|pair_stats_report" {
111 |         publishDir = [
112 |             path: { "${params.out_dir}/pairs/" },
113 |             mode: "copy",
114 |             pattern: '*.{gz,stats.txt,bed,mcool,html}'
115 |         ]
116 |     }
117 |     withName: "mosdepth_coverage" {
118 |         publishDir = [
119 |             path: { "${params.out_dir}/coverage/" },
120 |             mode: "copy",
121 |             pattern: '*.*'
122 |         ]
123 |     }
124 |     withName: "merge_parquets_to_dataset" {
125 |         publishDir = [
126 |             path: { "${params.out_dir}/chromunity/" },
127 |             mode: "copy",
128 |             pattern: '*.*'
129 |         ]
130 |     }
131 |     withName: "merge_paired_end_bams" {
132 |         publishDir = [
133 |             path: { "${params.out_dir}/paired_end/" },
134 |             mode: "copy",
135 |             pattern: '*.{bam,csi,bai}'
136 |         ]
137 |     }
138 |     withName: "makeReport" {
139 |         publishDir = [
140 |             path: { "${params.out_dir}/" },
141 |             mode: "copy",
142 |             pattern: '*.{html}'
143 |         ]
144 |     }
145 |     withName: "prepare_hic" {
146 |         publishDir = [
147 |             path: { "${params.out_dir}/hi-c" },
148 |             mode: "copy",
149 |             pattern: '*.{hic}'
150 |         ]
151 |     }
152 |     withName: "mergeBed" {
153 |         publishDir = [
154 |             path: { "${params.out_dir}/bed" },
155 |             mode: "copy",
156 |             pattern: '*.{bed}'
157 |         ]
158 |     }
159 |     withName: "get_filtered_out_bam" {
160 |         publishDir = [
161 |             path: { "${params.out_dir}/filtered_out" },
162 |             mode: "copy",
163 |             pattern: '*.{bam}'
164 |         ]
165 |     }
166 | }
167 | 
168 | 
169 | profiles {
170 |     // the "standard" profile is used implicitely by nextflow
171 |     // if no other profile is given on the CLI
172 |     standard {
173 |         docker {
174 |             enabled = true
175 |             // this ensures container is run as host user and group, but
176 |             //    also adds host user to the within-container group
177 |             runOptions = "--user \$(id -u):\$(id -g) --group-add 100"
178 |         }
179 |     }
180 | 
181 |     // using singularity instead of docker
182 |     singularity {
183 |         singularity {
184 |             enabled = true
185 |             autoMounts = true
186 |         }
187 |     }
188 | 
189 | 
190 |     // keep stub conda profile to prevent unknown profile warning so users get a better error
191 |     conda {
192 |         conda.enabled = true
193 |     }
194 | 
195 |     // Using AWS batch.
196 |     // May need to set aws.region and aws.batch.cliPath
197 |     awsbatch {
198 |         process {
199 |             executor = 'awsbatch'
200 |             queue = "${params.aws_queue}"
201 |             memory = '8G'
202 |             withLabel:wfporec {
203 |                 container = "${params.aws_image_prefix}-wf-pore-c:${params.wf.container_sha}"
204 |             }
205 |             withLabel:pore_c_py {
206 |                  container = "${params.aws_image_prefix}-pore-c-py:${params.wf.pore_c_py_sha}"
207 |             }
208 |             withLabel:wf_common {
209 |                 container = "${params.aws_image_prefix}-wf-common:${params.wf.common_sha}"
210 |             }
211 |             shell = ['/bin/bash', '-euo', 'pipefail']
212 |         }
213 |     }
214 |     aws.region = 'eu-west-1'
215 |     aws.batch.cliPath = '/home/ec2-user/miniconda/bin/aws'
216 | 
217 |     // local profile for simplified development testing
218 |     local {
219 |         process.executor = 'local'
220 |     }
221 | }
222 | 
223 | 
224 | timeline {
225 |   enabled = true
226 |   overwrite = true
227 |   file = "${params.out_dir}/execution/timeline.html"
228 | }
229 | report {
230 |   enabled = true
231 |   overwrite = true
232 |   file = "${params.out_dir}/execution/report.html"
233 | }
234 | trace {
235 |   enabled = true
236 |   overwrite = true
237 |   file = "${params.out_dir}/execution/trace.txt"
238 | }
239 | 
240 | env {
241 |     PYTHONNOUSERSITE = 1
242 |     JAVA_TOOL_OPTIONS = "-Xlog:disable -Xlog:all=warning:stderr"
243 | }
244 | 


--------------------------------------------------------------------------------
/nextflow_schema.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "$schema": "http://json-schema.org/draft-07/schema",
  3 |     "$id": "https://raw.githubusercontent.com/./master/nextflow_schema.json",
  4 |     "title": "epi2me-labs/wf-pore-c",
  5 |     "workflow_title": "Pore-c Workflow",
  6 |     "description": "Workflow for analysing Pore-c data for chromatin conformation capture.",
  7 |     "demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-pore-c/wf-pore-c-demo.tar.gz",
  8 |     "aws_demo_url": "https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-pore-c/wf-pore-c-demo/aws.nextflow.config",
  9 |     "url": "https://github.com/epi2me-labs/wf-pore-c",
 10 |     "type": "object",
 11 |     "definitions": {
 12 |         "input_options": {
 13 |             "title": "Input Options",
 14 |             "type": "object",
 15 |             "fa_icon": "fas fa-terminal",
 16 |             "description": "Parameters for finding and handling input data for analysis.",
 17 |             "properties": {
 18 |                 "bam": {
 19 |                     "type": "string",
 20 |                     "format": "path",
 21 |                     "title": "Unaligned BAM",
 22 |                     "description": "An unaligned BAM file containing Pore-C concatemer sequences.",
 23 |                     "help_text": "This accepts one of three cases: (i) the path to a single BAM file; (ii) the path to a top-level directory containing BAM files; (iii) the path to a directory containing one level of sub-directories which in turn contain BAM files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`."
 24 |                 },
 25 |                 "fastq": {
 26 |                     "type": "string",
 27 |                     "format": "path",
 28 |                     "title": "FASTQ",
 29 |                     "description": "FASTQ files to use in the analysis.",
 30 |                     "help_text": "This accepts one of three cases: (i) the path to a single FASTQ file; (ii) the path to a top-level directory containing FASTQ files; (iii) the path to a directory containing one level of sub-directories which in turn contain FASTQ files. In the first and second case, a sample name can be supplied with `--sample`. In the last case, the data is assumed to be multiplexed with the names of the sub-directories as barcodes. In this case, a sample sheet can be provided with `--sample_sheet`."
 31 |                 },
 32 |                 "sample_sheet": {
 33 |                     "type": "string",
 34 |                     "format": "file-path",
 35 |                     "title": "Sample sheet",
 36 |                     "description": "A CSV file used to map barcodes to sample aliases and optionally provide per-sample parameters. The sample sheet can be provided when the input data is a directory containing sub-directories with FASTQ files.",
 37 |                     "help_text": "The sample sheet is a CSV file with, minimally, columns named `barcode` and `alias`. Optionally, a `cutter` column can contain the name of the enzyme used per sample (see the `--cutter` parameter for more details) and a `vcf` column can be used to provide a phased VCF file per sample if you require haplotagged alignments."
 38 |                 },
 39 |                 "sample": {
 40 |                     "type": "string",
 41 |                     "description": "A single sample name for non-multiplexed data. Permissible if passing a single .fastq(.gz) file or directory of .fastq(.gz) files."
 42 |                 },
 43 |                 "analyse_unclassified": {
 44 |                     "type": "boolean",
 45 |                     "default": false,
 46 |                     "description": "Analyse unclassified reads from input directory. By default the workflow will not process reads in the unclassified directory.",
 47 |                     "help_text": "If selected and if the input is a multiplex directory the workflow will also process the unclassified directory."
 48 |                 },
 49 |                 "ref": {
 50 |                     "type": "string",
 51 |                     "title": "Reference FASTA",
 52 |                     "format": "file-path",
 53 |                     "description": "A FASTA file containing the reference genome to map against."
 54 |                 },
 55 |                 "vcf": {
 56 |                     "type": "string",
 57 |                     "title": "VCF",
 58 |                     "format": "file-path",
 59 |                     "description": "An optional phased VCF file that will be used to haplotag alignments."
 60 |                 },
 61 |                 "cutter": {
 62 |                     "type": "string",
 63 |                     "default": "NlaIII",
 64 |                     "description": "The enzyme used in the restriction digest.",
 65 |                     "help_text": "Any enzyme from the Biopython restriction dictionary can be used. See `https://github.com/biopython/biopython/blob/master/Bio/Restriction/Restriction_Dictionary.py`. This can also be defined per sample: see `--sample_sheet` parameter."
 66 |                 }
 67 |             },
 68 |             "allOf": [
 69 |                 {
 70 |                     "required": [
 71 |                         "ref"
 72 |                     ]
 73 |                 },
 74 |                 {
 75 |                     "oneOf": [
 76 |                         {
 77 |                             "required": [
 78 |                                 "fastq"
 79 |                             ]
 80 |                         },
 81 |                         {
 82 |                             "required": [
 83 |                                 "bam"
 84 |                             ]
 85 |                         }
 86 |                     ]
 87 |                 }
 88 |             ]
 89 |         },
 90 |         "output_options": {
 91 |             "title": "Output Options",
 92 |             "type": "object",
 93 |             "description": "Parameters for saving and naming workflow outputs.",
 94 |             "default": "",
 95 |             "properties": {
 96 |                 "out_dir": {
 97 |                     "type": "string",
 98 |                     "default": "output",
 99 |                     "format": "directory-path",
100 |                     "description": "Directory for output of all user-facing files."
101 |                 },
102 |                 "hi_c": {
103 |                     "type": "boolean",
104 |                     "title": "Hi-C",
105 |                     "default": false,
106 |                     "description": "Output a Hi-C formatted file; will convert pairs format to a Hi-C (`.hic`) file which will be compatible with [juicer](https://github.com/aidenlab/juicer).",
107 |                     "help_text": "Load this file with [Juice box](https://www.aidenlab.org/juicebox/) for an alternative contact map visualisation."
108 |                 },
109 |                 "bed": {
110 |                     "type": "boolean",
111 |                     "title": "BED",
112 |                     "default": false,
113 |                     "description": "Output a BED file of the paired-end BAM alignments for use with downstream tools. Setting this to true will also trigger creation of the paired-end BAM.",
114 |                     "help_text": "Will use the paired-end BAM to create a BED file compatible with downstream tools including scaffolding tool [Yahs](https://github.com/c-zhou/yahs)."
115 |   
116 | 
117 |                 }
118 |             }
119 |         },
120 |         "advanced_options": {
121 |             "title": "Advanced Options",
122 |             "type": "object",
123 |             "description": "Avanced options for configuring processes inside the workflow.",
124 |             "default": "",
125 |             "properties": {
126 |                 "chunk_size": {
127 |                     "type": "integer",
128 |                     "default": 20000,
129 |                     "description": "Process input in chunks of this number of reads.",
130 |                     "help_text": "To reduce per-process memory requirements for large datasets, process the inputs in chunks of reads. Set to 0 to process entire dataset in one go."
131 |                 },
132 |                 "threads": {
133 |                     "type": "integer",
134 |                     "default": 4,
135 |                     "description": "Set maximum number of threads to use for more intense processes (limited by config executor cpus). We recommend a minimum of 4, but if available 19.",
136 |                     "help": "Increasing this will speed up some individual processes, but reduce the number of processes that can run in parallel, potentially increasing the time for the workflow to run overall."
137 |                 }
138 |             },
139 |             "help_text": "These advanced options do not need to be changed for typical use, but allow fine tuning of workflows for users who want more control over the workflow."
140 |         },
141 |         "pore_c_tools_options": {
142 |             "title": "Pore-C Tools Options",
143 |             "type": "object",
144 |             "description": "Parameters to control the pore-c tools",
145 |             "properties": {
146 |                 "minimap2_settings": {
147 |                     "type": "string",
148 |                     "default": "-x map-ont",
149 |                     "description": "The minimap2 settings for mapping monomers."
150 |                 },
151 |                 "max_monomers": {
152 |                     "type": "integer",
153 |                     "title": "Maximum monomers",
154 |                     "default": 250,
155 |                     "description": "The maximum number of monomers allowed for a read to be included in downstream analysis.",
156 |                     "help": "Any reads that have more than this number will be filtered out, and output in a per sample filtered_bam file."
157 |                 },
158 |                 "coverage": {
159 |                     "type": "boolean",
160 |                     "default": false,
161 |                     "description": "Calculate restriction-fragment coverage using mosdepth."
162 |                 },
163 |                 "summary_json": {
164 |                     "type": "boolean",
165 |                     "title": "Summary JSON",
166 |                     "default": true,
167 |                     "description": "Output pore-c-py annotation summary in json format."
168 |                 }
169 |             }
170 |         },
171 |         "chromunity_options": {
172 |             "title": "Chromunity Options",
173 |             "type": "object",
174 |             "description": "Create files for Chromunity analyses.",
175 |             "properties": {
176 |                 "chromunity": {
177 |                     "type": "boolean",
178 |                     "default": false,
179 |                     "description": "Create parquet files for Chromunity.",
180 |                     "help_text": "See the chromunity documentation for further details 'https://github.com/mskilab/chromunity'."
181 |                 },
182 |                 "chromunity_merge_distance": {
183 |                     "type": "integer",
184 |                     "default": -1,
185 |                     "description": "Merge colinear alignments separated by less than this base pair distance into a single monomer."
186 |                 }
187 |             }
188 |         },
189 |         "4dn_files_options": {
190 |             "title": "4DN files Options",
191 |             "type": "object",
192 |             "description": "Create files for the 4D nucleome toolset.",
193 |             "properties": {
194 |                 "pairs": {
195 |                     "type": "boolean",
196 |                     "default": false,
197 |                     "description": "Create a 4DN-format pairs file (also calculate stats).",
198 |                     "help_text": "Outputs a directory with a pairs stats report and a pairs file which can be used for downstream anaylsis."
199 |                 },
200 |                 "pairtools_chunksize": {
201 |                     "type": "integer",
202 |                     "default": 100000,
203 |                     "description": "Number of pairs to be processed in each chunk in the prepare_hic process which uses the pairtools dedup tool.",
204 |                     "help_text": "Reduce for lower memory footprint. Below 10,000 performance starts suffering significantly."
205 |                 },
206 |                 "mcool": {
207 |                     "type": "boolean",
208 |                     "default": false,
209 |                     "title": "Multi-resolution cooler file (mcool)",
210 |                     "description": "Create a multi-resolution cooler file. Will output the cooler formatted file which you can load with cooler.",
211 |                     "help_text": "See 'https://open2c.github.io/cooler' for more details."
212 |                 },
213 |                 "cool_bin_size": {
214 |                     "type": "integer",
215 |                     "title": "Cooler file bin size",
216 |                     "default": 1000,
217 |                     "description": "The bin size of the cooler output file in base pairs.",
218 |                     "help_text": "See 'https://open2c.github.io/cooler' for more details."
219 |                 },
220 |                 "mcool_resolutions": {
221 |                     "type": "string",
222 |                     "default": "1000,2000,5000N",
223 |                     "description": "The resolutions of the mcool file in pixels (see cooler documentation for details).",
224 |                     "help_text": "Comma-separated list of target resolutions. Use suffixes B or N to specify a progression: B for binary (geometric steps of factor 2), N for nice (geometric steps of factor 10 interleaved with steps of 2 and 5). This is the equivalent of the `--resolutions` flag in cooler; see an example here 'https://cooler.readthedocs.io/en/latest/cli.html'."
225 |                 }
226 |             }
227 |         },
228 |         "paired_end_bam_options": {
229 |             "title": "Paired-end BAM Options",
230 |             "type": "object",
231 |             "description": "Create mock paired-end BAM files for legacy tools.",
232 |             "properties": {
233 |                 "paired_end": {
234 |                     "type": "boolean",
235 |                     "title": "Paired-end BAM",
236 |                     "description": "Create mock paired-end BAM files.",
237 |                     "default": false
238 |                 },
239 |                 "filter_pairs": {
240 |                     "type": "boolean",
241 |                     "default": false,
242 |                     "description": "Filter paired-end reads using minimum and maximum distance parameters."
243 |                 },
244 |                 "paired_end_minimum_distance": {
245 |                     "type": "integer",
246 |                     "default": -1,
247 |                     "description": "Remove trans/cis pairs separated by a distance shorter than this."
248 |                 },
249 |                 "paired_end_maximum_distance": {
250 |                     "type": "integer",
251 |                     "default": -1,
252 |                     "description": "Remove trans/cis pairs separated by a distance greater than this."
253 |                 }
254 |             }
255 |         },
256 |         "misc": {
257 |             "title": "Misc",
258 |             "type": "object",
259 |             "description": "",
260 |             "default": "",
261 |             "properties": {
262 |                 "help": {
263 |                     "type": "boolean",
264 |                     "description": "Display help text.",
265 |                     "fa_icon": "fas fa-question-circle",
266 |                     "hidden": true,
267 |                     "default": false
268 |                 },
269 |                 "disable_ping": {
270 |                     "type": "boolean",
271 |                     "default": false,
272 |                     "description": "Enable to prevent sending a workflow ping."
273 |                 },
274 |                 "version": {
275 |                     "type": "boolean",
276 |                     "default": false,
277 |                     "hidden": true
278 |                 }
279 |             }
280 |         }
281 |     },
282 |     "allOf": [
283 |         {
284 |             "$ref": "#/definitions/input_options"
285 |         },
286 |         {
287 |             "$ref": "#/definitions/output_options"
288 |         },
289 |         {
290 |             "$ref": "#/definitions/advanced_options"
291 |         },
292 |         {
293 |             "$ref": "#/definitions/pore_c_tools_options"
294 |         },
295 |         {
296 |             "$ref": "#/definitions/chromunity_options"
297 |         },
298 |         {
299 |             "$ref": "#/definitions/4dn_files_options"
300 |         },
301 |         {
302 |             "$ref": "#/definitions/paired_end_bam_options"
303 |         },
304 |         {
305 |             "$ref": "#/definitions/misc"
306 |         }
307 |     ],
308 |     "properties": {
309 |         "aws_image_prefix": {
310 |             "type": "string",
311 |             "hidden": true
312 |         },
313 |         "aws_queue": {
314 |             "type": "string",
315 |             "hidden": true
316 |         },
317 |         "monochrome_logs": {
318 |             "type": "boolean"
319 |         },
320 |         "validate_params": {
321 |             "type": "boolean",
322 |             "default": true
323 |         },
324 |         "show_hidden_params": {
325 |             "type": "boolean"
326 |         }
327 |     },
328 |     "resources": {
329 |         "recommended": {
330 |             "cpus": 64,
331 |             "memory": "128GB"
332 |         },
333 |         "minimum": {
334 |             "cpus": 8,
335 |             "memory": "32GB"
336 |         },
337 |         "run_time": "12 hours for 100GB input BAM using the recommended resources, this will vary depending on number of monomers found per read.",
338 |         "arm_support": false
339 |     }
340 | }


--------------------------------------------------------------------------------
/output_definition.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "files": {
  3 |     "workflow-report": {
  4 |       "filepath": "./wf-template-report.html",
  5 |       "title": "workflow report",
  6 |       "description": "Report for all samples.",
  7 |       "mime-type": "text/html",
  8 |       "optional": false,
  9 |       "type": "aggregated"
 10 |     },
 11 |     "read-stats-per-file": {
 12 |       "filepath": "./ingress_results/reads/fastcat_stats/per-file-stats.tsv",
 13 |       "title": "Per file read stats",
 14 |       "description": "A TSV with per file read stats, including all samples.",
 15 |       "mime-type": "text/tab-separated-values",
 16 |       "optional": false,
 17 |       "type": "aggregated"
 18 |     },
 19 |     "read-stats-per-read": {
 20 |       "filepath": "./ingress_results/reads/fastcat_stats/per-read-stats.tsv",
 21 |       "title": "Per read stats",
 22 |       "description": "A TSV with per read stats, including all samples.",
 23 |       "mime-type": "text/tab-separated-values",
 24 |       "optional": false,
 25 |       "type": "aggregated"
 26 |     },
 27 |     "run-ids": {
 28 |       "filepath": "./ingress_results/reads/fastcat_stats/run_ids",
 29 |       "title": "Run ID's",
 30 |       "description": "List of run ID's present in reads.",
 31 |       "mime-type": "text/plain",
 32 |       "optional": false,
 33 |       "type": "aggregated"
 34 |     },
 35 |     "metamap": {
 36 |       "filepath": "./ingress_results/reads/metamap.json",
 37 |       "title": "Meta map json",
 38 |       "description": "Meta data used in workflow presented in a JSON.",
 39 |       "mime-type": "text/json",
 40 |       "optional": false,
 41 |       "type": "aggregated"
 42 |     },
 43 |     "sample-data": {
 44 |       "filepath": "./ingress_results/reads/{{ alias }}.fastq.gz",
 45 |       "title": "Concatenated sequence data",
 46 |       "description": "Per-sample reads concatenated in to one fastq file.",
 47 |       "mime-type": "text/json",
 48 |       "optional": false,
 49 |       "type": "per-sample"
 50 |     },
 51 |     "coord-sorted-bam": {
 52 |       "filepath": "./bams/{{ alias }}.cs.bam",
 53 |       "title": "Coordinate-sorted Bam",
 54 |       "description": "Coordinate-sorted Bam.",
 55 |       "mime-type": "application/gzip",
 56 |       "optional": false,
 57 |       "type": "per-sample"
 58 |     },
 59 |     "coord-sorted-bam-bai": {
 60 |       "filepath": "./bams/{{ alias }}.cs.bam.bai",
 61 |       "title": "Coordinate-sorted Bam Index",
 62 |       "description": "Coordinate-sorted Bam Index.",
 63 |       "mime-type": "application/octet-stream",
 64 |       "optional": false,
 65 |       "type": "per-sample"
 66 |     },
 67 |     "name-sorted-bam": {
 68 |       "filepath": "./bams/{{ alias }}.ns.bam",
 69 |       "title": "Name-sorted Bam",
 70 |       "description": "Name-sorted Bam.",
 71 |       "mime-type": "application/octet-stream",
 72 |       "optional": false,
 73 |       "type": "per-sample"
 74 |     },
 75 |     "pairs": {
 76 |       "filepath": "./pairs/{{ alias }}.pairs.gz",
 77 |       "title": "Pairs file",
 78 |       "description": "This file contains contact information in a human-readable tabular format, and can be used with downstream tools. See [Pairtools documentation](https://pairtools.readthedocs.io/en/latest/formats.html#pairs) for full specification.",
 79 |       "mime-type": "application/gzip",
 80 |       "optional": true,
 81 |       "type": "per-sample"
 82 |     },
 83 |     "pairs-stats": {
 84 |       "filepath": "./pairs/{{ alias }}.pairs.stats.txt",
 85 |       "title": "Pairs summary stats file",
 86 |       "description": "Summary statistics of the pairs file. See this [overview](https://pairtools.readthedocs.io/en/latest/stats.html) for a full specification.",
 87 |       "mime-type": "text/plain",
 88 |       "optional": true,
 89 |       "type": "per-sample"
 90 |     },
 91 |     "pairs-report": {
 92 |       "filepath": "./pairs/{{ alias }}.pairs.stats.html",
 93 |       "title": "Pairs summary report",
 94 |       "description": "Pairs html report with result including an interactive contact map and statistics. See [pairsqc documentation](https://github.com/4dn-dcic/pairsqc) for further details.",
 95 |       "mime-type": "text/html",
 96 |       "optional": true,
 97 |       "type": "per-sample"
 98 |     },
 99 |     "mcool": {
100 |       "filepath": "./cooler/{{ alias }}.mcool",
101 |       "title": "Multi-resolution cool file",
102 |       "description": "Multi-resolution cool `.mcool` file which can be used with downstream tools to provide a high resolution genomic interaction matrix. See [Cool tools documentation](https://github.com/open2c/cooltools) for details on downstream analysis.",
103 |       "mime-type": "application/octet-stream",
104 |       "optional": true,
105 |       "type": "per-sample"
106 |     },
107 |     "paired_end_bam": {
108 |       "filepath": "./paired_end/{{ alias }}.ns.bam",
109 |       "title": "Paired-end BAM",
110 |       "description": "Mock paired end BAM.",
111 |       "mime-type": "application/octet-stream",
112 |       "optional": true,
113 |       "type": "per-sample"
114 |     },
115 |     "chromunity": {
116 |       "filepath": "./chromunity",
117 |       "title": "Chromunity parquet files.",
118 |       "description": "Chromunity directory with parquet files which can be used with the Chromunity package. Chromunity enables the nomination and statistical evaluation of high order interactions. See [Chromunity documentation](http://mskilab.com/chromunity/tutorial.html) for further details.",
119 |       "mime-type": "text/directory",
120 |       "optional": true,
121 |       "type": "per-sample"
122 |     },
123 |     "fragments-bed": {
124 |       "filepath": "./paireds/fragments.bed",
125 |       "title": "Fragments BED",
126 |       "description": "File with the DNA fragments created from the virtual digest.",
127 |       "mime-type": "text/tab-separated-values",
128 |       "optional": true,
129 |       "type": "per-sample"
130 |     },
131 |     "hi-c": {
132 |       "filepath": "./hi-c/{{ alias }}.hic",
133 |       "title": "Hi-C for contact map",
134 |       "description": "File which can be loaded into the [Juice box tool](https://www.aidenlab.org/juicebox/) for an alternative contact map visualisation.",
135 |       "mime-type": "application/octet-stream",
136 |       "optional": true,
137 |       "type": "per-sample"
138 |     },
139 |     "filtered_out": {
140 |       "filepath": "./filtered_out/{{ alias }}.bam",
141 |       "title": "Filtered out reads",
142 |       "description": "BAM file containing any reads that were filtered out at the digest step and not included in the analysis.",
143 |       "mime-type": "application/octet-stream",
144 |       "optional": true,
145 |       "type": "per-sample"
146 |     }
147 |   }
148 | }


--------------------------------------------------------------------------------
/subworkflows/local/prepare_genome.nf:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env nextflow
 2 | import groovy.json.JsonBuilder
 3 | nextflow.enable.dsl = 2
 4 | 
 5 | include {
 6 |     index_ref_fai
 7 |     index_ref_mmi
 8 |     decompress_ref
 9 | } from '../../modules/local/common'
10 | 
11 | 
12 | 
13 | workflow prepare_genome {
14 |     take:
15 |         ref_param
16 |         minimap2_settings
17 |     main:
18 |         // taken from wf-human-variation
19 |         // Check ref and decompress if needed
20 |         ref = null
21 |         ref_index_fp = null
22 |         if (ref_param.toLowerCase().endsWith('gz')) {
23 |             // gzipped ref not supported by some downstream tools (pyfaidx, cram_cache)
24 |             // easier to just decompress and pass it around rather than confusing the user
25 |             decompress_ref(file(ref_param))
26 |             ref = decompress_ref.out.decompressed_ref
27 |         }
28 |         else {
29 |             ref = Channel.fromPath(ref_param, checkIfExists: true)
30 |             ref_index_fp = file(ref_param + '.fai')
31 |         }
32 |         // Create ref index if required
33 |         if (!ref_index_fp || !ref_index_fp.exists()) {
34 |             index_ref = index_ref_fai(ref)
35 |             ref_index = index_ref.reference_index
36 |         }
37 |         else {
38 |             ref_index = Channel.of(ref_index_fp)
39 |         }
40 |         ref_channel = ref.concat(ref_index).buffer(size: 2)
41 |         // create a minimap2 index, not strictly necessary
42 |         mmi = index_ref_mmi(ref, minimap2_settings)
43 | 
44 |     emit:
45 |         fasta = ref
46 |         fai = ref_index
47 |         mmi = mmi
48 |         minimap2_settings = minimap2_settings
49 | }
50 | 


--------------------------------------------------------------------------------
/test_data/bams/barcode01/porec_test.concatemers.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/bams/barcode01/porec_test.concatemers.bam


--------------------------------------------------------------------------------
/test_data/bams/barcode02/porec_test.concatemers.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/bams/barcode02/porec_test.concatemers.bam


--------------------------------------------------------------------------------
/test_data/bams_dir/shard_0001.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/bams_dir/shard_0001.bam


--------------------------------------------------------------------------------
/test_data/bams_dir/shard_0002.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/bams_dir/shard_0002.bam


--------------------------------------------------------------------------------
/test_data/bams_dir/shard_0003.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/bams_dir/shard_0003.bam


--------------------------------------------------------------------------------
/test_data/bams_dir/shard_0004.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/bams_dir/shard_0004.bam


--------------------------------------------------------------------------------
/test_data/porec_test.concatemers.bam:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/porec_test.concatemers.bam


--------------------------------------------------------------------------------
/test_data/porec_test.fasta:
--------------------------------------------------------------------------------
1 | >chr1
2 | AACCTGATCGGACTACCGCAGGATAGAAGCGGTTGCTTAAAGCCACAGCCGGGCAGCTGTGTCAAAGGTCCCACATACAATGAGCGCTATTCCCAGACGGTGTTCTGACTGCGAGATACGTGAATACTAACCTCCAAGGGGAAAGGAATCAATCACATTGTGTACCGGCCTACTTTGAACTTGCACGAACGTTAGCGATTTAATTAACCAAACCGAGAGTGCAGTCGAGTGAGGTACACGCTCGGACTGCGTGAATGGCGTTATGTGTTTATCGTCACGCTTCCACAATTAGACAAGAATGCTTCCCAGCTTATGTCAGTTAAGGAGTTAACGATCTGTCTATTGACCATCTCGTGTATTTAGCGGGGCAAACCGACGATCTACTCCGCTCAATGTTTACCAAAGAATCGTAACTAGGTGCAGTCTCTAGCTGGCCAGTCAAAGTGTTTTGCGTATTAAGAAGAAGATATTGCGTTTATGAGCTGACTGATCGGCAGTGAAAAATCTTTGGCATTTATGGGATCGATTAGTCGGGTATTGTTGCAACAGGCTGCTGCGAAAGCTATCCTTTATAGGCAAATGAGTGACGCGCGAACGCATGACGTCGTCAAGCGGGTCTCATCACTTTTATCGACAGTATCTCGTTTGATAATTGAACCATTTGCCGTGCCAAGCCGAGGCCGTATAACCAAGGCGCCGGCATTGACTACTGTAGTGATTATGCGATCTGTCCCGCATAACCGTCTATACTGGTCCTGAAAGGTGTTCGTTGTCGTTCGATTCAAGCCTCTACCGTCGCTGGTTGCTGGCGACATTGCATAGCATACCCATTCGCTATATCGAGCTGACGTTATTGGCTAACGCTTAGTGTGTCAAGTCCTGGTCTGGGAATGATCGGAGCAACGGGATGCTCAACGCGCTACGAATTAGTTGGTCCGGGAGCGAGCGTGTTGCGATCTAACTTCGTTCAGAGACAGGCCTGCCAATCCAACATAGCTATGTTCACGCTTGCATTTAATCTCGTCACCCACCGCACATTAACCGGGGAACGTACACCAGAATGTAGGGCTGGCGAATCAAGAAGGGCGGGGACCCACGGCATGCCTTCGGTTTTACTAGACACGCAGCATCCCATTTGGAGCTCGGGGTAGATCTGTCGGCCCGCGGGCTCGACCGTACCAAGAATACTGCGCGATGCATAGGACCTCGGAGGACTTTGCGACTATTATTACCGAAGAAGATTTTGTTCGTGCAATACGGTGTGTTTGAGGCCGGCCAAGTAGCATCTTGGAATTTATCCACTAACTATCCGAGCCTGGTTGGGATGCTGATCGATTGTTGACACCTCACATACGATTGCGCCATTTGGAAGGACTGGATTTGCGCTGTCAACCTGACTGGTCTAAGATTTGCCCGCGCAGTCCCATCGGTTGATGGAGAAGGTCCTTGTAACTTATCTACAATCTAAAAAAATCGAACAAGTTGGGATCGTCCACGGTTTTAGATGTGCGAGATCAACTAGGAACGGCAGAGAACAGTCTGACTACACGTGTGAGTTCGGATAACACGTGCACGCTGCCTCGCACGGAGGTTCGCAGGATGGCATCTCGGTTTGATCCTATGAGAGCCCTTTATCTTGGAACTGCCTGCGGTACAAGCGCGGGGCCGTCCCAAGCCAAGCCAGGTAGGTAACACCCCAAGTCGTTAGACGCCTGTTCGGATGGTAGTCCACGCGTTGCACTGTGCAAAAGACCAAGAATACGCGAGGGGTAAACGCGCTTGCTTAGGCTATCGAGACGAACGGTTCACTGATTCAGTGTTAGATGATAGAACACGGAAGACGCTAAGAACCAAAAGTAACGCATTACTATAAGGAGGTAATTGGCCGATGCACCCCCAGAACGTGAATAACTTGCAGTCGCTGGGGTCGACCATCGAAGAGAAACCCATCATTATTACTGGCCCCAAGTATCTCATCGGCAGGCCTGCGCGTCCACGCTACTAACATAGTTCTCAGAGTACTGTCCCATCAGTTGTTGCTCGCAATTCCCCGCTGGGGGCTTCCGCGAATAGGAGGCCAGCTATCCAGTCCCTATACCAGTGCTGTTAGCGTCGCTATTCTGGCCTCCTAAGCCACACGGTTCTGAGATTGTTATATGATCGCTCCTCCACAGCACGGATGGACGACAGAGCCTTCTGAGCAATCCATAAAGCGACCAATTAATCGCACCACGAATACCCTCTAACAAGGGCCTAGCTTGTTAAGTGGAAGAGGCTAAGGCCAATCAAGAGGCCAGCTCACAGTCCGGTGCTTCCAGGGGCCCCTCGCCTGTACAGTATCTCCTACGACATTAACGTCTAACCTTGTCATAAATGGATAGGGGTTGTACGCTGCTTTATGGTTTTTTCAGGTCTCACGCCGAGATCCAGCCCTGAAGCAATCTCTACGTACGCGACATTACAGTGACACGATCCTGCGTTGGAACAATGGGAAATCTTTATGGGAGATTTAATATATGTAGTTGGAGCTGTAAGGGCGTAAATTTGGCTGAGACGTGCCAGCGACTCTGCTCTGTTCGGATCATTTGGTCATTGAAGTCTCGAATTGCGGGGGCAATAACCCGAGGATCACAGTCTTGAACGAGGGTTCCTTGCCGATTTAACAACGTGTATTTGAGGTGTCTCTGTTATAACATTCGGCGTTGCATACGGACTGGGCTCAATATAGAGTTTTACTGTGTTTGAAAATTGAAGCGTCGAGTACTTACGCTCCCACTACTCGAACATCCTCCAAGCGGGCAGTTTGTGCAAAGGTTTCTTAAATCTATCACATTTTATAGACTACTCTAACGAGGATCTTCTGCAAAAATCCCAATTAAGTGTGATACTAGGGGTCGCCGTAGAAGAATGAATGCCATTCAAGGTTAGGTATCCACGACAGAAGCCATCGTAAATAGGCCGTCGATACAGGGTCGATGGAATTGTGGGCTCCAATGGGACATGTTGCCTAACGATGGGGACGCGTTTGTAAGGAAATCTGAAATTTCGACTACCTCCAGTCATCCACTCGCGGTACTTCTCTCGCAGTAGATTTACGTGTAAAAAATGTCCGATCTGGTATCCGAAGAGGGCGGGACCGCGACTTCGAACGCCAGATATCGGATGCTCTCGGTTAATGGAGGGTACCCCATCCTGCTACTTTGCCGAAGCCGCCAAACGTAGGAGTTAAGGCAATTAGCTGACAGAGACATATTGTCCACTCCTTGCGGATTTACTCCGTACAGACCCATCTACGGAATTCATCATAGACGATGGAATTAATCCACAGCTAAGACTACACAAAATACATAACTCCATCCGGGGCGGGCCAGCCGCGCACCCATTGTGTTACCGTGTAGGCCTACCATTATAACGTTGAGGACGCAAGGATCAGTTAAGCCTCCGATGGACTGTGAAAAGCAAGCAAGACCACGGCTAGTACGGTAATACTCTCTAGATGCTTAGCTCATCCGCACGCAACCACCCCATTCTTCTGATGCGGCAGCTAGGAGGGTACGACCCTTCGGGGCGGTTCATG
3 | >chr2
4 | ACGAGGGTCGGGGCACTGGACTTTGGAGCCCCTCGTGACAATGCAGGTTTTCAGCATCGTTTGTGAGGTGTGTTCTGTTTTACTTGAATGTAGCGAGTCGTTATTAGGCCCTGCGGCGCCGCATTTGGGTATCGCTTCCGGACACTTTATGGCCATCGCCCCGGTGTTGGACGGATATCGATACCAACAGGGAGTATTGTAGGGGCTTAGCAACAGACCTACATCCAGCTGCGAGCGGTCTCGAAAGGAATGTTATTGCATCACCGTCCGTCTCGTGATGTCCGTAAAAGATAGACGTGGCCTTGGCGGACCGGAGGAAGGTTGGGACGCAAGTCATCTCCAGCCACCGTAGTCTCTTCAACTTCCTGCCGCAGACCAGTACCGATCAGCGCTGGATTTTTCAGAACACACGAGGCGACCATACGCCAATGTTCGTATCCTTCGCTAACCACCACCACCGTGAGTTCAGCTAGATCCATTCTGTTTAATCCCTACAGGCACTCATTTTGGGCGAGCCATACCGTGAATGGAGTCATGCTTGTAGCGTCCGAGGTCCTGACGGCGTCGTTCTTCTCCGGTCTCCCGAGGCTAAGTGGACAATCGAGTATAGTCGGTATAGCGTTGCTGAAATCATAGTGGTCGGACTCTACTATGTAGGCGGTACGTAATGGCGATGCCTCTCAGTGGGATGCCCTTGTAAAAGACAGATTCAGCGATGTTAACTCTATCGTGTAAGCGCGAGGGGACTGTGCATTGATATGGTCCTTTGATTACTCAAATGGATCCGTACTAAACCCTCCGGAACGGTTACGGGACGCTGTGGCGCTAGGGACTTCCTGAAGAGTTATATTAGGATTCGTTCCGGGCCAAGGGGCTTTGCTATCAGGGCTGTTCGTCATCGCTCTATACCTATGACCGTATATGAGGACGTCAGGTGCTAGGAACACTGAGACATTTCTGAGTGTGGCGGCCCGCGCGGAAAGGTGAAGTAATTCCAATGCACAAAGGAGTAACGGTTCGTTCACTGGTCAAACTCAAGTGGTGGACTAGGACTGATTAGTTCCTGGGTGACATCACCAGCCCGTCGCGCATAGAACGCCGGCCGAGTGCCACACGACGTGCTCAGTGATTTATTTGCACTACAGTTACAGACAGGAGTGCGTCAAAGTCCCCCCCCAATATGCGAGTTTTAAGACCTTTGCTAGGTGGTTAACAACTGTGCGTCTAGACGTTATCTGACTATGTCCCGCTTTTGTGAAGTGACGCGCAATCTGAGGTGCCCGATATTGACCCCTCCTCGGGCTTGAGCGCAAGTCGGGTACCGCTAGTAGTACAAGGAGCAACGTTGTTTATTAGGGTATACTCAAAAAAAGAGGGATCTGGAGAAGTGAGTTACCTTGTCTAAGAATTATCCGGCTACAATAATAAGCGTCAAGGCAGCGGACGTTTCGACAGTCACTCGAAGACATAGGGGTACGGCAGTATCCACCTAGGGTCGCCCGTGATAACCTTGAGCCCTGGGATAGCCCGCATCATACCAATGGAATTTACTCTGACCATAATCTAGATAGCCTAGACTAGGATCTGCCCCGAAGCCGATATTCAGTCTGATACAAGAAACGTTATATGCCCCTATGATAAGCGTTGCCACTGGTCCCTACCGTAACAAGGCTTCAGTCTTCTGACGCGCTTCAGGGCTCATCGCTTGAGGGCGCAAAATTACTAGTAATGGACTCTATCTGCAAACTACAGCGCTACGTATAGAACTCGGCAGAGGGGATAATATATAAAACTGACGTTGTTTTAGAGCACCGAGATGAGCTTTTGTCTGATGAGCTCAAGAACGTACTTCATCCTCATACAAGATTTTAGGACGACCCCGGATGGGGGGGGAGACTGTATTCGATGCCTGGCCCAGTGTGCGTGCCATCGCAAGTGGCTGTACCGCAGCCCCTGAAACGAGTGCAAGTTGCTGGGACTATACAAATAAGTGGTCGAAGCCTATTTGCGTAGCACACGTCGCCATTCGGTGTAATTAGGCCGCCGTAATGTCTAAGTATGAGCTGACGACTTCAAGGTAAATTAGCACTTTAGTAAACCCAAGTTCAAACGTAGTTAATCAAGCCACTAATAACATTTCCCTTAAGGCATCGTAATCTGAAACTTCCACTGAGGGGTCAGGCACCGATCCTAATATGTCTTTTATCATTACACTCGCTACGCTGAGCACAGACGATAATGACCTGTATCGACTTTTCGATTGATTAGATCCAAAGATTGCAGAGGGTCTCGGCCGCCTGGTTTTAAGAATACGCAATGTAGCGTTTAGCGGATGTTCTGACGCCACTGCTCTGCTGGCGTCTGGATGGCAAGACTATTAGAGTGAGTGATGGCGTCAGCTGGCACCTCGGGGGAATTAGGTTTATAGTGCGCCTTGCACGCACAACTCCCAGTGAGGCGGCTGACTCGGGATACTTGCCGGACACTACAACTCCGGGGAAGCTCAGAGTCTCTTGCAGTAAGGCGGGCGGGTTTACACTGATCAGTGCCCCTCTCGGCGGGTGGTAAACGAGGCATTCAATCGCACAGCAAGAGAAAATCATAATTCAACCGAAAGAGTTAGAAAATCCCAAGACGAGGCGGGTTGGGCAATAAACCACTCAGTAACCTACAACAATCAAGTCCTCGCGGCCCACCAAATAGTGACCGCTTCTTAAATGTTTCTAATCATCAAGCAGCCTGTTCTTGCAGTCGTTTCCATCAAATTGGTTCTCTTAGAAAAATACTCGGTACCTGGCTTGCACTAAGTCGAAAAAATGGGCACAACGTAGACGCAGGGGCGAAATCAACGGGATACGTGTTGCGTCGTTACGCCCGCTTCCAATCAACCTACGGCTGCCTATGGGCGCAATTGCGGGGCTGTAGCTTCCTGACTTTATTGGTGCGGGTTCTATATGTGGTTAAAAGACGTTCTAGCTATTTTGGAATTGTAAATTCCCGGTTGTGACGCCATCACCTCACCTACCCCCGGTACTGGATGCTTGTCATATGCGACACGAGTCGGCACCACGAGCATTACCGAGTACGTATTTCTAGAACAAACTTACTATATGAAGGTCTTTAACCGAAGGTAGGGACAGGCCGACGGCTGAAAAAGTGCCAAGCCAAGTCCCCACTGTGGTGGAACTCAAGGGTGAGTGGACCTAAAGAGCCCAAAAGAATCAAGTGTCTAGGACTTCAATAAGCGCGCGGCAGTAAGAACAGTCGCACCCAACGGACTTTCCTGGGAGGCCTGTCTATCTGCTCATTCGTGTATTATCCCCTTTTGCAAGTGCCAGTGCGGCTAACCGTGGGATAGTGAGGGCAGGGATTGCACTCTCGTTGCCCTTCCCGAAGCAAGTACAAGAGATCACTCTGGTATGGTCATACTCAGAATGCACGGGCCCTCCGGTCGGGTCTGATGCGAAGCTGCCCTCCAGCTTCCATTCCGAAAGAGTACTATGACCAGGAACTCCCTACGACTATCTAACCCAGTACTCGCGACTTAACTATCTAGCGTTAACCTTTTGCCGGCCGACATTAACCCAAACCTAGAGCCGCAAGACGAACCCGTCCCGCGTACTTTAGGTCTAGCCTAGTCCGGTAATATAGGTCGATGTGGGCAGGGTTCTCGAGCCTAGATGTTCACTGACCGGGAGTAGGCCGACATCAGGCCCGAAGCCGAGGCAGTGTCCAATGGTATGACCCGCAGCACAATAATACGACATCCCCGAAAACATTAGGCTGACAAGAATCGTATTGCACCAACGCGGATAGTAGACTGCTCCTTGGGAATAATGTTAGTTTGTGGCAGTAGGAGGAGATTTAGAAGTTCTCTTCTGGTATCTCCCGCAACGCGGTTCCCGGGCGAGGGGAAGCCTGCGTCCTGGCGAAATTCTGCGTACCTATGTGGGCACCACGGTTAGGCAGTAACGTCTAGTAGCGCTACGGTATCGGGATACTGGCGGCCGTAGTGAATCATACACCTGGAGCCGGGCTCGTAAAGGAGTCTTCAGGTCCATTAATTTCGAATTCAGGGCCGCTTTGCGAGATCGCCGTAATCCTAGCGGGGTTTTCCTCTAAGGAGGTAGACGTGACCATCCCACAAATCAAAAGTCTGATGCCTGGAACCATACTTCAGGCGCCGTCTGAGACCCCTTGTGCGCAGTAAAATTGCTACTTTTTACAAGATTCGTGACCGGAGAGGTGGATGAGGCCCGGGATTAGACGAACGGTTTCCCGTAGGCCGCTACACGGGGGCGGGGCCAAACACTATGTGTATTGTCCCATAACGAGACTTTGCTGGCTGCTTCCCACCAGACGCAATTTAAGTCAATTTTATACAGTTGGGCTCTTGCCAGCACAATAGGCAGGTTCCTCAAAAATAACATCCCTCGGCGCCATAGAGCCGTACACCGTAGTCTGATATCCTGCGCGTCGTGTCTACAGAGTTGTTAAAGAAGGCTTATGCCGTTTTCGCACGCCTAGCACGGAACTCTAATTTCTATTAGAAAGAATCACTGGTTTTGACAAGGTTAGGACGGCTACACTGCCAAACCGCGCGCACAGCTTATCAAAGAGAGATTGAATGGCGTACACGTACTTCTCGGGACTAGGTCCACTCGAGACCCACCACTGAGGCACCCCGTCCCAGATTTATTCCTAGGACACTCCTACTAGTCAACGGAATTCTGTGGGACTCCGGCTGCCCTGTGCACCGATCTGTTCTAGTATCTATGTACCAGGCAGATTTAGTACACTGGAGAAATACGTCCTCCCGGGCAAAGCTGTTCGTCCCTTGCTTGTGGATCAGTGGAGGACAATATCTCAAAACCAACTCTTCAGGCTCGGTGTCGCATAGCTCGCTCCCAAGAGCTATACATTGCCATTCCCTGTCCTGCGGTAGGGGCTGCCGTCAGGTGCTGGACCGGTTCGTCGCGCGAACAGTGTCACATCGTTTGCCTTGTTATTGAGACGAACATACAGGGCGGCTCGATTTGCTTAGACAGGGCGGCGCACGAAACTGAAGCGGGCCGGGCATCTTTCGGGGTGACGTCATAATACTTGGGGGCGGAAAAAACCCTCCGGTGGGACCATACCGCGGCGTGATATCGAGAAGTTAGGCGAGACCTAAACGGAACATTCAGATACCTAAGATAACTGAATCCCTTAAACCGGTTCGGTGGAGTGCCGACCACATTAGGCCTCGTCAGACGTAGCGGCTATCGCTGGCTGTCGAGGTATGATACAGTTACAGTGTTGCTTACTGTATCCTGGGGGTCCTACACTGCCAGGAAAAGCAACTCCCACAACTTATTGTAACAACGCCTGCTGGATCTGGCTACGGAAGGGCACGAGCCATTGGCTGATATGCGTCCTAAAACGATTTCCCGTTTTACCATGACGCGGGGTGTTTGATCGAGTAAACTGGAGGGCACGACTTGAATTGCAGCAAGATCTACACGCGCTCATGATCTATTGAATCTCATGAGCGCACCAGGGTGTGGAACACACCAAAAACGAGGTGAGCCAGCCTAGCAATGGGTAAAGTGGGCGTACAACGACCAATGGTTCCCTGGACCGCAAAAGAAAGTGTCAGACAGGAGCATTCTGTACGGAGATACCCTACCTTAATCGCAACACACAGTAGACTCACACCTTCAACTATCACACAGTATAAAGCTCTCAGACTGATTAAGGTACTGTGTGATACTGAAAGCACGCTTAGTGGTGACCTGGCCACTAACGCACGGCTACGGCGCTTCGGCGCCTCCTCCAGGTCTGCGTGTGATACCGACTTGCTTATGAGTATAATAGGCTAGGCTCTTATTGTGTAGAGGGCCTCAAGACCTATTTGTTGTTAAGTCGTGGGCGTTGCATCCAGGAGTTATCTATGGTTGATACTCGCCCCAGCTTAAGCACTCTGAGTCCCGTCCATAATTTATGGCTAGACAGCGCTCGGACTTCCTTACCTAGTATATCCGTTCACAGCAGGAAGAACCCCATACTGAGCTATAGATTCTTTTCCGTCCTCTCAGCCAGAAGCTTTTTCAGATATCGTGCACAGGATAAGGGGAATTACATTCGCAAGGTTCTTACTCGCCAACTCAGGTTTGGCTATTACAAGTTGCAGCACACCGGAAAAAAGGGTATCCCGGAACTTTTTTTGTAGCCAGCAGGTCTCCAACGGAAAAGGGGAAGCCCCCACTTTTCCTGCTAACTTTCGAACGTGGCCTGTGGCGTGACCCATATATGTGAATGGAGCGTAATGGCCCCTAAAGTAACTTCTTCCGCCTAACACCAATGCTGCGTGTACTTAGTTCAATACCCTTGCAATATAGCGATATCAAGTAAGTCACTTCTAACTGCACTGAGTAATCGTGCGTGGCGTTGGCTTGAGACCGTGATGTGGCATGAATAAGTCTGACTGAATGCCCACCGGCTCCACCTTAGGTCAAACTCAGACGCGTTGCTCGGACCCCTCAATACGCAACTCATGGTGCTCCTATTCACGGGTTCTCGGGCCGCTATTGCGGTAACGATAACGCCAAGGTGTCAACAAGAAAGGAGCACGCCTCTCCATAATAACAACAAAACTATGGTAGAATGAACGTTAGTCAGCTCCGCATAGCATTCAGTGACCTGCTCGACTGCCGGTCCTGCAGCATAGTCCGTTGACGACCGGAGAGATCATAGACTGCAAACAGCTTTTAGCCTGATACATGGAAACGTACAAGAAAGGATGAGGAGAGGTCACAGGATATCAATCAGTTAAGACACTGCCCTACTCTTGTAATGTCGACGTTGGCTACCACCTAGAGGAGGTAGTAGTGGGGTTGGTACTCTTATAAAGGCACTGCCATTACGCGATCGCGATGTTTTTGACAGATTTTTTCTGAGGTCCAGTGCCGCTTAGATCCATTCGACCTCGGGGCATTCGCAGTGCTTAAGGGGTAGTAAGCCTATGCTACGTATAGGCTAGCGACCTTCTGCTCACTGCTTAGGCCATTCACCTAGGCGAGTTTCACTATTTCCCTAGCGAACGGACTTGGGGTAGGGCTGCGGCCTAGTCGTGATGCGGGTTGATAAGCCGCGCGTCGGTTCACCCGGACCGTAGATTCGATCTCGGAGACTGTGGGGAGAACCCTCTGTTGGCCGGGGAAGTCGGGCTCCTCACTAATACCGTATGGGTGGTAGAAATAACAGATGGGATACCCTACTTGGTAAGGAGGGTGCAAGCCTTACACCTTGAAATTAGCCCAGAAGTATCTTATCGAGAACTGCTAGCCAATAATTGCCCGCTTGACCGCACGTATCAGGTAATGTGTACGCCACGTTGGCGCGGAGATACCTGATACGAAAAGAATCCGTTGGTCAGTCCTACAGTGTCTCTCTCGTCATCAAGCGCCCCTATCGAGCCGACGTCTCGTGCGAGTCGGCTCTCTGTGTACAAATTGCCCGCGGGAGCATCTGCAGCCTTCCGCCTTAATAGCCAAATGAGCATCCGCCATG
5 | 


--------------------------------------------------------------------------------
/test_data/porec_test.fasta.fai:
--------------------------------------------------------------------------------
1 | chr1	3577	6	3577	3578
2 | chr2	7551	3590	7551	7552
3 | 


--------------------------------------------------------------------------------
/test_data/porec_test.params.json:
--------------------------------------------------------------------------------
1 | {"seed": 42122, "genome_size": 10000, "num_chroms": 2, "cut_rate": 0.001, "enzyme": "NlaIII", "num_concatemers": 1000, "num_haplotypes": 2, "variant_density": 0.05, "p_cis": 0.8, "mean_frags_per_concatemer": 10, "max_frags_per_concatemer": 10}


--------------------------------------------------------------------------------
/test_data/porec_test.phased_variants.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/porec_test.phased_variants.vcf.gz


--------------------------------------------------------------------------------
/test_data/porec_test.phased_variants.vcf.gz.tbi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/porec_test.phased_variants.vcf.gz.tbi


--------------------------------------------------------------------------------
/test_data/porec_test_no_index.phased_variants.vcf.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/epi2me-labs/wf-pore-c/10caa387bc5f75be324568712c82bc8801ea1b71/test_data/porec_test_no_index.phased_variants.vcf.gz


--------------------------------------------------------------------------------
/test_data/sample_sheet.csv:
--------------------------------------------------------------------------------
1 | flow_cell_id,kit,experiment_id,barcode,alias,type,vcf
2 | FA026858,SQK-RBK004,sequencing_20200522,barcode01,s01,test_sample,test_data/porec_test.phased_variants.vcf.gz
3 | FA026858,SQK-RBK004,sequencing_20200522,barcode02,s02,test_sample,test_data/porec_test.phased_variants.vcf.gz


--------------------------------------------------------------------------------
/test_data/sample_sheet_cutter.csv:
--------------------------------------------------------------------------------
1 | flow_cell_id,kit,experiment_id,barcode,alias,type,cutter,vcf
2 | FA026858,SQK-RBK004,sequencing_20200522,barcode01,s01,test_sample,NlaIII,test_data/porec_test.phased_variants.vcf.gz
3 | FA026858,SQK-RBK004,sequencing_20200522,barcode02,s02,test_sample,NlaIII,test_data/porec_test.phased_variants.vcf.gz


--------------------------------------------------------------------------------
/test_data/sample_sheet_no_tbi.csv:
--------------------------------------------------------------------------------
1 | flow_cell_id,kit,experiment_id,barcode,alias,type,vcf
2 | FA026858,SQK-RBK004,sequencing_20200522,barcode01,s01,test_sample,test_data/porec_test_no_index.phased_variants.vcf.gz
3 | FA026858,SQK-RBK004,sequencing_20200522,barcode02,s02,test_sample,test_data/porec_test_no_index.phased_variants.vcf.gz


--------------------------------------------------------------------------------
/test_data/sample_sheet_no_vcf.csv:
--------------------------------------------------------------------------------
1 | flow_cell_id,kit,experiment_id,barcode,alias,type
2 | FA026858,SQK-RBK004,sequencing_20200522,barcode01,s01,test_sample
3 | FA026858,SQK-RBK004,sequencing_20200522,barcode02,s02,test_sample


--------------------------------------------------------------------------------